Python · 257580 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.path_display import display_runtime_path
20 from loader.runtime.permissions import (
21 PermissionMode,
22 build_permission_policy,
23 load_permission_rules,
24 )
25 from loader.runtime.reasoning_types import (
26 ActionVerification,
27 ConfidenceAssessment,
28 ConfidenceLevel,
29 )
30 from loader.runtime.recovery import RecoveryContext
31 from loader.runtime.tool_batches import (
32 ToolBatchRunner,
33 )
34 from loader.runtime.tool_batches import (
35 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
36 )
37 from loader.runtime.workflow import sync_todos_to_definition_of_done
38 from loader.tools.base import ToolResult as RegistryToolResult
39 from loader.tools.base import create_default_registry
40 from tests.helpers.runtime_harness import ScriptedBackend
41
42
43 class FakeSession:
44 def __init__(self, messages: list[Message]) -> None:
45 self.messages = list(messages)
46 self.workflow_timeline = []
47
48 def append(self, message: Message) -> None:
49 self.messages.append(message)
50
51 def append_workflow_timeline_entry(self, entry) -> None:
52 self.workflow_timeline.append(entry)
53
54
55 class FakeCodeFilter:
56 def reset(self) -> None:
57 return None
58
59
60 class FakeSafeguards:
61 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
62 self.action_tracker = object()
63 self.validator = object()
64 self.code_filter = FakeCodeFilter()
65 self._detect_loop_result = detect_loop_result
66
67 def filter_stream_chunk(self, content: str) -> str:
68 return content
69
70 def filter_complete_content(self, content: str) -> str:
71 return content
72
73 def should_steer(self) -> bool:
74 return False
75
76 def get_steering_message(self) -> str | None:
77 return None
78
79 def record_response(self, content: str) -> None:
80 return None
81
82 def detect_text_loop(self, content: str) -> tuple[bool, str]:
83 return False, ""
84
85 def detect_loop(self) -> tuple[bool, str]:
86 return self._detect_loop_result
87
88
89 class FakeExecutor:
90 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
91 self._outcomes = list(outcomes)
92 self.calls: list[ToolCall] = []
93
94 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
95 self.calls.append(tool_call)
96 if not self._outcomes:
97 raise AssertionError("No fake tool outcome queued")
98 return self._outcomes.pop(0)
99
100
101 def build_context(
102 *,
103 temp_dir: Path,
104 messages: list[Message],
105 safeguards: FakeSafeguards,
106 assess_confidence,
107 verify_action,
108 recovery_context: RecoveryContext | None = None,
109 confidence_scoring: bool = False,
110 verification: bool = False,
111 auto_recover: bool = True,
112 min_confidence_for_action: int = 3,
113 ) -> RuntimeContext:
114 registry = create_default_registry(temp_dir)
115 registry.configure_workspace_root(temp_dir)
116 rule_status = load_permission_rules(temp_dir)
117 policy = build_permission_policy(
118 active_mode=PermissionMode.WORKSPACE_WRITE,
119 workspace_root=temp_dir,
120 tool_requirements=registry.get_tool_requirements(),
121 rules=rule_status.rules,
122 )
123 context = RuntimeContext(
124 project_root=temp_dir,
125 backend=ScriptedBackend(),
126 registry=registry,
127 session=FakeSession(messages), # type: ignore[arg-type]
128 config=SimpleNamespace(
129 force_react=False,
130 max_recovery_attempts=2,
131 auto_recover=auto_recover,
132 reasoning=SimpleNamespace(
133 rollback=False,
134 show_rollback_plan=False,
135 completion_check=True,
136 max_continuation_prompts=5,
137 self_critique=False,
138 confidence_scoring=confidence_scoring,
139 min_confidence_for_action=min_confidence_for_action,
140 verification=verification,
141 ),
142 ),
143 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
144 project_context=None,
145 permission_policy=policy,
146 permission_config_status=rule_status,
147 workflow_mode="execute",
148 safeguards=safeguards,
149 reasoning=SimpleNamespace(
150 assess_confidence=assess_confidence,
151 verify_action=verify_action,
152 ),
153 recovery_context=recovery_context,
154 )
155 return context
156
157
158 def tool_outcome(
159 *,
160 tool_call: ToolCall,
161 output: str,
162 is_error: bool,
163 state: ToolExecutionState = ToolExecutionState.EXECUTED,
164 metadata: dict[str, object] | None = None,
165 ) -> ToolExecutionOutcome:
166 return ToolExecutionOutcome(
167 tool_call=tool_call,
168 state=state,
169 message=Message.tool_result_message(
170 tool_call_id=tool_call.id,
171 display_content=output,
172 result_content=output,
173 is_error=is_error,
174 ),
175 event_content=output,
176 is_error=is_error,
177 result_output=output,
178 registry_result=RegistryToolResult(
179 output=output,
180 is_error=is_error,
181 metadata=metadata or {},
182 ),
183 )
184
185
186 @pytest.mark.asyncio
187 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
188 captured: dict[str, str] = {}
189
190 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
191 captured["context"] = context
192 return ConfidenceAssessment(
193 action=f"{tool_name} with {tool_args}",
194 tool_name=tool_name,
195 tool_args=tool_args,
196 level=ConfidenceLevel.LOW,
197 reasoning="Need to inspect the target first.",
198 risks=["Unknown target file"],
199 )
200
201 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
202 raise AssertionError("Verification should not run for skipped actions")
203
204 context = build_context(
205 temp_dir=temp_dir,
206 messages=[
207 Message(role=Role.USER, content="Please inspect the project."),
208 Message(role=Role.ASSISTANT, content="I will read the file next."),
209 ],
210 safeguards=FakeSafeguards(),
211 assess_confidence=assess_confidence,
212 verify_action=verify_action,
213 confidence_scoring=True,
214 min_confidence_for_action=3,
215 )
216 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
217 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
218 events: list[AgentEvent] = []
219
220 async def emit(event: AgentEvent) -> None:
221 events.append(event)
222
223 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
224 result = await runner.execute_batch(
225 tool_calls=[tool_call],
226 tool_source="assistant",
227 pending_tool_calls_seen=set(),
228 emit=emit,
229 summary=TurnSummary(final_response=""),
230 dod=create_definition_of_done("Read the docs"),
231 executor=executor, # type: ignore[arg-type]
232 on_confirmation=None,
233 on_user_question=None,
234 emit_confirmation=None,
235 consecutive_errors=0,
236 )
237
238 assert result.actions_taken == []
239 assert executor.calls == []
240 assert "Please inspect the project." in captured["context"]
241 assert context.session.messages[-1].role == Role.USER
242 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
243 event_types = [event.type for event in events]
244 assert "confidence" in event_types
245
246
247 @pytest.mark.asyncio
248 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
249 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
250 raise AssertionError("Confidence scoring should be disabled in this scenario")
251
252 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
253 raise AssertionError("Verification should not run for failed actions")
254
255 context = build_context(
256 temp_dir=temp_dir,
257 messages=[],
258 safeguards=FakeSafeguards(),
259 assess_confidence=assess_confidence,
260 verify_action=verify_action,
261 auto_recover=True,
262 )
263 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
264 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
265 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
266 summary = TurnSummary(final_response="")
267 events: list[AgentEvent] = []
268
269 async def emit(event: AgentEvent) -> None:
270 events.append(event)
271
272 await runner.execute_batch(
273 tool_calls=[tool_call],
274 tool_source="assistant",
275 pending_tool_calls_seen=set(),
276 emit=emit,
277 summary=summary,
278 dod=create_definition_of_done("Run tests"),
279 executor=executor, # type: ignore[arg-type]
280 on_confirmation=None,
281 on_user_question=None,
282 emit_confirmation=None,
283 consecutive_errors=0,
284 )
285
286 assert context.recovery_context is not None
287 assert summary.tool_result_messages
288 assert context.session.messages[-1] == summary.tool_result_messages[-1]
289 assert any(event.type == "recovery" for event in events)
290
291
292 @pytest.mark.asyncio
293 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
294 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
295 raise AssertionError("Confidence scoring should be disabled in this scenario")
296
297 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
298 raise AssertionError("Verification should not run for this scenario")
299
300 context = build_context(
301 temp_dir=temp_dir,
302 messages=[],
303 safeguards=FakeSafeguards(),
304 assess_confidence=assess_confidence,
305 verify_action=verify_action,
306 auto_recover=False,
307 )
308 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
309 tool_call = ToolCall(
310 id="bash-1",
311 name="bash",
312 arguments={"command": "python -m http.server 8000", "background": True},
313 )
314 metadata = {
315 "job_id": "bash-1",
316 "status": "running",
317 "background": True,
318 }
319 executor = FakeExecutor(
320 [
321 tool_outcome(
322 tool_call=tool_call,
323 output="Started bash job bash-1",
324 is_error=False,
325 metadata=metadata,
326 )
327 ]
328 )
329 events: list[AgentEvent] = []
330
331 async def emit(event: AgentEvent) -> None:
332 events.append(event)
333
334 await runner.execute_batch(
335 tool_calls=[tool_call],
336 tool_source="assistant",
337 pending_tool_calls_seen=set(),
338 emit=emit,
339 summary=TurnSummary(final_response=""),
340 dod=create_definition_of_done("Launch a preview server"),
341 executor=executor, # type: ignore[arg-type]
342 on_confirmation=None,
343 on_user_question=None,
344 emit_confirmation=None,
345 consecutive_errors=0,
346 )
347
348 tool_result = next(event for event in events if event.type == "tool_result")
349 assert tool_result.tool_metadata == metadata
350
351
352 @pytest.mark.asyncio
353 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
354 verification_calls: list[str] = []
355
356 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
357 raise AssertionError("Confidence scoring should be disabled in this scenario")
358
359 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
360 verification_calls.append(result)
361 return ActionVerification(
362 tool_name=tool_name,
363 tool_args=tool_args,
364 expected_outcome="Success",
365 actual_result=result,
366 verified=False,
367 discrepancies=["File contents did not match"],
368 needs_correction=True,
369 correction_suggestion="Read the file before editing again.",
370 )
371
372 existing_recovery = RecoveryContext(
373 original_tool="edit",
374 original_args={"file_path": "README.md"},
375 )
376 context = build_context(
377 temp_dir=temp_dir,
378 messages=[],
379 safeguards=FakeSafeguards(),
380 assess_confidence=assess_confidence,
381 verify_action=verify_action,
382 recovery_context=existing_recovery,
383 verification=True,
384 )
385 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
386 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
387 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
388 events: list[AgentEvent] = []
389
390 async def emit(event: AgentEvent) -> None:
391 events.append(event)
392
393 await runner.execute_batch(
394 tool_calls=[tool_call],
395 tool_source="assistant",
396 pending_tool_calls_seen=set(),
397 emit=emit,
398 summary=TurnSummary(final_response=""),
399 dod=create_definition_of_done("Read the docs"),
400 executor=executor, # type: ignore[arg-type]
401 on_confirmation=None,
402 on_user_question=None,
403 emit_confirmation=None,
404 consecutive_errors=0,
405 )
406
407 assert verification_calls == ["file contents"]
408 assert context.recovery_context is existing_recovery
409 assert existing_recovery.successful_steps == [
410 ("read", {"file_path": "README.md"})
411 ]
412 assert context.session.messages[-1].role == Role.TOOL
413 assert context.session.messages[-1].content == "file contents"
414 assert any(event.type == "verification" for event in events)
415
416
417 @pytest.mark.asyncio
418 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
419 temp_dir: Path,
420 ) -> None:
421 async def assess_confidence(
422 tool_name: str,
423 tool_args: dict,
424 context: str,
425 ) -> ConfidenceAssessment:
426 raise AssertionError("Confidence scoring should be disabled in this scenario")
427
428 async def verify_action(
429 tool_name: str,
430 tool_args: dict,
431 result: str,
432 expected: str = "",
433 ) -> ActionVerification:
434 raise AssertionError("Verification should not run for this scenario")
435
436 existing_recovery = RecoveryContext(
437 original_tool="read",
438 original_args={"file_path": "chapters/04-data-types.html"},
439 )
440 existing_recovery.add_attempt(
441 "read",
442 {"file_path": "chapters/04-data-types.html"},
443 "File not found",
444 )
445 context = build_context(
446 temp_dir=temp_dir,
447 messages=[],
448 safeguards=FakeSafeguards(),
449 assess_confidence=assess_confidence,
450 verify_action=verify_action,
451 recovery_context=existing_recovery,
452 auto_recover=False,
453 )
454 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
455 tool_call = ToolCall(
456 id="bash-1",
457 name="bash",
458 arguments={"command": "ls chapters"},
459 )
460 executor = FakeExecutor(
461 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
462 )
463
464 summary = TurnSummary(final_response="")
465 await runner.execute_batch(
466 tool_calls=[tool_call],
467 tool_source="assistant",
468 pending_tool_calls_seen=set(),
469 emit=_noop_emit,
470 summary=summary,
471 dod=create_definition_of_done("Fix the chapter links"),
472 executor=executor, # type: ignore[arg-type]
473 on_confirmation=None,
474 on_user_question=None,
475 emit_confirmation=None,
476 consecutive_errors=0,
477 )
478
479 assert context.recovery_context is existing_recovery
480 assert existing_recovery.successful_steps == [
481 ("bash", {"command": "ls chapters"})
482 ]
483
484
485 @pytest.mark.asyncio
486 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
487 temp_dir: Path,
488 ) -> None:
489 async def assess_confidence(
490 tool_name: str,
491 tool_args: dict,
492 context: str,
493 ) -> ConfidenceAssessment:
494 raise AssertionError("Confidence scoring should be disabled in this scenario")
495
496 async def verify_action(
497 tool_name: str,
498 tool_args: dict,
499 result: str,
500 expected: str = "",
501 ) -> ActionVerification:
502 raise AssertionError("Verification should not run for this scenario")
503
504 existing_recovery = RecoveryContext(
505 original_tool="read",
506 original_args={"file_path": "chapters/04-data-types.html"},
507 )
508 existing_recovery.add_attempt(
509 "read",
510 {"file_path": "chapters/04-data-types.html"},
511 "File not found",
512 )
513 context = build_context(
514 temp_dir=temp_dir,
515 messages=[],
516 safeguards=FakeSafeguards(),
517 assess_confidence=assess_confidence,
518 verify_action=verify_action,
519 recovery_context=existing_recovery,
520 auto_recover=False,
521 )
522 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
523 tool_call = ToolCall(
524 id="patch-1",
525 name="patch",
526 arguments={
527 "file_path": "index.html",
528 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
529 },
530 )
531 executor = FakeExecutor(
532 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
533 )
534
535 summary = TurnSummary(final_response="")
536 await runner.execute_batch(
537 tool_calls=[tool_call],
538 tool_source="assistant",
539 pending_tool_calls_seen=set(),
540 emit=_noop_emit,
541 summary=summary,
542 dod=create_definition_of_done("Fix the chapter links"),
543 executor=executor, # type: ignore[arg-type]
544 on_confirmation=None,
545 on_user_question=None,
546 emit_confirmation=None,
547 consecutive_errors=0,
548 )
549
550 assert context.recovery_context is None
551
552
553 @pytest.mark.asyncio
554 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
555 temp_dir: Path,
556 ) -> None:
557 async def assess_confidence(
558 tool_name: str,
559 tool_args: dict,
560 context: str,
561 ) -> ConfidenceAssessment:
562 raise AssertionError("Confidence scoring should be disabled in this scenario")
563
564 async def verify_action(
565 tool_name: str,
566 tool_args: dict,
567 result: str,
568 expected: str = "",
569 ) -> ActionVerification:
570 raise AssertionError("Verification should not run for this scenario")
571
572 messages = [
573 Message(
574 role=Role.TOOL,
575 content=(
576 "Observation [glob]: Result: "
577 f"{temp_dir}/chapters/01-introduction.html\n"
578 f"{temp_dir}/chapters/02-setup.html\n"
579 f"{temp_dir}/chapters/03-basics.html"
580 ),
581 tool_results=[],
582 ),
583 Message(
584 role=Role.ASSISTANT,
585 content="I already inspected the first chapter title.",
586 tool_calls=[
587 ToolCall(
588 id="read-ch1",
589 name="read",
590 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
591 )
592 ],
593 ),
594 Message.tool_result_message(
595 tool_call_id="read-ch1",
596 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
598 ),
599 Message(
600 role=Role.ASSISTANT,
601 content="I should update the index now.",
602 tool_calls=[
603 ToolCall(
604 id="read-index",
605 name="read",
606 arguments={"file_path": str(temp_dir / 'index.html')},
607 )
608 ],
609 ),
610 ]
611 context = build_context(
612 temp_dir=temp_dir,
613 messages=messages,
614 safeguards=FakeSafeguards(),
615 assess_confidence=assess_confidence,
616 verify_action=verify_action,
617 auto_recover=False,
618 )
619 (temp_dir / "chapters").mkdir()
620 (temp_dir / "index.html").write_text("<ul></ul>\n")
621 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
622 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
623 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
624 implementation_plan = temp_dir / "implementation.md"
625 implementation_plan.write_text(
626 "\n".join(
627 [
628 "# Implementation Plan",
629 "",
630 "## File Changes",
631 f"- `{temp_dir / 'index.html'}`",
632 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
633 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
634 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
635 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
636 ]
637 )
638 )
639 context.session.current_task = (
640 f"Update {temp_dir / 'index.html'} with the right chapter links."
641 )
642 persistent_messages: list[str] = []
643 ephemeral_messages: list[str] = []
644 context.queue_steering_message_callback = persistent_messages.append
645 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
646 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
647 tool_call = ToolCall(
648 id="read-dup",
649 name="read",
650 arguments={"file_path": str(temp_dir / "index.html")},
651 )
652 duplicate_message = (
653 "[Skipped - duplicate action: Already read "
654 f"{temp_dir / 'index.html'} recently without any intervening changes; "
655 "reuse the earlier read result instead of rereading]"
656 )
657 executor = FakeExecutor(
658 [
659 ToolExecutionOutcome(
660 tool_call=tool_call,
661 state=ToolExecutionState.DUPLICATE,
662 message=Message.tool_result_message(
663 tool_call_id=tool_call.id,
664 display_content=duplicate_message,
665 result_content=duplicate_message,
666 ),
667 event_content=duplicate_message,
668 is_error=False,
669 result_output=duplicate_message,
670 )
671 ]
672 )
673
674 summary = TurnSummary(final_response="")
675 dod = create_definition_of_done("Fix the chapter links")
676 dod.implementation_plan = str(implementation_plan)
677 dod.pending_items.append("Create the remaining chapter files")
678 await runner.execute_batch(
679 tool_calls=[tool_call],
680 tool_source="assistant",
681 pending_tool_calls_seen=set(),
682 emit=_noop_emit,
683 summary=summary,
684 dod=dod,
685 executor=executor, # type: ignore[arg-type]
686 on_confirmation=None,
687 on_user_question=None,
688 emit_confirmation=None,
689 consecutive_errors=0,
690 )
691
692 assert len(persistent_messages) == 1
693 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
694 assert "A declared output artifact is still missing." in persistent_messages[0]
695 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
696 assert (
697 "Prefer one `write` call for "
698 f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
699 in persistent_messages[0]
700 )
701 assert ephemeral_messages == []
702
703
704 @pytest.mark.asyncio
705 async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
706 temp_dir: Path,
707 ) -> None:
708 async def assess_confidence(
709 tool_name: str,
710 tool_args: dict,
711 context: str,
712 ) -> ConfidenceAssessment:
713 raise AssertionError("Confidence scoring should not run for this scenario")
714
715 async def verify_action(
716 tool_name: str,
717 tool_args: dict,
718 result: str,
719 expected: str = "",
720 ) -> ActionVerification:
721 raise AssertionError("Verification should not run for this scenario")
722
723 guide_root = temp_dir / "guide"
724 chapters = guide_root / "chapters"
725 chapters.mkdir(parents=True)
726 index = guide_root / "index.html"
727 chapter_one = chapters / "01-introduction.html"
728 index.write_text(
729 '<a href="chapters/01-introduction.html">Intro</a>\n'
730 '<a href="chapters/02-installation.html">Install</a>\n'
731 )
732 chapter_one.write_text("<h1>Intro</h1>\n")
733
734 implementation_plan = temp_dir / "implementation.md"
735 implementation_plan.write_text(
736 "\n".join(
737 [
738 "# Implementation Plan",
739 "",
740 "## File Changes",
741 f"- `{index}`",
742 f"- `{chapters}/` (directory for chapter files)",
743 ]
744 )
745 )
746
747 messages = [
748 Message(
749 role=Role.ASSISTANT,
750 content="I should keep building the guide.",
751 tool_calls=[
752 ToolCall(
753 id="read-index",
754 name="read",
755 arguments={"file_path": str(index)},
756 )
757 ],
758 ),
759 ]
760 context = build_context(
761 temp_dir=temp_dir,
762 messages=messages,
763 safeguards=FakeSafeguards(),
764 assess_confidence=assess_confidence,
765 verify_action=verify_action,
766 auto_recover=False,
767 )
768 context.session.current_task = f"Build the guide rooted at {index}."
769 persistent_messages: list[str] = []
770 ephemeral_messages: list[str] = []
771 context.queue_steering_message_callback = persistent_messages.append
772 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
773 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
774 tool_call = ToolCall(
775 id="read-dup-rooted",
776 name="read",
777 arguments={"file_path": str(index)},
778 )
779 duplicate_message = (
780 "[Skipped - duplicate action: Already read "
781 f"{index} recently without any intervening changes; "
782 "reuse the earlier read result instead of rereading]"
783 )
784 executor = FakeExecutor(
785 [
786 ToolExecutionOutcome(
787 tool_call=tool_call,
788 state=ToolExecutionState.DUPLICATE,
789 message=Message.tool_result_message(
790 tool_call_id=tool_call.id,
791 display_content=duplicate_message,
792 result_content=duplicate_message,
793 ),
794 event_content=duplicate_message,
795 is_error=False,
796 result_output=duplicate_message,
797 )
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
803 dod.implementation_plan = str(implementation_plan)
804 dod.touched_files = [str(index), str(chapter_one)]
805 dod.completed_items = ["Create chapter files with appropriate content"]
806 dod.pending_items.append("Create the remaining chapter files")
807
808 await runner.execute_batch(
809 tool_calls=[tool_call],
810 tool_source="assistant",
811 pending_tool_calls_seen=set(),
812 emit=_noop_emit,
813 summary=summary,
814 dod=dod,
815 executor=executor, # type: ignore[arg-type]
816 on_confirmation=None,
817 on_user_question=None,
818 emit_confirmation=None,
819 consecutive_errors=0,
820 )
821
822 assert len(persistent_messages) == 1
823 assert "Create the remaining chapter files" in persistent_messages[0]
824 assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
825 assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
826 assert ephemeral_messages == []
827
828
829 @pytest.mark.asyncio
830 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
831 temp_dir: Path,
832 ) -> None:
833 async def assess_confidence(
834 tool_name: str,
835 tool_args: dict,
836 context: str,
837 ) -> ConfidenceAssessment:
838 raise AssertionError("Confidence scoring should not run for this scenario")
839
840 async def verify_action(
841 tool_name: str,
842 tool_args: dict,
843 result: str,
844 expected: str = "",
845 ) -> ActionVerification:
846 raise AssertionError("Verification should not run for this scenario")
847
848 context = build_context(
849 temp_dir=temp_dir,
850 messages=[],
851 safeguards=FakeSafeguards(),
852 assess_confidence=assess_confidence,
853 verify_action=verify_action,
854 auto_recover=False,
855 )
856 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
857 dod = create_definition_of_done("Create a multi-file nginx guide.")
858 sync_todos_to_definition_of_done(
859 dod,
860 [
861 {
862 "content": "Create 03-first-website.html",
863 "active_form": "Creating 03-first-website.html",
864 "status": "pending",
865 },
866 {
867 "content": "Create 04-configuration-basics.html",
868 "active_form": "Creating 04-configuration-basics.html",
869 "status": "pending",
870 },
871 ],
872 )
873
874 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
875 chapter_path.parent.mkdir(parents=True)
876 write_call = ToolCall(
877 id="write-ch3",
878 name="write",
879 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
880 )
881 stale_todo_call = ToolCall(
882 id="todo-stale",
883 name="TodoWrite",
884 arguments={
885 "todos": [
886 {
887 "content": "Create 03-first-website.html",
888 "active_form": "Creating 03-first-website.html",
889 "status": "pending",
890 },
891 {
892 "content": "Create 04-configuration-basics.html",
893 "active_form": "Creating 04-configuration-basics.html",
894 "status": "pending",
895 },
896 ]
897 },
898 )
899 executor = FakeExecutor(
900 [
901 tool_outcome(
902 tool_call=write_call,
903 output=f"Successfully wrote {chapter_path}",
904 is_error=False,
905 ),
906 tool_outcome(
907 tool_call=stale_todo_call,
908 output="Todos updated",
909 is_error=False,
910 metadata={
911 "new_todos": [
912 {
913 "content": "Create 03-first-website.html",
914 "active_form": "Creating 03-first-website.html",
915 "status": "pending",
916 },
917 {
918 "content": "Create 04-configuration-basics.html",
919 "active_form": "Creating 04-configuration-basics.html",
920 "status": "pending",
921 },
922 ]
923 },
924 ),
925 ]
926 )
927
928 summary = TurnSummary(final_response="")
929 await runner.execute_batch(
930 tool_calls=[write_call, stale_todo_call],
931 tool_source="assistant",
932 pending_tool_calls_seen=set(),
933 emit=_noop_emit,
934 summary=summary,
935 dod=dod,
936 executor=executor, # type: ignore[arg-type]
937 on_confirmation=None,
938 on_user_question=None,
939 emit_confirmation=None,
940 consecutive_errors=0,
941 )
942
943 assert "Create 03-first-website.html" in dod.completed_items
944 assert "Create 03-first-website.html" not in dod.pending_items
945 assert "Create 04-configuration-basics.html" in dod.pending_items
946
947
948 @pytest.mark.asyncio
949 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
950 temp_dir: Path,
951 ) -> None:
952 async def assess_confidence(
953 tool_name: str,
954 tool_args: dict,
955 context: str,
956 ) -> ConfidenceAssessment:
957 raise AssertionError("Confidence scoring should be disabled in this scenario")
958
959 async def verify_action(
960 tool_name: str,
961 tool_args: dict,
962 result: str,
963 expected: str = "",
964 ) -> ActionVerification:
965 raise AssertionError("Verification should not run for this scenario")
966
967 chapters = temp_dir / "chapters"
968 chapters.mkdir()
969 (chapters / "01-introduction.html").write_text(
970 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
971 )
972 (chapters / "02-setup.html").write_text(
973 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
974 )
975 (temp_dir / "index.html").write_text("<ul></ul>\n")
976
977 context = build_context(
978 temp_dir=temp_dir,
979 messages=[],
980 safeguards=FakeSafeguards(),
981 assess_confidence=assess_confidence,
982 verify_action=verify_action,
983 auto_recover=False,
984 )
985 context.session.current_task = (
986 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
987 )
988 persistent_messages: list[str] = []
989 ephemeral_messages: list[str] = []
990 context.queue_steering_message_callback = persistent_messages.append
991 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
992 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
993 tool_call = ToolCall(
994 id="glob-1",
995 name="glob",
996 arguments={"path": str(chapters), "pattern": "*.html"},
997 )
998 executor = FakeExecutor(
999 [
1000 tool_outcome(
1001 tool_call=tool_call,
1002 output="\n".join(
1003 [
1004 str(chapters / "01-introduction.html"),
1005 str(chapters / "02-setup.html"),
1006 ]
1007 ),
1008 is_error=False,
1009 )
1010 ]
1011 )
1012
1013 summary = TurnSummary(final_response="")
1014 await runner.execute_batch(
1015 tool_calls=[tool_call],
1016 tool_source="assistant",
1017 pending_tool_calls_seen=set(),
1018 emit=_noop_emit,
1019 summary=summary,
1020 dod=create_definition_of_done("Fix the chapter links"),
1021 executor=executor, # type: ignore[arg-type]
1022 on_confirmation=None,
1023 on_user_question=None,
1024 emit_confirmation=None,
1025 consecutive_errors=0,
1026 )
1027
1028 assert persistent_messages == []
1029 assert ephemeral_messages == []
1030 assert len(summary.tool_result_messages) == 1
1031 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
1032
1033
1034 @pytest.mark.asyncio
1035 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
1036 temp_dir: Path,
1037 ) -> None:
1038 async def assess_confidence(
1039 tool_name: str,
1040 tool_args: dict,
1041 context: str,
1042 ) -> ConfidenceAssessment:
1043 raise AssertionError("Confidence scoring should be disabled in this scenario")
1044
1045 async def verify_action(
1046 tool_name: str,
1047 tool_args: dict,
1048 result: str,
1049 expected: str = "",
1050 ) -> ActionVerification:
1051 raise AssertionError("Verification should not run for this scenario")
1052
1053 chapters = temp_dir / "chapters"
1054 chapters.mkdir()
1055 (chapters / "01-introduction.html").write_text(
1056 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1057 )
1058 (chapters / "02-setup.html").write_text(
1059 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1060 )
1061 index_path = temp_dir / "index.html"
1062 old_block = (
1063 '<ul class="chapter-list">\n'
1064 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
1065 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
1066 "</ul>\n"
1067 )
1068 new_block = (
1069 '<ul class="chapter-list">\n'
1070 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1071 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1072 "</ul>\n"
1073 )
1074 index_path.write_text(new_block)
1075
1076 context = build_context(
1077 temp_dir=temp_dir,
1078 messages=[],
1079 safeguards=FakeSafeguards(),
1080 assess_confidence=assess_confidence,
1081 verify_action=verify_action,
1082 auto_recover=False,
1083 )
1084 context.session.current_task = (
1085 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1086 )
1087 persistent_messages: list[str] = []
1088 ephemeral_messages: list[str] = []
1089 context.queue_steering_message_callback = persistent_messages.append
1090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1092 tool_call = ToolCall(
1093 id="edit-1",
1094 name="edit",
1095 arguments={
1096 "file_path": str(index_path),
1097 "old_string": old_block,
1098 "new_string": new_block,
1099 },
1100 )
1101 executor = FakeExecutor(
1102 [
1103 tool_outcome(
1104 tool_call=tool_call,
1105 output=f"Successfully edited {index_path}",
1106 is_error=False,
1107 )
1108 ]
1109 )
1110
1111 summary = TurnSummary(final_response="")
1112 await runner.execute_batch(
1113 tool_calls=[tool_call],
1114 tool_source="assistant",
1115 pending_tool_calls_seen=set(),
1116 emit=_noop_emit,
1117 summary=summary,
1118 dod=create_definition_of_done(
1119 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1120 ),
1121 executor=executor, # type: ignore[arg-type]
1122 on_confirmation=None,
1123 on_user_question=None,
1124 emit_confirmation=None,
1125 consecutive_errors=0,
1126 )
1127
1128 assert all(
1129 "Semantic verification preview:" not in message.content
1130 for message in summary.tool_result_messages
1131 )
1132 assert persistent_messages == []
1133 assert ephemeral_messages == []
1134
1135
1136 @pytest.mark.asyncio
1137 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1138 temp_dir: Path,
1139 ) -> None:
1140 async def assess_confidence(
1141 tool_name: str,
1142 tool_args: dict,
1143 context: str,
1144 ) -> ConfidenceAssessment:
1145 raise AssertionError("Confidence scoring should be disabled in this scenario")
1146
1147 async def verify_action(
1148 tool_name: str,
1149 tool_args: dict,
1150 result: str,
1151 expected: str = "",
1152 ) -> ActionVerification:
1153 raise AssertionError("Verification should not run for this scenario")
1154
1155 chapters = temp_dir / "chapters"
1156 chapters.mkdir()
1157 (chapters / "01-introduction.html").write_text(
1158 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1159 )
1160 (chapters / "02-setup.html").write_text(
1161 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1162 )
1163 index_path = temp_dir / "index.html"
1164 index_path.write_text(
1165 "<h2>Table of Contents</h2>\n"
1166 '<ul class="chapter-list">\n'
1167 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1168 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1169 "</ul>\n"
1170 )
1171
1172 prompt = (
1173 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1174 "for the structure and cadence of the guide. We are going to make an all "
1175 "new equally thorough guide on how to use the nginx tool."
1176 )
1177
1178 context = build_context(
1179 temp_dir=temp_dir,
1180 messages=[],
1181 safeguards=FakeSafeguards(),
1182 assess_confidence=assess_confidence,
1183 verify_action=verify_action,
1184 auto_recover=False,
1185 )
1186 context.session.current_task = prompt # type: ignore[attr-defined]
1187 persistent_messages: list[str] = []
1188 ephemeral_messages: list[str] = []
1189 context.queue_steering_message_callback = persistent_messages.append
1190 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1191 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1192 tool_call = ToolCall(
1193 id="read-index",
1194 name="read",
1195 arguments={"file_path": str(index_path)},
1196 )
1197 executor = FakeExecutor(
1198 [
1199 tool_outcome(
1200 tool_call=tool_call,
1201 output=index_path.read_text(),
1202 is_error=False,
1203 )
1204 ]
1205 )
1206
1207 summary = TurnSummary(final_response="")
1208 await runner.execute_batch(
1209 tool_calls=[tool_call],
1210 tool_source="assistant",
1211 pending_tool_calls_seen=set(),
1212 emit=_noop_emit,
1213 summary=summary,
1214 dod=create_definition_of_done(prompt),
1215 executor=executor, # type: ignore[arg-type]
1216 on_confirmation=None,
1217 on_user_question=None,
1218 emit_confirmation=None,
1219 consecutive_errors=0,
1220 )
1221
1222 assert persistent_messages == []
1223 assert ephemeral_messages == []
1224 assert all(
1225 "Semantic verification preview:" not in message.content
1226 for message in summary.tool_result_messages
1227 )
1228
1229
1230 @pytest.mark.asyncio
1231 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1232 temp_dir: Path,
1233 ) -> None:
1234 async def assess_confidence(
1235 tool_name: str,
1236 tool_args: dict,
1237 context: str,
1238 ) -> ConfidenceAssessment:
1239 raise AssertionError("Confidence scoring should be disabled in this scenario")
1240
1241 async def verify_action(
1242 tool_name: str,
1243 tool_args: dict,
1244 result: str,
1245 expected: str = "",
1246 ) -> ActionVerification:
1247 raise AssertionError("Verification should not run for this scenario")
1248
1249 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1250 reference.parent.mkdir(parents=True)
1251 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1252 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1253 chapters = nginx_root / "chapters"
1254 implementation_plan = temp_dir / "implementation.md"
1255 implementation_plan.write_text(
1256 "\n".join(
1257 [
1258 "# Implementation Plan",
1259 "",
1260 "## File Changes",
1261 f"- `{chapters}/`",
1262 f"- `{nginx_root / 'index.html'}`",
1263 "",
1264 ]
1265 )
1266 )
1267
1268 context = build_context(
1269 temp_dir=temp_dir,
1270 messages=[],
1271 safeguards=FakeSafeguards(),
1272 assess_confidence=assess_confidence,
1273 verify_action=verify_action,
1274 auto_recover=False,
1275 )
1276 persistent_messages: list[str] = []
1277 ephemeral_messages: list[str] = []
1278 context.queue_steering_message_callback = persistent_messages.append
1279 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1280 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1281 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1282 dod.implementation_plan = str(implementation_plan)
1283 sync_todos_to_definition_of_done(
1284 dod,
1285 [
1286 {
1287 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1288 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1289 "status": "pending",
1290 },
1291 {
1292 "content": "Create the nginx directory structure",
1293 "active_form": "Working on: Create the nginx directory structure",
1294 "status": "pending",
1295 },
1296 {
1297 "content": "Create the nginx index.html file",
1298 "active_form": "Working on: Create the nginx index.html file",
1299 "status": "pending",
1300 },
1301 ],
1302 )
1303 tool_call = ToolCall(
1304 id="read-reference",
1305 name="read",
1306 arguments={"file_path": str(reference)},
1307 )
1308 executor = FakeExecutor(
1309 [
1310 tool_outcome(
1311 tool_call=tool_call,
1312 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1313 is_error=False,
1314 )
1315 ]
1316 )
1317
1318 summary = TurnSummary(final_response="")
1319 await runner.execute_batch(
1320 tool_calls=[tool_call],
1321 tool_source="assistant",
1322 pending_tool_calls_seen=set(),
1323 emit=_noop_emit,
1324 summary=summary,
1325 dod=dod,
1326 executor=executor, # type: ignore[arg-type]
1327 on_confirmation=None,
1328 on_user_question=None,
1329 emit_confirmation=None,
1330 consecutive_errors=0,
1331 )
1332
1333 assert (
1334 "Examine the existing Fortran guide structure to understand the cadence and format"
1335 in dod.completed_items
1336 )
1337 assert any(
1338 "Continue with the next pending item: `Create the nginx directory structure`"
1339 in message
1340 for message in persistent_messages
1341 )
1342 assert any(
1343 "Resume by creating `chapters/` now." in message
1344 for message in persistent_messages
1345 )
1346 assert all("01-introduction.html" not in message for message in persistent_messages)
1347 assert ephemeral_messages == []
1348
1349
1350 @pytest.mark.asyncio
1351 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1352 temp_dir: Path,
1353 ) -> None:
1354 async def assess_confidence(
1355 tool_name: str,
1356 tool_args: dict,
1357 context: str,
1358 ) -> ConfidenceAssessment:
1359 raise AssertionError("Confidence scoring should be disabled in this scenario")
1360
1361 async def verify_action(
1362 tool_name: str,
1363 tool_args: dict,
1364 result: str,
1365 expected: str = "",
1366 ) -> ActionVerification:
1367 raise AssertionError("Verification should not run for this scenario")
1368
1369 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1370 reference.parent.mkdir(parents=True)
1371 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1372 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1373 chapters = nginx_root / "chapters"
1374 implementation_plan = temp_dir / "implementation.md"
1375 implementation_plan.write_text(
1376 "\n".join(
1377 [
1378 "# Implementation Plan",
1379 "",
1380 "## File Changes",
1381 f"- `{nginx_root / 'index.html'}`",
1382 f"- `{chapters}/`",
1383 "",
1384 ]
1385 )
1386 )
1387
1388 context = build_context(
1389 temp_dir=temp_dir,
1390 messages=[],
1391 safeguards=FakeSafeguards(),
1392 assess_confidence=assess_confidence,
1393 verify_action=verify_action,
1394 auto_recover=False,
1395 )
1396 persistent_messages: list[str] = []
1397 ephemeral_messages: list[str] = []
1398 context.queue_steering_message_callback = persistent_messages.append
1399 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1400 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1401 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1402 dod.implementation_plan = str(implementation_plan)
1403 sync_todos_to_definition_of_done(
1404 dod,
1405 [
1406 {
1407 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1408 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1409 "status": "pending",
1410 },
1411 {
1412 "content": "Create the nginx directory structure",
1413 "active_form": "Working on: Create the nginx directory structure",
1414 "status": "pending",
1415 },
1416 {
1417 "content": "Create the nginx index.html file",
1418 "active_form": "Working on: Create the nginx index.html file",
1419 "status": "pending",
1420 },
1421 ],
1422 project_root=temp_dir,
1423 )
1424 tool_call = ToolCall(
1425 id="read-reference-index-first",
1426 name="read",
1427 arguments={"file_path": str(reference)},
1428 )
1429 executor = FakeExecutor(
1430 [
1431 tool_outcome(
1432 tool_call=tool_call,
1433 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1434 is_error=False,
1435 )
1436 ]
1437 )
1438
1439 summary = TurnSummary(final_response="")
1440 await runner.execute_batch(
1441 tool_calls=[tool_call],
1442 tool_source="assistant",
1443 pending_tool_calls_seen=set(),
1444 emit=_noop_emit,
1445 summary=summary,
1446 dod=dod,
1447 executor=executor, # type: ignore[arg-type]
1448 on_confirmation=None,
1449 on_user_question=None,
1450 emit_confirmation=None,
1451 consecutive_errors=0,
1452 )
1453
1454 assert persistent_messages
1455 assert any(
1456 "Continue with the next pending item: `Create the nginx directory structure`"
1457 in message
1458 for message in persistent_messages
1459 )
1460 assert any(
1461 "Resume by creating `chapters/` now." in message
1462 for message in persistent_messages
1463 )
1464 assert all(
1465 "Next step: create `index.html`." not in message
1466 for message in persistent_messages
1467 )
1468 assert ephemeral_messages == []
1469
1470
1471 @pytest.mark.asyncio
1472 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1473 temp_dir: Path,
1474 ) -> None:
1475 async def assess_confidence(
1476 tool_name: str,
1477 tool_args: dict,
1478 context: str,
1479 ) -> ConfidenceAssessment:
1480 raise AssertionError("Confidence scoring should be disabled in this scenario")
1481
1482 async def verify_action(
1483 tool_name: str,
1484 tool_args: dict,
1485 result: str,
1486 expected: str = "",
1487 ) -> ActionVerification:
1488 raise AssertionError("Verification should not run for this scenario")
1489
1490 reference = temp_dir / "fortran" / "index.html"
1491 reference.parent.mkdir(parents=True)
1492 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1493
1494 messages = [
1495 Message(
1496 role=Role.TOOL,
1497 content=(
1498 "Observation [read]: Result: "
1499 "<h1>Fortran Beginner's Guide</h1>\n"
1500 ),
1501 )
1502 ]
1503 context = build_context(
1504 temp_dir=temp_dir,
1505 messages=messages,
1506 safeguards=FakeSafeguards(),
1507 assess_confidence=assess_confidence,
1508 verify_action=verify_action,
1509 auto_recover=False,
1510 )
1511 prompt = (
1512 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1513 "for the structure and cadence of the guide. We are going to make an all "
1514 "new equally thorough guide on how to use the nginx tool."
1515 )
1516 context.session.current_task = prompt
1517 persistent_messages: list[str] = []
1518 ephemeral_messages: list[str] = []
1519 context.queue_steering_message_callback = persistent_messages.append
1520 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1522 dod = create_definition_of_done(prompt)
1523 sync_todos_to_definition_of_done(
1524 dod,
1525 [
1526 {
1527 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1528 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1529 "status": "completed",
1530 },
1531 {
1532 "content": "Create the nginx directory structure",
1533 "active_form": "Working on: Create the nginx directory structure",
1534 "status": "pending",
1535 },
1536 {
1537 "content": "Create the nginx index.html file",
1538 "active_form": "Working on: Create the nginx index.html file",
1539 "status": "pending",
1540 },
1541 ],
1542 )
1543 tool_call = ToolCall(
1544 id="read-dup",
1545 name="read",
1546 arguments={"file_path": str(reference)},
1547 )
1548 duplicate_message = (
1549 "[Skipped - duplicate action: Already read "
1550 f"{reference} recently without any intervening changes; "
1551 "reuse the earlier read result instead of rereading]"
1552 )
1553 executor = FakeExecutor(
1554 [
1555 ToolExecutionOutcome(
1556 tool_call=tool_call,
1557 state=ToolExecutionState.DUPLICATE,
1558 message=Message.tool_result_message(
1559 tool_call_id=tool_call.id,
1560 display_content=duplicate_message,
1561 result_content=duplicate_message,
1562 ),
1563 event_content=duplicate_message,
1564 is_error=False,
1565 result_output=duplicate_message,
1566 )
1567 ]
1568 )
1569
1570 summary = TurnSummary(final_response="")
1571 await runner.execute_batch(
1572 tool_calls=[tool_call],
1573 tool_source="assistant",
1574 pending_tool_calls_seen=set(),
1575 emit=_noop_emit,
1576 summary=summary,
1577 dod=dod,
1578 executor=executor, # type: ignore[arg-type]
1579 on_confirmation=None,
1580 on_user_question=None,
1581 emit_confirmation=None,
1582 consecutive_errors=0,
1583 )
1584
1585 assert len(persistent_messages) == 1
1586 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1587 assert (
1588 "Continue with the next pending item: `Create the nginx directory structure`"
1589 in persistent_messages[0]
1590 )
1591 assert "Update `" not in persistent_messages[0]
1592 assert ephemeral_messages == []
1593
1594
1595 @pytest.mark.asyncio
1596 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1597 temp_dir: Path,
1598 ) -> None:
1599 async def assess_confidence(
1600 tool_name: str,
1601 tool_args: dict,
1602 context: str,
1603 ) -> ConfidenceAssessment:
1604 raise AssertionError("Confidence scoring should be disabled in this scenario")
1605
1606 async def verify_action(
1607 tool_name: str,
1608 tool_args: dict,
1609 result: str,
1610 expected: str = "",
1611 ) -> ActionVerification:
1612 raise AssertionError("Verification should not run for this scenario")
1613
1614 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1615 chapters = guide_root / "chapters"
1616 chapters.mkdir(parents=True)
1617 chapter_one = chapters / "01-introduction.html"
1618 chapter_one.write_text("<html></html>\n")
1619 index_path = guide_root / "index.html"
1620
1621 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1622 reference.parent.mkdir(parents=True, exist_ok=True)
1623 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1624
1625 implementation_plan = temp_dir / "implementation.md"
1626 implementation_plan.write_text(
1627 "\n".join(
1628 [
1629 "# Implementation Plan",
1630 "",
1631 "## File Changes",
1632 f"- `{guide_root}/`",
1633 f"- `{chapters}/`",
1634 f"- `{index_path}`",
1635 f"- `{chapter_one}`",
1636 f"- `{chapters / '02-installation.html'}`",
1637 "",
1638 ]
1639 )
1640 )
1641
1642 context = build_context(
1643 temp_dir=temp_dir,
1644 messages=[],
1645 safeguards=FakeSafeguards(),
1646 assess_confidence=assess_confidence,
1647 verify_action=verify_action,
1648 auto_recover=False,
1649 )
1650 persistent_messages: list[str] = []
1651 ephemeral_messages: list[str] = []
1652 context.queue_steering_message_callback = persistent_messages.append
1653 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1654 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1655 dod = create_definition_of_done("Create a multi-file nginx guide.")
1656 dod.implementation_plan = str(implementation_plan)
1657 dod.touched_files.append(str(chapter_one))
1658 sync_todos_to_definition_of_done(
1659 dod,
1660 [
1661 {
1662 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1663 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1664 "status": "pending",
1665 },
1666 {
1667 "content": "Create each chapter file with appropriate content",
1668 "active_form": "Working on: Create each chapter file with appropriate content",
1669 "status": "pending",
1670 },
1671 {
1672 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1673 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1674 "status": "pending",
1675 },
1676 ],
1677 )
1678 tool_call = ToolCall(
1679 id="read-reference-chapter",
1680 name="read",
1681 arguments={"file_path": str(reference)},
1682 )
1683 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1684 executor = FakeExecutor(
1685 [
1686 ToolExecutionOutcome(
1687 tool_call=tool_call,
1688 state=ToolExecutionState.EXECUTED,
1689 message=Message.tool_result_message(
1690 tool_call_id=tool_call.id,
1691 display_content=read_output,
1692 result_content=read_output,
1693 ),
1694 event_content=read_output,
1695 is_error=False,
1696 result_output=read_output,
1697 )
1698 ]
1699 )
1700
1701 summary = TurnSummary(final_response="")
1702 await runner.execute_batch(
1703 tool_calls=[tool_call],
1704 tool_source="assistant",
1705 pending_tool_calls_seen=set(),
1706 emit=_noop_emit,
1707 summary=summary,
1708 dod=dod,
1709 executor=executor, # type: ignore[arg-type]
1710 on_confirmation=None,
1711 on_user_question=None,
1712 emit_confirmation=None,
1713 consecutive_errors=0,
1714 )
1715
1716 assert persistent_messages
1717 assert any(
1718 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1719 in message
1720 for message in persistent_messages
1721 )
1722 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1723 assert not any(
1724 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1725 in message
1726 for message in persistent_messages
1727 )
1728 assert ephemeral_messages == []
1729
1730
1731 @pytest.mark.asyncio
1732 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1733 temp_dir: Path,
1734 ) -> None:
1735 async def assess_confidence(
1736 tool_name: str,
1737 tool_args: dict,
1738 context: str,
1739 ) -> ConfidenceAssessment:
1740 raise AssertionError("Confidence scoring should not run for this scenario")
1741
1742 async def verify_action(
1743 tool_name: str,
1744 tool_args: dict,
1745 result: str,
1746 expected: str = "",
1747 ) -> ActionVerification:
1748 raise AssertionError("Verification should not run for this scenario")
1749
1750 guide_root = temp_dir / "guides" / "nginx"
1751 chapters = guide_root / "chapters"
1752 guide_root.mkdir(parents=True)
1753 chapters.mkdir()
1754 index_path = guide_root / "index.html"
1755 chapter_one = chapters / "01-getting-started.html"
1756 chapter_two = chapters / "02-installation.html"
1757 index_path.write_text("<html></html>\n")
1758 chapter_one.write_text("<h1>One</h1>\n")
1759 chapter_two.write_text("<h1>Two</h1>\n")
1760
1761 implementation_plan = temp_dir / "implementation.md"
1762 implementation_plan.write_text(
1763 "\n".join(
1764 [
1765 "# Implementation Plan",
1766 "",
1767 "## File Changes",
1768 f"- `{guide_root}/`",
1769 f"- `{chapters}/`",
1770 f"- `{index_path}`",
1771 f"- `{chapter_one}`",
1772 f"- `{chapter_two}`",
1773 "",
1774 ]
1775 )
1776 )
1777
1778 context = build_context(
1779 temp_dir=temp_dir,
1780 messages=[],
1781 safeguards=FakeSafeguards(),
1782 assess_confidence=assess_confidence,
1783 verify_action=verify_action,
1784 auto_recover=False,
1785 )
1786 persistent_messages: list[str] = []
1787 ephemeral_messages: list[str] = []
1788 context.queue_steering_message_callback = persistent_messages.append
1789 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1790 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1791 dod = create_definition_of_done("Create a multi-file nginx guide.")
1792 dod.implementation_plan = str(implementation_plan)
1793 dod.pending_items = [
1794 "Create 07-performance-tuning.html",
1795 "Verify all guide files are linked and complete",
1796 "Complete the requested work",
1797 ]
1798
1799 tool_call = ToolCall(
1800 id="read-dup",
1801 name="read",
1802 arguments={"file_path": str(chapter_one)},
1803 )
1804 duplicate_message = (
1805 "[Skipped - duplicate action: Already read "
1806 f"{chapter_one} recently without any intervening changes; "
1807 "reuse the earlier read result instead of rereading]"
1808 )
1809 executor = FakeExecutor(
1810 [
1811 ToolExecutionOutcome(
1812 tool_call=tool_call,
1813 state=ToolExecutionState.DUPLICATE,
1814 message=Message.tool_result_message(
1815 tool_call_id=tool_call.id,
1816 display_content=duplicate_message,
1817 result_content=duplicate_message,
1818 ),
1819 event_content=duplicate_message,
1820 is_error=False,
1821 result_output=duplicate_message,
1822 )
1823 ]
1824 )
1825
1826 summary = TurnSummary(final_response="")
1827 await runner.execute_batch(
1828 tool_calls=[tool_call],
1829 tool_source="assistant",
1830 pending_tool_calls_seen=set(),
1831 emit=_noop_emit,
1832 summary=summary,
1833 dod=dod,
1834 executor=executor, # type: ignore[arg-type]
1835 on_confirmation=None,
1836 on_user_question=None,
1837 emit_confirmation=None,
1838 consecutive_errors=0,
1839 )
1840
1841 assert len(persistent_messages) == 1
1842 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1843 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1844 assert ephemeral_messages == []
1845
1846
1847 @pytest.mark.asyncio
1848 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1849 temp_dir: Path,
1850 ) -> None:
1851 async def assess_confidence(
1852 tool_name: str,
1853 tool_args: dict,
1854 context: str,
1855 ) -> ConfidenceAssessment:
1856 raise AssertionError("Confidence scoring should not run for this scenario")
1857
1858 async def verify_action(
1859 tool_name: str,
1860 tool_args: dict,
1861 result: str,
1862 expected: str = "",
1863 ) -> ActionVerification:
1864 raise AssertionError("Verification should not run for this scenario")
1865
1866 guide_root = temp_dir / "guides" / "nginx"
1867 chapters = guide_root / "chapters"
1868 guide_root.mkdir(parents=True)
1869 chapters.mkdir()
1870 index_path = guide_root / "index.html"
1871 chapter_one = chapters / "01-getting-started.html"
1872 chapter_two = chapters / "02-installation.html"
1873 index_path.write_text("<html></html>\n")
1874 chapter_one.write_text("<h1>One</h1>\n")
1875 chapter_two.write_text("<h1>Two</h1>\n")
1876
1877 implementation_plan = temp_dir / "implementation.md"
1878 implementation_plan.write_text(
1879 "\n".join(
1880 [
1881 "# Implementation Plan",
1882 "",
1883 "## File Changes",
1884 f"- `{guide_root}/`",
1885 f"- `{chapters}/`",
1886 f"- `{index_path}`",
1887 f"- `{chapter_one}`",
1888 f"- `{chapter_two}`",
1889 "",
1890 ]
1891 )
1892 )
1893
1894 context = build_context(
1895 temp_dir=temp_dir,
1896 messages=[],
1897 safeguards=FakeSafeguards(),
1898 assess_confidence=assess_confidence,
1899 verify_action=verify_action,
1900 auto_recover=False,
1901 )
1902 persistent_messages: list[str] = []
1903 ephemeral_messages: list[str] = []
1904 context.queue_steering_message_callback = persistent_messages.append
1905 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1906 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1907 dod = create_definition_of_done("Create a multi-file nginx guide.")
1908 dod.implementation_plan = str(implementation_plan)
1909 dod.verification_commands = [f"ls -la {guide_root}"]
1910 dod.pending_items = [
1911 "Create 07-performance-tuning.html",
1912 "Complete the requested work",
1913 ]
1914
1915 tool_call = ToolCall(
1916 id="read-dup",
1917 name="read",
1918 arguments={"file_path": str(chapter_one)},
1919 )
1920 duplicate_message = (
1921 "[Skipped - duplicate action: Already read "
1922 f"{chapter_one} recently without any intervening changes; "
1923 "reuse the earlier read result instead of rereading]"
1924 )
1925 executor = FakeExecutor(
1926 [
1927 ToolExecutionOutcome(
1928 tool_call=tool_call,
1929 state=ToolExecutionState.DUPLICATE,
1930 message=Message.tool_result_message(
1931 tool_call_id=tool_call.id,
1932 display_content=duplicate_message,
1933 result_content=duplicate_message,
1934 ),
1935 event_content=duplicate_message,
1936 is_error=False,
1937 result_output=duplicate_message,
1938 )
1939 ]
1940 )
1941
1942 summary = TurnSummary(final_response="")
1943 await runner.execute_batch(
1944 tool_calls=[tool_call],
1945 tool_source="assistant",
1946 pending_tool_calls_seen=set(),
1947 emit=_noop_emit,
1948 summary=summary,
1949 dod=dod,
1950 executor=executor, # type: ignore[arg-type]
1951 on_confirmation=None,
1952 on_user_question=None,
1953 emit_confirmation=None,
1954 consecutive_errors=0,
1955 )
1956
1957 assert len(persistent_messages) == 1
1958 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
1959 assert (
1960 "Move to verification or final confirmation using the files already on disk."
1961 in persistent_messages[0]
1962 )
1963 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1964 assert ephemeral_messages == []
1965
1966
1967 @pytest.mark.asyncio
1968 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1969 temp_dir: Path,
1970 ) -> None:
1971 async def assess_confidence(
1972 tool_name: str,
1973 tool_args: dict,
1974 context: str,
1975 ) -> ConfidenceAssessment:
1976 raise AssertionError("Confidence scoring should not run for this scenario")
1977
1978 async def verify_action(
1979 tool_name: str,
1980 tool_args: dict,
1981 result: str,
1982 expected: str = "",
1983 ) -> ActionVerification:
1984 raise AssertionError("Verification should not run for this scenario")
1985
1986 guide_root = temp_dir / "guides" / "nginx"
1987 chapters = guide_root / "chapters"
1988 guide_root.mkdir(parents=True)
1989 chapters.mkdir()
1990 index_path = guide_root / "index.html"
1991 chapter_one = chapters / "01-getting-started.html"
1992 chapter_two = chapters / "02-installation.html"
1993 index_path.write_text("<html></html>\n")
1994 chapter_one.write_text("<h1>One</h1>\n")
1995 chapter_two.write_text("<h1>Two</h1>\n")
1996
1997 implementation_plan = temp_dir / "implementation.md"
1998 implementation_plan.write_text(
1999 "\n".join(
2000 [
2001 "# Implementation Plan",
2002 "",
2003 "## File Changes",
2004 f"- `{guide_root}/`",
2005 f"- `{chapters}/`",
2006 f"- `{index_path}`",
2007 f"- `{chapter_one}`",
2008 f"- `{chapter_two}`",
2009 "",
2010 ]
2011 )
2012 )
2013
2014 context = build_context(
2015 temp_dir=temp_dir,
2016 messages=[],
2017 safeguards=FakeSafeguards(),
2018 assess_confidence=assess_confidence,
2019 verify_action=verify_action,
2020 auto_recover=False,
2021 )
2022 persistent_messages: list[str] = []
2023 ephemeral_messages: list[str] = []
2024 context.queue_steering_message_callback = persistent_messages.append
2025 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2026 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2027 dod = create_definition_of_done("Create a multi-file nginx guide.")
2028 dod.implementation_plan = str(implementation_plan)
2029 dod.verification_commands = [f"ls -la {guide_root}"]
2030 dod.pending_items = [
2031 "Create 01-getting-started.html",
2032 "Creating 02-installation.html",
2033 "Complete the requested work",
2034 ]
2035
2036 tool_call = ToolCall(
2037 id="read-dup-built-stale",
2038 name="read",
2039 arguments={"file_path": str(chapter_one)},
2040 )
2041 duplicate_message = (
2042 "[Skipped - duplicate action: Already read "
2043 f"{chapter_one} recently without any intervening changes; "
2044 "reuse the earlier read result instead of rereading]"
2045 )
2046 executor = FakeExecutor(
2047 [
2048 ToolExecutionOutcome(
2049 tool_call=tool_call,
2050 state=ToolExecutionState.DUPLICATE,
2051 message=Message.tool_result_message(
2052 tool_call_id=tool_call.id,
2053 display_content=duplicate_message,
2054 result_content=duplicate_message,
2055 ),
2056 event_content=duplicate_message,
2057 is_error=False,
2058 result_output=duplicate_message,
2059 )
2060 ]
2061 )
2062
2063 summary = TurnSummary(final_response="")
2064 await runner.execute_batch(
2065 tool_calls=[tool_call],
2066 tool_source="assistant",
2067 pending_tool_calls_seen=set(),
2068 emit=_noop_emit,
2069 summary=summary,
2070 dod=dod,
2071 executor=executor, # type: ignore[arg-type]
2072 on_confirmation=None,
2073 on_user_question=None,
2074 emit_confirmation=None,
2075 consecutive_errors=0,
2076 )
2077
2078 assert len(persistent_messages) == 1
2079 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
2080 assert (
2081 "Move to verification or final confirmation using the files already on disk."
2082 in persistent_messages[0]
2083 )
2084 assert "Create 01-getting-started.html" not in persistent_messages[0]
2085 assert "Creating 02-installation.html" not in persistent_messages[0]
2086 assert ephemeral_messages == []
2087
2088
2089 @pytest.mark.asyncio
2090 async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
2091 temp_dir: Path,
2092 ) -> None:
2093 async def assess_confidence(
2094 tool_name: str,
2095 tool_args: dict,
2096 context: str,
2097 ) -> ConfidenceAssessment:
2098 raise AssertionError("Confidence scoring should not run for this scenario")
2099
2100 async def verify_action(
2101 tool_name: str,
2102 tool_args: dict,
2103 result: str,
2104 expected: str = "",
2105 ) -> ActionVerification:
2106 raise AssertionError("Verification should not run for this scenario")
2107
2108 guide_root = temp_dir / "guides" / "nginx"
2109 chapters = guide_root / "chapters"
2110 guide_root.mkdir(parents=True)
2111 chapters.mkdir()
2112 index_path = guide_root / "index.html"
2113 chapter_one = chapters / "01-getting-started.html"
2114 chapter_two = chapters / "02-installation.html"
2115 index_path.write_text("<html></html>\n")
2116 chapter_one.write_text("<h1>One</h1>\n")
2117 chapter_two.write_text("<h1>Two</h1>\n")
2118
2119 implementation_plan = temp_dir / "implementation.md"
2120 implementation_plan.write_text(
2121 "\n".join(
2122 [
2123 "# Implementation Plan",
2124 "",
2125 "## File Changes",
2126 f"- `{guide_root}/`",
2127 f"- `{chapters}/`",
2128 f"- `{index_path}`",
2129 f"- `{chapter_one}`",
2130 f"- `{chapter_two}`",
2131 "",
2132 ]
2133 )
2134 )
2135
2136 context = build_context(
2137 temp_dir=temp_dir,
2138 messages=[],
2139 safeguards=FakeSafeguards(),
2140 assess_confidence=assess_confidence,
2141 verify_action=verify_action,
2142 auto_recover=False,
2143 )
2144 persistent_messages: list[str] = []
2145 ephemeral_messages: list[str] = []
2146 context.queue_steering_message_callback = persistent_messages.append
2147 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2148 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2149 dod = create_definition_of_done("Create a multi-file nginx guide.")
2150 dod.implementation_plan = str(implementation_plan)
2151 dod.verification_commands = [f"ls -la {guide_root}"]
2152 sync_todos_to_definition_of_done(
2153 dod,
2154 [
2155 {
2156 "content": "Create 01-getting-started.html",
2157 "active_form": "Creating 01-getting-started.html",
2158 "status": "pending",
2159 },
2160 {
2161 "content": "Ensure all files are properly linked and formatted consistently",
2162 "active_form": "Reviewing guide consistency and linkage",
2163 "status": "pending",
2164 },
2165 ],
2166 )
2167
2168 tool_call = ToolCall(
2169 id="read-built-review",
2170 name="read",
2171 arguments={"file_path": str(chapter_one)},
2172 )
2173 executor = FakeExecutor(
2174 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2175 )
2176
2177 summary = TurnSummary(final_response="")
2178 await runner.execute_batch(
2179 tool_calls=[tool_call],
2180 tool_source="assistant",
2181 pending_tool_calls_seen=set(),
2182 emit=_noop_emit,
2183 summary=summary,
2184 dod=dod,
2185 executor=executor, # type: ignore[arg-type]
2186 on_confirmation=None,
2187 on_user_question=None,
2188 emit_confirmation=None,
2189 consecutive_errors=0,
2190 )
2191
2192 assert persistent_messages == []
2193 assert len(ephemeral_messages) == 1
2194 message = ephemeral_messages[0]
2195 assert "All explicitly planned artifacts already exist." in message
2196 assert "Ensure all files are properly linked and formatted consistently" in message
2197 assert "Create 01-getting-started.html" not in message
2198 assert "do not keep broad-rereading the output set" in message
2199 assert "If no specific mismatch remains, move to verification now." in message
2200
2201
2202 @pytest.mark.asyncio
2203 async def test_tool_batch_runner_successful_read_after_plan_complete_switches_to_verify(
2204 temp_dir: Path,
2205 ) -> None:
2206 async def assess_confidence(
2207 tool_name: str,
2208 tool_args: dict,
2209 context: str,
2210 ) -> ConfidenceAssessment:
2211 raise AssertionError("Confidence scoring should not run for this scenario")
2212
2213 async def verify_action(
2214 tool_name: str,
2215 tool_args: dict,
2216 result: str,
2217 expected: str = "",
2218 ) -> ActionVerification:
2219 raise AssertionError("Verification should not run for this scenario")
2220
2221 guide_root = temp_dir / "guides" / "nginx"
2222 chapters = guide_root / "chapters"
2223 guide_root.mkdir(parents=True)
2224 chapters.mkdir()
2225 index_path = guide_root / "index.html"
2226 chapter_one = chapters / "01-getting-started.html"
2227 chapter_two = chapters / "02-installation.html"
2228 index_path.write_text("<html></html>\n")
2229 chapter_one.write_text("<h1>One</h1>\n")
2230 chapter_two.write_text("<h1>Two</h1>\n")
2231
2232 implementation_plan = temp_dir / "implementation.md"
2233 implementation_plan.write_text(
2234 "\n".join(
2235 [
2236 "# Implementation Plan",
2237 "",
2238 "## File Changes",
2239 f"- `{guide_root}/`",
2240 f"- `{chapters}/`",
2241 f"- `{index_path}`",
2242 f"- `{chapter_one}`",
2243 f"- `{chapter_two}`",
2244 "",
2245 ]
2246 )
2247 )
2248
2249 context = build_context(
2250 temp_dir=temp_dir,
2251 messages=[],
2252 safeguards=FakeSafeguards(),
2253 assess_confidence=assess_confidence,
2254 verify_action=verify_action,
2255 auto_recover=False,
2256 )
2257 persistent_messages: list[str] = []
2258 ephemeral_messages: list[str] = []
2259 context.queue_steering_message_callback = persistent_messages.append
2260 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2261 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2262 dod = create_definition_of_done("Create a multi-file nginx guide.")
2263 dod.implementation_plan = str(implementation_plan)
2264 dod.verification_commands = [f"ls -la {guide_root}"]
2265
2266 tool_call = ToolCall(
2267 id="read-built-verify",
2268 name="read",
2269 arguments={"file_path": str(chapter_one)},
2270 )
2271 executor = FakeExecutor(
2272 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2273 )
2274
2275 summary = TurnSummary(final_response="")
2276 await runner.execute_batch(
2277 tool_calls=[tool_call],
2278 tool_source="assistant",
2279 pending_tool_calls_seen=set(),
2280 emit=_noop_emit,
2281 summary=summary,
2282 dod=dod,
2283 executor=executor, # type: ignore[arg-type]
2284 on_confirmation=None,
2285 on_user_question=None,
2286 emit_confirmation=None,
2287 consecutive_errors=0,
2288 )
2289
2290 assert len(persistent_messages) == 1
2291 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
2292 assert "Verification should run next." in persistent_messages[0]
2293 assert "stop broad rereads" in persistent_messages[0]
2294 assert ephemeral_messages == []
2295 assert context.workflow_mode == "verify"
2296
2297
2298 @pytest.mark.asyncio
2299 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
2300 temp_dir: Path,
2301 ) -> None:
2302 async def assess_confidence(
2303 tool_name: str,
2304 tool_args: dict,
2305 context: str,
2306 ) -> ConfidenceAssessment:
2307 raise AssertionError("Confidence scoring should be disabled in this scenario")
2308
2309 async def verify_action(
2310 tool_name: str,
2311 tool_args: dict,
2312 result: str,
2313 expected: str = "",
2314 ) -> ActionVerification:
2315 raise AssertionError("Verification should not run for this scenario")
2316
2317 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2318 reference.parent.mkdir(parents=True)
2319 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2320
2321 context = build_context(
2322 temp_dir=temp_dir,
2323 messages=[],
2324 safeguards=FakeSafeguards(),
2325 assess_confidence=assess_confidence,
2326 verify_action=verify_action,
2327 auto_recover=False,
2328 )
2329 persistent_messages: list[str] = []
2330 ephemeral_messages: list[str] = []
2331 context.queue_steering_message_callback = persistent_messages.append
2332 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2333 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2334 dod = create_definition_of_done("Create a multi-file nginx guide.")
2335 sync_todos_to_definition_of_done(
2336 dod,
2337 [
2338 {
2339 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2340 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2341 "status": "pending",
2342 },
2343 {
2344 "content": "Create the nginx index.html file",
2345 "active_form": "Working on: Create the nginx index.html file",
2346 "status": "pending",
2347 },
2348 ],
2349 )
2350 tool_call = ToolCall(
2351 id="read-reference",
2352 name="read",
2353 arguments={"file_path": str(reference)},
2354 )
2355 executor = FakeExecutor(
2356 [
2357 tool_outcome(
2358 tool_call=tool_call,
2359 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2360 is_error=False,
2361 )
2362 ]
2363 )
2364
2365 summary = TurnSummary(final_response="")
2366 await runner.execute_batch(
2367 tool_calls=[tool_call],
2368 tool_source="assistant",
2369 pending_tool_calls_seen=set(),
2370 emit=_noop_emit,
2371 summary=summary,
2372 dod=dod,
2373 executor=executor, # type: ignore[arg-type]
2374 on_confirmation=None,
2375 on_user_question=None,
2376 emit_confirmation=None,
2377 consecutive_errors=0,
2378 )
2379
2380 assert any(
2381 "Continue with the next pending item: `Create the nginx index.html file`"
2382 in message
2383 for message in persistent_messages
2384 )
2385 assert any(
2386 "stop gathering more reference material and perform the change now" in message
2387 for message in persistent_messages
2388 )
2389 assert ephemeral_messages == []
2390
2391
2392 @pytest.mark.asyncio
2393 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2394 temp_dir: Path,
2395 ) -> None:
2396 async def assess_confidence(
2397 tool_name: str,
2398 tool_args: dict,
2399 context: str,
2400 ) -> ConfidenceAssessment:
2401 raise AssertionError("Confidence scoring should be disabled in this scenario")
2402
2403 async def verify_action(
2404 tool_name: str,
2405 tool_args: dict,
2406 result: str,
2407 expected: str = "",
2408 ) -> ActionVerification:
2409 raise AssertionError("Verification should not run for this scenario")
2410
2411 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2412 reference.parent.mkdir(parents=True)
2413 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2414
2415 context = build_context(
2416 temp_dir=temp_dir,
2417 messages=[],
2418 safeguards=FakeSafeguards(),
2419 assess_confidence=assess_confidence,
2420 verify_action=verify_action,
2421 auto_recover=False,
2422 )
2423 persistent_messages: list[str] = []
2424 ephemeral_messages: list[str] = []
2425 context.queue_steering_message_callback = persistent_messages.append
2426 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2427 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2428 dod = create_definition_of_done("Create a multi-file nginx guide.")
2429 sync_todos_to_definition_of_done(
2430 dod,
2431 [
2432 {
2433 "content": "First, examine the existing fortran guide structure and content",
2434 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2435 "status": "pending",
2436 },
2437 {
2438 "content": "Create the nginx directory structure",
2439 "active_form": "Working on: Create the nginx directory structure",
2440 "status": "pending",
2441 },
2442 ],
2443 )
2444 tool_call = ToolCall(
2445 id="read-reference",
2446 name="read",
2447 arguments={"file_path": str(reference)},
2448 )
2449 executor = FakeExecutor(
2450 [
2451 tool_outcome(
2452 tool_call=tool_call,
2453 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2454 is_error=False,
2455 )
2456 ]
2457 )
2458
2459 summary = TurnSummary(final_response="")
2460 await runner.execute_batch(
2461 tool_calls=[tool_call],
2462 tool_source="assistant",
2463 pending_tool_calls_seen=set(),
2464 emit=_noop_emit,
2465 summary=summary,
2466 dod=dod,
2467 executor=executor, # type: ignore[arg-type]
2468 on_confirmation=None,
2469 on_user_question=None,
2470 emit_confirmation=None,
2471 consecutive_errors=0,
2472 )
2473
2474 assert persistent_messages
2475 assert any(
2476 "Continue with the next pending item: `Create the nginx directory structure`"
2477 in message
2478 for message in persistent_messages
2479 )
2480 assert ephemeral_messages == []
2481
2482
2483 @pytest.mark.asyncio
2484 async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
2485 temp_dir: Path,
2486 ) -> None:
2487 async def assess_confidence(
2488 tool_name: str,
2489 tool_args: dict,
2490 context: str,
2491 ) -> ConfidenceAssessment:
2492 raise AssertionError("Confidence scoring should be disabled in this scenario")
2493
2494 async def verify_action(
2495 tool_name: str,
2496 tool_args: dict,
2497 result: str,
2498 expected: str = "",
2499 ) -> ActionVerification:
2500 raise AssertionError("Verification should not run for this scenario")
2501
2502 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2503 chapters = nginx_root / "chapters"
2504 implementation_plan = temp_dir / "implementation.md"
2505 implementation_plan.write_text(
2506 "\n".join(
2507 [
2508 "# Implementation Plan",
2509 "",
2510 "## File Changes",
2511 f"- `{chapters}/`",
2512 f"- `{nginx_root / 'index.html'}`",
2513 "",
2514 ]
2515 )
2516 )
2517
2518 context = build_context(
2519 temp_dir=temp_dir,
2520 messages=[],
2521 safeguards=FakeSafeguards(),
2522 assess_confidence=assess_confidence,
2523 verify_action=verify_action,
2524 auto_recover=False,
2525 )
2526 persistent_messages: list[str] = []
2527 ephemeral_messages: list[str] = []
2528 context.queue_steering_message_callback = persistent_messages.append
2529 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2530 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2531 dod = create_definition_of_done("Create a multi-file nginx guide.")
2532 dod.implementation_plan = str(implementation_plan)
2533 sync_todos_to_definition_of_done(
2534 dod,
2535 [
2536 {
2537 "content": "Create the nginx directory structure",
2538 "active_form": "Creating the nginx directory structure",
2539 "status": "pending",
2540 },
2541 {
2542 "content": "Develop the main index.html file with proper structure",
2543 "active_form": "Developing the main index.html file with proper structure",
2544 "status": "pending",
2545 },
2546 ],
2547 )
2548
2549 tool_call = ToolCall(
2550 id="mkdir-nginx",
2551 name="bash",
2552 arguments={"command": f"mkdir -p {chapters}"},
2553 )
2554 executor = FakeExecutor(
2555 [
2556 tool_outcome(
2557 tool_call=tool_call,
2558 output="",
2559 is_error=False,
2560 )
2561 ]
2562 )
2563
2564 summary = TurnSummary(final_response="")
2565 await runner.execute_batch(
2566 tool_calls=[tool_call],
2567 tool_source="assistant",
2568 pending_tool_calls_seen=set(),
2569 emit=_noop_emit,
2570 summary=summary,
2571 dod=dod,
2572 executor=executor, # type: ignore[arg-type]
2573 on_confirmation=None,
2574 on_user_question=None,
2575 emit_confirmation=None,
2576 consecutive_errors=0,
2577 )
2578
2579 assert persistent_messages
2580 message = persistent_messages[-1]
2581 assert "Directory setup is complete." in message
2582 assert "Next step: create `index.html`." in message
2583 assert "Write a compact but real initial version of that file now" in message
2584 assert ephemeral_messages == []
2585
2586
2587 @pytest.mark.asyncio
2588 async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
2589 temp_dir: Path,
2590 ) -> None:
2591 async def assess_confidence(
2592 tool_name: str,
2593 tool_args: dict,
2594 context: str,
2595 ) -> ConfidenceAssessment:
2596 raise AssertionError("Confidence scoring should be disabled in this scenario")
2597
2598 async def verify_action(
2599 tool_name: str,
2600 tool_args: dict,
2601 result: str,
2602 expected: str = "",
2603 ) -> ActionVerification:
2604 raise AssertionError("Verification should not run for this scenario")
2605
2606 nginx_root = temp_dir / "guides" / "nginx"
2607 chapters = nginx_root / "chapters"
2608 chapters.mkdir(parents=True)
2609 index_path = nginx_root / "index.html"
2610
2611 implementation_plan = temp_dir / "implementation.md"
2612 implementation_plan.write_text(
2613 "\n".join(
2614 [
2615 "# Implementation Plan",
2616 "",
2617 "## File Changes",
2618 f"- `{chapters}/`",
2619 f"- `{index_path}`",
2620 f"- `{chapters / '01-introduction.html'}`",
2621 "",
2622 ]
2623 )
2624 )
2625
2626 context = build_context(
2627 temp_dir=temp_dir,
2628 messages=[],
2629 safeguards=FakeSafeguards(),
2630 assess_confidence=assess_confidence,
2631 verify_action=verify_action,
2632 auto_recover=False,
2633 )
2634 persistent_messages: list[str] = []
2635 ephemeral_messages: list[str] = []
2636 context.queue_steering_message_callback = persistent_messages.append
2637 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2638 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2639 dod = create_definition_of_done("Create a multi-file nginx guide.")
2640 dod.implementation_plan = str(implementation_plan)
2641 sync_todos_to_definition_of_done(
2642 dod,
2643 [
2644 {
2645 "content": "Create the main index.html file with proper structure",
2646 "active_form": "Creating the main index.html file with proper structure",
2647 "status": "pending",
2648 },
2649 {
2650 "content": "Create each chapter file with appropriate content",
2651 "active_form": "Creating each chapter file with appropriate content",
2652 "status": "pending",
2653 },
2654 ],
2655 )
2656
2657 tool_call = ToolCall(
2658 id="write-index",
2659 name="write",
2660 arguments={
2661 "file_path": str(index_path),
2662 "content": "<html></html>\n",
2663 },
2664 )
2665 executor = FakeExecutor(
2666 [
2667 tool_outcome(
2668 tool_call=tool_call,
2669 output=f"Successfully wrote 14 bytes to {index_path}",
2670 is_error=False,
2671 )
2672 ]
2673 )
2674
2675 summary = TurnSummary(final_response="")
2676 await runner.execute_batch(
2677 tool_calls=[tool_call],
2678 tool_source="assistant",
2679 pending_tool_calls_seen=set(),
2680 emit=_noop_emit,
2681 summary=summary,
2682 dod=dod,
2683 executor=executor, # type: ignore[arg-type]
2684 on_confirmation=None,
2685 on_user_question=None,
2686 emit_confirmation=None,
2687 consecutive_errors=0,
2688 )
2689
2690 assert persistent_messages
2691 assert ephemeral_messages == []
2692 message = persistent_messages[-1]
2693 assert "Confirmed progress:" in message
2694 assert "Next step: create `01-introduction.html`." in message
2695 assert (
2696 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2697 in message
2698 )
2699 assert "Write a compact but real initial version of that file now" not in message
2700 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2701
2702
2703 @pytest.mark.asyncio
2704 async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
2705 temp_dir: Path,
2706 monkeypatch: pytest.MonkeyPatch,
2707 ) -> None:
2708 monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
2709
2710 async def assess_confidence(
2711 tool_name: str,
2712 tool_args: dict,
2713 context: str,
2714 ) -> ConfidenceAssessment:
2715 raise AssertionError("Confidence scoring should be disabled in this scenario")
2716
2717 async def verify_action(
2718 tool_name: str,
2719 tool_args: dict,
2720 result: str,
2721 expected: str = "",
2722 ) -> ActionVerification:
2723 raise AssertionError("Verification should not run for this scenario")
2724
2725 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2726 chapters = nginx_root / "chapters"
2727 index_path = nginx_root / "index.html"
2728
2729 implementation_plan = temp_dir / "implementation.md"
2730 implementation_plan.write_text(
2731 "\n".join(
2732 [
2733 "# Implementation Plan",
2734 "",
2735 "## File Changes",
2736 f"- `{chapters}/`",
2737 f"- `{index_path}`",
2738 "",
2739 ]
2740 )
2741 )
2742
2743 context = build_context(
2744 temp_dir=temp_dir,
2745 messages=[],
2746 safeguards=FakeSafeguards(),
2747 assess_confidence=assess_confidence,
2748 verify_action=verify_action,
2749 auto_recover=False,
2750 )
2751 persistent_messages: list[str] = []
2752 context.queue_steering_message_callback = persistent_messages.append
2753 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2754 dod = create_definition_of_done("Create a multi-file nginx guide.")
2755 dod.implementation_plan = str(implementation_plan)
2756 sync_todos_to_definition_of_done(
2757 dod,
2758 [
2759 {
2760 "content": "Create the nginx directory structure",
2761 "active_form": "Creating the nginx directory structure",
2762 "status": "pending",
2763 },
2764 {
2765 "content": "Develop the main index.html file with proper structure",
2766 "active_form": "Developing the main index.html file with proper structure",
2767 "status": "pending",
2768 },
2769 ],
2770 )
2771
2772 tool_call = ToolCall(
2773 id="mkdir-nginx-home",
2774 name="bash",
2775 arguments={"command": f"mkdir -p {chapters}"},
2776 )
2777 executor = FakeExecutor(
2778 [
2779 tool_outcome(
2780 tool_call=tool_call,
2781 output="",
2782 is_error=False,
2783 )
2784 ]
2785 )
2786
2787 summary = TurnSummary(final_response="")
2788 await runner.execute_batch(
2789 tool_calls=[tool_call],
2790 tool_source="assistant",
2791 pending_tool_calls_seen=set(),
2792 emit=_noop_emit,
2793 summary=summary,
2794 dod=dod,
2795 executor=executor, # type: ignore[arg-type]
2796 on_confirmation=None,
2797 on_user_question=None,
2798 emit_confirmation=None,
2799 consecutive_errors=0,
2800 )
2801
2802 assert persistent_messages
2803 message = persistent_messages[-1]
2804 assert "Next step: create `index.html`." in message
2805 assert "`~/Loader/guides/nginx/index.html`" in message
2806 assert "Write a compact but real initial version of that file now" in message
2807
2808
2809 @pytest.mark.asyncio
2810 async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
2811 temp_dir: Path,
2812 ) -> None:
2813 async def assess_confidence(
2814 tool_name: str,
2815 tool_args: dict,
2816 context: str,
2817 ) -> ConfidenceAssessment:
2818 raise AssertionError("Confidence scoring should not run in this scenario")
2819
2820 async def verify_action(
2821 tool_name: str,
2822 tool_args: dict,
2823 result: str,
2824 expected: str = "",
2825 ) -> ActionVerification:
2826 raise AssertionError("Verification should not run in this scenario")
2827
2828 nginx_root = temp_dir / "guides" / "nginx"
2829 chapters = nginx_root / "chapters"
2830 chapters.mkdir(parents=True)
2831 index_path = nginx_root / "index.html"
2832 index_path.write_text(
2833 "\n".join(
2834 [
2835 "<html>",
2836 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2837 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2838 "</html>",
2839 ]
2840 )
2841 + "\n"
2842 )
2843
2844 implementation_plan = temp_dir / "implementation.md"
2845 implementation_plan.write_text(
2846 "\n".join(
2847 [
2848 "# Implementation Plan",
2849 "",
2850 "## File Changes",
2851 f"- `{nginx_root}/`",
2852 f"- `{chapters}/`",
2853 f"- `{index_path}`",
2854 f"- `{chapters / '01-introduction.html'}`",
2855 "",
2856 ]
2857 )
2858 )
2859
2860 context = build_context(
2861 temp_dir=temp_dir,
2862 messages=[],
2863 safeguards=FakeSafeguards(),
2864 assess_confidence=assess_confidence,
2865 verify_action=verify_action,
2866 auto_recover=False,
2867 )
2868 persistent_messages: list[str] = []
2869 ephemeral_messages: list[str] = []
2870 context.queue_steering_message_callback = persistent_messages.append
2871 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2872 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2873 dod = create_definition_of_done("Create a multi-file nginx guide.")
2874 dod.implementation_plan = str(implementation_plan)
2875 dod.touched_files.append(str(index_path))
2876 dod.completed_items.append("Develop the main index.html file for the nginx guide")
2877 dod.pending_items.append("Create chapter files for the nginx guide")
2878
2879 tool_call = ToolCall(
2880 id="read-index-self-audit",
2881 name="read",
2882 arguments={"file_path": str(index_path)},
2883 )
2884 executor = FakeExecutor(
2885 [
2886 tool_outcome(
2887 tool_call=tool_call,
2888 output="1\t<html>\n",
2889 is_error=False,
2890 )
2891 ]
2892 )
2893
2894 summary = TurnSummary(final_response="")
2895 await runner.execute_batch(
2896 tool_calls=[tool_call],
2897 tool_source="assistant",
2898 pending_tool_calls_seen=set(),
2899 emit=_noop_emit,
2900 summary=summary,
2901 dod=dod,
2902 executor=executor, # type: ignore[arg-type]
2903 on_confirmation=None,
2904 on_user_question=None,
2905 emit_confirmation=None,
2906 consecutive_errors=0,
2907 )
2908
2909 assert persistent_messages
2910 message = persistent_messages[-1]
2911 assert "You already have the current contents of `index.html` from the successful write." in message
2912 assert "Resume by creating `01-introduction.html` now." in message
2913 assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
2914 assert ephemeral_messages == []
2915
2916
2917 @pytest.mark.asyncio
2918 async def test_tool_batch_runner_preserves_first_file_handoff_after_recovery_prompt(
2919 temp_dir: Path,
2920 ) -> None:
2921 async def assess_confidence(
2922 tool_name: str,
2923 tool_args: dict,
2924 context: str,
2925 ) -> ConfidenceAssessment:
2926 raise AssertionError("Confidence scoring should be disabled in this scenario")
2927
2928 async def verify_action(
2929 tool_name: str,
2930 tool_args: dict,
2931 result: str,
2932 expected: str = "",
2933 ) -> ActionVerification:
2934 raise AssertionError("Verification should not run for this scenario")
2935
2936 nginx_root = temp_dir / "guides" / "nginx"
2937 chapters = nginx_root / "chapters"
2938 chapters.mkdir(parents=True)
2939 index_path = nginx_root / "index.html"
2940
2941 implementation_plan = temp_dir / "implementation.md"
2942 implementation_plan.write_text(
2943 "\n".join(
2944 [
2945 "# Implementation Plan",
2946 "",
2947 "## File Changes",
2948 f"- `{chapters}/`",
2949 f"- `{index_path}`",
2950 f"- `{chapters / '01-introduction.html'}`",
2951 "",
2952 ]
2953 )
2954 )
2955
2956 context = build_context(
2957 temp_dir=temp_dir,
2958 messages=[
2959 Message(
2960 role=Role.USER,
2961 content=(
2962 "[EMPTY ASSISTANT RESPONSE]\n"
2963 "Respond with that concrete mutation tool call now. Do not return an empty response."
2964 ),
2965 )
2966 ],
2967 safeguards=FakeSafeguards(),
2968 assess_confidence=assess_confidence,
2969 verify_action=verify_action,
2970 auto_recover=False,
2971 )
2972 persistent_messages: list[str] = []
2973 ephemeral_messages: list[str] = []
2974 context.queue_steering_message_callback = persistent_messages.append
2975 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2976 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2977 dod = create_definition_of_done("Create a multi-file nginx guide.")
2978 dod.implementation_plan = str(implementation_plan)
2979 sync_todos_to_definition_of_done(
2980 dod,
2981 [
2982 {
2983 "content": "Create the main index.html file with proper structure",
2984 "active_form": "Creating the main index.html file with proper structure",
2985 "status": "pending",
2986 },
2987 {
2988 "content": "Create each chapter file with appropriate content",
2989 "active_form": "Creating each chapter file with appropriate content",
2990 "status": "pending",
2991 },
2992 ],
2993 )
2994
2995 tool_call = ToolCall(
2996 id="write-index-recovered",
2997 name="write",
2998 arguments={
2999 "file_path": str(index_path),
3000 "content": "<html></html>\n",
3001 },
3002 )
3003 executor = FakeExecutor(
3004 [
3005 tool_outcome(
3006 tool_call=tool_call,
3007 output=f"Successfully wrote 14 bytes to {index_path}",
3008 is_error=False,
3009 )
3010 ]
3011 )
3012
3013 summary = TurnSummary(final_response="")
3014 await runner.execute_batch(
3015 tool_calls=[tool_call],
3016 tool_source="assistant",
3017 pending_tool_calls_seen=set(),
3018 emit=_noop_emit,
3019 summary=summary,
3020 dod=dod,
3021 executor=executor, # type: ignore[arg-type]
3022 on_confirmation=None,
3023 on_user_question=None,
3024 emit_confirmation=None,
3025 consecutive_errors=0,
3026 )
3027
3028 assert persistent_messages
3029 assert ephemeral_messages == []
3030 message = persistent_messages[-1]
3031 assert "Next step: create `01-introduction.html`." in message
3032 assert "Write a compact but real initial version of that file now" not in message
3033
3034
3035 @pytest.mark.asyncio
3036 async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
3037 temp_dir: Path,
3038 ) -> None:
3039 async def assess_confidence(
3040 tool_name: str,
3041 tool_args: dict,
3042 context: str,
3043 ) -> ConfidenceAssessment:
3044 raise AssertionError("Confidence scoring should not run in this scenario")
3045
3046 async def verify_action(
3047 tool_name: str,
3048 tool_args: dict,
3049 result: str,
3050 expected: str = "",
3051 ) -> ActionVerification:
3052 raise AssertionError("Verification should not run in this scenario")
3053
3054 guide_root = temp_dir / "guides" / "nginx"
3055 chapters = guide_root / "chapters"
3056 chapters.mkdir(parents=True)
3057 index_path = guide_root / "index.html"
3058 index_path.write_text(
3059 "\n".join(
3060 [
3061 "<html>",
3062 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
3063 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3064 "</html>",
3065 ]
3066 )
3067 + "\n"
3068 )
3069
3070 implementation_plan = temp_dir / "implementation.md"
3071 implementation_plan.write_text(
3072 "\n".join(
3073 [
3074 "# Implementation Plan",
3075 "",
3076 "## File Changes",
3077 f"- `{guide_root}/`",
3078 f"- `{chapters}/`",
3079 f"- `{index_path}`",
3080 "",
3081 ]
3082 )
3083 )
3084
3085 context = build_context(
3086 temp_dir=temp_dir,
3087 messages=[],
3088 safeguards=FakeSafeguards(),
3089 assess_confidence=assess_confidence,
3090 verify_action=verify_action,
3091 )
3092 queued_messages: list[str] = []
3093 context.queue_steering_message_callback = queued_messages.append
3094 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3095 dod = create_definition_of_done("Create a multi-file nginx guide.")
3096 dod.implementation_plan = str(implementation_plan)
3097 dod.touched_files.append(str(index_path))
3098 sync_todos_to_definition_of_done(
3099 dod,
3100 [
3101 {
3102 "content": "Develop the main index.html file with proper structure",
3103 "active_form": "Developing the main index.html file with proper structure",
3104 "status": "completed",
3105 },
3106 {
3107 "content": "Create chapter files with content and structure",
3108 "active_form": "Creating chapter files with content and structure",
3109 "status": "pending",
3110 },
3111 ],
3112 )
3113
3114 todos = [
3115 {
3116 "content": "Develop the main index.html file with proper structure",
3117 "active_form": "Developing the main index.html file with proper structure",
3118 "status": "completed",
3119 },
3120 {
3121 "content": "Create chapter files with content and structure",
3122 "active_form": "Creating chapter files with content and structure",
3123 "status": "pending",
3124 },
3125 ]
3126 tool_call = ToolCall(
3127 id="todo-aggregate",
3128 name="TodoWrite",
3129 arguments={"todos": todos},
3130 )
3131 executor = FakeExecutor(
3132 [
3133 tool_outcome(
3134 tool_call=tool_call,
3135 output="Todos updated",
3136 is_error=False,
3137 metadata={"new_todos": todos},
3138 )
3139 ]
3140 )
3141
3142 summary = TurnSummary(final_response="")
3143 await runner.execute_batch(
3144 tool_calls=[tool_call],
3145 tool_source="assistant",
3146 pending_tool_calls_seen=set(),
3147 emit=_noop_emit,
3148 summary=summary,
3149 dod=dod,
3150 executor=executor, # type: ignore[arg-type]
3151 on_confirmation=None,
3152 on_user_question=None,
3153 emit_confirmation=None,
3154 consecutive_errors=0,
3155 )
3156
3157 assert queued_messages
3158 message = queued_messages[-1]
3159 assert "Todo tracking is updated." in message
3160 assert "Next step: create `01-introduction.html`." in message
3161 assert (
3162 "Continue with the next pending item: `Create chapter files with content and structure`."
3163 not in message
3164 )
3165
3166
3167 @pytest.mark.asyncio
3168 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
3169 temp_dir: Path,
3170 ) -> None:
3171 async def assess_confidence(
3172 tool_name: str,
3173 tool_args: dict,
3174 context: str,
3175 ) -> ConfidenceAssessment:
3176 raise AssertionError("Confidence scoring should be disabled in this scenario")
3177
3178 async def verify_action(
3179 tool_name: str,
3180 tool_args: dict,
3181 result: str,
3182 expected: str = "",
3183 ) -> ActionVerification:
3184 raise AssertionError("Verification should not run for this scenario")
3185
3186 guide_root = temp_dir / "guides" / "nginx"
3187 chapters = guide_root / "chapters"
3188 chapters.mkdir(parents=True)
3189 index_path = guide_root / "index.html"
3190 chapter_one = chapters / "01-getting-started.html"
3191 chapter_one.write_text("<h1>One</h1>\n")
3192 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3193
3194 implementation_plan = temp_dir / "implementation.md"
3195 implementation_plan.write_text(
3196 "\n".join(
3197 [
3198 "# Implementation Plan",
3199 "",
3200 "## File Changes",
3201 f"- `{index_path}`",
3202 f"- `{chapter_one}`",
3203 f"- `{chapters / '06-ssl-configuration.html'}`",
3204 "",
3205 ]
3206 )
3207 )
3208
3209 context = build_context(
3210 temp_dir=temp_dir,
3211 messages=[],
3212 safeguards=FakeSafeguards(),
3213 assess_confidence=assess_confidence,
3214 verify_action=verify_action,
3215 auto_recover=False,
3216 )
3217 persistent_messages: list[str] = []
3218 ephemeral_messages: list[str] = []
3219 context.queue_steering_message_callback = persistent_messages.append
3220 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3221 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3222 dod = create_definition_of_done("Create a multi-file nginx guide.")
3223 dod.implementation_plan = str(implementation_plan)
3224 sync_todos_to_definition_of_done(
3225 dod,
3226 [
3227 {
3228 "content": "Ensure all files are properly linked and formatted consistently",
3229 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3230 "status": "pending",
3231 },
3232 {
3233 "content": "Create the final chapter (06-ssl-configuration.html)",
3234 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
3235 "status": "pending",
3236 },
3237 ],
3238 )
3239 assert tool_batches_should_prioritize_missing_artifact(
3240 dod=dod,
3241 next_pending=dod.pending_items[0],
3242 missing_artifact=(chapters / "06-ssl-configuration.html", False),
3243 project_root=temp_dir,
3244 )
3245
3246 tool_call = ToolCall(
3247 id="dup-read",
3248 name="read",
3249 arguments={"file_path": str(index_path)},
3250 )
3251 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
3252
3253 assert persistent_messages
3254 message = persistent_messages[-1]
3255 assert "06-ssl-configuration.html" in message
3256 assert "Do not switch into review or consistency-check mode" in message
3257 assert (
3258 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
3259 not in message
3260 )
3261
3262
3263 @pytest.mark.asyncio
3264 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
3265 temp_dir: Path,
3266 ) -> None:
3267 async def assess_confidence(
3268 tool_name: str,
3269 tool_args: dict,
3270 context: str,
3271 ) -> ConfidenceAssessment:
3272 raise AssertionError("Confidence scoring should be disabled in this scenario")
3273
3274 async def verify_action(
3275 tool_name: str,
3276 tool_args: dict,
3277 result: str,
3278 expected: str = "",
3279 ) -> ActionVerification:
3280 raise AssertionError("Verification should not run for this scenario")
3281
3282 guide_root = temp_dir / "guides" / "nginx"
3283 chapters = guide_root / "chapters"
3284 chapters.mkdir(parents=True)
3285 index_path = guide_root / "index.html"
3286 chapter_one = chapters / "01-getting-started.html"
3287 chapter_two = chapters / "02-installation.html"
3288 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3289 chapter_one.write_text("<h1>One</h1>\n")
3290 chapter_two.write_text("<h1>Two</h1>\n")
3291
3292 implementation_plan = temp_dir / "implementation.md"
3293 implementation_plan.write_text(
3294 "\n".join(
3295 [
3296 "# Implementation Plan",
3297 "",
3298 "## File Changes",
3299 f"- `{chapters}/`",
3300 f"- `{index_path}`",
3301 f"- `{chapter_one}`",
3302 f"- `{chapter_two}`",
3303 "",
3304 ]
3305 )
3306 )
3307
3308 context = build_context(
3309 temp_dir=temp_dir,
3310 messages=[],
3311 safeguards=FakeSafeguards(),
3312 assess_confidence=assess_confidence,
3313 verify_action=verify_action,
3314 auto_recover=False,
3315 )
3316 persistent_messages: list[str] = []
3317 ephemeral_messages: list[str] = []
3318 context.queue_steering_message_callback = persistent_messages.append
3319 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3320 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3321 dod = create_definition_of_done("Create a multi-file nginx guide.")
3322 dod.implementation_plan = str(implementation_plan)
3323 sync_todos_to_definition_of_done(
3324 dod,
3325 [
3326 {
3327 "content": "Create the guide files",
3328 "active_form": "Working on: Create the guide files",
3329 "status": "completed",
3330 },
3331 {
3332 "content": "Ensure all files are properly linked and formatted consistently",
3333 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3334 "status": "pending",
3335 },
3336 ],
3337 )
3338 tool_call = ToolCall(
3339 id="write-final",
3340 name="write",
3341 arguments={
3342 "file_path": str(chapter_two),
3343 "content": "<h1>Two</h1>\n",
3344 },
3345 )
3346 executor = FakeExecutor(
3347 [
3348 tool_outcome(
3349 tool_call=tool_call,
3350 output=f"Successfully wrote {chapter_two}",
3351 is_error=False,
3352 )
3353 ]
3354 )
3355
3356 summary = TurnSummary(final_response="")
3357 await runner.execute_batch(
3358 tool_calls=[tool_call],
3359 tool_source="assistant",
3360 pending_tool_calls_seen=set(),
3361 emit=_noop_emit,
3362 summary=summary,
3363 dod=dod,
3364 executor=executor, # type: ignore[arg-type]
3365 on_confirmation=None,
3366 on_user_question=None,
3367 emit_confirmation=None,
3368 consecutive_errors=0,
3369 )
3370
3371 assert any(
3372 "All explicitly planned artifacts now exist on disk." in message
3373 for message in persistent_messages
3374 )
3375 assert any(
3376 "Ensure all files are properly linked and formatted consistently" in message
3377 for message in persistent_messages
3378 )
3379 assert any(
3380 "Move to verification once no specific mismatch remains." in message
3381 for message in persistent_messages
3382 )
3383
3384
3385 @pytest.mark.asyncio
3386 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
3387 temp_dir: Path,
3388 ) -> None:
3389 async def assess_confidence(
3390 tool_name: str,
3391 tool_args: dict,
3392 context: str,
3393 ) -> ConfidenceAssessment:
3394 raise AssertionError("Confidence scoring should not run in this scenario")
3395
3396 async def verify_action(
3397 tool_name: str,
3398 tool_args: dict,
3399 result: str,
3400 expected: str = "",
3401 ) -> ActionVerification:
3402 raise AssertionError("Verification should not run in this scenario")
3403
3404 guide_root = temp_dir / "guides" / "nginx"
3405 chapters = guide_root / "chapters"
3406 guide_root.mkdir(parents=True)
3407 chapters.mkdir()
3408 index_path = guide_root / "index.html"
3409 index_path.write_text("<html></html>\n")
3410 chapter_one = chapters / "01-getting-started.html"
3411 chapter_two = chapters / "02-installation.html"
3412 implementation_plan = temp_dir / "implementation.md"
3413 implementation_plan.write_text(
3414 "\n".join(
3415 [
3416 "# Implementation Plan",
3417 "",
3418 "## File Changes",
3419 f"- `{guide_root}/`",
3420 f"- `{index_path}`",
3421 f"- `{chapter_one}`",
3422 f"- `{chapter_two}`",
3423 "",
3424 ]
3425 )
3426 )
3427
3428 context = build_context(
3429 temp_dir=temp_dir,
3430 messages=[],
3431 safeguards=FakeSafeguards(),
3432 assess_confidence=assess_confidence,
3433 verify_action=verify_action,
3434 auto_recover=False,
3435 )
3436 persistent_messages: list[str] = []
3437 ephemeral_messages: list[str] = []
3438 context.queue_steering_message_callback = persistent_messages.append
3439 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3440 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3441 dod = create_definition_of_done("Create a multi-file nginx guide.")
3442 dod.implementation_plan = str(implementation_plan)
3443 sync_todos_to_definition_of_done(
3444 dod,
3445 [
3446 {
3447 "content": "Create the main index.html file with proper structure",
3448 "active_form": "Working on: Create the main index.html file with proper structure",
3449 "status": "pending",
3450 },
3451 {
3452 "content": "Create each chapter file in sequence, following the established pattern",
3453 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
3454 "status": "pending",
3455 },
3456 {
3457 "content": "Ensure all files are properly linked and formatted consistently",
3458 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3459 "status": "pending",
3460 },
3461 ],
3462 )
3463 tool_call = ToolCall(
3464 id="write-index",
3465 name="write",
3466 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
3467 )
3468 executor = FakeExecutor(
3469 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
3470 )
3471
3472 summary = TurnSummary(final_response="")
3473 await runner.execute_batch(
3474 tool_calls=[tool_call],
3475 tool_source="assistant",
3476 pending_tool_calls_seen=set(),
3477 emit=_noop_emit,
3478 summary=summary,
3479 dod=dod,
3480 executor=executor, # type: ignore[arg-type]
3481 on_confirmation=None,
3482 on_user_question=None,
3483 emit_confirmation=None,
3484 consecutive_errors=0,
3485 )
3486
3487 assert persistent_messages
3488 assert ephemeral_messages == []
3489 message = persistent_messages[-1]
3490 assert "Next step: create `01-getting-started.html`." in message
3491 assert "Write a compact but real initial version of that file now" not in message
3492 assert "refresh `TodoWrite`" not in message
3493 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3494
3495
3496 @pytest.mark.asyncio
3497 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
3498 temp_dir: Path,
3499 ) -> None:
3500 async def assess_confidence(
3501 tool_name: str,
3502 tool_args: dict,
3503 context: str,
3504 ) -> ConfidenceAssessment:
3505 raise AssertionError("Confidence scoring should not run in this scenario")
3506
3507 async def verify_action(
3508 tool_name: str,
3509 tool_args: dict,
3510 result: str,
3511 expected: str = "",
3512 ) -> ActionVerification:
3513 raise AssertionError("Verification should not run in this scenario")
3514
3515 guide_root = temp_dir / "guides" / "nginx"
3516 chapters = guide_root / "chapters"
3517 guide_root.mkdir(parents=True)
3518 chapters.mkdir()
3519 index_path = guide_root / "index.html"
3520 index_path.write_text("<html></html>\n")
3521
3522 chapter_paths = [
3523 chapters / "01-getting-started.html",
3524 chapters / "02-installation.html",
3525 chapters / "03-first-website.html",
3526 chapters / "04-configuration-basics.html",
3527 chapters / "05-advanced-configurations.html",
3528 chapters / "06-performance-tuning.html",
3529 chapters / "07-security-best-practices.html",
3530 ]
3531 for chapter in chapter_paths[:4]:
3532 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
3533 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
3534
3535 implementation_plan = temp_dir / "implementation.md"
3536 implementation_plan.write_text(
3537 "\n".join(
3538 [
3539 "# Implementation Plan",
3540 "",
3541 "## File Changes",
3542 f"- `{guide_root}/`",
3543 f"- `{chapters}/`",
3544 f"- `{index_path}`",
3545 *[f"- `{path}`" for path in chapter_paths],
3546 "",
3547 ]
3548 )
3549 )
3550
3551 context = build_context(
3552 temp_dir=temp_dir,
3553 messages=[],
3554 safeguards=FakeSafeguards(),
3555 assess_confidence=assess_confidence,
3556 verify_action=verify_action,
3557 auto_recover=False,
3558 )
3559 persistent_messages: list[str] = []
3560 ephemeral_messages: list[str] = []
3561 context.queue_steering_message_callback = persistent_messages.append
3562 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3563 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3564 dod = create_definition_of_done("Create a thorough nginx guide.")
3565 dod.implementation_plan = str(implementation_plan)
3566 sync_todos_to_definition_of_done(
3567 dod,
3568 [
3569 {
3570 "content": "Create the nginx guide artifacts",
3571 "active_form": "Creating nginx guide artifacts",
3572 "status": "pending",
3573 },
3574 {
3575 "content": "Verify all guide files are linked and complete",
3576 "active_form": "Verifying guide linkage and completeness",
3577 "status": "pending",
3578 },
3579 ],
3580 )
3581 tool_call = ToolCall(
3582 id="write-chapter-05",
3583 name="write",
3584 arguments={
3585 "file_path": str(chapter_paths[4]),
3586 "content": "<h1>Advanced configurations</h1>\n",
3587 },
3588 )
3589 executor = FakeExecutor(
3590 [
3591 tool_outcome(
3592 tool_call=tool_call,
3593 output=f"Successfully wrote {chapter_paths[4]}",
3594 is_error=False,
3595 )
3596 ]
3597 )
3598
3599 summary = TurnSummary(final_response="")
3600 await runner.execute_batch(
3601 tool_calls=[tool_call],
3602 tool_source="assistant",
3603 pending_tool_calls_seen=set(),
3604 emit=_noop_emit,
3605 summary=summary,
3606 dod=dod,
3607 executor=executor, # type: ignore[arg-type]
3608 on_confirmation=None,
3609 on_user_question=None,
3610 emit_confirmation=None,
3611 consecutive_errors=0,
3612 )
3613
3614 assert any(
3615 "Next step: create `06-performance-tuning.html`." in message
3616 for message in ephemeral_messages
3617 )
3618 assert not any(
3619 "All explicitly planned artifacts now exist on disk." in message
3620 for message in ephemeral_messages
3621 )
3622
3623
3624 @pytest.mark.asyncio
3625 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
3626 temp_dir: Path,
3627 ) -> None:
3628 async def assess_confidence(
3629 tool_name: str,
3630 tool_args: dict,
3631 context: str,
3632 ) -> ConfidenceAssessment:
3633 raise AssertionError("Confidence scoring should not run in this scenario")
3634
3635 async def verify_action(
3636 tool_name: str,
3637 tool_args: dict,
3638 result: str,
3639 expected: str = "",
3640 ) -> ActionVerification:
3641 raise AssertionError("Verification should not run in this scenario")
3642
3643 guide_root = temp_dir / "guides" / "nginx"
3644 chapters = guide_root / "chapters"
3645 guide_root.mkdir(parents=True)
3646 chapters.mkdir()
3647 index_path = guide_root / "index.html"
3648 chapter_paths = [
3649 chapters / "01-introduction.html",
3650 chapters / "02-installation.html",
3651 chapters / "03-configuration.html",
3652 chapters / "04-basic-usage.html",
3653 chapters / "05-advanced-features.html",
3654 ]
3655 for path in (index_path, *chapter_paths[:4]):
3656 path.write_text("<html></html>\n")
3657
3658 implementation_plan = temp_dir / "implementation.md"
3659 implementation_plan.write_text(
3660 "\n".join(
3661 [
3662 "# Implementation Plan",
3663 "",
3664 "## File Changes",
3665 f"- `{guide_root}/`",
3666 f"- `{chapters}/`",
3667 f"- `{index_path}`",
3668 *[f"- `{path}`" for path in chapter_paths],
3669 "",
3670 ]
3671 )
3672 )
3673
3674 context = build_context(
3675 temp_dir=temp_dir,
3676 messages=[],
3677 safeguards=FakeSafeguards(),
3678 assess_confidence=assess_confidence,
3679 verify_action=verify_action,
3680 auto_recover=False,
3681 )
3682 persistent_messages: list[str] = []
3683 ephemeral_messages: list[str] = []
3684 context.queue_steering_message_callback = persistent_messages.append
3685 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3686 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3687 dod = create_definition_of_done("Create a thorough nginx guide.")
3688 dod.implementation_plan = str(implementation_plan)
3689 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3690 dod.completed_items.extend(
3691 [
3692 "Create the nginx directory structure",
3693 "Create the main index.html file with proper structure",
3694 ]
3695 )
3696 sync_todos_to_definition_of_done(
3697 dod,
3698 [
3699 {
3700 "content": "Create each chapter file with appropriate content",
3701 "active_form": "Creating each chapter file with appropriate content",
3702 "status": "pending",
3703 }
3704 ],
3705 )
3706 tool_call = ToolCall(
3707 id="write-chapter-04",
3708 name="write",
3709 arguments={
3710 "file_path": str(chapter_paths[3]),
3711 "content": "<html>updated</html>\n",
3712 },
3713 )
3714 executor = FakeExecutor(
3715 [
3716 tool_outcome(
3717 tool_call=tool_call,
3718 output=f"Successfully wrote {chapter_paths[3]}",
3719 is_error=False,
3720 )
3721 ]
3722 )
3723
3724 summary = TurnSummary(final_response="")
3725 await runner.execute_batch(
3726 tool_calls=[tool_call],
3727 tool_source="assistant",
3728 pending_tool_calls_seen=set(),
3729 emit=_noop_emit,
3730 summary=summary,
3731 dod=dod,
3732 executor=executor, # type: ignore[arg-type]
3733 on_confirmation=None,
3734 on_user_question=None,
3735 emit_confirmation=None,
3736 consecutive_errors=0,
3737 )
3738
3739 assert ephemeral_messages
3740 message = ephemeral_messages[-1]
3741 assert "Next step: create `05-advanced-features.html`." in message
3742 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3743 assert "refresh `TodoWrite`" not in message
3744
3745
3746 @pytest.mark.asyncio
3747 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3748 temp_dir: Path,
3749 ) -> None:
3750 async def assess_confidence(
3751 tool_name: str,
3752 tool_args: dict,
3753 context: str,
3754 ) -> ConfidenceAssessment:
3755 raise AssertionError("Confidence scoring should not run in this scenario")
3756
3757 async def verify_action(
3758 tool_name: str,
3759 tool_args: dict,
3760 result: str,
3761 expected: str = "",
3762 ) -> ActionVerification:
3763 raise AssertionError("Verification should not run in this scenario")
3764
3765 guide_root = temp_dir / "guides" / "nginx"
3766 chapters = guide_root / "chapters"
3767 guide_root.mkdir(parents=True)
3768 chapters.mkdir()
3769 index_path = guide_root / "index.html"
3770 index_path.write_text("<html></html>\n")
3771 chapter_one = chapters / "01-getting-started.html"
3772 chapter_two = chapters / "02-installation.html"
3773 chapter_one.write_text("<h1>One</h1>\n")
3774
3775 implementation_plan = temp_dir / "implementation.md"
3776 implementation_plan.write_text(
3777 "\n".join(
3778 [
3779 "# Implementation Plan",
3780 "",
3781 "## File Changes",
3782 f"- `{guide_root}/`",
3783 f"- `{chapters}/`",
3784 f"- `{index_path}`",
3785 f"- `{chapter_one}`",
3786 f"- `{chapter_two}`",
3787 "",
3788 ]
3789 )
3790 )
3791
3792 context = build_context(
3793 temp_dir=temp_dir,
3794 messages=[],
3795 safeguards=FakeSafeguards(),
3796 assess_confidence=assess_confidence,
3797 verify_action=verify_action,
3798 auto_recover=False,
3799 )
3800 persistent_messages: list[str] = []
3801 ephemeral_messages: list[str] = []
3802 context.queue_steering_message_callback = persistent_messages.append
3803 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3804 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3805 dod = create_definition_of_done("Create a multi-file nginx guide.")
3806 dod.implementation_plan = str(implementation_plan)
3807 sync_todos_to_definition_of_done(
3808 dod,
3809 [
3810 {
3811 "content": "Create 01-getting-started.html",
3812 "active_form": "Creating 01-getting-started.html",
3813 "status": "completed",
3814 },
3815 {
3816 "content": "Create 02-installation.html",
3817 "active_form": "Creating 02-installation.html",
3818 "status": "pending",
3819 },
3820 ],
3821 )
3822 dod.touched_files.extend([str(index_path), str(chapter_one)])
3823
3824 tool_call = ToolCall(
3825 id="todo-only",
3826 name="TodoWrite",
3827 arguments={
3828 "todos": [
3829 {
3830 "content": "Create 01-getting-started.html",
3831 "active_form": "Creating 01-getting-started.html",
3832 "status": "completed",
3833 },
3834 {
3835 "content": "Create 02-installation.html",
3836 "active_form": "Creating 02-installation.html",
3837 "status": "pending",
3838 },
3839 ]
3840 },
3841 )
3842 executor = FakeExecutor(
3843 [
3844 tool_outcome(
3845 tool_call=tool_call,
3846 output="Todos updated",
3847 is_error=False,
3848 metadata={
3849 "new_todos": [
3850 {
3851 "content": "Create 01-getting-started.html",
3852 "active_form": "Creating 01-getting-started.html",
3853 "status": "completed",
3854 },
3855 {
3856 "content": "Create 02-installation.html",
3857 "active_form": "Creating 02-installation.html",
3858 "status": "pending",
3859 },
3860 ]
3861 },
3862 )
3863 ]
3864 )
3865
3866 summary = TurnSummary(final_response="")
3867 await runner.execute_batch(
3868 tool_calls=[tool_call],
3869 tool_source="assistant",
3870 pending_tool_calls_seen=set(),
3871 emit=_noop_emit,
3872 summary=summary,
3873 dod=dod,
3874 executor=executor, # type: ignore[arg-type]
3875 on_confirmation=None,
3876 on_user_question=None,
3877 emit_confirmation=None,
3878 consecutive_errors=0,
3879 )
3880
3881 assert persistent_messages
3882 message = persistent_messages[-1]
3883 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
3884 assert "Prefer one `write(file_path=..., content=...)` call" in message
3885 assert "Make your next response the concrete mutation tool call itself." in message
3886 assert ephemeral_messages == []
3887
3888
3889 @pytest.mark.asyncio
3890 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3891 temp_dir: Path,
3892 ) -> None:
3893 async def assess_confidence(
3894 tool_name: str,
3895 tool_args: dict,
3896 context: str,
3897 ) -> ConfidenceAssessment:
3898 raise AssertionError("Confidence scoring should not run in this scenario")
3899
3900 async def verify_action(
3901 tool_name: str,
3902 tool_args: dict,
3903 result: str,
3904 expected: str = "",
3905 ) -> ActionVerification:
3906 raise AssertionError("Verification should not run in this scenario")
3907
3908 guide_root = temp_dir / "guides" / "nginx"
3909 chapters = guide_root / "chapters"
3910 guide_root.mkdir(parents=True)
3911 chapters.mkdir()
3912 index_path = guide_root / "index.html"
3913 chapter_one = chapters / "01-getting-started.html"
3914 chapter_two = chapters / "02-installation.html"
3915 index_path.write_text("<html></html>\n")
3916 chapter_one.write_text("<h1>One</h1>\n")
3917 chapter_two.write_text("<h1>Two</h1>\n")
3918
3919 implementation_plan = temp_dir / "implementation.md"
3920 implementation_plan.write_text(
3921 "\n".join(
3922 [
3923 "# Implementation Plan",
3924 "",
3925 "## File Changes",
3926 f"- `{guide_root}/`",
3927 f"- `{chapters}/`",
3928 f"- `{index_path}`",
3929 f"- `{chapter_one}`",
3930 f"- `{chapter_two}`",
3931 "",
3932 ]
3933 )
3934 )
3935
3936 context = build_context(
3937 temp_dir=temp_dir,
3938 messages=[],
3939 safeguards=FakeSafeguards(),
3940 assess_confidence=assess_confidence,
3941 verify_action=verify_action,
3942 auto_recover=False,
3943 )
3944 queued_messages: list[str] = []
3945 context.queue_steering_message_callback = queued_messages.append
3946 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3947 dod = create_definition_of_done("Create a multi-file nginx guide.")
3948 dod.implementation_plan = str(implementation_plan)
3949 dod.verification_commands = [f"ls -la {guide_root}"]
3950 sync_todos_to_definition_of_done(
3951 dod,
3952 [
3953 {
3954 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3955 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3956 "status": "pending",
3957 },
3958 {
3959 "content": "Verify all guide files are linked and complete",
3960 "active_form": "Working on: Verify all guide files are linked and complete",
3961 "status": "pending",
3962 },
3963 ],
3964 project_root=temp_dir,
3965 )
3966
3967 tool_call = ToolCall(
3968 id="todo-only",
3969 name="TodoWrite",
3970 arguments={
3971 "todos": [
3972 {
3973 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3974 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3975 "status": "pending",
3976 },
3977 {
3978 "content": "Verify all guide files are linked and complete",
3979 "active_form": "Working on: Verify all guide files are linked and complete",
3980 "status": "pending",
3981 },
3982 ]
3983 },
3984 )
3985 executor = FakeExecutor(
3986 [
3987 tool_outcome(
3988 tool_call=tool_call,
3989 output="Todos updated",
3990 is_error=False,
3991 metadata={
3992 "new_todos": [
3993 {
3994 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3995 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3996 "status": "pending",
3997 },
3998 {
3999 "content": "Verify all guide files are linked and complete",
4000 "active_form": "Working on: Verify all guide files are linked and complete",
4001 "status": "pending",
4002 },
4003 ]
4004 },
4005 )
4006 ]
4007 )
4008
4009 summary = TurnSummary(final_response="")
4010 await runner.execute_batch(
4011 tool_calls=[tool_call],
4012 tool_source="assistant",
4013 pending_tool_calls_seen=set(),
4014 emit=_noop_emit,
4015 summary=summary,
4016 dod=dod,
4017 executor=executor, # type: ignore[arg-type]
4018 on_confirmation=None,
4019 on_user_question=None,
4020 emit_confirmation=None,
4021 consecutive_errors=0,
4022 )
4023
4024 assert queued_messages
4025 message = queued_messages[-1]
4026 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4027 assert "Verify all guide files are linked and complete" in message
4028 assert "Move to verification once no specific mismatch remains." in message
4029 assert "reopen reference materials" in message
4030 assert "Fortran guide structure" not in message
4031 assert context.workflow_mode == "execute"
4032
4033
4034 @pytest.mark.asyncio
4035 async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
4036 temp_dir: Path,
4037 ) -> None:
4038 async def assess_confidence(
4039 tool_name: str,
4040 tool_args: dict,
4041 context: str,
4042 ) -> ConfidenceAssessment:
4043 raise AssertionError("Confidence scoring should not run for this scenario")
4044
4045 async def verify_action(
4046 tool_name: str,
4047 tool_args: dict,
4048 result: str,
4049 expected: str = "",
4050 ) -> ActionVerification:
4051 raise AssertionError("Verification should not run for this scenario")
4052
4053 guide_root = temp_dir / "guides" / "nginx"
4054 chapters = guide_root / "chapters"
4055 guide_root.mkdir(parents=True)
4056 chapters.mkdir()
4057 index_path = guide_root / "index.html"
4058 chapter_one = chapters / "01-introduction.html"
4059 chapter_two = chapters / "02-installation.html"
4060 index_path.write_text(
4061 "\n".join(
4062 [
4063 '<a href="chapters/01-introduction.html">Intro</a>',
4064 '<a href="chapters/02-installation.html">Install</a>',
4065 '<a href="../index.html">Back</a>',
4066 "",
4067 ]
4068 )
4069 )
4070 chapter_one.write_text("<html></html>\n")
4071 chapter_two.write_text("<html></html>\n")
4072
4073 implementation_plan = temp_dir / "implementation.md"
4074 implementation_plan.write_text(
4075 "\n".join(
4076 [
4077 "# Implementation Plan",
4078 "",
4079 "## File Changes",
4080 f"- `{guide_root}/`",
4081 f"- `{chapters}/`",
4082 f"- `{index_path}`",
4083 f"- `{chapter_one}`",
4084 f"- `{chapter_two}`",
4085 "",
4086 ]
4087 )
4088 )
4089
4090 context = build_context(
4091 temp_dir=temp_dir,
4092 messages=[],
4093 safeguards=FakeSafeguards(),
4094 assess_confidence=assess_confidence,
4095 verify_action=verify_action,
4096 auto_recover=False,
4097 )
4098 queued_messages: list[str] = []
4099 context.queue_steering_message_callback = queued_messages.append
4100 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4101 dod = create_definition_of_done("Create a multi-file nginx guide.")
4102 dod.implementation_plan = str(implementation_plan)
4103 dod.verification_commands = [f"ls -la {guide_root}"]
4104 sync_todos_to_definition_of_done(
4105 dod,
4106 [
4107 {
4108 "content": "Create chapter files following the established pattern",
4109 "active_form": "Creating chapter files",
4110 "status": "in_progress",
4111 }
4112 ],
4113 project_root=temp_dir,
4114 )
4115
4116 tool_call = ToolCall(
4117 id="todo-post-build",
4118 name="TodoWrite",
4119 arguments={
4120 "todos": [
4121 {
4122 "content": "Create chapter files following the established pattern",
4123 "active_form": "Creating chapter files",
4124 "status": "in_progress",
4125 }
4126 ]
4127 },
4128 )
4129 executor = FakeExecutor(
4130 [
4131 tool_outcome(
4132 tool_call=tool_call,
4133 output="Todos updated",
4134 is_error=False,
4135 metadata={
4136 "new_todos": [
4137 {
4138 "content": "Create chapter files following the established pattern",
4139 "active_form": "Creating chapter files",
4140 "status": "in_progress",
4141 }
4142 ]
4143 },
4144 )
4145 ]
4146 )
4147
4148 summary = TurnSummary(final_response="")
4149 await runner.execute_batch(
4150 tool_calls=[tool_call],
4151 tool_source="assistant",
4152 pending_tool_calls_seen=set(),
4153 emit=_noop_emit,
4154 summary=summary,
4155 dod=dod,
4156 executor=executor, # type: ignore[arg-type]
4157 on_confirmation=None,
4158 on_user_question=None,
4159 emit_confirmation=None,
4160 consecutive_errors=0,
4161 )
4162
4163 assert queued_messages
4164 message = queued_messages[-1]
4165 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4166 assert "Verification should run next." in message
4167 assert "Repair or verify the current files instead of expanding the artifact set." not in message
4168 assert context.workflow_mode == "verify"
4169
4170
4171 @pytest.mark.asyncio
4172 async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff(
4173 temp_dir: Path,
4174 ) -> None:
4175 async def assess_confidence(
4176 tool_name: str,
4177 tool_args: dict,
4178 context: str,
4179 ) -> ConfidenceAssessment:
4180 raise AssertionError("Confidence scoring should not run for this scenario")
4181
4182 async def verify_action(
4183 tool_name: str,
4184 tool_args: dict,
4185 result: str,
4186 expected: str = "",
4187 ) -> ActionVerification:
4188 raise AssertionError("Verification should not run for this scenario")
4189
4190 guide_root = temp_dir / "guides" / "nginx"
4191 chapters = guide_root / "chapters"
4192 guide_root.mkdir(parents=True)
4193 chapters.mkdir()
4194 index_path = guide_root / "index.html"
4195 chapter_one = chapters / "01-introduction.html"
4196 chapter_two = chapters / "02-installation.html"
4197 index_path.write_text(
4198 "\n".join(
4199 [
4200 '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
4201 '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
4202 "",
4203 ]
4204 )
4205 )
4206 chapter_one.write_text("<html></html>\n")
4207 chapter_two.write_text("<html></html>\n")
4208
4209 implementation_plan = temp_dir / "implementation.md"
4210 implementation_plan.write_text(
4211 "\n".join(
4212 [
4213 "# Implementation Plan",
4214 "",
4215 "## File Changes",
4216 f"- `{guide_root}/`",
4217 f"- `{chapters}/`",
4218 f"- `{index_path}`",
4219 f"- `{chapter_one}`",
4220 f"- `{chapter_two}`",
4221 "",
4222 ]
4223 )
4224 )
4225
4226 context = build_context(
4227 temp_dir=temp_dir,
4228 messages=[],
4229 safeguards=FakeSafeguards(),
4230 assess_confidence=assess_confidence,
4231 verify_action=verify_action,
4232 auto_recover=False,
4233 )
4234 queued_messages: list[str] = []
4235 context.queue_steering_message_callback = queued_messages.append
4236 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4237 dod = create_definition_of_done("Create a multi-file nginx guide.")
4238 dod.implementation_plan = str(implementation_plan)
4239 dod.verification_commands = [f"ls -la {guide_root}"]
4240
4241 todo_call = ToolCall(
4242 id="todo-post-build-preempt",
4243 name="TodoWrite",
4244 arguments={"todos": []},
4245 )
4246 audit_read = ToolCall(
4247 id="read-after-todo",
4248 name="read",
4249 arguments={"file_path": str(index_path)},
4250 )
4251 executor = FakeExecutor(
4252 [
4253 tool_outcome(
4254 tool_call=todo_call,
4255 output="Todos updated",
4256 is_error=False,
4257 metadata={"new_todos": []},
4258 ),
4259 tool_outcome(
4260 tool_call=audit_read,
4261 output=index_path.read_text(),
4262 is_error=False,
4263 ),
4264 ]
4265 )
4266
4267 summary = TurnSummary(final_response="")
4268 result = await runner.execute_batch(
4269 tool_calls=[todo_call, audit_read],
4270 tool_source="assistant",
4271 pending_tool_calls_seen=set(),
4272 emit=_noop_emit,
4273 summary=summary,
4274 dod=dod,
4275 executor=executor, # type: ignore[arg-type]
4276 on_confirmation=None,
4277 on_user_question=None,
4278 emit_confirmation=None,
4279 consecutive_errors=0,
4280 )
4281
4282 assert result.continue_after_batch is True
4283 assert result.halted is False
4284 assert [call.id for call in executor.calls] == ["todo-post-build-preempt"]
4285 assert len(summary.tool_result_messages) == 1
4286 assert context.workflow_mode == "verify"
4287 assert queued_messages
4288 assert "Verification should run next." in queued_messages[-1]
4289
4290
4291 @pytest.mark.asyncio
4292 async def test_tool_batch_runner_todowrite_complete_directory_plan_does_not_reinfer_first_child(
4293 temp_dir: Path,
4294 ) -> None:
4295 async def assess_confidence(
4296 tool_name: str,
4297 tool_args: dict,
4298 context: str,
4299 ) -> ConfidenceAssessment:
4300 raise AssertionError("Confidence scoring should not run for this scenario")
4301
4302 async def verify_action(
4303 tool_name: str,
4304 tool_args: dict,
4305 result: str,
4306 expected: str = "",
4307 ) -> ActionVerification:
4308 raise AssertionError("Verification should not run for this scenario")
4309
4310 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
4311 reference.parent.mkdir(parents=True)
4312 reference.write_text("<h1>Introduction</h1>\n")
4313
4314 guide_root = temp_dir / "Loader" / "guides" / "nginx"
4315 chapters = guide_root / "chapters"
4316 guide_root.mkdir(parents=True)
4317 chapters.mkdir()
4318 index_path = guide_root / "index.html"
4319 chapter_one = chapters / "01-introduction.html"
4320 chapter_two = chapters / "02-installation.html"
4321 chapter_three = chapters / "03-basic-configuration.html"
4322 index_path.write_text(
4323 "\n".join(
4324 [
4325 '<a href="chapters/01-introduction.html">Introduction</a>',
4326 '<a href="chapters/02-installation.html">Installation</a>',
4327 '<a href="chapters/03-basic-configuration.html">Configuration</a>',
4328 "",
4329 ]
4330 )
4331 )
4332 chapter_one.write_text("<html></html>\n")
4333 chapter_two.write_text("<html></html>\n")
4334 chapter_three.write_text("<html></html>\n")
4335
4336 implementation_plan = temp_dir / "implementation.md"
4337 implementation_plan.write_text(
4338 "\n".join(
4339 [
4340 "# Implementation Plan",
4341 "",
4342 "## File Changes",
4343 f"- `{guide_root / 'index.html'}`",
4344 f"- `{chapters}/`",
4345 "",
4346 ]
4347 )
4348 )
4349
4350 messages = [
4351 Message(
4352 role=Role.ASSISTANT,
4353 content="I examined the reference guide structure.",
4354 tool_calls=[
4355 ToolCall(
4356 id="read-reference-child",
4357 name="read",
4358 arguments={"file_path": str(reference)},
4359 )
4360 ],
4361 )
4362 ]
4363 context = build_context(
4364 temp_dir=temp_dir,
4365 messages=messages,
4366 safeguards=FakeSafeguards(),
4367 assess_confidence=assess_confidence,
4368 verify_action=verify_action,
4369 auto_recover=False,
4370 )
4371 queued_messages: list[str] = []
4372 context.queue_steering_message_callback = queued_messages.append
4373 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4374 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4375 dod.implementation_plan = str(implementation_plan)
4376 dod.verification_commands = [f"ls -la {guide_root}"]
4377
4378 todo_call = ToolCall(
4379 id="todo-complete-directory-plan",
4380 name="TodoWrite",
4381 arguments={"todos": []},
4382 )
4383 executor = FakeExecutor(
4384 [
4385 tool_outcome(
4386 tool_call=todo_call,
4387 output="Todos updated",
4388 is_error=False,
4389 metadata={"new_todos": []},
4390 )
4391 ]
4392 )
4393
4394 summary = TurnSummary(final_response="")
4395 result = await runner.execute_batch(
4396 tool_calls=[todo_call],
4397 tool_source="assistant",
4398 pending_tool_calls_seen=set(),
4399 emit=_noop_emit,
4400 summary=summary,
4401 dod=dod,
4402 executor=executor, # type: ignore[arg-type]
4403 on_confirmation=None,
4404 on_user_question=None,
4405 emit_confirmation=None,
4406 consecutive_errors=0,
4407 )
4408
4409 assert result.continue_after_batch is True
4410 assert queued_messages
4411 message = queued_messages[-1]
4412 assert "Verification should run next." in message
4413 assert "01-introduction.html" not in message
4414 assert "chapter files" not in message.lower()
4415 assert context.workflow_mode == "verify"
4416 assert summary.tool_result_messages
4417 assert "verification should be reviewed next" in summary.tool_result_messages[-1].content
4418 assert "fortran guide structure" not in summary.tool_result_messages[-1].content.lower()
4419
4420
4421 @pytest.mark.asyncio
4422 async def test_tool_batch_runner_preempts_post_build_observation_batch_for_verify_handoff(
4423 temp_dir: Path,
4424 ) -> None:
4425 async def assess_confidence(
4426 tool_name: str,
4427 tool_args: dict,
4428 context: str,
4429 ) -> ConfidenceAssessment:
4430 raise AssertionError("Confidence scoring should not run for this scenario")
4431
4432 async def verify_action(
4433 tool_name: str,
4434 tool_args: dict,
4435 result: str,
4436 expected: str = "",
4437 ) -> ActionVerification:
4438 raise AssertionError("Verification should not run for this scenario")
4439
4440 guide_root = temp_dir / "guides" / "nginx"
4441 chapters = guide_root / "chapters"
4442 guide_root.mkdir(parents=True)
4443 chapters.mkdir()
4444 index_path = guide_root / "index.html"
4445 chapter_one = chapters / "01-introduction.html"
4446 chapter_two = chapters / "02-installation.html"
4447 chapter_three = chapters / "03-configuration.html"
4448 index_path.write_text(
4449 "\n".join(
4450 [
4451 '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
4452 '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
4453 "",
4454 ]
4455 )
4456 )
4457 chapter_one.write_text("<html></html>\n")
4458 chapter_two.write_text("<html></html>\n")
4459 chapter_three.write_text("<html></html>\n")
4460
4461 implementation_plan = temp_dir / "implementation.md"
4462 implementation_plan.write_text(
4463 "\n".join(
4464 [
4465 "# Implementation Plan",
4466 "",
4467 "## File Changes",
4468 f"- `{guide_root}/`",
4469 f"- `{chapters}/`",
4470 f"- `{index_path}`",
4471 "",
4472 ]
4473 )
4474 )
4475
4476 context = build_context(
4477 temp_dir=temp_dir,
4478 messages=[],
4479 safeguards=FakeSafeguards(),
4480 assess_confidence=assess_confidence,
4481 verify_action=verify_action,
4482 auto_recover=False,
4483 )
4484 queued_messages: list[str] = []
4485 context.queue_steering_message_callback = queued_messages.append
4486 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4487 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4488 dod.implementation_plan = str(implementation_plan)
4489 dod.verification_commands = [f"ls -la {guide_root}"]
4490
4491 audit_bash = ToolCall(
4492 id="bash-post-build-audit",
4493 name="bash",
4494 arguments={"command": f"ls -la {guide_root}"},
4495 )
4496 audit_read = ToolCall(
4497 id="read-index-after-audit",
4498 name="read",
4499 arguments={"file_path": str(index_path)},
4500 )
4501 executor = FakeExecutor(
4502 [
4503 tool_outcome(
4504 tool_call=audit_bash,
4505 output="total 8\n",
4506 is_error=False,
4507 ),
4508 tool_outcome(
4509 tool_call=audit_read,
4510 output=index_path.read_text(),
4511 is_error=False,
4512 ),
4513 ]
4514 )
4515
4516 summary = TurnSummary(final_response="")
4517 result = await runner.execute_batch(
4518 tool_calls=[audit_bash, audit_read],
4519 tool_source="assistant",
4520 pending_tool_calls_seen=set(),
4521 emit=_noop_emit,
4522 summary=summary,
4523 dod=dod,
4524 executor=executor, # type: ignore[arg-type]
4525 on_confirmation=None,
4526 on_user_question=None,
4527 emit_confirmation=None,
4528 consecutive_errors=0,
4529 )
4530
4531 assert result.continue_after_batch is True
4532 assert [call.id for call in executor.calls] == ["bash-post-build-audit"]
4533 assert context.workflow_mode == "verify"
4534 assert queued_messages
4535 assert "Verification should run next." in queued_messages[-1]
4536
4537
4538 @pytest.mark.asyncio
4539 async def test_tool_batch_runner_preempts_post_build_observation_batch_during_consistency_review(
4540 temp_dir: Path,
4541 ) -> None:
4542 async def assess_confidence(
4543 tool_name: str,
4544 tool_args: dict,
4545 context: str,
4546 ) -> ConfidenceAssessment:
4547 raise AssertionError("Confidence scoring should not run for this scenario")
4548
4549 async def verify_action(
4550 tool_name: str,
4551 tool_args: dict,
4552 result: str,
4553 expected: str = "",
4554 ) -> ActionVerification:
4555 raise AssertionError("Verification should not run for this scenario")
4556
4557 guide_root = temp_dir / "guides" / "nginx"
4558 chapters = guide_root / "chapters"
4559 guide_root.mkdir(parents=True)
4560 chapters.mkdir()
4561 index_path = guide_root / "index.html"
4562 chapter_one = chapters / "01-introduction.html"
4563 chapter_two = chapters / "02-installation.html"
4564 chapter_three = chapters / "03-basic-configuration.html"
4565 index_path.write_text("<html></html>\n")
4566 chapter_one.write_text("<html></html>\n")
4567 chapter_two.write_text("<html></html>\n")
4568 chapter_three.write_text("<html></html>\n")
4569
4570 implementation_plan = temp_dir / "implementation.md"
4571 implementation_plan.write_text(
4572 "\n".join(
4573 [
4574 "# Implementation Plan",
4575 "",
4576 "## File Changes",
4577 f"- `{guide_root}/`",
4578 f"- `{chapters}/`",
4579 f"- `{index_path}`",
4580 "",
4581 ]
4582 )
4583 )
4584
4585 context = build_context(
4586 temp_dir=temp_dir,
4587 messages=[],
4588 safeguards=FakeSafeguards(),
4589 assess_confidence=assess_confidence,
4590 verify_action=verify_action,
4591 auto_recover=False,
4592 )
4593 queued_messages: list[str] = []
4594 queued_ephemeral: list[str] = []
4595 context.queue_steering_message_callback = queued_messages.append
4596 context.queue_ephemeral_steering_message_callback = queued_ephemeral.append
4597 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4598 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4599 dod.implementation_plan = str(implementation_plan)
4600 dod.verification_commands = [f"ls -la {guide_root}"]
4601 sync_todos_to_definition_of_done(
4602 dod,
4603 [
4604 {
4605 "content": "Review the generated guide for consistency and completeness",
4606 "active_form": "Reviewing the generated guide for consistency and completeness",
4607 "status": "pending",
4608 }
4609 ],
4610 project_root=temp_dir,
4611 )
4612
4613 audit_read = ToolCall(
4614 id="read-index-during-review",
4615 name="read",
4616 arguments={"file_path": str(index_path)},
4617 )
4618 second_read = ToolCall(
4619 id="read-chapter-after-review",
4620 name="read",
4621 arguments={"file_path": str(chapter_one)},
4622 )
4623 executor = FakeExecutor(
4624 [
4625 tool_outcome(
4626 tool_call=audit_read,
4627 output=index_path.read_text(),
4628 is_error=False,
4629 ),
4630 tool_outcome(
4631 tool_call=second_read,
4632 output=chapter_one.read_text(),
4633 is_error=False,
4634 ),
4635 ]
4636 )
4637
4638 summary = TurnSummary(final_response="")
4639 result = await runner.execute_batch(
4640 tool_calls=[audit_read, second_read],
4641 tool_source="assistant",
4642 pending_tool_calls_seen=set(),
4643 emit=_noop_emit,
4644 summary=summary,
4645 dod=dod,
4646 executor=executor, # type: ignore[arg-type]
4647 on_confirmation=None,
4648 on_user_question=None,
4649 emit_confirmation=None,
4650 consecutive_errors=0,
4651 )
4652
4653 assert result.continue_after_batch is True
4654 assert [call.id for call in executor.calls] == ["read-index-during-review"]
4655 queued = queued_ephemeral or queued_messages
4656 assert queued
4657 assert "All explicitly planned artifacts already exist." in queued[-1]
4658 assert "generated files" in queued[-1]
4659
4660
4661 @pytest.mark.asyncio
4662 async def test_tool_batch_runner_skips_post_build_user_question_during_consistency_review(
4663 temp_dir: Path,
4664 ) -> None:
4665 async def assess_confidence(
4666 tool_name: str,
4667 tool_args: dict,
4668 context: str,
4669 ) -> ConfidenceAssessment:
4670 raise AssertionError("Confidence scoring should not run for this scenario")
4671
4672 async def verify_action(
4673 tool_name: str,
4674 tool_args: dict,
4675 result: str,
4676 expected: str = "",
4677 ) -> ActionVerification:
4678 raise AssertionError("Verification should not run for this scenario")
4679
4680 guide_root = temp_dir / "guides" / "nginx"
4681 chapters = guide_root / "chapters"
4682 guide_root.mkdir(parents=True)
4683 chapters.mkdir()
4684 index_path = guide_root / "index.html"
4685 chapter_one = chapters / "01-introduction.html"
4686 chapter_two = chapters / "02-installation.html"
4687 index_path.write_text(
4688 "\n".join(
4689 [
4690 '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
4691 '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
4692 "",
4693 ]
4694 )
4695 )
4696 chapter_one.write_text("<html></html>\n")
4697 chapter_two.write_text("<html></html>\n")
4698
4699 implementation_plan = temp_dir / "implementation.md"
4700 implementation_plan.write_text(
4701 "\n".join(
4702 [
4703 "# Implementation Plan",
4704 "",
4705 "## File Changes",
4706 f"- `{guide_root}/`",
4707 f"- `{chapters}/`",
4708 f"- `{index_path}`",
4709 f"- `{chapter_one}`",
4710 f"- `{chapter_two}`",
4711 "",
4712 ]
4713 )
4714 )
4715
4716 context = build_context(
4717 temp_dir=temp_dir,
4718 messages=[],
4719 safeguards=FakeSafeguards(),
4720 assess_confidence=assess_confidence,
4721 verify_action=verify_action,
4722 auto_recover=False,
4723 )
4724 queued_messages: list[str] = []
4725 context.queue_steering_message_callback = queued_messages.append
4726 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4727 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4728 dod.implementation_plan = str(implementation_plan)
4729 dod.verification_commands = [f"ls -la {guide_root}"]
4730 dod.pending_items = ["Ensure all files are properly linked and formatted"]
4731
4732 question_call = ToolCall(
4733 id="ask-post-build-review",
4734 name="AskUserQuestion",
4735 arguments={
4736 "question": "Which specific aspects of the reference guide should I copy?",
4737 "context": "I already created the output files and want to ensure they match.",
4738 },
4739 )
4740 executor = FakeExecutor([])
4741
4742 summary = TurnSummary(final_response="")
4743 result = await runner.execute_batch(
4744 tool_calls=[question_call],
4745 tool_source="assistant",
4746 pending_tool_calls_seen=set(),
4747 emit=_noop_emit,
4748 summary=summary,
4749 dod=dod,
4750 executor=executor, # type: ignore[arg-type]
4751 on_confirmation=None,
4752 on_user_question=None,
4753 emit_confirmation=None,
4754 consecutive_errors=0,
4755 )
4756
4757 assert result.continue_after_batch is True
4758 assert executor.calls == []
4759 assert queued_messages
4760 assert "The remaining work is review/verification of the generated files." in queued_messages[-1]
4761 assert "Do not ask the user for more clarification about the reference pattern now." in queued_messages[-1]
4762 assert "Verification should run next." in queued_messages[-1]
4763 assert context.workflow_mode == "verify"
4764 assert summary.tool_result_messages
4765 assert "Skipped - stale post-build user question" in summary.tool_result_messages[-1].content
4766
4767
4768 @pytest.mark.asyncio
4769 async def test_tool_batch_runner_rewrites_stale_todowrite_summary_from_reconciled_dod(
4770 temp_dir: Path,
4771 ) -> None:
4772 async def assess_confidence(
4773 tool_name: str,
4774 tool_args: dict,
4775 context: str,
4776 ) -> ConfidenceAssessment:
4777 raise AssertionError("Confidence scoring should not run for this scenario")
4778
4779 async def verify_action(
4780 tool_name: str,
4781 tool_args: dict,
4782 result: str,
4783 expected: str = "",
4784 ) -> ActionVerification:
4785 raise AssertionError("Verification should not run for this scenario")
4786
4787 guide_root = temp_dir / "guides" / "nginx"
4788 chapters = guide_root / "chapters"
4789 guide_root.mkdir(parents=True)
4790 chapters.mkdir()
4791 index_path = guide_root / "index.html"
4792 for name in (
4793 "01-introduction.html",
4794 "02-installation.html",
4795 "03-basic-configuration.html",
4796 "04-advanced-usage.html",
4797 "05-troubleshooting.html",
4798 ):
4799 (chapters / name).write_text("<html></html>\n")
4800 index_path.write_text("<html></html>\n")
4801
4802 implementation_plan = temp_dir / "implementation.md"
4803 implementation_plan.write_text(
4804 "\n".join(
4805 [
4806 "# Implementation Plan",
4807 "",
4808 "## File Changes",
4809 f"- `{guide_root}/`",
4810 f"- `{chapters}/`",
4811 f"- `{index_path}`",
4812 "",
4813 ]
4814 )
4815 )
4816
4817 context = build_context(
4818 temp_dir=temp_dir,
4819 messages=[],
4820 safeguards=FakeSafeguards(),
4821 assess_confidence=assess_confidence,
4822 verify_action=verify_action,
4823 auto_recover=False,
4824 )
4825 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4826 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4827 dod.implementation_plan = str(implementation_plan)
4828 dod.verification_commands = [f"ls -la {guide_root}"]
4829
4830 tool_call = ToolCall(
4831 id="todo-stale-summary",
4832 name="TodoWrite",
4833 arguments={
4834 "todos": [
4835 {
4836 "content": "First, examine the existing fortran guide structure and content to understand the format",
4837 "active_form": "Working on: First, examine the existing fortran guide structure and content to understand the format",
4838 "status": "pending",
4839 }
4840 ]
4841 },
4842 )
4843 executor = FakeExecutor(
4844 [
4845 tool_outcome(
4846 tool_call=tool_call,
4847 output="Todos updated",
4848 is_error=False,
4849 metadata={
4850 "new_todos": [
4851 {
4852 "content": "First, examine the existing fortran guide structure and content to understand the format",
4853 "active_form": "Working on: First, examine the existing fortran guide structure and content to understand the format",
4854 "status": "pending",
4855 }
4856 ]
4857 },
4858 )
4859 ]
4860 )
4861
4862 summary = TurnSummary(final_response="")
4863 result = await runner.execute_batch(
4864 tool_calls=[tool_call],
4865 tool_source="assistant",
4866 pending_tool_calls_seen=set(),
4867 emit=_noop_emit,
4868 summary=summary,
4869 dod=dod,
4870 executor=executor, # type: ignore[arg-type]
4871 on_confirmation=None,
4872 on_user_question=None,
4873 emit_confirmation=None,
4874 consecutive_errors=0,
4875 )
4876
4877 assert result.continue_after_batch is True
4878 assert summary.tool_result_messages
4879 message = summary.tool_result_messages[-1].content
4880 assert "updated todo list" in message
4881 assert "verification should be reviewed next" in message
4882 assert "next pending:" not in message
4883 assert "fortran guide structure" not in message.lower()
4884
4885
4886 @pytest.mark.asyncio
4887 async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
4888 temp_dir: Path,
4889 ) -> None:
4890 async def assess_confidence(
4891 tool_name: str,
4892 tool_args: dict,
4893 context: str,
4894 ) -> ConfidenceAssessment:
4895 raise AssertionError("Confidence scoring should not run for this scenario")
4896
4897 async def verify_action(
4898 tool_name: str,
4899 tool_args: dict,
4900 result: str,
4901 expected: str = "",
4902 ) -> ActionVerification:
4903 raise AssertionError("Verification should not run for this scenario")
4904
4905 guide_root = temp_dir / "guides" / "nginx"
4906 chapters = guide_root / "chapters"
4907 guide_root.mkdir(parents=True)
4908 chapters.mkdir()
4909 index_path = guide_root / "index.html"
4910 chapter_one = chapters / "01-introduction.html"
4911 chapter_two = chapters / "02-installation.html"
4912 index_path.write_text(
4913 "\n".join(
4914 [
4915 '<a href="chapters/01-introduction.html">Intro</a>',
4916 '<a href="chapters/02-installation.html">Install</a>',
4917 '<a href="../index.html">Back</a>',
4918 "",
4919 ]
4920 )
4921 )
4922 chapter_one.write_text("<html></html>\n")
4923 chapter_two.write_text("<html></html>\n")
4924
4925 implementation_plan = temp_dir / "implementation.md"
4926 implementation_plan.write_text(
4927 "\n".join(
4928 [
4929 "# Implementation Plan",
4930 "",
4931 "## File Changes",
4932 f"- `{guide_root}/`",
4933 f"- `{chapters}/`",
4934 f"- `{index_path}`",
4935 f"- `{chapter_one}`",
4936 f"- `{chapter_two}`",
4937 "",
4938 ]
4939 )
4940 )
4941
4942 context = build_context(
4943 temp_dir=temp_dir,
4944 messages=[],
4945 safeguards=FakeSafeguards(),
4946 assess_confidence=assess_confidence,
4947 verify_action=verify_action,
4948 auto_recover=False,
4949 )
4950 queued_messages: list[str] = []
4951 context.queue_steering_message_callback = queued_messages.append
4952 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4953 dod = create_definition_of_done("Create a multi-file nginx guide.")
4954 dod.implementation_plan = str(implementation_plan)
4955 dod.verification_commands = [f"ls -la {guide_root}"]
4956
4957 tool_call = ToolCall(
4958 id="todo-post-build-expansion",
4959 name="TodoWrite",
4960 arguments={
4961 "todos": [
4962 {
4963 "content": "Create index.html for nginx guide",
4964 "activeForm": "Creating index.html",
4965 "status": "in_progress",
4966 },
4967 {
4968 "content": "Create chapter 01-introduction.html",
4969 "activeForm": "Creating chapter 01-introduction.html",
4970 "status": "completed",
4971 },
4972 {
4973 "content": "Create chapter 02-installation.html",
4974 "activeForm": "Creating chapter 02-installation.html",
4975 "status": "completed",
4976 },
4977 {
4978 "content": "Create chapter 08-troubleshooting.html",
4979 "activeForm": "Creating chapter 08-troubleshooting.html",
4980 "status": "pending",
4981 },
4982 ]
4983 },
4984 )
4985 executor = FakeExecutor(
4986 [
4987 tool_outcome(
4988 tool_call=tool_call,
4989 output="Todos updated",
4990 is_error=False,
4991 metadata={
4992 "new_todos": [
4993 {
4994 "content": "Create index.html for nginx guide",
4995 "active_form": "Creating index.html",
4996 "status": "in_progress",
4997 },
4998 {
4999 "content": "Create chapter 01-introduction.html",
5000 "active_form": "Creating chapter 01-introduction.html",
5001 "status": "completed",
5002 },
5003 {
5004 "content": "Create chapter 02-installation.html",
5005 "active_form": "Creating chapter 02-installation.html",
5006 "status": "completed",
5007 },
5008 {
5009 "content": "Create chapter 08-troubleshooting.html",
5010 "active_form": "Creating chapter 08-troubleshooting.html",
5011 "status": "pending",
5012 },
5013 ]
5014 },
5015 )
5016 ]
5017 )
5018
5019 summary = TurnSummary(final_response="")
5020 await runner.execute_batch(
5021 tool_calls=[tool_call],
5022 tool_source="assistant",
5023 pending_tool_calls_seen=set(),
5024 emit=_noop_emit,
5025 summary=summary,
5026 dod=dod,
5027 executor=executor, # type: ignore[arg-type]
5028 on_confirmation=None,
5029 on_user_question=None,
5030 emit_confirmation=None,
5031 consecutive_errors=0,
5032 )
5033
5034 assert queued_messages
5035 message = queued_messages[-1]
5036 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
5037 assert "Verification should run next." in message
5038 assert "Repair or verify the current files instead of expanding the artifact set." not in message
5039 assert "08-troubleshooting.html" not in message
5040 assert context.workflow_mode == "verify"
5041
5042
5043 @pytest.mark.asyncio
5044 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
5045 temp_dir: Path,
5046 ) -> None:
5047 async def assess_confidence(
5048 tool_name: str,
5049 tool_args: dict,
5050 context: str,
5051 ) -> ConfidenceAssessment:
5052 raise AssertionError("Confidence scoring should not run in this scenario")
5053
5054 async def verify_action(
5055 tool_name: str,
5056 tool_args: dict,
5057 result: str,
5058 expected: str = "",
5059 ) -> ActionVerification:
5060 raise AssertionError("Verification should not run in this scenario")
5061
5062 guide_root = temp_dir / "guides" / "nginx"
5063 chapters = guide_root / "chapters"
5064 guide_root.mkdir(parents=True)
5065 chapters.mkdir()
5066 index_path = guide_root / "index.html"
5067 index_path.write_text(
5068 "\n".join(
5069 [
5070 "<!DOCTYPE html>",
5071 "<html>",
5072 "<body>",
5073 '<a href="chapters/01-introduction.html">Introduction</a>',
5074 "</body>",
5075 "</html>",
5076 "",
5077 ]
5078 )
5079 )
5080
5081 implementation_plan = temp_dir / "implementation.md"
5082 implementation_plan.write_text(
5083 "\n".join(
5084 [
5085 "# Implementation Plan",
5086 "",
5087 "## File Changes",
5088 f"- `{guide_root}/`",
5089 f"- `{chapters}/`",
5090 f"- `{index_path}`",
5091 "",
5092 ]
5093 )
5094 )
5095
5096 context = build_context(
5097 temp_dir=temp_dir,
5098 messages=[],
5099 safeguards=FakeSafeguards(),
5100 assess_confidence=assess_confidence,
5101 verify_action=verify_action,
5102 auto_recover=False,
5103 )
5104 queued_messages: list[str] = []
5105 context.queue_steering_message_callback = queued_messages.append
5106 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5107 dod = create_definition_of_done("Create a multi-file nginx guide.")
5108 dod.implementation_plan = str(implementation_plan)
5109 dod.touched_files.append(str(index_path))
5110 sync_todos_to_definition_of_done(
5111 dod,
5112 [
5113 {
5114 "content": "Examine the existing Fortran guide structure",
5115 "active_form": "Examining the existing Fortran guide structure",
5116 "status": "completed",
5117 },
5118 {
5119 "content": "Create the nginx directory structure",
5120 "active_form": "Creating the nginx directory structure",
5121 "status": "completed",
5122 },
5123 {
5124 "content": "Write the introduction chapter",
5125 "active_form": "Writing the introduction chapter",
5126 "status": "pending",
5127 },
5128 ],
5129 project_root=temp_dir,
5130 )
5131
5132 tool_call = ToolCall(
5133 id="todo-next-mutation",
5134 name="TodoWrite",
5135 arguments={
5136 "todos": [
5137 {
5138 "content": "Examine the existing Fortran guide structure",
5139 "active_form": "Examining the existing Fortran guide structure",
5140 "status": "completed",
5141 },
5142 {
5143 "content": "Create the nginx directory structure",
5144 "active_form": "Creating the nginx directory structure",
5145 "status": "completed",
5146 },
5147 {
5148 "content": "Write the introduction chapter",
5149 "active_form": "Writing the introduction chapter",
5150 "status": "pending",
5151 },
5152 ]
5153 },
5154 )
5155 executor = FakeExecutor(
5156 [
5157 tool_outcome(
5158 tool_call=tool_call,
5159 output="Todos updated",
5160 is_error=False,
5161 metadata={
5162 "new_todos": [
5163 {
5164 "content": "Examine the existing Fortran guide structure",
5165 "active_form": "Examining the existing Fortran guide structure",
5166 "status": "completed",
5167 },
5168 {
5169 "content": "Create the nginx directory structure",
5170 "active_form": "Creating the nginx directory structure",
5171 "status": "completed",
5172 },
5173 {
5174 "content": "Write the introduction chapter",
5175 "active_form": "Writing the introduction chapter",
5176 "status": "pending",
5177 },
5178 ]
5179 },
5180 )
5181 ]
5182 )
5183
5184 summary = TurnSummary(final_response="")
5185 await runner.execute_batch(
5186 tool_calls=[tool_call],
5187 tool_source="assistant",
5188 pending_tool_calls_seen=set(),
5189 emit=_noop_emit,
5190 summary=summary,
5191 dod=dod,
5192 executor=executor, # type: ignore[arg-type]
5193 on_confirmation=None,
5194 on_user_question=None,
5195 emit_confirmation=None,
5196 consecutive_errors=0,
5197 )
5198
5199 assert queued_messages
5200 message = queued_messages[-1]
5201 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
5202 assert "Prefer one `write(file_path=..., content=...)` call" in message
5203 assert "Make your next response the concrete mutation tool call itself." in message
5204
5205
5206 @pytest.mark.asyncio
5207 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
5208 temp_dir: Path,
5209 ) -> None:
5210 async def assess_confidence(
5211 tool_name: str,
5212 tool_args: dict,
5213 context: str,
5214 ) -> ConfidenceAssessment:
5215 raise AssertionError("Confidence scoring should not run in this scenario")
5216
5217 async def verify_action(
5218 tool_name: str,
5219 tool_args: dict,
5220 result: str,
5221 expected: str = "",
5222 ) -> ActionVerification:
5223 raise AssertionError("Verification should not run in this scenario")
5224
5225 guide_root = temp_dir / "Loader" / "guides" / "nginx"
5226 chapters = guide_root / "chapters"
5227 chapters.mkdir(parents=True)
5228 index_path = guide_root / "index.html"
5229 implementation_plan = temp_dir / "implementation.md"
5230 implementation_plan.write_text(
5231 "\n".join(
5232 [
5233 "# Implementation Plan",
5234 "",
5235 "## File Changes",
5236 f"- `{chapters}/`",
5237 f"- `{index_path}`",
5238 "",
5239 ]
5240 )
5241 )
5242
5243 dod = create_definition_of_done("Create a multi-file nginx guide.")
5244 dod.implementation_plan = str(implementation_plan)
5245 sync_todos_to_definition_of_done(
5246 dod,
5247 [
5248 {
5249 "content": "Examine the existing Fortran guide structure to understand the format and depth",
5250 "active_form": "Examining the existing Fortran guide structure",
5251 "status": "completed",
5252 },
5253 {
5254 "content": "Create the new nginx guide directory structure",
5255 "active_form": "Creating the new nginx guide directory structure",
5256 "status": "completed",
5257 },
5258 {
5259 "content": "Create a new index.html for the nginx guide",
5260 "active_form": "Creating a new index.html for the nginx guide",
5261 "status": "pending",
5262 },
5263 {
5264 "content": "Create the first chapter for the nginx guide",
5265 "active_form": "Creating the first chapter for the nginx guide",
5266 "status": "pending",
5267 },
5268 ],
5269 project_root=temp_dir,
5270 )
5271
5272 queued_messages: list[str] = []
5273 context = build_context(
5274 temp_dir=temp_dir,
5275 messages=[],
5276 safeguards=FakeSafeguards(),
5277 assess_confidence=assess_confidence,
5278 verify_action=verify_action,
5279 auto_recover=False,
5280 )
5281 context.queue_steering_message_callback = queued_messages.append
5282 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5283
5284 todos = [
5285 {
5286 "content": "Examine the existing Fortran guide structure to understand the format and depth",
5287 "active_form": "Examining the existing Fortran guide structure",
5288 "status": "completed",
5289 },
5290 {
5291 "content": "Create the new nginx guide directory structure",
5292 "active_form": "Creating the new nginx guide directory structure",
5293 "status": "completed",
5294 },
5295 {
5296 "content": "Create a new index.html for the nginx guide",
5297 "active_form": "Creating a new index.html for the nginx guide",
5298 "status": "pending",
5299 },
5300 {
5301 "content": "Create the first chapter for the nginx guide",
5302 "active_form": "Creating the first chapter for the nginx guide",
5303 "status": "pending",
5304 },
5305 ]
5306 tool_call = ToolCall(
5307 id="todo-index-before-chapter",
5308 name="TodoWrite",
5309 arguments={"todos": todos},
5310 )
5311 executor = FakeExecutor(
5312 [
5313 tool_outcome(
5314 tool_call=tool_call,
5315 output="Todos updated",
5316 is_error=False,
5317 metadata={"new_todos": todos},
5318 )
5319 ]
5320 )
5321
5322 summary = TurnSummary(final_response="")
5323 await runner.execute_batch(
5324 tool_calls=[tool_call],
5325 tool_source="assistant",
5326 pending_tool_calls_seen=set(),
5327 emit=_noop_emit,
5328 summary=summary,
5329 dod=dod,
5330 executor=executor, # type: ignore[arg-type]
5331 on_confirmation=None,
5332 on_user_question=None,
5333 emit_confirmation=None,
5334 consecutive_errors=0,
5335 )
5336
5337 assert queued_messages
5338 message = queued_messages[-1]
5339 assert "Todo tracking is updated. Next step: create `index.html`." in message
5340 assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
5341 assert "01-introduction.html" not in message
5342
5343
5344 @pytest.mark.asyncio
5345 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
5346 temp_dir: Path,
5347 ) -> None:
5348 async def assess_confidence(
5349 tool_name: str,
5350 tool_args: dict,
5351 context: str,
5352 ) -> ConfidenceAssessment:
5353 raise AssertionError("Confidence scoring should not run in this scenario")
5354
5355 async def verify_action(
5356 tool_name: str,
5357 tool_args: dict,
5358 result: str,
5359 expected: str = "",
5360 ) -> ActionVerification:
5361 raise AssertionError("Verification should not run in this scenario")
5362
5363 guide_root = temp_dir / "guides" / "nginx"
5364 chapters = guide_root / "chapters"
5365 guide_root.mkdir(parents=True)
5366 chapters.mkdir()
5367 index_path = guide_root / "index.html"
5368 index_path.write_text(
5369 "\n".join(
5370 [
5371 "<html>",
5372 '<a href="chapters/introduction.html">Introduction</a>',
5373 '<a href="chapters/installation.html">Installation</a>',
5374 "</html>",
5375 ]
5376 )
5377 + "\n"
5378 )
5379
5380 implementation_plan = temp_dir / "implementation.md"
5381 implementation_plan.write_text(
5382 "\n".join(
5383 [
5384 "# Implementation Plan",
5385 "",
5386 "## File Changes",
5387 f"- `{guide_root}/`",
5388 f"- `{chapters}/`",
5389 f"- `{index_path}`",
5390 "",
5391 ]
5392 )
5393 )
5394
5395 dod = create_definition_of_done("Create a multi-file nginx guide.")
5396 dod.implementation_plan = str(implementation_plan)
5397 dod.pending_items = [
5398 "Write the introduction chapter",
5399 "Complete the requested work",
5400 ]
5401 dod.touched_files.append(str(index_path))
5402
5403 queued_messages: list[str] = []
5404 context = build_context(
5405 temp_dir=temp_dir,
5406 messages=[],
5407 safeguards=FakeSafeguards(),
5408 assess_confidence=assess_confidence,
5409 verify_action=verify_action,
5410 auto_recover=False,
5411 )
5412 context.queue_steering_message_callback = queued_messages.append
5413 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5414
5415 tool_call = ToolCall(
5416 id="todo-1",
5417 name="TodoWrite",
5418 arguments={
5419 "todos": [
5420 {
5421 "content": "Write the introduction chapter",
5422 "activeForm": "Writing the introduction chapter",
5423 "status": "pending",
5424 }
5425 ]
5426 },
5427 )
5428 executor = FakeExecutor(
5429 [
5430 tool_outcome(
5431 tool_call=tool_call,
5432 output="Todos updated",
5433 is_error=False,
5434 metadata={
5435 "new_todos": [
5436 {
5437 "content": "Write the introduction chapter",
5438 "active_form": "Writing the introduction chapter",
5439 "status": "pending",
5440 }
5441 ]
5442 },
5443 )
5444 ]
5445 )
5446
5447 summary = TurnSummary(final_response="")
5448 await runner.execute_batch(
5449 tool_calls=[tool_call],
5450 tool_source="assistant",
5451 pending_tool_calls_seen=set(),
5452 emit=_noop_emit,
5453 summary=summary,
5454 dod=dod,
5455 executor=executor, # type: ignore[arg-type]
5456 on_confirmation=None,
5457 on_user_question=None,
5458 emit_confirmation=None,
5459 consecutive_errors=0,
5460 )
5461
5462 assert queued_messages
5463 message = queued_messages[-1]
5464 assert "Todo tracking is updated. Next step: create `introduction.html`." in message
5465 assert "Prefer one `write(file_path=..., content=...)` call" in message
5466 assert "Make your next response the concrete mutation tool call itself." in message
5467
5468
5469 @pytest.mark.asyncio
5470 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
5471 temp_dir: Path,
5472 ) -> None:
5473 async def assess_confidence(
5474 tool_name: str,
5475 tool_args: dict,
5476 context: str,
5477 ) -> ConfidenceAssessment:
5478 raise AssertionError("Confidence scoring should not run in this scenario")
5479
5480 async def verify_action(
5481 tool_name: str,
5482 tool_args: dict,
5483 result: str,
5484 expected: str = "",
5485 ) -> ActionVerification:
5486 raise AssertionError("Verification should not run in this scenario")
5487
5488 guide_root = temp_dir / "guides" / "nginx"
5489 chapters = guide_root / "chapters"
5490 guide_root.mkdir(parents=True)
5491 chapters.mkdir()
5492 index_path = guide_root / "index.html"
5493 chapter_one = chapters / "01-introduction.html"
5494 index_path.write_text(
5495 "\n".join(
5496 [
5497 "<html>",
5498 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
5499 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
5500 "</html>",
5501 ]
5502 )
5503 + "\n"
5504 )
5505 chapter_one.write_text("<html></html>\n")
5506
5507 implementation_plan = temp_dir / "implementation.md"
5508 implementation_plan.write_text(
5509 "\n".join(
5510 [
5511 "# Implementation Plan",
5512 "",
5513 "## File Changes",
5514 f"- `{guide_root}/`",
5515 f"- `{chapters}/`",
5516 f"- `{index_path}`",
5517 "",
5518 ]
5519 )
5520 )
5521
5522 dod = create_definition_of_done("Create a multi-file nginx guide.")
5523 dod.implementation_plan = str(implementation_plan)
5524 dod.pending_items = [
5525 "Creating Chapter 2: Installation and Setup",
5526 "Complete the requested work",
5527 ]
5528 dod.touched_files.extend([str(index_path), str(chapter_one)])
5529
5530 queued_messages: list[str] = []
5531 context = build_context(
5532 temp_dir=temp_dir,
5533 messages=[],
5534 safeguards=FakeSafeguards(),
5535 assess_confidence=assess_confidence,
5536 verify_action=verify_action,
5537 auto_recover=False,
5538 )
5539 context.queue_steering_message_callback = queued_messages.append
5540 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5541
5542 tool_call = ToolCall(
5543 id="todo-1",
5544 name="TodoWrite",
5545 arguments={
5546 "todos": [
5547 {
5548 "content": "Creating Chapter 2: Installation and Setup",
5549 "activeForm": "Creating Chapter 2: Installation and Setup",
5550 "status": "pending",
5551 }
5552 ]
5553 },
5554 )
5555 executor = FakeExecutor(
5556 [
5557 tool_outcome(
5558 tool_call=tool_call,
5559 output="Todos updated",
5560 is_error=False,
5561 metadata={
5562 "new_todos": [
5563 {
5564 "content": "Creating Chapter 2: Installation and Setup",
5565 "active_form": "Creating Chapter 2: Installation and Setup",
5566 "status": "pending",
5567 }
5568 ]
5569 },
5570 )
5571 ]
5572 )
5573
5574 summary = TurnSummary(final_response="")
5575 await runner.execute_batch(
5576 tool_calls=[tool_call],
5577 tool_source="assistant",
5578 pending_tool_calls_seen=set(),
5579 emit=_noop_emit,
5580 summary=summary,
5581 dod=dod,
5582 executor=executor, # type: ignore[arg-type]
5583 on_confirmation=None,
5584 on_user_question=None,
5585 emit_confirmation=None,
5586 consecutive_errors=0,
5587 )
5588
5589 assert queued_messages
5590 message = queued_messages[-1]
5591 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
5592 assert "Prefer one `write(file_path=..., content=...)` call" in message
5593 assert "Make your next response the concrete mutation tool call itself" in message
5594
5595
5596 @pytest.mark.asyncio
5597 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
5598 temp_dir: Path,
5599 ) -> None:
5600 async def assess_confidence(
5601 tool_name: str,
5602 tool_args: dict,
5603 context: str,
5604 ) -> ConfidenceAssessment:
5605 raise AssertionError("Confidence scoring should not run in this scenario")
5606
5607 async def verify_action(
5608 tool_name: str,
5609 tool_args: dict,
5610 result: str,
5611 expected: str = "",
5612 ) -> ActionVerification:
5613 raise AssertionError("Verification should not run in this scenario")
5614
5615 reference_chapters = temp_dir / "fortran" / "chapters"
5616 reference_chapters.mkdir(parents=True)
5617 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
5618
5619 guide_root = temp_dir / "guides" / "nginx"
5620 chapters = guide_root / "chapters"
5621 guide_root.mkdir(parents=True)
5622 chapters.mkdir()
5623 index_path = guide_root / "index.html"
5624 index_path.write_text("<html></html>\n")
5625
5626 implementation_plan = temp_dir / "implementation.md"
5627 implementation_plan.write_text(
5628 "\n".join(
5629 [
5630 "# Implementation Plan",
5631 "",
5632 "## File Changes",
5633 f"- `{guide_root}/`",
5634 f"- `{chapters}/`",
5635 f"- `{index_path}`",
5636 "",
5637 ]
5638 )
5639 )
5640
5641 dod = create_definition_of_done("Create a multi-file nginx guide.")
5642 dod.implementation_plan = str(implementation_plan)
5643 dod.pending_items = [
5644 "Write the introduction chapter",
5645 "Complete the requested work",
5646 ]
5647 dod.touched_files.append(str(index_path))
5648
5649 queued_messages: list[str] = []
5650 context = build_context(
5651 temp_dir=temp_dir,
5652 messages=[
5653 Message(
5654 role=Role.ASSISTANT,
5655 content="",
5656 tool_calls=[
5657 ToolCall(
5658 id="read-ref-1",
5659 name="read",
5660 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
5661 )
5662 ],
5663 )
5664 ],
5665 safeguards=FakeSafeguards(),
5666 assess_confidence=assess_confidence,
5667 verify_action=verify_action,
5668 auto_recover=False,
5669 )
5670 context.queue_steering_message_callback = queued_messages.append
5671 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5672
5673 tool_call = ToolCall(
5674 id="todo-observed-1",
5675 name="TodoWrite",
5676 arguments={
5677 "todos": [
5678 {
5679 "content": "Write the introduction chapter",
5680 "activeForm": "Writing the introduction chapter",
5681 "status": "pending",
5682 }
5683 ]
5684 },
5685 )
5686 executor = FakeExecutor(
5687 [
5688 tool_outcome(
5689 tool_call=tool_call,
5690 output="Todos updated",
5691 is_error=False,
5692 metadata={
5693 "new_todos": [
5694 {
5695 "content": "Write the introduction chapter",
5696 "active_form": "Writing the introduction chapter",
5697 "status": "pending",
5698 }
5699 ]
5700 },
5701 )
5702 ]
5703 )
5704
5705 summary = TurnSummary(final_response="")
5706 await runner.execute_batch(
5707 tool_calls=[tool_call],
5708 tool_source="assistant",
5709 pending_tool_calls_seen=set(),
5710 emit=_noop_emit,
5711 summary=summary,
5712 dod=dod,
5713 executor=executor, # type: ignore[arg-type]
5714 on_confirmation=None,
5715 on_user_question=None,
5716 emit_confirmation=None,
5717 consecutive_errors=0,
5718 )
5719
5720 assert queued_messages
5721 message = queued_messages[-1]
5722 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
5723 assert "Prefer one `write(file_path=..., content=...)` call" in message
5724
5725
5726 @pytest.mark.asyncio
5727 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
5728 temp_dir: Path,
5729 ) -> None:
5730 async def assess_confidence(
5731 tool_name: str,
5732 tool_args: dict,
5733 context: str,
5734 ) -> ConfidenceAssessment:
5735 raise AssertionError("Confidence scoring should not run in this scenario")
5736
5737 async def verify_action(
5738 tool_name: str,
5739 tool_args: dict,
5740 result: str,
5741 expected: str = "",
5742 ) -> ActionVerification:
5743 raise AssertionError("Verification should not run in this scenario")
5744
5745 guide_root = temp_dir / "guides" / "nginx"
5746 chapters = guide_root / "chapters"
5747 guide_root.mkdir(parents=True)
5748 chapters.mkdir()
5749 index_path = guide_root / "index.html"
5750 chapter_one = chapters / "01-getting-started.html"
5751 chapter_two = chapters / "02-installation.html"
5752 index_path.write_text("<html></html>\n")
5753 chapter_one.write_text("<h1>One</h1>\n")
5754
5755 implementation_plan = temp_dir / "implementation.md"
5756 implementation_plan.write_text(
5757 "\n".join(
5758 [
5759 "# Implementation Plan",
5760 "",
5761 "## File Changes",
5762 f"- `{guide_root}/`",
5763 f"- `{chapters}/`",
5764 f"- `{index_path}`",
5765 f"- `{chapter_one}`",
5766 f"- `{chapter_two}`",
5767 "",
5768 ]
5769 )
5770 )
5771
5772 context = build_context(
5773 temp_dir=temp_dir,
5774 messages=[],
5775 safeguards=FakeSafeguards(),
5776 assess_confidence=assess_confidence,
5777 verify_action=verify_action,
5778 auto_recover=False,
5779 )
5780 queued_messages: list[str] = []
5781 context.queue_steering_message_callback = queued_messages.append
5782 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5783 dod = create_definition_of_done("Create a multi-file nginx guide.")
5784 dod.implementation_plan = str(implementation_plan)
5785 sync_todos_to_definition_of_done(
5786 dod,
5787 [
5788 {
5789 "content": "Create 01-getting-started.html",
5790 "active_form": "Creating 01-getting-started.html",
5791 "status": "completed",
5792 },
5793 {
5794 "content": "Create 02-installation.html",
5795 "active_form": "Creating 02-installation.html",
5796 "status": "pending",
5797 },
5798 ],
5799 project_root=temp_dir,
5800 )
5801 dod.touched_files.extend([str(index_path), str(chapter_one)])
5802
5803 tool_call = ToolCall(
5804 id="working-note",
5805 name="notepad_write_working",
5806 arguments={"content": "Creating the second chapter file: Installation"},
5807 )
5808 executor = FakeExecutor(
5809 [
5810 tool_outcome(
5811 tool_call=tool_call,
5812 output="Working note recorded",
5813 is_error=False,
5814 )
5815 ]
5816 )
5817
5818 summary = TurnSummary(final_response="")
5819 await runner.execute_batch(
5820 tool_calls=[tool_call],
5821 tool_source="assistant",
5822 pending_tool_calls_seen=set(),
5823 emit=_noop_emit,
5824 summary=summary,
5825 dod=dod,
5826 executor=executor, # type: ignore[arg-type]
5827 on_confirmation=None,
5828 on_user_question=None,
5829 emit_confirmation=None,
5830 consecutive_errors=0,
5831 )
5832
5833 assert queued_messages
5834 message = queued_messages[-1]
5835 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5836 assert "Resume by creating `02-installation.html` now." in message
5837 assert "Make your next response the concrete mutation tool call itself" in message
5838 assert "refresh `TodoWrite`" in message
5839 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
5840
5841
5842 @pytest.mark.asyncio
5843 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
5844 temp_dir: Path,
5845 ) -> None:
5846 async def assess_confidence(
5847 tool_name: str,
5848 tool_args: dict,
5849 context: str,
5850 ) -> ConfidenceAssessment:
5851 raise AssertionError("Confidence scoring should be disabled in this scenario")
5852
5853 async def verify_action(
5854 tool_name: str,
5855 tool_args: dict,
5856 result: str,
5857 expected: str = "",
5858 ) -> ActionVerification:
5859 raise AssertionError("Verification should not run in this scenario")
5860
5861 implementation_plan = temp_dir / "implementation.md"
5862 implementation_plan.write_text(
5863 "\n".join(
5864 [
5865 "# Implementation Plan",
5866 "",
5867 "## File Changes",
5868 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
5869 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
5870 "",
5871 ]
5872 )
5873 )
5874
5875 context = build_context(
5876 temp_dir=temp_dir,
5877 messages=[],
5878 safeguards=FakeSafeguards(),
5879 assess_confidence=assess_confidence,
5880 verify_action=verify_action,
5881 auto_recover=False,
5882 )
5883 queued_messages: list[str] = []
5884 context.queue_steering_message_callback = queued_messages.append
5885 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5886 dod = create_definition_of_done("Create a multi-file nginx guide.")
5887 dod.implementation_plan = str(implementation_plan)
5888 dod.pending_items.extend(
5889 [
5890 "First, examine the existing fortran guide structure and content to understand the format",
5891 "Create the nginx directory structure",
5892 "Develop the main index.html file for the nginx guide",
5893 ]
5894 )
5895
5896 tool_call = ToolCall(
5897 id="working-note",
5898 name="notepad_write_working",
5899 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
5900 )
5901 executor = FakeExecutor(
5902 [
5903 tool_outcome(
5904 tool_call=tool_call,
5905 output="Working note recorded",
5906 is_error=False,
5907 )
5908 ]
5909 )
5910
5911 summary = TurnSummary(final_response="")
5912 await runner.execute_batch(
5913 tool_calls=[tool_call],
5914 tool_source="assistant",
5915 pending_tool_calls_seen=set(),
5916 emit=_noop_emit,
5917 summary=summary,
5918 dod=dod,
5919 executor=executor, # type: ignore[arg-type]
5920 on_confirmation=None,
5921 on_user_question=None,
5922 emit_confirmation=None,
5923 consecutive_errors=0,
5924 )
5925
5926 assert queued_messages
5927 message = queued_messages[-1]
5928 assert (
5929 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
5930 in message
5931 )
5932 assert "one concrete evidence-gathering tool call" in message
5933 assert "Resume by creating `index.html` now." not in message
5934
5935
5936 @pytest.mark.asyncio
5937 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
5938 temp_dir: Path,
5939 ) -> None:
5940 async def assess_confidence(
5941 tool_name: str,
5942 tool_args: dict,
5943 context: str,
5944 ) -> ConfidenceAssessment:
5945 raise AssertionError("Confidence scoring should be disabled in this scenario")
5946
5947 async def verify_action(
5948 tool_name: str,
5949 tool_args: dict,
5950 result: str,
5951 expected: str = "",
5952 ) -> ActionVerification:
5953 raise AssertionError("Verification should not run in this scenario")
5954
5955 guide_root = temp_dir / "guides" / "nginx"
5956 chapters_dir = guide_root / "chapters"
5957 chapters_dir.mkdir(parents=True)
5958 index_path = guide_root / "index.html"
5959 first_chapter = chapters_dir / "01-introduction.html"
5960 index_path.write_text(
5961 "\n".join(
5962 [
5963 '<a href="chapters/01-introduction.html">Introduction</a>',
5964 '<a href="chapters/02-installation.html">Installation</a>',
5965 '<a href="chapters/03-configuration.html">Configuration</a>',
5966 ]
5967 )
5968 )
5969 first_chapter.write_text("<h1>Introduction</h1>\n")
5970
5971 implementation_plan = temp_dir / "implementation.md"
5972 implementation_plan.write_text(
5973 "\n".join(
5974 [
5975 "# Implementation Plan",
5976 "",
5977 "## File Changes",
5978 f"- `{guide_root / 'index.html'}`",
5979 f"- `{chapters_dir}/`",
5980 "",
5981 ]
5982 )
5983 )
5984
5985 context = build_context(
5986 temp_dir=temp_dir,
5987 messages=[],
5988 safeguards=FakeSafeguards(),
5989 assess_confidence=assess_confidence,
5990 verify_action=verify_action,
5991 auto_recover=False,
5992 )
5993 queued_messages: list[str] = []
5994 context.queue_steering_message_callback = queued_messages.append
5995 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5996 dod = create_definition_of_done("Create a multi-file nginx guide.")
5997 dod.implementation_plan = str(implementation_plan)
5998 dod.pending_items.extend(
5999 [
6000 "First, examine the existing fortran guide structure and content to understand the format",
6001 "Create chapter files following the established pattern",
6002 ]
6003 )
6004 dod.touched_files.extend([str(index_path), str(first_chapter)])
6005
6006 tool_call = ToolCall(
6007 id="working-note",
6008 name="notepad_write_working",
6009 arguments={"content": "Created index and first chapter; next is chapter 2"},
6010 )
6011 executor = FakeExecutor(
6012 [
6013 tool_outcome(
6014 tool_call=tool_call,
6015 output="Working note recorded",
6016 is_error=False,
6017 )
6018 ]
6019 )
6020
6021 summary = TurnSummary(final_response="")
6022 await runner.execute_batch(
6023 tool_calls=[tool_call],
6024 tool_source="assistant",
6025 pending_tool_calls_seen=set(),
6026 emit=_noop_emit,
6027 summary=summary,
6028 dod=dod,
6029 executor=executor, # type: ignore[arg-type]
6030 on_confirmation=None,
6031 on_user_question=None,
6032 emit_confirmation=None,
6033 consecutive_errors=0,
6034 )
6035
6036 assert queued_messages
6037 message = queued_messages[-1]
6038 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
6039 assert "Resume by creating `02-installation.html` now." in message
6040 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
6041
6042
6043 @pytest.mark.asyncio
6044 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
6045 temp_dir: Path,
6046 ) -> None:
6047 async def assess_confidence(
6048 tool_name: str,
6049 tool_args: dict,
6050 context: str,
6051 ) -> ConfidenceAssessment:
6052 raise AssertionError("Confidence scoring should be disabled in this scenario")
6053
6054 async def verify_action(
6055 tool_name: str,
6056 tool_args: dict,
6057 result: str,
6058 expected: str = "",
6059 ) -> ActionVerification:
6060 raise AssertionError("Verification should not run in this scenario")
6061
6062 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
6063 chapters_dir = fortran_root / "chapters"
6064 chapters_dir.mkdir(parents=True)
6065
6066 implementation_plan = temp_dir / "implementation.md"
6067 implementation_plan.write_text(
6068 "\n".join(
6069 [
6070 "# Implementation Plan",
6071 "",
6072 "## File Changes",
6073 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
6074 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
6075 "",
6076 ]
6077 )
6078 )
6079
6080 context = build_context(
6081 temp_dir=temp_dir,
6082 messages=[],
6083 safeguards=FakeSafeguards(),
6084 assess_confidence=assess_confidence,
6085 verify_action=verify_action,
6086 auto_recover=False,
6087 )
6088 queued_messages: list[str] = []
6089 context.queue_steering_message_callback = queued_messages.append
6090 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6091 dod = create_definition_of_done("Create a multi-file nginx guide.")
6092 dod.implementation_plan = str(implementation_plan)
6093 dod.pending_items.extend(
6094 [
6095 "First, examine the existing fortran guide structure and content",
6096 "Create the nginx directory structure",
6097 "Develop the main index.html file for nginx guide",
6098 ]
6099 )
6100
6101 tool_call = ToolCall(
6102 id="glob-1",
6103 name="glob",
6104 arguments={"pattern": "**", "path": str(fortran_root)},
6105 )
6106 executor = FakeExecutor(
6107 [
6108 tool_outcome(
6109 tool_call=tool_call,
6110 output=f"{fortran_root}\n{chapters_dir}",
6111 is_error=False,
6112 )
6113 ]
6114 )
6115
6116 summary = TurnSummary(final_response="")
6117 await runner.execute_batch(
6118 tool_calls=[tool_call],
6119 tool_source="assistant",
6120 pending_tool_calls_seen=set(),
6121 emit=_noop_emit,
6122 summary=summary,
6123 dod=dod,
6124 executor=executor, # type: ignore[arg-type]
6125 on_confirmation=None,
6126 on_user_question=None,
6127 emit_confirmation=None,
6128 consecutive_errors=0,
6129 )
6130
6131 assert queued_messages == []
6132
6133
6134 @pytest.mark.asyncio
6135 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
6136 temp_dir: Path,
6137 ) -> None:
6138 async def assess_confidence(
6139 tool_name: str,
6140 tool_args: dict,
6141 context: str,
6142 ) -> ConfidenceAssessment:
6143 raise AssertionError("Confidence scoring should not run in this scenario")
6144
6145 async def verify_action(
6146 tool_name: str,
6147 tool_args: dict,
6148 result: str,
6149 expected: str = "",
6150 ) -> ActionVerification:
6151 raise AssertionError("Verification should not run in this scenario")
6152
6153 prompt = (
6154 "Have a look at ~/Loader/guides/fortran/index.html, then "
6155 "~/Loader/guides/fortran/chapters. The table of contents links in "
6156 "index.html are inaccurate and the href’s are wrong. Let’s update the "
6157 "links and their link texts to be correct."
6158 )
6159 chapters = temp_dir / "chapters"
6160 chapters.mkdir()
6161 (chapters / "01-introduction.html").write_text(
6162 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
6163 )
6164 (chapters / "02-setup.html").write_text(
6165 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
6166 )
6167 current_block = (
6168 "<h2>Table of Contents</h2>\n"
6169 ' <ul class="chapter-list">\n'
6170 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
6171 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
6172 " </ul>\n"
6173 )
6174 index_path = temp_dir / "index.html"
6175 index_path.write_text(current_block)
6176
6177 context = build_context(
6178 temp_dir=temp_dir,
6179 messages=[],
6180 safeguards=FakeSafeguards(),
6181 assess_confidence=assess_confidence,
6182 verify_action=verify_action,
6183 auto_recover=False,
6184 )
6185 context.session.current_task = prompt # type: ignore[attr-defined]
6186 queued_messages: list[str] = []
6187 context.queue_steering_message_callback = queued_messages.append
6188 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6189 tool_call = ToolCall(
6190 id="edit-1",
6191 name="edit",
6192 arguments={
6193 "file_path": str(index_path),
6194 "old_string": current_block,
6195 "new_string": current_block,
6196 },
6197 )
6198 executor = FakeExecutor(
6199 [
6200 tool_outcome(
6201 tool_call=tool_call,
6202 output=(
6203 "[Blocked - old_string and new_string are identical - no change "
6204 "would occur] Suggestion: Provide different old and new strings"
6205 ),
6206 is_error=True,
6207 state=ToolExecutionState.BLOCKED,
6208 )
6209 ]
6210 )
6211
6212 await runner.execute_batch(
6213 tool_calls=[tool_call],
6214 tool_source="assistant",
6215 pending_tool_calls_seen=set(),
6216 emit=_noop_emit,
6217 summary=TurnSummary(final_response=""),
6218 dod=create_definition_of_done(prompt),
6219 executor=executor, # type: ignore[arg-type]
6220 on_confirmation=None,
6221 on_user_question=None,
6222 emit_confirmation=None,
6223 consecutive_errors=0,
6224 )
6225
6226 assert queued_messages == []
6227
6228
6229 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
6230 temp_dir: Path,
6231 ) -> None:
6232 async def assess_confidence(
6233 tool_name: str,
6234 tool_args: dict,
6235 context: str,
6236 ) -> ConfidenceAssessment:
6237 raise AssertionError("Confidence scoring should be disabled in this scenario")
6238
6239 async def verify_action(
6240 tool_name: str,
6241 tool_args: dict,
6242 result: str,
6243 expected: str = "",
6244 ) -> ActionVerification:
6245 raise AssertionError("Verification should not run in this scenario")
6246
6247 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
6248 context = build_context(
6249 temp_dir=temp_dir,
6250 messages=[
6251 Message(
6252 role=Role.ASSISTANT,
6253 content=(
6254 "Repair focus:\n"
6255 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
6256 f"- Immediate next step: edit `{repair_target}`.\n"
6257 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
6258 ),
6259 )
6260 ],
6261 safeguards=FakeSafeguards(),
6262 assess_confidence=assess_confidence,
6263 verify_action=verify_action,
6264 )
6265 queued: list[str] = []
6266 context.queue_steering_message_callback = queued.append
6267 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6268 dod = create_definition_of_done("Repair a guide page.")
6269
6270 runner._queue_blocked_html_edit_nudge(
6271 ToolCall(
6272 id="edit-1",
6273 name="edit",
6274 arguments={
6275 "file_path": str(repair_target),
6276 "old_string": "same",
6277 "new_string": "same",
6278 },
6279 ),
6280 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
6281 dod=dod,
6282 )
6283
6284 assert queued
6285 assert str(repair_target) in queued[0]
6286 assert "no on-disk change" in queued[0]
6287 assert "replace the surrounding block" in queued[0]
6288 assert "Do not reopen unrelated reference materials" in queued[0]
6289
6290
6291 def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
6292 temp_dir: Path,
6293 ) -> None:
6294 async def assess_confidence(
6295 tool_name: str,
6296 tool_args: dict,
6297 context: str,
6298 ) -> ConfidenceAssessment:
6299 raise AssertionError("Confidence scoring should be disabled in this scenario")
6300
6301 async def verify_action(
6302 tool_name: str,
6303 tool_args: dict,
6304 result: str,
6305 expected: str = "",
6306 ) -> ActionVerification:
6307 raise AssertionError("Verification should not run in this scenario")
6308
6309 guide_root = temp_dir / "guide"
6310 chapters = guide_root / "chapters"
6311 chapters.mkdir(parents=True)
6312 index_path = guide_root / "index.html"
6313 chapter_one = chapters / "01-introduction.html"
6314 index_path.write_text("<html></html>\n")
6315 chapter_one.write_text("<html></html>\n")
6316
6317 implementation_plan = temp_dir / "implementation.md"
6318 implementation_plan.write_text(
6319 "\n".join(
6320 [
6321 "# Implementation Plan",
6322 "",
6323 "## File Changes",
6324 f"- `{index_path}`",
6325 f"- `{chapter_one}`",
6326 "",
6327 ]
6328 )
6329 )
6330
6331 context = build_context(
6332 temp_dir=temp_dir,
6333 messages=[
6334 Message(
6335 role=Role.ASSISTANT,
6336 content=(
6337 "Repair focus:\n"
6338 f"- Confirm the final guide state in `{index_path}`.\n"
6339 f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
6340 ),
6341 )
6342 ],
6343 safeguards=FakeSafeguards(),
6344 assess_confidence=assess_confidence,
6345 verify_action=verify_action,
6346 )
6347 queued: list[str] = []
6348 context.queue_steering_message_callback = queued.append
6349 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6350
6351 dod = create_definition_of_done("Create a multi-file guide.")
6352 dod.implementation_plan = str(implementation_plan)
6353 dod.touched_files.extend([str(index_path), str(chapter_one)])
6354 dod.verification_commands = [f"ls -la {guide_root}"]
6355
6356 runner._queue_blocked_html_edit_nudge(
6357 ToolCall(
6358 id="edit-1",
6359 name="edit",
6360 arguments={
6361 "file_path": str(index_path),
6362 "old_string": "same",
6363 "new_string": "same",
6364 },
6365 ),
6366 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
6367 dod=dod,
6368 )
6369
6370 assert queued
6371 assert "All explicitly planned artifacts already exist." in queued[0]
6372 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
6373 assert "replace the surrounding block" not in queued[0]
6374
6375
6376 async def _noop_emit(event: AgentEvent) -> None:
6377 return None
6378
6379
6380 @pytest.mark.asyncio
6381 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
6382 temp_dir: Path,
6383 ) -> None:
6384 async def assess_confidence(
6385 tool_name: str,
6386 tool_args: dict,
6387 context: str,
6388 ) -> ConfidenceAssessment:
6389 raise AssertionError("Confidence scoring should be disabled in this scenario")
6390
6391 async def verify_action(
6392 tool_name: str,
6393 tool_args: dict,
6394 result: str,
6395 expected: str = "",
6396 ) -> ActionVerification:
6397 raise AssertionError("Verification should not run for this scenario")
6398
6399 context = build_context(
6400 temp_dir=temp_dir,
6401 messages=[],
6402 safeguards=FakeSafeguards(),
6403 assess_confidence=assess_confidence,
6404 verify_action=verify_action,
6405 )
6406 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6407 tool_call = ToolCall(
6408 id="write-1",
6409 name="write",
6410 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
6411 )
6412 executor = FakeExecutor(
6413 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
6414 )
6415 summary = TurnSummary(final_response="")
6416 dod = create_definition_of_done("Update README and verify it still works.")
6417 events: list[AgentEvent] = []
6418
6419 async def emit(event: AgentEvent) -> None:
6420 events.append(event)
6421
6422 await runner.execute_batch(
6423 tool_calls=[tool_call],
6424 tool_source="assistant",
6425 pending_tool_calls_seen=set(),
6426 emit=emit,
6427 summary=summary,
6428 dod=dod,
6429 executor=executor, # type: ignore[arg-type]
6430 on_confirmation=None,
6431 on_user_question=None,
6432 emit_confirmation=None,
6433 consecutive_errors=0,
6434 )
6435
6436 assert dod.last_verification_result == "planned"
6437 assert dod.verification_commands
6438 assert "Collect verification evidence" in dod.pending_items
6439 assert dod.active_verification_attempt_id == "verification-attempt-1"
6440 assert dod.active_verification_attempt_number == 1
6441 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
6442 assert summary.workflow_timeline[-1].policy_outcome == "planned"
6443 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
6444 assert (
6445 summary.workflow_timeline[-1].verification_observations[0].attempt_id
6446 == "verification-attempt-1"
6447 )
6448 assert (
6449 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
6450 )
6451
6452
6453 @pytest.mark.asyncio
6454 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
6455 temp_dir: Path,
6456 ) -> None:
6457 async def assess_confidence(
6458 tool_name: str,
6459 tool_args: dict,
6460 context: str,
6461 ) -> ConfidenceAssessment:
6462 raise AssertionError("Confidence scoring should be disabled in this scenario")
6463
6464 async def verify_action(
6465 tool_name: str,
6466 tool_args: dict,
6467 result: str,
6468 expected: str = "",
6469 ) -> ActionVerification:
6470 raise AssertionError("Verification should not run in this scenario")
6471
6472 context = build_context(
6473 temp_dir=temp_dir,
6474 messages=[],
6475 safeguards=FakeSafeguards(),
6476 assess_confidence=assess_confidence,
6477 verify_action=verify_action,
6478 )
6479 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6480 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
6481 chapters = nginx_root / "chapters"
6482 implementation_plan = temp_dir / "implementation.md"
6483 implementation_plan.write_text(
6484 "\n".join(
6485 [
6486 "# Implementation Plan",
6487 "",
6488 "## File Changes",
6489 f"- `{chapters}/`",
6490 f"- `{nginx_root / 'index.html'}`",
6491 "",
6492 ]
6493 )
6494 )
6495
6496 tool_call = ToolCall(
6497 id="mkdir-1",
6498 name="bash",
6499 arguments={"command": f"mkdir -p {chapters}"},
6500 )
6501 executor = FakeExecutor(
6502 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
6503 )
6504 summary = TurnSummary(final_response="")
6505 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
6506 dod.implementation_plan = str(implementation_plan)
6507 events: list[AgentEvent] = []
6508
6509 async def emit(event: AgentEvent) -> None:
6510 events.append(event)
6511
6512 await runner.execute_batch(
6513 tool_calls=[tool_call],
6514 tool_source="assistant",
6515 pending_tool_calls_seen=set(),
6516 emit=emit,
6517 summary=summary,
6518 dod=dod,
6519 executor=executor, # type: ignore[arg-type]
6520 on_confirmation=None,
6521 on_user_question=None,
6522 emit_confirmation=None,
6523 consecutive_errors=0,
6524 )
6525
6526 assert dod.last_verification_result is None
6527 assert "Collect verification evidence" not in dod.pending_items
6528 assert not any(
6529 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
6530 )
6531
6532
6533 @pytest.mark.asyncio
6534 async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
6535 temp_dir: Path,
6536 ) -> None:
6537 async def assess_confidence(
6538 tool_name: str,
6539 tool_args: dict,
6540 context: str,
6541 ) -> ConfidenceAssessment:
6542 raise AssertionError("Confidence scoring should be disabled in this scenario")
6543
6544 async def verify_action(
6545 tool_name: str,
6546 tool_args: dict,
6547 result: str,
6548 expected: str = "",
6549 ) -> ActionVerification:
6550 raise AssertionError("Verification should not run in this scenario")
6551
6552 context = build_context(
6553 temp_dir=temp_dir,
6554 messages=[],
6555 safeguards=FakeSafeguards(),
6556 assess_confidence=assess_confidence,
6557 verify_action=verify_action,
6558 )
6559 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6560 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
6561 chapters = nginx_root / "chapters"
6562 chapters.mkdir(parents=True)
6563 index_path = nginx_root / "index.html"
6564 implementation_plan = temp_dir / "implementation.md"
6565 implementation_plan.write_text(
6566 "\n".join(
6567 [
6568 "# Implementation Plan",
6569 "",
6570 "## File Changes",
6571 f"- `{nginx_root}/`",
6572 f"- `{chapters}/`",
6573 f"- `{index_path}`",
6574 "",
6575 ]
6576 )
6577 )
6578
6579 tool_call = ToolCall(
6580 id="write-index",
6581 name="write",
6582 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
6583 )
6584 executor = FakeExecutor(
6585 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
6586 )
6587 summary = TurnSummary(final_response="")
6588 dod = create_definition_of_done("Create a multi-file nginx guide.")
6589 dod.implementation_plan = str(implementation_plan)
6590 dod.pending_items.extend(
6591 [
6592 "Develop the main index.html file with proper structure",
6593 "Create first nginx chapter",
6594 ]
6595 )
6596 events: list[AgentEvent] = []
6597
6598 async def emit(event: AgentEvent) -> None:
6599 events.append(event)
6600
6601 await runner.execute_batch(
6602 tool_calls=[tool_call],
6603 tool_source="assistant",
6604 pending_tool_calls_seen=set(),
6605 emit=emit,
6606 summary=summary,
6607 dod=dod,
6608 executor=executor, # type: ignore[arg-type]
6609 on_confirmation=None,
6610 on_user_question=None,
6611 emit_confirmation=None,
6612 consecutive_errors=0,
6613 )
6614
6615 assert dod.last_verification_result is None
6616 assert "Collect verification evidence" not in dod.pending_items
6617 assert "Create first nginx chapter" in dod.pending_items
6618 assert not any(
6619 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
6620 )
6621
6622
6623 @pytest.mark.asyncio
6624 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
6625 temp_dir: Path,
6626 ) -> None:
6627 async def assess_confidence(
6628 tool_name: str,
6629 tool_args: dict,
6630 context: str,
6631 ) -> ConfidenceAssessment:
6632 raise AssertionError("Confidence scoring should be disabled in this scenario")
6633
6634 async def verify_action(
6635 tool_name: str,
6636 tool_args: dict,
6637 result: str,
6638 expected: str = "",
6639 ) -> ActionVerification:
6640 raise AssertionError("Verification should not run for this scenario")
6641
6642 context = build_context(
6643 temp_dir=temp_dir,
6644 messages=[],
6645 safeguards=FakeSafeguards(),
6646 assess_confidence=assess_confidence,
6647 verify_action=verify_action,
6648 )
6649 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6650 tool_call = ToolCall(
6651 id="write-1",
6652 name="write",
6653 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
6654 )
6655 executor = FakeExecutor(
6656 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
6657 )
6658 summary = TurnSummary(final_response="")
6659 dod = create_definition_of_done("Update README and verify it still works.")
6660 dod.verification_commands = ["uv run pytest -q"]
6661 dod.last_verification_result = "passed"
6662 dod.verification_attempt_counter = 1
6663 dod.active_verification_attempt_id = "verification-attempt-1"
6664 dod.active_verification_attempt_number = 1
6665 dod.evidence = [
6666 VerificationEvidence(
6667 command="uv run pytest -q",
6668 passed=True,
6669 stdout="401 passed",
6670 kind="test",
6671 )
6672 ]
6673 dod.completed_items.append("Collect verification evidence")
6674 events: list[AgentEvent] = []
6675
6676 async def emit(event: AgentEvent) -> None:
6677 events.append(event)
6678
6679 await runner.execute_batch(
6680 tool_calls=[tool_call],
6681 tool_source="assistant",
6682 pending_tool_calls_seen=set(),
6683 emit=emit,
6684 summary=summary,
6685 dod=dod,
6686 executor=executor, # type: ignore[arg-type]
6687 on_confirmation=None,
6688 on_user_question=None,
6689 emit_confirmation=None,
6690 consecutive_errors=0,
6691 )
6692
6693 assert dod.last_verification_result == "stale"
6694 assert dod.evidence == []
6695 assert "Collect verification evidence" in dod.pending_items
6696 assert "Collect verification evidence" not in dod.completed_items
6697 assert dod.active_verification_attempt_id == "verification-attempt-2"
6698 assert dod.active_verification_attempt_number == 2
6699 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
6700 assert summary.workflow_timeline[-1].policy_outcome == "stale"
6701 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
6702 assert (
6703 summary.workflow_timeline[-1].verification_observations[0].attempt_id
6704 == "verification-attempt-1"
6705 )
6706 assert (
6707 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
6708 )
6709 assert (
6710 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
6711 == "verification-attempt-2"
6712 )
6713 assert (
6714 summary.workflow_timeline[-1].verification_observations[0].command
6715 == "uv run pytest -q"
6716 )
6717
6718
6719 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
6720 async def assess_confidence(
6721 tool_name: str,
6722 tool_args: dict,
6723 context: str,
6724 ) -> ConfidenceAssessment:
6725 raise AssertionError("Confidence scoring should be disabled in this scenario")
6726
6727 async def verify_action(
6728 tool_name: str,
6729 tool_args: dict,
6730 result: str,
6731 expected: str = "",
6732 ) -> ActionVerification:
6733 raise AssertionError("Verification should not run in this scenario")
6734
6735 repair_target = temp_dir / "guide" / "index.html"
6736 context = build_context(
6737 temp_dir=temp_dir,
6738 messages=[
6739 Message(
6740 role=Role.ASSISTANT,
6741 content=(
6742 "Repair focus:\n"
6743 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
6744 f"- Immediate next step: edit `{repair_target}`.\n"
6745 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
6746 ),
6747 )
6748 ],
6749 safeguards=FakeSafeguards(),
6750 assess_confidence=assess_confidence,
6751 verify_action=verify_action,
6752 )
6753 queued: list[str] = []
6754 context.queue_steering_message_callback = queued.append
6755 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6756
6757 runner._queue_blocked_active_repair_nudge(
6758 "[Blocked - active repair scope: verification already identified the repair target.]"
6759 )
6760
6761 assert queued
6762 assert str(repair_target) in queued[0]
6763 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
6764 assert "Do not reopen unrelated reference materials" in queued[0]
6765
6766
6767 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
6768 temp_dir: Path,
6769 ) -> None:
6770 async def assess_confidence(
6771 tool_name: str,
6772 tool_args: dict,
6773 context: str,
6774 ) -> ConfidenceAssessment:
6775 raise AssertionError("Confidence scoring should be disabled in this scenario")
6776
6777 async def verify_action(
6778 tool_name: str,
6779 tool_args: dict,
6780 result: str,
6781 expected: str = "",
6782 ) -> ActionVerification:
6783 raise AssertionError("Verification should not run in this scenario")
6784
6785 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
6786 stylesheet = temp_dir / "guide" / "styles.css"
6787 context = build_context(
6788 temp_dir=temp_dir,
6789 messages=[
6790 Message(
6791 role=Role.ASSISTANT,
6792 content=(
6793 "Repair focus:\n"
6794 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
6795 f"- Immediate next step: edit `{repair_target}`.\n"
6796 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
6797 ),
6798 )
6799 ],
6800 safeguards=FakeSafeguards(),
6801 assess_confidence=assess_confidence,
6802 verify_action=verify_action,
6803 )
6804 queued: list[str] = []
6805 context.queue_steering_message_callback = queued.append
6806 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6807
6808 runner._queue_blocked_active_repair_mutation_nudge(
6809 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
6810 )
6811
6812 assert queued
6813 assert str(repair_target) in queued[0]
6814 assert str(stylesheet) in queued[0]
6815 assert "before widening the change set" in queued[0]
6816
6817
6818 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
6819 temp_dir: Path,
6820 ) -> None:
6821 async def assess_confidence(
6822 tool_name: str,
6823 tool_args: dict,
6824 context: str,
6825 ) -> ConfidenceAssessment:
6826 raise AssertionError("Confidence scoring should be disabled in this scenario")
6827
6828 async def verify_action(
6829 tool_name: str,
6830 tool_args: dict,
6831 result: str,
6832 expected: str = "",
6833 ) -> ActionVerification:
6834 raise AssertionError("Verification should not run in this scenario")
6835
6836 context = build_context(
6837 temp_dir=temp_dir,
6838 messages=[],
6839 safeguards=FakeSafeguards(),
6840 assess_confidence=assess_confidence,
6841 verify_action=verify_action,
6842 )
6843 queued: list[str] = []
6844 context.queue_steering_message_callback = queued.append
6845 store = DefinitionOfDoneStore(temp_dir)
6846 dod = create_definition_of_done("Create a multi-file guide from a reference")
6847 plan_path = temp_dir / "implementation.md"
6848 plan_path.write_text(
6849 "# File Changes\n"
6850 "- `guide/index.html`\n"
6851 "- `guide/chapters/01-getting-started.html`\n"
6852 "- `guide/chapters/02-installation.html`\n"
6853 "- `guide/chapters/03-first-website.html`\n"
6854 )
6855 dod.implementation_plan = str(plan_path)
6856 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
6857 (temp_dir / "guide" / "index.html").write_text("index")
6858 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
6859 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
6860 runner = ToolBatchRunner(context, store)
6861
6862 runner._queue_blocked_late_reference_drift_nudge(
6863 "[Blocked - late reference drift: several planned artifacts already exist.]",
6864 dod=dod,
6865 )
6866
6867 assert queued
6868 assert "03-first-website.html" in queued[0]
6869 assert "older reference materials" in queued[0]
6870
6871
6872 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
6873 temp_dir: Path,
6874 ) -> None:
6875 async def assess_confidence(
6876 tool_name: str,
6877 tool_args: dict,
6878 context: str,
6879 ) -> ConfidenceAssessment:
6880 raise AssertionError("Confidence scoring should be disabled in this scenario")
6881
6882 async def verify_action(
6883 tool_name: str,
6884 tool_args: dict,
6885 result: str,
6886 expected: str = "",
6887 ) -> ActionVerification:
6888 raise AssertionError("Verification should not run in this scenario")
6889
6890 guide_root = temp_dir / "guide"
6891 chapters = guide_root / "chapters"
6892 guide_root.mkdir(parents=True)
6893 chapters.mkdir()
6894 index_path = guide_root / "index.html"
6895 chapter_one = chapters / "01-getting-started.html"
6896 chapter_two = chapters / "02-installation.html"
6897 index_path.write_text("index")
6898 chapter_one.write_text("one")
6899 chapter_two.write_text("two")
6900
6901 implementation_plan = temp_dir / "implementation.md"
6902 implementation_plan.write_text(
6903 "\n".join(
6904 [
6905 "# Implementation Plan",
6906 "",
6907 "## File Changes",
6908 f"- `{guide_root}`",
6909 f"- `{chapters}`",
6910 f"- `{index_path}`",
6911 f"- `{chapter_one}`",
6912 f"- `{chapter_two}`",
6913 "",
6914 ]
6915 )
6916 )
6917
6918 context = build_context(
6919 temp_dir=temp_dir,
6920 messages=[],
6921 safeguards=FakeSafeguards(),
6922 assess_confidence=assess_confidence,
6923 verify_action=verify_action,
6924 )
6925 queued: list[str] = []
6926 context.queue_steering_message_callback = queued.append
6927 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6928 dod = create_definition_of_done("Create a multi-file guide from a reference")
6929 dod.implementation_plan = str(implementation_plan)
6930 dod.verification_commands = [f"ls -la {guide_root}"]
6931 sync_todos_to_definition_of_done(
6932 dod,
6933 [
6934 {
6935 "content": "Verify all guide files are linked and complete",
6936 "active_form": "Working on: Verify all guide files are linked and complete",
6937 "status": "pending",
6938 }
6939 ],
6940 project_root=temp_dir,
6941 )
6942
6943 runner._queue_blocked_completed_artifact_scope_nudge(
6944 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
6945 dod=dod,
6946 )
6947
6948 assert queued
6949 assert context.workflow_mode == "verify"
6950 assert "All explicitly planned artifacts already exist." in queued[0]
6951 assert "Verify all guide files are linked and complete" in queued[0]
6952 assert "Do not reopen earlier reference materials." in queued[0]
6953 assert "Verification should run next" in queued[0]
6954
6955
6956 def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
6957 temp_dir: Path,
6958 ) -> None:
6959 async def assess_confidence(
6960 tool_name: str,
6961 tool_args: dict,
6962 context: str,
6963 ) -> ConfidenceAssessment:
6964 raise AssertionError("Confidence scoring should be disabled in this scenario")
6965
6966 async def verify_action(
6967 tool_name: str,
6968 tool_args: dict,
6969 result: str,
6970 expected: str = "",
6971 ) -> ActionVerification:
6972 raise AssertionError("Verification should not run in this scenario")
6973
6974 guide_root = temp_dir / "guide"
6975 chapters = guide_root / "chapters"
6976 guide_root.mkdir(parents=True)
6977 chapters.mkdir()
6978 index_path = guide_root / "index.html"
6979 chapter_one = chapters / "01-getting-started.html"
6980 chapter_two = chapters / "02-installation.html"
6981 index_path.write_text("index")
6982 chapter_one.write_text("one")
6983 chapter_two.write_text("two")
6984
6985 implementation_plan = temp_dir / "implementation.md"
6986 implementation_plan.write_text(
6987 "\n".join(
6988 [
6989 "# Implementation Plan",
6990 "",
6991 "## File Changes",
6992 f"- `{guide_root}`",
6993 f"- `{chapters}`",
6994 f"- `{index_path}`",
6995 f"- `{chapter_one}`",
6996 f"- `{chapter_two}`",
6997 "",
6998 ]
6999 )
7000 )
7001
7002 context = build_context(
7003 temp_dir=temp_dir,
7004 messages=[],
7005 safeguards=FakeSafeguards(),
7006 assess_confidence=assess_confidence,
7007 verify_action=verify_action,
7008 )
7009 queued: list[str] = []
7010 context.queue_steering_message_callback = queued.append
7011 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7012 dod = create_definition_of_done("Create a multi-file guide from a reference")
7013 dod.implementation_plan = str(implementation_plan)
7014 dod.verification_commands = [f"ls -la {guide_root}"]
7015
7016 runner._queue_blocked_completed_artifact_scope_nudge(
7017 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
7018 dod=dod,
7019 )
7020
7021 assert queued
7022 assert context.workflow_mode == "verify"
7023 assert "All explicitly planned artifacts already exist." in queued[0]
7024 assert "move to verification or final confirmation" in queued[0]
7025
7026
7027 @pytest.mark.asyncio
7028 async def test_tool_batch_runner_does_not_halt_on_repeated_post_build_audit_blocks(
7029 temp_dir: Path,
7030 ) -> None:
7031 async def assess_confidence(
7032 tool_name: str,
7033 tool_args: dict,
7034 context: str,
7035 ) -> ConfidenceAssessment:
7036 raise AssertionError("Confidence scoring should be disabled in this scenario")
7037
7038 async def verify_action(
7039 tool_name: str,
7040 tool_args: dict,
7041 result: str,
7042 expected: str = "",
7043 ) -> ActionVerification:
7044 raise AssertionError("Verification should not run in this scenario")
7045
7046 guide_root = temp_dir / "guide"
7047 chapters = guide_root / "chapters"
7048 guide_root.mkdir(parents=True)
7049 chapters.mkdir()
7050 index_path = guide_root / "index.html"
7051 chapter_one = chapters / "01-getting-started.html"
7052 chapter_two = chapters / "02-installation.html"
7053 index_path.write_text("index")
7054 chapter_one.write_text("one")
7055 chapter_two.write_text("two")
7056
7057 implementation_plan = temp_dir / "implementation.md"
7058 implementation_plan.write_text(
7059 "\n".join(
7060 [
7061 "# Implementation Plan",
7062 "",
7063 "## File Changes",
7064 f"- `{guide_root}`",
7065 f"- `{chapters}`",
7066 f"- `{index_path}`",
7067 f"- `{chapter_one}`",
7068 f"- `{chapter_two}`",
7069 "",
7070 ]
7071 )
7072 )
7073
7074 context = build_context(
7075 temp_dir=temp_dir,
7076 messages=[],
7077 safeguards=FakeSafeguards(),
7078 assess_confidence=assess_confidence,
7079 verify_action=verify_action,
7080 )
7081 queued: list[str] = []
7082 context.queue_steering_message_callback = queued.append
7083 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7084 dod = create_definition_of_done("Create a multi-file guide from a reference")
7085 dod.implementation_plan = str(implementation_plan)
7086 dod.verification_commands = [f"ls -la {guide_root}"]
7087
7088 blocked_message = (
7089 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]"
7090 )
7091 tool_calls = [
7092 ToolCall(
7093 id=f"audit-{index}",
7094 name="bash",
7095 arguments={"command": f"cd {temp_dir} && ls -la guide/chapters/"},
7096 )
7097 for index in range(1, 4)
7098 ]
7099 executor = FakeExecutor(
7100 [
7101 tool_outcome(
7102 tool_call=tool_call,
7103 output=blocked_message,
7104 is_error=True,
7105 state=ToolExecutionState.BLOCKED,
7106 )
7107 for tool_call in tool_calls
7108 ]
7109 )
7110 events: list[AgentEvent] = []
7111
7112 async def emit(event: AgentEvent) -> None:
7113 events.append(event)
7114
7115 result = await runner.execute_batch(
7116 tool_calls=tool_calls,
7117 tool_source="native",
7118 pending_tool_calls_seen=set(),
7119 emit=emit,
7120 summary=TurnSummary(final_response=""),
7121 dod=dod,
7122 executor=executor,
7123 on_confirmation=None,
7124 on_user_question=None,
7125 emit_confirmation=None,
7126 consecutive_errors=0,
7127 )
7128
7129 assert result.halted is False
7130 assert result.consecutive_errors == 0
7131 assert context.workflow_mode == "verify"
7132 assert queued
7133 assert any("move to verification or final confirmation" in message for message in queued)
7134
7135
7136 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
7137 temp_dir: Path,
7138 ) -> None:
7139 async def assess_confidence(
7140 tool_name: str,
7141 tool_args: dict,
7142 context: str,
7143 ) -> ConfidenceAssessment:
7144 raise AssertionError("Confidence scoring should be disabled in this scenario")
7145
7146 async def verify_action(
7147 tool_name: str,
7148 tool_args: dict,
7149 result: str,
7150 expected: str = "",
7151 ) -> ActionVerification:
7152 raise AssertionError("Verification should not run in this scenario")
7153
7154 context = build_context(
7155 temp_dir=temp_dir,
7156 messages=[],
7157 safeguards=FakeSafeguards(),
7158 assess_confidence=assess_confidence,
7159 verify_action=verify_action,
7160 )
7161 queued: list[str] = []
7162 context.queue_steering_message_callback = queued.append
7163 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7164
7165 runner._queue_blocked_html_declared_target_nudge(
7166 ToolCall(
7167 id="write-ch1",
7168 name="write",
7169 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
7170 ),
7171 (
7172 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
7173 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
7174 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
7175 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
7176 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
7177 ),
7178 )
7179
7180 assert queued
7181 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
7182 assert "`chapters/02-installation.html`" in queued[0]
7183 assert "same file now" in queued[0]
7184
7185
7186 def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
7187 temp_dir: Path,
7188 ) -> None:
7189 async def assess_confidence(
7190 tool_name: str,
7191 tool_args: dict,
7192 context: str,
7193 ) -> ConfidenceAssessment:
7194 raise AssertionError("Confidence scoring should be disabled in this scenario")
7195
7196 async def verify_action(
7197 tool_name: str,
7198 tool_args: dict,
7199 result: str,
7200 expected: str = "",
7201 ) -> ActionVerification:
7202 raise AssertionError("Verification should not run in this scenario")
7203
7204 context = build_context(
7205 temp_dir=temp_dir,
7206 messages=[],
7207 safeguards=FakeSafeguards(),
7208 assess_confidence=assess_confidence,
7209 verify_action=verify_action,
7210 )
7211 queued: list[str] = []
7212 context.queue_steering_message_callback = queued.append
7213 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7214
7215 runner._queue_blocked_html_declared_target_nudge(
7216 ToolCall(
7217 id="write-ch1",
7218 name="write",
7219 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
7220 ),
7221 (
7222 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
7223 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
7224 "introducing new sibling targets that the guide root does not declare; remove or replace "
7225 "undeclared hrefs like: troubleshooting.html. "
7226 "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
7227 "chapters/configuration.html."
7228 ),
7229 )
7230
7231 assert queued
7232 assert "Remove the invented hrefs or keep local links within the declared target set" in queued[0]
7233 assert "`chapters/installation.html`" in queued[0]
7234 assert "closest declared target(s)" not in queued[0]
7235
7236
7237 def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
7238 temp_dir: Path,
7239 ) -> None:
7240 async def assess_confidence(
7241 tool_name: str,
7242 tool_args: dict,
7243 context: str,
7244 ) -> ConfidenceAssessment:
7245 raise AssertionError("Confidence scoring should be disabled in this scenario")
7246
7247 async def verify_action(
7248 tool_name: str,
7249 tool_args: dict,
7250 result: str,
7251 expected: str = "",
7252 ) -> ActionVerification:
7253 raise AssertionError("Verification should not run in this scenario")
7254
7255 context = build_context(
7256 temp_dir=temp_dir,
7257 messages=[],
7258 safeguards=FakeSafeguards(),
7259 assess_confidence=assess_confidence,
7260 verify_action=verify_action,
7261 )
7262 queued: list[str] = []
7263 context.queue_steering_message_callback = queued.append
7264 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7265 dod = create_definition_of_done("Create a guide.")
7266
7267 target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
7268 runner._queue_blocked_html_declared_file_creation_nudge(
7269 ToolCall(
7270 id="write-troubleshooting",
7271 name="write",
7272 arguments={"file_path": str(target)},
7273 ),
7274 (
7275 "[Blocked - HTML file creation falls outside the current declared artifact set] "
7276 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
7277 f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
7278 "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
7279 "Already-declared local targets include: chapters/advanced-topics.html, "
7280 "chapters/basic-usage.html, chapters/configuration.html"
7281 ),
7282 dod=dod,
7283 )
7284
7285 assert queued
7286 assert "update" in queued[0].lower()
7287 assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
7288 assert "`chapters/troubleshooting.html`" in queued[0]
7289 assert "retry the file creation" in queued[0]
7290
7291
7292 def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
7293 temp_dir: Path,
7294 ) -> None:
7295 async def assess_confidence(
7296 tool_name: str,
7297 tool_args: dict,
7298 context: str,
7299 ) -> ConfidenceAssessment:
7300 raise AssertionError("Confidence scoring should not run in this scenario")
7301
7302 async def verify_action(
7303 tool_name: str,
7304 tool_args: dict,
7305 result: str,
7306 expected: str = "",
7307 ) -> ActionVerification:
7308 raise AssertionError("Verification should not run in this scenario")
7309
7310 guide = temp_dir / "guide"
7311 chapters = guide / "chapters"
7312 guide.mkdir()
7313 chapters.mkdir()
7314 index = guide / "index.html"
7315 index.write_text(
7316 "\n".join(
7317 [
7318 '<a href="chapters/01-introduction.html">Intro</a>',
7319 '<a href="chapters/02-installation.html">Install</a>',
7320 '<a href="../index.html">Back</a>',
7321 "",
7322 ]
7323 )
7324 )
7325 (chapters / "01-introduction.html").write_text("<html></html>\n")
7326 (chapters / "02-installation.html").write_text("<html></html>\n")
7327
7328 implementation_plan = temp_dir / "implementation.md"
7329 implementation_plan.write_text(
7330 "\n".join(
7331 [
7332 "# Implementation Plan",
7333 "",
7334 "## File Changes",
7335 f"- `{index}`",
7336 f"- `{chapters / '01-introduction.html'}`",
7337 f"- `{chapters / '02-installation.html'}`",
7338 "",
7339 ]
7340 )
7341 )
7342
7343 context = build_context(
7344 temp_dir=temp_dir,
7345 messages=[],
7346 safeguards=FakeSafeguards(),
7347 assess_confidence=assess_confidence,
7348 verify_action=verify_action,
7349 )
7350 queued: list[str] = []
7351 context.queue_steering_message_callback = queued.append
7352 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7353 dod = create_definition_of_done("Create a guide.")
7354 dod.implementation_plan = str(implementation_plan)
7355 dod.verification_commands = [f"ls -la {guide}"]
7356 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
7357
7358 target = guide / "chapters" / "08-advanced-configuration.html"
7359 runner._queue_blocked_html_declared_file_creation_nudge(
7360 ToolCall(
7361 id="write-extra",
7362 name="write",
7363 arguments={"file_path": str(target)},
7364 ),
7365 (
7366 "[Blocked - HTML file creation falls outside the current declared artifact set] "
7367 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
7368 f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
7369 "for example: chapters/08-advanced-configuration.html."
7370 ),
7371 dod=dod,
7372 )
7373
7374 assert queued
7375 assert "All explicitly planned artifacts already exist on disk." in queued[0]
7376 assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
7377 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
7378 assert "update the guide root" not in queued[0]
7379
7380
7381 def test_tool_batch_runner_blocked_html_declared_file_creation_prefers_closest_target(
7382 temp_dir: Path,
7383 ) -> None:
7384 async def assess_confidence(
7385 tool_name: str,
7386 tool_args: dict,
7387 context: str,
7388 ) -> ConfidenceAssessment:
7389 raise AssertionError("Confidence scoring should not run in this scenario")
7390
7391 async def verify_action(
7392 tool_name: str,
7393 tool_args: dict,
7394 result: str,
7395 expected: str = "",
7396 ) -> ActionVerification:
7397 raise AssertionError("Verification should not run in this scenario")
7398
7399 context = build_context(
7400 temp_dir=temp_dir,
7401 messages=[],
7402 safeguards=FakeSafeguards(),
7403 assess_confidence=assess_confidence,
7404 verify_action=verify_action,
7405 )
7406 queued: list[str] = []
7407 context.queue_steering_message_callback = queued.append
7408 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7409 dod = create_definition_of_done("Create a guide.")
7410
7411 target = temp_dir / "guide" / "chapters" / "02-basics.html"
7412 runner._queue_blocked_html_declared_file_creation_nudge(
7413 ToolCall(
7414 id="write-basics",
7415 name="write",
7416 arguments={"file_path": str(target)},
7417 ),
7418 (
7419 "[Blocked - HTML file creation falls outside the current declared artifact set] "
7420 "Suggestion: Keep new non-root HTML files within the root-declared artifact set. "
7421 "Do not create undeclared sibling page `chapters/02-basics.html`; use the closest declared local target instead. "
7422 "Already-declared local targets include: chapters/01-introduction.html, "
7423 "chapters/02-installation.html, chapters/03-basic-configuration.html. "
7424 "Closest declared local targets include: chapters/02-installation.html"
7425 ),
7426 dod=dod,
7427 )
7428
7429 assert queued
7430 assert "Do not create `chapters/02-basics.html`." in queued[0]
7431 assert "closest declared target instead: `chapters/02-installation.html`" in queued[0]
7432 assert "Already-declared local targets include:" in queued[0]
7433 assert "update the guide root" not in queued[0]
7434
7435
7436 def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
7437 temp_dir: Path,
7438 ) -> None:
7439 async def assess_confidence(
7440 tool_name: str,
7441 tool_args: dict,
7442 context: str,
7443 ) -> ConfidenceAssessment:
7444 raise AssertionError("Confidence scoring should not run in this scenario")
7445
7446 async def verify_action(
7447 tool_name: str,
7448 tool_args: dict,
7449 result: str,
7450 expected: str = "",
7451 ) -> ActionVerification:
7452 raise AssertionError("Verification should not run in this scenario")
7453
7454 guide = temp_dir / "guide"
7455 chapters = guide / "chapters"
7456 guide.mkdir()
7457 chapters.mkdir()
7458 index = guide / "index.html"
7459 index.write_text(
7460 "\n".join(
7461 [
7462 '<a href="chapters/01-introduction.html">Intro</a>',
7463 '<a href="chapters/02-installation.html">Install</a>',
7464 '<a href="../index.html">Back</a>',
7465 "",
7466 ]
7467 )
7468 )
7469 (chapters / "01-introduction.html").write_text("<html></html>\n")
7470 (chapters / "02-installation.html").write_text("<html></html>\n")
7471
7472 implementation_plan = temp_dir / "implementation.md"
7473 implementation_plan.write_text(
7474 "\n".join(
7475 [
7476 "# Implementation Plan",
7477 "",
7478 "## File Changes",
7479 f"- `{index}`",
7480 f"- `{chapters / '01-introduction.html'}`",
7481 f"- `{chapters / '02-installation.html'}`",
7482 "",
7483 ]
7484 )
7485 )
7486
7487 context = build_context(
7488 temp_dir=temp_dir,
7489 messages=[],
7490 safeguards=FakeSafeguards(),
7491 assess_confidence=assess_confidence,
7492 verify_action=verify_action,
7493 )
7494 queued: list[str] = []
7495 context.queue_steering_message_callback = queued.append
7496 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7497 dod = create_definition_of_done("Create a guide.")
7498 dod.implementation_plan = str(implementation_plan)
7499 dod.verification_commands = [f"ls -la {guide}"]
7500 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
7501
7502 runner._queue_blocked_html_missing_target_nudge(
7503 ToolCall(
7504 id="edit-root",
7505 name="edit",
7506 arguments={"file_path": str(index)},
7507 ),
7508 (
7509 "[Blocked - Edited HTML links point to files that do not exist] "
7510 "Suggestion: Use only existing local targets for href values and avoid introducing missing links. "
7511 "Broken href(s): chapters/08-advanced-configuration.html. "
7512 "Replace them with an existing local target or remove the broken link."
7513 ),
7514 dod=dod,
7515 )
7516
7517 assert queued
7518 assert "All explicitly planned artifacts already exist on disk." in queued[0]
7519 assert f"Stay on `{index}`." in queued[0]
7520 assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
7521 assert "Repair the existing generated files instead of expanding the guide." in queued[0]
7522 assert "Replace broken hrefs with existing local targets or remove the broken link." in queued[0]
7523
7524
7525 @pytest.mark.asyncio
7526 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
7527 temp_dir: Path,
7528 ) -> None:
7529 async def assess_confidence(
7530 tool_name: str,
7531 tool_args: dict,
7532 context: str,
7533 ) -> ConfidenceAssessment:
7534 raise AssertionError("Confidence scoring should be disabled in this scenario")
7535
7536 async def verify_action(
7537 tool_name: str,
7538 tool_args: dict,
7539 result: str,
7540 expected: str = "",
7541 ) -> ActionVerification:
7542 raise AssertionError("Verification should not run in this scenario")
7543
7544 guide_root = temp_dir / "guides" / "nginx"
7545 chapters = guide_root / "chapters"
7546 chapters.mkdir(parents=True)
7547 index_path = guide_root / "index.html"
7548 chapter_one = chapters / "01-introduction.html"
7549 chapter_two = chapters / "02-installation.html"
7550 index_path.write_text("<html></html>\n")
7551 chapter_one.write_text("<h1>Intro</h1>\n")
7552
7553 implementation_plan = temp_dir / "implementation.md"
7554 implementation_plan.write_text(
7555 "\n".join(
7556 [
7557 "# Implementation Plan",
7558 "",
7559 "## File Changes",
7560 f"- `{index_path}`",
7561 f"- `{chapter_one}`",
7562 f"- `{chapter_two}`",
7563 "",
7564 ]
7565 )
7566 )
7567
7568 context = build_context(
7569 temp_dir=temp_dir,
7570 messages=[],
7571 safeguards=FakeSafeguards(),
7572 assess_confidence=assess_confidence,
7573 verify_action=verify_action,
7574 auto_recover=False,
7575 )
7576 queued: list[str] = []
7577 context.queue_steering_message_callback = queued.append
7578 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7579 tool_call = ToolCall(
7580 id="write-2",
7581 name="write",
7582 arguments={"file_path": "", "content": "<html></html>\n"},
7583 )
7584 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
7585 executor = FakeExecutor(
7586 [
7587 ToolExecutionOutcome(
7588 tool_call=tool_call,
7589 state=ToolExecutionState.BLOCKED,
7590 message=Message.tool_result_message(
7591 tool_call_id=tool_call.id,
7592 display_content=blocked_message,
7593 result_content=blocked_message,
7594 is_error=True,
7595 ),
7596 event_content=blocked_message,
7597 is_error=True,
7598 result_output=blocked_message,
7599 )
7600 ]
7601 )
7602 dod = create_definition_of_done("Create a multi-file nginx guide.")
7603 dod.implementation_plan = str(implementation_plan)
7604 dod.touched_files.extend([str(index_path), str(chapter_one)])
7605 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
7606
7607 await runner.execute_batch(
7608 tool_calls=[tool_call],
7609 tool_source="assistant",
7610 pending_tool_calls_seen=set(),
7611 emit=_noop_emit,
7612 summary=TurnSummary(final_response=""),
7613 dod=dod,
7614 executor=executor, # type: ignore[arg-type]
7615 on_confirmation=None,
7616 on_user_question=None,
7617 emit_confirmation=None,
7618 consecutive_errors=0,
7619 )
7620
7621 assert queued
7622 assert "did not provide a valid `file_path`" in queued[0]
7623 assert "Resume by creating `02-installation.html` now." in queued[0]
7624 assert (
7625 f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
7626 in queued[0]
7627 )
7628 assert context.recovery_context is not None
7629 assert context.recovery_context.attempts[-1].error == blocked_message