Python · 231591 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.path_display import display_runtime_path
20 from loader.runtime.permissions import (
21 PermissionMode,
22 build_permission_policy,
23 load_permission_rules,
24 )
25 from loader.runtime.reasoning_types import (
26 ActionVerification,
27 ConfidenceAssessment,
28 ConfidenceLevel,
29 )
30 from loader.runtime.recovery import RecoveryContext
31 from loader.runtime.tool_batches import (
32 ToolBatchRunner,
33 )
34 from loader.runtime.tool_batches import (
35 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
36 )
37 from loader.runtime.workflow import sync_todos_to_definition_of_done
38 from loader.tools.base import ToolResult as RegistryToolResult
39 from loader.tools.base import create_default_registry
40 from tests.helpers.runtime_harness import ScriptedBackend
41
42
43 class FakeSession:
44 def __init__(self, messages: list[Message]) -> None:
45 self.messages = list(messages)
46 self.workflow_timeline = []
47
48 def append(self, message: Message) -> None:
49 self.messages.append(message)
50
51 def append_workflow_timeline_entry(self, entry) -> None:
52 self.workflow_timeline.append(entry)
53
54
55 class FakeCodeFilter:
56 def reset(self) -> None:
57 return None
58
59
60 class FakeSafeguards:
61 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
62 self.action_tracker = object()
63 self.validator = object()
64 self.code_filter = FakeCodeFilter()
65 self._detect_loop_result = detect_loop_result
66
67 def filter_stream_chunk(self, content: str) -> str:
68 return content
69
70 def filter_complete_content(self, content: str) -> str:
71 return content
72
73 def should_steer(self) -> bool:
74 return False
75
76 def get_steering_message(self) -> str | None:
77 return None
78
79 def record_response(self, content: str) -> None:
80 return None
81
82 def detect_text_loop(self, content: str) -> tuple[bool, str]:
83 return False, ""
84
85 def detect_loop(self) -> tuple[bool, str]:
86 return self._detect_loop_result
87
88
89 class FakeExecutor:
90 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
91 self._outcomes = list(outcomes)
92 self.calls: list[ToolCall] = []
93
94 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
95 self.calls.append(tool_call)
96 if not self._outcomes:
97 raise AssertionError("No fake tool outcome queued")
98 return self._outcomes.pop(0)
99
100
101 def build_context(
102 *,
103 temp_dir: Path,
104 messages: list[Message],
105 safeguards: FakeSafeguards,
106 assess_confidence,
107 verify_action,
108 recovery_context: RecoveryContext | None = None,
109 confidence_scoring: bool = False,
110 verification: bool = False,
111 auto_recover: bool = True,
112 min_confidence_for_action: int = 3,
113 ) -> RuntimeContext:
114 registry = create_default_registry(temp_dir)
115 registry.configure_workspace_root(temp_dir)
116 rule_status = load_permission_rules(temp_dir)
117 policy = build_permission_policy(
118 active_mode=PermissionMode.WORKSPACE_WRITE,
119 workspace_root=temp_dir,
120 tool_requirements=registry.get_tool_requirements(),
121 rules=rule_status.rules,
122 )
123 context = RuntimeContext(
124 project_root=temp_dir,
125 backend=ScriptedBackend(),
126 registry=registry,
127 session=FakeSession(messages), # type: ignore[arg-type]
128 config=SimpleNamespace(
129 force_react=False,
130 max_recovery_attempts=2,
131 auto_recover=auto_recover,
132 reasoning=SimpleNamespace(
133 rollback=False,
134 show_rollback_plan=False,
135 completion_check=True,
136 max_continuation_prompts=5,
137 self_critique=False,
138 confidence_scoring=confidence_scoring,
139 min_confidence_for_action=min_confidence_for_action,
140 verification=verification,
141 ),
142 ),
143 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
144 project_context=None,
145 permission_policy=policy,
146 permission_config_status=rule_status,
147 workflow_mode="execute",
148 safeguards=safeguards,
149 reasoning=SimpleNamespace(
150 assess_confidence=assess_confidence,
151 verify_action=verify_action,
152 ),
153 recovery_context=recovery_context,
154 )
155 return context
156
157
158 def tool_outcome(
159 *,
160 tool_call: ToolCall,
161 output: str,
162 is_error: bool,
163 state: ToolExecutionState = ToolExecutionState.EXECUTED,
164 metadata: dict[str, object] | None = None,
165 ) -> ToolExecutionOutcome:
166 return ToolExecutionOutcome(
167 tool_call=tool_call,
168 state=state,
169 message=Message.tool_result_message(
170 tool_call_id=tool_call.id,
171 display_content=output,
172 result_content=output,
173 is_error=is_error,
174 ),
175 event_content=output,
176 is_error=is_error,
177 result_output=output,
178 registry_result=RegistryToolResult(
179 output=output,
180 is_error=is_error,
181 metadata=metadata or {},
182 ),
183 )
184
185
186 @pytest.mark.asyncio
187 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
188 captured: dict[str, str] = {}
189
190 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
191 captured["context"] = context
192 return ConfidenceAssessment(
193 action=f"{tool_name} with {tool_args}",
194 tool_name=tool_name,
195 tool_args=tool_args,
196 level=ConfidenceLevel.LOW,
197 reasoning="Need to inspect the target first.",
198 risks=["Unknown target file"],
199 )
200
201 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
202 raise AssertionError("Verification should not run for skipped actions")
203
204 context = build_context(
205 temp_dir=temp_dir,
206 messages=[
207 Message(role=Role.USER, content="Please inspect the project."),
208 Message(role=Role.ASSISTANT, content="I will read the file next."),
209 ],
210 safeguards=FakeSafeguards(),
211 assess_confidence=assess_confidence,
212 verify_action=verify_action,
213 confidence_scoring=True,
214 min_confidence_for_action=3,
215 )
216 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
217 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
218 events: list[AgentEvent] = []
219
220 async def emit(event: AgentEvent) -> None:
221 events.append(event)
222
223 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
224 result = await runner.execute_batch(
225 tool_calls=[tool_call],
226 tool_source="assistant",
227 pending_tool_calls_seen=set(),
228 emit=emit,
229 summary=TurnSummary(final_response=""),
230 dod=create_definition_of_done("Read the docs"),
231 executor=executor, # type: ignore[arg-type]
232 on_confirmation=None,
233 on_user_question=None,
234 emit_confirmation=None,
235 consecutive_errors=0,
236 )
237
238 assert result.actions_taken == []
239 assert executor.calls == []
240 assert "Please inspect the project." in captured["context"]
241 assert context.session.messages[-1].role == Role.USER
242 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
243 event_types = [event.type for event in events]
244 assert "confidence" in event_types
245
246
247 @pytest.mark.asyncio
248 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
249 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
250 raise AssertionError("Confidence scoring should be disabled in this scenario")
251
252 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
253 raise AssertionError("Verification should not run for failed actions")
254
255 context = build_context(
256 temp_dir=temp_dir,
257 messages=[],
258 safeguards=FakeSafeguards(),
259 assess_confidence=assess_confidence,
260 verify_action=verify_action,
261 auto_recover=True,
262 )
263 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
264 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
265 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
266 summary = TurnSummary(final_response="")
267 events: list[AgentEvent] = []
268
269 async def emit(event: AgentEvent) -> None:
270 events.append(event)
271
272 await runner.execute_batch(
273 tool_calls=[tool_call],
274 tool_source="assistant",
275 pending_tool_calls_seen=set(),
276 emit=emit,
277 summary=summary,
278 dod=create_definition_of_done("Run tests"),
279 executor=executor, # type: ignore[arg-type]
280 on_confirmation=None,
281 on_user_question=None,
282 emit_confirmation=None,
283 consecutive_errors=0,
284 )
285
286 assert context.recovery_context is not None
287 assert summary.tool_result_messages
288 assert context.session.messages[-1] == summary.tool_result_messages[-1]
289 assert any(event.type == "recovery" for event in events)
290
291
292 @pytest.mark.asyncio
293 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
294 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
295 raise AssertionError("Confidence scoring should be disabled in this scenario")
296
297 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
298 raise AssertionError("Verification should not run for this scenario")
299
300 context = build_context(
301 temp_dir=temp_dir,
302 messages=[],
303 safeguards=FakeSafeguards(),
304 assess_confidence=assess_confidence,
305 verify_action=verify_action,
306 auto_recover=False,
307 )
308 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
309 tool_call = ToolCall(
310 id="bash-1",
311 name="bash",
312 arguments={"command": "python -m http.server 8000", "background": True},
313 )
314 metadata = {
315 "job_id": "bash-1",
316 "status": "running",
317 "background": True,
318 }
319 executor = FakeExecutor(
320 [
321 tool_outcome(
322 tool_call=tool_call,
323 output="Started bash job bash-1",
324 is_error=False,
325 metadata=metadata,
326 )
327 ]
328 )
329 events: list[AgentEvent] = []
330
331 async def emit(event: AgentEvent) -> None:
332 events.append(event)
333
334 await runner.execute_batch(
335 tool_calls=[tool_call],
336 tool_source="assistant",
337 pending_tool_calls_seen=set(),
338 emit=emit,
339 summary=TurnSummary(final_response=""),
340 dod=create_definition_of_done("Launch a preview server"),
341 executor=executor, # type: ignore[arg-type]
342 on_confirmation=None,
343 on_user_question=None,
344 emit_confirmation=None,
345 consecutive_errors=0,
346 )
347
348 tool_result = next(event for event in events if event.type == "tool_result")
349 assert tool_result.tool_metadata == metadata
350
351
352 @pytest.mark.asyncio
353 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
354 verification_calls: list[str] = []
355
356 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
357 raise AssertionError("Confidence scoring should be disabled in this scenario")
358
359 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
360 verification_calls.append(result)
361 return ActionVerification(
362 tool_name=tool_name,
363 tool_args=tool_args,
364 expected_outcome="Success",
365 actual_result=result,
366 verified=False,
367 discrepancies=["File contents did not match"],
368 needs_correction=True,
369 correction_suggestion="Read the file before editing again.",
370 )
371
372 existing_recovery = RecoveryContext(
373 original_tool="edit",
374 original_args={"file_path": "README.md"},
375 )
376 context = build_context(
377 temp_dir=temp_dir,
378 messages=[],
379 safeguards=FakeSafeguards(),
380 assess_confidence=assess_confidence,
381 verify_action=verify_action,
382 recovery_context=existing_recovery,
383 verification=True,
384 )
385 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
386 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
387 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
388 events: list[AgentEvent] = []
389
390 async def emit(event: AgentEvent) -> None:
391 events.append(event)
392
393 await runner.execute_batch(
394 tool_calls=[tool_call],
395 tool_source="assistant",
396 pending_tool_calls_seen=set(),
397 emit=emit,
398 summary=TurnSummary(final_response=""),
399 dod=create_definition_of_done("Read the docs"),
400 executor=executor, # type: ignore[arg-type]
401 on_confirmation=None,
402 on_user_question=None,
403 emit_confirmation=None,
404 consecutive_errors=0,
405 )
406
407 assert verification_calls == ["file contents"]
408 assert context.recovery_context is existing_recovery
409 assert existing_recovery.successful_steps == [
410 ("read", {"file_path": "README.md"})
411 ]
412 assert context.session.messages[-1].role == Role.TOOL
413 assert context.session.messages[-1].content == "file contents"
414 assert any(event.type == "verification" for event in events)
415
416
417 @pytest.mark.asyncio
418 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
419 temp_dir: Path,
420 ) -> None:
421 async def assess_confidence(
422 tool_name: str,
423 tool_args: dict,
424 context: str,
425 ) -> ConfidenceAssessment:
426 raise AssertionError("Confidence scoring should be disabled in this scenario")
427
428 async def verify_action(
429 tool_name: str,
430 tool_args: dict,
431 result: str,
432 expected: str = "",
433 ) -> ActionVerification:
434 raise AssertionError("Verification should not run for this scenario")
435
436 existing_recovery = RecoveryContext(
437 original_tool="read",
438 original_args={"file_path": "chapters/04-data-types.html"},
439 )
440 existing_recovery.add_attempt(
441 "read",
442 {"file_path": "chapters/04-data-types.html"},
443 "File not found",
444 )
445 context = build_context(
446 temp_dir=temp_dir,
447 messages=[],
448 safeguards=FakeSafeguards(),
449 assess_confidence=assess_confidence,
450 verify_action=verify_action,
451 recovery_context=existing_recovery,
452 auto_recover=False,
453 )
454 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
455 tool_call = ToolCall(
456 id="bash-1",
457 name="bash",
458 arguments={"command": "ls chapters"},
459 )
460 executor = FakeExecutor(
461 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
462 )
463
464 summary = TurnSummary(final_response="")
465 await runner.execute_batch(
466 tool_calls=[tool_call],
467 tool_source="assistant",
468 pending_tool_calls_seen=set(),
469 emit=_noop_emit,
470 summary=summary,
471 dod=create_definition_of_done("Fix the chapter links"),
472 executor=executor, # type: ignore[arg-type]
473 on_confirmation=None,
474 on_user_question=None,
475 emit_confirmation=None,
476 consecutive_errors=0,
477 )
478
479 assert context.recovery_context is existing_recovery
480 assert existing_recovery.successful_steps == [
481 ("bash", {"command": "ls chapters"})
482 ]
483
484
485 @pytest.mark.asyncio
486 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
487 temp_dir: Path,
488 ) -> None:
489 async def assess_confidence(
490 tool_name: str,
491 tool_args: dict,
492 context: str,
493 ) -> ConfidenceAssessment:
494 raise AssertionError("Confidence scoring should be disabled in this scenario")
495
496 async def verify_action(
497 tool_name: str,
498 tool_args: dict,
499 result: str,
500 expected: str = "",
501 ) -> ActionVerification:
502 raise AssertionError("Verification should not run for this scenario")
503
504 existing_recovery = RecoveryContext(
505 original_tool="read",
506 original_args={"file_path": "chapters/04-data-types.html"},
507 )
508 existing_recovery.add_attempt(
509 "read",
510 {"file_path": "chapters/04-data-types.html"},
511 "File not found",
512 )
513 context = build_context(
514 temp_dir=temp_dir,
515 messages=[],
516 safeguards=FakeSafeguards(),
517 assess_confidence=assess_confidence,
518 verify_action=verify_action,
519 recovery_context=existing_recovery,
520 auto_recover=False,
521 )
522 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
523 tool_call = ToolCall(
524 id="patch-1",
525 name="patch",
526 arguments={
527 "file_path": "index.html",
528 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
529 },
530 )
531 executor = FakeExecutor(
532 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
533 )
534
535 summary = TurnSummary(final_response="")
536 await runner.execute_batch(
537 tool_calls=[tool_call],
538 tool_source="assistant",
539 pending_tool_calls_seen=set(),
540 emit=_noop_emit,
541 summary=summary,
542 dod=create_definition_of_done("Fix the chapter links"),
543 executor=executor, # type: ignore[arg-type]
544 on_confirmation=None,
545 on_user_question=None,
546 emit_confirmation=None,
547 consecutive_errors=0,
548 )
549
550 assert context.recovery_context is None
551
552
553 @pytest.mark.asyncio
554 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
555 temp_dir: Path,
556 ) -> None:
557 async def assess_confidence(
558 tool_name: str,
559 tool_args: dict,
560 context: str,
561 ) -> ConfidenceAssessment:
562 raise AssertionError("Confidence scoring should be disabled in this scenario")
563
564 async def verify_action(
565 tool_name: str,
566 tool_args: dict,
567 result: str,
568 expected: str = "",
569 ) -> ActionVerification:
570 raise AssertionError("Verification should not run for this scenario")
571
572 messages = [
573 Message(
574 role=Role.TOOL,
575 content=(
576 "Observation [glob]: Result: "
577 f"{temp_dir}/chapters/01-introduction.html\n"
578 f"{temp_dir}/chapters/02-setup.html\n"
579 f"{temp_dir}/chapters/03-basics.html"
580 ),
581 tool_results=[],
582 ),
583 Message(
584 role=Role.ASSISTANT,
585 content="I already inspected the first chapter title.",
586 tool_calls=[
587 ToolCall(
588 id="read-ch1",
589 name="read",
590 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
591 )
592 ],
593 ),
594 Message.tool_result_message(
595 tool_call_id="read-ch1",
596 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
598 ),
599 Message(
600 role=Role.ASSISTANT,
601 content="I should update the index now.",
602 tool_calls=[
603 ToolCall(
604 id="read-index",
605 name="read",
606 arguments={"file_path": str(temp_dir / 'index.html')},
607 )
608 ],
609 ),
610 ]
611 context = build_context(
612 temp_dir=temp_dir,
613 messages=messages,
614 safeguards=FakeSafeguards(),
615 assess_confidence=assess_confidence,
616 verify_action=verify_action,
617 auto_recover=False,
618 )
619 (temp_dir / "chapters").mkdir()
620 (temp_dir / "index.html").write_text("<ul></ul>\n")
621 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
622 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
623 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
624 implementation_plan = temp_dir / "implementation.md"
625 implementation_plan.write_text(
626 "\n".join(
627 [
628 "# Implementation Plan",
629 "",
630 "## File Changes",
631 f"- `{temp_dir / 'index.html'}`",
632 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
633 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
634 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
635 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
636 ]
637 )
638 )
639 context.session.current_task = (
640 f"Update {temp_dir / 'index.html'} with the right chapter links."
641 )
642 persistent_messages: list[str] = []
643 ephemeral_messages: list[str] = []
644 context.queue_steering_message_callback = persistent_messages.append
645 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
646 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
647 tool_call = ToolCall(
648 id="read-dup",
649 name="read",
650 arguments={"file_path": str(temp_dir / "index.html")},
651 )
652 duplicate_message = (
653 "[Skipped - duplicate action: Already read "
654 f"{temp_dir / 'index.html'} recently without any intervening changes; "
655 "reuse the earlier read result instead of rereading]"
656 )
657 executor = FakeExecutor(
658 [
659 ToolExecutionOutcome(
660 tool_call=tool_call,
661 state=ToolExecutionState.DUPLICATE,
662 message=Message.tool_result_message(
663 tool_call_id=tool_call.id,
664 display_content=duplicate_message,
665 result_content=duplicate_message,
666 ),
667 event_content=duplicate_message,
668 is_error=False,
669 result_output=duplicate_message,
670 )
671 ]
672 )
673
674 summary = TurnSummary(final_response="")
675 dod = create_definition_of_done("Fix the chapter links")
676 dod.implementation_plan = str(implementation_plan)
677 dod.pending_items.append("Create the remaining chapter files")
678 await runner.execute_batch(
679 tool_calls=[tool_call],
680 tool_source="assistant",
681 pending_tool_calls_seen=set(),
682 emit=_noop_emit,
683 summary=summary,
684 dod=dod,
685 executor=executor, # type: ignore[arg-type]
686 on_confirmation=None,
687 on_user_question=None,
688 emit_confirmation=None,
689 consecutive_errors=0,
690 )
691
692 assert len(persistent_messages) == 1
693 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
694 assert "A declared output artifact is still missing." in persistent_messages[0]
695 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
696 assert (
697 "Prefer one `write` call for "
698 f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
699 in persistent_messages[0]
700 )
701 assert ephemeral_messages == []
702
703
704 @pytest.mark.asyncio
705 async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
706 temp_dir: Path,
707 ) -> None:
708 async def assess_confidence(
709 tool_name: str,
710 tool_args: dict,
711 context: str,
712 ) -> ConfidenceAssessment:
713 raise AssertionError("Confidence scoring should not run for this scenario")
714
715 async def verify_action(
716 tool_name: str,
717 tool_args: dict,
718 result: str,
719 expected: str = "",
720 ) -> ActionVerification:
721 raise AssertionError("Verification should not run for this scenario")
722
723 guide_root = temp_dir / "guide"
724 chapters = guide_root / "chapters"
725 chapters.mkdir(parents=True)
726 index = guide_root / "index.html"
727 chapter_one = chapters / "01-introduction.html"
728 index.write_text(
729 '<a href="chapters/01-introduction.html">Intro</a>\n'
730 '<a href="chapters/02-installation.html">Install</a>\n'
731 )
732 chapter_one.write_text("<h1>Intro</h1>\n")
733
734 implementation_plan = temp_dir / "implementation.md"
735 implementation_plan.write_text(
736 "\n".join(
737 [
738 "# Implementation Plan",
739 "",
740 "## File Changes",
741 f"- `{index}`",
742 f"- `{chapters}/` (directory for chapter files)",
743 ]
744 )
745 )
746
747 messages = [
748 Message(
749 role=Role.ASSISTANT,
750 content="I should keep building the guide.",
751 tool_calls=[
752 ToolCall(
753 id="read-index",
754 name="read",
755 arguments={"file_path": str(index)},
756 )
757 ],
758 ),
759 ]
760 context = build_context(
761 temp_dir=temp_dir,
762 messages=messages,
763 safeguards=FakeSafeguards(),
764 assess_confidence=assess_confidence,
765 verify_action=verify_action,
766 auto_recover=False,
767 )
768 context.session.current_task = f"Build the guide rooted at {index}."
769 persistent_messages: list[str] = []
770 ephemeral_messages: list[str] = []
771 context.queue_steering_message_callback = persistent_messages.append
772 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
773 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
774 tool_call = ToolCall(
775 id="read-dup-rooted",
776 name="read",
777 arguments={"file_path": str(index)},
778 )
779 duplicate_message = (
780 "[Skipped - duplicate action: Already read "
781 f"{index} recently without any intervening changes; "
782 "reuse the earlier read result instead of rereading]"
783 )
784 executor = FakeExecutor(
785 [
786 ToolExecutionOutcome(
787 tool_call=tool_call,
788 state=ToolExecutionState.DUPLICATE,
789 message=Message.tool_result_message(
790 tool_call_id=tool_call.id,
791 display_content=duplicate_message,
792 result_content=duplicate_message,
793 ),
794 event_content=duplicate_message,
795 is_error=False,
796 result_output=duplicate_message,
797 )
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
803 dod.implementation_plan = str(implementation_plan)
804 dod.touched_files = [str(index), str(chapter_one)]
805 dod.completed_items = ["Create chapter files with appropriate content"]
806 dod.pending_items.append("Create the remaining chapter files")
807
808 await runner.execute_batch(
809 tool_calls=[tool_call],
810 tool_source="assistant",
811 pending_tool_calls_seen=set(),
812 emit=_noop_emit,
813 summary=summary,
814 dod=dod,
815 executor=executor, # type: ignore[arg-type]
816 on_confirmation=None,
817 on_user_question=None,
818 emit_confirmation=None,
819 consecutive_errors=0,
820 )
821
822 assert len(persistent_messages) == 1
823 assert "Create the remaining chapter files" in persistent_messages[0]
824 assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
825 assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
826 assert ephemeral_messages == []
827
828
829 @pytest.mark.asyncio
830 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
831 temp_dir: Path,
832 ) -> None:
833 async def assess_confidence(
834 tool_name: str,
835 tool_args: dict,
836 context: str,
837 ) -> ConfidenceAssessment:
838 raise AssertionError("Confidence scoring should not run for this scenario")
839
840 async def verify_action(
841 tool_name: str,
842 tool_args: dict,
843 result: str,
844 expected: str = "",
845 ) -> ActionVerification:
846 raise AssertionError("Verification should not run for this scenario")
847
848 context = build_context(
849 temp_dir=temp_dir,
850 messages=[],
851 safeguards=FakeSafeguards(),
852 assess_confidence=assess_confidence,
853 verify_action=verify_action,
854 auto_recover=False,
855 )
856 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
857 dod = create_definition_of_done("Create a multi-file nginx guide.")
858 sync_todos_to_definition_of_done(
859 dod,
860 [
861 {
862 "content": "Create 03-first-website.html",
863 "active_form": "Creating 03-first-website.html",
864 "status": "pending",
865 },
866 {
867 "content": "Create 04-configuration-basics.html",
868 "active_form": "Creating 04-configuration-basics.html",
869 "status": "pending",
870 },
871 ],
872 )
873
874 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
875 chapter_path.parent.mkdir(parents=True)
876 write_call = ToolCall(
877 id="write-ch3",
878 name="write",
879 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
880 )
881 stale_todo_call = ToolCall(
882 id="todo-stale",
883 name="TodoWrite",
884 arguments={
885 "todos": [
886 {
887 "content": "Create 03-first-website.html",
888 "active_form": "Creating 03-first-website.html",
889 "status": "pending",
890 },
891 {
892 "content": "Create 04-configuration-basics.html",
893 "active_form": "Creating 04-configuration-basics.html",
894 "status": "pending",
895 },
896 ]
897 },
898 )
899 executor = FakeExecutor(
900 [
901 tool_outcome(
902 tool_call=write_call,
903 output=f"Successfully wrote {chapter_path}",
904 is_error=False,
905 ),
906 tool_outcome(
907 tool_call=stale_todo_call,
908 output="Todos updated",
909 is_error=False,
910 metadata={
911 "new_todos": [
912 {
913 "content": "Create 03-first-website.html",
914 "active_form": "Creating 03-first-website.html",
915 "status": "pending",
916 },
917 {
918 "content": "Create 04-configuration-basics.html",
919 "active_form": "Creating 04-configuration-basics.html",
920 "status": "pending",
921 },
922 ]
923 },
924 ),
925 ]
926 )
927
928 summary = TurnSummary(final_response="")
929 await runner.execute_batch(
930 tool_calls=[write_call, stale_todo_call],
931 tool_source="assistant",
932 pending_tool_calls_seen=set(),
933 emit=_noop_emit,
934 summary=summary,
935 dod=dod,
936 executor=executor, # type: ignore[arg-type]
937 on_confirmation=None,
938 on_user_question=None,
939 emit_confirmation=None,
940 consecutive_errors=0,
941 )
942
943 assert "Create 03-first-website.html" in dod.completed_items
944 assert "Create 03-first-website.html" not in dod.pending_items
945 assert "Create 04-configuration-basics.html" in dod.pending_items
946
947
948 @pytest.mark.asyncio
949 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
950 temp_dir: Path,
951 ) -> None:
952 async def assess_confidence(
953 tool_name: str,
954 tool_args: dict,
955 context: str,
956 ) -> ConfidenceAssessment:
957 raise AssertionError("Confidence scoring should be disabled in this scenario")
958
959 async def verify_action(
960 tool_name: str,
961 tool_args: dict,
962 result: str,
963 expected: str = "",
964 ) -> ActionVerification:
965 raise AssertionError("Verification should not run for this scenario")
966
967 chapters = temp_dir / "chapters"
968 chapters.mkdir()
969 (chapters / "01-introduction.html").write_text(
970 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
971 )
972 (chapters / "02-setup.html").write_text(
973 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
974 )
975 (temp_dir / "index.html").write_text("<ul></ul>\n")
976
977 context = build_context(
978 temp_dir=temp_dir,
979 messages=[],
980 safeguards=FakeSafeguards(),
981 assess_confidence=assess_confidence,
982 verify_action=verify_action,
983 auto_recover=False,
984 )
985 context.session.current_task = (
986 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
987 )
988 persistent_messages: list[str] = []
989 ephemeral_messages: list[str] = []
990 context.queue_steering_message_callback = persistent_messages.append
991 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
992 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
993 tool_call = ToolCall(
994 id="glob-1",
995 name="glob",
996 arguments={"path": str(chapters), "pattern": "*.html"},
997 )
998 executor = FakeExecutor(
999 [
1000 tool_outcome(
1001 tool_call=tool_call,
1002 output="\n".join(
1003 [
1004 str(chapters / "01-introduction.html"),
1005 str(chapters / "02-setup.html"),
1006 ]
1007 ),
1008 is_error=False,
1009 )
1010 ]
1011 )
1012
1013 summary = TurnSummary(final_response="")
1014 await runner.execute_batch(
1015 tool_calls=[tool_call],
1016 tool_source="assistant",
1017 pending_tool_calls_seen=set(),
1018 emit=_noop_emit,
1019 summary=summary,
1020 dod=create_definition_of_done("Fix the chapter links"),
1021 executor=executor, # type: ignore[arg-type]
1022 on_confirmation=None,
1023 on_user_question=None,
1024 emit_confirmation=None,
1025 consecutive_errors=0,
1026 )
1027
1028 assert persistent_messages == []
1029 assert ephemeral_messages == []
1030 assert len(summary.tool_result_messages) == 1
1031 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
1032
1033
1034 @pytest.mark.asyncio
1035 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
1036 temp_dir: Path,
1037 ) -> None:
1038 async def assess_confidence(
1039 tool_name: str,
1040 tool_args: dict,
1041 context: str,
1042 ) -> ConfidenceAssessment:
1043 raise AssertionError("Confidence scoring should be disabled in this scenario")
1044
1045 async def verify_action(
1046 tool_name: str,
1047 tool_args: dict,
1048 result: str,
1049 expected: str = "",
1050 ) -> ActionVerification:
1051 raise AssertionError("Verification should not run for this scenario")
1052
1053 chapters = temp_dir / "chapters"
1054 chapters.mkdir()
1055 (chapters / "01-introduction.html").write_text(
1056 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1057 )
1058 (chapters / "02-setup.html").write_text(
1059 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1060 )
1061 index_path = temp_dir / "index.html"
1062 old_block = (
1063 '<ul class="chapter-list">\n'
1064 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
1065 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
1066 "</ul>\n"
1067 )
1068 new_block = (
1069 '<ul class="chapter-list">\n'
1070 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1071 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1072 "</ul>\n"
1073 )
1074 index_path.write_text(new_block)
1075
1076 context = build_context(
1077 temp_dir=temp_dir,
1078 messages=[],
1079 safeguards=FakeSafeguards(),
1080 assess_confidence=assess_confidence,
1081 verify_action=verify_action,
1082 auto_recover=False,
1083 )
1084 context.session.current_task = (
1085 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1086 )
1087 persistent_messages: list[str] = []
1088 ephemeral_messages: list[str] = []
1089 context.queue_steering_message_callback = persistent_messages.append
1090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1092 tool_call = ToolCall(
1093 id="edit-1",
1094 name="edit",
1095 arguments={
1096 "file_path": str(index_path),
1097 "old_string": old_block,
1098 "new_string": new_block,
1099 },
1100 )
1101 executor = FakeExecutor(
1102 [
1103 tool_outcome(
1104 tool_call=tool_call,
1105 output=f"Successfully edited {index_path}",
1106 is_error=False,
1107 )
1108 ]
1109 )
1110
1111 summary = TurnSummary(final_response="")
1112 await runner.execute_batch(
1113 tool_calls=[tool_call],
1114 tool_source="assistant",
1115 pending_tool_calls_seen=set(),
1116 emit=_noop_emit,
1117 summary=summary,
1118 dod=create_definition_of_done(
1119 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1120 ),
1121 executor=executor, # type: ignore[arg-type]
1122 on_confirmation=None,
1123 on_user_question=None,
1124 emit_confirmation=None,
1125 consecutive_errors=0,
1126 )
1127
1128 assert all(
1129 "Semantic verification preview:" not in message.content
1130 for message in summary.tool_result_messages
1131 )
1132 assert persistent_messages == []
1133 assert ephemeral_messages == []
1134
1135
1136 @pytest.mark.asyncio
1137 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1138 temp_dir: Path,
1139 ) -> None:
1140 async def assess_confidence(
1141 tool_name: str,
1142 tool_args: dict,
1143 context: str,
1144 ) -> ConfidenceAssessment:
1145 raise AssertionError("Confidence scoring should be disabled in this scenario")
1146
1147 async def verify_action(
1148 tool_name: str,
1149 tool_args: dict,
1150 result: str,
1151 expected: str = "",
1152 ) -> ActionVerification:
1153 raise AssertionError("Verification should not run for this scenario")
1154
1155 chapters = temp_dir / "chapters"
1156 chapters.mkdir()
1157 (chapters / "01-introduction.html").write_text(
1158 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1159 )
1160 (chapters / "02-setup.html").write_text(
1161 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1162 )
1163 index_path = temp_dir / "index.html"
1164 index_path.write_text(
1165 "<h2>Table of Contents</h2>\n"
1166 '<ul class="chapter-list">\n'
1167 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1168 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1169 "</ul>\n"
1170 )
1171
1172 prompt = (
1173 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1174 "for the structure and cadence of the guide. We are going to make an all "
1175 "new equally thorough guide on how to use the nginx tool."
1176 )
1177
1178 context = build_context(
1179 temp_dir=temp_dir,
1180 messages=[],
1181 safeguards=FakeSafeguards(),
1182 assess_confidence=assess_confidence,
1183 verify_action=verify_action,
1184 auto_recover=False,
1185 )
1186 context.session.current_task = prompt # type: ignore[attr-defined]
1187 persistent_messages: list[str] = []
1188 ephemeral_messages: list[str] = []
1189 context.queue_steering_message_callback = persistent_messages.append
1190 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1191 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1192 tool_call = ToolCall(
1193 id="read-index",
1194 name="read",
1195 arguments={"file_path": str(index_path)},
1196 )
1197 executor = FakeExecutor(
1198 [
1199 tool_outcome(
1200 tool_call=tool_call,
1201 output=index_path.read_text(),
1202 is_error=False,
1203 )
1204 ]
1205 )
1206
1207 summary = TurnSummary(final_response="")
1208 await runner.execute_batch(
1209 tool_calls=[tool_call],
1210 tool_source="assistant",
1211 pending_tool_calls_seen=set(),
1212 emit=_noop_emit,
1213 summary=summary,
1214 dod=create_definition_of_done(prompt),
1215 executor=executor, # type: ignore[arg-type]
1216 on_confirmation=None,
1217 on_user_question=None,
1218 emit_confirmation=None,
1219 consecutive_errors=0,
1220 )
1221
1222 assert persistent_messages == []
1223 assert ephemeral_messages == []
1224 assert all(
1225 "Semantic verification preview:" not in message.content
1226 for message in summary.tool_result_messages
1227 )
1228
1229
1230 @pytest.mark.asyncio
1231 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1232 temp_dir: Path,
1233 ) -> None:
1234 async def assess_confidence(
1235 tool_name: str,
1236 tool_args: dict,
1237 context: str,
1238 ) -> ConfidenceAssessment:
1239 raise AssertionError("Confidence scoring should be disabled in this scenario")
1240
1241 async def verify_action(
1242 tool_name: str,
1243 tool_args: dict,
1244 result: str,
1245 expected: str = "",
1246 ) -> ActionVerification:
1247 raise AssertionError("Verification should not run for this scenario")
1248
1249 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1250 reference.parent.mkdir(parents=True)
1251 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1252 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1253 chapters = nginx_root / "chapters"
1254 implementation_plan = temp_dir / "implementation.md"
1255 implementation_plan.write_text(
1256 "\n".join(
1257 [
1258 "# Implementation Plan",
1259 "",
1260 "## File Changes",
1261 f"- `{chapters}/`",
1262 f"- `{nginx_root / 'index.html'}`",
1263 "",
1264 ]
1265 )
1266 )
1267
1268 context = build_context(
1269 temp_dir=temp_dir,
1270 messages=[],
1271 safeguards=FakeSafeguards(),
1272 assess_confidence=assess_confidence,
1273 verify_action=verify_action,
1274 auto_recover=False,
1275 )
1276 persistent_messages: list[str] = []
1277 ephemeral_messages: list[str] = []
1278 context.queue_steering_message_callback = persistent_messages.append
1279 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1280 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1281 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1282 dod.implementation_plan = str(implementation_plan)
1283 sync_todos_to_definition_of_done(
1284 dod,
1285 [
1286 {
1287 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1288 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1289 "status": "pending",
1290 },
1291 {
1292 "content": "Create the nginx directory structure",
1293 "active_form": "Working on: Create the nginx directory structure",
1294 "status": "pending",
1295 },
1296 {
1297 "content": "Create the nginx index.html file",
1298 "active_form": "Working on: Create the nginx index.html file",
1299 "status": "pending",
1300 },
1301 ],
1302 )
1303 tool_call = ToolCall(
1304 id="read-reference",
1305 name="read",
1306 arguments={"file_path": str(reference)},
1307 )
1308 executor = FakeExecutor(
1309 [
1310 tool_outcome(
1311 tool_call=tool_call,
1312 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1313 is_error=False,
1314 )
1315 ]
1316 )
1317
1318 summary = TurnSummary(final_response="")
1319 await runner.execute_batch(
1320 tool_calls=[tool_call],
1321 tool_source="assistant",
1322 pending_tool_calls_seen=set(),
1323 emit=_noop_emit,
1324 summary=summary,
1325 dod=dod,
1326 executor=executor, # type: ignore[arg-type]
1327 on_confirmation=None,
1328 on_user_question=None,
1329 emit_confirmation=None,
1330 consecutive_errors=0,
1331 )
1332
1333 assert (
1334 "Examine the existing Fortran guide structure to understand the cadence and format"
1335 in dod.completed_items
1336 )
1337 assert any(
1338 "Continue with the next pending item: `Create the nginx directory structure`"
1339 in message
1340 for message in persistent_messages
1341 )
1342 assert any(
1343 "Resume by creating `chapters/` now." in message
1344 for message in persistent_messages
1345 )
1346 assert all("01-introduction.html" not in message for message in persistent_messages)
1347 assert ephemeral_messages == []
1348
1349
1350 @pytest.mark.asyncio
1351 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1352 temp_dir: Path,
1353 ) -> None:
1354 async def assess_confidence(
1355 tool_name: str,
1356 tool_args: dict,
1357 context: str,
1358 ) -> ConfidenceAssessment:
1359 raise AssertionError("Confidence scoring should be disabled in this scenario")
1360
1361 async def verify_action(
1362 tool_name: str,
1363 tool_args: dict,
1364 result: str,
1365 expected: str = "",
1366 ) -> ActionVerification:
1367 raise AssertionError("Verification should not run for this scenario")
1368
1369 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1370 reference.parent.mkdir(parents=True)
1371 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1372 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1373 chapters = nginx_root / "chapters"
1374 implementation_plan = temp_dir / "implementation.md"
1375 implementation_plan.write_text(
1376 "\n".join(
1377 [
1378 "# Implementation Plan",
1379 "",
1380 "## File Changes",
1381 f"- `{nginx_root / 'index.html'}`",
1382 f"- `{chapters}/`",
1383 "",
1384 ]
1385 )
1386 )
1387
1388 context = build_context(
1389 temp_dir=temp_dir,
1390 messages=[],
1391 safeguards=FakeSafeguards(),
1392 assess_confidence=assess_confidence,
1393 verify_action=verify_action,
1394 auto_recover=False,
1395 )
1396 persistent_messages: list[str] = []
1397 ephemeral_messages: list[str] = []
1398 context.queue_steering_message_callback = persistent_messages.append
1399 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1400 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1401 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1402 dod.implementation_plan = str(implementation_plan)
1403 sync_todos_to_definition_of_done(
1404 dod,
1405 [
1406 {
1407 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1408 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1409 "status": "pending",
1410 },
1411 {
1412 "content": "Create the nginx directory structure",
1413 "active_form": "Working on: Create the nginx directory structure",
1414 "status": "pending",
1415 },
1416 {
1417 "content": "Create the nginx index.html file",
1418 "active_form": "Working on: Create the nginx index.html file",
1419 "status": "pending",
1420 },
1421 ],
1422 project_root=temp_dir,
1423 )
1424 tool_call = ToolCall(
1425 id="read-reference-index-first",
1426 name="read",
1427 arguments={"file_path": str(reference)},
1428 )
1429 executor = FakeExecutor(
1430 [
1431 tool_outcome(
1432 tool_call=tool_call,
1433 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1434 is_error=False,
1435 )
1436 ]
1437 )
1438
1439 summary = TurnSummary(final_response="")
1440 await runner.execute_batch(
1441 tool_calls=[tool_call],
1442 tool_source="assistant",
1443 pending_tool_calls_seen=set(),
1444 emit=_noop_emit,
1445 summary=summary,
1446 dod=dod,
1447 executor=executor, # type: ignore[arg-type]
1448 on_confirmation=None,
1449 on_user_question=None,
1450 emit_confirmation=None,
1451 consecutive_errors=0,
1452 )
1453
1454 assert persistent_messages
1455 assert any(
1456 "Continue with the next pending item: `Create the nginx directory structure`"
1457 in message
1458 for message in persistent_messages
1459 )
1460 assert any(
1461 "Resume by creating `chapters/` now." in message
1462 for message in persistent_messages
1463 )
1464 assert all(
1465 "Next step: create `index.html`." not in message
1466 for message in persistent_messages
1467 )
1468 assert ephemeral_messages == []
1469
1470
1471 @pytest.mark.asyncio
1472 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1473 temp_dir: Path,
1474 ) -> None:
1475 async def assess_confidence(
1476 tool_name: str,
1477 tool_args: dict,
1478 context: str,
1479 ) -> ConfidenceAssessment:
1480 raise AssertionError("Confidence scoring should be disabled in this scenario")
1481
1482 async def verify_action(
1483 tool_name: str,
1484 tool_args: dict,
1485 result: str,
1486 expected: str = "",
1487 ) -> ActionVerification:
1488 raise AssertionError("Verification should not run for this scenario")
1489
1490 reference = temp_dir / "fortran" / "index.html"
1491 reference.parent.mkdir(parents=True)
1492 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1493
1494 messages = [
1495 Message(
1496 role=Role.TOOL,
1497 content=(
1498 "Observation [read]: Result: "
1499 "<h1>Fortran Beginner's Guide</h1>\n"
1500 ),
1501 )
1502 ]
1503 context = build_context(
1504 temp_dir=temp_dir,
1505 messages=messages,
1506 safeguards=FakeSafeguards(),
1507 assess_confidence=assess_confidence,
1508 verify_action=verify_action,
1509 auto_recover=False,
1510 )
1511 prompt = (
1512 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1513 "for the structure and cadence of the guide. We are going to make an all "
1514 "new equally thorough guide on how to use the nginx tool."
1515 )
1516 context.session.current_task = prompt
1517 persistent_messages: list[str] = []
1518 ephemeral_messages: list[str] = []
1519 context.queue_steering_message_callback = persistent_messages.append
1520 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1522 dod = create_definition_of_done(prompt)
1523 sync_todos_to_definition_of_done(
1524 dod,
1525 [
1526 {
1527 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1528 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1529 "status": "completed",
1530 },
1531 {
1532 "content": "Create the nginx directory structure",
1533 "active_form": "Working on: Create the nginx directory structure",
1534 "status": "pending",
1535 },
1536 {
1537 "content": "Create the nginx index.html file",
1538 "active_form": "Working on: Create the nginx index.html file",
1539 "status": "pending",
1540 },
1541 ],
1542 )
1543 tool_call = ToolCall(
1544 id="read-dup",
1545 name="read",
1546 arguments={"file_path": str(reference)},
1547 )
1548 duplicate_message = (
1549 "[Skipped - duplicate action: Already read "
1550 f"{reference} recently without any intervening changes; "
1551 "reuse the earlier read result instead of rereading]"
1552 )
1553 executor = FakeExecutor(
1554 [
1555 ToolExecutionOutcome(
1556 tool_call=tool_call,
1557 state=ToolExecutionState.DUPLICATE,
1558 message=Message.tool_result_message(
1559 tool_call_id=tool_call.id,
1560 display_content=duplicate_message,
1561 result_content=duplicate_message,
1562 ),
1563 event_content=duplicate_message,
1564 is_error=False,
1565 result_output=duplicate_message,
1566 )
1567 ]
1568 )
1569
1570 summary = TurnSummary(final_response="")
1571 await runner.execute_batch(
1572 tool_calls=[tool_call],
1573 tool_source="assistant",
1574 pending_tool_calls_seen=set(),
1575 emit=_noop_emit,
1576 summary=summary,
1577 dod=dod,
1578 executor=executor, # type: ignore[arg-type]
1579 on_confirmation=None,
1580 on_user_question=None,
1581 emit_confirmation=None,
1582 consecutive_errors=0,
1583 )
1584
1585 assert len(persistent_messages) == 1
1586 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1587 assert (
1588 "Continue with the next pending item: `Create the nginx directory structure`"
1589 in persistent_messages[0]
1590 )
1591 assert "Update `" not in persistent_messages[0]
1592 assert ephemeral_messages == []
1593
1594
1595 @pytest.mark.asyncio
1596 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1597 temp_dir: Path,
1598 ) -> None:
1599 async def assess_confidence(
1600 tool_name: str,
1601 tool_args: dict,
1602 context: str,
1603 ) -> ConfidenceAssessment:
1604 raise AssertionError("Confidence scoring should be disabled in this scenario")
1605
1606 async def verify_action(
1607 tool_name: str,
1608 tool_args: dict,
1609 result: str,
1610 expected: str = "",
1611 ) -> ActionVerification:
1612 raise AssertionError("Verification should not run for this scenario")
1613
1614 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1615 chapters = guide_root / "chapters"
1616 chapters.mkdir(parents=True)
1617 chapter_one = chapters / "01-introduction.html"
1618 chapter_one.write_text("<html></html>\n")
1619 index_path = guide_root / "index.html"
1620
1621 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1622 reference.parent.mkdir(parents=True, exist_ok=True)
1623 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1624
1625 implementation_plan = temp_dir / "implementation.md"
1626 implementation_plan.write_text(
1627 "\n".join(
1628 [
1629 "# Implementation Plan",
1630 "",
1631 "## File Changes",
1632 f"- `{guide_root}/`",
1633 f"- `{chapters}/`",
1634 f"- `{index_path}`",
1635 f"- `{chapter_one}`",
1636 f"- `{chapters / '02-installation.html'}`",
1637 "",
1638 ]
1639 )
1640 )
1641
1642 context = build_context(
1643 temp_dir=temp_dir,
1644 messages=[],
1645 safeguards=FakeSafeguards(),
1646 assess_confidence=assess_confidence,
1647 verify_action=verify_action,
1648 auto_recover=False,
1649 )
1650 persistent_messages: list[str] = []
1651 ephemeral_messages: list[str] = []
1652 context.queue_steering_message_callback = persistent_messages.append
1653 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1654 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1655 dod = create_definition_of_done("Create a multi-file nginx guide.")
1656 dod.implementation_plan = str(implementation_plan)
1657 dod.touched_files.append(str(chapter_one))
1658 sync_todos_to_definition_of_done(
1659 dod,
1660 [
1661 {
1662 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1663 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1664 "status": "pending",
1665 },
1666 {
1667 "content": "Create each chapter file with appropriate content",
1668 "active_form": "Working on: Create each chapter file with appropriate content",
1669 "status": "pending",
1670 },
1671 {
1672 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1673 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1674 "status": "pending",
1675 },
1676 ],
1677 )
1678 tool_call = ToolCall(
1679 id="read-reference-chapter",
1680 name="read",
1681 arguments={"file_path": str(reference)},
1682 )
1683 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1684 executor = FakeExecutor(
1685 [
1686 ToolExecutionOutcome(
1687 tool_call=tool_call,
1688 state=ToolExecutionState.EXECUTED,
1689 message=Message.tool_result_message(
1690 tool_call_id=tool_call.id,
1691 display_content=read_output,
1692 result_content=read_output,
1693 ),
1694 event_content=read_output,
1695 is_error=False,
1696 result_output=read_output,
1697 )
1698 ]
1699 )
1700
1701 summary = TurnSummary(final_response="")
1702 await runner.execute_batch(
1703 tool_calls=[tool_call],
1704 tool_source="assistant",
1705 pending_tool_calls_seen=set(),
1706 emit=_noop_emit,
1707 summary=summary,
1708 dod=dod,
1709 executor=executor, # type: ignore[arg-type]
1710 on_confirmation=None,
1711 on_user_question=None,
1712 emit_confirmation=None,
1713 consecutive_errors=0,
1714 )
1715
1716 assert persistent_messages
1717 assert any(
1718 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1719 in message
1720 for message in persistent_messages
1721 )
1722 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1723 assert not any(
1724 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1725 in message
1726 for message in persistent_messages
1727 )
1728 assert ephemeral_messages == []
1729
1730
1731 @pytest.mark.asyncio
1732 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1733 temp_dir: Path,
1734 ) -> None:
1735 async def assess_confidence(
1736 tool_name: str,
1737 tool_args: dict,
1738 context: str,
1739 ) -> ConfidenceAssessment:
1740 raise AssertionError("Confidence scoring should not run for this scenario")
1741
1742 async def verify_action(
1743 tool_name: str,
1744 tool_args: dict,
1745 result: str,
1746 expected: str = "",
1747 ) -> ActionVerification:
1748 raise AssertionError("Verification should not run for this scenario")
1749
1750 guide_root = temp_dir / "guides" / "nginx"
1751 chapters = guide_root / "chapters"
1752 guide_root.mkdir(parents=True)
1753 chapters.mkdir()
1754 index_path = guide_root / "index.html"
1755 chapter_one = chapters / "01-getting-started.html"
1756 chapter_two = chapters / "02-installation.html"
1757 index_path.write_text("<html></html>\n")
1758 chapter_one.write_text("<h1>One</h1>\n")
1759 chapter_two.write_text("<h1>Two</h1>\n")
1760
1761 implementation_plan = temp_dir / "implementation.md"
1762 implementation_plan.write_text(
1763 "\n".join(
1764 [
1765 "# Implementation Plan",
1766 "",
1767 "## File Changes",
1768 f"- `{guide_root}/`",
1769 f"- `{chapters}/`",
1770 f"- `{index_path}`",
1771 f"- `{chapter_one}`",
1772 f"- `{chapter_two}`",
1773 "",
1774 ]
1775 )
1776 )
1777
1778 context = build_context(
1779 temp_dir=temp_dir,
1780 messages=[],
1781 safeguards=FakeSafeguards(),
1782 assess_confidence=assess_confidence,
1783 verify_action=verify_action,
1784 auto_recover=False,
1785 )
1786 persistent_messages: list[str] = []
1787 ephemeral_messages: list[str] = []
1788 context.queue_steering_message_callback = persistent_messages.append
1789 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1790 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1791 dod = create_definition_of_done("Create a multi-file nginx guide.")
1792 dod.implementation_plan = str(implementation_plan)
1793 dod.pending_items = [
1794 "Create 07-performance-tuning.html",
1795 "Verify all guide files are linked and complete",
1796 "Complete the requested work",
1797 ]
1798
1799 tool_call = ToolCall(
1800 id="read-dup",
1801 name="read",
1802 arguments={"file_path": str(chapter_one)},
1803 )
1804 duplicate_message = (
1805 "[Skipped - duplicate action: Already read "
1806 f"{chapter_one} recently without any intervening changes; "
1807 "reuse the earlier read result instead of rereading]"
1808 )
1809 executor = FakeExecutor(
1810 [
1811 ToolExecutionOutcome(
1812 tool_call=tool_call,
1813 state=ToolExecutionState.DUPLICATE,
1814 message=Message.tool_result_message(
1815 tool_call_id=tool_call.id,
1816 display_content=duplicate_message,
1817 result_content=duplicate_message,
1818 ),
1819 event_content=duplicate_message,
1820 is_error=False,
1821 result_output=duplicate_message,
1822 )
1823 ]
1824 )
1825
1826 summary = TurnSummary(final_response="")
1827 await runner.execute_batch(
1828 tool_calls=[tool_call],
1829 tool_source="assistant",
1830 pending_tool_calls_seen=set(),
1831 emit=_noop_emit,
1832 summary=summary,
1833 dod=dod,
1834 executor=executor, # type: ignore[arg-type]
1835 on_confirmation=None,
1836 on_user_question=None,
1837 emit_confirmation=None,
1838 consecutive_errors=0,
1839 )
1840
1841 assert len(persistent_messages) == 1
1842 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1843 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1844 assert ephemeral_messages == []
1845
1846
1847 @pytest.mark.asyncio
1848 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1849 temp_dir: Path,
1850 ) -> None:
1851 async def assess_confidence(
1852 tool_name: str,
1853 tool_args: dict,
1854 context: str,
1855 ) -> ConfidenceAssessment:
1856 raise AssertionError("Confidence scoring should not run for this scenario")
1857
1858 async def verify_action(
1859 tool_name: str,
1860 tool_args: dict,
1861 result: str,
1862 expected: str = "",
1863 ) -> ActionVerification:
1864 raise AssertionError("Verification should not run for this scenario")
1865
1866 guide_root = temp_dir / "guides" / "nginx"
1867 chapters = guide_root / "chapters"
1868 guide_root.mkdir(parents=True)
1869 chapters.mkdir()
1870 index_path = guide_root / "index.html"
1871 chapter_one = chapters / "01-getting-started.html"
1872 chapter_two = chapters / "02-installation.html"
1873 index_path.write_text("<html></html>\n")
1874 chapter_one.write_text("<h1>One</h1>\n")
1875 chapter_two.write_text("<h1>Two</h1>\n")
1876
1877 implementation_plan = temp_dir / "implementation.md"
1878 implementation_plan.write_text(
1879 "\n".join(
1880 [
1881 "# Implementation Plan",
1882 "",
1883 "## File Changes",
1884 f"- `{guide_root}/`",
1885 f"- `{chapters}/`",
1886 f"- `{index_path}`",
1887 f"- `{chapter_one}`",
1888 f"- `{chapter_two}`",
1889 "",
1890 ]
1891 )
1892 )
1893
1894 context = build_context(
1895 temp_dir=temp_dir,
1896 messages=[],
1897 safeguards=FakeSafeguards(),
1898 assess_confidence=assess_confidence,
1899 verify_action=verify_action,
1900 auto_recover=False,
1901 )
1902 persistent_messages: list[str] = []
1903 ephemeral_messages: list[str] = []
1904 context.queue_steering_message_callback = persistent_messages.append
1905 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1906 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1907 dod = create_definition_of_done("Create a multi-file nginx guide.")
1908 dod.implementation_plan = str(implementation_plan)
1909 dod.verification_commands = [f"ls -la {guide_root}"]
1910 dod.pending_items = [
1911 "Create 07-performance-tuning.html",
1912 "Complete the requested work",
1913 ]
1914
1915 tool_call = ToolCall(
1916 id="read-dup",
1917 name="read",
1918 arguments={"file_path": str(chapter_one)},
1919 )
1920 duplicate_message = (
1921 "[Skipped - duplicate action: Already read "
1922 f"{chapter_one} recently without any intervening changes; "
1923 "reuse the earlier read result instead of rereading]"
1924 )
1925 executor = FakeExecutor(
1926 [
1927 ToolExecutionOutcome(
1928 tool_call=tool_call,
1929 state=ToolExecutionState.DUPLICATE,
1930 message=Message.tool_result_message(
1931 tool_call_id=tool_call.id,
1932 display_content=duplicate_message,
1933 result_content=duplicate_message,
1934 ),
1935 event_content=duplicate_message,
1936 is_error=False,
1937 result_output=duplicate_message,
1938 )
1939 ]
1940 )
1941
1942 summary = TurnSummary(final_response="")
1943 await runner.execute_batch(
1944 tool_calls=[tool_call],
1945 tool_source="assistant",
1946 pending_tool_calls_seen=set(),
1947 emit=_noop_emit,
1948 summary=summary,
1949 dod=dod,
1950 executor=executor, # type: ignore[arg-type]
1951 on_confirmation=None,
1952 on_user_question=None,
1953 emit_confirmation=None,
1954 consecutive_errors=0,
1955 )
1956
1957 assert len(persistent_messages) == 1
1958 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
1959 assert (
1960 "Move to verification or final confirmation using the files already on disk."
1961 in persistent_messages[0]
1962 )
1963 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1964 assert ephemeral_messages == []
1965
1966
1967 @pytest.mark.asyncio
1968 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1969 temp_dir: Path,
1970 ) -> None:
1971 async def assess_confidence(
1972 tool_name: str,
1973 tool_args: dict,
1974 context: str,
1975 ) -> ConfidenceAssessment:
1976 raise AssertionError("Confidence scoring should not run for this scenario")
1977
1978 async def verify_action(
1979 tool_name: str,
1980 tool_args: dict,
1981 result: str,
1982 expected: str = "",
1983 ) -> ActionVerification:
1984 raise AssertionError("Verification should not run for this scenario")
1985
1986 guide_root = temp_dir / "guides" / "nginx"
1987 chapters = guide_root / "chapters"
1988 guide_root.mkdir(parents=True)
1989 chapters.mkdir()
1990 index_path = guide_root / "index.html"
1991 chapter_one = chapters / "01-getting-started.html"
1992 chapter_two = chapters / "02-installation.html"
1993 index_path.write_text("<html></html>\n")
1994 chapter_one.write_text("<h1>One</h1>\n")
1995 chapter_two.write_text("<h1>Two</h1>\n")
1996
1997 implementation_plan = temp_dir / "implementation.md"
1998 implementation_plan.write_text(
1999 "\n".join(
2000 [
2001 "# Implementation Plan",
2002 "",
2003 "## File Changes",
2004 f"- `{guide_root}/`",
2005 f"- `{chapters}/`",
2006 f"- `{index_path}`",
2007 f"- `{chapter_one}`",
2008 f"- `{chapter_two}`",
2009 "",
2010 ]
2011 )
2012 )
2013
2014 context = build_context(
2015 temp_dir=temp_dir,
2016 messages=[],
2017 safeguards=FakeSafeguards(),
2018 assess_confidence=assess_confidence,
2019 verify_action=verify_action,
2020 auto_recover=False,
2021 )
2022 persistent_messages: list[str] = []
2023 ephemeral_messages: list[str] = []
2024 context.queue_steering_message_callback = persistent_messages.append
2025 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2026 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2027 dod = create_definition_of_done("Create a multi-file nginx guide.")
2028 dod.implementation_plan = str(implementation_plan)
2029 dod.verification_commands = [f"ls -la {guide_root}"]
2030 dod.pending_items = [
2031 "Create 01-getting-started.html",
2032 "Creating 02-installation.html",
2033 "Complete the requested work",
2034 ]
2035
2036 tool_call = ToolCall(
2037 id="read-dup-built-stale",
2038 name="read",
2039 arguments={"file_path": str(chapter_one)},
2040 )
2041 duplicate_message = (
2042 "[Skipped - duplicate action: Already read "
2043 f"{chapter_one} recently without any intervening changes; "
2044 "reuse the earlier read result instead of rereading]"
2045 )
2046 executor = FakeExecutor(
2047 [
2048 ToolExecutionOutcome(
2049 tool_call=tool_call,
2050 state=ToolExecutionState.DUPLICATE,
2051 message=Message.tool_result_message(
2052 tool_call_id=tool_call.id,
2053 display_content=duplicate_message,
2054 result_content=duplicate_message,
2055 ),
2056 event_content=duplicate_message,
2057 is_error=False,
2058 result_output=duplicate_message,
2059 )
2060 ]
2061 )
2062
2063 summary = TurnSummary(final_response="")
2064 await runner.execute_batch(
2065 tool_calls=[tool_call],
2066 tool_source="assistant",
2067 pending_tool_calls_seen=set(),
2068 emit=_noop_emit,
2069 summary=summary,
2070 dod=dod,
2071 executor=executor, # type: ignore[arg-type]
2072 on_confirmation=None,
2073 on_user_question=None,
2074 emit_confirmation=None,
2075 consecutive_errors=0,
2076 )
2077
2078 assert len(persistent_messages) == 1
2079 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
2080 assert (
2081 "Move to verification or final confirmation using the files already on disk."
2082 in persistent_messages[0]
2083 )
2084 assert "Create 01-getting-started.html" not in persistent_messages[0]
2085 assert "Creating 02-installation.html" not in persistent_messages[0]
2086 assert ephemeral_messages == []
2087
2088
2089 @pytest.mark.asyncio
2090 async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
2091 temp_dir: Path,
2092 ) -> None:
2093 async def assess_confidence(
2094 tool_name: str,
2095 tool_args: dict,
2096 context: str,
2097 ) -> ConfidenceAssessment:
2098 raise AssertionError("Confidence scoring should not run for this scenario")
2099
2100 async def verify_action(
2101 tool_name: str,
2102 tool_args: dict,
2103 result: str,
2104 expected: str = "",
2105 ) -> ActionVerification:
2106 raise AssertionError("Verification should not run for this scenario")
2107
2108 guide_root = temp_dir / "guides" / "nginx"
2109 chapters = guide_root / "chapters"
2110 guide_root.mkdir(parents=True)
2111 chapters.mkdir()
2112 index_path = guide_root / "index.html"
2113 chapter_one = chapters / "01-getting-started.html"
2114 chapter_two = chapters / "02-installation.html"
2115 index_path.write_text("<html></html>\n")
2116 chapter_one.write_text("<h1>One</h1>\n")
2117 chapter_two.write_text("<h1>Two</h1>\n")
2118
2119 implementation_plan = temp_dir / "implementation.md"
2120 implementation_plan.write_text(
2121 "\n".join(
2122 [
2123 "# Implementation Plan",
2124 "",
2125 "## File Changes",
2126 f"- `{guide_root}/`",
2127 f"- `{chapters}/`",
2128 f"- `{index_path}`",
2129 f"- `{chapter_one}`",
2130 f"- `{chapter_two}`",
2131 "",
2132 ]
2133 )
2134 )
2135
2136 context = build_context(
2137 temp_dir=temp_dir,
2138 messages=[],
2139 safeguards=FakeSafeguards(),
2140 assess_confidence=assess_confidence,
2141 verify_action=verify_action,
2142 auto_recover=False,
2143 )
2144 persistent_messages: list[str] = []
2145 ephemeral_messages: list[str] = []
2146 context.queue_steering_message_callback = persistent_messages.append
2147 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2148 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2149 dod = create_definition_of_done("Create a multi-file nginx guide.")
2150 dod.implementation_plan = str(implementation_plan)
2151 dod.verification_commands = [f"ls -la {guide_root}"]
2152 sync_todos_to_definition_of_done(
2153 dod,
2154 [
2155 {
2156 "content": "Create 01-getting-started.html",
2157 "active_form": "Creating 01-getting-started.html",
2158 "status": "pending",
2159 },
2160 {
2161 "content": "Ensure all files are properly linked and formatted consistently",
2162 "active_form": "Reviewing guide consistency and linkage",
2163 "status": "pending",
2164 },
2165 ],
2166 )
2167
2168 tool_call = ToolCall(
2169 id="read-built-review",
2170 name="read",
2171 arguments={"file_path": str(chapter_one)},
2172 )
2173 executor = FakeExecutor(
2174 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2175 )
2176
2177 summary = TurnSummary(final_response="")
2178 await runner.execute_batch(
2179 tool_calls=[tool_call],
2180 tool_source="assistant",
2181 pending_tool_calls_seen=set(),
2182 emit=_noop_emit,
2183 summary=summary,
2184 dod=dod,
2185 executor=executor, # type: ignore[arg-type]
2186 on_confirmation=None,
2187 on_user_question=None,
2188 emit_confirmation=None,
2189 consecutive_errors=0,
2190 )
2191
2192 assert persistent_messages == []
2193 assert len(ephemeral_messages) == 1
2194 message = ephemeral_messages[0]
2195 assert "All explicitly planned artifacts already exist." in message
2196 assert "Ensure all files are properly linked and formatted consistently" in message
2197 assert "Create 01-getting-started.html" not in message
2198 assert "do not keep broad-rereading the output set" in message
2199 assert "If no specific mismatch remains, move to verification now." in message
2200
2201
2202 @pytest.mark.asyncio
2203 async def test_tool_batch_runner_successful_read_after_plan_complete_switches_to_verify(
2204 temp_dir: Path,
2205 ) -> None:
2206 async def assess_confidence(
2207 tool_name: str,
2208 tool_args: dict,
2209 context: str,
2210 ) -> ConfidenceAssessment:
2211 raise AssertionError("Confidence scoring should not run for this scenario")
2212
2213 async def verify_action(
2214 tool_name: str,
2215 tool_args: dict,
2216 result: str,
2217 expected: str = "",
2218 ) -> ActionVerification:
2219 raise AssertionError("Verification should not run for this scenario")
2220
2221 guide_root = temp_dir / "guides" / "nginx"
2222 chapters = guide_root / "chapters"
2223 guide_root.mkdir(parents=True)
2224 chapters.mkdir()
2225 index_path = guide_root / "index.html"
2226 chapter_one = chapters / "01-getting-started.html"
2227 chapter_two = chapters / "02-installation.html"
2228 index_path.write_text("<html></html>\n")
2229 chapter_one.write_text("<h1>One</h1>\n")
2230 chapter_two.write_text("<h1>Two</h1>\n")
2231
2232 implementation_plan = temp_dir / "implementation.md"
2233 implementation_plan.write_text(
2234 "\n".join(
2235 [
2236 "# Implementation Plan",
2237 "",
2238 "## File Changes",
2239 f"- `{guide_root}/`",
2240 f"- `{chapters}/`",
2241 f"- `{index_path}`",
2242 f"- `{chapter_one}`",
2243 f"- `{chapter_two}`",
2244 "",
2245 ]
2246 )
2247 )
2248
2249 context = build_context(
2250 temp_dir=temp_dir,
2251 messages=[],
2252 safeguards=FakeSafeguards(),
2253 assess_confidence=assess_confidence,
2254 verify_action=verify_action,
2255 auto_recover=False,
2256 )
2257 persistent_messages: list[str] = []
2258 ephemeral_messages: list[str] = []
2259 context.queue_steering_message_callback = persistent_messages.append
2260 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2261 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2262 dod = create_definition_of_done("Create a multi-file nginx guide.")
2263 dod.implementation_plan = str(implementation_plan)
2264 dod.verification_commands = [f"ls -la {guide_root}"]
2265
2266 tool_call = ToolCall(
2267 id="read-built-verify",
2268 name="read",
2269 arguments={"file_path": str(chapter_one)},
2270 )
2271 executor = FakeExecutor(
2272 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2273 )
2274
2275 summary = TurnSummary(final_response="")
2276 await runner.execute_batch(
2277 tool_calls=[tool_call],
2278 tool_source="assistant",
2279 pending_tool_calls_seen=set(),
2280 emit=_noop_emit,
2281 summary=summary,
2282 dod=dod,
2283 executor=executor, # type: ignore[arg-type]
2284 on_confirmation=None,
2285 on_user_question=None,
2286 emit_confirmation=None,
2287 consecutive_errors=0,
2288 )
2289
2290 assert len(persistent_messages) == 1
2291 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
2292 assert "Verification should run next." in persistent_messages[0]
2293 assert "stop broad rereads" in persistent_messages[0]
2294 assert ephemeral_messages == []
2295 assert context.workflow_mode == "verify"
2296
2297
2298 @pytest.mark.asyncio
2299 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
2300 temp_dir: Path,
2301 ) -> None:
2302 async def assess_confidence(
2303 tool_name: str,
2304 tool_args: dict,
2305 context: str,
2306 ) -> ConfidenceAssessment:
2307 raise AssertionError("Confidence scoring should be disabled in this scenario")
2308
2309 async def verify_action(
2310 tool_name: str,
2311 tool_args: dict,
2312 result: str,
2313 expected: str = "",
2314 ) -> ActionVerification:
2315 raise AssertionError("Verification should not run for this scenario")
2316
2317 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2318 reference.parent.mkdir(parents=True)
2319 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2320
2321 context = build_context(
2322 temp_dir=temp_dir,
2323 messages=[],
2324 safeguards=FakeSafeguards(),
2325 assess_confidence=assess_confidence,
2326 verify_action=verify_action,
2327 auto_recover=False,
2328 )
2329 persistent_messages: list[str] = []
2330 ephemeral_messages: list[str] = []
2331 context.queue_steering_message_callback = persistent_messages.append
2332 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2333 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2334 dod = create_definition_of_done("Create a multi-file nginx guide.")
2335 sync_todos_to_definition_of_done(
2336 dod,
2337 [
2338 {
2339 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2340 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2341 "status": "pending",
2342 },
2343 {
2344 "content": "Create the nginx index.html file",
2345 "active_form": "Working on: Create the nginx index.html file",
2346 "status": "pending",
2347 },
2348 ],
2349 )
2350 tool_call = ToolCall(
2351 id="read-reference",
2352 name="read",
2353 arguments={"file_path": str(reference)},
2354 )
2355 executor = FakeExecutor(
2356 [
2357 tool_outcome(
2358 tool_call=tool_call,
2359 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2360 is_error=False,
2361 )
2362 ]
2363 )
2364
2365 summary = TurnSummary(final_response="")
2366 await runner.execute_batch(
2367 tool_calls=[tool_call],
2368 tool_source="assistant",
2369 pending_tool_calls_seen=set(),
2370 emit=_noop_emit,
2371 summary=summary,
2372 dod=dod,
2373 executor=executor, # type: ignore[arg-type]
2374 on_confirmation=None,
2375 on_user_question=None,
2376 emit_confirmation=None,
2377 consecutive_errors=0,
2378 )
2379
2380 assert any(
2381 "Continue with the next pending item: `Create the nginx index.html file`"
2382 in message
2383 for message in persistent_messages
2384 )
2385 assert any(
2386 "stop gathering more reference material and perform the change now" in message
2387 for message in persistent_messages
2388 )
2389 assert ephemeral_messages == []
2390
2391
2392 @pytest.mark.asyncio
2393 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2394 temp_dir: Path,
2395 ) -> None:
2396 async def assess_confidence(
2397 tool_name: str,
2398 tool_args: dict,
2399 context: str,
2400 ) -> ConfidenceAssessment:
2401 raise AssertionError("Confidence scoring should be disabled in this scenario")
2402
2403 async def verify_action(
2404 tool_name: str,
2405 tool_args: dict,
2406 result: str,
2407 expected: str = "",
2408 ) -> ActionVerification:
2409 raise AssertionError("Verification should not run for this scenario")
2410
2411 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2412 reference.parent.mkdir(parents=True)
2413 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2414
2415 context = build_context(
2416 temp_dir=temp_dir,
2417 messages=[],
2418 safeguards=FakeSafeguards(),
2419 assess_confidence=assess_confidence,
2420 verify_action=verify_action,
2421 auto_recover=False,
2422 )
2423 persistent_messages: list[str] = []
2424 ephemeral_messages: list[str] = []
2425 context.queue_steering_message_callback = persistent_messages.append
2426 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2427 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2428 dod = create_definition_of_done("Create a multi-file nginx guide.")
2429 sync_todos_to_definition_of_done(
2430 dod,
2431 [
2432 {
2433 "content": "First, examine the existing fortran guide structure and content",
2434 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2435 "status": "pending",
2436 },
2437 {
2438 "content": "Create the nginx directory structure",
2439 "active_form": "Working on: Create the nginx directory structure",
2440 "status": "pending",
2441 },
2442 ],
2443 )
2444 tool_call = ToolCall(
2445 id="read-reference",
2446 name="read",
2447 arguments={"file_path": str(reference)},
2448 )
2449 executor = FakeExecutor(
2450 [
2451 tool_outcome(
2452 tool_call=tool_call,
2453 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2454 is_error=False,
2455 )
2456 ]
2457 )
2458
2459 summary = TurnSummary(final_response="")
2460 await runner.execute_batch(
2461 tool_calls=[tool_call],
2462 tool_source="assistant",
2463 pending_tool_calls_seen=set(),
2464 emit=_noop_emit,
2465 summary=summary,
2466 dod=dod,
2467 executor=executor, # type: ignore[arg-type]
2468 on_confirmation=None,
2469 on_user_question=None,
2470 emit_confirmation=None,
2471 consecutive_errors=0,
2472 )
2473
2474 assert persistent_messages
2475 assert any(
2476 "Continue with the next pending item: `Create the nginx directory structure`"
2477 in message
2478 for message in persistent_messages
2479 )
2480 assert ephemeral_messages == []
2481
2482
2483 @pytest.mark.asyncio
2484 async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
2485 temp_dir: Path,
2486 ) -> None:
2487 async def assess_confidence(
2488 tool_name: str,
2489 tool_args: dict,
2490 context: str,
2491 ) -> ConfidenceAssessment:
2492 raise AssertionError("Confidence scoring should be disabled in this scenario")
2493
2494 async def verify_action(
2495 tool_name: str,
2496 tool_args: dict,
2497 result: str,
2498 expected: str = "",
2499 ) -> ActionVerification:
2500 raise AssertionError("Verification should not run for this scenario")
2501
2502 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2503 chapters = nginx_root / "chapters"
2504 implementation_plan = temp_dir / "implementation.md"
2505 implementation_plan.write_text(
2506 "\n".join(
2507 [
2508 "# Implementation Plan",
2509 "",
2510 "## File Changes",
2511 f"- `{chapters}/`",
2512 f"- `{nginx_root / 'index.html'}`",
2513 "",
2514 ]
2515 )
2516 )
2517
2518 context = build_context(
2519 temp_dir=temp_dir,
2520 messages=[],
2521 safeguards=FakeSafeguards(),
2522 assess_confidence=assess_confidence,
2523 verify_action=verify_action,
2524 auto_recover=False,
2525 )
2526 persistent_messages: list[str] = []
2527 ephemeral_messages: list[str] = []
2528 context.queue_steering_message_callback = persistent_messages.append
2529 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2530 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2531 dod = create_definition_of_done("Create a multi-file nginx guide.")
2532 dod.implementation_plan = str(implementation_plan)
2533 sync_todos_to_definition_of_done(
2534 dod,
2535 [
2536 {
2537 "content": "Create the nginx directory structure",
2538 "active_form": "Creating the nginx directory structure",
2539 "status": "pending",
2540 },
2541 {
2542 "content": "Develop the main index.html file with proper structure",
2543 "active_form": "Developing the main index.html file with proper structure",
2544 "status": "pending",
2545 },
2546 ],
2547 )
2548
2549 tool_call = ToolCall(
2550 id="mkdir-nginx",
2551 name="bash",
2552 arguments={"command": f"mkdir -p {chapters}"},
2553 )
2554 executor = FakeExecutor(
2555 [
2556 tool_outcome(
2557 tool_call=tool_call,
2558 output="",
2559 is_error=False,
2560 )
2561 ]
2562 )
2563
2564 summary = TurnSummary(final_response="")
2565 await runner.execute_batch(
2566 tool_calls=[tool_call],
2567 tool_source="assistant",
2568 pending_tool_calls_seen=set(),
2569 emit=_noop_emit,
2570 summary=summary,
2571 dod=dod,
2572 executor=executor, # type: ignore[arg-type]
2573 on_confirmation=None,
2574 on_user_question=None,
2575 emit_confirmation=None,
2576 consecutive_errors=0,
2577 )
2578
2579 assert persistent_messages
2580 message = persistent_messages[-1]
2581 assert "Directory setup is complete." in message
2582 assert "Next step: create `index.html`." in message
2583 assert "Write a compact but real initial version of that file now" in message
2584 assert ephemeral_messages == []
2585
2586
2587 @pytest.mark.asyncio
2588 async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
2589 temp_dir: Path,
2590 ) -> None:
2591 async def assess_confidence(
2592 tool_name: str,
2593 tool_args: dict,
2594 context: str,
2595 ) -> ConfidenceAssessment:
2596 raise AssertionError("Confidence scoring should be disabled in this scenario")
2597
2598 async def verify_action(
2599 tool_name: str,
2600 tool_args: dict,
2601 result: str,
2602 expected: str = "",
2603 ) -> ActionVerification:
2604 raise AssertionError("Verification should not run for this scenario")
2605
2606 nginx_root = temp_dir / "guides" / "nginx"
2607 chapters = nginx_root / "chapters"
2608 chapters.mkdir(parents=True)
2609 index_path = nginx_root / "index.html"
2610
2611 implementation_plan = temp_dir / "implementation.md"
2612 implementation_plan.write_text(
2613 "\n".join(
2614 [
2615 "# Implementation Plan",
2616 "",
2617 "## File Changes",
2618 f"- `{chapters}/`",
2619 f"- `{index_path}`",
2620 f"- `{chapters / '01-introduction.html'}`",
2621 "",
2622 ]
2623 )
2624 )
2625
2626 context = build_context(
2627 temp_dir=temp_dir,
2628 messages=[],
2629 safeguards=FakeSafeguards(),
2630 assess_confidence=assess_confidence,
2631 verify_action=verify_action,
2632 auto_recover=False,
2633 )
2634 persistent_messages: list[str] = []
2635 ephemeral_messages: list[str] = []
2636 context.queue_steering_message_callback = persistent_messages.append
2637 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2638 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2639 dod = create_definition_of_done("Create a multi-file nginx guide.")
2640 dod.implementation_plan = str(implementation_plan)
2641 sync_todos_to_definition_of_done(
2642 dod,
2643 [
2644 {
2645 "content": "Create the main index.html file with proper structure",
2646 "active_form": "Creating the main index.html file with proper structure",
2647 "status": "pending",
2648 },
2649 {
2650 "content": "Create each chapter file with appropriate content",
2651 "active_form": "Creating each chapter file with appropriate content",
2652 "status": "pending",
2653 },
2654 ],
2655 )
2656
2657 tool_call = ToolCall(
2658 id="write-index",
2659 name="write",
2660 arguments={
2661 "file_path": str(index_path),
2662 "content": "<html></html>\n",
2663 },
2664 )
2665 executor = FakeExecutor(
2666 [
2667 tool_outcome(
2668 tool_call=tool_call,
2669 output=f"Successfully wrote 14 bytes to {index_path}",
2670 is_error=False,
2671 )
2672 ]
2673 )
2674
2675 summary = TurnSummary(final_response="")
2676 await runner.execute_batch(
2677 tool_calls=[tool_call],
2678 tool_source="assistant",
2679 pending_tool_calls_seen=set(),
2680 emit=_noop_emit,
2681 summary=summary,
2682 dod=dod,
2683 executor=executor, # type: ignore[arg-type]
2684 on_confirmation=None,
2685 on_user_question=None,
2686 emit_confirmation=None,
2687 consecutive_errors=0,
2688 )
2689
2690 assert persistent_messages
2691 assert ephemeral_messages == []
2692 message = persistent_messages[-1]
2693 assert "Confirmed progress:" in message
2694 assert "Next step: create `01-introduction.html`." in message
2695 assert (
2696 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2697 in message
2698 )
2699 assert "Write a compact but real initial version of that file now" not in message
2700 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2701
2702
2703 @pytest.mark.asyncio
2704 async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
2705 temp_dir: Path,
2706 monkeypatch: pytest.MonkeyPatch,
2707 ) -> None:
2708 monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
2709
2710 async def assess_confidence(
2711 tool_name: str,
2712 tool_args: dict,
2713 context: str,
2714 ) -> ConfidenceAssessment:
2715 raise AssertionError("Confidence scoring should be disabled in this scenario")
2716
2717 async def verify_action(
2718 tool_name: str,
2719 tool_args: dict,
2720 result: str,
2721 expected: str = "",
2722 ) -> ActionVerification:
2723 raise AssertionError("Verification should not run for this scenario")
2724
2725 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2726 chapters = nginx_root / "chapters"
2727 index_path = nginx_root / "index.html"
2728
2729 implementation_plan = temp_dir / "implementation.md"
2730 implementation_plan.write_text(
2731 "\n".join(
2732 [
2733 "# Implementation Plan",
2734 "",
2735 "## File Changes",
2736 f"- `{chapters}/`",
2737 f"- `{index_path}`",
2738 "",
2739 ]
2740 )
2741 )
2742
2743 context = build_context(
2744 temp_dir=temp_dir,
2745 messages=[],
2746 safeguards=FakeSafeguards(),
2747 assess_confidence=assess_confidence,
2748 verify_action=verify_action,
2749 auto_recover=False,
2750 )
2751 persistent_messages: list[str] = []
2752 context.queue_steering_message_callback = persistent_messages.append
2753 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2754 dod = create_definition_of_done("Create a multi-file nginx guide.")
2755 dod.implementation_plan = str(implementation_plan)
2756 sync_todos_to_definition_of_done(
2757 dod,
2758 [
2759 {
2760 "content": "Create the nginx directory structure",
2761 "active_form": "Creating the nginx directory structure",
2762 "status": "pending",
2763 },
2764 {
2765 "content": "Develop the main index.html file with proper structure",
2766 "active_form": "Developing the main index.html file with proper structure",
2767 "status": "pending",
2768 },
2769 ],
2770 )
2771
2772 tool_call = ToolCall(
2773 id="mkdir-nginx-home",
2774 name="bash",
2775 arguments={"command": f"mkdir -p {chapters}"},
2776 )
2777 executor = FakeExecutor(
2778 [
2779 tool_outcome(
2780 tool_call=tool_call,
2781 output="",
2782 is_error=False,
2783 )
2784 ]
2785 )
2786
2787 summary = TurnSummary(final_response="")
2788 await runner.execute_batch(
2789 tool_calls=[tool_call],
2790 tool_source="assistant",
2791 pending_tool_calls_seen=set(),
2792 emit=_noop_emit,
2793 summary=summary,
2794 dod=dod,
2795 executor=executor, # type: ignore[arg-type]
2796 on_confirmation=None,
2797 on_user_question=None,
2798 emit_confirmation=None,
2799 consecutive_errors=0,
2800 )
2801
2802 assert persistent_messages
2803 message = persistent_messages[-1]
2804 assert "Next step: create `index.html`." in message
2805 assert "`~/Loader/guides/nginx/index.html`" in message
2806 assert "Write a compact but real initial version of that file now" in message
2807
2808
2809 @pytest.mark.asyncio
2810 async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
2811 temp_dir: Path,
2812 ) -> None:
2813 async def assess_confidence(
2814 tool_name: str,
2815 tool_args: dict,
2816 context: str,
2817 ) -> ConfidenceAssessment:
2818 raise AssertionError("Confidence scoring should not run in this scenario")
2819
2820 async def verify_action(
2821 tool_name: str,
2822 tool_args: dict,
2823 result: str,
2824 expected: str = "",
2825 ) -> ActionVerification:
2826 raise AssertionError("Verification should not run in this scenario")
2827
2828 nginx_root = temp_dir / "guides" / "nginx"
2829 chapters = nginx_root / "chapters"
2830 chapters.mkdir(parents=True)
2831 index_path = nginx_root / "index.html"
2832 index_path.write_text(
2833 "\n".join(
2834 [
2835 "<html>",
2836 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2837 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2838 "</html>",
2839 ]
2840 )
2841 + "\n"
2842 )
2843
2844 implementation_plan = temp_dir / "implementation.md"
2845 implementation_plan.write_text(
2846 "\n".join(
2847 [
2848 "# Implementation Plan",
2849 "",
2850 "## File Changes",
2851 f"- `{nginx_root}/`",
2852 f"- `{chapters}/`",
2853 f"- `{index_path}`",
2854 f"- `{chapters / '01-introduction.html'}`",
2855 "",
2856 ]
2857 )
2858 )
2859
2860 context = build_context(
2861 temp_dir=temp_dir,
2862 messages=[],
2863 safeguards=FakeSafeguards(),
2864 assess_confidence=assess_confidence,
2865 verify_action=verify_action,
2866 auto_recover=False,
2867 )
2868 persistent_messages: list[str] = []
2869 ephemeral_messages: list[str] = []
2870 context.queue_steering_message_callback = persistent_messages.append
2871 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2872 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2873 dod = create_definition_of_done("Create a multi-file nginx guide.")
2874 dod.implementation_plan = str(implementation_plan)
2875 dod.touched_files.append(str(index_path))
2876 dod.completed_items.append("Develop the main index.html file for the nginx guide")
2877 dod.pending_items.append("Create chapter files for the nginx guide")
2878
2879 tool_call = ToolCall(
2880 id="read-index-self-audit",
2881 name="read",
2882 arguments={"file_path": str(index_path)},
2883 )
2884 executor = FakeExecutor(
2885 [
2886 tool_outcome(
2887 tool_call=tool_call,
2888 output="1\t<html>\n",
2889 is_error=False,
2890 )
2891 ]
2892 )
2893
2894 summary = TurnSummary(final_response="")
2895 await runner.execute_batch(
2896 tool_calls=[tool_call],
2897 tool_source="assistant",
2898 pending_tool_calls_seen=set(),
2899 emit=_noop_emit,
2900 summary=summary,
2901 dod=dod,
2902 executor=executor, # type: ignore[arg-type]
2903 on_confirmation=None,
2904 on_user_question=None,
2905 emit_confirmation=None,
2906 consecutive_errors=0,
2907 )
2908
2909 assert persistent_messages
2910 message = persistent_messages[-1]
2911 assert "You already have the current contents of `index.html` from the successful write." in message
2912 assert "Resume by creating `01-introduction.html` now." in message
2913 assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
2914 assert ephemeral_messages == []
2915
2916
2917 @pytest.mark.asyncio
2918 async def test_tool_batch_runner_preserves_first_file_handoff_after_recovery_prompt(
2919 temp_dir: Path,
2920 ) -> None:
2921 async def assess_confidence(
2922 tool_name: str,
2923 tool_args: dict,
2924 context: str,
2925 ) -> ConfidenceAssessment:
2926 raise AssertionError("Confidence scoring should be disabled in this scenario")
2927
2928 async def verify_action(
2929 tool_name: str,
2930 tool_args: dict,
2931 result: str,
2932 expected: str = "",
2933 ) -> ActionVerification:
2934 raise AssertionError("Verification should not run for this scenario")
2935
2936 nginx_root = temp_dir / "guides" / "nginx"
2937 chapters = nginx_root / "chapters"
2938 chapters.mkdir(parents=True)
2939 index_path = nginx_root / "index.html"
2940
2941 implementation_plan = temp_dir / "implementation.md"
2942 implementation_plan.write_text(
2943 "\n".join(
2944 [
2945 "# Implementation Plan",
2946 "",
2947 "## File Changes",
2948 f"- `{chapters}/`",
2949 f"- `{index_path}`",
2950 f"- `{chapters / '01-introduction.html'}`",
2951 "",
2952 ]
2953 )
2954 )
2955
2956 context = build_context(
2957 temp_dir=temp_dir,
2958 messages=[
2959 Message(
2960 role=Role.USER,
2961 content=(
2962 "[EMPTY ASSISTANT RESPONSE]\n"
2963 "Respond with that concrete mutation tool call now. Do not return an empty response."
2964 ),
2965 )
2966 ],
2967 safeguards=FakeSafeguards(),
2968 assess_confidence=assess_confidence,
2969 verify_action=verify_action,
2970 auto_recover=False,
2971 )
2972 persistent_messages: list[str] = []
2973 ephemeral_messages: list[str] = []
2974 context.queue_steering_message_callback = persistent_messages.append
2975 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2976 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2977 dod = create_definition_of_done("Create a multi-file nginx guide.")
2978 dod.implementation_plan = str(implementation_plan)
2979 sync_todos_to_definition_of_done(
2980 dod,
2981 [
2982 {
2983 "content": "Create the main index.html file with proper structure",
2984 "active_form": "Creating the main index.html file with proper structure",
2985 "status": "pending",
2986 },
2987 {
2988 "content": "Create each chapter file with appropriate content",
2989 "active_form": "Creating each chapter file with appropriate content",
2990 "status": "pending",
2991 },
2992 ],
2993 )
2994
2995 tool_call = ToolCall(
2996 id="write-index-recovered",
2997 name="write",
2998 arguments={
2999 "file_path": str(index_path),
3000 "content": "<html></html>\n",
3001 },
3002 )
3003 executor = FakeExecutor(
3004 [
3005 tool_outcome(
3006 tool_call=tool_call,
3007 output=f"Successfully wrote 14 bytes to {index_path}",
3008 is_error=False,
3009 )
3010 ]
3011 )
3012
3013 summary = TurnSummary(final_response="")
3014 await runner.execute_batch(
3015 tool_calls=[tool_call],
3016 tool_source="assistant",
3017 pending_tool_calls_seen=set(),
3018 emit=_noop_emit,
3019 summary=summary,
3020 dod=dod,
3021 executor=executor, # type: ignore[arg-type]
3022 on_confirmation=None,
3023 on_user_question=None,
3024 emit_confirmation=None,
3025 consecutive_errors=0,
3026 )
3027
3028 assert persistent_messages
3029 assert ephemeral_messages == []
3030 message = persistent_messages[-1]
3031 assert "Next step: create `01-introduction.html`." in message
3032 assert "Write a compact but real initial version of that file now" not in message
3033
3034
3035 @pytest.mark.asyncio
3036 async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
3037 temp_dir: Path,
3038 ) -> None:
3039 async def assess_confidence(
3040 tool_name: str,
3041 tool_args: dict,
3042 context: str,
3043 ) -> ConfidenceAssessment:
3044 raise AssertionError("Confidence scoring should not run in this scenario")
3045
3046 async def verify_action(
3047 tool_name: str,
3048 tool_args: dict,
3049 result: str,
3050 expected: str = "",
3051 ) -> ActionVerification:
3052 raise AssertionError("Verification should not run in this scenario")
3053
3054 guide_root = temp_dir / "guides" / "nginx"
3055 chapters = guide_root / "chapters"
3056 chapters.mkdir(parents=True)
3057 index_path = guide_root / "index.html"
3058 index_path.write_text(
3059 "\n".join(
3060 [
3061 "<html>",
3062 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
3063 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3064 "</html>",
3065 ]
3066 )
3067 + "\n"
3068 )
3069
3070 implementation_plan = temp_dir / "implementation.md"
3071 implementation_plan.write_text(
3072 "\n".join(
3073 [
3074 "# Implementation Plan",
3075 "",
3076 "## File Changes",
3077 f"- `{guide_root}/`",
3078 f"- `{chapters}/`",
3079 f"- `{index_path}`",
3080 "",
3081 ]
3082 )
3083 )
3084
3085 context = build_context(
3086 temp_dir=temp_dir,
3087 messages=[],
3088 safeguards=FakeSafeguards(),
3089 assess_confidence=assess_confidence,
3090 verify_action=verify_action,
3091 )
3092 queued_messages: list[str] = []
3093 context.queue_steering_message_callback = queued_messages.append
3094 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3095 dod = create_definition_of_done("Create a multi-file nginx guide.")
3096 dod.implementation_plan = str(implementation_plan)
3097 dod.touched_files.append(str(index_path))
3098 sync_todos_to_definition_of_done(
3099 dod,
3100 [
3101 {
3102 "content": "Develop the main index.html file with proper structure",
3103 "active_form": "Developing the main index.html file with proper structure",
3104 "status": "completed",
3105 },
3106 {
3107 "content": "Create chapter files with content and structure",
3108 "active_form": "Creating chapter files with content and structure",
3109 "status": "pending",
3110 },
3111 ],
3112 )
3113
3114 todos = [
3115 {
3116 "content": "Develop the main index.html file with proper structure",
3117 "active_form": "Developing the main index.html file with proper structure",
3118 "status": "completed",
3119 },
3120 {
3121 "content": "Create chapter files with content and structure",
3122 "active_form": "Creating chapter files with content and structure",
3123 "status": "pending",
3124 },
3125 ]
3126 tool_call = ToolCall(
3127 id="todo-aggregate",
3128 name="TodoWrite",
3129 arguments={"todos": todos},
3130 )
3131 executor = FakeExecutor(
3132 [
3133 tool_outcome(
3134 tool_call=tool_call,
3135 output="Todos updated",
3136 is_error=False,
3137 metadata={"new_todos": todos},
3138 )
3139 ]
3140 )
3141
3142 summary = TurnSummary(final_response="")
3143 await runner.execute_batch(
3144 tool_calls=[tool_call],
3145 tool_source="assistant",
3146 pending_tool_calls_seen=set(),
3147 emit=_noop_emit,
3148 summary=summary,
3149 dod=dod,
3150 executor=executor, # type: ignore[arg-type]
3151 on_confirmation=None,
3152 on_user_question=None,
3153 emit_confirmation=None,
3154 consecutive_errors=0,
3155 )
3156
3157 assert queued_messages
3158 message = queued_messages[-1]
3159 assert "Todo tracking is updated." in message
3160 assert "Next step: create `01-introduction.html`." in message
3161 assert (
3162 "Continue with the next pending item: `Create chapter files with content and structure`."
3163 not in message
3164 )
3165
3166
3167 @pytest.mark.asyncio
3168 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
3169 temp_dir: Path,
3170 ) -> None:
3171 async def assess_confidence(
3172 tool_name: str,
3173 tool_args: dict,
3174 context: str,
3175 ) -> ConfidenceAssessment:
3176 raise AssertionError("Confidence scoring should be disabled in this scenario")
3177
3178 async def verify_action(
3179 tool_name: str,
3180 tool_args: dict,
3181 result: str,
3182 expected: str = "",
3183 ) -> ActionVerification:
3184 raise AssertionError("Verification should not run for this scenario")
3185
3186 guide_root = temp_dir / "guides" / "nginx"
3187 chapters = guide_root / "chapters"
3188 chapters.mkdir(parents=True)
3189 index_path = guide_root / "index.html"
3190 chapter_one = chapters / "01-getting-started.html"
3191 chapter_one.write_text("<h1>One</h1>\n")
3192 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3193
3194 implementation_plan = temp_dir / "implementation.md"
3195 implementation_plan.write_text(
3196 "\n".join(
3197 [
3198 "# Implementation Plan",
3199 "",
3200 "## File Changes",
3201 f"- `{index_path}`",
3202 f"- `{chapter_one}`",
3203 f"- `{chapters / '06-ssl-configuration.html'}`",
3204 "",
3205 ]
3206 )
3207 )
3208
3209 context = build_context(
3210 temp_dir=temp_dir,
3211 messages=[],
3212 safeguards=FakeSafeguards(),
3213 assess_confidence=assess_confidence,
3214 verify_action=verify_action,
3215 auto_recover=False,
3216 )
3217 persistent_messages: list[str] = []
3218 ephemeral_messages: list[str] = []
3219 context.queue_steering_message_callback = persistent_messages.append
3220 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3221 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3222 dod = create_definition_of_done("Create a multi-file nginx guide.")
3223 dod.implementation_plan = str(implementation_plan)
3224 sync_todos_to_definition_of_done(
3225 dod,
3226 [
3227 {
3228 "content": "Ensure all files are properly linked and formatted consistently",
3229 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3230 "status": "pending",
3231 },
3232 {
3233 "content": "Create the final chapter (06-ssl-configuration.html)",
3234 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
3235 "status": "pending",
3236 },
3237 ],
3238 )
3239 assert tool_batches_should_prioritize_missing_artifact(
3240 dod=dod,
3241 next_pending=dod.pending_items[0],
3242 missing_artifact=(chapters / "06-ssl-configuration.html", False),
3243 project_root=temp_dir,
3244 )
3245
3246 tool_call = ToolCall(
3247 id="dup-read",
3248 name="read",
3249 arguments={"file_path": str(index_path)},
3250 )
3251 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
3252
3253 assert persistent_messages
3254 message = persistent_messages[-1]
3255 assert "06-ssl-configuration.html" in message
3256 assert "Do not switch into review or consistency-check mode" in message
3257 assert (
3258 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
3259 not in message
3260 )
3261
3262
3263 @pytest.mark.asyncio
3264 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
3265 temp_dir: Path,
3266 ) -> None:
3267 async def assess_confidence(
3268 tool_name: str,
3269 tool_args: dict,
3270 context: str,
3271 ) -> ConfidenceAssessment:
3272 raise AssertionError("Confidence scoring should be disabled in this scenario")
3273
3274 async def verify_action(
3275 tool_name: str,
3276 tool_args: dict,
3277 result: str,
3278 expected: str = "",
3279 ) -> ActionVerification:
3280 raise AssertionError("Verification should not run for this scenario")
3281
3282 guide_root = temp_dir / "guides" / "nginx"
3283 chapters = guide_root / "chapters"
3284 chapters.mkdir(parents=True)
3285 index_path = guide_root / "index.html"
3286 chapter_one = chapters / "01-getting-started.html"
3287 chapter_two = chapters / "02-installation.html"
3288 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3289 chapter_one.write_text("<h1>One</h1>\n")
3290 chapter_two.write_text("<h1>Two</h1>\n")
3291
3292 implementation_plan = temp_dir / "implementation.md"
3293 implementation_plan.write_text(
3294 "\n".join(
3295 [
3296 "# Implementation Plan",
3297 "",
3298 "## File Changes",
3299 f"- `{chapters}/`",
3300 f"- `{index_path}`",
3301 f"- `{chapter_one}`",
3302 f"- `{chapter_two}`",
3303 "",
3304 ]
3305 )
3306 )
3307
3308 context = build_context(
3309 temp_dir=temp_dir,
3310 messages=[],
3311 safeguards=FakeSafeguards(),
3312 assess_confidence=assess_confidence,
3313 verify_action=verify_action,
3314 auto_recover=False,
3315 )
3316 persistent_messages: list[str] = []
3317 ephemeral_messages: list[str] = []
3318 context.queue_steering_message_callback = persistent_messages.append
3319 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3320 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3321 dod = create_definition_of_done("Create a multi-file nginx guide.")
3322 dod.implementation_plan = str(implementation_plan)
3323 sync_todos_to_definition_of_done(
3324 dod,
3325 [
3326 {
3327 "content": "Create the guide files",
3328 "active_form": "Working on: Create the guide files",
3329 "status": "completed",
3330 },
3331 {
3332 "content": "Ensure all files are properly linked and formatted consistently",
3333 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3334 "status": "pending",
3335 },
3336 ],
3337 )
3338 tool_call = ToolCall(
3339 id="write-final",
3340 name="write",
3341 arguments={
3342 "file_path": str(chapter_two),
3343 "content": "<h1>Two</h1>\n",
3344 },
3345 )
3346 executor = FakeExecutor(
3347 [
3348 tool_outcome(
3349 tool_call=tool_call,
3350 output=f"Successfully wrote {chapter_two}",
3351 is_error=False,
3352 )
3353 ]
3354 )
3355
3356 summary = TurnSummary(final_response="")
3357 await runner.execute_batch(
3358 tool_calls=[tool_call],
3359 tool_source="assistant",
3360 pending_tool_calls_seen=set(),
3361 emit=_noop_emit,
3362 summary=summary,
3363 dod=dod,
3364 executor=executor, # type: ignore[arg-type]
3365 on_confirmation=None,
3366 on_user_question=None,
3367 emit_confirmation=None,
3368 consecutive_errors=0,
3369 )
3370
3371 assert any(
3372 "All explicitly planned artifacts now exist on disk." in message
3373 for message in persistent_messages
3374 )
3375 assert any(
3376 "Ensure all files are properly linked and formatted consistently" in message
3377 for message in persistent_messages
3378 )
3379 assert any(
3380 "Move to verification once no specific mismatch remains." in message
3381 for message in persistent_messages
3382 )
3383
3384
3385 @pytest.mark.asyncio
3386 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
3387 temp_dir: Path,
3388 ) -> None:
3389 async def assess_confidence(
3390 tool_name: str,
3391 tool_args: dict,
3392 context: str,
3393 ) -> ConfidenceAssessment:
3394 raise AssertionError("Confidence scoring should not run in this scenario")
3395
3396 async def verify_action(
3397 tool_name: str,
3398 tool_args: dict,
3399 result: str,
3400 expected: str = "",
3401 ) -> ActionVerification:
3402 raise AssertionError("Verification should not run in this scenario")
3403
3404 guide_root = temp_dir / "guides" / "nginx"
3405 chapters = guide_root / "chapters"
3406 guide_root.mkdir(parents=True)
3407 chapters.mkdir()
3408 index_path = guide_root / "index.html"
3409 index_path.write_text("<html></html>\n")
3410 chapter_one = chapters / "01-getting-started.html"
3411 chapter_two = chapters / "02-installation.html"
3412 implementation_plan = temp_dir / "implementation.md"
3413 implementation_plan.write_text(
3414 "\n".join(
3415 [
3416 "# Implementation Plan",
3417 "",
3418 "## File Changes",
3419 f"- `{guide_root}/`",
3420 f"- `{index_path}`",
3421 f"- `{chapter_one}`",
3422 f"- `{chapter_two}`",
3423 "",
3424 ]
3425 )
3426 )
3427
3428 context = build_context(
3429 temp_dir=temp_dir,
3430 messages=[],
3431 safeguards=FakeSafeguards(),
3432 assess_confidence=assess_confidence,
3433 verify_action=verify_action,
3434 auto_recover=False,
3435 )
3436 persistent_messages: list[str] = []
3437 ephemeral_messages: list[str] = []
3438 context.queue_steering_message_callback = persistent_messages.append
3439 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3440 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3441 dod = create_definition_of_done("Create a multi-file nginx guide.")
3442 dod.implementation_plan = str(implementation_plan)
3443 sync_todos_to_definition_of_done(
3444 dod,
3445 [
3446 {
3447 "content": "Create the main index.html file with proper structure",
3448 "active_form": "Working on: Create the main index.html file with proper structure",
3449 "status": "pending",
3450 },
3451 {
3452 "content": "Create each chapter file in sequence, following the established pattern",
3453 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
3454 "status": "pending",
3455 },
3456 {
3457 "content": "Ensure all files are properly linked and formatted consistently",
3458 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3459 "status": "pending",
3460 },
3461 ],
3462 )
3463 tool_call = ToolCall(
3464 id="write-index",
3465 name="write",
3466 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
3467 )
3468 executor = FakeExecutor(
3469 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
3470 )
3471
3472 summary = TurnSummary(final_response="")
3473 await runner.execute_batch(
3474 tool_calls=[tool_call],
3475 tool_source="assistant",
3476 pending_tool_calls_seen=set(),
3477 emit=_noop_emit,
3478 summary=summary,
3479 dod=dod,
3480 executor=executor, # type: ignore[arg-type]
3481 on_confirmation=None,
3482 on_user_question=None,
3483 emit_confirmation=None,
3484 consecutive_errors=0,
3485 )
3486
3487 assert persistent_messages
3488 assert ephemeral_messages == []
3489 message = persistent_messages[-1]
3490 assert "Next step: create `01-getting-started.html`." in message
3491 assert "Write a compact but real initial version of that file now" not in message
3492 assert "refresh `TodoWrite`" not in message
3493 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3494
3495
3496 @pytest.mark.asyncio
3497 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
3498 temp_dir: Path,
3499 ) -> None:
3500 async def assess_confidence(
3501 tool_name: str,
3502 tool_args: dict,
3503 context: str,
3504 ) -> ConfidenceAssessment:
3505 raise AssertionError("Confidence scoring should not run in this scenario")
3506
3507 async def verify_action(
3508 tool_name: str,
3509 tool_args: dict,
3510 result: str,
3511 expected: str = "",
3512 ) -> ActionVerification:
3513 raise AssertionError("Verification should not run in this scenario")
3514
3515 guide_root = temp_dir / "guides" / "nginx"
3516 chapters = guide_root / "chapters"
3517 guide_root.mkdir(parents=True)
3518 chapters.mkdir()
3519 index_path = guide_root / "index.html"
3520 index_path.write_text("<html></html>\n")
3521
3522 chapter_paths = [
3523 chapters / "01-getting-started.html",
3524 chapters / "02-installation.html",
3525 chapters / "03-first-website.html",
3526 chapters / "04-configuration-basics.html",
3527 chapters / "05-advanced-configurations.html",
3528 chapters / "06-performance-tuning.html",
3529 chapters / "07-security-best-practices.html",
3530 ]
3531 for chapter in chapter_paths[:4]:
3532 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
3533 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
3534
3535 implementation_plan = temp_dir / "implementation.md"
3536 implementation_plan.write_text(
3537 "\n".join(
3538 [
3539 "# Implementation Plan",
3540 "",
3541 "## File Changes",
3542 f"- `{guide_root}/`",
3543 f"- `{chapters}/`",
3544 f"- `{index_path}`",
3545 *[f"- `{path}`" for path in chapter_paths],
3546 "",
3547 ]
3548 )
3549 )
3550
3551 context = build_context(
3552 temp_dir=temp_dir,
3553 messages=[],
3554 safeguards=FakeSafeguards(),
3555 assess_confidence=assess_confidence,
3556 verify_action=verify_action,
3557 auto_recover=False,
3558 )
3559 persistent_messages: list[str] = []
3560 ephemeral_messages: list[str] = []
3561 context.queue_steering_message_callback = persistent_messages.append
3562 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3563 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3564 dod = create_definition_of_done("Create a thorough nginx guide.")
3565 dod.implementation_plan = str(implementation_plan)
3566 sync_todos_to_definition_of_done(
3567 dod,
3568 [
3569 {
3570 "content": "Create the nginx guide artifacts",
3571 "active_form": "Creating nginx guide artifacts",
3572 "status": "pending",
3573 },
3574 {
3575 "content": "Verify all guide files are linked and complete",
3576 "active_form": "Verifying guide linkage and completeness",
3577 "status": "pending",
3578 },
3579 ],
3580 )
3581 tool_call = ToolCall(
3582 id="write-chapter-05",
3583 name="write",
3584 arguments={
3585 "file_path": str(chapter_paths[4]),
3586 "content": "<h1>Advanced configurations</h1>\n",
3587 },
3588 )
3589 executor = FakeExecutor(
3590 [
3591 tool_outcome(
3592 tool_call=tool_call,
3593 output=f"Successfully wrote {chapter_paths[4]}",
3594 is_error=False,
3595 )
3596 ]
3597 )
3598
3599 summary = TurnSummary(final_response="")
3600 await runner.execute_batch(
3601 tool_calls=[tool_call],
3602 tool_source="assistant",
3603 pending_tool_calls_seen=set(),
3604 emit=_noop_emit,
3605 summary=summary,
3606 dod=dod,
3607 executor=executor, # type: ignore[arg-type]
3608 on_confirmation=None,
3609 on_user_question=None,
3610 emit_confirmation=None,
3611 consecutive_errors=0,
3612 )
3613
3614 assert any(
3615 "Next step: create `06-performance-tuning.html`." in message
3616 for message in ephemeral_messages
3617 )
3618 assert not any(
3619 "All explicitly planned artifacts now exist on disk." in message
3620 for message in ephemeral_messages
3621 )
3622
3623
3624 @pytest.mark.asyncio
3625 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
3626 temp_dir: Path,
3627 ) -> None:
3628 async def assess_confidence(
3629 tool_name: str,
3630 tool_args: dict,
3631 context: str,
3632 ) -> ConfidenceAssessment:
3633 raise AssertionError("Confidence scoring should not run in this scenario")
3634
3635 async def verify_action(
3636 tool_name: str,
3637 tool_args: dict,
3638 result: str,
3639 expected: str = "",
3640 ) -> ActionVerification:
3641 raise AssertionError("Verification should not run in this scenario")
3642
3643 guide_root = temp_dir / "guides" / "nginx"
3644 chapters = guide_root / "chapters"
3645 guide_root.mkdir(parents=True)
3646 chapters.mkdir()
3647 index_path = guide_root / "index.html"
3648 chapter_paths = [
3649 chapters / "01-introduction.html",
3650 chapters / "02-installation.html",
3651 chapters / "03-configuration.html",
3652 chapters / "04-basic-usage.html",
3653 chapters / "05-advanced-features.html",
3654 ]
3655 for path in (index_path, *chapter_paths[:4]):
3656 path.write_text("<html></html>\n")
3657
3658 implementation_plan = temp_dir / "implementation.md"
3659 implementation_plan.write_text(
3660 "\n".join(
3661 [
3662 "# Implementation Plan",
3663 "",
3664 "## File Changes",
3665 f"- `{guide_root}/`",
3666 f"- `{chapters}/`",
3667 f"- `{index_path}`",
3668 *[f"- `{path}`" for path in chapter_paths],
3669 "",
3670 ]
3671 )
3672 )
3673
3674 context = build_context(
3675 temp_dir=temp_dir,
3676 messages=[],
3677 safeguards=FakeSafeguards(),
3678 assess_confidence=assess_confidence,
3679 verify_action=verify_action,
3680 auto_recover=False,
3681 )
3682 persistent_messages: list[str] = []
3683 ephemeral_messages: list[str] = []
3684 context.queue_steering_message_callback = persistent_messages.append
3685 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3686 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3687 dod = create_definition_of_done("Create a thorough nginx guide.")
3688 dod.implementation_plan = str(implementation_plan)
3689 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3690 dod.completed_items.extend(
3691 [
3692 "Create the nginx directory structure",
3693 "Create the main index.html file with proper structure",
3694 ]
3695 )
3696 sync_todos_to_definition_of_done(
3697 dod,
3698 [
3699 {
3700 "content": "Create each chapter file with appropriate content",
3701 "active_form": "Creating each chapter file with appropriate content",
3702 "status": "pending",
3703 }
3704 ],
3705 )
3706 tool_call = ToolCall(
3707 id="write-chapter-04",
3708 name="write",
3709 arguments={
3710 "file_path": str(chapter_paths[3]),
3711 "content": "<html>updated</html>\n",
3712 },
3713 )
3714 executor = FakeExecutor(
3715 [
3716 tool_outcome(
3717 tool_call=tool_call,
3718 output=f"Successfully wrote {chapter_paths[3]}",
3719 is_error=False,
3720 )
3721 ]
3722 )
3723
3724 summary = TurnSummary(final_response="")
3725 await runner.execute_batch(
3726 tool_calls=[tool_call],
3727 tool_source="assistant",
3728 pending_tool_calls_seen=set(),
3729 emit=_noop_emit,
3730 summary=summary,
3731 dod=dod,
3732 executor=executor, # type: ignore[arg-type]
3733 on_confirmation=None,
3734 on_user_question=None,
3735 emit_confirmation=None,
3736 consecutive_errors=0,
3737 )
3738
3739 assert ephemeral_messages
3740 message = ephemeral_messages[-1]
3741 assert "Next step: create `05-advanced-features.html`." in message
3742 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3743 assert "refresh `TodoWrite`" not in message
3744
3745
3746 @pytest.mark.asyncio
3747 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3748 temp_dir: Path,
3749 ) -> None:
3750 async def assess_confidence(
3751 tool_name: str,
3752 tool_args: dict,
3753 context: str,
3754 ) -> ConfidenceAssessment:
3755 raise AssertionError("Confidence scoring should not run in this scenario")
3756
3757 async def verify_action(
3758 tool_name: str,
3759 tool_args: dict,
3760 result: str,
3761 expected: str = "",
3762 ) -> ActionVerification:
3763 raise AssertionError("Verification should not run in this scenario")
3764
3765 guide_root = temp_dir / "guides" / "nginx"
3766 chapters = guide_root / "chapters"
3767 guide_root.mkdir(parents=True)
3768 chapters.mkdir()
3769 index_path = guide_root / "index.html"
3770 index_path.write_text("<html></html>\n")
3771 chapter_one = chapters / "01-getting-started.html"
3772 chapter_two = chapters / "02-installation.html"
3773 chapter_one.write_text("<h1>One</h1>\n")
3774
3775 implementation_plan = temp_dir / "implementation.md"
3776 implementation_plan.write_text(
3777 "\n".join(
3778 [
3779 "# Implementation Plan",
3780 "",
3781 "## File Changes",
3782 f"- `{guide_root}/`",
3783 f"- `{chapters}/`",
3784 f"- `{index_path}`",
3785 f"- `{chapter_one}`",
3786 f"- `{chapter_two}`",
3787 "",
3788 ]
3789 )
3790 )
3791
3792 context = build_context(
3793 temp_dir=temp_dir,
3794 messages=[],
3795 safeguards=FakeSafeguards(),
3796 assess_confidence=assess_confidence,
3797 verify_action=verify_action,
3798 auto_recover=False,
3799 )
3800 persistent_messages: list[str] = []
3801 ephemeral_messages: list[str] = []
3802 context.queue_steering_message_callback = persistent_messages.append
3803 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3804 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3805 dod = create_definition_of_done("Create a multi-file nginx guide.")
3806 dod.implementation_plan = str(implementation_plan)
3807 sync_todos_to_definition_of_done(
3808 dod,
3809 [
3810 {
3811 "content": "Create 01-getting-started.html",
3812 "active_form": "Creating 01-getting-started.html",
3813 "status": "completed",
3814 },
3815 {
3816 "content": "Create 02-installation.html",
3817 "active_form": "Creating 02-installation.html",
3818 "status": "pending",
3819 },
3820 ],
3821 )
3822 dod.touched_files.extend([str(index_path), str(chapter_one)])
3823
3824 tool_call = ToolCall(
3825 id="todo-only",
3826 name="TodoWrite",
3827 arguments={
3828 "todos": [
3829 {
3830 "content": "Create 01-getting-started.html",
3831 "active_form": "Creating 01-getting-started.html",
3832 "status": "completed",
3833 },
3834 {
3835 "content": "Create 02-installation.html",
3836 "active_form": "Creating 02-installation.html",
3837 "status": "pending",
3838 },
3839 ]
3840 },
3841 )
3842 executor = FakeExecutor(
3843 [
3844 tool_outcome(
3845 tool_call=tool_call,
3846 output="Todos updated",
3847 is_error=False,
3848 metadata={
3849 "new_todos": [
3850 {
3851 "content": "Create 01-getting-started.html",
3852 "active_form": "Creating 01-getting-started.html",
3853 "status": "completed",
3854 },
3855 {
3856 "content": "Create 02-installation.html",
3857 "active_form": "Creating 02-installation.html",
3858 "status": "pending",
3859 },
3860 ]
3861 },
3862 )
3863 ]
3864 )
3865
3866 summary = TurnSummary(final_response="")
3867 await runner.execute_batch(
3868 tool_calls=[tool_call],
3869 tool_source="assistant",
3870 pending_tool_calls_seen=set(),
3871 emit=_noop_emit,
3872 summary=summary,
3873 dod=dod,
3874 executor=executor, # type: ignore[arg-type]
3875 on_confirmation=None,
3876 on_user_question=None,
3877 emit_confirmation=None,
3878 consecutive_errors=0,
3879 )
3880
3881 assert persistent_messages
3882 message = persistent_messages[-1]
3883 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
3884 assert "Prefer one `write(file_path=..., content=...)` call" in message
3885 assert "Make your next response the concrete mutation tool call itself." in message
3886 assert ephemeral_messages == []
3887
3888
3889 @pytest.mark.asyncio
3890 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3891 temp_dir: Path,
3892 ) -> None:
3893 async def assess_confidence(
3894 tool_name: str,
3895 tool_args: dict,
3896 context: str,
3897 ) -> ConfidenceAssessment:
3898 raise AssertionError("Confidence scoring should not run in this scenario")
3899
3900 async def verify_action(
3901 tool_name: str,
3902 tool_args: dict,
3903 result: str,
3904 expected: str = "",
3905 ) -> ActionVerification:
3906 raise AssertionError("Verification should not run in this scenario")
3907
3908 guide_root = temp_dir / "guides" / "nginx"
3909 chapters = guide_root / "chapters"
3910 guide_root.mkdir(parents=True)
3911 chapters.mkdir()
3912 index_path = guide_root / "index.html"
3913 chapter_one = chapters / "01-getting-started.html"
3914 chapter_two = chapters / "02-installation.html"
3915 index_path.write_text("<html></html>\n")
3916 chapter_one.write_text("<h1>One</h1>\n")
3917 chapter_two.write_text("<h1>Two</h1>\n")
3918
3919 implementation_plan = temp_dir / "implementation.md"
3920 implementation_plan.write_text(
3921 "\n".join(
3922 [
3923 "# Implementation Plan",
3924 "",
3925 "## File Changes",
3926 f"- `{guide_root}/`",
3927 f"- `{chapters}/`",
3928 f"- `{index_path}`",
3929 f"- `{chapter_one}`",
3930 f"- `{chapter_two}`",
3931 "",
3932 ]
3933 )
3934 )
3935
3936 context = build_context(
3937 temp_dir=temp_dir,
3938 messages=[],
3939 safeguards=FakeSafeguards(),
3940 assess_confidence=assess_confidence,
3941 verify_action=verify_action,
3942 auto_recover=False,
3943 )
3944 queued_messages: list[str] = []
3945 context.queue_steering_message_callback = queued_messages.append
3946 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3947 dod = create_definition_of_done("Create a multi-file nginx guide.")
3948 dod.implementation_plan = str(implementation_plan)
3949 dod.verification_commands = [f"ls -la {guide_root}"]
3950 sync_todos_to_definition_of_done(
3951 dod,
3952 [
3953 {
3954 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3955 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3956 "status": "pending",
3957 },
3958 {
3959 "content": "Verify all guide files are linked and complete",
3960 "active_form": "Working on: Verify all guide files are linked and complete",
3961 "status": "pending",
3962 },
3963 ],
3964 project_root=temp_dir,
3965 )
3966
3967 tool_call = ToolCall(
3968 id="todo-only",
3969 name="TodoWrite",
3970 arguments={
3971 "todos": [
3972 {
3973 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3974 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3975 "status": "pending",
3976 },
3977 {
3978 "content": "Verify all guide files are linked and complete",
3979 "active_form": "Working on: Verify all guide files are linked and complete",
3980 "status": "pending",
3981 },
3982 ]
3983 },
3984 )
3985 executor = FakeExecutor(
3986 [
3987 tool_outcome(
3988 tool_call=tool_call,
3989 output="Todos updated",
3990 is_error=False,
3991 metadata={
3992 "new_todos": [
3993 {
3994 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3995 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3996 "status": "pending",
3997 },
3998 {
3999 "content": "Verify all guide files are linked and complete",
4000 "active_form": "Working on: Verify all guide files are linked and complete",
4001 "status": "pending",
4002 },
4003 ]
4004 },
4005 )
4006 ]
4007 )
4008
4009 summary = TurnSummary(final_response="")
4010 await runner.execute_batch(
4011 tool_calls=[tool_call],
4012 tool_source="assistant",
4013 pending_tool_calls_seen=set(),
4014 emit=_noop_emit,
4015 summary=summary,
4016 dod=dod,
4017 executor=executor, # type: ignore[arg-type]
4018 on_confirmation=None,
4019 on_user_question=None,
4020 emit_confirmation=None,
4021 consecutive_errors=0,
4022 )
4023
4024 assert queued_messages
4025 message = queued_messages[-1]
4026 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4027 assert "Verify all guide files are linked and complete" in message
4028 assert "Move to verification once no specific mismatch remains." in message
4029 assert "reopen reference materials" in message
4030 assert "Fortran guide structure" not in message
4031 assert context.workflow_mode == "execute"
4032
4033
4034 @pytest.mark.asyncio
4035 async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
4036 temp_dir: Path,
4037 ) -> None:
4038 async def assess_confidence(
4039 tool_name: str,
4040 tool_args: dict,
4041 context: str,
4042 ) -> ConfidenceAssessment:
4043 raise AssertionError("Confidence scoring should not run for this scenario")
4044
4045 async def verify_action(
4046 tool_name: str,
4047 tool_args: dict,
4048 result: str,
4049 expected: str = "",
4050 ) -> ActionVerification:
4051 raise AssertionError("Verification should not run for this scenario")
4052
4053 guide_root = temp_dir / "guides" / "nginx"
4054 chapters = guide_root / "chapters"
4055 guide_root.mkdir(parents=True)
4056 chapters.mkdir()
4057 index_path = guide_root / "index.html"
4058 chapter_one = chapters / "01-introduction.html"
4059 chapter_two = chapters / "02-installation.html"
4060 index_path.write_text(
4061 "\n".join(
4062 [
4063 '<a href="chapters/01-introduction.html">Intro</a>',
4064 '<a href="chapters/02-installation.html">Install</a>',
4065 '<a href="../index.html">Back</a>',
4066 "",
4067 ]
4068 )
4069 )
4070 chapter_one.write_text("<html></html>\n")
4071 chapter_two.write_text("<html></html>\n")
4072
4073 implementation_plan = temp_dir / "implementation.md"
4074 implementation_plan.write_text(
4075 "\n".join(
4076 [
4077 "# Implementation Plan",
4078 "",
4079 "## File Changes",
4080 f"- `{guide_root}/`",
4081 f"- `{chapters}/`",
4082 f"- `{index_path}`",
4083 f"- `{chapter_one}`",
4084 f"- `{chapter_two}`",
4085 "",
4086 ]
4087 )
4088 )
4089
4090 context = build_context(
4091 temp_dir=temp_dir,
4092 messages=[],
4093 safeguards=FakeSafeguards(),
4094 assess_confidence=assess_confidence,
4095 verify_action=verify_action,
4096 auto_recover=False,
4097 )
4098 queued_messages: list[str] = []
4099 context.queue_steering_message_callback = queued_messages.append
4100 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4101 dod = create_definition_of_done("Create a multi-file nginx guide.")
4102 dod.implementation_plan = str(implementation_plan)
4103 dod.verification_commands = [f"ls -la {guide_root}"]
4104 sync_todos_to_definition_of_done(
4105 dod,
4106 [
4107 {
4108 "content": "Create chapter files following the established pattern",
4109 "active_form": "Creating chapter files",
4110 "status": "in_progress",
4111 }
4112 ],
4113 project_root=temp_dir,
4114 )
4115
4116 tool_call = ToolCall(
4117 id="todo-post-build",
4118 name="TodoWrite",
4119 arguments={
4120 "todos": [
4121 {
4122 "content": "Create chapter files following the established pattern",
4123 "active_form": "Creating chapter files",
4124 "status": "in_progress",
4125 }
4126 ]
4127 },
4128 )
4129 executor = FakeExecutor(
4130 [
4131 tool_outcome(
4132 tool_call=tool_call,
4133 output="Todos updated",
4134 is_error=False,
4135 metadata={
4136 "new_todos": [
4137 {
4138 "content": "Create chapter files following the established pattern",
4139 "active_form": "Creating chapter files",
4140 "status": "in_progress",
4141 }
4142 ]
4143 },
4144 )
4145 ]
4146 )
4147
4148 summary = TurnSummary(final_response="")
4149 await runner.execute_batch(
4150 tool_calls=[tool_call],
4151 tool_source="assistant",
4152 pending_tool_calls_seen=set(),
4153 emit=_noop_emit,
4154 summary=summary,
4155 dod=dod,
4156 executor=executor, # type: ignore[arg-type]
4157 on_confirmation=None,
4158 on_user_question=None,
4159 emit_confirmation=None,
4160 consecutive_errors=0,
4161 )
4162
4163 assert queued_messages
4164 message = queued_messages[-1]
4165 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4166 assert "Verification should run next." in message
4167 assert "Repair or verify the current files instead of expanding the artifact set." not in message
4168 assert context.workflow_mode == "verify"
4169
4170
4171 @pytest.mark.asyncio
4172 async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
4173 temp_dir: Path,
4174 ) -> None:
4175 async def assess_confidence(
4176 tool_name: str,
4177 tool_args: dict,
4178 context: str,
4179 ) -> ConfidenceAssessment:
4180 raise AssertionError("Confidence scoring should not run for this scenario")
4181
4182 async def verify_action(
4183 tool_name: str,
4184 tool_args: dict,
4185 result: str,
4186 expected: str = "",
4187 ) -> ActionVerification:
4188 raise AssertionError("Verification should not run for this scenario")
4189
4190 guide_root = temp_dir / "guides" / "nginx"
4191 chapters = guide_root / "chapters"
4192 guide_root.mkdir(parents=True)
4193 chapters.mkdir()
4194 index_path = guide_root / "index.html"
4195 chapter_one = chapters / "01-introduction.html"
4196 chapter_two = chapters / "02-installation.html"
4197 index_path.write_text(
4198 "\n".join(
4199 [
4200 '<a href="chapters/01-introduction.html">Intro</a>',
4201 '<a href="chapters/02-installation.html">Install</a>',
4202 '<a href="../index.html">Back</a>',
4203 "",
4204 ]
4205 )
4206 )
4207 chapter_one.write_text("<html></html>\n")
4208 chapter_two.write_text("<html></html>\n")
4209
4210 implementation_plan = temp_dir / "implementation.md"
4211 implementation_plan.write_text(
4212 "\n".join(
4213 [
4214 "# Implementation Plan",
4215 "",
4216 "## File Changes",
4217 f"- `{guide_root}/`",
4218 f"- `{chapters}/`",
4219 f"- `{index_path}`",
4220 f"- `{chapter_one}`",
4221 f"- `{chapter_two}`",
4222 "",
4223 ]
4224 )
4225 )
4226
4227 context = build_context(
4228 temp_dir=temp_dir,
4229 messages=[],
4230 safeguards=FakeSafeguards(),
4231 assess_confidence=assess_confidence,
4232 verify_action=verify_action,
4233 auto_recover=False,
4234 )
4235 queued_messages: list[str] = []
4236 context.queue_steering_message_callback = queued_messages.append
4237 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4238 dod = create_definition_of_done("Create a multi-file nginx guide.")
4239 dod.implementation_plan = str(implementation_plan)
4240 dod.verification_commands = [f"ls -la {guide_root}"]
4241
4242 tool_call = ToolCall(
4243 id="todo-post-build-expansion",
4244 name="TodoWrite",
4245 arguments={
4246 "todos": [
4247 {
4248 "content": "Create index.html for nginx guide",
4249 "activeForm": "Creating index.html",
4250 "status": "in_progress",
4251 },
4252 {
4253 "content": "Create chapter 01-introduction.html",
4254 "activeForm": "Creating chapter 01-introduction.html",
4255 "status": "completed",
4256 },
4257 {
4258 "content": "Create chapter 02-installation.html",
4259 "activeForm": "Creating chapter 02-installation.html",
4260 "status": "completed",
4261 },
4262 {
4263 "content": "Create chapter 08-troubleshooting.html",
4264 "activeForm": "Creating chapter 08-troubleshooting.html",
4265 "status": "pending",
4266 },
4267 ]
4268 },
4269 )
4270 executor = FakeExecutor(
4271 [
4272 tool_outcome(
4273 tool_call=tool_call,
4274 output="Todos updated",
4275 is_error=False,
4276 metadata={
4277 "new_todos": [
4278 {
4279 "content": "Create index.html for nginx guide",
4280 "active_form": "Creating index.html",
4281 "status": "in_progress",
4282 },
4283 {
4284 "content": "Create chapter 01-introduction.html",
4285 "active_form": "Creating chapter 01-introduction.html",
4286 "status": "completed",
4287 },
4288 {
4289 "content": "Create chapter 02-installation.html",
4290 "active_form": "Creating chapter 02-installation.html",
4291 "status": "completed",
4292 },
4293 {
4294 "content": "Create chapter 08-troubleshooting.html",
4295 "active_form": "Creating chapter 08-troubleshooting.html",
4296 "status": "pending",
4297 },
4298 ]
4299 },
4300 )
4301 ]
4302 )
4303
4304 summary = TurnSummary(final_response="")
4305 await runner.execute_batch(
4306 tool_calls=[tool_call],
4307 tool_source="assistant",
4308 pending_tool_calls_seen=set(),
4309 emit=_noop_emit,
4310 summary=summary,
4311 dod=dod,
4312 executor=executor, # type: ignore[arg-type]
4313 on_confirmation=None,
4314 on_user_question=None,
4315 emit_confirmation=None,
4316 consecutive_errors=0,
4317 )
4318
4319 assert queued_messages
4320 message = queued_messages[-1]
4321 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4322 assert "Verification should run next." in message
4323 assert "Repair or verify the current files instead of expanding the artifact set." not in message
4324 assert "08-troubleshooting.html" not in message
4325 assert context.workflow_mode == "verify"
4326
4327
4328 @pytest.mark.asyncio
4329 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
4330 temp_dir: Path,
4331 ) -> None:
4332 async def assess_confidence(
4333 tool_name: str,
4334 tool_args: dict,
4335 context: str,
4336 ) -> ConfidenceAssessment:
4337 raise AssertionError("Confidence scoring should not run in this scenario")
4338
4339 async def verify_action(
4340 tool_name: str,
4341 tool_args: dict,
4342 result: str,
4343 expected: str = "",
4344 ) -> ActionVerification:
4345 raise AssertionError("Verification should not run in this scenario")
4346
4347 guide_root = temp_dir / "guides" / "nginx"
4348 chapters = guide_root / "chapters"
4349 guide_root.mkdir(parents=True)
4350 chapters.mkdir()
4351 index_path = guide_root / "index.html"
4352 index_path.write_text(
4353 "\n".join(
4354 [
4355 "<!DOCTYPE html>",
4356 "<html>",
4357 "<body>",
4358 '<a href="chapters/01-introduction.html">Introduction</a>',
4359 "</body>",
4360 "</html>",
4361 "",
4362 ]
4363 )
4364 )
4365
4366 implementation_plan = temp_dir / "implementation.md"
4367 implementation_plan.write_text(
4368 "\n".join(
4369 [
4370 "# Implementation Plan",
4371 "",
4372 "## File Changes",
4373 f"- `{guide_root}/`",
4374 f"- `{chapters}/`",
4375 f"- `{index_path}`",
4376 "",
4377 ]
4378 )
4379 )
4380
4381 context = build_context(
4382 temp_dir=temp_dir,
4383 messages=[],
4384 safeguards=FakeSafeguards(),
4385 assess_confidence=assess_confidence,
4386 verify_action=verify_action,
4387 auto_recover=False,
4388 )
4389 queued_messages: list[str] = []
4390 context.queue_steering_message_callback = queued_messages.append
4391 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4392 dod = create_definition_of_done("Create a multi-file nginx guide.")
4393 dod.implementation_plan = str(implementation_plan)
4394 dod.touched_files.append(str(index_path))
4395 sync_todos_to_definition_of_done(
4396 dod,
4397 [
4398 {
4399 "content": "Examine the existing Fortran guide structure",
4400 "active_form": "Examining the existing Fortran guide structure",
4401 "status": "completed",
4402 },
4403 {
4404 "content": "Create the nginx directory structure",
4405 "active_form": "Creating the nginx directory structure",
4406 "status": "completed",
4407 },
4408 {
4409 "content": "Write the introduction chapter",
4410 "active_form": "Writing the introduction chapter",
4411 "status": "pending",
4412 },
4413 ],
4414 project_root=temp_dir,
4415 )
4416
4417 tool_call = ToolCall(
4418 id="todo-next-mutation",
4419 name="TodoWrite",
4420 arguments={
4421 "todos": [
4422 {
4423 "content": "Examine the existing Fortran guide structure",
4424 "active_form": "Examining the existing Fortran guide structure",
4425 "status": "completed",
4426 },
4427 {
4428 "content": "Create the nginx directory structure",
4429 "active_form": "Creating the nginx directory structure",
4430 "status": "completed",
4431 },
4432 {
4433 "content": "Write the introduction chapter",
4434 "active_form": "Writing the introduction chapter",
4435 "status": "pending",
4436 },
4437 ]
4438 },
4439 )
4440 executor = FakeExecutor(
4441 [
4442 tool_outcome(
4443 tool_call=tool_call,
4444 output="Todos updated",
4445 is_error=False,
4446 metadata={
4447 "new_todos": [
4448 {
4449 "content": "Examine the existing Fortran guide structure",
4450 "active_form": "Examining the existing Fortran guide structure",
4451 "status": "completed",
4452 },
4453 {
4454 "content": "Create the nginx directory structure",
4455 "active_form": "Creating the nginx directory structure",
4456 "status": "completed",
4457 },
4458 {
4459 "content": "Write the introduction chapter",
4460 "active_form": "Writing the introduction chapter",
4461 "status": "pending",
4462 },
4463 ]
4464 },
4465 )
4466 ]
4467 )
4468
4469 summary = TurnSummary(final_response="")
4470 await runner.execute_batch(
4471 tool_calls=[tool_call],
4472 tool_source="assistant",
4473 pending_tool_calls_seen=set(),
4474 emit=_noop_emit,
4475 summary=summary,
4476 dod=dod,
4477 executor=executor, # type: ignore[arg-type]
4478 on_confirmation=None,
4479 on_user_question=None,
4480 emit_confirmation=None,
4481 consecutive_errors=0,
4482 )
4483
4484 assert queued_messages
4485 message = queued_messages[-1]
4486 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
4487 assert "Prefer one `write(file_path=..., content=...)` call" in message
4488 assert "Make your next response the concrete mutation tool call itself." in message
4489
4490
4491 @pytest.mark.asyncio
4492 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
4493 temp_dir: Path,
4494 ) -> None:
4495 async def assess_confidence(
4496 tool_name: str,
4497 tool_args: dict,
4498 context: str,
4499 ) -> ConfidenceAssessment:
4500 raise AssertionError("Confidence scoring should not run in this scenario")
4501
4502 async def verify_action(
4503 tool_name: str,
4504 tool_args: dict,
4505 result: str,
4506 expected: str = "",
4507 ) -> ActionVerification:
4508 raise AssertionError("Verification should not run in this scenario")
4509
4510 guide_root = temp_dir / "Loader" / "guides" / "nginx"
4511 chapters = guide_root / "chapters"
4512 chapters.mkdir(parents=True)
4513 index_path = guide_root / "index.html"
4514 implementation_plan = temp_dir / "implementation.md"
4515 implementation_plan.write_text(
4516 "\n".join(
4517 [
4518 "# Implementation Plan",
4519 "",
4520 "## File Changes",
4521 f"- `{chapters}/`",
4522 f"- `{index_path}`",
4523 "",
4524 ]
4525 )
4526 )
4527
4528 dod = create_definition_of_done("Create a multi-file nginx guide.")
4529 dod.implementation_plan = str(implementation_plan)
4530 sync_todos_to_definition_of_done(
4531 dod,
4532 [
4533 {
4534 "content": "Examine the existing Fortran guide structure to understand the format and depth",
4535 "active_form": "Examining the existing Fortran guide structure",
4536 "status": "completed",
4537 },
4538 {
4539 "content": "Create the new nginx guide directory structure",
4540 "active_form": "Creating the new nginx guide directory structure",
4541 "status": "completed",
4542 },
4543 {
4544 "content": "Create a new index.html for the nginx guide",
4545 "active_form": "Creating a new index.html for the nginx guide",
4546 "status": "pending",
4547 },
4548 {
4549 "content": "Create the first chapter for the nginx guide",
4550 "active_form": "Creating the first chapter for the nginx guide",
4551 "status": "pending",
4552 },
4553 ],
4554 project_root=temp_dir,
4555 )
4556
4557 queued_messages: list[str] = []
4558 context = build_context(
4559 temp_dir=temp_dir,
4560 messages=[],
4561 safeguards=FakeSafeguards(),
4562 assess_confidence=assess_confidence,
4563 verify_action=verify_action,
4564 auto_recover=False,
4565 )
4566 context.queue_steering_message_callback = queued_messages.append
4567 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4568
4569 todos = [
4570 {
4571 "content": "Examine the existing Fortran guide structure to understand the format and depth",
4572 "active_form": "Examining the existing Fortran guide structure",
4573 "status": "completed",
4574 },
4575 {
4576 "content": "Create the new nginx guide directory structure",
4577 "active_form": "Creating the new nginx guide directory structure",
4578 "status": "completed",
4579 },
4580 {
4581 "content": "Create a new index.html for the nginx guide",
4582 "active_form": "Creating a new index.html for the nginx guide",
4583 "status": "pending",
4584 },
4585 {
4586 "content": "Create the first chapter for the nginx guide",
4587 "active_form": "Creating the first chapter for the nginx guide",
4588 "status": "pending",
4589 },
4590 ]
4591 tool_call = ToolCall(
4592 id="todo-index-before-chapter",
4593 name="TodoWrite",
4594 arguments={"todos": todos},
4595 )
4596 executor = FakeExecutor(
4597 [
4598 tool_outcome(
4599 tool_call=tool_call,
4600 output="Todos updated",
4601 is_error=False,
4602 metadata={"new_todos": todos},
4603 )
4604 ]
4605 )
4606
4607 summary = TurnSummary(final_response="")
4608 await runner.execute_batch(
4609 tool_calls=[tool_call],
4610 tool_source="assistant",
4611 pending_tool_calls_seen=set(),
4612 emit=_noop_emit,
4613 summary=summary,
4614 dod=dod,
4615 executor=executor, # type: ignore[arg-type]
4616 on_confirmation=None,
4617 on_user_question=None,
4618 emit_confirmation=None,
4619 consecutive_errors=0,
4620 )
4621
4622 assert queued_messages
4623 message = queued_messages[-1]
4624 assert "Todo tracking is updated. Next step: create `index.html`." in message
4625 assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
4626 assert "01-introduction.html" not in message
4627
4628
4629 @pytest.mark.asyncio
4630 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
4631 temp_dir: Path,
4632 ) -> None:
4633 async def assess_confidence(
4634 tool_name: str,
4635 tool_args: dict,
4636 context: str,
4637 ) -> ConfidenceAssessment:
4638 raise AssertionError("Confidence scoring should not run in this scenario")
4639
4640 async def verify_action(
4641 tool_name: str,
4642 tool_args: dict,
4643 result: str,
4644 expected: str = "",
4645 ) -> ActionVerification:
4646 raise AssertionError("Verification should not run in this scenario")
4647
4648 guide_root = temp_dir / "guides" / "nginx"
4649 chapters = guide_root / "chapters"
4650 guide_root.mkdir(parents=True)
4651 chapters.mkdir()
4652 index_path = guide_root / "index.html"
4653 index_path.write_text(
4654 "\n".join(
4655 [
4656 "<html>",
4657 '<a href="chapters/introduction.html">Introduction</a>',
4658 '<a href="chapters/installation.html">Installation</a>',
4659 "</html>",
4660 ]
4661 )
4662 + "\n"
4663 )
4664
4665 implementation_plan = temp_dir / "implementation.md"
4666 implementation_plan.write_text(
4667 "\n".join(
4668 [
4669 "# Implementation Plan",
4670 "",
4671 "## File Changes",
4672 f"- `{guide_root}/`",
4673 f"- `{chapters}/`",
4674 f"- `{index_path}`",
4675 "",
4676 ]
4677 )
4678 )
4679
4680 dod = create_definition_of_done("Create a multi-file nginx guide.")
4681 dod.implementation_plan = str(implementation_plan)
4682 dod.pending_items = [
4683 "Write the introduction chapter",
4684 "Complete the requested work",
4685 ]
4686 dod.touched_files.append(str(index_path))
4687
4688 queued_messages: list[str] = []
4689 context = build_context(
4690 temp_dir=temp_dir,
4691 messages=[],
4692 safeguards=FakeSafeguards(),
4693 assess_confidence=assess_confidence,
4694 verify_action=verify_action,
4695 auto_recover=False,
4696 )
4697 context.queue_steering_message_callback = queued_messages.append
4698 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4699
4700 tool_call = ToolCall(
4701 id="todo-1",
4702 name="TodoWrite",
4703 arguments={
4704 "todos": [
4705 {
4706 "content": "Write the introduction chapter",
4707 "activeForm": "Writing the introduction chapter",
4708 "status": "pending",
4709 }
4710 ]
4711 },
4712 )
4713 executor = FakeExecutor(
4714 [
4715 tool_outcome(
4716 tool_call=tool_call,
4717 output="Todos updated",
4718 is_error=False,
4719 metadata={
4720 "new_todos": [
4721 {
4722 "content": "Write the introduction chapter",
4723 "active_form": "Writing the introduction chapter",
4724 "status": "pending",
4725 }
4726 ]
4727 },
4728 )
4729 ]
4730 )
4731
4732 summary = TurnSummary(final_response="")
4733 await runner.execute_batch(
4734 tool_calls=[tool_call],
4735 tool_source="assistant",
4736 pending_tool_calls_seen=set(),
4737 emit=_noop_emit,
4738 summary=summary,
4739 dod=dod,
4740 executor=executor, # type: ignore[arg-type]
4741 on_confirmation=None,
4742 on_user_question=None,
4743 emit_confirmation=None,
4744 consecutive_errors=0,
4745 )
4746
4747 assert queued_messages
4748 message = queued_messages[-1]
4749 assert "Todo tracking is updated. Next step: create `introduction.html`." in message
4750 assert "Prefer one `write(file_path=..., content=...)` call" in message
4751 assert "Make your next response the concrete mutation tool call itself." in message
4752
4753
4754 @pytest.mark.asyncio
4755 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
4756 temp_dir: Path,
4757 ) -> None:
4758 async def assess_confidence(
4759 tool_name: str,
4760 tool_args: dict,
4761 context: str,
4762 ) -> ConfidenceAssessment:
4763 raise AssertionError("Confidence scoring should not run in this scenario")
4764
4765 async def verify_action(
4766 tool_name: str,
4767 tool_args: dict,
4768 result: str,
4769 expected: str = "",
4770 ) -> ActionVerification:
4771 raise AssertionError("Verification should not run in this scenario")
4772
4773 guide_root = temp_dir / "guides" / "nginx"
4774 chapters = guide_root / "chapters"
4775 guide_root.mkdir(parents=True)
4776 chapters.mkdir()
4777 index_path = guide_root / "index.html"
4778 chapter_one = chapters / "01-introduction.html"
4779 index_path.write_text(
4780 "\n".join(
4781 [
4782 "<html>",
4783 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
4784 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
4785 "</html>",
4786 ]
4787 )
4788 + "\n"
4789 )
4790 chapter_one.write_text("<html></html>\n")
4791
4792 implementation_plan = temp_dir / "implementation.md"
4793 implementation_plan.write_text(
4794 "\n".join(
4795 [
4796 "# Implementation Plan",
4797 "",
4798 "## File Changes",
4799 f"- `{guide_root}/`",
4800 f"- `{chapters}/`",
4801 f"- `{index_path}`",
4802 "",
4803 ]
4804 )
4805 )
4806
4807 dod = create_definition_of_done("Create a multi-file nginx guide.")
4808 dod.implementation_plan = str(implementation_plan)
4809 dod.pending_items = [
4810 "Creating Chapter 2: Installation and Setup",
4811 "Complete the requested work",
4812 ]
4813 dod.touched_files.extend([str(index_path), str(chapter_one)])
4814
4815 queued_messages: list[str] = []
4816 context = build_context(
4817 temp_dir=temp_dir,
4818 messages=[],
4819 safeguards=FakeSafeguards(),
4820 assess_confidence=assess_confidence,
4821 verify_action=verify_action,
4822 auto_recover=False,
4823 )
4824 context.queue_steering_message_callback = queued_messages.append
4825 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4826
4827 tool_call = ToolCall(
4828 id="todo-1",
4829 name="TodoWrite",
4830 arguments={
4831 "todos": [
4832 {
4833 "content": "Creating Chapter 2: Installation and Setup",
4834 "activeForm": "Creating Chapter 2: Installation and Setup",
4835 "status": "pending",
4836 }
4837 ]
4838 },
4839 )
4840 executor = FakeExecutor(
4841 [
4842 tool_outcome(
4843 tool_call=tool_call,
4844 output="Todos updated",
4845 is_error=False,
4846 metadata={
4847 "new_todos": [
4848 {
4849 "content": "Creating Chapter 2: Installation and Setup",
4850 "active_form": "Creating Chapter 2: Installation and Setup",
4851 "status": "pending",
4852 }
4853 ]
4854 },
4855 )
4856 ]
4857 )
4858
4859 summary = TurnSummary(final_response="")
4860 await runner.execute_batch(
4861 tool_calls=[tool_call],
4862 tool_source="assistant",
4863 pending_tool_calls_seen=set(),
4864 emit=_noop_emit,
4865 summary=summary,
4866 dod=dod,
4867 executor=executor, # type: ignore[arg-type]
4868 on_confirmation=None,
4869 on_user_question=None,
4870 emit_confirmation=None,
4871 consecutive_errors=0,
4872 )
4873
4874 assert queued_messages
4875 message = queued_messages[-1]
4876 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
4877 assert "Prefer one `write(file_path=..., content=...)` call" in message
4878 assert "Make your next response the concrete mutation tool call itself" in message
4879
4880
4881 @pytest.mark.asyncio
4882 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
4883 temp_dir: Path,
4884 ) -> None:
4885 async def assess_confidence(
4886 tool_name: str,
4887 tool_args: dict,
4888 context: str,
4889 ) -> ConfidenceAssessment:
4890 raise AssertionError("Confidence scoring should not run in this scenario")
4891
4892 async def verify_action(
4893 tool_name: str,
4894 tool_args: dict,
4895 result: str,
4896 expected: str = "",
4897 ) -> ActionVerification:
4898 raise AssertionError("Verification should not run in this scenario")
4899
4900 reference_chapters = temp_dir / "fortran" / "chapters"
4901 reference_chapters.mkdir(parents=True)
4902 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
4903
4904 guide_root = temp_dir / "guides" / "nginx"
4905 chapters = guide_root / "chapters"
4906 guide_root.mkdir(parents=True)
4907 chapters.mkdir()
4908 index_path = guide_root / "index.html"
4909 index_path.write_text("<html></html>\n")
4910
4911 implementation_plan = temp_dir / "implementation.md"
4912 implementation_plan.write_text(
4913 "\n".join(
4914 [
4915 "# Implementation Plan",
4916 "",
4917 "## File Changes",
4918 f"- `{guide_root}/`",
4919 f"- `{chapters}/`",
4920 f"- `{index_path}`",
4921 "",
4922 ]
4923 )
4924 )
4925
4926 dod = create_definition_of_done("Create a multi-file nginx guide.")
4927 dod.implementation_plan = str(implementation_plan)
4928 dod.pending_items = [
4929 "Write the introduction chapter",
4930 "Complete the requested work",
4931 ]
4932 dod.touched_files.append(str(index_path))
4933
4934 queued_messages: list[str] = []
4935 context = build_context(
4936 temp_dir=temp_dir,
4937 messages=[
4938 Message(
4939 role=Role.ASSISTANT,
4940 content="",
4941 tool_calls=[
4942 ToolCall(
4943 id="read-ref-1",
4944 name="read",
4945 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
4946 )
4947 ],
4948 )
4949 ],
4950 safeguards=FakeSafeguards(),
4951 assess_confidence=assess_confidence,
4952 verify_action=verify_action,
4953 auto_recover=False,
4954 )
4955 context.queue_steering_message_callback = queued_messages.append
4956 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4957
4958 tool_call = ToolCall(
4959 id="todo-observed-1",
4960 name="TodoWrite",
4961 arguments={
4962 "todos": [
4963 {
4964 "content": "Write the introduction chapter",
4965 "activeForm": "Writing the introduction chapter",
4966 "status": "pending",
4967 }
4968 ]
4969 },
4970 )
4971 executor = FakeExecutor(
4972 [
4973 tool_outcome(
4974 tool_call=tool_call,
4975 output="Todos updated",
4976 is_error=False,
4977 metadata={
4978 "new_todos": [
4979 {
4980 "content": "Write the introduction chapter",
4981 "active_form": "Writing the introduction chapter",
4982 "status": "pending",
4983 }
4984 ]
4985 },
4986 )
4987 ]
4988 )
4989
4990 summary = TurnSummary(final_response="")
4991 await runner.execute_batch(
4992 tool_calls=[tool_call],
4993 tool_source="assistant",
4994 pending_tool_calls_seen=set(),
4995 emit=_noop_emit,
4996 summary=summary,
4997 dod=dod,
4998 executor=executor, # type: ignore[arg-type]
4999 on_confirmation=None,
5000 on_user_question=None,
5001 emit_confirmation=None,
5002 consecutive_errors=0,
5003 )
5004
5005 assert queued_messages
5006 message = queued_messages[-1]
5007 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
5008 assert "Prefer one `write(file_path=..., content=...)` call" in message
5009
5010
5011 @pytest.mark.asyncio
5012 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
5013 temp_dir: Path,
5014 ) -> None:
5015 async def assess_confidence(
5016 tool_name: str,
5017 tool_args: dict,
5018 context: str,
5019 ) -> ConfidenceAssessment:
5020 raise AssertionError("Confidence scoring should not run in this scenario")
5021
5022 async def verify_action(
5023 tool_name: str,
5024 tool_args: dict,
5025 result: str,
5026 expected: str = "",
5027 ) -> ActionVerification:
5028 raise AssertionError("Verification should not run in this scenario")
5029
5030 guide_root = temp_dir / "guides" / "nginx"
5031 chapters = guide_root / "chapters"
5032 guide_root.mkdir(parents=True)
5033 chapters.mkdir()
5034 index_path = guide_root / "index.html"
5035 chapter_one = chapters / "01-getting-started.html"
5036 chapter_two = chapters / "02-installation.html"
5037 index_path.write_text("<html></html>\n")
5038 chapter_one.write_text("<h1>One</h1>\n")
5039
5040 implementation_plan = temp_dir / "implementation.md"
5041 implementation_plan.write_text(
5042 "\n".join(
5043 [
5044 "# Implementation Plan",
5045 "",
5046 "## File Changes",
5047 f"- `{guide_root}/`",
5048 f"- `{chapters}/`",
5049 f"- `{index_path}`",
5050 f"- `{chapter_one}`",
5051 f"- `{chapter_two}`",
5052 "",
5053 ]
5054 )
5055 )
5056
5057 context = build_context(
5058 temp_dir=temp_dir,
5059 messages=[],
5060 safeguards=FakeSafeguards(),
5061 assess_confidence=assess_confidence,
5062 verify_action=verify_action,
5063 auto_recover=False,
5064 )
5065 queued_messages: list[str] = []
5066 context.queue_steering_message_callback = queued_messages.append
5067 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5068 dod = create_definition_of_done("Create a multi-file nginx guide.")
5069 dod.implementation_plan = str(implementation_plan)
5070 sync_todos_to_definition_of_done(
5071 dod,
5072 [
5073 {
5074 "content": "Create 01-getting-started.html",
5075 "active_form": "Creating 01-getting-started.html",
5076 "status": "completed",
5077 },
5078 {
5079 "content": "Create 02-installation.html",
5080 "active_form": "Creating 02-installation.html",
5081 "status": "pending",
5082 },
5083 ],
5084 project_root=temp_dir,
5085 )
5086 dod.touched_files.extend([str(index_path), str(chapter_one)])
5087
5088 tool_call = ToolCall(
5089 id="working-note",
5090 name="notepad_write_working",
5091 arguments={"content": "Creating the second chapter file: Installation"},
5092 )
5093 executor = FakeExecutor(
5094 [
5095 tool_outcome(
5096 tool_call=tool_call,
5097 output="Working note recorded",
5098 is_error=False,
5099 )
5100 ]
5101 )
5102
5103 summary = TurnSummary(final_response="")
5104 await runner.execute_batch(
5105 tool_calls=[tool_call],
5106 tool_source="assistant",
5107 pending_tool_calls_seen=set(),
5108 emit=_noop_emit,
5109 summary=summary,
5110 dod=dod,
5111 executor=executor, # type: ignore[arg-type]
5112 on_confirmation=None,
5113 on_user_question=None,
5114 emit_confirmation=None,
5115 consecutive_errors=0,
5116 )
5117
5118 assert queued_messages
5119 message = queued_messages[-1]
5120 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5121 assert "Resume by creating `02-installation.html` now." in message
5122 assert "Make your next response the concrete mutation tool call itself" in message
5123 assert "refresh `TodoWrite`" in message
5124 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
5125
5126
5127 @pytest.mark.asyncio
5128 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
5129 temp_dir: Path,
5130 ) -> None:
5131 async def assess_confidence(
5132 tool_name: str,
5133 tool_args: dict,
5134 context: str,
5135 ) -> ConfidenceAssessment:
5136 raise AssertionError("Confidence scoring should be disabled in this scenario")
5137
5138 async def verify_action(
5139 tool_name: str,
5140 tool_args: dict,
5141 result: str,
5142 expected: str = "",
5143 ) -> ActionVerification:
5144 raise AssertionError("Verification should not run in this scenario")
5145
5146 implementation_plan = temp_dir / "implementation.md"
5147 implementation_plan.write_text(
5148 "\n".join(
5149 [
5150 "# Implementation Plan",
5151 "",
5152 "## File Changes",
5153 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
5154 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
5155 "",
5156 ]
5157 )
5158 )
5159
5160 context = build_context(
5161 temp_dir=temp_dir,
5162 messages=[],
5163 safeguards=FakeSafeguards(),
5164 assess_confidence=assess_confidence,
5165 verify_action=verify_action,
5166 auto_recover=False,
5167 )
5168 queued_messages: list[str] = []
5169 context.queue_steering_message_callback = queued_messages.append
5170 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5171 dod = create_definition_of_done("Create a multi-file nginx guide.")
5172 dod.implementation_plan = str(implementation_plan)
5173 dod.pending_items.extend(
5174 [
5175 "First, examine the existing fortran guide structure and content to understand the format",
5176 "Create the nginx directory structure",
5177 "Develop the main index.html file for the nginx guide",
5178 ]
5179 )
5180
5181 tool_call = ToolCall(
5182 id="working-note",
5183 name="notepad_write_working",
5184 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
5185 )
5186 executor = FakeExecutor(
5187 [
5188 tool_outcome(
5189 tool_call=tool_call,
5190 output="Working note recorded",
5191 is_error=False,
5192 )
5193 ]
5194 )
5195
5196 summary = TurnSummary(final_response="")
5197 await runner.execute_batch(
5198 tool_calls=[tool_call],
5199 tool_source="assistant",
5200 pending_tool_calls_seen=set(),
5201 emit=_noop_emit,
5202 summary=summary,
5203 dod=dod,
5204 executor=executor, # type: ignore[arg-type]
5205 on_confirmation=None,
5206 on_user_question=None,
5207 emit_confirmation=None,
5208 consecutive_errors=0,
5209 )
5210
5211 assert queued_messages
5212 message = queued_messages[-1]
5213 assert (
5214 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
5215 in message
5216 )
5217 assert "one concrete evidence-gathering tool call" in message
5218 assert "Resume by creating `index.html` now." not in message
5219
5220
5221 @pytest.mark.asyncio
5222 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
5223 temp_dir: Path,
5224 ) -> None:
5225 async def assess_confidence(
5226 tool_name: str,
5227 tool_args: dict,
5228 context: str,
5229 ) -> ConfidenceAssessment:
5230 raise AssertionError("Confidence scoring should be disabled in this scenario")
5231
5232 async def verify_action(
5233 tool_name: str,
5234 tool_args: dict,
5235 result: str,
5236 expected: str = "",
5237 ) -> ActionVerification:
5238 raise AssertionError("Verification should not run in this scenario")
5239
5240 guide_root = temp_dir / "guides" / "nginx"
5241 chapters_dir = guide_root / "chapters"
5242 chapters_dir.mkdir(parents=True)
5243 index_path = guide_root / "index.html"
5244 first_chapter = chapters_dir / "01-introduction.html"
5245 index_path.write_text(
5246 "\n".join(
5247 [
5248 '<a href="chapters/01-introduction.html">Introduction</a>',
5249 '<a href="chapters/02-installation.html">Installation</a>',
5250 '<a href="chapters/03-configuration.html">Configuration</a>',
5251 ]
5252 )
5253 )
5254 first_chapter.write_text("<h1>Introduction</h1>\n")
5255
5256 implementation_plan = temp_dir / "implementation.md"
5257 implementation_plan.write_text(
5258 "\n".join(
5259 [
5260 "# Implementation Plan",
5261 "",
5262 "## File Changes",
5263 f"- `{guide_root / 'index.html'}`",
5264 f"- `{chapters_dir}/`",
5265 "",
5266 ]
5267 )
5268 )
5269
5270 context = build_context(
5271 temp_dir=temp_dir,
5272 messages=[],
5273 safeguards=FakeSafeguards(),
5274 assess_confidence=assess_confidence,
5275 verify_action=verify_action,
5276 auto_recover=False,
5277 )
5278 queued_messages: list[str] = []
5279 context.queue_steering_message_callback = queued_messages.append
5280 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5281 dod = create_definition_of_done("Create a multi-file nginx guide.")
5282 dod.implementation_plan = str(implementation_plan)
5283 dod.pending_items.extend(
5284 [
5285 "First, examine the existing fortran guide structure and content to understand the format",
5286 "Create chapter files following the established pattern",
5287 ]
5288 )
5289 dod.touched_files.extend([str(index_path), str(first_chapter)])
5290
5291 tool_call = ToolCall(
5292 id="working-note",
5293 name="notepad_write_working",
5294 arguments={"content": "Created index and first chapter; next is chapter 2"},
5295 )
5296 executor = FakeExecutor(
5297 [
5298 tool_outcome(
5299 tool_call=tool_call,
5300 output="Working note recorded",
5301 is_error=False,
5302 )
5303 ]
5304 )
5305
5306 summary = TurnSummary(final_response="")
5307 await runner.execute_batch(
5308 tool_calls=[tool_call],
5309 tool_source="assistant",
5310 pending_tool_calls_seen=set(),
5311 emit=_noop_emit,
5312 summary=summary,
5313 dod=dod,
5314 executor=executor, # type: ignore[arg-type]
5315 on_confirmation=None,
5316 on_user_question=None,
5317 emit_confirmation=None,
5318 consecutive_errors=0,
5319 )
5320
5321 assert queued_messages
5322 message = queued_messages[-1]
5323 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5324 assert "Resume by creating `02-installation.html` now." in message
5325 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
5326
5327
5328 @pytest.mark.asyncio
5329 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
5330 temp_dir: Path,
5331 ) -> None:
5332 async def assess_confidence(
5333 tool_name: str,
5334 tool_args: dict,
5335 context: str,
5336 ) -> ConfidenceAssessment:
5337 raise AssertionError("Confidence scoring should be disabled in this scenario")
5338
5339 async def verify_action(
5340 tool_name: str,
5341 tool_args: dict,
5342 result: str,
5343 expected: str = "",
5344 ) -> ActionVerification:
5345 raise AssertionError("Verification should not run in this scenario")
5346
5347 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
5348 chapters_dir = fortran_root / "chapters"
5349 chapters_dir.mkdir(parents=True)
5350
5351 implementation_plan = temp_dir / "implementation.md"
5352 implementation_plan.write_text(
5353 "\n".join(
5354 [
5355 "# Implementation Plan",
5356 "",
5357 "## File Changes",
5358 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
5359 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
5360 "",
5361 ]
5362 )
5363 )
5364
5365 context = build_context(
5366 temp_dir=temp_dir,
5367 messages=[],
5368 safeguards=FakeSafeguards(),
5369 assess_confidence=assess_confidence,
5370 verify_action=verify_action,
5371 auto_recover=False,
5372 )
5373 queued_messages: list[str] = []
5374 context.queue_steering_message_callback = queued_messages.append
5375 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5376 dod = create_definition_of_done("Create a multi-file nginx guide.")
5377 dod.implementation_plan = str(implementation_plan)
5378 dod.pending_items.extend(
5379 [
5380 "First, examine the existing fortran guide structure and content",
5381 "Create the nginx directory structure",
5382 "Develop the main index.html file for nginx guide",
5383 ]
5384 )
5385
5386 tool_call = ToolCall(
5387 id="glob-1",
5388 name="glob",
5389 arguments={"pattern": "**", "path": str(fortran_root)},
5390 )
5391 executor = FakeExecutor(
5392 [
5393 tool_outcome(
5394 tool_call=tool_call,
5395 output=f"{fortran_root}\n{chapters_dir}",
5396 is_error=False,
5397 )
5398 ]
5399 )
5400
5401 summary = TurnSummary(final_response="")
5402 await runner.execute_batch(
5403 tool_calls=[tool_call],
5404 tool_source="assistant",
5405 pending_tool_calls_seen=set(),
5406 emit=_noop_emit,
5407 summary=summary,
5408 dod=dod,
5409 executor=executor, # type: ignore[arg-type]
5410 on_confirmation=None,
5411 on_user_question=None,
5412 emit_confirmation=None,
5413 consecutive_errors=0,
5414 )
5415
5416 assert queued_messages == []
5417
5418
5419 @pytest.mark.asyncio
5420 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
5421 temp_dir: Path,
5422 ) -> None:
5423 async def assess_confidence(
5424 tool_name: str,
5425 tool_args: dict,
5426 context: str,
5427 ) -> ConfidenceAssessment:
5428 raise AssertionError("Confidence scoring should not run in this scenario")
5429
5430 async def verify_action(
5431 tool_name: str,
5432 tool_args: dict,
5433 result: str,
5434 expected: str = "",
5435 ) -> ActionVerification:
5436 raise AssertionError("Verification should not run in this scenario")
5437
5438 prompt = (
5439 "Have a look at ~/Loader/guides/fortran/index.html, then "
5440 "~/Loader/guides/fortran/chapters. The table of contents links in "
5441 "index.html are inaccurate and the href’s are wrong. Let’s update the "
5442 "links and their link texts to be correct."
5443 )
5444 chapters = temp_dir / "chapters"
5445 chapters.mkdir()
5446 (chapters / "01-introduction.html").write_text(
5447 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
5448 )
5449 (chapters / "02-setup.html").write_text(
5450 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
5451 )
5452 current_block = (
5453 "<h2>Table of Contents</h2>\n"
5454 ' <ul class="chapter-list">\n'
5455 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
5456 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
5457 " </ul>\n"
5458 )
5459 index_path = temp_dir / "index.html"
5460 index_path.write_text(current_block)
5461
5462 context = build_context(
5463 temp_dir=temp_dir,
5464 messages=[],
5465 safeguards=FakeSafeguards(),
5466 assess_confidence=assess_confidence,
5467 verify_action=verify_action,
5468 auto_recover=False,
5469 )
5470 context.session.current_task = prompt # type: ignore[attr-defined]
5471 queued_messages: list[str] = []
5472 context.queue_steering_message_callback = queued_messages.append
5473 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5474 tool_call = ToolCall(
5475 id="edit-1",
5476 name="edit",
5477 arguments={
5478 "file_path": str(index_path),
5479 "old_string": current_block,
5480 "new_string": current_block,
5481 },
5482 )
5483 executor = FakeExecutor(
5484 [
5485 tool_outcome(
5486 tool_call=tool_call,
5487 output=(
5488 "[Blocked - old_string and new_string are identical - no change "
5489 "would occur] Suggestion: Provide different old and new strings"
5490 ),
5491 is_error=True,
5492 state=ToolExecutionState.BLOCKED,
5493 )
5494 ]
5495 )
5496
5497 await runner.execute_batch(
5498 tool_calls=[tool_call],
5499 tool_source="assistant",
5500 pending_tool_calls_seen=set(),
5501 emit=_noop_emit,
5502 summary=TurnSummary(final_response=""),
5503 dod=create_definition_of_done(prompt),
5504 executor=executor, # type: ignore[arg-type]
5505 on_confirmation=None,
5506 on_user_question=None,
5507 emit_confirmation=None,
5508 consecutive_errors=0,
5509 )
5510
5511 assert queued_messages == []
5512
5513
5514 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
5515 temp_dir: Path,
5516 ) -> None:
5517 async def assess_confidence(
5518 tool_name: str,
5519 tool_args: dict,
5520 context: str,
5521 ) -> ConfidenceAssessment:
5522 raise AssertionError("Confidence scoring should be disabled in this scenario")
5523
5524 async def verify_action(
5525 tool_name: str,
5526 tool_args: dict,
5527 result: str,
5528 expected: str = "",
5529 ) -> ActionVerification:
5530 raise AssertionError("Verification should not run in this scenario")
5531
5532 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
5533 context = build_context(
5534 temp_dir=temp_dir,
5535 messages=[
5536 Message(
5537 role=Role.ASSISTANT,
5538 content=(
5539 "Repair focus:\n"
5540 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
5541 f"- Immediate next step: edit `{repair_target}`.\n"
5542 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
5543 ),
5544 )
5545 ],
5546 safeguards=FakeSafeguards(),
5547 assess_confidence=assess_confidence,
5548 verify_action=verify_action,
5549 )
5550 queued: list[str] = []
5551 context.queue_steering_message_callback = queued.append
5552 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5553 dod = create_definition_of_done("Repair a guide page.")
5554
5555 runner._queue_blocked_html_edit_nudge(
5556 ToolCall(
5557 id="edit-1",
5558 name="edit",
5559 arguments={
5560 "file_path": str(repair_target),
5561 "old_string": "same",
5562 "new_string": "same",
5563 },
5564 ),
5565 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
5566 dod=dod,
5567 )
5568
5569 assert queued
5570 assert str(repair_target) in queued[0]
5571 assert "no on-disk change" in queued[0]
5572 assert "replace the surrounding block" in queued[0]
5573 assert "Do not reopen unrelated reference materials" in queued[0]
5574
5575
5576 def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
5577 temp_dir: Path,
5578 ) -> None:
5579 async def assess_confidence(
5580 tool_name: str,
5581 tool_args: dict,
5582 context: str,
5583 ) -> ConfidenceAssessment:
5584 raise AssertionError("Confidence scoring should be disabled in this scenario")
5585
5586 async def verify_action(
5587 tool_name: str,
5588 tool_args: dict,
5589 result: str,
5590 expected: str = "",
5591 ) -> ActionVerification:
5592 raise AssertionError("Verification should not run in this scenario")
5593
5594 guide_root = temp_dir / "guide"
5595 chapters = guide_root / "chapters"
5596 chapters.mkdir(parents=True)
5597 index_path = guide_root / "index.html"
5598 chapter_one = chapters / "01-introduction.html"
5599 index_path.write_text("<html></html>\n")
5600 chapter_one.write_text("<html></html>\n")
5601
5602 implementation_plan = temp_dir / "implementation.md"
5603 implementation_plan.write_text(
5604 "\n".join(
5605 [
5606 "# Implementation Plan",
5607 "",
5608 "## File Changes",
5609 f"- `{index_path}`",
5610 f"- `{chapter_one}`",
5611 "",
5612 ]
5613 )
5614 )
5615
5616 context = build_context(
5617 temp_dir=temp_dir,
5618 messages=[
5619 Message(
5620 role=Role.ASSISTANT,
5621 content=(
5622 "Repair focus:\n"
5623 f"- Confirm the final guide state in `{index_path}`.\n"
5624 f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
5625 ),
5626 )
5627 ],
5628 safeguards=FakeSafeguards(),
5629 assess_confidence=assess_confidence,
5630 verify_action=verify_action,
5631 )
5632 queued: list[str] = []
5633 context.queue_steering_message_callback = queued.append
5634 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5635
5636 dod = create_definition_of_done("Create a multi-file guide.")
5637 dod.implementation_plan = str(implementation_plan)
5638 dod.touched_files.extend([str(index_path), str(chapter_one)])
5639 dod.verification_commands = [f"ls -la {guide_root}"]
5640
5641 runner._queue_blocked_html_edit_nudge(
5642 ToolCall(
5643 id="edit-1",
5644 name="edit",
5645 arguments={
5646 "file_path": str(index_path),
5647 "old_string": "same",
5648 "new_string": "same",
5649 },
5650 ),
5651 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
5652 dod=dod,
5653 )
5654
5655 assert queued
5656 assert "All explicitly planned artifacts already exist." in queued[0]
5657 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
5658 assert "replace the surrounding block" not in queued[0]
5659
5660
5661 async def _noop_emit(event: AgentEvent) -> None:
5662 return None
5663
5664
5665 @pytest.mark.asyncio
5666 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
5667 temp_dir: Path,
5668 ) -> None:
5669 async def assess_confidence(
5670 tool_name: str,
5671 tool_args: dict,
5672 context: str,
5673 ) -> ConfidenceAssessment:
5674 raise AssertionError("Confidence scoring should be disabled in this scenario")
5675
5676 async def verify_action(
5677 tool_name: str,
5678 tool_args: dict,
5679 result: str,
5680 expected: str = "",
5681 ) -> ActionVerification:
5682 raise AssertionError("Verification should not run for this scenario")
5683
5684 context = build_context(
5685 temp_dir=temp_dir,
5686 messages=[],
5687 safeguards=FakeSafeguards(),
5688 assess_confidence=assess_confidence,
5689 verify_action=verify_action,
5690 )
5691 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5692 tool_call = ToolCall(
5693 id="write-1",
5694 name="write",
5695 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
5696 )
5697 executor = FakeExecutor(
5698 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5699 )
5700 summary = TurnSummary(final_response="")
5701 dod = create_definition_of_done("Update README and verify it still works.")
5702 events: list[AgentEvent] = []
5703
5704 async def emit(event: AgentEvent) -> None:
5705 events.append(event)
5706
5707 await runner.execute_batch(
5708 tool_calls=[tool_call],
5709 tool_source="assistant",
5710 pending_tool_calls_seen=set(),
5711 emit=emit,
5712 summary=summary,
5713 dod=dod,
5714 executor=executor, # type: ignore[arg-type]
5715 on_confirmation=None,
5716 on_user_question=None,
5717 emit_confirmation=None,
5718 consecutive_errors=0,
5719 )
5720
5721 assert dod.last_verification_result == "planned"
5722 assert dod.verification_commands
5723 assert "Collect verification evidence" in dod.pending_items
5724 assert dod.active_verification_attempt_id == "verification-attempt-1"
5725 assert dod.active_verification_attempt_number == 1
5726 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
5727 assert summary.workflow_timeline[-1].policy_outcome == "planned"
5728 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
5729 assert (
5730 summary.workflow_timeline[-1].verification_observations[0].attempt_id
5731 == "verification-attempt-1"
5732 )
5733 assert (
5734 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
5735 )
5736
5737
5738 @pytest.mark.asyncio
5739 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
5740 temp_dir: Path,
5741 ) -> None:
5742 async def assess_confidence(
5743 tool_name: str,
5744 tool_args: dict,
5745 context: str,
5746 ) -> ConfidenceAssessment:
5747 raise AssertionError("Confidence scoring should be disabled in this scenario")
5748
5749 async def verify_action(
5750 tool_name: str,
5751 tool_args: dict,
5752 result: str,
5753 expected: str = "",
5754 ) -> ActionVerification:
5755 raise AssertionError("Verification should not run in this scenario")
5756
5757 context = build_context(
5758 temp_dir=temp_dir,
5759 messages=[],
5760 safeguards=FakeSafeguards(),
5761 assess_confidence=assess_confidence,
5762 verify_action=verify_action,
5763 )
5764 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5765 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
5766 chapters = nginx_root / "chapters"
5767 implementation_plan = temp_dir / "implementation.md"
5768 implementation_plan.write_text(
5769 "\n".join(
5770 [
5771 "# Implementation Plan",
5772 "",
5773 "## File Changes",
5774 f"- `{chapters}/`",
5775 f"- `{nginx_root / 'index.html'}`",
5776 "",
5777 ]
5778 )
5779 )
5780
5781 tool_call = ToolCall(
5782 id="mkdir-1",
5783 name="bash",
5784 arguments={"command": f"mkdir -p {chapters}"},
5785 )
5786 executor = FakeExecutor(
5787 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
5788 )
5789 summary = TurnSummary(final_response="")
5790 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
5791 dod.implementation_plan = str(implementation_plan)
5792 events: list[AgentEvent] = []
5793
5794 async def emit(event: AgentEvent) -> None:
5795 events.append(event)
5796
5797 await runner.execute_batch(
5798 tool_calls=[tool_call],
5799 tool_source="assistant",
5800 pending_tool_calls_seen=set(),
5801 emit=emit,
5802 summary=summary,
5803 dod=dod,
5804 executor=executor, # type: ignore[arg-type]
5805 on_confirmation=None,
5806 on_user_question=None,
5807 emit_confirmation=None,
5808 consecutive_errors=0,
5809 )
5810
5811 assert dod.last_verification_result is None
5812 assert "Collect verification evidence" not in dod.pending_items
5813 assert not any(
5814 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
5815 )
5816
5817
5818 @pytest.mark.asyncio
5819 async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
5820 temp_dir: Path,
5821 ) -> None:
5822 async def assess_confidence(
5823 tool_name: str,
5824 tool_args: dict,
5825 context: str,
5826 ) -> ConfidenceAssessment:
5827 raise AssertionError("Confidence scoring should be disabled in this scenario")
5828
5829 async def verify_action(
5830 tool_name: str,
5831 tool_args: dict,
5832 result: str,
5833 expected: str = "",
5834 ) -> ActionVerification:
5835 raise AssertionError("Verification should not run in this scenario")
5836
5837 context = build_context(
5838 temp_dir=temp_dir,
5839 messages=[],
5840 safeguards=FakeSafeguards(),
5841 assess_confidence=assess_confidence,
5842 verify_action=verify_action,
5843 )
5844 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5845 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
5846 chapters = nginx_root / "chapters"
5847 chapters.mkdir(parents=True)
5848 index_path = nginx_root / "index.html"
5849 implementation_plan = temp_dir / "implementation.md"
5850 implementation_plan.write_text(
5851 "\n".join(
5852 [
5853 "# Implementation Plan",
5854 "",
5855 "## File Changes",
5856 f"- `{nginx_root}/`",
5857 f"- `{chapters}/`",
5858 f"- `{index_path}`",
5859 "",
5860 ]
5861 )
5862 )
5863
5864 tool_call = ToolCall(
5865 id="write-index",
5866 name="write",
5867 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
5868 )
5869 executor = FakeExecutor(
5870 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5871 )
5872 summary = TurnSummary(final_response="")
5873 dod = create_definition_of_done("Create a multi-file nginx guide.")
5874 dod.implementation_plan = str(implementation_plan)
5875 dod.pending_items.extend(
5876 [
5877 "Develop the main index.html file with proper structure",
5878 "Create first nginx chapter",
5879 ]
5880 )
5881 events: list[AgentEvent] = []
5882
5883 async def emit(event: AgentEvent) -> None:
5884 events.append(event)
5885
5886 await runner.execute_batch(
5887 tool_calls=[tool_call],
5888 tool_source="assistant",
5889 pending_tool_calls_seen=set(),
5890 emit=emit,
5891 summary=summary,
5892 dod=dod,
5893 executor=executor, # type: ignore[arg-type]
5894 on_confirmation=None,
5895 on_user_question=None,
5896 emit_confirmation=None,
5897 consecutive_errors=0,
5898 )
5899
5900 assert dod.last_verification_result is None
5901 assert "Collect verification evidence" not in dod.pending_items
5902 assert "Create first nginx chapter" in dod.pending_items
5903 assert not any(
5904 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
5905 )
5906
5907
5908 @pytest.mark.asyncio
5909 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
5910 temp_dir: Path,
5911 ) -> None:
5912 async def assess_confidence(
5913 tool_name: str,
5914 tool_args: dict,
5915 context: str,
5916 ) -> ConfidenceAssessment:
5917 raise AssertionError("Confidence scoring should be disabled in this scenario")
5918
5919 async def verify_action(
5920 tool_name: str,
5921 tool_args: dict,
5922 result: str,
5923 expected: str = "",
5924 ) -> ActionVerification:
5925 raise AssertionError("Verification should not run for this scenario")
5926
5927 context = build_context(
5928 temp_dir=temp_dir,
5929 messages=[],
5930 safeguards=FakeSafeguards(),
5931 assess_confidence=assess_confidence,
5932 verify_action=verify_action,
5933 )
5934 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5935 tool_call = ToolCall(
5936 id="write-1",
5937 name="write",
5938 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
5939 )
5940 executor = FakeExecutor(
5941 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5942 )
5943 summary = TurnSummary(final_response="")
5944 dod = create_definition_of_done("Update README and verify it still works.")
5945 dod.verification_commands = ["uv run pytest -q"]
5946 dod.last_verification_result = "passed"
5947 dod.verification_attempt_counter = 1
5948 dod.active_verification_attempt_id = "verification-attempt-1"
5949 dod.active_verification_attempt_number = 1
5950 dod.evidence = [
5951 VerificationEvidence(
5952 command="uv run pytest -q",
5953 passed=True,
5954 stdout="401 passed",
5955 kind="test",
5956 )
5957 ]
5958 dod.completed_items.append("Collect verification evidence")
5959 events: list[AgentEvent] = []
5960
5961 async def emit(event: AgentEvent) -> None:
5962 events.append(event)
5963
5964 await runner.execute_batch(
5965 tool_calls=[tool_call],
5966 tool_source="assistant",
5967 pending_tool_calls_seen=set(),
5968 emit=emit,
5969 summary=summary,
5970 dod=dod,
5971 executor=executor, # type: ignore[arg-type]
5972 on_confirmation=None,
5973 on_user_question=None,
5974 emit_confirmation=None,
5975 consecutive_errors=0,
5976 )
5977
5978 assert dod.last_verification_result == "stale"
5979 assert dod.evidence == []
5980 assert "Collect verification evidence" in dod.pending_items
5981 assert "Collect verification evidence" not in dod.completed_items
5982 assert dod.active_verification_attempt_id == "verification-attempt-2"
5983 assert dod.active_verification_attempt_number == 2
5984 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
5985 assert summary.workflow_timeline[-1].policy_outcome == "stale"
5986 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
5987 assert (
5988 summary.workflow_timeline[-1].verification_observations[0].attempt_id
5989 == "verification-attempt-1"
5990 )
5991 assert (
5992 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
5993 )
5994 assert (
5995 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
5996 == "verification-attempt-2"
5997 )
5998 assert (
5999 summary.workflow_timeline[-1].verification_observations[0].command
6000 == "uv run pytest -q"
6001 )
6002
6003
6004 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
6005 async def assess_confidence(
6006 tool_name: str,
6007 tool_args: dict,
6008 context: str,
6009 ) -> ConfidenceAssessment:
6010 raise AssertionError("Confidence scoring should be disabled in this scenario")
6011
6012 async def verify_action(
6013 tool_name: str,
6014 tool_args: dict,
6015 result: str,
6016 expected: str = "",
6017 ) -> ActionVerification:
6018 raise AssertionError("Verification should not run in this scenario")
6019
6020 repair_target = temp_dir / "guide" / "index.html"
6021 context = build_context(
6022 temp_dir=temp_dir,
6023 messages=[
6024 Message(
6025 role=Role.ASSISTANT,
6026 content=(
6027 "Repair focus:\n"
6028 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
6029 f"- Immediate next step: edit `{repair_target}`.\n"
6030 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
6031 ),
6032 )
6033 ],
6034 safeguards=FakeSafeguards(),
6035 assess_confidence=assess_confidence,
6036 verify_action=verify_action,
6037 )
6038 queued: list[str] = []
6039 context.queue_steering_message_callback = queued.append
6040 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6041
6042 runner._queue_blocked_active_repair_nudge(
6043 "[Blocked - active repair scope: verification already identified the repair target.]"
6044 )
6045
6046 assert queued
6047 assert str(repair_target) in queued[0]
6048 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
6049 assert "Do not reopen unrelated reference materials" in queued[0]
6050
6051
6052 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
6053 temp_dir: Path,
6054 ) -> None:
6055 async def assess_confidence(
6056 tool_name: str,
6057 tool_args: dict,
6058 context: str,
6059 ) -> ConfidenceAssessment:
6060 raise AssertionError("Confidence scoring should be disabled in this scenario")
6061
6062 async def verify_action(
6063 tool_name: str,
6064 tool_args: dict,
6065 result: str,
6066 expected: str = "",
6067 ) -> ActionVerification:
6068 raise AssertionError("Verification should not run in this scenario")
6069
6070 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
6071 stylesheet = temp_dir / "guide" / "styles.css"
6072 context = build_context(
6073 temp_dir=temp_dir,
6074 messages=[
6075 Message(
6076 role=Role.ASSISTANT,
6077 content=(
6078 "Repair focus:\n"
6079 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
6080 f"- Immediate next step: edit `{repair_target}`.\n"
6081 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
6082 ),
6083 )
6084 ],
6085 safeguards=FakeSafeguards(),
6086 assess_confidence=assess_confidence,
6087 verify_action=verify_action,
6088 )
6089 queued: list[str] = []
6090 context.queue_steering_message_callback = queued.append
6091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6092
6093 runner._queue_blocked_active_repair_mutation_nudge(
6094 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
6095 )
6096
6097 assert queued
6098 assert str(repair_target) in queued[0]
6099 assert str(stylesheet) in queued[0]
6100 assert "before widening the change set" in queued[0]
6101
6102
6103 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
6104 temp_dir: Path,
6105 ) -> None:
6106 async def assess_confidence(
6107 tool_name: str,
6108 tool_args: dict,
6109 context: str,
6110 ) -> ConfidenceAssessment:
6111 raise AssertionError("Confidence scoring should be disabled in this scenario")
6112
6113 async def verify_action(
6114 tool_name: str,
6115 tool_args: dict,
6116 result: str,
6117 expected: str = "",
6118 ) -> ActionVerification:
6119 raise AssertionError("Verification should not run in this scenario")
6120
6121 context = build_context(
6122 temp_dir=temp_dir,
6123 messages=[],
6124 safeguards=FakeSafeguards(),
6125 assess_confidence=assess_confidence,
6126 verify_action=verify_action,
6127 )
6128 queued: list[str] = []
6129 context.queue_steering_message_callback = queued.append
6130 store = DefinitionOfDoneStore(temp_dir)
6131 dod = create_definition_of_done("Create a multi-file guide from a reference")
6132 plan_path = temp_dir / "implementation.md"
6133 plan_path.write_text(
6134 "# File Changes\n"
6135 "- `guide/index.html`\n"
6136 "- `guide/chapters/01-getting-started.html`\n"
6137 "- `guide/chapters/02-installation.html`\n"
6138 "- `guide/chapters/03-first-website.html`\n"
6139 )
6140 dod.implementation_plan = str(plan_path)
6141 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
6142 (temp_dir / "guide" / "index.html").write_text("index")
6143 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
6144 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
6145 runner = ToolBatchRunner(context, store)
6146
6147 runner._queue_blocked_late_reference_drift_nudge(
6148 "[Blocked - late reference drift: several planned artifacts already exist.]",
6149 dod=dod,
6150 )
6151
6152 assert queued
6153 assert "03-first-website.html" in queued[0]
6154 assert "older reference materials" in queued[0]
6155
6156
6157 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
6158 temp_dir: Path,
6159 ) -> None:
6160 async def assess_confidence(
6161 tool_name: str,
6162 tool_args: dict,
6163 context: str,
6164 ) -> ConfidenceAssessment:
6165 raise AssertionError("Confidence scoring should be disabled in this scenario")
6166
6167 async def verify_action(
6168 tool_name: str,
6169 tool_args: dict,
6170 result: str,
6171 expected: str = "",
6172 ) -> ActionVerification:
6173 raise AssertionError("Verification should not run in this scenario")
6174
6175 guide_root = temp_dir / "guide"
6176 chapters = guide_root / "chapters"
6177 guide_root.mkdir(parents=True)
6178 chapters.mkdir()
6179 index_path = guide_root / "index.html"
6180 chapter_one = chapters / "01-getting-started.html"
6181 chapter_two = chapters / "02-installation.html"
6182 index_path.write_text("index")
6183 chapter_one.write_text("one")
6184 chapter_two.write_text("two")
6185
6186 implementation_plan = temp_dir / "implementation.md"
6187 implementation_plan.write_text(
6188 "\n".join(
6189 [
6190 "# Implementation Plan",
6191 "",
6192 "## File Changes",
6193 f"- `{guide_root}`",
6194 f"- `{chapters}`",
6195 f"- `{index_path}`",
6196 f"- `{chapter_one}`",
6197 f"- `{chapter_two}`",
6198 "",
6199 ]
6200 )
6201 )
6202
6203 context = build_context(
6204 temp_dir=temp_dir,
6205 messages=[],
6206 safeguards=FakeSafeguards(),
6207 assess_confidence=assess_confidence,
6208 verify_action=verify_action,
6209 )
6210 queued: list[str] = []
6211 context.queue_steering_message_callback = queued.append
6212 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6213 dod = create_definition_of_done("Create a multi-file guide from a reference")
6214 dod.implementation_plan = str(implementation_plan)
6215 dod.verification_commands = [f"ls -la {guide_root}"]
6216 sync_todos_to_definition_of_done(
6217 dod,
6218 [
6219 {
6220 "content": "Verify all guide files are linked and complete",
6221 "active_form": "Working on: Verify all guide files are linked and complete",
6222 "status": "pending",
6223 }
6224 ],
6225 project_root=temp_dir,
6226 )
6227
6228 runner._queue_blocked_completed_artifact_scope_nudge(
6229 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
6230 dod=dod,
6231 )
6232
6233 assert queued
6234 assert context.workflow_mode == "verify"
6235 assert "All explicitly planned artifacts already exist." in queued[0]
6236 assert "Verify all guide files are linked and complete" in queued[0]
6237 assert "Do not reopen earlier reference materials." in queued[0]
6238 assert "Verification should run next" in queued[0]
6239
6240
6241 def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
6242 temp_dir: Path,
6243 ) -> None:
6244 async def assess_confidence(
6245 tool_name: str,
6246 tool_args: dict,
6247 context: str,
6248 ) -> ConfidenceAssessment:
6249 raise AssertionError("Confidence scoring should be disabled in this scenario")
6250
6251 async def verify_action(
6252 tool_name: str,
6253 tool_args: dict,
6254 result: str,
6255 expected: str = "",
6256 ) -> ActionVerification:
6257 raise AssertionError("Verification should not run in this scenario")
6258
6259 guide_root = temp_dir / "guide"
6260 chapters = guide_root / "chapters"
6261 guide_root.mkdir(parents=True)
6262 chapters.mkdir()
6263 index_path = guide_root / "index.html"
6264 chapter_one = chapters / "01-getting-started.html"
6265 chapter_two = chapters / "02-installation.html"
6266 index_path.write_text("index")
6267 chapter_one.write_text("one")
6268 chapter_two.write_text("two")
6269
6270 implementation_plan = temp_dir / "implementation.md"
6271 implementation_plan.write_text(
6272 "\n".join(
6273 [
6274 "# Implementation Plan",
6275 "",
6276 "## File Changes",
6277 f"- `{guide_root}`",
6278 f"- `{chapters}`",
6279 f"- `{index_path}`",
6280 f"- `{chapter_one}`",
6281 f"- `{chapter_two}`",
6282 "",
6283 ]
6284 )
6285 )
6286
6287 context = build_context(
6288 temp_dir=temp_dir,
6289 messages=[],
6290 safeguards=FakeSafeguards(),
6291 assess_confidence=assess_confidence,
6292 verify_action=verify_action,
6293 )
6294 queued: list[str] = []
6295 context.queue_steering_message_callback = queued.append
6296 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6297 dod = create_definition_of_done("Create a multi-file guide from a reference")
6298 dod.implementation_plan = str(implementation_plan)
6299 dod.verification_commands = [f"ls -la {guide_root}"]
6300
6301 runner._queue_blocked_completed_artifact_scope_nudge(
6302 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
6303 dod=dod,
6304 )
6305
6306 assert queued
6307 assert context.workflow_mode == "verify"
6308 assert "All explicitly planned artifacts already exist." in queued[0]
6309 assert "move to verification or final confirmation" in queued[0]
6310
6311
6312 @pytest.mark.asyncio
6313 async def test_tool_batch_runner_does_not_halt_on_repeated_post_build_audit_blocks(
6314 temp_dir: Path,
6315 ) -> None:
6316 async def assess_confidence(
6317 tool_name: str,
6318 tool_args: dict,
6319 context: str,
6320 ) -> ConfidenceAssessment:
6321 raise AssertionError("Confidence scoring should be disabled in this scenario")
6322
6323 async def verify_action(
6324 tool_name: str,
6325 tool_args: dict,
6326 result: str,
6327 expected: str = "",
6328 ) -> ActionVerification:
6329 raise AssertionError("Verification should not run in this scenario")
6330
6331 guide_root = temp_dir / "guide"
6332 chapters = guide_root / "chapters"
6333 guide_root.mkdir(parents=True)
6334 chapters.mkdir()
6335 index_path = guide_root / "index.html"
6336 chapter_one = chapters / "01-getting-started.html"
6337 chapter_two = chapters / "02-installation.html"
6338 index_path.write_text("index")
6339 chapter_one.write_text("one")
6340 chapter_two.write_text("two")
6341
6342 implementation_plan = temp_dir / "implementation.md"
6343 implementation_plan.write_text(
6344 "\n".join(
6345 [
6346 "# Implementation Plan",
6347 "",
6348 "## File Changes",
6349 f"- `{guide_root}`",
6350 f"- `{chapters}`",
6351 f"- `{index_path}`",
6352 f"- `{chapter_one}`",
6353 f"- `{chapter_two}`",
6354 "",
6355 ]
6356 )
6357 )
6358
6359 context = build_context(
6360 temp_dir=temp_dir,
6361 messages=[],
6362 safeguards=FakeSafeguards(),
6363 assess_confidence=assess_confidence,
6364 verify_action=verify_action,
6365 )
6366 queued: list[str] = []
6367 context.queue_steering_message_callback = queued.append
6368 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6369 dod = create_definition_of_done("Create a multi-file guide from a reference")
6370 dod.implementation_plan = str(implementation_plan)
6371 dod.verification_commands = [f"ls -la {guide_root}"]
6372
6373 blocked_message = (
6374 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]"
6375 )
6376 tool_calls = [
6377 ToolCall(
6378 id=f"audit-{index}",
6379 name="bash",
6380 arguments={"command": f"cd {temp_dir} && ls -la guide/chapters/"},
6381 )
6382 for index in range(1, 4)
6383 ]
6384 executor = FakeExecutor(
6385 [
6386 tool_outcome(
6387 tool_call=tool_call,
6388 output=blocked_message,
6389 is_error=True,
6390 state=ToolExecutionState.BLOCKED,
6391 )
6392 for tool_call in tool_calls
6393 ]
6394 )
6395 events: list[AgentEvent] = []
6396
6397 async def emit(event: AgentEvent) -> None:
6398 events.append(event)
6399
6400 result = await runner.execute_batch(
6401 tool_calls=tool_calls,
6402 tool_source="native",
6403 pending_tool_calls_seen=set(),
6404 emit=emit,
6405 summary=TurnSummary(final_response=""),
6406 dod=dod,
6407 executor=executor,
6408 on_confirmation=None,
6409 on_user_question=None,
6410 emit_confirmation=None,
6411 consecutive_errors=0,
6412 )
6413
6414 assert result.halted is False
6415 assert result.consecutive_errors == 0
6416 assert context.workflow_mode == "verify"
6417 assert queued
6418 assert any("move to verification or final confirmation" in message for message in queued)
6419
6420
6421 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
6422 temp_dir: Path,
6423 ) -> None:
6424 async def assess_confidence(
6425 tool_name: str,
6426 tool_args: dict,
6427 context: str,
6428 ) -> ConfidenceAssessment:
6429 raise AssertionError("Confidence scoring should be disabled in this scenario")
6430
6431 async def verify_action(
6432 tool_name: str,
6433 tool_args: dict,
6434 result: str,
6435 expected: str = "",
6436 ) -> ActionVerification:
6437 raise AssertionError("Verification should not run in this scenario")
6438
6439 context = build_context(
6440 temp_dir=temp_dir,
6441 messages=[],
6442 safeguards=FakeSafeguards(),
6443 assess_confidence=assess_confidence,
6444 verify_action=verify_action,
6445 )
6446 queued: list[str] = []
6447 context.queue_steering_message_callback = queued.append
6448 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6449
6450 runner._queue_blocked_html_declared_target_nudge(
6451 ToolCall(
6452 id="write-ch1",
6453 name="write",
6454 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
6455 ),
6456 (
6457 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
6458 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
6459 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
6460 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
6461 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
6462 ),
6463 )
6464
6465 assert queued
6466 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
6467 assert "`chapters/02-installation.html`" in queued[0]
6468 assert "same file now" in queued[0]
6469
6470
6471 def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
6472 temp_dir: Path,
6473 ) -> None:
6474 async def assess_confidence(
6475 tool_name: str,
6476 tool_args: dict,
6477 context: str,
6478 ) -> ConfidenceAssessment:
6479 raise AssertionError("Confidence scoring should be disabled in this scenario")
6480
6481 async def verify_action(
6482 tool_name: str,
6483 tool_args: dict,
6484 result: str,
6485 expected: str = "",
6486 ) -> ActionVerification:
6487 raise AssertionError("Verification should not run in this scenario")
6488
6489 context = build_context(
6490 temp_dir=temp_dir,
6491 messages=[],
6492 safeguards=FakeSafeguards(),
6493 assess_confidence=assess_confidence,
6494 verify_action=verify_action,
6495 )
6496 queued: list[str] = []
6497 context.queue_steering_message_callback = queued.append
6498 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6499
6500 runner._queue_blocked_html_declared_target_nudge(
6501 ToolCall(
6502 id="write-ch1",
6503 name="write",
6504 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
6505 ),
6506 (
6507 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
6508 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
6509 "introducing new sibling targets that the guide root does not declare; remove or replace "
6510 "undeclared hrefs like: troubleshooting.html. "
6511 "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
6512 "chapters/configuration.html."
6513 ),
6514 )
6515
6516 assert queued
6517 assert "Remove the invented hrefs or keep local links within the declared target set" in queued[0]
6518 assert "`chapters/installation.html`" in queued[0]
6519 assert "closest declared target(s)" not in queued[0]
6520
6521
6522 def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
6523 temp_dir: Path,
6524 ) -> None:
6525 async def assess_confidence(
6526 tool_name: str,
6527 tool_args: dict,
6528 context: str,
6529 ) -> ConfidenceAssessment:
6530 raise AssertionError("Confidence scoring should be disabled in this scenario")
6531
6532 async def verify_action(
6533 tool_name: str,
6534 tool_args: dict,
6535 result: str,
6536 expected: str = "",
6537 ) -> ActionVerification:
6538 raise AssertionError("Verification should not run in this scenario")
6539
6540 context = build_context(
6541 temp_dir=temp_dir,
6542 messages=[],
6543 safeguards=FakeSafeguards(),
6544 assess_confidence=assess_confidence,
6545 verify_action=verify_action,
6546 )
6547 queued: list[str] = []
6548 context.queue_steering_message_callback = queued.append
6549 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6550 dod = create_definition_of_done("Create a guide.")
6551
6552 target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
6553 runner._queue_blocked_html_declared_file_creation_nudge(
6554 ToolCall(
6555 id="write-troubleshooting",
6556 name="write",
6557 arguments={"file_path": str(target)},
6558 ),
6559 (
6560 "[Blocked - HTML file creation falls outside the current declared artifact set] "
6561 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
6562 f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
6563 "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
6564 "Already-declared local targets include: chapters/advanced-topics.html, "
6565 "chapters/basic-usage.html, chapters/configuration.html"
6566 ),
6567 dod=dod,
6568 )
6569
6570 assert queued
6571 assert "update" in queued[0].lower()
6572 assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
6573 assert "`chapters/troubleshooting.html`" in queued[0]
6574 assert "retry the file creation" in queued[0]
6575
6576
6577 def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
6578 temp_dir: Path,
6579 ) -> None:
6580 async def assess_confidence(
6581 tool_name: str,
6582 tool_args: dict,
6583 context: str,
6584 ) -> ConfidenceAssessment:
6585 raise AssertionError("Confidence scoring should not run in this scenario")
6586
6587 async def verify_action(
6588 tool_name: str,
6589 tool_args: dict,
6590 result: str,
6591 expected: str = "",
6592 ) -> ActionVerification:
6593 raise AssertionError("Verification should not run in this scenario")
6594
6595 guide = temp_dir / "guide"
6596 chapters = guide / "chapters"
6597 guide.mkdir()
6598 chapters.mkdir()
6599 index = guide / "index.html"
6600 index.write_text(
6601 "\n".join(
6602 [
6603 '<a href="chapters/01-introduction.html">Intro</a>',
6604 '<a href="chapters/02-installation.html">Install</a>',
6605 '<a href="../index.html">Back</a>',
6606 "",
6607 ]
6608 )
6609 )
6610 (chapters / "01-introduction.html").write_text("<html></html>\n")
6611 (chapters / "02-installation.html").write_text("<html></html>\n")
6612
6613 implementation_plan = temp_dir / "implementation.md"
6614 implementation_plan.write_text(
6615 "\n".join(
6616 [
6617 "# Implementation Plan",
6618 "",
6619 "## File Changes",
6620 f"- `{index}`",
6621 f"- `{chapters / '01-introduction.html'}`",
6622 f"- `{chapters / '02-installation.html'}`",
6623 "",
6624 ]
6625 )
6626 )
6627
6628 context = build_context(
6629 temp_dir=temp_dir,
6630 messages=[],
6631 safeguards=FakeSafeguards(),
6632 assess_confidence=assess_confidence,
6633 verify_action=verify_action,
6634 )
6635 queued: list[str] = []
6636 context.queue_steering_message_callback = queued.append
6637 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6638 dod = create_definition_of_done("Create a guide.")
6639 dod.implementation_plan = str(implementation_plan)
6640 dod.verification_commands = [f"ls -la {guide}"]
6641 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
6642
6643 target = guide / "chapters" / "08-advanced-configuration.html"
6644 runner._queue_blocked_html_declared_file_creation_nudge(
6645 ToolCall(
6646 id="write-extra",
6647 name="write",
6648 arguments={"file_path": str(target)},
6649 ),
6650 (
6651 "[Blocked - HTML file creation falls outside the current declared artifact set] "
6652 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
6653 f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
6654 "for example: chapters/08-advanced-configuration.html."
6655 ),
6656 dod=dod,
6657 )
6658
6659 assert queued
6660 assert "All explicitly planned artifacts already exist on disk." in queued[0]
6661 assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
6662 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
6663 assert "update the guide root" not in queued[0]
6664
6665
6666 def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
6667 temp_dir: Path,
6668 ) -> None:
6669 async def assess_confidence(
6670 tool_name: str,
6671 tool_args: dict,
6672 context: str,
6673 ) -> ConfidenceAssessment:
6674 raise AssertionError("Confidence scoring should not run in this scenario")
6675
6676 async def verify_action(
6677 tool_name: str,
6678 tool_args: dict,
6679 result: str,
6680 expected: str = "",
6681 ) -> ActionVerification:
6682 raise AssertionError("Verification should not run in this scenario")
6683
6684 guide = temp_dir / "guide"
6685 chapters = guide / "chapters"
6686 guide.mkdir()
6687 chapters.mkdir()
6688 index = guide / "index.html"
6689 index.write_text(
6690 "\n".join(
6691 [
6692 '<a href="chapters/01-introduction.html">Intro</a>',
6693 '<a href="chapters/02-installation.html">Install</a>',
6694 '<a href="../index.html">Back</a>',
6695 "",
6696 ]
6697 )
6698 )
6699 (chapters / "01-introduction.html").write_text("<html></html>\n")
6700 (chapters / "02-installation.html").write_text("<html></html>\n")
6701
6702 implementation_plan = temp_dir / "implementation.md"
6703 implementation_plan.write_text(
6704 "\n".join(
6705 [
6706 "# Implementation Plan",
6707 "",
6708 "## File Changes",
6709 f"- `{index}`",
6710 f"- `{chapters / '01-introduction.html'}`",
6711 f"- `{chapters / '02-installation.html'}`",
6712 "",
6713 ]
6714 )
6715 )
6716
6717 context = build_context(
6718 temp_dir=temp_dir,
6719 messages=[],
6720 safeguards=FakeSafeguards(),
6721 assess_confidence=assess_confidence,
6722 verify_action=verify_action,
6723 )
6724 queued: list[str] = []
6725 context.queue_steering_message_callback = queued.append
6726 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6727 dod = create_definition_of_done("Create a guide.")
6728 dod.implementation_plan = str(implementation_plan)
6729 dod.verification_commands = [f"ls -la {guide}"]
6730 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
6731
6732 runner._queue_blocked_html_missing_target_nudge(
6733 ToolCall(
6734 id="edit-root",
6735 name="edit",
6736 arguments={"file_path": str(index)},
6737 ),
6738 (
6739 "[Blocked - Edited HTML links point to files that do not exist] "
6740 "Suggestion: Use only existing local targets for href values and avoid introducing missing links, "
6741 "for example fix: chapters/08-advanced-configuration.html"
6742 ),
6743 dod=dod,
6744 )
6745
6746 assert queued
6747 assert "All explicitly planned artifacts already exist on disk." in queued[0]
6748 assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
6749 assert "Repair the existing generated files instead of expanding the guide." in queued[0]
6750
6751
6752 @pytest.mark.asyncio
6753 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
6754 temp_dir: Path,
6755 ) -> None:
6756 async def assess_confidence(
6757 tool_name: str,
6758 tool_args: dict,
6759 context: str,
6760 ) -> ConfidenceAssessment:
6761 raise AssertionError("Confidence scoring should be disabled in this scenario")
6762
6763 async def verify_action(
6764 tool_name: str,
6765 tool_args: dict,
6766 result: str,
6767 expected: str = "",
6768 ) -> ActionVerification:
6769 raise AssertionError("Verification should not run in this scenario")
6770
6771 guide_root = temp_dir / "guides" / "nginx"
6772 chapters = guide_root / "chapters"
6773 chapters.mkdir(parents=True)
6774 index_path = guide_root / "index.html"
6775 chapter_one = chapters / "01-introduction.html"
6776 chapter_two = chapters / "02-installation.html"
6777 index_path.write_text("<html></html>\n")
6778 chapter_one.write_text("<h1>Intro</h1>\n")
6779
6780 implementation_plan = temp_dir / "implementation.md"
6781 implementation_plan.write_text(
6782 "\n".join(
6783 [
6784 "# Implementation Plan",
6785 "",
6786 "## File Changes",
6787 f"- `{index_path}`",
6788 f"- `{chapter_one}`",
6789 f"- `{chapter_two}`",
6790 "",
6791 ]
6792 )
6793 )
6794
6795 context = build_context(
6796 temp_dir=temp_dir,
6797 messages=[],
6798 safeguards=FakeSafeguards(),
6799 assess_confidence=assess_confidence,
6800 verify_action=verify_action,
6801 auto_recover=False,
6802 )
6803 queued: list[str] = []
6804 context.queue_steering_message_callback = queued.append
6805 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6806 tool_call = ToolCall(
6807 id="write-2",
6808 name="write",
6809 arguments={"file_path": "", "content": "<html></html>\n"},
6810 )
6811 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
6812 executor = FakeExecutor(
6813 [
6814 ToolExecutionOutcome(
6815 tool_call=tool_call,
6816 state=ToolExecutionState.BLOCKED,
6817 message=Message.tool_result_message(
6818 tool_call_id=tool_call.id,
6819 display_content=blocked_message,
6820 result_content=blocked_message,
6821 is_error=True,
6822 ),
6823 event_content=blocked_message,
6824 is_error=True,
6825 result_output=blocked_message,
6826 )
6827 ]
6828 )
6829 dod = create_definition_of_done("Create a multi-file nginx guide.")
6830 dod.implementation_plan = str(implementation_plan)
6831 dod.touched_files.extend([str(index_path), str(chapter_one)])
6832 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
6833
6834 await runner.execute_batch(
6835 tool_calls=[tool_call],
6836 tool_source="assistant",
6837 pending_tool_calls_seen=set(),
6838 emit=_noop_emit,
6839 summary=TurnSummary(final_response=""),
6840 dod=dod,
6841 executor=executor, # type: ignore[arg-type]
6842 on_confirmation=None,
6843 on_user_question=None,
6844 emit_confirmation=None,
6845 consecutive_errors=0,
6846 )
6847
6848 assert queued
6849 assert "did not provide a valid `file_path`" in queued[0]
6850 assert "Resume by creating `02-installation.html` now." in queued[0]
6851 assert (
6852 f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
6853 in queued[0]
6854 )
6855 assert context.recovery_context is not None
6856 assert context.recovery_context.attempts[-1].error == blocked_message