Python · 239230 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.path_display import display_runtime_path
20 from loader.runtime.permissions import (
21 PermissionMode,
22 build_permission_policy,
23 load_permission_rules,
24 )
25 from loader.runtime.reasoning_types import (
26 ActionVerification,
27 ConfidenceAssessment,
28 ConfidenceLevel,
29 )
30 from loader.runtime.recovery import RecoveryContext
31 from loader.runtime.tool_batches import (
32 ToolBatchRunner,
33 )
34 from loader.runtime.tool_batches import (
35 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
36 )
37 from loader.runtime.workflow import sync_todos_to_definition_of_done
38 from loader.tools.base import ToolResult as RegistryToolResult
39 from loader.tools.base import create_default_registry
40 from tests.helpers.runtime_harness import ScriptedBackend
41
42
43 class FakeSession:
44 def __init__(self, messages: list[Message]) -> None:
45 self.messages = list(messages)
46 self.workflow_timeline = []
47
48 def append(self, message: Message) -> None:
49 self.messages.append(message)
50
51 def append_workflow_timeline_entry(self, entry) -> None:
52 self.workflow_timeline.append(entry)
53
54
55 class FakeCodeFilter:
56 def reset(self) -> None:
57 return None
58
59
60 class FakeSafeguards:
61 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
62 self.action_tracker = object()
63 self.validator = object()
64 self.code_filter = FakeCodeFilter()
65 self._detect_loop_result = detect_loop_result
66
67 def filter_stream_chunk(self, content: str) -> str:
68 return content
69
70 def filter_complete_content(self, content: str) -> str:
71 return content
72
73 def should_steer(self) -> bool:
74 return False
75
76 def get_steering_message(self) -> str | None:
77 return None
78
79 def record_response(self, content: str) -> None:
80 return None
81
82 def detect_text_loop(self, content: str) -> tuple[bool, str]:
83 return False, ""
84
85 def detect_loop(self) -> tuple[bool, str]:
86 return self._detect_loop_result
87
88
89 class FakeExecutor:
90 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
91 self._outcomes = list(outcomes)
92 self.calls: list[ToolCall] = []
93
94 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
95 self.calls.append(tool_call)
96 if not self._outcomes:
97 raise AssertionError("No fake tool outcome queued")
98 return self._outcomes.pop(0)
99
100
101 def build_context(
102 *,
103 temp_dir: Path,
104 messages: list[Message],
105 safeguards: FakeSafeguards,
106 assess_confidence,
107 verify_action,
108 recovery_context: RecoveryContext | None = None,
109 confidence_scoring: bool = False,
110 verification: bool = False,
111 auto_recover: bool = True,
112 min_confidence_for_action: int = 3,
113 ) -> RuntimeContext:
114 registry = create_default_registry(temp_dir)
115 registry.configure_workspace_root(temp_dir)
116 rule_status = load_permission_rules(temp_dir)
117 policy = build_permission_policy(
118 active_mode=PermissionMode.WORKSPACE_WRITE,
119 workspace_root=temp_dir,
120 tool_requirements=registry.get_tool_requirements(),
121 rules=rule_status.rules,
122 )
123 context = RuntimeContext(
124 project_root=temp_dir,
125 backend=ScriptedBackend(),
126 registry=registry,
127 session=FakeSession(messages), # type: ignore[arg-type]
128 config=SimpleNamespace(
129 force_react=False,
130 max_recovery_attempts=2,
131 auto_recover=auto_recover,
132 reasoning=SimpleNamespace(
133 rollback=False,
134 show_rollback_plan=False,
135 completion_check=True,
136 max_continuation_prompts=5,
137 self_critique=False,
138 confidence_scoring=confidence_scoring,
139 min_confidence_for_action=min_confidence_for_action,
140 verification=verification,
141 ),
142 ),
143 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
144 project_context=None,
145 permission_policy=policy,
146 permission_config_status=rule_status,
147 workflow_mode="execute",
148 safeguards=safeguards,
149 reasoning=SimpleNamespace(
150 assess_confidence=assess_confidence,
151 verify_action=verify_action,
152 ),
153 recovery_context=recovery_context,
154 )
155 return context
156
157
158 def tool_outcome(
159 *,
160 tool_call: ToolCall,
161 output: str,
162 is_error: bool,
163 state: ToolExecutionState = ToolExecutionState.EXECUTED,
164 metadata: dict[str, object] | None = None,
165 ) -> ToolExecutionOutcome:
166 return ToolExecutionOutcome(
167 tool_call=tool_call,
168 state=state,
169 message=Message.tool_result_message(
170 tool_call_id=tool_call.id,
171 display_content=output,
172 result_content=output,
173 is_error=is_error,
174 ),
175 event_content=output,
176 is_error=is_error,
177 result_output=output,
178 registry_result=RegistryToolResult(
179 output=output,
180 is_error=is_error,
181 metadata=metadata or {},
182 ),
183 )
184
185
186 @pytest.mark.asyncio
187 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
188 captured: dict[str, str] = {}
189
190 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
191 captured["context"] = context
192 return ConfidenceAssessment(
193 action=f"{tool_name} with {tool_args}",
194 tool_name=tool_name,
195 tool_args=tool_args,
196 level=ConfidenceLevel.LOW,
197 reasoning="Need to inspect the target first.",
198 risks=["Unknown target file"],
199 )
200
201 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
202 raise AssertionError("Verification should not run for skipped actions")
203
204 context = build_context(
205 temp_dir=temp_dir,
206 messages=[
207 Message(role=Role.USER, content="Please inspect the project."),
208 Message(role=Role.ASSISTANT, content="I will read the file next."),
209 ],
210 safeguards=FakeSafeguards(),
211 assess_confidence=assess_confidence,
212 verify_action=verify_action,
213 confidence_scoring=True,
214 min_confidence_for_action=3,
215 )
216 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
217 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
218 events: list[AgentEvent] = []
219
220 async def emit(event: AgentEvent) -> None:
221 events.append(event)
222
223 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
224 result = await runner.execute_batch(
225 tool_calls=[tool_call],
226 tool_source="assistant",
227 pending_tool_calls_seen=set(),
228 emit=emit,
229 summary=TurnSummary(final_response=""),
230 dod=create_definition_of_done("Read the docs"),
231 executor=executor, # type: ignore[arg-type]
232 on_confirmation=None,
233 on_user_question=None,
234 emit_confirmation=None,
235 consecutive_errors=0,
236 )
237
238 assert result.actions_taken == []
239 assert executor.calls == []
240 assert "Please inspect the project." in captured["context"]
241 assert context.session.messages[-1].role == Role.USER
242 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
243 event_types = [event.type for event in events]
244 assert "confidence" in event_types
245
246
247 @pytest.mark.asyncio
248 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
249 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
250 raise AssertionError("Confidence scoring should be disabled in this scenario")
251
252 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
253 raise AssertionError("Verification should not run for failed actions")
254
255 context = build_context(
256 temp_dir=temp_dir,
257 messages=[],
258 safeguards=FakeSafeguards(),
259 assess_confidence=assess_confidence,
260 verify_action=verify_action,
261 auto_recover=True,
262 )
263 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
264 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
265 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
266 summary = TurnSummary(final_response="")
267 events: list[AgentEvent] = []
268
269 async def emit(event: AgentEvent) -> None:
270 events.append(event)
271
272 await runner.execute_batch(
273 tool_calls=[tool_call],
274 tool_source="assistant",
275 pending_tool_calls_seen=set(),
276 emit=emit,
277 summary=summary,
278 dod=create_definition_of_done("Run tests"),
279 executor=executor, # type: ignore[arg-type]
280 on_confirmation=None,
281 on_user_question=None,
282 emit_confirmation=None,
283 consecutive_errors=0,
284 )
285
286 assert context.recovery_context is not None
287 assert summary.tool_result_messages
288 assert context.session.messages[-1] == summary.tool_result_messages[-1]
289 assert any(event.type == "recovery" for event in events)
290
291
292 @pytest.mark.asyncio
293 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
294 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
295 raise AssertionError("Confidence scoring should be disabled in this scenario")
296
297 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
298 raise AssertionError("Verification should not run for this scenario")
299
300 context = build_context(
301 temp_dir=temp_dir,
302 messages=[],
303 safeguards=FakeSafeguards(),
304 assess_confidence=assess_confidence,
305 verify_action=verify_action,
306 auto_recover=False,
307 )
308 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
309 tool_call = ToolCall(
310 id="bash-1",
311 name="bash",
312 arguments={"command": "python -m http.server 8000", "background": True},
313 )
314 metadata = {
315 "job_id": "bash-1",
316 "status": "running",
317 "background": True,
318 }
319 executor = FakeExecutor(
320 [
321 tool_outcome(
322 tool_call=tool_call,
323 output="Started bash job bash-1",
324 is_error=False,
325 metadata=metadata,
326 )
327 ]
328 )
329 events: list[AgentEvent] = []
330
331 async def emit(event: AgentEvent) -> None:
332 events.append(event)
333
334 await runner.execute_batch(
335 tool_calls=[tool_call],
336 tool_source="assistant",
337 pending_tool_calls_seen=set(),
338 emit=emit,
339 summary=TurnSummary(final_response=""),
340 dod=create_definition_of_done("Launch a preview server"),
341 executor=executor, # type: ignore[arg-type]
342 on_confirmation=None,
343 on_user_question=None,
344 emit_confirmation=None,
345 consecutive_errors=0,
346 )
347
348 tool_result = next(event for event in events if event.type == "tool_result")
349 assert tool_result.tool_metadata == metadata
350
351
352 @pytest.mark.asyncio
353 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
354 verification_calls: list[str] = []
355
356 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
357 raise AssertionError("Confidence scoring should be disabled in this scenario")
358
359 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
360 verification_calls.append(result)
361 return ActionVerification(
362 tool_name=tool_name,
363 tool_args=tool_args,
364 expected_outcome="Success",
365 actual_result=result,
366 verified=False,
367 discrepancies=["File contents did not match"],
368 needs_correction=True,
369 correction_suggestion="Read the file before editing again.",
370 )
371
372 existing_recovery = RecoveryContext(
373 original_tool="edit",
374 original_args={"file_path": "README.md"},
375 )
376 context = build_context(
377 temp_dir=temp_dir,
378 messages=[],
379 safeguards=FakeSafeguards(),
380 assess_confidence=assess_confidence,
381 verify_action=verify_action,
382 recovery_context=existing_recovery,
383 verification=True,
384 )
385 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
386 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
387 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
388 events: list[AgentEvent] = []
389
390 async def emit(event: AgentEvent) -> None:
391 events.append(event)
392
393 await runner.execute_batch(
394 tool_calls=[tool_call],
395 tool_source="assistant",
396 pending_tool_calls_seen=set(),
397 emit=emit,
398 summary=TurnSummary(final_response=""),
399 dod=create_definition_of_done("Read the docs"),
400 executor=executor, # type: ignore[arg-type]
401 on_confirmation=None,
402 on_user_question=None,
403 emit_confirmation=None,
404 consecutive_errors=0,
405 )
406
407 assert verification_calls == ["file contents"]
408 assert context.recovery_context is existing_recovery
409 assert existing_recovery.successful_steps == [
410 ("read", {"file_path": "README.md"})
411 ]
412 assert context.session.messages[-1].role == Role.TOOL
413 assert context.session.messages[-1].content == "file contents"
414 assert any(event.type == "verification" for event in events)
415
416
417 @pytest.mark.asyncio
418 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
419 temp_dir: Path,
420 ) -> None:
421 async def assess_confidence(
422 tool_name: str,
423 tool_args: dict,
424 context: str,
425 ) -> ConfidenceAssessment:
426 raise AssertionError("Confidence scoring should be disabled in this scenario")
427
428 async def verify_action(
429 tool_name: str,
430 tool_args: dict,
431 result: str,
432 expected: str = "",
433 ) -> ActionVerification:
434 raise AssertionError("Verification should not run for this scenario")
435
436 existing_recovery = RecoveryContext(
437 original_tool="read",
438 original_args={"file_path": "chapters/04-data-types.html"},
439 )
440 existing_recovery.add_attempt(
441 "read",
442 {"file_path": "chapters/04-data-types.html"},
443 "File not found",
444 )
445 context = build_context(
446 temp_dir=temp_dir,
447 messages=[],
448 safeguards=FakeSafeguards(),
449 assess_confidence=assess_confidence,
450 verify_action=verify_action,
451 recovery_context=existing_recovery,
452 auto_recover=False,
453 )
454 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
455 tool_call = ToolCall(
456 id="bash-1",
457 name="bash",
458 arguments={"command": "ls chapters"},
459 )
460 executor = FakeExecutor(
461 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
462 )
463
464 summary = TurnSummary(final_response="")
465 await runner.execute_batch(
466 tool_calls=[tool_call],
467 tool_source="assistant",
468 pending_tool_calls_seen=set(),
469 emit=_noop_emit,
470 summary=summary,
471 dod=create_definition_of_done("Fix the chapter links"),
472 executor=executor, # type: ignore[arg-type]
473 on_confirmation=None,
474 on_user_question=None,
475 emit_confirmation=None,
476 consecutive_errors=0,
477 )
478
479 assert context.recovery_context is existing_recovery
480 assert existing_recovery.successful_steps == [
481 ("bash", {"command": "ls chapters"})
482 ]
483
484
485 @pytest.mark.asyncio
486 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
487 temp_dir: Path,
488 ) -> None:
489 async def assess_confidence(
490 tool_name: str,
491 tool_args: dict,
492 context: str,
493 ) -> ConfidenceAssessment:
494 raise AssertionError("Confidence scoring should be disabled in this scenario")
495
496 async def verify_action(
497 tool_name: str,
498 tool_args: dict,
499 result: str,
500 expected: str = "",
501 ) -> ActionVerification:
502 raise AssertionError("Verification should not run for this scenario")
503
504 existing_recovery = RecoveryContext(
505 original_tool="read",
506 original_args={"file_path": "chapters/04-data-types.html"},
507 )
508 existing_recovery.add_attempt(
509 "read",
510 {"file_path": "chapters/04-data-types.html"},
511 "File not found",
512 )
513 context = build_context(
514 temp_dir=temp_dir,
515 messages=[],
516 safeguards=FakeSafeguards(),
517 assess_confidence=assess_confidence,
518 verify_action=verify_action,
519 recovery_context=existing_recovery,
520 auto_recover=False,
521 )
522 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
523 tool_call = ToolCall(
524 id="patch-1",
525 name="patch",
526 arguments={
527 "file_path": "index.html",
528 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
529 },
530 )
531 executor = FakeExecutor(
532 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
533 )
534
535 summary = TurnSummary(final_response="")
536 await runner.execute_batch(
537 tool_calls=[tool_call],
538 tool_source="assistant",
539 pending_tool_calls_seen=set(),
540 emit=_noop_emit,
541 summary=summary,
542 dod=create_definition_of_done("Fix the chapter links"),
543 executor=executor, # type: ignore[arg-type]
544 on_confirmation=None,
545 on_user_question=None,
546 emit_confirmation=None,
547 consecutive_errors=0,
548 )
549
550 assert context.recovery_context is None
551
552
553 @pytest.mark.asyncio
554 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
555 temp_dir: Path,
556 ) -> None:
557 async def assess_confidence(
558 tool_name: str,
559 tool_args: dict,
560 context: str,
561 ) -> ConfidenceAssessment:
562 raise AssertionError("Confidence scoring should be disabled in this scenario")
563
564 async def verify_action(
565 tool_name: str,
566 tool_args: dict,
567 result: str,
568 expected: str = "",
569 ) -> ActionVerification:
570 raise AssertionError("Verification should not run for this scenario")
571
572 messages = [
573 Message(
574 role=Role.TOOL,
575 content=(
576 "Observation [glob]: Result: "
577 f"{temp_dir}/chapters/01-introduction.html\n"
578 f"{temp_dir}/chapters/02-setup.html\n"
579 f"{temp_dir}/chapters/03-basics.html"
580 ),
581 tool_results=[],
582 ),
583 Message(
584 role=Role.ASSISTANT,
585 content="I already inspected the first chapter title.",
586 tool_calls=[
587 ToolCall(
588 id="read-ch1",
589 name="read",
590 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
591 )
592 ],
593 ),
594 Message.tool_result_message(
595 tool_call_id="read-ch1",
596 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
598 ),
599 Message(
600 role=Role.ASSISTANT,
601 content="I should update the index now.",
602 tool_calls=[
603 ToolCall(
604 id="read-index",
605 name="read",
606 arguments={"file_path": str(temp_dir / 'index.html')},
607 )
608 ],
609 ),
610 ]
611 context = build_context(
612 temp_dir=temp_dir,
613 messages=messages,
614 safeguards=FakeSafeguards(),
615 assess_confidence=assess_confidence,
616 verify_action=verify_action,
617 auto_recover=False,
618 )
619 (temp_dir / "chapters").mkdir()
620 (temp_dir / "index.html").write_text("<ul></ul>\n")
621 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
622 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
623 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
624 implementation_plan = temp_dir / "implementation.md"
625 implementation_plan.write_text(
626 "\n".join(
627 [
628 "# Implementation Plan",
629 "",
630 "## File Changes",
631 f"- `{temp_dir / 'index.html'}`",
632 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
633 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
634 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
635 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
636 ]
637 )
638 )
639 context.session.current_task = (
640 f"Update {temp_dir / 'index.html'} with the right chapter links."
641 )
642 persistent_messages: list[str] = []
643 ephemeral_messages: list[str] = []
644 context.queue_steering_message_callback = persistent_messages.append
645 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
646 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
647 tool_call = ToolCall(
648 id="read-dup",
649 name="read",
650 arguments={"file_path": str(temp_dir / "index.html")},
651 )
652 duplicate_message = (
653 "[Skipped - duplicate action: Already read "
654 f"{temp_dir / 'index.html'} recently without any intervening changes; "
655 "reuse the earlier read result instead of rereading]"
656 )
657 executor = FakeExecutor(
658 [
659 ToolExecutionOutcome(
660 tool_call=tool_call,
661 state=ToolExecutionState.DUPLICATE,
662 message=Message.tool_result_message(
663 tool_call_id=tool_call.id,
664 display_content=duplicate_message,
665 result_content=duplicate_message,
666 ),
667 event_content=duplicate_message,
668 is_error=False,
669 result_output=duplicate_message,
670 )
671 ]
672 )
673
674 summary = TurnSummary(final_response="")
675 dod = create_definition_of_done("Fix the chapter links")
676 dod.implementation_plan = str(implementation_plan)
677 dod.pending_items.append("Create the remaining chapter files")
678 await runner.execute_batch(
679 tool_calls=[tool_call],
680 tool_source="assistant",
681 pending_tool_calls_seen=set(),
682 emit=_noop_emit,
683 summary=summary,
684 dod=dod,
685 executor=executor, # type: ignore[arg-type]
686 on_confirmation=None,
687 on_user_question=None,
688 emit_confirmation=None,
689 consecutive_errors=0,
690 )
691
692 assert len(persistent_messages) == 1
693 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
694 assert "A declared output artifact is still missing." in persistent_messages[0]
695 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
696 assert (
697 "Prefer one `write` call for "
698 f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
699 in persistent_messages[0]
700 )
701 assert ephemeral_messages == []
702
703
704 @pytest.mark.asyncio
705 async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
706 temp_dir: Path,
707 ) -> None:
708 async def assess_confidence(
709 tool_name: str,
710 tool_args: dict,
711 context: str,
712 ) -> ConfidenceAssessment:
713 raise AssertionError("Confidence scoring should not run for this scenario")
714
715 async def verify_action(
716 tool_name: str,
717 tool_args: dict,
718 result: str,
719 expected: str = "",
720 ) -> ActionVerification:
721 raise AssertionError("Verification should not run for this scenario")
722
723 guide_root = temp_dir / "guide"
724 chapters = guide_root / "chapters"
725 chapters.mkdir(parents=True)
726 index = guide_root / "index.html"
727 chapter_one = chapters / "01-introduction.html"
728 index.write_text(
729 '<a href="chapters/01-introduction.html">Intro</a>\n'
730 '<a href="chapters/02-installation.html">Install</a>\n'
731 )
732 chapter_one.write_text("<h1>Intro</h1>\n")
733
734 implementation_plan = temp_dir / "implementation.md"
735 implementation_plan.write_text(
736 "\n".join(
737 [
738 "# Implementation Plan",
739 "",
740 "## File Changes",
741 f"- `{index}`",
742 f"- `{chapters}/` (directory for chapter files)",
743 ]
744 )
745 )
746
747 messages = [
748 Message(
749 role=Role.ASSISTANT,
750 content="I should keep building the guide.",
751 tool_calls=[
752 ToolCall(
753 id="read-index",
754 name="read",
755 arguments={"file_path": str(index)},
756 )
757 ],
758 ),
759 ]
760 context = build_context(
761 temp_dir=temp_dir,
762 messages=messages,
763 safeguards=FakeSafeguards(),
764 assess_confidence=assess_confidence,
765 verify_action=verify_action,
766 auto_recover=False,
767 )
768 context.session.current_task = f"Build the guide rooted at {index}."
769 persistent_messages: list[str] = []
770 ephemeral_messages: list[str] = []
771 context.queue_steering_message_callback = persistent_messages.append
772 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
773 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
774 tool_call = ToolCall(
775 id="read-dup-rooted",
776 name="read",
777 arguments={"file_path": str(index)},
778 )
779 duplicate_message = (
780 "[Skipped - duplicate action: Already read "
781 f"{index} recently without any intervening changes; "
782 "reuse the earlier read result instead of rereading]"
783 )
784 executor = FakeExecutor(
785 [
786 ToolExecutionOutcome(
787 tool_call=tool_call,
788 state=ToolExecutionState.DUPLICATE,
789 message=Message.tool_result_message(
790 tool_call_id=tool_call.id,
791 display_content=duplicate_message,
792 result_content=duplicate_message,
793 ),
794 event_content=duplicate_message,
795 is_error=False,
796 result_output=duplicate_message,
797 )
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
803 dod.implementation_plan = str(implementation_plan)
804 dod.touched_files = [str(index), str(chapter_one)]
805 dod.completed_items = ["Create chapter files with appropriate content"]
806 dod.pending_items.append("Create the remaining chapter files")
807
808 await runner.execute_batch(
809 tool_calls=[tool_call],
810 tool_source="assistant",
811 pending_tool_calls_seen=set(),
812 emit=_noop_emit,
813 summary=summary,
814 dod=dod,
815 executor=executor, # type: ignore[arg-type]
816 on_confirmation=None,
817 on_user_question=None,
818 emit_confirmation=None,
819 consecutive_errors=0,
820 )
821
822 assert len(persistent_messages) == 1
823 assert "Create the remaining chapter files" in persistent_messages[0]
824 assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
825 assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
826 assert ephemeral_messages == []
827
828
829 @pytest.mark.asyncio
830 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
831 temp_dir: Path,
832 ) -> None:
833 async def assess_confidence(
834 tool_name: str,
835 tool_args: dict,
836 context: str,
837 ) -> ConfidenceAssessment:
838 raise AssertionError("Confidence scoring should not run for this scenario")
839
840 async def verify_action(
841 tool_name: str,
842 tool_args: dict,
843 result: str,
844 expected: str = "",
845 ) -> ActionVerification:
846 raise AssertionError("Verification should not run for this scenario")
847
848 context = build_context(
849 temp_dir=temp_dir,
850 messages=[],
851 safeguards=FakeSafeguards(),
852 assess_confidence=assess_confidence,
853 verify_action=verify_action,
854 auto_recover=False,
855 )
856 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
857 dod = create_definition_of_done("Create a multi-file nginx guide.")
858 sync_todos_to_definition_of_done(
859 dod,
860 [
861 {
862 "content": "Create 03-first-website.html",
863 "active_form": "Creating 03-first-website.html",
864 "status": "pending",
865 },
866 {
867 "content": "Create 04-configuration-basics.html",
868 "active_form": "Creating 04-configuration-basics.html",
869 "status": "pending",
870 },
871 ],
872 )
873
874 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
875 chapter_path.parent.mkdir(parents=True)
876 write_call = ToolCall(
877 id="write-ch3",
878 name="write",
879 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
880 )
881 stale_todo_call = ToolCall(
882 id="todo-stale",
883 name="TodoWrite",
884 arguments={
885 "todos": [
886 {
887 "content": "Create 03-first-website.html",
888 "active_form": "Creating 03-first-website.html",
889 "status": "pending",
890 },
891 {
892 "content": "Create 04-configuration-basics.html",
893 "active_form": "Creating 04-configuration-basics.html",
894 "status": "pending",
895 },
896 ]
897 },
898 )
899 executor = FakeExecutor(
900 [
901 tool_outcome(
902 tool_call=write_call,
903 output=f"Successfully wrote {chapter_path}",
904 is_error=False,
905 ),
906 tool_outcome(
907 tool_call=stale_todo_call,
908 output="Todos updated",
909 is_error=False,
910 metadata={
911 "new_todos": [
912 {
913 "content": "Create 03-first-website.html",
914 "active_form": "Creating 03-first-website.html",
915 "status": "pending",
916 },
917 {
918 "content": "Create 04-configuration-basics.html",
919 "active_form": "Creating 04-configuration-basics.html",
920 "status": "pending",
921 },
922 ]
923 },
924 ),
925 ]
926 )
927
928 summary = TurnSummary(final_response="")
929 await runner.execute_batch(
930 tool_calls=[write_call, stale_todo_call],
931 tool_source="assistant",
932 pending_tool_calls_seen=set(),
933 emit=_noop_emit,
934 summary=summary,
935 dod=dod,
936 executor=executor, # type: ignore[arg-type]
937 on_confirmation=None,
938 on_user_question=None,
939 emit_confirmation=None,
940 consecutive_errors=0,
941 )
942
943 assert "Create 03-first-website.html" in dod.completed_items
944 assert "Create 03-first-website.html" not in dod.pending_items
945 assert "Create 04-configuration-basics.html" in dod.pending_items
946
947
948 @pytest.mark.asyncio
949 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
950 temp_dir: Path,
951 ) -> None:
952 async def assess_confidence(
953 tool_name: str,
954 tool_args: dict,
955 context: str,
956 ) -> ConfidenceAssessment:
957 raise AssertionError("Confidence scoring should be disabled in this scenario")
958
959 async def verify_action(
960 tool_name: str,
961 tool_args: dict,
962 result: str,
963 expected: str = "",
964 ) -> ActionVerification:
965 raise AssertionError("Verification should not run for this scenario")
966
967 chapters = temp_dir / "chapters"
968 chapters.mkdir()
969 (chapters / "01-introduction.html").write_text(
970 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
971 )
972 (chapters / "02-setup.html").write_text(
973 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
974 )
975 (temp_dir / "index.html").write_text("<ul></ul>\n")
976
977 context = build_context(
978 temp_dir=temp_dir,
979 messages=[],
980 safeguards=FakeSafeguards(),
981 assess_confidence=assess_confidence,
982 verify_action=verify_action,
983 auto_recover=False,
984 )
985 context.session.current_task = (
986 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
987 )
988 persistent_messages: list[str] = []
989 ephemeral_messages: list[str] = []
990 context.queue_steering_message_callback = persistent_messages.append
991 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
992 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
993 tool_call = ToolCall(
994 id="glob-1",
995 name="glob",
996 arguments={"path": str(chapters), "pattern": "*.html"},
997 )
998 executor = FakeExecutor(
999 [
1000 tool_outcome(
1001 tool_call=tool_call,
1002 output="\n".join(
1003 [
1004 str(chapters / "01-introduction.html"),
1005 str(chapters / "02-setup.html"),
1006 ]
1007 ),
1008 is_error=False,
1009 )
1010 ]
1011 )
1012
1013 summary = TurnSummary(final_response="")
1014 await runner.execute_batch(
1015 tool_calls=[tool_call],
1016 tool_source="assistant",
1017 pending_tool_calls_seen=set(),
1018 emit=_noop_emit,
1019 summary=summary,
1020 dod=create_definition_of_done("Fix the chapter links"),
1021 executor=executor, # type: ignore[arg-type]
1022 on_confirmation=None,
1023 on_user_question=None,
1024 emit_confirmation=None,
1025 consecutive_errors=0,
1026 )
1027
1028 assert persistent_messages == []
1029 assert ephemeral_messages == []
1030 assert len(summary.tool_result_messages) == 1
1031 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
1032
1033
1034 @pytest.mark.asyncio
1035 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
1036 temp_dir: Path,
1037 ) -> None:
1038 async def assess_confidence(
1039 tool_name: str,
1040 tool_args: dict,
1041 context: str,
1042 ) -> ConfidenceAssessment:
1043 raise AssertionError("Confidence scoring should be disabled in this scenario")
1044
1045 async def verify_action(
1046 tool_name: str,
1047 tool_args: dict,
1048 result: str,
1049 expected: str = "",
1050 ) -> ActionVerification:
1051 raise AssertionError("Verification should not run for this scenario")
1052
1053 chapters = temp_dir / "chapters"
1054 chapters.mkdir()
1055 (chapters / "01-introduction.html").write_text(
1056 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1057 )
1058 (chapters / "02-setup.html").write_text(
1059 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1060 )
1061 index_path = temp_dir / "index.html"
1062 old_block = (
1063 '<ul class="chapter-list">\n'
1064 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
1065 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
1066 "</ul>\n"
1067 )
1068 new_block = (
1069 '<ul class="chapter-list">\n'
1070 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1071 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1072 "</ul>\n"
1073 )
1074 index_path.write_text(new_block)
1075
1076 context = build_context(
1077 temp_dir=temp_dir,
1078 messages=[],
1079 safeguards=FakeSafeguards(),
1080 assess_confidence=assess_confidence,
1081 verify_action=verify_action,
1082 auto_recover=False,
1083 )
1084 context.session.current_task = (
1085 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1086 )
1087 persistent_messages: list[str] = []
1088 ephemeral_messages: list[str] = []
1089 context.queue_steering_message_callback = persistent_messages.append
1090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1092 tool_call = ToolCall(
1093 id="edit-1",
1094 name="edit",
1095 arguments={
1096 "file_path": str(index_path),
1097 "old_string": old_block,
1098 "new_string": new_block,
1099 },
1100 )
1101 executor = FakeExecutor(
1102 [
1103 tool_outcome(
1104 tool_call=tool_call,
1105 output=f"Successfully edited {index_path}",
1106 is_error=False,
1107 )
1108 ]
1109 )
1110
1111 summary = TurnSummary(final_response="")
1112 await runner.execute_batch(
1113 tool_calls=[tool_call],
1114 tool_source="assistant",
1115 pending_tool_calls_seen=set(),
1116 emit=_noop_emit,
1117 summary=summary,
1118 dod=create_definition_of_done(
1119 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1120 ),
1121 executor=executor, # type: ignore[arg-type]
1122 on_confirmation=None,
1123 on_user_question=None,
1124 emit_confirmation=None,
1125 consecutive_errors=0,
1126 )
1127
1128 assert all(
1129 "Semantic verification preview:" not in message.content
1130 for message in summary.tool_result_messages
1131 )
1132 assert persistent_messages == []
1133 assert ephemeral_messages == []
1134
1135
1136 @pytest.mark.asyncio
1137 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1138 temp_dir: Path,
1139 ) -> None:
1140 async def assess_confidence(
1141 tool_name: str,
1142 tool_args: dict,
1143 context: str,
1144 ) -> ConfidenceAssessment:
1145 raise AssertionError("Confidence scoring should be disabled in this scenario")
1146
1147 async def verify_action(
1148 tool_name: str,
1149 tool_args: dict,
1150 result: str,
1151 expected: str = "",
1152 ) -> ActionVerification:
1153 raise AssertionError("Verification should not run for this scenario")
1154
1155 chapters = temp_dir / "chapters"
1156 chapters.mkdir()
1157 (chapters / "01-introduction.html").write_text(
1158 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1159 )
1160 (chapters / "02-setup.html").write_text(
1161 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1162 )
1163 index_path = temp_dir / "index.html"
1164 index_path.write_text(
1165 "<h2>Table of Contents</h2>\n"
1166 '<ul class="chapter-list">\n'
1167 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1168 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1169 "</ul>\n"
1170 )
1171
1172 prompt = (
1173 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1174 "for the structure and cadence of the guide. We are going to make an all "
1175 "new equally thorough guide on how to use the nginx tool."
1176 )
1177
1178 context = build_context(
1179 temp_dir=temp_dir,
1180 messages=[],
1181 safeguards=FakeSafeguards(),
1182 assess_confidence=assess_confidence,
1183 verify_action=verify_action,
1184 auto_recover=False,
1185 )
1186 context.session.current_task = prompt # type: ignore[attr-defined]
1187 persistent_messages: list[str] = []
1188 ephemeral_messages: list[str] = []
1189 context.queue_steering_message_callback = persistent_messages.append
1190 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1191 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1192 tool_call = ToolCall(
1193 id="read-index",
1194 name="read",
1195 arguments={"file_path": str(index_path)},
1196 )
1197 executor = FakeExecutor(
1198 [
1199 tool_outcome(
1200 tool_call=tool_call,
1201 output=index_path.read_text(),
1202 is_error=False,
1203 )
1204 ]
1205 )
1206
1207 summary = TurnSummary(final_response="")
1208 await runner.execute_batch(
1209 tool_calls=[tool_call],
1210 tool_source="assistant",
1211 pending_tool_calls_seen=set(),
1212 emit=_noop_emit,
1213 summary=summary,
1214 dod=create_definition_of_done(prompt),
1215 executor=executor, # type: ignore[arg-type]
1216 on_confirmation=None,
1217 on_user_question=None,
1218 emit_confirmation=None,
1219 consecutive_errors=0,
1220 )
1221
1222 assert persistent_messages == []
1223 assert ephemeral_messages == []
1224 assert all(
1225 "Semantic verification preview:" not in message.content
1226 for message in summary.tool_result_messages
1227 )
1228
1229
1230 @pytest.mark.asyncio
1231 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1232 temp_dir: Path,
1233 ) -> None:
1234 async def assess_confidence(
1235 tool_name: str,
1236 tool_args: dict,
1237 context: str,
1238 ) -> ConfidenceAssessment:
1239 raise AssertionError("Confidence scoring should be disabled in this scenario")
1240
1241 async def verify_action(
1242 tool_name: str,
1243 tool_args: dict,
1244 result: str,
1245 expected: str = "",
1246 ) -> ActionVerification:
1247 raise AssertionError("Verification should not run for this scenario")
1248
1249 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1250 reference.parent.mkdir(parents=True)
1251 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1252 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1253 chapters = nginx_root / "chapters"
1254 implementation_plan = temp_dir / "implementation.md"
1255 implementation_plan.write_text(
1256 "\n".join(
1257 [
1258 "# Implementation Plan",
1259 "",
1260 "## File Changes",
1261 f"- `{chapters}/`",
1262 f"- `{nginx_root / 'index.html'}`",
1263 "",
1264 ]
1265 )
1266 )
1267
1268 context = build_context(
1269 temp_dir=temp_dir,
1270 messages=[],
1271 safeguards=FakeSafeguards(),
1272 assess_confidence=assess_confidence,
1273 verify_action=verify_action,
1274 auto_recover=False,
1275 )
1276 persistent_messages: list[str] = []
1277 ephemeral_messages: list[str] = []
1278 context.queue_steering_message_callback = persistent_messages.append
1279 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1280 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1281 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1282 dod.implementation_plan = str(implementation_plan)
1283 sync_todos_to_definition_of_done(
1284 dod,
1285 [
1286 {
1287 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1288 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1289 "status": "pending",
1290 },
1291 {
1292 "content": "Create the nginx directory structure",
1293 "active_form": "Working on: Create the nginx directory structure",
1294 "status": "pending",
1295 },
1296 {
1297 "content": "Create the nginx index.html file",
1298 "active_form": "Working on: Create the nginx index.html file",
1299 "status": "pending",
1300 },
1301 ],
1302 )
1303 tool_call = ToolCall(
1304 id="read-reference",
1305 name="read",
1306 arguments={"file_path": str(reference)},
1307 )
1308 executor = FakeExecutor(
1309 [
1310 tool_outcome(
1311 tool_call=tool_call,
1312 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1313 is_error=False,
1314 )
1315 ]
1316 )
1317
1318 summary = TurnSummary(final_response="")
1319 await runner.execute_batch(
1320 tool_calls=[tool_call],
1321 tool_source="assistant",
1322 pending_tool_calls_seen=set(),
1323 emit=_noop_emit,
1324 summary=summary,
1325 dod=dod,
1326 executor=executor, # type: ignore[arg-type]
1327 on_confirmation=None,
1328 on_user_question=None,
1329 emit_confirmation=None,
1330 consecutive_errors=0,
1331 )
1332
1333 assert (
1334 "Examine the existing Fortran guide structure to understand the cadence and format"
1335 in dod.completed_items
1336 )
1337 assert any(
1338 "Continue with the next pending item: `Create the nginx directory structure`"
1339 in message
1340 for message in persistent_messages
1341 )
1342 assert any(
1343 "Resume by creating `chapters/` now." in message
1344 for message in persistent_messages
1345 )
1346 assert all("01-introduction.html" not in message for message in persistent_messages)
1347 assert ephemeral_messages == []
1348
1349
1350 @pytest.mark.asyncio
1351 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1352 temp_dir: Path,
1353 ) -> None:
1354 async def assess_confidence(
1355 tool_name: str,
1356 tool_args: dict,
1357 context: str,
1358 ) -> ConfidenceAssessment:
1359 raise AssertionError("Confidence scoring should be disabled in this scenario")
1360
1361 async def verify_action(
1362 tool_name: str,
1363 tool_args: dict,
1364 result: str,
1365 expected: str = "",
1366 ) -> ActionVerification:
1367 raise AssertionError("Verification should not run for this scenario")
1368
1369 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1370 reference.parent.mkdir(parents=True)
1371 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1372 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1373 chapters = nginx_root / "chapters"
1374 implementation_plan = temp_dir / "implementation.md"
1375 implementation_plan.write_text(
1376 "\n".join(
1377 [
1378 "# Implementation Plan",
1379 "",
1380 "## File Changes",
1381 f"- `{nginx_root / 'index.html'}`",
1382 f"- `{chapters}/`",
1383 "",
1384 ]
1385 )
1386 )
1387
1388 context = build_context(
1389 temp_dir=temp_dir,
1390 messages=[],
1391 safeguards=FakeSafeguards(),
1392 assess_confidence=assess_confidence,
1393 verify_action=verify_action,
1394 auto_recover=False,
1395 )
1396 persistent_messages: list[str] = []
1397 ephemeral_messages: list[str] = []
1398 context.queue_steering_message_callback = persistent_messages.append
1399 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1400 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1401 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1402 dod.implementation_plan = str(implementation_plan)
1403 sync_todos_to_definition_of_done(
1404 dod,
1405 [
1406 {
1407 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1408 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1409 "status": "pending",
1410 },
1411 {
1412 "content": "Create the nginx directory structure",
1413 "active_form": "Working on: Create the nginx directory structure",
1414 "status": "pending",
1415 },
1416 {
1417 "content": "Create the nginx index.html file",
1418 "active_form": "Working on: Create the nginx index.html file",
1419 "status": "pending",
1420 },
1421 ],
1422 project_root=temp_dir,
1423 )
1424 tool_call = ToolCall(
1425 id="read-reference-index-first",
1426 name="read",
1427 arguments={"file_path": str(reference)},
1428 )
1429 executor = FakeExecutor(
1430 [
1431 tool_outcome(
1432 tool_call=tool_call,
1433 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1434 is_error=False,
1435 )
1436 ]
1437 )
1438
1439 summary = TurnSummary(final_response="")
1440 await runner.execute_batch(
1441 tool_calls=[tool_call],
1442 tool_source="assistant",
1443 pending_tool_calls_seen=set(),
1444 emit=_noop_emit,
1445 summary=summary,
1446 dod=dod,
1447 executor=executor, # type: ignore[arg-type]
1448 on_confirmation=None,
1449 on_user_question=None,
1450 emit_confirmation=None,
1451 consecutive_errors=0,
1452 )
1453
1454 assert persistent_messages
1455 assert any(
1456 "Continue with the next pending item: `Create the nginx directory structure`"
1457 in message
1458 for message in persistent_messages
1459 )
1460 assert any(
1461 "Resume by creating `chapters/` now." in message
1462 for message in persistent_messages
1463 )
1464 assert all(
1465 "Next step: create `index.html`." not in message
1466 for message in persistent_messages
1467 )
1468 assert ephemeral_messages == []
1469
1470
1471 @pytest.mark.asyncio
1472 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1473 temp_dir: Path,
1474 ) -> None:
1475 async def assess_confidence(
1476 tool_name: str,
1477 tool_args: dict,
1478 context: str,
1479 ) -> ConfidenceAssessment:
1480 raise AssertionError("Confidence scoring should be disabled in this scenario")
1481
1482 async def verify_action(
1483 tool_name: str,
1484 tool_args: dict,
1485 result: str,
1486 expected: str = "",
1487 ) -> ActionVerification:
1488 raise AssertionError("Verification should not run for this scenario")
1489
1490 reference = temp_dir / "fortran" / "index.html"
1491 reference.parent.mkdir(parents=True)
1492 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1493
1494 messages = [
1495 Message(
1496 role=Role.TOOL,
1497 content=(
1498 "Observation [read]: Result: "
1499 "<h1>Fortran Beginner's Guide</h1>\n"
1500 ),
1501 )
1502 ]
1503 context = build_context(
1504 temp_dir=temp_dir,
1505 messages=messages,
1506 safeguards=FakeSafeguards(),
1507 assess_confidence=assess_confidence,
1508 verify_action=verify_action,
1509 auto_recover=False,
1510 )
1511 prompt = (
1512 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1513 "for the structure and cadence of the guide. We are going to make an all "
1514 "new equally thorough guide on how to use the nginx tool."
1515 )
1516 context.session.current_task = prompt
1517 persistent_messages: list[str] = []
1518 ephemeral_messages: list[str] = []
1519 context.queue_steering_message_callback = persistent_messages.append
1520 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1522 dod = create_definition_of_done(prompt)
1523 sync_todos_to_definition_of_done(
1524 dod,
1525 [
1526 {
1527 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1528 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1529 "status": "completed",
1530 },
1531 {
1532 "content": "Create the nginx directory structure",
1533 "active_form": "Working on: Create the nginx directory structure",
1534 "status": "pending",
1535 },
1536 {
1537 "content": "Create the nginx index.html file",
1538 "active_form": "Working on: Create the nginx index.html file",
1539 "status": "pending",
1540 },
1541 ],
1542 )
1543 tool_call = ToolCall(
1544 id="read-dup",
1545 name="read",
1546 arguments={"file_path": str(reference)},
1547 )
1548 duplicate_message = (
1549 "[Skipped - duplicate action: Already read "
1550 f"{reference} recently without any intervening changes; "
1551 "reuse the earlier read result instead of rereading]"
1552 )
1553 executor = FakeExecutor(
1554 [
1555 ToolExecutionOutcome(
1556 tool_call=tool_call,
1557 state=ToolExecutionState.DUPLICATE,
1558 message=Message.tool_result_message(
1559 tool_call_id=tool_call.id,
1560 display_content=duplicate_message,
1561 result_content=duplicate_message,
1562 ),
1563 event_content=duplicate_message,
1564 is_error=False,
1565 result_output=duplicate_message,
1566 )
1567 ]
1568 )
1569
1570 summary = TurnSummary(final_response="")
1571 await runner.execute_batch(
1572 tool_calls=[tool_call],
1573 tool_source="assistant",
1574 pending_tool_calls_seen=set(),
1575 emit=_noop_emit,
1576 summary=summary,
1577 dod=dod,
1578 executor=executor, # type: ignore[arg-type]
1579 on_confirmation=None,
1580 on_user_question=None,
1581 emit_confirmation=None,
1582 consecutive_errors=0,
1583 )
1584
1585 assert len(persistent_messages) == 1
1586 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1587 assert (
1588 "Continue with the next pending item: `Create the nginx directory structure`"
1589 in persistent_messages[0]
1590 )
1591 assert "Update `" not in persistent_messages[0]
1592 assert ephemeral_messages == []
1593
1594
1595 @pytest.mark.asyncio
1596 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1597 temp_dir: Path,
1598 ) -> None:
1599 async def assess_confidence(
1600 tool_name: str,
1601 tool_args: dict,
1602 context: str,
1603 ) -> ConfidenceAssessment:
1604 raise AssertionError("Confidence scoring should be disabled in this scenario")
1605
1606 async def verify_action(
1607 tool_name: str,
1608 tool_args: dict,
1609 result: str,
1610 expected: str = "",
1611 ) -> ActionVerification:
1612 raise AssertionError("Verification should not run for this scenario")
1613
1614 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1615 chapters = guide_root / "chapters"
1616 chapters.mkdir(parents=True)
1617 chapter_one = chapters / "01-introduction.html"
1618 chapter_one.write_text("<html></html>\n")
1619 index_path = guide_root / "index.html"
1620
1621 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1622 reference.parent.mkdir(parents=True, exist_ok=True)
1623 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1624
1625 implementation_plan = temp_dir / "implementation.md"
1626 implementation_plan.write_text(
1627 "\n".join(
1628 [
1629 "# Implementation Plan",
1630 "",
1631 "## File Changes",
1632 f"- `{guide_root}/`",
1633 f"- `{chapters}/`",
1634 f"- `{index_path}`",
1635 f"- `{chapter_one}`",
1636 f"- `{chapters / '02-installation.html'}`",
1637 "",
1638 ]
1639 )
1640 )
1641
1642 context = build_context(
1643 temp_dir=temp_dir,
1644 messages=[],
1645 safeguards=FakeSafeguards(),
1646 assess_confidence=assess_confidence,
1647 verify_action=verify_action,
1648 auto_recover=False,
1649 )
1650 persistent_messages: list[str] = []
1651 ephemeral_messages: list[str] = []
1652 context.queue_steering_message_callback = persistent_messages.append
1653 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1654 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1655 dod = create_definition_of_done("Create a multi-file nginx guide.")
1656 dod.implementation_plan = str(implementation_plan)
1657 dod.touched_files.append(str(chapter_one))
1658 sync_todos_to_definition_of_done(
1659 dod,
1660 [
1661 {
1662 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1663 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1664 "status": "pending",
1665 },
1666 {
1667 "content": "Create each chapter file with appropriate content",
1668 "active_form": "Working on: Create each chapter file with appropriate content",
1669 "status": "pending",
1670 },
1671 {
1672 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1673 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1674 "status": "pending",
1675 },
1676 ],
1677 )
1678 tool_call = ToolCall(
1679 id="read-reference-chapter",
1680 name="read",
1681 arguments={"file_path": str(reference)},
1682 )
1683 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1684 executor = FakeExecutor(
1685 [
1686 ToolExecutionOutcome(
1687 tool_call=tool_call,
1688 state=ToolExecutionState.EXECUTED,
1689 message=Message.tool_result_message(
1690 tool_call_id=tool_call.id,
1691 display_content=read_output,
1692 result_content=read_output,
1693 ),
1694 event_content=read_output,
1695 is_error=False,
1696 result_output=read_output,
1697 )
1698 ]
1699 )
1700
1701 summary = TurnSummary(final_response="")
1702 await runner.execute_batch(
1703 tool_calls=[tool_call],
1704 tool_source="assistant",
1705 pending_tool_calls_seen=set(),
1706 emit=_noop_emit,
1707 summary=summary,
1708 dod=dod,
1709 executor=executor, # type: ignore[arg-type]
1710 on_confirmation=None,
1711 on_user_question=None,
1712 emit_confirmation=None,
1713 consecutive_errors=0,
1714 )
1715
1716 assert persistent_messages
1717 assert any(
1718 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1719 in message
1720 for message in persistent_messages
1721 )
1722 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1723 assert not any(
1724 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1725 in message
1726 for message in persistent_messages
1727 )
1728 assert ephemeral_messages == []
1729
1730
1731 @pytest.mark.asyncio
1732 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1733 temp_dir: Path,
1734 ) -> None:
1735 async def assess_confidence(
1736 tool_name: str,
1737 tool_args: dict,
1738 context: str,
1739 ) -> ConfidenceAssessment:
1740 raise AssertionError("Confidence scoring should not run for this scenario")
1741
1742 async def verify_action(
1743 tool_name: str,
1744 tool_args: dict,
1745 result: str,
1746 expected: str = "",
1747 ) -> ActionVerification:
1748 raise AssertionError("Verification should not run for this scenario")
1749
1750 guide_root = temp_dir / "guides" / "nginx"
1751 chapters = guide_root / "chapters"
1752 guide_root.mkdir(parents=True)
1753 chapters.mkdir()
1754 index_path = guide_root / "index.html"
1755 chapter_one = chapters / "01-getting-started.html"
1756 chapter_two = chapters / "02-installation.html"
1757 index_path.write_text("<html></html>\n")
1758 chapter_one.write_text("<h1>One</h1>\n")
1759 chapter_two.write_text("<h1>Two</h1>\n")
1760
1761 implementation_plan = temp_dir / "implementation.md"
1762 implementation_plan.write_text(
1763 "\n".join(
1764 [
1765 "# Implementation Plan",
1766 "",
1767 "## File Changes",
1768 f"- `{guide_root}/`",
1769 f"- `{chapters}/`",
1770 f"- `{index_path}`",
1771 f"- `{chapter_one}`",
1772 f"- `{chapter_two}`",
1773 "",
1774 ]
1775 )
1776 )
1777
1778 context = build_context(
1779 temp_dir=temp_dir,
1780 messages=[],
1781 safeguards=FakeSafeguards(),
1782 assess_confidence=assess_confidence,
1783 verify_action=verify_action,
1784 auto_recover=False,
1785 )
1786 persistent_messages: list[str] = []
1787 ephemeral_messages: list[str] = []
1788 context.queue_steering_message_callback = persistent_messages.append
1789 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1790 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1791 dod = create_definition_of_done("Create a multi-file nginx guide.")
1792 dod.implementation_plan = str(implementation_plan)
1793 dod.pending_items = [
1794 "Create 07-performance-tuning.html",
1795 "Verify all guide files are linked and complete",
1796 "Complete the requested work",
1797 ]
1798
1799 tool_call = ToolCall(
1800 id="read-dup",
1801 name="read",
1802 arguments={"file_path": str(chapter_one)},
1803 )
1804 duplicate_message = (
1805 "[Skipped - duplicate action: Already read "
1806 f"{chapter_one} recently without any intervening changes; "
1807 "reuse the earlier read result instead of rereading]"
1808 )
1809 executor = FakeExecutor(
1810 [
1811 ToolExecutionOutcome(
1812 tool_call=tool_call,
1813 state=ToolExecutionState.DUPLICATE,
1814 message=Message.tool_result_message(
1815 tool_call_id=tool_call.id,
1816 display_content=duplicate_message,
1817 result_content=duplicate_message,
1818 ),
1819 event_content=duplicate_message,
1820 is_error=False,
1821 result_output=duplicate_message,
1822 )
1823 ]
1824 )
1825
1826 summary = TurnSummary(final_response="")
1827 await runner.execute_batch(
1828 tool_calls=[tool_call],
1829 tool_source="assistant",
1830 pending_tool_calls_seen=set(),
1831 emit=_noop_emit,
1832 summary=summary,
1833 dod=dod,
1834 executor=executor, # type: ignore[arg-type]
1835 on_confirmation=None,
1836 on_user_question=None,
1837 emit_confirmation=None,
1838 consecutive_errors=0,
1839 )
1840
1841 assert len(persistent_messages) == 1
1842 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1843 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1844 assert ephemeral_messages == []
1845
1846
1847 @pytest.mark.asyncio
1848 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1849 temp_dir: Path,
1850 ) -> None:
1851 async def assess_confidence(
1852 tool_name: str,
1853 tool_args: dict,
1854 context: str,
1855 ) -> ConfidenceAssessment:
1856 raise AssertionError("Confidence scoring should not run for this scenario")
1857
1858 async def verify_action(
1859 tool_name: str,
1860 tool_args: dict,
1861 result: str,
1862 expected: str = "",
1863 ) -> ActionVerification:
1864 raise AssertionError("Verification should not run for this scenario")
1865
1866 guide_root = temp_dir / "guides" / "nginx"
1867 chapters = guide_root / "chapters"
1868 guide_root.mkdir(parents=True)
1869 chapters.mkdir()
1870 index_path = guide_root / "index.html"
1871 chapter_one = chapters / "01-getting-started.html"
1872 chapter_two = chapters / "02-installation.html"
1873 index_path.write_text("<html></html>\n")
1874 chapter_one.write_text("<h1>One</h1>\n")
1875 chapter_two.write_text("<h1>Two</h1>\n")
1876
1877 implementation_plan = temp_dir / "implementation.md"
1878 implementation_plan.write_text(
1879 "\n".join(
1880 [
1881 "# Implementation Plan",
1882 "",
1883 "## File Changes",
1884 f"- `{guide_root}/`",
1885 f"- `{chapters}/`",
1886 f"- `{index_path}`",
1887 f"- `{chapter_one}`",
1888 f"- `{chapter_two}`",
1889 "",
1890 ]
1891 )
1892 )
1893
1894 context = build_context(
1895 temp_dir=temp_dir,
1896 messages=[],
1897 safeguards=FakeSafeguards(),
1898 assess_confidence=assess_confidence,
1899 verify_action=verify_action,
1900 auto_recover=False,
1901 )
1902 persistent_messages: list[str] = []
1903 ephemeral_messages: list[str] = []
1904 context.queue_steering_message_callback = persistent_messages.append
1905 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1906 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1907 dod = create_definition_of_done("Create a multi-file nginx guide.")
1908 dod.implementation_plan = str(implementation_plan)
1909 dod.verification_commands = [f"ls -la {guide_root}"]
1910 dod.pending_items = [
1911 "Create 07-performance-tuning.html",
1912 "Complete the requested work",
1913 ]
1914
1915 tool_call = ToolCall(
1916 id="read-dup",
1917 name="read",
1918 arguments={"file_path": str(chapter_one)},
1919 )
1920 duplicate_message = (
1921 "[Skipped - duplicate action: Already read "
1922 f"{chapter_one} recently without any intervening changes; "
1923 "reuse the earlier read result instead of rereading]"
1924 )
1925 executor = FakeExecutor(
1926 [
1927 ToolExecutionOutcome(
1928 tool_call=tool_call,
1929 state=ToolExecutionState.DUPLICATE,
1930 message=Message.tool_result_message(
1931 tool_call_id=tool_call.id,
1932 display_content=duplicate_message,
1933 result_content=duplicate_message,
1934 ),
1935 event_content=duplicate_message,
1936 is_error=False,
1937 result_output=duplicate_message,
1938 )
1939 ]
1940 )
1941
1942 summary = TurnSummary(final_response="")
1943 await runner.execute_batch(
1944 tool_calls=[tool_call],
1945 tool_source="assistant",
1946 pending_tool_calls_seen=set(),
1947 emit=_noop_emit,
1948 summary=summary,
1949 dod=dod,
1950 executor=executor, # type: ignore[arg-type]
1951 on_confirmation=None,
1952 on_user_question=None,
1953 emit_confirmation=None,
1954 consecutive_errors=0,
1955 )
1956
1957 assert len(persistent_messages) == 1
1958 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
1959 assert (
1960 "Move to verification or final confirmation using the files already on disk."
1961 in persistent_messages[0]
1962 )
1963 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1964 assert ephemeral_messages == []
1965
1966
1967 @pytest.mark.asyncio
1968 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1969 temp_dir: Path,
1970 ) -> None:
1971 async def assess_confidence(
1972 tool_name: str,
1973 tool_args: dict,
1974 context: str,
1975 ) -> ConfidenceAssessment:
1976 raise AssertionError("Confidence scoring should not run for this scenario")
1977
1978 async def verify_action(
1979 tool_name: str,
1980 tool_args: dict,
1981 result: str,
1982 expected: str = "",
1983 ) -> ActionVerification:
1984 raise AssertionError("Verification should not run for this scenario")
1985
1986 guide_root = temp_dir / "guides" / "nginx"
1987 chapters = guide_root / "chapters"
1988 guide_root.mkdir(parents=True)
1989 chapters.mkdir()
1990 index_path = guide_root / "index.html"
1991 chapter_one = chapters / "01-getting-started.html"
1992 chapter_two = chapters / "02-installation.html"
1993 index_path.write_text("<html></html>\n")
1994 chapter_one.write_text("<h1>One</h1>\n")
1995 chapter_two.write_text("<h1>Two</h1>\n")
1996
1997 implementation_plan = temp_dir / "implementation.md"
1998 implementation_plan.write_text(
1999 "\n".join(
2000 [
2001 "# Implementation Plan",
2002 "",
2003 "## File Changes",
2004 f"- `{guide_root}/`",
2005 f"- `{chapters}/`",
2006 f"- `{index_path}`",
2007 f"- `{chapter_one}`",
2008 f"- `{chapter_two}`",
2009 "",
2010 ]
2011 )
2012 )
2013
2014 context = build_context(
2015 temp_dir=temp_dir,
2016 messages=[],
2017 safeguards=FakeSafeguards(),
2018 assess_confidence=assess_confidence,
2019 verify_action=verify_action,
2020 auto_recover=False,
2021 )
2022 persistent_messages: list[str] = []
2023 ephemeral_messages: list[str] = []
2024 context.queue_steering_message_callback = persistent_messages.append
2025 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2026 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2027 dod = create_definition_of_done("Create a multi-file nginx guide.")
2028 dod.implementation_plan = str(implementation_plan)
2029 dod.verification_commands = [f"ls -la {guide_root}"]
2030 dod.pending_items = [
2031 "Create 01-getting-started.html",
2032 "Creating 02-installation.html",
2033 "Complete the requested work",
2034 ]
2035
2036 tool_call = ToolCall(
2037 id="read-dup-built-stale",
2038 name="read",
2039 arguments={"file_path": str(chapter_one)},
2040 )
2041 duplicate_message = (
2042 "[Skipped - duplicate action: Already read "
2043 f"{chapter_one} recently without any intervening changes; "
2044 "reuse the earlier read result instead of rereading]"
2045 )
2046 executor = FakeExecutor(
2047 [
2048 ToolExecutionOutcome(
2049 tool_call=tool_call,
2050 state=ToolExecutionState.DUPLICATE,
2051 message=Message.tool_result_message(
2052 tool_call_id=tool_call.id,
2053 display_content=duplicate_message,
2054 result_content=duplicate_message,
2055 ),
2056 event_content=duplicate_message,
2057 is_error=False,
2058 result_output=duplicate_message,
2059 )
2060 ]
2061 )
2062
2063 summary = TurnSummary(final_response="")
2064 await runner.execute_batch(
2065 tool_calls=[tool_call],
2066 tool_source="assistant",
2067 pending_tool_calls_seen=set(),
2068 emit=_noop_emit,
2069 summary=summary,
2070 dod=dod,
2071 executor=executor, # type: ignore[arg-type]
2072 on_confirmation=None,
2073 on_user_question=None,
2074 emit_confirmation=None,
2075 consecutive_errors=0,
2076 )
2077
2078 assert len(persistent_messages) == 1
2079 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
2080 assert (
2081 "Move to verification or final confirmation using the files already on disk."
2082 in persistent_messages[0]
2083 )
2084 assert "Create 01-getting-started.html" not in persistent_messages[0]
2085 assert "Creating 02-installation.html" not in persistent_messages[0]
2086 assert ephemeral_messages == []
2087
2088
2089 @pytest.mark.asyncio
2090 async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
2091 temp_dir: Path,
2092 ) -> None:
2093 async def assess_confidence(
2094 tool_name: str,
2095 tool_args: dict,
2096 context: str,
2097 ) -> ConfidenceAssessment:
2098 raise AssertionError("Confidence scoring should not run for this scenario")
2099
2100 async def verify_action(
2101 tool_name: str,
2102 tool_args: dict,
2103 result: str,
2104 expected: str = "",
2105 ) -> ActionVerification:
2106 raise AssertionError("Verification should not run for this scenario")
2107
2108 guide_root = temp_dir / "guides" / "nginx"
2109 chapters = guide_root / "chapters"
2110 guide_root.mkdir(parents=True)
2111 chapters.mkdir()
2112 index_path = guide_root / "index.html"
2113 chapter_one = chapters / "01-getting-started.html"
2114 chapter_two = chapters / "02-installation.html"
2115 index_path.write_text("<html></html>\n")
2116 chapter_one.write_text("<h1>One</h1>\n")
2117 chapter_two.write_text("<h1>Two</h1>\n")
2118
2119 implementation_plan = temp_dir / "implementation.md"
2120 implementation_plan.write_text(
2121 "\n".join(
2122 [
2123 "# Implementation Plan",
2124 "",
2125 "## File Changes",
2126 f"- `{guide_root}/`",
2127 f"- `{chapters}/`",
2128 f"- `{index_path}`",
2129 f"- `{chapter_one}`",
2130 f"- `{chapter_two}`",
2131 "",
2132 ]
2133 )
2134 )
2135
2136 context = build_context(
2137 temp_dir=temp_dir,
2138 messages=[],
2139 safeguards=FakeSafeguards(),
2140 assess_confidence=assess_confidence,
2141 verify_action=verify_action,
2142 auto_recover=False,
2143 )
2144 persistent_messages: list[str] = []
2145 ephemeral_messages: list[str] = []
2146 context.queue_steering_message_callback = persistent_messages.append
2147 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2148 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2149 dod = create_definition_of_done("Create a multi-file nginx guide.")
2150 dod.implementation_plan = str(implementation_plan)
2151 dod.verification_commands = [f"ls -la {guide_root}"]
2152 sync_todos_to_definition_of_done(
2153 dod,
2154 [
2155 {
2156 "content": "Create 01-getting-started.html",
2157 "active_form": "Creating 01-getting-started.html",
2158 "status": "pending",
2159 },
2160 {
2161 "content": "Ensure all files are properly linked and formatted consistently",
2162 "active_form": "Reviewing guide consistency and linkage",
2163 "status": "pending",
2164 },
2165 ],
2166 )
2167
2168 tool_call = ToolCall(
2169 id="read-built-review",
2170 name="read",
2171 arguments={"file_path": str(chapter_one)},
2172 )
2173 executor = FakeExecutor(
2174 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2175 )
2176
2177 summary = TurnSummary(final_response="")
2178 await runner.execute_batch(
2179 tool_calls=[tool_call],
2180 tool_source="assistant",
2181 pending_tool_calls_seen=set(),
2182 emit=_noop_emit,
2183 summary=summary,
2184 dod=dod,
2185 executor=executor, # type: ignore[arg-type]
2186 on_confirmation=None,
2187 on_user_question=None,
2188 emit_confirmation=None,
2189 consecutive_errors=0,
2190 )
2191
2192 assert persistent_messages == []
2193 assert len(ephemeral_messages) == 1
2194 message = ephemeral_messages[0]
2195 assert "All explicitly planned artifacts already exist." in message
2196 assert "Ensure all files are properly linked and formatted consistently" in message
2197 assert "Create 01-getting-started.html" not in message
2198 assert "do not keep broad-rereading the output set" in message
2199 assert "If no specific mismatch remains, move to verification now." in message
2200
2201
2202 @pytest.mark.asyncio
2203 async def test_tool_batch_runner_successful_read_after_plan_complete_switches_to_verify(
2204 temp_dir: Path,
2205 ) -> None:
2206 async def assess_confidence(
2207 tool_name: str,
2208 tool_args: dict,
2209 context: str,
2210 ) -> ConfidenceAssessment:
2211 raise AssertionError("Confidence scoring should not run for this scenario")
2212
2213 async def verify_action(
2214 tool_name: str,
2215 tool_args: dict,
2216 result: str,
2217 expected: str = "",
2218 ) -> ActionVerification:
2219 raise AssertionError("Verification should not run for this scenario")
2220
2221 guide_root = temp_dir / "guides" / "nginx"
2222 chapters = guide_root / "chapters"
2223 guide_root.mkdir(parents=True)
2224 chapters.mkdir()
2225 index_path = guide_root / "index.html"
2226 chapter_one = chapters / "01-getting-started.html"
2227 chapter_two = chapters / "02-installation.html"
2228 index_path.write_text("<html></html>\n")
2229 chapter_one.write_text("<h1>One</h1>\n")
2230 chapter_two.write_text("<h1>Two</h1>\n")
2231
2232 implementation_plan = temp_dir / "implementation.md"
2233 implementation_plan.write_text(
2234 "\n".join(
2235 [
2236 "# Implementation Plan",
2237 "",
2238 "## File Changes",
2239 f"- `{guide_root}/`",
2240 f"- `{chapters}/`",
2241 f"- `{index_path}`",
2242 f"- `{chapter_one}`",
2243 f"- `{chapter_two}`",
2244 "",
2245 ]
2246 )
2247 )
2248
2249 context = build_context(
2250 temp_dir=temp_dir,
2251 messages=[],
2252 safeguards=FakeSafeguards(),
2253 assess_confidence=assess_confidence,
2254 verify_action=verify_action,
2255 auto_recover=False,
2256 )
2257 persistent_messages: list[str] = []
2258 ephemeral_messages: list[str] = []
2259 context.queue_steering_message_callback = persistent_messages.append
2260 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2261 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2262 dod = create_definition_of_done("Create a multi-file nginx guide.")
2263 dod.implementation_plan = str(implementation_plan)
2264 dod.verification_commands = [f"ls -la {guide_root}"]
2265
2266 tool_call = ToolCall(
2267 id="read-built-verify",
2268 name="read",
2269 arguments={"file_path": str(chapter_one)},
2270 )
2271 executor = FakeExecutor(
2272 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2273 )
2274
2275 summary = TurnSummary(final_response="")
2276 await runner.execute_batch(
2277 tool_calls=[tool_call],
2278 tool_source="assistant",
2279 pending_tool_calls_seen=set(),
2280 emit=_noop_emit,
2281 summary=summary,
2282 dod=dod,
2283 executor=executor, # type: ignore[arg-type]
2284 on_confirmation=None,
2285 on_user_question=None,
2286 emit_confirmation=None,
2287 consecutive_errors=0,
2288 )
2289
2290 assert len(persistent_messages) == 1
2291 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
2292 assert "Verification should run next." in persistent_messages[0]
2293 assert "stop broad rereads" in persistent_messages[0]
2294 assert ephemeral_messages == []
2295 assert context.workflow_mode == "verify"
2296
2297
2298 @pytest.mark.asyncio
2299 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
2300 temp_dir: Path,
2301 ) -> None:
2302 async def assess_confidence(
2303 tool_name: str,
2304 tool_args: dict,
2305 context: str,
2306 ) -> ConfidenceAssessment:
2307 raise AssertionError("Confidence scoring should be disabled in this scenario")
2308
2309 async def verify_action(
2310 tool_name: str,
2311 tool_args: dict,
2312 result: str,
2313 expected: str = "",
2314 ) -> ActionVerification:
2315 raise AssertionError("Verification should not run for this scenario")
2316
2317 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2318 reference.parent.mkdir(parents=True)
2319 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2320
2321 context = build_context(
2322 temp_dir=temp_dir,
2323 messages=[],
2324 safeguards=FakeSafeguards(),
2325 assess_confidence=assess_confidence,
2326 verify_action=verify_action,
2327 auto_recover=False,
2328 )
2329 persistent_messages: list[str] = []
2330 ephemeral_messages: list[str] = []
2331 context.queue_steering_message_callback = persistent_messages.append
2332 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2333 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2334 dod = create_definition_of_done("Create a multi-file nginx guide.")
2335 sync_todos_to_definition_of_done(
2336 dod,
2337 [
2338 {
2339 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2340 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2341 "status": "pending",
2342 },
2343 {
2344 "content": "Create the nginx index.html file",
2345 "active_form": "Working on: Create the nginx index.html file",
2346 "status": "pending",
2347 },
2348 ],
2349 )
2350 tool_call = ToolCall(
2351 id="read-reference",
2352 name="read",
2353 arguments={"file_path": str(reference)},
2354 )
2355 executor = FakeExecutor(
2356 [
2357 tool_outcome(
2358 tool_call=tool_call,
2359 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2360 is_error=False,
2361 )
2362 ]
2363 )
2364
2365 summary = TurnSummary(final_response="")
2366 await runner.execute_batch(
2367 tool_calls=[tool_call],
2368 tool_source="assistant",
2369 pending_tool_calls_seen=set(),
2370 emit=_noop_emit,
2371 summary=summary,
2372 dod=dod,
2373 executor=executor, # type: ignore[arg-type]
2374 on_confirmation=None,
2375 on_user_question=None,
2376 emit_confirmation=None,
2377 consecutive_errors=0,
2378 )
2379
2380 assert any(
2381 "Continue with the next pending item: `Create the nginx index.html file`"
2382 in message
2383 for message in persistent_messages
2384 )
2385 assert any(
2386 "stop gathering more reference material and perform the change now" in message
2387 for message in persistent_messages
2388 )
2389 assert ephemeral_messages == []
2390
2391
2392 @pytest.mark.asyncio
2393 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2394 temp_dir: Path,
2395 ) -> None:
2396 async def assess_confidence(
2397 tool_name: str,
2398 tool_args: dict,
2399 context: str,
2400 ) -> ConfidenceAssessment:
2401 raise AssertionError("Confidence scoring should be disabled in this scenario")
2402
2403 async def verify_action(
2404 tool_name: str,
2405 tool_args: dict,
2406 result: str,
2407 expected: str = "",
2408 ) -> ActionVerification:
2409 raise AssertionError("Verification should not run for this scenario")
2410
2411 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2412 reference.parent.mkdir(parents=True)
2413 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2414
2415 context = build_context(
2416 temp_dir=temp_dir,
2417 messages=[],
2418 safeguards=FakeSafeguards(),
2419 assess_confidence=assess_confidence,
2420 verify_action=verify_action,
2421 auto_recover=False,
2422 )
2423 persistent_messages: list[str] = []
2424 ephemeral_messages: list[str] = []
2425 context.queue_steering_message_callback = persistent_messages.append
2426 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2427 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2428 dod = create_definition_of_done("Create a multi-file nginx guide.")
2429 sync_todos_to_definition_of_done(
2430 dod,
2431 [
2432 {
2433 "content": "First, examine the existing fortran guide structure and content",
2434 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2435 "status": "pending",
2436 },
2437 {
2438 "content": "Create the nginx directory structure",
2439 "active_form": "Working on: Create the nginx directory structure",
2440 "status": "pending",
2441 },
2442 ],
2443 )
2444 tool_call = ToolCall(
2445 id="read-reference",
2446 name="read",
2447 arguments={"file_path": str(reference)},
2448 )
2449 executor = FakeExecutor(
2450 [
2451 tool_outcome(
2452 tool_call=tool_call,
2453 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2454 is_error=False,
2455 )
2456 ]
2457 )
2458
2459 summary = TurnSummary(final_response="")
2460 await runner.execute_batch(
2461 tool_calls=[tool_call],
2462 tool_source="assistant",
2463 pending_tool_calls_seen=set(),
2464 emit=_noop_emit,
2465 summary=summary,
2466 dod=dod,
2467 executor=executor, # type: ignore[arg-type]
2468 on_confirmation=None,
2469 on_user_question=None,
2470 emit_confirmation=None,
2471 consecutive_errors=0,
2472 )
2473
2474 assert persistent_messages
2475 assert any(
2476 "Continue with the next pending item: `Create the nginx directory structure`"
2477 in message
2478 for message in persistent_messages
2479 )
2480 assert ephemeral_messages == []
2481
2482
2483 @pytest.mark.asyncio
2484 async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
2485 temp_dir: Path,
2486 ) -> None:
2487 async def assess_confidence(
2488 tool_name: str,
2489 tool_args: dict,
2490 context: str,
2491 ) -> ConfidenceAssessment:
2492 raise AssertionError("Confidence scoring should be disabled in this scenario")
2493
2494 async def verify_action(
2495 tool_name: str,
2496 tool_args: dict,
2497 result: str,
2498 expected: str = "",
2499 ) -> ActionVerification:
2500 raise AssertionError("Verification should not run for this scenario")
2501
2502 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2503 chapters = nginx_root / "chapters"
2504 implementation_plan = temp_dir / "implementation.md"
2505 implementation_plan.write_text(
2506 "\n".join(
2507 [
2508 "# Implementation Plan",
2509 "",
2510 "## File Changes",
2511 f"- `{chapters}/`",
2512 f"- `{nginx_root / 'index.html'}`",
2513 "",
2514 ]
2515 )
2516 )
2517
2518 context = build_context(
2519 temp_dir=temp_dir,
2520 messages=[],
2521 safeguards=FakeSafeguards(),
2522 assess_confidence=assess_confidence,
2523 verify_action=verify_action,
2524 auto_recover=False,
2525 )
2526 persistent_messages: list[str] = []
2527 ephemeral_messages: list[str] = []
2528 context.queue_steering_message_callback = persistent_messages.append
2529 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2530 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2531 dod = create_definition_of_done("Create a multi-file nginx guide.")
2532 dod.implementation_plan = str(implementation_plan)
2533 sync_todos_to_definition_of_done(
2534 dod,
2535 [
2536 {
2537 "content": "Create the nginx directory structure",
2538 "active_form": "Creating the nginx directory structure",
2539 "status": "pending",
2540 },
2541 {
2542 "content": "Develop the main index.html file with proper structure",
2543 "active_form": "Developing the main index.html file with proper structure",
2544 "status": "pending",
2545 },
2546 ],
2547 )
2548
2549 tool_call = ToolCall(
2550 id="mkdir-nginx",
2551 name="bash",
2552 arguments={"command": f"mkdir -p {chapters}"},
2553 )
2554 executor = FakeExecutor(
2555 [
2556 tool_outcome(
2557 tool_call=tool_call,
2558 output="",
2559 is_error=False,
2560 )
2561 ]
2562 )
2563
2564 summary = TurnSummary(final_response="")
2565 await runner.execute_batch(
2566 tool_calls=[tool_call],
2567 tool_source="assistant",
2568 pending_tool_calls_seen=set(),
2569 emit=_noop_emit,
2570 summary=summary,
2571 dod=dod,
2572 executor=executor, # type: ignore[arg-type]
2573 on_confirmation=None,
2574 on_user_question=None,
2575 emit_confirmation=None,
2576 consecutive_errors=0,
2577 )
2578
2579 assert persistent_messages
2580 message = persistent_messages[-1]
2581 assert "Directory setup is complete." in message
2582 assert "Next step: create `index.html`." in message
2583 assert "Write a compact but real initial version of that file now" in message
2584 assert ephemeral_messages == []
2585
2586
2587 @pytest.mark.asyncio
2588 async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
2589 temp_dir: Path,
2590 ) -> None:
2591 async def assess_confidence(
2592 tool_name: str,
2593 tool_args: dict,
2594 context: str,
2595 ) -> ConfidenceAssessment:
2596 raise AssertionError("Confidence scoring should be disabled in this scenario")
2597
2598 async def verify_action(
2599 tool_name: str,
2600 tool_args: dict,
2601 result: str,
2602 expected: str = "",
2603 ) -> ActionVerification:
2604 raise AssertionError("Verification should not run for this scenario")
2605
2606 nginx_root = temp_dir / "guides" / "nginx"
2607 chapters = nginx_root / "chapters"
2608 chapters.mkdir(parents=True)
2609 index_path = nginx_root / "index.html"
2610
2611 implementation_plan = temp_dir / "implementation.md"
2612 implementation_plan.write_text(
2613 "\n".join(
2614 [
2615 "# Implementation Plan",
2616 "",
2617 "## File Changes",
2618 f"- `{chapters}/`",
2619 f"- `{index_path}`",
2620 f"- `{chapters / '01-introduction.html'}`",
2621 "",
2622 ]
2623 )
2624 )
2625
2626 context = build_context(
2627 temp_dir=temp_dir,
2628 messages=[],
2629 safeguards=FakeSafeguards(),
2630 assess_confidence=assess_confidence,
2631 verify_action=verify_action,
2632 auto_recover=False,
2633 )
2634 persistent_messages: list[str] = []
2635 ephemeral_messages: list[str] = []
2636 context.queue_steering_message_callback = persistent_messages.append
2637 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2638 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2639 dod = create_definition_of_done("Create a multi-file nginx guide.")
2640 dod.implementation_plan = str(implementation_plan)
2641 sync_todos_to_definition_of_done(
2642 dod,
2643 [
2644 {
2645 "content": "Create the main index.html file with proper structure",
2646 "active_form": "Creating the main index.html file with proper structure",
2647 "status": "pending",
2648 },
2649 {
2650 "content": "Create each chapter file with appropriate content",
2651 "active_form": "Creating each chapter file with appropriate content",
2652 "status": "pending",
2653 },
2654 ],
2655 )
2656
2657 tool_call = ToolCall(
2658 id="write-index",
2659 name="write",
2660 arguments={
2661 "file_path": str(index_path),
2662 "content": "<html></html>\n",
2663 },
2664 )
2665 executor = FakeExecutor(
2666 [
2667 tool_outcome(
2668 tool_call=tool_call,
2669 output=f"Successfully wrote 14 bytes to {index_path}",
2670 is_error=False,
2671 )
2672 ]
2673 )
2674
2675 summary = TurnSummary(final_response="")
2676 await runner.execute_batch(
2677 tool_calls=[tool_call],
2678 tool_source="assistant",
2679 pending_tool_calls_seen=set(),
2680 emit=_noop_emit,
2681 summary=summary,
2682 dod=dod,
2683 executor=executor, # type: ignore[arg-type]
2684 on_confirmation=None,
2685 on_user_question=None,
2686 emit_confirmation=None,
2687 consecutive_errors=0,
2688 )
2689
2690 assert persistent_messages
2691 assert ephemeral_messages == []
2692 message = persistent_messages[-1]
2693 assert "Confirmed progress:" in message
2694 assert "Next step: create `01-introduction.html`." in message
2695 assert (
2696 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2697 in message
2698 )
2699 assert "Write a compact but real initial version of that file now" not in message
2700 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2701
2702
2703 @pytest.mark.asyncio
2704 async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
2705 temp_dir: Path,
2706 monkeypatch: pytest.MonkeyPatch,
2707 ) -> None:
2708 monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
2709
2710 async def assess_confidence(
2711 tool_name: str,
2712 tool_args: dict,
2713 context: str,
2714 ) -> ConfidenceAssessment:
2715 raise AssertionError("Confidence scoring should be disabled in this scenario")
2716
2717 async def verify_action(
2718 tool_name: str,
2719 tool_args: dict,
2720 result: str,
2721 expected: str = "",
2722 ) -> ActionVerification:
2723 raise AssertionError("Verification should not run for this scenario")
2724
2725 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2726 chapters = nginx_root / "chapters"
2727 index_path = nginx_root / "index.html"
2728
2729 implementation_plan = temp_dir / "implementation.md"
2730 implementation_plan.write_text(
2731 "\n".join(
2732 [
2733 "# Implementation Plan",
2734 "",
2735 "## File Changes",
2736 f"- `{chapters}/`",
2737 f"- `{index_path}`",
2738 "",
2739 ]
2740 )
2741 )
2742
2743 context = build_context(
2744 temp_dir=temp_dir,
2745 messages=[],
2746 safeguards=FakeSafeguards(),
2747 assess_confidence=assess_confidence,
2748 verify_action=verify_action,
2749 auto_recover=False,
2750 )
2751 persistent_messages: list[str] = []
2752 context.queue_steering_message_callback = persistent_messages.append
2753 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2754 dod = create_definition_of_done("Create a multi-file nginx guide.")
2755 dod.implementation_plan = str(implementation_plan)
2756 sync_todos_to_definition_of_done(
2757 dod,
2758 [
2759 {
2760 "content": "Create the nginx directory structure",
2761 "active_form": "Creating the nginx directory structure",
2762 "status": "pending",
2763 },
2764 {
2765 "content": "Develop the main index.html file with proper structure",
2766 "active_form": "Developing the main index.html file with proper structure",
2767 "status": "pending",
2768 },
2769 ],
2770 )
2771
2772 tool_call = ToolCall(
2773 id="mkdir-nginx-home",
2774 name="bash",
2775 arguments={"command": f"mkdir -p {chapters}"},
2776 )
2777 executor = FakeExecutor(
2778 [
2779 tool_outcome(
2780 tool_call=tool_call,
2781 output="",
2782 is_error=False,
2783 )
2784 ]
2785 )
2786
2787 summary = TurnSummary(final_response="")
2788 await runner.execute_batch(
2789 tool_calls=[tool_call],
2790 tool_source="assistant",
2791 pending_tool_calls_seen=set(),
2792 emit=_noop_emit,
2793 summary=summary,
2794 dod=dod,
2795 executor=executor, # type: ignore[arg-type]
2796 on_confirmation=None,
2797 on_user_question=None,
2798 emit_confirmation=None,
2799 consecutive_errors=0,
2800 )
2801
2802 assert persistent_messages
2803 message = persistent_messages[-1]
2804 assert "Next step: create `index.html`." in message
2805 assert "`~/Loader/guides/nginx/index.html`" in message
2806 assert "Write a compact but real initial version of that file now" in message
2807
2808
2809 @pytest.mark.asyncio
2810 async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
2811 temp_dir: Path,
2812 ) -> None:
2813 async def assess_confidence(
2814 tool_name: str,
2815 tool_args: dict,
2816 context: str,
2817 ) -> ConfidenceAssessment:
2818 raise AssertionError("Confidence scoring should not run in this scenario")
2819
2820 async def verify_action(
2821 tool_name: str,
2822 tool_args: dict,
2823 result: str,
2824 expected: str = "",
2825 ) -> ActionVerification:
2826 raise AssertionError("Verification should not run in this scenario")
2827
2828 nginx_root = temp_dir / "guides" / "nginx"
2829 chapters = nginx_root / "chapters"
2830 chapters.mkdir(parents=True)
2831 index_path = nginx_root / "index.html"
2832 index_path.write_text(
2833 "\n".join(
2834 [
2835 "<html>",
2836 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2837 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2838 "</html>",
2839 ]
2840 )
2841 + "\n"
2842 )
2843
2844 implementation_plan = temp_dir / "implementation.md"
2845 implementation_plan.write_text(
2846 "\n".join(
2847 [
2848 "# Implementation Plan",
2849 "",
2850 "## File Changes",
2851 f"- `{nginx_root}/`",
2852 f"- `{chapters}/`",
2853 f"- `{index_path}`",
2854 f"- `{chapters / '01-introduction.html'}`",
2855 "",
2856 ]
2857 )
2858 )
2859
2860 context = build_context(
2861 temp_dir=temp_dir,
2862 messages=[],
2863 safeguards=FakeSafeguards(),
2864 assess_confidence=assess_confidence,
2865 verify_action=verify_action,
2866 auto_recover=False,
2867 )
2868 persistent_messages: list[str] = []
2869 ephemeral_messages: list[str] = []
2870 context.queue_steering_message_callback = persistent_messages.append
2871 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2872 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2873 dod = create_definition_of_done("Create a multi-file nginx guide.")
2874 dod.implementation_plan = str(implementation_plan)
2875 dod.touched_files.append(str(index_path))
2876 dod.completed_items.append("Develop the main index.html file for the nginx guide")
2877 dod.pending_items.append("Create chapter files for the nginx guide")
2878
2879 tool_call = ToolCall(
2880 id="read-index-self-audit",
2881 name="read",
2882 arguments={"file_path": str(index_path)},
2883 )
2884 executor = FakeExecutor(
2885 [
2886 tool_outcome(
2887 tool_call=tool_call,
2888 output="1\t<html>\n",
2889 is_error=False,
2890 )
2891 ]
2892 )
2893
2894 summary = TurnSummary(final_response="")
2895 await runner.execute_batch(
2896 tool_calls=[tool_call],
2897 tool_source="assistant",
2898 pending_tool_calls_seen=set(),
2899 emit=_noop_emit,
2900 summary=summary,
2901 dod=dod,
2902 executor=executor, # type: ignore[arg-type]
2903 on_confirmation=None,
2904 on_user_question=None,
2905 emit_confirmation=None,
2906 consecutive_errors=0,
2907 )
2908
2909 assert persistent_messages
2910 message = persistent_messages[-1]
2911 assert "You already have the current contents of `index.html` from the successful write." in message
2912 assert "Resume by creating `01-introduction.html` now." in message
2913 assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
2914 assert ephemeral_messages == []
2915
2916
2917 @pytest.mark.asyncio
2918 async def test_tool_batch_runner_preserves_first_file_handoff_after_recovery_prompt(
2919 temp_dir: Path,
2920 ) -> None:
2921 async def assess_confidence(
2922 tool_name: str,
2923 tool_args: dict,
2924 context: str,
2925 ) -> ConfidenceAssessment:
2926 raise AssertionError("Confidence scoring should be disabled in this scenario")
2927
2928 async def verify_action(
2929 tool_name: str,
2930 tool_args: dict,
2931 result: str,
2932 expected: str = "",
2933 ) -> ActionVerification:
2934 raise AssertionError("Verification should not run for this scenario")
2935
2936 nginx_root = temp_dir / "guides" / "nginx"
2937 chapters = nginx_root / "chapters"
2938 chapters.mkdir(parents=True)
2939 index_path = nginx_root / "index.html"
2940
2941 implementation_plan = temp_dir / "implementation.md"
2942 implementation_plan.write_text(
2943 "\n".join(
2944 [
2945 "# Implementation Plan",
2946 "",
2947 "## File Changes",
2948 f"- `{chapters}/`",
2949 f"- `{index_path}`",
2950 f"- `{chapters / '01-introduction.html'}`",
2951 "",
2952 ]
2953 )
2954 )
2955
2956 context = build_context(
2957 temp_dir=temp_dir,
2958 messages=[
2959 Message(
2960 role=Role.USER,
2961 content=(
2962 "[EMPTY ASSISTANT RESPONSE]\n"
2963 "Respond with that concrete mutation tool call now. Do not return an empty response."
2964 ),
2965 )
2966 ],
2967 safeguards=FakeSafeguards(),
2968 assess_confidence=assess_confidence,
2969 verify_action=verify_action,
2970 auto_recover=False,
2971 )
2972 persistent_messages: list[str] = []
2973 ephemeral_messages: list[str] = []
2974 context.queue_steering_message_callback = persistent_messages.append
2975 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2976 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2977 dod = create_definition_of_done("Create a multi-file nginx guide.")
2978 dod.implementation_plan = str(implementation_plan)
2979 sync_todos_to_definition_of_done(
2980 dod,
2981 [
2982 {
2983 "content": "Create the main index.html file with proper structure",
2984 "active_form": "Creating the main index.html file with proper structure",
2985 "status": "pending",
2986 },
2987 {
2988 "content": "Create each chapter file with appropriate content",
2989 "active_form": "Creating each chapter file with appropriate content",
2990 "status": "pending",
2991 },
2992 ],
2993 )
2994
2995 tool_call = ToolCall(
2996 id="write-index-recovered",
2997 name="write",
2998 arguments={
2999 "file_path": str(index_path),
3000 "content": "<html></html>\n",
3001 },
3002 )
3003 executor = FakeExecutor(
3004 [
3005 tool_outcome(
3006 tool_call=tool_call,
3007 output=f"Successfully wrote 14 bytes to {index_path}",
3008 is_error=False,
3009 )
3010 ]
3011 )
3012
3013 summary = TurnSummary(final_response="")
3014 await runner.execute_batch(
3015 tool_calls=[tool_call],
3016 tool_source="assistant",
3017 pending_tool_calls_seen=set(),
3018 emit=_noop_emit,
3019 summary=summary,
3020 dod=dod,
3021 executor=executor, # type: ignore[arg-type]
3022 on_confirmation=None,
3023 on_user_question=None,
3024 emit_confirmation=None,
3025 consecutive_errors=0,
3026 )
3027
3028 assert persistent_messages
3029 assert ephemeral_messages == []
3030 message = persistent_messages[-1]
3031 assert "Next step: create `01-introduction.html`." in message
3032 assert "Write a compact but real initial version of that file now" not in message
3033
3034
3035 @pytest.mark.asyncio
3036 async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
3037 temp_dir: Path,
3038 ) -> None:
3039 async def assess_confidence(
3040 tool_name: str,
3041 tool_args: dict,
3042 context: str,
3043 ) -> ConfidenceAssessment:
3044 raise AssertionError("Confidence scoring should not run in this scenario")
3045
3046 async def verify_action(
3047 tool_name: str,
3048 tool_args: dict,
3049 result: str,
3050 expected: str = "",
3051 ) -> ActionVerification:
3052 raise AssertionError("Verification should not run in this scenario")
3053
3054 guide_root = temp_dir / "guides" / "nginx"
3055 chapters = guide_root / "chapters"
3056 chapters.mkdir(parents=True)
3057 index_path = guide_root / "index.html"
3058 index_path.write_text(
3059 "\n".join(
3060 [
3061 "<html>",
3062 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
3063 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3064 "</html>",
3065 ]
3066 )
3067 + "\n"
3068 )
3069
3070 implementation_plan = temp_dir / "implementation.md"
3071 implementation_plan.write_text(
3072 "\n".join(
3073 [
3074 "# Implementation Plan",
3075 "",
3076 "## File Changes",
3077 f"- `{guide_root}/`",
3078 f"- `{chapters}/`",
3079 f"- `{index_path}`",
3080 "",
3081 ]
3082 )
3083 )
3084
3085 context = build_context(
3086 temp_dir=temp_dir,
3087 messages=[],
3088 safeguards=FakeSafeguards(),
3089 assess_confidence=assess_confidence,
3090 verify_action=verify_action,
3091 )
3092 queued_messages: list[str] = []
3093 context.queue_steering_message_callback = queued_messages.append
3094 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3095 dod = create_definition_of_done("Create a multi-file nginx guide.")
3096 dod.implementation_plan = str(implementation_plan)
3097 dod.touched_files.append(str(index_path))
3098 sync_todos_to_definition_of_done(
3099 dod,
3100 [
3101 {
3102 "content": "Develop the main index.html file with proper structure",
3103 "active_form": "Developing the main index.html file with proper structure",
3104 "status": "completed",
3105 },
3106 {
3107 "content": "Create chapter files with content and structure",
3108 "active_form": "Creating chapter files with content and structure",
3109 "status": "pending",
3110 },
3111 ],
3112 )
3113
3114 todos = [
3115 {
3116 "content": "Develop the main index.html file with proper structure",
3117 "active_form": "Developing the main index.html file with proper structure",
3118 "status": "completed",
3119 },
3120 {
3121 "content": "Create chapter files with content and structure",
3122 "active_form": "Creating chapter files with content and structure",
3123 "status": "pending",
3124 },
3125 ]
3126 tool_call = ToolCall(
3127 id="todo-aggregate",
3128 name="TodoWrite",
3129 arguments={"todos": todos},
3130 )
3131 executor = FakeExecutor(
3132 [
3133 tool_outcome(
3134 tool_call=tool_call,
3135 output="Todos updated",
3136 is_error=False,
3137 metadata={"new_todos": todos},
3138 )
3139 ]
3140 )
3141
3142 summary = TurnSummary(final_response="")
3143 await runner.execute_batch(
3144 tool_calls=[tool_call],
3145 tool_source="assistant",
3146 pending_tool_calls_seen=set(),
3147 emit=_noop_emit,
3148 summary=summary,
3149 dod=dod,
3150 executor=executor, # type: ignore[arg-type]
3151 on_confirmation=None,
3152 on_user_question=None,
3153 emit_confirmation=None,
3154 consecutive_errors=0,
3155 )
3156
3157 assert queued_messages
3158 message = queued_messages[-1]
3159 assert "Todo tracking is updated." in message
3160 assert "Next step: create `01-introduction.html`." in message
3161 assert (
3162 "Continue with the next pending item: `Create chapter files with content and structure`."
3163 not in message
3164 )
3165
3166
3167 @pytest.mark.asyncio
3168 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
3169 temp_dir: Path,
3170 ) -> None:
3171 async def assess_confidence(
3172 tool_name: str,
3173 tool_args: dict,
3174 context: str,
3175 ) -> ConfidenceAssessment:
3176 raise AssertionError("Confidence scoring should be disabled in this scenario")
3177
3178 async def verify_action(
3179 tool_name: str,
3180 tool_args: dict,
3181 result: str,
3182 expected: str = "",
3183 ) -> ActionVerification:
3184 raise AssertionError("Verification should not run for this scenario")
3185
3186 guide_root = temp_dir / "guides" / "nginx"
3187 chapters = guide_root / "chapters"
3188 chapters.mkdir(parents=True)
3189 index_path = guide_root / "index.html"
3190 chapter_one = chapters / "01-getting-started.html"
3191 chapter_one.write_text("<h1>One</h1>\n")
3192 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3193
3194 implementation_plan = temp_dir / "implementation.md"
3195 implementation_plan.write_text(
3196 "\n".join(
3197 [
3198 "# Implementation Plan",
3199 "",
3200 "## File Changes",
3201 f"- `{index_path}`",
3202 f"- `{chapter_one}`",
3203 f"- `{chapters / '06-ssl-configuration.html'}`",
3204 "",
3205 ]
3206 )
3207 )
3208
3209 context = build_context(
3210 temp_dir=temp_dir,
3211 messages=[],
3212 safeguards=FakeSafeguards(),
3213 assess_confidence=assess_confidence,
3214 verify_action=verify_action,
3215 auto_recover=False,
3216 )
3217 persistent_messages: list[str] = []
3218 ephemeral_messages: list[str] = []
3219 context.queue_steering_message_callback = persistent_messages.append
3220 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3221 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3222 dod = create_definition_of_done("Create a multi-file nginx guide.")
3223 dod.implementation_plan = str(implementation_plan)
3224 sync_todos_to_definition_of_done(
3225 dod,
3226 [
3227 {
3228 "content": "Ensure all files are properly linked and formatted consistently",
3229 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3230 "status": "pending",
3231 },
3232 {
3233 "content": "Create the final chapter (06-ssl-configuration.html)",
3234 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
3235 "status": "pending",
3236 },
3237 ],
3238 )
3239 assert tool_batches_should_prioritize_missing_artifact(
3240 dod=dod,
3241 next_pending=dod.pending_items[0],
3242 missing_artifact=(chapters / "06-ssl-configuration.html", False),
3243 project_root=temp_dir,
3244 )
3245
3246 tool_call = ToolCall(
3247 id="dup-read",
3248 name="read",
3249 arguments={"file_path": str(index_path)},
3250 )
3251 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
3252
3253 assert persistent_messages
3254 message = persistent_messages[-1]
3255 assert "06-ssl-configuration.html" in message
3256 assert "Do not switch into review or consistency-check mode" in message
3257 assert (
3258 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
3259 not in message
3260 )
3261
3262
3263 @pytest.mark.asyncio
3264 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
3265 temp_dir: Path,
3266 ) -> None:
3267 async def assess_confidence(
3268 tool_name: str,
3269 tool_args: dict,
3270 context: str,
3271 ) -> ConfidenceAssessment:
3272 raise AssertionError("Confidence scoring should be disabled in this scenario")
3273
3274 async def verify_action(
3275 tool_name: str,
3276 tool_args: dict,
3277 result: str,
3278 expected: str = "",
3279 ) -> ActionVerification:
3280 raise AssertionError("Verification should not run for this scenario")
3281
3282 guide_root = temp_dir / "guides" / "nginx"
3283 chapters = guide_root / "chapters"
3284 chapters.mkdir(parents=True)
3285 index_path = guide_root / "index.html"
3286 chapter_one = chapters / "01-getting-started.html"
3287 chapter_two = chapters / "02-installation.html"
3288 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3289 chapter_one.write_text("<h1>One</h1>\n")
3290 chapter_two.write_text("<h1>Two</h1>\n")
3291
3292 implementation_plan = temp_dir / "implementation.md"
3293 implementation_plan.write_text(
3294 "\n".join(
3295 [
3296 "# Implementation Plan",
3297 "",
3298 "## File Changes",
3299 f"- `{chapters}/`",
3300 f"- `{index_path}`",
3301 f"- `{chapter_one}`",
3302 f"- `{chapter_two}`",
3303 "",
3304 ]
3305 )
3306 )
3307
3308 context = build_context(
3309 temp_dir=temp_dir,
3310 messages=[],
3311 safeguards=FakeSafeguards(),
3312 assess_confidence=assess_confidence,
3313 verify_action=verify_action,
3314 auto_recover=False,
3315 )
3316 persistent_messages: list[str] = []
3317 ephemeral_messages: list[str] = []
3318 context.queue_steering_message_callback = persistent_messages.append
3319 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3320 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3321 dod = create_definition_of_done("Create a multi-file nginx guide.")
3322 dod.implementation_plan = str(implementation_plan)
3323 sync_todos_to_definition_of_done(
3324 dod,
3325 [
3326 {
3327 "content": "Create the guide files",
3328 "active_form": "Working on: Create the guide files",
3329 "status": "completed",
3330 },
3331 {
3332 "content": "Ensure all files are properly linked and formatted consistently",
3333 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3334 "status": "pending",
3335 },
3336 ],
3337 )
3338 tool_call = ToolCall(
3339 id="write-final",
3340 name="write",
3341 arguments={
3342 "file_path": str(chapter_two),
3343 "content": "<h1>Two</h1>\n",
3344 },
3345 )
3346 executor = FakeExecutor(
3347 [
3348 tool_outcome(
3349 tool_call=tool_call,
3350 output=f"Successfully wrote {chapter_two}",
3351 is_error=False,
3352 )
3353 ]
3354 )
3355
3356 summary = TurnSummary(final_response="")
3357 await runner.execute_batch(
3358 tool_calls=[tool_call],
3359 tool_source="assistant",
3360 pending_tool_calls_seen=set(),
3361 emit=_noop_emit,
3362 summary=summary,
3363 dod=dod,
3364 executor=executor, # type: ignore[arg-type]
3365 on_confirmation=None,
3366 on_user_question=None,
3367 emit_confirmation=None,
3368 consecutive_errors=0,
3369 )
3370
3371 assert any(
3372 "All explicitly planned artifacts now exist on disk." in message
3373 for message in persistent_messages
3374 )
3375 assert any(
3376 "Ensure all files are properly linked and formatted consistently" in message
3377 for message in persistent_messages
3378 )
3379 assert any(
3380 "Move to verification once no specific mismatch remains." in message
3381 for message in persistent_messages
3382 )
3383
3384
3385 @pytest.mark.asyncio
3386 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
3387 temp_dir: Path,
3388 ) -> None:
3389 async def assess_confidence(
3390 tool_name: str,
3391 tool_args: dict,
3392 context: str,
3393 ) -> ConfidenceAssessment:
3394 raise AssertionError("Confidence scoring should not run in this scenario")
3395
3396 async def verify_action(
3397 tool_name: str,
3398 tool_args: dict,
3399 result: str,
3400 expected: str = "",
3401 ) -> ActionVerification:
3402 raise AssertionError("Verification should not run in this scenario")
3403
3404 guide_root = temp_dir / "guides" / "nginx"
3405 chapters = guide_root / "chapters"
3406 guide_root.mkdir(parents=True)
3407 chapters.mkdir()
3408 index_path = guide_root / "index.html"
3409 index_path.write_text("<html></html>\n")
3410 chapter_one = chapters / "01-getting-started.html"
3411 chapter_two = chapters / "02-installation.html"
3412 implementation_plan = temp_dir / "implementation.md"
3413 implementation_plan.write_text(
3414 "\n".join(
3415 [
3416 "# Implementation Plan",
3417 "",
3418 "## File Changes",
3419 f"- `{guide_root}/`",
3420 f"- `{index_path}`",
3421 f"- `{chapter_one}`",
3422 f"- `{chapter_two}`",
3423 "",
3424 ]
3425 )
3426 )
3427
3428 context = build_context(
3429 temp_dir=temp_dir,
3430 messages=[],
3431 safeguards=FakeSafeguards(),
3432 assess_confidence=assess_confidence,
3433 verify_action=verify_action,
3434 auto_recover=False,
3435 )
3436 persistent_messages: list[str] = []
3437 ephemeral_messages: list[str] = []
3438 context.queue_steering_message_callback = persistent_messages.append
3439 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3440 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3441 dod = create_definition_of_done("Create a multi-file nginx guide.")
3442 dod.implementation_plan = str(implementation_plan)
3443 sync_todos_to_definition_of_done(
3444 dod,
3445 [
3446 {
3447 "content": "Create the main index.html file with proper structure",
3448 "active_form": "Working on: Create the main index.html file with proper structure",
3449 "status": "pending",
3450 },
3451 {
3452 "content": "Create each chapter file in sequence, following the established pattern",
3453 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
3454 "status": "pending",
3455 },
3456 {
3457 "content": "Ensure all files are properly linked and formatted consistently",
3458 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3459 "status": "pending",
3460 },
3461 ],
3462 )
3463 tool_call = ToolCall(
3464 id="write-index",
3465 name="write",
3466 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
3467 )
3468 executor = FakeExecutor(
3469 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
3470 )
3471
3472 summary = TurnSummary(final_response="")
3473 await runner.execute_batch(
3474 tool_calls=[tool_call],
3475 tool_source="assistant",
3476 pending_tool_calls_seen=set(),
3477 emit=_noop_emit,
3478 summary=summary,
3479 dod=dod,
3480 executor=executor, # type: ignore[arg-type]
3481 on_confirmation=None,
3482 on_user_question=None,
3483 emit_confirmation=None,
3484 consecutive_errors=0,
3485 )
3486
3487 assert persistent_messages
3488 assert ephemeral_messages == []
3489 message = persistent_messages[-1]
3490 assert "Next step: create `01-getting-started.html`." in message
3491 assert "Write a compact but real initial version of that file now" not in message
3492 assert "refresh `TodoWrite`" not in message
3493 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3494
3495
3496 @pytest.mark.asyncio
3497 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
3498 temp_dir: Path,
3499 ) -> None:
3500 async def assess_confidence(
3501 tool_name: str,
3502 tool_args: dict,
3503 context: str,
3504 ) -> ConfidenceAssessment:
3505 raise AssertionError("Confidence scoring should not run in this scenario")
3506
3507 async def verify_action(
3508 tool_name: str,
3509 tool_args: dict,
3510 result: str,
3511 expected: str = "",
3512 ) -> ActionVerification:
3513 raise AssertionError("Verification should not run in this scenario")
3514
3515 guide_root = temp_dir / "guides" / "nginx"
3516 chapters = guide_root / "chapters"
3517 guide_root.mkdir(parents=True)
3518 chapters.mkdir()
3519 index_path = guide_root / "index.html"
3520 index_path.write_text("<html></html>\n")
3521
3522 chapter_paths = [
3523 chapters / "01-getting-started.html",
3524 chapters / "02-installation.html",
3525 chapters / "03-first-website.html",
3526 chapters / "04-configuration-basics.html",
3527 chapters / "05-advanced-configurations.html",
3528 chapters / "06-performance-tuning.html",
3529 chapters / "07-security-best-practices.html",
3530 ]
3531 for chapter in chapter_paths[:4]:
3532 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
3533 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
3534
3535 implementation_plan = temp_dir / "implementation.md"
3536 implementation_plan.write_text(
3537 "\n".join(
3538 [
3539 "# Implementation Plan",
3540 "",
3541 "## File Changes",
3542 f"- `{guide_root}/`",
3543 f"- `{chapters}/`",
3544 f"- `{index_path}`",
3545 *[f"- `{path}`" for path in chapter_paths],
3546 "",
3547 ]
3548 )
3549 )
3550
3551 context = build_context(
3552 temp_dir=temp_dir,
3553 messages=[],
3554 safeguards=FakeSafeguards(),
3555 assess_confidence=assess_confidence,
3556 verify_action=verify_action,
3557 auto_recover=False,
3558 )
3559 persistent_messages: list[str] = []
3560 ephemeral_messages: list[str] = []
3561 context.queue_steering_message_callback = persistent_messages.append
3562 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3563 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3564 dod = create_definition_of_done("Create a thorough nginx guide.")
3565 dod.implementation_plan = str(implementation_plan)
3566 sync_todos_to_definition_of_done(
3567 dod,
3568 [
3569 {
3570 "content": "Create the nginx guide artifacts",
3571 "active_form": "Creating nginx guide artifacts",
3572 "status": "pending",
3573 },
3574 {
3575 "content": "Verify all guide files are linked and complete",
3576 "active_form": "Verifying guide linkage and completeness",
3577 "status": "pending",
3578 },
3579 ],
3580 )
3581 tool_call = ToolCall(
3582 id="write-chapter-05",
3583 name="write",
3584 arguments={
3585 "file_path": str(chapter_paths[4]),
3586 "content": "<h1>Advanced configurations</h1>\n",
3587 },
3588 )
3589 executor = FakeExecutor(
3590 [
3591 tool_outcome(
3592 tool_call=tool_call,
3593 output=f"Successfully wrote {chapter_paths[4]}",
3594 is_error=False,
3595 )
3596 ]
3597 )
3598
3599 summary = TurnSummary(final_response="")
3600 await runner.execute_batch(
3601 tool_calls=[tool_call],
3602 tool_source="assistant",
3603 pending_tool_calls_seen=set(),
3604 emit=_noop_emit,
3605 summary=summary,
3606 dod=dod,
3607 executor=executor, # type: ignore[arg-type]
3608 on_confirmation=None,
3609 on_user_question=None,
3610 emit_confirmation=None,
3611 consecutive_errors=0,
3612 )
3613
3614 assert any(
3615 "Next step: create `06-performance-tuning.html`." in message
3616 for message in ephemeral_messages
3617 )
3618 assert not any(
3619 "All explicitly planned artifacts now exist on disk." in message
3620 for message in ephemeral_messages
3621 )
3622
3623
3624 @pytest.mark.asyncio
3625 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
3626 temp_dir: Path,
3627 ) -> None:
3628 async def assess_confidence(
3629 tool_name: str,
3630 tool_args: dict,
3631 context: str,
3632 ) -> ConfidenceAssessment:
3633 raise AssertionError("Confidence scoring should not run in this scenario")
3634
3635 async def verify_action(
3636 tool_name: str,
3637 tool_args: dict,
3638 result: str,
3639 expected: str = "",
3640 ) -> ActionVerification:
3641 raise AssertionError("Verification should not run in this scenario")
3642
3643 guide_root = temp_dir / "guides" / "nginx"
3644 chapters = guide_root / "chapters"
3645 guide_root.mkdir(parents=True)
3646 chapters.mkdir()
3647 index_path = guide_root / "index.html"
3648 chapter_paths = [
3649 chapters / "01-introduction.html",
3650 chapters / "02-installation.html",
3651 chapters / "03-configuration.html",
3652 chapters / "04-basic-usage.html",
3653 chapters / "05-advanced-features.html",
3654 ]
3655 for path in (index_path, *chapter_paths[:4]):
3656 path.write_text("<html></html>\n")
3657
3658 implementation_plan = temp_dir / "implementation.md"
3659 implementation_plan.write_text(
3660 "\n".join(
3661 [
3662 "# Implementation Plan",
3663 "",
3664 "## File Changes",
3665 f"- `{guide_root}/`",
3666 f"- `{chapters}/`",
3667 f"- `{index_path}`",
3668 *[f"- `{path}`" for path in chapter_paths],
3669 "",
3670 ]
3671 )
3672 )
3673
3674 context = build_context(
3675 temp_dir=temp_dir,
3676 messages=[],
3677 safeguards=FakeSafeguards(),
3678 assess_confidence=assess_confidence,
3679 verify_action=verify_action,
3680 auto_recover=False,
3681 )
3682 persistent_messages: list[str] = []
3683 ephemeral_messages: list[str] = []
3684 context.queue_steering_message_callback = persistent_messages.append
3685 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3686 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3687 dod = create_definition_of_done("Create a thorough nginx guide.")
3688 dod.implementation_plan = str(implementation_plan)
3689 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3690 dod.completed_items.extend(
3691 [
3692 "Create the nginx directory structure",
3693 "Create the main index.html file with proper structure",
3694 ]
3695 )
3696 sync_todos_to_definition_of_done(
3697 dod,
3698 [
3699 {
3700 "content": "Create each chapter file with appropriate content",
3701 "active_form": "Creating each chapter file with appropriate content",
3702 "status": "pending",
3703 }
3704 ],
3705 )
3706 tool_call = ToolCall(
3707 id="write-chapter-04",
3708 name="write",
3709 arguments={
3710 "file_path": str(chapter_paths[3]),
3711 "content": "<html>updated</html>\n",
3712 },
3713 )
3714 executor = FakeExecutor(
3715 [
3716 tool_outcome(
3717 tool_call=tool_call,
3718 output=f"Successfully wrote {chapter_paths[3]}",
3719 is_error=False,
3720 )
3721 ]
3722 )
3723
3724 summary = TurnSummary(final_response="")
3725 await runner.execute_batch(
3726 tool_calls=[tool_call],
3727 tool_source="assistant",
3728 pending_tool_calls_seen=set(),
3729 emit=_noop_emit,
3730 summary=summary,
3731 dod=dod,
3732 executor=executor, # type: ignore[arg-type]
3733 on_confirmation=None,
3734 on_user_question=None,
3735 emit_confirmation=None,
3736 consecutive_errors=0,
3737 )
3738
3739 assert ephemeral_messages
3740 message = ephemeral_messages[-1]
3741 assert "Next step: create `05-advanced-features.html`." in message
3742 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3743 assert "refresh `TodoWrite`" not in message
3744
3745
3746 @pytest.mark.asyncio
3747 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3748 temp_dir: Path,
3749 ) -> None:
3750 async def assess_confidence(
3751 tool_name: str,
3752 tool_args: dict,
3753 context: str,
3754 ) -> ConfidenceAssessment:
3755 raise AssertionError("Confidence scoring should not run in this scenario")
3756
3757 async def verify_action(
3758 tool_name: str,
3759 tool_args: dict,
3760 result: str,
3761 expected: str = "",
3762 ) -> ActionVerification:
3763 raise AssertionError("Verification should not run in this scenario")
3764
3765 guide_root = temp_dir / "guides" / "nginx"
3766 chapters = guide_root / "chapters"
3767 guide_root.mkdir(parents=True)
3768 chapters.mkdir()
3769 index_path = guide_root / "index.html"
3770 index_path.write_text("<html></html>\n")
3771 chapter_one = chapters / "01-getting-started.html"
3772 chapter_two = chapters / "02-installation.html"
3773 chapter_one.write_text("<h1>One</h1>\n")
3774
3775 implementation_plan = temp_dir / "implementation.md"
3776 implementation_plan.write_text(
3777 "\n".join(
3778 [
3779 "# Implementation Plan",
3780 "",
3781 "## File Changes",
3782 f"- `{guide_root}/`",
3783 f"- `{chapters}/`",
3784 f"- `{index_path}`",
3785 f"- `{chapter_one}`",
3786 f"- `{chapter_two}`",
3787 "",
3788 ]
3789 )
3790 )
3791
3792 context = build_context(
3793 temp_dir=temp_dir,
3794 messages=[],
3795 safeguards=FakeSafeguards(),
3796 assess_confidence=assess_confidence,
3797 verify_action=verify_action,
3798 auto_recover=False,
3799 )
3800 persistent_messages: list[str] = []
3801 ephemeral_messages: list[str] = []
3802 context.queue_steering_message_callback = persistent_messages.append
3803 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3804 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3805 dod = create_definition_of_done("Create a multi-file nginx guide.")
3806 dod.implementation_plan = str(implementation_plan)
3807 sync_todos_to_definition_of_done(
3808 dod,
3809 [
3810 {
3811 "content": "Create 01-getting-started.html",
3812 "active_form": "Creating 01-getting-started.html",
3813 "status": "completed",
3814 },
3815 {
3816 "content": "Create 02-installation.html",
3817 "active_form": "Creating 02-installation.html",
3818 "status": "pending",
3819 },
3820 ],
3821 )
3822 dod.touched_files.extend([str(index_path), str(chapter_one)])
3823
3824 tool_call = ToolCall(
3825 id="todo-only",
3826 name="TodoWrite",
3827 arguments={
3828 "todos": [
3829 {
3830 "content": "Create 01-getting-started.html",
3831 "active_form": "Creating 01-getting-started.html",
3832 "status": "completed",
3833 },
3834 {
3835 "content": "Create 02-installation.html",
3836 "active_form": "Creating 02-installation.html",
3837 "status": "pending",
3838 },
3839 ]
3840 },
3841 )
3842 executor = FakeExecutor(
3843 [
3844 tool_outcome(
3845 tool_call=tool_call,
3846 output="Todos updated",
3847 is_error=False,
3848 metadata={
3849 "new_todos": [
3850 {
3851 "content": "Create 01-getting-started.html",
3852 "active_form": "Creating 01-getting-started.html",
3853 "status": "completed",
3854 },
3855 {
3856 "content": "Create 02-installation.html",
3857 "active_form": "Creating 02-installation.html",
3858 "status": "pending",
3859 },
3860 ]
3861 },
3862 )
3863 ]
3864 )
3865
3866 summary = TurnSummary(final_response="")
3867 await runner.execute_batch(
3868 tool_calls=[tool_call],
3869 tool_source="assistant",
3870 pending_tool_calls_seen=set(),
3871 emit=_noop_emit,
3872 summary=summary,
3873 dod=dod,
3874 executor=executor, # type: ignore[arg-type]
3875 on_confirmation=None,
3876 on_user_question=None,
3877 emit_confirmation=None,
3878 consecutive_errors=0,
3879 )
3880
3881 assert persistent_messages
3882 message = persistent_messages[-1]
3883 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
3884 assert "Prefer one `write(file_path=..., content=...)` call" in message
3885 assert "Make your next response the concrete mutation tool call itself." in message
3886 assert ephemeral_messages == []
3887
3888
3889 @pytest.mark.asyncio
3890 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3891 temp_dir: Path,
3892 ) -> None:
3893 async def assess_confidence(
3894 tool_name: str,
3895 tool_args: dict,
3896 context: str,
3897 ) -> ConfidenceAssessment:
3898 raise AssertionError("Confidence scoring should not run in this scenario")
3899
3900 async def verify_action(
3901 tool_name: str,
3902 tool_args: dict,
3903 result: str,
3904 expected: str = "",
3905 ) -> ActionVerification:
3906 raise AssertionError("Verification should not run in this scenario")
3907
3908 guide_root = temp_dir / "guides" / "nginx"
3909 chapters = guide_root / "chapters"
3910 guide_root.mkdir(parents=True)
3911 chapters.mkdir()
3912 index_path = guide_root / "index.html"
3913 chapter_one = chapters / "01-getting-started.html"
3914 chapter_two = chapters / "02-installation.html"
3915 index_path.write_text("<html></html>\n")
3916 chapter_one.write_text("<h1>One</h1>\n")
3917 chapter_two.write_text("<h1>Two</h1>\n")
3918
3919 implementation_plan = temp_dir / "implementation.md"
3920 implementation_plan.write_text(
3921 "\n".join(
3922 [
3923 "# Implementation Plan",
3924 "",
3925 "## File Changes",
3926 f"- `{guide_root}/`",
3927 f"- `{chapters}/`",
3928 f"- `{index_path}`",
3929 f"- `{chapter_one}`",
3930 f"- `{chapter_two}`",
3931 "",
3932 ]
3933 )
3934 )
3935
3936 context = build_context(
3937 temp_dir=temp_dir,
3938 messages=[],
3939 safeguards=FakeSafeguards(),
3940 assess_confidence=assess_confidence,
3941 verify_action=verify_action,
3942 auto_recover=False,
3943 )
3944 queued_messages: list[str] = []
3945 context.queue_steering_message_callback = queued_messages.append
3946 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3947 dod = create_definition_of_done("Create a multi-file nginx guide.")
3948 dod.implementation_plan = str(implementation_plan)
3949 dod.verification_commands = [f"ls -la {guide_root}"]
3950 sync_todos_to_definition_of_done(
3951 dod,
3952 [
3953 {
3954 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3955 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3956 "status": "pending",
3957 },
3958 {
3959 "content": "Verify all guide files are linked and complete",
3960 "active_form": "Working on: Verify all guide files are linked and complete",
3961 "status": "pending",
3962 },
3963 ],
3964 project_root=temp_dir,
3965 )
3966
3967 tool_call = ToolCall(
3968 id="todo-only",
3969 name="TodoWrite",
3970 arguments={
3971 "todos": [
3972 {
3973 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3974 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3975 "status": "pending",
3976 },
3977 {
3978 "content": "Verify all guide files are linked and complete",
3979 "active_form": "Working on: Verify all guide files are linked and complete",
3980 "status": "pending",
3981 },
3982 ]
3983 },
3984 )
3985 executor = FakeExecutor(
3986 [
3987 tool_outcome(
3988 tool_call=tool_call,
3989 output="Todos updated",
3990 is_error=False,
3991 metadata={
3992 "new_todos": [
3993 {
3994 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3995 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3996 "status": "pending",
3997 },
3998 {
3999 "content": "Verify all guide files are linked and complete",
4000 "active_form": "Working on: Verify all guide files are linked and complete",
4001 "status": "pending",
4002 },
4003 ]
4004 },
4005 )
4006 ]
4007 )
4008
4009 summary = TurnSummary(final_response="")
4010 await runner.execute_batch(
4011 tool_calls=[tool_call],
4012 tool_source="assistant",
4013 pending_tool_calls_seen=set(),
4014 emit=_noop_emit,
4015 summary=summary,
4016 dod=dod,
4017 executor=executor, # type: ignore[arg-type]
4018 on_confirmation=None,
4019 on_user_question=None,
4020 emit_confirmation=None,
4021 consecutive_errors=0,
4022 )
4023
4024 assert queued_messages
4025 message = queued_messages[-1]
4026 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4027 assert "Verify all guide files are linked and complete" in message
4028 assert "Move to verification once no specific mismatch remains." in message
4029 assert "reopen reference materials" in message
4030 assert "Fortran guide structure" not in message
4031 assert context.workflow_mode == "execute"
4032
4033
4034 @pytest.mark.asyncio
4035 async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
4036 temp_dir: Path,
4037 ) -> None:
4038 async def assess_confidence(
4039 tool_name: str,
4040 tool_args: dict,
4041 context: str,
4042 ) -> ConfidenceAssessment:
4043 raise AssertionError("Confidence scoring should not run for this scenario")
4044
4045 async def verify_action(
4046 tool_name: str,
4047 tool_args: dict,
4048 result: str,
4049 expected: str = "",
4050 ) -> ActionVerification:
4051 raise AssertionError("Verification should not run for this scenario")
4052
4053 guide_root = temp_dir / "guides" / "nginx"
4054 chapters = guide_root / "chapters"
4055 guide_root.mkdir(parents=True)
4056 chapters.mkdir()
4057 index_path = guide_root / "index.html"
4058 chapter_one = chapters / "01-introduction.html"
4059 chapter_two = chapters / "02-installation.html"
4060 index_path.write_text(
4061 "\n".join(
4062 [
4063 '<a href="chapters/01-introduction.html">Intro</a>',
4064 '<a href="chapters/02-installation.html">Install</a>',
4065 '<a href="../index.html">Back</a>',
4066 "",
4067 ]
4068 )
4069 )
4070 chapter_one.write_text("<html></html>\n")
4071 chapter_two.write_text("<html></html>\n")
4072
4073 implementation_plan = temp_dir / "implementation.md"
4074 implementation_plan.write_text(
4075 "\n".join(
4076 [
4077 "# Implementation Plan",
4078 "",
4079 "## File Changes",
4080 f"- `{guide_root}/`",
4081 f"- `{chapters}/`",
4082 f"- `{index_path}`",
4083 f"- `{chapter_one}`",
4084 f"- `{chapter_two}`",
4085 "",
4086 ]
4087 )
4088 )
4089
4090 context = build_context(
4091 temp_dir=temp_dir,
4092 messages=[],
4093 safeguards=FakeSafeguards(),
4094 assess_confidence=assess_confidence,
4095 verify_action=verify_action,
4096 auto_recover=False,
4097 )
4098 queued_messages: list[str] = []
4099 context.queue_steering_message_callback = queued_messages.append
4100 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4101 dod = create_definition_of_done("Create a multi-file nginx guide.")
4102 dod.implementation_plan = str(implementation_plan)
4103 dod.verification_commands = [f"ls -la {guide_root}"]
4104 sync_todos_to_definition_of_done(
4105 dod,
4106 [
4107 {
4108 "content": "Create chapter files following the established pattern",
4109 "active_form": "Creating chapter files",
4110 "status": "in_progress",
4111 }
4112 ],
4113 project_root=temp_dir,
4114 )
4115
4116 tool_call = ToolCall(
4117 id="todo-post-build",
4118 name="TodoWrite",
4119 arguments={
4120 "todos": [
4121 {
4122 "content": "Create chapter files following the established pattern",
4123 "active_form": "Creating chapter files",
4124 "status": "in_progress",
4125 }
4126 ]
4127 },
4128 )
4129 executor = FakeExecutor(
4130 [
4131 tool_outcome(
4132 tool_call=tool_call,
4133 output="Todos updated",
4134 is_error=False,
4135 metadata={
4136 "new_todos": [
4137 {
4138 "content": "Create chapter files following the established pattern",
4139 "active_form": "Creating chapter files",
4140 "status": "in_progress",
4141 }
4142 ]
4143 },
4144 )
4145 ]
4146 )
4147
4148 summary = TurnSummary(final_response="")
4149 await runner.execute_batch(
4150 tool_calls=[tool_call],
4151 tool_source="assistant",
4152 pending_tool_calls_seen=set(),
4153 emit=_noop_emit,
4154 summary=summary,
4155 dod=dod,
4156 executor=executor, # type: ignore[arg-type]
4157 on_confirmation=None,
4158 on_user_question=None,
4159 emit_confirmation=None,
4160 consecutive_errors=0,
4161 )
4162
4163 assert queued_messages
4164 message = queued_messages[-1]
4165 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4166 assert "Verification should run next." in message
4167 assert "Repair or verify the current files instead of expanding the artifact set." not in message
4168 assert context.workflow_mode == "verify"
4169
4170
4171 @pytest.mark.asyncio
4172 async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff(
4173 temp_dir: Path,
4174 ) -> None:
4175 async def assess_confidence(
4176 tool_name: str,
4177 tool_args: dict,
4178 context: str,
4179 ) -> ConfidenceAssessment:
4180 raise AssertionError("Confidence scoring should not run for this scenario")
4181
4182 async def verify_action(
4183 tool_name: str,
4184 tool_args: dict,
4185 result: str,
4186 expected: str = "",
4187 ) -> ActionVerification:
4188 raise AssertionError("Verification should not run for this scenario")
4189
4190 guide_root = temp_dir / "guides" / "nginx"
4191 chapters = guide_root / "chapters"
4192 guide_root.mkdir(parents=True)
4193 chapters.mkdir()
4194 index_path = guide_root / "index.html"
4195 chapter_one = chapters / "01-introduction.html"
4196 chapter_two = chapters / "02-installation.html"
4197 index_path.write_text("<html></html>\n")
4198 chapter_one.write_text("<html></html>\n")
4199 chapter_two.write_text("<html></html>\n")
4200
4201 implementation_plan = temp_dir / "implementation.md"
4202 implementation_plan.write_text(
4203 "\n".join(
4204 [
4205 "# Implementation Plan",
4206 "",
4207 "## File Changes",
4208 f"- `{guide_root}/`",
4209 f"- `{chapters}/`",
4210 f"- `{index_path}`",
4211 f"- `{chapter_one}`",
4212 f"- `{chapter_two}`",
4213 "",
4214 ]
4215 )
4216 )
4217
4218 context = build_context(
4219 temp_dir=temp_dir,
4220 messages=[],
4221 safeguards=FakeSafeguards(),
4222 assess_confidence=assess_confidence,
4223 verify_action=verify_action,
4224 auto_recover=False,
4225 )
4226 queued_messages: list[str] = []
4227 context.queue_steering_message_callback = queued_messages.append
4228 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4229 dod = create_definition_of_done("Create a multi-file nginx guide.")
4230 dod.implementation_plan = str(implementation_plan)
4231 dod.verification_commands = [f"ls -la {guide_root}"]
4232
4233 todo_call = ToolCall(
4234 id="todo-post-build-preempt",
4235 name="TodoWrite",
4236 arguments={"todos": []},
4237 )
4238 audit_read = ToolCall(
4239 id="read-after-todo",
4240 name="read",
4241 arguments={"file_path": str(index_path)},
4242 )
4243 executor = FakeExecutor(
4244 [
4245 tool_outcome(
4246 tool_call=todo_call,
4247 output="Todos updated",
4248 is_error=False,
4249 metadata={"new_todos": []},
4250 ),
4251 tool_outcome(
4252 tool_call=audit_read,
4253 output=index_path.read_text(),
4254 is_error=False,
4255 ),
4256 ]
4257 )
4258
4259 summary = TurnSummary(final_response="")
4260 result = await runner.execute_batch(
4261 tool_calls=[todo_call, audit_read],
4262 tool_source="assistant",
4263 pending_tool_calls_seen=set(),
4264 emit=_noop_emit,
4265 summary=summary,
4266 dod=dod,
4267 executor=executor, # type: ignore[arg-type]
4268 on_confirmation=None,
4269 on_user_question=None,
4270 emit_confirmation=None,
4271 consecutive_errors=0,
4272 )
4273
4274 assert result.continue_after_batch is True
4275 assert result.halted is False
4276 assert [call.id for call in executor.calls] == ["todo-post-build-preempt"]
4277 assert len(summary.tool_result_messages) == 1
4278 assert context.workflow_mode == "verify"
4279 assert queued_messages
4280 assert "Verification should run next." in queued_messages[-1]
4281
4282
4283 @pytest.mark.asyncio
4284 async def test_tool_batch_runner_todowrite_complete_directory_plan_does_not_reinfer_first_child(
4285 temp_dir: Path,
4286 ) -> None:
4287 async def assess_confidence(
4288 tool_name: str,
4289 tool_args: dict,
4290 context: str,
4291 ) -> ConfidenceAssessment:
4292 raise AssertionError("Confidence scoring should not run for this scenario")
4293
4294 async def verify_action(
4295 tool_name: str,
4296 tool_args: dict,
4297 result: str,
4298 expected: str = "",
4299 ) -> ActionVerification:
4300 raise AssertionError("Verification should not run for this scenario")
4301
4302 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
4303 reference.parent.mkdir(parents=True)
4304 reference.write_text("<h1>Introduction</h1>\n")
4305
4306 guide_root = temp_dir / "Loader" / "guides" / "nginx"
4307 chapters = guide_root / "chapters"
4308 guide_root.mkdir(parents=True)
4309 chapters.mkdir()
4310 index_path = guide_root / "index.html"
4311 chapter_one = chapters / "01-introduction.html"
4312 chapter_two = chapters / "02-installation.html"
4313 chapter_three = chapters / "03-basic-configuration.html"
4314 index_path.write_text(
4315 "\n".join(
4316 [
4317 '<a href="chapters/01-introduction.html">Introduction</a>',
4318 '<a href="chapters/02-installation.html">Installation</a>',
4319 '<a href="chapters/03-basic-configuration.html">Configuration</a>',
4320 "",
4321 ]
4322 )
4323 )
4324 chapter_one.write_text("<html></html>\n")
4325 chapter_two.write_text("<html></html>\n")
4326 chapter_three.write_text("<html></html>\n")
4327
4328 implementation_plan = temp_dir / "implementation.md"
4329 implementation_plan.write_text(
4330 "\n".join(
4331 [
4332 "# Implementation Plan",
4333 "",
4334 "## File Changes",
4335 f"- `{guide_root / 'index.html'}`",
4336 f"- `{chapters}/`",
4337 "",
4338 ]
4339 )
4340 )
4341
4342 messages = [
4343 Message(
4344 role=Role.ASSISTANT,
4345 content="I examined the reference guide structure.",
4346 tool_calls=[
4347 ToolCall(
4348 id="read-reference-child",
4349 name="read",
4350 arguments={"file_path": str(reference)},
4351 )
4352 ],
4353 )
4354 ]
4355 context = build_context(
4356 temp_dir=temp_dir,
4357 messages=messages,
4358 safeguards=FakeSafeguards(),
4359 assess_confidence=assess_confidence,
4360 verify_action=verify_action,
4361 auto_recover=False,
4362 )
4363 queued_messages: list[str] = []
4364 context.queue_steering_message_callback = queued_messages.append
4365 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4366 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4367 dod.implementation_plan = str(implementation_plan)
4368 dod.verification_commands = [f"ls -la {guide_root}"]
4369
4370 todo_call = ToolCall(
4371 id="todo-complete-directory-plan",
4372 name="TodoWrite",
4373 arguments={"todos": []},
4374 )
4375 executor = FakeExecutor(
4376 [
4377 tool_outcome(
4378 tool_call=todo_call,
4379 output="Todos updated",
4380 is_error=False,
4381 metadata={"new_todos": []},
4382 )
4383 ]
4384 )
4385
4386 summary = TurnSummary(final_response="")
4387 result = await runner.execute_batch(
4388 tool_calls=[todo_call],
4389 tool_source="assistant",
4390 pending_tool_calls_seen=set(),
4391 emit=_noop_emit,
4392 summary=summary,
4393 dod=dod,
4394 executor=executor, # type: ignore[arg-type]
4395 on_confirmation=None,
4396 on_user_question=None,
4397 emit_confirmation=None,
4398 consecutive_errors=0,
4399 )
4400
4401 assert result.continue_after_batch is True
4402 assert queued_messages
4403 message = queued_messages[-1]
4404 assert "Verification should run next." in message
4405 assert "01-introduction.html" not in message
4406 assert "chapter files" not in message.lower()
4407 assert context.workflow_mode == "verify"
4408
4409
4410 @pytest.mark.asyncio
4411 async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
4412 temp_dir: Path,
4413 ) -> None:
4414 async def assess_confidence(
4415 tool_name: str,
4416 tool_args: dict,
4417 context: str,
4418 ) -> ConfidenceAssessment:
4419 raise AssertionError("Confidence scoring should not run for this scenario")
4420
4421 async def verify_action(
4422 tool_name: str,
4423 tool_args: dict,
4424 result: str,
4425 expected: str = "",
4426 ) -> ActionVerification:
4427 raise AssertionError("Verification should not run for this scenario")
4428
4429 guide_root = temp_dir / "guides" / "nginx"
4430 chapters = guide_root / "chapters"
4431 guide_root.mkdir(parents=True)
4432 chapters.mkdir()
4433 index_path = guide_root / "index.html"
4434 chapter_one = chapters / "01-introduction.html"
4435 chapter_two = chapters / "02-installation.html"
4436 index_path.write_text(
4437 "\n".join(
4438 [
4439 '<a href="chapters/01-introduction.html">Intro</a>',
4440 '<a href="chapters/02-installation.html">Install</a>',
4441 '<a href="../index.html">Back</a>',
4442 "",
4443 ]
4444 )
4445 )
4446 chapter_one.write_text("<html></html>\n")
4447 chapter_two.write_text("<html></html>\n")
4448
4449 implementation_plan = temp_dir / "implementation.md"
4450 implementation_plan.write_text(
4451 "\n".join(
4452 [
4453 "# Implementation Plan",
4454 "",
4455 "## File Changes",
4456 f"- `{guide_root}/`",
4457 f"- `{chapters}/`",
4458 f"- `{index_path}`",
4459 f"- `{chapter_one}`",
4460 f"- `{chapter_two}`",
4461 "",
4462 ]
4463 )
4464 )
4465
4466 context = build_context(
4467 temp_dir=temp_dir,
4468 messages=[],
4469 safeguards=FakeSafeguards(),
4470 assess_confidence=assess_confidence,
4471 verify_action=verify_action,
4472 auto_recover=False,
4473 )
4474 queued_messages: list[str] = []
4475 context.queue_steering_message_callback = queued_messages.append
4476 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4477 dod = create_definition_of_done("Create a multi-file nginx guide.")
4478 dod.implementation_plan = str(implementation_plan)
4479 dod.verification_commands = [f"ls -la {guide_root}"]
4480
4481 tool_call = ToolCall(
4482 id="todo-post-build-expansion",
4483 name="TodoWrite",
4484 arguments={
4485 "todos": [
4486 {
4487 "content": "Create index.html for nginx guide",
4488 "activeForm": "Creating index.html",
4489 "status": "in_progress",
4490 },
4491 {
4492 "content": "Create chapter 01-introduction.html",
4493 "activeForm": "Creating chapter 01-introduction.html",
4494 "status": "completed",
4495 },
4496 {
4497 "content": "Create chapter 02-installation.html",
4498 "activeForm": "Creating chapter 02-installation.html",
4499 "status": "completed",
4500 },
4501 {
4502 "content": "Create chapter 08-troubleshooting.html",
4503 "activeForm": "Creating chapter 08-troubleshooting.html",
4504 "status": "pending",
4505 },
4506 ]
4507 },
4508 )
4509 executor = FakeExecutor(
4510 [
4511 tool_outcome(
4512 tool_call=tool_call,
4513 output="Todos updated",
4514 is_error=False,
4515 metadata={
4516 "new_todos": [
4517 {
4518 "content": "Create index.html for nginx guide",
4519 "active_form": "Creating index.html",
4520 "status": "in_progress",
4521 },
4522 {
4523 "content": "Create chapter 01-introduction.html",
4524 "active_form": "Creating chapter 01-introduction.html",
4525 "status": "completed",
4526 },
4527 {
4528 "content": "Create chapter 02-installation.html",
4529 "active_form": "Creating chapter 02-installation.html",
4530 "status": "completed",
4531 },
4532 {
4533 "content": "Create chapter 08-troubleshooting.html",
4534 "active_form": "Creating chapter 08-troubleshooting.html",
4535 "status": "pending",
4536 },
4537 ]
4538 },
4539 )
4540 ]
4541 )
4542
4543 summary = TurnSummary(final_response="")
4544 await runner.execute_batch(
4545 tool_calls=[tool_call],
4546 tool_source="assistant",
4547 pending_tool_calls_seen=set(),
4548 emit=_noop_emit,
4549 summary=summary,
4550 dod=dod,
4551 executor=executor, # type: ignore[arg-type]
4552 on_confirmation=None,
4553 on_user_question=None,
4554 emit_confirmation=None,
4555 consecutive_errors=0,
4556 )
4557
4558 assert queued_messages
4559 message = queued_messages[-1]
4560 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4561 assert "Verification should run next." in message
4562 assert "Repair or verify the current files instead of expanding the artifact set." not in message
4563 assert "08-troubleshooting.html" not in message
4564 assert context.workflow_mode == "verify"
4565
4566
4567 @pytest.mark.asyncio
4568 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
4569 temp_dir: Path,
4570 ) -> None:
4571 async def assess_confidence(
4572 tool_name: str,
4573 tool_args: dict,
4574 context: str,
4575 ) -> ConfidenceAssessment:
4576 raise AssertionError("Confidence scoring should not run in this scenario")
4577
4578 async def verify_action(
4579 tool_name: str,
4580 tool_args: dict,
4581 result: str,
4582 expected: str = "",
4583 ) -> ActionVerification:
4584 raise AssertionError("Verification should not run in this scenario")
4585
4586 guide_root = temp_dir / "guides" / "nginx"
4587 chapters = guide_root / "chapters"
4588 guide_root.mkdir(parents=True)
4589 chapters.mkdir()
4590 index_path = guide_root / "index.html"
4591 index_path.write_text(
4592 "\n".join(
4593 [
4594 "<!DOCTYPE html>",
4595 "<html>",
4596 "<body>",
4597 '<a href="chapters/01-introduction.html">Introduction</a>',
4598 "</body>",
4599 "</html>",
4600 "",
4601 ]
4602 )
4603 )
4604
4605 implementation_plan = temp_dir / "implementation.md"
4606 implementation_plan.write_text(
4607 "\n".join(
4608 [
4609 "# Implementation Plan",
4610 "",
4611 "## File Changes",
4612 f"- `{guide_root}/`",
4613 f"- `{chapters}/`",
4614 f"- `{index_path}`",
4615 "",
4616 ]
4617 )
4618 )
4619
4620 context = build_context(
4621 temp_dir=temp_dir,
4622 messages=[],
4623 safeguards=FakeSafeguards(),
4624 assess_confidence=assess_confidence,
4625 verify_action=verify_action,
4626 auto_recover=False,
4627 )
4628 queued_messages: list[str] = []
4629 context.queue_steering_message_callback = queued_messages.append
4630 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4631 dod = create_definition_of_done("Create a multi-file nginx guide.")
4632 dod.implementation_plan = str(implementation_plan)
4633 dod.touched_files.append(str(index_path))
4634 sync_todos_to_definition_of_done(
4635 dod,
4636 [
4637 {
4638 "content": "Examine the existing Fortran guide structure",
4639 "active_form": "Examining the existing Fortran guide structure",
4640 "status": "completed",
4641 },
4642 {
4643 "content": "Create the nginx directory structure",
4644 "active_form": "Creating the nginx directory structure",
4645 "status": "completed",
4646 },
4647 {
4648 "content": "Write the introduction chapter",
4649 "active_form": "Writing the introduction chapter",
4650 "status": "pending",
4651 },
4652 ],
4653 project_root=temp_dir,
4654 )
4655
4656 tool_call = ToolCall(
4657 id="todo-next-mutation",
4658 name="TodoWrite",
4659 arguments={
4660 "todos": [
4661 {
4662 "content": "Examine the existing Fortran guide structure",
4663 "active_form": "Examining the existing Fortran guide structure",
4664 "status": "completed",
4665 },
4666 {
4667 "content": "Create the nginx directory structure",
4668 "active_form": "Creating the nginx directory structure",
4669 "status": "completed",
4670 },
4671 {
4672 "content": "Write the introduction chapter",
4673 "active_form": "Writing the introduction chapter",
4674 "status": "pending",
4675 },
4676 ]
4677 },
4678 )
4679 executor = FakeExecutor(
4680 [
4681 tool_outcome(
4682 tool_call=tool_call,
4683 output="Todos updated",
4684 is_error=False,
4685 metadata={
4686 "new_todos": [
4687 {
4688 "content": "Examine the existing Fortran guide structure",
4689 "active_form": "Examining the existing Fortran guide structure",
4690 "status": "completed",
4691 },
4692 {
4693 "content": "Create the nginx directory structure",
4694 "active_form": "Creating the nginx directory structure",
4695 "status": "completed",
4696 },
4697 {
4698 "content": "Write the introduction chapter",
4699 "active_form": "Writing the introduction chapter",
4700 "status": "pending",
4701 },
4702 ]
4703 },
4704 )
4705 ]
4706 )
4707
4708 summary = TurnSummary(final_response="")
4709 await runner.execute_batch(
4710 tool_calls=[tool_call],
4711 tool_source="assistant",
4712 pending_tool_calls_seen=set(),
4713 emit=_noop_emit,
4714 summary=summary,
4715 dod=dod,
4716 executor=executor, # type: ignore[arg-type]
4717 on_confirmation=None,
4718 on_user_question=None,
4719 emit_confirmation=None,
4720 consecutive_errors=0,
4721 )
4722
4723 assert queued_messages
4724 message = queued_messages[-1]
4725 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
4726 assert "Prefer one `write(file_path=..., content=...)` call" in message
4727 assert "Make your next response the concrete mutation tool call itself." in message
4728
4729
4730 @pytest.mark.asyncio
4731 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
4732 temp_dir: Path,
4733 ) -> None:
4734 async def assess_confidence(
4735 tool_name: str,
4736 tool_args: dict,
4737 context: str,
4738 ) -> ConfidenceAssessment:
4739 raise AssertionError("Confidence scoring should not run in this scenario")
4740
4741 async def verify_action(
4742 tool_name: str,
4743 tool_args: dict,
4744 result: str,
4745 expected: str = "",
4746 ) -> ActionVerification:
4747 raise AssertionError("Verification should not run in this scenario")
4748
4749 guide_root = temp_dir / "Loader" / "guides" / "nginx"
4750 chapters = guide_root / "chapters"
4751 chapters.mkdir(parents=True)
4752 index_path = guide_root / "index.html"
4753 implementation_plan = temp_dir / "implementation.md"
4754 implementation_plan.write_text(
4755 "\n".join(
4756 [
4757 "# Implementation Plan",
4758 "",
4759 "## File Changes",
4760 f"- `{chapters}/`",
4761 f"- `{index_path}`",
4762 "",
4763 ]
4764 )
4765 )
4766
4767 dod = create_definition_of_done("Create a multi-file nginx guide.")
4768 dod.implementation_plan = str(implementation_plan)
4769 sync_todos_to_definition_of_done(
4770 dod,
4771 [
4772 {
4773 "content": "Examine the existing Fortran guide structure to understand the format and depth",
4774 "active_form": "Examining the existing Fortran guide structure",
4775 "status": "completed",
4776 },
4777 {
4778 "content": "Create the new nginx guide directory structure",
4779 "active_form": "Creating the new nginx guide directory structure",
4780 "status": "completed",
4781 },
4782 {
4783 "content": "Create a new index.html for the nginx guide",
4784 "active_form": "Creating a new index.html for the nginx guide",
4785 "status": "pending",
4786 },
4787 {
4788 "content": "Create the first chapter for the nginx guide",
4789 "active_form": "Creating the first chapter for the nginx guide",
4790 "status": "pending",
4791 },
4792 ],
4793 project_root=temp_dir,
4794 )
4795
4796 queued_messages: list[str] = []
4797 context = build_context(
4798 temp_dir=temp_dir,
4799 messages=[],
4800 safeguards=FakeSafeguards(),
4801 assess_confidence=assess_confidence,
4802 verify_action=verify_action,
4803 auto_recover=False,
4804 )
4805 context.queue_steering_message_callback = queued_messages.append
4806 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4807
4808 todos = [
4809 {
4810 "content": "Examine the existing Fortran guide structure to understand the format and depth",
4811 "active_form": "Examining the existing Fortran guide structure",
4812 "status": "completed",
4813 },
4814 {
4815 "content": "Create the new nginx guide directory structure",
4816 "active_form": "Creating the new nginx guide directory structure",
4817 "status": "completed",
4818 },
4819 {
4820 "content": "Create a new index.html for the nginx guide",
4821 "active_form": "Creating a new index.html for the nginx guide",
4822 "status": "pending",
4823 },
4824 {
4825 "content": "Create the first chapter for the nginx guide",
4826 "active_form": "Creating the first chapter for the nginx guide",
4827 "status": "pending",
4828 },
4829 ]
4830 tool_call = ToolCall(
4831 id="todo-index-before-chapter",
4832 name="TodoWrite",
4833 arguments={"todos": todos},
4834 )
4835 executor = FakeExecutor(
4836 [
4837 tool_outcome(
4838 tool_call=tool_call,
4839 output="Todos updated",
4840 is_error=False,
4841 metadata={"new_todos": todos},
4842 )
4843 ]
4844 )
4845
4846 summary = TurnSummary(final_response="")
4847 await runner.execute_batch(
4848 tool_calls=[tool_call],
4849 tool_source="assistant",
4850 pending_tool_calls_seen=set(),
4851 emit=_noop_emit,
4852 summary=summary,
4853 dod=dod,
4854 executor=executor, # type: ignore[arg-type]
4855 on_confirmation=None,
4856 on_user_question=None,
4857 emit_confirmation=None,
4858 consecutive_errors=0,
4859 )
4860
4861 assert queued_messages
4862 message = queued_messages[-1]
4863 assert "Todo tracking is updated. Next step: create `index.html`." in message
4864 assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
4865 assert "01-introduction.html" not in message
4866
4867
4868 @pytest.mark.asyncio
4869 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
4870 temp_dir: Path,
4871 ) -> None:
4872 async def assess_confidence(
4873 tool_name: str,
4874 tool_args: dict,
4875 context: str,
4876 ) -> ConfidenceAssessment:
4877 raise AssertionError("Confidence scoring should not run in this scenario")
4878
4879 async def verify_action(
4880 tool_name: str,
4881 tool_args: dict,
4882 result: str,
4883 expected: str = "",
4884 ) -> ActionVerification:
4885 raise AssertionError("Verification should not run in this scenario")
4886
4887 guide_root = temp_dir / "guides" / "nginx"
4888 chapters = guide_root / "chapters"
4889 guide_root.mkdir(parents=True)
4890 chapters.mkdir()
4891 index_path = guide_root / "index.html"
4892 index_path.write_text(
4893 "\n".join(
4894 [
4895 "<html>",
4896 '<a href="chapters/introduction.html">Introduction</a>',
4897 '<a href="chapters/installation.html">Installation</a>',
4898 "</html>",
4899 ]
4900 )
4901 + "\n"
4902 )
4903
4904 implementation_plan = temp_dir / "implementation.md"
4905 implementation_plan.write_text(
4906 "\n".join(
4907 [
4908 "# Implementation Plan",
4909 "",
4910 "## File Changes",
4911 f"- `{guide_root}/`",
4912 f"- `{chapters}/`",
4913 f"- `{index_path}`",
4914 "",
4915 ]
4916 )
4917 )
4918
4919 dod = create_definition_of_done("Create a multi-file nginx guide.")
4920 dod.implementation_plan = str(implementation_plan)
4921 dod.pending_items = [
4922 "Write the introduction chapter",
4923 "Complete the requested work",
4924 ]
4925 dod.touched_files.append(str(index_path))
4926
4927 queued_messages: list[str] = []
4928 context = build_context(
4929 temp_dir=temp_dir,
4930 messages=[],
4931 safeguards=FakeSafeguards(),
4932 assess_confidence=assess_confidence,
4933 verify_action=verify_action,
4934 auto_recover=False,
4935 )
4936 context.queue_steering_message_callback = queued_messages.append
4937 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4938
4939 tool_call = ToolCall(
4940 id="todo-1",
4941 name="TodoWrite",
4942 arguments={
4943 "todos": [
4944 {
4945 "content": "Write the introduction chapter",
4946 "activeForm": "Writing the introduction chapter",
4947 "status": "pending",
4948 }
4949 ]
4950 },
4951 )
4952 executor = FakeExecutor(
4953 [
4954 tool_outcome(
4955 tool_call=tool_call,
4956 output="Todos updated",
4957 is_error=False,
4958 metadata={
4959 "new_todos": [
4960 {
4961 "content": "Write the introduction chapter",
4962 "active_form": "Writing the introduction chapter",
4963 "status": "pending",
4964 }
4965 ]
4966 },
4967 )
4968 ]
4969 )
4970
4971 summary = TurnSummary(final_response="")
4972 await runner.execute_batch(
4973 tool_calls=[tool_call],
4974 tool_source="assistant",
4975 pending_tool_calls_seen=set(),
4976 emit=_noop_emit,
4977 summary=summary,
4978 dod=dod,
4979 executor=executor, # type: ignore[arg-type]
4980 on_confirmation=None,
4981 on_user_question=None,
4982 emit_confirmation=None,
4983 consecutive_errors=0,
4984 )
4985
4986 assert queued_messages
4987 message = queued_messages[-1]
4988 assert "Todo tracking is updated. Next step: create `introduction.html`." in message
4989 assert "Prefer one `write(file_path=..., content=...)` call" in message
4990 assert "Make your next response the concrete mutation tool call itself." in message
4991
4992
4993 @pytest.mark.asyncio
4994 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
4995 temp_dir: Path,
4996 ) -> None:
4997 async def assess_confidence(
4998 tool_name: str,
4999 tool_args: dict,
5000 context: str,
5001 ) -> ConfidenceAssessment:
5002 raise AssertionError("Confidence scoring should not run in this scenario")
5003
5004 async def verify_action(
5005 tool_name: str,
5006 tool_args: dict,
5007 result: str,
5008 expected: str = "",
5009 ) -> ActionVerification:
5010 raise AssertionError("Verification should not run in this scenario")
5011
5012 guide_root = temp_dir / "guides" / "nginx"
5013 chapters = guide_root / "chapters"
5014 guide_root.mkdir(parents=True)
5015 chapters.mkdir()
5016 index_path = guide_root / "index.html"
5017 chapter_one = chapters / "01-introduction.html"
5018 index_path.write_text(
5019 "\n".join(
5020 [
5021 "<html>",
5022 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
5023 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
5024 "</html>",
5025 ]
5026 )
5027 + "\n"
5028 )
5029 chapter_one.write_text("<html></html>\n")
5030
5031 implementation_plan = temp_dir / "implementation.md"
5032 implementation_plan.write_text(
5033 "\n".join(
5034 [
5035 "# Implementation Plan",
5036 "",
5037 "## File Changes",
5038 f"- `{guide_root}/`",
5039 f"- `{chapters}/`",
5040 f"- `{index_path}`",
5041 "",
5042 ]
5043 )
5044 )
5045
5046 dod = create_definition_of_done("Create a multi-file nginx guide.")
5047 dod.implementation_plan = str(implementation_plan)
5048 dod.pending_items = [
5049 "Creating Chapter 2: Installation and Setup",
5050 "Complete the requested work",
5051 ]
5052 dod.touched_files.extend([str(index_path), str(chapter_one)])
5053
5054 queued_messages: list[str] = []
5055 context = build_context(
5056 temp_dir=temp_dir,
5057 messages=[],
5058 safeguards=FakeSafeguards(),
5059 assess_confidence=assess_confidence,
5060 verify_action=verify_action,
5061 auto_recover=False,
5062 )
5063 context.queue_steering_message_callback = queued_messages.append
5064 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5065
5066 tool_call = ToolCall(
5067 id="todo-1",
5068 name="TodoWrite",
5069 arguments={
5070 "todos": [
5071 {
5072 "content": "Creating Chapter 2: Installation and Setup",
5073 "activeForm": "Creating Chapter 2: Installation and Setup",
5074 "status": "pending",
5075 }
5076 ]
5077 },
5078 )
5079 executor = FakeExecutor(
5080 [
5081 tool_outcome(
5082 tool_call=tool_call,
5083 output="Todos updated",
5084 is_error=False,
5085 metadata={
5086 "new_todos": [
5087 {
5088 "content": "Creating Chapter 2: Installation and Setup",
5089 "active_form": "Creating Chapter 2: Installation and Setup",
5090 "status": "pending",
5091 }
5092 ]
5093 },
5094 )
5095 ]
5096 )
5097
5098 summary = TurnSummary(final_response="")
5099 await runner.execute_batch(
5100 tool_calls=[tool_call],
5101 tool_source="assistant",
5102 pending_tool_calls_seen=set(),
5103 emit=_noop_emit,
5104 summary=summary,
5105 dod=dod,
5106 executor=executor, # type: ignore[arg-type]
5107 on_confirmation=None,
5108 on_user_question=None,
5109 emit_confirmation=None,
5110 consecutive_errors=0,
5111 )
5112
5113 assert queued_messages
5114 message = queued_messages[-1]
5115 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
5116 assert "Prefer one `write(file_path=..., content=...)` call" in message
5117 assert "Make your next response the concrete mutation tool call itself" in message
5118
5119
5120 @pytest.mark.asyncio
5121 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
5122 temp_dir: Path,
5123 ) -> None:
5124 async def assess_confidence(
5125 tool_name: str,
5126 tool_args: dict,
5127 context: str,
5128 ) -> ConfidenceAssessment:
5129 raise AssertionError("Confidence scoring should not run in this scenario")
5130
5131 async def verify_action(
5132 tool_name: str,
5133 tool_args: dict,
5134 result: str,
5135 expected: str = "",
5136 ) -> ActionVerification:
5137 raise AssertionError("Verification should not run in this scenario")
5138
5139 reference_chapters = temp_dir / "fortran" / "chapters"
5140 reference_chapters.mkdir(parents=True)
5141 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
5142
5143 guide_root = temp_dir / "guides" / "nginx"
5144 chapters = guide_root / "chapters"
5145 guide_root.mkdir(parents=True)
5146 chapters.mkdir()
5147 index_path = guide_root / "index.html"
5148 index_path.write_text("<html></html>\n")
5149
5150 implementation_plan = temp_dir / "implementation.md"
5151 implementation_plan.write_text(
5152 "\n".join(
5153 [
5154 "# Implementation Plan",
5155 "",
5156 "## File Changes",
5157 f"- `{guide_root}/`",
5158 f"- `{chapters}/`",
5159 f"- `{index_path}`",
5160 "",
5161 ]
5162 )
5163 )
5164
5165 dod = create_definition_of_done("Create a multi-file nginx guide.")
5166 dod.implementation_plan = str(implementation_plan)
5167 dod.pending_items = [
5168 "Write the introduction chapter",
5169 "Complete the requested work",
5170 ]
5171 dod.touched_files.append(str(index_path))
5172
5173 queued_messages: list[str] = []
5174 context = build_context(
5175 temp_dir=temp_dir,
5176 messages=[
5177 Message(
5178 role=Role.ASSISTANT,
5179 content="",
5180 tool_calls=[
5181 ToolCall(
5182 id="read-ref-1",
5183 name="read",
5184 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
5185 )
5186 ],
5187 )
5188 ],
5189 safeguards=FakeSafeguards(),
5190 assess_confidence=assess_confidence,
5191 verify_action=verify_action,
5192 auto_recover=False,
5193 )
5194 context.queue_steering_message_callback = queued_messages.append
5195 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5196
5197 tool_call = ToolCall(
5198 id="todo-observed-1",
5199 name="TodoWrite",
5200 arguments={
5201 "todos": [
5202 {
5203 "content": "Write the introduction chapter",
5204 "activeForm": "Writing the introduction chapter",
5205 "status": "pending",
5206 }
5207 ]
5208 },
5209 )
5210 executor = FakeExecutor(
5211 [
5212 tool_outcome(
5213 tool_call=tool_call,
5214 output="Todos updated",
5215 is_error=False,
5216 metadata={
5217 "new_todos": [
5218 {
5219 "content": "Write the introduction chapter",
5220 "active_form": "Writing the introduction chapter",
5221 "status": "pending",
5222 }
5223 ]
5224 },
5225 )
5226 ]
5227 )
5228
5229 summary = TurnSummary(final_response="")
5230 await runner.execute_batch(
5231 tool_calls=[tool_call],
5232 tool_source="assistant",
5233 pending_tool_calls_seen=set(),
5234 emit=_noop_emit,
5235 summary=summary,
5236 dod=dod,
5237 executor=executor, # type: ignore[arg-type]
5238 on_confirmation=None,
5239 on_user_question=None,
5240 emit_confirmation=None,
5241 consecutive_errors=0,
5242 )
5243
5244 assert queued_messages
5245 message = queued_messages[-1]
5246 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
5247 assert "Prefer one `write(file_path=..., content=...)` call" in message
5248
5249
5250 @pytest.mark.asyncio
5251 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
5252 temp_dir: Path,
5253 ) -> None:
5254 async def assess_confidence(
5255 tool_name: str,
5256 tool_args: dict,
5257 context: str,
5258 ) -> ConfidenceAssessment:
5259 raise AssertionError("Confidence scoring should not run in this scenario")
5260
5261 async def verify_action(
5262 tool_name: str,
5263 tool_args: dict,
5264 result: str,
5265 expected: str = "",
5266 ) -> ActionVerification:
5267 raise AssertionError("Verification should not run in this scenario")
5268
5269 guide_root = temp_dir / "guides" / "nginx"
5270 chapters = guide_root / "chapters"
5271 guide_root.mkdir(parents=True)
5272 chapters.mkdir()
5273 index_path = guide_root / "index.html"
5274 chapter_one = chapters / "01-getting-started.html"
5275 chapter_two = chapters / "02-installation.html"
5276 index_path.write_text("<html></html>\n")
5277 chapter_one.write_text("<h1>One</h1>\n")
5278
5279 implementation_plan = temp_dir / "implementation.md"
5280 implementation_plan.write_text(
5281 "\n".join(
5282 [
5283 "# Implementation Plan",
5284 "",
5285 "## File Changes",
5286 f"- `{guide_root}/`",
5287 f"- `{chapters}/`",
5288 f"- `{index_path}`",
5289 f"- `{chapter_one}`",
5290 f"- `{chapter_two}`",
5291 "",
5292 ]
5293 )
5294 )
5295
5296 context = build_context(
5297 temp_dir=temp_dir,
5298 messages=[],
5299 safeguards=FakeSafeguards(),
5300 assess_confidence=assess_confidence,
5301 verify_action=verify_action,
5302 auto_recover=False,
5303 )
5304 queued_messages: list[str] = []
5305 context.queue_steering_message_callback = queued_messages.append
5306 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5307 dod = create_definition_of_done("Create a multi-file nginx guide.")
5308 dod.implementation_plan = str(implementation_plan)
5309 sync_todos_to_definition_of_done(
5310 dod,
5311 [
5312 {
5313 "content": "Create 01-getting-started.html",
5314 "active_form": "Creating 01-getting-started.html",
5315 "status": "completed",
5316 },
5317 {
5318 "content": "Create 02-installation.html",
5319 "active_form": "Creating 02-installation.html",
5320 "status": "pending",
5321 },
5322 ],
5323 project_root=temp_dir,
5324 )
5325 dod.touched_files.extend([str(index_path), str(chapter_one)])
5326
5327 tool_call = ToolCall(
5328 id="working-note",
5329 name="notepad_write_working",
5330 arguments={"content": "Creating the second chapter file: Installation"},
5331 )
5332 executor = FakeExecutor(
5333 [
5334 tool_outcome(
5335 tool_call=tool_call,
5336 output="Working note recorded",
5337 is_error=False,
5338 )
5339 ]
5340 )
5341
5342 summary = TurnSummary(final_response="")
5343 await runner.execute_batch(
5344 tool_calls=[tool_call],
5345 tool_source="assistant",
5346 pending_tool_calls_seen=set(),
5347 emit=_noop_emit,
5348 summary=summary,
5349 dod=dod,
5350 executor=executor, # type: ignore[arg-type]
5351 on_confirmation=None,
5352 on_user_question=None,
5353 emit_confirmation=None,
5354 consecutive_errors=0,
5355 )
5356
5357 assert queued_messages
5358 message = queued_messages[-1]
5359 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5360 assert "Resume by creating `02-installation.html` now." in message
5361 assert "Make your next response the concrete mutation tool call itself" in message
5362 assert "refresh `TodoWrite`" in message
5363 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
5364
5365
5366 @pytest.mark.asyncio
5367 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
5368 temp_dir: Path,
5369 ) -> None:
5370 async def assess_confidence(
5371 tool_name: str,
5372 tool_args: dict,
5373 context: str,
5374 ) -> ConfidenceAssessment:
5375 raise AssertionError("Confidence scoring should be disabled in this scenario")
5376
5377 async def verify_action(
5378 tool_name: str,
5379 tool_args: dict,
5380 result: str,
5381 expected: str = "",
5382 ) -> ActionVerification:
5383 raise AssertionError("Verification should not run in this scenario")
5384
5385 implementation_plan = temp_dir / "implementation.md"
5386 implementation_plan.write_text(
5387 "\n".join(
5388 [
5389 "# Implementation Plan",
5390 "",
5391 "## File Changes",
5392 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
5393 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
5394 "",
5395 ]
5396 )
5397 )
5398
5399 context = build_context(
5400 temp_dir=temp_dir,
5401 messages=[],
5402 safeguards=FakeSafeguards(),
5403 assess_confidence=assess_confidence,
5404 verify_action=verify_action,
5405 auto_recover=False,
5406 )
5407 queued_messages: list[str] = []
5408 context.queue_steering_message_callback = queued_messages.append
5409 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5410 dod = create_definition_of_done("Create a multi-file nginx guide.")
5411 dod.implementation_plan = str(implementation_plan)
5412 dod.pending_items.extend(
5413 [
5414 "First, examine the existing fortran guide structure and content to understand the format",
5415 "Create the nginx directory structure",
5416 "Develop the main index.html file for the nginx guide",
5417 ]
5418 )
5419
5420 tool_call = ToolCall(
5421 id="working-note",
5422 name="notepad_write_working",
5423 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
5424 )
5425 executor = FakeExecutor(
5426 [
5427 tool_outcome(
5428 tool_call=tool_call,
5429 output="Working note recorded",
5430 is_error=False,
5431 )
5432 ]
5433 )
5434
5435 summary = TurnSummary(final_response="")
5436 await runner.execute_batch(
5437 tool_calls=[tool_call],
5438 tool_source="assistant",
5439 pending_tool_calls_seen=set(),
5440 emit=_noop_emit,
5441 summary=summary,
5442 dod=dod,
5443 executor=executor, # type: ignore[arg-type]
5444 on_confirmation=None,
5445 on_user_question=None,
5446 emit_confirmation=None,
5447 consecutive_errors=0,
5448 )
5449
5450 assert queued_messages
5451 message = queued_messages[-1]
5452 assert (
5453 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
5454 in message
5455 )
5456 assert "one concrete evidence-gathering tool call" in message
5457 assert "Resume by creating `index.html` now." not in message
5458
5459
5460 @pytest.mark.asyncio
5461 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
5462 temp_dir: Path,
5463 ) -> None:
5464 async def assess_confidence(
5465 tool_name: str,
5466 tool_args: dict,
5467 context: str,
5468 ) -> ConfidenceAssessment:
5469 raise AssertionError("Confidence scoring should be disabled in this scenario")
5470
5471 async def verify_action(
5472 tool_name: str,
5473 tool_args: dict,
5474 result: str,
5475 expected: str = "",
5476 ) -> ActionVerification:
5477 raise AssertionError("Verification should not run in this scenario")
5478
5479 guide_root = temp_dir / "guides" / "nginx"
5480 chapters_dir = guide_root / "chapters"
5481 chapters_dir.mkdir(parents=True)
5482 index_path = guide_root / "index.html"
5483 first_chapter = chapters_dir / "01-introduction.html"
5484 index_path.write_text(
5485 "\n".join(
5486 [
5487 '<a href="chapters/01-introduction.html">Introduction</a>',
5488 '<a href="chapters/02-installation.html">Installation</a>',
5489 '<a href="chapters/03-configuration.html">Configuration</a>',
5490 ]
5491 )
5492 )
5493 first_chapter.write_text("<h1>Introduction</h1>\n")
5494
5495 implementation_plan = temp_dir / "implementation.md"
5496 implementation_plan.write_text(
5497 "\n".join(
5498 [
5499 "# Implementation Plan",
5500 "",
5501 "## File Changes",
5502 f"- `{guide_root / 'index.html'}`",
5503 f"- `{chapters_dir}/`",
5504 "",
5505 ]
5506 )
5507 )
5508
5509 context = build_context(
5510 temp_dir=temp_dir,
5511 messages=[],
5512 safeguards=FakeSafeguards(),
5513 assess_confidence=assess_confidence,
5514 verify_action=verify_action,
5515 auto_recover=False,
5516 )
5517 queued_messages: list[str] = []
5518 context.queue_steering_message_callback = queued_messages.append
5519 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5520 dod = create_definition_of_done("Create a multi-file nginx guide.")
5521 dod.implementation_plan = str(implementation_plan)
5522 dod.pending_items.extend(
5523 [
5524 "First, examine the existing fortran guide structure and content to understand the format",
5525 "Create chapter files following the established pattern",
5526 ]
5527 )
5528 dod.touched_files.extend([str(index_path), str(first_chapter)])
5529
5530 tool_call = ToolCall(
5531 id="working-note",
5532 name="notepad_write_working",
5533 arguments={"content": "Created index and first chapter; next is chapter 2"},
5534 )
5535 executor = FakeExecutor(
5536 [
5537 tool_outcome(
5538 tool_call=tool_call,
5539 output="Working note recorded",
5540 is_error=False,
5541 )
5542 ]
5543 )
5544
5545 summary = TurnSummary(final_response="")
5546 await runner.execute_batch(
5547 tool_calls=[tool_call],
5548 tool_source="assistant",
5549 pending_tool_calls_seen=set(),
5550 emit=_noop_emit,
5551 summary=summary,
5552 dod=dod,
5553 executor=executor, # type: ignore[arg-type]
5554 on_confirmation=None,
5555 on_user_question=None,
5556 emit_confirmation=None,
5557 consecutive_errors=0,
5558 )
5559
5560 assert queued_messages
5561 message = queued_messages[-1]
5562 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5563 assert "Resume by creating `02-installation.html` now." in message
5564 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
5565
5566
5567 @pytest.mark.asyncio
5568 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
5569 temp_dir: Path,
5570 ) -> None:
5571 async def assess_confidence(
5572 tool_name: str,
5573 tool_args: dict,
5574 context: str,
5575 ) -> ConfidenceAssessment:
5576 raise AssertionError("Confidence scoring should be disabled in this scenario")
5577
5578 async def verify_action(
5579 tool_name: str,
5580 tool_args: dict,
5581 result: str,
5582 expected: str = "",
5583 ) -> ActionVerification:
5584 raise AssertionError("Verification should not run in this scenario")
5585
5586 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
5587 chapters_dir = fortran_root / "chapters"
5588 chapters_dir.mkdir(parents=True)
5589
5590 implementation_plan = temp_dir / "implementation.md"
5591 implementation_plan.write_text(
5592 "\n".join(
5593 [
5594 "# Implementation Plan",
5595 "",
5596 "## File Changes",
5597 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
5598 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
5599 "",
5600 ]
5601 )
5602 )
5603
5604 context = build_context(
5605 temp_dir=temp_dir,
5606 messages=[],
5607 safeguards=FakeSafeguards(),
5608 assess_confidence=assess_confidence,
5609 verify_action=verify_action,
5610 auto_recover=False,
5611 )
5612 queued_messages: list[str] = []
5613 context.queue_steering_message_callback = queued_messages.append
5614 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5615 dod = create_definition_of_done("Create a multi-file nginx guide.")
5616 dod.implementation_plan = str(implementation_plan)
5617 dod.pending_items.extend(
5618 [
5619 "First, examine the existing fortran guide structure and content",
5620 "Create the nginx directory structure",
5621 "Develop the main index.html file for nginx guide",
5622 ]
5623 )
5624
5625 tool_call = ToolCall(
5626 id="glob-1",
5627 name="glob",
5628 arguments={"pattern": "**", "path": str(fortran_root)},
5629 )
5630 executor = FakeExecutor(
5631 [
5632 tool_outcome(
5633 tool_call=tool_call,
5634 output=f"{fortran_root}\n{chapters_dir}",
5635 is_error=False,
5636 )
5637 ]
5638 )
5639
5640 summary = TurnSummary(final_response="")
5641 await runner.execute_batch(
5642 tool_calls=[tool_call],
5643 tool_source="assistant",
5644 pending_tool_calls_seen=set(),
5645 emit=_noop_emit,
5646 summary=summary,
5647 dod=dod,
5648 executor=executor, # type: ignore[arg-type]
5649 on_confirmation=None,
5650 on_user_question=None,
5651 emit_confirmation=None,
5652 consecutive_errors=0,
5653 )
5654
5655 assert queued_messages == []
5656
5657
5658 @pytest.mark.asyncio
5659 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
5660 temp_dir: Path,
5661 ) -> None:
5662 async def assess_confidence(
5663 tool_name: str,
5664 tool_args: dict,
5665 context: str,
5666 ) -> ConfidenceAssessment:
5667 raise AssertionError("Confidence scoring should not run in this scenario")
5668
5669 async def verify_action(
5670 tool_name: str,
5671 tool_args: dict,
5672 result: str,
5673 expected: str = "",
5674 ) -> ActionVerification:
5675 raise AssertionError("Verification should not run in this scenario")
5676
5677 prompt = (
5678 "Have a look at ~/Loader/guides/fortran/index.html, then "
5679 "~/Loader/guides/fortran/chapters. The table of contents links in "
5680 "index.html are inaccurate and the href’s are wrong. Let’s update the "
5681 "links and their link texts to be correct."
5682 )
5683 chapters = temp_dir / "chapters"
5684 chapters.mkdir()
5685 (chapters / "01-introduction.html").write_text(
5686 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
5687 )
5688 (chapters / "02-setup.html").write_text(
5689 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
5690 )
5691 current_block = (
5692 "<h2>Table of Contents</h2>\n"
5693 ' <ul class="chapter-list">\n'
5694 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
5695 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
5696 " </ul>\n"
5697 )
5698 index_path = temp_dir / "index.html"
5699 index_path.write_text(current_block)
5700
5701 context = build_context(
5702 temp_dir=temp_dir,
5703 messages=[],
5704 safeguards=FakeSafeguards(),
5705 assess_confidence=assess_confidence,
5706 verify_action=verify_action,
5707 auto_recover=False,
5708 )
5709 context.session.current_task = prompt # type: ignore[attr-defined]
5710 queued_messages: list[str] = []
5711 context.queue_steering_message_callback = queued_messages.append
5712 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5713 tool_call = ToolCall(
5714 id="edit-1",
5715 name="edit",
5716 arguments={
5717 "file_path": str(index_path),
5718 "old_string": current_block,
5719 "new_string": current_block,
5720 },
5721 )
5722 executor = FakeExecutor(
5723 [
5724 tool_outcome(
5725 tool_call=tool_call,
5726 output=(
5727 "[Blocked - old_string and new_string are identical - no change "
5728 "would occur] Suggestion: Provide different old and new strings"
5729 ),
5730 is_error=True,
5731 state=ToolExecutionState.BLOCKED,
5732 )
5733 ]
5734 )
5735
5736 await runner.execute_batch(
5737 tool_calls=[tool_call],
5738 tool_source="assistant",
5739 pending_tool_calls_seen=set(),
5740 emit=_noop_emit,
5741 summary=TurnSummary(final_response=""),
5742 dod=create_definition_of_done(prompt),
5743 executor=executor, # type: ignore[arg-type]
5744 on_confirmation=None,
5745 on_user_question=None,
5746 emit_confirmation=None,
5747 consecutive_errors=0,
5748 )
5749
5750 assert queued_messages == []
5751
5752
5753 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
5754 temp_dir: Path,
5755 ) -> None:
5756 async def assess_confidence(
5757 tool_name: str,
5758 tool_args: dict,
5759 context: str,
5760 ) -> ConfidenceAssessment:
5761 raise AssertionError("Confidence scoring should be disabled in this scenario")
5762
5763 async def verify_action(
5764 tool_name: str,
5765 tool_args: dict,
5766 result: str,
5767 expected: str = "",
5768 ) -> ActionVerification:
5769 raise AssertionError("Verification should not run in this scenario")
5770
5771 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
5772 context = build_context(
5773 temp_dir=temp_dir,
5774 messages=[
5775 Message(
5776 role=Role.ASSISTANT,
5777 content=(
5778 "Repair focus:\n"
5779 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
5780 f"- Immediate next step: edit `{repair_target}`.\n"
5781 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
5782 ),
5783 )
5784 ],
5785 safeguards=FakeSafeguards(),
5786 assess_confidence=assess_confidence,
5787 verify_action=verify_action,
5788 )
5789 queued: list[str] = []
5790 context.queue_steering_message_callback = queued.append
5791 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5792 dod = create_definition_of_done("Repair a guide page.")
5793
5794 runner._queue_blocked_html_edit_nudge(
5795 ToolCall(
5796 id="edit-1",
5797 name="edit",
5798 arguments={
5799 "file_path": str(repair_target),
5800 "old_string": "same",
5801 "new_string": "same",
5802 },
5803 ),
5804 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
5805 dod=dod,
5806 )
5807
5808 assert queued
5809 assert str(repair_target) in queued[0]
5810 assert "no on-disk change" in queued[0]
5811 assert "replace the surrounding block" in queued[0]
5812 assert "Do not reopen unrelated reference materials" in queued[0]
5813
5814
5815 def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
5816 temp_dir: Path,
5817 ) -> None:
5818 async def assess_confidence(
5819 tool_name: str,
5820 tool_args: dict,
5821 context: str,
5822 ) -> ConfidenceAssessment:
5823 raise AssertionError("Confidence scoring should be disabled in this scenario")
5824
5825 async def verify_action(
5826 tool_name: str,
5827 tool_args: dict,
5828 result: str,
5829 expected: str = "",
5830 ) -> ActionVerification:
5831 raise AssertionError("Verification should not run in this scenario")
5832
5833 guide_root = temp_dir / "guide"
5834 chapters = guide_root / "chapters"
5835 chapters.mkdir(parents=True)
5836 index_path = guide_root / "index.html"
5837 chapter_one = chapters / "01-introduction.html"
5838 index_path.write_text("<html></html>\n")
5839 chapter_one.write_text("<html></html>\n")
5840
5841 implementation_plan = temp_dir / "implementation.md"
5842 implementation_plan.write_text(
5843 "\n".join(
5844 [
5845 "# Implementation Plan",
5846 "",
5847 "## File Changes",
5848 f"- `{index_path}`",
5849 f"- `{chapter_one}`",
5850 "",
5851 ]
5852 )
5853 )
5854
5855 context = build_context(
5856 temp_dir=temp_dir,
5857 messages=[
5858 Message(
5859 role=Role.ASSISTANT,
5860 content=(
5861 "Repair focus:\n"
5862 f"- Confirm the final guide state in `{index_path}`.\n"
5863 f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
5864 ),
5865 )
5866 ],
5867 safeguards=FakeSafeguards(),
5868 assess_confidence=assess_confidence,
5869 verify_action=verify_action,
5870 )
5871 queued: list[str] = []
5872 context.queue_steering_message_callback = queued.append
5873 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5874
5875 dod = create_definition_of_done("Create a multi-file guide.")
5876 dod.implementation_plan = str(implementation_plan)
5877 dod.touched_files.extend([str(index_path), str(chapter_one)])
5878 dod.verification_commands = [f"ls -la {guide_root}"]
5879
5880 runner._queue_blocked_html_edit_nudge(
5881 ToolCall(
5882 id="edit-1",
5883 name="edit",
5884 arguments={
5885 "file_path": str(index_path),
5886 "old_string": "same",
5887 "new_string": "same",
5888 },
5889 ),
5890 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
5891 dod=dod,
5892 )
5893
5894 assert queued
5895 assert "All explicitly planned artifacts already exist." in queued[0]
5896 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
5897 assert "replace the surrounding block" not in queued[0]
5898
5899
5900 async def _noop_emit(event: AgentEvent) -> None:
5901 return None
5902
5903
5904 @pytest.mark.asyncio
5905 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
5906 temp_dir: Path,
5907 ) -> None:
5908 async def assess_confidence(
5909 tool_name: str,
5910 tool_args: dict,
5911 context: str,
5912 ) -> ConfidenceAssessment:
5913 raise AssertionError("Confidence scoring should be disabled in this scenario")
5914
5915 async def verify_action(
5916 tool_name: str,
5917 tool_args: dict,
5918 result: str,
5919 expected: str = "",
5920 ) -> ActionVerification:
5921 raise AssertionError("Verification should not run for this scenario")
5922
5923 context = build_context(
5924 temp_dir=temp_dir,
5925 messages=[],
5926 safeguards=FakeSafeguards(),
5927 assess_confidence=assess_confidence,
5928 verify_action=verify_action,
5929 )
5930 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5931 tool_call = ToolCall(
5932 id="write-1",
5933 name="write",
5934 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
5935 )
5936 executor = FakeExecutor(
5937 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5938 )
5939 summary = TurnSummary(final_response="")
5940 dod = create_definition_of_done("Update README and verify it still works.")
5941 events: list[AgentEvent] = []
5942
5943 async def emit(event: AgentEvent) -> None:
5944 events.append(event)
5945
5946 await runner.execute_batch(
5947 tool_calls=[tool_call],
5948 tool_source="assistant",
5949 pending_tool_calls_seen=set(),
5950 emit=emit,
5951 summary=summary,
5952 dod=dod,
5953 executor=executor, # type: ignore[arg-type]
5954 on_confirmation=None,
5955 on_user_question=None,
5956 emit_confirmation=None,
5957 consecutive_errors=0,
5958 )
5959
5960 assert dod.last_verification_result == "planned"
5961 assert dod.verification_commands
5962 assert "Collect verification evidence" in dod.pending_items
5963 assert dod.active_verification_attempt_id == "verification-attempt-1"
5964 assert dod.active_verification_attempt_number == 1
5965 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
5966 assert summary.workflow_timeline[-1].policy_outcome == "planned"
5967 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
5968 assert (
5969 summary.workflow_timeline[-1].verification_observations[0].attempt_id
5970 == "verification-attempt-1"
5971 )
5972 assert (
5973 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
5974 )
5975
5976
5977 @pytest.mark.asyncio
5978 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
5979 temp_dir: Path,
5980 ) -> None:
5981 async def assess_confidence(
5982 tool_name: str,
5983 tool_args: dict,
5984 context: str,
5985 ) -> ConfidenceAssessment:
5986 raise AssertionError("Confidence scoring should be disabled in this scenario")
5987
5988 async def verify_action(
5989 tool_name: str,
5990 tool_args: dict,
5991 result: str,
5992 expected: str = "",
5993 ) -> ActionVerification:
5994 raise AssertionError("Verification should not run in this scenario")
5995
5996 context = build_context(
5997 temp_dir=temp_dir,
5998 messages=[],
5999 safeguards=FakeSafeguards(),
6000 assess_confidence=assess_confidence,
6001 verify_action=verify_action,
6002 )
6003 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6004 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
6005 chapters = nginx_root / "chapters"
6006 implementation_plan = temp_dir / "implementation.md"
6007 implementation_plan.write_text(
6008 "\n".join(
6009 [
6010 "# Implementation Plan",
6011 "",
6012 "## File Changes",
6013 f"- `{chapters}/`",
6014 f"- `{nginx_root / 'index.html'}`",
6015 "",
6016 ]
6017 )
6018 )
6019
6020 tool_call = ToolCall(
6021 id="mkdir-1",
6022 name="bash",
6023 arguments={"command": f"mkdir -p {chapters}"},
6024 )
6025 executor = FakeExecutor(
6026 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
6027 )
6028 summary = TurnSummary(final_response="")
6029 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
6030 dod.implementation_plan = str(implementation_plan)
6031 events: list[AgentEvent] = []
6032
6033 async def emit(event: AgentEvent) -> None:
6034 events.append(event)
6035
6036 await runner.execute_batch(
6037 tool_calls=[tool_call],
6038 tool_source="assistant",
6039 pending_tool_calls_seen=set(),
6040 emit=emit,
6041 summary=summary,
6042 dod=dod,
6043 executor=executor, # type: ignore[arg-type]
6044 on_confirmation=None,
6045 on_user_question=None,
6046 emit_confirmation=None,
6047 consecutive_errors=0,
6048 )
6049
6050 assert dod.last_verification_result is None
6051 assert "Collect verification evidence" not in dod.pending_items
6052 assert not any(
6053 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
6054 )
6055
6056
6057 @pytest.mark.asyncio
6058 async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
6059 temp_dir: Path,
6060 ) -> None:
6061 async def assess_confidence(
6062 tool_name: str,
6063 tool_args: dict,
6064 context: str,
6065 ) -> ConfidenceAssessment:
6066 raise AssertionError("Confidence scoring should be disabled in this scenario")
6067
6068 async def verify_action(
6069 tool_name: str,
6070 tool_args: dict,
6071 result: str,
6072 expected: str = "",
6073 ) -> ActionVerification:
6074 raise AssertionError("Verification should not run in this scenario")
6075
6076 context = build_context(
6077 temp_dir=temp_dir,
6078 messages=[],
6079 safeguards=FakeSafeguards(),
6080 assess_confidence=assess_confidence,
6081 verify_action=verify_action,
6082 )
6083 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6084 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
6085 chapters = nginx_root / "chapters"
6086 chapters.mkdir(parents=True)
6087 index_path = nginx_root / "index.html"
6088 implementation_plan = temp_dir / "implementation.md"
6089 implementation_plan.write_text(
6090 "\n".join(
6091 [
6092 "# Implementation Plan",
6093 "",
6094 "## File Changes",
6095 f"- `{nginx_root}/`",
6096 f"- `{chapters}/`",
6097 f"- `{index_path}`",
6098 "",
6099 ]
6100 )
6101 )
6102
6103 tool_call = ToolCall(
6104 id="write-index",
6105 name="write",
6106 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
6107 )
6108 executor = FakeExecutor(
6109 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
6110 )
6111 summary = TurnSummary(final_response="")
6112 dod = create_definition_of_done("Create a multi-file nginx guide.")
6113 dod.implementation_plan = str(implementation_plan)
6114 dod.pending_items.extend(
6115 [
6116 "Develop the main index.html file with proper structure",
6117 "Create first nginx chapter",
6118 ]
6119 )
6120 events: list[AgentEvent] = []
6121
6122 async def emit(event: AgentEvent) -> None:
6123 events.append(event)
6124
6125 await runner.execute_batch(
6126 tool_calls=[tool_call],
6127 tool_source="assistant",
6128 pending_tool_calls_seen=set(),
6129 emit=emit,
6130 summary=summary,
6131 dod=dod,
6132 executor=executor, # type: ignore[arg-type]
6133 on_confirmation=None,
6134 on_user_question=None,
6135 emit_confirmation=None,
6136 consecutive_errors=0,
6137 )
6138
6139 assert dod.last_verification_result is None
6140 assert "Collect verification evidence" not in dod.pending_items
6141 assert "Create first nginx chapter" in dod.pending_items
6142 assert not any(
6143 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
6144 )
6145
6146
6147 @pytest.mark.asyncio
6148 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
6149 temp_dir: Path,
6150 ) -> None:
6151 async def assess_confidence(
6152 tool_name: str,
6153 tool_args: dict,
6154 context: str,
6155 ) -> ConfidenceAssessment:
6156 raise AssertionError("Confidence scoring should be disabled in this scenario")
6157
6158 async def verify_action(
6159 tool_name: str,
6160 tool_args: dict,
6161 result: str,
6162 expected: str = "",
6163 ) -> ActionVerification:
6164 raise AssertionError("Verification should not run for this scenario")
6165
6166 context = build_context(
6167 temp_dir=temp_dir,
6168 messages=[],
6169 safeguards=FakeSafeguards(),
6170 assess_confidence=assess_confidence,
6171 verify_action=verify_action,
6172 )
6173 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6174 tool_call = ToolCall(
6175 id="write-1",
6176 name="write",
6177 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
6178 )
6179 executor = FakeExecutor(
6180 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
6181 )
6182 summary = TurnSummary(final_response="")
6183 dod = create_definition_of_done("Update README and verify it still works.")
6184 dod.verification_commands = ["uv run pytest -q"]
6185 dod.last_verification_result = "passed"
6186 dod.verification_attempt_counter = 1
6187 dod.active_verification_attempt_id = "verification-attempt-1"
6188 dod.active_verification_attempt_number = 1
6189 dod.evidence = [
6190 VerificationEvidence(
6191 command="uv run pytest -q",
6192 passed=True,
6193 stdout="401 passed",
6194 kind="test",
6195 )
6196 ]
6197 dod.completed_items.append("Collect verification evidence")
6198 events: list[AgentEvent] = []
6199
6200 async def emit(event: AgentEvent) -> None:
6201 events.append(event)
6202
6203 await runner.execute_batch(
6204 tool_calls=[tool_call],
6205 tool_source="assistant",
6206 pending_tool_calls_seen=set(),
6207 emit=emit,
6208 summary=summary,
6209 dod=dod,
6210 executor=executor, # type: ignore[arg-type]
6211 on_confirmation=None,
6212 on_user_question=None,
6213 emit_confirmation=None,
6214 consecutive_errors=0,
6215 )
6216
6217 assert dod.last_verification_result == "stale"
6218 assert dod.evidence == []
6219 assert "Collect verification evidence" in dod.pending_items
6220 assert "Collect verification evidence" not in dod.completed_items
6221 assert dod.active_verification_attempt_id == "verification-attempt-2"
6222 assert dod.active_verification_attempt_number == 2
6223 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
6224 assert summary.workflow_timeline[-1].policy_outcome == "stale"
6225 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
6226 assert (
6227 summary.workflow_timeline[-1].verification_observations[0].attempt_id
6228 == "verification-attempt-1"
6229 )
6230 assert (
6231 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
6232 )
6233 assert (
6234 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
6235 == "verification-attempt-2"
6236 )
6237 assert (
6238 summary.workflow_timeline[-1].verification_observations[0].command
6239 == "uv run pytest -q"
6240 )
6241
6242
6243 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
6244 async def assess_confidence(
6245 tool_name: str,
6246 tool_args: dict,
6247 context: str,
6248 ) -> ConfidenceAssessment:
6249 raise AssertionError("Confidence scoring should be disabled in this scenario")
6250
6251 async def verify_action(
6252 tool_name: str,
6253 tool_args: dict,
6254 result: str,
6255 expected: str = "",
6256 ) -> ActionVerification:
6257 raise AssertionError("Verification should not run in this scenario")
6258
6259 repair_target = temp_dir / "guide" / "index.html"
6260 context = build_context(
6261 temp_dir=temp_dir,
6262 messages=[
6263 Message(
6264 role=Role.ASSISTANT,
6265 content=(
6266 "Repair focus:\n"
6267 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
6268 f"- Immediate next step: edit `{repair_target}`.\n"
6269 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
6270 ),
6271 )
6272 ],
6273 safeguards=FakeSafeguards(),
6274 assess_confidence=assess_confidence,
6275 verify_action=verify_action,
6276 )
6277 queued: list[str] = []
6278 context.queue_steering_message_callback = queued.append
6279 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6280
6281 runner._queue_blocked_active_repair_nudge(
6282 "[Blocked - active repair scope: verification already identified the repair target.]"
6283 )
6284
6285 assert queued
6286 assert str(repair_target) in queued[0]
6287 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
6288 assert "Do not reopen unrelated reference materials" in queued[0]
6289
6290
6291 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
6292 temp_dir: Path,
6293 ) -> None:
6294 async def assess_confidence(
6295 tool_name: str,
6296 tool_args: dict,
6297 context: str,
6298 ) -> ConfidenceAssessment:
6299 raise AssertionError("Confidence scoring should be disabled in this scenario")
6300
6301 async def verify_action(
6302 tool_name: str,
6303 tool_args: dict,
6304 result: str,
6305 expected: str = "",
6306 ) -> ActionVerification:
6307 raise AssertionError("Verification should not run in this scenario")
6308
6309 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
6310 stylesheet = temp_dir / "guide" / "styles.css"
6311 context = build_context(
6312 temp_dir=temp_dir,
6313 messages=[
6314 Message(
6315 role=Role.ASSISTANT,
6316 content=(
6317 "Repair focus:\n"
6318 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
6319 f"- Immediate next step: edit `{repair_target}`.\n"
6320 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
6321 ),
6322 )
6323 ],
6324 safeguards=FakeSafeguards(),
6325 assess_confidence=assess_confidence,
6326 verify_action=verify_action,
6327 )
6328 queued: list[str] = []
6329 context.queue_steering_message_callback = queued.append
6330 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6331
6332 runner._queue_blocked_active_repair_mutation_nudge(
6333 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
6334 )
6335
6336 assert queued
6337 assert str(repair_target) in queued[0]
6338 assert str(stylesheet) in queued[0]
6339 assert "before widening the change set" in queued[0]
6340
6341
6342 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
6343 temp_dir: Path,
6344 ) -> None:
6345 async def assess_confidence(
6346 tool_name: str,
6347 tool_args: dict,
6348 context: str,
6349 ) -> ConfidenceAssessment:
6350 raise AssertionError("Confidence scoring should be disabled in this scenario")
6351
6352 async def verify_action(
6353 tool_name: str,
6354 tool_args: dict,
6355 result: str,
6356 expected: str = "",
6357 ) -> ActionVerification:
6358 raise AssertionError("Verification should not run in this scenario")
6359
6360 context = build_context(
6361 temp_dir=temp_dir,
6362 messages=[],
6363 safeguards=FakeSafeguards(),
6364 assess_confidence=assess_confidence,
6365 verify_action=verify_action,
6366 )
6367 queued: list[str] = []
6368 context.queue_steering_message_callback = queued.append
6369 store = DefinitionOfDoneStore(temp_dir)
6370 dod = create_definition_of_done("Create a multi-file guide from a reference")
6371 plan_path = temp_dir / "implementation.md"
6372 plan_path.write_text(
6373 "# File Changes\n"
6374 "- `guide/index.html`\n"
6375 "- `guide/chapters/01-getting-started.html`\n"
6376 "- `guide/chapters/02-installation.html`\n"
6377 "- `guide/chapters/03-first-website.html`\n"
6378 )
6379 dod.implementation_plan = str(plan_path)
6380 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
6381 (temp_dir / "guide" / "index.html").write_text("index")
6382 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
6383 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
6384 runner = ToolBatchRunner(context, store)
6385
6386 runner._queue_blocked_late_reference_drift_nudge(
6387 "[Blocked - late reference drift: several planned artifacts already exist.]",
6388 dod=dod,
6389 )
6390
6391 assert queued
6392 assert "03-first-website.html" in queued[0]
6393 assert "older reference materials" in queued[0]
6394
6395
6396 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
6397 temp_dir: Path,
6398 ) -> None:
6399 async def assess_confidence(
6400 tool_name: str,
6401 tool_args: dict,
6402 context: str,
6403 ) -> ConfidenceAssessment:
6404 raise AssertionError("Confidence scoring should be disabled in this scenario")
6405
6406 async def verify_action(
6407 tool_name: str,
6408 tool_args: dict,
6409 result: str,
6410 expected: str = "",
6411 ) -> ActionVerification:
6412 raise AssertionError("Verification should not run in this scenario")
6413
6414 guide_root = temp_dir / "guide"
6415 chapters = guide_root / "chapters"
6416 guide_root.mkdir(parents=True)
6417 chapters.mkdir()
6418 index_path = guide_root / "index.html"
6419 chapter_one = chapters / "01-getting-started.html"
6420 chapter_two = chapters / "02-installation.html"
6421 index_path.write_text("index")
6422 chapter_one.write_text("one")
6423 chapter_two.write_text("two")
6424
6425 implementation_plan = temp_dir / "implementation.md"
6426 implementation_plan.write_text(
6427 "\n".join(
6428 [
6429 "# Implementation Plan",
6430 "",
6431 "## File Changes",
6432 f"- `{guide_root}`",
6433 f"- `{chapters}`",
6434 f"- `{index_path}`",
6435 f"- `{chapter_one}`",
6436 f"- `{chapter_two}`",
6437 "",
6438 ]
6439 )
6440 )
6441
6442 context = build_context(
6443 temp_dir=temp_dir,
6444 messages=[],
6445 safeguards=FakeSafeguards(),
6446 assess_confidence=assess_confidence,
6447 verify_action=verify_action,
6448 )
6449 queued: list[str] = []
6450 context.queue_steering_message_callback = queued.append
6451 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6452 dod = create_definition_of_done("Create a multi-file guide from a reference")
6453 dod.implementation_plan = str(implementation_plan)
6454 dod.verification_commands = [f"ls -la {guide_root}"]
6455 sync_todos_to_definition_of_done(
6456 dod,
6457 [
6458 {
6459 "content": "Verify all guide files are linked and complete",
6460 "active_form": "Working on: Verify all guide files are linked and complete",
6461 "status": "pending",
6462 }
6463 ],
6464 project_root=temp_dir,
6465 )
6466
6467 runner._queue_blocked_completed_artifact_scope_nudge(
6468 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
6469 dod=dod,
6470 )
6471
6472 assert queued
6473 assert context.workflow_mode == "verify"
6474 assert "All explicitly planned artifacts already exist." in queued[0]
6475 assert "Verify all guide files are linked and complete" in queued[0]
6476 assert "Do not reopen earlier reference materials." in queued[0]
6477 assert "Verification should run next" in queued[0]
6478
6479
6480 def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
6481 temp_dir: Path,
6482 ) -> None:
6483 async def assess_confidence(
6484 tool_name: str,
6485 tool_args: dict,
6486 context: str,
6487 ) -> ConfidenceAssessment:
6488 raise AssertionError("Confidence scoring should be disabled in this scenario")
6489
6490 async def verify_action(
6491 tool_name: str,
6492 tool_args: dict,
6493 result: str,
6494 expected: str = "",
6495 ) -> ActionVerification:
6496 raise AssertionError("Verification should not run in this scenario")
6497
6498 guide_root = temp_dir / "guide"
6499 chapters = guide_root / "chapters"
6500 guide_root.mkdir(parents=True)
6501 chapters.mkdir()
6502 index_path = guide_root / "index.html"
6503 chapter_one = chapters / "01-getting-started.html"
6504 chapter_two = chapters / "02-installation.html"
6505 index_path.write_text("index")
6506 chapter_one.write_text("one")
6507 chapter_two.write_text("two")
6508
6509 implementation_plan = temp_dir / "implementation.md"
6510 implementation_plan.write_text(
6511 "\n".join(
6512 [
6513 "# Implementation Plan",
6514 "",
6515 "## File Changes",
6516 f"- `{guide_root}`",
6517 f"- `{chapters}`",
6518 f"- `{index_path}`",
6519 f"- `{chapter_one}`",
6520 f"- `{chapter_two}`",
6521 "",
6522 ]
6523 )
6524 )
6525
6526 context = build_context(
6527 temp_dir=temp_dir,
6528 messages=[],
6529 safeguards=FakeSafeguards(),
6530 assess_confidence=assess_confidence,
6531 verify_action=verify_action,
6532 )
6533 queued: list[str] = []
6534 context.queue_steering_message_callback = queued.append
6535 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6536 dod = create_definition_of_done("Create a multi-file guide from a reference")
6537 dod.implementation_plan = str(implementation_plan)
6538 dod.verification_commands = [f"ls -la {guide_root}"]
6539
6540 runner._queue_blocked_completed_artifact_scope_nudge(
6541 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
6542 dod=dod,
6543 )
6544
6545 assert queued
6546 assert context.workflow_mode == "verify"
6547 assert "All explicitly planned artifacts already exist." in queued[0]
6548 assert "move to verification or final confirmation" in queued[0]
6549
6550
6551 @pytest.mark.asyncio
6552 async def test_tool_batch_runner_does_not_halt_on_repeated_post_build_audit_blocks(
6553 temp_dir: Path,
6554 ) -> None:
6555 async def assess_confidence(
6556 tool_name: str,
6557 tool_args: dict,
6558 context: str,
6559 ) -> ConfidenceAssessment:
6560 raise AssertionError("Confidence scoring should be disabled in this scenario")
6561
6562 async def verify_action(
6563 tool_name: str,
6564 tool_args: dict,
6565 result: str,
6566 expected: str = "",
6567 ) -> ActionVerification:
6568 raise AssertionError("Verification should not run in this scenario")
6569
6570 guide_root = temp_dir / "guide"
6571 chapters = guide_root / "chapters"
6572 guide_root.mkdir(parents=True)
6573 chapters.mkdir()
6574 index_path = guide_root / "index.html"
6575 chapter_one = chapters / "01-getting-started.html"
6576 chapter_two = chapters / "02-installation.html"
6577 index_path.write_text("index")
6578 chapter_one.write_text("one")
6579 chapter_two.write_text("two")
6580
6581 implementation_plan = temp_dir / "implementation.md"
6582 implementation_plan.write_text(
6583 "\n".join(
6584 [
6585 "# Implementation Plan",
6586 "",
6587 "## File Changes",
6588 f"- `{guide_root}`",
6589 f"- `{chapters}`",
6590 f"- `{index_path}`",
6591 f"- `{chapter_one}`",
6592 f"- `{chapter_two}`",
6593 "",
6594 ]
6595 )
6596 )
6597
6598 context = build_context(
6599 temp_dir=temp_dir,
6600 messages=[],
6601 safeguards=FakeSafeguards(),
6602 assess_confidence=assess_confidence,
6603 verify_action=verify_action,
6604 )
6605 queued: list[str] = []
6606 context.queue_steering_message_callback = queued.append
6607 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6608 dod = create_definition_of_done("Create a multi-file guide from a reference")
6609 dod.implementation_plan = str(implementation_plan)
6610 dod.verification_commands = [f"ls -la {guide_root}"]
6611
6612 blocked_message = (
6613 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]"
6614 )
6615 tool_calls = [
6616 ToolCall(
6617 id=f"audit-{index}",
6618 name="bash",
6619 arguments={"command": f"cd {temp_dir} && ls -la guide/chapters/"},
6620 )
6621 for index in range(1, 4)
6622 ]
6623 executor = FakeExecutor(
6624 [
6625 tool_outcome(
6626 tool_call=tool_call,
6627 output=blocked_message,
6628 is_error=True,
6629 state=ToolExecutionState.BLOCKED,
6630 )
6631 for tool_call in tool_calls
6632 ]
6633 )
6634 events: list[AgentEvent] = []
6635
6636 async def emit(event: AgentEvent) -> None:
6637 events.append(event)
6638
6639 result = await runner.execute_batch(
6640 tool_calls=tool_calls,
6641 tool_source="native",
6642 pending_tool_calls_seen=set(),
6643 emit=emit,
6644 summary=TurnSummary(final_response=""),
6645 dod=dod,
6646 executor=executor,
6647 on_confirmation=None,
6648 on_user_question=None,
6649 emit_confirmation=None,
6650 consecutive_errors=0,
6651 )
6652
6653 assert result.halted is False
6654 assert result.consecutive_errors == 0
6655 assert context.workflow_mode == "verify"
6656 assert queued
6657 assert any("move to verification or final confirmation" in message for message in queued)
6658
6659
6660 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
6661 temp_dir: Path,
6662 ) -> None:
6663 async def assess_confidence(
6664 tool_name: str,
6665 tool_args: dict,
6666 context: str,
6667 ) -> ConfidenceAssessment:
6668 raise AssertionError("Confidence scoring should be disabled in this scenario")
6669
6670 async def verify_action(
6671 tool_name: str,
6672 tool_args: dict,
6673 result: str,
6674 expected: str = "",
6675 ) -> ActionVerification:
6676 raise AssertionError("Verification should not run in this scenario")
6677
6678 context = build_context(
6679 temp_dir=temp_dir,
6680 messages=[],
6681 safeguards=FakeSafeguards(),
6682 assess_confidence=assess_confidence,
6683 verify_action=verify_action,
6684 )
6685 queued: list[str] = []
6686 context.queue_steering_message_callback = queued.append
6687 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6688
6689 runner._queue_blocked_html_declared_target_nudge(
6690 ToolCall(
6691 id="write-ch1",
6692 name="write",
6693 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
6694 ),
6695 (
6696 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
6697 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
6698 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
6699 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
6700 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
6701 ),
6702 )
6703
6704 assert queued
6705 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
6706 assert "`chapters/02-installation.html`" in queued[0]
6707 assert "same file now" in queued[0]
6708
6709
6710 def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
6711 temp_dir: Path,
6712 ) -> None:
6713 async def assess_confidence(
6714 tool_name: str,
6715 tool_args: dict,
6716 context: str,
6717 ) -> ConfidenceAssessment:
6718 raise AssertionError("Confidence scoring should be disabled in this scenario")
6719
6720 async def verify_action(
6721 tool_name: str,
6722 tool_args: dict,
6723 result: str,
6724 expected: str = "",
6725 ) -> ActionVerification:
6726 raise AssertionError("Verification should not run in this scenario")
6727
6728 context = build_context(
6729 temp_dir=temp_dir,
6730 messages=[],
6731 safeguards=FakeSafeguards(),
6732 assess_confidence=assess_confidence,
6733 verify_action=verify_action,
6734 )
6735 queued: list[str] = []
6736 context.queue_steering_message_callback = queued.append
6737 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6738
6739 runner._queue_blocked_html_declared_target_nudge(
6740 ToolCall(
6741 id="write-ch1",
6742 name="write",
6743 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
6744 ),
6745 (
6746 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
6747 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
6748 "introducing new sibling targets that the guide root does not declare; remove or replace "
6749 "undeclared hrefs like: troubleshooting.html. "
6750 "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
6751 "chapters/configuration.html."
6752 ),
6753 )
6754
6755 assert queued
6756 assert "Remove the invented hrefs or keep local links within the declared target set" in queued[0]
6757 assert "`chapters/installation.html`" in queued[0]
6758 assert "closest declared target(s)" not in queued[0]
6759
6760
6761 def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
6762 temp_dir: Path,
6763 ) -> None:
6764 async def assess_confidence(
6765 tool_name: str,
6766 tool_args: dict,
6767 context: str,
6768 ) -> ConfidenceAssessment:
6769 raise AssertionError("Confidence scoring should be disabled in this scenario")
6770
6771 async def verify_action(
6772 tool_name: str,
6773 tool_args: dict,
6774 result: str,
6775 expected: str = "",
6776 ) -> ActionVerification:
6777 raise AssertionError("Verification should not run in this scenario")
6778
6779 context = build_context(
6780 temp_dir=temp_dir,
6781 messages=[],
6782 safeguards=FakeSafeguards(),
6783 assess_confidence=assess_confidence,
6784 verify_action=verify_action,
6785 )
6786 queued: list[str] = []
6787 context.queue_steering_message_callback = queued.append
6788 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6789 dod = create_definition_of_done("Create a guide.")
6790
6791 target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
6792 runner._queue_blocked_html_declared_file_creation_nudge(
6793 ToolCall(
6794 id="write-troubleshooting",
6795 name="write",
6796 arguments={"file_path": str(target)},
6797 ),
6798 (
6799 "[Blocked - HTML file creation falls outside the current declared artifact set] "
6800 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
6801 f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
6802 "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
6803 "Already-declared local targets include: chapters/advanced-topics.html, "
6804 "chapters/basic-usage.html, chapters/configuration.html"
6805 ),
6806 dod=dod,
6807 )
6808
6809 assert queued
6810 assert "update" in queued[0].lower()
6811 assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
6812 assert "`chapters/troubleshooting.html`" in queued[0]
6813 assert "retry the file creation" in queued[0]
6814
6815
6816 def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
6817 temp_dir: Path,
6818 ) -> None:
6819 async def assess_confidence(
6820 tool_name: str,
6821 tool_args: dict,
6822 context: str,
6823 ) -> ConfidenceAssessment:
6824 raise AssertionError("Confidence scoring should not run in this scenario")
6825
6826 async def verify_action(
6827 tool_name: str,
6828 tool_args: dict,
6829 result: str,
6830 expected: str = "",
6831 ) -> ActionVerification:
6832 raise AssertionError("Verification should not run in this scenario")
6833
6834 guide = temp_dir / "guide"
6835 chapters = guide / "chapters"
6836 guide.mkdir()
6837 chapters.mkdir()
6838 index = guide / "index.html"
6839 index.write_text(
6840 "\n".join(
6841 [
6842 '<a href="chapters/01-introduction.html">Intro</a>',
6843 '<a href="chapters/02-installation.html">Install</a>',
6844 '<a href="../index.html">Back</a>',
6845 "",
6846 ]
6847 )
6848 )
6849 (chapters / "01-introduction.html").write_text("<html></html>\n")
6850 (chapters / "02-installation.html").write_text("<html></html>\n")
6851
6852 implementation_plan = temp_dir / "implementation.md"
6853 implementation_plan.write_text(
6854 "\n".join(
6855 [
6856 "# Implementation Plan",
6857 "",
6858 "## File Changes",
6859 f"- `{index}`",
6860 f"- `{chapters / '01-introduction.html'}`",
6861 f"- `{chapters / '02-installation.html'}`",
6862 "",
6863 ]
6864 )
6865 )
6866
6867 context = build_context(
6868 temp_dir=temp_dir,
6869 messages=[],
6870 safeguards=FakeSafeguards(),
6871 assess_confidence=assess_confidence,
6872 verify_action=verify_action,
6873 )
6874 queued: list[str] = []
6875 context.queue_steering_message_callback = queued.append
6876 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6877 dod = create_definition_of_done("Create a guide.")
6878 dod.implementation_plan = str(implementation_plan)
6879 dod.verification_commands = [f"ls -la {guide}"]
6880 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
6881
6882 target = guide / "chapters" / "08-advanced-configuration.html"
6883 runner._queue_blocked_html_declared_file_creation_nudge(
6884 ToolCall(
6885 id="write-extra",
6886 name="write",
6887 arguments={"file_path": str(target)},
6888 ),
6889 (
6890 "[Blocked - HTML file creation falls outside the current declared artifact set] "
6891 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
6892 f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
6893 "for example: chapters/08-advanced-configuration.html."
6894 ),
6895 dod=dod,
6896 )
6897
6898 assert queued
6899 assert "All explicitly planned artifacts already exist on disk." in queued[0]
6900 assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
6901 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
6902 assert "update the guide root" not in queued[0]
6903
6904
6905 def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
6906 temp_dir: Path,
6907 ) -> None:
6908 async def assess_confidence(
6909 tool_name: str,
6910 tool_args: dict,
6911 context: str,
6912 ) -> ConfidenceAssessment:
6913 raise AssertionError("Confidence scoring should not run in this scenario")
6914
6915 async def verify_action(
6916 tool_name: str,
6917 tool_args: dict,
6918 result: str,
6919 expected: str = "",
6920 ) -> ActionVerification:
6921 raise AssertionError("Verification should not run in this scenario")
6922
6923 guide = temp_dir / "guide"
6924 chapters = guide / "chapters"
6925 guide.mkdir()
6926 chapters.mkdir()
6927 index = guide / "index.html"
6928 index.write_text(
6929 "\n".join(
6930 [
6931 '<a href="chapters/01-introduction.html">Intro</a>',
6932 '<a href="chapters/02-installation.html">Install</a>',
6933 '<a href="../index.html">Back</a>',
6934 "",
6935 ]
6936 )
6937 )
6938 (chapters / "01-introduction.html").write_text("<html></html>\n")
6939 (chapters / "02-installation.html").write_text("<html></html>\n")
6940
6941 implementation_plan = temp_dir / "implementation.md"
6942 implementation_plan.write_text(
6943 "\n".join(
6944 [
6945 "# Implementation Plan",
6946 "",
6947 "## File Changes",
6948 f"- `{index}`",
6949 f"- `{chapters / '01-introduction.html'}`",
6950 f"- `{chapters / '02-installation.html'}`",
6951 "",
6952 ]
6953 )
6954 )
6955
6956 context = build_context(
6957 temp_dir=temp_dir,
6958 messages=[],
6959 safeguards=FakeSafeguards(),
6960 assess_confidence=assess_confidence,
6961 verify_action=verify_action,
6962 )
6963 queued: list[str] = []
6964 context.queue_steering_message_callback = queued.append
6965 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6966 dod = create_definition_of_done("Create a guide.")
6967 dod.implementation_plan = str(implementation_plan)
6968 dod.verification_commands = [f"ls -la {guide}"]
6969 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
6970
6971 runner._queue_blocked_html_missing_target_nudge(
6972 ToolCall(
6973 id="edit-root",
6974 name="edit",
6975 arguments={"file_path": str(index)},
6976 ),
6977 (
6978 "[Blocked - Edited HTML links point to files that do not exist] "
6979 "Suggestion: Use only existing local targets for href values and avoid introducing missing links, "
6980 "for example fix: chapters/08-advanced-configuration.html"
6981 ),
6982 dod=dod,
6983 )
6984
6985 assert queued
6986 assert "All explicitly planned artifacts already exist on disk." in queued[0]
6987 assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
6988 assert "Repair the existing generated files instead of expanding the guide." in queued[0]
6989
6990
6991 @pytest.mark.asyncio
6992 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
6993 temp_dir: Path,
6994 ) -> None:
6995 async def assess_confidence(
6996 tool_name: str,
6997 tool_args: dict,
6998 context: str,
6999 ) -> ConfidenceAssessment:
7000 raise AssertionError("Confidence scoring should be disabled in this scenario")
7001
7002 async def verify_action(
7003 tool_name: str,
7004 tool_args: dict,
7005 result: str,
7006 expected: str = "",
7007 ) -> ActionVerification:
7008 raise AssertionError("Verification should not run in this scenario")
7009
7010 guide_root = temp_dir / "guides" / "nginx"
7011 chapters = guide_root / "chapters"
7012 chapters.mkdir(parents=True)
7013 index_path = guide_root / "index.html"
7014 chapter_one = chapters / "01-introduction.html"
7015 chapter_two = chapters / "02-installation.html"
7016 index_path.write_text("<html></html>\n")
7017 chapter_one.write_text("<h1>Intro</h1>\n")
7018
7019 implementation_plan = temp_dir / "implementation.md"
7020 implementation_plan.write_text(
7021 "\n".join(
7022 [
7023 "# Implementation Plan",
7024 "",
7025 "## File Changes",
7026 f"- `{index_path}`",
7027 f"- `{chapter_one}`",
7028 f"- `{chapter_two}`",
7029 "",
7030 ]
7031 )
7032 )
7033
7034 context = build_context(
7035 temp_dir=temp_dir,
7036 messages=[],
7037 safeguards=FakeSafeguards(),
7038 assess_confidence=assess_confidence,
7039 verify_action=verify_action,
7040 auto_recover=False,
7041 )
7042 queued: list[str] = []
7043 context.queue_steering_message_callback = queued.append
7044 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7045 tool_call = ToolCall(
7046 id="write-2",
7047 name="write",
7048 arguments={"file_path": "", "content": "<html></html>\n"},
7049 )
7050 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
7051 executor = FakeExecutor(
7052 [
7053 ToolExecutionOutcome(
7054 tool_call=tool_call,
7055 state=ToolExecutionState.BLOCKED,
7056 message=Message.tool_result_message(
7057 tool_call_id=tool_call.id,
7058 display_content=blocked_message,
7059 result_content=blocked_message,
7060 is_error=True,
7061 ),
7062 event_content=blocked_message,
7063 is_error=True,
7064 result_output=blocked_message,
7065 )
7066 ]
7067 )
7068 dod = create_definition_of_done("Create a multi-file nginx guide.")
7069 dod.implementation_plan = str(implementation_plan)
7070 dod.touched_files.extend([str(index_path), str(chapter_one)])
7071 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
7072
7073 await runner.execute_batch(
7074 tool_calls=[tool_call],
7075 tool_source="assistant",
7076 pending_tool_calls_seen=set(),
7077 emit=_noop_emit,
7078 summary=TurnSummary(final_response=""),
7079 dod=dod,
7080 executor=executor, # type: ignore[arg-type]
7081 on_confirmation=None,
7082 on_user_question=None,
7083 emit_confirmation=None,
7084 consecutive_errors=0,
7085 )
7086
7087 assert queued
7088 assert "did not provide a valid `file_path`" in queued[0]
7089 assert "Resume by creating `02-installation.html` now." in queued[0]
7090 assert (
7091 f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
7092 in queued[0]
7093 )
7094 assert context.recovery_context is not None
7095 assert context.recovery_context.attempts[-1].error == blocked_message