Python · 284971 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.path_display import display_runtime_path
20 from loader.runtime.permissions import (
21 PermissionMode,
22 build_permission_policy,
23 load_permission_rules,
24 )
25 from loader.runtime.reasoning_types import (
26 ActionVerification,
27 ConfidenceAssessment,
28 ConfidenceLevel,
29 )
30 from loader.runtime.recovery import RecoveryContext
31 from loader.runtime.tool_batches import (
32 ToolBatchRunner,
33 )
34 from loader.runtime.tool_batches import (
35 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
36 )
37 from loader.runtime.workflow import sync_todos_to_definition_of_done
38 from loader.tools.base import ToolResult as RegistryToolResult
39 from loader.tools.base import create_default_registry
40 from tests.helpers.runtime_harness import ScriptedBackend
41
42
43 class FakeSession:
44 def __init__(self, messages: list[Message]) -> None:
45 self.messages = list(messages)
46 self.workflow_timeline = []
47
48 def append(self, message: Message) -> None:
49 self.messages.append(message)
50
51 def append_workflow_timeline_entry(self, entry) -> None:
52 self.workflow_timeline.append(entry)
53
54
55 class FakeCodeFilter:
56 def reset(self) -> None:
57 return None
58
59
60 class FakeSafeguards:
61 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
62 self.action_tracker = object()
63 self.validator = object()
64 self.code_filter = FakeCodeFilter()
65 self._detect_loop_result = detect_loop_result
66
67 def filter_stream_chunk(self, content: str) -> str:
68 return content
69
70 def filter_complete_content(self, content: str) -> str:
71 return content
72
73 def should_steer(self) -> bool:
74 return False
75
76 def get_steering_message(self) -> str | None:
77 return None
78
79 def record_response(self, content: str) -> None:
80 return None
81
82 def detect_text_loop(self, content: str) -> tuple[bool, str]:
83 return False, ""
84
85 def detect_loop(self) -> tuple[bool, str]:
86 return self._detect_loop_result
87
88
89 class FakeExecutor:
90 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
91 self._outcomes = list(outcomes)
92 self.calls: list[ToolCall] = []
93
94 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
95 self.calls.append(tool_call)
96 if not self._outcomes:
97 raise AssertionError("No fake tool outcome queued")
98 return self._outcomes.pop(0)
99
100
101 def build_context(
102 *,
103 temp_dir: Path,
104 messages: list[Message],
105 safeguards: FakeSafeguards,
106 assess_confidence,
107 verify_action,
108 recovery_context: RecoveryContext | None = None,
109 confidence_scoring: bool = False,
110 verification: bool = False,
111 auto_recover: bool = True,
112 min_confidence_for_action: int = 3,
113 ) -> RuntimeContext:
114 registry = create_default_registry(temp_dir)
115 registry.configure_workspace_root(temp_dir)
116 rule_status = load_permission_rules(temp_dir)
117 policy = build_permission_policy(
118 active_mode=PermissionMode.WORKSPACE_WRITE,
119 workspace_root=temp_dir,
120 tool_requirements=registry.get_tool_requirements(),
121 rules=rule_status.rules,
122 )
123 context = RuntimeContext(
124 project_root=temp_dir,
125 backend=ScriptedBackend(),
126 registry=registry,
127 session=FakeSession(messages), # type: ignore[arg-type]
128 config=SimpleNamespace(
129 force_react=False,
130 max_recovery_attempts=2,
131 auto_recover=auto_recover,
132 reasoning=SimpleNamespace(
133 rollback=False,
134 show_rollback_plan=False,
135 completion_check=True,
136 max_continuation_prompts=5,
137 self_critique=False,
138 confidence_scoring=confidence_scoring,
139 min_confidence_for_action=min_confidence_for_action,
140 verification=verification,
141 ),
142 ),
143 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
144 project_context=None,
145 permission_policy=policy,
146 permission_config_status=rule_status,
147 workflow_mode="execute",
148 safeguards=safeguards,
149 reasoning=SimpleNamespace(
150 assess_confidence=assess_confidence,
151 verify_action=verify_action,
152 ),
153 recovery_context=recovery_context,
154 )
155 return context
156
157
158 def tool_outcome(
159 *,
160 tool_call: ToolCall,
161 output: str,
162 is_error: bool,
163 state: ToolExecutionState = ToolExecutionState.EXECUTED,
164 metadata: dict[str, object] | None = None,
165 ) -> ToolExecutionOutcome:
166 return ToolExecutionOutcome(
167 tool_call=tool_call,
168 state=state,
169 message=Message.tool_result_message(
170 tool_call_id=tool_call.id,
171 display_content=output,
172 result_content=output,
173 is_error=is_error,
174 ),
175 event_content=output,
176 is_error=is_error,
177 result_output=output,
178 registry_result=RegistryToolResult(
179 output=output,
180 is_error=is_error,
181 metadata=metadata or {},
182 ),
183 )
184
185
186 @pytest.mark.asyncio
187 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
188 captured: dict[str, str] = {}
189
190 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
191 captured["context"] = context
192 return ConfidenceAssessment(
193 action=f"{tool_name} with {tool_args}",
194 tool_name=tool_name,
195 tool_args=tool_args,
196 level=ConfidenceLevel.LOW,
197 reasoning="Need to inspect the target first.",
198 risks=["Unknown target file"],
199 )
200
201 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
202 raise AssertionError("Verification should not run for skipped actions")
203
204 context = build_context(
205 temp_dir=temp_dir,
206 messages=[
207 Message(role=Role.USER, content="Please inspect the project."),
208 Message(role=Role.ASSISTANT, content="I will read the file next."),
209 ],
210 safeguards=FakeSafeguards(),
211 assess_confidence=assess_confidence,
212 verify_action=verify_action,
213 confidence_scoring=True,
214 min_confidence_for_action=3,
215 )
216 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
217 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
218 events: list[AgentEvent] = []
219
220 async def emit(event: AgentEvent) -> None:
221 events.append(event)
222
223 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
224 result = await runner.execute_batch(
225 tool_calls=[tool_call],
226 tool_source="assistant",
227 pending_tool_calls_seen=set(),
228 emit=emit,
229 summary=TurnSummary(final_response=""),
230 dod=create_definition_of_done("Read the docs"),
231 executor=executor, # type: ignore[arg-type]
232 on_confirmation=None,
233 on_user_question=None,
234 emit_confirmation=None,
235 consecutive_errors=0,
236 )
237
238 assert result.actions_taken == []
239 assert executor.calls == []
240 assert "Please inspect the project." in captured["context"]
241 assert context.session.messages[-1].role == Role.USER
242 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
243 event_types = [event.type for event in events]
244 assert "confidence" in event_types
245
246
247 @pytest.mark.asyncio
248 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
249 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
250 raise AssertionError("Confidence scoring should be disabled in this scenario")
251
252 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
253 raise AssertionError("Verification should not run for failed actions")
254
255 context = build_context(
256 temp_dir=temp_dir,
257 messages=[],
258 safeguards=FakeSafeguards(),
259 assess_confidence=assess_confidence,
260 verify_action=verify_action,
261 auto_recover=True,
262 )
263 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
264 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
265 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
266 summary = TurnSummary(final_response="")
267 events: list[AgentEvent] = []
268
269 async def emit(event: AgentEvent) -> None:
270 events.append(event)
271
272 await runner.execute_batch(
273 tool_calls=[tool_call],
274 tool_source="assistant",
275 pending_tool_calls_seen=set(),
276 emit=emit,
277 summary=summary,
278 dod=create_definition_of_done("Run tests"),
279 executor=executor, # type: ignore[arg-type]
280 on_confirmation=None,
281 on_user_question=None,
282 emit_confirmation=None,
283 consecutive_errors=0,
284 )
285
286 assert context.recovery_context is not None
287 assert summary.tool_result_messages
288 assert context.session.messages[-1] == summary.tool_result_messages[-1]
289 assert any(event.type == "recovery" for event in events)
290
291
292 @pytest.mark.asyncio
293 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
294 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
295 raise AssertionError("Confidence scoring should be disabled in this scenario")
296
297 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
298 raise AssertionError("Verification should not run for this scenario")
299
300 context = build_context(
301 temp_dir=temp_dir,
302 messages=[],
303 safeguards=FakeSafeguards(),
304 assess_confidence=assess_confidence,
305 verify_action=verify_action,
306 auto_recover=False,
307 )
308 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
309 tool_call = ToolCall(
310 id="bash-1",
311 name="bash",
312 arguments={"command": "python -m http.server 8000", "background": True},
313 )
314 metadata = {
315 "job_id": "bash-1",
316 "status": "running",
317 "background": True,
318 }
319 executor = FakeExecutor(
320 [
321 tool_outcome(
322 tool_call=tool_call,
323 output="Started bash job bash-1",
324 is_error=False,
325 metadata=metadata,
326 )
327 ]
328 )
329 events: list[AgentEvent] = []
330
331 async def emit(event: AgentEvent) -> None:
332 events.append(event)
333
334 await runner.execute_batch(
335 tool_calls=[tool_call],
336 tool_source="assistant",
337 pending_tool_calls_seen=set(),
338 emit=emit,
339 summary=TurnSummary(final_response=""),
340 dod=create_definition_of_done("Launch a preview server"),
341 executor=executor, # type: ignore[arg-type]
342 on_confirmation=None,
343 on_user_question=None,
344 emit_confirmation=None,
345 consecutive_errors=0,
346 )
347
348 tool_result = next(event for event in events if event.type == "tool_result")
349 assert tool_result.tool_metadata == metadata
350
351
352 @pytest.mark.asyncio
353 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
354 verification_calls: list[str] = []
355
356 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
357 raise AssertionError("Confidence scoring should be disabled in this scenario")
358
359 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
360 verification_calls.append(result)
361 return ActionVerification(
362 tool_name=tool_name,
363 tool_args=tool_args,
364 expected_outcome="Success",
365 actual_result=result,
366 verified=False,
367 discrepancies=["File contents did not match"],
368 needs_correction=True,
369 correction_suggestion="Read the file before editing again.",
370 )
371
372 existing_recovery = RecoveryContext(
373 original_tool="edit",
374 original_args={"file_path": "README.md"},
375 )
376 context = build_context(
377 temp_dir=temp_dir,
378 messages=[],
379 safeguards=FakeSafeguards(),
380 assess_confidence=assess_confidence,
381 verify_action=verify_action,
382 recovery_context=existing_recovery,
383 verification=True,
384 )
385 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
386 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
387 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
388 events: list[AgentEvent] = []
389
390 async def emit(event: AgentEvent) -> None:
391 events.append(event)
392
393 await runner.execute_batch(
394 tool_calls=[tool_call],
395 tool_source="assistant",
396 pending_tool_calls_seen=set(),
397 emit=emit,
398 summary=TurnSummary(final_response=""),
399 dod=create_definition_of_done("Read the docs"),
400 executor=executor, # type: ignore[arg-type]
401 on_confirmation=None,
402 on_user_question=None,
403 emit_confirmation=None,
404 consecutive_errors=0,
405 )
406
407 assert verification_calls == ["file contents"]
408 assert context.recovery_context is existing_recovery
409 assert existing_recovery.successful_steps == [
410 ("read", {"file_path": "README.md"})
411 ]
412 assert context.session.messages[-1].role == Role.TOOL
413 assert context.session.messages[-1].content == "file contents"
414 assert any(event.type == "verification" for event in events)
415
416
417 @pytest.mark.asyncio
418 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
419 temp_dir: Path,
420 ) -> None:
421 async def assess_confidence(
422 tool_name: str,
423 tool_args: dict,
424 context: str,
425 ) -> ConfidenceAssessment:
426 raise AssertionError("Confidence scoring should be disabled in this scenario")
427
428 async def verify_action(
429 tool_name: str,
430 tool_args: dict,
431 result: str,
432 expected: str = "",
433 ) -> ActionVerification:
434 raise AssertionError("Verification should not run for this scenario")
435
436 existing_recovery = RecoveryContext(
437 original_tool="read",
438 original_args={"file_path": "chapters/04-data-types.html"},
439 )
440 existing_recovery.add_attempt(
441 "read",
442 {"file_path": "chapters/04-data-types.html"},
443 "File not found",
444 )
445 context = build_context(
446 temp_dir=temp_dir,
447 messages=[],
448 safeguards=FakeSafeguards(),
449 assess_confidence=assess_confidence,
450 verify_action=verify_action,
451 recovery_context=existing_recovery,
452 auto_recover=False,
453 )
454 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
455 tool_call = ToolCall(
456 id="bash-1",
457 name="bash",
458 arguments={"command": "ls chapters"},
459 )
460 executor = FakeExecutor(
461 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
462 )
463
464 summary = TurnSummary(final_response="")
465 await runner.execute_batch(
466 tool_calls=[tool_call],
467 tool_source="assistant",
468 pending_tool_calls_seen=set(),
469 emit=_noop_emit,
470 summary=summary,
471 dod=create_definition_of_done("Fix the chapter links"),
472 executor=executor, # type: ignore[arg-type]
473 on_confirmation=None,
474 on_user_question=None,
475 emit_confirmation=None,
476 consecutive_errors=0,
477 )
478
479 assert context.recovery_context is existing_recovery
480 assert existing_recovery.successful_steps == [
481 ("bash", {"command": "ls chapters"})
482 ]
483
484
485 @pytest.mark.asyncio
486 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
487 temp_dir: Path,
488 ) -> None:
489 async def assess_confidence(
490 tool_name: str,
491 tool_args: dict,
492 context: str,
493 ) -> ConfidenceAssessment:
494 raise AssertionError("Confidence scoring should be disabled in this scenario")
495
496 async def verify_action(
497 tool_name: str,
498 tool_args: dict,
499 result: str,
500 expected: str = "",
501 ) -> ActionVerification:
502 raise AssertionError("Verification should not run for this scenario")
503
504 existing_recovery = RecoveryContext(
505 original_tool="read",
506 original_args={"file_path": "chapters/04-data-types.html"},
507 )
508 existing_recovery.add_attempt(
509 "read",
510 {"file_path": "chapters/04-data-types.html"},
511 "File not found",
512 )
513 context = build_context(
514 temp_dir=temp_dir,
515 messages=[],
516 safeguards=FakeSafeguards(),
517 assess_confidence=assess_confidence,
518 verify_action=verify_action,
519 recovery_context=existing_recovery,
520 auto_recover=False,
521 )
522 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
523 tool_call = ToolCall(
524 id="patch-1",
525 name="patch",
526 arguments={
527 "file_path": "index.html",
528 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
529 },
530 )
531 executor = FakeExecutor(
532 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
533 )
534
535 summary = TurnSummary(final_response="")
536 await runner.execute_batch(
537 tool_calls=[tool_call],
538 tool_source="assistant",
539 pending_tool_calls_seen=set(),
540 emit=_noop_emit,
541 summary=summary,
542 dod=create_definition_of_done("Fix the chapter links"),
543 executor=executor, # type: ignore[arg-type]
544 on_confirmation=None,
545 on_user_question=None,
546 emit_confirmation=None,
547 consecutive_errors=0,
548 )
549
550 assert context.recovery_context is None
551
552
553 @pytest.mark.asyncio
554 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
555 temp_dir: Path,
556 ) -> None:
557 async def assess_confidence(
558 tool_name: str,
559 tool_args: dict,
560 context: str,
561 ) -> ConfidenceAssessment:
562 raise AssertionError("Confidence scoring should be disabled in this scenario")
563
564 async def verify_action(
565 tool_name: str,
566 tool_args: dict,
567 result: str,
568 expected: str = "",
569 ) -> ActionVerification:
570 raise AssertionError("Verification should not run for this scenario")
571
572 messages = [
573 Message(
574 role=Role.TOOL,
575 content=(
576 "Observation [glob]: Result: "
577 f"{temp_dir}/chapters/01-introduction.html\n"
578 f"{temp_dir}/chapters/02-setup.html\n"
579 f"{temp_dir}/chapters/03-basics.html"
580 ),
581 tool_results=[],
582 ),
583 Message(
584 role=Role.ASSISTANT,
585 content="I already inspected the first chapter title.",
586 tool_calls=[
587 ToolCall(
588 id="read-ch1",
589 name="read",
590 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
591 )
592 ],
593 ),
594 Message.tool_result_message(
595 tool_call_id="read-ch1",
596 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
598 ),
599 Message(
600 role=Role.ASSISTANT,
601 content="I should update the index now.",
602 tool_calls=[
603 ToolCall(
604 id="read-index",
605 name="read",
606 arguments={"file_path": str(temp_dir / 'index.html')},
607 )
608 ],
609 ),
610 ]
611 context = build_context(
612 temp_dir=temp_dir,
613 messages=messages,
614 safeguards=FakeSafeguards(),
615 assess_confidence=assess_confidence,
616 verify_action=verify_action,
617 auto_recover=False,
618 )
619 (temp_dir / "chapters").mkdir()
620 (temp_dir / "index.html").write_text("<ul></ul>\n")
621 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
622 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
623 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
624 implementation_plan = temp_dir / "implementation.md"
625 implementation_plan.write_text(
626 "\n".join(
627 [
628 "# Implementation Plan",
629 "",
630 "## File Changes",
631 f"- `{temp_dir / 'index.html'}`",
632 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
633 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
634 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
635 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
636 ]
637 )
638 )
639 context.session.current_task = (
640 f"Update {temp_dir / 'index.html'} with the right chapter links."
641 )
642 persistent_messages: list[str] = []
643 ephemeral_messages: list[str] = []
644 context.queue_steering_message_callback = persistent_messages.append
645 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
646 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
647 tool_call = ToolCall(
648 id="read-dup",
649 name="read",
650 arguments={"file_path": str(temp_dir / "index.html")},
651 )
652 duplicate_message = (
653 "[Skipped - duplicate action: Already read "
654 f"{temp_dir / 'index.html'} recently without any intervening changes; "
655 "reuse the earlier read result instead of rereading]"
656 )
657 executor = FakeExecutor(
658 [
659 ToolExecutionOutcome(
660 tool_call=tool_call,
661 state=ToolExecutionState.DUPLICATE,
662 message=Message.tool_result_message(
663 tool_call_id=tool_call.id,
664 display_content=duplicate_message,
665 result_content=duplicate_message,
666 ),
667 event_content=duplicate_message,
668 is_error=False,
669 result_output=duplicate_message,
670 )
671 ]
672 )
673
674 summary = TurnSummary(final_response="")
675 dod = create_definition_of_done("Fix the chapter links")
676 dod.implementation_plan = str(implementation_plan)
677 dod.pending_items.append("Create the remaining chapter files")
678 await runner.execute_batch(
679 tool_calls=[tool_call],
680 tool_source="assistant",
681 pending_tool_calls_seen=set(),
682 emit=_noop_emit,
683 summary=summary,
684 dod=dod,
685 executor=executor, # type: ignore[arg-type]
686 on_confirmation=None,
687 on_user_question=None,
688 emit_confirmation=None,
689 consecutive_errors=0,
690 )
691
692 assert len(persistent_messages) == 1
693 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
694 assert "A declared output artifact is still missing." in persistent_messages[0]
695 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
696 assert (
697 "Prefer one `write` call for "
698 f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
699 in persistent_messages[0]
700 )
701 assert ephemeral_messages == []
702
703
704 @pytest.mark.asyncio
705 async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
706 temp_dir: Path,
707 ) -> None:
708 async def assess_confidence(
709 tool_name: str,
710 tool_args: dict,
711 context: str,
712 ) -> ConfidenceAssessment:
713 raise AssertionError("Confidence scoring should not run for this scenario")
714
715 async def verify_action(
716 tool_name: str,
717 tool_args: dict,
718 result: str,
719 expected: str = "",
720 ) -> ActionVerification:
721 raise AssertionError("Verification should not run for this scenario")
722
723 guide_root = temp_dir / "guide"
724 chapters = guide_root / "chapters"
725 chapters.mkdir(parents=True)
726 index = guide_root / "index.html"
727 chapter_one = chapters / "01-introduction.html"
728 index.write_text(
729 '<a href="chapters/01-introduction.html">Intro</a>\n'
730 '<a href="chapters/02-installation.html">Install</a>\n'
731 )
732 chapter_one.write_text("<h1>Intro</h1>\n")
733
734 implementation_plan = temp_dir / "implementation.md"
735 implementation_plan.write_text(
736 "\n".join(
737 [
738 "# Implementation Plan",
739 "",
740 "## File Changes",
741 f"- `{index}`",
742 f"- `{chapters}/` (directory for chapter files)",
743 ]
744 )
745 )
746
747 messages = [
748 Message(
749 role=Role.ASSISTANT,
750 content="I should keep building the guide.",
751 tool_calls=[
752 ToolCall(
753 id="read-index",
754 name="read",
755 arguments={"file_path": str(index)},
756 )
757 ],
758 ),
759 ]
760 context = build_context(
761 temp_dir=temp_dir,
762 messages=messages,
763 safeguards=FakeSafeguards(),
764 assess_confidence=assess_confidence,
765 verify_action=verify_action,
766 auto_recover=False,
767 )
768 context.session.current_task = f"Build the guide rooted at {index}."
769 persistent_messages: list[str] = []
770 ephemeral_messages: list[str] = []
771 context.queue_steering_message_callback = persistent_messages.append
772 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
773 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
774 tool_call = ToolCall(
775 id="read-dup-rooted",
776 name="read",
777 arguments={"file_path": str(index)},
778 )
779 duplicate_message = (
780 "[Skipped - duplicate action: Already read "
781 f"{index} recently without any intervening changes; "
782 "reuse the earlier read result instead of rereading]"
783 )
784 executor = FakeExecutor(
785 [
786 ToolExecutionOutcome(
787 tool_call=tool_call,
788 state=ToolExecutionState.DUPLICATE,
789 message=Message.tool_result_message(
790 tool_call_id=tool_call.id,
791 display_content=duplicate_message,
792 result_content=duplicate_message,
793 ),
794 event_content=duplicate_message,
795 is_error=False,
796 result_output=duplicate_message,
797 )
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
803 dod.implementation_plan = str(implementation_plan)
804 dod.touched_files = [str(index), str(chapter_one)]
805 dod.completed_items = ["Create chapter files with appropriate content"]
806 dod.pending_items.append("Create the remaining chapter files")
807
808 await runner.execute_batch(
809 tool_calls=[tool_call],
810 tool_source="assistant",
811 pending_tool_calls_seen=set(),
812 emit=_noop_emit,
813 summary=summary,
814 dod=dod,
815 executor=executor, # type: ignore[arg-type]
816 on_confirmation=None,
817 on_user_question=None,
818 emit_confirmation=None,
819 consecutive_errors=0,
820 )
821
822 assert len(persistent_messages) == 1
823 assert "Create the remaining chapter files" in persistent_messages[0]
824 assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
825 assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
826 assert ephemeral_messages == []
827
828
829 @pytest.mark.asyncio
830 async def test_tool_batch_runner_duplicate_read_after_edit_mismatch_steers_to_mutation(
831 temp_dir: Path,
832 ) -> None:
833 async def assess_confidence(
834 tool_name: str,
835 tool_args: dict,
836 context: str,
837 ) -> ConfidenceAssessment:
838 raise AssertionError("Confidence scoring should not run for this scenario")
839
840 async def verify_action(
841 tool_name: str,
842 tool_args: dict,
843 result: str,
844 expected: str = "",
845 ) -> ActionVerification:
846 raise AssertionError("Verification should not run for this scenario")
847
848 target = temp_dir / "guide" / "chapters" / "02-installation.html"
849 target.parent.mkdir(parents=True)
850 target.write_text(
851 "<h1>Chapter 2: Installation Guide</h1>\n"
852 "<p>This chapter is still too thin.</p>\n"
853 )
854 recovery_context = RecoveryContext(
855 original_tool="edit",
856 original_args={
857 "file_path": str(target),
858 "old_string": "<h1>Installation</h1>",
859 "new_string": "<h1>Installation</h1><p>Expanded.</p>",
860 },
861 max_retries=2,
862 )
863 recovery_context.add_attempt(
864 "edit",
865 {
866 "file_path": str(target),
867 "old_string": "<h1>Installation</h1>",
868 "new_string": "<h1>Installation</h1><p>Expanded.</p>",
869 },
870 "old_string not found in file. Make sure it matches exactly.",
871 )
872 context = build_context(
873 temp_dir=temp_dir,
874 messages=[],
875 safeguards=FakeSafeguards(),
876 assess_confidence=assess_confidence,
877 verify_action=verify_action,
878 recovery_context=recovery_context,
879 auto_recover=False,
880 )
881 persistent_messages: list[str] = []
882 context.queue_steering_message_callback = persistent_messages.append
883 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
884 tool_call = ToolCall(
885 id="read-dup-after-edit-miss",
886 name="read",
887 arguments={"file_path": str(target)},
888 )
889 duplicate_message = (
890 "[Skipped - duplicate action: Already read "
891 f"{target} recently without any intervening changes; "
892 "reuse the earlier read result instead of rereading]"
893 )
894 executor = FakeExecutor(
895 [
896 ToolExecutionOutcome(
897 tool_call=tool_call,
898 state=ToolExecutionState.DUPLICATE,
899 message=Message.tool_result_message(
900 tool_call_id=tool_call.id,
901 display_content=duplicate_message,
902 result_content=duplicate_message,
903 ),
904 event_content=duplicate_message,
905 is_error=False,
906 result_output=duplicate_message,
907 )
908 ]
909 )
910 dod = create_definition_of_done("Expand thin generated guide chapters.")
911
912 await runner.execute_batch(
913 tool_calls=[tool_call],
914 tool_source="assistant",
915 pending_tool_calls_seen=set(),
916 emit=_noop_emit,
917 summary=TurnSummary(final_response=""),
918 dod=dod,
919 executor=executor, # type: ignore[arg-type]
920 on_confirmation=None,
921 on_user_question=None,
922 emit_confirmation=None,
923 consecutive_errors=0,
924 )
925
926 assert len(persistent_messages) == 1
927 assert "last edit" in persistent_messages[0]
928 assert "`old_string` did not exactly match" in persistent_messages[0]
929 assert "send one concrete mutation now" in persistent_messages[0]
930 assert "`write` with the complete replacement content" in persistent_messages[0]
931
932
933 @pytest.mark.asyncio
934 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
935 temp_dir: Path,
936 ) -> None:
937 async def assess_confidence(
938 tool_name: str,
939 tool_args: dict,
940 context: str,
941 ) -> ConfidenceAssessment:
942 raise AssertionError("Confidence scoring should not run for this scenario")
943
944 async def verify_action(
945 tool_name: str,
946 tool_args: dict,
947 result: str,
948 expected: str = "",
949 ) -> ActionVerification:
950 raise AssertionError("Verification should not run for this scenario")
951
952 context = build_context(
953 temp_dir=temp_dir,
954 messages=[],
955 safeguards=FakeSafeguards(),
956 assess_confidence=assess_confidence,
957 verify_action=verify_action,
958 auto_recover=False,
959 )
960 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
961 dod = create_definition_of_done("Create a multi-file nginx guide.")
962 sync_todos_to_definition_of_done(
963 dod,
964 [
965 {
966 "content": "Create 03-first-website.html",
967 "active_form": "Creating 03-first-website.html",
968 "status": "pending",
969 },
970 {
971 "content": "Create 04-configuration-basics.html",
972 "active_form": "Creating 04-configuration-basics.html",
973 "status": "pending",
974 },
975 ],
976 )
977
978 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
979 chapter_path.parent.mkdir(parents=True)
980 write_call = ToolCall(
981 id="write-ch3",
982 name="write",
983 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
984 )
985 stale_todo_call = ToolCall(
986 id="todo-stale",
987 name="TodoWrite",
988 arguments={
989 "todos": [
990 {
991 "content": "Create 03-first-website.html",
992 "active_form": "Creating 03-first-website.html",
993 "status": "pending",
994 },
995 {
996 "content": "Create 04-configuration-basics.html",
997 "active_form": "Creating 04-configuration-basics.html",
998 "status": "pending",
999 },
1000 ]
1001 },
1002 )
1003 executor = FakeExecutor(
1004 [
1005 tool_outcome(
1006 tool_call=write_call,
1007 output=f"Successfully wrote {chapter_path}",
1008 is_error=False,
1009 ),
1010 tool_outcome(
1011 tool_call=stale_todo_call,
1012 output="Todos updated",
1013 is_error=False,
1014 metadata={
1015 "new_todos": [
1016 {
1017 "content": "Create 03-first-website.html",
1018 "active_form": "Creating 03-first-website.html",
1019 "status": "pending",
1020 },
1021 {
1022 "content": "Create 04-configuration-basics.html",
1023 "active_form": "Creating 04-configuration-basics.html",
1024 "status": "pending",
1025 },
1026 ]
1027 },
1028 ),
1029 ]
1030 )
1031
1032 summary = TurnSummary(final_response="")
1033 await runner.execute_batch(
1034 tool_calls=[write_call, stale_todo_call],
1035 tool_source="assistant",
1036 pending_tool_calls_seen=set(),
1037 emit=_noop_emit,
1038 summary=summary,
1039 dod=dod,
1040 executor=executor, # type: ignore[arg-type]
1041 on_confirmation=None,
1042 on_user_question=None,
1043 emit_confirmation=None,
1044 consecutive_errors=0,
1045 )
1046
1047 assert "Create 03-first-website.html" in dod.completed_items
1048 assert "Create 03-first-website.html" not in dod.pending_items
1049 assert "Create 04-configuration-basics.html" in dod.pending_items
1050
1051
1052 @pytest.mark.asyncio
1053 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
1054 temp_dir: Path,
1055 ) -> None:
1056 async def assess_confidence(
1057 tool_name: str,
1058 tool_args: dict,
1059 context: str,
1060 ) -> ConfidenceAssessment:
1061 raise AssertionError("Confidence scoring should be disabled in this scenario")
1062
1063 async def verify_action(
1064 tool_name: str,
1065 tool_args: dict,
1066 result: str,
1067 expected: str = "",
1068 ) -> ActionVerification:
1069 raise AssertionError("Verification should not run for this scenario")
1070
1071 chapters = temp_dir / "chapters"
1072 chapters.mkdir()
1073 (chapters / "01-introduction.html").write_text(
1074 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1075 )
1076 (chapters / "02-setup.html").write_text(
1077 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1078 )
1079 (temp_dir / "index.html").write_text("<ul></ul>\n")
1080
1081 context = build_context(
1082 temp_dir=temp_dir,
1083 messages=[],
1084 safeguards=FakeSafeguards(),
1085 assess_confidence=assess_confidence,
1086 verify_action=verify_action,
1087 auto_recover=False,
1088 )
1089 context.session.current_task = (
1090 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
1091 )
1092 persistent_messages: list[str] = []
1093 ephemeral_messages: list[str] = []
1094 context.queue_steering_message_callback = persistent_messages.append
1095 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1096 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1097 tool_call = ToolCall(
1098 id="glob-1",
1099 name="glob",
1100 arguments={"path": str(chapters), "pattern": "*.html"},
1101 )
1102 executor = FakeExecutor(
1103 [
1104 tool_outcome(
1105 tool_call=tool_call,
1106 output="\n".join(
1107 [
1108 str(chapters / "01-introduction.html"),
1109 str(chapters / "02-setup.html"),
1110 ]
1111 ),
1112 is_error=False,
1113 )
1114 ]
1115 )
1116
1117 summary = TurnSummary(final_response="")
1118 await runner.execute_batch(
1119 tool_calls=[tool_call],
1120 tool_source="assistant",
1121 pending_tool_calls_seen=set(),
1122 emit=_noop_emit,
1123 summary=summary,
1124 dod=create_definition_of_done("Fix the chapter links"),
1125 executor=executor, # type: ignore[arg-type]
1126 on_confirmation=None,
1127 on_user_question=None,
1128 emit_confirmation=None,
1129 consecutive_errors=0,
1130 )
1131
1132 assert persistent_messages == []
1133 assert ephemeral_messages == []
1134 assert len(summary.tool_result_messages) == 1
1135 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
1136
1137
1138 @pytest.mark.asyncio
1139 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
1140 temp_dir: Path,
1141 ) -> None:
1142 async def assess_confidence(
1143 tool_name: str,
1144 tool_args: dict,
1145 context: str,
1146 ) -> ConfidenceAssessment:
1147 raise AssertionError("Confidence scoring should be disabled in this scenario")
1148
1149 async def verify_action(
1150 tool_name: str,
1151 tool_args: dict,
1152 result: str,
1153 expected: str = "",
1154 ) -> ActionVerification:
1155 raise AssertionError("Verification should not run for this scenario")
1156
1157 chapters = temp_dir / "chapters"
1158 chapters.mkdir()
1159 (chapters / "01-introduction.html").write_text(
1160 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1161 )
1162 (chapters / "02-setup.html").write_text(
1163 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1164 )
1165 index_path = temp_dir / "index.html"
1166 old_block = (
1167 '<ul class="chapter-list">\n'
1168 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
1169 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
1170 "</ul>\n"
1171 )
1172 new_block = (
1173 '<ul class="chapter-list">\n'
1174 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1175 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1176 "</ul>\n"
1177 )
1178 index_path.write_text(new_block)
1179
1180 context = build_context(
1181 temp_dir=temp_dir,
1182 messages=[],
1183 safeguards=FakeSafeguards(),
1184 assess_confidence=assess_confidence,
1185 verify_action=verify_action,
1186 auto_recover=False,
1187 )
1188 context.session.current_task = (
1189 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1190 )
1191 persistent_messages: list[str] = []
1192 ephemeral_messages: list[str] = []
1193 context.queue_steering_message_callback = persistent_messages.append
1194 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1195 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1196 tool_call = ToolCall(
1197 id="edit-1",
1198 name="edit",
1199 arguments={
1200 "file_path": str(index_path),
1201 "old_string": old_block,
1202 "new_string": new_block,
1203 },
1204 )
1205 executor = FakeExecutor(
1206 [
1207 tool_outcome(
1208 tool_call=tool_call,
1209 output=f"Successfully edited {index_path}",
1210 is_error=False,
1211 )
1212 ]
1213 )
1214
1215 summary = TurnSummary(final_response="")
1216 await runner.execute_batch(
1217 tool_calls=[tool_call],
1218 tool_source="assistant",
1219 pending_tool_calls_seen=set(),
1220 emit=_noop_emit,
1221 summary=summary,
1222 dod=create_definition_of_done(
1223 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1224 ),
1225 executor=executor, # type: ignore[arg-type]
1226 on_confirmation=None,
1227 on_user_question=None,
1228 emit_confirmation=None,
1229 consecutive_errors=0,
1230 )
1231
1232 assert all(
1233 "Semantic verification preview:" not in message.content
1234 for message in summary.tool_result_messages
1235 )
1236 assert persistent_messages == []
1237 assert ephemeral_messages == []
1238
1239
1240 @pytest.mark.asyncio
1241 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1242 temp_dir: Path,
1243 ) -> None:
1244 async def assess_confidence(
1245 tool_name: str,
1246 tool_args: dict,
1247 context: str,
1248 ) -> ConfidenceAssessment:
1249 raise AssertionError("Confidence scoring should be disabled in this scenario")
1250
1251 async def verify_action(
1252 tool_name: str,
1253 tool_args: dict,
1254 result: str,
1255 expected: str = "",
1256 ) -> ActionVerification:
1257 raise AssertionError("Verification should not run for this scenario")
1258
1259 chapters = temp_dir / "chapters"
1260 chapters.mkdir()
1261 (chapters / "01-introduction.html").write_text(
1262 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1263 )
1264 (chapters / "02-setup.html").write_text(
1265 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1266 )
1267 index_path = temp_dir / "index.html"
1268 index_path.write_text(
1269 "<h2>Table of Contents</h2>\n"
1270 '<ul class="chapter-list">\n'
1271 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1272 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1273 "</ul>\n"
1274 )
1275
1276 prompt = (
1277 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1278 "for the structure and cadence of the guide. We are going to make an all "
1279 "new equally thorough guide on how to use the nginx tool."
1280 )
1281
1282 context = build_context(
1283 temp_dir=temp_dir,
1284 messages=[],
1285 safeguards=FakeSafeguards(),
1286 assess_confidence=assess_confidence,
1287 verify_action=verify_action,
1288 auto_recover=False,
1289 )
1290 context.session.current_task = prompt # type: ignore[attr-defined]
1291 persistent_messages: list[str] = []
1292 ephemeral_messages: list[str] = []
1293 context.queue_steering_message_callback = persistent_messages.append
1294 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1295 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1296 tool_call = ToolCall(
1297 id="read-index",
1298 name="read",
1299 arguments={"file_path": str(index_path)},
1300 )
1301 executor = FakeExecutor(
1302 [
1303 tool_outcome(
1304 tool_call=tool_call,
1305 output=index_path.read_text(),
1306 is_error=False,
1307 )
1308 ]
1309 )
1310
1311 summary = TurnSummary(final_response="")
1312 await runner.execute_batch(
1313 tool_calls=[tool_call],
1314 tool_source="assistant",
1315 pending_tool_calls_seen=set(),
1316 emit=_noop_emit,
1317 summary=summary,
1318 dod=create_definition_of_done(prompt),
1319 executor=executor, # type: ignore[arg-type]
1320 on_confirmation=None,
1321 on_user_question=None,
1322 emit_confirmation=None,
1323 consecutive_errors=0,
1324 )
1325
1326 assert persistent_messages == []
1327 assert ephemeral_messages == []
1328 assert all(
1329 "Semantic verification preview:" not in message.content
1330 for message in summary.tool_result_messages
1331 )
1332
1333
1334 @pytest.mark.asyncio
1335 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1336 temp_dir: Path,
1337 ) -> None:
1338 async def assess_confidence(
1339 tool_name: str,
1340 tool_args: dict,
1341 context: str,
1342 ) -> ConfidenceAssessment:
1343 raise AssertionError("Confidence scoring should be disabled in this scenario")
1344
1345 async def verify_action(
1346 tool_name: str,
1347 tool_args: dict,
1348 result: str,
1349 expected: str = "",
1350 ) -> ActionVerification:
1351 raise AssertionError("Verification should not run for this scenario")
1352
1353 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1354 reference.parent.mkdir(parents=True)
1355 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1356 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1357 chapters = nginx_root / "chapters"
1358 implementation_plan = temp_dir / "implementation.md"
1359 implementation_plan.write_text(
1360 "\n".join(
1361 [
1362 "# Implementation Plan",
1363 "",
1364 "## File Changes",
1365 f"- `{chapters}/`",
1366 f"- `{nginx_root / 'index.html'}`",
1367 "",
1368 ]
1369 )
1370 )
1371
1372 context = build_context(
1373 temp_dir=temp_dir,
1374 messages=[],
1375 safeguards=FakeSafeguards(),
1376 assess_confidence=assess_confidence,
1377 verify_action=verify_action,
1378 auto_recover=False,
1379 )
1380 persistent_messages: list[str] = []
1381 ephemeral_messages: list[str] = []
1382 context.queue_steering_message_callback = persistent_messages.append
1383 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1384 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1385 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1386 dod.implementation_plan = str(implementation_plan)
1387 sync_todos_to_definition_of_done(
1388 dod,
1389 [
1390 {
1391 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1392 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1393 "status": "pending",
1394 },
1395 {
1396 "content": "Create the nginx directory structure",
1397 "active_form": "Working on: Create the nginx directory structure",
1398 "status": "pending",
1399 },
1400 {
1401 "content": "Create the nginx index.html file",
1402 "active_form": "Working on: Create the nginx index.html file",
1403 "status": "pending",
1404 },
1405 ],
1406 )
1407 tool_call = ToolCall(
1408 id="read-reference",
1409 name="read",
1410 arguments={"file_path": str(reference)},
1411 )
1412 executor = FakeExecutor(
1413 [
1414 tool_outcome(
1415 tool_call=tool_call,
1416 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1417 is_error=False,
1418 )
1419 ]
1420 )
1421
1422 summary = TurnSummary(final_response="")
1423 await runner.execute_batch(
1424 tool_calls=[tool_call],
1425 tool_source="assistant",
1426 pending_tool_calls_seen=set(),
1427 emit=_noop_emit,
1428 summary=summary,
1429 dod=dod,
1430 executor=executor, # type: ignore[arg-type]
1431 on_confirmation=None,
1432 on_user_question=None,
1433 emit_confirmation=None,
1434 consecutive_errors=0,
1435 )
1436
1437 assert (
1438 "Examine the existing Fortran guide structure to understand the cadence and format"
1439 in dod.completed_items
1440 )
1441 assert any(
1442 "Continue with the next pending item: `Create the nginx directory structure`"
1443 in message
1444 for message in persistent_messages
1445 )
1446 assert any(
1447 "Resume by creating `chapters/` now." in message
1448 for message in persistent_messages
1449 )
1450 assert all("01-introduction.html" not in message for message in persistent_messages)
1451 assert ephemeral_messages == []
1452
1453
1454 @pytest.mark.asyncio
1455 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1456 temp_dir: Path,
1457 ) -> None:
1458 async def assess_confidence(
1459 tool_name: str,
1460 tool_args: dict,
1461 context: str,
1462 ) -> ConfidenceAssessment:
1463 raise AssertionError("Confidence scoring should be disabled in this scenario")
1464
1465 async def verify_action(
1466 tool_name: str,
1467 tool_args: dict,
1468 result: str,
1469 expected: str = "",
1470 ) -> ActionVerification:
1471 raise AssertionError("Verification should not run for this scenario")
1472
1473 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1474 reference.parent.mkdir(parents=True)
1475 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1476 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1477 chapters = nginx_root / "chapters"
1478 implementation_plan = temp_dir / "implementation.md"
1479 implementation_plan.write_text(
1480 "\n".join(
1481 [
1482 "# Implementation Plan",
1483 "",
1484 "## File Changes",
1485 f"- `{nginx_root / 'index.html'}`",
1486 f"- `{chapters}/`",
1487 "",
1488 ]
1489 )
1490 )
1491
1492 context = build_context(
1493 temp_dir=temp_dir,
1494 messages=[],
1495 safeguards=FakeSafeguards(),
1496 assess_confidence=assess_confidence,
1497 verify_action=verify_action,
1498 auto_recover=False,
1499 )
1500 persistent_messages: list[str] = []
1501 ephemeral_messages: list[str] = []
1502 context.queue_steering_message_callback = persistent_messages.append
1503 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1504 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1505 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1506 dod.implementation_plan = str(implementation_plan)
1507 sync_todos_to_definition_of_done(
1508 dod,
1509 [
1510 {
1511 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1512 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1513 "status": "pending",
1514 },
1515 {
1516 "content": "Create the nginx directory structure",
1517 "active_form": "Working on: Create the nginx directory structure",
1518 "status": "pending",
1519 },
1520 {
1521 "content": "Create the nginx index.html file",
1522 "active_form": "Working on: Create the nginx index.html file",
1523 "status": "pending",
1524 },
1525 ],
1526 project_root=temp_dir,
1527 )
1528 tool_call = ToolCall(
1529 id="read-reference-index-first",
1530 name="read",
1531 arguments={"file_path": str(reference)},
1532 )
1533 executor = FakeExecutor(
1534 [
1535 tool_outcome(
1536 tool_call=tool_call,
1537 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1538 is_error=False,
1539 )
1540 ]
1541 )
1542
1543 summary = TurnSummary(final_response="")
1544 await runner.execute_batch(
1545 tool_calls=[tool_call],
1546 tool_source="assistant",
1547 pending_tool_calls_seen=set(),
1548 emit=_noop_emit,
1549 summary=summary,
1550 dod=dod,
1551 executor=executor, # type: ignore[arg-type]
1552 on_confirmation=None,
1553 on_user_question=None,
1554 emit_confirmation=None,
1555 consecutive_errors=0,
1556 )
1557
1558 assert persistent_messages
1559 assert any(
1560 "Continue with the next pending item: `Create the nginx directory structure`"
1561 in message
1562 for message in persistent_messages
1563 )
1564 assert any(
1565 "Resume by creating `chapters/` now." in message
1566 for message in persistent_messages
1567 )
1568 assert all(
1569 "Next step: create `index.html`." not in message
1570 for message in persistent_messages
1571 )
1572 assert ephemeral_messages == []
1573
1574
1575 @pytest.mark.asyncio
1576 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1577 temp_dir: Path,
1578 ) -> None:
1579 async def assess_confidence(
1580 tool_name: str,
1581 tool_args: dict,
1582 context: str,
1583 ) -> ConfidenceAssessment:
1584 raise AssertionError("Confidence scoring should be disabled in this scenario")
1585
1586 async def verify_action(
1587 tool_name: str,
1588 tool_args: dict,
1589 result: str,
1590 expected: str = "",
1591 ) -> ActionVerification:
1592 raise AssertionError("Verification should not run for this scenario")
1593
1594 reference = temp_dir / "fortran" / "index.html"
1595 reference.parent.mkdir(parents=True)
1596 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1597
1598 messages = [
1599 Message(
1600 role=Role.TOOL,
1601 content=(
1602 "Observation [read]: Result: "
1603 "<h1>Fortran Beginner's Guide</h1>\n"
1604 ),
1605 )
1606 ]
1607 context = build_context(
1608 temp_dir=temp_dir,
1609 messages=messages,
1610 safeguards=FakeSafeguards(),
1611 assess_confidence=assess_confidence,
1612 verify_action=verify_action,
1613 auto_recover=False,
1614 )
1615 prompt = (
1616 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1617 "for the structure and cadence of the guide. We are going to make an all "
1618 "new equally thorough guide on how to use the nginx tool."
1619 )
1620 context.session.current_task = prompt
1621 persistent_messages: list[str] = []
1622 ephemeral_messages: list[str] = []
1623 context.queue_steering_message_callback = persistent_messages.append
1624 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1625 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1626 dod = create_definition_of_done(prompt)
1627 sync_todos_to_definition_of_done(
1628 dod,
1629 [
1630 {
1631 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1632 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1633 "status": "completed",
1634 },
1635 {
1636 "content": "Create the nginx directory structure",
1637 "active_form": "Working on: Create the nginx directory structure",
1638 "status": "pending",
1639 },
1640 {
1641 "content": "Create the nginx index.html file",
1642 "active_form": "Working on: Create the nginx index.html file",
1643 "status": "pending",
1644 },
1645 ],
1646 )
1647 tool_call = ToolCall(
1648 id="read-dup",
1649 name="read",
1650 arguments={"file_path": str(reference)},
1651 )
1652 duplicate_message = (
1653 "[Skipped - duplicate action: Already read "
1654 f"{reference} recently without any intervening changes; "
1655 "reuse the earlier read result instead of rereading]"
1656 )
1657 executor = FakeExecutor(
1658 [
1659 ToolExecutionOutcome(
1660 tool_call=tool_call,
1661 state=ToolExecutionState.DUPLICATE,
1662 message=Message.tool_result_message(
1663 tool_call_id=tool_call.id,
1664 display_content=duplicate_message,
1665 result_content=duplicate_message,
1666 ),
1667 event_content=duplicate_message,
1668 is_error=False,
1669 result_output=duplicate_message,
1670 )
1671 ]
1672 )
1673
1674 summary = TurnSummary(final_response="")
1675 await runner.execute_batch(
1676 tool_calls=[tool_call],
1677 tool_source="assistant",
1678 pending_tool_calls_seen=set(),
1679 emit=_noop_emit,
1680 summary=summary,
1681 dod=dod,
1682 executor=executor, # type: ignore[arg-type]
1683 on_confirmation=None,
1684 on_user_question=None,
1685 emit_confirmation=None,
1686 consecutive_errors=0,
1687 )
1688
1689 assert len(persistent_messages) == 1
1690 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1691 assert (
1692 "Continue with the next pending item: `Create the nginx directory structure`"
1693 in persistent_messages[0]
1694 )
1695 assert "Update `" not in persistent_messages[0]
1696 assert ephemeral_messages == []
1697
1698
1699 @pytest.mark.asyncio
1700 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1701 temp_dir: Path,
1702 ) -> None:
1703 async def assess_confidence(
1704 tool_name: str,
1705 tool_args: dict,
1706 context: str,
1707 ) -> ConfidenceAssessment:
1708 raise AssertionError("Confidence scoring should be disabled in this scenario")
1709
1710 async def verify_action(
1711 tool_name: str,
1712 tool_args: dict,
1713 result: str,
1714 expected: str = "",
1715 ) -> ActionVerification:
1716 raise AssertionError("Verification should not run for this scenario")
1717
1718 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1719 chapters = guide_root / "chapters"
1720 chapters.mkdir(parents=True)
1721 chapter_one = chapters / "01-introduction.html"
1722 chapter_one.write_text("<html></html>\n")
1723 index_path = guide_root / "index.html"
1724
1725 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1726 reference.parent.mkdir(parents=True, exist_ok=True)
1727 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1728
1729 implementation_plan = temp_dir / "implementation.md"
1730 implementation_plan.write_text(
1731 "\n".join(
1732 [
1733 "# Implementation Plan",
1734 "",
1735 "## File Changes",
1736 f"- `{guide_root}/`",
1737 f"- `{chapters}/`",
1738 f"- `{index_path}`",
1739 f"- `{chapter_one}`",
1740 f"- `{chapters / '02-installation.html'}`",
1741 "",
1742 ]
1743 )
1744 )
1745
1746 context = build_context(
1747 temp_dir=temp_dir,
1748 messages=[],
1749 safeguards=FakeSafeguards(),
1750 assess_confidence=assess_confidence,
1751 verify_action=verify_action,
1752 auto_recover=False,
1753 )
1754 persistent_messages: list[str] = []
1755 ephemeral_messages: list[str] = []
1756 context.queue_steering_message_callback = persistent_messages.append
1757 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1758 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1759 dod = create_definition_of_done("Create a multi-file nginx guide.")
1760 dod.implementation_plan = str(implementation_plan)
1761 dod.touched_files.append(str(chapter_one))
1762 sync_todos_to_definition_of_done(
1763 dod,
1764 [
1765 {
1766 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1767 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1768 "status": "pending",
1769 },
1770 {
1771 "content": "Create each chapter file with appropriate content",
1772 "active_form": "Working on: Create each chapter file with appropriate content",
1773 "status": "pending",
1774 },
1775 {
1776 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1777 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1778 "status": "pending",
1779 },
1780 ],
1781 )
1782 tool_call = ToolCall(
1783 id="read-reference-chapter",
1784 name="read",
1785 arguments={"file_path": str(reference)},
1786 )
1787 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1788 executor = FakeExecutor(
1789 [
1790 ToolExecutionOutcome(
1791 tool_call=tool_call,
1792 state=ToolExecutionState.EXECUTED,
1793 message=Message.tool_result_message(
1794 tool_call_id=tool_call.id,
1795 display_content=read_output,
1796 result_content=read_output,
1797 ),
1798 event_content=read_output,
1799 is_error=False,
1800 result_output=read_output,
1801 )
1802 ]
1803 )
1804
1805 summary = TurnSummary(final_response="")
1806 await runner.execute_batch(
1807 tool_calls=[tool_call],
1808 tool_source="assistant",
1809 pending_tool_calls_seen=set(),
1810 emit=_noop_emit,
1811 summary=summary,
1812 dod=dod,
1813 executor=executor, # type: ignore[arg-type]
1814 on_confirmation=None,
1815 on_user_question=None,
1816 emit_confirmation=None,
1817 consecutive_errors=0,
1818 )
1819
1820 assert persistent_messages
1821 assert any(
1822 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1823 in message
1824 for message in persistent_messages
1825 )
1826 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1827 assert not any(
1828 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1829 in message
1830 for message in persistent_messages
1831 )
1832 assert ephemeral_messages == []
1833
1834
1835 @pytest.mark.asyncio
1836 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1837 temp_dir: Path,
1838 ) -> None:
1839 async def assess_confidence(
1840 tool_name: str,
1841 tool_args: dict,
1842 context: str,
1843 ) -> ConfidenceAssessment:
1844 raise AssertionError("Confidence scoring should not run for this scenario")
1845
1846 async def verify_action(
1847 tool_name: str,
1848 tool_args: dict,
1849 result: str,
1850 expected: str = "",
1851 ) -> ActionVerification:
1852 raise AssertionError("Verification should not run for this scenario")
1853
1854 guide_root = temp_dir / "guides" / "nginx"
1855 chapters = guide_root / "chapters"
1856 guide_root.mkdir(parents=True)
1857 chapters.mkdir()
1858 index_path = guide_root / "index.html"
1859 chapter_one = chapters / "01-getting-started.html"
1860 chapter_two = chapters / "02-installation.html"
1861 index_path.write_text("<html></html>\n")
1862 chapter_one.write_text("<h1>One</h1>\n")
1863 chapter_two.write_text("<h1>Two</h1>\n")
1864
1865 implementation_plan = temp_dir / "implementation.md"
1866 implementation_plan.write_text(
1867 "\n".join(
1868 [
1869 "# Implementation Plan",
1870 "",
1871 "## File Changes",
1872 f"- `{guide_root}/`",
1873 f"- `{chapters}/`",
1874 f"- `{index_path}`",
1875 f"- `{chapter_one}`",
1876 f"- `{chapter_two}`",
1877 "",
1878 ]
1879 )
1880 )
1881
1882 context = build_context(
1883 temp_dir=temp_dir,
1884 messages=[],
1885 safeguards=FakeSafeguards(),
1886 assess_confidence=assess_confidence,
1887 verify_action=verify_action,
1888 auto_recover=False,
1889 )
1890 persistent_messages: list[str] = []
1891 ephemeral_messages: list[str] = []
1892 context.queue_steering_message_callback = persistent_messages.append
1893 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1894 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1895 dod = create_definition_of_done("Create a multi-file nginx guide.")
1896 dod.implementation_plan = str(implementation_plan)
1897 dod.pending_items = [
1898 "Create 07-performance-tuning.html",
1899 "Verify all guide files are linked and complete",
1900 "Complete the requested work",
1901 ]
1902
1903 tool_call = ToolCall(
1904 id="read-dup",
1905 name="read",
1906 arguments={"file_path": str(chapter_one)},
1907 )
1908 duplicate_message = (
1909 "[Skipped - duplicate action: Already read "
1910 f"{chapter_one} recently without any intervening changes; "
1911 "reuse the earlier read result instead of rereading]"
1912 )
1913 executor = FakeExecutor(
1914 [
1915 ToolExecutionOutcome(
1916 tool_call=tool_call,
1917 state=ToolExecutionState.DUPLICATE,
1918 message=Message.tool_result_message(
1919 tool_call_id=tool_call.id,
1920 display_content=duplicate_message,
1921 result_content=duplicate_message,
1922 ),
1923 event_content=duplicate_message,
1924 is_error=False,
1925 result_output=duplicate_message,
1926 )
1927 ]
1928 )
1929
1930 summary = TurnSummary(final_response="")
1931 await runner.execute_batch(
1932 tool_calls=[tool_call],
1933 tool_source="assistant",
1934 pending_tool_calls_seen=set(),
1935 emit=_noop_emit,
1936 summary=summary,
1937 dod=dod,
1938 executor=executor, # type: ignore[arg-type]
1939 on_confirmation=None,
1940 on_user_question=None,
1941 emit_confirmation=None,
1942 consecutive_errors=0,
1943 )
1944
1945 assert len(persistent_messages) == 1
1946 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1947 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1948 assert ephemeral_messages == []
1949
1950
1951 @pytest.mark.asyncio
1952 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1953 temp_dir: Path,
1954 ) -> None:
1955 async def assess_confidence(
1956 tool_name: str,
1957 tool_args: dict,
1958 context: str,
1959 ) -> ConfidenceAssessment:
1960 raise AssertionError("Confidence scoring should not run for this scenario")
1961
1962 async def verify_action(
1963 tool_name: str,
1964 tool_args: dict,
1965 result: str,
1966 expected: str = "",
1967 ) -> ActionVerification:
1968 raise AssertionError("Verification should not run for this scenario")
1969
1970 guide_root = temp_dir / "guides" / "nginx"
1971 chapters = guide_root / "chapters"
1972 guide_root.mkdir(parents=True)
1973 chapters.mkdir()
1974 index_path = guide_root / "index.html"
1975 chapter_one = chapters / "01-getting-started.html"
1976 chapter_two = chapters / "02-installation.html"
1977 index_path.write_text("<html></html>\n")
1978 chapter_one.write_text("<h1>One</h1>\n")
1979 chapter_two.write_text("<h1>Two</h1>\n")
1980
1981 implementation_plan = temp_dir / "implementation.md"
1982 implementation_plan.write_text(
1983 "\n".join(
1984 [
1985 "# Implementation Plan",
1986 "",
1987 "## File Changes",
1988 f"- `{guide_root}/`",
1989 f"- `{chapters}/`",
1990 f"- `{index_path}`",
1991 f"- `{chapter_one}`",
1992 f"- `{chapter_two}`",
1993 "",
1994 ]
1995 )
1996 )
1997
1998 context = build_context(
1999 temp_dir=temp_dir,
2000 messages=[],
2001 safeguards=FakeSafeguards(),
2002 assess_confidence=assess_confidence,
2003 verify_action=verify_action,
2004 auto_recover=False,
2005 )
2006 persistent_messages: list[str] = []
2007 ephemeral_messages: list[str] = []
2008 context.queue_steering_message_callback = persistent_messages.append
2009 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2010 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2011 dod = create_definition_of_done("Create a multi-file nginx guide.")
2012 dod.implementation_plan = str(implementation_plan)
2013 dod.verification_commands = [f"ls -la {guide_root}"]
2014 dod.pending_items = [
2015 "Create 07-performance-tuning.html",
2016 "Complete the requested work",
2017 ]
2018
2019 tool_call = ToolCall(
2020 id="read-dup",
2021 name="read",
2022 arguments={"file_path": str(chapter_one)},
2023 )
2024 duplicate_message = (
2025 "[Skipped - duplicate action: Already read "
2026 f"{chapter_one} recently without any intervening changes; "
2027 "reuse the earlier read result instead of rereading]"
2028 )
2029 executor = FakeExecutor(
2030 [
2031 ToolExecutionOutcome(
2032 tool_call=tool_call,
2033 state=ToolExecutionState.DUPLICATE,
2034 message=Message.tool_result_message(
2035 tool_call_id=tool_call.id,
2036 display_content=duplicate_message,
2037 result_content=duplicate_message,
2038 ),
2039 event_content=duplicate_message,
2040 is_error=False,
2041 result_output=duplicate_message,
2042 )
2043 ]
2044 )
2045
2046 summary = TurnSummary(final_response="")
2047 await runner.execute_batch(
2048 tool_calls=[tool_call],
2049 tool_source="assistant",
2050 pending_tool_calls_seen=set(),
2051 emit=_noop_emit,
2052 summary=summary,
2053 dod=dod,
2054 executor=executor, # type: ignore[arg-type]
2055 on_confirmation=None,
2056 on_user_question=None,
2057 emit_confirmation=None,
2058 consecutive_errors=0,
2059 )
2060
2061 assert len(persistent_messages) == 1
2062 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
2063 assert (
2064 "Finish with a final response now so Loader can run verification automatically."
2065 in persistent_messages[0]
2066 )
2067 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
2068 assert ephemeral_messages == []
2069
2070
2071 @pytest.mark.asyncio
2072 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
2073 temp_dir: Path,
2074 ) -> None:
2075 async def assess_confidence(
2076 tool_name: str,
2077 tool_args: dict,
2078 context: str,
2079 ) -> ConfidenceAssessment:
2080 raise AssertionError("Confidence scoring should not run for this scenario")
2081
2082 async def verify_action(
2083 tool_name: str,
2084 tool_args: dict,
2085 result: str,
2086 expected: str = "",
2087 ) -> ActionVerification:
2088 raise AssertionError("Verification should not run for this scenario")
2089
2090 guide_root = temp_dir / "guides" / "nginx"
2091 chapters = guide_root / "chapters"
2092 guide_root.mkdir(parents=True)
2093 chapters.mkdir()
2094 index_path = guide_root / "index.html"
2095 chapter_one = chapters / "01-getting-started.html"
2096 chapter_two = chapters / "02-installation.html"
2097 index_path.write_text("<html></html>\n")
2098 chapter_one.write_text("<h1>One</h1>\n")
2099 chapter_two.write_text("<h1>Two</h1>\n")
2100
2101 implementation_plan = temp_dir / "implementation.md"
2102 implementation_plan.write_text(
2103 "\n".join(
2104 [
2105 "# Implementation Plan",
2106 "",
2107 "## File Changes",
2108 f"- `{guide_root}/`",
2109 f"- `{chapters}/`",
2110 f"- `{index_path}`",
2111 f"- `{chapter_one}`",
2112 f"- `{chapter_two}`",
2113 "",
2114 ]
2115 )
2116 )
2117
2118 context = build_context(
2119 temp_dir=temp_dir,
2120 messages=[],
2121 safeguards=FakeSafeguards(),
2122 assess_confidence=assess_confidence,
2123 verify_action=verify_action,
2124 auto_recover=False,
2125 )
2126 persistent_messages: list[str] = []
2127 ephemeral_messages: list[str] = []
2128 context.queue_steering_message_callback = persistent_messages.append
2129 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2130 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2131 dod = create_definition_of_done("Create a multi-file nginx guide.")
2132 dod.implementation_plan = str(implementation_plan)
2133 dod.verification_commands = [f"ls -la {guide_root}"]
2134 dod.pending_items = [
2135 "Create 01-getting-started.html",
2136 "Creating 02-installation.html",
2137 "Complete the requested work",
2138 ]
2139
2140 tool_call = ToolCall(
2141 id="read-dup-built-stale",
2142 name="read",
2143 arguments={"file_path": str(chapter_one)},
2144 )
2145 duplicate_message = (
2146 "[Skipped - duplicate action: Already read "
2147 f"{chapter_one} recently without any intervening changes; "
2148 "reuse the earlier read result instead of rereading]"
2149 )
2150 executor = FakeExecutor(
2151 [
2152 ToolExecutionOutcome(
2153 tool_call=tool_call,
2154 state=ToolExecutionState.DUPLICATE,
2155 message=Message.tool_result_message(
2156 tool_call_id=tool_call.id,
2157 display_content=duplicate_message,
2158 result_content=duplicate_message,
2159 ),
2160 event_content=duplicate_message,
2161 is_error=False,
2162 result_output=duplicate_message,
2163 )
2164 ]
2165 )
2166
2167 summary = TurnSummary(final_response="")
2168 await runner.execute_batch(
2169 tool_calls=[tool_call],
2170 tool_source="assistant",
2171 pending_tool_calls_seen=set(),
2172 emit=_noop_emit,
2173 summary=summary,
2174 dod=dod,
2175 executor=executor, # type: ignore[arg-type]
2176 on_confirmation=None,
2177 on_user_question=None,
2178 emit_confirmation=None,
2179 consecutive_errors=0,
2180 )
2181
2182 assert len(persistent_messages) == 1
2183 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
2184 assert (
2185 "Finish with a final response now so Loader can run verification automatically."
2186 in persistent_messages[0]
2187 )
2188 assert "Create 01-getting-started.html" not in persistent_messages[0]
2189 assert "Creating 02-installation.html" not in persistent_messages[0]
2190 assert ephemeral_messages == []
2191
2192
2193 @pytest.mark.asyncio
2194 async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
2195 temp_dir: Path,
2196 ) -> None:
2197 async def assess_confidence(
2198 tool_name: str,
2199 tool_args: dict,
2200 context: str,
2201 ) -> ConfidenceAssessment:
2202 raise AssertionError("Confidence scoring should not run for this scenario")
2203
2204 async def verify_action(
2205 tool_name: str,
2206 tool_args: dict,
2207 result: str,
2208 expected: str = "",
2209 ) -> ActionVerification:
2210 raise AssertionError("Verification should not run for this scenario")
2211
2212 guide_root = temp_dir / "guides" / "nginx"
2213 chapters = guide_root / "chapters"
2214 guide_root.mkdir(parents=True)
2215 chapters.mkdir()
2216 index_path = guide_root / "index.html"
2217 chapter_one = chapters / "01-getting-started.html"
2218 chapter_two = chapters / "02-installation.html"
2219 index_path.write_text("<html></html>\n")
2220 chapter_one.write_text("<h1>One</h1>\n")
2221 chapter_two.write_text("<h1>Two</h1>\n")
2222
2223 implementation_plan = temp_dir / "implementation.md"
2224 implementation_plan.write_text(
2225 "\n".join(
2226 [
2227 "# Implementation Plan",
2228 "",
2229 "## File Changes",
2230 f"- `{guide_root}/`",
2231 f"- `{chapters}/`",
2232 f"- `{index_path}`",
2233 f"- `{chapter_one}`",
2234 f"- `{chapter_two}`",
2235 "",
2236 ]
2237 )
2238 )
2239
2240 context = build_context(
2241 temp_dir=temp_dir,
2242 messages=[],
2243 safeguards=FakeSafeguards(),
2244 assess_confidence=assess_confidence,
2245 verify_action=verify_action,
2246 auto_recover=False,
2247 )
2248 persistent_messages: list[str] = []
2249 ephemeral_messages: list[str] = []
2250 context.queue_steering_message_callback = persistent_messages.append
2251 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2252 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2253 dod = create_definition_of_done("Create a multi-file nginx guide.")
2254 dod.implementation_plan = str(implementation_plan)
2255 dod.verification_commands = [f"ls -la {guide_root}"]
2256 sync_todos_to_definition_of_done(
2257 dod,
2258 [
2259 {
2260 "content": "Create 01-getting-started.html",
2261 "active_form": "Creating 01-getting-started.html",
2262 "status": "pending",
2263 },
2264 {
2265 "content": "Ensure all files are properly linked and formatted consistently",
2266 "active_form": "Reviewing guide consistency and linkage",
2267 "status": "pending",
2268 },
2269 ],
2270 )
2271
2272 tool_call = ToolCall(
2273 id="read-built-review",
2274 name="read",
2275 arguments={"file_path": str(chapter_one)},
2276 )
2277 executor = FakeExecutor(
2278 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2279 )
2280
2281 summary = TurnSummary(final_response="")
2282 await runner.execute_batch(
2283 tool_calls=[tool_call],
2284 tool_source="assistant",
2285 pending_tool_calls_seen=set(),
2286 emit=_noop_emit,
2287 summary=summary,
2288 dod=dod,
2289 executor=executor, # type: ignore[arg-type]
2290 on_confirmation=None,
2291 on_user_question=None,
2292 emit_confirmation=None,
2293 consecutive_errors=0,
2294 )
2295
2296 assert persistent_messages == []
2297 assert len(ephemeral_messages) == 1
2298 message = ephemeral_messages[0]
2299 assert "All explicitly planned artifacts already exist." in message
2300 assert "Ensure all files are properly linked and formatted consistently" in message
2301 assert "Create 01-getting-started.html" not in message
2302 assert "do not keep broad-rereading the output set" in message
2303 assert "If no specific mismatch remains, finish with a final response so Loader can verify." in message
2304
2305
2306 @pytest.mark.asyncio
2307 async def test_tool_batch_runner_successful_read_after_plan_complete_switches_to_verify(
2308 temp_dir: Path,
2309 ) -> None:
2310 async def assess_confidence(
2311 tool_name: str,
2312 tool_args: dict,
2313 context: str,
2314 ) -> ConfidenceAssessment:
2315 raise AssertionError("Confidence scoring should not run for this scenario")
2316
2317 async def verify_action(
2318 tool_name: str,
2319 tool_args: dict,
2320 result: str,
2321 expected: str = "",
2322 ) -> ActionVerification:
2323 raise AssertionError("Verification should not run for this scenario")
2324
2325 guide_root = temp_dir / "guides" / "nginx"
2326 chapters = guide_root / "chapters"
2327 guide_root.mkdir(parents=True)
2328 chapters.mkdir()
2329 index_path = guide_root / "index.html"
2330 chapter_one = chapters / "01-getting-started.html"
2331 chapter_two = chapters / "02-installation.html"
2332 index_path.write_text("<html></html>\n")
2333 chapter_one.write_text("<h1>One</h1>\n")
2334 chapter_two.write_text("<h1>Two</h1>\n")
2335
2336 implementation_plan = temp_dir / "implementation.md"
2337 implementation_plan.write_text(
2338 "\n".join(
2339 [
2340 "# Implementation Plan",
2341 "",
2342 "## File Changes",
2343 f"- `{guide_root}/`",
2344 f"- `{chapters}/`",
2345 f"- `{index_path}`",
2346 f"- `{chapter_one}`",
2347 f"- `{chapter_two}`",
2348 "",
2349 ]
2350 )
2351 )
2352
2353 context = build_context(
2354 temp_dir=temp_dir,
2355 messages=[],
2356 safeguards=FakeSafeguards(),
2357 assess_confidence=assess_confidence,
2358 verify_action=verify_action,
2359 auto_recover=False,
2360 )
2361 persistent_messages: list[str] = []
2362 ephemeral_messages: list[str] = []
2363 context.queue_steering_message_callback = persistent_messages.append
2364 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2365 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2366 dod = create_definition_of_done("Create a multi-file nginx guide.")
2367 dod.implementation_plan = str(implementation_plan)
2368 dod.verification_commands = [f"ls -la {guide_root}"]
2369
2370 tool_call = ToolCall(
2371 id="read-built-verify",
2372 name="read",
2373 arguments={"file_path": str(chapter_one)},
2374 )
2375 executor = FakeExecutor(
2376 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2377 )
2378
2379 summary = TurnSummary(final_response="")
2380 await runner.execute_batch(
2381 tool_calls=[tool_call],
2382 tool_source="assistant",
2383 pending_tool_calls_seen=set(),
2384 emit=_noop_emit,
2385 summary=summary,
2386 dod=dod,
2387 executor=executor, # type: ignore[arg-type]
2388 on_confirmation=None,
2389 on_user_question=None,
2390 emit_confirmation=None,
2391 consecutive_errors=0,
2392 )
2393
2394 assert len(persistent_messages) == 1
2395 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
2396 assert "Finish with a final response now so Loader can run verification automatically." in persistent_messages[0]
2397 assert "stop broad rereads" in persistent_messages[0]
2398 assert ephemeral_messages == []
2399 assert context.workflow_mode == "verify"
2400
2401
2402 @pytest.mark.asyncio
2403 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
2404 temp_dir: Path,
2405 ) -> None:
2406 async def assess_confidence(
2407 tool_name: str,
2408 tool_args: dict,
2409 context: str,
2410 ) -> ConfidenceAssessment:
2411 raise AssertionError("Confidence scoring should be disabled in this scenario")
2412
2413 async def verify_action(
2414 tool_name: str,
2415 tool_args: dict,
2416 result: str,
2417 expected: str = "",
2418 ) -> ActionVerification:
2419 raise AssertionError("Verification should not run for this scenario")
2420
2421 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2422 reference.parent.mkdir(parents=True)
2423 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2424
2425 context = build_context(
2426 temp_dir=temp_dir,
2427 messages=[],
2428 safeguards=FakeSafeguards(),
2429 assess_confidence=assess_confidence,
2430 verify_action=verify_action,
2431 auto_recover=False,
2432 )
2433 persistent_messages: list[str] = []
2434 ephemeral_messages: list[str] = []
2435 context.queue_steering_message_callback = persistent_messages.append
2436 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2437 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2438 dod = create_definition_of_done("Create a multi-file nginx guide.")
2439 sync_todos_to_definition_of_done(
2440 dod,
2441 [
2442 {
2443 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2444 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2445 "status": "pending",
2446 },
2447 {
2448 "content": "Create the nginx index.html file",
2449 "active_form": "Working on: Create the nginx index.html file",
2450 "status": "pending",
2451 },
2452 ],
2453 )
2454 tool_call = ToolCall(
2455 id="read-reference",
2456 name="read",
2457 arguments={"file_path": str(reference)},
2458 )
2459 executor = FakeExecutor(
2460 [
2461 tool_outcome(
2462 tool_call=tool_call,
2463 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2464 is_error=False,
2465 )
2466 ]
2467 )
2468
2469 summary = TurnSummary(final_response="")
2470 await runner.execute_batch(
2471 tool_calls=[tool_call],
2472 tool_source="assistant",
2473 pending_tool_calls_seen=set(),
2474 emit=_noop_emit,
2475 summary=summary,
2476 dod=dod,
2477 executor=executor, # type: ignore[arg-type]
2478 on_confirmation=None,
2479 on_user_question=None,
2480 emit_confirmation=None,
2481 consecutive_errors=0,
2482 )
2483
2484 assert any(
2485 "Continue with the next pending item: `Create the nginx index.html file`"
2486 in message
2487 for message in persistent_messages
2488 )
2489 assert any(
2490 "stop gathering more reference material and perform the change now" in message
2491 for message in persistent_messages
2492 )
2493 assert ephemeral_messages == []
2494
2495
2496 @pytest.mark.asyncio
2497 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2498 temp_dir: Path,
2499 ) -> None:
2500 async def assess_confidence(
2501 tool_name: str,
2502 tool_args: dict,
2503 context: str,
2504 ) -> ConfidenceAssessment:
2505 raise AssertionError("Confidence scoring should be disabled in this scenario")
2506
2507 async def verify_action(
2508 tool_name: str,
2509 tool_args: dict,
2510 result: str,
2511 expected: str = "",
2512 ) -> ActionVerification:
2513 raise AssertionError("Verification should not run for this scenario")
2514
2515 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2516 reference.parent.mkdir(parents=True)
2517 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2518
2519 context = build_context(
2520 temp_dir=temp_dir,
2521 messages=[],
2522 safeguards=FakeSafeguards(),
2523 assess_confidence=assess_confidence,
2524 verify_action=verify_action,
2525 auto_recover=False,
2526 )
2527 persistent_messages: list[str] = []
2528 ephemeral_messages: list[str] = []
2529 context.queue_steering_message_callback = persistent_messages.append
2530 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2531 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2532 dod = create_definition_of_done("Create a multi-file nginx guide.")
2533 sync_todos_to_definition_of_done(
2534 dod,
2535 [
2536 {
2537 "content": "First, examine the existing fortran guide structure and content",
2538 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2539 "status": "pending",
2540 },
2541 {
2542 "content": "Create the nginx directory structure",
2543 "active_form": "Working on: Create the nginx directory structure",
2544 "status": "pending",
2545 },
2546 ],
2547 )
2548 tool_call = ToolCall(
2549 id="read-reference",
2550 name="read",
2551 arguments={"file_path": str(reference)},
2552 )
2553 executor = FakeExecutor(
2554 [
2555 tool_outcome(
2556 tool_call=tool_call,
2557 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2558 is_error=False,
2559 )
2560 ]
2561 )
2562
2563 summary = TurnSummary(final_response="")
2564 await runner.execute_batch(
2565 tool_calls=[tool_call],
2566 tool_source="assistant",
2567 pending_tool_calls_seen=set(),
2568 emit=_noop_emit,
2569 summary=summary,
2570 dod=dod,
2571 executor=executor, # type: ignore[arg-type]
2572 on_confirmation=None,
2573 on_user_question=None,
2574 emit_confirmation=None,
2575 consecutive_errors=0,
2576 )
2577
2578 assert persistent_messages
2579 assert any(
2580 "Continue with the next pending item: `Create the nginx directory structure`"
2581 in message
2582 for message in persistent_messages
2583 )
2584 assert ephemeral_messages == []
2585
2586
2587 @pytest.mark.asyncio
2588 async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
2589 temp_dir: Path,
2590 ) -> None:
2591 async def assess_confidence(
2592 tool_name: str,
2593 tool_args: dict,
2594 context: str,
2595 ) -> ConfidenceAssessment:
2596 raise AssertionError("Confidence scoring should be disabled in this scenario")
2597
2598 async def verify_action(
2599 tool_name: str,
2600 tool_args: dict,
2601 result: str,
2602 expected: str = "",
2603 ) -> ActionVerification:
2604 raise AssertionError("Verification should not run for this scenario")
2605
2606 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2607 chapters = nginx_root / "chapters"
2608 implementation_plan = temp_dir / "implementation.md"
2609 implementation_plan.write_text(
2610 "\n".join(
2611 [
2612 "# Implementation Plan",
2613 "",
2614 "## File Changes",
2615 f"- `{chapters}/`",
2616 f"- `{nginx_root / 'index.html'}`",
2617 "",
2618 ]
2619 )
2620 )
2621
2622 context = build_context(
2623 temp_dir=temp_dir,
2624 messages=[],
2625 safeguards=FakeSafeguards(),
2626 assess_confidence=assess_confidence,
2627 verify_action=verify_action,
2628 auto_recover=False,
2629 )
2630 persistent_messages: list[str] = []
2631 ephemeral_messages: list[str] = []
2632 context.queue_steering_message_callback = persistent_messages.append
2633 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2634 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2635 dod = create_definition_of_done("Create a multi-file nginx guide.")
2636 dod.implementation_plan = str(implementation_plan)
2637 sync_todos_to_definition_of_done(
2638 dod,
2639 [
2640 {
2641 "content": "Create the nginx directory structure",
2642 "active_form": "Creating the nginx directory structure",
2643 "status": "pending",
2644 },
2645 {
2646 "content": "Develop the main index.html file with proper structure",
2647 "active_form": "Developing the main index.html file with proper structure",
2648 "status": "pending",
2649 },
2650 ],
2651 )
2652
2653 tool_call = ToolCall(
2654 id="mkdir-nginx",
2655 name="bash",
2656 arguments={"command": f"mkdir -p {chapters}"},
2657 )
2658 executor = FakeExecutor(
2659 [
2660 tool_outcome(
2661 tool_call=tool_call,
2662 output="",
2663 is_error=False,
2664 )
2665 ]
2666 )
2667
2668 summary = TurnSummary(final_response="")
2669 await runner.execute_batch(
2670 tool_calls=[tool_call],
2671 tool_source="assistant",
2672 pending_tool_calls_seen=set(),
2673 emit=_noop_emit,
2674 summary=summary,
2675 dod=dod,
2676 executor=executor, # type: ignore[arg-type]
2677 on_confirmation=None,
2678 on_user_question=None,
2679 emit_confirmation=None,
2680 consecutive_errors=0,
2681 )
2682
2683 assert persistent_messages
2684 message = persistent_messages[-1]
2685 assert "Directory setup is complete." in message
2686 assert "Next step: create `index.html`." in message
2687 assert "Write a compact but real initial version of that file now" in message
2688 assert ephemeral_messages == []
2689
2690
2691 @pytest.mark.asyncio
2692 async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
2693 temp_dir: Path,
2694 ) -> None:
2695 async def assess_confidence(
2696 tool_name: str,
2697 tool_args: dict,
2698 context: str,
2699 ) -> ConfidenceAssessment:
2700 raise AssertionError("Confidence scoring should be disabled in this scenario")
2701
2702 async def verify_action(
2703 tool_name: str,
2704 tool_args: dict,
2705 result: str,
2706 expected: str = "",
2707 ) -> ActionVerification:
2708 raise AssertionError("Verification should not run for this scenario")
2709
2710 nginx_root = temp_dir / "guides" / "nginx"
2711 chapters = nginx_root / "chapters"
2712 chapters.mkdir(parents=True)
2713 index_path = nginx_root / "index.html"
2714
2715 implementation_plan = temp_dir / "implementation.md"
2716 implementation_plan.write_text(
2717 "\n".join(
2718 [
2719 "# Implementation Plan",
2720 "",
2721 "## File Changes",
2722 f"- `{chapters}/`",
2723 f"- `{index_path}`",
2724 f"- `{chapters / '01-introduction.html'}`",
2725 "",
2726 ]
2727 )
2728 )
2729
2730 context = build_context(
2731 temp_dir=temp_dir,
2732 messages=[],
2733 safeguards=FakeSafeguards(),
2734 assess_confidence=assess_confidence,
2735 verify_action=verify_action,
2736 auto_recover=False,
2737 )
2738 persistent_messages: list[str] = []
2739 ephemeral_messages: list[str] = []
2740 context.queue_steering_message_callback = persistent_messages.append
2741 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2742 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2743 dod = create_definition_of_done("Create a multi-file nginx guide.")
2744 dod.implementation_plan = str(implementation_plan)
2745 sync_todos_to_definition_of_done(
2746 dod,
2747 [
2748 {
2749 "content": "Create the main index.html file with proper structure",
2750 "active_form": "Creating the main index.html file with proper structure",
2751 "status": "pending",
2752 },
2753 {
2754 "content": "Create each chapter file with appropriate content",
2755 "active_form": "Creating each chapter file with appropriate content",
2756 "status": "pending",
2757 },
2758 ],
2759 )
2760
2761 tool_call = ToolCall(
2762 id="write-index",
2763 name="write",
2764 arguments={
2765 "file_path": str(index_path),
2766 "content": "<html></html>\n",
2767 },
2768 )
2769 executor = FakeExecutor(
2770 [
2771 tool_outcome(
2772 tool_call=tool_call,
2773 output=f"Successfully wrote 14 bytes to {index_path}",
2774 is_error=False,
2775 )
2776 ]
2777 )
2778
2779 summary = TurnSummary(final_response="")
2780 await runner.execute_batch(
2781 tool_calls=[tool_call],
2782 tool_source="assistant",
2783 pending_tool_calls_seen=set(),
2784 emit=_noop_emit,
2785 summary=summary,
2786 dod=dod,
2787 executor=executor, # type: ignore[arg-type]
2788 on_confirmation=None,
2789 on_user_question=None,
2790 emit_confirmation=None,
2791 consecutive_errors=0,
2792 )
2793
2794 assert persistent_messages
2795 assert ephemeral_messages == []
2796 message = persistent_messages[-1]
2797 assert "Confirmed progress:" in message
2798 assert "Next step: create `01-introduction.html`." in message
2799 assert (
2800 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2801 in message
2802 )
2803 assert "Write a compact but real initial version of that file now" not in message
2804 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2805
2806
2807 @pytest.mark.asyncio
2808 async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
2809 temp_dir: Path,
2810 monkeypatch: pytest.MonkeyPatch,
2811 ) -> None:
2812 monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
2813
2814 async def assess_confidence(
2815 tool_name: str,
2816 tool_args: dict,
2817 context: str,
2818 ) -> ConfidenceAssessment:
2819 raise AssertionError("Confidence scoring should be disabled in this scenario")
2820
2821 async def verify_action(
2822 tool_name: str,
2823 tool_args: dict,
2824 result: str,
2825 expected: str = "",
2826 ) -> ActionVerification:
2827 raise AssertionError("Verification should not run for this scenario")
2828
2829 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2830 chapters = nginx_root / "chapters"
2831 index_path = nginx_root / "index.html"
2832
2833 implementation_plan = temp_dir / "implementation.md"
2834 implementation_plan.write_text(
2835 "\n".join(
2836 [
2837 "# Implementation Plan",
2838 "",
2839 "## File Changes",
2840 f"- `{chapters}/`",
2841 f"- `{index_path}`",
2842 "",
2843 ]
2844 )
2845 )
2846
2847 context = build_context(
2848 temp_dir=temp_dir,
2849 messages=[],
2850 safeguards=FakeSafeguards(),
2851 assess_confidence=assess_confidence,
2852 verify_action=verify_action,
2853 auto_recover=False,
2854 )
2855 persistent_messages: list[str] = []
2856 context.queue_steering_message_callback = persistent_messages.append
2857 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2858 dod = create_definition_of_done("Create a multi-file nginx guide.")
2859 dod.implementation_plan = str(implementation_plan)
2860 sync_todos_to_definition_of_done(
2861 dod,
2862 [
2863 {
2864 "content": "Create the nginx directory structure",
2865 "active_form": "Creating the nginx directory structure",
2866 "status": "pending",
2867 },
2868 {
2869 "content": "Develop the main index.html file with proper structure",
2870 "active_form": "Developing the main index.html file with proper structure",
2871 "status": "pending",
2872 },
2873 ],
2874 )
2875
2876 tool_call = ToolCall(
2877 id="mkdir-nginx-home",
2878 name="bash",
2879 arguments={"command": f"mkdir -p {chapters}"},
2880 )
2881 executor = FakeExecutor(
2882 [
2883 tool_outcome(
2884 tool_call=tool_call,
2885 output="",
2886 is_error=False,
2887 )
2888 ]
2889 )
2890
2891 summary = TurnSummary(final_response="")
2892 await runner.execute_batch(
2893 tool_calls=[tool_call],
2894 tool_source="assistant",
2895 pending_tool_calls_seen=set(),
2896 emit=_noop_emit,
2897 summary=summary,
2898 dod=dod,
2899 executor=executor, # type: ignore[arg-type]
2900 on_confirmation=None,
2901 on_user_question=None,
2902 emit_confirmation=None,
2903 consecutive_errors=0,
2904 )
2905
2906 assert persistent_messages
2907 message = persistent_messages[-1]
2908 assert "Next step: create `index.html`." in message
2909 assert "`~/Loader/guides/nginx/index.html`" in message
2910 assert "Write a compact but real initial version of that file now" in message
2911
2912
2913 @pytest.mark.asyncio
2914 async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
2915 temp_dir: Path,
2916 ) -> None:
2917 async def assess_confidence(
2918 tool_name: str,
2919 tool_args: dict,
2920 context: str,
2921 ) -> ConfidenceAssessment:
2922 raise AssertionError("Confidence scoring should not run in this scenario")
2923
2924 async def verify_action(
2925 tool_name: str,
2926 tool_args: dict,
2927 result: str,
2928 expected: str = "",
2929 ) -> ActionVerification:
2930 raise AssertionError("Verification should not run in this scenario")
2931
2932 nginx_root = temp_dir / "guides" / "nginx"
2933 chapters = nginx_root / "chapters"
2934 chapters.mkdir(parents=True)
2935 index_path = nginx_root / "index.html"
2936 index_path.write_text(
2937 "\n".join(
2938 [
2939 "<html>",
2940 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2941 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2942 "</html>",
2943 ]
2944 )
2945 + "\n"
2946 )
2947
2948 implementation_plan = temp_dir / "implementation.md"
2949 implementation_plan.write_text(
2950 "\n".join(
2951 [
2952 "# Implementation Plan",
2953 "",
2954 "## File Changes",
2955 f"- `{nginx_root}/`",
2956 f"- `{chapters}/`",
2957 f"- `{index_path}`",
2958 f"- `{chapters / '01-introduction.html'}`",
2959 "",
2960 ]
2961 )
2962 )
2963
2964 context = build_context(
2965 temp_dir=temp_dir,
2966 messages=[],
2967 safeguards=FakeSafeguards(),
2968 assess_confidence=assess_confidence,
2969 verify_action=verify_action,
2970 auto_recover=False,
2971 )
2972 persistent_messages: list[str] = []
2973 ephemeral_messages: list[str] = []
2974 context.queue_steering_message_callback = persistent_messages.append
2975 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2976 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2977 dod = create_definition_of_done("Create a multi-file nginx guide.")
2978 dod.implementation_plan = str(implementation_plan)
2979 dod.touched_files.append(str(index_path))
2980 dod.completed_items.append("Develop the main index.html file for the nginx guide")
2981 dod.pending_items.append("Create chapter files for the nginx guide")
2982
2983 tool_call = ToolCall(
2984 id="read-index-self-audit",
2985 name="read",
2986 arguments={"file_path": str(index_path)},
2987 )
2988 executor = FakeExecutor(
2989 [
2990 tool_outcome(
2991 tool_call=tool_call,
2992 output="1\t<html>\n",
2993 is_error=False,
2994 )
2995 ]
2996 )
2997
2998 summary = TurnSummary(final_response="")
2999 await runner.execute_batch(
3000 tool_calls=[tool_call],
3001 tool_source="assistant",
3002 pending_tool_calls_seen=set(),
3003 emit=_noop_emit,
3004 summary=summary,
3005 dod=dod,
3006 executor=executor, # type: ignore[arg-type]
3007 on_confirmation=None,
3008 on_user_question=None,
3009 emit_confirmation=None,
3010 consecutive_errors=0,
3011 )
3012
3013 assert persistent_messages
3014 message = persistent_messages[-1]
3015 assert "You already have the current contents of `index.html` from the successful write." in message
3016 assert "Resume by creating `01-introduction.html` now." in message
3017 assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
3018 assert ephemeral_messages == []
3019
3020
3021 @pytest.mark.asyncio
3022 async def test_tool_batch_runner_preserves_first_file_handoff_after_recovery_prompt(
3023 temp_dir: Path,
3024 ) -> None:
3025 async def assess_confidence(
3026 tool_name: str,
3027 tool_args: dict,
3028 context: str,
3029 ) -> ConfidenceAssessment:
3030 raise AssertionError("Confidence scoring should be disabled in this scenario")
3031
3032 async def verify_action(
3033 tool_name: str,
3034 tool_args: dict,
3035 result: str,
3036 expected: str = "",
3037 ) -> ActionVerification:
3038 raise AssertionError("Verification should not run for this scenario")
3039
3040 nginx_root = temp_dir / "guides" / "nginx"
3041 chapters = nginx_root / "chapters"
3042 chapters.mkdir(parents=True)
3043 index_path = nginx_root / "index.html"
3044
3045 implementation_plan = temp_dir / "implementation.md"
3046 implementation_plan.write_text(
3047 "\n".join(
3048 [
3049 "# Implementation Plan",
3050 "",
3051 "## File Changes",
3052 f"- `{chapters}/`",
3053 f"- `{index_path}`",
3054 f"- `{chapters / '01-introduction.html'}`",
3055 "",
3056 ]
3057 )
3058 )
3059
3060 context = build_context(
3061 temp_dir=temp_dir,
3062 messages=[
3063 Message(
3064 role=Role.USER,
3065 content=(
3066 "[EMPTY ASSISTANT RESPONSE]\n"
3067 "Respond with that concrete mutation tool call now. Do not return an empty response."
3068 ),
3069 )
3070 ],
3071 safeguards=FakeSafeguards(),
3072 assess_confidence=assess_confidence,
3073 verify_action=verify_action,
3074 auto_recover=False,
3075 )
3076 persistent_messages: list[str] = []
3077 ephemeral_messages: list[str] = []
3078 context.queue_steering_message_callback = persistent_messages.append
3079 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3080 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3081 dod = create_definition_of_done("Create a multi-file nginx guide.")
3082 dod.implementation_plan = str(implementation_plan)
3083 sync_todos_to_definition_of_done(
3084 dod,
3085 [
3086 {
3087 "content": "Create the main index.html file with proper structure",
3088 "active_form": "Creating the main index.html file with proper structure",
3089 "status": "pending",
3090 },
3091 {
3092 "content": "Create each chapter file with appropriate content",
3093 "active_form": "Creating each chapter file with appropriate content",
3094 "status": "pending",
3095 },
3096 ],
3097 )
3098
3099 tool_call = ToolCall(
3100 id="write-index-recovered",
3101 name="write",
3102 arguments={
3103 "file_path": str(index_path),
3104 "content": "<html></html>\n",
3105 },
3106 )
3107 executor = FakeExecutor(
3108 [
3109 tool_outcome(
3110 tool_call=tool_call,
3111 output=f"Successfully wrote 14 bytes to {index_path}",
3112 is_error=False,
3113 )
3114 ]
3115 )
3116
3117 summary = TurnSummary(final_response="")
3118 await runner.execute_batch(
3119 tool_calls=[tool_call],
3120 tool_source="assistant",
3121 pending_tool_calls_seen=set(),
3122 emit=_noop_emit,
3123 summary=summary,
3124 dod=dod,
3125 executor=executor, # type: ignore[arg-type]
3126 on_confirmation=None,
3127 on_user_question=None,
3128 emit_confirmation=None,
3129 consecutive_errors=0,
3130 )
3131
3132 assert persistent_messages
3133 assert ephemeral_messages == []
3134 message = persistent_messages[-1]
3135 assert "Next step: create `01-introduction.html`." in message
3136 assert "Write a compact but real initial version of that file now" not in message
3137
3138
3139 @pytest.mark.asyncio
3140 async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
3141 temp_dir: Path,
3142 ) -> None:
3143 async def assess_confidence(
3144 tool_name: str,
3145 tool_args: dict,
3146 context: str,
3147 ) -> ConfidenceAssessment:
3148 raise AssertionError("Confidence scoring should not run in this scenario")
3149
3150 async def verify_action(
3151 tool_name: str,
3152 tool_args: dict,
3153 result: str,
3154 expected: str = "",
3155 ) -> ActionVerification:
3156 raise AssertionError("Verification should not run in this scenario")
3157
3158 guide_root = temp_dir / "guides" / "nginx"
3159 chapters = guide_root / "chapters"
3160 chapters.mkdir(parents=True)
3161 index_path = guide_root / "index.html"
3162 index_path.write_text(
3163 "\n".join(
3164 [
3165 "<html>",
3166 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
3167 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3168 "</html>",
3169 ]
3170 )
3171 + "\n"
3172 )
3173
3174 implementation_plan = temp_dir / "implementation.md"
3175 implementation_plan.write_text(
3176 "\n".join(
3177 [
3178 "# Implementation Plan",
3179 "",
3180 "## File Changes",
3181 f"- `{guide_root}/`",
3182 f"- `{chapters}/`",
3183 f"- `{index_path}`",
3184 "",
3185 ]
3186 )
3187 )
3188
3189 context = build_context(
3190 temp_dir=temp_dir,
3191 messages=[],
3192 safeguards=FakeSafeguards(),
3193 assess_confidence=assess_confidence,
3194 verify_action=verify_action,
3195 )
3196 queued_messages: list[str] = []
3197 context.queue_steering_message_callback = queued_messages.append
3198 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3199 dod = create_definition_of_done("Create a multi-file nginx guide.")
3200 dod.implementation_plan = str(implementation_plan)
3201 dod.touched_files.append(str(index_path))
3202 sync_todos_to_definition_of_done(
3203 dod,
3204 [
3205 {
3206 "content": "Develop the main index.html file with proper structure",
3207 "active_form": "Developing the main index.html file with proper structure",
3208 "status": "completed",
3209 },
3210 {
3211 "content": "Create chapter files with content and structure",
3212 "active_form": "Creating chapter files with content and structure",
3213 "status": "pending",
3214 },
3215 ],
3216 )
3217
3218 todos = [
3219 {
3220 "content": "Develop the main index.html file with proper structure",
3221 "active_form": "Developing the main index.html file with proper structure",
3222 "status": "completed",
3223 },
3224 {
3225 "content": "Create chapter files with content and structure",
3226 "active_form": "Creating chapter files with content and structure",
3227 "status": "pending",
3228 },
3229 ]
3230 tool_call = ToolCall(
3231 id="todo-aggregate",
3232 name="TodoWrite",
3233 arguments={"todos": todos},
3234 )
3235 executor = FakeExecutor(
3236 [
3237 tool_outcome(
3238 tool_call=tool_call,
3239 output="Todos updated",
3240 is_error=False,
3241 metadata={"new_todos": todos},
3242 )
3243 ]
3244 )
3245
3246 summary = TurnSummary(final_response="")
3247 await runner.execute_batch(
3248 tool_calls=[tool_call],
3249 tool_source="assistant",
3250 pending_tool_calls_seen=set(),
3251 emit=_noop_emit,
3252 summary=summary,
3253 dod=dod,
3254 executor=executor, # type: ignore[arg-type]
3255 on_confirmation=None,
3256 on_user_question=None,
3257 emit_confirmation=None,
3258 consecutive_errors=0,
3259 )
3260
3261 assert queued_messages
3262 message = queued_messages[-1]
3263 assert "Todo tracking is updated." in message
3264 assert "Next step: create `01-introduction.html`." in message
3265 assert (
3266 "Continue with the next pending item: `Create chapter files with content and structure`."
3267 not in message
3268 )
3269
3270
3271 @pytest.mark.asyncio
3272 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
3273 temp_dir: Path,
3274 ) -> None:
3275 async def assess_confidence(
3276 tool_name: str,
3277 tool_args: dict,
3278 context: str,
3279 ) -> ConfidenceAssessment:
3280 raise AssertionError("Confidence scoring should be disabled in this scenario")
3281
3282 async def verify_action(
3283 tool_name: str,
3284 tool_args: dict,
3285 result: str,
3286 expected: str = "",
3287 ) -> ActionVerification:
3288 raise AssertionError("Verification should not run for this scenario")
3289
3290 guide_root = temp_dir / "guides" / "nginx"
3291 chapters = guide_root / "chapters"
3292 chapters.mkdir(parents=True)
3293 index_path = guide_root / "index.html"
3294 chapter_one = chapters / "01-getting-started.html"
3295 chapter_one.write_text("<h1>One</h1>\n")
3296 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3297
3298 implementation_plan = temp_dir / "implementation.md"
3299 implementation_plan.write_text(
3300 "\n".join(
3301 [
3302 "# Implementation Plan",
3303 "",
3304 "## File Changes",
3305 f"- `{index_path}`",
3306 f"- `{chapter_one}`",
3307 f"- `{chapters / '06-ssl-configuration.html'}`",
3308 "",
3309 ]
3310 )
3311 )
3312
3313 context = build_context(
3314 temp_dir=temp_dir,
3315 messages=[],
3316 safeguards=FakeSafeguards(),
3317 assess_confidence=assess_confidence,
3318 verify_action=verify_action,
3319 auto_recover=False,
3320 )
3321 persistent_messages: list[str] = []
3322 ephemeral_messages: list[str] = []
3323 context.queue_steering_message_callback = persistent_messages.append
3324 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3325 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3326 dod = create_definition_of_done("Create a multi-file nginx guide.")
3327 dod.implementation_plan = str(implementation_plan)
3328 sync_todos_to_definition_of_done(
3329 dod,
3330 [
3331 {
3332 "content": "Ensure all files are properly linked and formatted consistently",
3333 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3334 "status": "pending",
3335 },
3336 {
3337 "content": "Create the final chapter (06-ssl-configuration.html)",
3338 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
3339 "status": "pending",
3340 },
3341 ],
3342 )
3343 assert tool_batches_should_prioritize_missing_artifact(
3344 dod=dod,
3345 next_pending=dod.pending_items[0],
3346 missing_artifact=(chapters / "06-ssl-configuration.html", False),
3347 project_root=temp_dir,
3348 )
3349
3350 tool_call = ToolCall(
3351 id="dup-read",
3352 name="read",
3353 arguments={"file_path": str(index_path)},
3354 )
3355 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
3356
3357 assert persistent_messages
3358 message = persistent_messages[-1]
3359 assert "06-ssl-configuration.html" in message
3360 assert "Do not switch into review or consistency-check mode" in message
3361 assert (
3362 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
3363 not in message
3364 )
3365
3366
3367 @pytest.mark.asyncio
3368 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
3369 temp_dir: Path,
3370 ) -> None:
3371 async def assess_confidence(
3372 tool_name: str,
3373 tool_args: dict,
3374 context: str,
3375 ) -> ConfidenceAssessment:
3376 raise AssertionError("Confidence scoring should be disabled in this scenario")
3377
3378 async def verify_action(
3379 tool_name: str,
3380 tool_args: dict,
3381 result: str,
3382 expected: str = "",
3383 ) -> ActionVerification:
3384 raise AssertionError("Verification should not run for this scenario")
3385
3386 guide_root = temp_dir / "guides" / "nginx"
3387 chapters = guide_root / "chapters"
3388 chapters.mkdir(parents=True)
3389 index_path = guide_root / "index.html"
3390 chapter_one = chapters / "01-getting-started.html"
3391 chapter_two = chapters / "02-installation.html"
3392 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3393 chapter_one.write_text("<h1>One</h1>\n")
3394 chapter_two.write_text("<h1>Two</h1>\n")
3395
3396 implementation_plan = temp_dir / "implementation.md"
3397 implementation_plan.write_text(
3398 "\n".join(
3399 [
3400 "# Implementation Plan",
3401 "",
3402 "## File Changes",
3403 f"- `{chapters}/`",
3404 f"- `{index_path}`",
3405 f"- `{chapter_one}`",
3406 f"- `{chapter_two}`",
3407 "",
3408 ]
3409 )
3410 )
3411
3412 context = build_context(
3413 temp_dir=temp_dir,
3414 messages=[],
3415 safeguards=FakeSafeguards(),
3416 assess_confidence=assess_confidence,
3417 verify_action=verify_action,
3418 auto_recover=False,
3419 )
3420 persistent_messages: list[str] = []
3421 ephemeral_messages: list[str] = []
3422 context.queue_steering_message_callback = persistent_messages.append
3423 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3424 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3425 dod = create_definition_of_done("Create a multi-file nginx guide.")
3426 dod.implementation_plan = str(implementation_plan)
3427 sync_todos_to_definition_of_done(
3428 dod,
3429 [
3430 {
3431 "content": "Create the guide files",
3432 "active_form": "Working on: Create the guide files",
3433 "status": "completed",
3434 },
3435 {
3436 "content": "Ensure all files are properly linked and formatted consistently",
3437 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3438 "status": "pending",
3439 },
3440 ],
3441 )
3442 tool_call = ToolCall(
3443 id="write-final",
3444 name="write",
3445 arguments={
3446 "file_path": str(chapter_two),
3447 "content": "<h1>Two</h1>\n",
3448 },
3449 )
3450 executor = FakeExecutor(
3451 [
3452 tool_outcome(
3453 tool_call=tool_call,
3454 output=f"Successfully wrote {chapter_two}",
3455 is_error=False,
3456 )
3457 ]
3458 )
3459
3460 summary = TurnSummary(final_response="")
3461 await runner.execute_batch(
3462 tool_calls=[tool_call],
3463 tool_source="assistant",
3464 pending_tool_calls_seen=set(),
3465 emit=_noop_emit,
3466 summary=summary,
3467 dod=dod,
3468 executor=executor, # type: ignore[arg-type]
3469 on_confirmation=None,
3470 on_user_question=None,
3471 emit_confirmation=None,
3472 consecutive_errors=0,
3473 )
3474
3475 assert any(
3476 "All explicitly planned artifacts now exist on disk." in message
3477 for message in persistent_messages
3478 )
3479 assert any(
3480 "Ensure all files are properly linked and formatted consistently" in message
3481 for message in persistent_messages
3482 )
3483 assert any(
3484 "Finish with a final response once no specific mismatch remains so Loader can verify."
3485 in message
3486 for message in persistent_messages
3487 )
3488
3489
3490 @pytest.mark.asyncio
3491 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
3492 temp_dir: Path,
3493 ) -> None:
3494 async def assess_confidence(
3495 tool_name: str,
3496 tool_args: dict,
3497 context: str,
3498 ) -> ConfidenceAssessment:
3499 raise AssertionError("Confidence scoring should not run in this scenario")
3500
3501 async def verify_action(
3502 tool_name: str,
3503 tool_args: dict,
3504 result: str,
3505 expected: str = "",
3506 ) -> ActionVerification:
3507 raise AssertionError("Verification should not run in this scenario")
3508
3509 guide_root = temp_dir / "guides" / "nginx"
3510 chapters = guide_root / "chapters"
3511 guide_root.mkdir(parents=True)
3512 chapters.mkdir()
3513 index_path = guide_root / "index.html"
3514 index_path.write_text("<html></html>\n")
3515 chapter_one = chapters / "01-getting-started.html"
3516 chapter_two = chapters / "02-installation.html"
3517 implementation_plan = temp_dir / "implementation.md"
3518 implementation_plan.write_text(
3519 "\n".join(
3520 [
3521 "# Implementation Plan",
3522 "",
3523 "## File Changes",
3524 f"- `{guide_root}/`",
3525 f"- `{index_path}`",
3526 f"- `{chapter_one}`",
3527 f"- `{chapter_two}`",
3528 "",
3529 ]
3530 )
3531 )
3532
3533 context = build_context(
3534 temp_dir=temp_dir,
3535 messages=[],
3536 safeguards=FakeSafeguards(),
3537 assess_confidence=assess_confidence,
3538 verify_action=verify_action,
3539 auto_recover=False,
3540 )
3541 persistent_messages: list[str] = []
3542 ephemeral_messages: list[str] = []
3543 context.queue_steering_message_callback = persistent_messages.append
3544 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3545 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3546 dod = create_definition_of_done("Create a multi-file nginx guide.")
3547 dod.implementation_plan = str(implementation_plan)
3548 sync_todos_to_definition_of_done(
3549 dod,
3550 [
3551 {
3552 "content": "Create the main index.html file with proper structure",
3553 "active_form": "Working on: Create the main index.html file with proper structure",
3554 "status": "pending",
3555 },
3556 {
3557 "content": "Create each chapter file in sequence, following the established pattern",
3558 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
3559 "status": "pending",
3560 },
3561 {
3562 "content": "Ensure all files are properly linked and formatted consistently",
3563 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3564 "status": "pending",
3565 },
3566 ],
3567 )
3568 tool_call = ToolCall(
3569 id="write-index",
3570 name="write",
3571 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
3572 )
3573 executor = FakeExecutor(
3574 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
3575 )
3576
3577 summary = TurnSummary(final_response="")
3578 await runner.execute_batch(
3579 tool_calls=[tool_call],
3580 tool_source="assistant",
3581 pending_tool_calls_seen=set(),
3582 emit=_noop_emit,
3583 summary=summary,
3584 dod=dod,
3585 executor=executor, # type: ignore[arg-type]
3586 on_confirmation=None,
3587 on_user_question=None,
3588 emit_confirmation=None,
3589 consecutive_errors=0,
3590 )
3591
3592 assert persistent_messages
3593 assert ephemeral_messages == []
3594 message = persistent_messages[-1]
3595 assert "Next step: create `01-getting-started.html`." in message
3596 assert "Write a compact but real initial version of that file now" not in message
3597 assert "refresh `TodoWrite`" not in message
3598 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3599
3600
3601 @pytest.mark.asyncio
3602 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
3603 temp_dir: Path,
3604 ) -> None:
3605 async def assess_confidence(
3606 tool_name: str,
3607 tool_args: dict,
3608 context: str,
3609 ) -> ConfidenceAssessment:
3610 raise AssertionError("Confidence scoring should not run in this scenario")
3611
3612 async def verify_action(
3613 tool_name: str,
3614 tool_args: dict,
3615 result: str,
3616 expected: str = "",
3617 ) -> ActionVerification:
3618 raise AssertionError("Verification should not run in this scenario")
3619
3620 guide_root = temp_dir / "guides" / "nginx"
3621 chapters = guide_root / "chapters"
3622 guide_root.mkdir(parents=True)
3623 chapters.mkdir()
3624 index_path = guide_root / "index.html"
3625 index_path.write_text("<html></html>\n")
3626
3627 chapter_paths = [
3628 chapters / "01-getting-started.html",
3629 chapters / "02-installation.html",
3630 chapters / "03-first-website.html",
3631 chapters / "04-configuration-basics.html",
3632 chapters / "05-advanced-configurations.html",
3633 chapters / "06-performance-tuning.html",
3634 chapters / "07-security-best-practices.html",
3635 ]
3636 for chapter in chapter_paths[:4]:
3637 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
3638 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
3639
3640 implementation_plan = temp_dir / "implementation.md"
3641 implementation_plan.write_text(
3642 "\n".join(
3643 [
3644 "# Implementation Plan",
3645 "",
3646 "## File Changes",
3647 f"- `{guide_root}/`",
3648 f"- `{chapters}/`",
3649 f"- `{index_path}`",
3650 *[f"- `{path}`" for path in chapter_paths],
3651 "",
3652 ]
3653 )
3654 )
3655
3656 context = build_context(
3657 temp_dir=temp_dir,
3658 messages=[],
3659 safeguards=FakeSafeguards(),
3660 assess_confidence=assess_confidence,
3661 verify_action=verify_action,
3662 auto_recover=False,
3663 )
3664 persistent_messages: list[str] = []
3665 ephemeral_messages: list[str] = []
3666 context.queue_steering_message_callback = persistent_messages.append
3667 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3668 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3669 dod = create_definition_of_done("Create a thorough nginx guide.")
3670 dod.implementation_plan = str(implementation_plan)
3671 sync_todos_to_definition_of_done(
3672 dod,
3673 [
3674 {
3675 "content": "Create the nginx guide artifacts",
3676 "active_form": "Creating nginx guide artifacts",
3677 "status": "pending",
3678 },
3679 {
3680 "content": "Verify all guide files are linked and complete",
3681 "active_form": "Verifying guide linkage and completeness",
3682 "status": "pending",
3683 },
3684 ],
3685 )
3686 tool_call = ToolCall(
3687 id="write-chapter-05",
3688 name="write",
3689 arguments={
3690 "file_path": str(chapter_paths[4]),
3691 "content": "<h1>Advanced configurations</h1>\n",
3692 },
3693 )
3694 executor = FakeExecutor(
3695 [
3696 tool_outcome(
3697 tool_call=tool_call,
3698 output=f"Successfully wrote {chapter_paths[4]}",
3699 is_error=False,
3700 )
3701 ]
3702 )
3703
3704 summary = TurnSummary(final_response="")
3705 await runner.execute_batch(
3706 tool_calls=[tool_call],
3707 tool_source="assistant",
3708 pending_tool_calls_seen=set(),
3709 emit=_noop_emit,
3710 summary=summary,
3711 dod=dod,
3712 executor=executor, # type: ignore[arg-type]
3713 on_confirmation=None,
3714 on_user_question=None,
3715 emit_confirmation=None,
3716 consecutive_errors=0,
3717 )
3718
3719 assert any(
3720 "Next step: create `06-performance-tuning.html`." in message
3721 for message in ephemeral_messages
3722 )
3723 assert not any(
3724 "All explicitly planned artifacts now exist on disk." in message
3725 for message in ephemeral_messages
3726 )
3727
3728
3729 @pytest.mark.asyncio
3730 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
3731 temp_dir: Path,
3732 ) -> None:
3733 async def assess_confidence(
3734 tool_name: str,
3735 tool_args: dict,
3736 context: str,
3737 ) -> ConfidenceAssessment:
3738 raise AssertionError("Confidence scoring should not run in this scenario")
3739
3740 async def verify_action(
3741 tool_name: str,
3742 tool_args: dict,
3743 result: str,
3744 expected: str = "",
3745 ) -> ActionVerification:
3746 raise AssertionError("Verification should not run in this scenario")
3747
3748 guide_root = temp_dir / "guides" / "nginx"
3749 chapters = guide_root / "chapters"
3750 guide_root.mkdir(parents=True)
3751 chapters.mkdir()
3752 index_path = guide_root / "index.html"
3753 chapter_paths = [
3754 chapters / "01-introduction.html",
3755 chapters / "02-installation.html",
3756 chapters / "03-configuration.html",
3757 chapters / "04-basic-usage.html",
3758 chapters / "05-advanced-features.html",
3759 ]
3760 for path in (index_path, *chapter_paths[:4]):
3761 path.write_text("<html></html>\n")
3762
3763 implementation_plan = temp_dir / "implementation.md"
3764 implementation_plan.write_text(
3765 "\n".join(
3766 [
3767 "# Implementation Plan",
3768 "",
3769 "## File Changes",
3770 f"- `{guide_root}/`",
3771 f"- `{chapters}/`",
3772 f"- `{index_path}`",
3773 *[f"- `{path}`" for path in chapter_paths],
3774 "",
3775 ]
3776 )
3777 )
3778
3779 context = build_context(
3780 temp_dir=temp_dir,
3781 messages=[],
3782 safeguards=FakeSafeguards(),
3783 assess_confidence=assess_confidence,
3784 verify_action=verify_action,
3785 auto_recover=False,
3786 )
3787 persistent_messages: list[str] = []
3788 ephemeral_messages: list[str] = []
3789 context.queue_steering_message_callback = persistent_messages.append
3790 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3791 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3792 dod = create_definition_of_done("Create a thorough nginx guide.")
3793 dod.implementation_plan = str(implementation_plan)
3794 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3795 dod.completed_items.extend(
3796 [
3797 "Create the nginx directory structure",
3798 "Create the main index.html file with proper structure",
3799 ]
3800 )
3801 sync_todos_to_definition_of_done(
3802 dod,
3803 [
3804 {
3805 "content": "Create each chapter file with appropriate content",
3806 "active_form": "Creating each chapter file with appropriate content",
3807 "status": "pending",
3808 }
3809 ],
3810 )
3811 tool_call = ToolCall(
3812 id="write-chapter-04",
3813 name="write",
3814 arguments={
3815 "file_path": str(chapter_paths[3]),
3816 "content": "<html>updated</html>\n",
3817 },
3818 )
3819 executor = FakeExecutor(
3820 [
3821 tool_outcome(
3822 tool_call=tool_call,
3823 output=f"Successfully wrote {chapter_paths[3]}",
3824 is_error=False,
3825 )
3826 ]
3827 )
3828
3829 summary = TurnSummary(final_response="")
3830 await runner.execute_batch(
3831 tool_calls=[tool_call],
3832 tool_source="assistant",
3833 pending_tool_calls_seen=set(),
3834 emit=_noop_emit,
3835 summary=summary,
3836 dod=dod,
3837 executor=executor, # type: ignore[arg-type]
3838 on_confirmation=None,
3839 on_user_question=None,
3840 emit_confirmation=None,
3841 consecutive_errors=0,
3842 )
3843
3844 assert ephemeral_messages
3845 message = ephemeral_messages[-1]
3846 assert "Next step: create `05-advanced-features.html`." in message
3847 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3848 assert "refresh `TodoWrite`" not in message
3849
3850
3851 @pytest.mark.asyncio
3852 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3853 temp_dir: Path,
3854 ) -> None:
3855 async def assess_confidence(
3856 tool_name: str,
3857 tool_args: dict,
3858 context: str,
3859 ) -> ConfidenceAssessment:
3860 raise AssertionError("Confidence scoring should not run in this scenario")
3861
3862 async def verify_action(
3863 tool_name: str,
3864 tool_args: dict,
3865 result: str,
3866 expected: str = "",
3867 ) -> ActionVerification:
3868 raise AssertionError("Verification should not run in this scenario")
3869
3870 guide_root = temp_dir / "guides" / "nginx"
3871 chapters = guide_root / "chapters"
3872 guide_root.mkdir(parents=True)
3873 chapters.mkdir()
3874 index_path = guide_root / "index.html"
3875 index_path.write_text("<html></html>\n")
3876 chapter_one = chapters / "01-getting-started.html"
3877 chapter_two = chapters / "02-installation.html"
3878 chapter_one.write_text("<h1>One</h1>\n")
3879
3880 implementation_plan = temp_dir / "implementation.md"
3881 implementation_plan.write_text(
3882 "\n".join(
3883 [
3884 "# Implementation Plan",
3885 "",
3886 "## File Changes",
3887 f"- `{guide_root}/`",
3888 f"- `{chapters}/`",
3889 f"- `{index_path}`",
3890 f"- `{chapter_one}`",
3891 f"- `{chapter_two}`",
3892 "",
3893 ]
3894 )
3895 )
3896
3897 context = build_context(
3898 temp_dir=temp_dir,
3899 messages=[],
3900 safeguards=FakeSafeguards(),
3901 assess_confidence=assess_confidence,
3902 verify_action=verify_action,
3903 auto_recover=False,
3904 )
3905 persistent_messages: list[str] = []
3906 ephemeral_messages: list[str] = []
3907 context.queue_steering_message_callback = persistent_messages.append
3908 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3909 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3910 dod = create_definition_of_done("Create a multi-file nginx guide.")
3911 dod.implementation_plan = str(implementation_plan)
3912 sync_todos_to_definition_of_done(
3913 dod,
3914 [
3915 {
3916 "content": "Create 01-getting-started.html",
3917 "active_form": "Creating 01-getting-started.html",
3918 "status": "completed",
3919 },
3920 {
3921 "content": "Create 02-installation.html",
3922 "active_form": "Creating 02-installation.html",
3923 "status": "pending",
3924 },
3925 ],
3926 )
3927 dod.touched_files.extend([str(index_path), str(chapter_one)])
3928
3929 tool_call = ToolCall(
3930 id="todo-only",
3931 name="TodoWrite",
3932 arguments={
3933 "todos": [
3934 {
3935 "content": "Create 01-getting-started.html",
3936 "active_form": "Creating 01-getting-started.html",
3937 "status": "completed",
3938 },
3939 {
3940 "content": "Create 02-installation.html",
3941 "active_form": "Creating 02-installation.html",
3942 "status": "pending",
3943 },
3944 ]
3945 },
3946 )
3947 executor = FakeExecutor(
3948 [
3949 tool_outcome(
3950 tool_call=tool_call,
3951 output="Todos updated",
3952 is_error=False,
3953 metadata={
3954 "new_todos": [
3955 {
3956 "content": "Create 01-getting-started.html",
3957 "active_form": "Creating 01-getting-started.html",
3958 "status": "completed",
3959 },
3960 {
3961 "content": "Create 02-installation.html",
3962 "active_form": "Creating 02-installation.html",
3963 "status": "pending",
3964 },
3965 ]
3966 },
3967 )
3968 ]
3969 )
3970
3971 summary = TurnSummary(final_response="")
3972 await runner.execute_batch(
3973 tool_calls=[tool_call],
3974 tool_source="assistant",
3975 pending_tool_calls_seen=set(),
3976 emit=_noop_emit,
3977 summary=summary,
3978 dod=dod,
3979 executor=executor, # type: ignore[arg-type]
3980 on_confirmation=None,
3981 on_user_question=None,
3982 emit_confirmation=None,
3983 consecutive_errors=0,
3984 )
3985
3986 assert persistent_messages
3987 message = persistent_messages[-1]
3988 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
3989 assert "Prefer one `write(file_path=..., content=...)` call" in message
3990 assert "Make your next response the concrete mutation tool call itself." in message
3991 assert ephemeral_messages == []
3992
3993
3994 @pytest.mark.asyncio
3995 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3996 temp_dir: Path,
3997 ) -> None:
3998 async def assess_confidence(
3999 tool_name: str,
4000 tool_args: dict,
4001 context: str,
4002 ) -> ConfidenceAssessment:
4003 raise AssertionError("Confidence scoring should not run in this scenario")
4004
4005 async def verify_action(
4006 tool_name: str,
4007 tool_args: dict,
4008 result: str,
4009 expected: str = "",
4010 ) -> ActionVerification:
4011 raise AssertionError("Verification should not run in this scenario")
4012
4013 guide_root = temp_dir / "guides" / "nginx"
4014 chapters = guide_root / "chapters"
4015 guide_root.mkdir(parents=True)
4016 chapters.mkdir()
4017 index_path = guide_root / "index.html"
4018 chapter_one = chapters / "01-getting-started.html"
4019 chapter_two = chapters / "02-installation.html"
4020 index_path.write_text("<html></html>\n")
4021 chapter_one.write_text("<h1>One</h1>\n")
4022 chapter_two.write_text("<h1>Two</h1>\n")
4023
4024 implementation_plan = temp_dir / "implementation.md"
4025 implementation_plan.write_text(
4026 "\n".join(
4027 [
4028 "# Implementation Plan",
4029 "",
4030 "## File Changes",
4031 f"- `{guide_root}/`",
4032 f"- `{chapters}/`",
4033 f"- `{index_path}`",
4034 f"- `{chapter_one}`",
4035 f"- `{chapter_two}`",
4036 "",
4037 ]
4038 )
4039 )
4040
4041 context = build_context(
4042 temp_dir=temp_dir,
4043 messages=[],
4044 safeguards=FakeSafeguards(),
4045 assess_confidence=assess_confidence,
4046 verify_action=verify_action,
4047 auto_recover=False,
4048 )
4049 queued_messages: list[str] = []
4050 context.queue_steering_message_callback = queued_messages.append
4051 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4052 dod = create_definition_of_done("Create a multi-file nginx guide.")
4053 dod.implementation_plan = str(implementation_plan)
4054 dod.verification_commands = [f"ls -la {guide_root}"]
4055 sync_todos_to_definition_of_done(
4056 dod,
4057 [
4058 {
4059 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
4060 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
4061 "status": "pending",
4062 },
4063 {
4064 "content": "Verify all guide files are linked and complete",
4065 "active_form": "Working on: Verify all guide files are linked and complete",
4066 "status": "pending",
4067 },
4068 ],
4069 project_root=temp_dir,
4070 )
4071
4072 tool_call = ToolCall(
4073 id="todo-only",
4074 name="TodoWrite",
4075 arguments={
4076 "todos": [
4077 {
4078 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
4079 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
4080 "status": "pending",
4081 },
4082 {
4083 "content": "Verify all guide files are linked and complete",
4084 "active_form": "Working on: Verify all guide files are linked and complete",
4085 "status": "pending",
4086 },
4087 ]
4088 },
4089 )
4090 executor = FakeExecutor(
4091 [
4092 tool_outcome(
4093 tool_call=tool_call,
4094 output="Todos updated",
4095 is_error=False,
4096 metadata={
4097 "new_todos": [
4098 {
4099 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
4100 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
4101 "status": "pending",
4102 },
4103 {
4104 "content": "Verify all guide files are linked and complete",
4105 "active_form": "Working on: Verify all guide files are linked and complete",
4106 "status": "pending",
4107 },
4108 ]
4109 },
4110 )
4111 ]
4112 )
4113
4114 summary = TurnSummary(final_response="")
4115 await runner.execute_batch(
4116 tool_calls=[tool_call],
4117 tool_source="assistant",
4118 pending_tool_calls_seen=set(),
4119 emit=_noop_emit,
4120 summary=summary,
4121 dod=dod,
4122 executor=executor, # type: ignore[arg-type]
4123 on_confirmation=None,
4124 on_user_question=None,
4125 emit_confirmation=None,
4126 consecutive_errors=0,
4127 )
4128
4129 assert queued_messages
4130 message = queued_messages[-1]
4131 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4132 assert "Verify all guide files are linked and complete" in message
4133 assert (
4134 "Finish with a final response once no specific mismatch remains so Loader can verify."
4135 in message
4136 )
4137 assert "reopen reference materials" in message
4138 assert "Fortran guide structure" not in message
4139 assert context.workflow_mode == "execute"
4140
4141
4142 @pytest.mark.asyncio
4143 async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
4144 temp_dir: Path,
4145 ) -> None:
4146 async def assess_confidence(
4147 tool_name: str,
4148 tool_args: dict,
4149 context: str,
4150 ) -> ConfidenceAssessment:
4151 raise AssertionError("Confidence scoring should not run for this scenario")
4152
4153 async def verify_action(
4154 tool_name: str,
4155 tool_args: dict,
4156 result: str,
4157 expected: str = "",
4158 ) -> ActionVerification:
4159 raise AssertionError("Verification should not run for this scenario")
4160
4161 guide_root = temp_dir / "guides" / "nginx"
4162 chapters = guide_root / "chapters"
4163 guide_root.mkdir(parents=True)
4164 chapters.mkdir()
4165 index_path = guide_root / "index.html"
4166 chapter_one = chapters / "01-introduction.html"
4167 chapter_two = chapters / "02-installation.html"
4168 index_path.write_text(
4169 "\n".join(
4170 [
4171 '<a href="chapters/01-introduction.html">Intro</a>',
4172 '<a href="chapters/02-installation.html">Install</a>',
4173 '<a href="../index.html">Back</a>',
4174 "",
4175 ]
4176 )
4177 )
4178 chapter_one.write_text("<html></html>\n")
4179 chapter_two.write_text("<html></html>\n")
4180
4181 implementation_plan = temp_dir / "implementation.md"
4182 implementation_plan.write_text(
4183 "\n".join(
4184 [
4185 "# Implementation Plan",
4186 "",
4187 "## File Changes",
4188 f"- `{guide_root}/`",
4189 f"- `{chapters}/`",
4190 f"- `{index_path}`",
4191 f"- `{chapter_one}`",
4192 f"- `{chapter_two}`",
4193 "",
4194 ]
4195 )
4196 )
4197
4198 context = build_context(
4199 temp_dir=temp_dir,
4200 messages=[],
4201 safeguards=FakeSafeguards(),
4202 assess_confidence=assess_confidence,
4203 verify_action=verify_action,
4204 auto_recover=False,
4205 )
4206 queued_messages: list[str] = []
4207 context.queue_steering_message_callback = queued_messages.append
4208 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4209 dod = create_definition_of_done("Create a multi-file nginx guide.")
4210 dod.implementation_plan = str(implementation_plan)
4211 dod.verification_commands = [f"ls -la {guide_root}"]
4212 sync_todos_to_definition_of_done(
4213 dod,
4214 [
4215 {
4216 "content": "Create chapter files following the established pattern",
4217 "active_form": "Creating chapter files",
4218 "status": "in_progress",
4219 }
4220 ],
4221 project_root=temp_dir,
4222 )
4223
4224 tool_call = ToolCall(
4225 id="todo-post-build",
4226 name="TodoWrite",
4227 arguments={
4228 "todos": [
4229 {
4230 "content": "Create chapter files following the established pattern",
4231 "active_form": "Creating chapter files",
4232 "status": "in_progress",
4233 }
4234 ]
4235 },
4236 )
4237 executor = FakeExecutor(
4238 [
4239 tool_outcome(
4240 tool_call=tool_call,
4241 output="Todos updated",
4242 is_error=False,
4243 metadata={
4244 "new_todos": [
4245 {
4246 "content": "Create chapter files following the established pattern",
4247 "active_form": "Creating chapter files",
4248 "status": "in_progress",
4249 }
4250 ]
4251 },
4252 )
4253 ]
4254 )
4255
4256 summary = TurnSummary(final_response="")
4257 await runner.execute_batch(
4258 tool_calls=[tool_call],
4259 tool_source="assistant",
4260 pending_tool_calls_seen=set(),
4261 emit=_noop_emit,
4262 summary=summary,
4263 dod=dod,
4264 executor=executor, # type: ignore[arg-type]
4265 on_confirmation=None,
4266 on_user_question=None,
4267 emit_confirmation=None,
4268 consecutive_errors=0,
4269 )
4270
4271 assert queued_messages
4272 message = queued_messages[-1]
4273 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4274 assert "Finish with a final response now so Loader can run verification automatically." in message
4275 assert "Repair or verify the current files instead of expanding the artifact set." not in message
4276 assert context.workflow_mode == "verify"
4277
4278
4279 @pytest.mark.asyncio
4280 async def test_tool_batch_runner_todowrite_during_quality_repair_requires_mutation(
4281 temp_dir: Path,
4282 ) -> None:
4283 async def assess_confidence(
4284 tool_name: str,
4285 tool_args: dict,
4286 context: str,
4287 ) -> ConfidenceAssessment:
4288 raise AssertionError("Confidence scoring should not run for this scenario")
4289
4290 async def verify_action(
4291 tool_name: str,
4292 tool_args: dict,
4293 result: str,
4294 expected: str = "",
4295 ) -> ActionVerification:
4296 raise AssertionError("Verification should not run for this scenario")
4297
4298 guide_root = temp_dir / "guides" / "nginx"
4299 chapters = guide_root / "chapters"
4300 chapters.mkdir(parents=True)
4301 index_path = guide_root / "index.html"
4302 chapter_one = chapters / "01-introduction.html"
4303 index_path.write_text("<html></html>\n")
4304 chapter_one.write_text("<html></html>\n")
4305
4306 implementation_plan = temp_dir / "implementation.md"
4307 implementation_plan.write_text(
4308 "\n".join(
4309 [
4310 "# Implementation Plan",
4311 "",
4312 "## File Changes",
4313 f"- `{guide_root}/`",
4314 f"- `{chapters}/`",
4315 f"- `{index_path}`",
4316 f"- `{chapter_one}`",
4317 "",
4318 ]
4319 )
4320 )
4321
4322 context = build_context(
4323 temp_dir=temp_dir,
4324 messages=[
4325 Message(
4326 role=Role.USER,
4327 content=(
4328 "Repair focus:\n"
4329 f"- Improve `{chapter_one}`: thin content (409 text chars, expected at least 1758).\n"
4330 f"- Improve `{chapter_one}`: insufficient structured content (6 blocks, expected at least 18).\n"
4331 f"- Immediate next step: edit `{chapter_one}`.\n"
4332 ),
4333 )
4334 ],
4335 safeguards=FakeSafeguards(),
4336 assess_confidence=assess_confidence,
4337 verify_action=verify_action,
4338 auto_recover=False,
4339 )
4340 context.set_workflow_mode("verify")
4341 queued_messages: list[str] = []
4342 emitted_responses: list[str] = []
4343 context.queue_steering_message_callback = queued_messages.append
4344 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4345 dod = create_definition_of_done("Create a multi-file nginx guide.")
4346 dod.implementation_plan = str(implementation_plan)
4347 dod.verification_commands = [f"ls -la {guide_root}"]
4348 sync_todos_to_definition_of_done(
4349 dod,
4350 [
4351 {
4352 "content": "Expand generated chapters to satisfy quality verification",
4353 "active_form": "Expanding generated chapters",
4354 "status": "in_progress",
4355 }
4356 ],
4357 project_root=temp_dir,
4358 )
4359 pending_before_todowrite = list(dod.pending_items)
4360 completed_before_todowrite = list(dod.completed_items)
4361
4362 tool_call = ToolCall(
4363 id="todo-quality",
4364 name="TodoWrite",
4365 arguments={
4366 "todos": [
4367 {
4368 "content": "Expand generated chapters to satisfy quality verification",
4369 "active_form": "Expanding generated chapters",
4370 "status": "completed",
4371 }
4372 ]
4373 },
4374 )
4375 executor = FakeExecutor(
4376 [
4377 tool_outcome(
4378 tool_call=tool_call,
4379 output="Todos updated",
4380 is_error=False,
4381 metadata={
4382 "new_todos": [
4383 {
4384 "content": "Expand generated chapters to satisfy quality verification",
4385 "active_form": "Expanding generated chapters",
4386 "status": "completed",
4387 }
4388 ]
4389 },
4390 )
4391 ]
4392 )
4393
4394 async def emit(event: AgentEvent) -> None:
4395 if event.type == "response":
4396 emitted_responses.append(str(event.content))
4397
4398 summary = TurnSummary(final_response="")
4399 result = await runner.execute_batch(
4400 tool_calls=[tool_call],
4401 tool_source="assistant",
4402 pending_tool_calls_seen=set(),
4403 emit=emit,
4404 summary=summary,
4405 dod=dod,
4406 executor=executor, # type: ignore[arg-type]
4407 on_confirmation=None,
4408 on_user_question=None,
4409 emit_confirmation=None,
4410 consecutive_errors=0,
4411 )
4412
4413 assert queued_messages
4414 message = queued_messages[-1]
4415 assert "verification still has an active HTML content-quality repair" in message
4416 assert "TodoWrite cannot satisfy that verifier" in message
4417 assert f"Immediate next step: edit `{chapter_one.resolve(strict=False)}`" in message
4418 assert "thin content" in message
4419 assert "Finish with a final response now" not in message
4420 assert context.workflow_mode == "execute"
4421 assert result.halted is False
4422 assert summary.final_response == ""
4423 assert not emitted_responses
4424 assert dod.pending_items == pending_before_todowrite
4425 assert dod.completed_items == completed_before_todowrite
4426
4427
4428 def test_todowrite_quality_repair_nudge_uses_exact_anchor_after_stale_context(
4429 temp_dir: Path,
4430 ) -> None:
4431 async def assess_confidence(
4432 tool_name: str,
4433 tool_args: dict,
4434 context: str,
4435 ) -> ConfidenceAssessment:
4436 raise AssertionError("Confidence should not run for direct nudge test")
4437
4438 async def verify_action(
4439 tool_name: str,
4440 tool_args: dict,
4441 result: str,
4442 expected: str = "",
4443 ) -> ActionVerification:
4444 raise AssertionError("Verification should not run for direct nudge test")
4445
4446 guide_root = temp_dir / "guides" / "nginx"
4447 chapters = guide_root / "chapters"
4448 chapters.mkdir(parents=True)
4449 chapter_one = chapters / "05-load-balancing.html"
4450 chapter_one.write_text("<html><body><h1>Load Balancing</h1></body></html>\n")
4451 context = build_context(
4452 temp_dir=temp_dir,
4453 messages=[
4454 Message(
4455 role=Role.USER,
4456 content=(
4457 "Repair focus:\n"
4458 f"- Improve `{chapter_one}`: thin content "
4459 "(846 text chars, expected at least 1758).\n"
4460 f"- Immediate next step: edit `{chapter_one}`.\n"
4461 ),
4462 ),
4463 Message(
4464 role=Role.TOOL,
4465 content=(
4466 "Observation [edit]: Error: Failed to complete the operation "
4467 f"after 2 attempts for {chapter_one}. old_string not found in file."
4468 ),
4469 ),
4470 ],
4471 safeguards=FakeSafeguards(),
4472 assess_confidence=assess_confidence,
4473 verify_action=verify_action,
4474 auto_recover=False,
4475 )
4476 queued_messages: list[str] = []
4477 context.queue_steering_message_callback = queued_messages.append
4478 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4479 dod = create_definition_of_done("Create a multi-file nginx guide.")
4480
4481 runner._queue_todowrite_resume_nudge(dod=dod)
4482
4483 assert queued_messages
4484 message = queued_messages[-1]
4485 assert f"Immediate next step: edit `{chapter_one.resolve(strict=False)}`" in message
4486 assert "`edit(file_path=..., old_string=..., new_string=...)`" in message
4487 assert "Use this exact current closing-tail anchor as `old_string`" in message
4488 assert "```html\n</body></html>\n```" in message
4489 assert "do not call `read`, `patch`, `write`, or TodoWrite again first" in message
4490
4491
4492 @pytest.mark.asyncio
4493 async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff(
4494 temp_dir: Path,
4495 ) -> None:
4496 async def assess_confidence(
4497 tool_name: str,
4498 tool_args: dict,
4499 context: str,
4500 ) -> ConfidenceAssessment:
4501 raise AssertionError("Confidence scoring should not run for this scenario")
4502
4503 async def verify_action(
4504 tool_name: str,
4505 tool_args: dict,
4506 result: str,
4507 expected: str = "",
4508 ) -> ActionVerification:
4509 raise AssertionError("Verification should not run for this scenario")
4510
4511 guide_root = temp_dir / "guides" / "nginx"
4512 chapters = guide_root / "chapters"
4513 guide_root.mkdir(parents=True)
4514 chapters.mkdir()
4515 index_path = guide_root / "index.html"
4516 chapter_one = chapters / "01-introduction.html"
4517 chapter_two = chapters / "02-installation.html"
4518 index_path.write_text(
4519 "\n".join(
4520 [
4521 '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
4522 '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
4523 "",
4524 ]
4525 )
4526 )
4527 chapter_one.write_text("<html></html>\n")
4528 chapter_two.write_text("<html></html>\n")
4529
4530 implementation_plan = temp_dir / "implementation.md"
4531 implementation_plan.write_text(
4532 "\n".join(
4533 [
4534 "# Implementation Plan",
4535 "",
4536 "## File Changes",
4537 f"- `{guide_root}/`",
4538 f"- `{chapters}/`",
4539 f"- `{index_path}`",
4540 f"- `{chapter_one}`",
4541 f"- `{chapter_two}`",
4542 "",
4543 ]
4544 )
4545 )
4546
4547 context = build_context(
4548 temp_dir=temp_dir,
4549 messages=[],
4550 safeguards=FakeSafeguards(),
4551 assess_confidence=assess_confidence,
4552 verify_action=verify_action,
4553 auto_recover=False,
4554 )
4555 queued_messages: list[str] = []
4556 context.queue_steering_message_callback = queued_messages.append
4557 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4558 dod = create_definition_of_done("Create a multi-file nginx guide.")
4559 dod.implementation_plan = str(implementation_plan)
4560 dod.verification_commands = [f"ls -la {guide_root}"]
4561
4562 todo_call = ToolCall(
4563 id="todo-post-build-preempt",
4564 name="TodoWrite",
4565 arguments={"todos": []},
4566 )
4567 audit_read = ToolCall(
4568 id="read-after-todo",
4569 name="read",
4570 arguments={"file_path": str(index_path)},
4571 )
4572 executor = FakeExecutor(
4573 [
4574 tool_outcome(
4575 tool_call=todo_call,
4576 output="Todos updated",
4577 is_error=False,
4578 metadata={"new_todos": []},
4579 ),
4580 tool_outcome(
4581 tool_call=audit_read,
4582 output=index_path.read_text(),
4583 is_error=False,
4584 ),
4585 ]
4586 )
4587
4588 summary = TurnSummary(final_response="")
4589 result = await runner.execute_batch(
4590 tool_calls=[todo_call, audit_read],
4591 tool_source="assistant",
4592 pending_tool_calls_seen=set(),
4593 emit=_noop_emit,
4594 summary=summary,
4595 dod=dod,
4596 executor=executor, # type: ignore[arg-type]
4597 on_confirmation=None,
4598 on_user_question=None,
4599 emit_confirmation=None,
4600 consecutive_errors=0,
4601 )
4602
4603 assert result.continue_after_batch is True
4604 assert result.halted is False
4605 assert [call.id for call in executor.calls] == ["todo-post-build-preempt"]
4606 assert len(summary.tool_result_messages) == 1
4607 assert context.workflow_mode == "verify"
4608 assert queued_messages
4609 assert "Finish with a final response now so Loader can run verification automatically." in queued_messages[-1]
4610
4611
4612 @pytest.mark.asyncio
4613 async def test_tool_batch_runner_todowrite_complete_directory_plan_does_not_reinfer_first_child(
4614 temp_dir: Path,
4615 ) -> None:
4616 async def assess_confidence(
4617 tool_name: str,
4618 tool_args: dict,
4619 context: str,
4620 ) -> ConfidenceAssessment:
4621 raise AssertionError("Confidence scoring should not run for this scenario")
4622
4623 async def verify_action(
4624 tool_name: str,
4625 tool_args: dict,
4626 result: str,
4627 expected: str = "",
4628 ) -> ActionVerification:
4629 raise AssertionError("Verification should not run for this scenario")
4630
4631 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
4632 reference.parent.mkdir(parents=True)
4633 reference.write_text("<h1>Introduction</h1>\n")
4634
4635 guide_root = temp_dir / "Loader" / "guides" / "nginx"
4636 chapters = guide_root / "chapters"
4637 guide_root.mkdir(parents=True)
4638 chapters.mkdir()
4639 index_path = guide_root / "index.html"
4640 chapter_one = chapters / "01-introduction.html"
4641 chapter_two = chapters / "02-installation.html"
4642 chapter_three = chapters / "03-basic-configuration.html"
4643 index_path.write_text(
4644 "\n".join(
4645 [
4646 '<a href="chapters/01-introduction.html">Introduction</a>',
4647 '<a href="chapters/02-installation.html">Installation</a>',
4648 '<a href="chapters/03-basic-configuration.html">Configuration</a>',
4649 "",
4650 ]
4651 )
4652 )
4653 chapter_one.write_text("<html></html>\n")
4654 chapter_two.write_text("<html></html>\n")
4655 chapter_three.write_text("<html></html>\n")
4656
4657 implementation_plan = temp_dir / "implementation.md"
4658 implementation_plan.write_text(
4659 "\n".join(
4660 [
4661 "# Implementation Plan",
4662 "",
4663 "## File Changes",
4664 f"- `{guide_root / 'index.html'}`",
4665 f"- `{chapters}/`",
4666 "",
4667 ]
4668 )
4669 )
4670
4671 messages = [
4672 Message(
4673 role=Role.ASSISTANT,
4674 content="I examined the reference guide structure.",
4675 tool_calls=[
4676 ToolCall(
4677 id="read-reference-child",
4678 name="read",
4679 arguments={"file_path": str(reference)},
4680 )
4681 ],
4682 )
4683 ]
4684 context = build_context(
4685 temp_dir=temp_dir,
4686 messages=messages,
4687 safeguards=FakeSafeguards(),
4688 assess_confidence=assess_confidence,
4689 verify_action=verify_action,
4690 auto_recover=False,
4691 )
4692 queued_messages: list[str] = []
4693 context.queue_steering_message_callback = queued_messages.append
4694 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4695 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4696 dod.implementation_plan = str(implementation_plan)
4697 dod.verification_commands = [f"ls -la {guide_root}"]
4698
4699 todo_call = ToolCall(
4700 id="todo-complete-directory-plan",
4701 name="TodoWrite",
4702 arguments={"todos": []},
4703 )
4704 executor = FakeExecutor(
4705 [
4706 tool_outcome(
4707 tool_call=todo_call,
4708 output="Todos updated",
4709 is_error=False,
4710 metadata={"new_todos": []},
4711 )
4712 ]
4713 )
4714
4715 summary = TurnSummary(final_response="")
4716 result = await runner.execute_batch(
4717 tool_calls=[todo_call],
4718 tool_source="assistant",
4719 pending_tool_calls_seen=set(),
4720 emit=_noop_emit,
4721 summary=summary,
4722 dod=dod,
4723 executor=executor, # type: ignore[arg-type]
4724 on_confirmation=None,
4725 on_user_question=None,
4726 emit_confirmation=None,
4727 consecutive_errors=0,
4728 )
4729
4730 assert result.halted is True
4731 assert result.final_response == (
4732 "Todo tracking is complete; running Loader verification on the generated "
4733 "files now."
4734 )
4735 assert summary.final_response == result.final_response
4736 assert context.workflow_mode == "verify"
4737 assert summary.tool_result_messages
4738 assert (
4739 "final response should be provided next for Loader verification"
4740 in summary.tool_result_messages[-1].content
4741 )
4742 assert "01-introduction.html" not in summary.tool_result_messages[-1].content
4743 assert "chapter files" not in summary.tool_result_messages[-1].content.lower()
4744 assert "fortran guide structure" not in summary.tool_result_messages[-1].content.lower()
4745
4746
4747 @pytest.mark.asyncio
4748 async def test_tool_batch_runner_preempts_post_build_observation_batch_for_verify_handoff(
4749 temp_dir: Path,
4750 ) -> None:
4751 async def assess_confidence(
4752 tool_name: str,
4753 tool_args: dict,
4754 context: str,
4755 ) -> ConfidenceAssessment:
4756 raise AssertionError("Confidence scoring should not run for this scenario")
4757
4758 async def verify_action(
4759 tool_name: str,
4760 tool_args: dict,
4761 result: str,
4762 expected: str = "",
4763 ) -> ActionVerification:
4764 raise AssertionError("Verification should not run for this scenario")
4765
4766 guide_root = temp_dir / "guides" / "nginx"
4767 chapters = guide_root / "chapters"
4768 guide_root.mkdir(parents=True)
4769 chapters.mkdir()
4770 index_path = guide_root / "index.html"
4771 chapter_one = chapters / "01-introduction.html"
4772 chapter_two = chapters / "02-installation.html"
4773 chapter_three = chapters / "03-configuration.html"
4774 index_path.write_text(
4775 "\n".join(
4776 [
4777 '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
4778 '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
4779 "",
4780 ]
4781 )
4782 )
4783 chapter_one.write_text("<html></html>\n")
4784 chapter_two.write_text("<html></html>\n")
4785 chapter_three.write_text("<html></html>\n")
4786
4787 implementation_plan = temp_dir / "implementation.md"
4788 implementation_plan.write_text(
4789 "\n".join(
4790 [
4791 "# Implementation Plan",
4792 "",
4793 "## File Changes",
4794 f"- `{guide_root}/`",
4795 f"- `{chapters}/`",
4796 f"- `{index_path}`",
4797 "",
4798 ]
4799 )
4800 )
4801
4802 context = build_context(
4803 temp_dir=temp_dir,
4804 messages=[],
4805 safeguards=FakeSafeguards(),
4806 assess_confidence=assess_confidence,
4807 verify_action=verify_action,
4808 auto_recover=False,
4809 )
4810 queued_messages: list[str] = []
4811 context.queue_steering_message_callback = queued_messages.append
4812 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4813 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4814 dod.implementation_plan = str(implementation_plan)
4815 dod.verification_commands = [f"ls -la {guide_root}"]
4816
4817 audit_bash = ToolCall(
4818 id="bash-post-build-audit",
4819 name="bash",
4820 arguments={"command": f"ls -la {guide_root}"},
4821 )
4822 audit_read = ToolCall(
4823 id="read-index-after-audit",
4824 name="read",
4825 arguments={"file_path": str(index_path)},
4826 )
4827 executor = FakeExecutor(
4828 [
4829 tool_outcome(
4830 tool_call=audit_bash,
4831 output="total 8\n",
4832 is_error=False,
4833 ),
4834 tool_outcome(
4835 tool_call=audit_read,
4836 output=index_path.read_text(),
4837 is_error=False,
4838 ),
4839 ]
4840 )
4841
4842 summary = TurnSummary(final_response="")
4843 result = await runner.execute_batch(
4844 tool_calls=[audit_bash, audit_read],
4845 tool_source="assistant",
4846 pending_tool_calls_seen=set(),
4847 emit=_noop_emit,
4848 summary=summary,
4849 dod=dod,
4850 executor=executor, # type: ignore[arg-type]
4851 on_confirmation=None,
4852 on_user_question=None,
4853 emit_confirmation=None,
4854 consecutive_errors=0,
4855 )
4856
4857 assert result.continue_after_batch is True
4858 assert [call.id for call in executor.calls] == ["bash-post-build-audit"]
4859 assert context.workflow_mode == "verify"
4860 assert queued_messages
4861 assert "Finish with a final response now so Loader can run verification automatically." in queued_messages[-1]
4862
4863
4864 @pytest.mark.asyncio
4865 async def test_tool_batch_runner_preempts_post_build_observation_batch_during_consistency_review(
4866 temp_dir: Path,
4867 ) -> None:
4868 async def assess_confidence(
4869 tool_name: str,
4870 tool_args: dict,
4871 context: str,
4872 ) -> ConfidenceAssessment:
4873 raise AssertionError("Confidence scoring should not run for this scenario")
4874
4875 async def verify_action(
4876 tool_name: str,
4877 tool_args: dict,
4878 result: str,
4879 expected: str = "",
4880 ) -> ActionVerification:
4881 raise AssertionError("Verification should not run for this scenario")
4882
4883 guide_root = temp_dir / "guides" / "nginx"
4884 chapters = guide_root / "chapters"
4885 guide_root.mkdir(parents=True)
4886 chapters.mkdir()
4887 index_path = guide_root / "index.html"
4888 chapter_one = chapters / "01-introduction.html"
4889 chapter_two = chapters / "02-installation.html"
4890 chapter_three = chapters / "03-basic-configuration.html"
4891 index_path.write_text("<html></html>\n")
4892 chapter_one.write_text("<html></html>\n")
4893 chapter_two.write_text("<html></html>\n")
4894 chapter_three.write_text("<html></html>\n")
4895
4896 implementation_plan = temp_dir / "implementation.md"
4897 implementation_plan.write_text(
4898 "\n".join(
4899 [
4900 "# Implementation Plan",
4901 "",
4902 "## File Changes",
4903 f"- `{guide_root}/`",
4904 f"- `{chapters}/`",
4905 f"- `{index_path}`",
4906 "",
4907 ]
4908 )
4909 )
4910
4911 context = build_context(
4912 temp_dir=temp_dir,
4913 messages=[],
4914 safeguards=FakeSafeguards(),
4915 assess_confidence=assess_confidence,
4916 verify_action=verify_action,
4917 auto_recover=False,
4918 )
4919 queued_messages: list[str] = []
4920 queued_ephemeral: list[str] = []
4921 context.queue_steering_message_callback = queued_messages.append
4922 context.queue_ephemeral_steering_message_callback = queued_ephemeral.append
4923 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4924 dod = create_definition_of_done("Create an equally thorough nginx guide.")
4925 dod.implementation_plan = str(implementation_plan)
4926 dod.verification_commands = [f"ls -la {guide_root}"]
4927 sync_todos_to_definition_of_done(
4928 dod,
4929 [
4930 {
4931 "content": "Review the generated guide for consistency and completeness",
4932 "active_form": "Reviewing the generated guide for consistency and completeness",
4933 "status": "pending",
4934 }
4935 ],
4936 project_root=temp_dir,
4937 )
4938
4939 audit_read = ToolCall(
4940 id="read-index-during-review",
4941 name="read",
4942 arguments={"file_path": str(index_path)},
4943 )
4944 second_read = ToolCall(
4945 id="read-chapter-after-review",
4946 name="read",
4947 arguments={"file_path": str(chapter_one)},
4948 )
4949 executor = FakeExecutor(
4950 [
4951 tool_outcome(
4952 tool_call=audit_read,
4953 output=index_path.read_text(),
4954 is_error=False,
4955 ),
4956 tool_outcome(
4957 tool_call=second_read,
4958 output=chapter_one.read_text(),
4959 is_error=False,
4960 ),
4961 ]
4962 )
4963
4964 summary = TurnSummary(final_response="")
4965 result = await runner.execute_batch(
4966 tool_calls=[audit_read, second_read],
4967 tool_source="assistant",
4968 pending_tool_calls_seen=set(),
4969 emit=_noop_emit,
4970 summary=summary,
4971 dod=dod,
4972 executor=executor, # type: ignore[arg-type]
4973 on_confirmation=None,
4974 on_user_question=None,
4975 emit_confirmation=None,
4976 consecutive_errors=0,
4977 )
4978
4979 assert result.continue_after_batch is True
4980 assert [call.id for call in executor.calls] == ["read-index-during-review"]
4981 queued = queued_ephemeral or queued_messages
4982 assert queued
4983 assert "All explicitly planned artifacts already exist." in queued[-1]
4984 assert "generated files" in queued[-1]
4985
4986
4987 @pytest.mark.asyncio
4988 async def test_tool_batch_runner_skips_post_build_user_question_during_consistency_review(
4989 temp_dir: Path,
4990 ) -> None:
4991 async def assess_confidence(
4992 tool_name: str,
4993 tool_args: dict,
4994 context: str,
4995 ) -> ConfidenceAssessment:
4996 raise AssertionError("Confidence scoring should not run for this scenario")
4997
4998 async def verify_action(
4999 tool_name: str,
5000 tool_args: dict,
5001 result: str,
5002 expected: str = "",
5003 ) -> ActionVerification:
5004 raise AssertionError("Verification should not run for this scenario")
5005
5006 guide_root = temp_dir / "guides" / "nginx"
5007 chapters = guide_root / "chapters"
5008 guide_root.mkdir(parents=True)
5009 chapters.mkdir()
5010 index_path = guide_root / "index.html"
5011 chapter_one = chapters / "01-introduction.html"
5012 chapter_two = chapters / "02-installation.html"
5013 index_path.write_text(
5014 "\n".join(
5015 [
5016 '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction</a></li>',
5017 '<li><a href="chapters/02-installation.html">Chapter 2: Installation</a></li>',
5018 "",
5019 ]
5020 )
5021 )
5022 chapter_one.write_text("<html></html>\n")
5023 chapter_two.write_text("<html></html>\n")
5024
5025 implementation_plan = temp_dir / "implementation.md"
5026 implementation_plan.write_text(
5027 "\n".join(
5028 [
5029 "# Implementation Plan",
5030 "",
5031 "## File Changes",
5032 f"- `{guide_root}/`",
5033 f"- `{chapters}/`",
5034 f"- `{index_path}`",
5035 f"- `{chapter_one}`",
5036 f"- `{chapter_two}`",
5037 "",
5038 ]
5039 )
5040 )
5041
5042 context = build_context(
5043 temp_dir=temp_dir,
5044 messages=[],
5045 safeguards=FakeSafeguards(),
5046 assess_confidence=assess_confidence,
5047 verify_action=verify_action,
5048 auto_recover=False,
5049 )
5050 queued_messages: list[str] = []
5051 context.queue_steering_message_callback = queued_messages.append
5052 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5053 dod = create_definition_of_done("Create an equally thorough nginx guide.")
5054 dod.implementation_plan = str(implementation_plan)
5055 dod.verification_commands = [f"ls -la {guide_root}"]
5056 dod.pending_items = ["Ensure all files are properly linked and formatted"]
5057
5058 question_call = ToolCall(
5059 id="ask-post-build-review",
5060 name="AskUserQuestion",
5061 arguments={
5062 "question": "Which specific aspects of the reference guide should I copy?",
5063 "context": "I already created the output files and want to ensure they match.",
5064 },
5065 )
5066 executor = FakeExecutor([])
5067
5068 summary = TurnSummary(final_response="")
5069 result = await runner.execute_batch(
5070 tool_calls=[question_call],
5071 tool_source="assistant",
5072 pending_tool_calls_seen=set(),
5073 emit=_noop_emit,
5074 summary=summary,
5075 dod=dod,
5076 executor=executor, # type: ignore[arg-type]
5077 on_confirmation=None,
5078 on_user_question=None,
5079 emit_confirmation=None,
5080 consecutive_errors=0,
5081 )
5082
5083 assert result.continue_after_batch is True
5084 assert executor.calls == []
5085 assert queued_messages
5086 assert "The remaining work is review/verification of the generated files." in queued_messages[-1]
5087 assert "Do not ask the user for more clarification about the reference pattern now." in queued_messages[-1]
5088 assert "Finish with a final response now so Loader can run verification automatically." in queued_messages[-1]
5089 assert context.workflow_mode == "verify"
5090 assert summary.tool_result_messages
5091 assert "Skipped - stale post-build user question" in summary.tool_result_messages[-1].content
5092
5093
5094 @pytest.mark.asyncio
5095 async def test_tool_batch_runner_rewrites_stale_todowrite_summary_from_reconciled_dod(
5096 temp_dir: Path,
5097 ) -> None:
5098 async def assess_confidence(
5099 tool_name: str,
5100 tool_args: dict,
5101 context: str,
5102 ) -> ConfidenceAssessment:
5103 raise AssertionError("Confidence scoring should not run for this scenario")
5104
5105 async def verify_action(
5106 tool_name: str,
5107 tool_args: dict,
5108 result: str,
5109 expected: str = "",
5110 ) -> ActionVerification:
5111 raise AssertionError("Verification should not run for this scenario")
5112
5113 guide_root = temp_dir / "guides" / "nginx"
5114 chapters = guide_root / "chapters"
5115 guide_root.mkdir(parents=True)
5116 chapters.mkdir()
5117 index_path = guide_root / "index.html"
5118 for name in (
5119 "01-introduction.html",
5120 "02-installation.html",
5121 "03-basic-configuration.html",
5122 "04-advanced-usage.html",
5123 "05-troubleshooting.html",
5124 ):
5125 (chapters / name).write_text("<html></html>\n")
5126 index_path.write_text("<html></html>\n")
5127
5128 implementation_plan = temp_dir / "implementation.md"
5129 implementation_plan.write_text(
5130 "\n".join(
5131 [
5132 "# Implementation Plan",
5133 "",
5134 "## File Changes",
5135 f"- `{guide_root}/`",
5136 f"- `{chapters}/`",
5137 f"- `{index_path}`",
5138 "",
5139 ]
5140 )
5141 )
5142
5143 context = build_context(
5144 temp_dir=temp_dir,
5145 messages=[],
5146 safeguards=FakeSafeguards(),
5147 assess_confidence=assess_confidence,
5148 verify_action=verify_action,
5149 auto_recover=False,
5150 )
5151 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5152 dod = create_definition_of_done("Create an equally thorough nginx guide.")
5153 dod.implementation_plan = str(implementation_plan)
5154 dod.verification_commands = [f"ls -la {guide_root}"]
5155
5156 tool_call = ToolCall(
5157 id="todo-stale-summary",
5158 name="TodoWrite",
5159 arguments={
5160 "todos": [
5161 {
5162 "content": "First, examine the existing fortran guide structure and content to understand the format",
5163 "active_form": "Working on: First, examine the existing fortran guide structure and content to understand the format",
5164 "status": "pending",
5165 }
5166 ]
5167 },
5168 )
5169 executor = FakeExecutor(
5170 [
5171 tool_outcome(
5172 tool_call=tool_call,
5173 output="Todos updated",
5174 is_error=False,
5175 metadata={
5176 "new_todos": [
5177 {
5178 "content": "First, examine the existing fortran guide structure and content to understand the format",
5179 "active_form": "Working on: First, examine the existing fortran guide structure and content to understand the format",
5180 "status": "pending",
5181 }
5182 ]
5183 },
5184 )
5185 ]
5186 )
5187
5188 summary = TurnSummary(final_response="")
5189 result = await runner.execute_batch(
5190 tool_calls=[tool_call],
5191 tool_source="assistant",
5192 pending_tool_calls_seen=set(),
5193 emit=_noop_emit,
5194 summary=summary,
5195 dod=dod,
5196 executor=executor, # type: ignore[arg-type]
5197 on_confirmation=None,
5198 on_user_question=None,
5199 emit_confirmation=None,
5200 consecutive_errors=0,
5201 )
5202
5203 assert result.halted is True
5204 assert result.final_response == (
5205 "Todo tracking is complete; running Loader verification on the generated "
5206 "files now."
5207 )
5208 assert summary.final_response == result.final_response
5209 assert summary.tool_result_messages
5210 message = summary.tool_result_messages[-1].content
5211 assert "updated todo list" in message
5212 assert "final response should be provided next for Loader verification" in message
5213 assert "next pending:" not in message
5214 assert "fortran guide structure" not in message.lower()
5215
5216
5217 @pytest.mark.asyncio
5218 async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
5219 temp_dir: Path,
5220 ) -> None:
5221 async def assess_confidence(
5222 tool_name: str,
5223 tool_args: dict,
5224 context: str,
5225 ) -> ConfidenceAssessment:
5226 raise AssertionError("Confidence scoring should not run for this scenario")
5227
5228 async def verify_action(
5229 tool_name: str,
5230 tool_args: dict,
5231 result: str,
5232 expected: str = "",
5233 ) -> ActionVerification:
5234 raise AssertionError("Verification should not run for this scenario")
5235
5236 guide_root = temp_dir / "guides" / "nginx"
5237 chapters = guide_root / "chapters"
5238 guide_root.mkdir(parents=True)
5239 chapters.mkdir()
5240 index_path = guide_root / "index.html"
5241 chapter_one = chapters / "01-introduction.html"
5242 chapter_two = chapters / "02-installation.html"
5243 index_path.write_text(
5244 "\n".join(
5245 [
5246 '<a href="chapters/01-introduction.html">Intro</a>',
5247 '<a href="chapters/02-installation.html">Install</a>',
5248 '<a href="../index.html">Back</a>',
5249 "",
5250 ]
5251 )
5252 )
5253 chapter_one.write_text("<html></html>\n")
5254 chapter_two.write_text("<html></html>\n")
5255
5256 implementation_plan = temp_dir / "implementation.md"
5257 implementation_plan.write_text(
5258 "\n".join(
5259 [
5260 "# Implementation Plan",
5261 "",
5262 "## File Changes",
5263 f"- `{guide_root}/`",
5264 f"- `{chapters}/`",
5265 f"- `{index_path}`",
5266 f"- `{chapter_one}`",
5267 f"- `{chapter_two}`",
5268 "",
5269 ]
5270 )
5271 )
5272
5273 context = build_context(
5274 temp_dir=temp_dir,
5275 messages=[],
5276 safeguards=FakeSafeguards(),
5277 assess_confidence=assess_confidence,
5278 verify_action=verify_action,
5279 auto_recover=False,
5280 )
5281 queued_messages: list[str] = []
5282 context.queue_steering_message_callback = queued_messages.append
5283 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5284 dod = create_definition_of_done("Create a multi-file nginx guide.")
5285 dod.implementation_plan = str(implementation_plan)
5286 dod.verification_commands = [f"ls -la {guide_root}"]
5287
5288 tool_call = ToolCall(
5289 id="todo-post-build-expansion",
5290 name="TodoWrite",
5291 arguments={
5292 "todos": [
5293 {
5294 "content": "Create index.html for nginx guide",
5295 "activeForm": "Creating index.html",
5296 "status": "in_progress",
5297 },
5298 {
5299 "content": "Create chapter 01-introduction.html",
5300 "activeForm": "Creating chapter 01-introduction.html",
5301 "status": "completed",
5302 },
5303 {
5304 "content": "Create chapter 02-installation.html",
5305 "activeForm": "Creating chapter 02-installation.html",
5306 "status": "completed",
5307 },
5308 {
5309 "content": "Create chapter 08-troubleshooting.html",
5310 "activeForm": "Creating chapter 08-troubleshooting.html",
5311 "status": "pending",
5312 },
5313 ]
5314 },
5315 )
5316 executor = FakeExecutor(
5317 [
5318 tool_outcome(
5319 tool_call=tool_call,
5320 output="Todos updated",
5321 is_error=False,
5322 metadata={
5323 "new_todos": [
5324 {
5325 "content": "Create index.html for nginx guide",
5326 "active_form": "Creating index.html",
5327 "status": "in_progress",
5328 },
5329 {
5330 "content": "Create chapter 01-introduction.html",
5331 "active_form": "Creating chapter 01-introduction.html",
5332 "status": "completed",
5333 },
5334 {
5335 "content": "Create chapter 02-installation.html",
5336 "active_form": "Creating chapter 02-installation.html",
5337 "status": "completed",
5338 },
5339 {
5340 "content": "Create chapter 08-troubleshooting.html",
5341 "active_form": "Creating chapter 08-troubleshooting.html",
5342 "status": "pending",
5343 },
5344 ]
5345 },
5346 )
5347 ]
5348 )
5349
5350 summary = TurnSummary(final_response="")
5351 await runner.execute_batch(
5352 tool_calls=[tool_call],
5353 tool_source="assistant",
5354 pending_tool_calls_seen=set(),
5355 emit=_noop_emit,
5356 summary=summary,
5357 dod=dod,
5358 executor=executor, # type: ignore[arg-type]
5359 on_confirmation=None,
5360 on_user_question=None,
5361 emit_confirmation=None,
5362 consecutive_errors=0,
5363 )
5364
5365 assert queued_messages
5366 message = queued_messages[-1]
5367 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
5368 assert "Finish with a final response now so Loader can run verification automatically." in message
5369 assert "Repair or verify the current files instead of expanding the artifact set." not in message
5370 assert "08-troubleshooting.html" not in message
5371 assert context.workflow_mode == "verify"
5372
5373
5374 @pytest.mark.asyncio
5375 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
5376 temp_dir: Path,
5377 ) -> None:
5378 async def assess_confidence(
5379 tool_name: str,
5380 tool_args: dict,
5381 context: str,
5382 ) -> ConfidenceAssessment:
5383 raise AssertionError("Confidence scoring should not run in this scenario")
5384
5385 async def verify_action(
5386 tool_name: str,
5387 tool_args: dict,
5388 result: str,
5389 expected: str = "",
5390 ) -> ActionVerification:
5391 raise AssertionError("Verification should not run in this scenario")
5392
5393 guide_root = temp_dir / "guides" / "nginx"
5394 chapters = guide_root / "chapters"
5395 guide_root.mkdir(parents=True)
5396 chapters.mkdir()
5397 index_path = guide_root / "index.html"
5398 index_path.write_text(
5399 "\n".join(
5400 [
5401 "<!DOCTYPE html>",
5402 "<html>",
5403 "<body>",
5404 '<a href="chapters/01-introduction.html">Introduction</a>',
5405 "</body>",
5406 "</html>",
5407 "",
5408 ]
5409 )
5410 )
5411
5412 implementation_plan = temp_dir / "implementation.md"
5413 implementation_plan.write_text(
5414 "\n".join(
5415 [
5416 "# Implementation Plan",
5417 "",
5418 "## File Changes",
5419 f"- `{guide_root}/`",
5420 f"- `{chapters}/`",
5421 f"- `{index_path}`",
5422 "",
5423 ]
5424 )
5425 )
5426
5427 context = build_context(
5428 temp_dir=temp_dir,
5429 messages=[],
5430 safeguards=FakeSafeguards(),
5431 assess_confidence=assess_confidence,
5432 verify_action=verify_action,
5433 auto_recover=False,
5434 )
5435 queued_messages: list[str] = []
5436 context.queue_steering_message_callback = queued_messages.append
5437 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5438 dod = create_definition_of_done("Create a multi-file nginx guide.")
5439 dod.implementation_plan = str(implementation_plan)
5440 dod.touched_files.append(str(index_path))
5441 sync_todos_to_definition_of_done(
5442 dod,
5443 [
5444 {
5445 "content": "Examine the existing Fortran guide structure",
5446 "active_form": "Examining the existing Fortran guide structure",
5447 "status": "completed",
5448 },
5449 {
5450 "content": "Create the nginx directory structure",
5451 "active_form": "Creating the nginx directory structure",
5452 "status": "completed",
5453 },
5454 {
5455 "content": "Write the introduction chapter",
5456 "active_form": "Writing the introduction chapter",
5457 "status": "pending",
5458 },
5459 ],
5460 project_root=temp_dir,
5461 )
5462
5463 tool_call = ToolCall(
5464 id="todo-next-mutation",
5465 name="TodoWrite",
5466 arguments={
5467 "todos": [
5468 {
5469 "content": "Examine the existing Fortran guide structure",
5470 "active_form": "Examining the existing Fortran guide structure",
5471 "status": "completed",
5472 },
5473 {
5474 "content": "Create the nginx directory structure",
5475 "active_form": "Creating the nginx directory structure",
5476 "status": "completed",
5477 },
5478 {
5479 "content": "Write the introduction chapter",
5480 "active_form": "Writing the introduction chapter",
5481 "status": "pending",
5482 },
5483 ]
5484 },
5485 )
5486 executor = FakeExecutor(
5487 [
5488 tool_outcome(
5489 tool_call=tool_call,
5490 output="Todos updated",
5491 is_error=False,
5492 metadata={
5493 "new_todos": [
5494 {
5495 "content": "Examine the existing Fortran guide structure",
5496 "active_form": "Examining the existing Fortran guide structure",
5497 "status": "completed",
5498 },
5499 {
5500 "content": "Create the nginx directory structure",
5501 "active_form": "Creating the nginx directory structure",
5502 "status": "completed",
5503 },
5504 {
5505 "content": "Write the introduction chapter",
5506 "active_form": "Writing the introduction chapter",
5507 "status": "pending",
5508 },
5509 ]
5510 },
5511 )
5512 ]
5513 )
5514
5515 summary = TurnSummary(final_response="")
5516 await runner.execute_batch(
5517 tool_calls=[tool_call],
5518 tool_source="assistant",
5519 pending_tool_calls_seen=set(),
5520 emit=_noop_emit,
5521 summary=summary,
5522 dod=dod,
5523 executor=executor, # type: ignore[arg-type]
5524 on_confirmation=None,
5525 on_user_question=None,
5526 emit_confirmation=None,
5527 consecutive_errors=0,
5528 )
5529
5530 assert queued_messages
5531 message = queued_messages[-1]
5532 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
5533 assert "Prefer one `write(file_path=..., content=...)` call" in message
5534 assert "Make your next response the concrete mutation tool call itself." in message
5535
5536
5537 @pytest.mark.asyncio
5538 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
5539 temp_dir: Path,
5540 ) -> None:
5541 async def assess_confidence(
5542 tool_name: str,
5543 tool_args: dict,
5544 context: str,
5545 ) -> ConfidenceAssessment:
5546 raise AssertionError("Confidence scoring should not run in this scenario")
5547
5548 async def verify_action(
5549 tool_name: str,
5550 tool_args: dict,
5551 result: str,
5552 expected: str = "",
5553 ) -> ActionVerification:
5554 raise AssertionError("Verification should not run in this scenario")
5555
5556 guide_root = temp_dir / "Loader" / "guides" / "nginx"
5557 chapters = guide_root / "chapters"
5558 chapters.mkdir(parents=True)
5559 index_path = guide_root / "index.html"
5560 implementation_plan = temp_dir / "implementation.md"
5561 implementation_plan.write_text(
5562 "\n".join(
5563 [
5564 "# Implementation Plan",
5565 "",
5566 "## File Changes",
5567 f"- `{chapters}/`",
5568 f"- `{index_path}`",
5569 "",
5570 ]
5571 )
5572 )
5573
5574 dod = create_definition_of_done("Create a multi-file nginx guide.")
5575 dod.implementation_plan = str(implementation_plan)
5576 sync_todos_to_definition_of_done(
5577 dod,
5578 [
5579 {
5580 "content": "Examine the existing Fortran guide structure to understand the format and depth",
5581 "active_form": "Examining the existing Fortran guide structure",
5582 "status": "completed",
5583 },
5584 {
5585 "content": "Create the new nginx guide directory structure",
5586 "active_form": "Creating the new nginx guide directory structure",
5587 "status": "completed",
5588 },
5589 {
5590 "content": "Create a new index.html for the nginx guide",
5591 "active_form": "Creating a new index.html for the nginx guide",
5592 "status": "pending",
5593 },
5594 {
5595 "content": "Create the first chapter for the nginx guide",
5596 "active_form": "Creating the first chapter for the nginx guide",
5597 "status": "pending",
5598 },
5599 ],
5600 project_root=temp_dir,
5601 )
5602
5603 queued_messages: list[str] = []
5604 context = build_context(
5605 temp_dir=temp_dir,
5606 messages=[],
5607 safeguards=FakeSafeguards(),
5608 assess_confidence=assess_confidence,
5609 verify_action=verify_action,
5610 auto_recover=False,
5611 )
5612 context.queue_steering_message_callback = queued_messages.append
5613 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5614
5615 todos = [
5616 {
5617 "content": "Examine the existing Fortran guide structure to understand the format and depth",
5618 "active_form": "Examining the existing Fortran guide structure",
5619 "status": "completed",
5620 },
5621 {
5622 "content": "Create the new nginx guide directory structure",
5623 "active_form": "Creating the new nginx guide directory structure",
5624 "status": "completed",
5625 },
5626 {
5627 "content": "Create a new index.html for the nginx guide",
5628 "active_form": "Creating a new index.html for the nginx guide",
5629 "status": "pending",
5630 },
5631 {
5632 "content": "Create the first chapter for the nginx guide",
5633 "active_form": "Creating the first chapter for the nginx guide",
5634 "status": "pending",
5635 },
5636 ]
5637 tool_call = ToolCall(
5638 id="todo-index-before-chapter",
5639 name="TodoWrite",
5640 arguments={"todos": todos},
5641 )
5642 executor = FakeExecutor(
5643 [
5644 tool_outcome(
5645 tool_call=tool_call,
5646 output="Todos updated",
5647 is_error=False,
5648 metadata={"new_todos": todos},
5649 )
5650 ]
5651 )
5652
5653 summary = TurnSummary(final_response="")
5654 await runner.execute_batch(
5655 tool_calls=[tool_call],
5656 tool_source="assistant",
5657 pending_tool_calls_seen=set(),
5658 emit=_noop_emit,
5659 summary=summary,
5660 dod=dod,
5661 executor=executor, # type: ignore[arg-type]
5662 on_confirmation=None,
5663 on_user_question=None,
5664 emit_confirmation=None,
5665 consecutive_errors=0,
5666 )
5667
5668 assert queued_messages
5669 message = queued_messages[-1]
5670 assert "Todo tracking is updated. Next step: create `index.html`." in message
5671 assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
5672 assert "01-introduction.html" not in message
5673
5674
5675 @pytest.mark.asyncio
5676 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
5677 temp_dir: Path,
5678 ) -> None:
5679 async def assess_confidence(
5680 tool_name: str,
5681 tool_args: dict,
5682 context: str,
5683 ) -> ConfidenceAssessment:
5684 raise AssertionError("Confidence scoring should not run in this scenario")
5685
5686 async def verify_action(
5687 tool_name: str,
5688 tool_args: dict,
5689 result: str,
5690 expected: str = "",
5691 ) -> ActionVerification:
5692 raise AssertionError("Verification should not run in this scenario")
5693
5694 guide_root = temp_dir / "guides" / "nginx"
5695 chapters = guide_root / "chapters"
5696 guide_root.mkdir(parents=True)
5697 chapters.mkdir()
5698 index_path = guide_root / "index.html"
5699 index_path.write_text(
5700 "\n".join(
5701 [
5702 "<html>",
5703 '<a href="chapters/introduction.html">Introduction</a>',
5704 '<a href="chapters/installation.html">Installation</a>',
5705 "</html>",
5706 ]
5707 )
5708 + "\n"
5709 )
5710
5711 implementation_plan = temp_dir / "implementation.md"
5712 implementation_plan.write_text(
5713 "\n".join(
5714 [
5715 "# Implementation Plan",
5716 "",
5717 "## File Changes",
5718 f"- `{guide_root}/`",
5719 f"- `{chapters}/`",
5720 f"- `{index_path}`",
5721 "",
5722 ]
5723 )
5724 )
5725
5726 dod = create_definition_of_done("Create a multi-file nginx guide.")
5727 dod.implementation_plan = str(implementation_plan)
5728 dod.pending_items = [
5729 "Write the introduction chapter",
5730 "Complete the requested work",
5731 ]
5732 dod.touched_files.append(str(index_path))
5733
5734 queued_messages: list[str] = []
5735 context = build_context(
5736 temp_dir=temp_dir,
5737 messages=[],
5738 safeguards=FakeSafeguards(),
5739 assess_confidence=assess_confidence,
5740 verify_action=verify_action,
5741 auto_recover=False,
5742 )
5743 context.queue_steering_message_callback = queued_messages.append
5744 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5745
5746 tool_call = ToolCall(
5747 id="todo-1",
5748 name="TodoWrite",
5749 arguments={
5750 "todos": [
5751 {
5752 "content": "Write the introduction chapter",
5753 "activeForm": "Writing the introduction chapter",
5754 "status": "pending",
5755 }
5756 ]
5757 },
5758 )
5759 executor = FakeExecutor(
5760 [
5761 tool_outcome(
5762 tool_call=tool_call,
5763 output="Todos updated",
5764 is_error=False,
5765 metadata={
5766 "new_todos": [
5767 {
5768 "content": "Write the introduction chapter",
5769 "active_form": "Writing the introduction chapter",
5770 "status": "pending",
5771 }
5772 ]
5773 },
5774 )
5775 ]
5776 )
5777
5778 summary = TurnSummary(final_response="")
5779 await runner.execute_batch(
5780 tool_calls=[tool_call],
5781 tool_source="assistant",
5782 pending_tool_calls_seen=set(),
5783 emit=_noop_emit,
5784 summary=summary,
5785 dod=dod,
5786 executor=executor, # type: ignore[arg-type]
5787 on_confirmation=None,
5788 on_user_question=None,
5789 emit_confirmation=None,
5790 consecutive_errors=0,
5791 )
5792
5793 assert queued_messages
5794 message = queued_messages[-1]
5795 assert "Todo tracking is updated. Next step: create `introduction.html`." in message
5796 assert "Prefer one `write(file_path=..., content=...)` call" in message
5797 assert "Make your next response the concrete mutation tool call itself." in message
5798
5799
5800 @pytest.mark.asyncio
5801 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
5802 temp_dir: Path,
5803 ) -> None:
5804 async def assess_confidence(
5805 tool_name: str,
5806 tool_args: dict,
5807 context: str,
5808 ) -> ConfidenceAssessment:
5809 raise AssertionError("Confidence scoring should not run in this scenario")
5810
5811 async def verify_action(
5812 tool_name: str,
5813 tool_args: dict,
5814 result: str,
5815 expected: str = "",
5816 ) -> ActionVerification:
5817 raise AssertionError("Verification should not run in this scenario")
5818
5819 guide_root = temp_dir / "guides" / "nginx"
5820 chapters = guide_root / "chapters"
5821 guide_root.mkdir(parents=True)
5822 chapters.mkdir()
5823 index_path = guide_root / "index.html"
5824 chapter_one = chapters / "01-introduction.html"
5825 index_path.write_text(
5826 "\n".join(
5827 [
5828 "<html>",
5829 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
5830 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
5831 "</html>",
5832 ]
5833 )
5834 + "\n"
5835 )
5836 chapter_one.write_text("<html></html>\n")
5837
5838 implementation_plan = temp_dir / "implementation.md"
5839 implementation_plan.write_text(
5840 "\n".join(
5841 [
5842 "# Implementation Plan",
5843 "",
5844 "## File Changes",
5845 f"- `{guide_root}/`",
5846 f"- `{chapters}/`",
5847 f"- `{index_path}`",
5848 "",
5849 ]
5850 )
5851 )
5852
5853 dod = create_definition_of_done("Create a multi-file nginx guide.")
5854 dod.implementation_plan = str(implementation_plan)
5855 dod.pending_items = [
5856 "Creating Chapter 2: Installation and Setup",
5857 "Complete the requested work",
5858 ]
5859 dod.touched_files.extend([str(index_path), str(chapter_one)])
5860
5861 queued_messages: list[str] = []
5862 context = build_context(
5863 temp_dir=temp_dir,
5864 messages=[],
5865 safeguards=FakeSafeguards(),
5866 assess_confidence=assess_confidence,
5867 verify_action=verify_action,
5868 auto_recover=False,
5869 )
5870 context.queue_steering_message_callback = queued_messages.append
5871 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5872
5873 tool_call = ToolCall(
5874 id="todo-1",
5875 name="TodoWrite",
5876 arguments={
5877 "todos": [
5878 {
5879 "content": "Creating Chapter 2: Installation and Setup",
5880 "activeForm": "Creating Chapter 2: Installation and Setup",
5881 "status": "pending",
5882 }
5883 ]
5884 },
5885 )
5886 executor = FakeExecutor(
5887 [
5888 tool_outcome(
5889 tool_call=tool_call,
5890 output="Todos updated",
5891 is_error=False,
5892 metadata={
5893 "new_todos": [
5894 {
5895 "content": "Creating Chapter 2: Installation and Setup",
5896 "active_form": "Creating Chapter 2: Installation and Setup",
5897 "status": "pending",
5898 }
5899 ]
5900 },
5901 )
5902 ]
5903 )
5904
5905 summary = TurnSummary(final_response="")
5906 await runner.execute_batch(
5907 tool_calls=[tool_call],
5908 tool_source="assistant",
5909 pending_tool_calls_seen=set(),
5910 emit=_noop_emit,
5911 summary=summary,
5912 dod=dod,
5913 executor=executor, # type: ignore[arg-type]
5914 on_confirmation=None,
5915 on_user_question=None,
5916 emit_confirmation=None,
5917 consecutive_errors=0,
5918 )
5919
5920 assert queued_messages
5921 message = queued_messages[-1]
5922 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
5923 assert "Prefer one `write(file_path=..., content=...)` call" in message
5924 assert "Make your next response the concrete mutation tool call itself" in message
5925
5926
5927 @pytest.mark.asyncio
5928 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
5929 temp_dir: Path,
5930 ) -> None:
5931 async def assess_confidence(
5932 tool_name: str,
5933 tool_args: dict,
5934 context: str,
5935 ) -> ConfidenceAssessment:
5936 raise AssertionError("Confidence scoring should not run in this scenario")
5937
5938 async def verify_action(
5939 tool_name: str,
5940 tool_args: dict,
5941 result: str,
5942 expected: str = "",
5943 ) -> ActionVerification:
5944 raise AssertionError("Verification should not run in this scenario")
5945
5946 reference_chapters = temp_dir / "fortran" / "chapters"
5947 reference_chapters.mkdir(parents=True)
5948 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
5949
5950 guide_root = temp_dir / "guides" / "nginx"
5951 chapters = guide_root / "chapters"
5952 guide_root.mkdir(parents=True)
5953 chapters.mkdir()
5954 index_path = guide_root / "index.html"
5955 index_path.write_text("<html></html>\n")
5956
5957 implementation_plan = temp_dir / "implementation.md"
5958 implementation_plan.write_text(
5959 "\n".join(
5960 [
5961 "# Implementation Plan",
5962 "",
5963 "## File Changes",
5964 f"- `{guide_root}/`",
5965 f"- `{chapters}/`",
5966 f"- `{index_path}`",
5967 "",
5968 ]
5969 )
5970 )
5971
5972 dod = create_definition_of_done("Create a multi-file nginx guide.")
5973 dod.implementation_plan = str(implementation_plan)
5974 dod.pending_items = [
5975 "Write the introduction chapter",
5976 "Complete the requested work",
5977 ]
5978 dod.touched_files.append(str(index_path))
5979
5980 queued_messages: list[str] = []
5981 context = build_context(
5982 temp_dir=temp_dir,
5983 messages=[
5984 Message(
5985 role=Role.ASSISTANT,
5986 content="",
5987 tool_calls=[
5988 ToolCall(
5989 id="read-ref-1",
5990 name="read",
5991 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
5992 )
5993 ],
5994 )
5995 ],
5996 safeguards=FakeSafeguards(),
5997 assess_confidence=assess_confidence,
5998 verify_action=verify_action,
5999 auto_recover=False,
6000 )
6001 context.queue_steering_message_callback = queued_messages.append
6002 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6003
6004 tool_call = ToolCall(
6005 id="todo-observed-1",
6006 name="TodoWrite",
6007 arguments={
6008 "todos": [
6009 {
6010 "content": "Write the introduction chapter",
6011 "activeForm": "Writing the introduction chapter",
6012 "status": "pending",
6013 }
6014 ]
6015 },
6016 )
6017 executor = FakeExecutor(
6018 [
6019 tool_outcome(
6020 tool_call=tool_call,
6021 output="Todos updated",
6022 is_error=False,
6023 metadata={
6024 "new_todos": [
6025 {
6026 "content": "Write the introduction chapter",
6027 "active_form": "Writing the introduction chapter",
6028 "status": "pending",
6029 }
6030 ]
6031 },
6032 )
6033 ]
6034 )
6035
6036 summary = TurnSummary(final_response="")
6037 await runner.execute_batch(
6038 tool_calls=[tool_call],
6039 tool_source="assistant",
6040 pending_tool_calls_seen=set(),
6041 emit=_noop_emit,
6042 summary=summary,
6043 dod=dod,
6044 executor=executor, # type: ignore[arg-type]
6045 on_confirmation=None,
6046 on_user_question=None,
6047 emit_confirmation=None,
6048 consecutive_errors=0,
6049 )
6050
6051 assert queued_messages
6052 message = queued_messages[-1]
6053 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
6054 assert "Prefer one `write(file_path=..., content=...)` call" in message
6055
6056
6057 @pytest.mark.asyncio
6058 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
6059 temp_dir: Path,
6060 ) -> None:
6061 async def assess_confidence(
6062 tool_name: str,
6063 tool_args: dict,
6064 context: str,
6065 ) -> ConfidenceAssessment:
6066 raise AssertionError("Confidence scoring should not run in this scenario")
6067
6068 async def verify_action(
6069 tool_name: str,
6070 tool_args: dict,
6071 result: str,
6072 expected: str = "",
6073 ) -> ActionVerification:
6074 raise AssertionError("Verification should not run in this scenario")
6075
6076 guide_root = temp_dir / "guides" / "nginx"
6077 chapters = guide_root / "chapters"
6078 guide_root.mkdir(parents=True)
6079 chapters.mkdir()
6080 index_path = guide_root / "index.html"
6081 chapter_one = chapters / "01-getting-started.html"
6082 chapter_two = chapters / "02-installation.html"
6083 index_path.write_text("<html></html>\n")
6084 chapter_one.write_text("<h1>One</h1>\n")
6085
6086 implementation_plan = temp_dir / "implementation.md"
6087 implementation_plan.write_text(
6088 "\n".join(
6089 [
6090 "# Implementation Plan",
6091 "",
6092 "## File Changes",
6093 f"- `{guide_root}/`",
6094 f"- `{chapters}/`",
6095 f"- `{index_path}`",
6096 f"- `{chapter_one}`",
6097 f"- `{chapter_two}`",
6098 "",
6099 ]
6100 )
6101 )
6102
6103 context = build_context(
6104 temp_dir=temp_dir,
6105 messages=[],
6106 safeguards=FakeSafeguards(),
6107 assess_confidence=assess_confidence,
6108 verify_action=verify_action,
6109 auto_recover=False,
6110 )
6111 queued_messages: list[str] = []
6112 context.queue_steering_message_callback = queued_messages.append
6113 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6114 dod = create_definition_of_done("Create a multi-file nginx guide.")
6115 dod.implementation_plan = str(implementation_plan)
6116 sync_todos_to_definition_of_done(
6117 dod,
6118 [
6119 {
6120 "content": "Create 01-getting-started.html",
6121 "active_form": "Creating 01-getting-started.html",
6122 "status": "completed",
6123 },
6124 {
6125 "content": "Create 02-installation.html",
6126 "active_form": "Creating 02-installation.html",
6127 "status": "pending",
6128 },
6129 ],
6130 project_root=temp_dir,
6131 )
6132 dod.touched_files.extend([str(index_path), str(chapter_one)])
6133
6134 tool_call = ToolCall(
6135 id="working-note",
6136 name="notepad_write_working",
6137 arguments={"content": "Creating the second chapter file: Installation"},
6138 )
6139 executor = FakeExecutor(
6140 [
6141 tool_outcome(
6142 tool_call=tool_call,
6143 output="Working note recorded",
6144 is_error=False,
6145 )
6146 ]
6147 )
6148
6149 summary = TurnSummary(final_response="")
6150 await runner.execute_batch(
6151 tool_calls=[tool_call],
6152 tool_source="assistant",
6153 pending_tool_calls_seen=set(),
6154 emit=_noop_emit,
6155 summary=summary,
6156 dod=dod,
6157 executor=executor, # type: ignore[arg-type]
6158 on_confirmation=None,
6159 on_user_question=None,
6160 emit_confirmation=None,
6161 consecutive_errors=0,
6162 )
6163
6164 assert queued_messages
6165 message = queued_messages[-1]
6166 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
6167 assert "Resume by creating `02-installation.html` now." in message
6168 assert "Make your next response the concrete mutation tool call itself" in message
6169 assert "refresh `TodoWrite`" in message
6170 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
6171
6172
6173 @pytest.mark.asyncio
6174 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
6175 temp_dir: Path,
6176 ) -> None:
6177 async def assess_confidence(
6178 tool_name: str,
6179 tool_args: dict,
6180 context: str,
6181 ) -> ConfidenceAssessment:
6182 raise AssertionError("Confidence scoring should be disabled in this scenario")
6183
6184 async def verify_action(
6185 tool_name: str,
6186 tool_args: dict,
6187 result: str,
6188 expected: str = "",
6189 ) -> ActionVerification:
6190 raise AssertionError("Verification should not run in this scenario")
6191
6192 implementation_plan = temp_dir / "implementation.md"
6193 implementation_plan.write_text(
6194 "\n".join(
6195 [
6196 "# Implementation Plan",
6197 "",
6198 "## File Changes",
6199 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
6200 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
6201 "",
6202 ]
6203 )
6204 )
6205
6206 context = build_context(
6207 temp_dir=temp_dir,
6208 messages=[],
6209 safeguards=FakeSafeguards(),
6210 assess_confidence=assess_confidence,
6211 verify_action=verify_action,
6212 auto_recover=False,
6213 )
6214 queued_messages: list[str] = []
6215 context.queue_steering_message_callback = queued_messages.append
6216 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6217 dod = create_definition_of_done("Create a multi-file nginx guide.")
6218 dod.implementation_plan = str(implementation_plan)
6219 dod.pending_items.extend(
6220 [
6221 "First, examine the existing fortran guide structure and content to understand the format",
6222 "Create the nginx directory structure",
6223 "Develop the main index.html file for the nginx guide",
6224 ]
6225 )
6226
6227 tool_call = ToolCall(
6228 id="working-note",
6229 name="notepad_write_working",
6230 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
6231 )
6232 executor = FakeExecutor(
6233 [
6234 tool_outcome(
6235 tool_call=tool_call,
6236 output="Working note recorded",
6237 is_error=False,
6238 )
6239 ]
6240 )
6241
6242 summary = TurnSummary(final_response="")
6243 await runner.execute_batch(
6244 tool_calls=[tool_call],
6245 tool_source="assistant",
6246 pending_tool_calls_seen=set(),
6247 emit=_noop_emit,
6248 summary=summary,
6249 dod=dod,
6250 executor=executor, # type: ignore[arg-type]
6251 on_confirmation=None,
6252 on_user_question=None,
6253 emit_confirmation=None,
6254 consecutive_errors=0,
6255 )
6256
6257 assert queued_messages
6258 message = queued_messages[-1]
6259 assert (
6260 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
6261 in message
6262 )
6263 assert "one concrete evidence-gathering tool call" in message
6264 assert "Resume by creating `index.html` now." not in message
6265
6266
6267 @pytest.mark.asyncio
6268 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
6269 temp_dir: Path,
6270 ) -> None:
6271 async def assess_confidence(
6272 tool_name: str,
6273 tool_args: dict,
6274 context: str,
6275 ) -> ConfidenceAssessment:
6276 raise AssertionError("Confidence scoring should be disabled in this scenario")
6277
6278 async def verify_action(
6279 tool_name: str,
6280 tool_args: dict,
6281 result: str,
6282 expected: str = "",
6283 ) -> ActionVerification:
6284 raise AssertionError("Verification should not run in this scenario")
6285
6286 guide_root = temp_dir / "guides" / "nginx"
6287 chapters_dir = guide_root / "chapters"
6288 chapters_dir.mkdir(parents=True)
6289 index_path = guide_root / "index.html"
6290 first_chapter = chapters_dir / "01-introduction.html"
6291 index_path.write_text(
6292 "\n".join(
6293 [
6294 '<a href="chapters/01-introduction.html">Introduction</a>',
6295 '<a href="chapters/02-installation.html">Installation</a>',
6296 '<a href="chapters/03-configuration.html">Configuration</a>',
6297 ]
6298 )
6299 )
6300 first_chapter.write_text("<h1>Introduction</h1>\n")
6301
6302 implementation_plan = temp_dir / "implementation.md"
6303 implementation_plan.write_text(
6304 "\n".join(
6305 [
6306 "# Implementation Plan",
6307 "",
6308 "## File Changes",
6309 f"- `{guide_root / 'index.html'}`",
6310 f"- `{chapters_dir}/`",
6311 "",
6312 ]
6313 )
6314 )
6315
6316 context = build_context(
6317 temp_dir=temp_dir,
6318 messages=[],
6319 safeguards=FakeSafeguards(),
6320 assess_confidence=assess_confidence,
6321 verify_action=verify_action,
6322 auto_recover=False,
6323 )
6324 queued_messages: list[str] = []
6325 context.queue_steering_message_callback = queued_messages.append
6326 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6327 dod = create_definition_of_done("Create a multi-file nginx guide.")
6328 dod.implementation_plan = str(implementation_plan)
6329 dod.pending_items.extend(
6330 [
6331 "First, examine the existing fortran guide structure and content to understand the format",
6332 "Create chapter files following the established pattern",
6333 ]
6334 )
6335 dod.touched_files.extend([str(index_path), str(first_chapter)])
6336
6337 tool_call = ToolCall(
6338 id="working-note",
6339 name="notepad_write_working",
6340 arguments={"content": "Created index and first chapter; next is chapter 2"},
6341 )
6342 executor = FakeExecutor(
6343 [
6344 tool_outcome(
6345 tool_call=tool_call,
6346 output="Working note recorded",
6347 is_error=False,
6348 )
6349 ]
6350 )
6351
6352 summary = TurnSummary(final_response="")
6353 await runner.execute_batch(
6354 tool_calls=[tool_call],
6355 tool_source="assistant",
6356 pending_tool_calls_seen=set(),
6357 emit=_noop_emit,
6358 summary=summary,
6359 dod=dod,
6360 executor=executor, # type: ignore[arg-type]
6361 on_confirmation=None,
6362 on_user_question=None,
6363 emit_confirmation=None,
6364 consecutive_errors=0,
6365 )
6366
6367 assert queued_messages
6368 message = queued_messages[-1]
6369 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
6370 assert "Resume by creating `02-installation.html` now." in message
6371 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
6372
6373
6374 @pytest.mark.asyncio
6375 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
6376 temp_dir: Path,
6377 ) -> None:
6378 async def assess_confidence(
6379 tool_name: str,
6380 tool_args: dict,
6381 context: str,
6382 ) -> ConfidenceAssessment:
6383 raise AssertionError("Confidence scoring should be disabled in this scenario")
6384
6385 async def verify_action(
6386 tool_name: str,
6387 tool_args: dict,
6388 result: str,
6389 expected: str = "",
6390 ) -> ActionVerification:
6391 raise AssertionError("Verification should not run in this scenario")
6392
6393 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
6394 chapters_dir = fortran_root / "chapters"
6395 chapters_dir.mkdir(parents=True)
6396
6397 implementation_plan = temp_dir / "implementation.md"
6398 implementation_plan.write_text(
6399 "\n".join(
6400 [
6401 "# Implementation Plan",
6402 "",
6403 "## File Changes",
6404 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
6405 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
6406 "",
6407 ]
6408 )
6409 )
6410
6411 context = build_context(
6412 temp_dir=temp_dir,
6413 messages=[],
6414 safeguards=FakeSafeguards(),
6415 assess_confidence=assess_confidence,
6416 verify_action=verify_action,
6417 auto_recover=False,
6418 )
6419 queued_messages: list[str] = []
6420 context.queue_steering_message_callback = queued_messages.append
6421 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6422 dod = create_definition_of_done("Create a multi-file nginx guide.")
6423 dod.implementation_plan = str(implementation_plan)
6424 dod.pending_items.extend(
6425 [
6426 "First, examine the existing fortran guide structure and content",
6427 "Create the nginx directory structure",
6428 "Develop the main index.html file for nginx guide",
6429 ]
6430 )
6431
6432 tool_call = ToolCall(
6433 id="glob-1",
6434 name="glob",
6435 arguments={"pattern": "**", "path": str(fortran_root)},
6436 )
6437 executor = FakeExecutor(
6438 [
6439 tool_outcome(
6440 tool_call=tool_call,
6441 output=f"{fortran_root}\n{chapters_dir}",
6442 is_error=False,
6443 )
6444 ]
6445 )
6446
6447 summary = TurnSummary(final_response="")
6448 await runner.execute_batch(
6449 tool_calls=[tool_call],
6450 tool_source="assistant",
6451 pending_tool_calls_seen=set(),
6452 emit=_noop_emit,
6453 summary=summary,
6454 dod=dod,
6455 executor=executor, # type: ignore[arg-type]
6456 on_confirmation=None,
6457 on_user_question=None,
6458 emit_confirmation=None,
6459 consecutive_errors=0,
6460 )
6461
6462 assert queued_messages == []
6463
6464
6465 @pytest.mark.asyncio
6466 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
6467 temp_dir: Path,
6468 ) -> None:
6469 async def assess_confidence(
6470 tool_name: str,
6471 tool_args: dict,
6472 context: str,
6473 ) -> ConfidenceAssessment:
6474 raise AssertionError("Confidence scoring should not run in this scenario")
6475
6476 async def verify_action(
6477 tool_name: str,
6478 tool_args: dict,
6479 result: str,
6480 expected: str = "",
6481 ) -> ActionVerification:
6482 raise AssertionError("Verification should not run in this scenario")
6483
6484 prompt = (
6485 "Have a look at ~/Loader/guides/fortran/index.html, then "
6486 "~/Loader/guides/fortran/chapters. The table of contents links in "
6487 "index.html are inaccurate and the href’s are wrong. Let’s update the "
6488 "links and their link texts to be correct."
6489 )
6490 chapters = temp_dir / "chapters"
6491 chapters.mkdir()
6492 (chapters / "01-introduction.html").write_text(
6493 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
6494 )
6495 (chapters / "02-setup.html").write_text(
6496 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
6497 )
6498 current_block = (
6499 "<h2>Table of Contents</h2>\n"
6500 ' <ul class="chapter-list">\n'
6501 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
6502 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
6503 " </ul>\n"
6504 )
6505 index_path = temp_dir / "index.html"
6506 index_path.write_text(current_block)
6507
6508 context = build_context(
6509 temp_dir=temp_dir,
6510 messages=[],
6511 safeguards=FakeSafeguards(),
6512 assess_confidence=assess_confidence,
6513 verify_action=verify_action,
6514 auto_recover=False,
6515 )
6516 context.session.current_task = prompt # type: ignore[attr-defined]
6517 queued_messages: list[str] = []
6518 context.queue_steering_message_callback = queued_messages.append
6519 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6520 tool_call = ToolCall(
6521 id="edit-1",
6522 name="edit",
6523 arguments={
6524 "file_path": str(index_path),
6525 "old_string": current_block,
6526 "new_string": current_block,
6527 },
6528 )
6529 executor = FakeExecutor(
6530 [
6531 tool_outcome(
6532 tool_call=tool_call,
6533 output=(
6534 "[Blocked - old_string and new_string are identical - no change "
6535 "would occur] Suggestion: Provide different old and new strings"
6536 ),
6537 is_error=True,
6538 state=ToolExecutionState.BLOCKED,
6539 )
6540 ]
6541 )
6542
6543 await runner.execute_batch(
6544 tool_calls=[tool_call],
6545 tool_source="assistant",
6546 pending_tool_calls_seen=set(),
6547 emit=_noop_emit,
6548 summary=TurnSummary(final_response=""),
6549 dod=create_definition_of_done(prompt),
6550 executor=executor, # type: ignore[arg-type]
6551 on_confirmation=None,
6552 on_user_question=None,
6553 emit_confirmation=None,
6554 consecutive_errors=0,
6555 )
6556
6557 assert queued_messages == []
6558
6559
6560 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
6561 temp_dir: Path,
6562 ) -> None:
6563 async def assess_confidence(
6564 tool_name: str,
6565 tool_args: dict,
6566 context: str,
6567 ) -> ConfidenceAssessment:
6568 raise AssertionError("Confidence scoring should be disabled in this scenario")
6569
6570 async def verify_action(
6571 tool_name: str,
6572 tool_args: dict,
6573 result: str,
6574 expected: str = "",
6575 ) -> ActionVerification:
6576 raise AssertionError("Verification should not run in this scenario")
6577
6578 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
6579 context = build_context(
6580 temp_dir=temp_dir,
6581 messages=[
6582 Message(
6583 role=Role.ASSISTANT,
6584 content=(
6585 "Repair focus:\n"
6586 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
6587 f"- Immediate next step: edit `{repair_target}`.\n"
6588 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
6589 ),
6590 )
6591 ],
6592 safeguards=FakeSafeguards(),
6593 assess_confidence=assess_confidence,
6594 verify_action=verify_action,
6595 )
6596 queued: list[str] = []
6597 context.queue_steering_message_callback = queued.append
6598 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6599 dod = create_definition_of_done("Repair a guide page.")
6600
6601 runner._queue_blocked_html_edit_nudge(
6602 ToolCall(
6603 id="edit-1",
6604 name="edit",
6605 arguments={
6606 "file_path": str(repair_target),
6607 "old_string": "same",
6608 "new_string": "same",
6609 },
6610 ),
6611 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
6612 dod=dod,
6613 )
6614
6615 assert queued
6616 assert str(repair_target) in queued[0]
6617 assert "no on-disk change" in queued[0]
6618 assert "replace the surrounding block" in queued[0]
6619 assert "Do not reopen unrelated reference materials" in queued[0]
6620
6621
6622 def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
6623 temp_dir: Path,
6624 ) -> None:
6625 async def assess_confidence(
6626 tool_name: str,
6627 tool_args: dict,
6628 context: str,
6629 ) -> ConfidenceAssessment:
6630 raise AssertionError("Confidence scoring should be disabled in this scenario")
6631
6632 async def verify_action(
6633 tool_name: str,
6634 tool_args: dict,
6635 result: str,
6636 expected: str = "",
6637 ) -> ActionVerification:
6638 raise AssertionError("Verification should not run in this scenario")
6639
6640 guide_root = temp_dir / "guide"
6641 chapters = guide_root / "chapters"
6642 chapters.mkdir(parents=True)
6643 index_path = guide_root / "index.html"
6644 chapter_one = chapters / "01-introduction.html"
6645 index_path.write_text("<html></html>\n")
6646 chapter_one.write_text("<html></html>\n")
6647
6648 implementation_plan = temp_dir / "implementation.md"
6649 implementation_plan.write_text(
6650 "\n".join(
6651 [
6652 "# Implementation Plan",
6653 "",
6654 "## File Changes",
6655 f"- `{index_path}`",
6656 f"- `{chapter_one}`",
6657 "",
6658 ]
6659 )
6660 )
6661
6662 context = build_context(
6663 temp_dir=temp_dir,
6664 messages=[
6665 Message(
6666 role=Role.ASSISTANT,
6667 content=(
6668 "Repair focus:\n"
6669 f"- Confirm the final guide state in `{index_path}`.\n"
6670 f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
6671 ),
6672 )
6673 ],
6674 safeguards=FakeSafeguards(),
6675 assess_confidence=assess_confidence,
6676 verify_action=verify_action,
6677 )
6678 queued: list[str] = []
6679 context.queue_steering_message_callback = queued.append
6680 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6681
6682 dod = create_definition_of_done("Create a multi-file guide.")
6683 dod.implementation_plan = str(implementation_plan)
6684 dod.touched_files.extend([str(index_path), str(chapter_one)])
6685 dod.verification_commands = [f"ls -la {guide_root}"]
6686
6687 runner._queue_blocked_html_edit_nudge(
6688 ToolCall(
6689 id="edit-1",
6690 name="edit",
6691 arguments={
6692 "file_path": str(index_path),
6693 "old_string": "same",
6694 "new_string": "same",
6695 },
6696 ),
6697 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
6698 dod=dod,
6699 )
6700
6701 assert queued
6702 assert "All explicitly planned artifacts already exist." in queued[0]
6703 assert "Finish with a final response now so Loader can run verification automatically." in queued[0]
6704 assert "replace the surrounding block" not in queued[0]
6705
6706
6707 def test_tool_batch_runner_blocked_noop_edit_keeps_quality_repair_active_after_full_build(
6708 temp_dir: Path,
6709 ) -> None:
6710 async def assess_confidence(
6711 tool_name: str,
6712 tool_args: dict,
6713 context: str,
6714 ) -> ConfidenceAssessment:
6715 raise AssertionError("Confidence scoring should be disabled in this scenario")
6716
6717 async def verify_action(
6718 tool_name: str,
6719 tool_args: dict,
6720 result: str,
6721 expected: str = "",
6722 ) -> ActionVerification:
6723 raise AssertionError("Verification should not run in this scenario")
6724
6725 guide_root = temp_dir / "guide"
6726 chapters = guide_root / "chapters"
6727 chapters.mkdir(parents=True)
6728 index_path = guide_root / "index.html"
6729 chapter_one = chapters / "01-introduction.html"
6730 chapter_two = chapters / "02-installation.html"
6731 index_path.write_text("<html></html>\n")
6732 chapter_one.write_text("<html></html>\n")
6733 chapter_two.write_text("<html></html>\n")
6734
6735 implementation_plan = temp_dir / "implementation.md"
6736 implementation_plan.write_text(
6737 "\n".join(
6738 [
6739 "# Implementation Plan",
6740 "",
6741 "## File Changes",
6742 f"- `{index_path}`",
6743 f"- `{chapter_one}`",
6744 f"- `{chapter_two}`",
6745 "",
6746 ]
6747 )
6748 )
6749
6750 context = build_context(
6751 temp_dir=temp_dir,
6752 messages=[
6753 Message(
6754 role=Role.USER,
6755 content=(
6756 "Repair focus:\n"
6757 f"- Improve `{chapter_two}`: thin content (504 text chars, expected at least 1758).\n"
6758 f"- Improve `{chapter_two}`: insufficient structured content (6 blocks, expected at least 18).\n"
6759 f"- Immediate next step: edit `{chapter_two}`.\n"
6760 ),
6761 )
6762 ],
6763 safeguards=FakeSafeguards(),
6764 assess_confidence=assess_confidence,
6765 verify_action=verify_action,
6766 )
6767 queued: list[str] = []
6768 context.queue_steering_message_callback = queued.append
6769 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6770
6771 dod = create_definition_of_done("Create a multi-file guide.")
6772 dod.implementation_plan = str(implementation_plan)
6773 dod.touched_files.extend([str(index_path), str(chapter_one), str(chapter_two)])
6774 dod.verification_commands = [f"ls -la {guide_root}"]
6775
6776 runner._queue_blocked_html_edit_nudge(
6777 ToolCall(
6778 id="edit-1",
6779 name="edit",
6780 arguments={
6781 "file_path": str(chapter_two),
6782 "old_string": "same",
6783 "new_string": "same",
6784 },
6785 ),
6786 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
6787 dod=dod,
6788 )
6789
6790 assert queued
6791 assert "active content-quality repair is not complete" in queued[0]
6792 assert "Repair focus:" in queued[0]
6793 assert f"Immediate next step: edit `{chapter_two}`" in queued[0]
6794 assert "thin content" in queued[0]
6795 assert "TodoWrite cannot satisfy" not in queued[0]
6796 assert "Finish with a final response now" not in queued[0]
6797
6798
6799 async def _noop_emit(event: AgentEvent) -> None:
6800 return None
6801
6802
6803 @pytest.mark.asyncio
6804 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
6805 temp_dir: Path,
6806 ) -> None:
6807 async def assess_confidence(
6808 tool_name: str,
6809 tool_args: dict,
6810 context: str,
6811 ) -> ConfidenceAssessment:
6812 raise AssertionError("Confidence scoring should be disabled in this scenario")
6813
6814 async def verify_action(
6815 tool_name: str,
6816 tool_args: dict,
6817 result: str,
6818 expected: str = "",
6819 ) -> ActionVerification:
6820 raise AssertionError("Verification should not run for this scenario")
6821
6822 context = build_context(
6823 temp_dir=temp_dir,
6824 messages=[],
6825 safeguards=FakeSafeguards(),
6826 assess_confidence=assess_confidence,
6827 verify_action=verify_action,
6828 )
6829 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6830 tool_call = ToolCall(
6831 id="write-1",
6832 name="write",
6833 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
6834 )
6835 executor = FakeExecutor(
6836 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
6837 )
6838 summary = TurnSummary(final_response="")
6839 dod = create_definition_of_done("Update README and verify it still works.")
6840 events: list[AgentEvent] = []
6841
6842 async def emit(event: AgentEvent) -> None:
6843 events.append(event)
6844
6845 await runner.execute_batch(
6846 tool_calls=[tool_call],
6847 tool_source="assistant",
6848 pending_tool_calls_seen=set(),
6849 emit=emit,
6850 summary=summary,
6851 dod=dod,
6852 executor=executor, # type: ignore[arg-type]
6853 on_confirmation=None,
6854 on_user_question=None,
6855 emit_confirmation=None,
6856 consecutive_errors=0,
6857 )
6858
6859 assert dod.last_verification_result == "planned"
6860 assert dod.verification_commands
6861 assert "Collect verification evidence" in dod.pending_items
6862 assert dod.active_verification_attempt_id == "verification-attempt-1"
6863 assert dod.active_verification_attempt_number == 1
6864 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
6865 assert summary.workflow_timeline[-1].policy_outcome == "planned"
6866 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
6867 assert (
6868 summary.workflow_timeline[-1].verification_observations[0].attempt_id
6869 == "verification-attempt-1"
6870 )
6871 assert (
6872 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
6873 )
6874
6875
6876 @pytest.mark.asyncio
6877 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
6878 temp_dir: Path,
6879 ) -> None:
6880 async def assess_confidence(
6881 tool_name: str,
6882 tool_args: dict,
6883 context: str,
6884 ) -> ConfidenceAssessment:
6885 raise AssertionError("Confidence scoring should be disabled in this scenario")
6886
6887 async def verify_action(
6888 tool_name: str,
6889 tool_args: dict,
6890 result: str,
6891 expected: str = "",
6892 ) -> ActionVerification:
6893 raise AssertionError("Verification should not run in this scenario")
6894
6895 context = build_context(
6896 temp_dir=temp_dir,
6897 messages=[],
6898 safeguards=FakeSafeguards(),
6899 assess_confidence=assess_confidence,
6900 verify_action=verify_action,
6901 )
6902 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6903 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
6904 chapters = nginx_root / "chapters"
6905 implementation_plan = temp_dir / "implementation.md"
6906 implementation_plan.write_text(
6907 "\n".join(
6908 [
6909 "# Implementation Plan",
6910 "",
6911 "## File Changes",
6912 f"- `{chapters}/`",
6913 f"- `{nginx_root / 'index.html'}`",
6914 "",
6915 ]
6916 )
6917 )
6918
6919 tool_call = ToolCall(
6920 id="mkdir-1",
6921 name="bash",
6922 arguments={"command": f"mkdir -p {chapters}"},
6923 )
6924 executor = FakeExecutor(
6925 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
6926 )
6927 summary = TurnSummary(final_response="")
6928 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
6929 dod.implementation_plan = str(implementation_plan)
6930 events: list[AgentEvent] = []
6931
6932 async def emit(event: AgentEvent) -> None:
6933 events.append(event)
6934
6935 await runner.execute_batch(
6936 tool_calls=[tool_call],
6937 tool_source="assistant",
6938 pending_tool_calls_seen=set(),
6939 emit=emit,
6940 summary=summary,
6941 dod=dod,
6942 executor=executor, # type: ignore[arg-type]
6943 on_confirmation=None,
6944 on_user_question=None,
6945 emit_confirmation=None,
6946 consecutive_errors=0,
6947 )
6948
6949 assert dod.last_verification_result is None
6950 assert "Collect verification evidence" not in dod.pending_items
6951 assert not any(
6952 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
6953 )
6954
6955
6956 @pytest.mark.asyncio
6957 async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
6958 temp_dir: Path,
6959 ) -> None:
6960 async def assess_confidence(
6961 tool_name: str,
6962 tool_args: dict,
6963 context: str,
6964 ) -> ConfidenceAssessment:
6965 raise AssertionError("Confidence scoring should be disabled in this scenario")
6966
6967 async def verify_action(
6968 tool_name: str,
6969 tool_args: dict,
6970 result: str,
6971 expected: str = "",
6972 ) -> ActionVerification:
6973 raise AssertionError("Verification should not run in this scenario")
6974
6975 context = build_context(
6976 temp_dir=temp_dir,
6977 messages=[],
6978 safeguards=FakeSafeguards(),
6979 assess_confidence=assess_confidence,
6980 verify_action=verify_action,
6981 )
6982 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6983 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
6984 chapters = nginx_root / "chapters"
6985 chapters.mkdir(parents=True)
6986 index_path = nginx_root / "index.html"
6987 implementation_plan = temp_dir / "implementation.md"
6988 implementation_plan.write_text(
6989 "\n".join(
6990 [
6991 "# Implementation Plan",
6992 "",
6993 "## File Changes",
6994 f"- `{nginx_root}/`",
6995 f"- `{chapters}/`",
6996 f"- `{index_path}`",
6997 "",
6998 ]
6999 )
7000 )
7001
7002 tool_call = ToolCall(
7003 id="write-index",
7004 name="write",
7005 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
7006 )
7007 executor = FakeExecutor(
7008 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
7009 )
7010 summary = TurnSummary(final_response="")
7011 dod = create_definition_of_done("Create a multi-file nginx guide.")
7012 dod.implementation_plan = str(implementation_plan)
7013 dod.pending_items.extend(
7014 [
7015 "Develop the main index.html file with proper structure",
7016 "Create first nginx chapter",
7017 ]
7018 )
7019 events: list[AgentEvent] = []
7020
7021 async def emit(event: AgentEvent) -> None:
7022 events.append(event)
7023
7024 await runner.execute_batch(
7025 tool_calls=[tool_call],
7026 tool_source="assistant",
7027 pending_tool_calls_seen=set(),
7028 emit=emit,
7029 summary=summary,
7030 dod=dod,
7031 executor=executor, # type: ignore[arg-type]
7032 on_confirmation=None,
7033 on_user_question=None,
7034 emit_confirmation=None,
7035 consecutive_errors=0,
7036 )
7037
7038 assert dod.last_verification_result is None
7039 assert "Collect verification evidence" not in dod.pending_items
7040 assert "Create first nginx chapter" in dod.pending_items
7041 assert not any(
7042 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
7043 )
7044
7045
7046 @pytest.mark.asyncio
7047 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
7048 temp_dir: Path,
7049 ) -> None:
7050 async def assess_confidence(
7051 tool_name: str,
7052 tool_args: dict,
7053 context: str,
7054 ) -> ConfidenceAssessment:
7055 raise AssertionError("Confidence scoring should be disabled in this scenario")
7056
7057 async def verify_action(
7058 tool_name: str,
7059 tool_args: dict,
7060 result: str,
7061 expected: str = "",
7062 ) -> ActionVerification:
7063 raise AssertionError("Verification should not run for this scenario")
7064
7065 context = build_context(
7066 temp_dir=temp_dir,
7067 messages=[],
7068 safeguards=FakeSafeguards(),
7069 assess_confidence=assess_confidence,
7070 verify_action=verify_action,
7071 )
7072 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7073 tool_call = ToolCall(
7074 id="write-1",
7075 name="write",
7076 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
7077 )
7078 executor = FakeExecutor(
7079 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
7080 )
7081 summary = TurnSummary(final_response="")
7082 dod = create_definition_of_done("Update README and verify it still works.")
7083 dod.verification_commands = ["uv run pytest -q"]
7084 dod.last_verification_result = "passed"
7085 dod.verification_attempt_counter = 1
7086 dod.active_verification_attempt_id = "verification-attempt-1"
7087 dod.active_verification_attempt_number = 1
7088 dod.evidence = [
7089 VerificationEvidence(
7090 command="uv run pytest -q",
7091 passed=True,
7092 stdout="401 passed",
7093 kind="test",
7094 )
7095 ]
7096 dod.completed_items.append("Collect verification evidence")
7097 events: list[AgentEvent] = []
7098
7099 async def emit(event: AgentEvent) -> None:
7100 events.append(event)
7101
7102 await runner.execute_batch(
7103 tool_calls=[tool_call],
7104 tool_source="assistant",
7105 pending_tool_calls_seen=set(),
7106 emit=emit,
7107 summary=summary,
7108 dod=dod,
7109 executor=executor, # type: ignore[arg-type]
7110 on_confirmation=None,
7111 on_user_question=None,
7112 emit_confirmation=None,
7113 consecutive_errors=0,
7114 )
7115
7116 assert dod.last_verification_result == "stale"
7117 assert dod.evidence == []
7118 assert "Collect verification evidence" in dod.pending_items
7119 assert "Collect verification evidence" not in dod.completed_items
7120 assert dod.active_verification_attempt_id == "verification-attempt-2"
7121 assert dod.active_verification_attempt_number == 2
7122 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
7123 assert summary.workflow_timeline[-1].policy_outcome == "stale"
7124 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
7125 assert (
7126 summary.workflow_timeline[-1].verification_observations[0].attempt_id
7127 == "verification-attempt-1"
7128 )
7129 assert (
7130 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
7131 )
7132 assert (
7133 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
7134 == "verification-attempt-2"
7135 )
7136 assert (
7137 summary.workflow_timeline[-1].verification_observations[0].command
7138 == "uv run pytest -q"
7139 )
7140
7141
7142 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
7143 async def assess_confidence(
7144 tool_name: str,
7145 tool_args: dict,
7146 context: str,
7147 ) -> ConfidenceAssessment:
7148 raise AssertionError("Confidence scoring should be disabled in this scenario")
7149
7150 async def verify_action(
7151 tool_name: str,
7152 tool_args: dict,
7153 result: str,
7154 expected: str = "",
7155 ) -> ActionVerification:
7156 raise AssertionError("Verification should not run in this scenario")
7157
7158 repair_target = temp_dir / "guide" / "index.html"
7159 context = build_context(
7160 temp_dir=temp_dir,
7161 messages=[
7162 Message(
7163 role=Role.ASSISTANT,
7164 content=(
7165 "Repair focus:\n"
7166 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
7167 f"- Immediate next step: edit `{repair_target}`.\n"
7168 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
7169 ),
7170 )
7171 ],
7172 safeguards=FakeSafeguards(),
7173 assess_confidence=assess_confidence,
7174 verify_action=verify_action,
7175 )
7176 queued: list[str] = []
7177 context.queue_steering_message_callback = queued.append
7178 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7179
7180 runner._queue_blocked_active_repair_nudge(
7181 "[Blocked - active repair scope: verification already identified the repair target.]"
7182 )
7183
7184 assert queued
7185 assert str(repair_target) in queued[0]
7186 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
7187 assert "Do not reopen unrelated reference materials" in queued[0]
7188
7189
7190 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
7191 temp_dir: Path,
7192 ) -> None:
7193 async def assess_confidence(
7194 tool_name: str,
7195 tool_args: dict,
7196 context: str,
7197 ) -> ConfidenceAssessment:
7198 raise AssertionError("Confidence scoring should be disabled in this scenario")
7199
7200 async def verify_action(
7201 tool_name: str,
7202 tool_args: dict,
7203 result: str,
7204 expected: str = "",
7205 ) -> ActionVerification:
7206 raise AssertionError("Verification should not run in this scenario")
7207
7208 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
7209 stylesheet = temp_dir / "guide" / "styles.css"
7210 context = build_context(
7211 temp_dir=temp_dir,
7212 messages=[
7213 Message(
7214 role=Role.ASSISTANT,
7215 content=(
7216 "Repair focus:\n"
7217 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
7218 f"- Immediate next step: edit `{repair_target}`.\n"
7219 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
7220 ),
7221 )
7222 ],
7223 safeguards=FakeSafeguards(),
7224 assess_confidence=assess_confidence,
7225 verify_action=verify_action,
7226 )
7227 queued: list[str] = []
7228 context.queue_steering_message_callback = queued.append
7229 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7230
7231 runner._queue_blocked_active_repair_mutation_nudge(
7232 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
7233 )
7234
7235 assert queued
7236 assert str(repair_target) in queued[0]
7237 assert str(stylesheet) in queued[0]
7238 assert "before widening the change set" in queued[0]
7239
7240
7241 def test_tool_batch_runner_duplicate_repair_mutation_restates_verifier_deltas(
7242 temp_dir: Path,
7243 ) -> None:
7244 async def assess_confidence(
7245 tool_name: str,
7246 tool_args: dict,
7247 context: str,
7248 ) -> ConfidenceAssessment:
7249 raise AssertionError("Confidence scoring should be disabled in this scenario")
7250
7251 async def verify_action(
7252 tool_name: str,
7253 tool_args: dict,
7254 result: str,
7255 expected: str = "",
7256 ) -> ActionVerification:
7257 raise AssertionError("Verification should not run in this scenario")
7258
7259 index_path = temp_dir / "guide" / "index.html"
7260 chapter_path = temp_dir / "guide" / "chapters" / "02-installation.html"
7261 context = build_context(
7262 temp_dir=temp_dir,
7263 messages=[
7264 Message(
7265 role=Role.USER,
7266 content=(
7267 "Repair focus:\n"
7268 f"- Improve `{index_path}`: insufficient structured content (9 blocks, expected at least 12).\n"
7269 f"- Improve `{chapter_path}`: thin content (526 text chars, expected at least 1758).\n"
7270 f"- Immediate next step: edit `{index_path}`.\n"
7271 "- Update the listed generated artifacts directly; do not recreate the artifact set.\n"
7272 ),
7273 )
7274 ],
7275 safeguards=FakeSafeguards(),
7276 assess_confidence=assess_confidence,
7277 verify_action=verify_action,
7278 )
7279 queued: list[str] = []
7280 context.queue_steering_message_callback = queued.append
7281 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7282 dod = create_definition_of_done("Create a multi-file guide.")
7283
7284 runner._queue_duplicate_mutation_nudge( # type: ignore[attr-defined]
7285 ToolCall(
7286 id="dup-write",
7287 name="write",
7288 arguments={"file_path": str(index_path), "content": "<h1>same</h1>"},
7289 ),
7290 dod=dod,
7291 )
7292
7293 assert queued
7294 assert "skipped because it would not change" in queued[0]
7295 assert "Do not submit the same content again" in queued[0]
7296 assert "insufficient structured content" in queued[0]
7297 assert "thin content" in queued[0]
7298 assert "make one real edit" in queued[0]
7299
7300
7301 @pytest.mark.asyncio
7302 async def test_tool_batch_runner_quality_repair_success_hands_to_next_target(
7303 temp_dir: Path,
7304 ) -> None:
7305 async def assess_confidence(
7306 tool_name: str,
7307 tool_args: dict,
7308 context: str,
7309 ) -> ConfidenceAssessment:
7310 raise AssertionError("Confidence scoring should be disabled in this scenario")
7311
7312 async def verify_action(
7313 tool_name: str,
7314 tool_args: dict,
7315 result: str,
7316 expected: str = "",
7317 ) -> ActionVerification:
7318 raise AssertionError("Verification should not run in this scenario")
7319
7320 chapters = temp_dir / "guide" / "chapters"
7321 first = chapters / "01-introduction.html"
7322 second = chapters / "02-installation.html"
7323 chapters.mkdir(parents=True)
7324 first.write_text("<h1>Intro</h1>\n")
7325 second.write_text("<h1>Install</h1>\n")
7326 context = build_context(
7327 temp_dir=temp_dir,
7328 messages=[
7329 Message(
7330 role=Role.ASSISTANT,
7331 content=(
7332 "Repair focus:\n"
7333 f"- Improve `{first}`: thin content (400 text chars, expected at least 1758).\n"
7334 f"- Improve `{second}`: insufficient structured content (6 blocks, expected at least 18).\n"
7335 f"- Immediate next step: edit `{first}` with a substantial expansion or replacement.\n"
7336 "- Repair every listed quality target in order before any final answer.\n"
7337 ),
7338 )
7339 ],
7340 safeguards=FakeSafeguards(),
7341 assess_confidence=assess_confidence,
7342 verify_action=verify_action,
7343 )
7344 queued: list[str] = []
7345 context.queue_steering_message_callback = queued.append
7346 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7347 dod = create_definition_of_done("Repair generated HTML guide quality.")
7348 tool_call = ToolCall(
7349 id="write-intro",
7350 name="write",
7351 arguments={
7352 "file_path": str(first),
7353 "content": "<h1>Intro</h1><p>Substantial expansion.</p>\n",
7354 },
7355 )
7356
7357 await runner.execute_batch(
7358 tool_calls=[tool_call],
7359 tool_source="assistant",
7360 pending_tool_calls_seen=set(),
7361 emit=_noop_emit,
7362 summary=TurnSummary(final_response=""),
7363 dod=dod,
7364 executor=FakeExecutor(
7365 [
7366 tool_outcome(
7367 tool_call=tool_call,
7368 output=f"Successfully wrote {first}",
7369 is_error=False,
7370 )
7371 ]
7372 ), # type: ignore[arg-type]
7373 on_confirmation=None,
7374 on_user_question=None,
7375 emit_confirmation=None,
7376 consecutive_errors=0,
7377 )
7378
7379 assert queued
7380 handoff = next(message for message in queued if "next listed quality target" in message)
7381 assert str(second.resolve(strict=False)) in handoff
7382 assert "Do not rerun verification" in handoff
7383 assert "Repair focus:" in handoff
7384 assert "insufficient structured content" in handoff
7385 assert f"Immediate next step: edit `{second.resolve(strict=False)}`" in handoff
7386 assert all("All explicitly planned artifacts now exist" not in message for message in queued)
7387
7388
7389 @pytest.mark.asyncio
7390 async def test_tool_batch_runner_hands_off_after_active_repair_support_file_write(
7391 temp_dir: Path,
7392 ) -> None:
7393 async def assess_confidence(
7394 tool_name: str,
7395 tool_args: dict,
7396 context: str,
7397 ) -> ConfidenceAssessment:
7398 raise AssertionError("Confidence scoring should be disabled in this scenario")
7399
7400 async def verify_action(
7401 tool_name: str,
7402 tool_args: dict,
7403 result: str,
7404 expected: str = "",
7405 ) -> ActionVerification:
7406 raise AssertionError("Verification should not run in this scenario")
7407
7408 repair_target = temp_dir / "guide" / "index.html"
7409 stylesheet = temp_dir / "guide" / "style.css"
7410 repair_target.parent.mkdir(parents=True)
7411 repair_target.write_text('<link rel="stylesheet" href="style.css">\n')
7412 context = build_context(
7413 temp_dir=temp_dir,
7414 messages=[
7415 Message(
7416 role=Role.ASSISTANT,
7417 content=(
7418 "Repair focus:\n"
7419 f"- Fix the broken local reference `style.css` in `{repair_target}`.\n"
7420 f"- Immediate next step: edit `{repair_target}`.\n"
7421 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `style.css`.\n"
7422 ),
7423 )
7424 ],
7425 safeguards=FakeSafeguards(),
7426 assess_confidence=assess_confidence,
7427 verify_action=verify_action,
7428 )
7429 queued: list[str] = []
7430 context.queue_steering_message_callback = queued.append
7431 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7432 dod = create_definition_of_done("Repair a guide stylesheet link.")
7433 tool_call = ToolCall(
7434 id="write-style",
7435 name="write",
7436 arguments={
7437 "file_path": str(stylesheet),
7438 "content": "body { font-family: sans-serif; }\n",
7439 },
7440 )
7441
7442 await runner.execute_batch(
7443 tool_calls=[tool_call],
7444 tool_source="assistant",
7445 pending_tool_calls_seen=set(),
7446 emit=_noop_emit,
7447 summary=TurnSummary(final_response=""),
7448 dod=dod,
7449 executor=FakeExecutor(
7450 [
7451 tool_outcome(
7452 tool_call=tool_call,
7453 output=f"Successfully wrote {stylesheet}",
7454 is_error=False,
7455 )
7456 ]
7457 ), # type: ignore[arg-type]
7458 on_confirmation=None,
7459 on_user_question=None,
7460 emit_confirmation=None,
7461 consecutive_errors=0,
7462 )
7463
7464 assert queued
7465 assert any("support file for the active verification repair now exists" in message for message in queued)
7466 assert any("Do not retarget" in message for message in queued)
7467 assert any("Loader can re-run verification" in message for message in queued)
7468
7469
7470 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
7471 temp_dir: Path,
7472 ) -> None:
7473 async def assess_confidence(
7474 tool_name: str,
7475 tool_args: dict,
7476 context: str,
7477 ) -> ConfidenceAssessment:
7478 raise AssertionError("Confidence scoring should be disabled in this scenario")
7479
7480 async def verify_action(
7481 tool_name: str,
7482 tool_args: dict,
7483 result: str,
7484 expected: str = "",
7485 ) -> ActionVerification:
7486 raise AssertionError("Verification should not run in this scenario")
7487
7488 context = build_context(
7489 temp_dir=temp_dir,
7490 messages=[],
7491 safeguards=FakeSafeguards(),
7492 assess_confidence=assess_confidence,
7493 verify_action=verify_action,
7494 )
7495 queued: list[str] = []
7496 context.queue_steering_message_callback = queued.append
7497 store = DefinitionOfDoneStore(temp_dir)
7498 dod = create_definition_of_done("Create a multi-file guide from a reference")
7499 plan_path = temp_dir / "implementation.md"
7500 plan_path.write_text(
7501 "# File Changes\n"
7502 "- `guide/index.html`\n"
7503 "- `guide/chapters/01-getting-started.html`\n"
7504 "- `guide/chapters/02-installation.html`\n"
7505 "- `guide/chapters/03-first-website.html`\n"
7506 )
7507 dod.implementation_plan = str(plan_path)
7508 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
7509 (temp_dir / "guide" / "index.html").write_text("index")
7510 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
7511 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
7512 runner = ToolBatchRunner(context, store)
7513
7514 runner._queue_blocked_late_reference_drift_nudge(
7515 "[Blocked - late reference drift: several planned artifacts already exist.]",
7516 dod=dod,
7517 )
7518
7519 assert queued
7520 assert "03-first-website.html" in queued[0]
7521 assert "older reference materials" in queued[0]
7522
7523
7524 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
7525 temp_dir: Path,
7526 ) -> None:
7527 async def assess_confidence(
7528 tool_name: str,
7529 tool_args: dict,
7530 context: str,
7531 ) -> ConfidenceAssessment:
7532 raise AssertionError("Confidence scoring should be disabled in this scenario")
7533
7534 async def verify_action(
7535 tool_name: str,
7536 tool_args: dict,
7537 result: str,
7538 expected: str = "",
7539 ) -> ActionVerification:
7540 raise AssertionError("Verification should not run in this scenario")
7541
7542 guide_root = temp_dir / "guide"
7543 chapters = guide_root / "chapters"
7544 guide_root.mkdir(parents=True)
7545 chapters.mkdir()
7546 index_path = guide_root / "index.html"
7547 chapter_one = chapters / "01-getting-started.html"
7548 chapter_two = chapters / "02-installation.html"
7549 index_path.write_text("index")
7550 chapter_one.write_text("one")
7551 chapter_two.write_text("two")
7552
7553 implementation_plan = temp_dir / "implementation.md"
7554 implementation_plan.write_text(
7555 "\n".join(
7556 [
7557 "# Implementation Plan",
7558 "",
7559 "## File Changes",
7560 f"- `{guide_root}`",
7561 f"- `{chapters}`",
7562 f"- `{index_path}`",
7563 f"- `{chapter_one}`",
7564 f"- `{chapter_two}`",
7565 "",
7566 ]
7567 )
7568 )
7569
7570 context = build_context(
7571 temp_dir=temp_dir,
7572 messages=[],
7573 safeguards=FakeSafeguards(),
7574 assess_confidence=assess_confidence,
7575 verify_action=verify_action,
7576 )
7577 queued: list[str] = []
7578 context.queue_steering_message_callback = queued.append
7579 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7580 dod = create_definition_of_done("Create a multi-file guide from a reference")
7581 dod.implementation_plan = str(implementation_plan)
7582 dod.verification_commands = [f"ls -la {guide_root}"]
7583 sync_todos_to_definition_of_done(
7584 dod,
7585 [
7586 {
7587 "content": "Verify all guide files are linked and complete",
7588 "active_form": "Working on: Verify all guide files are linked and complete",
7589 "status": "pending",
7590 }
7591 ],
7592 project_root=temp_dir,
7593 )
7594
7595 runner._queue_blocked_completed_artifact_scope_nudge(
7596 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
7597 dod=dod,
7598 )
7599
7600 assert queued
7601 assert context.workflow_mode == "verify"
7602 assert "All explicitly planned artifacts already exist." in queued[0]
7603 assert "Verify all guide files are linked and complete" in queued[0]
7604 assert "Do not reopen earlier reference materials." in queued[0]
7605 assert "Finish with a final response so Loader can verify" in queued[0]
7606
7607
7608 def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
7609 temp_dir: Path,
7610 ) -> None:
7611 async def assess_confidence(
7612 tool_name: str,
7613 tool_args: dict,
7614 context: str,
7615 ) -> ConfidenceAssessment:
7616 raise AssertionError("Confidence scoring should be disabled in this scenario")
7617
7618 async def verify_action(
7619 tool_name: str,
7620 tool_args: dict,
7621 result: str,
7622 expected: str = "",
7623 ) -> ActionVerification:
7624 raise AssertionError("Verification should not run in this scenario")
7625
7626 guide_root = temp_dir / "guide"
7627 chapters = guide_root / "chapters"
7628 guide_root.mkdir(parents=True)
7629 chapters.mkdir()
7630 index_path = guide_root / "index.html"
7631 chapter_one = chapters / "01-getting-started.html"
7632 chapter_two = chapters / "02-installation.html"
7633 index_path.write_text("index")
7634 chapter_one.write_text("one")
7635 chapter_two.write_text("two")
7636
7637 implementation_plan = temp_dir / "implementation.md"
7638 implementation_plan.write_text(
7639 "\n".join(
7640 [
7641 "# Implementation Plan",
7642 "",
7643 "## File Changes",
7644 f"- `{guide_root}`",
7645 f"- `{chapters}`",
7646 f"- `{index_path}`",
7647 f"- `{chapter_one}`",
7648 f"- `{chapter_two}`",
7649 "",
7650 ]
7651 )
7652 )
7653
7654 context = build_context(
7655 temp_dir=temp_dir,
7656 messages=[],
7657 safeguards=FakeSafeguards(),
7658 assess_confidence=assess_confidence,
7659 verify_action=verify_action,
7660 )
7661 queued: list[str] = []
7662 context.queue_steering_message_callback = queued.append
7663 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7664 dod = create_definition_of_done("Create a multi-file guide from a reference")
7665 dod.implementation_plan = str(implementation_plan)
7666 dod.verification_commands = [f"ls -la {guide_root}"]
7667
7668 runner._queue_blocked_completed_artifact_scope_nudge(
7669 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
7670 dod=dod,
7671 )
7672
7673 assert queued
7674 assert context.workflow_mode == "verify"
7675 assert "All explicitly planned artifacts already exist." in queued[0]
7676 assert "finish with a final response so Loader can verify" in queued[0]
7677
7678
7679 @pytest.mark.asyncio
7680 async def test_tool_batch_runner_does_not_halt_on_repeated_post_build_audit_blocks(
7681 temp_dir: Path,
7682 ) -> None:
7683 async def assess_confidence(
7684 tool_name: str,
7685 tool_args: dict,
7686 context: str,
7687 ) -> ConfidenceAssessment:
7688 raise AssertionError("Confidence scoring should be disabled in this scenario")
7689
7690 async def verify_action(
7691 tool_name: str,
7692 tool_args: dict,
7693 result: str,
7694 expected: str = "",
7695 ) -> ActionVerification:
7696 raise AssertionError("Verification should not run in this scenario")
7697
7698 guide_root = temp_dir / "guide"
7699 chapters = guide_root / "chapters"
7700 guide_root.mkdir(parents=True)
7701 chapters.mkdir()
7702 index_path = guide_root / "index.html"
7703 chapter_one = chapters / "01-getting-started.html"
7704 chapter_two = chapters / "02-installation.html"
7705 index_path.write_text("index")
7706 chapter_one.write_text("one")
7707 chapter_two.write_text("two")
7708
7709 implementation_plan = temp_dir / "implementation.md"
7710 implementation_plan.write_text(
7711 "\n".join(
7712 [
7713 "# Implementation Plan",
7714 "",
7715 "## File Changes",
7716 f"- `{guide_root}`",
7717 f"- `{chapters}`",
7718 f"- `{index_path}`",
7719 f"- `{chapter_one}`",
7720 f"- `{chapter_two}`",
7721 "",
7722 ]
7723 )
7724 )
7725
7726 context = build_context(
7727 temp_dir=temp_dir,
7728 messages=[],
7729 safeguards=FakeSafeguards(),
7730 assess_confidence=assess_confidence,
7731 verify_action=verify_action,
7732 )
7733 queued: list[str] = []
7734 context.queue_steering_message_callback = queued.append
7735 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7736 dod = create_definition_of_done("Create a multi-file guide from a reference")
7737 dod.implementation_plan = str(implementation_plan)
7738 dod.verification_commands = [f"ls -la {guide_root}"]
7739
7740 blocked_message = (
7741 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]"
7742 )
7743 tool_calls = [
7744 ToolCall(
7745 id=f"audit-{index}",
7746 name="bash",
7747 arguments={"command": f"cd {temp_dir} && ls -la guide/chapters/"},
7748 )
7749 for index in range(1, 4)
7750 ]
7751 executor = FakeExecutor(
7752 [
7753 tool_outcome(
7754 tool_call=tool_call,
7755 output=blocked_message,
7756 is_error=True,
7757 state=ToolExecutionState.BLOCKED,
7758 )
7759 for tool_call in tool_calls
7760 ]
7761 )
7762 events: list[AgentEvent] = []
7763
7764 async def emit(event: AgentEvent) -> None:
7765 events.append(event)
7766
7767 result = await runner.execute_batch(
7768 tool_calls=tool_calls,
7769 tool_source="native",
7770 pending_tool_calls_seen=set(),
7771 emit=emit,
7772 summary=TurnSummary(final_response=""),
7773 dod=dod,
7774 executor=executor,
7775 on_confirmation=None,
7776 on_user_question=None,
7777 emit_confirmation=None,
7778 consecutive_errors=0,
7779 )
7780
7781 assert result.halted is False
7782 assert result.consecutive_errors == 0
7783 assert context.workflow_mode == "verify"
7784 assert queued
7785 assert any("finish with a final response so Loader can verify" in message for message in queued)
7786
7787
7788 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
7789 temp_dir: Path,
7790 ) -> None:
7791 async def assess_confidence(
7792 tool_name: str,
7793 tool_args: dict,
7794 context: str,
7795 ) -> ConfidenceAssessment:
7796 raise AssertionError("Confidence scoring should be disabled in this scenario")
7797
7798 async def verify_action(
7799 tool_name: str,
7800 tool_args: dict,
7801 result: str,
7802 expected: str = "",
7803 ) -> ActionVerification:
7804 raise AssertionError("Verification should not run in this scenario")
7805
7806 context = build_context(
7807 temp_dir=temp_dir,
7808 messages=[],
7809 safeguards=FakeSafeguards(),
7810 assess_confidence=assess_confidence,
7811 verify_action=verify_action,
7812 )
7813 queued: list[str] = []
7814 context.queue_steering_message_callback = queued.append
7815 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7816
7817 runner._queue_blocked_html_declared_target_nudge(
7818 ToolCall(
7819 id="write-ch1",
7820 name="write",
7821 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
7822 ),
7823 (
7824 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
7825 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
7826 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
7827 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
7828 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
7829 ),
7830 )
7831
7832 assert queued
7833 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
7834 assert "`chapters/02-installation.html`" in queued[0]
7835 assert "same file now" in queued[0]
7836
7837
7838 def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
7839 temp_dir: Path,
7840 ) -> None:
7841 async def assess_confidence(
7842 tool_name: str,
7843 tool_args: dict,
7844 context: str,
7845 ) -> ConfidenceAssessment:
7846 raise AssertionError("Confidence scoring should be disabled in this scenario")
7847
7848 async def verify_action(
7849 tool_name: str,
7850 tool_args: dict,
7851 result: str,
7852 expected: str = "",
7853 ) -> ActionVerification:
7854 raise AssertionError("Verification should not run in this scenario")
7855
7856 context = build_context(
7857 temp_dir=temp_dir,
7858 messages=[],
7859 safeguards=FakeSafeguards(),
7860 assess_confidence=assess_confidence,
7861 verify_action=verify_action,
7862 )
7863 queued: list[str] = []
7864 context.queue_steering_message_callback = queued.append
7865 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7866
7867 runner._queue_blocked_html_declared_target_nudge(
7868 ToolCall(
7869 id="write-ch1",
7870 name="write",
7871 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
7872 ),
7873 (
7874 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
7875 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
7876 "introducing new sibling targets that the guide root does not declare; remove or replace "
7877 "undeclared hrefs like: troubleshooting.html. "
7878 "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
7879 "chapters/configuration.html. Allowed hrefs from this file include: ../index.html, "
7880 "installation.html, configuration.html."
7881 ),
7882 )
7883
7884 assert queued
7885 assert "use only these exact href values" in queued[0]
7886 assert "`installation.html`" in queued[0]
7887 assert "`../index.html`" in queued[0]
7888 assert "closest declared target(s)" not in queued[0]
7889
7890
7891 def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
7892 temp_dir: Path,
7893 ) -> None:
7894 async def assess_confidence(
7895 tool_name: str,
7896 tool_args: dict,
7897 context: str,
7898 ) -> ConfidenceAssessment:
7899 raise AssertionError("Confidence scoring should be disabled in this scenario")
7900
7901 async def verify_action(
7902 tool_name: str,
7903 tool_args: dict,
7904 result: str,
7905 expected: str = "",
7906 ) -> ActionVerification:
7907 raise AssertionError("Verification should not run in this scenario")
7908
7909 context = build_context(
7910 temp_dir=temp_dir,
7911 messages=[],
7912 safeguards=FakeSafeguards(),
7913 assess_confidence=assess_confidence,
7914 verify_action=verify_action,
7915 )
7916 queued: list[str] = []
7917 context.queue_steering_message_callback = queued.append
7918 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
7919 dod = create_definition_of_done("Create a guide.")
7920
7921 target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
7922 runner._queue_blocked_html_declared_file_creation_nudge(
7923 ToolCall(
7924 id="write-troubleshooting",
7925 name="write",
7926 arguments={"file_path": str(target)},
7927 ),
7928 (
7929 "[Blocked - HTML file creation falls outside the current declared artifact set] "
7930 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
7931 f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
7932 "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
7933 "Already-declared local targets include: chapters/advanced-topics.html, "
7934 "chapters/basic-usage.html, chapters/configuration.html"
7935 ),
7936 dod=dod,
7937 )
7938
7939 assert queued
7940 assert "update" in queued[0].lower()
7941 assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
7942 assert "`chapters/troubleshooting.html`" in queued[0]
7943 assert "retry the file creation" in queued[0]
7944
7945
7946 def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
7947 temp_dir: Path,
7948 ) -> None:
7949 async def assess_confidence(
7950 tool_name: str,
7951 tool_args: dict,
7952 context: str,
7953 ) -> ConfidenceAssessment:
7954 raise AssertionError("Confidence scoring should not run in this scenario")
7955
7956 async def verify_action(
7957 tool_name: str,
7958 tool_args: dict,
7959 result: str,
7960 expected: str = "",
7961 ) -> ActionVerification:
7962 raise AssertionError("Verification should not run in this scenario")
7963
7964 guide = temp_dir / "guide"
7965 chapters = guide / "chapters"
7966 guide.mkdir()
7967 chapters.mkdir()
7968 index = guide / "index.html"
7969 index.write_text(
7970 "\n".join(
7971 [
7972 '<a href="chapters/01-introduction.html">Intro</a>',
7973 '<a href="chapters/02-installation.html">Install</a>',
7974 '<a href="../index.html">Back</a>',
7975 "",
7976 ]
7977 )
7978 )
7979 (chapters / "01-introduction.html").write_text("<html></html>\n")
7980 (chapters / "02-installation.html").write_text("<html></html>\n")
7981
7982 implementation_plan = temp_dir / "implementation.md"
7983 implementation_plan.write_text(
7984 "\n".join(
7985 [
7986 "# Implementation Plan",
7987 "",
7988 "## File Changes",
7989 f"- `{index}`",
7990 f"- `{chapters / '01-introduction.html'}`",
7991 f"- `{chapters / '02-installation.html'}`",
7992 "",
7993 ]
7994 )
7995 )
7996
7997 context = build_context(
7998 temp_dir=temp_dir,
7999 messages=[],
8000 safeguards=FakeSafeguards(),
8001 assess_confidence=assess_confidence,
8002 verify_action=verify_action,
8003 )
8004 queued: list[str] = []
8005 context.queue_steering_message_callback = queued.append
8006 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
8007 dod = create_definition_of_done("Create a guide.")
8008 dod.implementation_plan = str(implementation_plan)
8009 dod.verification_commands = [f"ls -la {guide}"]
8010 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
8011
8012 target = guide / "chapters" / "08-advanced-configuration.html"
8013 runner._queue_blocked_html_declared_file_creation_nudge(
8014 ToolCall(
8015 id="write-extra",
8016 name="write",
8017 arguments={"file_path": str(target)},
8018 ),
8019 (
8020 "[Blocked - HTML file creation falls outside the current declared artifact set] "
8021 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
8022 f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
8023 "for example: chapters/08-advanced-configuration.html."
8024 ),
8025 dod=dod,
8026 )
8027
8028 assert queued
8029 assert "All explicitly planned artifacts already exist on disk." in queued[0]
8030 assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
8031 assert "Finish with a final response now so Loader can run verification automatically." in queued[0]
8032 assert "update the guide root" not in queued[0]
8033
8034
8035 def test_tool_batch_runner_blocked_html_declared_file_creation_prefers_closest_target(
8036 temp_dir: Path,
8037 ) -> None:
8038 async def assess_confidence(
8039 tool_name: str,
8040 tool_args: dict,
8041 context: str,
8042 ) -> ConfidenceAssessment:
8043 raise AssertionError("Confidence scoring should not run in this scenario")
8044
8045 async def verify_action(
8046 tool_name: str,
8047 tool_args: dict,
8048 result: str,
8049 expected: str = "",
8050 ) -> ActionVerification:
8051 raise AssertionError("Verification should not run in this scenario")
8052
8053 context = build_context(
8054 temp_dir=temp_dir,
8055 messages=[],
8056 safeguards=FakeSafeguards(),
8057 assess_confidence=assess_confidence,
8058 verify_action=verify_action,
8059 )
8060 queued: list[str] = []
8061 context.queue_steering_message_callback = queued.append
8062 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
8063 dod = create_definition_of_done("Create a guide.")
8064
8065 target = temp_dir / "guide" / "chapters" / "02-basics.html"
8066 runner._queue_blocked_html_declared_file_creation_nudge(
8067 ToolCall(
8068 id="write-basics",
8069 name="write",
8070 arguments={"file_path": str(target)},
8071 ),
8072 (
8073 "[Blocked - HTML file creation falls outside the current declared artifact set] "
8074 "Suggestion: Keep new non-root HTML files within the root-declared artifact set. "
8075 "Do not create undeclared sibling page `chapters/02-basics.html`; use the closest declared local target instead. "
8076 "Already-declared local targets include: chapters/01-introduction.html, "
8077 "chapters/02-installation.html, chapters/03-basic-configuration.html. "
8078 "Closest declared local targets include: chapters/02-installation.html"
8079 ),
8080 dod=dod,
8081 )
8082
8083 assert queued
8084 assert "Do not create `chapters/02-basics.html`." in queued[0]
8085 assert "closest declared target instead: `chapters/02-installation.html`" in queued[0]
8086 assert "Already-declared local targets include:" in queued[0]
8087 assert "update the guide root" not in queued[0]
8088
8089
8090 def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
8091 temp_dir: Path,
8092 ) -> None:
8093 async def assess_confidence(
8094 tool_name: str,
8095 tool_args: dict,
8096 context: str,
8097 ) -> ConfidenceAssessment:
8098 raise AssertionError("Confidence scoring should not run in this scenario")
8099
8100 async def verify_action(
8101 tool_name: str,
8102 tool_args: dict,
8103 result: str,
8104 expected: str = "",
8105 ) -> ActionVerification:
8106 raise AssertionError("Verification should not run in this scenario")
8107
8108 guide = temp_dir / "guide"
8109 chapters = guide / "chapters"
8110 guide.mkdir()
8111 chapters.mkdir()
8112 index = guide / "index.html"
8113 index.write_text(
8114 "\n".join(
8115 [
8116 '<a href="chapters/01-introduction.html">Intro</a>',
8117 '<a href="chapters/02-installation.html">Install</a>',
8118 '<a href="../index.html">Back</a>',
8119 "",
8120 ]
8121 )
8122 )
8123 (chapters / "01-introduction.html").write_text("<html></html>\n")
8124 (chapters / "02-installation.html").write_text("<html></html>\n")
8125
8126 implementation_plan = temp_dir / "implementation.md"
8127 implementation_plan.write_text(
8128 "\n".join(
8129 [
8130 "# Implementation Plan",
8131 "",
8132 "## File Changes",
8133 f"- `{index}`",
8134 f"- `{chapters / '01-introduction.html'}`",
8135 f"- `{chapters / '02-installation.html'}`",
8136 "",
8137 ]
8138 )
8139 )
8140
8141 context = build_context(
8142 temp_dir=temp_dir,
8143 messages=[],
8144 safeguards=FakeSafeguards(),
8145 assess_confidence=assess_confidence,
8146 verify_action=verify_action,
8147 )
8148 queued: list[str] = []
8149 context.queue_steering_message_callback = queued.append
8150 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
8151 dod = create_definition_of_done("Create a guide.")
8152 dod.implementation_plan = str(implementation_plan)
8153 dod.verification_commands = [f"ls -la {guide}"]
8154 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
8155
8156 runner._queue_blocked_html_missing_target_nudge(
8157 ToolCall(
8158 id="edit-root",
8159 name="edit",
8160 arguments={"file_path": str(index)},
8161 ),
8162 (
8163 "[Blocked - Edited HTML links point to files that do not exist] "
8164 "Suggestion: Use only existing local targets for href values and avoid introducing missing links. "
8165 "Broken href(s): chapters/08-advanced-configuration.html. "
8166 "Replace them with an existing local target or remove the broken link."
8167 ),
8168 dod=dod,
8169 )
8170
8171 assert queued
8172 assert "All explicitly planned artifacts already exist on disk." in queued[0]
8173 assert f"Stay on `{index}`." in queued[0]
8174 assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
8175 assert "Repair the existing generated files instead of expanding the guide." in queued[0]
8176 assert "Replace broken hrefs with existing local targets or remove the broken link." in queued[0]
8177
8178
8179 def test_tool_batch_runner_blocked_html_asset_nudge_retries_same_file(
8180 temp_dir: Path,
8181 ) -> None:
8182 async def assess_confidence(
8183 tool_name: str,
8184 tool_args: dict,
8185 context: str,
8186 ) -> ConfidenceAssessment:
8187 raise AssertionError("Confidence scoring should not run in this scenario")
8188
8189 async def verify_action(
8190 tool_name: str,
8191 tool_args: dict,
8192 result: str,
8193 expected: str = "",
8194 ) -> ActionVerification:
8195 raise AssertionError("Verification should not run in this scenario")
8196
8197 context = build_context(
8198 temp_dir=temp_dir,
8199 messages=[],
8200 safeguards=FakeSafeguards(),
8201 assess_confidence=assess_confidence,
8202 verify_action=verify_action,
8203 )
8204 queued: list[str] = []
8205 context.queue_steering_message_callback = queued.append
8206 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
8207 target = temp_dir / "guide" / "chapters" / "03-configuration.html"
8208
8209 runner._queue_blocked_html_asset_nudge(
8210 ToolCall(
8211 id="write-config",
8212 name="write",
8213 arguments={"file_path": str(target)},
8214 ),
8215 (
8216 "[Blocked - HTML local asset references do not exist] Suggestion: "
8217 "Use only existing local assets for non-HTML href values. "
8218 "Missing local asset href(s): ../styles.css. Remove the asset link, "
8219 "create the referenced asset first, inline the styling/content, or point "
8220 "the href at an existing local file."
8221 ),
8222 )
8223
8224 assert queued
8225 assert str(target) in queued[0]
8226 assert "was not created or updated" in queued[0]
8227 assert "Remove or replace `../styles.css`." in queued[0]
8228 assert "Do not resend the same `<link>` tag" in queued[0]
8229 assert "do not claim completion" in queued[0]
8230
8231
8232 def test_tool_batch_runner_repeated_blocked_html_asset_nudge_forces_href_removal(
8233 temp_dir: Path,
8234 ) -> None:
8235 async def assess_confidence(
8236 tool_name: str,
8237 tool_args: dict,
8238 context: str,
8239 ) -> ConfidenceAssessment:
8240 raise AssertionError("Confidence scoring should not run in this scenario")
8241
8242 async def verify_action(
8243 tool_name: str,
8244 tool_args: dict,
8245 result: str,
8246 expected: str = "",
8247 ) -> ActionVerification:
8248 raise AssertionError("Verification should not run in this scenario")
8249
8250 blocked_event = (
8251 "[Blocked - HTML local asset references do not exist] Suggestion: "
8252 "Use only existing local assets for non-HTML href values. "
8253 "Missing local asset href(s): ../style.css. Remove the asset link, "
8254 "create the referenced asset first, inline the styling/content, or point "
8255 "the href at an existing local file."
8256 )
8257 context = build_context(
8258 temp_dir=temp_dir,
8259 messages=[Message(role=Role.TOOL, content=blocked_event)],
8260 safeguards=FakeSafeguards(),
8261 assess_confidence=assess_confidence,
8262 verify_action=verify_action,
8263 )
8264 context.session.append(Message(role=Role.TOOL, content=blocked_event))
8265 queued: list[str] = []
8266 context.queue_steering_message_callback = queued.append
8267 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
8268 target = temp_dir / "guide" / "chapters" / "05-troubleshooting.html"
8269
8270 runner._queue_blocked_html_asset_nudge(
8271 ToolCall(
8272 id="write-troubleshooting",
8273 name="write",
8274 arguments={"file_path": str(target)},
8275 ),
8276 blocked_event,
8277 )
8278
8279 assert queued
8280 assert "blocked 2 times" in queued[0]
8281 assert "`../style.css`" in queued[0]
8282 assert "line removed" in queued[0]
8283 assert "Do not resend another" in queued[0]
8284
8285
8286 @pytest.mark.asyncio
8287 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
8288 temp_dir: Path,
8289 ) -> None:
8290 async def assess_confidence(
8291 tool_name: str,
8292 tool_args: dict,
8293 context: str,
8294 ) -> ConfidenceAssessment:
8295 raise AssertionError("Confidence scoring should be disabled in this scenario")
8296
8297 async def verify_action(
8298 tool_name: str,
8299 tool_args: dict,
8300 result: str,
8301 expected: str = "",
8302 ) -> ActionVerification:
8303 raise AssertionError("Verification should not run in this scenario")
8304
8305 guide_root = temp_dir / "guides" / "nginx"
8306 chapters = guide_root / "chapters"
8307 chapters.mkdir(parents=True)
8308 index_path = guide_root / "index.html"
8309 chapter_one = chapters / "01-introduction.html"
8310 chapter_two = chapters / "02-installation.html"
8311 index_path.write_text("<html></html>\n")
8312 chapter_one.write_text("<h1>Intro</h1>\n")
8313
8314 implementation_plan = temp_dir / "implementation.md"
8315 implementation_plan.write_text(
8316 "\n".join(
8317 [
8318 "# Implementation Plan",
8319 "",
8320 "## File Changes",
8321 f"- `{index_path}`",
8322 f"- `{chapter_one}`",
8323 f"- `{chapter_two}`",
8324 "",
8325 ]
8326 )
8327 )
8328
8329 context = build_context(
8330 temp_dir=temp_dir,
8331 messages=[],
8332 safeguards=FakeSafeguards(),
8333 assess_confidence=assess_confidence,
8334 verify_action=verify_action,
8335 auto_recover=False,
8336 )
8337 queued: list[str] = []
8338 context.queue_steering_message_callback = queued.append
8339 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
8340 tool_call = ToolCall(
8341 id="write-2",
8342 name="write",
8343 arguments={"file_path": "", "content": "<html></html>\n"},
8344 )
8345 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
8346 executor = FakeExecutor(
8347 [
8348 ToolExecutionOutcome(
8349 tool_call=tool_call,
8350 state=ToolExecutionState.BLOCKED,
8351 message=Message.tool_result_message(
8352 tool_call_id=tool_call.id,
8353 display_content=blocked_message,
8354 result_content=blocked_message,
8355 is_error=True,
8356 ),
8357 event_content=blocked_message,
8358 is_error=True,
8359 result_output=blocked_message,
8360 )
8361 ]
8362 )
8363 dod = create_definition_of_done("Create a multi-file nginx guide.")
8364 dod.implementation_plan = str(implementation_plan)
8365 dod.touched_files.extend([str(index_path), str(chapter_one)])
8366 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
8367
8368 await runner.execute_batch(
8369 tool_calls=[tool_call],
8370 tool_source="assistant",
8371 pending_tool_calls_seen=set(),
8372 emit=_noop_emit,
8373 summary=TurnSummary(final_response=""),
8374 dod=dod,
8375 executor=executor, # type: ignore[arg-type]
8376 on_confirmation=None,
8377 on_user_question=None,
8378 emit_confirmation=None,
8379 consecutive_errors=0,
8380 )
8381
8382 assert queued
8383 assert "did not provide a valid `file_path`" in queued[0]
8384 assert "Resume by creating `02-installation.html` now." in queued[0]
8385 assert (
8386 f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
8387 in queued[0]
8388 )
8389 assert context.recovery_context is not None
8390 assert context.recovery_context.attempts[-1].error == blocked_message