Python · 224940 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.path_display import display_runtime_path
20 from loader.runtime.permissions import (
21 PermissionMode,
22 build_permission_policy,
23 load_permission_rules,
24 )
25 from loader.runtime.reasoning_types import (
26 ActionVerification,
27 ConfidenceAssessment,
28 ConfidenceLevel,
29 )
30 from loader.runtime.recovery import RecoveryContext
31 from loader.runtime.tool_batches import (
32 ToolBatchRunner,
33 )
34 from loader.runtime.tool_batches import (
35 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
36 )
37 from loader.runtime.workflow import sync_todos_to_definition_of_done
38 from loader.tools.base import ToolResult as RegistryToolResult
39 from loader.tools.base import create_default_registry
40 from tests.helpers.runtime_harness import ScriptedBackend
41
42
43 class FakeSession:
44 def __init__(self, messages: list[Message]) -> None:
45 self.messages = list(messages)
46 self.workflow_timeline = []
47
48 def append(self, message: Message) -> None:
49 self.messages.append(message)
50
51 def append_workflow_timeline_entry(self, entry) -> None:
52 self.workflow_timeline.append(entry)
53
54
55 class FakeCodeFilter:
56 def reset(self) -> None:
57 return None
58
59
60 class FakeSafeguards:
61 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
62 self.action_tracker = object()
63 self.validator = object()
64 self.code_filter = FakeCodeFilter()
65 self._detect_loop_result = detect_loop_result
66
67 def filter_stream_chunk(self, content: str) -> str:
68 return content
69
70 def filter_complete_content(self, content: str) -> str:
71 return content
72
73 def should_steer(self) -> bool:
74 return False
75
76 def get_steering_message(self) -> str | None:
77 return None
78
79 def record_response(self, content: str) -> None:
80 return None
81
82 def detect_text_loop(self, content: str) -> tuple[bool, str]:
83 return False, ""
84
85 def detect_loop(self) -> tuple[bool, str]:
86 return self._detect_loop_result
87
88
89 class FakeExecutor:
90 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
91 self._outcomes = list(outcomes)
92 self.calls: list[ToolCall] = []
93
94 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
95 self.calls.append(tool_call)
96 if not self._outcomes:
97 raise AssertionError("No fake tool outcome queued")
98 return self._outcomes.pop(0)
99
100
101 def build_context(
102 *,
103 temp_dir: Path,
104 messages: list[Message],
105 safeguards: FakeSafeguards,
106 assess_confidence,
107 verify_action,
108 recovery_context: RecoveryContext | None = None,
109 confidence_scoring: bool = False,
110 verification: bool = False,
111 auto_recover: bool = True,
112 min_confidence_for_action: int = 3,
113 ) -> RuntimeContext:
114 registry = create_default_registry(temp_dir)
115 registry.configure_workspace_root(temp_dir)
116 rule_status = load_permission_rules(temp_dir)
117 policy = build_permission_policy(
118 active_mode=PermissionMode.WORKSPACE_WRITE,
119 workspace_root=temp_dir,
120 tool_requirements=registry.get_tool_requirements(),
121 rules=rule_status.rules,
122 )
123 context = RuntimeContext(
124 project_root=temp_dir,
125 backend=ScriptedBackend(),
126 registry=registry,
127 session=FakeSession(messages), # type: ignore[arg-type]
128 config=SimpleNamespace(
129 force_react=False,
130 max_recovery_attempts=2,
131 auto_recover=auto_recover,
132 reasoning=SimpleNamespace(
133 rollback=False,
134 show_rollback_plan=False,
135 completion_check=True,
136 max_continuation_prompts=5,
137 self_critique=False,
138 confidence_scoring=confidence_scoring,
139 min_confidence_for_action=min_confidence_for_action,
140 verification=verification,
141 ),
142 ),
143 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
144 project_context=None,
145 permission_policy=policy,
146 permission_config_status=rule_status,
147 workflow_mode="execute",
148 safeguards=safeguards,
149 reasoning=SimpleNamespace(
150 assess_confidence=assess_confidence,
151 verify_action=verify_action,
152 ),
153 recovery_context=recovery_context,
154 )
155 return context
156
157
158 def tool_outcome(
159 *,
160 tool_call: ToolCall,
161 output: str,
162 is_error: bool,
163 state: ToolExecutionState = ToolExecutionState.EXECUTED,
164 metadata: dict[str, object] | None = None,
165 ) -> ToolExecutionOutcome:
166 return ToolExecutionOutcome(
167 tool_call=tool_call,
168 state=state,
169 message=Message.tool_result_message(
170 tool_call_id=tool_call.id,
171 display_content=output,
172 result_content=output,
173 is_error=is_error,
174 ),
175 event_content=output,
176 is_error=is_error,
177 result_output=output,
178 registry_result=RegistryToolResult(
179 output=output,
180 is_error=is_error,
181 metadata=metadata or {},
182 ),
183 )
184
185
186 @pytest.mark.asyncio
187 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
188 captured: dict[str, str] = {}
189
190 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
191 captured["context"] = context
192 return ConfidenceAssessment(
193 action=f"{tool_name} with {tool_args}",
194 tool_name=tool_name,
195 tool_args=tool_args,
196 level=ConfidenceLevel.LOW,
197 reasoning="Need to inspect the target first.",
198 risks=["Unknown target file"],
199 )
200
201 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
202 raise AssertionError("Verification should not run for skipped actions")
203
204 context = build_context(
205 temp_dir=temp_dir,
206 messages=[
207 Message(role=Role.USER, content="Please inspect the project."),
208 Message(role=Role.ASSISTANT, content="I will read the file next."),
209 ],
210 safeguards=FakeSafeguards(),
211 assess_confidence=assess_confidence,
212 verify_action=verify_action,
213 confidence_scoring=True,
214 min_confidence_for_action=3,
215 )
216 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
217 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
218 events: list[AgentEvent] = []
219
220 async def emit(event: AgentEvent) -> None:
221 events.append(event)
222
223 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
224 result = await runner.execute_batch(
225 tool_calls=[tool_call],
226 tool_source="assistant",
227 pending_tool_calls_seen=set(),
228 emit=emit,
229 summary=TurnSummary(final_response=""),
230 dod=create_definition_of_done("Read the docs"),
231 executor=executor, # type: ignore[arg-type]
232 on_confirmation=None,
233 on_user_question=None,
234 emit_confirmation=None,
235 consecutive_errors=0,
236 )
237
238 assert result.actions_taken == []
239 assert executor.calls == []
240 assert "Please inspect the project." in captured["context"]
241 assert context.session.messages[-1].role == Role.USER
242 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
243 event_types = [event.type for event in events]
244 assert "confidence" in event_types
245
246
247 @pytest.mark.asyncio
248 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
249 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
250 raise AssertionError("Confidence scoring should be disabled in this scenario")
251
252 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
253 raise AssertionError("Verification should not run for failed actions")
254
255 context = build_context(
256 temp_dir=temp_dir,
257 messages=[],
258 safeguards=FakeSafeguards(),
259 assess_confidence=assess_confidence,
260 verify_action=verify_action,
261 auto_recover=True,
262 )
263 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
264 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
265 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
266 summary = TurnSummary(final_response="")
267 events: list[AgentEvent] = []
268
269 async def emit(event: AgentEvent) -> None:
270 events.append(event)
271
272 await runner.execute_batch(
273 tool_calls=[tool_call],
274 tool_source="assistant",
275 pending_tool_calls_seen=set(),
276 emit=emit,
277 summary=summary,
278 dod=create_definition_of_done("Run tests"),
279 executor=executor, # type: ignore[arg-type]
280 on_confirmation=None,
281 on_user_question=None,
282 emit_confirmation=None,
283 consecutive_errors=0,
284 )
285
286 assert context.recovery_context is not None
287 assert summary.tool_result_messages
288 assert context.session.messages[-1] == summary.tool_result_messages[-1]
289 assert any(event.type == "recovery" for event in events)
290
291
292 @pytest.mark.asyncio
293 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
294 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
295 raise AssertionError("Confidence scoring should be disabled in this scenario")
296
297 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
298 raise AssertionError("Verification should not run for this scenario")
299
300 context = build_context(
301 temp_dir=temp_dir,
302 messages=[],
303 safeguards=FakeSafeguards(),
304 assess_confidence=assess_confidence,
305 verify_action=verify_action,
306 auto_recover=False,
307 )
308 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
309 tool_call = ToolCall(
310 id="bash-1",
311 name="bash",
312 arguments={"command": "python -m http.server 8000", "background": True},
313 )
314 metadata = {
315 "job_id": "bash-1",
316 "status": "running",
317 "background": True,
318 }
319 executor = FakeExecutor(
320 [
321 tool_outcome(
322 tool_call=tool_call,
323 output="Started bash job bash-1",
324 is_error=False,
325 metadata=metadata,
326 )
327 ]
328 )
329 events: list[AgentEvent] = []
330
331 async def emit(event: AgentEvent) -> None:
332 events.append(event)
333
334 await runner.execute_batch(
335 tool_calls=[tool_call],
336 tool_source="assistant",
337 pending_tool_calls_seen=set(),
338 emit=emit,
339 summary=TurnSummary(final_response=""),
340 dod=create_definition_of_done("Launch a preview server"),
341 executor=executor, # type: ignore[arg-type]
342 on_confirmation=None,
343 on_user_question=None,
344 emit_confirmation=None,
345 consecutive_errors=0,
346 )
347
348 tool_result = next(event for event in events if event.type == "tool_result")
349 assert tool_result.tool_metadata == metadata
350
351
352 @pytest.mark.asyncio
353 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
354 verification_calls: list[str] = []
355
356 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
357 raise AssertionError("Confidence scoring should be disabled in this scenario")
358
359 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
360 verification_calls.append(result)
361 return ActionVerification(
362 tool_name=tool_name,
363 tool_args=tool_args,
364 expected_outcome="Success",
365 actual_result=result,
366 verified=False,
367 discrepancies=["File contents did not match"],
368 needs_correction=True,
369 correction_suggestion="Read the file before editing again.",
370 )
371
372 existing_recovery = RecoveryContext(
373 original_tool="edit",
374 original_args={"file_path": "README.md"},
375 )
376 context = build_context(
377 temp_dir=temp_dir,
378 messages=[],
379 safeguards=FakeSafeguards(),
380 assess_confidence=assess_confidence,
381 verify_action=verify_action,
382 recovery_context=existing_recovery,
383 verification=True,
384 )
385 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
386 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
387 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
388 events: list[AgentEvent] = []
389
390 async def emit(event: AgentEvent) -> None:
391 events.append(event)
392
393 await runner.execute_batch(
394 tool_calls=[tool_call],
395 tool_source="assistant",
396 pending_tool_calls_seen=set(),
397 emit=emit,
398 summary=TurnSummary(final_response=""),
399 dod=create_definition_of_done("Read the docs"),
400 executor=executor, # type: ignore[arg-type]
401 on_confirmation=None,
402 on_user_question=None,
403 emit_confirmation=None,
404 consecutive_errors=0,
405 )
406
407 assert verification_calls == ["file contents"]
408 assert context.recovery_context is existing_recovery
409 assert existing_recovery.successful_steps == [
410 ("read", {"file_path": "README.md"})
411 ]
412 assert context.session.messages[-1].role == Role.TOOL
413 assert context.session.messages[-1].content == "file contents"
414 assert any(event.type == "verification" for event in events)
415
416
417 @pytest.mark.asyncio
418 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
419 temp_dir: Path,
420 ) -> None:
421 async def assess_confidence(
422 tool_name: str,
423 tool_args: dict,
424 context: str,
425 ) -> ConfidenceAssessment:
426 raise AssertionError("Confidence scoring should be disabled in this scenario")
427
428 async def verify_action(
429 tool_name: str,
430 tool_args: dict,
431 result: str,
432 expected: str = "",
433 ) -> ActionVerification:
434 raise AssertionError("Verification should not run for this scenario")
435
436 existing_recovery = RecoveryContext(
437 original_tool="read",
438 original_args={"file_path": "chapters/04-data-types.html"},
439 )
440 existing_recovery.add_attempt(
441 "read",
442 {"file_path": "chapters/04-data-types.html"},
443 "File not found",
444 )
445 context = build_context(
446 temp_dir=temp_dir,
447 messages=[],
448 safeguards=FakeSafeguards(),
449 assess_confidence=assess_confidence,
450 verify_action=verify_action,
451 recovery_context=existing_recovery,
452 auto_recover=False,
453 )
454 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
455 tool_call = ToolCall(
456 id="bash-1",
457 name="bash",
458 arguments={"command": "ls chapters"},
459 )
460 executor = FakeExecutor(
461 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
462 )
463
464 summary = TurnSummary(final_response="")
465 await runner.execute_batch(
466 tool_calls=[tool_call],
467 tool_source="assistant",
468 pending_tool_calls_seen=set(),
469 emit=_noop_emit,
470 summary=summary,
471 dod=create_definition_of_done("Fix the chapter links"),
472 executor=executor, # type: ignore[arg-type]
473 on_confirmation=None,
474 on_user_question=None,
475 emit_confirmation=None,
476 consecutive_errors=0,
477 )
478
479 assert context.recovery_context is existing_recovery
480 assert existing_recovery.successful_steps == [
481 ("bash", {"command": "ls chapters"})
482 ]
483
484
485 @pytest.mark.asyncio
486 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
487 temp_dir: Path,
488 ) -> None:
489 async def assess_confidence(
490 tool_name: str,
491 tool_args: dict,
492 context: str,
493 ) -> ConfidenceAssessment:
494 raise AssertionError("Confidence scoring should be disabled in this scenario")
495
496 async def verify_action(
497 tool_name: str,
498 tool_args: dict,
499 result: str,
500 expected: str = "",
501 ) -> ActionVerification:
502 raise AssertionError("Verification should not run for this scenario")
503
504 existing_recovery = RecoveryContext(
505 original_tool="read",
506 original_args={"file_path": "chapters/04-data-types.html"},
507 )
508 existing_recovery.add_attempt(
509 "read",
510 {"file_path": "chapters/04-data-types.html"},
511 "File not found",
512 )
513 context = build_context(
514 temp_dir=temp_dir,
515 messages=[],
516 safeguards=FakeSafeguards(),
517 assess_confidence=assess_confidence,
518 verify_action=verify_action,
519 recovery_context=existing_recovery,
520 auto_recover=False,
521 )
522 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
523 tool_call = ToolCall(
524 id="patch-1",
525 name="patch",
526 arguments={
527 "file_path": "index.html",
528 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
529 },
530 )
531 executor = FakeExecutor(
532 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
533 )
534
535 summary = TurnSummary(final_response="")
536 await runner.execute_batch(
537 tool_calls=[tool_call],
538 tool_source="assistant",
539 pending_tool_calls_seen=set(),
540 emit=_noop_emit,
541 summary=summary,
542 dod=create_definition_of_done("Fix the chapter links"),
543 executor=executor, # type: ignore[arg-type]
544 on_confirmation=None,
545 on_user_question=None,
546 emit_confirmation=None,
547 consecutive_errors=0,
548 )
549
550 assert context.recovery_context is None
551
552
553 @pytest.mark.asyncio
554 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
555 temp_dir: Path,
556 ) -> None:
557 async def assess_confidence(
558 tool_name: str,
559 tool_args: dict,
560 context: str,
561 ) -> ConfidenceAssessment:
562 raise AssertionError("Confidence scoring should be disabled in this scenario")
563
564 async def verify_action(
565 tool_name: str,
566 tool_args: dict,
567 result: str,
568 expected: str = "",
569 ) -> ActionVerification:
570 raise AssertionError("Verification should not run for this scenario")
571
572 messages = [
573 Message(
574 role=Role.TOOL,
575 content=(
576 "Observation [glob]: Result: "
577 f"{temp_dir}/chapters/01-introduction.html\n"
578 f"{temp_dir}/chapters/02-setup.html\n"
579 f"{temp_dir}/chapters/03-basics.html"
580 ),
581 tool_results=[],
582 ),
583 Message(
584 role=Role.ASSISTANT,
585 content="I already inspected the first chapter title.",
586 tool_calls=[
587 ToolCall(
588 id="read-ch1",
589 name="read",
590 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
591 )
592 ],
593 ),
594 Message.tool_result_message(
595 tool_call_id="read-ch1",
596 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
598 ),
599 Message(
600 role=Role.ASSISTANT,
601 content="I should update the index now.",
602 tool_calls=[
603 ToolCall(
604 id="read-index",
605 name="read",
606 arguments={"file_path": str(temp_dir / 'index.html')},
607 )
608 ],
609 ),
610 ]
611 context = build_context(
612 temp_dir=temp_dir,
613 messages=messages,
614 safeguards=FakeSafeguards(),
615 assess_confidence=assess_confidence,
616 verify_action=verify_action,
617 auto_recover=False,
618 )
619 (temp_dir / "chapters").mkdir()
620 (temp_dir / "index.html").write_text("<ul></ul>\n")
621 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
622 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
623 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
624 implementation_plan = temp_dir / "implementation.md"
625 implementation_plan.write_text(
626 "\n".join(
627 [
628 "# Implementation Plan",
629 "",
630 "## File Changes",
631 f"- `{temp_dir / 'index.html'}`",
632 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
633 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
634 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
635 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
636 ]
637 )
638 )
639 context.session.current_task = (
640 f"Update {temp_dir / 'index.html'} with the right chapter links."
641 )
642 persistent_messages: list[str] = []
643 ephemeral_messages: list[str] = []
644 context.queue_steering_message_callback = persistent_messages.append
645 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
646 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
647 tool_call = ToolCall(
648 id="read-dup",
649 name="read",
650 arguments={"file_path": str(temp_dir / "index.html")},
651 )
652 duplicate_message = (
653 "[Skipped - duplicate action: Already read "
654 f"{temp_dir / 'index.html'} recently without any intervening changes; "
655 "reuse the earlier read result instead of rereading]"
656 )
657 executor = FakeExecutor(
658 [
659 ToolExecutionOutcome(
660 tool_call=tool_call,
661 state=ToolExecutionState.DUPLICATE,
662 message=Message.tool_result_message(
663 tool_call_id=tool_call.id,
664 display_content=duplicate_message,
665 result_content=duplicate_message,
666 ),
667 event_content=duplicate_message,
668 is_error=False,
669 result_output=duplicate_message,
670 )
671 ]
672 )
673
674 summary = TurnSummary(final_response="")
675 dod = create_definition_of_done("Fix the chapter links")
676 dod.implementation_plan = str(implementation_plan)
677 dod.pending_items.append("Create the remaining chapter files")
678 await runner.execute_batch(
679 tool_calls=[tool_call],
680 tool_source="assistant",
681 pending_tool_calls_seen=set(),
682 emit=_noop_emit,
683 summary=summary,
684 dod=dod,
685 executor=executor, # type: ignore[arg-type]
686 on_confirmation=None,
687 on_user_question=None,
688 emit_confirmation=None,
689 consecutive_errors=0,
690 )
691
692 assert len(persistent_messages) == 1
693 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
694 assert "A declared output artifact is still missing." in persistent_messages[0]
695 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
696 assert (
697 "Prefer one `write` call for "
698 f"`{display_runtime_path(temp_dir / 'chapters' / '04-variables.html')}` instead of more rereads."
699 in persistent_messages[0]
700 )
701 assert ephemeral_messages == []
702
703
704 @pytest.mark.asyncio
705 async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html_output_active(
706 temp_dir: Path,
707 ) -> None:
708 async def assess_confidence(
709 tool_name: str,
710 tool_args: dict,
711 context: str,
712 ) -> ConfidenceAssessment:
713 raise AssertionError("Confidence scoring should not run for this scenario")
714
715 async def verify_action(
716 tool_name: str,
717 tool_args: dict,
718 result: str,
719 expected: str = "",
720 ) -> ActionVerification:
721 raise AssertionError("Verification should not run for this scenario")
722
723 guide_root = temp_dir / "guide"
724 chapters = guide_root / "chapters"
725 chapters.mkdir(parents=True)
726 index = guide_root / "index.html"
727 chapter_one = chapters / "01-introduction.html"
728 index.write_text(
729 '<a href="chapters/01-introduction.html">Intro</a>\n'
730 '<a href="chapters/02-installation.html">Install</a>\n'
731 )
732 chapter_one.write_text("<h1>Intro</h1>\n")
733
734 implementation_plan = temp_dir / "implementation.md"
735 implementation_plan.write_text(
736 "\n".join(
737 [
738 "# Implementation Plan",
739 "",
740 "## File Changes",
741 f"- `{index}`",
742 f"- `{chapters}/` (directory for chapter files)",
743 ]
744 )
745 )
746
747 messages = [
748 Message(
749 role=Role.ASSISTANT,
750 content="I should keep building the guide.",
751 tool_calls=[
752 ToolCall(
753 id="read-index",
754 name="read",
755 arguments={"file_path": str(index)},
756 )
757 ],
758 ),
759 ]
760 context = build_context(
761 temp_dir=temp_dir,
762 messages=messages,
763 safeguards=FakeSafeguards(),
764 assess_confidence=assess_confidence,
765 verify_action=verify_action,
766 auto_recover=False,
767 )
768 context.session.current_task = f"Build the guide rooted at {index}."
769 persistent_messages: list[str] = []
770 ephemeral_messages: list[str] = []
771 context.queue_steering_message_callback = persistent_messages.append
772 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
773 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
774 tool_call = ToolCall(
775 id="read-dup-rooted",
776 name="read",
777 arguments={"file_path": str(index)},
778 )
779 duplicate_message = (
780 "[Skipped - duplicate action: Already read "
781 f"{index} recently without any intervening changes; "
782 "reuse the earlier read result instead of rereading]"
783 )
784 executor = FakeExecutor(
785 [
786 ToolExecutionOutcome(
787 tool_call=tool_call,
788 state=ToolExecutionState.DUPLICATE,
789 message=Message.tool_result_message(
790 tool_call_id=tool_call.id,
791 display_content=duplicate_message,
792 result_content=duplicate_message,
793 ),
794 event_content=duplicate_message,
795 is_error=False,
796 result_output=duplicate_message,
797 )
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 dod = create_definition_of_done("Create a multi-file HTML guide with chapters.")
803 dod.implementation_plan = str(implementation_plan)
804 dod.touched_files = [str(index), str(chapter_one)]
805 dod.completed_items = ["Create chapter files with appropriate content"]
806 dod.pending_items.append("Create the remaining chapter files")
807
808 await runner.execute_batch(
809 tool_calls=[tool_call],
810 tool_source="assistant",
811 pending_tool_calls_seen=set(),
812 emit=_noop_emit,
813 summary=summary,
814 dod=dod,
815 executor=executor, # type: ignore[arg-type]
816 on_confirmation=None,
817 on_user_question=None,
818 emit_confirmation=None,
819 consecutive_errors=0,
820 )
821
822 assert len(persistent_messages) == 1
823 assert "Create the remaining chapter files" in persistent_messages[0]
824 assert "Resume by creating `02-installation.html` now." in persistent_messages[0]
825 assert "All explicitly planned artifacts already exist on disk." not in persistent_messages[0]
826 assert ephemeral_messages == []
827
828
829 @pytest.mark.asyncio
830 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
831 temp_dir: Path,
832 ) -> None:
833 async def assess_confidence(
834 tool_name: str,
835 tool_args: dict,
836 context: str,
837 ) -> ConfidenceAssessment:
838 raise AssertionError("Confidence scoring should not run for this scenario")
839
840 async def verify_action(
841 tool_name: str,
842 tool_args: dict,
843 result: str,
844 expected: str = "",
845 ) -> ActionVerification:
846 raise AssertionError("Verification should not run for this scenario")
847
848 context = build_context(
849 temp_dir=temp_dir,
850 messages=[],
851 safeguards=FakeSafeguards(),
852 assess_confidence=assess_confidence,
853 verify_action=verify_action,
854 auto_recover=False,
855 )
856 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
857 dod = create_definition_of_done("Create a multi-file nginx guide.")
858 sync_todos_to_definition_of_done(
859 dod,
860 [
861 {
862 "content": "Create 03-first-website.html",
863 "active_form": "Creating 03-first-website.html",
864 "status": "pending",
865 },
866 {
867 "content": "Create 04-configuration-basics.html",
868 "active_form": "Creating 04-configuration-basics.html",
869 "status": "pending",
870 },
871 ],
872 )
873
874 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
875 chapter_path.parent.mkdir(parents=True)
876 write_call = ToolCall(
877 id="write-ch3",
878 name="write",
879 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
880 )
881 stale_todo_call = ToolCall(
882 id="todo-stale",
883 name="TodoWrite",
884 arguments={
885 "todos": [
886 {
887 "content": "Create 03-first-website.html",
888 "active_form": "Creating 03-first-website.html",
889 "status": "pending",
890 },
891 {
892 "content": "Create 04-configuration-basics.html",
893 "active_form": "Creating 04-configuration-basics.html",
894 "status": "pending",
895 },
896 ]
897 },
898 )
899 executor = FakeExecutor(
900 [
901 tool_outcome(
902 tool_call=write_call,
903 output=f"Successfully wrote {chapter_path}",
904 is_error=False,
905 ),
906 tool_outcome(
907 tool_call=stale_todo_call,
908 output="Todos updated",
909 is_error=False,
910 metadata={
911 "new_todos": [
912 {
913 "content": "Create 03-first-website.html",
914 "active_form": "Creating 03-first-website.html",
915 "status": "pending",
916 },
917 {
918 "content": "Create 04-configuration-basics.html",
919 "active_form": "Creating 04-configuration-basics.html",
920 "status": "pending",
921 },
922 ]
923 },
924 ),
925 ]
926 )
927
928 summary = TurnSummary(final_response="")
929 await runner.execute_batch(
930 tool_calls=[write_call, stale_todo_call],
931 tool_source="assistant",
932 pending_tool_calls_seen=set(),
933 emit=_noop_emit,
934 summary=summary,
935 dod=dod,
936 executor=executor, # type: ignore[arg-type]
937 on_confirmation=None,
938 on_user_question=None,
939 emit_confirmation=None,
940 consecutive_errors=0,
941 )
942
943 assert "Create 03-first-website.html" in dod.completed_items
944 assert "Create 03-first-website.html" not in dod.pending_items
945 assert "Create 04-configuration-basics.html" in dod.pending_items
946
947
948 @pytest.mark.asyncio
949 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
950 temp_dir: Path,
951 ) -> None:
952 async def assess_confidence(
953 tool_name: str,
954 tool_args: dict,
955 context: str,
956 ) -> ConfidenceAssessment:
957 raise AssertionError("Confidence scoring should be disabled in this scenario")
958
959 async def verify_action(
960 tool_name: str,
961 tool_args: dict,
962 result: str,
963 expected: str = "",
964 ) -> ActionVerification:
965 raise AssertionError("Verification should not run for this scenario")
966
967 chapters = temp_dir / "chapters"
968 chapters.mkdir()
969 (chapters / "01-introduction.html").write_text(
970 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
971 )
972 (chapters / "02-setup.html").write_text(
973 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
974 )
975 (temp_dir / "index.html").write_text("<ul></ul>\n")
976
977 context = build_context(
978 temp_dir=temp_dir,
979 messages=[],
980 safeguards=FakeSafeguards(),
981 assess_confidence=assess_confidence,
982 verify_action=verify_action,
983 auto_recover=False,
984 )
985 context.session.current_task = (
986 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
987 )
988 persistent_messages: list[str] = []
989 ephemeral_messages: list[str] = []
990 context.queue_steering_message_callback = persistent_messages.append
991 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
992 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
993 tool_call = ToolCall(
994 id="glob-1",
995 name="glob",
996 arguments={"path": str(chapters), "pattern": "*.html"},
997 )
998 executor = FakeExecutor(
999 [
1000 tool_outcome(
1001 tool_call=tool_call,
1002 output="\n".join(
1003 [
1004 str(chapters / "01-introduction.html"),
1005 str(chapters / "02-setup.html"),
1006 ]
1007 ),
1008 is_error=False,
1009 )
1010 ]
1011 )
1012
1013 summary = TurnSummary(final_response="")
1014 await runner.execute_batch(
1015 tool_calls=[tool_call],
1016 tool_source="assistant",
1017 pending_tool_calls_seen=set(),
1018 emit=_noop_emit,
1019 summary=summary,
1020 dod=create_definition_of_done("Fix the chapter links"),
1021 executor=executor, # type: ignore[arg-type]
1022 on_confirmation=None,
1023 on_user_question=None,
1024 emit_confirmation=None,
1025 consecutive_errors=0,
1026 )
1027
1028 assert persistent_messages == []
1029 assert ephemeral_messages == []
1030 assert len(summary.tool_result_messages) == 1
1031 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
1032
1033
1034 @pytest.mark.asyncio
1035 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
1036 temp_dir: Path,
1037 ) -> None:
1038 async def assess_confidence(
1039 tool_name: str,
1040 tool_args: dict,
1041 context: str,
1042 ) -> ConfidenceAssessment:
1043 raise AssertionError("Confidence scoring should be disabled in this scenario")
1044
1045 async def verify_action(
1046 tool_name: str,
1047 tool_args: dict,
1048 result: str,
1049 expected: str = "",
1050 ) -> ActionVerification:
1051 raise AssertionError("Verification should not run for this scenario")
1052
1053 chapters = temp_dir / "chapters"
1054 chapters.mkdir()
1055 (chapters / "01-introduction.html").write_text(
1056 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1057 )
1058 (chapters / "02-setup.html").write_text(
1059 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1060 )
1061 index_path = temp_dir / "index.html"
1062 old_block = (
1063 '<ul class="chapter-list">\n'
1064 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
1065 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
1066 "</ul>\n"
1067 )
1068 new_block = (
1069 '<ul class="chapter-list">\n'
1070 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1071 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1072 "</ul>\n"
1073 )
1074 index_path.write_text(new_block)
1075
1076 context = build_context(
1077 temp_dir=temp_dir,
1078 messages=[],
1079 safeguards=FakeSafeguards(),
1080 assess_confidence=assess_confidence,
1081 verify_action=verify_action,
1082 auto_recover=False,
1083 )
1084 context.session.current_task = (
1085 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1086 )
1087 persistent_messages: list[str] = []
1088 ephemeral_messages: list[str] = []
1089 context.queue_steering_message_callback = persistent_messages.append
1090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1092 tool_call = ToolCall(
1093 id="edit-1",
1094 name="edit",
1095 arguments={
1096 "file_path": str(index_path),
1097 "old_string": old_block,
1098 "new_string": new_block,
1099 },
1100 )
1101 executor = FakeExecutor(
1102 [
1103 tool_outcome(
1104 tool_call=tool_call,
1105 output=f"Successfully edited {index_path}",
1106 is_error=False,
1107 )
1108 ]
1109 )
1110
1111 summary = TurnSummary(final_response="")
1112 await runner.execute_batch(
1113 tool_calls=[tool_call],
1114 tool_source="assistant",
1115 pending_tool_calls_seen=set(),
1116 emit=_noop_emit,
1117 summary=summary,
1118 dod=create_definition_of_done(
1119 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
1120 ),
1121 executor=executor, # type: ignore[arg-type]
1122 on_confirmation=None,
1123 on_user_question=None,
1124 emit_confirmation=None,
1125 consecutive_errors=0,
1126 )
1127
1128 assert all(
1129 "Semantic verification preview:" not in message.content
1130 for message in summary.tool_result_messages
1131 )
1132 assert persistent_messages == []
1133 assert ephemeral_messages == []
1134
1135
1136 @pytest.mark.asyncio
1137 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1138 temp_dir: Path,
1139 ) -> None:
1140 async def assess_confidence(
1141 tool_name: str,
1142 tool_args: dict,
1143 context: str,
1144 ) -> ConfidenceAssessment:
1145 raise AssertionError("Confidence scoring should be disabled in this scenario")
1146
1147 async def verify_action(
1148 tool_name: str,
1149 tool_args: dict,
1150 result: str,
1151 expected: str = "",
1152 ) -> ActionVerification:
1153 raise AssertionError("Verification should not run for this scenario")
1154
1155 chapters = temp_dir / "chapters"
1156 chapters.mkdir()
1157 (chapters / "01-introduction.html").write_text(
1158 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1159 )
1160 (chapters / "02-setup.html").write_text(
1161 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1162 )
1163 index_path = temp_dir / "index.html"
1164 index_path.write_text(
1165 "<h2>Table of Contents</h2>\n"
1166 '<ul class="chapter-list">\n'
1167 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1168 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1169 "</ul>\n"
1170 )
1171
1172 prompt = (
1173 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1174 "for the structure and cadence of the guide. We are going to make an all "
1175 "new equally thorough guide on how to use the nginx tool."
1176 )
1177
1178 context = build_context(
1179 temp_dir=temp_dir,
1180 messages=[],
1181 safeguards=FakeSafeguards(),
1182 assess_confidence=assess_confidence,
1183 verify_action=verify_action,
1184 auto_recover=False,
1185 )
1186 context.session.current_task = prompt # type: ignore[attr-defined]
1187 persistent_messages: list[str] = []
1188 ephemeral_messages: list[str] = []
1189 context.queue_steering_message_callback = persistent_messages.append
1190 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1191 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1192 tool_call = ToolCall(
1193 id="read-index",
1194 name="read",
1195 arguments={"file_path": str(index_path)},
1196 )
1197 executor = FakeExecutor(
1198 [
1199 tool_outcome(
1200 tool_call=tool_call,
1201 output=index_path.read_text(),
1202 is_error=False,
1203 )
1204 ]
1205 )
1206
1207 summary = TurnSummary(final_response="")
1208 await runner.execute_batch(
1209 tool_calls=[tool_call],
1210 tool_source="assistant",
1211 pending_tool_calls_seen=set(),
1212 emit=_noop_emit,
1213 summary=summary,
1214 dod=create_definition_of_done(prompt),
1215 executor=executor, # type: ignore[arg-type]
1216 on_confirmation=None,
1217 on_user_question=None,
1218 emit_confirmation=None,
1219 consecutive_errors=0,
1220 )
1221
1222 assert persistent_messages == []
1223 assert ephemeral_messages == []
1224 assert all(
1225 "Semantic verification preview:" not in message.content
1226 for message in summary.tool_result_messages
1227 )
1228
1229
1230 @pytest.mark.asyncio
1231 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1232 temp_dir: Path,
1233 ) -> None:
1234 async def assess_confidence(
1235 tool_name: str,
1236 tool_args: dict,
1237 context: str,
1238 ) -> ConfidenceAssessment:
1239 raise AssertionError("Confidence scoring should be disabled in this scenario")
1240
1241 async def verify_action(
1242 tool_name: str,
1243 tool_args: dict,
1244 result: str,
1245 expected: str = "",
1246 ) -> ActionVerification:
1247 raise AssertionError("Verification should not run for this scenario")
1248
1249 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1250 reference.parent.mkdir(parents=True)
1251 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1252 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1253 chapters = nginx_root / "chapters"
1254 implementation_plan = temp_dir / "implementation.md"
1255 implementation_plan.write_text(
1256 "\n".join(
1257 [
1258 "# Implementation Plan",
1259 "",
1260 "## File Changes",
1261 f"- `{chapters}/`",
1262 f"- `{nginx_root / 'index.html'}`",
1263 "",
1264 ]
1265 )
1266 )
1267
1268 context = build_context(
1269 temp_dir=temp_dir,
1270 messages=[],
1271 safeguards=FakeSafeguards(),
1272 assess_confidence=assess_confidence,
1273 verify_action=verify_action,
1274 auto_recover=False,
1275 )
1276 persistent_messages: list[str] = []
1277 ephemeral_messages: list[str] = []
1278 context.queue_steering_message_callback = persistent_messages.append
1279 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1280 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1281 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1282 dod.implementation_plan = str(implementation_plan)
1283 sync_todos_to_definition_of_done(
1284 dod,
1285 [
1286 {
1287 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1288 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1289 "status": "pending",
1290 },
1291 {
1292 "content": "Create the nginx directory structure",
1293 "active_form": "Working on: Create the nginx directory structure",
1294 "status": "pending",
1295 },
1296 {
1297 "content": "Create the nginx index.html file",
1298 "active_form": "Working on: Create the nginx index.html file",
1299 "status": "pending",
1300 },
1301 ],
1302 )
1303 tool_call = ToolCall(
1304 id="read-reference",
1305 name="read",
1306 arguments={"file_path": str(reference)},
1307 )
1308 executor = FakeExecutor(
1309 [
1310 tool_outcome(
1311 tool_call=tool_call,
1312 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1313 is_error=False,
1314 )
1315 ]
1316 )
1317
1318 summary = TurnSummary(final_response="")
1319 await runner.execute_batch(
1320 tool_calls=[tool_call],
1321 tool_source="assistant",
1322 pending_tool_calls_seen=set(),
1323 emit=_noop_emit,
1324 summary=summary,
1325 dod=dod,
1326 executor=executor, # type: ignore[arg-type]
1327 on_confirmation=None,
1328 on_user_question=None,
1329 emit_confirmation=None,
1330 consecutive_errors=0,
1331 )
1332
1333 assert (
1334 "Examine the existing Fortran guide structure to understand the cadence and format"
1335 in dod.completed_items
1336 )
1337 assert any(
1338 "Continue with the next pending item: `Create the nginx directory structure`"
1339 in message
1340 for message in persistent_messages
1341 )
1342 assert any(
1343 "Resume by creating `chapters/` now." in message
1344 for message in persistent_messages
1345 )
1346 assert all("01-introduction.html" not in message for message in persistent_messages)
1347 assert ephemeral_messages == []
1348
1349
1350 @pytest.mark.asyncio
1351 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1352 temp_dir: Path,
1353 ) -> None:
1354 async def assess_confidence(
1355 tool_name: str,
1356 tool_args: dict,
1357 context: str,
1358 ) -> ConfidenceAssessment:
1359 raise AssertionError("Confidence scoring should be disabled in this scenario")
1360
1361 async def verify_action(
1362 tool_name: str,
1363 tool_args: dict,
1364 result: str,
1365 expected: str = "",
1366 ) -> ActionVerification:
1367 raise AssertionError("Verification should not run for this scenario")
1368
1369 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1370 reference.parent.mkdir(parents=True)
1371 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1372 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1373 chapters = nginx_root / "chapters"
1374 implementation_plan = temp_dir / "implementation.md"
1375 implementation_plan.write_text(
1376 "\n".join(
1377 [
1378 "# Implementation Plan",
1379 "",
1380 "## File Changes",
1381 f"- `{nginx_root / 'index.html'}`",
1382 f"- `{chapters}/`",
1383 "",
1384 ]
1385 )
1386 )
1387
1388 context = build_context(
1389 temp_dir=temp_dir,
1390 messages=[],
1391 safeguards=FakeSafeguards(),
1392 assess_confidence=assess_confidence,
1393 verify_action=verify_action,
1394 auto_recover=False,
1395 )
1396 persistent_messages: list[str] = []
1397 ephemeral_messages: list[str] = []
1398 context.queue_steering_message_callback = persistent_messages.append
1399 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1400 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1401 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1402 dod.implementation_plan = str(implementation_plan)
1403 sync_todos_to_definition_of_done(
1404 dod,
1405 [
1406 {
1407 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1408 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1409 "status": "pending",
1410 },
1411 {
1412 "content": "Create the nginx directory structure",
1413 "active_form": "Working on: Create the nginx directory structure",
1414 "status": "pending",
1415 },
1416 {
1417 "content": "Create the nginx index.html file",
1418 "active_form": "Working on: Create the nginx index.html file",
1419 "status": "pending",
1420 },
1421 ],
1422 project_root=temp_dir,
1423 )
1424 tool_call = ToolCall(
1425 id="read-reference-index-first",
1426 name="read",
1427 arguments={"file_path": str(reference)},
1428 )
1429 executor = FakeExecutor(
1430 [
1431 tool_outcome(
1432 tool_call=tool_call,
1433 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1434 is_error=False,
1435 )
1436 ]
1437 )
1438
1439 summary = TurnSummary(final_response="")
1440 await runner.execute_batch(
1441 tool_calls=[tool_call],
1442 tool_source="assistant",
1443 pending_tool_calls_seen=set(),
1444 emit=_noop_emit,
1445 summary=summary,
1446 dod=dod,
1447 executor=executor, # type: ignore[arg-type]
1448 on_confirmation=None,
1449 on_user_question=None,
1450 emit_confirmation=None,
1451 consecutive_errors=0,
1452 )
1453
1454 assert persistent_messages
1455 assert any(
1456 "Continue with the next pending item: `Create the nginx directory structure`"
1457 in message
1458 for message in persistent_messages
1459 )
1460 assert any(
1461 "Resume by creating `chapters/` now." in message
1462 for message in persistent_messages
1463 )
1464 assert all(
1465 "Next step: create `index.html`." not in message
1466 for message in persistent_messages
1467 )
1468 assert ephemeral_messages == []
1469
1470
1471 @pytest.mark.asyncio
1472 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1473 temp_dir: Path,
1474 ) -> None:
1475 async def assess_confidence(
1476 tool_name: str,
1477 tool_args: dict,
1478 context: str,
1479 ) -> ConfidenceAssessment:
1480 raise AssertionError("Confidence scoring should be disabled in this scenario")
1481
1482 async def verify_action(
1483 tool_name: str,
1484 tool_args: dict,
1485 result: str,
1486 expected: str = "",
1487 ) -> ActionVerification:
1488 raise AssertionError("Verification should not run for this scenario")
1489
1490 reference = temp_dir / "fortran" / "index.html"
1491 reference.parent.mkdir(parents=True)
1492 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1493
1494 messages = [
1495 Message(
1496 role=Role.TOOL,
1497 content=(
1498 "Observation [read]: Result: "
1499 "<h1>Fortran Beginner's Guide</h1>\n"
1500 ),
1501 )
1502 ]
1503 context = build_context(
1504 temp_dir=temp_dir,
1505 messages=messages,
1506 safeguards=FakeSafeguards(),
1507 assess_confidence=assess_confidence,
1508 verify_action=verify_action,
1509 auto_recover=False,
1510 )
1511 prompt = (
1512 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1513 "for the structure and cadence of the guide. We are going to make an all "
1514 "new equally thorough guide on how to use the nginx tool."
1515 )
1516 context.session.current_task = prompt
1517 persistent_messages: list[str] = []
1518 ephemeral_messages: list[str] = []
1519 context.queue_steering_message_callback = persistent_messages.append
1520 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1522 dod = create_definition_of_done(prompt)
1523 sync_todos_to_definition_of_done(
1524 dod,
1525 [
1526 {
1527 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1528 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1529 "status": "completed",
1530 },
1531 {
1532 "content": "Create the nginx directory structure",
1533 "active_form": "Working on: Create the nginx directory structure",
1534 "status": "pending",
1535 },
1536 {
1537 "content": "Create the nginx index.html file",
1538 "active_form": "Working on: Create the nginx index.html file",
1539 "status": "pending",
1540 },
1541 ],
1542 )
1543 tool_call = ToolCall(
1544 id="read-dup",
1545 name="read",
1546 arguments={"file_path": str(reference)},
1547 )
1548 duplicate_message = (
1549 "[Skipped - duplicate action: Already read "
1550 f"{reference} recently without any intervening changes; "
1551 "reuse the earlier read result instead of rereading]"
1552 )
1553 executor = FakeExecutor(
1554 [
1555 ToolExecutionOutcome(
1556 tool_call=tool_call,
1557 state=ToolExecutionState.DUPLICATE,
1558 message=Message.tool_result_message(
1559 tool_call_id=tool_call.id,
1560 display_content=duplicate_message,
1561 result_content=duplicate_message,
1562 ),
1563 event_content=duplicate_message,
1564 is_error=False,
1565 result_output=duplicate_message,
1566 )
1567 ]
1568 )
1569
1570 summary = TurnSummary(final_response="")
1571 await runner.execute_batch(
1572 tool_calls=[tool_call],
1573 tool_source="assistant",
1574 pending_tool_calls_seen=set(),
1575 emit=_noop_emit,
1576 summary=summary,
1577 dod=dod,
1578 executor=executor, # type: ignore[arg-type]
1579 on_confirmation=None,
1580 on_user_question=None,
1581 emit_confirmation=None,
1582 consecutive_errors=0,
1583 )
1584
1585 assert len(persistent_messages) == 1
1586 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1587 assert (
1588 "Continue with the next pending item: `Create the nginx directory structure`"
1589 in persistent_messages[0]
1590 )
1591 assert "Update `" not in persistent_messages[0]
1592 assert ephemeral_messages == []
1593
1594
1595 @pytest.mark.asyncio
1596 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1597 temp_dir: Path,
1598 ) -> None:
1599 async def assess_confidence(
1600 tool_name: str,
1601 tool_args: dict,
1602 context: str,
1603 ) -> ConfidenceAssessment:
1604 raise AssertionError("Confidence scoring should be disabled in this scenario")
1605
1606 async def verify_action(
1607 tool_name: str,
1608 tool_args: dict,
1609 result: str,
1610 expected: str = "",
1611 ) -> ActionVerification:
1612 raise AssertionError("Verification should not run for this scenario")
1613
1614 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1615 chapters = guide_root / "chapters"
1616 chapters.mkdir(parents=True)
1617 chapter_one = chapters / "01-introduction.html"
1618 chapter_one.write_text("<html></html>\n")
1619 index_path = guide_root / "index.html"
1620
1621 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1622 reference.parent.mkdir(parents=True, exist_ok=True)
1623 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1624
1625 implementation_plan = temp_dir / "implementation.md"
1626 implementation_plan.write_text(
1627 "\n".join(
1628 [
1629 "# Implementation Plan",
1630 "",
1631 "## File Changes",
1632 f"- `{guide_root}/`",
1633 f"- `{chapters}/`",
1634 f"- `{index_path}`",
1635 f"- `{chapter_one}`",
1636 f"- `{chapters / '02-installation.html'}`",
1637 "",
1638 ]
1639 )
1640 )
1641
1642 context = build_context(
1643 temp_dir=temp_dir,
1644 messages=[],
1645 safeguards=FakeSafeguards(),
1646 assess_confidence=assess_confidence,
1647 verify_action=verify_action,
1648 auto_recover=False,
1649 )
1650 persistent_messages: list[str] = []
1651 ephemeral_messages: list[str] = []
1652 context.queue_steering_message_callback = persistent_messages.append
1653 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1654 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1655 dod = create_definition_of_done("Create a multi-file nginx guide.")
1656 dod.implementation_plan = str(implementation_plan)
1657 dod.touched_files.append(str(chapter_one))
1658 sync_todos_to_definition_of_done(
1659 dod,
1660 [
1661 {
1662 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1663 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1664 "status": "pending",
1665 },
1666 {
1667 "content": "Create each chapter file with appropriate content",
1668 "active_form": "Working on: Create each chapter file with appropriate content",
1669 "status": "pending",
1670 },
1671 {
1672 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1673 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1674 "status": "pending",
1675 },
1676 ],
1677 )
1678 tool_call = ToolCall(
1679 id="read-reference-chapter",
1680 name="read",
1681 arguments={"file_path": str(reference)},
1682 )
1683 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1684 executor = FakeExecutor(
1685 [
1686 ToolExecutionOutcome(
1687 tool_call=tool_call,
1688 state=ToolExecutionState.EXECUTED,
1689 message=Message.tool_result_message(
1690 tool_call_id=tool_call.id,
1691 display_content=read_output,
1692 result_content=read_output,
1693 ),
1694 event_content=read_output,
1695 is_error=False,
1696 result_output=read_output,
1697 )
1698 ]
1699 )
1700
1701 summary = TurnSummary(final_response="")
1702 await runner.execute_batch(
1703 tool_calls=[tool_call],
1704 tool_source="assistant",
1705 pending_tool_calls_seen=set(),
1706 emit=_noop_emit,
1707 summary=summary,
1708 dod=dod,
1709 executor=executor, # type: ignore[arg-type]
1710 on_confirmation=None,
1711 on_user_question=None,
1712 emit_confirmation=None,
1713 consecutive_errors=0,
1714 )
1715
1716 assert persistent_messages
1717 assert any(
1718 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1719 in message
1720 for message in persistent_messages
1721 )
1722 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1723 assert not any(
1724 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1725 in message
1726 for message in persistent_messages
1727 )
1728 assert ephemeral_messages == []
1729
1730
1731 @pytest.mark.asyncio
1732 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1733 temp_dir: Path,
1734 ) -> None:
1735 async def assess_confidence(
1736 tool_name: str,
1737 tool_args: dict,
1738 context: str,
1739 ) -> ConfidenceAssessment:
1740 raise AssertionError("Confidence scoring should not run for this scenario")
1741
1742 async def verify_action(
1743 tool_name: str,
1744 tool_args: dict,
1745 result: str,
1746 expected: str = "",
1747 ) -> ActionVerification:
1748 raise AssertionError("Verification should not run for this scenario")
1749
1750 guide_root = temp_dir / "guides" / "nginx"
1751 chapters = guide_root / "chapters"
1752 guide_root.mkdir(parents=True)
1753 chapters.mkdir()
1754 index_path = guide_root / "index.html"
1755 chapter_one = chapters / "01-getting-started.html"
1756 chapter_two = chapters / "02-installation.html"
1757 index_path.write_text("<html></html>\n")
1758 chapter_one.write_text("<h1>One</h1>\n")
1759 chapter_two.write_text("<h1>Two</h1>\n")
1760
1761 implementation_plan = temp_dir / "implementation.md"
1762 implementation_plan.write_text(
1763 "\n".join(
1764 [
1765 "# Implementation Plan",
1766 "",
1767 "## File Changes",
1768 f"- `{guide_root}/`",
1769 f"- `{chapters}/`",
1770 f"- `{index_path}`",
1771 f"- `{chapter_one}`",
1772 f"- `{chapter_two}`",
1773 "",
1774 ]
1775 )
1776 )
1777
1778 context = build_context(
1779 temp_dir=temp_dir,
1780 messages=[],
1781 safeguards=FakeSafeguards(),
1782 assess_confidence=assess_confidence,
1783 verify_action=verify_action,
1784 auto_recover=False,
1785 )
1786 persistent_messages: list[str] = []
1787 ephemeral_messages: list[str] = []
1788 context.queue_steering_message_callback = persistent_messages.append
1789 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1790 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1791 dod = create_definition_of_done("Create a multi-file nginx guide.")
1792 dod.implementation_plan = str(implementation_plan)
1793 dod.pending_items = [
1794 "Create 07-performance-tuning.html",
1795 "Verify all guide files are linked and complete",
1796 "Complete the requested work",
1797 ]
1798
1799 tool_call = ToolCall(
1800 id="read-dup",
1801 name="read",
1802 arguments={"file_path": str(chapter_one)},
1803 )
1804 duplicate_message = (
1805 "[Skipped - duplicate action: Already read "
1806 f"{chapter_one} recently without any intervening changes; "
1807 "reuse the earlier read result instead of rereading]"
1808 )
1809 executor = FakeExecutor(
1810 [
1811 ToolExecutionOutcome(
1812 tool_call=tool_call,
1813 state=ToolExecutionState.DUPLICATE,
1814 message=Message.tool_result_message(
1815 tool_call_id=tool_call.id,
1816 display_content=duplicate_message,
1817 result_content=duplicate_message,
1818 ),
1819 event_content=duplicate_message,
1820 is_error=False,
1821 result_output=duplicate_message,
1822 )
1823 ]
1824 )
1825
1826 summary = TurnSummary(final_response="")
1827 await runner.execute_batch(
1828 tool_calls=[tool_call],
1829 tool_source="assistant",
1830 pending_tool_calls_seen=set(),
1831 emit=_noop_emit,
1832 summary=summary,
1833 dod=dod,
1834 executor=executor, # type: ignore[arg-type]
1835 on_confirmation=None,
1836 on_user_question=None,
1837 emit_confirmation=None,
1838 consecutive_errors=0,
1839 )
1840
1841 assert len(persistent_messages) == 1
1842 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1843 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1844 assert ephemeral_messages == []
1845
1846
1847 @pytest.mark.asyncio
1848 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1849 temp_dir: Path,
1850 ) -> None:
1851 async def assess_confidence(
1852 tool_name: str,
1853 tool_args: dict,
1854 context: str,
1855 ) -> ConfidenceAssessment:
1856 raise AssertionError("Confidence scoring should not run for this scenario")
1857
1858 async def verify_action(
1859 tool_name: str,
1860 tool_args: dict,
1861 result: str,
1862 expected: str = "",
1863 ) -> ActionVerification:
1864 raise AssertionError("Verification should not run for this scenario")
1865
1866 guide_root = temp_dir / "guides" / "nginx"
1867 chapters = guide_root / "chapters"
1868 guide_root.mkdir(parents=True)
1869 chapters.mkdir()
1870 index_path = guide_root / "index.html"
1871 chapter_one = chapters / "01-getting-started.html"
1872 chapter_two = chapters / "02-installation.html"
1873 index_path.write_text("<html></html>\n")
1874 chapter_one.write_text("<h1>One</h1>\n")
1875 chapter_two.write_text("<h1>Two</h1>\n")
1876
1877 implementation_plan = temp_dir / "implementation.md"
1878 implementation_plan.write_text(
1879 "\n".join(
1880 [
1881 "# Implementation Plan",
1882 "",
1883 "## File Changes",
1884 f"- `{guide_root}/`",
1885 f"- `{chapters}/`",
1886 f"- `{index_path}`",
1887 f"- `{chapter_one}`",
1888 f"- `{chapter_two}`",
1889 "",
1890 ]
1891 )
1892 )
1893
1894 context = build_context(
1895 temp_dir=temp_dir,
1896 messages=[],
1897 safeguards=FakeSafeguards(),
1898 assess_confidence=assess_confidence,
1899 verify_action=verify_action,
1900 auto_recover=False,
1901 )
1902 persistent_messages: list[str] = []
1903 ephemeral_messages: list[str] = []
1904 context.queue_steering_message_callback = persistent_messages.append
1905 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1906 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1907 dod = create_definition_of_done("Create a multi-file nginx guide.")
1908 dod.implementation_plan = str(implementation_plan)
1909 dod.verification_commands = [f"ls -la {guide_root}"]
1910 dod.pending_items = [
1911 "Create 07-performance-tuning.html",
1912 "Complete the requested work",
1913 ]
1914
1915 tool_call = ToolCall(
1916 id="read-dup",
1917 name="read",
1918 arguments={"file_path": str(chapter_one)},
1919 )
1920 duplicate_message = (
1921 "[Skipped - duplicate action: Already read "
1922 f"{chapter_one} recently without any intervening changes; "
1923 "reuse the earlier read result instead of rereading]"
1924 )
1925 executor = FakeExecutor(
1926 [
1927 ToolExecutionOutcome(
1928 tool_call=tool_call,
1929 state=ToolExecutionState.DUPLICATE,
1930 message=Message.tool_result_message(
1931 tool_call_id=tool_call.id,
1932 display_content=duplicate_message,
1933 result_content=duplicate_message,
1934 ),
1935 event_content=duplicate_message,
1936 is_error=False,
1937 result_output=duplicate_message,
1938 )
1939 ]
1940 )
1941
1942 summary = TurnSummary(final_response="")
1943 await runner.execute_batch(
1944 tool_calls=[tool_call],
1945 tool_source="assistant",
1946 pending_tool_calls_seen=set(),
1947 emit=_noop_emit,
1948 summary=summary,
1949 dod=dod,
1950 executor=executor, # type: ignore[arg-type]
1951 on_confirmation=None,
1952 on_user_question=None,
1953 emit_confirmation=None,
1954 consecutive_errors=0,
1955 )
1956
1957 assert len(persistent_messages) == 1
1958 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
1959 assert (
1960 "Move to verification or final confirmation using the files already on disk."
1961 in persistent_messages[0]
1962 )
1963 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1964 assert ephemeral_messages == []
1965
1966
1967 @pytest.mark.asyncio
1968 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1969 temp_dir: Path,
1970 ) -> None:
1971 async def assess_confidence(
1972 tool_name: str,
1973 tool_args: dict,
1974 context: str,
1975 ) -> ConfidenceAssessment:
1976 raise AssertionError("Confidence scoring should not run for this scenario")
1977
1978 async def verify_action(
1979 tool_name: str,
1980 tool_args: dict,
1981 result: str,
1982 expected: str = "",
1983 ) -> ActionVerification:
1984 raise AssertionError("Verification should not run for this scenario")
1985
1986 guide_root = temp_dir / "guides" / "nginx"
1987 chapters = guide_root / "chapters"
1988 guide_root.mkdir(parents=True)
1989 chapters.mkdir()
1990 index_path = guide_root / "index.html"
1991 chapter_one = chapters / "01-getting-started.html"
1992 chapter_two = chapters / "02-installation.html"
1993 index_path.write_text("<html></html>\n")
1994 chapter_one.write_text("<h1>One</h1>\n")
1995 chapter_two.write_text("<h1>Two</h1>\n")
1996
1997 implementation_plan = temp_dir / "implementation.md"
1998 implementation_plan.write_text(
1999 "\n".join(
2000 [
2001 "# Implementation Plan",
2002 "",
2003 "## File Changes",
2004 f"- `{guide_root}/`",
2005 f"- `{chapters}/`",
2006 f"- `{index_path}`",
2007 f"- `{chapter_one}`",
2008 f"- `{chapter_two}`",
2009 "",
2010 ]
2011 )
2012 )
2013
2014 context = build_context(
2015 temp_dir=temp_dir,
2016 messages=[],
2017 safeguards=FakeSafeguards(),
2018 assess_confidence=assess_confidence,
2019 verify_action=verify_action,
2020 auto_recover=False,
2021 )
2022 persistent_messages: list[str] = []
2023 ephemeral_messages: list[str] = []
2024 context.queue_steering_message_callback = persistent_messages.append
2025 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2026 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2027 dod = create_definition_of_done("Create a multi-file nginx guide.")
2028 dod.implementation_plan = str(implementation_plan)
2029 dod.verification_commands = [f"ls -la {guide_root}"]
2030 dod.pending_items = [
2031 "Create 01-getting-started.html",
2032 "Creating 02-installation.html",
2033 "Complete the requested work",
2034 ]
2035
2036 tool_call = ToolCall(
2037 id="read-dup-built-stale",
2038 name="read",
2039 arguments={"file_path": str(chapter_one)},
2040 )
2041 duplicate_message = (
2042 "[Skipped - duplicate action: Already read "
2043 f"{chapter_one} recently without any intervening changes; "
2044 "reuse the earlier read result instead of rereading]"
2045 )
2046 executor = FakeExecutor(
2047 [
2048 ToolExecutionOutcome(
2049 tool_call=tool_call,
2050 state=ToolExecutionState.DUPLICATE,
2051 message=Message.tool_result_message(
2052 tool_call_id=tool_call.id,
2053 display_content=duplicate_message,
2054 result_content=duplicate_message,
2055 ),
2056 event_content=duplicate_message,
2057 is_error=False,
2058 result_output=duplicate_message,
2059 )
2060 ]
2061 )
2062
2063 summary = TurnSummary(final_response="")
2064 await runner.execute_batch(
2065 tool_calls=[tool_call],
2066 tool_source="assistant",
2067 pending_tool_calls_seen=set(),
2068 emit=_noop_emit,
2069 summary=summary,
2070 dod=dod,
2071 executor=executor, # type: ignore[arg-type]
2072 on_confirmation=None,
2073 on_user_question=None,
2074 emit_confirmation=None,
2075 consecutive_errors=0,
2076 )
2077
2078 assert len(persistent_messages) == 1
2079 assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
2080 assert (
2081 "Move to verification or final confirmation using the files already on disk."
2082 in persistent_messages[0]
2083 )
2084 assert "Create 01-getting-started.html" not in persistent_messages[0]
2085 assert "Creating 02-installation.html" not in persistent_messages[0]
2086 assert ephemeral_messages == []
2087
2088
2089 @pytest.mark.asyncio
2090 async def test_tool_batch_runner_successful_read_after_plan_complete_pushes_review_handoff(
2091 temp_dir: Path,
2092 ) -> None:
2093 async def assess_confidence(
2094 tool_name: str,
2095 tool_args: dict,
2096 context: str,
2097 ) -> ConfidenceAssessment:
2098 raise AssertionError("Confidence scoring should not run for this scenario")
2099
2100 async def verify_action(
2101 tool_name: str,
2102 tool_args: dict,
2103 result: str,
2104 expected: str = "",
2105 ) -> ActionVerification:
2106 raise AssertionError("Verification should not run for this scenario")
2107
2108 guide_root = temp_dir / "guides" / "nginx"
2109 chapters = guide_root / "chapters"
2110 guide_root.mkdir(parents=True)
2111 chapters.mkdir()
2112 index_path = guide_root / "index.html"
2113 chapter_one = chapters / "01-getting-started.html"
2114 chapter_two = chapters / "02-installation.html"
2115 index_path.write_text("<html></html>\n")
2116 chapter_one.write_text("<h1>One</h1>\n")
2117 chapter_two.write_text("<h1>Two</h1>\n")
2118
2119 implementation_plan = temp_dir / "implementation.md"
2120 implementation_plan.write_text(
2121 "\n".join(
2122 [
2123 "# Implementation Plan",
2124 "",
2125 "## File Changes",
2126 f"- `{guide_root}/`",
2127 f"- `{chapters}/`",
2128 f"- `{index_path}`",
2129 f"- `{chapter_one}`",
2130 f"- `{chapter_two}`",
2131 "",
2132 ]
2133 )
2134 )
2135
2136 context = build_context(
2137 temp_dir=temp_dir,
2138 messages=[],
2139 safeguards=FakeSafeguards(),
2140 assess_confidence=assess_confidence,
2141 verify_action=verify_action,
2142 auto_recover=False,
2143 )
2144 persistent_messages: list[str] = []
2145 ephemeral_messages: list[str] = []
2146 context.queue_steering_message_callback = persistent_messages.append
2147 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2148 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2149 dod = create_definition_of_done("Create a multi-file nginx guide.")
2150 dod.implementation_plan = str(implementation_plan)
2151 dod.verification_commands = [f"ls -la {guide_root}"]
2152 sync_todos_to_definition_of_done(
2153 dod,
2154 [
2155 {
2156 "content": "Create 01-getting-started.html",
2157 "active_form": "Creating 01-getting-started.html",
2158 "status": "pending",
2159 },
2160 {
2161 "content": "Ensure all files are properly linked and formatted consistently",
2162 "active_form": "Reviewing guide consistency and linkage",
2163 "status": "pending",
2164 },
2165 ],
2166 )
2167
2168 tool_call = ToolCall(
2169 id="read-built-review",
2170 name="read",
2171 arguments={"file_path": str(chapter_one)},
2172 )
2173 executor = FakeExecutor(
2174 [tool_outcome(tool_call=tool_call, output=chapter_one.read_text(), is_error=False)]
2175 )
2176
2177 summary = TurnSummary(final_response="")
2178 await runner.execute_batch(
2179 tool_calls=[tool_call],
2180 tool_source="assistant",
2181 pending_tool_calls_seen=set(),
2182 emit=_noop_emit,
2183 summary=summary,
2184 dod=dod,
2185 executor=executor, # type: ignore[arg-type]
2186 on_confirmation=None,
2187 on_user_question=None,
2188 emit_confirmation=None,
2189 consecutive_errors=0,
2190 )
2191
2192 assert persistent_messages == []
2193 assert len(ephemeral_messages) == 1
2194 message = ephemeral_messages[0]
2195 assert "All explicitly planned artifacts already exist." in message
2196 assert "Ensure all files are properly linked and formatted consistently" in message
2197 assert "Create 01-getting-started.html" not in message
2198 assert "do not keep broad-rereading the output set" in message
2199 assert "If no specific mismatch remains, move to verification now." in message
2200
2201
2202 @pytest.mark.asyncio
2203 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
2204 temp_dir: Path,
2205 ) -> None:
2206 async def assess_confidence(
2207 tool_name: str,
2208 tool_args: dict,
2209 context: str,
2210 ) -> ConfidenceAssessment:
2211 raise AssertionError("Confidence scoring should be disabled in this scenario")
2212
2213 async def verify_action(
2214 tool_name: str,
2215 tool_args: dict,
2216 result: str,
2217 expected: str = "",
2218 ) -> ActionVerification:
2219 raise AssertionError("Verification should not run for this scenario")
2220
2221 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2222 reference.parent.mkdir(parents=True)
2223 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2224
2225 context = build_context(
2226 temp_dir=temp_dir,
2227 messages=[],
2228 safeguards=FakeSafeguards(),
2229 assess_confidence=assess_confidence,
2230 verify_action=verify_action,
2231 auto_recover=False,
2232 )
2233 persistent_messages: list[str] = []
2234 ephemeral_messages: list[str] = []
2235 context.queue_steering_message_callback = persistent_messages.append
2236 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2237 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2238 dod = create_definition_of_done("Create a multi-file nginx guide.")
2239 sync_todos_to_definition_of_done(
2240 dod,
2241 [
2242 {
2243 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2244 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2245 "status": "pending",
2246 },
2247 {
2248 "content": "Create the nginx index.html file",
2249 "active_form": "Working on: Create the nginx index.html file",
2250 "status": "pending",
2251 },
2252 ],
2253 )
2254 tool_call = ToolCall(
2255 id="read-reference",
2256 name="read",
2257 arguments={"file_path": str(reference)},
2258 )
2259 executor = FakeExecutor(
2260 [
2261 tool_outcome(
2262 tool_call=tool_call,
2263 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2264 is_error=False,
2265 )
2266 ]
2267 )
2268
2269 summary = TurnSummary(final_response="")
2270 await runner.execute_batch(
2271 tool_calls=[tool_call],
2272 tool_source="assistant",
2273 pending_tool_calls_seen=set(),
2274 emit=_noop_emit,
2275 summary=summary,
2276 dod=dod,
2277 executor=executor, # type: ignore[arg-type]
2278 on_confirmation=None,
2279 on_user_question=None,
2280 emit_confirmation=None,
2281 consecutive_errors=0,
2282 )
2283
2284 assert any(
2285 "Continue with the next pending item: `Create the nginx index.html file`"
2286 in message
2287 for message in persistent_messages
2288 )
2289 assert any(
2290 "stop gathering more reference material and perform the change now" in message
2291 for message in persistent_messages
2292 )
2293 assert ephemeral_messages == []
2294
2295
2296 @pytest.mark.asyncio
2297 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2298 temp_dir: Path,
2299 ) -> None:
2300 async def assess_confidence(
2301 tool_name: str,
2302 tool_args: dict,
2303 context: str,
2304 ) -> ConfidenceAssessment:
2305 raise AssertionError("Confidence scoring should be disabled in this scenario")
2306
2307 async def verify_action(
2308 tool_name: str,
2309 tool_args: dict,
2310 result: str,
2311 expected: str = "",
2312 ) -> ActionVerification:
2313 raise AssertionError("Verification should not run for this scenario")
2314
2315 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2316 reference.parent.mkdir(parents=True)
2317 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2318
2319 context = build_context(
2320 temp_dir=temp_dir,
2321 messages=[],
2322 safeguards=FakeSafeguards(),
2323 assess_confidence=assess_confidence,
2324 verify_action=verify_action,
2325 auto_recover=False,
2326 )
2327 persistent_messages: list[str] = []
2328 ephemeral_messages: list[str] = []
2329 context.queue_steering_message_callback = persistent_messages.append
2330 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2331 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2332 dod = create_definition_of_done("Create a multi-file nginx guide.")
2333 sync_todos_to_definition_of_done(
2334 dod,
2335 [
2336 {
2337 "content": "First, examine the existing fortran guide structure and content",
2338 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2339 "status": "pending",
2340 },
2341 {
2342 "content": "Create the nginx directory structure",
2343 "active_form": "Working on: Create the nginx directory structure",
2344 "status": "pending",
2345 },
2346 ],
2347 )
2348 tool_call = ToolCall(
2349 id="read-reference",
2350 name="read",
2351 arguments={"file_path": str(reference)},
2352 )
2353 executor = FakeExecutor(
2354 [
2355 tool_outcome(
2356 tool_call=tool_call,
2357 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2358 is_error=False,
2359 )
2360 ]
2361 )
2362
2363 summary = TurnSummary(final_response="")
2364 await runner.execute_batch(
2365 tool_calls=[tool_call],
2366 tool_source="assistant",
2367 pending_tool_calls_seen=set(),
2368 emit=_noop_emit,
2369 summary=summary,
2370 dod=dod,
2371 executor=executor, # type: ignore[arg-type]
2372 on_confirmation=None,
2373 on_user_question=None,
2374 emit_confirmation=None,
2375 consecutive_errors=0,
2376 )
2377
2378 assert persistent_messages
2379 assert any(
2380 "Continue with the next pending item: `Create the nginx directory structure`"
2381 in message
2382 for message in persistent_messages
2383 )
2384 assert ephemeral_messages == []
2385
2386
2387 @pytest.mark.asyncio
2388 async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
2389 temp_dir: Path,
2390 ) -> None:
2391 async def assess_confidence(
2392 tool_name: str,
2393 tool_args: dict,
2394 context: str,
2395 ) -> ConfidenceAssessment:
2396 raise AssertionError("Confidence scoring should be disabled in this scenario")
2397
2398 async def verify_action(
2399 tool_name: str,
2400 tool_args: dict,
2401 result: str,
2402 expected: str = "",
2403 ) -> ActionVerification:
2404 raise AssertionError("Verification should not run for this scenario")
2405
2406 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2407 chapters = nginx_root / "chapters"
2408 implementation_plan = temp_dir / "implementation.md"
2409 implementation_plan.write_text(
2410 "\n".join(
2411 [
2412 "# Implementation Plan",
2413 "",
2414 "## File Changes",
2415 f"- `{chapters}/`",
2416 f"- `{nginx_root / 'index.html'}`",
2417 "",
2418 ]
2419 )
2420 )
2421
2422 context = build_context(
2423 temp_dir=temp_dir,
2424 messages=[],
2425 safeguards=FakeSafeguards(),
2426 assess_confidence=assess_confidence,
2427 verify_action=verify_action,
2428 auto_recover=False,
2429 )
2430 persistent_messages: list[str] = []
2431 ephemeral_messages: list[str] = []
2432 context.queue_steering_message_callback = persistent_messages.append
2433 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2434 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2435 dod = create_definition_of_done("Create a multi-file nginx guide.")
2436 dod.implementation_plan = str(implementation_plan)
2437 sync_todos_to_definition_of_done(
2438 dod,
2439 [
2440 {
2441 "content": "Create the nginx directory structure",
2442 "active_form": "Creating the nginx directory structure",
2443 "status": "pending",
2444 },
2445 {
2446 "content": "Develop the main index.html file with proper structure",
2447 "active_form": "Developing the main index.html file with proper structure",
2448 "status": "pending",
2449 },
2450 ],
2451 )
2452
2453 tool_call = ToolCall(
2454 id="mkdir-nginx",
2455 name="bash",
2456 arguments={"command": f"mkdir -p {chapters}"},
2457 )
2458 executor = FakeExecutor(
2459 [
2460 tool_outcome(
2461 tool_call=tool_call,
2462 output="",
2463 is_error=False,
2464 )
2465 ]
2466 )
2467
2468 summary = TurnSummary(final_response="")
2469 await runner.execute_batch(
2470 tool_calls=[tool_call],
2471 tool_source="assistant",
2472 pending_tool_calls_seen=set(),
2473 emit=_noop_emit,
2474 summary=summary,
2475 dod=dod,
2476 executor=executor, # type: ignore[arg-type]
2477 on_confirmation=None,
2478 on_user_question=None,
2479 emit_confirmation=None,
2480 consecutive_errors=0,
2481 )
2482
2483 assert persistent_messages
2484 message = persistent_messages[-1]
2485 assert "Directory setup is complete." in message
2486 assert "Next step: create `index.html`." in message
2487 assert "Write a compact but real initial version of that file now" in message
2488 assert ephemeral_messages == []
2489
2490
2491 @pytest.mark.asyncio
2492 async def test_tool_batch_runner_first_chapter_handoff_stays_persistent_until_substantive_output_exists(
2493 temp_dir: Path,
2494 ) -> None:
2495 async def assess_confidence(
2496 tool_name: str,
2497 tool_args: dict,
2498 context: str,
2499 ) -> ConfidenceAssessment:
2500 raise AssertionError("Confidence scoring should be disabled in this scenario")
2501
2502 async def verify_action(
2503 tool_name: str,
2504 tool_args: dict,
2505 result: str,
2506 expected: str = "",
2507 ) -> ActionVerification:
2508 raise AssertionError("Verification should not run for this scenario")
2509
2510 nginx_root = temp_dir / "guides" / "nginx"
2511 chapters = nginx_root / "chapters"
2512 chapters.mkdir(parents=True)
2513 index_path = nginx_root / "index.html"
2514
2515 implementation_plan = temp_dir / "implementation.md"
2516 implementation_plan.write_text(
2517 "\n".join(
2518 [
2519 "# Implementation Plan",
2520 "",
2521 "## File Changes",
2522 f"- `{chapters}/`",
2523 f"- `{index_path}`",
2524 f"- `{chapters / '01-introduction.html'}`",
2525 "",
2526 ]
2527 )
2528 )
2529
2530 context = build_context(
2531 temp_dir=temp_dir,
2532 messages=[],
2533 safeguards=FakeSafeguards(),
2534 assess_confidence=assess_confidence,
2535 verify_action=verify_action,
2536 auto_recover=False,
2537 )
2538 persistent_messages: list[str] = []
2539 ephemeral_messages: list[str] = []
2540 context.queue_steering_message_callback = persistent_messages.append
2541 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2542 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2543 dod = create_definition_of_done("Create a multi-file nginx guide.")
2544 dod.implementation_plan = str(implementation_plan)
2545 sync_todos_to_definition_of_done(
2546 dod,
2547 [
2548 {
2549 "content": "Create the main index.html file with proper structure",
2550 "active_form": "Creating the main index.html file with proper structure",
2551 "status": "pending",
2552 },
2553 {
2554 "content": "Create each chapter file with appropriate content",
2555 "active_form": "Creating each chapter file with appropriate content",
2556 "status": "pending",
2557 },
2558 ],
2559 )
2560
2561 tool_call = ToolCall(
2562 id="write-index",
2563 name="write",
2564 arguments={
2565 "file_path": str(index_path),
2566 "content": "<html></html>\n",
2567 },
2568 )
2569 executor = FakeExecutor(
2570 [
2571 tool_outcome(
2572 tool_call=tool_call,
2573 output=f"Successfully wrote 14 bytes to {index_path}",
2574 is_error=False,
2575 )
2576 ]
2577 )
2578
2579 summary = TurnSummary(final_response="")
2580 await runner.execute_batch(
2581 tool_calls=[tool_call],
2582 tool_source="assistant",
2583 pending_tool_calls_seen=set(),
2584 emit=_noop_emit,
2585 summary=summary,
2586 dod=dod,
2587 executor=executor, # type: ignore[arg-type]
2588 on_confirmation=None,
2589 on_user_question=None,
2590 emit_confirmation=None,
2591 consecutive_errors=0,
2592 )
2593
2594 assert persistent_messages
2595 assert ephemeral_messages == []
2596 message = persistent_messages[-1]
2597 assert "Confirmed progress:" in message
2598 assert "Next step: create `01-introduction.html`." in message
2599 assert (
2600 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2601 in message
2602 )
2603 assert "Write a compact but real initial version of that file now" not in message
2604 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2605
2606
2607 @pytest.mark.asyncio
2608 async def test_tool_batch_runner_directory_handoff_uses_home_relative_path(
2609 temp_dir: Path,
2610 monkeypatch: pytest.MonkeyPatch,
2611 ) -> None:
2612 monkeypatch.setenv("HOME", str(temp_dir.resolve(strict=False)))
2613
2614 async def assess_confidence(
2615 tool_name: str,
2616 tool_args: dict,
2617 context: str,
2618 ) -> ConfidenceAssessment:
2619 raise AssertionError("Confidence scoring should be disabled in this scenario")
2620
2621 async def verify_action(
2622 tool_name: str,
2623 tool_args: dict,
2624 result: str,
2625 expected: str = "",
2626 ) -> ActionVerification:
2627 raise AssertionError("Verification should not run for this scenario")
2628
2629 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2630 chapters = nginx_root / "chapters"
2631 index_path = nginx_root / "index.html"
2632
2633 implementation_plan = temp_dir / "implementation.md"
2634 implementation_plan.write_text(
2635 "\n".join(
2636 [
2637 "# Implementation Plan",
2638 "",
2639 "## File Changes",
2640 f"- `{chapters}/`",
2641 f"- `{index_path}`",
2642 "",
2643 ]
2644 )
2645 )
2646
2647 context = build_context(
2648 temp_dir=temp_dir,
2649 messages=[],
2650 safeguards=FakeSafeguards(),
2651 assess_confidence=assess_confidence,
2652 verify_action=verify_action,
2653 auto_recover=False,
2654 )
2655 persistent_messages: list[str] = []
2656 context.queue_steering_message_callback = persistent_messages.append
2657 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2658 dod = create_definition_of_done("Create a multi-file nginx guide.")
2659 dod.implementation_plan = str(implementation_plan)
2660 sync_todos_to_definition_of_done(
2661 dod,
2662 [
2663 {
2664 "content": "Create the nginx directory structure",
2665 "active_form": "Creating the nginx directory structure",
2666 "status": "pending",
2667 },
2668 {
2669 "content": "Develop the main index.html file with proper structure",
2670 "active_form": "Developing the main index.html file with proper structure",
2671 "status": "pending",
2672 },
2673 ],
2674 )
2675
2676 tool_call = ToolCall(
2677 id="mkdir-nginx-home",
2678 name="bash",
2679 arguments={"command": f"mkdir -p {chapters}"},
2680 )
2681 executor = FakeExecutor(
2682 [
2683 tool_outcome(
2684 tool_call=tool_call,
2685 output="",
2686 is_error=False,
2687 )
2688 ]
2689 )
2690
2691 summary = TurnSummary(final_response="")
2692 await runner.execute_batch(
2693 tool_calls=[tool_call],
2694 tool_source="assistant",
2695 pending_tool_calls_seen=set(),
2696 emit=_noop_emit,
2697 summary=summary,
2698 dod=dod,
2699 executor=executor, # type: ignore[arg-type]
2700 on_confirmation=None,
2701 on_user_question=None,
2702 emit_confirmation=None,
2703 consecutive_errors=0,
2704 )
2705
2706 assert persistent_messages
2707 message = persistent_messages[-1]
2708 assert "Next step: create `index.html`." in message
2709 assert "`~/Loader/guides/nginx/index.html`" in message
2710 assert "Write a compact but real initial version of that file now" in message
2711
2712
2713 @pytest.mark.asyncio
2714 async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
2715 temp_dir: Path,
2716 ) -> None:
2717 async def assess_confidence(
2718 tool_name: str,
2719 tool_args: dict,
2720 context: str,
2721 ) -> ConfidenceAssessment:
2722 raise AssertionError("Confidence scoring should not run in this scenario")
2723
2724 async def verify_action(
2725 tool_name: str,
2726 tool_args: dict,
2727 result: str,
2728 expected: str = "",
2729 ) -> ActionVerification:
2730 raise AssertionError("Verification should not run in this scenario")
2731
2732 nginx_root = temp_dir / "guides" / "nginx"
2733 chapters = nginx_root / "chapters"
2734 chapters.mkdir(parents=True)
2735 index_path = nginx_root / "index.html"
2736 index_path.write_text(
2737 "\n".join(
2738 [
2739 "<html>",
2740 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2741 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2742 "</html>",
2743 ]
2744 )
2745 + "\n"
2746 )
2747
2748 implementation_plan = temp_dir / "implementation.md"
2749 implementation_plan.write_text(
2750 "\n".join(
2751 [
2752 "# Implementation Plan",
2753 "",
2754 "## File Changes",
2755 f"- `{nginx_root}/`",
2756 f"- `{chapters}/`",
2757 f"- `{index_path}`",
2758 f"- `{chapters / '01-introduction.html'}`",
2759 "",
2760 ]
2761 )
2762 )
2763
2764 context = build_context(
2765 temp_dir=temp_dir,
2766 messages=[],
2767 safeguards=FakeSafeguards(),
2768 assess_confidence=assess_confidence,
2769 verify_action=verify_action,
2770 auto_recover=False,
2771 )
2772 persistent_messages: list[str] = []
2773 ephemeral_messages: list[str] = []
2774 context.queue_steering_message_callback = persistent_messages.append
2775 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2776 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2777 dod = create_definition_of_done("Create a multi-file nginx guide.")
2778 dod.implementation_plan = str(implementation_plan)
2779 dod.touched_files.append(str(index_path))
2780 dod.completed_items.append("Develop the main index.html file for the nginx guide")
2781 dod.pending_items.append("Create chapter files for the nginx guide")
2782
2783 tool_call = ToolCall(
2784 id="read-index-self-audit",
2785 name="read",
2786 arguments={"file_path": str(index_path)},
2787 )
2788 executor = FakeExecutor(
2789 [
2790 tool_outcome(
2791 tool_call=tool_call,
2792 output="1\t<html>\n",
2793 is_error=False,
2794 )
2795 ]
2796 )
2797
2798 summary = TurnSummary(final_response="")
2799 await runner.execute_batch(
2800 tool_calls=[tool_call],
2801 tool_source="assistant",
2802 pending_tool_calls_seen=set(),
2803 emit=_noop_emit,
2804 summary=summary,
2805 dod=dod,
2806 executor=executor, # type: ignore[arg-type]
2807 on_confirmation=None,
2808 on_user_question=None,
2809 emit_confirmation=None,
2810 consecutive_errors=0,
2811 )
2812
2813 assert persistent_messages
2814 message = persistent_messages[-1]
2815 assert "You already have the current contents of `index.html` from the successful write." in message
2816 assert "Resume by creating `01-introduction.html` now." in message
2817 assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
2818 assert ephemeral_messages == []
2819
2820
2821 @pytest.mark.asyncio
2822 async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
2823 temp_dir: Path,
2824 ) -> None:
2825 async def assess_confidence(
2826 tool_name: str,
2827 tool_args: dict,
2828 context: str,
2829 ) -> ConfidenceAssessment:
2830 raise AssertionError("Confidence scoring should be disabled in this scenario")
2831
2832 async def verify_action(
2833 tool_name: str,
2834 tool_args: dict,
2835 result: str,
2836 expected: str = "",
2837 ) -> ActionVerification:
2838 raise AssertionError("Verification should not run for this scenario")
2839
2840 nginx_root = temp_dir / "guides" / "nginx"
2841 chapters = nginx_root / "chapters"
2842 chapters.mkdir(parents=True)
2843 index_path = nginx_root / "index.html"
2844
2845 implementation_plan = temp_dir / "implementation.md"
2846 implementation_plan.write_text(
2847 "\n".join(
2848 [
2849 "# Implementation Plan",
2850 "",
2851 "## File Changes",
2852 f"- `{chapters}/`",
2853 f"- `{index_path}`",
2854 f"- `{chapters / '01-introduction.html'}`",
2855 "",
2856 ]
2857 )
2858 )
2859
2860 context = build_context(
2861 temp_dir=temp_dir,
2862 messages=[
2863 Message(
2864 role=Role.USER,
2865 content=(
2866 "[EMPTY ASSISTANT RESPONSE]\n"
2867 "Respond with that concrete mutation tool call now. Do not return an empty response."
2868 ),
2869 )
2870 ],
2871 safeguards=FakeSafeguards(),
2872 assess_confidence=assess_confidence,
2873 verify_action=verify_action,
2874 auto_recover=False,
2875 )
2876 persistent_messages: list[str] = []
2877 ephemeral_messages: list[str] = []
2878 context.queue_steering_message_callback = persistent_messages.append
2879 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2880 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2881 dod = create_definition_of_done("Create a multi-file nginx guide.")
2882 dod.implementation_plan = str(implementation_plan)
2883 sync_todos_to_definition_of_done(
2884 dod,
2885 [
2886 {
2887 "content": "Create the main index.html file with proper structure",
2888 "active_form": "Creating the main index.html file with proper structure",
2889 "status": "pending",
2890 },
2891 {
2892 "content": "Create each chapter file with appropriate content",
2893 "active_form": "Creating each chapter file with appropriate content",
2894 "status": "pending",
2895 },
2896 ],
2897 )
2898
2899 tool_call = ToolCall(
2900 id="write-index-recovered",
2901 name="write",
2902 arguments={
2903 "file_path": str(index_path),
2904 "content": "<html></html>\n",
2905 },
2906 )
2907 executor = FakeExecutor(
2908 [
2909 tool_outcome(
2910 tool_call=tool_call,
2911 output=f"Successfully wrote 14 bytes to {index_path}",
2912 is_error=False,
2913 )
2914 ]
2915 )
2916
2917 summary = TurnSummary(final_response="")
2918 await runner.execute_batch(
2919 tool_calls=[tool_call],
2920 tool_source="assistant",
2921 pending_tool_calls_seen=set(),
2922 emit=_noop_emit,
2923 summary=summary,
2924 dod=dod,
2925 executor=executor, # type: ignore[arg-type]
2926 on_confirmation=None,
2927 on_user_question=None,
2928 emit_confirmation=None,
2929 consecutive_errors=0,
2930 )
2931
2932 assert persistent_messages == []
2933 assert ephemeral_messages
2934 message = ephemeral_messages[-1]
2935 assert "Next step: create `01-introduction.html`." in message
2936 assert "Write a compact but real initial version of that file now" not in message
2937
2938
2939 @pytest.mark.asyncio
2940 async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
2941 temp_dir: Path,
2942 ) -> None:
2943 async def assess_confidence(
2944 tool_name: str,
2945 tool_args: dict,
2946 context: str,
2947 ) -> ConfidenceAssessment:
2948 raise AssertionError("Confidence scoring should not run in this scenario")
2949
2950 async def verify_action(
2951 tool_name: str,
2952 tool_args: dict,
2953 result: str,
2954 expected: str = "",
2955 ) -> ActionVerification:
2956 raise AssertionError("Verification should not run in this scenario")
2957
2958 guide_root = temp_dir / "guides" / "nginx"
2959 chapters = guide_root / "chapters"
2960 chapters.mkdir(parents=True)
2961 index_path = guide_root / "index.html"
2962 index_path.write_text(
2963 "\n".join(
2964 [
2965 "<html>",
2966 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2967 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2968 "</html>",
2969 ]
2970 )
2971 + "\n"
2972 )
2973
2974 implementation_plan = temp_dir / "implementation.md"
2975 implementation_plan.write_text(
2976 "\n".join(
2977 [
2978 "# Implementation Plan",
2979 "",
2980 "## File Changes",
2981 f"- `{guide_root}/`",
2982 f"- `{chapters}/`",
2983 f"- `{index_path}`",
2984 "",
2985 ]
2986 )
2987 )
2988
2989 context = build_context(
2990 temp_dir=temp_dir,
2991 messages=[],
2992 safeguards=FakeSafeguards(),
2993 assess_confidence=assess_confidence,
2994 verify_action=verify_action,
2995 )
2996 queued_messages: list[str] = []
2997 context.queue_steering_message_callback = queued_messages.append
2998 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2999 dod = create_definition_of_done("Create a multi-file nginx guide.")
3000 dod.implementation_plan = str(implementation_plan)
3001 dod.touched_files.append(str(index_path))
3002 sync_todos_to_definition_of_done(
3003 dod,
3004 [
3005 {
3006 "content": "Develop the main index.html file with proper structure",
3007 "active_form": "Developing the main index.html file with proper structure",
3008 "status": "completed",
3009 },
3010 {
3011 "content": "Create chapter files with content and structure",
3012 "active_form": "Creating chapter files with content and structure",
3013 "status": "pending",
3014 },
3015 ],
3016 )
3017
3018 todos = [
3019 {
3020 "content": "Develop the main index.html file with proper structure",
3021 "active_form": "Developing the main index.html file with proper structure",
3022 "status": "completed",
3023 },
3024 {
3025 "content": "Create chapter files with content and structure",
3026 "active_form": "Creating chapter files with content and structure",
3027 "status": "pending",
3028 },
3029 ]
3030 tool_call = ToolCall(
3031 id="todo-aggregate",
3032 name="TodoWrite",
3033 arguments={"todos": todos},
3034 )
3035 executor = FakeExecutor(
3036 [
3037 tool_outcome(
3038 tool_call=tool_call,
3039 output="Todos updated",
3040 is_error=False,
3041 metadata={"new_todos": todos},
3042 )
3043 ]
3044 )
3045
3046 summary = TurnSummary(final_response="")
3047 await runner.execute_batch(
3048 tool_calls=[tool_call],
3049 tool_source="assistant",
3050 pending_tool_calls_seen=set(),
3051 emit=_noop_emit,
3052 summary=summary,
3053 dod=dod,
3054 executor=executor, # type: ignore[arg-type]
3055 on_confirmation=None,
3056 on_user_question=None,
3057 emit_confirmation=None,
3058 consecutive_errors=0,
3059 )
3060
3061 assert queued_messages
3062 message = queued_messages[-1]
3063 assert "Todo tracking is updated." in message
3064 assert "Next step: create `01-introduction.html`." in message
3065 assert (
3066 "Continue with the next pending item: `Create chapter files with content and structure`."
3067 not in message
3068 )
3069
3070
3071 @pytest.mark.asyncio
3072 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
3073 temp_dir: Path,
3074 ) -> None:
3075 async def assess_confidence(
3076 tool_name: str,
3077 tool_args: dict,
3078 context: str,
3079 ) -> ConfidenceAssessment:
3080 raise AssertionError("Confidence scoring should be disabled in this scenario")
3081
3082 async def verify_action(
3083 tool_name: str,
3084 tool_args: dict,
3085 result: str,
3086 expected: str = "",
3087 ) -> ActionVerification:
3088 raise AssertionError("Verification should not run for this scenario")
3089
3090 guide_root = temp_dir / "guides" / "nginx"
3091 chapters = guide_root / "chapters"
3092 chapters.mkdir(parents=True)
3093 index_path = guide_root / "index.html"
3094 chapter_one = chapters / "01-getting-started.html"
3095 chapter_one.write_text("<h1>One</h1>\n")
3096 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3097
3098 implementation_plan = temp_dir / "implementation.md"
3099 implementation_plan.write_text(
3100 "\n".join(
3101 [
3102 "# Implementation Plan",
3103 "",
3104 "## File Changes",
3105 f"- `{index_path}`",
3106 f"- `{chapter_one}`",
3107 f"- `{chapters / '06-ssl-configuration.html'}`",
3108 "",
3109 ]
3110 )
3111 )
3112
3113 context = build_context(
3114 temp_dir=temp_dir,
3115 messages=[],
3116 safeguards=FakeSafeguards(),
3117 assess_confidence=assess_confidence,
3118 verify_action=verify_action,
3119 auto_recover=False,
3120 )
3121 persistent_messages: list[str] = []
3122 ephemeral_messages: list[str] = []
3123 context.queue_steering_message_callback = persistent_messages.append
3124 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3125 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3126 dod = create_definition_of_done("Create a multi-file nginx guide.")
3127 dod.implementation_plan = str(implementation_plan)
3128 sync_todos_to_definition_of_done(
3129 dod,
3130 [
3131 {
3132 "content": "Ensure all files are properly linked and formatted consistently",
3133 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3134 "status": "pending",
3135 },
3136 {
3137 "content": "Create the final chapter (06-ssl-configuration.html)",
3138 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
3139 "status": "pending",
3140 },
3141 ],
3142 )
3143 assert tool_batches_should_prioritize_missing_artifact(
3144 dod=dod,
3145 next_pending=dod.pending_items[0],
3146 missing_artifact=(chapters / "06-ssl-configuration.html", False),
3147 project_root=temp_dir,
3148 )
3149
3150 tool_call = ToolCall(
3151 id="dup-read",
3152 name="read",
3153 arguments={"file_path": str(index_path)},
3154 )
3155 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
3156
3157 assert persistent_messages
3158 message = persistent_messages[-1]
3159 assert "06-ssl-configuration.html" in message
3160 assert "Do not switch into review or consistency-check mode" in message
3161 assert (
3162 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
3163 not in message
3164 )
3165
3166
3167 @pytest.mark.asyncio
3168 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
3169 temp_dir: Path,
3170 ) -> None:
3171 async def assess_confidence(
3172 tool_name: str,
3173 tool_args: dict,
3174 context: str,
3175 ) -> ConfidenceAssessment:
3176 raise AssertionError("Confidence scoring should be disabled in this scenario")
3177
3178 async def verify_action(
3179 tool_name: str,
3180 tool_args: dict,
3181 result: str,
3182 expected: str = "",
3183 ) -> ActionVerification:
3184 raise AssertionError("Verification should not run for this scenario")
3185
3186 guide_root = temp_dir / "guides" / "nginx"
3187 chapters = guide_root / "chapters"
3188 chapters.mkdir(parents=True)
3189 index_path = guide_root / "index.html"
3190 chapter_one = chapters / "01-getting-started.html"
3191 chapter_two = chapters / "02-installation.html"
3192 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
3193 chapter_one.write_text("<h1>One</h1>\n")
3194 chapter_two.write_text("<h1>Two</h1>\n")
3195
3196 implementation_plan = temp_dir / "implementation.md"
3197 implementation_plan.write_text(
3198 "\n".join(
3199 [
3200 "# Implementation Plan",
3201 "",
3202 "## File Changes",
3203 f"- `{chapters}/`",
3204 f"- `{index_path}`",
3205 f"- `{chapter_one}`",
3206 f"- `{chapter_two}`",
3207 "",
3208 ]
3209 )
3210 )
3211
3212 context = build_context(
3213 temp_dir=temp_dir,
3214 messages=[],
3215 safeguards=FakeSafeguards(),
3216 assess_confidence=assess_confidence,
3217 verify_action=verify_action,
3218 auto_recover=False,
3219 )
3220 persistent_messages: list[str] = []
3221 ephemeral_messages: list[str] = []
3222 context.queue_steering_message_callback = persistent_messages.append
3223 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3224 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3225 dod = create_definition_of_done("Create a multi-file nginx guide.")
3226 dod.implementation_plan = str(implementation_plan)
3227 sync_todos_to_definition_of_done(
3228 dod,
3229 [
3230 {
3231 "content": "Create the guide files",
3232 "active_form": "Working on: Create the guide files",
3233 "status": "completed",
3234 },
3235 {
3236 "content": "Ensure all files are properly linked and formatted consistently",
3237 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3238 "status": "pending",
3239 },
3240 ],
3241 )
3242 tool_call = ToolCall(
3243 id="write-final",
3244 name="write",
3245 arguments={
3246 "file_path": str(chapter_two),
3247 "content": "<h1>Two</h1>\n",
3248 },
3249 )
3250 executor = FakeExecutor(
3251 [
3252 tool_outcome(
3253 tool_call=tool_call,
3254 output=f"Successfully wrote {chapter_two}",
3255 is_error=False,
3256 )
3257 ]
3258 )
3259
3260 summary = TurnSummary(final_response="")
3261 await runner.execute_batch(
3262 tool_calls=[tool_call],
3263 tool_source="assistant",
3264 pending_tool_calls_seen=set(),
3265 emit=_noop_emit,
3266 summary=summary,
3267 dod=dod,
3268 executor=executor, # type: ignore[arg-type]
3269 on_confirmation=None,
3270 on_user_question=None,
3271 emit_confirmation=None,
3272 consecutive_errors=0,
3273 )
3274
3275 assert any(
3276 "All explicitly planned artifacts now exist on disk." in message
3277 for message in persistent_messages
3278 )
3279 assert any(
3280 "Ensure all files are properly linked and formatted consistently" in message
3281 for message in persistent_messages
3282 )
3283 assert any(
3284 "Move to verification once no specific mismatch remains." in message
3285 for message in persistent_messages
3286 )
3287
3288
3289 @pytest.mark.asyncio
3290 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
3291 temp_dir: Path,
3292 ) -> None:
3293 async def assess_confidence(
3294 tool_name: str,
3295 tool_args: dict,
3296 context: str,
3297 ) -> ConfidenceAssessment:
3298 raise AssertionError("Confidence scoring should not run in this scenario")
3299
3300 async def verify_action(
3301 tool_name: str,
3302 tool_args: dict,
3303 result: str,
3304 expected: str = "",
3305 ) -> ActionVerification:
3306 raise AssertionError("Verification should not run in this scenario")
3307
3308 guide_root = temp_dir / "guides" / "nginx"
3309 chapters = guide_root / "chapters"
3310 guide_root.mkdir(parents=True)
3311 chapters.mkdir()
3312 index_path = guide_root / "index.html"
3313 index_path.write_text("<html></html>\n")
3314 chapter_one = chapters / "01-getting-started.html"
3315 chapter_two = chapters / "02-installation.html"
3316 implementation_plan = temp_dir / "implementation.md"
3317 implementation_plan.write_text(
3318 "\n".join(
3319 [
3320 "# Implementation Plan",
3321 "",
3322 "## File Changes",
3323 f"- `{guide_root}/`",
3324 f"- `{index_path}`",
3325 f"- `{chapter_one}`",
3326 f"- `{chapter_two}`",
3327 "",
3328 ]
3329 )
3330 )
3331
3332 context = build_context(
3333 temp_dir=temp_dir,
3334 messages=[],
3335 safeguards=FakeSafeguards(),
3336 assess_confidence=assess_confidence,
3337 verify_action=verify_action,
3338 auto_recover=False,
3339 )
3340 persistent_messages: list[str] = []
3341 ephemeral_messages: list[str] = []
3342 context.queue_steering_message_callback = persistent_messages.append
3343 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3344 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3345 dod = create_definition_of_done("Create a multi-file nginx guide.")
3346 dod.implementation_plan = str(implementation_plan)
3347 sync_todos_to_definition_of_done(
3348 dod,
3349 [
3350 {
3351 "content": "Create the main index.html file with proper structure",
3352 "active_form": "Working on: Create the main index.html file with proper structure",
3353 "status": "pending",
3354 },
3355 {
3356 "content": "Create each chapter file in sequence, following the established pattern",
3357 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
3358 "status": "pending",
3359 },
3360 {
3361 "content": "Ensure all files are properly linked and formatted consistently",
3362 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3363 "status": "pending",
3364 },
3365 ],
3366 )
3367 tool_call = ToolCall(
3368 id="write-index",
3369 name="write",
3370 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
3371 )
3372 executor = FakeExecutor(
3373 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
3374 )
3375
3376 summary = TurnSummary(final_response="")
3377 await runner.execute_batch(
3378 tool_calls=[tool_call],
3379 tool_source="assistant",
3380 pending_tool_calls_seen=set(),
3381 emit=_noop_emit,
3382 summary=summary,
3383 dod=dod,
3384 executor=executor, # type: ignore[arg-type]
3385 on_confirmation=None,
3386 on_user_question=None,
3387 emit_confirmation=None,
3388 consecutive_errors=0,
3389 )
3390
3391 assert persistent_messages
3392 assert ephemeral_messages == []
3393 message = persistent_messages[-1]
3394 assert "Next step: create `01-getting-started.html`." in message
3395 assert "Write a compact but real initial version of that file now" not in message
3396 assert "refresh `TodoWrite`" not in message
3397 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3398
3399
3400 @pytest.mark.asyncio
3401 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
3402 temp_dir: Path,
3403 ) -> None:
3404 async def assess_confidence(
3405 tool_name: str,
3406 tool_args: dict,
3407 context: str,
3408 ) -> ConfidenceAssessment:
3409 raise AssertionError("Confidence scoring should not run in this scenario")
3410
3411 async def verify_action(
3412 tool_name: str,
3413 tool_args: dict,
3414 result: str,
3415 expected: str = "",
3416 ) -> ActionVerification:
3417 raise AssertionError("Verification should not run in this scenario")
3418
3419 guide_root = temp_dir / "guides" / "nginx"
3420 chapters = guide_root / "chapters"
3421 guide_root.mkdir(parents=True)
3422 chapters.mkdir()
3423 index_path = guide_root / "index.html"
3424 index_path.write_text("<html></html>\n")
3425
3426 chapter_paths = [
3427 chapters / "01-getting-started.html",
3428 chapters / "02-installation.html",
3429 chapters / "03-first-website.html",
3430 chapters / "04-configuration-basics.html",
3431 chapters / "05-advanced-configurations.html",
3432 chapters / "06-performance-tuning.html",
3433 chapters / "07-security-best-practices.html",
3434 ]
3435 for chapter in chapter_paths[:4]:
3436 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
3437 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
3438
3439 implementation_plan = temp_dir / "implementation.md"
3440 implementation_plan.write_text(
3441 "\n".join(
3442 [
3443 "# Implementation Plan",
3444 "",
3445 "## File Changes",
3446 f"- `{guide_root}/`",
3447 f"- `{chapters}/`",
3448 f"- `{index_path}`",
3449 *[f"- `{path}`" for path in chapter_paths],
3450 "",
3451 ]
3452 )
3453 )
3454
3455 context = build_context(
3456 temp_dir=temp_dir,
3457 messages=[],
3458 safeguards=FakeSafeguards(),
3459 assess_confidence=assess_confidence,
3460 verify_action=verify_action,
3461 auto_recover=False,
3462 )
3463 persistent_messages: list[str] = []
3464 ephemeral_messages: list[str] = []
3465 context.queue_steering_message_callback = persistent_messages.append
3466 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3467 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3468 dod = create_definition_of_done("Create a thorough nginx guide.")
3469 dod.implementation_plan = str(implementation_plan)
3470 sync_todos_to_definition_of_done(
3471 dod,
3472 [
3473 {
3474 "content": "Create the nginx guide artifacts",
3475 "active_form": "Creating nginx guide artifacts",
3476 "status": "pending",
3477 },
3478 {
3479 "content": "Verify all guide files are linked and complete",
3480 "active_form": "Verifying guide linkage and completeness",
3481 "status": "pending",
3482 },
3483 ],
3484 )
3485 tool_call = ToolCall(
3486 id="write-chapter-05",
3487 name="write",
3488 arguments={
3489 "file_path": str(chapter_paths[4]),
3490 "content": "<h1>Advanced configurations</h1>\n",
3491 },
3492 )
3493 executor = FakeExecutor(
3494 [
3495 tool_outcome(
3496 tool_call=tool_call,
3497 output=f"Successfully wrote {chapter_paths[4]}",
3498 is_error=False,
3499 )
3500 ]
3501 )
3502
3503 summary = TurnSummary(final_response="")
3504 await runner.execute_batch(
3505 tool_calls=[tool_call],
3506 tool_source="assistant",
3507 pending_tool_calls_seen=set(),
3508 emit=_noop_emit,
3509 summary=summary,
3510 dod=dod,
3511 executor=executor, # type: ignore[arg-type]
3512 on_confirmation=None,
3513 on_user_question=None,
3514 emit_confirmation=None,
3515 consecutive_errors=0,
3516 )
3517
3518 assert any(
3519 "Next step: create `06-performance-tuning.html`." in message
3520 for message in ephemeral_messages
3521 )
3522 assert not any(
3523 "All explicitly planned artifacts now exist on disk." in message
3524 for message in ephemeral_messages
3525 )
3526
3527
3528 @pytest.mark.asyncio
3529 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
3530 temp_dir: Path,
3531 ) -> None:
3532 async def assess_confidence(
3533 tool_name: str,
3534 tool_args: dict,
3535 context: str,
3536 ) -> ConfidenceAssessment:
3537 raise AssertionError("Confidence scoring should not run in this scenario")
3538
3539 async def verify_action(
3540 tool_name: str,
3541 tool_args: dict,
3542 result: str,
3543 expected: str = "",
3544 ) -> ActionVerification:
3545 raise AssertionError("Verification should not run in this scenario")
3546
3547 guide_root = temp_dir / "guides" / "nginx"
3548 chapters = guide_root / "chapters"
3549 guide_root.mkdir(parents=True)
3550 chapters.mkdir()
3551 index_path = guide_root / "index.html"
3552 chapter_paths = [
3553 chapters / "01-introduction.html",
3554 chapters / "02-installation.html",
3555 chapters / "03-configuration.html",
3556 chapters / "04-basic-usage.html",
3557 chapters / "05-advanced-features.html",
3558 ]
3559 for path in (index_path, *chapter_paths[:4]):
3560 path.write_text("<html></html>\n")
3561
3562 implementation_plan = temp_dir / "implementation.md"
3563 implementation_plan.write_text(
3564 "\n".join(
3565 [
3566 "# Implementation Plan",
3567 "",
3568 "## File Changes",
3569 f"- `{guide_root}/`",
3570 f"- `{chapters}/`",
3571 f"- `{index_path}`",
3572 *[f"- `{path}`" for path in chapter_paths],
3573 "",
3574 ]
3575 )
3576 )
3577
3578 context = build_context(
3579 temp_dir=temp_dir,
3580 messages=[],
3581 safeguards=FakeSafeguards(),
3582 assess_confidence=assess_confidence,
3583 verify_action=verify_action,
3584 auto_recover=False,
3585 )
3586 persistent_messages: list[str] = []
3587 ephemeral_messages: list[str] = []
3588 context.queue_steering_message_callback = persistent_messages.append
3589 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3590 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3591 dod = create_definition_of_done("Create a thorough nginx guide.")
3592 dod.implementation_plan = str(implementation_plan)
3593 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3594 dod.completed_items.extend(
3595 [
3596 "Create the nginx directory structure",
3597 "Create the main index.html file with proper structure",
3598 ]
3599 )
3600 sync_todos_to_definition_of_done(
3601 dod,
3602 [
3603 {
3604 "content": "Create each chapter file with appropriate content",
3605 "active_form": "Creating each chapter file with appropriate content",
3606 "status": "pending",
3607 }
3608 ],
3609 )
3610 tool_call = ToolCall(
3611 id="write-chapter-04",
3612 name="write",
3613 arguments={
3614 "file_path": str(chapter_paths[3]),
3615 "content": "<html>updated</html>\n",
3616 },
3617 )
3618 executor = FakeExecutor(
3619 [
3620 tool_outcome(
3621 tool_call=tool_call,
3622 output=f"Successfully wrote {chapter_paths[3]}",
3623 is_error=False,
3624 )
3625 ]
3626 )
3627
3628 summary = TurnSummary(final_response="")
3629 await runner.execute_batch(
3630 tool_calls=[tool_call],
3631 tool_source="assistant",
3632 pending_tool_calls_seen=set(),
3633 emit=_noop_emit,
3634 summary=summary,
3635 dod=dod,
3636 executor=executor, # type: ignore[arg-type]
3637 on_confirmation=None,
3638 on_user_question=None,
3639 emit_confirmation=None,
3640 consecutive_errors=0,
3641 )
3642
3643 assert ephemeral_messages
3644 message = ephemeral_messages[-1]
3645 assert "Next step: create `05-advanced-features.html`." in message
3646 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3647 assert "refresh `TodoWrite`" not in message
3648
3649
3650 @pytest.mark.asyncio
3651 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3652 temp_dir: Path,
3653 ) -> None:
3654 async def assess_confidence(
3655 tool_name: str,
3656 tool_args: dict,
3657 context: str,
3658 ) -> ConfidenceAssessment:
3659 raise AssertionError("Confidence scoring should not run in this scenario")
3660
3661 async def verify_action(
3662 tool_name: str,
3663 tool_args: dict,
3664 result: str,
3665 expected: str = "",
3666 ) -> ActionVerification:
3667 raise AssertionError("Verification should not run in this scenario")
3668
3669 guide_root = temp_dir / "guides" / "nginx"
3670 chapters = guide_root / "chapters"
3671 guide_root.mkdir(parents=True)
3672 chapters.mkdir()
3673 index_path = guide_root / "index.html"
3674 index_path.write_text("<html></html>\n")
3675 chapter_one = chapters / "01-getting-started.html"
3676 chapter_two = chapters / "02-installation.html"
3677 chapter_one.write_text("<h1>One</h1>\n")
3678
3679 implementation_plan = temp_dir / "implementation.md"
3680 implementation_plan.write_text(
3681 "\n".join(
3682 [
3683 "# Implementation Plan",
3684 "",
3685 "## File Changes",
3686 f"- `{guide_root}/`",
3687 f"- `{chapters}/`",
3688 f"- `{index_path}`",
3689 f"- `{chapter_one}`",
3690 f"- `{chapter_two}`",
3691 "",
3692 ]
3693 )
3694 )
3695
3696 context = build_context(
3697 temp_dir=temp_dir,
3698 messages=[],
3699 safeguards=FakeSafeguards(),
3700 assess_confidence=assess_confidence,
3701 verify_action=verify_action,
3702 auto_recover=False,
3703 )
3704 persistent_messages: list[str] = []
3705 ephemeral_messages: list[str] = []
3706 context.queue_steering_message_callback = persistent_messages.append
3707 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3708 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3709 dod = create_definition_of_done("Create a multi-file nginx guide.")
3710 dod.implementation_plan = str(implementation_plan)
3711 sync_todos_to_definition_of_done(
3712 dod,
3713 [
3714 {
3715 "content": "Create 01-getting-started.html",
3716 "active_form": "Creating 01-getting-started.html",
3717 "status": "completed",
3718 },
3719 {
3720 "content": "Create 02-installation.html",
3721 "active_form": "Creating 02-installation.html",
3722 "status": "pending",
3723 },
3724 ],
3725 )
3726 dod.touched_files.extend([str(index_path), str(chapter_one)])
3727
3728 tool_call = ToolCall(
3729 id="todo-only",
3730 name="TodoWrite",
3731 arguments={
3732 "todos": [
3733 {
3734 "content": "Create 01-getting-started.html",
3735 "active_form": "Creating 01-getting-started.html",
3736 "status": "completed",
3737 },
3738 {
3739 "content": "Create 02-installation.html",
3740 "active_form": "Creating 02-installation.html",
3741 "status": "pending",
3742 },
3743 ]
3744 },
3745 )
3746 executor = FakeExecutor(
3747 [
3748 tool_outcome(
3749 tool_call=tool_call,
3750 output="Todos updated",
3751 is_error=False,
3752 metadata={
3753 "new_todos": [
3754 {
3755 "content": "Create 01-getting-started.html",
3756 "active_form": "Creating 01-getting-started.html",
3757 "status": "completed",
3758 },
3759 {
3760 "content": "Create 02-installation.html",
3761 "active_form": "Creating 02-installation.html",
3762 "status": "pending",
3763 },
3764 ]
3765 },
3766 )
3767 ]
3768 )
3769
3770 summary = TurnSummary(final_response="")
3771 await runner.execute_batch(
3772 tool_calls=[tool_call],
3773 tool_source="assistant",
3774 pending_tool_calls_seen=set(),
3775 emit=_noop_emit,
3776 summary=summary,
3777 dod=dod,
3778 executor=executor, # type: ignore[arg-type]
3779 on_confirmation=None,
3780 on_user_question=None,
3781 emit_confirmation=None,
3782 consecutive_errors=0,
3783 )
3784
3785 assert persistent_messages
3786 message = persistent_messages[-1]
3787 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
3788 assert "Prefer one `write(file_path=..., content=...)` call" in message
3789 assert "Make your next response the concrete mutation tool call itself." in message
3790 assert ephemeral_messages == []
3791
3792
3793 @pytest.mark.asyncio
3794 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3795 temp_dir: Path,
3796 ) -> None:
3797 async def assess_confidence(
3798 tool_name: str,
3799 tool_args: dict,
3800 context: str,
3801 ) -> ConfidenceAssessment:
3802 raise AssertionError("Confidence scoring should not run in this scenario")
3803
3804 async def verify_action(
3805 tool_name: str,
3806 tool_args: dict,
3807 result: str,
3808 expected: str = "",
3809 ) -> ActionVerification:
3810 raise AssertionError("Verification should not run in this scenario")
3811
3812 guide_root = temp_dir / "guides" / "nginx"
3813 chapters = guide_root / "chapters"
3814 guide_root.mkdir(parents=True)
3815 chapters.mkdir()
3816 index_path = guide_root / "index.html"
3817 chapter_one = chapters / "01-getting-started.html"
3818 chapter_two = chapters / "02-installation.html"
3819 index_path.write_text("<html></html>\n")
3820 chapter_one.write_text("<h1>One</h1>\n")
3821 chapter_two.write_text("<h1>Two</h1>\n")
3822
3823 implementation_plan = temp_dir / "implementation.md"
3824 implementation_plan.write_text(
3825 "\n".join(
3826 [
3827 "# Implementation Plan",
3828 "",
3829 "## File Changes",
3830 f"- `{guide_root}/`",
3831 f"- `{chapters}/`",
3832 f"- `{index_path}`",
3833 f"- `{chapter_one}`",
3834 f"- `{chapter_two}`",
3835 "",
3836 ]
3837 )
3838 )
3839
3840 context = build_context(
3841 temp_dir=temp_dir,
3842 messages=[],
3843 safeguards=FakeSafeguards(),
3844 assess_confidence=assess_confidence,
3845 verify_action=verify_action,
3846 auto_recover=False,
3847 )
3848 queued_messages: list[str] = []
3849 context.queue_steering_message_callback = queued_messages.append
3850 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3851 dod = create_definition_of_done("Create a multi-file nginx guide.")
3852 dod.implementation_plan = str(implementation_plan)
3853 dod.verification_commands = [f"ls -la {guide_root}"]
3854 sync_todos_to_definition_of_done(
3855 dod,
3856 [
3857 {
3858 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3859 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3860 "status": "pending",
3861 },
3862 {
3863 "content": "Verify all guide files are linked and complete",
3864 "active_form": "Working on: Verify all guide files are linked and complete",
3865 "status": "pending",
3866 },
3867 ],
3868 project_root=temp_dir,
3869 )
3870
3871 tool_call = ToolCall(
3872 id="todo-only",
3873 name="TodoWrite",
3874 arguments={
3875 "todos": [
3876 {
3877 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3878 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3879 "status": "pending",
3880 },
3881 {
3882 "content": "Verify all guide files are linked and complete",
3883 "active_form": "Working on: Verify all guide files are linked and complete",
3884 "status": "pending",
3885 },
3886 ]
3887 },
3888 )
3889 executor = FakeExecutor(
3890 [
3891 tool_outcome(
3892 tool_call=tool_call,
3893 output="Todos updated",
3894 is_error=False,
3895 metadata={
3896 "new_todos": [
3897 {
3898 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3899 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3900 "status": "pending",
3901 },
3902 {
3903 "content": "Verify all guide files are linked and complete",
3904 "active_form": "Working on: Verify all guide files are linked and complete",
3905 "status": "pending",
3906 },
3907 ]
3908 },
3909 )
3910 ]
3911 )
3912
3913 summary = TurnSummary(final_response="")
3914 await runner.execute_batch(
3915 tool_calls=[tool_call],
3916 tool_source="assistant",
3917 pending_tool_calls_seen=set(),
3918 emit=_noop_emit,
3919 summary=summary,
3920 dod=dod,
3921 executor=executor, # type: ignore[arg-type]
3922 on_confirmation=None,
3923 on_user_question=None,
3924 emit_confirmation=None,
3925 consecutive_errors=0,
3926 )
3927
3928 assert queued_messages
3929 message = queued_messages[-1]
3930 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
3931 assert "Verify all guide files are linked and complete" in message
3932 assert "Move to verification once no specific mismatch remains." in message
3933 assert "reopen reference materials" in message
3934 assert "Fortran guide structure" not in message
3935
3936
3937 @pytest.mark.asyncio
3938 async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
3939 temp_dir: Path,
3940 ) -> None:
3941 async def assess_confidence(
3942 tool_name: str,
3943 tool_args: dict,
3944 context: str,
3945 ) -> ConfidenceAssessment:
3946 raise AssertionError("Confidence scoring should not run for this scenario")
3947
3948 async def verify_action(
3949 tool_name: str,
3950 tool_args: dict,
3951 result: str,
3952 expected: str = "",
3953 ) -> ActionVerification:
3954 raise AssertionError("Verification should not run for this scenario")
3955
3956 guide_root = temp_dir / "guides" / "nginx"
3957 chapters = guide_root / "chapters"
3958 guide_root.mkdir(parents=True)
3959 chapters.mkdir()
3960 index_path = guide_root / "index.html"
3961 chapter_one = chapters / "01-introduction.html"
3962 chapter_two = chapters / "02-installation.html"
3963 index_path.write_text(
3964 "\n".join(
3965 [
3966 '<a href="chapters/01-introduction.html">Intro</a>',
3967 '<a href="chapters/02-installation.html">Install</a>',
3968 '<a href="../index.html">Back</a>',
3969 "",
3970 ]
3971 )
3972 )
3973 chapter_one.write_text("<html></html>\n")
3974 chapter_two.write_text("<html></html>\n")
3975
3976 implementation_plan = temp_dir / "implementation.md"
3977 implementation_plan.write_text(
3978 "\n".join(
3979 [
3980 "# Implementation Plan",
3981 "",
3982 "## File Changes",
3983 f"- `{guide_root}/`",
3984 f"- `{chapters}/`",
3985 f"- `{index_path}`",
3986 f"- `{chapter_one}`",
3987 f"- `{chapter_two}`",
3988 "",
3989 ]
3990 )
3991 )
3992
3993 context = build_context(
3994 temp_dir=temp_dir,
3995 messages=[],
3996 safeguards=FakeSafeguards(),
3997 assess_confidence=assess_confidence,
3998 verify_action=verify_action,
3999 auto_recover=False,
4000 )
4001 queued_messages: list[str] = []
4002 context.queue_steering_message_callback = queued_messages.append
4003 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4004 dod = create_definition_of_done("Create a multi-file nginx guide.")
4005 dod.implementation_plan = str(implementation_plan)
4006 dod.verification_commands = [f"ls -la {guide_root}"]
4007 sync_todos_to_definition_of_done(
4008 dod,
4009 [
4010 {
4011 "content": "Create chapter files following the established pattern",
4012 "active_form": "Creating chapter files",
4013 "status": "in_progress",
4014 }
4015 ],
4016 project_root=temp_dir,
4017 )
4018
4019 tool_call = ToolCall(
4020 id="todo-post-build",
4021 name="TodoWrite",
4022 arguments={
4023 "todos": [
4024 {
4025 "content": "Create chapter files following the established pattern",
4026 "active_form": "Creating chapter files",
4027 "status": "in_progress",
4028 }
4029 ]
4030 },
4031 )
4032 executor = FakeExecutor(
4033 [
4034 tool_outcome(
4035 tool_call=tool_call,
4036 output="Todos updated",
4037 is_error=False,
4038 metadata={
4039 "new_todos": [
4040 {
4041 "content": "Create chapter files following the established pattern",
4042 "active_form": "Creating chapter files",
4043 "status": "in_progress",
4044 }
4045 ]
4046 },
4047 )
4048 ]
4049 )
4050
4051 summary = TurnSummary(final_response="")
4052 await runner.execute_batch(
4053 tool_calls=[tool_call],
4054 tool_source="assistant",
4055 pending_tool_calls_seen=set(),
4056 emit=_noop_emit,
4057 summary=summary,
4058 dod=dod,
4059 executor=executor, # type: ignore[arg-type]
4060 on_confirmation=None,
4061 on_user_question=None,
4062 emit_confirmation=None,
4063 consecutive_errors=0,
4064 )
4065
4066 assert queued_messages
4067 message = queued_messages[-1]
4068 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4069 assert "Repair or verify the current files instead of expanding the artifact set." in message
4070 assert "Move to verification or final confirmation using the files already on disk." in message
4071
4072
4073 @pytest.mark.asyncio
4074 async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
4075 temp_dir: Path,
4076 ) -> None:
4077 async def assess_confidence(
4078 tool_name: str,
4079 tool_args: dict,
4080 context: str,
4081 ) -> ConfidenceAssessment:
4082 raise AssertionError("Confidence scoring should not run for this scenario")
4083
4084 async def verify_action(
4085 tool_name: str,
4086 tool_args: dict,
4087 result: str,
4088 expected: str = "",
4089 ) -> ActionVerification:
4090 raise AssertionError("Verification should not run for this scenario")
4091
4092 guide_root = temp_dir / "guides" / "nginx"
4093 chapters = guide_root / "chapters"
4094 guide_root.mkdir(parents=True)
4095 chapters.mkdir()
4096 index_path = guide_root / "index.html"
4097 chapter_one = chapters / "01-introduction.html"
4098 chapter_two = chapters / "02-installation.html"
4099 index_path.write_text(
4100 "\n".join(
4101 [
4102 '<a href="chapters/01-introduction.html">Intro</a>',
4103 '<a href="chapters/02-installation.html">Install</a>',
4104 '<a href="../index.html">Back</a>',
4105 "",
4106 ]
4107 )
4108 )
4109 chapter_one.write_text("<html></html>\n")
4110 chapter_two.write_text("<html></html>\n")
4111
4112 implementation_plan = temp_dir / "implementation.md"
4113 implementation_plan.write_text(
4114 "\n".join(
4115 [
4116 "# Implementation Plan",
4117 "",
4118 "## File Changes",
4119 f"- `{guide_root}/`",
4120 f"- `{chapters}/`",
4121 f"- `{index_path}`",
4122 f"- `{chapter_one}`",
4123 f"- `{chapter_two}`",
4124 "",
4125 ]
4126 )
4127 )
4128
4129 context = build_context(
4130 temp_dir=temp_dir,
4131 messages=[],
4132 safeguards=FakeSafeguards(),
4133 assess_confidence=assess_confidence,
4134 verify_action=verify_action,
4135 auto_recover=False,
4136 )
4137 queued_messages: list[str] = []
4138 context.queue_steering_message_callback = queued_messages.append
4139 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4140 dod = create_definition_of_done("Create a multi-file nginx guide.")
4141 dod.implementation_plan = str(implementation_plan)
4142 dod.verification_commands = [f"ls -la {guide_root}"]
4143
4144 tool_call = ToolCall(
4145 id="todo-post-build-expansion",
4146 name="TodoWrite",
4147 arguments={
4148 "todos": [
4149 {
4150 "content": "Create index.html for nginx guide",
4151 "activeForm": "Creating index.html",
4152 "status": "in_progress",
4153 },
4154 {
4155 "content": "Create chapter 01-introduction.html",
4156 "activeForm": "Creating chapter 01-introduction.html",
4157 "status": "completed",
4158 },
4159 {
4160 "content": "Create chapter 02-installation.html",
4161 "activeForm": "Creating chapter 02-installation.html",
4162 "status": "completed",
4163 },
4164 {
4165 "content": "Create chapter 08-troubleshooting.html",
4166 "activeForm": "Creating chapter 08-troubleshooting.html",
4167 "status": "pending",
4168 },
4169 ]
4170 },
4171 )
4172 executor = FakeExecutor(
4173 [
4174 tool_outcome(
4175 tool_call=tool_call,
4176 output="Todos updated",
4177 is_error=False,
4178 metadata={
4179 "new_todos": [
4180 {
4181 "content": "Create index.html for nginx guide",
4182 "active_form": "Creating index.html",
4183 "status": "in_progress",
4184 },
4185 {
4186 "content": "Create chapter 01-introduction.html",
4187 "active_form": "Creating chapter 01-introduction.html",
4188 "status": "completed",
4189 },
4190 {
4191 "content": "Create chapter 02-installation.html",
4192 "active_form": "Creating chapter 02-installation.html",
4193 "status": "completed",
4194 },
4195 {
4196 "content": "Create chapter 08-troubleshooting.html",
4197 "active_form": "Creating chapter 08-troubleshooting.html",
4198 "status": "pending",
4199 },
4200 ]
4201 },
4202 )
4203 ]
4204 )
4205
4206 summary = TurnSummary(final_response="")
4207 await runner.execute_batch(
4208 tool_calls=[tool_call],
4209 tool_source="assistant",
4210 pending_tool_calls_seen=set(),
4211 emit=_noop_emit,
4212 summary=summary,
4213 dod=dod,
4214 executor=executor, # type: ignore[arg-type]
4215 on_confirmation=None,
4216 on_user_question=None,
4217 emit_confirmation=None,
4218 consecutive_errors=0,
4219 )
4220
4221 assert queued_messages
4222 message = queued_messages[-1]
4223 assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
4224 assert "Repair or verify the current files instead of expanding the artifact set." in message
4225 assert "Move to verification or final confirmation using the files already on disk." in message
4226 assert "08-troubleshooting.html" not in message
4227
4228
4229 @pytest.mark.asyncio
4230 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
4231 temp_dir: Path,
4232 ) -> None:
4233 async def assess_confidence(
4234 tool_name: str,
4235 tool_args: dict,
4236 context: str,
4237 ) -> ConfidenceAssessment:
4238 raise AssertionError("Confidence scoring should not run in this scenario")
4239
4240 async def verify_action(
4241 tool_name: str,
4242 tool_args: dict,
4243 result: str,
4244 expected: str = "",
4245 ) -> ActionVerification:
4246 raise AssertionError("Verification should not run in this scenario")
4247
4248 guide_root = temp_dir / "guides" / "nginx"
4249 chapters = guide_root / "chapters"
4250 guide_root.mkdir(parents=True)
4251 chapters.mkdir()
4252 index_path = guide_root / "index.html"
4253 index_path.write_text(
4254 "\n".join(
4255 [
4256 "<!DOCTYPE html>",
4257 "<html>",
4258 "<body>",
4259 '<a href="chapters/01-introduction.html">Introduction</a>',
4260 "</body>",
4261 "</html>",
4262 "",
4263 ]
4264 )
4265 )
4266
4267 implementation_plan = temp_dir / "implementation.md"
4268 implementation_plan.write_text(
4269 "\n".join(
4270 [
4271 "# Implementation Plan",
4272 "",
4273 "## File Changes",
4274 f"- `{guide_root}/`",
4275 f"- `{chapters}/`",
4276 f"- `{index_path}`",
4277 "",
4278 ]
4279 )
4280 )
4281
4282 context = build_context(
4283 temp_dir=temp_dir,
4284 messages=[],
4285 safeguards=FakeSafeguards(),
4286 assess_confidence=assess_confidence,
4287 verify_action=verify_action,
4288 auto_recover=False,
4289 )
4290 queued_messages: list[str] = []
4291 context.queue_steering_message_callback = queued_messages.append
4292 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4293 dod = create_definition_of_done("Create a multi-file nginx guide.")
4294 dod.implementation_plan = str(implementation_plan)
4295 dod.touched_files.append(str(index_path))
4296 sync_todos_to_definition_of_done(
4297 dod,
4298 [
4299 {
4300 "content": "Examine the existing Fortran guide structure",
4301 "active_form": "Examining the existing Fortran guide structure",
4302 "status": "completed",
4303 },
4304 {
4305 "content": "Create the nginx directory structure",
4306 "active_form": "Creating the nginx directory structure",
4307 "status": "completed",
4308 },
4309 {
4310 "content": "Write the introduction chapter",
4311 "active_form": "Writing the introduction chapter",
4312 "status": "pending",
4313 },
4314 ],
4315 project_root=temp_dir,
4316 )
4317
4318 tool_call = ToolCall(
4319 id="todo-next-mutation",
4320 name="TodoWrite",
4321 arguments={
4322 "todos": [
4323 {
4324 "content": "Examine the existing Fortran guide structure",
4325 "active_form": "Examining the existing Fortran guide structure",
4326 "status": "completed",
4327 },
4328 {
4329 "content": "Create the nginx directory structure",
4330 "active_form": "Creating the nginx directory structure",
4331 "status": "completed",
4332 },
4333 {
4334 "content": "Write the introduction chapter",
4335 "active_form": "Writing the introduction chapter",
4336 "status": "pending",
4337 },
4338 ]
4339 },
4340 )
4341 executor = FakeExecutor(
4342 [
4343 tool_outcome(
4344 tool_call=tool_call,
4345 output="Todos updated",
4346 is_error=False,
4347 metadata={
4348 "new_todos": [
4349 {
4350 "content": "Examine the existing Fortran guide structure",
4351 "active_form": "Examining the existing Fortran guide structure",
4352 "status": "completed",
4353 },
4354 {
4355 "content": "Create the nginx directory structure",
4356 "active_form": "Creating the nginx directory structure",
4357 "status": "completed",
4358 },
4359 {
4360 "content": "Write the introduction chapter",
4361 "active_form": "Writing the introduction chapter",
4362 "status": "pending",
4363 },
4364 ]
4365 },
4366 )
4367 ]
4368 )
4369
4370 summary = TurnSummary(final_response="")
4371 await runner.execute_batch(
4372 tool_calls=[tool_call],
4373 tool_source="assistant",
4374 pending_tool_calls_seen=set(),
4375 emit=_noop_emit,
4376 summary=summary,
4377 dod=dod,
4378 executor=executor, # type: ignore[arg-type]
4379 on_confirmation=None,
4380 on_user_question=None,
4381 emit_confirmation=None,
4382 consecutive_errors=0,
4383 )
4384
4385 assert queued_messages
4386 message = queued_messages[-1]
4387 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
4388 assert "Prefer one `write(file_path=..., content=...)` call" in message
4389 assert "Make your next response the concrete mutation tool call itself." in message
4390
4391
4392 @pytest.mark.asyncio
4393 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
4394 temp_dir: Path,
4395 ) -> None:
4396 async def assess_confidence(
4397 tool_name: str,
4398 tool_args: dict,
4399 context: str,
4400 ) -> ConfidenceAssessment:
4401 raise AssertionError("Confidence scoring should not run in this scenario")
4402
4403 async def verify_action(
4404 tool_name: str,
4405 tool_args: dict,
4406 result: str,
4407 expected: str = "",
4408 ) -> ActionVerification:
4409 raise AssertionError("Verification should not run in this scenario")
4410
4411 guide_root = temp_dir / "Loader" / "guides" / "nginx"
4412 chapters = guide_root / "chapters"
4413 chapters.mkdir(parents=True)
4414 index_path = guide_root / "index.html"
4415 implementation_plan = temp_dir / "implementation.md"
4416 implementation_plan.write_text(
4417 "\n".join(
4418 [
4419 "# Implementation Plan",
4420 "",
4421 "## File Changes",
4422 f"- `{chapters}/`",
4423 f"- `{index_path}`",
4424 "",
4425 ]
4426 )
4427 )
4428
4429 dod = create_definition_of_done("Create a multi-file nginx guide.")
4430 dod.implementation_plan = str(implementation_plan)
4431 sync_todos_to_definition_of_done(
4432 dod,
4433 [
4434 {
4435 "content": "Examine the existing Fortran guide structure to understand the format and depth",
4436 "active_form": "Examining the existing Fortran guide structure",
4437 "status": "completed",
4438 },
4439 {
4440 "content": "Create the new nginx guide directory structure",
4441 "active_form": "Creating the new nginx guide directory structure",
4442 "status": "completed",
4443 },
4444 {
4445 "content": "Create a new index.html for the nginx guide",
4446 "active_form": "Creating a new index.html for the nginx guide",
4447 "status": "pending",
4448 },
4449 {
4450 "content": "Create the first chapter for the nginx guide",
4451 "active_form": "Creating the first chapter for the nginx guide",
4452 "status": "pending",
4453 },
4454 ],
4455 project_root=temp_dir,
4456 )
4457
4458 queued_messages: list[str] = []
4459 context = build_context(
4460 temp_dir=temp_dir,
4461 messages=[],
4462 safeguards=FakeSafeguards(),
4463 assess_confidence=assess_confidence,
4464 verify_action=verify_action,
4465 auto_recover=False,
4466 )
4467 context.queue_steering_message_callback = queued_messages.append
4468 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4469
4470 todos = [
4471 {
4472 "content": "Examine the existing Fortran guide structure to understand the format and depth",
4473 "active_form": "Examining the existing Fortran guide structure",
4474 "status": "completed",
4475 },
4476 {
4477 "content": "Create the new nginx guide directory structure",
4478 "active_form": "Creating the new nginx guide directory structure",
4479 "status": "completed",
4480 },
4481 {
4482 "content": "Create a new index.html for the nginx guide",
4483 "active_form": "Creating a new index.html for the nginx guide",
4484 "status": "pending",
4485 },
4486 {
4487 "content": "Create the first chapter for the nginx guide",
4488 "active_form": "Creating the first chapter for the nginx guide",
4489 "status": "pending",
4490 },
4491 ]
4492 tool_call = ToolCall(
4493 id="todo-index-before-chapter",
4494 name="TodoWrite",
4495 arguments={"todos": todos},
4496 )
4497 executor = FakeExecutor(
4498 [
4499 tool_outcome(
4500 tool_call=tool_call,
4501 output="Todos updated",
4502 is_error=False,
4503 metadata={"new_todos": todos},
4504 )
4505 ]
4506 )
4507
4508 summary = TurnSummary(final_response="")
4509 await runner.execute_batch(
4510 tool_calls=[tool_call],
4511 tool_source="assistant",
4512 pending_tool_calls_seen=set(),
4513 emit=_noop_emit,
4514 summary=summary,
4515 dod=dod,
4516 executor=executor, # type: ignore[arg-type]
4517 on_confirmation=None,
4518 on_user_question=None,
4519 emit_confirmation=None,
4520 consecutive_errors=0,
4521 )
4522
4523 assert queued_messages
4524 message = queued_messages[-1]
4525 assert "Todo tracking is updated. Next step: create `index.html`." in message
4526 assert f"Prefer one `write(file_path=..., content=...)` call for `{index_path.resolve(strict=False)}`" in message
4527 assert "01-introduction.html" not in message
4528
4529
4530 @pytest.mark.asyncio
4531 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
4532 temp_dir: Path,
4533 ) -> None:
4534 async def assess_confidence(
4535 tool_name: str,
4536 tool_args: dict,
4537 context: str,
4538 ) -> ConfidenceAssessment:
4539 raise AssertionError("Confidence scoring should not run in this scenario")
4540
4541 async def verify_action(
4542 tool_name: str,
4543 tool_args: dict,
4544 result: str,
4545 expected: str = "",
4546 ) -> ActionVerification:
4547 raise AssertionError("Verification should not run in this scenario")
4548
4549 guide_root = temp_dir / "guides" / "nginx"
4550 chapters = guide_root / "chapters"
4551 guide_root.mkdir(parents=True)
4552 chapters.mkdir()
4553 index_path = guide_root / "index.html"
4554 index_path.write_text(
4555 "\n".join(
4556 [
4557 "<html>",
4558 '<a href="chapters/introduction.html">Introduction</a>',
4559 '<a href="chapters/installation.html">Installation</a>',
4560 "</html>",
4561 ]
4562 )
4563 + "\n"
4564 )
4565
4566 implementation_plan = temp_dir / "implementation.md"
4567 implementation_plan.write_text(
4568 "\n".join(
4569 [
4570 "# Implementation Plan",
4571 "",
4572 "## File Changes",
4573 f"- `{guide_root}/`",
4574 f"- `{chapters}/`",
4575 f"- `{index_path}`",
4576 "",
4577 ]
4578 )
4579 )
4580
4581 dod = create_definition_of_done("Create a multi-file nginx guide.")
4582 dod.implementation_plan = str(implementation_plan)
4583 dod.pending_items = [
4584 "Write the introduction chapter",
4585 "Complete the requested work",
4586 ]
4587 dod.touched_files.append(str(index_path))
4588
4589 queued_messages: list[str] = []
4590 context = build_context(
4591 temp_dir=temp_dir,
4592 messages=[],
4593 safeguards=FakeSafeguards(),
4594 assess_confidence=assess_confidence,
4595 verify_action=verify_action,
4596 auto_recover=False,
4597 )
4598 context.queue_steering_message_callback = queued_messages.append
4599 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4600
4601 tool_call = ToolCall(
4602 id="todo-1",
4603 name="TodoWrite",
4604 arguments={
4605 "todos": [
4606 {
4607 "content": "Write the introduction chapter",
4608 "activeForm": "Writing the introduction chapter",
4609 "status": "pending",
4610 }
4611 ]
4612 },
4613 )
4614 executor = FakeExecutor(
4615 [
4616 tool_outcome(
4617 tool_call=tool_call,
4618 output="Todos updated",
4619 is_error=False,
4620 metadata={
4621 "new_todos": [
4622 {
4623 "content": "Write the introduction chapter",
4624 "active_form": "Writing the introduction chapter",
4625 "status": "pending",
4626 }
4627 ]
4628 },
4629 )
4630 ]
4631 )
4632
4633 summary = TurnSummary(final_response="")
4634 await runner.execute_batch(
4635 tool_calls=[tool_call],
4636 tool_source="assistant",
4637 pending_tool_calls_seen=set(),
4638 emit=_noop_emit,
4639 summary=summary,
4640 dod=dod,
4641 executor=executor, # type: ignore[arg-type]
4642 on_confirmation=None,
4643 on_user_question=None,
4644 emit_confirmation=None,
4645 consecutive_errors=0,
4646 )
4647
4648 assert queued_messages
4649 message = queued_messages[-1]
4650 assert "Todo tracking is updated. Next step: create `introduction.html`." in message
4651 assert "Prefer one `write(file_path=..., content=...)` call" in message
4652 assert "Make your next response the concrete mutation tool call itself." in message
4653
4654
4655 @pytest.mark.asyncio
4656 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
4657 temp_dir: Path,
4658 ) -> None:
4659 async def assess_confidence(
4660 tool_name: str,
4661 tool_args: dict,
4662 context: str,
4663 ) -> ConfidenceAssessment:
4664 raise AssertionError("Confidence scoring should not run in this scenario")
4665
4666 async def verify_action(
4667 tool_name: str,
4668 tool_args: dict,
4669 result: str,
4670 expected: str = "",
4671 ) -> ActionVerification:
4672 raise AssertionError("Verification should not run in this scenario")
4673
4674 guide_root = temp_dir / "guides" / "nginx"
4675 chapters = guide_root / "chapters"
4676 guide_root.mkdir(parents=True)
4677 chapters.mkdir()
4678 index_path = guide_root / "index.html"
4679 chapter_one = chapters / "01-introduction.html"
4680 index_path.write_text(
4681 "\n".join(
4682 [
4683 "<html>",
4684 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
4685 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
4686 "</html>",
4687 ]
4688 )
4689 + "\n"
4690 )
4691 chapter_one.write_text("<html></html>\n")
4692
4693 implementation_plan = temp_dir / "implementation.md"
4694 implementation_plan.write_text(
4695 "\n".join(
4696 [
4697 "# Implementation Plan",
4698 "",
4699 "## File Changes",
4700 f"- `{guide_root}/`",
4701 f"- `{chapters}/`",
4702 f"- `{index_path}`",
4703 "",
4704 ]
4705 )
4706 )
4707
4708 dod = create_definition_of_done("Create a multi-file nginx guide.")
4709 dod.implementation_plan = str(implementation_plan)
4710 dod.pending_items = [
4711 "Creating Chapter 2: Installation and Setup",
4712 "Complete the requested work",
4713 ]
4714 dod.touched_files.extend([str(index_path), str(chapter_one)])
4715
4716 queued_messages: list[str] = []
4717 context = build_context(
4718 temp_dir=temp_dir,
4719 messages=[],
4720 safeguards=FakeSafeguards(),
4721 assess_confidence=assess_confidence,
4722 verify_action=verify_action,
4723 auto_recover=False,
4724 )
4725 context.queue_steering_message_callback = queued_messages.append
4726 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4727
4728 tool_call = ToolCall(
4729 id="todo-1",
4730 name="TodoWrite",
4731 arguments={
4732 "todos": [
4733 {
4734 "content": "Creating Chapter 2: Installation and Setup",
4735 "activeForm": "Creating Chapter 2: Installation and Setup",
4736 "status": "pending",
4737 }
4738 ]
4739 },
4740 )
4741 executor = FakeExecutor(
4742 [
4743 tool_outcome(
4744 tool_call=tool_call,
4745 output="Todos updated",
4746 is_error=False,
4747 metadata={
4748 "new_todos": [
4749 {
4750 "content": "Creating Chapter 2: Installation and Setup",
4751 "active_form": "Creating Chapter 2: Installation and Setup",
4752 "status": "pending",
4753 }
4754 ]
4755 },
4756 )
4757 ]
4758 )
4759
4760 summary = TurnSummary(final_response="")
4761 await runner.execute_batch(
4762 tool_calls=[tool_call],
4763 tool_source="assistant",
4764 pending_tool_calls_seen=set(),
4765 emit=_noop_emit,
4766 summary=summary,
4767 dod=dod,
4768 executor=executor, # type: ignore[arg-type]
4769 on_confirmation=None,
4770 on_user_question=None,
4771 emit_confirmation=None,
4772 consecutive_errors=0,
4773 )
4774
4775 assert queued_messages
4776 message = queued_messages[-1]
4777 assert "Todo tracking is updated. Next step: create `02-installation.html`." in message
4778 assert "Prefer one `write(file_path=..., content=...)` call" in message
4779 assert "Make your next response the concrete mutation tool call itself" in message
4780
4781
4782 @pytest.mark.asyncio
4783 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
4784 temp_dir: Path,
4785 ) -> None:
4786 async def assess_confidence(
4787 tool_name: str,
4788 tool_args: dict,
4789 context: str,
4790 ) -> ConfidenceAssessment:
4791 raise AssertionError("Confidence scoring should not run in this scenario")
4792
4793 async def verify_action(
4794 tool_name: str,
4795 tool_args: dict,
4796 result: str,
4797 expected: str = "",
4798 ) -> ActionVerification:
4799 raise AssertionError("Verification should not run in this scenario")
4800
4801 reference_chapters = temp_dir / "fortran" / "chapters"
4802 reference_chapters.mkdir(parents=True)
4803 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
4804
4805 guide_root = temp_dir / "guides" / "nginx"
4806 chapters = guide_root / "chapters"
4807 guide_root.mkdir(parents=True)
4808 chapters.mkdir()
4809 index_path = guide_root / "index.html"
4810 index_path.write_text("<html></html>\n")
4811
4812 implementation_plan = temp_dir / "implementation.md"
4813 implementation_plan.write_text(
4814 "\n".join(
4815 [
4816 "# Implementation Plan",
4817 "",
4818 "## File Changes",
4819 f"- `{guide_root}/`",
4820 f"- `{chapters}/`",
4821 f"- `{index_path}`",
4822 "",
4823 ]
4824 )
4825 )
4826
4827 dod = create_definition_of_done("Create a multi-file nginx guide.")
4828 dod.implementation_plan = str(implementation_plan)
4829 dod.pending_items = [
4830 "Write the introduction chapter",
4831 "Complete the requested work",
4832 ]
4833 dod.touched_files.append(str(index_path))
4834
4835 queued_messages: list[str] = []
4836 context = build_context(
4837 temp_dir=temp_dir,
4838 messages=[
4839 Message(
4840 role=Role.ASSISTANT,
4841 content="",
4842 tool_calls=[
4843 ToolCall(
4844 id="read-ref-1",
4845 name="read",
4846 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
4847 )
4848 ],
4849 )
4850 ],
4851 safeguards=FakeSafeguards(),
4852 assess_confidence=assess_confidence,
4853 verify_action=verify_action,
4854 auto_recover=False,
4855 )
4856 context.queue_steering_message_callback = queued_messages.append
4857 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4858
4859 tool_call = ToolCall(
4860 id="todo-observed-1",
4861 name="TodoWrite",
4862 arguments={
4863 "todos": [
4864 {
4865 "content": "Write the introduction chapter",
4866 "activeForm": "Writing the introduction chapter",
4867 "status": "pending",
4868 }
4869 ]
4870 },
4871 )
4872 executor = FakeExecutor(
4873 [
4874 tool_outcome(
4875 tool_call=tool_call,
4876 output="Todos updated",
4877 is_error=False,
4878 metadata={
4879 "new_todos": [
4880 {
4881 "content": "Write the introduction chapter",
4882 "active_form": "Writing the introduction chapter",
4883 "status": "pending",
4884 }
4885 ]
4886 },
4887 )
4888 ]
4889 )
4890
4891 summary = TurnSummary(final_response="")
4892 await runner.execute_batch(
4893 tool_calls=[tool_call],
4894 tool_source="assistant",
4895 pending_tool_calls_seen=set(),
4896 emit=_noop_emit,
4897 summary=summary,
4898 dod=dod,
4899 executor=executor, # type: ignore[arg-type]
4900 on_confirmation=None,
4901 on_user_question=None,
4902 emit_confirmation=None,
4903 consecutive_errors=0,
4904 )
4905
4906 assert queued_messages
4907 message = queued_messages[-1]
4908 assert "Todo tracking is updated. Next step: create `01-introduction.html`." in message
4909 assert "Prefer one `write(file_path=..., content=...)` call" in message
4910
4911
4912 @pytest.mark.asyncio
4913 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
4914 temp_dir: Path,
4915 ) -> None:
4916 async def assess_confidence(
4917 tool_name: str,
4918 tool_args: dict,
4919 context: str,
4920 ) -> ConfidenceAssessment:
4921 raise AssertionError("Confidence scoring should not run in this scenario")
4922
4923 async def verify_action(
4924 tool_name: str,
4925 tool_args: dict,
4926 result: str,
4927 expected: str = "",
4928 ) -> ActionVerification:
4929 raise AssertionError("Verification should not run in this scenario")
4930
4931 guide_root = temp_dir / "guides" / "nginx"
4932 chapters = guide_root / "chapters"
4933 guide_root.mkdir(parents=True)
4934 chapters.mkdir()
4935 index_path = guide_root / "index.html"
4936 chapter_one = chapters / "01-getting-started.html"
4937 chapter_two = chapters / "02-installation.html"
4938 index_path.write_text("<html></html>\n")
4939 chapter_one.write_text("<h1>One</h1>\n")
4940
4941 implementation_plan = temp_dir / "implementation.md"
4942 implementation_plan.write_text(
4943 "\n".join(
4944 [
4945 "# Implementation Plan",
4946 "",
4947 "## File Changes",
4948 f"- `{guide_root}/`",
4949 f"- `{chapters}/`",
4950 f"- `{index_path}`",
4951 f"- `{chapter_one}`",
4952 f"- `{chapter_two}`",
4953 "",
4954 ]
4955 )
4956 )
4957
4958 context = build_context(
4959 temp_dir=temp_dir,
4960 messages=[],
4961 safeguards=FakeSafeguards(),
4962 assess_confidence=assess_confidence,
4963 verify_action=verify_action,
4964 auto_recover=False,
4965 )
4966 queued_messages: list[str] = []
4967 context.queue_steering_message_callback = queued_messages.append
4968 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4969 dod = create_definition_of_done("Create a multi-file nginx guide.")
4970 dod.implementation_plan = str(implementation_plan)
4971 sync_todos_to_definition_of_done(
4972 dod,
4973 [
4974 {
4975 "content": "Create 01-getting-started.html",
4976 "active_form": "Creating 01-getting-started.html",
4977 "status": "completed",
4978 },
4979 {
4980 "content": "Create 02-installation.html",
4981 "active_form": "Creating 02-installation.html",
4982 "status": "pending",
4983 },
4984 ],
4985 project_root=temp_dir,
4986 )
4987 dod.touched_files.extend([str(index_path), str(chapter_one)])
4988
4989 tool_call = ToolCall(
4990 id="working-note",
4991 name="notepad_write_working",
4992 arguments={"content": "Creating the second chapter file: Installation"},
4993 )
4994 executor = FakeExecutor(
4995 [
4996 tool_outcome(
4997 tool_call=tool_call,
4998 output="Working note recorded",
4999 is_error=False,
5000 )
5001 ]
5002 )
5003
5004 summary = TurnSummary(final_response="")
5005 await runner.execute_batch(
5006 tool_calls=[tool_call],
5007 tool_source="assistant",
5008 pending_tool_calls_seen=set(),
5009 emit=_noop_emit,
5010 summary=summary,
5011 dod=dod,
5012 executor=executor, # type: ignore[arg-type]
5013 on_confirmation=None,
5014 on_user_question=None,
5015 emit_confirmation=None,
5016 consecutive_errors=0,
5017 )
5018
5019 assert queued_messages
5020 message = queued_messages[-1]
5021 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5022 assert "Resume by creating `02-installation.html` now." in message
5023 assert "Make your next response the concrete mutation tool call itself" in message
5024 assert "refresh `TodoWrite`" in message
5025 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
5026
5027
5028 @pytest.mark.asyncio
5029 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
5030 temp_dir: Path,
5031 ) -> None:
5032 async def assess_confidence(
5033 tool_name: str,
5034 tool_args: dict,
5035 context: str,
5036 ) -> ConfidenceAssessment:
5037 raise AssertionError("Confidence scoring should be disabled in this scenario")
5038
5039 async def verify_action(
5040 tool_name: str,
5041 tool_args: dict,
5042 result: str,
5043 expected: str = "",
5044 ) -> ActionVerification:
5045 raise AssertionError("Verification should not run in this scenario")
5046
5047 implementation_plan = temp_dir / "implementation.md"
5048 implementation_plan.write_text(
5049 "\n".join(
5050 [
5051 "# Implementation Plan",
5052 "",
5053 "## File Changes",
5054 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
5055 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
5056 "",
5057 ]
5058 )
5059 )
5060
5061 context = build_context(
5062 temp_dir=temp_dir,
5063 messages=[],
5064 safeguards=FakeSafeguards(),
5065 assess_confidence=assess_confidence,
5066 verify_action=verify_action,
5067 auto_recover=False,
5068 )
5069 queued_messages: list[str] = []
5070 context.queue_steering_message_callback = queued_messages.append
5071 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5072 dod = create_definition_of_done("Create a multi-file nginx guide.")
5073 dod.implementation_plan = str(implementation_plan)
5074 dod.pending_items.extend(
5075 [
5076 "First, examine the existing fortran guide structure and content to understand the format",
5077 "Create the nginx directory structure",
5078 "Develop the main index.html file for the nginx guide",
5079 ]
5080 )
5081
5082 tool_call = ToolCall(
5083 id="working-note",
5084 name="notepad_write_working",
5085 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
5086 )
5087 executor = FakeExecutor(
5088 [
5089 tool_outcome(
5090 tool_call=tool_call,
5091 output="Working note recorded",
5092 is_error=False,
5093 )
5094 ]
5095 )
5096
5097 summary = TurnSummary(final_response="")
5098 await runner.execute_batch(
5099 tool_calls=[tool_call],
5100 tool_source="assistant",
5101 pending_tool_calls_seen=set(),
5102 emit=_noop_emit,
5103 summary=summary,
5104 dod=dod,
5105 executor=executor, # type: ignore[arg-type]
5106 on_confirmation=None,
5107 on_user_question=None,
5108 emit_confirmation=None,
5109 consecutive_errors=0,
5110 )
5111
5112 assert queued_messages
5113 message = queued_messages[-1]
5114 assert (
5115 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
5116 in message
5117 )
5118 assert "one concrete evidence-gathering tool call" in message
5119 assert "Resume by creating `index.html` now." not in message
5120
5121
5122 @pytest.mark.asyncio
5123 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
5124 temp_dir: Path,
5125 ) -> None:
5126 async def assess_confidence(
5127 tool_name: str,
5128 tool_args: dict,
5129 context: str,
5130 ) -> ConfidenceAssessment:
5131 raise AssertionError("Confidence scoring should be disabled in this scenario")
5132
5133 async def verify_action(
5134 tool_name: str,
5135 tool_args: dict,
5136 result: str,
5137 expected: str = "",
5138 ) -> ActionVerification:
5139 raise AssertionError("Verification should not run in this scenario")
5140
5141 guide_root = temp_dir / "guides" / "nginx"
5142 chapters_dir = guide_root / "chapters"
5143 chapters_dir.mkdir(parents=True)
5144 index_path = guide_root / "index.html"
5145 first_chapter = chapters_dir / "01-introduction.html"
5146 index_path.write_text(
5147 "\n".join(
5148 [
5149 '<a href="chapters/01-introduction.html">Introduction</a>',
5150 '<a href="chapters/02-installation.html">Installation</a>',
5151 '<a href="chapters/03-configuration.html">Configuration</a>',
5152 ]
5153 )
5154 )
5155 first_chapter.write_text("<h1>Introduction</h1>\n")
5156
5157 implementation_plan = temp_dir / "implementation.md"
5158 implementation_plan.write_text(
5159 "\n".join(
5160 [
5161 "# Implementation Plan",
5162 "",
5163 "## File Changes",
5164 f"- `{guide_root / 'index.html'}`",
5165 f"- `{chapters_dir}/`",
5166 "",
5167 ]
5168 )
5169 )
5170
5171 context = build_context(
5172 temp_dir=temp_dir,
5173 messages=[],
5174 safeguards=FakeSafeguards(),
5175 assess_confidence=assess_confidence,
5176 verify_action=verify_action,
5177 auto_recover=False,
5178 )
5179 queued_messages: list[str] = []
5180 context.queue_steering_message_callback = queued_messages.append
5181 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5182 dod = create_definition_of_done("Create a multi-file nginx guide.")
5183 dod.implementation_plan = str(implementation_plan)
5184 dod.pending_items.extend(
5185 [
5186 "First, examine the existing fortran guide structure and content to understand the format",
5187 "Create chapter files following the established pattern",
5188 ]
5189 )
5190 dod.touched_files.extend([str(index_path), str(first_chapter)])
5191
5192 tool_call = ToolCall(
5193 id="working-note",
5194 name="notepad_write_working",
5195 arguments={"content": "Created index and first chapter; next is chapter 2"},
5196 )
5197 executor = FakeExecutor(
5198 [
5199 tool_outcome(
5200 tool_call=tool_call,
5201 output="Working note recorded",
5202 is_error=False,
5203 )
5204 ]
5205 )
5206
5207 summary = TurnSummary(final_response="")
5208 await runner.execute_batch(
5209 tool_calls=[tool_call],
5210 tool_source="assistant",
5211 pending_tool_calls_seen=set(),
5212 emit=_noop_emit,
5213 summary=summary,
5214 dod=dod,
5215 executor=executor, # type: ignore[arg-type]
5216 on_confirmation=None,
5217 on_user_question=None,
5218 emit_confirmation=None,
5219 consecutive_errors=0,
5220 )
5221
5222 assert queued_messages
5223 message = queued_messages[-1]
5224 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
5225 assert "Resume by creating `02-installation.html` now." in message
5226 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
5227
5228
5229 @pytest.mark.asyncio
5230 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
5231 temp_dir: Path,
5232 ) -> None:
5233 async def assess_confidence(
5234 tool_name: str,
5235 tool_args: dict,
5236 context: str,
5237 ) -> ConfidenceAssessment:
5238 raise AssertionError("Confidence scoring should be disabled in this scenario")
5239
5240 async def verify_action(
5241 tool_name: str,
5242 tool_args: dict,
5243 result: str,
5244 expected: str = "",
5245 ) -> ActionVerification:
5246 raise AssertionError("Verification should not run in this scenario")
5247
5248 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
5249 chapters_dir = fortran_root / "chapters"
5250 chapters_dir.mkdir(parents=True)
5251
5252 implementation_plan = temp_dir / "implementation.md"
5253 implementation_plan.write_text(
5254 "\n".join(
5255 [
5256 "# Implementation Plan",
5257 "",
5258 "## File Changes",
5259 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
5260 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
5261 "",
5262 ]
5263 )
5264 )
5265
5266 context = build_context(
5267 temp_dir=temp_dir,
5268 messages=[],
5269 safeguards=FakeSafeguards(),
5270 assess_confidence=assess_confidence,
5271 verify_action=verify_action,
5272 auto_recover=False,
5273 )
5274 queued_messages: list[str] = []
5275 context.queue_steering_message_callback = queued_messages.append
5276 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5277 dod = create_definition_of_done("Create a multi-file nginx guide.")
5278 dod.implementation_plan = str(implementation_plan)
5279 dod.pending_items.extend(
5280 [
5281 "First, examine the existing fortran guide structure and content",
5282 "Create the nginx directory structure",
5283 "Develop the main index.html file for nginx guide",
5284 ]
5285 )
5286
5287 tool_call = ToolCall(
5288 id="glob-1",
5289 name="glob",
5290 arguments={"pattern": "**", "path": str(fortran_root)},
5291 )
5292 executor = FakeExecutor(
5293 [
5294 tool_outcome(
5295 tool_call=tool_call,
5296 output=f"{fortran_root}\n{chapters_dir}",
5297 is_error=False,
5298 )
5299 ]
5300 )
5301
5302 summary = TurnSummary(final_response="")
5303 await runner.execute_batch(
5304 tool_calls=[tool_call],
5305 tool_source="assistant",
5306 pending_tool_calls_seen=set(),
5307 emit=_noop_emit,
5308 summary=summary,
5309 dod=dod,
5310 executor=executor, # type: ignore[arg-type]
5311 on_confirmation=None,
5312 on_user_question=None,
5313 emit_confirmation=None,
5314 consecutive_errors=0,
5315 )
5316
5317 assert queued_messages == []
5318
5319
5320 @pytest.mark.asyncio
5321 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
5322 temp_dir: Path,
5323 ) -> None:
5324 async def assess_confidence(
5325 tool_name: str,
5326 tool_args: dict,
5327 context: str,
5328 ) -> ConfidenceAssessment:
5329 raise AssertionError("Confidence scoring should not run in this scenario")
5330
5331 async def verify_action(
5332 tool_name: str,
5333 tool_args: dict,
5334 result: str,
5335 expected: str = "",
5336 ) -> ActionVerification:
5337 raise AssertionError("Verification should not run in this scenario")
5338
5339 prompt = (
5340 "Have a look at ~/Loader/guides/fortran/index.html, then "
5341 "~/Loader/guides/fortran/chapters. The table of contents links in "
5342 "index.html are inaccurate and the href’s are wrong. Let’s update the "
5343 "links and their link texts to be correct."
5344 )
5345 chapters = temp_dir / "chapters"
5346 chapters.mkdir()
5347 (chapters / "01-introduction.html").write_text(
5348 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
5349 )
5350 (chapters / "02-setup.html").write_text(
5351 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
5352 )
5353 current_block = (
5354 "<h2>Table of Contents</h2>\n"
5355 ' <ul class="chapter-list">\n'
5356 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
5357 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
5358 " </ul>\n"
5359 )
5360 index_path = temp_dir / "index.html"
5361 index_path.write_text(current_block)
5362
5363 context = build_context(
5364 temp_dir=temp_dir,
5365 messages=[],
5366 safeguards=FakeSafeguards(),
5367 assess_confidence=assess_confidence,
5368 verify_action=verify_action,
5369 auto_recover=False,
5370 )
5371 context.session.current_task = prompt # type: ignore[attr-defined]
5372 queued_messages: list[str] = []
5373 context.queue_steering_message_callback = queued_messages.append
5374 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5375 tool_call = ToolCall(
5376 id="edit-1",
5377 name="edit",
5378 arguments={
5379 "file_path": str(index_path),
5380 "old_string": current_block,
5381 "new_string": current_block,
5382 },
5383 )
5384 executor = FakeExecutor(
5385 [
5386 tool_outcome(
5387 tool_call=tool_call,
5388 output=(
5389 "[Blocked - old_string and new_string are identical - no change "
5390 "would occur] Suggestion: Provide different old and new strings"
5391 ),
5392 is_error=True,
5393 state=ToolExecutionState.BLOCKED,
5394 )
5395 ]
5396 )
5397
5398 await runner.execute_batch(
5399 tool_calls=[tool_call],
5400 tool_source="assistant",
5401 pending_tool_calls_seen=set(),
5402 emit=_noop_emit,
5403 summary=TurnSummary(final_response=""),
5404 dod=create_definition_of_done(prompt),
5405 executor=executor, # type: ignore[arg-type]
5406 on_confirmation=None,
5407 on_user_question=None,
5408 emit_confirmation=None,
5409 consecutive_errors=0,
5410 )
5411
5412 assert queued_messages == []
5413
5414
5415 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
5416 temp_dir: Path,
5417 ) -> None:
5418 async def assess_confidence(
5419 tool_name: str,
5420 tool_args: dict,
5421 context: str,
5422 ) -> ConfidenceAssessment:
5423 raise AssertionError("Confidence scoring should be disabled in this scenario")
5424
5425 async def verify_action(
5426 tool_name: str,
5427 tool_args: dict,
5428 result: str,
5429 expected: str = "",
5430 ) -> ActionVerification:
5431 raise AssertionError("Verification should not run in this scenario")
5432
5433 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
5434 context = build_context(
5435 temp_dir=temp_dir,
5436 messages=[
5437 Message(
5438 role=Role.ASSISTANT,
5439 content=(
5440 "Repair focus:\n"
5441 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
5442 f"- Immediate next step: edit `{repair_target}`.\n"
5443 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
5444 ),
5445 )
5446 ],
5447 safeguards=FakeSafeguards(),
5448 assess_confidence=assess_confidence,
5449 verify_action=verify_action,
5450 )
5451 queued: list[str] = []
5452 context.queue_steering_message_callback = queued.append
5453 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5454 dod = create_definition_of_done("Repair a guide page.")
5455
5456 runner._queue_blocked_html_edit_nudge(
5457 ToolCall(
5458 id="edit-1",
5459 name="edit",
5460 arguments={
5461 "file_path": str(repair_target),
5462 "old_string": "same",
5463 "new_string": "same",
5464 },
5465 ),
5466 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
5467 dod=dod,
5468 )
5469
5470 assert queued
5471 assert str(repair_target) in queued[0]
5472 assert "no on-disk change" in queued[0]
5473 assert "replace the surrounding block" in queued[0]
5474 assert "Do not reopen unrelated reference materials" in queued[0]
5475
5476
5477 def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verification(
5478 temp_dir: Path,
5479 ) -> None:
5480 async def assess_confidence(
5481 tool_name: str,
5482 tool_args: dict,
5483 context: str,
5484 ) -> ConfidenceAssessment:
5485 raise AssertionError("Confidence scoring should be disabled in this scenario")
5486
5487 async def verify_action(
5488 tool_name: str,
5489 tool_args: dict,
5490 result: str,
5491 expected: str = "",
5492 ) -> ActionVerification:
5493 raise AssertionError("Verification should not run in this scenario")
5494
5495 guide_root = temp_dir / "guide"
5496 chapters = guide_root / "chapters"
5497 chapters.mkdir(parents=True)
5498 index_path = guide_root / "index.html"
5499 chapter_one = chapters / "01-introduction.html"
5500 index_path.write_text("<html></html>\n")
5501 chapter_one.write_text("<html></html>\n")
5502
5503 implementation_plan = temp_dir / "implementation.md"
5504 implementation_plan.write_text(
5505 "\n".join(
5506 [
5507 "# Implementation Plan",
5508 "",
5509 "## File Changes",
5510 f"- `{index_path}`",
5511 f"- `{chapter_one}`",
5512 "",
5513 ]
5514 )
5515 )
5516
5517 context = build_context(
5518 temp_dir=temp_dir,
5519 messages=[
5520 Message(
5521 role=Role.ASSISTANT,
5522 content=(
5523 "Repair focus:\n"
5524 f"- Confirm the final guide state in `{index_path}`.\n"
5525 f"- Immediate next step: verify `{index_path}` if no concrete mismatch remains.\n"
5526 ),
5527 )
5528 ],
5529 safeguards=FakeSafeguards(),
5530 assess_confidence=assess_confidence,
5531 verify_action=verify_action,
5532 )
5533 queued: list[str] = []
5534 context.queue_steering_message_callback = queued.append
5535 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5536
5537 dod = create_definition_of_done("Create a multi-file guide.")
5538 dod.implementation_plan = str(implementation_plan)
5539 dod.touched_files.extend([str(index_path), str(chapter_one)])
5540 dod.verification_commands = [f"ls -la {guide_root}"]
5541
5542 runner._queue_blocked_html_edit_nudge(
5543 ToolCall(
5544 id="edit-1",
5545 name="edit",
5546 arguments={
5547 "file_path": str(index_path),
5548 "old_string": "same",
5549 "new_string": "same",
5550 },
5551 ),
5552 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
5553 dod=dod,
5554 )
5555
5556 assert queued
5557 assert "All explicitly planned artifacts already exist." in queued[0]
5558 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
5559 assert "replace the surrounding block" not in queued[0]
5560
5561
5562 async def _noop_emit(event: AgentEvent) -> None:
5563 return None
5564
5565
5566 @pytest.mark.asyncio
5567 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
5568 temp_dir: Path,
5569 ) -> None:
5570 async def assess_confidence(
5571 tool_name: str,
5572 tool_args: dict,
5573 context: str,
5574 ) -> ConfidenceAssessment:
5575 raise AssertionError("Confidence scoring should be disabled in this scenario")
5576
5577 async def verify_action(
5578 tool_name: str,
5579 tool_args: dict,
5580 result: str,
5581 expected: str = "",
5582 ) -> ActionVerification:
5583 raise AssertionError("Verification should not run for this scenario")
5584
5585 context = build_context(
5586 temp_dir=temp_dir,
5587 messages=[],
5588 safeguards=FakeSafeguards(),
5589 assess_confidence=assess_confidence,
5590 verify_action=verify_action,
5591 )
5592 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5593 tool_call = ToolCall(
5594 id="write-1",
5595 name="write",
5596 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
5597 )
5598 executor = FakeExecutor(
5599 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5600 )
5601 summary = TurnSummary(final_response="")
5602 dod = create_definition_of_done("Update README and verify it still works.")
5603 events: list[AgentEvent] = []
5604
5605 async def emit(event: AgentEvent) -> None:
5606 events.append(event)
5607
5608 await runner.execute_batch(
5609 tool_calls=[tool_call],
5610 tool_source="assistant",
5611 pending_tool_calls_seen=set(),
5612 emit=emit,
5613 summary=summary,
5614 dod=dod,
5615 executor=executor, # type: ignore[arg-type]
5616 on_confirmation=None,
5617 on_user_question=None,
5618 emit_confirmation=None,
5619 consecutive_errors=0,
5620 )
5621
5622 assert dod.last_verification_result == "planned"
5623 assert dod.verification_commands
5624 assert "Collect verification evidence" in dod.pending_items
5625 assert dod.active_verification_attempt_id == "verification-attempt-1"
5626 assert dod.active_verification_attempt_number == 1
5627 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
5628 assert summary.workflow_timeline[-1].policy_outcome == "planned"
5629 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
5630 assert (
5631 summary.workflow_timeline[-1].verification_observations[0].attempt_id
5632 == "verification-attempt-1"
5633 )
5634 assert (
5635 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
5636 )
5637
5638
5639 @pytest.mark.asyncio
5640 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
5641 temp_dir: Path,
5642 ) -> None:
5643 async def assess_confidence(
5644 tool_name: str,
5645 tool_args: dict,
5646 context: str,
5647 ) -> ConfidenceAssessment:
5648 raise AssertionError("Confidence scoring should be disabled in this scenario")
5649
5650 async def verify_action(
5651 tool_name: str,
5652 tool_args: dict,
5653 result: str,
5654 expected: str = "",
5655 ) -> ActionVerification:
5656 raise AssertionError("Verification should not run in this scenario")
5657
5658 context = build_context(
5659 temp_dir=temp_dir,
5660 messages=[],
5661 safeguards=FakeSafeguards(),
5662 assess_confidence=assess_confidence,
5663 verify_action=verify_action,
5664 )
5665 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5666 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
5667 chapters = nginx_root / "chapters"
5668 implementation_plan = temp_dir / "implementation.md"
5669 implementation_plan.write_text(
5670 "\n".join(
5671 [
5672 "# Implementation Plan",
5673 "",
5674 "## File Changes",
5675 f"- `{chapters}/`",
5676 f"- `{nginx_root / 'index.html'}`",
5677 "",
5678 ]
5679 )
5680 )
5681
5682 tool_call = ToolCall(
5683 id="mkdir-1",
5684 name="bash",
5685 arguments={"command": f"mkdir -p {chapters}"},
5686 )
5687 executor = FakeExecutor(
5688 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
5689 )
5690 summary = TurnSummary(final_response="")
5691 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
5692 dod.implementation_plan = str(implementation_plan)
5693 events: list[AgentEvent] = []
5694
5695 async def emit(event: AgentEvent) -> None:
5696 events.append(event)
5697
5698 await runner.execute_batch(
5699 tool_calls=[tool_call],
5700 tool_source="assistant",
5701 pending_tool_calls_seen=set(),
5702 emit=emit,
5703 summary=summary,
5704 dod=dod,
5705 executor=executor, # type: ignore[arg-type]
5706 on_confirmation=None,
5707 on_user_question=None,
5708 emit_confirmation=None,
5709 consecutive_errors=0,
5710 )
5711
5712 assert dod.last_verification_result is None
5713 assert "Collect verification evidence" not in dod.pending_items
5714 assert not any(
5715 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
5716 )
5717
5718
5719 @pytest.mark.asyncio
5720 async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
5721 temp_dir: Path,
5722 ) -> None:
5723 async def assess_confidence(
5724 tool_name: str,
5725 tool_args: dict,
5726 context: str,
5727 ) -> ConfidenceAssessment:
5728 raise AssertionError("Confidence scoring should be disabled in this scenario")
5729
5730 async def verify_action(
5731 tool_name: str,
5732 tool_args: dict,
5733 result: str,
5734 expected: str = "",
5735 ) -> ActionVerification:
5736 raise AssertionError("Verification should not run in this scenario")
5737
5738 context = build_context(
5739 temp_dir=temp_dir,
5740 messages=[],
5741 safeguards=FakeSafeguards(),
5742 assess_confidence=assess_confidence,
5743 verify_action=verify_action,
5744 )
5745 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5746 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
5747 chapters = nginx_root / "chapters"
5748 chapters.mkdir(parents=True)
5749 index_path = nginx_root / "index.html"
5750 implementation_plan = temp_dir / "implementation.md"
5751 implementation_plan.write_text(
5752 "\n".join(
5753 [
5754 "# Implementation Plan",
5755 "",
5756 "## File Changes",
5757 f"- `{nginx_root}/`",
5758 f"- `{chapters}/`",
5759 f"- `{index_path}`",
5760 "",
5761 ]
5762 )
5763 )
5764
5765 tool_call = ToolCall(
5766 id="write-index",
5767 name="write",
5768 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
5769 )
5770 executor = FakeExecutor(
5771 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5772 )
5773 summary = TurnSummary(final_response="")
5774 dod = create_definition_of_done("Create a multi-file nginx guide.")
5775 dod.implementation_plan = str(implementation_plan)
5776 dod.pending_items.extend(
5777 [
5778 "Develop the main index.html file with proper structure",
5779 "Create first nginx chapter",
5780 ]
5781 )
5782 events: list[AgentEvent] = []
5783
5784 async def emit(event: AgentEvent) -> None:
5785 events.append(event)
5786
5787 await runner.execute_batch(
5788 tool_calls=[tool_call],
5789 tool_source="assistant",
5790 pending_tool_calls_seen=set(),
5791 emit=emit,
5792 summary=summary,
5793 dod=dod,
5794 executor=executor, # type: ignore[arg-type]
5795 on_confirmation=None,
5796 on_user_question=None,
5797 emit_confirmation=None,
5798 consecutive_errors=0,
5799 )
5800
5801 assert dod.last_verification_result is None
5802 assert "Collect verification evidence" not in dod.pending_items
5803 assert "Create first nginx chapter" in dod.pending_items
5804 assert not any(
5805 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
5806 )
5807
5808
5809 @pytest.mark.asyncio
5810 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
5811 temp_dir: Path,
5812 ) -> None:
5813 async def assess_confidence(
5814 tool_name: str,
5815 tool_args: dict,
5816 context: str,
5817 ) -> ConfidenceAssessment:
5818 raise AssertionError("Confidence scoring should be disabled in this scenario")
5819
5820 async def verify_action(
5821 tool_name: str,
5822 tool_args: dict,
5823 result: str,
5824 expected: str = "",
5825 ) -> ActionVerification:
5826 raise AssertionError("Verification should not run for this scenario")
5827
5828 context = build_context(
5829 temp_dir=temp_dir,
5830 messages=[],
5831 safeguards=FakeSafeguards(),
5832 assess_confidence=assess_confidence,
5833 verify_action=verify_action,
5834 )
5835 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5836 tool_call = ToolCall(
5837 id="write-1",
5838 name="write",
5839 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
5840 )
5841 executor = FakeExecutor(
5842 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5843 )
5844 summary = TurnSummary(final_response="")
5845 dod = create_definition_of_done("Update README and verify it still works.")
5846 dod.verification_commands = ["uv run pytest -q"]
5847 dod.last_verification_result = "passed"
5848 dod.verification_attempt_counter = 1
5849 dod.active_verification_attempt_id = "verification-attempt-1"
5850 dod.active_verification_attempt_number = 1
5851 dod.evidence = [
5852 VerificationEvidence(
5853 command="uv run pytest -q",
5854 passed=True,
5855 stdout="401 passed",
5856 kind="test",
5857 )
5858 ]
5859 dod.completed_items.append("Collect verification evidence")
5860 events: list[AgentEvent] = []
5861
5862 async def emit(event: AgentEvent) -> None:
5863 events.append(event)
5864
5865 await runner.execute_batch(
5866 tool_calls=[tool_call],
5867 tool_source="assistant",
5868 pending_tool_calls_seen=set(),
5869 emit=emit,
5870 summary=summary,
5871 dod=dod,
5872 executor=executor, # type: ignore[arg-type]
5873 on_confirmation=None,
5874 on_user_question=None,
5875 emit_confirmation=None,
5876 consecutive_errors=0,
5877 )
5878
5879 assert dod.last_verification_result == "stale"
5880 assert dod.evidence == []
5881 assert "Collect verification evidence" in dod.pending_items
5882 assert "Collect verification evidence" not in dod.completed_items
5883 assert dod.active_verification_attempt_id == "verification-attempt-2"
5884 assert dod.active_verification_attempt_number == 2
5885 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
5886 assert summary.workflow_timeline[-1].policy_outcome == "stale"
5887 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
5888 assert (
5889 summary.workflow_timeline[-1].verification_observations[0].attempt_id
5890 == "verification-attempt-1"
5891 )
5892 assert (
5893 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
5894 )
5895 assert (
5896 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
5897 == "verification-attempt-2"
5898 )
5899 assert (
5900 summary.workflow_timeline[-1].verification_observations[0].command
5901 == "uv run pytest -q"
5902 )
5903
5904
5905 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
5906 async def assess_confidence(
5907 tool_name: str,
5908 tool_args: dict,
5909 context: str,
5910 ) -> ConfidenceAssessment:
5911 raise AssertionError("Confidence scoring should be disabled in this scenario")
5912
5913 async def verify_action(
5914 tool_name: str,
5915 tool_args: dict,
5916 result: str,
5917 expected: str = "",
5918 ) -> ActionVerification:
5919 raise AssertionError("Verification should not run in this scenario")
5920
5921 repair_target = temp_dir / "guide" / "index.html"
5922 context = build_context(
5923 temp_dir=temp_dir,
5924 messages=[
5925 Message(
5926 role=Role.ASSISTANT,
5927 content=(
5928 "Repair focus:\n"
5929 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
5930 f"- Immediate next step: edit `{repair_target}`.\n"
5931 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
5932 ),
5933 )
5934 ],
5935 safeguards=FakeSafeguards(),
5936 assess_confidence=assess_confidence,
5937 verify_action=verify_action,
5938 )
5939 queued: list[str] = []
5940 context.queue_steering_message_callback = queued.append
5941 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5942
5943 runner._queue_blocked_active_repair_nudge(
5944 "[Blocked - active repair scope: verification already identified the repair target.]"
5945 )
5946
5947 assert queued
5948 assert str(repair_target) in queued[0]
5949 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
5950 assert "Do not reopen unrelated reference materials" in queued[0]
5951
5952
5953 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
5954 temp_dir: Path,
5955 ) -> None:
5956 async def assess_confidence(
5957 tool_name: str,
5958 tool_args: dict,
5959 context: str,
5960 ) -> ConfidenceAssessment:
5961 raise AssertionError("Confidence scoring should be disabled in this scenario")
5962
5963 async def verify_action(
5964 tool_name: str,
5965 tool_args: dict,
5966 result: str,
5967 expected: str = "",
5968 ) -> ActionVerification:
5969 raise AssertionError("Verification should not run in this scenario")
5970
5971 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
5972 stylesheet = temp_dir / "guide" / "styles.css"
5973 context = build_context(
5974 temp_dir=temp_dir,
5975 messages=[
5976 Message(
5977 role=Role.ASSISTANT,
5978 content=(
5979 "Repair focus:\n"
5980 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
5981 f"- Immediate next step: edit `{repair_target}`.\n"
5982 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
5983 ),
5984 )
5985 ],
5986 safeguards=FakeSafeguards(),
5987 assess_confidence=assess_confidence,
5988 verify_action=verify_action,
5989 )
5990 queued: list[str] = []
5991 context.queue_steering_message_callback = queued.append
5992 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5993
5994 runner._queue_blocked_active_repair_mutation_nudge(
5995 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
5996 )
5997
5998 assert queued
5999 assert str(repair_target) in queued[0]
6000 assert str(stylesheet) in queued[0]
6001 assert "before widening the change set" in queued[0]
6002
6003
6004 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
6005 temp_dir: Path,
6006 ) -> None:
6007 async def assess_confidence(
6008 tool_name: str,
6009 tool_args: dict,
6010 context: str,
6011 ) -> ConfidenceAssessment:
6012 raise AssertionError("Confidence scoring should be disabled in this scenario")
6013
6014 async def verify_action(
6015 tool_name: str,
6016 tool_args: dict,
6017 result: str,
6018 expected: str = "",
6019 ) -> ActionVerification:
6020 raise AssertionError("Verification should not run in this scenario")
6021
6022 context = build_context(
6023 temp_dir=temp_dir,
6024 messages=[],
6025 safeguards=FakeSafeguards(),
6026 assess_confidence=assess_confidence,
6027 verify_action=verify_action,
6028 )
6029 queued: list[str] = []
6030 context.queue_steering_message_callback = queued.append
6031 store = DefinitionOfDoneStore(temp_dir)
6032 dod = create_definition_of_done("Create a multi-file guide from a reference")
6033 plan_path = temp_dir / "implementation.md"
6034 plan_path.write_text(
6035 "# File Changes\n"
6036 "- `guide/index.html`\n"
6037 "- `guide/chapters/01-getting-started.html`\n"
6038 "- `guide/chapters/02-installation.html`\n"
6039 "- `guide/chapters/03-first-website.html`\n"
6040 )
6041 dod.implementation_plan = str(plan_path)
6042 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
6043 (temp_dir / "guide" / "index.html").write_text("index")
6044 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
6045 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
6046 runner = ToolBatchRunner(context, store)
6047
6048 runner._queue_blocked_late_reference_drift_nudge(
6049 "[Blocked - late reference drift: several planned artifacts already exist.]",
6050 dod=dod,
6051 )
6052
6053 assert queued
6054 assert "03-first-website.html" in queued[0]
6055 assert "older reference materials" in queued[0]
6056
6057
6058 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
6059 temp_dir: Path,
6060 ) -> None:
6061 async def assess_confidence(
6062 tool_name: str,
6063 tool_args: dict,
6064 context: str,
6065 ) -> ConfidenceAssessment:
6066 raise AssertionError("Confidence scoring should be disabled in this scenario")
6067
6068 async def verify_action(
6069 tool_name: str,
6070 tool_args: dict,
6071 result: str,
6072 expected: str = "",
6073 ) -> ActionVerification:
6074 raise AssertionError("Verification should not run in this scenario")
6075
6076 guide_root = temp_dir / "guide"
6077 chapters = guide_root / "chapters"
6078 guide_root.mkdir(parents=True)
6079 chapters.mkdir()
6080 index_path = guide_root / "index.html"
6081 chapter_one = chapters / "01-getting-started.html"
6082 chapter_two = chapters / "02-installation.html"
6083 index_path.write_text("index")
6084 chapter_one.write_text("one")
6085 chapter_two.write_text("two")
6086
6087 implementation_plan = temp_dir / "implementation.md"
6088 implementation_plan.write_text(
6089 "\n".join(
6090 [
6091 "# Implementation Plan",
6092 "",
6093 "## File Changes",
6094 f"- `{guide_root}`",
6095 f"- `{chapters}`",
6096 f"- `{index_path}`",
6097 f"- `{chapter_one}`",
6098 f"- `{chapter_two}`",
6099 "",
6100 ]
6101 )
6102 )
6103
6104 context = build_context(
6105 temp_dir=temp_dir,
6106 messages=[],
6107 safeguards=FakeSafeguards(),
6108 assess_confidence=assess_confidence,
6109 verify_action=verify_action,
6110 )
6111 queued: list[str] = []
6112 context.queue_steering_message_callback = queued.append
6113 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6114 dod = create_definition_of_done("Create a multi-file guide from a reference")
6115 dod.implementation_plan = str(implementation_plan)
6116 dod.verification_commands = [f"ls -la {guide_root}"]
6117 sync_todos_to_definition_of_done(
6118 dod,
6119 [
6120 {
6121 "content": "Verify all guide files are linked and complete",
6122 "active_form": "Working on: Verify all guide files are linked and complete",
6123 "status": "pending",
6124 }
6125 ],
6126 project_root=temp_dir,
6127 )
6128
6129 runner._queue_blocked_completed_artifact_scope_nudge(
6130 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
6131 dod=dod,
6132 )
6133
6134 assert queued
6135 assert context.workflow_mode == "verify"
6136 assert "All explicitly planned artifacts already exist." in queued[0]
6137 assert "Verify all guide files are linked and complete" in queued[0]
6138 assert "Do not reopen earlier reference materials." in queued[0]
6139 assert "Verification should run next" in queued[0]
6140
6141
6142 def test_tool_batch_runner_blocked_post_build_audit_nudge_switches_to_verify(
6143 temp_dir: Path,
6144 ) -> None:
6145 async def assess_confidence(
6146 tool_name: str,
6147 tool_args: dict,
6148 context: str,
6149 ) -> ConfidenceAssessment:
6150 raise AssertionError("Confidence scoring should be disabled in this scenario")
6151
6152 async def verify_action(
6153 tool_name: str,
6154 tool_args: dict,
6155 result: str,
6156 expected: str = "",
6157 ) -> ActionVerification:
6158 raise AssertionError("Verification should not run in this scenario")
6159
6160 guide_root = temp_dir / "guide"
6161 chapters = guide_root / "chapters"
6162 guide_root.mkdir(parents=True)
6163 chapters.mkdir()
6164 index_path = guide_root / "index.html"
6165 chapter_one = chapters / "01-getting-started.html"
6166 chapter_two = chapters / "02-installation.html"
6167 index_path.write_text("index")
6168 chapter_one.write_text("one")
6169 chapter_two.write_text("two")
6170
6171 implementation_plan = temp_dir / "implementation.md"
6172 implementation_plan.write_text(
6173 "\n".join(
6174 [
6175 "# Implementation Plan",
6176 "",
6177 "## File Changes",
6178 f"- `{guide_root}`",
6179 f"- `{chapters}`",
6180 f"- `{index_path}`",
6181 f"- `{chapter_one}`",
6182 f"- `{chapter_two}`",
6183 "",
6184 ]
6185 )
6186 )
6187
6188 context = build_context(
6189 temp_dir=temp_dir,
6190 messages=[],
6191 safeguards=FakeSafeguards(),
6192 assess_confidence=assess_confidence,
6193 verify_action=verify_action,
6194 )
6195 queued: list[str] = []
6196 context.queue_steering_message_callback = queued.append
6197 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6198 dod = create_definition_of_done("Create a multi-file guide from a reference")
6199 dod.implementation_plan = str(implementation_plan)
6200 dod.verification_commands = [f"ls -la {guide_root}"]
6201
6202 runner._queue_blocked_completed_artifact_scope_nudge(
6203 "[Blocked - post-build audit loop: all explicitly planned artifacts already exist.]",
6204 dod=dod,
6205 )
6206
6207 assert queued
6208 assert context.workflow_mode == "verify"
6209 assert "All explicitly planned artifacts already exist." in queued[0]
6210 assert "move to verification or final confirmation" in queued[0]
6211
6212
6213 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
6214 temp_dir: Path,
6215 ) -> None:
6216 async def assess_confidence(
6217 tool_name: str,
6218 tool_args: dict,
6219 context: str,
6220 ) -> ConfidenceAssessment:
6221 raise AssertionError("Confidence scoring should be disabled in this scenario")
6222
6223 async def verify_action(
6224 tool_name: str,
6225 tool_args: dict,
6226 result: str,
6227 expected: str = "",
6228 ) -> ActionVerification:
6229 raise AssertionError("Verification should not run in this scenario")
6230
6231 context = build_context(
6232 temp_dir=temp_dir,
6233 messages=[],
6234 safeguards=FakeSafeguards(),
6235 assess_confidence=assess_confidence,
6236 verify_action=verify_action,
6237 )
6238 queued: list[str] = []
6239 context.queue_steering_message_callback = queued.append
6240 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6241
6242 runner._queue_blocked_html_declared_target_nudge(
6243 ToolCall(
6244 id="write-ch1",
6245 name="write",
6246 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
6247 ),
6248 (
6249 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
6250 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
6251 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
6252 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
6253 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
6254 ),
6255 )
6256
6257 assert queued
6258 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
6259 assert "`chapters/02-installation.html`" in queued[0]
6260 assert "same file now" in queued[0]
6261
6262
6263 def test_tool_batch_runner_blocked_html_declared_target_nudge_without_close_match(
6264 temp_dir: Path,
6265 ) -> None:
6266 async def assess_confidence(
6267 tool_name: str,
6268 tool_args: dict,
6269 context: str,
6270 ) -> ConfidenceAssessment:
6271 raise AssertionError("Confidence scoring should be disabled in this scenario")
6272
6273 async def verify_action(
6274 tool_name: str,
6275 tool_args: dict,
6276 result: str,
6277 expected: str = "",
6278 ) -> ActionVerification:
6279 raise AssertionError("Verification should not run in this scenario")
6280
6281 context = build_context(
6282 temp_dir=temp_dir,
6283 messages=[],
6284 safeguards=FakeSafeguards(),
6285 assess_confidence=assess_confidence,
6286 verify_action=verify_action,
6287 )
6288 queued: list[str] = []
6289 context.queue_steering_message_callback = queued.append
6290 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6291
6292 runner._queue_blocked_html_declared_target_nudge(
6293 ToolCall(
6294 id="write-ch1",
6295 name="write",
6296 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "introduction.html")},
6297 ),
6298 (
6299 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
6300 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
6301 "introducing new sibling targets that the guide root does not declare; remove or replace "
6302 "undeclared hrefs like: troubleshooting.html. "
6303 "Already-declared local targets include: chapters/introduction.html, chapters/installation.html, "
6304 "chapters/configuration.html."
6305 ),
6306 )
6307
6308 assert queued
6309 assert "Remove the invented hrefs or keep local links within the declared target set" in queued[0]
6310 assert "`chapters/installation.html`" in queued[0]
6311 assert "closest declared target(s)" not in queued[0]
6312
6313
6314 def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_root(
6315 temp_dir: Path,
6316 ) -> None:
6317 async def assess_confidence(
6318 tool_name: str,
6319 tool_args: dict,
6320 context: str,
6321 ) -> ConfidenceAssessment:
6322 raise AssertionError("Confidence scoring should be disabled in this scenario")
6323
6324 async def verify_action(
6325 tool_name: str,
6326 tool_args: dict,
6327 result: str,
6328 expected: str = "",
6329 ) -> ActionVerification:
6330 raise AssertionError("Verification should not run in this scenario")
6331
6332 context = build_context(
6333 temp_dir=temp_dir,
6334 messages=[],
6335 safeguards=FakeSafeguards(),
6336 assess_confidence=assess_confidence,
6337 verify_action=verify_action,
6338 )
6339 queued: list[str] = []
6340 context.queue_steering_message_callback = queued.append
6341 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6342 dod = create_definition_of_done("Create a guide.")
6343
6344 target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
6345 runner._queue_blocked_html_declared_file_creation_nudge(
6346 ToolCall(
6347 id="write-troubleshooting",
6348 name="write",
6349 arguments={"file_path": str(target)},
6350 ),
6351 (
6352 "[Blocked - HTML file creation falls outside the current declared artifact set] "
6353 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
6354 f"update the guide root `{(temp_dir / 'guide' / 'index.html').resolve(strict=False)}` "
6355 "before creating undeclared sibling pages, for example: chapters/troubleshooting.html. "
6356 "Already-declared local targets include: chapters/advanced-topics.html, "
6357 "chapters/basic-usage.html, chapters/configuration.html"
6358 ),
6359 dod=dod,
6360 )
6361
6362 assert queued
6363 assert "update" in queued[0].lower()
6364 assert str((temp_dir / "guide" / "index.html").resolve(strict=False)) in queued[0]
6365 assert "`chapters/troubleshooting.html`" in queued[0]
6366 assert "retry the file creation" in queued[0]
6367
6368
6369 def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
6370 temp_dir: Path,
6371 ) -> None:
6372 async def assess_confidence(
6373 tool_name: str,
6374 tool_args: dict,
6375 context: str,
6376 ) -> ConfidenceAssessment:
6377 raise AssertionError("Confidence scoring should not run in this scenario")
6378
6379 async def verify_action(
6380 tool_name: str,
6381 tool_args: dict,
6382 result: str,
6383 expected: str = "",
6384 ) -> ActionVerification:
6385 raise AssertionError("Verification should not run in this scenario")
6386
6387 guide = temp_dir / "guide"
6388 chapters = guide / "chapters"
6389 guide.mkdir()
6390 chapters.mkdir()
6391 index = guide / "index.html"
6392 index.write_text(
6393 "\n".join(
6394 [
6395 '<a href="chapters/01-introduction.html">Intro</a>',
6396 '<a href="chapters/02-installation.html">Install</a>',
6397 '<a href="../index.html">Back</a>',
6398 "",
6399 ]
6400 )
6401 )
6402 (chapters / "01-introduction.html").write_text("<html></html>\n")
6403 (chapters / "02-installation.html").write_text("<html></html>\n")
6404
6405 implementation_plan = temp_dir / "implementation.md"
6406 implementation_plan.write_text(
6407 "\n".join(
6408 [
6409 "# Implementation Plan",
6410 "",
6411 "## File Changes",
6412 f"- `{index}`",
6413 f"- `{chapters / '01-introduction.html'}`",
6414 f"- `{chapters / '02-installation.html'}`",
6415 "",
6416 ]
6417 )
6418 )
6419
6420 context = build_context(
6421 temp_dir=temp_dir,
6422 messages=[],
6423 safeguards=FakeSafeguards(),
6424 assess_confidence=assess_confidence,
6425 verify_action=verify_action,
6426 )
6427 queued: list[str] = []
6428 context.queue_steering_message_callback = queued.append
6429 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6430 dod = create_definition_of_done("Create a guide.")
6431 dod.implementation_plan = str(implementation_plan)
6432 dod.verification_commands = [f"ls -la {guide}"]
6433 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
6434
6435 target = guide / "chapters" / "08-advanced-configuration.html"
6436 runner._queue_blocked_html_declared_file_creation_nudge(
6437 ToolCall(
6438 id="write-extra",
6439 name="write",
6440 arguments={"file_path": str(target)},
6441 ),
6442 (
6443 "[Blocked - HTML file creation falls outside the current declared artifact set] "
6444 "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
6445 f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
6446 "for example: chapters/08-advanced-configuration.html."
6447 ),
6448 dod=dod,
6449 )
6450
6451 assert queued
6452 assert "All explicitly planned artifacts already exist on disk." in queued[0]
6453 assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
6454 assert "Move to verification or final confirmation using the files already on disk." in queued[0]
6455 assert "update the guide root" not in queued[0]
6456
6457
6458 def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
6459 temp_dir: Path,
6460 ) -> None:
6461 async def assess_confidence(
6462 tool_name: str,
6463 tool_args: dict,
6464 context: str,
6465 ) -> ConfidenceAssessment:
6466 raise AssertionError("Confidence scoring should not run in this scenario")
6467
6468 async def verify_action(
6469 tool_name: str,
6470 tool_args: dict,
6471 result: str,
6472 expected: str = "",
6473 ) -> ActionVerification:
6474 raise AssertionError("Verification should not run in this scenario")
6475
6476 guide = temp_dir / "guide"
6477 chapters = guide / "chapters"
6478 guide.mkdir()
6479 chapters.mkdir()
6480 index = guide / "index.html"
6481 index.write_text(
6482 "\n".join(
6483 [
6484 '<a href="chapters/01-introduction.html">Intro</a>',
6485 '<a href="chapters/02-installation.html">Install</a>',
6486 '<a href="../index.html">Back</a>',
6487 "",
6488 ]
6489 )
6490 )
6491 (chapters / "01-introduction.html").write_text("<html></html>\n")
6492 (chapters / "02-installation.html").write_text("<html></html>\n")
6493
6494 implementation_plan = temp_dir / "implementation.md"
6495 implementation_plan.write_text(
6496 "\n".join(
6497 [
6498 "# Implementation Plan",
6499 "",
6500 "## File Changes",
6501 f"- `{index}`",
6502 f"- `{chapters / '01-introduction.html'}`",
6503 f"- `{chapters / '02-installation.html'}`",
6504 "",
6505 ]
6506 )
6507 )
6508
6509 context = build_context(
6510 temp_dir=temp_dir,
6511 messages=[],
6512 safeguards=FakeSafeguards(),
6513 assess_confidence=assess_confidence,
6514 verify_action=verify_action,
6515 )
6516 queued: list[str] = []
6517 context.queue_steering_message_callback = queued.append
6518 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6519 dod = create_definition_of_done("Create a guide.")
6520 dod.implementation_plan = str(implementation_plan)
6521 dod.verification_commands = [f"ls -la {guide}"]
6522 dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
6523
6524 runner._queue_blocked_html_missing_target_nudge(
6525 ToolCall(
6526 id="edit-root",
6527 name="edit",
6528 arguments={"file_path": str(index)},
6529 ),
6530 (
6531 "[Blocked - Edited HTML links point to files that do not exist] "
6532 "Suggestion: Use only existing local targets for href values and avoid introducing missing links, "
6533 "for example fix: chapters/08-advanced-configuration.html"
6534 ),
6535 dod=dod,
6536 )
6537
6538 assert queued
6539 assert "All explicitly planned artifacts already exist on disk." in queued[0]
6540 assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
6541 assert "Repair the existing generated files instead of expanding the guide." in queued[0]
6542
6543
6544 @pytest.mark.asyncio
6545 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
6546 temp_dir: Path,
6547 ) -> None:
6548 async def assess_confidence(
6549 tool_name: str,
6550 tool_args: dict,
6551 context: str,
6552 ) -> ConfidenceAssessment:
6553 raise AssertionError("Confidence scoring should be disabled in this scenario")
6554
6555 async def verify_action(
6556 tool_name: str,
6557 tool_args: dict,
6558 result: str,
6559 expected: str = "",
6560 ) -> ActionVerification:
6561 raise AssertionError("Verification should not run in this scenario")
6562
6563 guide_root = temp_dir / "guides" / "nginx"
6564 chapters = guide_root / "chapters"
6565 chapters.mkdir(parents=True)
6566 index_path = guide_root / "index.html"
6567 chapter_one = chapters / "01-introduction.html"
6568 chapter_two = chapters / "02-installation.html"
6569 index_path.write_text("<html></html>\n")
6570 chapter_one.write_text("<h1>Intro</h1>\n")
6571
6572 implementation_plan = temp_dir / "implementation.md"
6573 implementation_plan.write_text(
6574 "\n".join(
6575 [
6576 "# Implementation Plan",
6577 "",
6578 "## File Changes",
6579 f"- `{index_path}`",
6580 f"- `{chapter_one}`",
6581 f"- `{chapter_two}`",
6582 "",
6583 ]
6584 )
6585 )
6586
6587 context = build_context(
6588 temp_dir=temp_dir,
6589 messages=[],
6590 safeguards=FakeSafeguards(),
6591 assess_confidence=assess_confidence,
6592 verify_action=verify_action,
6593 auto_recover=False,
6594 )
6595 queued: list[str] = []
6596 context.queue_steering_message_callback = queued.append
6597 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
6598 tool_call = ToolCall(
6599 id="write-2",
6600 name="write",
6601 arguments={"file_path": "", "content": "<html></html>\n"},
6602 )
6603 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
6604 executor = FakeExecutor(
6605 [
6606 ToolExecutionOutcome(
6607 tool_call=tool_call,
6608 state=ToolExecutionState.BLOCKED,
6609 message=Message.tool_result_message(
6610 tool_call_id=tool_call.id,
6611 display_content=blocked_message,
6612 result_content=blocked_message,
6613 is_error=True,
6614 ),
6615 event_content=blocked_message,
6616 is_error=True,
6617 result_output=blocked_message,
6618 )
6619 ]
6620 )
6621 dod = create_definition_of_done("Create a multi-file nginx guide.")
6622 dod.implementation_plan = str(implementation_plan)
6623 dod.touched_files.extend([str(index_path), str(chapter_one)])
6624 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
6625
6626 await runner.execute_batch(
6627 tool_calls=[tool_call],
6628 tool_source="assistant",
6629 pending_tool_calls_seen=set(),
6630 emit=_noop_emit,
6631 summary=TurnSummary(final_response=""),
6632 dod=dod,
6633 executor=executor, # type: ignore[arg-type]
6634 on_confirmation=None,
6635 on_user_question=None,
6636 emit_confirmation=None,
6637 consecutive_errors=0,
6638 )
6639
6640 assert queued
6641 assert "did not provide a valid `file_path`" in queued[0]
6642 assert "Resume by creating `02-installation.html` now." in queued[0]
6643 assert (
6644 f"Prefer one `write` call for `{display_runtime_path(chapter_two)}` instead of more rereads."
6645 in queued[0]
6646 )
6647 assert context.recovery_context is not None
6648 assert context.recovery_context.attempts[-1].error == blocked_message