Python · 185615 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.permissions import (
20 PermissionMode,
21 build_permission_policy,
22 load_permission_rules,
23 )
24 from loader.runtime.reasoning_types import (
25 ActionVerification,
26 ConfidenceAssessment,
27 ConfidenceLevel,
28 )
29 from loader.runtime.recovery import RecoveryContext
30 from loader.runtime.tool_batches import (
31 ToolBatchRunner,
32 )
33 from loader.runtime.tool_batches import (
34 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
35 )
36 from loader.runtime.workflow import sync_todos_to_definition_of_done
37 from loader.tools.base import ToolResult as RegistryToolResult
38 from loader.tools.base import create_default_registry
39 from tests.helpers.runtime_harness import ScriptedBackend
40
41
42 class FakeSession:
43 def __init__(self, messages: list[Message]) -> None:
44 self.messages = list(messages)
45 self.workflow_timeline = []
46
47 def append(self, message: Message) -> None:
48 self.messages.append(message)
49
50 def append_workflow_timeline_entry(self, entry) -> None:
51 self.workflow_timeline.append(entry)
52
53
54 class FakeCodeFilter:
55 def reset(self) -> None:
56 return None
57
58
59 class FakeSafeguards:
60 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
61 self.action_tracker = object()
62 self.validator = object()
63 self.code_filter = FakeCodeFilter()
64 self._detect_loop_result = detect_loop_result
65
66 def filter_stream_chunk(self, content: str) -> str:
67 return content
68
69 def filter_complete_content(self, content: str) -> str:
70 return content
71
72 def should_steer(self) -> bool:
73 return False
74
75 def get_steering_message(self) -> str | None:
76 return None
77
78 def record_response(self, content: str) -> None:
79 return None
80
81 def detect_text_loop(self, content: str) -> tuple[bool, str]:
82 return False, ""
83
84 def detect_loop(self) -> tuple[bool, str]:
85 return self._detect_loop_result
86
87
88 class FakeExecutor:
89 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
90 self._outcomes = list(outcomes)
91 self.calls: list[ToolCall] = []
92
93 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
94 self.calls.append(tool_call)
95 if not self._outcomes:
96 raise AssertionError("No fake tool outcome queued")
97 return self._outcomes.pop(0)
98
99
100 def build_context(
101 *,
102 temp_dir: Path,
103 messages: list[Message],
104 safeguards: FakeSafeguards,
105 assess_confidence,
106 verify_action,
107 recovery_context: RecoveryContext | None = None,
108 confidence_scoring: bool = False,
109 verification: bool = False,
110 auto_recover: bool = True,
111 min_confidence_for_action: int = 3,
112 ) -> RuntimeContext:
113 registry = create_default_registry(temp_dir)
114 registry.configure_workspace_root(temp_dir)
115 rule_status = load_permission_rules(temp_dir)
116 policy = build_permission_policy(
117 active_mode=PermissionMode.WORKSPACE_WRITE,
118 workspace_root=temp_dir,
119 tool_requirements=registry.get_tool_requirements(),
120 rules=rule_status.rules,
121 )
122 context = RuntimeContext(
123 project_root=temp_dir,
124 backend=ScriptedBackend(),
125 registry=registry,
126 session=FakeSession(messages), # type: ignore[arg-type]
127 config=SimpleNamespace(
128 force_react=False,
129 max_recovery_attempts=2,
130 auto_recover=auto_recover,
131 reasoning=SimpleNamespace(
132 rollback=False,
133 show_rollback_plan=False,
134 completion_check=True,
135 max_continuation_prompts=5,
136 self_critique=False,
137 confidence_scoring=confidence_scoring,
138 min_confidence_for_action=min_confidence_for_action,
139 verification=verification,
140 ),
141 ),
142 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
143 project_context=None,
144 permission_policy=policy,
145 permission_config_status=rule_status,
146 workflow_mode="execute",
147 safeguards=safeguards,
148 reasoning=SimpleNamespace(
149 assess_confidence=assess_confidence,
150 verify_action=verify_action,
151 ),
152 recovery_context=recovery_context,
153 )
154 return context
155
156
157 def tool_outcome(
158 *,
159 tool_call: ToolCall,
160 output: str,
161 is_error: bool,
162 state: ToolExecutionState = ToolExecutionState.EXECUTED,
163 metadata: dict[str, object] | None = None,
164 ) -> ToolExecutionOutcome:
165 return ToolExecutionOutcome(
166 tool_call=tool_call,
167 state=state,
168 message=Message.tool_result_message(
169 tool_call_id=tool_call.id,
170 display_content=output,
171 result_content=output,
172 is_error=is_error,
173 ),
174 event_content=output,
175 is_error=is_error,
176 result_output=output,
177 registry_result=RegistryToolResult(
178 output=output,
179 is_error=is_error,
180 metadata=metadata or {},
181 ),
182 )
183
184
185 @pytest.mark.asyncio
186 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
187 captured: dict[str, str] = {}
188
189 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
190 captured["context"] = context
191 return ConfidenceAssessment(
192 action=f"{tool_name} with {tool_args}",
193 tool_name=tool_name,
194 tool_args=tool_args,
195 level=ConfidenceLevel.LOW,
196 reasoning="Need to inspect the target first.",
197 risks=["Unknown target file"],
198 )
199
200 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
201 raise AssertionError("Verification should not run for skipped actions")
202
203 context = build_context(
204 temp_dir=temp_dir,
205 messages=[
206 Message(role=Role.USER, content="Please inspect the project."),
207 Message(role=Role.ASSISTANT, content="I will read the file next."),
208 ],
209 safeguards=FakeSafeguards(),
210 assess_confidence=assess_confidence,
211 verify_action=verify_action,
212 confidence_scoring=True,
213 min_confidence_for_action=3,
214 )
215 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
216 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
217 events: list[AgentEvent] = []
218
219 async def emit(event: AgentEvent) -> None:
220 events.append(event)
221
222 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
223 result = await runner.execute_batch(
224 tool_calls=[tool_call],
225 tool_source="assistant",
226 pending_tool_calls_seen=set(),
227 emit=emit,
228 summary=TurnSummary(final_response=""),
229 dod=create_definition_of_done("Read the docs"),
230 executor=executor, # type: ignore[arg-type]
231 on_confirmation=None,
232 on_user_question=None,
233 emit_confirmation=None,
234 consecutive_errors=0,
235 )
236
237 assert result.actions_taken == []
238 assert executor.calls == []
239 assert "Please inspect the project." in captured["context"]
240 assert context.session.messages[-1].role == Role.USER
241 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
242 event_types = [event.type for event in events]
243 assert "confidence" in event_types
244
245
246 @pytest.mark.asyncio
247 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
248 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
249 raise AssertionError("Confidence scoring should be disabled in this scenario")
250
251 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
252 raise AssertionError("Verification should not run for failed actions")
253
254 context = build_context(
255 temp_dir=temp_dir,
256 messages=[],
257 safeguards=FakeSafeguards(),
258 assess_confidence=assess_confidence,
259 verify_action=verify_action,
260 auto_recover=True,
261 )
262 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
263 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
264 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
265 summary = TurnSummary(final_response="")
266 events: list[AgentEvent] = []
267
268 async def emit(event: AgentEvent) -> None:
269 events.append(event)
270
271 await runner.execute_batch(
272 tool_calls=[tool_call],
273 tool_source="assistant",
274 pending_tool_calls_seen=set(),
275 emit=emit,
276 summary=summary,
277 dod=create_definition_of_done("Run tests"),
278 executor=executor, # type: ignore[arg-type]
279 on_confirmation=None,
280 on_user_question=None,
281 emit_confirmation=None,
282 consecutive_errors=0,
283 )
284
285 assert context.recovery_context is not None
286 assert summary.tool_result_messages
287 assert context.session.messages[-1] == summary.tool_result_messages[-1]
288 assert any(event.type == "recovery" for event in events)
289
290
291 @pytest.mark.asyncio
292 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
293 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
294 raise AssertionError("Confidence scoring should be disabled in this scenario")
295
296 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
297 raise AssertionError("Verification should not run for this scenario")
298
299 context = build_context(
300 temp_dir=temp_dir,
301 messages=[],
302 safeguards=FakeSafeguards(),
303 assess_confidence=assess_confidence,
304 verify_action=verify_action,
305 auto_recover=False,
306 )
307 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
308 tool_call = ToolCall(
309 id="bash-1",
310 name="bash",
311 arguments={"command": "python -m http.server 8000", "background": True},
312 )
313 metadata = {
314 "job_id": "bash-1",
315 "status": "running",
316 "background": True,
317 }
318 executor = FakeExecutor(
319 [
320 tool_outcome(
321 tool_call=tool_call,
322 output="Started bash job bash-1",
323 is_error=False,
324 metadata=metadata,
325 )
326 ]
327 )
328 events: list[AgentEvent] = []
329
330 async def emit(event: AgentEvent) -> None:
331 events.append(event)
332
333 await runner.execute_batch(
334 tool_calls=[tool_call],
335 tool_source="assistant",
336 pending_tool_calls_seen=set(),
337 emit=emit,
338 summary=TurnSummary(final_response=""),
339 dod=create_definition_of_done("Launch a preview server"),
340 executor=executor, # type: ignore[arg-type]
341 on_confirmation=None,
342 on_user_question=None,
343 emit_confirmation=None,
344 consecutive_errors=0,
345 )
346
347 tool_result = next(event for event in events if event.type == "tool_result")
348 assert tool_result.tool_metadata == metadata
349
350
351 @pytest.mark.asyncio
352 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
353 verification_calls: list[str] = []
354
355 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
356 raise AssertionError("Confidence scoring should be disabled in this scenario")
357
358 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
359 verification_calls.append(result)
360 return ActionVerification(
361 tool_name=tool_name,
362 tool_args=tool_args,
363 expected_outcome="Success",
364 actual_result=result,
365 verified=False,
366 discrepancies=["File contents did not match"],
367 needs_correction=True,
368 correction_suggestion="Read the file before editing again.",
369 )
370
371 existing_recovery = RecoveryContext(
372 original_tool="edit",
373 original_args={"file_path": "README.md"},
374 )
375 context = build_context(
376 temp_dir=temp_dir,
377 messages=[],
378 safeguards=FakeSafeguards(),
379 assess_confidence=assess_confidence,
380 verify_action=verify_action,
381 recovery_context=existing_recovery,
382 verification=True,
383 )
384 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
385 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
386 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
387 events: list[AgentEvent] = []
388
389 async def emit(event: AgentEvent) -> None:
390 events.append(event)
391
392 await runner.execute_batch(
393 tool_calls=[tool_call],
394 tool_source="assistant",
395 pending_tool_calls_seen=set(),
396 emit=emit,
397 summary=TurnSummary(final_response=""),
398 dod=create_definition_of_done("Read the docs"),
399 executor=executor, # type: ignore[arg-type]
400 on_confirmation=None,
401 on_user_question=None,
402 emit_confirmation=None,
403 consecutive_errors=0,
404 )
405
406 assert verification_calls == ["file contents"]
407 assert context.recovery_context is existing_recovery
408 assert existing_recovery.successful_steps == [
409 ("read", {"file_path": "README.md"})
410 ]
411 assert context.session.messages[-1].role == Role.TOOL
412 assert context.session.messages[-1].content == "file contents"
413 assert any(event.type == "verification" for event in events)
414
415
416 @pytest.mark.asyncio
417 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
418 temp_dir: Path,
419 ) -> None:
420 async def assess_confidence(
421 tool_name: str,
422 tool_args: dict,
423 context: str,
424 ) -> ConfidenceAssessment:
425 raise AssertionError("Confidence scoring should be disabled in this scenario")
426
427 async def verify_action(
428 tool_name: str,
429 tool_args: dict,
430 result: str,
431 expected: str = "",
432 ) -> ActionVerification:
433 raise AssertionError("Verification should not run for this scenario")
434
435 existing_recovery = RecoveryContext(
436 original_tool="read",
437 original_args={"file_path": "chapters/04-data-types.html"},
438 )
439 existing_recovery.add_attempt(
440 "read",
441 {"file_path": "chapters/04-data-types.html"},
442 "File not found",
443 )
444 context = build_context(
445 temp_dir=temp_dir,
446 messages=[],
447 safeguards=FakeSafeguards(),
448 assess_confidence=assess_confidence,
449 verify_action=verify_action,
450 recovery_context=existing_recovery,
451 auto_recover=False,
452 )
453 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
454 tool_call = ToolCall(
455 id="bash-1",
456 name="bash",
457 arguments={"command": "ls chapters"},
458 )
459 executor = FakeExecutor(
460 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
461 )
462
463 summary = TurnSummary(final_response="")
464 await runner.execute_batch(
465 tool_calls=[tool_call],
466 tool_source="assistant",
467 pending_tool_calls_seen=set(),
468 emit=_noop_emit,
469 summary=summary,
470 dod=create_definition_of_done("Fix the chapter links"),
471 executor=executor, # type: ignore[arg-type]
472 on_confirmation=None,
473 on_user_question=None,
474 emit_confirmation=None,
475 consecutive_errors=0,
476 )
477
478 assert context.recovery_context is existing_recovery
479 assert existing_recovery.successful_steps == [
480 ("bash", {"command": "ls chapters"})
481 ]
482
483
484 @pytest.mark.asyncio
485 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
486 temp_dir: Path,
487 ) -> None:
488 async def assess_confidence(
489 tool_name: str,
490 tool_args: dict,
491 context: str,
492 ) -> ConfidenceAssessment:
493 raise AssertionError("Confidence scoring should be disabled in this scenario")
494
495 async def verify_action(
496 tool_name: str,
497 tool_args: dict,
498 result: str,
499 expected: str = "",
500 ) -> ActionVerification:
501 raise AssertionError("Verification should not run for this scenario")
502
503 existing_recovery = RecoveryContext(
504 original_tool="read",
505 original_args={"file_path": "chapters/04-data-types.html"},
506 )
507 existing_recovery.add_attempt(
508 "read",
509 {"file_path": "chapters/04-data-types.html"},
510 "File not found",
511 )
512 context = build_context(
513 temp_dir=temp_dir,
514 messages=[],
515 safeguards=FakeSafeguards(),
516 assess_confidence=assess_confidence,
517 verify_action=verify_action,
518 recovery_context=existing_recovery,
519 auto_recover=False,
520 )
521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
522 tool_call = ToolCall(
523 id="patch-1",
524 name="patch",
525 arguments={
526 "file_path": "index.html",
527 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
528 },
529 )
530 executor = FakeExecutor(
531 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
532 )
533
534 summary = TurnSummary(final_response="")
535 await runner.execute_batch(
536 tool_calls=[tool_call],
537 tool_source="assistant",
538 pending_tool_calls_seen=set(),
539 emit=_noop_emit,
540 summary=summary,
541 dod=create_definition_of_done("Fix the chapter links"),
542 executor=executor, # type: ignore[arg-type]
543 on_confirmation=None,
544 on_user_question=None,
545 emit_confirmation=None,
546 consecutive_errors=0,
547 )
548
549 assert context.recovery_context is None
550
551
552 @pytest.mark.asyncio
553 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
554 temp_dir: Path,
555 ) -> None:
556 async def assess_confidence(
557 tool_name: str,
558 tool_args: dict,
559 context: str,
560 ) -> ConfidenceAssessment:
561 raise AssertionError("Confidence scoring should be disabled in this scenario")
562
563 async def verify_action(
564 tool_name: str,
565 tool_args: dict,
566 result: str,
567 expected: str = "",
568 ) -> ActionVerification:
569 raise AssertionError("Verification should not run for this scenario")
570
571 messages = [
572 Message(
573 role=Role.TOOL,
574 content=(
575 "Observation [glob]: Result: "
576 f"{temp_dir}/chapters/01-introduction.html\n"
577 f"{temp_dir}/chapters/02-setup.html\n"
578 f"{temp_dir}/chapters/03-basics.html"
579 ),
580 tool_results=[],
581 ),
582 Message(
583 role=Role.ASSISTANT,
584 content="I already inspected the first chapter title.",
585 tool_calls=[
586 ToolCall(
587 id="read-ch1",
588 name="read",
589 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
590 )
591 ],
592 ),
593 Message.tool_result_message(
594 tool_call_id="read-ch1",
595 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
596 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 ),
598 Message(
599 role=Role.ASSISTANT,
600 content="I should update the index now.",
601 tool_calls=[
602 ToolCall(
603 id="read-index",
604 name="read",
605 arguments={"file_path": str(temp_dir / 'index.html')},
606 )
607 ],
608 ),
609 ]
610 context = build_context(
611 temp_dir=temp_dir,
612 messages=messages,
613 safeguards=FakeSafeguards(),
614 assess_confidence=assess_confidence,
615 verify_action=verify_action,
616 auto_recover=False,
617 )
618 (temp_dir / "chapters").mkdir()
619 (temp_dir / "index.html").write_text("<ul></ul>\n")
620 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
621 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
622 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
623 implementation_plan = temp_dir / "implementation.md"
624 implementation_plan.write_text(
625 "\n".join(
626 [
627 "# Implementation Plan",
628 "",
629 "## File Changes",
630 f"- `{temp_dir / 'index.html'}`",
631 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
632 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
633 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
634 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
635 ]
636 )
637 )
638 context.session.current_task = (
639 f"Update {temp_dir / 'index.html'} with the right chapter links."
640 )
641 persistent_messages: list[str] = []
642 ephemeral_messages: list[str] = []
643 context.queue_steering_message_callback = persistent_messages.append
644 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
645 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
646 tool_call = ToolCall(
647 id="read-dup",
648 name="read",
649 arguments={"file_path": str(temp_dir / "index.html")},
650 )
651 duplicate_message = (
652 "[Skipped - duplicate action: Already read "
653 f"{temp_dir / 'index.html'} recently without any intervening changes; "
654 "reuse the earlier read result instead of rereading]"
655 )
656 executor = FakeExecutor(
657 [
658 ToolExecutionOutcome(
659 tool_call=tool_call,
660 state=ToolExecutionState.DUPLICATE,
661 message=Message.tool_result_message(
662 tool_call_id=tool_call.id,
663 display_content=duplicate_message,
664 result_content=duplicate_message,
665 ),
666 event_content=duplicate_message,
667 is_error=False,
668 result_output=duplicate_message,
669 )
670 ]
671 )
672
673 summary = TurnSummary(final_response="")
674 dod = create_definition_of_done("Fix the chapter links")
675 dod.implementation_plan = str(implementation_plan)
676 dod.pending_items.append("Create the remaining chapter files")
677 await runner.execute_batch(
678 tool_calls=[tool_call],
679 tool_source="assistant",
680 pending_tool_calls_seen=set(),
681 emit=_noop_emit,
682 summary=summary,
683 dod=dod,
684 executor=executor, # type: ignore[arg-type]
685 on_confirmation=None,
686 on_user_question=None,
687 emit_confirmation=None,
688 consecutive_errors=0,
689 )
690
691 assert len(persistent_messages) == 1
692 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
693 assert "A declared output artifact is still missing." in persistent_messages[0]
694 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
695 assert (
696 f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
697 in persistent_messages[0]
698 )
699 assert ephemeral_messages == []
700
701
702 @pytest.mark.asyncio
703 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
704 temp_dir: Path,
705 ) -> None:
706 async def assess_confidence(
707 tool_name: str,
708 tool_args: dict,
709 context: str,
710 ) -> ConfidenceAssessment:
711 raise AssertionError("Confidence scoring should not run for this scenario")
712
713 async def verify_action(
714 tool_name: str,
715 tool_args: dict,
716 result: str,
717 expected: str = "",
718 ) -> ActionVerification:
719 raise AssertionError("Verification should not run for this scenario")
720
721 context = build_context(
722 temp_dir=temp_dir,
723 messages=[],
724 safeguards=FakeSafeguards(),
725 assess_confidence=assess_confidence,
726 verify_action=verify_action,
727 auto_recover=False,
728 )
729 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
730 dod = create_definition_of_done("Create a multi-file nginx guide.")
731 sync_todos_to_definition_of_done(
732 dod,
733 [
734 {
735 "content": "Create 03-first-website.html",
736 "active_form": "Creating 03-first-website.html",
737 "status": "pending",
738 },
739 {
740 "content": "Create 04-configuration-basics.html",
741 "active_form": "Creating 04-configuration-basics.html",
742 "status": "pending",
743 },
744 ],
745 )
746
747 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
748 chapter_path.parent.mkdir(parents=True)
749 write_call = ToolCall(
750 id="write-ch3",
751 name="write",
752 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
753 )
754 stale_todo_call = ToolCall(
755 id="todo-stale",
756 name="TodoWrite",
757 arguments={
758 "todos": [
759 {
760 "content": "Create 03-first-website.html",
761 "active_form": "Creating 03-first-website.html",
762 "status": "pending",
763 },
764 {
765 "content": "Create 04-configuration-basics.html",
766 "active_form": "Creating 04-configuration-basics.html",
767 "status": "pending",
768 },
769 ]
770 },
771 )
772 executor = FakeExecutor(
773 [
774 tool_outcome(
775 tool_call=write_call,
776 output=f"Successfully wrote {chapter_path}",
777 is_error=False,
778 ),
779 tool_outcome(
780 tool_call=stale_todo_call,
781 output="Todos updated",
782 is_error=False,
783 metadata={
784 "new_todos": [
785 {
786 "content": "Create 03-first-website.html",
787 "active_form": "Creating 03-first-website.html",
788 "status": "pending",
789 },
790 {
791 "content": "Create 04-configuration-basics.html",
792 "active_form": "Creating 04-configuration-basics.html",
793 "status": "pending",
794 },
795 ]
796 },
797 ),
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 await runner.execute_batch(
803 tool_calls=[write_call, stale_todo_call],
804 tool_source="assistant",
805 pending_tool_calls_seen=set(),
806 emit=_noop_emit,
807 summary=summary,
808 dod=dod,
809 executor=executor, # type: ignore[arg-type]
810 on_confirmation=None,
811 on_user_question=None,
812 emit_confirmation=None,
813 consecutive_errors=0,
814 )
815
816 assert "Create 03-first-website.html" in dod.completed_items
817 assert "Create 03-first-website.html" not in dod.pending_items
818 assert "Create 04-configuration-basics.html" in dod.pending_items
819
820
821 @pytest.mark.asyncio
822 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
823 temp_dir: Path,
824 ) -> None:
825 async def assess_confidence(
826 tool_name: str,
827 tool_args: dict,
828 context: str,
829 ) -> ConfidenceAssessment:
830 raise AssertionError("Confidence scoring should be disabled in this scenario")
831
832 async def verify_action(
833 tool_name: str,
834 tool_args: dict,
835 result: str,
836 expected: str = "",
837 ) -> ActionVerification:
838 raise AssertionError("Verification should not run for this scenario")
839
840 chapters = temp_dir / "chapters"
841 chapters.mkdir()
842 (chapters / "01-introduction.html").write_text(
843 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
844 )
845 (chapters / "02-setup.html").write_text(
846 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
847 )
848 (temp_dir / "index.html").write_text("<ul></ul>\n")
849
850 context = build_context(
851 temp_dir=temp_dir,
852 messages=[],
853 safeguards=FakeSafeguards(),
854 assess_confidence=assess_confidence,
855 verify_action=verify_action,
856 auto_recover=False,
857 )
858 context.session.current_task = (
859 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
860 )
861 persistent_messages: list[str] = []
862 ephemeral_messages: list[str] = []
863 context.queue_steering_message_callback = persistent_messages.append
864 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
865 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
866 tool_call = ToolCall(
867 id="glob-1",
868 name="glob",
869 arguments={"path": str(chapters), "pattern": "*.html"},
870 )
871 executor = FakeExecutor(
872 [
873 tool_outcome(
874 tool_call=tool_call,
875 output="\n".join(
876 [
877 str(chapters / "01-introduction.html"),
878 str(chapters / "02-setup.html"),
879 ]
880 ),
881 is_error=False,
882 )
883 ]
884 )
885
886 summary = TurnSummary(final_response="")
887 await runner.execute_batch(
888 tool_calls=[tool_call],
889 tool_source="assistant",
890 pending_tool_calls_seen=set(),
891 emit=_noop_emit,
892 summary=summary,
893 dod=create_definition_of_done("Fix the chapter links"),
894 executor=executor, # type: ignore[arg-type]
895 on_confirmation=None,
896 on_user_question=None,
897 emit_confirmation=None,
898 consecutive_errors=0,
899 )
900
901 assert persistent_messages == []
902 assert ephemeral_messages == []
903 assert len(summary.tool_result_messages) == 1
904 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
905
906
907 @pytest.mark.asyncio
908 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
909 temp_dir: Path,
910 ) -> None:
911 async def assess_confidence(
912 tool_name: str,
913 tool_args: dict,
914 context: str,
915 ) -> ConfidenceAssessment:
916 raise AssertionError("Confidence scoring should be disabled in this scenario")
917
918 async def verify_action(
919 tool_name: str,
920 tool_args: dict,
921 result: str,
922 expected: str = "",
923 ) -> ActionVerification:
924 raise AssertionError("Verification should not run for this scenario")
925
926 chapters = temp_dir / "chapters"
927 chapters.mkdir()
928 (chapters / "01-introduction.html").write_text(
929 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
930 )
931 (chapters / "02-setup.html").write_text(
932 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
933 )
934 index_path = temp_dir / "index.html"
935 old_block = (
936 '<ul class="chapter-list">\n'
937 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
938 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
939 "</ul>\n"
940 )
941 new_block = (
942 '<ul class="chapter-list">\n'
943 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
944 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
945 "</ul>\n"
946 )
947 index_path.write_text(new_block)
948
949 context = build_context(
950 temp_dir=temp_dir,
951 messages=[],
952 safeguards=FakeSafeguards(),
953 assess_confidence=assess_confidence,
954 verify_action=verify_action,
955 auto_recover=False,
956 )
957 context.session.current_task = (
958 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
959 )
960 persistent_messages: list[str] = []
961 ephemeral_messages: list[str] = []
962 context.queue_steering_message_callback = persistent_messages.append
963 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
964 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
965 tool_call = ToolCall(
966 id="edit-1",
967 name="edit",
968 arguments={
969 "file_path": str(index_path),
970 "old_string": old_block,
971 "new_string": new_block,
972 },
973 )
974 executor = FakeExecutor(
975 [
976 tool_outcome(
977 tool_call=tool_call,
978 output=f"Successfully edited {index_path}",
979 is_error=False,
980 )
981 ]
982 )
983
984 summary = TurnSummary(final_response="")
985 await runner.execute_batch(
986 tool_calls=[tool_call],
987 tool_source="assistant",
988 pending_tool_calls_seen=set(),
989 emit=_noop_emit,
990 summary=summary,
991 dod=create_definition_of_done(
992 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
993 ),
994 executor=executor, # type: ignore[arg-type]
995 on_confirmation=None,
996 on_user_question=None,
997 emit_confirmation=None,
998 consecutive_errors=0,
999 )
1000
1001 assert all(
1002 "Semantic verification preview:" not in message.content
1003 for message in summary.tool_result_messages
1004 )
1005 assert persistent_messages == []
1006 assert ephemeral_messages == []
1007
1008
1009 @pytest.mark.asyncio
1010 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1011 temp_dir: Path,
1012 ) -> None:
1013 async def assess_confidence(
1014 tool_name: str,
1015 tool_args: dict,
1016 context: str,
1017 ) -> ConfidenceAssessment:
1018 raise AssertionError("Confidence scoring should be disabled in this scenario")
1019
1020 async def verify_action(
1021 tool_name: str,
1022 tool_args: dict,
1023 result: str,
1024 expected: str = "",
1025 ) -> ActionVerification:
1026 raise AssertionError("Verification should not run for this scenario")
1027
1028 chapters = temp_dir / "chapters"
1029 chapters.mkdir()
1030 (chapters / "01-introduction.html").write_text(
1031 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1032 )
1033 (chapters / "02-setup.html").write_text(
1034 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1035 )
1036 index_path = temp_dir / "index.html"
1037 index_path.write_text(
1038 "<h2>Table of Contents</h2>\n"
1039 '<ul class="chapter-list">\n'
1040 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1041 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1042 "</ul>\n"
1043 )
1044
1045 prompt = (
1046 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1047 "for the structure and cadence of the guide. We are going to make an all "
1048 "new equally thorough guide on how to use the nginx tool."
1049 )
1050
1051 context = build_context(
1052 temp_dir=temp_dir,
1053 messages=[],
1054 safeguards=FakeSafeguards(),
1055 assess_confidence=assess_confidence,
1056 verify_action=verify_action,
1057 auto_recover=False,
1058 )
1059 context.session.current_task = prompt # type: ignore[attr-defined]
1060 persistent_messages: list[str] = []
1061 ephemeral_messages: list[str] = []
1062 context.queue_steering_message_callback = persistent_messages.append
1063 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1064 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1065 tool_call = ToolCall(
1066 id="read-index",
1067 name="read",
1068 arguments={"file_path": str(index_path)},
1069 )
1070 executor = FakeExecutor(
1071 [
1072 tool_outcome(
1073 tool_call=tool_call,
1074 output=index_path.read_text(),
1075 is_error=False,
1076 )
1077 ]
1078 )
1079
1080 summary = TurnSummary(final_response="")
1081 await runner.execute_batch(
1082 tool_calls=[tool_call],
1083 tool_source="assistant",
1084 pending_tool_calls_seen=set(),
1085 emit=_noop_emit,
1086 summary=summary,
1087 dod=create_definition_of_done(prompt),
1088 executor=executor, # type: ignore[arg-type]
1089 on_confirmation=None,
1090 on_user_question=None,
1091 emit_confirmation=None,
1092 consecutive_errors=0,
1093 )
1094
1095 assert persistent_messages == []
1096 assert ephemeral_messages == []
1097 assert all(
1098 "Semantic verification preview:" not in message.content
1099 for message in summary.tool_result_messages
1100 )
1101
1102
1103 @pytest.mark.asyncio
1104 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1105 temp_dir: Path,
1106 ) -> None:
1107 async def assess_confidence(
1108 tool_name: str,
1109 tool_args: dict,
1110 context: str,
1111 ) -> ConfidenceAssessment:
1112 raise AssertionError("Confidence scoring should be disabled in this scenario")
1113
1114 async def verify_action(
1115 tool_name: str,
1116 tool_args: dict,
1117 result: str,
1118 expected: str = "",
1119 ) -> ActionVerification:
1120 raise AssertionError("Verification should not run for this scenario")
1121
1122 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1123 reference.parent.mkdir(parents=True)
1124 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1125 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1126 chapters = nginx_root / "chapters"
1127 implementation_plan = temp_dir / "implementation.md"
1128 implementation_plan.write_text(
1129 "\n".join(
1130 [
1131 "# Implementation Plan",
1132 "",
1133 "## File Changes",
1134 f"- `{chapters}/`",
1135 f"- `{nginx_root / 'index.html'}`",
1136 "",
1137 ]
1138 )
1139 )
1140
1141 context = build_context(
1142 temp_dir=temp_dir,
1143 messages=[],
1144 safeguards=FakeSafeguards(),
1145 assess_confidence=assess_confidence,
1146 verify_action=verify_action,
1147 auto_recover=False,
1148 )
1149 persistent_messages: list[str] = []
1150 ephemeral_messages: list[str] = []
1151 context.queue_steering_message_callback = persistent_messages.append
1152 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1153 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1154 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1155 dod.implementation_plan = str(implementation_plan)
1156 sync_todos_to_definition_of_done(
1157 dod,
1158 [
1159 {
1160 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1161 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1162 "status": "pending",
1163 },
1164 {
1165 "content": "Create the nginx directory structure",
1166 "active_form": "Working on: Create the nginx directory structure",
1167 "status": "pending",
1168 },
1169 {
1170 "content": "Create the nginx index.html file",
1171 "active_form": "Working on: Create the nginx index.html file",
1172 "status": "pending",
1173 },
1174 ],
1175 )
1176 tool_call = ToolCall(
1177 id="read-reference",
1178 name="read",
1179 arguments={"file_path": str(reference)},
1180 )
1181 executor = FakeExecutor(
1182 [
1183 tool_outcome(
1184 tool_call=tool_call,
1185 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1186 is_error=False,
1187 )
1188 ]
1189 )
1190
1191 summary = TurnSummary(final_response="")
1192 await runner.execute_batch(
1193 tool_calls=[tool_call],
1194 tool_source="assistant",
1195 pending_tool_calls_seen=set(),
1196 emit=_noop_emit,
1197 summary=summary,
1198 dod=dod,
1199 executor=executor, # type: ignore[arg-type]
1200 on_confirmation=None,
1201 on_user_question=None,
1202 emit_confirmation=None,
1203 consecutive_errors=0,
1204 )
1205
1206 assert (
1207 "Examine the existing Fortran guide structure to understand the cadence and format"
1208 in dod.completed_items
1209 )
1210 assert any(
1211 "Continue with the next pending item: `Create the nginx directory structure`"
1212 in message
1213 for message in persistent_messages
1214 )
1215 assert any(
1216 "Resume by creating `chapters/` now." in message
1217 for message in persistent_messages
1218 )
1219 assert all("01-introduction.html" not in message for message in persistent_messages)
1220 assert ephemeral_messages == []
1221
1222
1223 @pytest.mark.asyncio
1224 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1225 temp_dir: Path,
1226 ) -> None:
1227 async def assess_confidence(
1228 tool_name: str,
1229 tool_args: dict,
1230 context: str,
1231 ) -> ConfidenceAssessment:
1232 raise AssertionError("Confidence scoring should be disabled in this scenario")
1233
1234 async def verify_action(
1235 tool_name: str,
1236 tool_args: dict,
1237 result: str,
1238 expected: str = "",
1239 ) -> ActionVerification:
1240 raise AssertionError("Verification should not run for this scenario")
1241
1242 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1243 reference.parent.mkdir(parents=True)
1244 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1245 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1246 chapters = nginx_root / "chapters"
1247 implementation_plan = temp_dir / "implementation.md"
1248 implementation_plan.write_text(
1249 "\n".join(
1250 [
1251 "# Implementation Plan",
1252 "",
1253 "## File Changes",
1254 f"- `{nginx_root / 'index.html'}`",
1255 f"- `{chapters}/`",
1256 "",
1257 ]
1258 )
1259 )
1260
1261 context = build_context(
1262 temp_dir=temp_dir,
1263 messages=[],
1264 safeguards=FakeSafeguards(),
1265 assess_confidence=assess_confidence,
1266 verify_action=verify_action,
1267 auto_recover=False,
1268 )
1269 persistent_messages: list[str] = []
1270 ephemeral_messages: list[str] = []
1271 context.queue_steering_message_callback = persistent_messages.append
1272 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1273 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1274 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1275 dod.implementation_plan = str(implementation_plan)
1276 sync_todos_to_definition_of_done(
1277 dod,
1278 [
1279 {
1280 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1281 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1282 "status": "pending",
1283 },
1284 {
1285 "content": "Create the nginx directory structure",
1286 "active_form": "Working on: Create the nginx directory structure",
1287 "status": "pending",
1288 },
1289 {
1290 "content": "Create the nginx index.html file",
1291 "active_form": "Working on: Create the nginx index.html file",
1292 "status": "pending",
1293 },
1294 ],
1295 project_root=temp_dir,
1296 )
1297 tool_call = ToolCall(
1298 id="read-reference-index-first",
1299 name="read",
1300 arguments={"file_path": str(reference)},
1301 )
1302 executor = FakeExecutor(
1303 [
1304 tool_outcome(
1305 tool_call=tool_call,
1306 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1307 is_error=False,
1308 )
1309 ]
1310 )
1311
1312 summary = TurnSummary(final_response="")
1313 await runner.execute_batch(
1314 tool_calls=[tool_call],
1315 tool_source="assistant",
1316 pending_tool_calls_seen=set(),
1317 emit=_noop_emit,
1318 summary=summary,
1319 dod=dod,
1320 executor=executor, # type: ignore[arg-type]
1321 on_confirmation=None,
1322 on_user_question=None,
1323 emit_confirmation=None,
1324 consecutive_errors=0,
1325 )
1326
1327 assert persistent_messages
1328 assert any(
1329 "Continue with the next pending item: `Create the nginx directory structure`"
1330 in message
1331 for message in persistent_messages
1332 )
1333 assert any(
1334 "Resume by creating `chapters/` now." in message
1335 for message in persistent_messages
1336 )
1337 assert all(
1338 "Next step: create `index.html`." not in message
1339 for message in persistent_messages
1340 )
1341 assert ephemeral_messages == []
1342
1343
1344 @pytest.mark.asyncio
1345 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1346 temp_dir: Path,
1347 ) -> None:
1348 async def assess_confidence(
1349 tool_name: str,
1350 tool_args: dict,
1351 context: str,
1352 ) -> ConfidenceAssessment:
1353 raise AssertionError("Confidence scoring should be disabled in this scenario")
1354
1355 async def verify_action(
1356 tool_name: str,
1357 tool_args: dict,
1358 result: str,
1359 expected: str = "",
1360 ) -> ActionVerification:
1361 raise AssertionError("Verification should not run for this scenario")
1362
1363 reference = temp_dir / "fortran" / "index.html"
1364 reference.parent.mkdir(parents=True)
1365 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1366
1367 messages = [
1368 Message(
1369 role=Role.TOOL,
1370 content=(
1371 "Observation [read]: Result: "
1372 "<h1>Fortran Beginner's Guide</h1>\n"
1373 ),
1374 )
1375 ]
1376 context = build_context(
1377 temp_dir=temp_dir,
1378 messages=messages,
1379 safeguards=FakeSafeguards(),
1380 assess_confidence=assess_confidence,
1381 verify_action=verify_action,
1382 auto_recover=False,
1383 )
1384 prompt = (
1385 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1386 "for the structure and cadence of the guide. We are going to make an all "
1387 "new equally thorough guide on how to use the nginx tool."
1388 )
1389 context.session.current_task = prompt
1390 persistent_messages: list[str] = []
1391 ephemeral_messages: list[str] = []
1392 context.queue_steering_message_callback = persistent_messages.append
1393 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1394 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1395 dod = create_definition_of_done(prompt)
1396 sync_todos_to_definition_of_done(
1397 dod,
1398 [
1399 {
1400 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1401 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1402 "status": "completed",
1403 },
1404 {
1405 "content": "Create the nginx directory structure",
1406 "active_form": "Working on: Create the nginx directory structure",
1407 "status": "pending",
1408 },
1409 {
1410 "content": "Create the nginx index.html file",
1411 "active_form": "Working on: Create the nginx index.html file",
1412 "status": "pending",
1413 },
1414 ],
1415 )
1416 tool_call = ToolCall(
1417 id="read-dup",
1418 name="read",
1419 arguments={"file_path": str(reference)},
1420 )
1421 duplicate_message = (
1422 "[Skipped - duplicate action: Already read "
1423 f"{reference} recently without any intervening changes; "
1424 "reuse the earlier read result instead of rereading]"
1425 )
1426 executor = FakeExecutor(
1427 [
1428 ToolExecutionOutcome(
1429 tool_call=tool_call,
1430 state=ToolExecutionState.DUPLICATE,
1431 message=Message.tool_result_message(
1432 tool_call_id=tool_call.id,
1433 display_content=duplicate_message,
1434 result_content=duplicate_message,
1435 ),
1436 event_content=duplicate_message,
1437 is_error=False,
1438 result_output=duplicate_message,
1439 )
1440 ]
1441 )
1442
1443 summary = TurnSummary(final_response="")
1444 await runner.execute_batch(
1445 tool_calls=[tool_call],
1446 tool_source="assistant",
1447 pending_tool_calls_seen=set(),
1448 emit=_noop_emit,
1449 summary=summary,
1450 dod=dod,
1451 executor=executor, # type: ignore[arg-type]
1452 on_confirmation=None,
1453 on_user_question=None,
1454 emit_confirmation=None,
1455 consecutive_errors=0,
1456 )
1457
1458 assert len(persistent_messages) == 1
1459 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1460 assert (
1461 "Continue with the next pending item: `Create the nginx directory structure`"
1462 in persistent_messages[0]
1463 )
1464 assert "Update `" not in persistent_messages[0]
1465 assert ephemeral_messages == []
1466
1467
1468 @pytest.mark.asyncio
1469 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1470 temp_dir: Path,
1471 ) -> None:
1472 async def assess_confidence(
1473 tool_name: str,
1474 tool_args: dict,
1475 context: str,
1476 ) -> ConfidenceAssessment:
1477 raise AssertionError("Confidence scoring should be disabled in this scenario")
1478
1479 async def verify_action(
1480 tool_name: str,
1481 tool_args: dict,
1482 result: str,
1483 expected: str = "",
1484 ) -> ActionVerification:
1485 raise AssertionError("Verification should not run for this scenario")
1486
1487 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1488 chapters = guide_root / "chapters"
1489 chapters.mkdir(parents=True)
1490 chapter_one = chapters / "01-introduction.html"
1491 chapter_one.write_text("<html></html>\n")
1492 index_path = guide_root / "index.html"
1493
1494 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1495 reference.parent.mkdir(parents=True, exist_ok=True)
1496 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1497
1498 implementation_plan = temp_dir / "implementation.md"
1499 implementation_plan.write_text(
1500 "\n".join(
1501 [
1502 "# Implementation Plan",
1503 "",
1504 "## File Changes",
1505 f"- `{guide_root}/`",
1506 f"- `{chapters}/`",
1507 f"- `{index_path}`",
1508 f"- `{chapter_one}`",
1509 f"- `{chapters / '02-installation.html'}`",
1510 "",
1511 ]
1512 )
1513 )
1514
1515 context = build_context(
1516 temp_dir=temp_dir,
1517 messages=[],
1518 safeguards=FakeSafeguards(),
1519 assess_confidence=assess_confidence,
1520 verify_action=verify_action,
1521 auto_recover=False,
1522 )
1523 persistent_messages: list[str] = []
1524 ephemeral_messages: list[str] = []
1525 context.queue_steering_message_callback = persistent_messages.append
1526 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1527 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1528 dod = create_definition_of_done("Create a multi-file nginx guide.")
1529 dod.implementation_plan = str(implementation_plan)
1530 dod.touched_files.append(str(chapter_one))
1531 sync_todos_to_definition_of_done(
1532 dod,
1533 [
1534 {
1535 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1536 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1537 "status": "pending",
1538 },
1539 {
1540 "content": "Create each chapter file with appropriate content",
1541 "active_form": "Working on: Create each chapter file with appropriate content",
1542 "status": "pending",
1543 },
1544 {
1545 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1546 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1547 "status": "pending",
1548 },
1549 ],
1550 )
1551 tool_call = ToolCall(
1552 id="read-reference-chapter",
1553 name="read",
1554 arguments={"file_path": str(reference)},
1555 )
1556 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1557 executor = FakeExecutor(
1558 [
1559 ToolExecutionOutcome(
1560 tool_call=tool_call,
1561 state=ToolExecutionState.EXECUTED,
1562 message=Message.tool_result_message(
1563 tool_call_id=tool_call.id,
1564 display_content=read_output,
1565 result_content=read_output,
1566 ),
1567 event_content=read_output,
1568 is_error=False,
1569 result_output=read_output,
1570 )
1571 ]
1572 )
1573
1574 summary = TurnSummary(final_response="")
1575 await runner.execute_batch(
1576 tool_calls=[tool_call],
1577 tool_source="assistant",
1578 pending_tool_calls_seen=set(),
1579 emit=_noop_emit,
1580 summary=summary,
1581 dod=dod,
1582 executor=executor, # type: ignore[arg-type]
1583 on_confirmation=None,
1584 on_user_question=None,
1585 emit_confirmation=None,
1586 consecutive_errors=0,
1587 )
1588
1589 assert persistent_messages
1590 assert any(
1591 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1592 in message
1593 for message in persistent_messages
1594 )
1595 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1596 assert not any(
1597 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1598 in message
1599 for message in persistent_messages
1600 )
1601 assert ephemeral_messages == []
1602
1603
1604 @pytest.mark.asyncio
1605 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1606 temp_dir: Path,
1607 ) -> None:
1608 async def assess_confidence(
1609 tool_name: str,
1610 tool_args: dict,
1611 context: str,
1612 ) -> ConfidenceAssessment:
1613 raise AssertionError("Confidence scoring should not run for this scenario")
1614
1615 async def verify_action(
1616 tool_name: str,
1617 tool_args: dict,
1618 result: str,
1619 expected: str = "",
1620 ) -> ActionVerification:
1621 raise AssertionError("Verification should not run for this scenario")
1622
1623 guide_root = temp_dir / "guides" / "nginx"
1624 chapters = guide_root / "chapters"
1625 guide_root.mkdir(parents=True)
1626 chapters.mkdir()
1627 index_path = guide_root / "index.html"
1628 chapter_one = chapters / "01-getting-started.html"
1629 chapter_two = chapters / "02-installation.html"
1630 index_path.write_text("<html></html>\n")
1631 chapter_one.write_text("<h1>One</h1>\n")
1632 chapter_two.write_text("<h1>Two</h1>\n")
1633
1634 implementation_plan = temp_dir / "implementation.md"
1635 implementation_plan.write_text(
1636 "\n".join(
1637 [
1638 "# Implementation Plan",
1639 "",
1640 "## File Changes",
1641 f"- `{guide_root}/`",
1642 f"- `{chapters}/`",
1643 f"- `{index_path}`",
1644 f"- `{chapter_one}`",
1645 f"- `{chapter_two}`",
1646 "",
1647 ]
1648 )
1649 )
1650
1651 context = build_context(
1652 temp_dir=temp_dir,
1653 messages=[],
1654 safeguards=FakeSafeguards(),
1655 assess_confidence=assess_confidence,
1656 verify_action=verify_action,
1657 auto_recover=False,
1658 )
1659 persistent_messages: list[str] = []
1660 ephemeral_messages: list[str] = []
1661 context.queue_steering_message_callback = persistent_messages.append
1662 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1663 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1664 dod = create_definition_of_done("Create a multi-file nginx guide.")
1665 dod.implementation_plan = str(implementation_plan)
1666 dod.pending_items = [
1667 "Create 07-performance-tuning.html",
1668 "Verify all guide files are linked and complete",
1669 "Complete the requested work",
1670 ]
1671
1672 tool_call = ToolCall(
1673 id="read-dup",
1674 name="read",
1675 arguments={"file_path": str(chapter_one)},
1676 )
1677 duplicate_message = (
1678 "[Skipped - duplicate action: Already read "
1679 f"{chapter_one} recently without any intervening changes; "
1680 "reuse the earlier read result instead of rereading]"
1681 )
1682 executor = FakeExecutor(
1683 [
1684 ToolExecutionOutcome(
1685 tool_call=tool_call,
1686 state=ToolExecutionState.DUPLICATE,
1687 message=Message.tool_result_message(
1688 tool_call_id=tool_call.id,
1689 display_content=duplicate_message,
1690 result_content=duplicate_message,
1691 ),
1692 event_content=duplicate_message,
1693 is_error=False,
1694 result_output=duplicate_message,
1695 )
1696 ]
1697 )
1698
1699 summary = TurnSummary(final_response="")
1700 await runner.execute_batch(
1701 tool_calls=[tool_call],
1702 tool_source="assistant",
1703 pending_tool_calls_seen=set(),
1704 emit=_noop_emit,
1705 summary=summary,
1706 dod=dod,
1707 executor=executor, # type: ignore[arg-type]
1708 on_confirmation=None,
1709 on_user_question=None,
1710 emit_confirmation=None,
1711 consecutive_errors=0,
1712 )
1713
1714 assert len(persistent_messages) == 1
1715 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1716 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1717 assert ephemeral_messages == []
1718
1719
1720 @pytest.mark.asyncio
1721 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1722 temp_dir: Path,
1723 ) -> None:
1724 async def assess_confidence(
1725 tool_name: str,
1726 tool_args: dict,
1727 context: str,
1728 ) -> ConfidenceAssessment:
1729 raise AssertionError("Confidence scoring should not run for this scenario")
1730
1731 async def verify_action(
1732 tool_name: str,
1733 tool_args: dict,
1734 result: str,
1735 expected: str = "",
1736 ) -> ActionVerification:
1737 raise AssertionError("Verification should not run for this scenario")
1738
1739 guide_root = temp_dir / "guides" / "nginx"
1740 chapters = guide_root / "chapters"
1741 guide_root.mkdir(parents=True)
1742 chapters.mkdir()
1743 index_path = guide_root / "index.html"
1744 chapter_one = chapters / "01-getting-started.html"
1745 chapter_two = chapters / "02-installation.html"
1746 index_path.write_text("<html></html>\n")
1747 chapter_one.write_text("<h1>One</h1>\n")
1748 chapter_two.write_text("<h1>Two</h1>\n")
1749
1750 implementation_plan = temp_dir / "implementation.md"
1751 implementation_plan.write_text(
1752 "\n".join(
1753 [
1754 "# Implementation Plan",
1755 "",
1756 "## File Changes",
1757 f"- `{guide_root}/`",
1758 f"- `{chapters}/`",
1759 f"- `{index_path}`",
1760 f"- `{chapter_one}`",
1761 f"- `{chapter_two}`",
1762 "",
1763 ]
1764 )
1765 )
1766
1767 context = build_context(
1768 temp_dir=temp_dir,
1769 messages=[],
1770 safeguards=FakeSafeguards(),
1771 assess_confidence=assess_confidence,
1772 verify_action=verify_action,
1773 auto_recover=False,
1774 )
1775 persistent_messages: list[str] = []
1776 ephemeral_messages: list[str] = []
1777 context.queue_steering_message_callback = persistent_messages.append
1778 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1779 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1780 dod = create_definition_of_done("Create a multi-file nginx guide.")
1781 dod.implementation_plan = str(implementation_plan)
1782 dod.verification_commands = [f"ls -la {guide_root}"]
1783 dod.pending_items = [
1784 "Create 07-performance-tuning.html",
1785 "Complete the requested work",
1786 ]
1787
1788 tool_call = ToolCall(
1789 id="read-dup",
1790 name="read",
1791 arguments={"file_path": str(chapter_one)},
1792 )
1793 duplicate_message = (
1794 "[Skipped - duplicate action: Already read "
1795 f"{chapter_one} recently without any intervening changes; "
1796 "reuse the earlier read result instead of rereading]"
1797 )
1798 executor = FakeExecutor(
1799 [
1800 ToolExecutionOutcome(
1801 tool_call=tool_call,
1802 state=ToolExecutionState.DUPLICATE,
1803 message=Message.tool_result_message(
1804 tool_call_id=tool_call.id,
1805 display_content=duplicate_message,
1806 result_content=duplicate_message,
1807 ),
1808 event_content=duplicate_message,
1809 is_error=False,
1810 result_output=duplicate_message,
1811 )
1812 ]
1813 )
1814
1815 summary = TurnSummary(final_response="")
1816 await runner.execute_batch(
1817 tool_calls=[tool_call],
1818 tool_source="assistant",
1819 pending_tool_calls_seen=set(),
1820 emit=_noop_emit,
1821 summary=summary,
1822 dod=dod,
1823 executor=executor, # type: ignore[arg-type]
1824 on_confirmation=None,
1825 on_user_question=None,
1826 emit_confirmation=None,
1827 consecutive_errors=0,
1828 )
1829
1830 assert len(persistent_messages) == 1
1831 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1832 assert (
1833 "Move to verification or final confirmation using the files already on disk."
1834 in persistent_messages[0]
1835 )
1836 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1837 assert ephemeral_messages == []
1838
1839
1840 @pytest.mark.asyncio
1841 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1842 temp_dir: Path,
1843 ) -> None:
1844 async def assess_confidence(
1845 tool_name: str,
1846 tool_args: dict,
1847 context: str,
1848 ) -> ConfidenceAssessment:
1849 raise AssertionError("Confidence scoring should not run for this scenario")
1850
1851 async def verify_action(
1852 tool_name: str,
1853 tool_args: dict,
1854 result: str,
1855 expected: str = "",
1856 ) -> ActionVerification:
1857 raise AssertionError("Verification should not run for this scenario")
1858
1859 guide_root = temp_dir / "guides" / "nginx"
1860 chapters = guide_root / "chapters"
1861 guide_root.mkdir(parents=True)
1862 chapters.mkdir()
1863 index_path = guide_root / "index.html"
1864 chapter_one = chapters / "01-getting-started.html"
1865 chapter_two = chapters / "02-installation.html"
1866 index_path.write_text("<html></html>\n")
1867 chapter_one.write_text("<h1>One</h1>\n")
1868 chapter_two.write_text("<h1>Two</h1>\n")
1869
1870 implementation_plan = temp_dir / "implementation.md"
1871 implementation_plan.write_text(
1872 "\n".join(
1873 [
1874 "# Implementation Plan",
1875 "",
1876 "## File Changes",
1877 f"- `{guide_root}/`",
1878 f"- `{chapters}/`",
1879 f"- `{index_path}`",
1880 f"- `{chapter_one}`",
1881 f"- `{chapter_two}`",
1882 "",
1883 ]
1884 )
1885 )
1886
1887 context = build_context(
1888 temp_dir=temp_dir,
1889 messages=[],
1890 safeguards=FakeSafeguards(),
1891 assess_confidence=assess_confidence,
1892 verify_action=verify_action,
1893 auto_recover=False,
1894 )
1895 persistent_messages: list[str] = []
1896 ephemeral_messages: list[str] = []
1897 context.queue_steering_message_callback = persistent_messages.append
1898 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1899 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1900 dod = create_definition_of_done("Create a multi-file nginx guide.")
1901 dod.implementation_plan = str(implementation_plan)
1902 dod.verification_commands = [f"ls -la {guide_root}"]
1903 dod.pending_items = [
1904 "Create 01-getting-started.html",
1905 "Creating 02-installation.html",
1906 "Complete the requested work",
1907 ]
1908
1909 tool_call = ToolCall(
1910 id="read-dup-built-stale",
1911 name="read",
1912 arguments={"file_path": str(chapter_one)},
1913 )
1914 duplicate_message = (
1915 "[Skipped - duplicate action: Already read "
1916 f"{chapter_one} recently without any intervening changes; "
1917 "reuse the earlier read result instead of rereading]"
1918 )
1919 executor = FakeExecutor(
1920 [
1921 ToolExecutionOutcome(
1922 tool_call=tool_call,
1923 state=ToolExecutionState.DUPLICATE,
1924 message=Message.tool_result_message(
1925 tool_call_id=tool_call.id,
1926 display_content=duplicate_message,
1927 result_content=duplicate_message,
1928 ),
1929 event_content=duplicate_message,
1930 is_error=False,
1931 result_output=duplicate_message,
1932 )
1933 ]
1934 )
1935
1936 summary = TurnSummary(final_response="")
1937 await runner.execute_batch(
1938 tool_calls=[tool_call],
1939 tool_source="assistant",
1940 pending_tool_calls_seen=set(),
1941 emit=_noop_emit,
1942 summary=summary,
1943 dod=dod,
1944 executor=executor, # type: ignore[arg-type]
1945 on_confirmation=None,
1946 on_user_question=None,
1947 emit_confirmation=None,
1948 consecutive_errors=0,
1949 )
1950
1951 assert len(persistent_messages) == 1
1952 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1953 assert (
1954 "Move to verification or final confirmation using the files already on disk."
1955 in persistent_messages[0]
1956 )
1957 assert "Create 01-getting-started.html" not in persistent_messages[0]
1958 assert "Creating 02-installation.html" not in persistent_messages[0]
1959 assert ephemeral_messages == []
1960
1961
1962 @pytest.mark.asyncio
1963 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
1964 temp_dir: Path,
1965 ) -> None:
1966 async def assess_confidence(
1967 tool_name: str,
1968 tool_args: dict,
1969 context: str,
1970 ) -> ConfidenceAssessment:
1971 raise AssertionError("Confidence scoring should be disabled in this scenario")
1972
1973 async def verify_action(
1974 tool_name: str,
1975 tool_args: dict,
1976 result: str,
1977 expected: str = "",
1978 ) -> ActionVerification:
1979 raise AssertionError("Verification should not run for this scenario")
1980
1981 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1982 reference.parent.mkdir(parents=True)
1983 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1984
1985 context = build_context(
1986 temp_dir=temp_dir,
1987 messages=[],
1988 safeguards=FakeSafeguards(),
1989 assess_confidence=assess_confidence,
1990 verify_action=verify_action,
1991 auto_recover=False,
1992 )
1993 persistent_messages: list[str] = []
1994 ephemeral_messages: list[str] = []
1995 context.queue_steering_message_callback = persistent_messages.append
1996 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1997 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1998 dod = create_definition_of_done("Create a multi-file nginx guide.")
1999 sync_todos_to_definition_of_done(
2000 dod,
2001 [
2002 {
2003 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2004 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2005 "status": "pending",
2006 },
2007 {
2008 "content": "Create the nginx index.html file",
2009 "active_form": "Working on: Create the nginx index.html file",
2010 "status": "pending",
2011 },
2012 ],
2013 )
2014 tool_call = ToolCall(
2015 id="read-reference",
2016 name="read",
2017 arguments={"file_path": str(reference)},
2018 )
2019 executor = FakeExecutor(
2020 [
2021 tool_outcome(
2022 tool_call=tool_call,
2023 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2024 is_error=False,
2025 )
2026 ]
2027 )
2028
2029 summary = TurnSummary(final_response="")
2030 await runner.execute_batch(
2031 tool_calls=[tool_call],
2032 tool_source="assistant",
2033 pending_tool_calls_seen=set(),
2034 emit=_noop_emit,
2035 summary=summary,
2036 dod=dod,
2037 executor=executor, # type: ignore[arg-type]
2038 on_confirmation=None,
2039 on_user_question=None,
2040 emit_confirmation=None,
2041 consecutive_errors=0,
2042 )
2043
2044 assert any(
2045 "Continue with the next pending item: `Create the nginx index.html file`"
2046 in message
2047 for message in persistent_messages
2048 )
2049 assert any(
2050 "stop gathering more reference material and perform the change now" in message
2051 for message in persistent_messages
2052 )
2053 assert ephemeral_messages == []
2054
2055
2056 @pytest.mark.asyncio
2057 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2058 temp_dir: Path,
2059 ) -> None:
2060 async def assess_confidence(
2061 tool_name: str,
2062 tool_args: dict,
2063 context: str,
2064 ) -> ConfidenceAssessment:
2065 raise AssertionError("Confidence scoring should be disabled in this scenario")
2066
2067 async def verify_action(
2068 tool_name: str,
2069 tool_args: dict,
2070 result: str,
2071 expected: str = "",
2072 ) -> ActionVerification:
2073 raise AssertionError("Verification should not run for this scenario")
2074
2075 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2076 reference.parent.mkdir(parents=True)
2077 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2078
2079 context = build_context(
2080 temp_dir=temp_dir,
2081 messages=[],
2082 safeguards=FakeSafeguards(),
2083 assess_confidence=assess_confidence,
2084 verify_action=verify_action,
2085 auto_recover=False,
2086 )
2087 persistent_messages: list[str] = []
2088 ephemeral_messages: list[str] = []
2089 context.queue_steering_message_callback = persistent_messages.append
2090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2092 dod = create_definition_of_done("Create a multi-file nginx guide.")
2093 sync_todos_to_definition_of_done(
2094 dod,
2095 [
2096 {
2097 "content": "First, examine the existing fortran guide structure and content",
2098 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2099 "status": "pending",
2100 },
2101 {
2102 "content": "Create the nginx directory structure",
2103 "active_form": "Working on: Create the nginx directory structure",
2104 "status": "pending",
2105 },
2106 ],
2107 )
2108 tool_call = ToolCall(
2109 id="read-reference",
2110 name="read",
2111 arguments={"file_path": str(reference)},
2112 )
2113 executor = FakeExecutor(
2114 [
2115 tool_outcome(
2116 tool_call=tool_call,
2117 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2118 is_error=False,
2119 )
2120 ]
2121 )
2122
2123 summary = TurnSummary(final_response="")
2124 await runner.execute_batch(
2125 tool_calls=[tool_call],
2126 tool_source="assistant",
2127 pending_tool_calls_seen=set(),
2128 emit=_noop_emit,
2129 summary=summary,
2130 dod=dod,
2131 executor=executor, # type: ignore[arg-type]
2132 on_confirmation=None,
2133 on_user_question=None,
2134 emit_confirmation=None,
2135 consecutive_errors=0,
2136 )
2137
2138 assert persistent_messages
2139 assert any(
2140 "Continue with the next pending item: `Create the nginx directory structure`"
2141 in message
2142 for message in persistent_messages
2143 )
2144 assert ephemeral_messages == []
2145
2146
2147 @pytest.mark.asyncio
2148 async def test_tool_batch_runner_missing_artifact_nudge_names_next_file_after_setup_mkdir(
2149 temp_dir: Path,
2150 ) -> None:
2151 async def assess_confidence(
2152 tool_name: str,
2153 tool_args: dict,
2154 context: str,
2155 ) -> ConfidenceAssessment:
2156 raise AssertionError("Confidence scoring should be disabled in this scenario")
2157
2158 async def verify_action(
2159 tool_name: str,
2160 tool_args: dict,
2161 result: str,
2162 expected: str = "",
2163 ) -> ActionVerification:
2164 raise AssertionError("Verification should not run for this scenario")
2165
2166 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2167 chapters = nginx_root / "chapters"
2168 implementation_plan = temp_dir / "implementation.md"
2169 implementation_plan.write_text(
2170 "\n".join(
2171 [
2172 "# Implementation Plan",
2173 "",
2174 "## File Changes",
2175 f"- `{chapters}/`",
2176 f"- `{nginx_root / 'index.html'}`",
2177 "",
2178 ]
2179 )
2180 )
2181
2182 context = build_context(
2183 temp_dir=temp_dir,
2184 messages=[],
2185 safeguards=FakeSafeguards(),
2186 assess_confidence=assess_confidence,
2187 verify_action=verify_action,
2188 auto_recover=False,
2189 )
2190 persistent_messages: list[str] = []
2191 ephemeral_messages: list[str] = []
2192 context.queue_steering_message_callback = persistent_messages.append
2193 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2194 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2195 dod = create_definition_of_done("Create a multi-file nginx guide.")
2196 dod.implementation_plan = str(implementation_plan)
2197 sync_todos_to_definition_of_done(
2198 dod,
2199 [
2200 {
2201 "content": "Create the nginx directory structure",
2202 "active_form": "Creating the nginx directory structure",
2203 "status": "pending",
2204 },
2205 {
2206 "content": "Develop the main index.html file with proper structure",
2207 "active_form": "Developing the main index.html file with proper structure",
2208 "status": "pending",
2209 },
2210 ],
2211 )
2212
2213 tool_call = ToolCall(
2214 id="mkdir-nginx",
2215 name="bash",
2216 arguments={"command": f"mkdir -p {chapters}"},
2217 )
2218 executor = FakeExecutor(
2219 [
2220 tool_outcome(
2221 tool_call=tool_call,
2222 output="",
2223 is_error=False,
2224 )
2225 ]
2226 )
2227
2228 summary = TurnSummary(final_response="")
2229 await runner.execute_batch(
2230 tool_calls=[tool_call],
2231 tool_source="assistant",
2232 pending_tool_calls_seen=set(),
2233 emit=_noop_emit,
2234 summary=summary,
2235 dod=dod,
2236 executor=executor, # type: ignore[arg-type]
2237 on_confirmation=None,
2238 on_user_question=None,
2239 emit_confirmation=None,
2240 consecutive_errors=0,
2241 )
2242
2243 assert persistent_messages
2244 message = persistent_messages[-1]
2245 assert "Directory setup is complete." in message
2246 assert "Continue with the next pending item: `Develop the main index.html file with proper structure`." in message
2247 assert "Resume by creating `index.html` now." in message
2248 assert ephemeral_messages == []
2249
2250
2251 @pytest.mark.asyncio
2252 async def test_tool_batch_runner_first_chapter_handoff_becomes_ephemeral_after_first_file(
2253 temp_dir: Path,
2254 ) -> None:
2255 async def assess_confidence(
2256 tool_name: str,
2257 tool_args: dict,
2258 context: str,
2259 ) -> ConfidenceAssessment:
2260 raise AssertionError("Confidence scoring should be disabled in this scenario")
2261
2262 async def verify_action(
2263 tool_name: str,
2264 tool_args: dict,
2265 result: str,
2266 expected: str = "",
2267 ) -> ActionVerification:
2268 raise AssertionError("Verification should not run for this scenario")
2269
2270 nginx_root = temp_dir / "guides" / "nginx"
2271 chapters = nginx_root / "chapters"
2272 chapters.mkdir(parents=True)
2273 index_path = nginx_root / "index.html"
2274
2275 implementation_plan = temp_dir / "implementation.md"
2276 implementation_plan.write_text(
2277 "\n".join(
2278 [
2279 "# Implementation Plan",
2280 "",
2281 "## File Changes",
2282 f"- `{chapters}/`",
2283 f"- `{index_path}`",
2284 f"- `{chapters / '01-introduction.html'}`",
2285 "",
2286 ]
2287 )
2288 )
2289
2290 context = build_context(
2291 temp_dir=temp_dir,
2292 messages=[],
2293 safeguards=FakeSafeguards(),
2294 assess_confidence=assess_confidence,
2295 verify_action=verify_action,
2296 auto_recover=False,
2297 )
2298 persistent_messages: list[str] = []
2299 ephemeral_messages: list[str] = []
2300 context.queue_steering_message_callback = persistent_messages.append
2301 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2302 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2303 dod = create_definition_of_done("Create a multi-file nginx guide.")
2304 dod.implementation_plan = str(implementation_plan)
2305 sync_todos_to_definition_of_done(
2306 dod,
2307 [
2308 {
2309 "content": "Create the main index.html file with proper structure",
2310 "active_form": "Creating the main index.html file with proper structure",
2311 "status": "pending",
2312 },
2313 {
2314 "content": "Create each chapter file with appropriate content",
2315 "active_form": "Creating each chapter file with appropriate content",
2316 "status": "pending",
2317 },
2318 ],
2319 )
2320
2321 tool_call = ToolCall(
2322 id="write-index",
2323 name="write",
2324 arguments={
2325 "file_path": str(index_path),
2326 "content": "<html></html>\n",
2327 },
2328 )
2329 executor = FakeExecutor(
2330 [
2331 tool_outcome(
2332 tool_call=tool_call,
2333 output=f"Successfully wrote 14 bytes to {index_path}",
2334 is_error=False,
2335 )
2336 ]
2337 )
2338
2339 summary = TurnSummary(final_response="")
2340 await runner.execute_batch(
2341 tool_calls=[tool_call],
2342 tool_source="assistant",
2343 pending_tool_calls_seen=set(),
2344 emit=_noop_emit,
2345 summary=summary,
2346 dod=dod,
2347 executor=executor, # type: ignore[arg-type]
2348 on_confirmation=None,
2349 on_user_question=None,
2350 emit_confirmation=None,
2351 consecutive_errors=0,
2352 )
2353
2354 assert persistent_messages == []
2355 assert ephemeral_messages
2356 message = ephemeral_messages[-1]
2357 assert "Confirmed progress:" in message
2358 assert "Next step: create `01-introduction.html`." in message
2359 assert (
2360 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2361 in message
2362 )
2363 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2364
2365
2366 @pytest.mark.asyncio
2367 async def test_tool_batch_runner_redirects_post_write_self_audit_to_next_missing_artifact(
2368 temp_dir: Path,
2369 ) -> None:
2370 async def assess_confidence(
2371 tool_name: str,
2372 tool_args: dict,
2373 context: str,
2374 ) -> ConfidenceAssessment:
2375 raise AssertionError("Confidence scoring should not run in this scenario")
2376
2377 async def verify_action(
2378 tool_name: str,
2379 tool_args: dict,
2380 result: str,
2381 expected: str = "",
2382 ) -> ActionVerification:
2383 raise AssertionError("Verification should not run in this scenario")
2384
2385 nginx_root = temp_dir / "guides" / "nginx"
2386 chapters = nginx_root / "chapters"
2387 chapters.mkdir(parents=True)
2388 index_path = nginx_root / "index.html"
2389 index_path.write_text(
2390 "\n".join(
2391 [
2392 "<html>",
2393 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2394 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2395 "</html>",
2396 ]
2397 )
2398 + "\n"
2399 )
2400
2401 implementation_plan = temp_dir / "implementation.md"
2402 implementation_plan.write_text(
2403 "\n".join(
2404 [
2405 "# Implementation Plan",
2406 "",
2407 "## File Changes",
2408 f"- `{nginx_root}/`",
2409 f"- `{chapters}/`",
2410 f"- `{index_path}`",
2411 f"- `{chapters / '01-introduction.html'}`",
2412 "",
2413 ]
2414 )
2415 )
2416
2417 context = build_context(
2418 temp_dir=temp_dir,
2419 messages=[],
2420 safeguards=FakeSafeguards(),
2421 assess_confidence=assess_confidence,
2422 verify_action=verify_action,
2423 auto_recover=False,
2424 )
2425 persistent_messages: list[str] = []
2426 ephemeral_messages: list[str] = []
2427 context.queue_steering_message_callback = persistent_messages.append
2428 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2429 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2430 dod = create_definition_of_done("Create a multi-file nginx guide.")
2431 dod.implementation_plan = str(implementation_plan)
2432 dod.touched_files.append(str(index_path))
2433 dod.completed_items.append("Develop the main index.html file for the nginx guide")
2434 dod.pending_items.append("Create chapter files for the nginx guide")
2435
2436 tool_call = ToolCall(
2437 id="read-index-self-audit",
2438 name="read",
2439 arguments={"file_path": str(index_path)},
2440 )
2441 executor = FakeExecutor(
2442 [
2443 tool_outcome(
2444 tool_call=tool_call,
2445 output="1\t<html>\n",
2446 is_error=False,
2447 )
2448 ]
2449 )
2450
2451 summary = TurnSummary(final_response="")
2452 await runner.execute_batch(
2453 tool_calls=[tool_call],
2454 tool_source="assistant",
2455 pending_tool_calls_seen=set(),
2456 emit=_noop_emit,
2457 summary=summary,
2458 dod=dod,
2459 executor=executor, # type: ignore[arg-type]
2460 on_confirmation=None,
2461 on_user_question=None,
2462 emit_confirmation=None,
2463 consecutive_errors=0,
2464 )
2465
2466 assert persistent_messages
2467 message = persistent_messages[-1]
2468 assert "You already have the current contents of `index.html` from the successful write." in message
2469 assert "Resume by creating `01-introduction.html` now." in message
2470 assert "Do not spend another turn rereading the file you just wrote or on TodoWrite alone." in message
2471 assert ephemeral_messages == []
2472
2473
2474 @pytest.mark.asyncio
2475 async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
2476 temp_dir: Path,
2477 ) -> None:
2478 async def assess_confidence(
2479 tool_name: str,
2480 tool_args: dict,
2481 context: str,
2482 ) -> ConfidenceAssessment:
2483 raise AssertionError("Confidence scoring should be disabled in this scenario")
2484
2485 async def verify_action(
2486 tool_name: str,
2487 tool_args: dict,
2488 result: str,
2489 expected: str = "",
2490 ) -> ActionVerification:
2491 raise AssertionError("Verification should not run for this scenario")
2492
2493 nginx_root = temp_dir / "guides" / "nginx"
2494 chapters = nginx_root / "chapters"
2495 chapters.mkdir(parents=True)
2496 index_path = nginx_root / "index.html"
2497
2498 implementation_plan = temp_dir / "implementation.md"
2499 implementation_plan.write_text(
2500 "\n".join(
2501 [
2502 "# Implementation Plan",
2503 "",
2504 "## File Changes",
2505 f"- `{chapters}/`",
2506 f"- `{index_path}`",
2507 f"- `{chapters / '01-introduction.html'}`",
2508 "",
2509 ]
2510 )
2511 )
2512
2513 context = build_context(
2514 temp_dir=temp_dir,
2515 messages=[
2516 Message(
2517 role=Role.USER,
2518 content=(
2519 "[EMPTY ASSISTANT RESPONSE]\n"
2520 "Respond with that concrete mutation tool call now. Do not return an empty response."
2521 ),
2522 )
2523 ],
2524 safeguards=FakeSafeguards(),
2525 assess_confidence=assess_confidence,
2526 verify_action=verify_action,
2527 auto_recover=False,
2528 )
2529 persistent_messages: list[str] = []
2530 ephemeral_messages: list[str] = []
2531 context.queue_steering_message_callback = persistent_messages.append
2532 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2533 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2534 dod = create_definition_of_done("Create a multi-file nginx guide.")
2535 dod.implementation_plan = str(implementation_plan)
2536 sync_todos_to_definition_of_done(
2537 dod,
2538 [
2539 {
2540 "content": "Create the main index.html file with proper structure",
2541 "active_form": "Creating the main index.html file with proper structure",
2542 "status": "pending",
2543 },
2544 {
2545 "content": "Create each chapter file with appropriate content",
2546 "active_form": "Creating each chapter file with appropriate content",
2547 "status": "pending",
2548 },
2549 ],
2550 )
2551
2552 tool_call = ToolCall(
2553 id="write-index-recovered",
2554 name="write",
2555 arguments={
2556 "file_path": str(index_path),
2557 "content": "<html></html>\n",
2558 },
2559 )
2560 executor = FakeExecutor(
2561 [
2562 tool_outcome(
2563 tool_call=tool_call,
2564 output=f"Successfully wrote 14 bytes to {index_path}",
2565 is_error=False,
2566 )
2567 ]
2568 )
2569
2570 summary = TurnSummary(final_response="")
2571 await runner.execute_batch(
2572 tool_calls=[tool_call],
2573 tool_source="assistant",
2574 pending_tool_calls_seen=set(),
2575 emit=_noop_emit,
2576 summary=summary,
2577 dod=dod,
2578 executor=executor, # type: ignore[arg-type]
2579 on_confirmation=None,
2580 on_user_question=None,
2581 emit_confirmation=None,
2582 consecutive_errors=0,
2583 )
2584
2585 assert persistent_messages == []
2586 assert ephemeral_messages
2587 message = ephemeral_messages[-1]
2588 assert "Next step: create `01-introduction.html`." in message
2589
2590
2591 @pytest.mark.asyncio
2592 async def test_tool_batch_runner_todowrite_uses_concrete_output_language_for_aggregate_chapter_step(
2593 temp_dir: Path,
2594 ) -> None:
2595 async def assess_confidence(
2596 tool_name: str,
2597 tool_args: dict,
2598 context: str,
2599 ) -> ConfidenceAssessment:
2600 raise AssertionError("Confidence scoring should not run in this scenario")
2601
2602 async def verify_action(
2603 tool_name: str,
2604 tool_args: dict,
2605 result: str,
2606 expected: str = "",
2607 ) -> ActionVerification:
2608 raise AssertionError("Verification should not run in this scenario")
2609
2610 guide_root = temp_dir / "guides" / "nginx"
2611 chapters = guide_root / "chapters"
2612 chapters.mkdir(parents=True)
2613 index_path = guide_root / "index.html"
2614 index_path.write_text(
2615 "\n".join(
2616 [
2617 "<html>",
2618 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to Nginx</a>',
2619 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
2620 "</html>",
2621 ]
2622 )
2623 + "\n"
2624 )
2625
2626 implementation_plan = temp_dir / "implementation.md"
2627 implementation_plan.write_text(
2628 "\n".join(
2629 [
2630 "# Implementation Plan",
2631 "",
2632 "## File Changes",
2633 f"- `{guide_root}/`",
2634 f"- `{chapters}/`",
2635 f"- `{index_path}`",
2636 "",
2637 ]
2638 )
2639 )
2640
2641 context = build_context(
2642 temp_dir=temp_dir,
2643 messages=[],
2644 safeguards=FakeSafeguards(),
2645 assess_confidence=assess_confidence,
2646 verify_action=verify_action,
2647 )
2648 queued_messages: list[str] = []
2649 context.queue_steering_message_callback = queued_messages.append
2650 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2651 dod = create_definition_of_done("Create a multi-file nginx guide.")
2652 dod.implementation_plan = str(implementation_plan)
2653 dod.touched_files.append(str(index_path))
2654 sync_todos_to_definition_of_done(
2655 dod,
2656 [
2657 {
2658 "content": "Develop the main index.html file with proper structure",
2659 "active_form": "Developing the main index.html file with proper structure",
2660 "status": "completed",
2661 },
2662 {
2663 "content": "Create chapter files with content and structure",
2664 "active_form": "Creating chapter files with content and structure",
2665 "status": "pending",
2666 },
2667 ],
2668 )
2669
2670 todos = [
2671 {
2672 "content": "Develop the main index.html file with proper structure",
2673 "active_form": "Developing the main index.html file with proper structure",
2674 "status": "completed",
2675 },
2676 {
2677 "content": "Create chapter files with content and structure",
2678 "active_form": "Creating chapter files with content and structure",
2679 "status": "pending",
2680 },
2681 ]
2682 tool_call = ToolCall(
2683 id="todo-aggregate",
2684 name="TodoWrite",
2685 arguments={"todos": todos},
2686 )
2687 executor = FakeExecutor(
2688 [
2689 tool_outcome(
2690 tool_call=tool_call,
2691 output="Todos updated",
2692 is_error=False,
2693 metadata={"new_todos": todos},
2694 )
2695 ]
2696 )
2697
2698 summary = TurnSummary(final_response="")
2699 await runner.execute_batch(
2700 tool_calls=[tool_call],
2701 tool_source="assistant",
2702 pending_tool_calls_seen=set(),
2703 emit=_noop_emit,
2704 summary=summary,
2705 dod=dod,
2706 executor=executor, # type: ignore[arg-type]
2707 on_confirmation=None,
2708 on_user_question=None,
2709 emit_confirmation=None,
2710 consecutive_errors=0,
2711 )
2712
2713 assert queued_messages
2714 message = queued_messages[-1]
2715 assert "Continue with the next concrete output: `01-introduction.html`." in message
2716 assert "Resume by creating `01-introduction.html` now." in message
2717 assert (
2718 "Continue with the next pending item: `Create chapter files with content and structure`."
2719 not in message
2720 )
2721
2722
2723 @pytest.mark.asyncio
2724 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
2725 temp_dir: Path,
2726 ) -> None:
2727 async def assess_confidence(
2728 tool_name: str,
2729 tool_args: dict,
2730 context: str,
2731 ) -> ConfidenceAssessment:
2732 raise AssertionError("Confidence scoring should be disabled in this scenario")
2733
2734 async def verify_action(
2735 tool_name: str,
2736 tool_args: dict,
2737 result: str,
2738 expected: str = "",
2739 ) -> ActionVerification:
2740 raise AssertionError("Verification should not run for this scenario")
2741
2742 guide_root = temp_dir / "guides" / "nginx"
2743 chapters = guide_root / "chapters"
2744 chapters.mkdir(parents=True)
2745 index_path = guide_root / "index.html"
2746 chapter_one = chapters / "01-getting-started.html"
2747 chapter_one.write_text("<h1>One</h1>\n")
2748 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2749
2750 implementation_plan = temp_dir / "implementation.md"
2751 implementation_plan.write_text(
2752 "\n".join(
2753 [
2754 "# Implementation Plan",
2755 "",
2756 "## File Changes",
2757 f"- `{index_path}`",
2758 f"- `{chapter_one}`",
2759 f"- `{chapters / '06-ssl-configuration.html'}`",
2760 "",
2761 ]
2762 )
2763 )
2764
2765 context = build_context(
2766 temp_dir=temp_dir,
2767 messages=[],
2768 safeguards=FakeSafeguards(),
2769 assess_confidence=assess_confidence,
2770 verify_action=verify_action,
2771 auto_recover=False,
2772 )
2773 persistent_messages: list[str] = []
2774 ephemeral_messages: list[str] = []
2775 context.queue_steering_message_callback = persistent_messages.append
2776 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2777 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2778 dod = create_definition_of_done("Create a multi-file nginx guide.")
2779 dod.implementation_plan = str(implementation_plan)
2780 sync_todos_to_definition_of_done(
2781 dod,
2782 [
2783 {
2784 "content": "Ensure all files are properly linked and formatted consistently",
2785 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2786 "status": "pending",
2787 },
2788 {
2789 "content": "Create the final chapter (06-ssl-configuration.html)",
2790 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
2791 "status": "pending",
2792 },
2793 ],
2794 )
2795 assert tool_batches_should_prioritize_missing_artifact(
2796 dod=dod,
2797 next_pending=dod.pending_items[0],
2798 missing_artifact=(chapters / "06-ssl-configuration.html", False),
2799 project_root=temp_dir,
2800 )
2801
2802 tool_call = ToolCall(
2803 id="dup-read",
2804 name="read",
2805 arguments={"file_path": str(index_path)},
2806 )
2807 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
2808
2809 assert persistent_messages
2810 message = persistent_messages[-1]
2811 assert "06-ssl-configuration.html" in message
2812 assert "Do not switch into review or consistency-check mode" in message
2813 assert (
2814 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
2815 not in message
2816 )
2817
2818
2819 @pytest.mark.asyncio
2820 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
2821 temp_dir: Path,
2822 ) -> None:
2823 async def assess_confidence(
2824 tool_name: str,
2825 tool_args: dict,
2826 context: str,
2827 ) -> ConfidenceAssessment:
2828 raise AssertionError("Confidence scoring should be disabled in this scenario")
2829
2830 async def verify_action(
2831 tool_name: str,
2832 tool_args: dict,
2833 result: str,
2834 expected: str = "",
2835 ) -> ActionVerification:
2836 raise AssertionError("Verification should not run for this scenario")
2837
2838 guide_root = temp_dir / "guides" / "nginx"
2839 chapters = guide_root / "chapters"
2840 chapters.mkdir(parents=True)
2841 index_path = guide_root / "index.html"
2842 chapter_one = chapters / "01-getting-started.html"
2843 chapter_two = chapters / "02-installation.html"
2844 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2845 chapter_one.write_text("<h1>One</h1>\n")
2846 chapter_two.write_text("<h1>Two</h1>\n")
2847
2848 implementation_plan = temp_dir / "implementation.md"
2849 implementation_plan.write_text(
2850 "\n".join(
2851 [
2852 "# Implementation Plan",
2853 "",
2854 "## File Changes",
2855 f"- `{chapters}/`",
2856 f"- `{index_path}`",
2857 f"- `{chapter_one}`",
2858 f"- `{chapter_two}`",
2859 "",
2860 ]
2861 )
2862 )
2863
2864 context = build_context(
2865 temp_dir=temp_dir,
2866 messages=[],
2867 safeguards=FakeSafeguards(),
2868 assess_confidence=assess_confidence,
2869 verify_action=verify_action,
2870 auto_recover=False,
2871 )
2872 persistent_messages: list[str] = []
2873 ephemeral_messages: list[str] = []
2874 context.queue_steering_message_callback = persistent_messages.append
2875 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2876 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2877 dod = create_definition_of_done("Create a multi-file nginx guide.")
2878 dod.implementation_plan = str(implementation_plan)
2879 sync_todos_to_definition_of_done(
2880 dod,
2881 [
2882 {
2883 "content": "Create the guide files",
2884 "active_form": "Working on: Create the guide files",
2885 "status": "completed",
2886 },
2887 {
2888 "content": "Ensure all files are properly linked and formatted consistently",
2889 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2890 "status": "pending",
2891 },
2892 ],
2893 )
2894 tool_call = ToolCall(
2895 id="write-final",
2896 name="write",
2897 arguments={
2898 "file_path": str(chapter_two),
2899 "content": "<h1>Two</h1>\n",
2900 },
2901 )
2902 executor = FakeExecutor(
2903 [
2904 tool_outcome(
2905 tool_call=tool_call,
2906 output=f"Successfully wrote {chapter_two}",
2907 is_error=False,
2908 )
2909 ]
2910 )
2911
2912 summary = TurnSummary(final_response="")
2913 await runner.execute_batch(
2914 tool_calls=[tool_call],
2915 tool_source="assistant",
2916 pending_tool_calls_seen=set(),
2917 emit=_noop_emit,
2918 summary=summary,
2919 dod=dod,
2920 executor=executor, # type: ignore[arg-type]
2921 on_confirmation=None,
2922 on_user_question=None,
2923 emit_confirmation=None,
2924 consecutive_errors=0,
2925 )
2926
2927 assert any(
2928 "All explicitly planned artifacts now exist." in message
2929 for message in persistent_messages
2930 )
2931 assert any(
2932 "Ensure all files are properly linked and formatted consistently" in message
2933 for message in persistent_messages
2934 )
2935 assert any(
2936 "Move to verification once no specific mismatch remains." in message
2937 for message in persistent_messages
2938 )
2939
2940
2941 @pytest.mark.asyncio
2942 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
2943 temp_dir: Path,
2944 ) -> None:
2945 async def assess_confidence(
2946 tool_name: str,
2947 tool_args: dict,
2948 context: str,
2949 ) -> ConfidenceAssessment:
2950 raise AssertionError("Confidence scoring should not run in this scenario")
2951
2952 async def verify_action(
2953 tool_name: str,
2954 tool_args: dict,
2955 result: str,
2956 expected: str = "",
2957 ) -> ActionVerification:
2958 raise AssertionError("Verification should not run in this scenario")
2959
2960 guide_root = temp_dir / "guides" / "nginx"
2961 chapters = guide_root / "chapters"
2962 guide_root.mkdir(parents=True)
2963 chapters.mkdir()
2964 index_path = guide_root / "index.html"
2965 index_path.write_text("<html></html>\n")
2966 chapter_one = chapters / "01-getting-started.html"
2967 chapter_two = chapters / "02-installation.html"
2968 implementation_plan = temp_dir / "implementation.md"
2969 implementation_plan.write_text(
2970 "\n".join(
2971 [
2972 "# Implementation Plan",
2973 "",
2974 "## File Changes",
2975 f"- `{guide_root}/`",
2976 f"- `{index_path}`",
2977 f"- `{chapter_one}`",
2978 f"- `{chapter_two}`",
2979 "",
2980 ]
2981 )
2982 )
2983
2984 context = build_context(
2985 temp_dir=temp_dir,
2986 messages=[],
2987 safeguards=FakeSafeguards(),
2988 assess_confidence=assess_confidence,
2989 verify_action=verify_action,
2990 auto_recover=False,
2991 )
2992 persistent_messages: list[str] = []
2993 ephemeral_messages: list[str] = []
2994 context.queue_steering_message_callback = persistent_messages.append
2995 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2996 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2997 dod = create_definition_of_done("Create a multi-file nginx guide.")
2998 dod.implementation_plan = str(implementation_plan)
2999 sync_todos_to_definition_of_done(
3000 dod,
3001 [
3002 {
3003 "content": "Create the main index.html file with proper structure",
3004 "active_form": "Working on: Create the main index.html file with proper structure",
3005 "status": "pending",
3006 },
3007 {
3008 "content": "Create each chapter file in sequence, following the established pattern",
3009 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
3010 "status": "pending",
3011 },
3012 {
3013 "content": "Ensure all files are properly linked and formatted consistently",
3014 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
3015 "status": "pending",
3016 },
3017 ],
3018 )
3019 tool_call = ToolCall(
3020 id="write-index",
3021 name="write",
3022 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
3023 )
3024 executor = FakeExecutor(
3025 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
3026 )
3027
3028 summary = TurnSummary(final_response="")
3029 await runner.execute_batch(
3030 tool_calls=[tool_call],
3031 tool_source="assistant",
3032 pending_tool_calls_seen=set(),
3033 emit=_noop_emit,
3034 summary=summary,
3035 dod=dod,
3036 executor=executor, # type: ignore[arg-type]
3037 on_confirmation=None,
3038 on_user_question=None,
3039 emit_confirmation=None,
3040 consecutive_errors=0,
3041 )
3042
3043 assert persistent_messages == []
3044 assert ephemeral_messages
3045 message = ephemeral_messages[-1]
3046 assert "Next step: create `01-getting-started.html`." in message
3047 assert "refresh `TodoWrite`" not in message
3048 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3049
3050
3051 @pytest.mark.asyncio
3052 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
3053 temp_dir: Path,
3054 ) -> None:
3055 async def assess_confidence(
3056 tool_name: str,
3057 tool_args: dict,
3058 context: str,
3059 ) -> ConfidenceAssessment:
3060 raise AssertionError("Confidence scoring should not run in this scenario")
3061
3062 async def verify_action(
3063 tool_name: str,
3064 tool_args: dict,
3065 result: str,
3066 expected: str = "",
3067 ) -> ActionVerification:
3068 raise AssertionError("Verification should not run in this scenario")
3069
3070 guide_root = temp_dir / "guides" / "nginx"
3071 chapters = guide_root / "chapters"
3072 guide_root.mkdir(parents=True)
3073 chapters.mkdir()
3074 index_path = guide_root / "index.html"
3075 index_path.write_text("<html></html>\n")
3076
3077 chapter_paths = [
3078 chapters / "01-getting-started.html",
3079 chapters / "02-installation.html",
3080 chapters / "03-first-website.html",
3081 chapters / "04-configuration-basics.html",
3082 chapters / "05-advanced-configurations.html",
3083 chapters / "06-performance-tuning.html",
3084 chapters / "07-security-best-practices.html",
3085 ]
3086 for chapter in chapter_paths[:4]:
3087 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
3088 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
3089
3090 implementation_plan = temp_dir / "implementation.md"
3091 implementation_plan.write_text(
3092 "\n".join(
3093 [
3094 "# Implementation Plan",
3095 "",
3096 "## File Changes",
3097 f"- `{guide_root}/`",
3098 f"- `{chapters}/`",
3099 f"- `{index_path}`",
3100 *[f"- `{path}`" for path in chapter_paths],
3101 "",
3102 ]
3103 )
3104 )
3105
3106 context = build_context(
3107 temp_dir=temp_dir,
3108 messages=[],
3109 safeguards=FakeSafeguards(),
3110 assess_confidence=assess_confidence,
3111 verify_action=verify_action,
3112 auto_recover=False,
3113 )
3114 persistent_messages: list[str] = []
3115 ephemeral_messages: list[str] = []
3116 context.queue_steering_message_callback = persistent_messages.append
3117 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3118 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3119 dod = create_definition_of_done("Create a thorough nginx guide.")
3120 dod.implementation_plan = str(implementation_plan)
3121 sync_todos_to_definition_of_done(
3122 dod,
3123 [
3124 {
3125 "content": "Create the nginx guide artifacts",
3126 "active_form": "Creating nginx guide artifacts",
3127 "status": "pending",
3128 },
3129 {
3130 "content": "Verify all guide files are linked and complete",
3131 "active_form": "Verifying guide linkage and completeness",
3132 "status": "pending",
3133 },
3134 ],
3135 )
3136 tool_call = ToolCall(
3137 id="write-chapter-05",
3138 name="write",
3139 arguments={
3140 "file_path": str(chapter_paths[4]),
3141 "content": "<h1>Advanced configurations</h1>\n",
3142 },
3143 )
3144 executor = FakeExecutor(
3145 [
3146 tool_outcome(
3147 tool_call=tool_call,
3148 output=f"Successfully wrote {chapter_paths[4]}",
3149 is_error=False,
3150 )
3151 ]
3152 )
3153
3154 summary = TurnSummary(final_response="")
3155 await runner.execute_batch(
3156 tool_calls=[tool_call],
3157 tool_source="assistant",
3158 pending_tool_calls_seen=set(),
3159 emit=_noop_emit,
3160 summary=summary,
3161 dod=dod,
3162 executor=executor, # type: ignore[arg-type]
3163 on_confirmation=None,
3164 on_user_question=None,
3165 emit_confirmation=None,
3166 consecutive_errors=0,
3167 )
3168
3169 assert any(
3170 "Next step: create `06-performance-tuning.html`." in message
3171 for message in ephemeral_messages
3172 )
3173 assert not any(
3174 "All explicitly planned artifacts now exist." in message
3175 for message in ephemeral_messages
3176 )
3177
3178
3179 @pytest.mark.asyncio
3180 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
3181 temp_dir: Path,
3182 ) -> None:
3183 async def assess_confidence(
3184 tool_name: str,
3185 tool_args: dict,
3186 context: str,
3187 ) -> ConfidenceAssessment:
3188 raise AssertionError("Confidence scoring should not run in this scenario")
3189
3190 async def verify_action(
3191 tool_name: str,
3192 tool_args: dict,
3193 result: str,
3194 expected: str = "",
3195 ) -> ActionVerification:
3196 raise AssertionError("Verification should not run in this scenario")
3197
3198 guide_root = temp_dir / "guides" / "nginx"
3199 chapters = guide_root / "chapters"
3200 guide_root.mkdir(parents=True)
3201 chapters.mkdir()
3202 index_path = guide_root / "index.html"
3203 chapter_paths = [
3204 chapters / "01-introduction.html",
3205 chapters / "02-installation.html",
3206 chapters / "03-configuration.html",
3207 chapters / "04-basic-usage.html",
3208 chapters / "05-advanced-features.html",
3209 ]
3210 for path in (index_path, *chapter_paths[:4]):
3211 path.write_text("<html></html>\n")
3212
3213 implementation_plan = temp_dir / "implementation.md"
3214 implementation_plan.write_text(
3215 "\n".join(
3216 [
3217 "# Implementation Plan",
3218 "",
3219 "## File Changes",
3220 f"- `{guide_root}/`",
3221 f"- `{chapters}/`",
3222 f"- `{index_path}`",
3223 *[f"- `{path}`" for path in chapter_paths],
3224 "",
3225 ]
3226 )
3227 )
3228
3229 context = build_context(
3230 temp_dir=temp_dir,
3231 messages=[],
3232 safeguards=FakeSafeguards(),
3233 assess_confidence=assess_confidence,
3234 verify_action=verify_action,
3235 auto_recover=False,
3236 )
3237 persistent_messages: list[str] = []
3238 ephemeral_messages: list[str] = []
3239 context.queue_steering_message_callback = persistent_messages.append
3240 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3241 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3242 dod = create_definition_of_done("Create a thorough nginx guide.")
3243 dod.implementation_plan = str(implementation_plan)
3244 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3245 dod.completed_items.extend(
3246 [
3247 "Create the nginx directory structure",
3248 "Create the main index.html file with proper structure",
3249 ]
3250 )
3251 sync_todos_to_definition_of_done(
3252 dod,
3253 [
3254 {
3255 "content": "Create each chapter file with appropriate content",
3256 "active_form": "Creating each chapter file with appropriate content",
3257 "status": "pending",
3258 }
3259 ],
3260 )
3261 tool_call = ToolCall(
3262 id="write-chapter-04",
3263 name="write",
3264 arguments={
3265 "file_path": str(chapter_paths[3]),
3266 "content": "<html>updated</html>\n",
3267 },
3268 )
3269 executor = FakeExecutor(
3270 [
3271 tool_outcome(
3272 tool_call=tool_call,
3273 output=f"Successfully wrote {chapter_paths[3]}",
3274 is_error=False,
3275 )
3276 ]
3277 )
3278
3279 summary = TurnSummary(final_response="")
3280 await runner.execute_batch(
3281 tool_calls=[tool_call],
3282 tool_source="assistant",
3283 pending_tool_calls_seen=set(),
3284 emit=_noop_emit,
3285 summary=summary,
3286 dod=dod,
3287 executor=executor, # type: ignore[arg-type]
3288 on_confirmation=None,
3289 on_user_question=None,
3290 emit_confirmation=None,
3291 consecutive_errors=0,
3292 )
3293
3294 assert ephemeral_messages
3295 message = ephemeral_messages[-1]
3296 assert "Next step: create `05-advanced-features.html`." in message
3297 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
3298 assert "refresh `TodoWrite`" not in message
3299
3300
3301 @pytest.mark.asyncio
3302 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3303 temp_dir: Path,
3304 ) -> None:
3305 async def assess_confidence(
3306 tool_name: str,
3307 tool_args: dict,
3308 context: str,
3309 ) -> ConfidenceAssessment:
3310 raise AssertionError("Confidence scoring should not run in this scenario")
3311
3312 async def verify_action(
3313 tool_name: str,
3314 tool_args: dict,
3315 result: str,
3316 expected: str = "",
3317 ) -> ActionVerification:
3318 raise AssertionError("Verification should not run in this scenario")
3319
3320 guide_root = temp_dir / "guides" / "nginx"
3321 chapters = guide_root / "chapters"
3322 guide_root.mkdir(parents=True)
3323 chapters.mkdir()
3324 index_path = guide_root / "index.html"
3325 index_path.write_text("<html></html>\n")
3326 chapter_one = chapters / "01-getting-started.html"
3327 chapter_two = chapters / "02-installation.html"
3328 chapter_one.write_text("<h1>One</h1>\n")
3329
3330 implementation_plan = temp_dir / "implementation.md"
3331 implementation_plan.write_text(
3332 "\n".join(
3333 [
3334 "# Implementation Plan",
3335 "",
3336 "## File Changes",
3337 f"- `{guide_root}/`",
3338 f"- `{chapters}/`",
3339 f"- `{index_path}`",
3340 f"- `{chapter_one}`",
3341 f"- `{chapter_two}`",
3342 "",
3343 ]
3344 )
3345 )
3346
3347 context = build_context(
3348 temp_dir=temp_dir,
3349 messages=[],
3350 safeguards=FakeSafeguards(),
3351 assess_confidence=assess_confidence,
3352 verify_action=verify_action,
3353 auto_recover=False,
3354 )
3355 persistent_messages: list[str] = []
3356 ephemeral_messages: list[str] = []
3357 context.queue_steering_message_callback = persistent_messages.append
3358 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3359 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3360 dod = create_definition_of_done("Create a multi-file nginx guide.")
3361 dod.implementation_plan = str(implementation_plan)
3362 sync_todos_to_definition_of_done(
3363 dod,
3364 [
3365 {
3366 "content": "Create 01-getting-started.html",
3367 "active_form": "Creating 01-getting-started.html",
3368 "status": "completed",
3369 },
3370 {
3371 "content": "Create 02-installation.html",
3372 "active_form": "Creating 02-installation.html",
3373 "status": "pending",
3374 },
3375 ],
3376 )
3377 dod.touched_files.extend([str(index_path), str(chapter_one)])
3378
3379 tool_call = ToolCall(
3380 id="todo-only",
3381 name="TodoWrite",
3382 arguments={
3383 "todos": [
3384 {
3385 "content": "Create 01-getting-started.html",
3386 "active_form": "Creating 01-getting-started.html",
3387 "status": "completed",
3388 },
3389 {
3390 "content": "Create 02-installation.html",
3391 "active_form": "Creating 02-installation.html",
3392 "status": "pending",
3393 },
3394 ]
3395 },
3396 )
3397 executor = FakeExecutor(
3398 [
3399 tool_outcome(
3400 tool_call=tool_call,
3401 output="Todos updated",
3402 is_error=False,
3403 metadata={
3404 "new_todos": [
3405 {
3406 "content": "Create 01-getting-started.html",
3407 "active_form": "Creating 01-getting-started.html",
3408 "status": "completed",
3409 },
3410 {
3411 "content": "Create 02-installation.html",
3412 "active_form": "Creating 02-installation.html",
3413 "status": "pending",
3414 },
3415 ]
3416 },
3417 )
3418 ]
3419 )
3420
3421 summary = TurnSummary(final_response="")
3422 await runner.execute_batch(
3423 tool_calls=[tool_call],
3424 tool_source="assistant",
3425 pending_tool_calls_seen=set(),
3426 emit=_noop_emit,
3427 summary=summary,
3428 dod=dod,
3429 executor=executor, # type: ignore[arg-type]
3430 on_confirmation=None,
3431 on_user_question=None,
3432 emit_confirmation=None,
3433 consecutive_errors=0,
3434 )
3435
3436 assert persistent_messages
3437 message = persistent_messages[-1]
3438 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3439 assert "Resume by creating `02-installation.html` now." in message
3440 assert "refresh `TodoWrite`" in message
3441 assert "Do not spend the next turn on TodoWrite alone" in message
3442 assert ephemeral_messages == []
3443
3444
3445 @pytest.mark.asyncio
3446 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3447 temp_dir: Path,
3448 ) -> None:
3449 async def assess_confidence(
3450 tool_name: str,
3451 tool_args: dict,
3452 context: str,
3453 ) -> ConfidenceAssessment:
3454 raise AssertionError("Confidence scoring should not run in this scenario")
3455
3456 async def verify_action(
3457 tool_name: str,
3458 tool_args: dict,
3459 result: str,
3460 expected: str = "",
3461 ) -> ActionVerification:
3462 raise AssertionError("Verification should not run in this scenario")
3463
3464 guide_root = temp_dir / "guides" / "nginx"
3465 chapters = guide_root / "chapters"
3466 guide_root.mkdir(parents=True)
3467 chapters.mkdir()
3468 index_path = guide_root / "index.html"
3469 chapter_one = chapters / "01-getting-started.html"
3470 chapter_two = chapters / "02-installation.html"
3471 index_path.write_text("<html></html>\n")
3472 chapter_one.write_text("<h1>One</h1>\n")
3473 chapter_two.write_text("<h1>Two</h1>\n")
3474
3475 implementation_plan = temp_dir / "implementation.md"
3476 implementation_plan.write_text(
3477 "\n".join(
3478 [
3479 "# Implementation Plan",
3480 "",
3481 "## File Changes",
3482 f"- `{guide_root}/`",
3483 f"- `{chapters}/`",
3484 f"- `{index_path}`",
3485 f"- `{chapter_one}`",
3486 f"- `{chapter_two}`",
3487 "",
3488 ]
3489 )
3490 )
3491
3492 context = build_context(
3493 temp_dir=temp_dir,
3494 messages=[],
3495 safeguards=FakeSafeguards(),
3496 assess_confidence=assess_confidence,
3497 verify_action=verify_action,
3498 auto_recover=False,
3499 )
3500 queued_messages: list[str] = []
3501 context.queue_steering_message_callback = queued_messages.append
3502 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3503 dod = create_definition_of_done("Create a multi-file nginx guide.")
3504 dod.implementation_plan = str(implementation_plan)
3505 dod.verification_commands = [f"ls -la {guide_root}"]
3506 sync_todos_to_definition_of_done(
3507 dod,
3508 [
3509 {
3510 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3511 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3512 "status": "pending",
3513 },
3514 {
3515 "content": "Verify all guide files are linked and complete",
3516 "active_form": "Working on: Verify all guide files are linked and complete",
3517 "status": "pending",
3518 },
3519 ],
3520 project_root=temp_dir,
3521 )
3522
3523 tool_call = ToolCall(
3524 id="todo-only",
3525 name="TodoWrite",
3526 arguments={
3527 "todos": [
3528 {
3529 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3530 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3531 "status": "pending",
3532 },
3533 {
3534 "content": "Verify all guide files are linked and complete",
3535 "active_form": "Working on: Verify all guide files are linked and complete",
3536 "status": "pending",
3537 },
3538 ]
3539 },
3540 )
3541 executor = FakeExecutor(
3542 [
3543 tool_outcome(
3544 tool_call=tool_call,
3545 output="Todos updated",
3546 is_error=False,
3547 metadata={
3548 "new_todos": [
3549 {
3550 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3551 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3552 "status": "pending",
3553 },
3554 {
3555 "content": "Verify all guide files are linked and complete",
3556 "active_form": "Working on: Verify all guide files are linked and complete",
3557 "status": "pending",
3558 },
3559 ]
3560 },
3561 )
3562 ]
3563 )
3564
3565 summary = TurnSummary(final_response="")
3566 await runner.execute_batch(
3567 tool_calls=[tool_call],
3568 tool_source="assistant",
3569 pending_tool_calls_seen=set(),
3570 emit=_noop_emit,
3571 summary=summary,
3572 dod=dod,
3573 executor=executor, # type: ignore[arg-type]
3574 on_confirmation=None,
3575 on_user_question=None,
3576 emit_confirmation=None,
3577 consecutive_errors=0,
3578 )
3579
3580 assert queued_messages
3581 message = queued_messages[-1]
3582 assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
3583 assert "Verify all guide files are linked and complete" in message
3584 assert "Move to verification once no specific mismatch remains." in message
3585 assert "reopen reference materials" in message
3586 assert "Fortran guide structure" not in message
3587
3588
3589 @pytest.mark.asyncio
3590 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
3591 temp_dir: Path,
3592 ) -> None:
3593 async def assess_confidence(
3594 tool_name: str,
3595 tool_args: dict,
3596 context: str,
3597 ) -> ConfidenceAssessment:
3598 raise AssertionError("Confidence scoring should not run in this scenario")
3599
3600 async def verify_action(
3601 tool_name: str,
3602 tool_args: dict,
3603 result: str,
3604 expected: str = "",
3605 ) -> ActionVerification:
3606 raise AssertionError("Verification should not run in this scenario")
3607
3608 guide_root = temp_dir / "guides" / "nginx"
3609 chapters = guide_root / "chapters"
3610 guide_root.mkdir(parents=True)
3611 chapters.mkdir()
3612 index_path = guide_root / "index.html"
3613 index_path.write_text(
3614 "\n".join(
3615 [
3616 "<!DOCTYPE html>",
3617 "<html>",
3618 "<body>",
3619 '<a href="chapters/01-introduction.html">Introduction</a>',
3620 "</body>",
3621 "</html>",
3622 "",
3623 ]
3624 )
3625 )
3626
3627 implementation_plan = temp_dir / "implementation.md"
3628 implementation_plan.write_text(
3629 "\n".join(
3630 [
3631 "# Implementation Plan",
3632 "",
3633 "## File Changes",
3634 f"- `{guide_root}/`",
3635 f"- `{chapters}/`",
3636 f"- `{index_path}`",
3637 "",
3638 ]
3639 )
3640 )
3641
3642 context = build_context(
3643 temp_dir=temp_dir,
3644 messages=[],
3645 safeguards=FakeSafeguards(),
3646 assess_confidence=assess_confidence,
3647 verify_action=verify_action,
3648 auto_recover=False,
3649 )
3650 queued_messages: list[str] = []
3651 context.queue_steering_message_callback = queued_messages.append
3652 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3653 dod = create_definition_of_done("Create a multi-file nginx guide.")
3654 dod.implementation_plan = str(implementation_plan)
3655 dod.touched_files.append(str(index_path))
3656 sync_todos_to_definition_of_done(
3657 dod,
3658 [
3659 {
3660 "content": "Examine the existing Fortran guide structure",
3661 "active_form": "Examining the existing Fortran guide structure",
3662 "status": "completed",
3663 },
3664 {
3665 "content": "Create the nginx directory structure",
3666 "active_form": "Creating the nginx directory structure",
3667 "status": "completed",
3668 },
3669 {
3670 "content": "Write the introduction chapter",
3671 "active_form": "Writing the introduction chapter",
3672 "status": "pending",
3673 },
3674 ],
3675 project_root=temp_dir,
3676 )
3677
3678 tool_call = ToolCall(
3679 id="todo-next-mutation",
3680 name="TodoWrite",
3681 arguments={
3682 "todos": [
3683 {
3684 "content": "Examine the existing Fortran guide structure",
3685 "active_form": "Examining the existing Fortran guide structure",
3686 "status": "completed",
3687 },
3688 {
3689 "content": "Create the nginx directory structure",
3690 "active_form": "Creating the nginx directory structure",
3691 "status": "completed",
3692 },
3693 {
3694 "content": "Write the introduction chapter",
3695 "active_form": "Writing the introduction chapter",
3696 "status": "pending",
3697 },
3698 ]
3699 },
3700 )
3701 executor = FakeExecutor(
3702 [
3703 tool_outcome(
3704 tool_call=tool_call,
3705 output="Todos updated",
3706 is_error=False,
3707 metadata={
3708 "new_todos": [
3709 {
3710 "content": "Examine the existing Fortran guide structure",
3711 "active_form": "Examining the existing Fortran guide structure",
3712 "status": "completed",
3713 },
3714 {
3715 "content": "Create the nginx directory structure",
3716 "active_form": "Creating the nginx directory structure",
3717 "status": "completed",
3718 },
3719 {
3720 "content": "Write the introduction chapter",
3721 "active_form": "Writing the introduction chapter",
3722 "status": "pending",
3723 },
3724 ]
3725 },
3726 )
3727 ]
3728 )
3729
3730 summary = TurnSummary(final_response="")
3731 await runner.execute_batch(
3732 tool_calls=[tool_call],
3733 tool_source="assistant",
3734 pending_tool_calls_seen=set(),
3735 emit=_noop_emit,
3736 summary=summary,
3737 dod=dod,
3738 executor=executor, # type: ignore[arg-type]
3739 on_confirmation=None,
3740 on_user_question=None,
3741 emit_confirmation=None,
3742 consecutive_errors=0,
3743 )
3744
3745 assert queued_messages
3746 message = queued_messages[-1]
3747 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3748 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3749 assert "Resume by creating `01-introduction.html` now." in message
3750 assert "Prefer one `write` call for `" in message
3751 assert "01-introduction.html` instead of more rereads." in message
3752 assert "Do not spend the next turn on TodoWrite alone" in message
3753
3754
3755 @pytest.mark.asyncio
3756 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
3757 temp_dir: Path,
3758 ) -> None:
3759 async def assess_confidence(
3760 tool_name: str,
3761 tool_args: dict,
3762 context: str,
3763 ) -> ConfidenceAssessment:
3764 raise AssertionError("Confidence scoring should not run in this scenario")
3765
3766 async def verify_action(
3767 tool_name: str,
3768 tool_args: dict,
3769 result: str,
3770 expected: str = "",
3771 ) -> ActionVerification:
3772 raise AssertionError("Verification should not run in this scenario")
3773
3774 guide_root = temp_dir / "Loader" / "guides" / "nginx"
3775 chapters = guide_root / "chapters"
3776 chapters.mkdir(parents=True)
3777 index_path = guide_root / "index.html"
3778 implementation_plan = temp_dir / "implementation.md"
3779 implementation_plan.write_text(
3780 "\n".join(
3781 [
3782 "# Implementation Plan",
3783 "",
3784 "## File Changes",
3785 f"- `{chapters}/`",
3786 f"- `{index_path}`",
3787 "",
3788 ]
3789 )
3790 )
3791
3792 dod = create_definition_of_done("Create a multi-file nginx guide.")
3793 dod.implementation_plan = str(implementation_plan)
3794 sync_todos_to_definition_of_done(
3795 dod,
3796 [
3797 {
3798 "content": "Examine the existing Fortran guide structure to understand the format and depth",
3799 "active_form": "Examining the existing Fortran guide structure",
3800 "status": "completed",
3801 },
3802 {
3803 "content": "Create the new nginx guide directory structure",
3804 "active_form": "Creating the new nginx guide directory structure",
3805 "status": "completed",
3806 },
3807 {
3808 "content": "Create a new index.html for the nginx guide",
3809 "active_form": "Creating a new index.html for the nginx guide",
3810 "status": "pending",
3811 },
3812 {
3813 "content": "Create the first chapter for the nginx guide",
3814 "active_form": "Creating the first chapter for the nginx guide",
3815 "status": "pending",
3816 },
3817 ],
3818 project_root=temp_dir,
3819 )
3820
3821 queued_messages: list[str] = []
3822 context = build_context(
3823 temp_dir=temp_dir,
3824 messages=[],
3825 safeguards=FakeSafeguards(),
3826 assess_confidence=assess_confidence,
3827 verify_action=verify_action,
3828 auto_recover=False,
3829 )
3830 context.queue_steering_message_callback = queued_messages.append
3831 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3832
3833 todos = [
3834 {
3835 "content": "Examine the existing Fortran guide structure to understand the format and depth",
3836 "active_form": "Examining the existing Fortran guide structure",
3837 "status": "completed",
3838 },
3839 {
3840 "content": "Create the new nginx guide directory structure",
3841 "active_form": "Creating the new nginx guide directory structure",
3842 "status": "completed",
3843 },
3844 {
3845 "content": "Create a new index.html for the nginx guide",
3846 "active_form": "Creating a new index.html for the nginx guide",
3847 "status": "pending",
3848 },
3849 {
3850 "content": "Create the first chapter for the nginx guide",
3851 "active_form": "Creating the first chapter for the nginx guide",
3852 "status": "pending",
3853 },
3854 ]
3855 tool_call = ToolCall(
3856 id="todo-index-before-chapter",
3857 name="TodoWrite",
3858 arguments={"todos": todos},
3859 )
3860 executor = FakeExecutor(
3861 [
3862 tool_outcome(
3863 tool_call=tool_call,
3864 output="Todos updated",
3865 is_error=False,
3866 metadata={"new_todos": todos},
3867 )
3868 ]
3869 )
3870
3871 summary = TurnSummary(final_response="")
3872 await runner.execute_batch(
3873 tool_calls=[tool_call],
3874 tool_source="assistant",
3875 pending_tool_calls_seen=set(),
3876 emit=_noop_emit,
3877 summary=summary,
3878 dod=dod,
3879 executor=executor, # type: ignore[arg-type]
3880 on_confirmation=None,
3881 on_user_question=None,
3882 emit_confirmation=None,
3883 consecutive_errors=0,
3884 )
3885
3886 assert queued_messages
3887 message = queued_messages[-1]
3888 assert "Continue with the next pending item: `Create a new index.html for the nginx guide`." in message
3889 assert "Resume by creating `index.html` now." in message
3890 assert f"Prefer one `write` call for `{index_path.resolve(strict=False)}`" in message
3891 assert "01-introduction.html" not in message
3892
3893
3894 @pytest.mark.asyncio
3895 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
3896 temp_dir: Path,
3897 ) -> None:
3898 async def assess_confidence(
3899 tool_name: str,
3900 tool_args: dict,
3901 context: str,
3902 ) -> ConfidenceAssessment:
3903 raise AssertionError("Confidence scoring should not run in this scenario")
3904
3905 async def verify_action(
3906 tool_name: str,
3907 tool_args: dict,
3908 result: str,
3909 expected: str = "",
3910 ) -> ActionVerification:
3911 raise AssertionError("Verification should not run in this scenario")
3912
3913 guide_root = temp_dir / "guides" / "nginx"
3914 chapters = guide_root / "chapters"
3915 guide_root.mkdir(parents=True)
3916 chapters.mkdir()
3917 index_path = guide_root / "index.html"
3918 index_path.write_text(
3919 "\n".join(
3920 [
3921 "<html>",
3922 '<a href="chapters/introduction.html">Introduction</a>',
3923 '<a href="chapters/installation.html">Installation</a>',
3924 "</html>",
3925 ]
3926 )
3927 + "\n"
3928 )
3929
3930 implementation_plan = temp_dir / "implementation.md"
3931 implementation_plan.write_text(
3932 "\n".join(
3933 [
3934 "# Implementation Plan",
3935 "",
3936 "## File Changes",
3937 f"- `{guide_root}/`",
3938 f"- `{chapters}/`",
3939 f"- `{index_path}`",
3940 "",
3941 ]
3942 )
3943 )
3944
3945 dod = create_definition_of_done("Create a multi-file nginx guide.")
3946 dod.implementation_plan = str(implementation_plan)
3947 dod.pending_items = [
3948 "Write the introduction chapter",
3949 "Complete the requested work",
3950 ]
3951 dod.touched_files.append(str(index_path))
3952
3953 queued_messages: list[str] = []
3954 context = build_context(
3955 temp_dir=temp_dir,
3956 messages=[],
3957 safeguards=FakeSafeguards(),
3958 assess_confidence=assess_confidence,
3959 verify_action=verify_action,
3960 auto_recover=False,
3961 )
3962 context.queue_steering_message_callback = queued_messages.append
3963 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3964
3965 tool_call = ToolCall(
3966 id="todo-1",
3967 name="TodoWrite",
3968 arguments={
3969 "todos": [
3970 {
3971 "content": "Write the introduction chapter",
3972 "activeForm": "Writing the introduction chapter",
3973 "status": "pending",
3974 }
3975 ]
3976 },
3977 )
3978 executor = FakeExecutor(
3979 [
3980 tool_outcome(
3981 tool_call=tool_call,
3982 output="Todos updated",
3983 is_error=False,
3984 metadata={
3985 "new_todos": [
3986 {
3987 "content": "Write the introduction chapter",
3988 "active_form": "Writing the introduction chapter",
3989 "status": "pending",
3990 }
3991 ]
3992 },
3993 )
3994 ]
3995 )
3996
3997 summary = TurnSummary(final_response="")
3998 await runner.execute_batch(
3999 tool_calls=[tool_call],
4000 tool_source="assistant",
4001 pending_tool_calls_seen=set(),
4002 emit=_noop_emit,
4003 summary=summary,
4004 dod=dod,
4005 executor=executor, # type: ignore[arg-type]
4006 on_confirmation=None,
4007 on_user_question=None,
4008 emit_confirmation=None,
4009 consecutive_errors=0,
4010 )
4011
4012 assert queued_messages
4013 message = queued_messages[-1]
4014 assert "Todo tracking is updated. A declared output artifact is still missing." in message
4015 assert "Continue with the next pending item: `Write the introduction chapter`." in message
4016 assert "Resume by creating `introduction.html` now." in message
4017 assert "Prefer one `write` call for `" in message
4018 assert "introduction.html` instead of more rereads." in message
4019 assert "Do not spend the next turn on TodoWrite alone" in message
4020
4021
4022 @pytest.mark.asyncio
4023 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
4024 temp_dir: Path,
4025 ) -> None:
4026 async def assess_confidence(
4027 tool_name: str,
4028 tool_args: dict,
4029 context: str,
4030 ) -> ConfidenceAssessment:
4031 raise AssertionError("Confidence scoring should not run in this scenario")
4032
4033 async def verify_action(
4034 tool_name: str,
4035 tool_args: dict,
4036 result: str,
4037 expected: str = "",
4038 ) -> ActionVerification:
4039 raise AssertionError("Verification should not run in this scenario")
4040
4041 guide_root = temp_dir / "guides" / "nginx"
4042 chapters = guide_root / "chapters"
4043 guide_root.mkdir(parents=True)
4044 chapters.mkdir()
4045 index_path = guide_root / "index.html"
4046 chapter_one = chapters / "01-introduction.html"
4047 index_path.write_text(
4048 "\n".join(
4049 [
4050 "<html>",
4051 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
4052 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
4053 "</html>",
4054 ]
4055 )
4056 + "\n"
4057 )
4058 chapter_one.write_text("<html></html>\n")
4059
4060 implementation_plan = temp_dir / "implementation.md"
4061 implementation_plan.write_text(
4062 "\n".join(
4063 [
4064 "# Implementation Plan",
4065 "",
4066 "## File Changes",
4067 f"- `{guide_root}/`",
4068 f"- `{chapters}/`",
4069 f"- `{index_path}`",
4070 "",
4071 ]
4072 )
4073 )
4074
4075 dod = create_definition_of_done("Create a multi-file nginx guide.")
4076 dod.implementation_plan = str(implementation_plan)
4077 dod.pending_items = [
4078 "Creating Chapter 2: Installation and Setup",
4079 "Complete the requested work",
4080 ]
4081 dod.touched_files.extend([str(index_path), str(chapter_one)])
4082
4083 queued_messages: list[str] = []
4084 context = build_context(
4085 temp_dir=temp_dir,
4086 messages=[],
4087 safeguards=FakeSafeguards(),
4088 assess_confidence=assess_confidence,
4089 verify_action=verify_action,
4090 auto_recover=False,
4091 )
4092 context.queue_steering_message_callback = queued_messages.append
4093 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4094
4095 tool_call = ToolCall(
4096 id="todo-1",
4097 name="TodoWrite",
4098 arguments={
4099 "todos": [
4100 {
4101 "content": "Creating Chapter 2: Installation and Setup",
4102 "activeForm": "Creating Chapter 2: Installation and Setup",
4103 "status": "pending",
4104 }
4105 ]
4106 },
4107 )
4108 executor = FakeExecutor(
4109 [
4110 tool_outcome(
4111 tool_call=tool_call,
4112 output="Todos updated",
4113 is_error=False,
4114 metadata={
4115 "new_todos": [
4116 {
4117 "content": "Creating Chapter 2: Installation and Setup",
4118 "active_form": "Creating Chapter 2: Installation and Setup",
4119 "status": "pending",
4120 }
4121 ]
4122 },
4123 )
4124 ]
4125 )
4126
4127 summary = TurnSummary(final_response="")
4128 await runner.execute_batch(
4129 tool_calls=[tool_call],
4130 tool_source="assistant",
4131 pending_tool_calls_seen=set(),
4132 emit=_noop_emit,
4133 summary=summary,
4134 dod=dod,
4135 executor=executor, # type: ignore[arg-type]
4136 on_confirmation=None,
4137 on_user_question=None,
4138 emit_confirmation=None,
4139 consecutive_errors=0,
4140 )
4141
4142 assert queued_messages
4143 message = queued_messages[-1]
4144 assert "Todo tracking is updated. A declared output artifact is still missing." in message
4145 assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
4146 assert "Resume by creating `02-installation.html` now." in message
4147 assert (
4148 f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
4149 "instead of more rereads."
4150 in message
4151 )
4152 assert "Make your next response the concrete mutation tool call itself" in message
4153
4154
4155 @pytest.mark.asyncio
4156 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
4157 temp_dir: Path,
4158 ) -> None:
4159 async def assess_confidence(
4160 tool_name: str,
4161 tool_args: dict,
4162 context: str,
4163 ) -> ConfidenceAssessment:
4164 raise AssertionError("Confidence scoring should not run in this scenario")
4165
4166 async def verify_action(
4167 tool_name: str,
4168 tool_args: dict,
4169 result: str,
4170 expected: str = "",
4171 ) -> ActionVerification:
4172 raise AssertionError("Verification should not run in this scenario")
4173
4174 reference_chapters = temp_dir / "fortran" / "chapters"
4175 reference_chapters.mkdir(parents=True)
4176 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
4177
4178 guide_root = temp_dir / "guides" / "nginx"
4179 chapters = guide_root / "chapters"
4180 guide_root.mkdir(parents=True)
4181 chapters.mkdir()
4182 index_path = guide_root / "index.html"
4183 index_path.write_text("<html></html>\n")
4184
4185 implementation_plan = temp_dir / "implementation.md"
4186 implementation_plan.write_text(
4187 "\n".join(
4188 [
4189 "# Implementation Plan",
4190 "",
4191 "## File Changes",
4192 f"- `{guide_root}/`",
4193 f"- `{chapters}/`",
4194 f"- `{index_path}`",
4195 "",
4196 ]
4197 )
4198 )
4199
4200 dod = create_definition_of_done("Create a multi-file nginx guide.")
4201 dod.implementation_plan = str(implementation_plan)
4202 dod.pending_items = [
4203 "Write the introduction chapter",
4204 "Complete the requested work",
4205 ]
4206 dod.touched_files.append(str(index_path))
4207
4208 queued_messages: list[str] = []
4209 context = build_context(
4210 temp_dir=temp_dir,
4211 messages=[
4212 Message(
4213 role=Role.ASSISTANT,
4214 content="",
4215 tool_calls=[
4216 ToolCall(
4217 id="read-ref-1",
4218 name="read",
4219 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
4220 )
4221 ],
4222 )
4223 ],
4224 safeguards=FakeSafeguards(),
4225 assess_confidence=assess_confidence,
4226 verify_action=verify_action,
4227 auto_recover=False,
4228 )
4229 context.queue_steering_message_callback = queued_messages.append
4230 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4231
4232 tool_call = ToolCall(
4233 id="todo-observed-1",
4234 name="TodoWrite",
4235 arguments={
4236 "todos": [
4237 {
4238 "content": "Write the introduction chapter",
4239 "activeForm": "Writing the introduction chapter",
4240 "status": "pending",
4241 }
4242 ]
4243 },
4244 )
4245 executor = FakeExecutor(
4246 [
4247 tool_outcome(
4248 tool_call=tool_call,
4249 output="Todos updated",
4250 is_error=False,
4251 metadata={
4252 "new_todos": [
4253 {
4254 "content": "Write the introduction chapter",
4255 "active_form": "Writing the introduction chapter",
4256 "status": "pending",
4257 }
4258 ]
4259 },
4260 )
4261 ]
4262 )
4263
4264 summary = TurnSummary(final_response="")
4265 await runner.execute_batch(
4266 tool_calls=[tool_call],
4267 tool_source="assistant",
4268 pending_tool_calls_seen=set(),
4269 emit=_noop_emit,
4270 summary=summary,
4271 dod=dod,
4272 executor=executor, # type: ignore[arg-type]
4273 on_confirmation=None,
4274 on_user_question=None,
4275 emit_confirmation=None,
4276 consecutive_errors=0,
4277 )
4278
4279 assert queued_messages
4280 message = queued_messages[-1]
4281 assert "Todo tracking is updated. A declared output artifact is still missing." in message
4282 assert "Continue with the next pending item: `Write the introduction chapter`." in message
4283 assert "Resume by creating `01-introduction.html` now." in message
4284 assert (
4285 "It mirrors the observed filename pattern from another `chapters/` directory "
4286 "you already inspected."
4287 in message
4288 )
4289 assert "01-introduction.html` instead of more rereads." in message
4290
4291
4292 @pytest.mark.asyncio
4293 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
4294 temp_dir: Path,
4295 ) -> None:
4296 async def assess_confidence(
4297 tool_name: str,
4298 tool_args: dict,
4299 context: str,
4300 ) -> ConfidenceAssessment:
4301 raise AssertionError("Confidence scoring should not run in this scenario")
4302
4303 async def verify_action(
4304 tool_name: str,
4305 tool_args: dict,
4306 result: str,
4307 expected: str = "",
4308 ) -> ActionVerification:
4309 raise AssertionError("Verification should not run in this scenario")
4310
4311 guide_root = temp_dir / "guides" / "nginx"
4312 chapters = guide_root / "chapters"
4313 guide_root.mkdir(parents=True)
4314 chapters.mkdir()
4315 index_path = guide_root / "index.html"
4316 chapter_one = chapters / "01-getting-started.html"
4317 chapter_two = chapters / "02-installation.html"
4318 index_path.write_text("<html></html>\n")
4319 chapter_one.write_text("<h1>One</h1>\n")
4320
4321 implementation_plan = temp_dir / "implementation.md"
4322 implementation_plan.write_text(
4323 "\n".join(
4324 [
4325 "# Implementation Plan",
4326 "",
4327 "## File Changes",
4328 f"- `{guide_root}/`",
4329 f"- `{chapters}/`",
4330 f"- `{index_path}`",
4331 f"- `{chapter_one}`",
4332 f"- `{chapter_two}`",
4333 "",
4334 ]
4335 )
4336 )
4337
4338 context = build_context(
4339 temp_dir=temp_dir,
4340 messages=[],
4341 safeguards=FakeSafeguards(),
4342 assess_confidence=assess_confidence,
4343 verify_action=verify_action,
4344 auto_recover=False,
4345 )
4346 queued_messages: list[str] = []
4347 context.queue_steering_message_callback = queued_messages.append
4348 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4349 dod = create_definition_of_done("Create a multi-file nginx guide.")
4350 dod.implementation_plan = str(implementation_plan)
4351 sync_todos_to_definition_of_done(
4352 dod,
4353 [
4354 {
4355 "content": "Create 01-getting-started.html",
4356 "active_form": "Creating 01-getting-started.html",
4357 "status": "completed",
4358 },
4359 {
4360 "content": "Create 02-installation.html",
4361 "active_form": "Creating 02-installation.html",
4362 "status": "pending",
4363 },
4364 ],
4365 project_root=temp_dir,
4366 )
4367 dod.touched_files.extend([str(index_path), str(chapter_one)])
4368
4369 tool_call = ToolCall(
4370 id="working-note",
4371 name="notepad_write_working",
4372 arguments={"content": "Creating the second chapter file: Installation"},
4373 )
4374 executor = FakeExecutor(
4375 [
4376 tool_outcome(
4377 tool_call=tool_call,
4378 output="Working note recorded",
4379 is_error=False,
4380 )
4381 ]
4382 )
4383
4384 summary = TurnSummary(final_response="")
4385 await runner.execute_batch(
4386 tool_calls=[tool_call],
4387 tool_source="assistant",
4388 pending_tool_calls_seen=set(),
4389 emit=_noop_emit,
4390 summary=summary,
4391 dod=dod,
4392 executor=executor, # type: ignore[arg-type]
4393 on_confirmation=None,
4394 on_user_question=None,
4395 emit_confirmation=None,
4396 consecutive_errors=0,
4397 )
4398
4399 assert queued_messages
4400 message = queued_messages[-1]
4401 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
4402 assert "Resume by creating `02-installation.html` now." in message
4403 assert "Make your next response the concrete mutation tool call itself" in message
4404 assert "refresh `TodoWrite`" in message
4405 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
4406
4407
4408 @pytest.mark.asyncio
4409 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
4410 temp_dir: Path,
4411 ) -> None:
4412 async def assess_confidence(
4413 tool_name: str,
4414 tool_args: dict,
4415 context: str,
4416 ) -> ConfidenceAssessment:
4417 raise AssertionError("Confidence scoring should be disabled in this scenario")
4418
4419 async def verify_action(
4420 tool_name: str,
4421 tool_args: dict,
4422 result: str,
4423 expected: str = "",
4424 ) -> ActionVerification:
4425 raise AssertionError("Verification should not run in this scenario")
4426
4427 implementation_plan = temp_dir / "implementation.md"
4428 implementation_plan.write_text(
4429 "\n".join(
4430 [
4431 "# Implementation Plan",
4432 "",
4433 "## File Changes",
4434 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
4435 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
4436 "",
4437 ]
4438 )
4439 )
4440
4441 context = build_context(
4442 temp_dir=temp_dir,
4443 messages=[],
4444 safeguards=FakeSafeguards(),
4445 assess_confidence=assess_confidence,
4446 verify_action=verify_action,
4447 auto_recover=False,
4448 )
4449 queued_messages: list[str] = []
4450 context.queue_steering_message_callback = queued_messages.append
4451 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4452 dod = create_definition_of_done("Create a multi-file nginx guide.")
4453 dod.implementation_plan = str(implementation_plan)
4454 dod.pending_items.extend(
4455 [
4456 "First, examine the existing fortran guide structure and content to understand the format",
4457 "Create the nginx directory structure",
4458 "Develop the main index.html file for the nginx guide",
4459 ]
4460 )
4461
4462 tool_call = ToolCall(
4463 id="working-note",
4464 name="notepad_write_working",
4465 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
4466 )
4467 executor = FakeExecutor(
4468 [
4469 tool_outcome(
4470 tool_call=tool_call,
4471 output="Working note recorded",
4472 is_error=False,
4473 )
4474 ]
4475 )
4476
4477 summary = TurnSummary(final_response="")
4478 await runner.execute_batch(
4479 tool_calls=[tool_call],
4480 tool_source="assistant",
4481 pending_tool_calls_seen=set(),
4482 emit=_noop_emit,
4483 summary=summary,
4484 dod=dod,
4485 executor=executor, # type: ignore[arg-type]
4486 on_confirmation=None,
4487 on_user_question=None,
4488 emit_confirmation=None,
4489 consecutive_errors=0,
4490 )
4491
4492 assert queued_messages
4493 message = queued_messages[-1]
4494 assert (
4495 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
4496 in message
4497 )
4498 assert "one concrete evidence-gathering tool call" in message
4499 assert "Resume by creating `index.html` now." not in message
4500
4501
4502 @pytest.mark.asyncio
4503 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
4504 temp_dir: Path,
4505 ) -> None:
4506 async def assess_confidence(
4507 tool_name: str,
4508 tool_args: dict,
4509 context: str,
4510 ) -> ConfidenceAssessment:
4511 raise AssertionError("Confidence scoring should be disabled in this scenario")
4512
4513 async def verify_action(
4514 tool_name: str,
4515 tool_args: dict,
4516 result: str,
4517 expected: str = "",
4518 ) -> ActionVerification:
4519 raise AssertionError("Verification should not run in this scenario")
4520
4521 guide_root = temp_dir / "guides" / "nginx"
4522 chapters_dir = guide_root / "chapters"
4523 chapters_dir.mkdir(parents=True)
4524 index_path = guide_root / "index.html"
4525 first_chapter = chapters_dir / "01-introduction.html"
4526 index_path.write_text(
4527 "\n".join(
4528 [
4529 '<a href="chapters/01-introduction.html">Introduction</a>',
4530 '<a href="chapters/02-installation.html">Installation</a>',
4531 '<a href="chapters/03-configuration.html">Configuration</a>',
4532 ]
4533 )
4534 )
4535 first_chapter.write_text("<h1>Introduction</h1>\n")
4536
4537 implementation_plan = temp_dir / "implementation.md"
4538 implementation_plan.write_text(
4539 "\n".join(
4540 [
4541 "# Implementation Plan",
4542 "",
4543 "## File Changes",
4544 f"- `{guide_root / 'index.html'}`",
4545 f"- `{chapters_dir}/`",
4546 "",
4547 ]
4548 )
4549 )
4550
4551 context = build_context(
4552 temp_dir=temp_dir,
4553 messages=[],
4554 safeguards=FakeSafeguards(),
4555 assess_confidence=assess_confidence,
4556 verify_action=verify_action,
4557 auto_recover=False,
4558 )
4559 queued_messages: list[str] = []
4560 context.queue_steering_message_callback = queued_messages.append
4561 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4562 dod = create_definition_of_done("Create a multi-file nginx guide.")
4563 dod.implementation_plan = str(implementation_plan)
4564 dod.pending_items.extend(
4565 [
4566 "First, examine the existing fortran guide structure and content to understand the format",
4567 "Create chapter files following the established pattern",
4568 ]
4569 )
4570 dod.touched_files.extend([str(index_path), str(first_chapter)])
4571
4572 tool_call = ToolCall(
4573 id="working-note",
4574 name="notepad_write_working",
4575 arguments={"content": "Created index and first chapter; next is chapter 2"},
4576 )
4577 executor = FakeExecutor(
4578 [
4579 tool_outcome(
4580 tool_call=tool_call,
4581 output="Working note recorded",
4582 is_error=False,
4583 )
4584 ]
4585 )
4586
4587 summary = TurnSummary(final_response="")
4588 await runner.execute_batch(
4589 tool_calls=[tool_call],
4590 tool_source="assistant",
4591 pending_tool_calls_seen=set(),
4592 emit=_noop_emit,
4593 summary=summary,
4594 dod=dod,
4595 executor=executor, # type: ignore[arg-type]
4596 on_confirmation=None,
4597 on_user_question=None,
4598 emit_confirmation=None,
4599 consecutive_errors=0,
4600 )
4601
4602 assert queued_messages
4603 message = queued_messages[-1]
4604 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
4605 assert "Resume by creating `02-installation.html` now." in message
4606 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
4607
4608
4609 @pytest.mark.asyncio
4610 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
4611 temp_dir: Path,
4612 ) -> None:
4613 async def assess_confidence(
4614 tool_name: str,
4615 tool_args: dict,
4616 context: str,
4617 ) -> ConfidenceAssessment:
4618 raise AssertionError("Confidence scoring should be disabled in this scenario")
4619
4620 async def verify_action(
4621 tool_name: str,
4622 tool_args: dict,
4623 result: str,
4624 expected: str = "",
4625 ) -> ActionVerification:
4626 raise AssertionError("Verification should not run in this scenario")
4627
4628 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
4629 chapters_dir = fortran_root / "chapters"
4630 chapters_dir.mkdir(parents=True)
4631
4632 implementation_plan = temp_dir / "implementation.md"
4633 implementation_plan.write_text(
4634 "\n".join(
4635 [
4636 "# Implementation Plan",
4637 "",
4638 "## File Changes",
4639 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
4640 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
4641 "",
4642 ]
4643 )
4644 )
4645
4646 context = build_context(
4647 temp_dir=temp_dir,
4648 messages=[],
4649 safeguards=FakeSafeguards(),
4650 assess_confidence=assess_confidence,
4651 verify_action=verify_action,
4652 auto_recover=False,
4653 )
4654 queued_messages: list[str] = []
4655 context.queue_steering_message_callback = queued_messages.append
4656 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4657 dod = create_definition_of_done("Create a multi-file nginx guide.")
4658 dod.implementation_plan = str(implementation_plan)
4659 dod.pending_items.extend(
4660 [
4661 "First, examine the existing fortran guide structure and content",
4662 "Create the nginx directory structure",
4663 "Develop the main index.html file for nginx guide",
4664 ]
4665 )
4666
4667 tool_call = ToolCall(
4668 id="glob-1",
4669 name="glob",
4670 arguments={"pattern": "**", "path": str(fortran_root)},
4671 )
4672 executor = FakeExecutor(
4673 [
4674 tool_outcome(
4675 tool_call=tool_call,
4676 output=f"{fortran_root}\n{chapters_dir}",
4677 is_error=False,
4678 )
4679 ]
4680 )
4681
4682 summary = TurnSummary(final_response="")
4683 await runner.execute_batch(
4684 tool_calls=[tool_call],
4685 tool_source="assistant",
4686 pending_tool_calls_seen=set(),
4687 emit=_noop_emit,
4688 summary=summary,
4689 dod=dod,
4690 executor=executor, # type: ignore[arg-type]
4691 on_confirmation=None,
4692 on_user_question=None,
4693 emit_confirmation=None,
4694 consecutive_errors=0,
4695 )
4696
4697 assert queued_messages == []
4698
4699
4700 @pytest.mark.asyncio
4701 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
4702 temp_dir: Path,
4703 ) -> None:
4704 async def assess_confidence(
4705 tool_name: str,
4706 tool_args: dict,
4707 context: str,
4708 ) -> ConfidenceAssessment:
4709 raise AssertionError("Confidence scoring should not run in this scenario")
4710
4711 async def verify_action(
4712 tool_name: str,
4713 tool_args: dict,
4714 result: str,
4715 expected: str = "",
4716 ) -> ActionVerification:
4717 raise AssertionError("Verification should not run in this scenario")
4718
4719 prompt = (
4720 "Have a look at ~/Loader/guides/fortran/index.html, then "
4721 "~/Loader/guides/fortran/chapters. The table of contents links in "
4722 "index.html are inaccurate and the href’s are wrong. Let’s update the "
4723 "links and their link texts to be correct."
4724 )
4725 chapters = temp_dir / "chapters"
4726 chapters.mkdir()
4727 (chapters / "01-introduction.html").write_text(
4728 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
4729 )
4730 (chapters / "02-setup.html").write_text(
4731 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
4732 )
4733 current_block = (
4734 "<h2>Table of Contents</h2>\n"
4735 ' <ul class="chapter-list">\n'
4736 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
4737 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
4738 " </ul>\n"
4739 )
4740 index_path = temp_dir / "index.html"
4741 index_path.write_text(current_block)
4742
4743 context = build_context(
4744 temp_dir=temp_dir,
4745 messages=[],
4746 safeguards=FakeSafeguards(),
4747 assess_confidence=assess_confidence,
4748 verify_action=verify_action,
4749 auto_recover=False,
4750 )
4751 context.session.current_task = prompt # type: ignore[attr-defined]
4752 queued_messages: list[str] = []
4753 context.queue_steering_message_callback = queued_messages.append
4754 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4755 tool_call = ToolCall(
4756 id="edit-1",
4757 name="edit",
4758 arguments={
4759 "file_path": str(index_path),
4760 "old_string": current_block,
4761 "new_string": current_block,
4762 },
4763 )
4764 executor = FakeExecutor(
4765 [
4766 tool_outcome(
4767 tool_call=tool_call,
4768 output=(
4769 "[Blocked - old_string and new_string are identical - no change "
4770 "would occur] Suggestion: Provide different old and new strings"
4771 ),
4772 is_error=True,
4773 state=ToolExecutionState.BLOCKED,
4774 )
4775 ]
4776 )
4777
4778 await runner.execute_batch(
4779 tool_calls=[tool_call],
4780 tool_source="assistant",
4781 pending_tool_calls_seen=set(),
4782 emit=_noop_emit,
4783 summary=TurnSummary(final_response=""),
4784 dod=create_definition_of_done(prompt),
4785 executor=executor, # type: ignore[arg-type]
4786 on_confirmation=None,
4787 on_user_question=None,
4788 emit_confirmation=None,
4789 consecutive_errors=0,
4790 )
4791
4792 assert queued_messages == []
4793
4794
4795 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
4796 temp_dir: Path,
4797 ) -> None:
4798 async def assess_confidence(
4799 tool_name: str,
4800 tool_args: dict,
4801 context: str,
4802 ) -> ConfidenceAssessment:
4803 raise AssertionError("Confidence scoring should be disabled in this scenario")
4804
4805 async def verify_action(
4806 tool_name: str,
4807 tool_args: dict,
4808 result: str,
4809 expected: str = "",
4810 ) -> ActionVerification:
4811 raise AssertionError("Verification should not run in this scenario")
4812
4813 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
4814 context = build_context(
4815 temp_dir=temp_dir,
4816 messages=[
4817 Message(
4818 role=Role.ASSISTANT,
4819 content=(
4820 "Repair focus:\n"
4821 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
4822 f"- Immediate next step: edit `{repair_target}`.\n"
4823 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
4824 ),
4825 )
4826 ],
4827 safeguards=FakeSafeguards(),
4828 assess_confidence=assess_confidence,
4829 verify_action=verify_action,
4830 )
4831 queued: list[str] = []
4832 context.queue_steering_message_callback = queued.append
4833 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4834
4835 runner._queue_blocked_html_edit_nudge(
4836 ToolCall(
4837 id="edit-1",
4838 name="edit",
4839 arguments={
4840 "file_path": str(repair_target),
4841 "old_string": "same",
4842 "new_string": "same",
4843 },
4844 ),
4845 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
4846 )
4847
4848 assert queued
4849 assert str(repair_target) in queued[0]
4850 assert "no on-disk change" in queued[0]
4851 assert "replace the surrounding block" in queued[0]
4852 assert "Do not reopen unrelated reference materials" in queued[0]
4853
4854
4855 async def _noop_emit(event: AgentEvent) -> None:
4856 return None
4857
4858
4859 @pytest.mark.asyncio
4860 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
4861 temp_dir: Path,
4862 ) -> None:
4863 async def assess_confidence(
4864 tool_name: str,
4865 tool_args: dict,
4866 context: str,
4867 ) -> ConfidenceAssessment:
4868 raise AssertionError("Confidence scoring should be disabled in this scenario")
4869
4870 async def verify_action(
4871 tool_name: str,
4872 tool_args: dict,
4873 result: str,
4874 expected: str = "",
4875 ) -> ActionVerification:
4876 raise AssertionError("Verification should not run for this scenario")
4877
4878 context = build_context(
4879 temp_dir=temp_dir,
4880 messages=[],
4881 safeguards=FakeSafeguards(),
4882 assess_confidence=assess_confidence,
4883 verify_action=verify_action,
4884 )
4885 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4886 tool_call = ToolCall(
4887 id="write-1",
4888 name="write",
4889 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4890 )
4891 executor = FakeExecutor(
4892 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4893 )
4894 summary = TurnSummary(final_response="")
4895 dod = create_definition_of_done("Update README and verify it still works.")
4896 events: list[AgentEvent] = []
4897
4898 async def emit(event: AgentEvent) -> None:
4899 events.append(event)
4900
4901 await runner.execute_batch(
4902 tool_calls=[tool_call],
4903 tool_source="assistant",
4904 pending_tool_calls_seen=set(),
4905 emit=emit,
4906 summary=summary,
4907 dod=dod,
4908 executor=executor, # type: ignore[arg-type]
4909 on_confirmation=None,
4910 on_user_question=None,
4911 emit_confirmation=None,
4912 consecutive_errors=0,
4913 )
4914
4915 assert dod.last_verification_result == "planned"
4916 assert dod.verification_commands
4917 assert "Collect verification evidence" in dod.pending_items
4918 assert dod.active_verification_attempt_id == "verification-attempt-1"
4919 assert dod.active_verification_attempt_number == 1
4920 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
4921 assert summary.workflow_timeline[-1].policy_outcome == "planned"
4922 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
4923 assert (
4924 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4925 == "verification-attempt-1"
4926 )
4927 assert (
4928 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4929 )
4930
4931
4932 @pytest.mark.asyncio
4933 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
4934 temp_dir: Path,
4935 ) -> None:
4936 async def assess_confidence(
4937 tool_name: str,
4938 tool_args: dict,
4939 context: str,
4940 ) -> ConfidenceAssessment:
4941 raise AssertionError("Confidence scoring should be disabled in this scenario")
4942
4943 async def verify_action(
4944 tool_name: str,
4945 tool_args: dict,
4946 result: str,
4947 expected: str = "",
4948 ) -> ActionVerification:
4949 raise AssertionError("Verification should not run in this scenario")
4950
4951 context = build_context(
4952 temp_dir=temp_dir,
4953 messages=[],
4954 safeguards=FakeSafeguards(),
4955 assess_confidence=assess_confidence,
4956 verify_action=verify_action,
4957 )
4958 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4959 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
4960 chapters = nginx_root / "chapters"
4961 implementation_plan = temp_dir / "implementation.md"
4962 implementation_plan.write_text(
4963 "\n".join(
4964 [
4965 "# Implementation Plan",
4966 "",
4967 "## File Changes",
4968 f"- `{chapters}/`",
4969 f"- `{nginx_root / 'index.html'}`",
4970 "",
4971 ]
4972 )
4973 )
4974
4975 tool_call = ToolCall(
4976 id="mkdir-1",
4977 name="bash",
4978 arguments={"command": f"mkdir -p {chapters}"},
4979 )
4980 executor = FakeExecutor(
4981 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
4982 )
4983 summary = TurnSummary(final_response="")
4984 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
4985 dod.implementation_plan = str(implementation_plan)
4986 events: list[AgentEvent] = []
4987
4988 async def emit(event: AgentEvent) -> None:
4989 events.append(event)
4990
4991 await runner.execute_batch(
4992 tool_calls=[tool_call],
4993 tool_source="assistant",
4994 pending_tool_calls_seen=set(),
4995 emit=emit,
4996 summary=summary,
4997 dod=dod,
4998 executor=executor, # type: ignore[arg-type]
4999 on_confirmation=None,
5000 on_user_question=None,
5001 emit_confirmation=None,
5002 consecutive_errors=0,
5003 )
5004
5005 assert dod.last_verification_result is None
5006 assert "Collect verification evidence" not in dod.pending_items
5007 assert not any(
5008 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
5009 )
5010
5011
5012 @pytest.mark.asyncio
5013 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
5014 temp_dir: Path,
5015 ) -> None:
5016 async def assess_confidence(
5017 tool_name: str,
5018 tool_args: dict,
5019 context: str,
5020 ) -> ConfidenceAssessment:
5021 raise AssertionError("Confidence scoring should be disabled in this scenario")
5022
5023 async def verify_action(
5024 tool_name: str,
5025 tool_args: dict,
5026 result: str,
5027 expected: str = "",
5028 ) -> ActionVerification:
5029 raise AssertionError("Verification should not run for this scenario")
5030
5031 context = build_context(
5032 temp_dir=temp_dir,
5033 messages=[],
5034 safeguards=FakeSafeguards(),
5035 assess_confidence=assess_confidence,
5036 verify_action=verify_action,
5037 )
5038 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5039 tool_call = ToolCall(
5040 id="write-1",
5041 name="write",
5042 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
5043 )
5044 executor = FakeExecutor(
5045 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5046 )
5047 summary = TurnSummary(final_response="")
5048 dod = create_definition_of_done("Update README and verify it still works.")
5049 dod.verification_commands = ["uv run pytest -q"]
5050 dod.last_verification_result = "passed"
5051 dod.verification_attempt_counter = 1
5052 dod.active_verification_attempt_id = "verification-attempt-1"
5053 dod.active_verification_attempt_number = 1
5054 dod.evidence = [
5055 VerificationEvidence(
5056 command="uv run pytest -q",
5057 passed=True,
5058 stdout="401 passed",
5059 kind="test",
5060 )
5061 ]
5062 dod.completed_items.append("Collect verification evidence")
5063 events: list[AgentEvent] = []
5064
5065 async def emit(event: AgentEvent) -> None:
5066 events.append(event)
5067
5068 await runner.execute_batch(
5069 tool_calls=[tool_call],
5070 tool_source="assistant",
5071 pending_tool_calls_seen=set(),
5072 emit=emit,
5073 summary=summary,
5074 dod=dod,
5075 executor=executor, # type: ignore[arg-type]
5076 on_confirmation=None,
5077 on_user_question=None,
5078 emit_confirmation=None,
5079 consecutive_errors=0,
5080 )
5081
5082 assert dod.last_verification_result == "stale"
5083 assert dod.evidence == []
5084 assert "Collect verification evidence" in dod.pending_items
5085 assert "Collect verification evidence" not in dod.completed_items
5086 assert dod.active_verification_attempt_id == "verification-attempt-2"
5087 assert dod.active_verification_attempt_number == 2
5088 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
5089 assert summary.workflow_timeline[-1].policy_outcome == "stale"
5090 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
5091 assert (
5092 summary.workflow_timeline[-1].verification_observations[0].attempt_id
5093 == "verification-attempt-1"
5094 )
5095 assert (
5096 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
5097 )
5098 assert (
5099 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
5100 == "verification-attempt-2"
5101 )
5102 assert (
5103 summary.workflow_timeline[-1].verification_observations[0].command
5104 == "uv run pytest -q"
5105 )
5106
5107
5108 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
5109 async def assess_confidence(
5110 tool_name: str,
5111 tool_args: dict,
5112 context: str,
5113 ) -> ConfidenceAssessment:
5114 raise AssertionError("Confidence scoring should be disabled in this scenario")
5115
5116 async def verify_action(
5117 tool_name: str,
5118 tool_args: dict,
5119 result: str,
5120 expected: str = "",
5121 ) -> ActionVerification:
5122 raise AssertionError("Verification should not run in this scenario")
5123
5124 repair_target = temp_dir / "guide" / "index.html"
5125 context = build_context(
5126 temp_dir=temp_dir,
5127 messages=[
5128 Message(
5129 role=Role.ASSISTANT,
5130 content=(
5131 "Repair focus:\n"
5132 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
5133 f"- Immediate next step: edit `{repair_target}`.\n"
5134 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
5135 ),
5136 )
5137 ],
5138 safeguards=FakeSafeguards(),
5139 assess_confidence=assess_confidence,
5140 verify_action=verify_action,
5141 )
5142 queued: list[str] = []
5143 context.queue_steering_message_callback = queued.append
5144 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5145
5146 runner._queue_blocked_active_repair_nudge(
5147 "[Blocked - active repair scope: verification already identified the repair target.]"
5148 )
5149
5150 assert queued
5151 assert str(repair_target) in queued[0]
5152 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
5153 assert "Do not reopen unrelated reference materials" in queued[0]
5154
5155
5156 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
5157 temp_dir: Path,
5158 ) -> None:
5159 async def assess_confidence(
5160 tool_name: str,
5161 tool_args: dict,
5162 context: str,
5163 ) -> ConfidenceAssessment:
5164 raise AssertionError("Confidence scoring should be disabled in this scenario")
5165
5166 async def verify_action(
5167 tool_name: str,
5168 tool_args: dict,
5169 result: str,
5170 expected: str = "",
5171 ) -> ActionVerification:
5172 raise AssertionError("Verification should not run in this scenario")
5173
5174 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
5175 stylesheet = temp_dir / "guide" / "styles.css"
5176 context = build_context(
5177 temp_dir=temp_dir,
5178 messages=[
5179 Message(
5180 role=Role.ASSISTANT,
5181 content=(
5182 "Repair focus:\n"
5183 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
5184 f"- Immediate next step: edit `{repair_target}`.\n"
5185 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
5186 ),
5187 )
5188 ],
5189 safeguards=FakeSafeguards(),
5190 assess_confidence=assess_confidence,
5191 verify_action=verify_action,
5192 )
5193 queued: list[str] = []
5194 context.queue_steering_message_callback = queued.append
5195 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5196
5197 runner._queue_blocked_active_repair_mutation_nudge(
5198 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
5199 )
5200
5201 assert queued
5202 assert str(repair_target) in queued[0]
5203 assert str(stylesheet) in queued[0]
5204 assert "before widening the change set" in queued[0]
5205
5206
5207 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
5208 temp_dir: Path,
5209 ) -> None:
5210 async def assess_confidence(
5211 tool_name: str,
5212 tool_args: dict,
5213 context: str,
5214 ) -> ConfidenceAssessment:
5215 raise AssertionError("Confidence scoring should be disabled in this scenario")
5216
5217 async def verify_action(
5218 tool_name: str,
5219 tool_args: dict,
5220 result: str,
5221 expected: str = "",
5222 ) -> ActionVerification:
5223 raise AssertionError("Verification should not run in this scenario")
5224
5225 context = build_context(
5226 temp_dir=temp_dir,
5227 messages=[],
5228 safeguards=FakeSafeguards(),
5229 assess_confidence=assess_confidence,
5230 verify_action=verify_action,
5231 )
5232 queued: list[str] = []
5233 context.queue_steering_message_callback = queued.append
5234 store = DefinitionOfDoneStore(temp_dir)
5235 dod = create_definition_of_done("Create a multi-file guide from a reference")
5236 plan_path = temp_dir / "implementation.md"
5237 plan_path.write_text(
5238 "# File Changes\n"
5239 "- `guide/index.html`\n"
5240 "- `guide/chapters/01-getting-started.html`\n"
5241 "- `guide/chapters/02-installation.html`\n"
5242 "- `guide/chapters/03-first-website.html`\n"
5243 )
5244 dod.implementation_plan = str(plan_path)
5245 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
5246 (temp_dir / "guide" / "index.html").write_text("index")
5247 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
5248 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
5249 runner = ToolBatchRunner(context, store)
5250
5251 runner._queue_blocked_late_reference_drift_nudge(
5252 "[Blocked - late reference drift: several planned artifacts already exist.]",
5253 dod=dod,
5254 )
5255
5256 assert queued
5257 assert "03-first-website.html" in queued[0]
5258 assert "older reference materials" in queued[0]
5259
5260
5261 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
5262 temp_dir: Path,
5263 ) -> None:
5264 async def assess_confidence(
5265 tool_name: str,
5266 tool_args: dict,
5267 context: str,
5268 ) -> ConfidenceAssessment:
5269 raise AssertionError("Confidence scoring should be disabled in this scenario")
5270
5271 async def verify_action(
5272 tool_name: str,
5273 tool_args: dict,
5274 result: str,
5275 expected: str = "",
5276 ) -> ActionVerification:
5277 raise AssertionError("Verification should not run in this scenario")
5278
5279 guide_root = temp_dir / "guide"
5280 chapters = guide_root / "chapters"
5281 guide_root.mkdir(parents=True)
5282 chapters.mkdir()
5283 index_path = guide_root / "index.html"
5284 chapter_one = chapters / "01-getting-started.html"
5285 chapter_two = chapters / "02-installation.html"
5286 index_path.write_text("index")
5287 chapter_one.write_text("one")
5288 chapter_two.write_text("two")
5289
5290 implementation_plan = temp_dir / "implementation.md"
5291 implementation_plan.write_text(
5292 "\n".join(
5293 [
5294 "# Implementation Plan",
5295 "",
5296 "## File Changes",
5297 f"- `{guide_root}`",
5298 f"- `{chapters}`",
5299 f"- `{index_path}`",
5300 f"- `{chapter_one}`",
5301 f"- `{chapter_two}`",
5302 "",
5303 ]
5304 )
5305 )
5306
5307 context = build_context(
5308 temp_dir=temp_dir,
5309 messages=[],
5310 safeguards=FakeSafeguards(),
5311 assess_confidence=assess_confidence,
5312 verify_action=verify_action,
5313 )
5314 queued: list[str] = []
5315 context.queue_steering_message_callback = queued.append
5316 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5317 dod = create_definition_of_done("Create a multi-file guide from a reference")
5318 dod.implementation_plan = str(implementation_plan)
5319 dod.verification_commands = [f"ls -la {guide_root}"]
5320 sync_todos_to_definition_of_done(
5321 dod,
5322 [
5323 {
5324 "content": "Verify all guide files are linked and complete",
5325 "active_form": "Working on: Verify all guide files are linked and complete",
5326 "status": "pending",
5327 }
5328 ],
5329 project_root=temp_dir,
5330 )
5331
5332 runner._queue_blocked_completed_artifact_scope_nudge(
5333 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
5334 dod=dod,
5335 )
5336
5337 assert queued
5338 assert "All explicitly planned artifacts already exist." in queued[0]
5339 assert "Verify all guide files are linked and complete" in queued[0]
5340 assert "Do not reopen earlier reference materials." in queued[0]
5341
5342
5343 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
5344 temp_dir: Path,
5345 ) -> None:
5346 async def assess_confidence(
5347 tool_name: str,
5348 tool_args: dict,
5349 context: str,
5350 ) -> ConfidenceAssessment:
5351 raise AssertionError("Confidence scoring should be disabled in this scenario")
5352
5353 async def verify_action(
5354 tool_name: str,
5355 tool_args: dict,
5356 result: str,
5357 expected: str = "",
5358 ) -> ActionVerification:
5359 raise AssertionError("Verification should not run in this scenario")
5360
5361 context = build_context(
5362 temp_dir=temp_dir,
5363 messages=[],
5364 safeguards=FakeSafeguards(),
5365 assess_confidence=assess_confidence,
5366 verify_action=verify_action,
5367 )
5368 queued: list[str] = []
5369 context.queue_steering_message_callback = queued.append
5370 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5371
5372 runner._queue_blocked_html_declared_target_nudge(
5373 ToolCall(
5374 id="write-ch1",
5375 name="write",
5376 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
5377 ),
5378 (
5379 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
5380 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
5381 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
5382 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
5383 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
5384 ),
5385 )
5386
5387 assert queued
5388 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
5389 assert "`chapters/02-installation.html`" in queued[0]
5390 assert "same file now" in queued[0]
5391
5392
5393 @pytest.mark.asyncio
5394 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
5395 temp_dir: Path,
5396 ) -> None:
5397 async def assess_confidence(
5398 tool_name: str,
5399 tool_args: dict,
5400 context: str,
5401 ) -> ConfidenceAssessment:
5402 raise AssertionError("Confidence scoring should be disabled in this scenario")
5403
5404 async def verify_action(
5405 tool_name: str,
5406 tool_args: dict,
5407 result: str,
5408 expected: str = "",
5409 ) -> ActionVerification:
5410 raise AssertionError("Verification should not run in this scenario")
5411
5412 guide_root = temp_dir / "guides" / "nginx"
5413 chapters = guide_root / "chapters"
5414 chapters.mkdir(parents=True)
5415 index_path = guide_root / "index.html"
5416 chapter_one = chapters / "01-introduction.html"
5417 chapter_two = chapters / "02-installation.html"
5418 index_path.write_text("<html></html>\n")
5419 chapter_one.write_text("<h1>Intro</h1>\n")
5420
5421 implementation_plan = temp_dir / "implementation.md"
5422 implementation_plan.write_text(
5423 "\n".join(
5424 [
5425 "# Implementation Plan",
5426 "",
5427 "## File Changes",
5428 f"- `{index_path}`",
5429 f"- `{chapter_one}`",
5430 f"- `{chapter_two}`",
5431 "",
5432 ]
5433 )
5434 )
5435
5436 context = build_context(
5437 temp_dir=temp_dir,
5438 messages=[],
5439 safeguards=FakeSafeguards(),
5440 assess_confidence=assess_confidence,
5441 verify_action=verify_action,
5442 auto_recover=False,
5443 )
5444 queued: list[str] = []
5445 context.queue_steering_message_callback = queued.append
5446 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5447 tool_call = ToolCall(
5448 id="write-2",
5449 name="write",
5450 arguments={"file_path": "", "content": "<html></html>\n"},
5451 )
5452 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
5453 executor = FakeExecutor(
5454 [
5455 ToolExecutionOutcome(
5456 tool_call=tool_call,
5457 state=ToolExecutionState.BLOCKED,
5458 message=Message.tool_result_message(
5459 tool_call_id=tool_call.id,
5460 display_content=blocked_message,
5461 result_content=blocked_message,
5462 is_error=True,
5463 ),
5464 event_content=blocked_message,
5465 is_error=True,
5466 result_output=blocked_message,
5467 )
5468 ]
5469 )
5470 dod = create_definition_of_done("Create a multi-file nginx guide.")
5471 dod.implementation_plan = str(implementation_plan)
5472 dod.touched_files.extend([str(index_path), str(chapter_one)])
5473 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
5474
5475 await runner.execute_batch(
5476 tool_calls=[tool_call],
5477 tool_source="assistant",
5478 pending_tool_calls_seen=set(),
5479 emit=_noop_emit,
5480 summary=TurnSummary(final_response=""),
5481 dod=dod,
5482 executor=executor, # type: ignore[arg-type]
5483 on_confirmation=None,
5484 on_user_question=None,
5485 emit_confirmation=None,
5486 consecutive_errors=0,
5487 )
5488
5489 assert queued
5490 assert "did not provide a valid `file_path`" in queued[0]
5491 assert "Resume by creating `02-installation.html` now." in queued[0]
5492 assert (
5493 f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
5494 in queued[0]
5495 )
5496 assert context.recovery_context is not None
5497 assert context.recovery_context.attempts[-1].error == blocked_message