Python · 177638 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.permissions import (
20 PermissionMode,
21 build_permission_policy,
22 load_permission_rules,
23 )
24 from loader.runtime.reasoning_types import (
25 ActionVerification,
26 ConfidenceAssessment,
27 ConfidenceLevel,
28 )
29 from loader.runtime.recovery import RecoveryContext
30 from loader.runtime.tool_batches import (
31 ToolBatchRunner,
32 )
33 from loader.runtime.tool_batches import (
34 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
35 )
36 from loader.runtime.workflow import sync_todos_to_definition_of_done
37 from loader.tools.base import ToolResult as RegistryToolResult
38 from loader.tools.base import create_default_registry
39 from tests.helpers.runtime_harness import ScriptedBackend
40
41
42 class FakeSession:
43 def __init__(self, messages: list[Message]) -> None:
44 self.messages = list(messages)
45 self.workflow_timeline = []
46
47 def append(self, message: Message) -> None:
48 self.messages.append(message)
49
50 def append_workflow_timeline_entry(self, entry) -> None:
51 self.workflow_timeline.append(entry)
52
53
54 class FakeCodeFilter:
55 def reset(self) -> None:
56 return None
57
58
59 class FakeSafeguards:
60 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
61 self.action_tracker = object()
62 self.validator = object()
63 self.code_filter = FakeCodeFilter()
64 self._detect_loop_result = detect_loop_result
65
66 def filter_stream_chunk(self, content: str) -> str:
67 return content
68
69 def filter_complete_content(self, content: str) -> str:
70 return content
71
72 def should_steer(self) -> bool:
73 return False
74
75 def get_steering_message(self) -> str | None:
76 return None
77
78 def record_response(self, content: str) -> None:
79 return None
80
81 def detect_text_loop(self, content: str) -> tuple[bool, str]:
82 return False, ""
83
84 def detect_loop(self) -> tuple[bool, str]:
85 return self._detect_loop_result
86
87
88 class FakeExecutor:
89 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
90 self._outcomes = list(outcomes)
91 self.calls: list[ToolCall] = []
92
93 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
94 self.calls.append(tool_call)
95 if not self._outcomes:
96 raise AssertionError("No fake tool outcome queued")
97 return self._outcomes.pop(0)
98
99
100 def build_context(
101 *,
102 temp_dir: Path,
103 messages: list[Message],
104 safeguards: FakeSafeguards,
105 assess_confidence,
106 verify_action,
107 recovery_context: RecoveryContext | None = None,
108 confidence_scoring: bool = False,
109 verification: bool = False,
110 auto_recover: bool = True,
111 min_confidence_for_action: int = 3,
112 ) -> RuntimeContext:
113 registry = create_default_registry(temp_dir)
114 registry.configure_workspace_root(temp_dir)
115 rule_status = load_permission_rules(temp_dir)
116 policy = build_permission_policy(
117 active_mode=PermissionMode.WORKSPACE_WRITE,
118 workspace_root=temp_dir,
119 tool_requirements=registry.get_tool_requirements(),
120 rules=rule_status.rules,
121 )
122 context = RuntimeContext(
123 project_root=temp_dir,
124 backend=ScriptedBackend(),
125 registry=registry,
126 session=FakeSession(messages), # type: ignore[arg-type]
127 config=SimpleNamespace(
128 force_react=False,
129 max_recovery_attempts=2,
130 auto_recover=auto_recover,
131 reasoning=SimpleNamespace(
132 rollback=False,
133 show_rollback_plan=False,
134 completion_check=True,
135 max_continuation_prompts=5,
136 self_critique=False,
137 confidence_scoring=confidence_scoring,
138 min_confidence_for_action=min_confidence_for_action,
139 verification=verification,
140 ),
141 ),
142 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
143 project_context=None,
144 permission_policy=policy,
145 permission_config_status=rule_status,
146 workflow_mode="execute",
147 safeguards=safeguards,
148 reasoning=SimpleNamespace(
149 assess_confidence=assess_confidence,
150 verify_action=verify_action,
151 ),
152 recovery_context=recovery_context,
153 )
154 return context
155
156
157 def tool_outcome(
158 *,
159 tool_call: ToolCall,
160 output: str,
161 is_error: bool,
162 state: ToolExecutionState = ToolExecutionState.EXECUTED,
163 metadata: dict[str, object] | None = None,
164 ) -> ToolExecutionOutcome:
165 return ToolExecutionOutcome(
166 tool_call=tool_call,
167 state=state,
168 message=Message.tool_result_message(
169 tool_call_id=tool_call.id,
170 display_content=output,
171 result_content=output,
172 is_error=is_error,
173 ),
174 event_content=output,
175 is_error=is_error,
176 result_output=output,
177 registry_result=RegistryToolResult(
178 output=output,
179 is_error=is_error,
180 metadata=metadata or {},
181 ),
182 )
183
184
185 @pytest.mark.asyncio
186 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
187 captured: dict[str, str] = {}
188
189 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
190 captured["context"] = context
191 return ConfidenceAssessment(
192 action=f"{tool_name} with {tool_args}",
193 tool_name=tool_name,
194 tool_args=tool_args,
195 level=ConfidenceLevel.LOW,
196 reasoning="Need to inspect the target first.",
197 risks=["Unknown target file"],
198 )
199
200 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
201 raise AssertionError("Verification should not run for skipped actions")
202
203 context = build_context(
204 temp_dir=temp_dir,
205 messages=[
206 Message(role=Role.USER, content="Please inspect the project."),
207 Message(role=Role.ASSISTANT, content="I will read the file next."),
208 ],
209 safeguards=FakeSafeguards(),
210 assess_confidence=assess_confidence,
211 verify_action=verify_action,
212 confidence_scoring=True,
213 min_confidence_for_action=3,
214 )
215 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
216 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
217 events: list[AgentEvent] = []
218
219 async def emit(event: AgentEvent) -> None:
220 events.append(event)
221
222 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
223 result = await runner.execute_batch(
224 tool_calls=[tool_call],
225 tool_source="assistant",
226 pending_tool_calls_seen=set(),
227 emit=emit,
228 summary=TurnSummary(final_response=""),
229 dod=create_definition_of_done("Read the docs"),
230 executor=executor, # type: ignore[arg-type]
231 on_confirmation=None,
232 on_user_question=None,
233 emit_confirmation=None,
234 consecutive_errors=0,
235 )
236
237 assert result.actions_taken == []
238 assert executor.calls == []
239 assert "Please inspect the project." in captured["context"]
240 assert context.session.messages[-1].role == Role.USER
241 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
242 event_types = [event.type for event in events]
243 assert "confidence" in event_types
244
245
246 @pytest.mark.asyncio
247 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
248 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
249 raise AssertionError("Confidence scoring should be disabled in this scenario")
250
251 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
252 raise AssertionError("Verification should not run for failed actions")
253
254 context = build_context(
255 temp_dir=temp_dir,
256 messages=[],
257 safeguards=FakeSafeguards(),
258 assess_confidence=assess_confidence,
259 verify_action=verify_action,
260 auto_recover=True,
261 )
262 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
263 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
264 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
265 summary = TurnSummary(final_response="")
266 events: list[AgentEvent] = []
267
268 async def emit(event: AgentEvent) -> None:
269 events.append(event)
270
271 await runner.execute_batch(
272 tool_calls=[tool_call],
273 tool_source="assistant",
274 pending_tool_calls_seen=set(),
275 emit=emit,
276 summary=summary,
277 dod=create_definition_of_done("Run tests"),
278 executor=executor, # type: ignore[arg-type]
279 on_confirmation=None,
280 on_user_question=None,
281 emit_confirmation=None,
282 consecutive_errors=0,
283 )
284
285 assert context.recovery_context is not None
286 assert summary.tool_result_messages
287 assert context.session.messages[-1] == summary.tool_result_messages[-1]
288 assert any(event.type == "recovery" for event in events)
289
290
291 @pytest.mark.asyncio
292 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
293 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
294 raise AssertionError("Confidence scoring should be disabled in this scenario")
295
296 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
297 raise AssertionError("Verification should not run for this scenario")
298
299 context = build_context(
300 temp_dir=temp_dir,
301 messages=[],
302 safeguards=FakeSafeguards(),
303 assess_confidence=assess_confidence,
304 verify_action=verify_action,
305 auto_recover=False,
306 )
307 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
308 tool_call = ToolCall(
309 id="bash-1",
310 name="bash",
311 arguments={"command": "python -m http.server 8000", "background": True},
312 )
313 metadata = {
314 "job_id": "bash-1",
315 "status": "running",
316 "background": True,
317 }
318 executor = FakeExecutor(
319 [
320 tool_outcome(
321 tool_call=tool_call,
322 output="Started bash job bash-1",
323 is_error=False,
324 metadata=metadata,
325 )
326 ]
327 )
328 events: list[AgentEvent] = []
329
330 async def emit(event: AgentEvent) -> None:
331 events.append(event)
332
333 await runner.execute_batch(
334 tool_calls=[tool_call],
335 tool_source="assistant",
336 pending_tool_calls_seen=set(),
337 emit=emit,
338 summary=TurnSummary(final_response=""),
339 dod=create_definition_of_done("Launch a preview server"),
340 executor=executor, # type: ignore[arg-type]
341 on_confirmation=None,
342 on_user_question=None,
343 emit_confirmation=None,
344 consecutive_errors=0,
345 )
346
347 tool_result = next(event for event in events if event.type == "tool_result")
348 assert tool_result.tool_metadata == metadata
349
350
351 @pytest.mark.asyncio
352 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
353 verification_calls: list[str] = []
354
355 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
356 raise AssertionError("Confidence scoring should be disabled in this scenario")
357
358 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
359 verification_calls.append(result)
360 return ActionVerification(
361 tool_name=tool_name,
362 tool_args=tool_args,
363 expected_outcome="Success",
364 actual_result=result,
365 verified=False,
366 discrepancies=["File contents did not match"],
367 needs_correction=True,
368 correction_suggestion="Read the file before editing again.",
369 )
370
371 existing_recovery = RecoveryContext(
372 original_tool="edit",
373 original_args={"file_path": "README.md"},
374 )
375 context = build_context(
376 temp_dir=temp_dir,
377 messages=[],
378 safeguards=FakeSafeguards(),
379 assess_confidence=assess_confidence,
380 verify_action=verify_action,
381 recovery_context=existing_recovery,
382 verification=True,
383 )
384 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
385 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
386 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
387 events: list[AgentEvent] = []
388
389 async def emit(event: AgentEvent) -> None:
390 events.append(event)
391
392 await runner.execute_batch(
393 tool_calls=[tool_call],
394 tool_source="assistant",
395 pending_tool_calls_seen=set(),
396 emit=emit,
397 summary=TurnSummary(final_response=""),
398 dod=create_definition_of_done("Read the docs"),
399 executor=executor, # type: ignore[arg-type]
400 on_confirmation=None,
401 on_user_question=None,
402 emit_confirmation=None,
403 consecutive_errors=0,
404 )
405
406 assert verification_calls == ["file contents"]
407 assert context.recovery_context is existing_recovery
408 assert existing_recovery.successful_steps == [
409 ("read", {"file_path": "README.md"})
410 ]
411 assert context.session.messages[-1].role == Role.TOOL
412 assert context.session.messages[-1].content == "file contents"
413 assert any(event.type == "verification" for event in events)
414
415
416 @pytest.mark.asyncio
417 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
418 temp_dir: Path,
419 ) -> None:
420 async def assess_confidence(
421 tool_name: str,
422 tool_args: dict,
423 context: str,
424 ) -> ConfidenceAssessment:
425 raise AssertionError("Confidence scoring should be disabled in this scenario")
426
427 async def verify_action(
428 tool_name: str,
429 tool_args: dict,
430 result: str,
431 expected: str = "",
432 ) -> ActionVerification:
433 raise AssertionError("Verification should not run for this scenario")
434
435 existing_recovery = RecoveryContext(
436 original_tool="read",
437 original_args={"file_path": "chapters/04-data-types.html"},
438 )
439 existing_recovery.add_attempt(
440 "read",
441 {"file_path": "chapters/04-data-types.html"},
442 "File not found",
443 )
444 context = build_context(
445 temp_dir=temp_dir,
446 messages=[],
447 safeguards=FakeSafeguards(),
448 assess_confidence=assess_confidence,
449 verify_action=verify_action,
450 recovery_context=existing_recovery,
451 auto_recover=False,
452 )
453 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
454 tool_call = ToolCall(
455 id="bash-1",
456 name="bash",
457 arguments={"command": "ls chapters"},
458 )
459 executor = FakeExecutor(
460 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
461 )
462
463 summary = TurnSummary(final_response="")
464 await runner.execute_batch(
465 tool_calls=[tool_call],
466 tool_source="assistant",
467 pending_tool_calls_seen=set(),
468 emit=_noop_emit,
469 summary=summary,
470 dod=create_definition_of_done("Fix the chapter links"),
471 executor=executor, # type: ignore[arg-type]
472 on_confirmation=None,
473 on_user_question=None,
474 emit_confirmation=None,
475 consecutive_errors=0,
476 )
477
478 assert context.recovery_context is existing_recovery
479 assert existing_recovery.successful_steps == [
480 ("bash", {"command": "ls chapters"})
481 ]
482
483
484 @pytest.mark.asyncio
485 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
486 temp_dir: Path,
487 ) -> None:
488 async def assess_confidence(
489 tool_name: str,
490 tool_args: dict,
491 context: str,
492 ) -> ConfidenceAssessment:
493 raise AssertionError("Confidence scoring should be disabled in this scenario")
494
495 async def verify_action(
496 tool_name: str,
497 tool_args: dict,
498 result: str,
499 expected: str = "",
500 ) -> ActionVerification:
501 raise AssertionError("Verification should not run for this scenario")
502
503 existing_recovery = RecoveryContext(
504 original_tool="read",
505 original_args={"file_path": "chapters/04-data-types.html"},
506 )
507 existing_recovery.add_attempt(
508 "read",
509 {"file_path": "chapters/04-data-types.html"},
510 "File not found",
511 )
512 context = build_context(
513 temp_dir=temp_dir,
514 messages=[],
515 safeguards=FakeSafeguards(),
516 assess_confidence=assess_confidence,
517 verify_action=verify_action,
518 recovery_context=existing_recovery,
519 auto_recover=False,
520 )
521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
522 tool_call = ToolCall(
523 id="patch-1",
524 name="patch",
525 arguments={
526 "file_path": "index.html",
527 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
528 },
529 )
530 executor = FakeExecutor(
531 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
532 )
533
534 summary = TurnSummary(final_response="")
535 await runner.execute_batch(
536 tool_calls=[tool_call],
537 tool_source="assistant",
538 pending_tool_calls_seen=set(),
539 emit=_noop_emit,
540 summary=summary,
541 dod=create_definition_of_done("Fix the chapter links"),
542 executor=executor, # type: ignore[arg-type]
543 on_confirmation=None,
544 on_user_question=None,
545 emit_confirmation=None,
546 consecutive_errors=0,
547 )
548
549 assert context.recovery_context is None
550
551
552 @pytest.mark.asyncio
553 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
554 temp_dir: Path,
555 ) -> None:
556 async def assess_confidence(
557 tool_name: str,
558 tool_args: dict,
559 context: str,
560 ) -> ConfidenceAssessment:
561 raise AssertionError("Confidence scoring should be disabled in this scenario")
562
563 async def verify_action(
564 tool_name: str,
565 tool_args: dict,
566 result: str,
567 expected: str = "",
568 ) -> ActionVerification:
569 raise AssertionError("Verification should not run for this scenario")
570
571 messages = [
572 Message(
573 role=Role.TOOL,
574 content=(
575 "Observation [glob]: Result: "
576 f"{temp_dir}/chapters/01-introduction.html\n"
577 f"{temp_dir}/chapters/02-setup.html\n"
578 f"{temp_dir}/chapters/03-basics.html"
579 ),
580 tool_results=[],
581 ),
582 Message(
583 role=Role.ASSISTANT,
584 content="I already inspected the first chapter title.",
585 tool_calls=[
586 ToolCall(
587 id="read-ch1",
588 name="read",
589 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
590 )
591 ],
592 ),
593 Message.tool_result_message(
594 tool_call_id="read-ch1",
595 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
596 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 ),
598 Message(
599 role=Role.ASSISTANT,
600 content="I should update the index now.",
601 tool_calls=[
602 ToolCall(
603 id="read-index",
604 name="read",
605 arguments={"file_path": str(temp_dir / 'index.html')},
606 )
607 ],
608 ),
609 ]
610 context = build_context(
611 temp_dir=temp_dir,
612 messages=messages,
613 safeguards=FakeSafeguards(),
614 assess_confidence=assess_confidence,
615 verify_action=verify_action,
616 auto_recover=False,
617 )
618 (temp_dir / "chapters").mkdir()
619 (temp_dir / "index.html").write_text("<ul></ul>\n")
620 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
621 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
622 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
623 implementation_plan = temp_dir / "implementation.md"
624 implementation_plan.write_text(
625 "\n".join(
626 [
627 "# Implementation Plan",
628 "",
629 "## File Changes",
630 f"- `{temp_dir / 'index.html'}`",
631 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
632 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
633 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
634 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
635 ]
636 )
637 )
638 context.session.current_task = (
639 f"Update {temp_dir / 'index.html'} with the right chapter links."
640 )
641 persistent_messages: list[str] = []
642 ephemeral_messages: list[str] = []
643 context.queue_steering_message_callback = persistent_messages.append
644 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
645 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
646 tool_call = ToolCall(
647 id="read-dup",
648 name="read",
649 arguments={"file_path": str(temp_dir / "index.html")},
650 )
651 duplicate_message = (
652 "[Skipped - duplicate action: Already read "
653 f"{temp_dir / 'index.html'} recently without any intervening changes; "
654 "reuse the earlier read result instead of rereading]"
655 )
656 executor = FakeExecutor(
657 [
658 ToolExecutionOutcome(
659 tool_call=tool_call,
660 state=ToolExecutionState.DUPLICATE,
661 message=Message.tool_result_message(
662 tool_call_id=tool_call.id,
663 display_content=duplicate_message,
664 result_content=duplicate_message,
665 ),
666 event_content=duplicate_message,
667 is_error=False,
668 result_output=duplicate_message,
669 )
670 ]
671 )
672
673 summary = TurnSummary(final_response="")
674 dod = create_definition_of_done("Fix the chapter links")
675 dod.implementation_plan = str(implementation_plan)
676 dod.pending_items.append("Create the remaining chapter files")
677 await runner.execute_batch(
678 tool_calls=[tool_call],
679 tool_source="assistant",
680 pending_tool_calls_seen=set(),
681 emit=_noop_emit,
682 summary=summary,
683 dod=dod,
684 executor=executor, # type: ignore[arg-type]
685 on_confirmation=None,
686 on_user_question=None,
687 emit_confirmation=None,
688 consecutive_errors=0,
689 )
690
691 assert len(persistent_messages) == 1
692 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
693 assert "A declared output artifact is still missing." in persistent_messages[0]
694 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
695 assert (
696 f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
697 in persistent_messages[0]
698 )
699 assert ephemeral_messages == []
700
701
702 @pytest.mark.asyncio
703 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
704 temp_dir: Path,
705 ) -> None:
706 async def assess_confidence(
707 tool_name: str,
708 tool_args: dict,
709 context: str,
710 ) -> ConfidenceAssessment:
711 raise AssertionError("Confidence scoring should not run for this scenario")
712
713 async def verify_action(
714 tool_name: str,
715 tool_args: dict,
716 result: str,
717 expected: str = "",
718 ) -> ActionVerification:
719 raise AssertionError("Verification should not run for this scenario")
720
721 context = build_context(
722 temp_dir=temp_dir,
723 messages=[],
724 safeguards=FakeSafeguards(),
725 assess_confidence=assess_confidence,
726 verify_action=verify_action,
727 auto_recover=False,
728 )
729 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
730 dod = create_definition_of_done("Create a multi-file nginx guide.")
731 sync_todos_to_definition_of_done(
732 dod,
733 [
734 {
735 "content": "Create 03-first-website.html",
736 "active_form": "Creating 03-first-website.html",
737 "status": "pending",
738 },
739 {
740 "content": "Create 04-configuration-basics.html",
741 "active_form": "Creating 04-configuration-basics.html",
742 "status": "pending",
743 },
744 ],
745 )
746
747 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
748 chapter_path.parent.mkdir(parents=True)
749 write_call = ToolCall(
750 id="write-ch3",
751 name="write",
752 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
753 )
754 stale_todo_call = ToolCall(
755 id="todo-stale",
756 name="TodoWrite",
757 arguments={
758 "todos": [
759 {
760 "content": "Create 03-first-website.html",
761 "active_form": "Creating 03-first-website.html",
762 "status": "pending",
763 },
764 {
765 "content": "Create 04-configuration-basics.html",
766 "active_form": "Creating 04-configuration-basics.html",
767 "status": "pending",
768 },
769 ]
770 },
771 )
772 executor = FakeExecutor(
773 [
774 tool_outcome(
775 tool_call=write_call,
776 output=f"Successfully wrote {chapter_path}",
777 is_error=False,
778 ),
779 tool_outcome(
780 tool_call=stale_todo_call,
781 output="Todos updated",
782 is_error=False,
783 metadata={
784 "new_todos": [
785 {
786 "content": "Create 03-first-website.html",
787 "active_form": "Creating 03-first-website.html",
788 "status": "pending",
789 },
790 {
791 "content": "Create 04-configuration-basics.html",
792 "active_form": "Creating 04-configuration-basics.html",
793 "status": "pending",
794 },
795 ]
796 },
797 ),
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 await runner.execute_batch(
803 tool_calls=[write_call, stale_todo_call],
804 tool_source="assistant",
805 pending_tool_calls_seen=set(),
806 emit=_noop_emit,
807 summary=summary,
808 dod=dod,
809 executor=executor, # type: ignore[arg-type]
810 on_confirmation=None,
811 on_user_question=None,
812 emit_confirmation=None,
813 consecutive_errors=0,
814 )
815
816 assert "Create 03-first-website.html" in dod.completed_items
817 assert "Create 03-first-website.html" not in dod.pending_items
818 assert "Create 04-configuration-basics.html" in dod.pending_items
819
820
821 @pytest.mark.asyncio
822 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
823 temp_dir: Path,
824 ) -> None:
825 async def assess_confidence(
826 tool_name: str,
827 tool_args: dict,
828 context: str,
829 ) -> ConfidenceAssessment:
830 raise AssertionError("Confidence scoring should be disabled in this scenario")
831
832 async def verify_action(
833 tool_name: str,
834 tool_args: dict,
835 result: str,
836 expected: str = "",
837 ) -> ActionVerification:
838 raise AssertionError("Verification should not run for this scenario")
839
840 chapters = temp_dir / "chapters"
841 chapters.mkdir()
842 (chapters / "01-introduction.html").write_text(
843 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
844 )
845 (chapters / "02-setup.html").write_text(
846 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
847 )
848 (temp_dir / "index.html").write_text("<ul></ul>\n")
849
850 context = build_context(
851 temp_dir=temp_dir,
852 messages=[],
853 safeguards=FakeSafeguards(),
854 assess_confidence=assess_confidence,
855 verify_action=verify_action,
856 auto_recover=False,
857 )
858 context.session.current_task = (
859 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
860 )
861 persistent_messages: list[str] = []
862 ephemeral_messages: list[str] = []
863 context.queue_steering_message_callback = persistent_messages.append
864 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
865 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
866 tool_call = ToolCall(
867 id="glob-1",
868 name="glob",
869 arguments={"path": str(chapters), "pattern": "*.html"},
870 )
871 executor = FakeExecutor(
872 [
873 tool_outcome(
874 tool_call=tool_call,
875 output="\n".join(
876 [
877 str(chapters / "01-introduction.html"),
878 str(chapters / "02-setup.html"),
879 ]
880 ),
881 is_error=False,
882 )
883 ]
884 )
885
886 summary = TurnSummary(final_response="")
887 await runner.execute_batch(
888 tool_calls=[tool_call],
889 tool_source="assistant",
890 pending_tool_calls_seen=set(),
891 emit=_noop_emit,
892 summary=summary,
893 dod=create_definition_of_done("Fix the chapter links"),
894 executor=executor, # type: ignore[arg-type]
895 on_confirmation=None,
896 on_user_question=None,
897 emit_confirmation=None,
898 consecutive_errors=0,
899 )
900
901 assert persistent_messages == []
902 assert ephemeral_messages == []
903 assert len(summary.tool_result_messages) == 1
904 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
905
906
907 @pytest.mark.asyncio
908 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
909 temp_dir: Path,
910 ) -> None:
911 async def assess_confidence(
912 tool_name: str,
913 tool_args: dict,
914 context: str,
915 ) -> ConfidenceAssessment:
916 raise AssertionError("Confidence scoring should be disabled in this scenario")
917
918 async def verify_action(
919 tool_name: str,
920 tool_args: dict,
921 result: str,
922 expected: str = "",
923 ) -> ActionVerification:
924 raise AssertionError("Verification should not run for this scenario")
925
926 chapters = temp_dir / "chapters"
927 chapters.mkdir()
928 (chapters / "01-introduction.html").write_text(
929 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
930 )
931 (chapters / "02-setup.html").write_text(
932 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
933 )
934 index_path = temp_dir / "index.html"
935 old_block = (
936 '<ul class="chapter-list">\n'
937 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
938 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
939 "</ul>\n"
940 )
941 new_block = (
942 '<ul class="chapter-list">\n'
943 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
944 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
945 "</ul>\n"
946 )
947 index_path.write_text(new_block)
948
949 context = build_context(
950 temp_dir=temp_dir,
951 messages=[],
952 safeguards=FakeSafeguards(),
953 assess_confidence=assess_confidence,
954 verify_action=verify_action,
955 auto_recover=False,
956 )
957 context.session.current_task = (
958 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
959 )
960 persistent_messages: list[str] = []
961 ephemeral_messages: list[str] = []
962 context.queue_steering_message_callback = persistent_messages.append
963 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
964 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
965 tool_call = ToolCall(
966 id="edit-1",
967 name="edit",
968 arguments={
969 "file_path": str(index_path),
970 "old_string": old_block,
971 "new_string": new_block,
972 },
973 )
974 executor = FakeExecutor(
975 [
976 tool_outcome(
977 tool_call=tool_call,
978 output=f"Successfully edited {index_path}",
979 is_error=False,
980 )
981 ]
982 )
983
984 summary = TurnSummary(final_response="")
985 await runner.execute_batch(
986 tool_calls=[tool_call],
987 tool_source="assistant",
988 pending_tool_calls_seen=set(),
989 emit=_noop_emit,
990 summary=summary,
991 dod=create_definition_of_done(
992 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
993 ),
994 executor=executor, # type: ignore[arg-type]
995 on_confirmation=None,
996 on_user_question=None,
997 emit_confirmation=None,
998 consecutive_errors=0,
999 )
1000
1001 assert all(
1002 "Semantic verification preview:" not in message.content
1003 for message in summary.tool_result_messages
1004 )
1005 assert persistent_messages == []
1006 assert ephemeral_messages == []
1007
1008
1009 @pytest.mark.asyncio
1010 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1011 temp_dir: Path,
1012 ) -> None:
1013 async def assess_confidence(
1014 tool_name: str,
1015 tool_args: dict,
1016 context: str,
1017 ) -> ConfidenceAssessment:
1018 raise AssertionError("Confidence scoring should be disabled in this scenario")
1019
1020 async def verify_action(
1021 tool_name: str,
1022 tool_args: dict,
1023 result: str,
1024 expected: str = "",
1025 ) -> ActionVerification:
1026 raise AssertionError("Verification should not run for this scenario")
1027
1028 chapters = temp_dir / "chapters"
1029 chapters.mkdir()
1030 (chapters / "01-introduction.html").write_text(
1031 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1032 )
1033 (chapters / "02-setup.html").write_text(
1034 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1035 )
1036 index_path = temp_dir / "index.html"
1037 index_path.write_text(
1038 "<h2>Table of Contents</h2>\n"
1039 '<ul class="chapter-list">\n'
1040 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1041 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1042 "</ul>\n"
1043 )
1044
1045 prompt = (
1046 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1047 "for the structure and cadence of the guide. We are going to make an all "
1048 "new equally thorough guide on how to use the nginx tool."
1049 )
1050
1051 context = build_context(
1052 temp_dir=temp_dir,
1053 messages=[],
1054 safeguards=FakeSafeguards(),
1055 assess_confidence=assess_confidence,
1056 verify_action=verify_action,
1057 auto_recover=False,
1058 )
1059 context.session.current_task = prompt # type: ignore[attr-defined]
1060 persistent_messages: list[str] = []
1061 ephemeral_messages: list[str] = []
1062 context.queue_steering_message_callback = persistent_messages.append
1063 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1064 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1065 tool_call = ToolCall(
1066 id="read-index",
1067 name="read",
1068 arguments={"file_path": str(index_path)},
1069 )
1070 executor = FakeExecutor(
1071 [
1072 tool_outcome(
1073 tool_call=tool_call,
1074 output=index_path.read_text(),
1075 is_error=False,
1076 )
1077 ]
1078 )
1079
1080 summary = TurnSummary(final_response="")
1081 await runner.execute_batch(
1082 tool_calls=[tool_call],
1083 tool_source="assistant",
1084 pending_tool_calls_seen=set(),
1085 emit=_noop_emit,
1086 summary=summary,
1087 dod=create_definition_of_done(prompt),
1088 executor=executor, # type: ignore[arg-type]
1089 on_confirmation=None,
1090 on_user_question=None,
1091 emit_confirmation=None,
1092 consecutive_errors=0,
1093 )
1094
1095 assert persistent_messages == []
1096 assert ephemeral_messages == []
1097 assert all(
1098 "Semantic verification preview:" not in message.content
1099 for message in summary.tool_result_messages
1100 )
1101
1102
1103 @pytest.mark.asyncio
1104 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1105 temp_dir: Path,
1106 ) -> None:
1107 async def assess_confidence(
1108 tool_name: str,
1109 tool_args: dict,
1110 context: str,
1111 ) -> ConfidenceAssessment:
1112 raise AssertionError("Confidence scoring should be disabled in this scenario")
1113
1114 async def verify_action(
1115 tool_name: str,
1116 tool_args: dict,
1117 result: str,
1118 expected: str = "",
1119 ) -> ActionVerification:
1120 raise AssertionError("Verification should not run for this scenario")
1121
1122 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1123 reference.parent.mkdir(parents=True)
1124 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1125 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1126 chapters = nginx_root / "chapters"
1127 implementation_plan = temp_dir / "implementation.md"
1128 implementation_plan.write_text(
1129 "\n".join(
1130 [
1131 "# Implementation Plan",
1132 "",
1133 "## File Changes",
1134 f"- `{chapters}/`",
1135 f"- `{nginx_root / 'index.html'}`",
1136 "",
1137 ]
1138 )
1139 )
1140
1141 context = build_context(
1142 temp_dir=temp_dir,
1143 messages=[],
1144 safeguards=FakeSafeguards(),
1145 assess_confidence=assess_confidence,
1146 verify_action=verify_action,
1147 auto_recover=False,
1148 )
1149 persistent_messages: list[str] = []
1150 ephemeral_messages: list[str] = []
1151 context.queue_steering_message_callback = persistent_messages.append
1152 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1153 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1154 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1155 dod.implementation_plan = str(implementation_plan)
1156 sync_todos_to_definition_of_done(
1157 dod,
1158 [
1159 {
1160 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1161 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1162 "status": "pending",
1163 },
1164 {
1165 "content": "Create the nginx directory structure",
1166 "active_form": "Working on: Create the nginx directory structure",
1167 "status": "pending",
1168 },
1169 {
1170 "content": "Create the nginx index.html file",
1171 "active_form": "Working on: Create the nginx index.html file",
1172 "status": "pending",
1173 },
1174 ],
1175 )
1176 tool_call = ToolCall(
1177 id="read-reference",
1178 name="read",
1179 arguments={"file_path": str(reference)},
1180 )
1181 executor = FakeExecutor(
1182 [
1183 tool_outcome(
1184 tool_call=tool_call,
1185 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1186 is_error=False,
1187 )
1188 ]
1189 )
1190
1191 summary = TurnSummary(final_response="")
1192 await runner.execute_batch(
1193 tool_calls=[tool_call],
1194 tool_source="assistant",
1195 pending_tool_calls_seen=set(),
1196 emit=_noop_emit,
1197 summary=summary,
1198 dod=dod,
1199 executor=executor, # type: ignore[arg-type]
1200 on_confirmation=None,
1201 on_user_question=None,
1202 emit_confirmation=None,
1203 consecutive_errors=0,
1204 )
1205
1206 assert (
1207 "Examine the existing Fortran guide structure to understand the cadence and format"
1208 in dod.completed_items
1209 )
1210 assert any(
1211 "Continue with the next pending item: `Create the nginx directory structure`"
1212 in message
1213 for message in persistent_messages
1214 )
1215 assert any(
1216 "Resume by creating `chapters/` now." in message
1217 for message in persistent_messages
1218 )
1219 assert all("01-introduction.html" not in message for message in persistent_messages)
1220 assert ephemeral_messages == []
1221
1222
1223 @pytest.mark.asyncio
1224 async def test_tool_batch_runner_queues_setup_directory_before_file_when_plan_lists_index_first(
1225 temp_dir: Path,
1226 ) -> None:
1227 async def assess_confidence(
1228 tool_name: str,
1229 tool_args: dict,
1230 context: str,
1231 ) -> ConfidenceAssessment:
1232 raise AssertionError("Confidence scoring should be disabled in this scenario")
1233
1234 async def verify_action(
1235 tool_name: str,
1236 tool_args: dict,
1237 result: str,
1238 expected: str = "",
1239 ) -> ActionVerification:
1240 raise AssertionError("Verification should not run for this scenario")
1241
1242 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1243 reference.parent.mkdir(parents=True)
1244 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1245 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1246 chapters = nginx_root / "chapters"
1247 implementation_plan = temp_dir / "implementation.md"
1248 implementation_plan.write_text(
1249 "\n".join(
1250 [
1251 "# Implementation Plan",
1252 "",
1253 "## File Changes",
1254 f"- `{nginx_root / 'index.html'}`",
1255 f"- `{chapters}/`",
1256 "",
1257 ]
1258 )
1259 )
1260
1261 context = build_context(
1262 temp_dir=temp_dir,
1263 messages=[],
1264 safeguards=FakeSafeguards(),
1265 assess_confidence=assess_confidence,
1266 verify_action=verify_action,
1267 auto_recover=False,
1268 )
1269 persistent_messages: list[str] = []
1270 ephemeral_messages: list[str] = []
1271 context.queue_steering_message_callback = persistent_messages.append
1272 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1273 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1274 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1275 dod.implementation_plan = str(implementation_plan)
1276 sync_todos_to_definition_of_done(
1277 dod,
1278 [
1279 {
1280 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1281 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1282 "status": "pending",
1283 },
1284 {
1285 "content": "Create the nginx directory structure",
1286 "active_form": "Working on: Create the nginx directory structure",
1287 "status": "pending",
1288 },
1289 {
1290 "content": "Create the nginx index.html file",
1291 "active_form": "Working on: Create the nginx index.html file",
1292 "status": "pending",
1293 },
1294 ],
1295 project_root=temp_dir,
1296 )
1297 tool_call = ToolCall(
1298 id="read-reference-index-first",
1299 name="read",
1300 arguments={"file_path": str(reference)},
1301 )
1302 executor = FakeExecutor(
1303 [
1304 tool_outcome(
1305 tool_call=tool_call,
1306 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1307 is_error=False,
1308 )
1309 ]
1310 )
1311
1312 summary = TurnSummary(final_response="")
1313 await runner.execute_batch(
1314 tool_calls=[tool_call],
1315 tool_source="assistant",
1316 pending_tool_calls_seen=set(),
1317 emit=_noop_emit,
1318 summary=summary,
1319 dod=dod,
1320 executor=executor, # type: ignore[arg-type]
1321 on_confirmation=None,
1322 on_user_question=None,
1323 emit_confirmation=None,
1324 consecutive_errors=0,
1325 )
1326
1327 assert persistent_messages
1328 assert any(
1329 "Continue with the next pending item: `Create the nginx directory structure`"
1330 in message
1331 for message in persistent_messages
1332 )
1333 assert any(
1334 "Resume by creating `chapters/` now." in message
1335 for message in persistent_messages
1336 )
1337 assert all(
1338 "Next step: create `index.html`." not in message
1339 for message in persistent_messages
1340 )
1341 assert ephemeral_messages == []
1342
1343
1344 @pytest.mark.asyncio
1345 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1346 temp_dir: Path,
1347 ) -> None:
1348 async def assess_confidence(
1349 tool_name: str,
1350 tool_args: dict,
1351 context: str,
1352 ) -> ConfidenceAssessment:
1353 raise AssertionError("Confidence scoring should be disabled in this scenario")
1354
1355 async def verify_action(
1356 tool_name: str,
1357 tool_args: dict,
1358 result: str,
1359 expected: str = "",
1360 ) -> ActionVerification:
1361 raise AssertionError("Verification should not run for this scenario")
1362
1363 reference = temp_dir / "fortran" / "index.html"
1364 reference.parent.mkdir(parents=True)
1365 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1366
1367 messages = [
1368 Message(
1369 role=Role.TOOL,
1370 content=(
1371 "Observation [read]: Result: "
1372 "<h1>Fortran Beginner's Guide</h1>\n"
1373 ),
1374 )
1375 ]
1376 context = build_context(
1377 temp_dir=temp_dir,
1378 messages=messages,
1379 safeguards=FakeSafeguards(),
1380 assess_confidence=assess_confidence,
1381 verify_action=verify_action,
1382 auto_recover=False,
1383 )
1384 prompt = (
1385 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1386 "for the structure and cadence of the guide. We are going to make an all "
1387 "new equally thorough guide on how to use the nginx tool."
1388 )
1389 context.session.current_task = prompt
1390 persistent_messages: list[str] = []
1391 ephemeral_messages: list[str] = []
1392 context.queue_steering_message_callback = persistent_messages.append
1393 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1394 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1395 dod = create_definition_of_done(prompt)
1396 sync_todos_to_definition_of_done(
1397 dod,
1398 [
1399 {
1400 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1401 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1402 "status": "completed",
1403 },
1404 {
1405 "content": "Create the nginx directory structure",
1406 "active_form": "Working on: Create the nginx directory structure",
1407 "status": "pending",
1408 },
1409 {
1410 "content": "Create the nginx index.html file",
1411 "active_form": "Working on: Create the nginx index.html file",
1412 "status": "pending",
1413 },
1414 ],
1415 )
1416 tool_call = ToolCall(
1417 id="read-dup",
1418 name="read",
1419 arguments={"file_path": str(reference)},
1420 )
1421 duplicate_message = (
1422 "[Skipped - duplicate action: Already read "
1423 f"{reference} recently without any intervening changes; "
1424 "reuse the earlier read result instead of rereading]"
1425 )
1426 executor = FakeExecutor(
1427 [
1428 ToolExecutionOutcome(
1429 tool_call=tool_call,
1430 state=ToolExecutionState.DUPLICATE,
1431 message=Message.tool_result_message(
1432 tool_call_id=tool_call.id,
1433 display_content=duplicate_message,
1434 result_content=duplicate_message,
1435 ),
1436 event_content=duplicate_message,
1437 is_error=False,
1438 result_output=duplicate_message,
1439 )
1440 ]
1441 )
1442
1443 summary = TurnSummary(final_response="")
1444 await runner.execute_batch(
1445 tool_calls=[tool_call],
1446 tool_source="assistant",
1447 pending_tool_calls_seen=set(),
1448 emit=_noop_emit,
1449 summary=summary,
1450 dod=dod,
1451 executor=executor, # type: ignore[arg-type]
1452 on_confirmation=None,
1453 on_user_question=None,
1454 emit_confirmation=None,
1455 consecutive_errors=0,
1456 )
1457
1458 assert len(persistent_messages) == 1
1459 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1460 assert (
1461 "Continue with the next pending item: `Create the nginx directory structure`"
1462 in persistent_messages[0]
1463 )
1464 assert "Update `" not in persistent_messages[0]
1465 assert ephemeral_messages == []
1466
1467
1468 @pytest.mark.asyncio
1469 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1470 temp_dir: Path,
1471 ) -> None:
1472 async def assess_confidence(
1473 tool_name: str,
1474 tool_args: dict,
1475 context: str,
1476 ) -> ConfidenceAssessment:
1477 raise AssertionError("Confidence scoring should be disabled in this scenario")
1478
1479 async def verify_action(
1480 tool_name: str,
1481 tool_args: dict,
1482 result: str,
1483 expected: str = "",
1484 ) -> ActionVerification:
1485 raise AssertionError("Verification should not run for this scenario")
1486
1487 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1488 chapters = guide_root / "chapters"
1489 chapters.mkdir(parents=True)
1490 chapter_one = chapters / "01-introduction.html"
1491 chapter_one.write_text("<html></html>\n")
1492 index_path = guide_root / "index.html"
1493
1494 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1495 reference.parent.mkdir(parents=True, exist_ok=True)
1496 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1497
1498 implementation_plan = temp_dir / "implementation.md"
1499 implementation_plan.write_text(
1500 "\n".join(
1501 [
1502 "# Implementation Plan",
1503 "",
1504 "## File Changes",
1505 f"- `{guide_root}/`",
1506 f"- `{chapters}/`",
1507 f"- `{index_path}`",
1508 f"- `{chapter_one}`",
1509 f"- `{chapters / '02-installation.html'}`",
1510 "",
1511 ]
1512 )
1513 )
1514
1515 context = build_context(
1516 temp_dir=temp_dir,
1517 messages=[],
1518 safeguards=FakeSafeguards(),
1519 assess_confidence=assess_confidence,
1520 verify_action=verify_action,
1521 auto_recover=False,
1522 )
1523 persistent_messages: list[str] = []
1524 ephemeral_messages: list[str] = []
1525 context.queue_steering_message_callback = persistent_messages.append
1526 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1527 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1528 dod = create_definition_of_done("Create a multi-file nginx guide.")
1529 dod.implementation_plan = str(implementation_plan)
1530 dod.touched_files.append(str(chapter_one))
1531 sync_todos_to_definition_of_done(
1532 dod,
1533 [
1534 {
1535 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1536 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1537 "status": "pending",
1538 },
1539 {
1540 "content": "Create each chapter file with appropriate content",
1541 "active_form": "Working on: Create each chapter file with appropriate content",
1542 "status": "pending",
1543 },
1544 {
1545 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1546 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1547 "status": "pending",
1548 },
1549 ],
1550 )
1551 tool_call = ToolCall(
1552 id="read-reference-chapter",
1553 name="read",
1554 arguments={"file_path": str(reference)},
1555 )
1556 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1557 executor = FakeExecutor(
1558 [
1559 ToolExecutionOutcome(
1560 tool_call=tool_call,
1561 state=ToolExecutionState.EXECUTED,
1562 message=Message.tool_result_message(
1563 tool_call_id=tool_call.id,
1564 display_content=read_output,
1565 result_content=read_output,
1566 ),
1567 event_content=read_output,
1568 is_error=False,
1569 result_output=read_output,
1570 )
1571 ]
1572 )
1573
1574 summary = TurnSummary(final_response="")
1575 await runner.execute_batch(
1576 tool_calls=[tool_call],
1577 tool_source="assistant",
1578 pending_tool_calls_seen=set(),
1579 emit=_noop_emit,
1580 summary=summary,
1581 dod=dod,
1582 executor=executor, # type: ignore[arg-type]
1583 on_confirmation=None,
1584 on_user_question=None,
1585 emit_confirmation=None,
1586 consecutive_errors=0,
1587 )
1588
1589 assert persistent_messages
1590 assert any(
1591 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1592 in message
1593 for message in persistent_messages
1594 )
1595 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1596 assert not any(
1597 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1598 in message
1599 for message in persistent_messages
1600 )
1601 assert ephemeral_messages == []
1602
1603
1604 @pytest.mark.asyncio
1605 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1606 temp_dir: Path,
1607 ) -> None:
1608 async def assess_confidence(
1609 tool_name: str,
1610 tool_args: dict,
1611 context: str,
1612 ) -> ConfidenceAssessment:
1613 raise AssertionError("Confidence scoring should not run for this scenario")
1614
1615 async def verify_action(
1616 tool_name: str,
1617 tool_args: dict,
1618 result: str,
1619 expected: str = "",
1620 ) -> ActionVerification:
1621 raise AssertionError("Verification should not run for this scenario")
1622
1623 guide_root = temp_dir / "guides" / "nginx"
1624 chapters = guide_root / "chapters"
1625 guide_root.mkdir(parents=True)
1626 chapters.mkdir()
1627 index_path = guide_root / "index.html"
1628 chapter_one = chapters / "01-getting-started.html"
1629 chapter_two = chapters / "02-installation.html"
1630 index_path.write_text("<html></html>\n")
1631 chapter_one.write_text("<h1>One</h1>\n")
1632 chapter_two.write_text("<h1>Two</h1>\n")
1633
1634 implementation_plan = temp_dir / "implementation.md"
1635 implementation_plan.write_text(
1636 "\n".join(
1637 [
1638 "# Implementation Plan",
1639 "",
1640 "## File Changes",
1641 f"- `{guide_root}/`",
1642 f"- `{chapters}/`",
1643 f"- `{index_path}`",
1644 f"- `{chapter_one}`",
1645 f"- `{chapter_two}`",
1646 "",
1647 ]
1648 )
1649 )
1650
1651 context = build_context(
1652 temp_dir=temp_dir,
1653 messages=[],
1654 safeguards=FakeSafeguards(),
1655 assess_confidence=assess_confidence,
1656 verify_action=verify_action,
1657 auto_recover=False,
1658 )
1659 persistent_messages: list[str] = []
1660 ephemeral_messages: list[str] = []
1661 context.queue_steering_message_callback = persistent_messages.append
1662 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1663 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1664 dod = create_definition_of_done("Create a multi-file nginx guide.")
1665 dod.implementation_plan = str(implementation_plan)
1666 dod.pending_items = [
1667 "Create 07-performance-tuning.html",
1668 "Verify all guide files are linked and complete",
1669 "Complete the requested work",
1670 ]
1671
1672 tool_call = ToolCall(
1673 id="read-dup",
1674 name="read",
1675 arguments={"file_path": str(chapter_one)},
1676 )
1677 duplicate_message = (
1678 "[Skipped - duplicate action: Already read "
1679 f"{chapter_one} recently without any intervening changes; "
1680 "reuse the earlier read result instead of rereading]"
1681 )
1682 executor = FakeExecutor(
1683 [
1684 ToolExecutionOutcome(
1685 tool_call=tool_call,
1686 state=ToolExecutionState.DUPLICATE,
1687 message=Message.tool_result_message(
1688 tool_call_id=tool_call.id,
1689 display_content=duplicate_message,
1690 result_content=duplicate_message,
1691 ),
1692 event_content=duplicate_message,
1693 is_error=False,
1694 result_output=duplicate_message,
1695 )
1696 ]
1697 )
1698
1699 summary = TurnSummary(final_response="")
1700 await runner.execute_batch(
1701 tool_calls=[tool_call],
1702 tool_source="assistant",
1703 pending_tool_calls_seen=set(),
1704 emit=_noop_emit,
1705 summary=summary,
1706 dod=dod,
1707 executor=executor, # type: ignore[arg-type]
1708 on_confirmation=None,
1709 on_user_question=None,
1710 emit_confirmation=None,
1711 consecutive_errors=0,
1712 )
1713
1714 assert len(persistent_messages) == 1
1715 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1716 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1717 assert ephemeral_messages == []
1718
1719
1720 @pytest.mark.asyncio
1721 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1722 temp_dir: Path,
1723 ) -> None:
1724 async def assess_confidence(
1725 tool_name: str,
1726 tool_args: dict,
1727 context: str,
1728 ) -> ConfidenceAssessment:
1729 raise AssertionError("Confidence scoring should not run for this scenario")
1730
1731 async def verify_action(
1732 tool_name: str,
1733 tool_args: dict,
1734 result: str,
1735 expected: str = "",
1736 ) -> ActionVerification:
1737 raise AssertionError("Verification should not run for this scenario")
1738
1739 guide_root = temp_dir / "guides" / "nginx"
1740 chapters = guide_root / "chapters"
1741 guide_root.mkdir(parents=True)
1742 chapters.mkdir()
1743 index_path = guide_root / "index.html"
1744 chapter_one = chapters / "01-getting-started.html"
1745 chapter_two = chapters / "02-installation.html"
1746 index_path.write_text("<html></html>\n")
1747 chapter_one.write_text("<h1>One</h1>\n")
1748 chapter_two.write_text("<h1>Two</h1>\n")
1749
1750 implementation_plan = temp_dir / "implementation.md"
1751 implementation_plan.write_text(
1752 "\n".join(
1753 [
1754 "# Implementation Plan",
1755 "",
1756 "## File Changes",
1757 f"- `{guide_root}/`",
1758 f"- `{chapters}/`",
1759 f"- `{index_path}`",
1760 f"- `{chapter_one}`",
1761 f"- `{chapter_two}`",
1762 "",
1763 ]
1764 )
1765 )
1766
1767 context = build_context(
1768 temp_dir=temp_dir,
1769 messages=[],
1770 safeguards=FakeSafeguards(),
1771 assess_confidence=assess_confidence,
1772 verify_action=verify_action,
1773 auto_recover=False,
1774 )
1775 persistent_messages: list[str] = []
1776 ephemeral_messages: list[str] = []
1777 context.queue_steering_message_callback = persistent_messages.append
1778 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1779 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1780 dod = create_definition_of_done("Create a multi-file nginx guide.")
1781 dod.implementation_plan = str(implementation_plan)
1782 dod.verification_commands = [f"ls -la {guide_root}"]
1783 dod.pending_items = [
1784 "Create 07-performance-tuning.html",
1785 "Complete the requested work",
1786 ]
1787
1788 tool_call = ToolCall(
1789 id="read-dup",
1790 name="read",
1791 arguments={"file_path": str(chapter_one)},
1792 )
1793 duplicate_message = (
1794 "[Skipped - duplicate action: Already read "
1795 f"{chapter_one} recently without any intervening changes; "
1796 "reuse the earlier read result instead of rereading]"
1797 )
1798 executor = FakeExecutor(
1799 [
1800 ToolExecutionOutcome(
1801 tool_call=tool_call,
1802 state=ToolExecutionState.DUPLICATE,
1803 message=Message.tool_result_message(
1804 tool_call_id=tool_call.id,
1805 display_content=duplicate_message,
1806 result_content=duplicate_message,
1807 ),
1808 event_content=duplicate_message,
1809 is_error=False,
1810 result_output=duplicate_message,
1811 )
1812 ]
1813 )
1814
1815 summary = TurnSummary(final_response="")
1816 await runner.execute_batch(
1817 tool_calls=[tool_call],
1818 tool_source="assistant",
1819 pending_tool_calls_seen=set(),
1820 emit=_noop_emit,
1821 summary=summary,
1822 dod=dod,
1823 executor=executor, # type: ignore[arg-type]
1824 on_confirmation=None,
1825 on_user_question=None,
1826 emit_confirmation=None,
1827 consecutive_errors=0,
1828 )
1829
1830 assert len(persistent_messages) == 1
1831 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1832 assert (
1833 "Move to verification or final confirmation using the files already on disk."
1834 in persistent_messages[0]
1835 )
1836 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1837 assert ephemeral_messages == []
1838
1839
1840 @pytest.mark.asyncio
1841 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1842 temp_dir: Path,
1843 ) -> None:
1844 async def assess_confidence(
1845 tool_name: str,
1846 tool_args: dict,
1847 context: str,
1848 ) -> ConfidenceAssessment:
1849 raise AssertionError("Confidence scoring should not run for this scenario")
1850
1851 async def verify_action(
1852 tool_name: str,
1853 tool_args: dict,
1854 result: str,
1855 expected: str = "",
1856 ) -> ActionVerification:
1857 raise AssertionError("Verification should not run for this scenario")
1858
1859 guide_root = temp_dir / "guides" / "nginx"
1860 chapters = guide_root / "chapters"
1861 guide_root.mkdir(parents=True)
1862 chapters.mkdir()
1863 index_path = guide_root / "index.html"
1864 chapter_one = chapters / "01-getting-started.html"
1865 chapter_two = chapters / "02-installation.html"
1866 index_path.write_text("<html></html>\n")
1867 chapter_one.write_text("<h1>One</h1>\n")
1868 chapter_two.write_text("<h1>Two</h1>\n")
1869
1870 implementation_plan = temp_dir / "implementation.md"
1871 implementation_plan.write_text(
1872 "\n".join(
1873 [
1874 "# Implementation Plan",
1875 "",
1876 "## File Changes",
1877 f"- `{guide_root}/`",
1878 f"- `{chapters}/`",
1879 f"- `{index_path}`",
1880 f"- `{chapter_one}`",
1881 f"- `{chapter_two}`",
1882 "",
1883 ]
1884 )
1885 )
1886
1887 context = build_context(
1888 temp_dir=temp_dir,
1889 messages=[],
1890 safeguards=FakeSafeguards(),
1891 assess_confidence=assess_confidence,
1892 verify_action=verify_action,
1893 auto_recover=False,
1894 )
1895 persistent_messages: list[str] = []
1896 ephemeral_messages: list[str] = []
1897 context.queue_steering_message_callback = persistent_messages.append
1898 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1899 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1900 dod = create_definition_of_done("Create a multi-file nginx guide.")
1901 dod.implementation_plan = str(implementation_plan)
1902 dod.verification_commands = [f"ls -la {guide_root}"]
1903 dod.pending_items = [
1904 "Create 01-getting-started.html",
1905 "Creating 02-installation.html",
1906 "Complete the requested work",
1907 ]
1908
1909 tool_call = ToolCall(
1910 id="read-dup-built-stale",
1911 name="read",
1912 arguments={"file_path": str(chapter_one)},
1913 )
1914 duplicate_message = (
1915 "[Skipped - duplicate action: Already read "
1916 f"{chapter_one} recently without any intervening changes; "
1917 "reuse the earlier read result instead of rereading]"
1918 )
1919 executor = FakeExecutor(
1920 [
1921 ToolExecutionOutcome(
1922 tool_call=tool_call,
1923 state=ToolExecutionState.DUPLICATE,
1924 message=Message.tool_result_message(
1925 tool_call_id=tool_call.id,
1926 display_content=duplicate_message,
1927 result_content=duplicate_message,
1928 ),
1929 event_content=duplicate_message,
1930 is_error=False,
1931 result_output=duplicate_message,
1932 )
1933 ]
1934 )
1935
1936 summary = TurnSummary(final_response="")
1937 await runner.execute_batch(
1938 tool_calls=[tool_call],
1939 tool_source="assistant",
1940 pending_tool_calls_seen=set(),
1941 emit=_noop_emit,
1942 summary=summary,
1943 dod=dod,
1944 executor=executor, # type: ignore[arg-type]
1945 on_confirmation=None,
1946 on_user_question=None,
1947 emit_confirmation=None,
1948 consecutive_errors=0,
1949 )
1950
1951 assert len(persistent_messages) == 1
1952 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1953 assert (
1954 "Move to verification or final confirmation using the files already on disk."
1955 in persistent_messages[0]
1956 )
1957 assert "Create 01-getting-started.html" not in persistent_messages[0]
1958 assert "Creating 02-installation.html" not in persistent_messages[0]
1959 assert ephemeral_messages == []
1960
1961
1962 @pytest.mark.asyncio
1963 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
1964 temp_dir: Path,
1965 ) -> None:
1966 async def assess_confidence(
1967 tool_name: str,
1968 tool_args: dict,
1969 context: str,
1970 ) -> ConfidenceAssessment:
1971 raise AssertionError("Confidence scoring should be disabled in this scenario")
1972
1973 async def verify_action(
1974 tool_name: str,
1975 tool_args: dict,
1976 result: str,
1977 expected: str = "",
1978 ) -> ActionVerification:
1979 raise AssertionError("Verification should not run for this scenario")
1980
1981 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1982 reference.parent.mkdir(parents=True)
1983 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1984
1985 context = build_context(
1986 temp_dir=temp_dir,
1987 messages=[],
1988 safeguards=FakeSafeguards(),
1989 assess_confidence=assess_confidence,
1990 verify_action=verify_action,
1991 auto_recover=False,
1992 )
1993 persistent_messages: list[str] = []
1994 ephemeral_messages: list[str] = []
1995 context.queue_steering_message_callback = persistent_messages.append
1996 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1997 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1998 dod = create_definition_of_done("Create a multi-file nginx guide.")
1999 sync_todos_to_definition_of_done(
2000 dod,
2001 [
2002 {
2003 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
2004 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
2005 "status": "pending",
2006 },
2007 {
2008 "content": "Create the nginx index.html file",
2009 "active_form": "Working on: Create the nginx index.html file",
2010 "status": "pending",
2011 },
2012 ],
2013 )
2014 tool_call = ToolCall(
2015 id="read-reference",
2016 name="read",
2017 arguments={"file_path": str(reference)},
2018 )
2019 executor = FakeExecutor(
2020 [
2021 tool_outcome(
2022 tool_call=tool_call,
2023 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2024 is_error=False,
2025 )
2026 ]
2027 )
2028
2029 summary = TurnSummary(final_response="")
2030 await runner.execute_batch(
2031 tool_calls=[tool_call],
2032 tool_source="assistant",
2033 pending_tool_calls_seen=set(),
2034 emit=_noop_emit,
2035 summary=summary,
2036 dod=dod,
2037 executor=executor, # type: ignore[arg-type]
2038 on_confirmation=None,
2039 on_user_question=None,
2040 emit_confirmation=None,
2041 consecutive_errors=0,
2042 )
2043
2044 assert any(
2045 "Continue with the next pending item: `Create the nginx index.html file`"
2046 in message
2047 for message in persistent_messages
2048 )
2049 assert any(
2050 "stop gathering more reference material and perform the change now" in message
2051 for message in persistent_messages
2052 )
2053 assert ephemeral_messages == []
2054
2055
2056 @pytest.mark.asyncio
2057 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
2058 temp_dir: Path,
2059 ) -> None:
2060 async def assess_confidence(
2061 tool_name: str,
2062 tool_args: dict,
2063 context: str,
2064 ) -> ConfidenceAssessment:
2065 raise AssertionError("Confidence scoring should be disabled in this scenario")
2066
2067 async def verify_action(
2068 tool_name: str,
2069 tool_args: dict,
2070 result: str,
2071 expected: str = "",
2072 ) -> ActionVerification:
2073 raise AssertionError("Verification should not run for this scenario")
2074
2075 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
2076 reference.parent.mkdir(parents=True)
2077 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
2078
2079 context = build_context(
2080 temp_dir=temp_dir,
2081 messages=[],
2082 safeguards=FakeSafeguards(),
2083 assess_confidence=assess_confidence,
2084 verify_action=verify_action,
2085 auto_recover=False,
2086 )
2087 persistent_messages: list[str] = []
2088 ephemeral_messages: list[str] = []
2089 context.queue_steering_message_callback = persistent_messages.append
2090 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2091 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2092 dod = create_definition_of_done("Create a multi-file nginx guide.")
2093 sync_todos_to_definition_of_done(
2094 dod,
2095 [
2096 {
2097 "content": "First, examine the existing fortran guide structure and content",
2098 "active_form": "Working on: First, examine the existing fortran guide structure and content",
2099 "status": "pending",
2100 },
2101 {
2102 "content": "Create the nginx directory structure",
2103 "active_form": "Working on: Create the nginx directory structure",
2104 "status": "pending",
2105 },
2106 ],
2107 )
2108 tool_call = ToolCall(
2109 id="read-reference",
2110 name="read",
2111 arguments={"file_path": str(reference)},
2112 )
2113 executor = FakeExecutor(
2114 [
2115 tool_outcome(
2116 tool_call=tool_call,
2117 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
2118 is_error=False,
2119 )
2120 ]
2121 )
2122
2123 summary = TurnSummary(final_response="")
2124 await runner.execute_batch(
2125 tool_calls=[tool_call],
2126 tool_source="assistant",
2127 pending_tool_calls_seen=set(),
2128 emit=_noop_emit,
2129 summary=summary,
2130 dod=dod,
2131 executor=executor, # type: ignore[arg-type]
2132 on_confirmation=None,
2133 on_user_question=None,
2134 emit_confirmation=None,
2135 consecutive_errors=0,
2136 )
2137
2138 assert persistent_messages
2139 assert any(
2140 "Continue with the next pending item: `Create the nginx directory structure`"
2141 in message
2142 for message in persistent_messages
2143 )
2144 assert ephemeral_messages == []
2145
2146
2147 @pytest.mark.asyncio
2148 async def test_tool_batch_runner_missing_artifact_nudge_stays_quiet_after_setup_mkdir(
2149 temp_dir: Path,
2150 ) -> None:
2151 async def assess_confidence(
2152 tool_name: str,
2153 tool_args: dict,
2154 context: str,
2155 ) -> ConfidenceAssessment:
2156 raise AssertionError("Confidence scoring should be disabled in this scenario")
2157
2158 async def verify_action(
2159 tool_name: str,
2160 tool_args: dict,
2161 result: str,
2162 expected: str = "",
2163 ) -> ActionVerification:
2164 raise AssertionError("Verification should not run for this scenario")
2165
2166 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2167 chapters = nginx_root / "chapters"
2168 implementation_plan = temp_dir / "implementation.md"
2169 implementation_plan.write_text(
2170 "\n".join(
2171 [
2172 "# Implementation Plan",
2173 "",
2174 "## File Changes",
2175 f"- `{chapters}/`",
2176 f"- `{nginx_root / 'index.html'}`",
2177 "",
2178 ]
2179 )
2180 )
2181
2182 context = build_context(
2183 temp_dir=temp_dir,
2184 messages=[],
2185 safeguards=FakeSafeguards(),
2186 assess_confidence=assess_confidence,
2187 verify_action=verify_action,
2188 auto_recover=False,
2189 )
2190 persistent_messages: list[str] = []
2191 ephemeral_messages: list[str] = []
2192 context.queue_steering_message_callback = persistent_messages.append
2193 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2194 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2195 dod = create_definition_of_done("Create a multi-file nginx guide.")
2196 dod.implementation_plan = str(implementation_plan)
2197 sync_todos_to_definition_of_done(
2198 dod,
2199 [
2200 {
2201 "content": "Create the nginx directory structure",
2202 "active_form": "Creating the nginx directory structure",
2203 "status": "pending",
2204 },
2205 {
2206 "content": "Develop the main index.html file with proper structure",
2207 "active_form": "Developing the main index.html file with proper structure",
2208 "status": "pending",
2209 },
2210 ],
2211 )
2212
2213 tool_call = ToolCall(
2214 id="mkdir-nginx",
2215 name="bash",
2216 arguments={"command": f"mkdir -p {chapters}"},
2217 )
2218 executor = FakeExecutor(
2219 [
2220 tool_outcome(
2221 tool_call=tool_call,
2222 output="",
2223 is_error=False,
2224 )
2225 ]
2226 )
2227
2228 summary = TurnSummary(final_response="")
2229 await runner.execute_batch(
2230 tool_calls=[tool_call],
2231 tool_source="assistant",
2232 pending_tool_calls_seen=set(),
2233 emit=_noop_emit,
2234 summary=summary,
2235 dod=dod,
2236 executor=executor, # type: ignore[arg-type]
2237 on_confirmation=None,
2238 on_user_question=None,
2239 emit_confirmation=None,
2240 consecutive_errors=0,
2241 )
2242
2243 assert persistent_messages == []
2244 assert ephemeral_messages == []
2245
2246
2247 @pytest.mark.asyncio
2248 async def test_tool_batch_runner_first_file_handoff_stays_persistent(
2249 temp_dir: Path,
2250 ) -> None:
2251 async def assess_confidence(
2252 tool_name: str,
2253 tool_args: dict,
2254 context: str,
2255 ) -> ConfidenceAssessment:
2256 raise AssertionError("Confidence scoring should be disabled in this scenario")
2257
2258 async def verify_action(
2259 tool_name: str,
2260 tool_args: dict,
2261 result: str,
2262 expected: str = "",
2263 ) -> ActionVerification:
2264 raise AssertionError("Verification should not run for this scenario")
2265
2266 nginx_root = temp_dir / "guides" / "nginx"
2267 chapters = nginx_root / "chapters"
2268 chapters.mkdir(parents=True)
2269 index_path = nginx_root / "index.html"
2270
2271 implementation_plan = temp_dir / "implementation.md"
2272 implementation_plan.write_text(
2273 "\n".join(
2274 [
2275 "# Implementation Plan",
2276 "",
2277 "## File Changes",
2278 f"- `{chapters}/`",
2279 f"- `{index_path}`",
2280 f"- `{chapters / '01-introduction.html'}`",
2281 "",
2282 ]
2283 )
2284 )
2285
2286 context = build_context(
2287 temp_dir=temp_dir,
2288 messages=[],
2289 safeguards=FakeSafeguards(),
2290 assess_confidence=assess_confidence,
2291 verify_action=verify_action,
2292 auto_recover=False,
2293 )
2294 persistent_messages: list[str] = []
2295 ephemeral_messages: list[str] = []
2296 context.queue_steering_message_callback = persistent_messages.append
2297 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2298 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2299 dod = create_definition_of_done("Create a multi-file nginx guide.")
2300 dod.implementation_plan = str(implementation_plan)
2301 sync_todos_to_definition_of_done(
2302 dod,
2303 [
2304 {
2305 "content": "Create the main index.html file with proper structure",
2306 "active_form": "Creating the main index.html file with proper structure",
2307 "status": "pending",
2308 },
2309 {
2310 "content": "Create each chapter file with appropriate content",
2311 "active_form": "Creating each chapter file with appropriate content",
2312 "status": "pending",
2313 },
2314 ],
2315 )
2316
2317 tool_call = ToolCall(
2318 id="write-index",
2319 name="write",
2320 arguments={
2321 "file_path": str(index_path),
2322 "content": "<html></html>\n",
2323 },
2324 )
2325 executor = FakeExecutor(
2326 [
2327 tool_outcome(
2328 tool_call=tool_call,
2329 output=f"Successfully wrote 14 bytes to {index_path}",
2330 is_error=False,
2331 )
2332 ]
2333 )
2334
2335 summary = TurnSummary(final_response="")
2336 await runner.execute_batch(
2337 tool_calls=[tool_call],
2338 tool_source="assistant",
2339 pending_tool_calls_seen=set(),
2340 emit=_noop_emit,
2341 summary=summary,
2342 dod=dod,
2343 executor=executor, # type: ignore[arg-type]
2344 on_confirmation=None,
2345 on_user_question=None,
2346 emit_confirmation=None,
2347 consecutive_errors=0,
2348 )
2349
2350 assert persistent_messages
2351 message = persistent_messages[-1]
2352 assert "Confirmed progress:" in message
2353 assert "Next step: create `01-introduction.html`." in message
2354 assert (
2355 f"Prefer one `write(file_path=..., content=...)` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` now."
2356 in message
2357 )
2358 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2359 assert ephemeral_messages == []
2360
2361
2362 @pytest.mark.asyncio
2363 async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt(
2364 temp_dir: Path,
2365 ) -> None:
2366 async def assess_confidence(
2367 tool_name: str,
2368 tool_args: dict,
2369 context: str,
2370 ) -> ConfidenceAssessment:
2371 raise AssertionError("Confidence scoring should be disabled in this scenario")
2372
2373 async def verify_action(
2374 tool_name: str,
2375 tool_args: dict,
2376 result: str,
2377 expected: str = "",
2378 ) -> ActionVerification:
2379 raise AssertionError("Verification should not run for this scenario")
2380
2381 nginx_root = temp_dir / "guides" / "nginx"
2382 chapters = nginx_root / "chapters"
2383 chapters.mkdir(parents=True)
2384 index_path = nginx_root / "index.html"
2385
2386 implementation_plan = temp_dir / "implementation.md"
2387 implementation_plan.write_text(
2388 "\n".join(
2389 [
2390 "# Implementation Plan",
2391 "",
2392 "## File Changes",
2393 f"- `{chapters}/`",
2394 f"- `{index_path}`",
2395 f"- `{chapters / '01-introduction.html'}`",
2396 "",
2397 ]
2398 )
2399 )
2400
2401 context = build_context(
2402 temp_dir=temp_dir,
2403 messages=[
2404 Message(
2405 role=Role.USER,
2406 content=(
2407 "[EMPTY ASSISTANT RESPONSE]\n"
2408 "Respond with that concrete mutation tool call now. Do not return an empty response."
2409 ),
2410 )
2411 ],
2412 safeguards=FakeSafeguards(),
2413 assess_confidence=assess_confidence,
2414 verify_action=verify_action,
2415 auto_recover=False,
2416 )
2417 persistent_messages: list[str] = []
2418 ephemeral_messages: list[str] = []
2419 context.queue_steering_message_callback = persistent_messages.append
2420 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2421 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2422 dod = create_definition_of_done("Create a multi-file nginx guide.")
2423 dod.implementation_plan = str(implementation_plan)
2424 sync_todos_to_definition_of_done(
2425 dod,
2426 [
2427 {
2428 "content": "Create the main index.html file with proper structure",
2429 "active_form": "Creating the main index.html file with proper structure",
2430 "status": "pending",
2431 },
2432 {
2433 "content": "Create each chapter file with appropriate content",
2434 "active_form": "Creating each chapter file with appropriate content",
2435 "status": "pending",
2436 },
2437 ],
2438 )
2439
2440 tool_call = ToolCall(
2441 id="write-index-recovered",
2442 name="write",
2443 arguments={
2444 "file_path": str(index_path),
2445 "content": "<html></html>\n",
2446 },
2447 )
2448 executor = FakeExecutor(
2449 [
2450 tool_outcome(
2451 tool_call=tool_call,
2452 output=f"Successfully wrote 14 bytes to {index_path}",
2453 is_error=False,
2454 )
2455 ]
2456 )
2457
2458 summary = TurnSummary(final_response="")
2459 await runner.execute_batch(
2460 tool_calls=[tool_call],
2461 tool_source="assistant",
2462 pending_tool_calls_seen=set(),
2463 emit=_noop_emit,
2464 summary=summary,
2465 dod=dod,
2466 executor=executor, # type: ignore[arg-type]
2467 on_confirmation=None,
2468 on_user_question=None,
2469 emit_confirmation=None,
2470 consecutive_errors=0,
2471 )
2472
2473 assert persistent_messages == []
2474 assert ephemeral_messages
2475 message = ephemeral_messages[-1]
2476 assert "Resume by creating `01-introduction.html` now." in message
2477
2478
2479 @pytest.mark.asyncio
2480 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
2481 temp_dir: Path,
2482 ) -> None:
2483 async def assess_confidence(
2484 tool_name: str,
2485 tool_args: dict,
2486 context: str,
2487 ) -> ConfidenceAssessment:
2488 raise AssertionError("Confidence scoring should be disabled in this scenario")
2489
2490 async def verify_action(
2491 tool_name: str,
2492 tool_args: dict,
2493 result: str,
2494 expected: str = "",
2495 ) -> ActionVerification:
2496 raise AssertionError("Verification should not run for this scenario")
2497
2498 guide_root = temp_dir / "guides" / "nginx"
2499 chapters = guide_root / "chapters"
2500 chapters.mkdir(parents=True)
2501 index_path = guide_root / "index.html"
2502 chapter_one = chapters / "01-getting-started.html"
2503 chapter_one.write_text("<h1>One</h1>\n")
2504 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2505
2506 implementation_plan = temp_dir / "implementation.md"
2507 implementation_plan.write_text(
2508 "\n".join(
2509 [
2510 "# Implementation Plan",
2511 "",
2512 "## File Changes",
2513 f"- `{index_path}`",
2514 f"- `{chapter_one}`",
2515 f"- `{chapters / '06-ssl-configuration.html'}`",
2516 "",
2517 ]
2518 )
2519 )
2520
2521 context = build_context(
2522 temp_dir=temp_dir,
2523 messages=[],
2524 safeguards=FakeSafeguards(),
2525 assess_confidence=assess_confidence,
2526 verify_action=verify_action,
2527 auto_recover=False,
2528 )
2529 persistent_messages: list[str] = []
2530 ephemeral_messages: list[str] = []
2531 context.queue_steering_message_callback = persistent_messages.append
2532 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2533 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2534 dod = create_definition_of_done("Create a multi-file nginx guide.")
2535 dod.implementation_plan = str(implementation_plan)
2536 sync_todos_to_definition_of_done(
2537 dod,
2538 [
2539 {
2540 "content": "Ensure all files are properly linked and formatted consistently",
2541 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2542 "status": "pending",
2543 },
2544 {
2545 "content": "Create the final chapter (06-ssl-configuration.html)",
2546 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
2547 "status": "pending",
2548 },
2549 ],
2550 )
2551 assert tool_batches_should_prioritize_missing_artifact(
2552 dod=dod,
2553 next_pending=dod.pending_items[0],
2554 missing_artifact=(chapters / "06-ssl-configuration.html", False),
2555 project_root=temp_dir,
2556 )
2557
2558 tool_call = ToolCall(
2559 id="dup-read",
2560 name="read",
2561 arguments={"file_path": str(index_path)},
2562 )
2563 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
2564
2565 assert persistent_messages
2566 message = persistent_messages[-1]
2567 assert "06-ssl-configuration.html" in message
2568 assert "Do not switch into review or consistency-check mode" in message
2569 assert (
2570 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
2571 not in message
2572 )
2573
2574
2575 @pytest.mark.asyncio
2576 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
2577 temp_dir: Path,
2578 ) -> None:
2579 async def assess_confidence(
2580 tool_name: str,
2581 tool_args: dict,
2582 context: str,
2583 ) -> ConfidenceAssessment:
2584 raise AssertionError("Confidence scoring should be disabled in this scenario")
2585
2586 async def verify_action(
2587 tool_name: str,
2588 tool_args: dict,
2589 result: str,
2590 expected: str = "",
2591 ) -> ActionVerification:
2592 raise AssertionError("Verification should not run for this scenario")
2593
2594 guide_root = temp_dir / "guides" / "nginx"
2595 chapters = guide_root / "chapters"
2596 chapters.mkdir(parents=True)
2597 index_path = guide_root / "index.html"
2598 chapter_one = chapters / "01-getting-started.html"
2599 chapter_two = chapters / "02-installation.html"
2600 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2601 chapter_one.write_text("<h1>One</h1>\n")
2602 chapter_two.write_text("<h1>Two</h1>\n")
2603
2604 implementation_plan = temp_dir / "implementation.md"
2605 implementation_plan.write_text(
2606 "\n".join(
2607 [
2608 "# Implementation Plan",
2609 "",
2610 "## File Changes",
2611 f"- `{chapters}/`",
2612 f"- `{index_path}`",
2613 f"- `{chapter_one}`",
2614 f"- `{chapter_two}`",
2615 "",
2616 ]
2617 )
2618 )
2619
2620 context = build_context(
2621 temp_dir=temp_dir,
2622 messages=[],
2623 safeguards=FakeSafeguards(),
2624 assess_confidence=assess_confidence,
2625 verify_action=verify_action,
2626 auto_recover=False,
2627 )
2628 persistent_messages: list[str] = []
2629 ephemeral_messages: list[str] = []
2630 context.queue_steering_message_callback = persistent_messages.append
2631 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2632 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2633 dod = create_definition_of_done("Create a multi-file nginx guide.")
2634 dod.implementation_plan = str(implementation_plan)
2635 sync_todos_to_definition_of_done(
2636 dod,
2637 [
2638 {
2639 "content": "Create the guide files",
2640 "active_form": "Working on: Create the guide files",
2641 "status": "completed",
2642 },
2643 {
2644 "content": "Ensure all files are properly linked and formatted consistently",
2645 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2646 "status": "pending",
2647 },
2648 ],
2649 )
2650 tool_call = ToolCall(
2651 id="write-final",
2652 name="write",
2653 arguments={
2654 "file_path": str(chapter_two),
2655 "content": "<h1>Two</h1>\n",
2656 },
2657 )
2658 executor = FakeExecutor(
2659 [
2660 tool_outcome(
2661 tool_call=tool_call,
2662 output=f"Successfully wrote {chapter_two}",
2663 is_error=False,
2664 )
2665 ]
2666 )
2667
2668 summary = TurnSummary(final_response="")
2669 await runner.execute_batch(
2670 tool_calls=[tool_call],
2671 tool_source="assistant",
2672 pending_tool_calls_seen=set(),
2673 emit=_noop_emit,
2674 summary=summary,
2675 dod=dod,
2676 executor=executor, # type: ignore[arg-type]
2677 on_confirmation=None,
2678 on_user_question=None,
2679 emit_confirmation=None,
2680 consecutive_errors=0,
2681 )
2682
2683 assert any(
2684 "All explicitly planned artifacts now exist." in message
2685 for message in persistent_messages
2686 )
2687 assert any(
2688 "Ensure all files are properly linked and formatted consistently" in message
2689 for message in persistent_messages
2690 )
2691 assert any(
2692 "Move to verification once no specific mismatch remains." in message
2693 for message in persistent_messages
2694 )
2695
2696
2697 @pytest.mark.asyncio
2698 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
2699 temp_dir: Path,
2700 ) -> None:
2701 async def assess_confidence(
2702 tool_name: str,
2703 tool_args: dict,
2704 context: str,
2705 ) -> ConfidenceAssessment:
2706 raise AssertionError("Confidence scoring should not run in this scenario")
2707
2708 async def verify_action(
2709 tool_name: str,
2710 tool_args: dict,
2711 result: str,
2712 expected: str = "",
2713 ) -> ActionVerification:
2714 raise AssertionError("Verification should not run in this scenario")
2715
2716 guide_root = temp_dir / "guides" / "nginx"
2717 chapters = guide_root / "chapters"
2718 guide_root.mkdir(parents=True)
2719 chapters.mkdir()
2720 index_path = guide_root / "index.html"
2721 index_path.write_text("<html></html>\n")
2722 chapter_one = chapters / "01-getting-started.html"
2723 chapter_two = chapters / "02-installation.html"
2724 implementation_plan = temp_dir / "implementation.md"
2725 implementation_plan.write_text(
2726 "\n".join(
2727 [
2728 "# Implementation Plan",
2729 "",
2730 "## File Changes",
2731 f"- `{guide_root}/`",
2732 f"- `{index_path}`",
2733 f"- `{chapter_one}`",
2734 f"- `{chapter_two}`",
2735 "",
2736 ]
2737 )
2738 )
2739
2740 context = build_context(
2741 temp_dir=temp_dir,
2742 messages=[],
2743 safeguards=FakeSafeguards(),
2744 assess_confidence=assess_confidence,
2745 verify_action=verify_action,
2746 auto_recover=False,
2747 )
2748 persistent_messages: list[str] = []
2749 ephemeral_messages: list[str] = []
2750 context.queue_steering_message_callback = persistent_messages.append
2751 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2752 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2753 dod = create_definition_of_done("Create a multi-file nginx guide.")
2754 dod.implementation_plan = str(implementation_plan)
2755 sync_todos_to_definition_of_done(
2756 dod,
2757 [
2758 {
2759 "content": "Create the main index.html file with proper structure",
2760 "active_form": "Working on: Create the main index.html file with proper structure",
2761 "status": "pending",
2762 },
2763 {
2764 "content": "Create each chapter file in sequence, following the established pattern",
2765 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
2766 "status": "pending",
2767 },
2768 {
2769 "content": "Ensure all files are properly linked and formatted consistently",
2770 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2771 "status": "pending",
2772 },
2773 ],
2774 )
2775 tool_call = ToolCall(
2776 id="write-index",
2777 name="write",
2778 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
2779 )
2780 executor = FakeExecutor(
2781 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
2782 )
2783
2784 summary = TurnSummary(final_response="")
2785 await runner.execute_batch(
2786 tool_calls=[tool_call],
2787 tool_source="assistant",
2788 pending_tool_calls_seen=set(),
2789 emit=_noop_emit,
2790 summary=summary,
2791 dod=dod,
2792 executor=executor, # type: ignore[arg-type]
2793 on_confirmation=None,
2794 on_user_question=None,
2795 emit_confirmation=None,
2796 consecutive_errors=0,
2797 )
2798
2799 assert persistent_messages
2800 message = persistent_messages[-1]
2801 assert "Next step: create `01-getting-started.html`." in message
2802 assert (
2803 f"Prefer one `write(file_path=..., content=...)` call for `{chapter_one.resolve(strict=False)}` now."
2804 in message
2805 )
2806 assert "refresh `TodoWrite`" not in message
2807 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2808
2809
2810 @pytest.mark.asyncio
2811 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
2812 temp_dir: Path,
2813 ) -> None:
2814 async def assess_confidence(
2815 tool_name: str,
2816 tool_args: dict,
2817 context: str,
2818 ) -> ConfidenceAssessment:
2819 raise AssertionError("Confidence scoring should not run in this scenario")
2820
2821 async def verify_action(
2822 tool_name: str,
2823 tool_args: dict,
2824 result: str,
2825 expected: str = "",
2826 ) -> ActionVerification:
2827 raise AssertionError("Verification should not run in this scenario")
2828
2829 guide_root = temp_dir / "guides" / "nginx"
2830 chapters = guide_root / "chapters"
2831 guide_root.mkdir(parents=True)
2832 chapters.mkdir()
2833 index_path = guide_root / "index.html"
2834 index_path.write_text("<html></html>\n")
2835
2836 chapter_paths = [
2837 chapters / "01-getting-started.html",
2838 chapters / "02-installation.html",
2839 chapters / "03-first-website.html",
2840 chapters / "04-configuration-basics.html",
2841 chapters / "05-advanced-configurations.html",
2842 chapters / "06-performance-tuning.html",
2843 chapters / "07-security-best-practices.html",
2844 ]
2845 for chapter in chapter_paths[:4]:
2846 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
2847 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
2848
2849 implementation_plan = temp_dir / "implementation.md"
2850 implementation_plan.write_text(
2851 "\n".join(
2852 [
2853 "# Implementation Plan",
2854 "",
2855 "## File Changes",
2856 f"- `{guide_root}/`",
2857 f"- `{chapters}/`",
2858 f"- `{index_path}`",
2859 *[f"- `{path}`" for path in chapter_paths],
2860 "",
2861 ]
2862 )
2863 )
2864
2865 context = build_context(
2866 temp_dir=temp_dir,
2867 messages=[],
2868 safeguards=FakeSafeguards(),
2869 assess_confidence=assess_confidence,
2870 verify_action=verify_action,
2871 auto_recover=False,
2872 )
2873 persistent_messages: list[str] = []
2874 ephemeral_messages: list[str] = []
2875 context.queue_steering_message_callback = persistent_messages.append
2876 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2877 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2878 dod = create_definition_of_done("Create a thorough nginx guide.")
2879 dod.implementation_plan = str(implementation_plan)
2880 sync_todos_to_definition_of_done(
2881 dod,
2882 [
2883 {
2884 "content": "Create the nginx guide artifacts",
2885 "active_form": "Creating nginx guide artifacts",
2886 "status": "pending",
2887 },
2888 {
2889 "content": "Verify all guide files are linked and complete",
2890 "active_form": "Verifying guide linkage and completeness",
2891 "status": "pending",
2892 },
2893 ],
2894 )
2895 tool_call = ToolCall(
2896 id="write-chapter-05",
2897 name="write",
2898 arguments={
2899 "file_path": str(chapter_paths[4]),
2900 "content": "<h1>Advanced configurations</h1>\n",
2901 },
2902 )
2903 executor = FakeExecutor(
2904 [
2905 tool_outcome(
2906 tool_call=tool_call,
2907 output=f"Successfully wrote {chapter_paths[4]}",
2908 is_error=False,
2909 )
2910 ]
2911 )
2912
2913 summary = TurnSummary(final_response="")
2914 await runner.execute_batch(
2915 tool_calls=[tool_call],
2916 tool_source="assistant",
2917 pending_tool_calls_seen=set(),
2918 emit=_noop_emit,
2919 summary=summary,
2920 dod=dod,
2921 executor=executor, # type: ignore[arg-type]
2922 on_confirmation=None,
2923 on_user_question=None,
2924 emit_confirmation=None,
2925 consecutive_errors=0,
2926 )
2927
2928 assert any(
2929 "Resume by creating `06-performance-tuning.html` now." in message
2930 for message in ephemeral_messages
2931 )
2932 assert not any(
2933 "All explicitly planned artifacts now exist." in message
2934 for message in ephemeral_messages
2935 )
2936
2937
2938 @pytest.mark.asyncio
2939 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
2940 temp_dir: Path,
2941 ) -> None:
2942 async def assess_confidence(
2943 tool_name: str,
2944 tool_args: dict,
2945 context: str,
2946 ) -> ConfidenceAssessment:
2947 raise AssertionError("Confidence scoring should not run in this scenario")
2948
2949 async def verify_action(
2950 tool_name: str,
2951 tool_args: dict,
2952 result: str,
2953 expected: str = "",
2954 ) -> ActionVerification:
2955 raise AssertionError("Verification should not run in this scenario")
2956
2957 guide_root = temp_dir / "guides" / "nginx"
2958 chapters = guide_root / "chapters"
2959 guide_root.mkdir(parents=True)
2960 chapters.mkdir()
2961 index_path = guide_root / "index.html"
2962 chapter_paths = [
2963 chapters / "01-introduction.html",
2964 chapters / "02-installation.html",
2965 chapters / "03-configuration.html",
2966 chapters / "04-basic-usage.html",
2967 chapters / "05-advanced-features.html",
2968 ]
2969 for path in (index_path, *chapter_paths[:4]):
2970 path.write_text("<html></html>\n")
2971
2972 implementation_plan = temp_dir / "implementation.md"
2973 implementation_plan.write_text(
2974 "\n".join(
2975 [
2976 "# Implementation Plan",
2977 "",
2978 "## File Changes",
2979 f"- `{guide_root}/`",
2980 f"- `{chapters}/`",
2981 f"- `{index_path}`",
2982 *[f"- `{path}`" for path in chapter_paths],
2983 "",
2984 ]
2985 )
2986 )
2987
2988 context = build_context(
2989 temp_dir=temp_dir,
2990 messages=[],
2991 safeguards=FakeSafeguards(),
2992 assess_confidence=assess_confidence,
2993 verify_action=verify_action,
2994 auto_recover=False,
2995 )
2996 persistent_messages: list[str] = []
2997 ephemeral_messages: list[str] = []
2998 context.queue_steering_message_callback = persistent_messages.append
2999 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3000 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3001 dod = create_definition_of_done("Create a thorough nginx guide.")
3002 dod.implementation_plan = str(implementation_plan)
3003 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
3004 dod.completed_items.extend(
3005 [
3006 "Create the nginx directory structure",
3007 "Create the main index.html file with proper structure",
3008 ]
3009 )
3010 sync_todos_to_definition_of_done(
3011 dod,
3012 [
3013 {
3014 "content": "Create each chapter file with appropriate content",
3015 "active_form": "Creating each chapter file with appropriate content",
3016 "status": "pending",
3017 }
3018 ],
3019 )
3020 tool_call = ToolCall(
3021 id="write-chapter-04",
3022 name="write",
3023 arguments={
3024 "file_path": str(chapter_paths[3]),
3025 "content": "<html>updated</html>\n",
3026 },
3027 )
3028 executor = FakeExecutor(
3029 [
3030 tool_outcome(
3031 tool_call=tool_call,
3032 output=f"Successfully wrote {chapter_paths[3]}",
3033 is_error=False,
3034 )
3035 ]
3036 )
3037
3038 summary = TurnSummary(final_response="")
3039 await runner.execute_batch(
3040 tool_calls=[tool_call],
3041 tool_source="assistant",
3042 pending_tool_calls_seen=set(),
3043 emit=_noop_emit,
3044 summary=summary,
3045 dod=dod,
3046 executor=executor, # type: ignore[arg-type]
3047 on_confirmation=None,
3048 on_user_question=None,
3049 emit_confirmation=None,
3050 consecutive_errors=0,
3051 )
3052
3053 assert ephemeral_messages
3054 message = ephemeral_messages[-1]
3055 assert "Resume by creating `05-advanced-features.html` now." in message
3056 assert "No TodoWrite, no verification, no rereads until that artifact exists." in message
3057 assert "refresh `TodoWrite`" not in message
3058
3059
3060 @pytest.mark.asyncio
3061 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
3062 temp_dir: Path,
3063 ) -> None:
3064 async def assess_confidence(
3065 tool_name: str,
3066 tool_args: dict,
3067 context: str,
3068 ) -> ConfidenceAssessment:
3069 raise AssertionError("Confidence scoring should not run in this scenario")
3070
3071 async def verify_action(
3072 tool_name: str,
3073 tool_args: dict,
3074 result: str,
3075 expected: str = "",
3076 ) -> ActionVerification:
3077 raise AssertionError("Verification should not run in this scenario")
3078
3079 guide_root = temp_dir / "guides" / "nginx"
3080 chapters = guide_root / "chapters"
3081 guide_root.mkdir(parents=True)
3082 chapters.mkdir()
3083 index_path = guide_root / "index.html"
3084 index_path.write_text("<html></html>\n")
3085 chapter_one = chapters / "01-getting-started.html"
3086 chapter_two = chapters / "02-installation.html"
3087 chapter_one.write_text("<h1>One</h1>\n")
3088
3089 implementation_plan = temp_dir / "implementation.md"
3090 implementation_plan.write_text(
3091 "\n".join(
3092 [
3093 "# Implementation Plan",
3094 "",
3095 "## File Changes",
3096 f"- `{guide_root}/`",
3097 f"- `{chapters}/`",
3098 f"- `{index_path}`",
3099 f"- `{chapter_one}`",
3100 f"- `{chapter_two}`",
3101 "",
3102 ]
3103 )
3104 )
3105
3106 context = build_context(
3107 temp_dir=temp_dir,
3108 messages=[],
3109 safeguards=FakeSafeguards(),
3110 assess_confidence=assess_confidence,
3111 verify_action=verify_action,
3112 auto_recover=False,
3113 )
3114 persistent_messages: list[str] = []
3115 ephemeral_messages: list[str] = []
3116 context.queue_steering_message_callback = persistent_messages.append
3117 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
3118 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3119 dod = create_definition_of_done("Create a multi-file nginx guide.")
3120 dod.implementation_plan = str(implementation_plan)
3121 sync_todos_to_definition_of_done(
3122 dod,
3123 [
3124 {
3125 "content": "Create 01-getting-started.html",
3126 "active_form": "Creating 01-getting-started.html",
3127 "status": "completed",
3128 },
3129 {
3130 "content": "Create 02-installation.html",
3131 "active_form": "Creating 02-installation.html",
3132 "status": "pending",
3133 },
3134 ],
3135 )
3136 dod.touched_files.extend([str(index_path), str(chapter_one)])
3137
3138 tool_call = ToolCall(
3139 id="todo-only",
3140 name="TodoWrite",
3141 arguments={
3142 "todos": [
3143 {
3144 "content": "Create 01-getting-started.html",
3145 "active_form": "Creating 01-getting-started.html",
3146 "status": "completed",
3147 },
3148 {
3149 "content": "Create 02-installation.html",
3150 "active_form": "Creating 02-installation.html",
3151 "status": "pending",
3152 },
3153 ]
3154 },
3155 )
3156 executor = FakeExecutor(
3157 [
3158 tool_outcome(
3159 tool_call=tool_call,
3160 output="Todos updated",
3161 is_error=False,
3162 metadata={
3163 "new_todos": [
3164 {
3165 "content": "Create 01-getting-started.html",
3166 "active_form": "Creating 01-getting-started.html",
3167 "status": "completed",
3168 },
3169 {
3170 "content": "Create 02-installation.html",
3171 "active_form": "Creating 02-installation.html",
3172 "status": "pending",
3173 },
3174 ]
3175 },
3176 )
3177 ]
3178 )
3179
3180 summary = TurnSummary(final_response="")
3181 await runner.execute_batch(
3182 tool_calls=[tool_call],
3183 tool_source="assistant",
3184 pending_tool_calls_seen=set(),
3185 emit=_noop_emit,
3186 summary=summary,
3187 dod=dod,
3188 executor=executor, # type: ignore[arg-type]
3189 on_confirmation=None,
3190 on_user_question=None,
3191 emit_confirmation=None,
3192 consecutive_errors=0,
3193 )
3194
3195 assert persistent_messages
3196 message = persistent_messages[-1]
3197 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3198 assert "Resume by creating `02-installation.html` now." in message
3199 assert "refresh `TodoWrite`" in message
3200 assert "Do not spend the next turn on TodoWrite alone" in message
3201 assert ephemeral_messages == []
3202
3203
3204 @pytest.mark.asyncio
3205 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
3206 temp_dir: Path,
3207 ) -> None:
3208 async def assess_confidence(
3209 tool_name: str,
3210 tool_args: dict,
3211 context: str,
3212 ) -> ConfidenceAssessment:
3213 raise AssertionError("Confidence scoring should not run in this scenario")
3214
3215 async def verify_action(
3216 tool_name: str,
3217 tool_args: dict,
3218 result: str,
3219 expected: str = "",
3220 ) -> ActionVerification:
3221 raise AssertionError("Verification should not run in this scenario")
3222
3223 guide_root = temp_dir / "guides" / "nginx"
3224 chapters = guide_root / "chapters"
3225 guide_root.mkdir(parents=True)
3226 chapters.mkdir()
3227 index_path = guide_root / "index.html"
3228 chapter_one = chapters / "01-getting-started.html"
3229 chapter_two = chapters / "02-installation.html"
3230 index_path.write_text("<html></html>\n")
3231 chapter_one.write_text("<h1>One</h1>\n")
3232 chapter_two.write_text("<h1>Two</h1>\n")
3233
3234 implementation_plan = temp_dir / "implementation.md"
3235 implementation_plan.write_text(
3236 "\n".join(
3237 [
3238 "# Implementation Plan",
3239 "",
3240 "## File Changes",
3241 f"- `{guide_root}/`",
3242 f"- `{chapters}/`",
3243 f"- `{index_path}`",
3244 f"- `{chapter_one}`",
3245 f"- `{chapter_two}`",
3246 "",
3247 ]
3248 )
3249 )
3250
3251 context = build_context(
3252 temp_dir=temp_dir,
3253 messages=[],
3254 safeguards=FakeSafeguards(),
3255 assess_confidence=assess_confidence,
3256 verify_action=verify_action,
3257 auto_recover=False,
3258 )
3259 queued_messages: list[str] = []
3260 context.queue_steering_message_callback = queued_messages.append
3261 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3262 dod = create_definition_of_done("Create a multi-file nginx guide.")
3263 dod.implementation_plan = str(implementation_plan)
3264 dod.verification_commands = [f"ls -la {guide_root}"]
3265 sync_todos_to_definition_of_done(
3266 dod,
3267 [
3268 {
3269 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3270 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3271 "status": "pending",
3272 },
3273 {
3274 "content": "Verify all guide files are linked and complete",
3275 "active_form": "Working on: Verify all guide files are linked and complete",
3276 "status": "pending",
3277 },
3278 ],
3279 project_root=temp_dir,
3280 )
3281
3282 tool_call = ToolCall(
3283 id="todo-only",
3284 name="TodoWrite",
3285 arguments={
3286 "todos": [
3287 {
3288 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3289 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3290 "status": "pending",
3291 },
3292 {
3293 "content": "Verify all guide files are linked and complete",
3294 "active_form": "Working on: Verify all guide files are linked and complete",
3295 "status": "pending",
3296 },
3297 ]
3298 },
3299 )
3300 executor = FakeExecutor(
3301 [
3302 tool_outcome(
3303 tool_call=tool_call,
3304 output="Todos updated",
3305 is_error=False,
3306 metadata={
3307 "new_todos": [
3308 {
3309 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3310 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3311 "status": "pending",
3312 },
3313 {
3314 "content": "Verify all guide files are linked and complete",
3315 "active_form": "Working on: Verify all guide files are linked and complete",
3316 "status": "pending",
3317 },
3318 ]
3319 },
3320 )
3321 ]
3322 )
3323
3324 summary = TurnSummary(final_response="")
3325 await runner.execute_batch(
3326 tool_calls=[tool_call],
3327 tool_source="assistant",
3328 pending_tool_calls_seen=set(),
3329 emit=_noop_emit,
3330 summary=summary,
3331 dod=dod,
3332 executor=executor, # type: ignore[arg-type]
3333 on_confirmation=None,
3334 on_user_question=None,
3335 emit_confirmation=None,
3336 consecutive_errors=0,
3337 )
3338
3339 assert queued_messages
3340 message = queued_messages[-1]
3341 assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
3342 assert "Verify all guide files are linked and complete" in message
3343 assert "Move to verification once no specific mismatch remains." in message
3344 assert "reopen reference materials" in message
3345 assert "Fortran guide structure" not in message
3346
3347
3348 @pytest.mark.asyncio
3349 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
3350 temp_dir: Path,
3351 ) -> None:
3352 async def assess_confidence(
3353 tool_name: str,
3354 tool_args: dict,
3355 context: str,
3356 ) -> ConfidenceAssessment:
3357 raise AssertionError("Confidence scoring should not run in this scenario")
3358
3359 async def verify_action(
3360 tool_name: str,
3361 tool_args: dict,
3362 result: str,
3363 expected: str = "",
3364 ) -> ActionVerification:
3365 raise AssertionError("Verification should not run in this scenario")
3366
3367 guide_root = temp_dir / "guides" / "nginx"
3368 chapters = guide_root / "chapters"
3369 guide_root.mkdir(parents=True)
3370 chapters.mkdir()
3371 index_path = guide_root / "index.html"
3372 index_path.write_text(
3373 "\n".join(
3374 [
3375 "<!DOCTYPE html>",
3376 "<html>",
3377 "<body>",
3378 '<a href="chapters/01-introduction.html">Introduction</a>',
3379 "</body>",
3380 "</html>",
3381 "",
3382 ]
3383 )
3384 )
3385
3386 implementation_plan = temp_dir / "implementation.md"
3387 implementation_plan.write_text(
3388 "\n".join(
3389 [
3390 "# Implementation Plan",
3391 "",
3392 "## File Changes",
3393 f"- `{guide_root}/`",
3394 f"- `{chapters}/`",
3395 f"- `{index_path}`",
3396 "",
3397 ]
3398 )
3399 )
3400
3401 context = build_context(
3402 temp_dir=temp_dir,
3403 messages=[],
3404 safeguards=FakeSafeguards(),
3405 assess_confidence=assess_confidence,
3406 verify_action=verify_action,
3407 auto_recover=False,
3408 )
3409 queued_messages: list[str] = []
3410 context.queue_steering_message_callback = queued_messages.append
3411 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3412 dod = create_definition_of_done("Create a multi-file nginx guide.")
3413 dod.implementation_plan = str(implementation_plan)
3414 dod.touched_files.append(str(index_path))
3415 sync_todos_to_definition_of_done(
3416 dod,
3417 [
3418 {
3419 "content": "Examine the existing Fortran guide structure",
3420 "active_form": "Examining the existing Fortran guide structure",
3421 "status": "completed",
3422 },
3423 {
3424 "content": "Create the nginx directory structure",
3425 "active_form": "Creating the nginx directory structure",
3426 "status": "completed",
3427 },
3428 {
3429 "content": "Write the introduction chapter",
3430 "active_form": "Writing the introduction chapter",
3431 "status": "pending",
3432 },
3433 ],
3434 project_root=temp_dir,
3435 )
3436
3437 tool_call = ToolCall(
3438 id="todo-next-mutation",
3439 name="TodoWrite",
3440 arguments={
3441 "todos": [
3442 {
3443 "content": "Examine the existing Fortran guide structure",
3444 "active_form": "Examining the existing Fortran guide structure",
3445 "status": "completed",
3446 },
3447 {
3448 "content": "Create the nginx directory structure",
3449 "active_form": "Creating the nginx directory structure",
3450 "status": "completed",
3451 },
3452 {
3453 "content": "Write the introduction chapter",
3454 "active_form": "Writing the introduction chapter",
3455 "status": "pending",
3456 },
3457 ]
3458 },
3459 )
3460 executor = FakeExecutor(
3461 [
3462 tool_outcome(
3463 tool_call=tool_call,
3464 output="Todos updated",
3465 is_error=False,
3466 metadata={
3467 "new_todos": [
3468 {
3469 "content": "Examine the existing Fortran guide structure",
3470 "active_form": "Examining the existing Fortran guide structure",
3471 "status": "completed",
3472 },
3473 {
3474 "content": "Create the nginx directory structure",
3475 "active_form": "Creating the nginx directory structure",
3476 "status": "completed",
3477 },
3478 {
3479 "content": "Write the introduction chapter",
3480 "active_form": "Writing the introduction chapter",
3481 "status": "pending",
3482 },
3483 ]
3484 },
3485 )
3486 ]
3487 )
3488
3489 summary = TurnSummary(final_response="")
3490 await runner.execute_batch(
3491 tool_calls=[tool_call],
3492 tool_source="assistant",
3493 pending_tool_calls_seen=set(),
3494 emit=_noop_emit,
3495 summary=summary,
3496 dod=dod,
3497 executor=executor, # type: ignore[arg-type]
3498 on_confirmation=None,
3499 on_user_question=None,
3500 emit_confirmation=None,
3501 consecutive_errors=0,
3502 )
3503
3504 assert queued_messages
3505 message = queued_messages[-1]
3506 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3507 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3508 assert "Resume by creating `01-introduction.html` now." in message
3509 assert "Prefer one `write` call for `" in message
3510 assert "01-introduction.html` instead of more rereads." in message
3511 assert "Do not spend the next turn on TodoWrite alone" in message
3512
3513
3514 @pytest.mark.asyncio
3515 async def test_tool_batch_runner_todowrite_prefers_pending_index_over_empty_output_directory(
3516 temp_dir: Path,
3517 ) -> None:
3518 async def assess_confidence(
3519 tool_name: str,
3520 tool_args: dict,
3521 context: str,
3522 ) -> ConfidenceAssessment:
3523 raise AssertionError("Confidence scoring should not run in this scenario")
3524
3525 async def verify_action(
3526 tool_name: str,
3527 tool_args: dict,
3528 result: str,
3529 expected: str = "",
3530 ) -> ActionVerification:
3531 raise AssertionError("Verification should not run in this scenario")
3532
3533 guide_root = temp_dir / "Loader" / "guides" / "nginx"
3534 chapters = guide_root / "chapters"
3535 chapters.mkdir(parents=True)
3536 index_path = guide_root / "index.html"
3537 implementation_plan = temp_dir / "implementation.md"
3538 implementation_plan.write_text(
3539 "\n".join(
3540 [
3541 "# Implementation Plan",
3542 "",
3543 "## File Changes",
3544 f"- `{chapters}/`",
3545 f"- `{index_path}`",
3546 "",
3547 ]
3548 )
3549 )
3550
3551 dod = create_definition_of_done("Create a multi-file nginx guide.")
3552 dod.implementation_plan = str(implementation_plan)
3553 sync_todos_to_definition_of_done(
3554 dod,
3555 [
3556 {
3557 "content": "Examine the existing Fortran guide structure to understand the format and depth",
3558 "active_form": "Examining the existing Fortran guide structure",
3559 "status": "completed",
3560 },
3561 {
3562 "content": "Create the new nginx guide directory structure",
3563 "active_form": "Creating the new nginx guide directory structure",
3564 "status": "completed",
3565 },
3566 {
3567 "content": "Create a new index.html for the nginx guide",
3568 "active_form": "Creating a new index.html for the nginx guide",
3569 "status": "pending",
3570 },
3571 {
3572 "content": "Create the first chapter for the nginx guide",
3573 "active_form": "Creating the first chapter for the nginx guide",
3574 "status": "pending",
3575 },
3576 ],
3577 project_root=temp_dir,
3578 )
3579
3580 queued_messages: list[str] = []
3581 context = build_context(
3582 temp_dir=temp_dir,
3583 messages=[],
3584 safeguards=FakeSafeguards(),
3585 assess_confidence=assess_confidence,
3586 verify_action=verify_action,
3587 auto_recover=False,
3588 )
3589 context.queue_steering_message_callback = queued_messages.append
3590 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3591
3592 todos = [
3593 {
3594 "content": "Examine the existing Fortran guide structure to understand the format and depth",
3595 "active_form": "Examining the existing Fortran guide structure",
3596 "status": "completed",
3597 },
3598 {
3599 "content": "Create the new nginx guide directory structure",
3600 "active_form": "Creating the new nginx guide directory structure",
3601 "status": "completed",
3602 },
3603 {
3604 "content": "Create a new index.html for the nginx guide",
3605 "active_form": "Creating a new index.html for the nginx guide",
3606 "status": "pending",
3607 },
3608 {
3609 "content": "Create the first chapter for the nginx guide",
3610 "active_form": "Creating the first chapter for the nginx guide",
3611 "status": "pending",
3612 },
3613 ]
3614 tool_call = ToolCall(
3615 id="todo-index-before-chapter",
3616 name="TodoWrite",
3617 arguments={"todos": todos},
3618 )
3619 executor = FakeExecutor(
3620 [
3621 tool_outcome(
3622 tool_call=tool_call,
3623 output="Todos updated",
3624 is_error=False,
3625 metadata={"new_todos": todos},
3626 )
3627 ]
3628 )
3629
3630 summary = TurnSummary(final_response="")
3631 await runner.execute_batch(
3632 tool_calls=[tool_call],
3633 tool_source="assistant",
3634 pending_tool_calls_seen=set(),
3635 emit=_noop_emit,
3636 summary=summary,
3637 dod=dod,
3638 executor=executor, # type: ignore[arg-type]
3639 on_confirmation=None,
3640 on_user_question=None,
3641 emit_confirmation=None,
3642 consecutive_errors=0,
3643 )
3644
3645 assert queued_messages
3646 message = queued_messages[-1]
3647 assert "Continue with the next pending item: `Create a new index.html for the nginx guide`." in message
3648 assert "Resume by creating `index.html` now." in message
3649 assert f"Prefer one `write` call for `{index_path.resolve(strict=False)}`" in message
3650 assert "01-introduction.html" not in message
3651
3652
3653 @pytest.mark.asyncio
3654 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
3655 temp_dir: Path,
3656 ) -> None:
3657 async def assess_confidence(
3658 tool_name: str,
3659 tool_args: dict,
3660 context: str,
3661 ) -> ConfidenceAssessment:
3662 raise AssertionError("Confidence scoring should not run in this scenario")
3663
3664 async def verify_action(
3665 tool_name: str,
3666 tool_args: dict,
3667 result: str,
3668 expected: str = "",
3669 ) -> ActionVerification:
3670 raise AssertionError("Verification should not run in this scenario")
3671
3672 guide_root = temp_dir / "guides" / "nginx"
3673 chapters = guide_root / "chapters"
3674 guide_root.mkdir(parents=True)
3675 chapters.mkdir()
3676 index_path = guide_root / "index.html"
3677 index_path.write_text(
3678 "\n".join(
3679 [
3680 "<html>",
3681 '<a href="chapters/introduction.html">Introduction</a>',
3682 '<a href="chapters/installation.html">Installation</a>',
3683 "</html>",
3684 ]
3685 )
3686 + "\n"
3687 )
3688
3689 implementation_plan = temp_dir / "implementation.md"
3690 implementation_plan.write_text(
3691 "\n".join(
3692 [
3693 "# Implementation Plan",
3694 "",
3695 "## File Changes",
3696 f"- `{guide_root}/`",
3697 f"- `{chapters}/`",
3698 f"- `{index_path}`",
3699 "",
3700 ]
3701 )
3702 )
3703
3704 dod = create_definition_of_done("Create a multi-file nginx guide.")
3705 dod.implementation_plan = str(implementation_plan)
3706 dod.pending_items = [
3707 "Write the introduction chapter",
3708 "Complete the requested work",
3709 ]
3710 dod.touched_files.append(str(index_path))
3711
3712 queued_messages: list[str] = []
3713 context = build_context(
3714 temp_dir=temp_dir,
3715 messages=[],
3716 safeguards=FakeSafeguards(),
3717 assess_confidence=assess_confidence,
3718 verify_action=verify_action,
3719 auto_recover=False,
3720 )
3721 context.queue_steering_message_callback = queued_messages.append
3722 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3723
3724 tool_call = ToolCall(
3725 id="todo-1",
3726 name="TodoWrite",
3727 arguments={
3728 "todos": [
3729 {
3730 "content": "Write the introduction chapter",
3731 "activeForm": "Writing the introduction chapter",
3732 "status": "pending",
3733 }
3734 ]
3735 },
3736 )
3737 executor = FakeExecutor(
3738 [
3739 tool_outcome(
3740 tool_call=tool_call,
3741 output="Todos updated",
3742 is_error=False,
3743 metadata={
3744 "new_todos": [
3745 {
3746 "content": "Write the introduction chapter",
3747 "active_form": "Writing the introduction chapter",
3748 "status": "pending",
3749 }
3750 ]
3751 },
3752 )
3753 ]
3754 )
3755
3756 summary = TurnSummary(final_response="")
3757 await runner.execute_batch(
3758 tool_calls=[tool_call],
3759 tool_source="assistant",
3760 pending_tool_calls_seen=set(),
3761 emit=_noop_emit,
3762 summary=summary,
3763 dod=dod,
3764 executor=executor, # type: ignore[arg-type]
3765 on_confirmation=None,
3766 on_user_question=None,
3767 emit_confirmation=None,
3768 consecutive_errors=0,
3769 )
3770
3771 assert queued_messages
3772 message = queued_messages[-1]
3773 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3774 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3775 assert "Resume by creating `introduction.html` now." in message
3776 assert "Prefer one `write` call for `" in message
3777 assert "introduction.html` instead of more rereads." in message
3778 assert "Do not spend the next turn on TodoWrite alone" in message
3779
3780
3781 @pytest.mark.asyncio
3782 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
3783 temp_dir: Path,
3784 ) -> None:
3785 async def assess_confidence(
3786 tool_name: str,
3787 tool_args: dict,
3788 context: str,
3789 ) -> ConfidenceAssessment:
3790 raise AssertionError("Confidence scoring should not run in this scenario")
3791
3792 async def verify_action(
3793 tool_name: str,
3794 tool_args: dict,
3795 result: str,
3796 expected: str = "",
3797 ) -> ActionVerification:
3798 raise AssertionError("Verification should not run in this scenario")
3799
3800 guide_root = temp_dir / "guides" / "nginx"
3801 chapters = guide_root / "chapters"
3802 guide_root.mkdir(parents=True)
3803 chapters.mkdir()
3804 index_path = guide_root / "index.html"
3805 chapter_one = chapters / "01-introduction.html"
3806 index_path.write_text(
3807 "\n".join(
3808 [
3809 "<html>",
3810 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
3811 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3812 "</html>",
3813 ]
3814 )
3815 + "\n"
3816 )
3817 chapter_one.write_text("<html></html>\n")
3818
3819 implementation_plan = temp_dir / "implementation.md"
3820 implementation_plan.write_text(
3821 "\n".join(
3822 [
3823 "# Implementation Plan",
3824 "",
3825 "## File Changes",
3826 f"- `{guide_root}/`",
3827 f"- `{chapters}/`",
3828 f"- `{index_path}`",
3829 "",
3830 ]
3831 )
3832 )
3833
3834 dod = create_definition_of_done("Create a multi-file nginx guide.")
3835 dod.implementation_plan = str(implementation_plan)
3836 dod.pending_items = [
3837 "Creating Chapter 2: Installation and Setup",
3838 "Complete the requested work",
3839 ]
3840 dod.touched_files.extend([str(index_path), str(chapter_one)])
3841
3842 queued_messages: list[str] = []
3843 context = build_context(
3844 temp_dir=temp_dir,
3845 messages=[],
3846 safeguards=FakeSafeguards(),
3847 assess_confidence=assess_confidence,
3848 verify_action=verify_action,
3849 auto_recover=False,
3850 )
3851 context.queue_steering_message_callback = queued_messages.append
3852 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3853
3854 tool_call = ToolCall(
3855 id="todo-1",
3856 name="TodoWrite",
3857 arguments={
3858 "todos": [
3859 {
3860 "content": "Creating Chapter 2: Installation and Setup",
3861 "activeForm": "Creating Chapter 2: Installation and Setup",
3862 "status": "pending",
3863 }
3864 ]
3865 },
3866 )
3867 executor = FakeExecutor(
3868 [
3869 tool_outcome(
3870 tool_call=tool_call,
3871 output="Todos updated",
3872 is_error=False,
3873 metadata={
3874 "new_todos": [
3875 {
3876 "content": "Creating Chapter 2: Installation and Setup",
3877 "active_form": "Creating Chapter 2: Installation and Setup",
3878 "status": "pending",
3879 }
3880 ]
3881 },
3882 )
3883 ]
3884 )
3885
3886 summary = TurnSummary(final_response="")
3887 await runner.execute_batch(
3888 tool_calls=[tool_call],
3889 tool_source="assistant",
3890 pending_tool_calls_seen=set(),
3891 emit=_noop_emit,
3892 summary=summary,
3893 dod=dod,
3894 executor=executor, # type: ignore[arg-type]
3895 on_confirmation=None,
3896 on_user_question=None,
3897 emit_confirmation=None,
3898 consecutive_errors=0,
3899 )
3900
3901 assert queued_messages
3902 message = queued_messages[-1]
3903 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3904 assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
3905 assert "Resume by creating `02-installation.html` now." in message
3906 assert (
3907 f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
3908 "instead of more rereads."
3909 in message
3910 )
3911 assert "Make your next response the concrete mutation tool call itself" in message
3912
3913
3914 @pytest.mark.asyncio
3915 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
3916 temp_dir: Path,
3917 ) -> None:
3918 async def assess_confidence(
3919 tool_name: str,
3920 tool_args: dict,
3921 context: str,
3922 ) -> ConfidenceAssessment:
3923 raise AssertionError("Confidence scoring should not run in this scenario")
3924
3925 async def verify_action(
3926 tool_name: str,
3927 tool_args: dict,
3928 result: str,
3929 expected: str = "",
3930 ) -> ActionVerification:
3931 raise AssertionError("Verification should not run in this scenario")
3932
3933 reference_chapters = temp_dir / "fortran" / "chapters"
3934 reference_chapters.mkdir(parents=True)
3935 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
3936
3937 guide_root = temp_dir / "guides" / "nginx"
3938 chapters = guide_root / "chapters"
3939 guide_root.mkdir(parents=True)
3940 chapters.mkdir()
3941 index_path = guide_root / "index.html"
3942 index_path.write_text("<html></html>\n")
3943
3944 implementation_plan = temp_dir / "implementation.md"
3945 implementation_plan.write_text(
3946 "\n".join(
3947 [
3948 "# Implementation Plan",
3949 "",
3950 "## File Changes",
3951 f"- `{guide_root}/`",
3952 f"- `{chapters}/`",
3953 f"- `{index_path}`",
3954 "",
3955 ]
3956 )
3957 )
3958
3959 dod = create_definition_of_done("Create a multi-file nginx guide.")
3960 dod.implementation_plan = str(implementation_plan)
3961 dod.pending_items = [
3962 "Write the introduction chapter",
3963 "Complete the requested work",
3964 ]
3965 dod.touched_files.append(str(index_path))
3966
3967 queued_messages: list[str] = []
3968 context = build_context(
3969 temp_dir=temp_dir,
3970 messages=[
3971 Message(
3972 role=Role.ASSISTANT,
3973 content="",
3974 tool_calls=[
3975 ToolCall(
3976 id="read-ref-1",
3977 name="read",
3978 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
3979 )
3980 ],
3981 )
3982 ],
3983 safeguards=FakeSafeguards(),
3984 assess_confidence=assess_confidence,
3985 verify_action=verify_action,
3986 auto_recover=False,
3987 )
3988 context.queue_steering_message_callback = queued_messages.append
3989 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3990
3991 tool_call = ToolCall(
3992 id="todo-observed-1",
3993 name="TodoWrite",
3994 arguments={
3995 "todos": [
3996 {
3997 "content": "Write the introduction chapter",
3998 "activeForm": "Writing the introduction chapter",
3999 "status": "pending",
4000 }
4001 ]
4002 },
4003 )
4004 executor = FakeExecutor(
4005 [
4006 tool_outcome(
4007 tool_call=tool_call,
4008 output="Todos updated",
4009 is_error=False,
4010 metadata={
4011 "new_todos": [
4012 {
4013 "content": "Write the introduction chapter",
4014 "active_form": "Writing the introduction chapter",
4015 "status": "pending",
4016 }
4017 ]
4018 },
4019 )
4020 ]
4021 )
4022
4023 summary = TurnSummary(final_response="")
4024 await runner.execute_batch(
4025 tool_calls=[tool_call],
4026 tool_source="assistant",
4027 pending_tool_calls_seen=set(),
4028 emit=_noop_emit,
4029 summary=summary,
4030 dod=dod,
4031 executor=executor, # type: ignore[arg-type]
4032 on_confirmation=None,
4033 on_user_question=None,
4034 emit_confirmation=None,
4035 consecutive_errors=0,
4036 )
4037
4038 assert queued_messages
4039 message = queued_messages[-1]
4040 assert "Todo tracking is updated. A declared output artifact is still missing." in message
4041 assert "Continue with the next pending item: `Write the introduction chapter`." in message
4042 assert "Resume by creating `01-introduction.html` now." in message
4043 assert (
4044 "It mirrors the observed filename pattern from another `chapters/` directory "
4045 "you already inspected."
4046 in message
4047 )
4048 assert "01-introduction.html` instead of more rereads." in message
4049
4050
4051 @pytest.mark.asyncio
4052 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
4053 temp_dir: Path,
4054 ) -> None:
4055 async def assess_confidence(
4056 tool_name: str,
4057 tool_args: dict,
4058 context: str,
4059 ) -> ConfidenceAssessment:
4060 raise AssertionError("Confidence scoring should not run in this scenario")
4061
4062 async def verify_action(
4063 tool_name: str,
4064 tool_args: dict,
4065 result: str,
4066 expected: str = "",
4067 ) -> ActionVerification:
4068 raise AssertionError("Verification should not run in this scenario")
4069
4070 guide_root = temp_dir / "guides" / "nginx"
4071 chapters = guide_root / "chapters"
4072 guide_root.mkdir(parents=True)
4073 chapters.mkdir()
4074 index_path = guide_root / "index.html"
4075 chapter_one = chapters / "01-getting-started.html"
4076 chapter_two = chapters / "02-installation.html"
4077 index_path.write_text("<html></html>\n")
4078 chapter_one.write_text("<h1>One</h1>\n")
4079
4080 implementation_plan = temp_dir / "implementation.md"
4081 implementation_plan.write_text(
4082 "\n".join(
4083 [
4084 "# Implementation Plan",
4085 "",
4086 "## File Changes",
4087 f"- `{guide_root}/`",
4088 f"- `{chapters}/`",
4089 f"- `{index_path}`",
4090 f"- `{chapter_one}`",
4091 f"- `{chapter_two}`",
4092 "",
4093 ]
4094 )
4095 )
4096
4097 context = build_context(
4098 temp_dir=temp_dir,
4099 messages=[],
4100 safeguards=FakeSafeguards(),
4101 assess_confidence=assess_confidence,
4102 verify_action=verify_action,
4103 auto_recover=False,
4104 )
4105 queued_messages: list[str] = []
4106 context.queue_steering_message_callback = queued_messages.append
4107 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4108 dod = create_definition_of_done("Create a multi-file nginx guide.")
4109 dod.implementation_plan = str(implementation_plan)
4110 sync_todos_to_definition_of_done(
4111 dod,
4112 [
4113 {
4114 "content": "Create 01-getting-started.html",
4115 "active_form": "Creating 01-getting-started.html",
4116 "status": "completed",
4117 },
4118 {
4119 "content": "Create 02-installation.html",
4120 "active_form": "Creating 02-installation.html",
4121 "status": "pending",
4122 },
4123 ],
4124 project_root=temp_dir,
4125 )
4126 dod.touched_files.extend([str(index_path), str(chapter_one)])
4127
4128 tool_call = ToolCall(
4129 id="working-note",
4130 name="notepad_write_working",
4131 arguments={"content": "Creating the second chapter file: Installation"},
4132 )
4133 executor = FakeExecutor(
4134 [
4135 tool_outcome(
4136 tool_call=tool_call,
4137 output="Working note recorded",
4138 is_error=False,
4139 )
4140 ]
4141 )
4142
4143 summary = TurnSummary(final_response="")
4144 await runner.execute_batch(
4145 tool_calls=[tool_call],
4146 tool_source="assistant",
4147 pending_tool_calls_seen=set(),
4148 emit=_noop_emit,
4149 summary=summary,
4150 dod=dod,
4151 executor=executor, # type: ignore[arg-type]
4152 on_confirmation=None,
4153 on_user_question=None,
4154 emit_confirmation=None,
4155 consecutive_errors=0,
4156 )
4157
4158 assert queued_messages
4159 message = queued_messages[-1]
4160 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
4161 assert "Resume by creating `02-installation.html` now." in message
4162 assert "Make your next response the concrete mutation tool call itself" in message
4163 assert "refresh `TodoWrite`" in message
4164 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
4165
4166
4167 @pytest.mark.asyncio
4168 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
4169 temp_dir: Path,
4170 ) -> None:
4171 async def assess_confidence(
4172 tool_name: str,
4173 tool_args: dict,
4174 context: str,
4175 ) -> ConfidenceAssessment:
4176 raise AssertionError("Confidence scoring should be disabled in this scenario")
4177
4178 async def verify_action(
4179 tool_name: str,
4180 tool_args: dict,
4181 result: str,
4182 expected: str = "",
4183 ) -> ActionVerification:
4184 raise AssertionError("Verification should not run in this scenario")
4185
4186 implementation_plan = temp_dir / "implementation.md"
4187 implementation_plan.write_text(
4188 "\n".join(
4189 [
4190 "# Implementation Plan",
4191 "",
4192 "## File Changes",
4193 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
4194 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
4195 "",
4196 ]
4197 )
4198 )
4199
4200 context = build_context(
4201 temp_dir=temp_dir,
4202 messages=[],
4203 safeguards=FakeSafeguards(),
4204 assess_confidence=assess_confidence,
4205 verify_action=verify_action,
4206 auto_recover=False,
4207 )
4208 queued_messages: list[str] = []
4209 context.queue_steering_message_callback = queued_messages.append
4210 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4211 dod = create_definition_of_done("Create a multi-file nginx guide.")
4212 dod.implementation_plan = str(implementation_plan)
4213 dod.pending_items.extend(
4214 [
4215 "First, examine the existing fortran guide structure and content to understand the format",
4216 "Create the nginx directory structure",
4217 "Develop the main index.html file for the nginx guide",
4218 ]
4219 )
4220
4221 tool_call = ToolCall(
4222 id="working-note",
4223 name="notepad_write_working",
4224 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
4225 )
4226 executor = FakeExecutor(
4227 [
4228 tool_outcome(
4229 tool_call=tool_call,
4230 output="Working note recorded",
4231 is_error=False,
4232 )
4233 ]
4234 )
4235
4236 summary = TurnSummary(final_response="")
4237 await runner.execute_batch(
4238 tool_calls=[tool_call],
4239 tool_source="assistant",
4240 pending_tool_calls_seen=set(),
4241 emit=_noop_emit,
4242 summary=summary,
4243 dod=dod,
4244 executor=executor, # type: ignore[arg-type]
4245 on_confirmation=None,
4246 on_user_question=None,
4247 emit_confirmation=None,
4248 consecutive_errors=0,
4249 )
4250
4251 assert queued_messages
4252 message = queued_messages[-1]
4253 assert (
4254 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
4255 in message
4256 )
4257 assert "one concrete evidence-gathering tool call" in message
4258 assert "Resume by creating `index.html` now." not in message
4259
4260
4261 @pytest.mark.asyncio
4262 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
4263 temp_dir: Path,
4264 ) -> None:
4265 async def assess_confidence(
4266 tool_name: str,
4267 tool_args: dict,
4268 context: str,
4269 ) -> ConfidenceAssessment:
4270 raise AssertionError("Confidence scoring should be disabled in this scenario")
4271
4272 async def verify_action(
4273 tool_name: str,
4274 tool_args: dict,
4275 result: str,
4276 expected: str = "",
4277 ) -> ActionVerification:
4278 raise AssertionError("Verification should not run in this scenario")
4279
4280 guide_root = temp_dir / "guides" / "nginx"
4281 chapters_dir = guide_root / "chapters"
4282 chapters_dir.mkdir(parents=True)
4283 index_path = guide_root / "index.html"
4284 first_chapter = chapters_dir / "01-introduction.html"
4285 index_path.write_text(
4286 "\n".join(
4287 [
4288 '<a href="chapters/01-introduction.html">Introduction</a>',
4289 '<a href="chapters/02-installation.html">Installation</a>',
4290 '<a href="chapters/03-configuration.html">Configuration</a>',
4291 ]
4292 )
4293 )
4294 first_chapter.write_text("<h1>Introduction</h1>\n")
4295
4296 implementation_plan = temp_dir / "implementation.md"
4297 implementation_plan.write_text(
4298 "\n".join(
4299 [
4300 "# Implementation Plan",
4301 "",
4302 "## File Changes",
4303 f"- `{guide_root / 'index.html'}`",
4304 f"- `{chapters_dir}/`",
4305 "",
4306 ]
4307 )
4308 )
4309
4310 context = build_context(
4311 temp_dir=temp_dir,
4312 messages=[],
4313 safeguards=FakeSafeguards(),
4314 assess_confidence=assess_confidence,
4315 verify_action=verify_action,
4316 auto_recover=False,
4317 )
4318 queued_messages: list[str] = []
4319 context.queue_steering_message_callback = queued_messages.append
4320 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4321 dod = create_definition_of_done("Create a multi-file nginx guide.")
4322 dod.implementation_plan = str(implementation_plan)
4323 dod.pending_items.extend(
4324 [
4325 "First, examine the existing fortran guide structure and content to understand the format",
4326 "Create chapter files following the established pattern",
4327 ]
4328 )
4329 dod.touched_files.extend([str(index_path), str(first_chapter)])
4330
4331 tool_call = ToolCall(
4332 id="working-note",
4333 name="notepad_write_working",
4334 arguments={"content": "Created index and first chapter; next is chapter 2"},
4335 )
4336 executor = FakeExecutor(
4337 [
4338 tool_outcome(
4339 tool_call=tool_call,
4340 output="Working note recorded",
4341 is_error=False,
4342 )
4343 ]
4344 )
4345
4346 summary = TurnSummary(final_response="")
4347 await runner.execute_batch(
4348 tool_calls=[tool_call],
4349 tool_source="assistant",
4350 pending_tool_calls_seen=set(),
4351 emit=_noop_emit,
4352 summary=summary,
4353 dod=dod,
4354 executor=executor, # type: ignore[arg-type]
4355 on_confirmation=None,
4356 on_user_question=None,
4357 emit_confirmation=None,
4358 consecutive_errors=0,
4359 )
4360
4361 assert queued_messages
4362 message = queued_messages[-1]
4363 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
4364 assert "Resume by creating `02-installation.html` now." in message
4365 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
4366
4367
4368 @pytest.mark.asyncio
4369 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
4370 temp_dir: Path,
4371 ) -> None:
4372 async def assess_confidence(
4373 tool_name: str,
4374 tool_args: dict,
4375 context: str,
4376 ) -> ConfidenceAssessment:
4377 raise AssertionError("Confidence scoring should be disabled in this scenario")
4378
4379 async def verify_action(
4380 tool_name: str,
4381 tool_args: dict,
4382 result: str,
4383 expected: str = "",
4384 ) -> ActionVerification:
4385 raise AssertionError("Verification should not run in this scenario")
4386
4387 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
4388 chapters_dir = fortran_root / "chapters"
4389 chapters_dir.mkdir(parents=True)
4390
4391 implementation_plan = temp_dir / "implementation.md"
4392 implementation_plan.write_text(
4393 "\n".join(
4394 [
4395 "# Implementation Plan",
4396 "",
4397 "## File Changes",
4398 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
4399 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
4400 "",
4401 ]
4402 )
4403 )
4404
4405 context = build_context(
4406 temp_dir=temp_dir,
4407 messages=[],
4408 safeguards=FakeSafeguards(),
4409 assess_confidence=assess_confidence,
4410 verify_action=verify_action,
4411 auto_recover=False,
4412 )
4413 queued_messages: list[str] = []
4414 context.queue_steering_message_callback = queued_messages.append
4415 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4416 dod = create_definition_of_done("Create a multi-file nginx guide.")
4417 dod.implementation_plan = str(implementation_plan)
4418 dod.pending_items.extend(
4419 [
4420 "First, examine the existing fortran guide structure and content",
4421 "Create the nginx directory structure",
4422 "Develop the main index.html file for nginx guide",
4423 ]
4424 )
4425
4426 tool_call = ToolCall(
4427 id="glob-1",
4428 name="glob",
4429 arguments={"pattern": "**", "path": str(fortran_root)},
4430 )
4431 executor = FakeExecutor(
4432 [
4433 tool_outcome(
4434 tool_call=tool_call,
4435 output=f"{fortran_root}\n{chapters_dir}",
4436 is_error=False,
4437 )
4438 ]
4439 )
4440
4441 summary = TurnSummary(final_response="")
4442 await runner.execute_batch(
4443 tool_calls=[tool_call],
4444 tool_source="assistant",
4445 pending_tool_calls_seen=set(),
4446 emit=_noop_emit,
4447 summary=summary,
4448 dod=dod,
4449 executor=executor, # type: ignore[arg-type]
4450 on_confirmation=None,
4451 on_user_question=None,
4452 emit_confirmation=None,
4453 consecutive_errors=0,
4454 )
4455
4456 assert queued_messages == []
4457
4458
4459 @pytest.mark.asyncio
4460 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
4461 temp_dir: Path,
4462 ) -> None:
4463 async def assess_confidence(
4464 tool_name: str,
4465 tool_args: dict,
4466 context: str,
4467 ) -> ConfidenceAssessment:
4468 raise AssertionError("Confidence scoring should not run in this scenario")
4469
4470 async def verify_action(
4471 tool_name: str,
4472 tool_args: dict,
4473 result: str,
4474 expected: str = "",
4475 ) -> ActionVerification:
4476 raise AssertionError("Verification should not run in this scenario")
4477
4478 prompt = (
4479 "Have a look at ~/Loader/guides/fortran/index.html, then "
4480 "~/Loader/guides/fortran/chapters. The table of contents links in "
4481 "index.html are inaccurate and the href’s are wrong. Let’s update the "
4482 "links and their link texts to be correct."
4483 )
4484 chapters = temp_dir / "chapters"
4485 chapters.mkdir()
4486 (chapters / "01-introduction.html").write_text(
4487 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
4488 )
4489 (chapters / "02-setup.html").write_text(
4490 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
4491 )
4492 current_block = (
4493 "<h2>Table of Contents</h2>\n"
4494 ' <ul class="chapter-list">\n'
4495 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
4496 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
4497 " </ul>\n"
4498 )
4499 index_path = temp_dir / "index.html"
4500 index_path.write_text(current_block)
4501
4502 context = build_context(
4503 temp_dir=temp_dir,
4504 messages=[],
4505 safeguards=FakeSafeguards(),
4506 assess_confidence=assess_confidence,
4507 verify_action=verify_action,
4508 auto_recover=False,
4509 )
4510 context.session.current_task = prompt # type: ignore[attr-defined]
4511 queued_messages: list[str] = []
4512 context.queue_steering_message_callback = queued_messages.append
4513 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4514 tool_call = ToolCall(
4515 id="edit-1",
4516 name="edit",
4517 arguments={
4518 "file_path": str(index_path),
4519 "old_string": current_block,
4520 "new_string": current_block,
4521 },
4522 )
4523 executor = FakeExecutor(
4524 [
4525 tool_outcome(
4526 tool_call=tool_call,
4527 output=(
4528 "[Blocked - old_string and new_string are identical - no change "
4529 "would occur] Suggestion: Provide different old and new strings"
4530 ),
4531 is_error=True,
4532 state=ToolExecutionState.BLOCKED,
4533 )
4534 ]
4535 )
4536
4537 await runner.execute_batch(
4538 tool_calls=[tool_call],
4539 tool_source="assistant",
4540 pending_tool_calls_seen=set(),
4541 emit=_noop_emit,
4542 summary=TurnSummary(final_response=""),
4543 dod=create_definition_of_done(prompt),
4544 executor=executor, # type: ignore[arg-type]
4545 on_confirmation=None,
4546 on_user_question=None,
4547 emit_confirmation=None,
4548 consecutive_errors=0,
4549 )
4550
4551 assert queued_messages == []
4552
4553
4554 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
4555 temp_dir: Path,
4556 ) -> None:
4557 async def assess_confidence(
4558 tool_name: str,
4559 tool_args: dict,
4560 context: str,
4561 ) -> ConfidenceAssessment:
4562 raise AssertionError("Confidence scoring should be disabled in this scenario")
4563
4564 async def verify_action(
4565 tool_name: str,
4566 tool_args: dict,
4567 result: str,
4568 expected: str = "",
4569 ) -> ActionVerification:
4570 raise AssertionError("Verification should not run in this scenario")
4571
4572 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
4573 context = build_context(
4574 temp_dir=temp_dir,
4575 messages=[
4576 Message(
4577 role=Role.ASSISTANT,
4578 content=(
4579 "Repair focus:\n"
4580 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
4581 f"- Immediate next step: edit `{repair_target}`.\n"
4582 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
4583 ),
4584 )
4585 ],
4586 safeguards=FakeSafeguards(),
4587 assess_confidence=assess_confidence,
4588 verify_action=verify_action,
4589 )
4590 queued: list[str] = []
4591 context.queue_steering_message_callback = queued.append
4592 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4593
4594 runner._queue_blocked_html_edit_nudge(
4595 ToolCall(
4596 id="edit-1",
4597 name="edit",
4598 arguments={
4599 "file_path": str(repair_target),
4600 "old_string": "same",
4601 "new_string": "same",
4602 },
4603 ),
4604 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
4605 )
4606
4607 assert queued
4608 assert str(repair_target) in queued[0]
4609 assert "no on-disk change" in queued[0]
4610 assert "replace the surrounding block" in queued[0]
4611 assert "Do not reopen unrelated reference materials" in queued[0]
4612
4613
4614 async def _noop_emit(event: AgentEvent) -> None:
4615 return None
4616
4617
4618 @pytest.mark.asyncio
4619 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
4620 temp_dir: Path,
4621 ) -> None:
4622 async def assess_confidence(
4623 tool_name: str,
4624 tool_args: dict,
4625 context: str,
4626 ) -> ConfidenceAssessment:
4627 raise AssertionError("Confidence scoring should be disabled in this scenario")
4628
4629 async def verify_action(
4630 tool_name: str,
4631 tool_args: dict,
4632 result: str,
4633 expected: str = "",
4634 ) -> ActionVerification:
4635 raise AssertionError("Verification should not run for this scenario")
4636
4637 context = build_context(
4638 temp_dir=temp_dir,
4639 messages=[],
4640 safeguards=FakeSafeguards(),
4641 assess_confidence=assess_confidence,
4642 verify_action=verify_action,
4643 )
4644 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4645 tool_call = ToolCall(
4646 id="write-1",
4647 name="write",
4648 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4649 )
4650 executor = FakeExecutor(
4651 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4652 )
4653 summary = TurnSummary(final_response="")
4654 dod = create_definition_of_done("Update README and verify it still works.")
4655 events: list[AgentEvent] = []
4656
4657 async def emit(event: AgentEvent) -> None:
4658 events.append(event)
4659
4660 await runner.execute_batch(
4661 tool_calls=[tool_call],
4662 tool_source="assistant",
4663 pending_tool_calls_seen=set(),
4664 emit=emit,
4665 summary=summary,
4666 dod=dod,
4667 executor=executor, # type: ignore[arg-type]
4668 on_confirmation=None,
4669 on_user_question=None,
4670 emit_confirmation=None,
4671 consecutive_errors=0,
4672 )
4673
4674 assert dod.last_verification_result == "planned"
4675 assert dod.verification_commands
4676 assert "Collect verification evidence" in dod.pending_items
4677 assert dod.active_verification_attempt_id == "verification-attempt-1"
4678 assert dod.active_verification_attempt_number == 1
4679 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
4680 assert summary.workflow_timeline[-1].policy_outcome == "planned"
4681 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
4682 assert (
4683 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4684 == "verification-attempt-1"
4685 )
4686 assert (
4687 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4688 )
4689
4690
4691 @pytest.mark.asyncio
4692 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
4693 temp_dir: Path,
4694 ) -> None:
4695 async def assess_confidence(
4696 tool_name: str,
4697 tool_args: dict,
4698 context: str,
4699 ) -> ConfidenceAssessment:
4700 raise AssertionError("Confidence scoring should be disabled in this scenario")
4701
4702 async def verify_action(
4703 tool_name: str,
4704 tool_args: dict,
4705 result: str,
4706 expected: str = "",
4707 ) -> ActionVerification:
4708 raise AssertionError("Verification should not run in this scenario")
4709
4710 context = build_context(
4711 temp_dir=temp_dir,
4712 messages=[],
4713 safeguards=FakeSafeguards(),
4714 assess_confidence=assess_confidence,
4715 verify_action=verify_action,
4716 )
4717 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4718 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
4719 chapters = nginx_root / "chapters"
4720 implementation_plan = temp_dir / "implementation.md"
4721 implementation_plan.write_text(
4722 "\n".join(
4723 [
4724 "# Implementation Plan",
4725 "",
4726 "## File Changes",
4727 f"- `{chapters}/`",
4728 f"- `{nginx_root / 'index.html'}`",
4729 "",
4730 ]
4731 )
4732 )
4733
4734 tool_call = ToolCall(
4735 id="mkdir-1",
4736 name="bash",
4737 arguments={"command": f"mkdir -p {chapters}"},
4738 )
4739 executor = FakeExecutor(
4740 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
4741 )
4742 summary = TurnSummary(final_response="")
4743 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
4744 dod.implementation_plan = str(implementation_plan)
4745 events: list[AgentEvent] = []
4746
4747 async def emit(event: AgentEvent) -> None:
4748 events.append(event)
4749
4750 await runner.execute_batch(
4751 tool_calls=[tool_call],
4752 tool_source="assistant",
4753 pending_tool_calls_seen=set(),
4754 emit=emit,
4755 summary=summary,
4756 dod=dod,
4757 executor=executor, # type: ignore[arg-type]
4758 on_confirmation=None,
4759 on_user_question=None,
4760 emit_confirmation=None,
4761 consecutive_errors=0,
4762 )
4763
4764 assert dod.last_verification_result is None
4765 assert "Collect verification evidence" not in dod.pending_items
4766 assert not any(
4767 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
4768 )
4769
4770
4771 @pytest.mark.asyncio
4772 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
4773 temp_dir: Path,
4774 ) -> None:
4775 async def assess_confidence(
4776 tool_name: str,
4777 tool_args: dict,
4778 context: str,
4779 ) -> ConfidenceAssessment:
4780 raise AssertionError("Confidence scoring should be disabled in this scenario")
4781
4782 async def verify_action(
4783 tool_name: str,
4784 tool_args: dict,
4785 result: str,
4786 expected: str = "",
4787 ) -> ActionVerification:
4788 raise AssertionError("Verification should not run for this scenario")
4789
4790 context = build_context(
4791 temp_dir=temp_dir,
4792 messages=[],
4793 safeguards=FakeSafeguards(),
4794 assess_confidence=assess_confidence,
4795 verify_action=verify_action,
4796 )
4797 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4798 tool_call = ToolCall(
4799 id="write-1",
4800 name="write",
4801 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4802 )
4803 executor = FakeExecutor(
4804 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4805 )
4806 summary = TurnSummary(final_response="")
4807 dod = create_definition_of_done("Update README and verify it still works.")
4808 dod.verification_commands = ["uv run pytest -q"]
4809 dod.last_verification_result = "passed"
4810 dod.verification_attempt_counter = 1
4811 dod.active_verification_attempt_id = "verification-attempt-1"
4812 dod.active_verification_attempt_number = 1
4813 dod.evidence = [
4814 VerificationEvidence(
4815 command="uv run pytest -q",
4816 passed=True,
4817 stdout="401 passed",
4818 kind="test",
4819 )
4820 ]
4821 dod.completed_items.append("Collect verification evidence")
4822 events: list[AgentEvent] = []
4823
4824 async def emit(event: AgentEvent) -> None:
4825 events.append(event)
4826
4827 await runner.execute_batch(
4828 tool_calls=[tool_call],
4829 tool_source="assistant",
4830 pending_tool_calls_seen=set(),
4831 emit=emit,
4832 summary=summary,
4833 dod=dod,
4834 executor=executor, # type: ignore[arg-type]
4835 on_confirmation=None,
4836 on_user_question=None,
4837 emit_confirmation=None,
4838 consecutive_errors=0,
4839 )
4840
4841 assert dod.last_verification_result == "stale"
4842 assert dod.evidence == []
4843 assert "Collect verification evidence" in dod.pending_items
4844 assert "Collect verification evidence" not in dod.completed_items
4845 assert dod.active_verification_attempt_id == "verification-attempt-2"
4846 assert dod.active_verification_attempt_number == 2
4847 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
4848 assert summary.workflow_timeline[-1].policy_outcome == "stale"
4849 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
4850 assert (
4851 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4852 == "verification-attempt-1"
4853 )
4854 assert (
4855 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4856 )
4857 assert (
4858 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
4859 == "verification-attempt-2"
4860 )
4861 assert (
4862 summary.workflow_timeline[-1].verification_observations[0].command
4863 == "uv run pytest -q"
4864 )
4865
4866
4867 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
4868 async def assess_confidence(
4869 tool_name: str,
4870 tool_args: dict,
4871 context: str,
4872 ) -> ConfidenceAssessment:
4873 raise AssertionError("Confidence scoring should be disabled in this scenario")
4874
4875 async def verify_action(
4876 tool_name: str,
4877 tool_args: dict,
4878 result: str,
4879 expected: str = "",
4880 ) -> ActionVerification:
4881 raise AssertionError("Verification should not run in this scenario")
4882
4883 repair_target = temp_dir / "guide" / "index.html"
4884 context = build_context(
4885 temp_dir=temp_dir,
4886 messages=[
4887 Message(
4888 role=Role.ASSISTANT,
4889 content=(
4890 "Repair focus:\n"
4891 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
4892 f"- Immediate next step: edit `{repair_target}`.\n"
4893 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
4894 ),
4895 )
4896 ],
4897 safeguards=FakeSafeguards(),
4898 assess_confidence=assess_confidence,
4899 verify_action=verify_action,
4900 )
4901 queued: list[str] = []
4902 context.queue_steering_message_callback = queued.append
4903 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4904
4905 runner._queue_blocked_active_repair_nudge(
4906 "[Blocked - active repair scope: verification already identified the repair target.]"
4907 )
4908
4909 assert queued
4910 assert str(repair_target) in queued[0]
4911 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
4912 assert "Do not reopen unrelated reference materials" in queued[0]
4913
4914
4915 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
4916 temp_dir: Path,
4917 ) -> None:
4918 async def assess_confidence(
4919 tool_name: str,
4920 tool_args: dict,
4921 context: str,
4922 ) -> ConfidenceAssessment:
4923 raise AssertionError("Confidence scoring should be disabled in this scenario")
4924
4925 async def verify_action(
4926 tool_name: str,
4927 tool_args: dict,
4928 result: str,
4929 expected: str = "",
4930 ) -> ActionVerification:
4931 raise AssertionError("Verification should not run in this scenario")
4932
4933 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
4934 stylesheet = temp_dir / "guide" / "styles.css"
4935 context = build_context(
4936 temp_dir=temp_dir,
4937 messages=[
4938 Message(
4939 role=Role.ASSISTANT,
4940 content=(
4941 "Repair focus:\n"
4942 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
4943 f"- Immediate next step: edit `{repair_target}`.\n"
4944 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
4945 ),
4946 )
4947 ],
4948 safeguards=FakeSafeguards(),
4949 assess_confidence=assess_confidence,
4950 verify_action=verify_action,
4951 )
4952 queued: list[str] = []
4953 context.queue_steering_message_callback = queued.append
4954 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4955
4956 runner._queue_blocked_active_repair_mutation_nudge(
4957 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
4958 )
4959
4960 assert queued
4961 assert str(repair_target) in queued[0]
4962 assert str(stylesheet) in queued[0]
4963 assert "before widening the change set" in queued[0]
4964
4965
4966 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
4967 temp_dir: Path,
4968 ) -> None:
4969 async def assess_confidence(
4970 tool_name: str,
4971 tool_args: dict,
4972 context: str,
4973 ) -> ConfidenceAssessment:
4974 raise AssertionError("Confidence scoring should be disabled in this scenario")
4975
4976 async def verify_action(
4977 tool_name: str,
4978 tool_args: dict,
4979 result: str,
4980 expected: str = "",
4981 ) -> ActionVerification:
4982 raise AssertionError("Verification should not run in this scenario")
4983
4984 context = build_context(
4985 temp_dir=temp_dir,
4986 messages=[],
4987 safeguards=FakeSafeguards(),
4988 assess_confidence=assess_confidence,
4989 verify_action=verify_action,
4990 )
4991 queued: list[str] = []
4992 context.queue_steering_message_callback = queued.append
4993 store = DefinitionOfDoneStore(temp_dir)
4994 dod = create_definition_of_done("Create a multi-file guide from a reference")
4995 plan_path = temp_dir / "implementation.md"
4996 plan_path.write_text(
4997 "# File Changes\n"
4998 "- `guide/index.html`\n"
4999 "- `guide/chapters/01-getting-started.html`\n"
5000 "- `guide/chapters/02-installation.html`\n"
5001 "- `guide/chapters/03-first-website.html`\n"
5002 )
5003 dod.implementation_plan = str(plan_path)
5004 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
5005 (temp_dir / "guide" / "index.html").write_text("index")
5006 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
5007 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
5008 runner = ToolBatchRunner(context, store)
5009
5010 runner._queue_blocked_late_reference_drift_nudge(
5011 "[Blocked - late reference drift: several planned artifacts already exist.]",
5012 dod=dod,
5013 )
5014
5015 assert queued
5016 assert "03-first-website.html" in queued[0]
5017 assert "older reference materials" in queued[0]
5018
5019
5020 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
5021 temp_dir: Path,
5022 ) -> None:
5023 async def assess_confidence(
5024 tool_name: str,
5025 tool_args: dict,
5026 context: str,
5027 ) -> ConfidenceAssessment:
5028 raise AssertionError("Confidence scoring should be disabled in this scenario")
5029
5030 async def verify_action(
5031 tool_name: str,
5032 tool_args: dict,
5033 result: str,
5034 expected: str = "",
5035 ) -> ActionVerification:
5036 raise AssertionError("Verification should not run in this scenario")
5037
5038 guide_root = temp_dir / "guide"
5039 chapters = guide_root / "chapters"
5040 guide_root.mkdir(parents=True)
5041 chapters.mkdir()
5042 index_path = guide_root / "index.html"
5043 chapter_one = chapters / "01-getting-started.html"
5044 chapter_two = chapters / "02-installation.html"
5045 index_path.write_text("index")
5046 chapter_one.write_text("one")
5047 chapter_two.write_text("two")
5048
5049 implementation_plan = temp_dir / "implementation.md"
5050 implementation_plan.write_text(
5051 "\n".join(
5052 [
5053 "# Implementation Plan",
5054 "",
5055 "## File Changes",
5056 f"- `{guide_root}`",
5057 f"- `{chapters}`",
5058 f"- `{index_path}`",
5059 f"- `{chapter_one}`",
5060 f"- `{chapter_two}`",
5061 "",
5062 ]
5063 )
5064 )
5065
5066 context = build_context(
5067 temp_dir=temp_dir,
5068 messages=[],
5069 safeguards=FakeSafeguards(),
5070 assess_confidence=assess_confidence,
5071 verify_action=verify_action,
5072 )
5073 queued: list[str] = []
5074 context.queue_steering_message_callback = queued.append
5075 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5076 dod = create_definition_of_done("Create a multi-file guide from a reference")
5077 dod.implementation_plan = str(implementation_plan)
5078 dod.verification_commands = [f"ls -la {guide_root}"]
5079 sync_todos_to_definition_of_done(
5080 dod,
5081 [
5082 {
5083 "content": "Verify all guide files are linked and complete",
5084 "active_form": "Working on: Verify all guide files are linked and complete",
5085 "status": "pending",
5086 }
5087 ],
5088 project_root=temp_dir,
5089 )
5090
5091 runner._queue_blocked_completed_artifact_scope_nudge(
5092 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
5093 dod=dod,
5094 )
5095
5096 assert queued
5097 assert "All explicitly planned artifacts already exist." in queued[0]
5098 assert "Verify all guide files are linked and complete" in queued[0]
5099 assert "Do not reopen earlier reference materials." in queued[0]
5100
5101
5102 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
5103 temp_dir: Path,
5104 ) -> None:
5105 async def assess_confidence(
5106 tool_name: str,
5107 tool_args: dict,
5108 context: str,
5109 ) -> ConfidenceAssessment:
5110 raise AssertionError("Confidence scoring should be disabled in this scenario")
5111
5112 async def verify_action(
5113 tool_name: str,
5114 tool_args: dict,
5115 result: str,
5116 expected: str = "",
5117 ) -> ActionVerification:
5118 raise AssertionError("Verification should not run in this scenario")
5119
5120 context = build_context(
5121 temp_dir=temp_dir,
5122 messages=[],
5123 safeguards=FakeSafeguards(),
5124 assess_confidence=assess_confidence,
5125 verify_action=verify_action,
5126 )
5127 queued: list[str] = []
5128 context.queue_steering_message_callback = queued.append
5129 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5130
5131 runner._queue_blocked_html_declared_target_nudge(
5132 ToolCall(
5133 id="write-ch1",
5134 name="write",
5135 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
5136 ),
5137 (
5138 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
5139 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
5140 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
5141 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
5142 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
5143 ),
5144 )
5145
5146 assert queued
5147 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
5148 assert "`chapters/02-installation.html`" in queued[0]
5149 assert "same file now" in queued[0]
5150
5151
5152 @pytest.mark.asyncio
5153 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
5154 temp_dir: Path,
5155 ) -> None:
5156 async def assess_confidence(
5157 tool_name: str,
5158 tool_args: dict,
5159 context: str,
5160 ) -> ConfidenceAssessment:
5161 raise AssertionError("Confidence scoring should be disabled in this scenario")
5162
5163 async def verify_action(
5164 tool_name: str,
5165 tool_args: dict,
5166 result: str,
5167 expected: str = "",
5168 ) -> ActionVerification:
5169 raise AssertionError("Verification should not run in this scenario")
5170
5171 guide_root = temp_dir / "guides" / "nginx"
5172 chapters = guide_root / "chapters"
5173 chapters.mkdir(parents=True)
5174 index_path = guide_root / "index.html"
5175 chapter_one = chapters / "01-introduction.html"
5176 chapter_two = chapters / "02-installation.html"
5177 index_path.write_text("<html></html>\n")
5178 chapter_one.write_text("<h1>Intro</h1>\n")
5179
5180 implementation_plan = temp_dir / "implementation.md"
5181 implementation_plan.write_text(
5182 "\n".join(
5183 [
5184 "# Implementation Plan",
5185 "",
5186 "## File Changes",
5187 f"- `{index_path}`",
5188 f"- `{chapter_one}`",
5189 f"- `{chapter_two}`",
5190 "",
5191 ]
5192 )
5193 )
5194
5195 context = build_context(
5196 temp_dir=temp_dir,
5197 messages=[],
5198 safeguards=FakeSafeguards(),
5199 assess_confidence=assess_confidence,
5200 verify_action=verify_action,
5201 auto_recover=False,
5202 )
5203 queued: list[str] = []
5204 context.queue_steering_message_callback = queued.append
5205 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5206 tool_call = ToolCall(
5207 id="write-2",
5208 name="write",
5209 arguments={"file_path": "", "content": "<html></html>\n"},
5210 )
5211 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
5212 executor = FakeExecutor(
5213 [
5214 ToolExecutionOutcome(
5215 tool_call=tool_call,
5216 state=ToolExecutionState.BLOCKED,
5217 message=Message.tool_result_message(
5218 tool_call_id=tool_call.id,
5219 display_content=blocked_message,
5220 result_content=blocked_message,
5221 is_error=True,
5222 ),
5223 event_content=blocked_message,
5224 is_error=True,
5225 result_output=blocked_message,
5226 )
5227 ]
5228 )
5229 dod = create_definition_of_done("Create a multi-file nginx guide.")
5230 dod.implementation_plan = str(implementation_plan)
5231 dod.touched_files.extend([str(index_path), str(chapter_one)])
5232 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
5233
5234 await runner.execute_batch(
5235 tool_calls=[tool_call],
5236 tool_source="assistant",
5237 pending_tool_calls_seen=set(),
5238 emit=_noop_emit,
5239 summary=TurnSummary(final_response=""),
5240 dod=dod,
5241 executor=executor, # type: ignore[arg-type]
5242 on_confirmation=None,
5243 on_user_question=None,
5244 emit_confirmation=None,
5245 consecutive_errors=0,
5246 )
5247
5248 assert queued
5249 assert "did not provide a valid `file_path`" in queued[0]
5250 assert "Resume by creating `02-installation.html` now." in queued[0]
5251 assert (
5252 f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
5253 in queued[0]
5254 )
5255 assert context.recovery_context is not None
5256 assert context.recovery_context.attempts[-1].error == blocked_message