Python · 164917 bytes Raw Blame History
1 """Tests for tool-batch execution on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.dod import (
13 DefinitionOfDoneStore,
14 VerificationEvidence,
15 create_definition_of_done,
16 )
17 from loader.runtime.events import AgentEvent, TurnSummary
18 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
19 from loader.runtime.permissions import (
20 PermissionMode,
21 build_permission_policy,
22 load_permission_rules,
23 )
24 from loader.runtime.reasoning_types import (
25 ActionVerification,
26 ConfidenceAssessment,
27 ConfidenceLevel,
28 )
29 from loader.runtime.recovery import RecoveryContext
30 from loader.runtime.tool_batches import (
31 ToolBatchRunner,
32 )
33 from loader.runtime.tool_batches import (
34 _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact,
35 )
36 from loader.runtime.workflow import sync_todos_to_definition_of_done
37 from loader.tools.base import ToolResult as RegistryToolResult
38 from loader.tools.base import create_default_registry
39 from tests.helpers.runtime_harness import ScriptedBackend
40
41
42 class FakeSession:
43 def __init__(self, messages: list[Message]) -> None:
44 self.messages = list(messages)
45 self.workflow_timeline = []
46
47 def append(self, message: Message) -> None:
48 self.messages.append(message)
49
50 def append_workflow_timeline_entry(self, entry) -> None:
51 self.workflow_timeline.append(entry)
52
53
54 class FakeCodeFilter:
55 def reset(self) -> None:
56 return None
57
58
59 class FakeSafeguards:
60 def __init__(self, *, detect_loop_result: tuple[bool, str] = (False, "")) -> None:
61 self.action_tracker = object()
62 self.validator = object()
63 self.code_filter = FakeCodeFilter()
64 self._detect_loop_result = detect_loop_result
65
66 def filter_stream_chunk(self, content: str) -> str:
67 return content
68
69 def filter_complete_content(self, content: str) -> str:
70 return content
71
72 def should_steer(self) -> bool:
73 return False
74
75 def get_steering_message(self) -> str | None:
76 return None
77
78 def record_response(self, content: str) -> None:
79 return None
80
81 def detect_text_loop(self, content: str) -> tuple[bool, str]:
82 return False, ""
83
84 def detect_loop(self) -> tuple[bool, str]:
85 return self._detect_loop_result
86
87
88 class FakeExecutor:
89 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
90 self._outcomes = list(outcomes)
91 self.calls: list[ToolCall] = []
92
93 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
94 self.calls.append(tool_call)
95 if not self._outcomes:
96 raise AssertionError("No fake tool outcome queued")
97 return self._outcomes.pop(0)
98
99
100 def build_context(
101 *,
102 temp_dir: Path,
103 messages: list[Message],
104 safeguards: FakeSafeguards,
105 assess_confidence,
106 verify_action,
107 recovery_context: RecoveryContext | None = None,
108 confidence_scoring: bool = False,
109 verification: bool = False,
110 auto_recover: bool = True,
111 min_confidence_for_action: int = 3,
112 ) -> RuntimeContext:
113 registry = create_default_registry(temp_dir)
114 registry.configure_workspace_root(temp_dir)
115 rule_status = load_permission_rules(temp_dir)
116 policy = build_permission_policy(
117 active_mode=PermissionMode.WORKSPACE_WRITE,
118 workspace_root=temp_dir,
119 tool_requirements=registry.get_tool_requirements(),
120 rules=rule_status.rules,
121 )
122 context = RuntimeContext(
123 project_root=temp_dir,
124 backend=ScriptedBackend(),
125 registry=registry,
126 session=FakeSession(messages), # type: ignore[arg-type]
127 config=SimpleNamespace(
128 force_react=False,
129 max_recovery_attempts=2,
130 auto_recover=auto_recover,
131 reasoning=SimpleNamespace(
132 rollback=False,
133 show_rollback_plan=False,
134 completion_check=True,
135 max_continuation_prompts=5,
136 self_critique=False,
137 confidence_scoring=confidence_scoring,
138 min_confidence_for_action=min_confidence_for_action,
139 verification=verification,
140 ),
141 ),
142 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
143 project_context=None,
144 permission_policy=policy,
145 permission_config_status=rule_status,
146 workflow_mode="execute",
147 safeguards=safeguards,
148 reasoning=SimpleNamespace(
149 assess_confidence=assess_confidence,
150 verify_action=verify_action,
151 ),
152 recovery_context=recovery_context,
153 )
154 return context
155
156
157 def tool_outcome(
158 *,
159 tool_call: ToolCall,
160 output: str,
161 is_error: bool,
162 state: ToolExecutionState = ToolExecutionState.EXECUTED,
163 metadata: dict[str, object] | None = None,
164 ) -> ToolExecutionOutcome:
165 return ToolExecutionOutcome(
166 tool_call=tool_call,
167 state=state,
168 message=Message.tool_result_message(
169 tool_call_id=tool_call.id,
170 display_content=output,
171 result_content=output,
172 is_error=is_error,
173 ),
174 event_content=output,
175 is_error=is_error,
176 result_output=output,
177 registry_result=RegistryToolResult(
178 output=output,
179 is_error=is_error,
180 metadata=metadata or {},
181 ),
182 )
183
184
185 @pytest.mark.asyncio
186 async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path) -> None:
187 captured: dict[str, str] = {}
188
189 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
190 captured["context"] = context
191 return ConfidenceAssessment(
192 action=f"{tool_name} with {tool_args}",
193 tool_name=tool_name,
194 tool_args=tool_args,
195 level=ConfidenceLevel.LOW,
196 reasoning="Need to inspect the target first.",
197 risks=["Unknown target file"],
198 )
199
200 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
201 raise AssertionError("Verification should not run for skipped actions")
202
203 context = build_context(
204 temp_dir=temp_dir,
205 messages=[
206 Message(role=Role.USER, content="Please inspect the project."),
207 Message(role=Role.ASSISTANT, content="I will read the file next."),
208 ],
209 safeguards=FakeSafeguards(),
210 assess_confidence=assess_confidence,
211 verify_action=verify_action,
212 confidence_scoring=True,
213 min_confidence_for_action=3,
214 )
215 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
216 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
217 events: list[AgentEvent] = []
218
219 async def emit(event: AgentEvent) -> None:
220 events.append(event)
221
222 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="unused", is_error=False)])
223 result = await runner.execute_batch(
224 tool_calls=[tool_call],
225 tool_source="assistant",
226 pending_tool_calls_seen=set(),
227 emit=emit,
228 summary=TurnSummary(final_response=""),
229 dod=create_definition_of_done("Read the docs"),
230 executor=executor, # type: ignore[arg-type]
231 on_confirmation=None,
232 on_user_question=None,
233 emit_confirmation=None,
234 consecutive_errors=0,
235 )
236
237 assert result.actions_taken == []
238 assert executor.calls == []
239 assert "Please inspect the project." in captured["context"]
240 assert context.session.messages[-1].role == Role.USER
241 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
242 event_types = [event.type for event in events]
243 assert "confidence" in event_types
244
245
246 @pytest.mark.asyncio
247 async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: Path) -> None:
248 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
249 raise AssertionError("Confidence scoring should be disabled in this scenario")
250
251 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
252 raise AssertionError("Verification should not run for failed actions")
253
254 context = build_context(
255 temp_dir=temp_dir,
256 messages=[],
257 safeguards=FakeSafeguards(),
258 assess_confidence=assess_confidence,
259 verify_action=verify_action,
260 auto_recover=True,
261 )
262 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
263 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
264 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="command failed", is_error=True)])
265 summary = TurnSummary(final_response="")
266 events: list[AgentEvent] = []
267
268 async def emit(event: AgentEvent) -> None:
269 events.append(event)
270
271 await runner.execute_batch(
272 tool_calls=[tool_call],
273 tool_source="assistant",
274 pending_tool_calls_seen=set(),
275 emit=emit,
276 summary=summary,
277 dod=create_definition_of_done("Run tests"),
278 executor=executor, # type: ignore[arg-type]
279 on_confirmation=None,
280 on_user_question=None,
281 emit_confirmation=None,
282 consecutive_errors=0,
283 )
284
285 assert context.recovery_context is not None
286 assert summary.tool_result_messages
287 assert context.session.messages[-1] == summary.tool_result_messages[-1]
288 assert any(event.type == "recovery" for event in events)
289
290
291 @pytest.mark.asyncio
292 async def test_tool_batch_runner_emits_tool_metadata(temp_dir: Path) -> None:
293 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
294 raise AssertionError("Confidence scoring should be disabled in this scenario")
295
296 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
297 raise AssertionError("Verification should not run for this scenario")
298
299 context = build_context(
300 temp_dir=temp_dir,
301 messages=[],
302 safeguards=FakeSafeguards(),
303 assess_confidence=assess_confidence,
304 verify_action=verify_action,
305 auto_recover=False,
306 )
307 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
308 tool_call = ToolCall(
309 id="bash-1",
310 name="bash",
311 arguments={"command": "python -m http.server 8000", "background": True},
312 )
313 metadata = {
314 "job_id": "bash-1",
315 "status": "running",
316 "background": True,
317 }
318 executor = FakeExecutor(
319 [
320 tool_outcome(
321 tool_call=tool_call,
322 output="Started bash job bash-1",
323 is_error=False,
324 metadata=metadata,
325 )
326 ]
327 )
328 events: list[AgentEvent] = []
329
330 async def emit(event: AgentEvent) -> None:
331 events.append(event)
332
333 await runner.execute_batch(
334 tool_calls=[tool_call],
335 tool_source="assistant",
336 pending_tool_calls_seen=set(),
337 emit=emit,
338 summary=TurnSummary(final_response=""),
339 dod=create_definition_of_done("Launch a preview server"),
340 executor=executor, # type: ignore[arg-type]
341 on_confirmation=None,
342 on_user_question=None,
343 emit_confirmation=None,
344 consecutive_errors=0,
345 )
346
347 tool_result = next(event for event in events if event.type == "tool_result")
348 assert tool_result.tool_metadata == metadata
349
350
351 @pytest.mark.asyncio
352 async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) -> None:
353 verification_calls: list[str] = []
354
355 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
356 raise AssertionError("Confidence scoring should be disabled in this scenario")
357
358 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
359 verification_calls.append(result)
360 return ActionVerification(
361 tool_name=tool_name,
362 tool_args=tool_args,
363 expected_outcome="Success",
364 actual_result=result,
365 verified=False,
366 discrepancies=["File contents did not match"],
367 needs_correction=True,
368 correction_suggestion="Read the file before editing again.",
369 )
370
371 existing_recovery = RecoveryContext(
372 original_tool="edit",
373 original_args={"file_path": "README.md"},
374 )
375 context = build_context(
376 temp_dir=temp_dir,
377 messages=[],
378 safeguards=FakeSafeguards(),
379 assess_confidence=assess_confidence,
380 verify_action=verify_action,
381 recovery_context=existing_recovery,
382 verification=True,
383 )
384 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
385 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
386 executor = FakeExecutor([tool_outcome(tool_call=tool_call, output="file contents", is_error=False)])
387 events: list[AgentEvent] = []
388
389 async def emit(event: AgentEvent) -> None:
390 events.append(event)
391
392 await runner.execute_batch(
393 tool_calls=[tool_call],
394 tool_source="assistant",
395 pending_tool_calls_seen=set(),
396 emit=emit,
397 summary=TurnSummary(final_response=""),
398 dod=create_definition_of_done("Read the docs"),
399 executor=executor, # type: ignore[arg-type]
400 on_confirmation=None,
401 on_user_question=None,
402 emit_confirmation=None,
403 consecutive_errors=0,
404 )
405
406 assert verification_calls == ["file contents"]
407 assert context.recovery_context is existing_recovery
408 assert existing_recovery.successful_steps == [
409 ("read", {"file_path": "README.md"})
410 ]
411 assert context.session.messages[-1].role == Role.TOOL
412 assert context.session.messages[-1].content == "file contents"
413 assert any(event.type == "verification" for event in events)
414
415
416 @pytest.mark.asyncio
417 async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_success(
418 temp_dir: Path,
419 ) -> None:
420 async def assess_confidence(
421 tool_name: str,
422 tool_args: dict,
423 context: str,
424 ) -> ConfidenceAssessment:
425 raise AssertionError("Confidence scoring should be disabled in this scenario")
426
427 async def verify_action(
428 tool_name: str,
429 tool_args: dict,
430 result: str,
431 expected: str = "",
432 ) -> ActionVerification:
433 raise AssertionError("Verification should not run for this scenario")
434
435 existing_recovery = RecoveryContext(
436 original_tool="read",
437 original_args={"file_path": "chapters/04-data-types.html"},
438 )
439 existing_recovery.add_attempt(
440 "read",
441 {"file_path": "chapters/04-data-types.html"},
442 "File not found",
443 )
444 context = build_context(
445 temp_dir=temp_dir,
446 messages=[],
447 safeguards=FakeSafeguards(),
448 assess_confidence=assess_confidence,
449 verify_action=verify_action,
450 recovery_context=existing_recovery,
451 auto_recover=False,
452 )
453 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
454 tool_call = ToolCall(
455 id="bash-1",
456 name="bash",
457 arguments={"command": "ls chapters"},
458 )
459 executor = FakeExecutor(
460 [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)]
461 )
462
463 summary = TurnSummary(final_response="")
464 await runner.execute_batch(
465 tool_calls=[tool_call],
466 tool_source="assistant",
467 pending_tool_calls_seen=set(),
468 emit=_noop_emit,
469 summary=summary,
470 dod=create_definition_of_done("Fix the chapter links"),
471 executor=executor, # type: ignore[arg-type]
472 on_confirmation=None,
473 on_user_question=None,
474 emit_confirmation=None,
475 consecutive_errors=0,
476 )
477
478 assert context.recovery_context is existing_recovery
479 assert existing_recovery.successful_steps == [
480 ("bash", {"command": "ls chapters"})
481 ]
482
483
484 @pytest.mark.asyncio
485 async def test_tool_batch_runner_clears_recovery_context_after_successful_mutation(
486 temp_dir: Path,
487 ) -> None:
488 async def assess_confidence(
489 tool_name: str,
490 tool_args: dict,
491 context: str,
492 ) -> ConfidenceAssessment:
493 raise AssertionError("Confidence scoring should be disabled in this scenario")
494
495 async def verify_action(
496 tool_name: str,
497 tool_args: dict,
498 result: str,
499 expected: str = "",
500 ) -> ActionVerification:
501 raise AssertionError("Verification should not run for this scenario")
502
503 existing_recovery = RecoveryContext(
504 original_tool="read",
505 original_args={"file_path": "chapters/04-data-types.html"},
506 )
507 existing_recovery.add_attempt(
508 "read",
509 {"file_path": "chapters/04-data-types.html"},
510 "File not found",
511 )
512 context = build_context(
513 temp_dir=temp_dir,
514 messages=[],
515 safeguards=FakeSafeguards(),
516 assess_confidence=assess_confidence,
517 verify_action=verify_action,
518 recovery_context=existing_recovery,
519 auto_recover=False,
520 )
521 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
522 tool_call = ToolCall(
523 id="patch-1",
524 name="patch",
525 arguments={
526 "file_path": "index.html",
527 "hunks": [{"old_start": 1, "old_lines": 1, "new_start": 1, "new_lines": 1, "lines": ["-a", "+b"]}],
528 },
529 )
530 executor = FakeExecutor(
531 [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)]
532 )
533
534 summary = TurnSummary(final_response="")
535 await runner.execute_batch(
536 tool_calls=[tool_call],
537 tool_source="assistant",
538 pending_tool_calls_seen=set(),
539 emit=_noop_emit,
540 summary=summary,
541 dod=create_definition_of_done("Fix the chapter links"),
542 executor=executor, # type: ignore[arg-type]
543 on_confirmation=None,
544 on_user_question=None,
545 emit_confirmation=None,
546 consecutive_errors=0,
547 )
548
549 assert context.recovery_context is None
550
551
552 @pytest.mark.asyncio
553 async def test_tool_batch_runner_queues_duplicate_observation_nudge(
554 temp_dir: Path,
555 ) -> None:
556 async def assess_confidence(
557 tool_name: str,
558 tool_args: dict,
559 context: str,
560 ) -> ConfidenceAssessment:
561 raise AssertionError("Confidence scoring should be disabled in this scenario")
562
563 async def verify_action(
564 tool_name: str,
565 tool_args: dict,
566 result: str,
567 expected: str = "",
568 ) -> ActionVerification:
569 raise AssertionError("Verification should not run for this scenario")
570
571 messages = [
572 Message(
573 role=Role.TOOL,
574 content=(
575 "Observation [glob]: Result: "
576 f"{temp_dir}/chapters/01-introduction.html\n"
577 f"{temp_dir}/chapters/02-setup.html\n"
578 f"{temp_dir}/chapters/03-basics.html"
579 ),
580 tool_results=[],
581 ),
582 Message(
583 role=Role.ASSISTANT,
584 content="I already inspected the first chapter title.",
585 tool_calls=[
586 ToolCall(
587 id="read-ch1",
588 name="read",
589 arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')},
590 )
591 ],
592 ),
593 Message.tool_result_message(
594 tool_call_id="read-ch1",
595 display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
596 result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n",
597 ),
598 Message(
599 role=Role.ASSISTANT,
600 content="I should update the index now.",
601 tool_calls=[
602 ToolCall(
603 id="read-index",
604 name="read",
605 arguments={"file_path": str(temp_dir / 'index.html')},
606 )
607 ],
608 ),
609 ]
610 context = build_context(
611 temp_dir=temp_dir,
612 messages=messages,
613 safeguards=FakeSafeguards(),
614 assess_confidence=assess_confidence,
615 verify_action=verify_action,
616 auto_recover=False,
617 )
618 (temp_dir / "chapters").mkdir()
619 (temp_dir / "index.html").write_text("<ul></ul>\n")
620 (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n")
621 (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n")
622 (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n")
623 implementation_plan = temp_dir / "implementation.md"
624 implementation_plan.write_text(
625 "\n".join(
626 [
627 "# Implementation Plan",
628 "",
629 "## File Changes",
630 f"- `{temp_dir / 'index.html'}`",
631 f"- `{temp_dir / 'chapters' / '01-introduction.html'}`",
632 f"- `{temp_dir / 'chapters' / '02-setup.html'}`",
633 f"- `{temp_dir / 'chapters' / '03-basics.html'}`",
634 f"- `{temp_dir / 'chapters' / '04-variables.html'}`",
635 ]
636 )
637 )
638 context.session.current_task = (
639 f"Update {temp_dir / 'index.html'} with the right chapter links."
640 )
641 persistent_messages: list[str] = []
642 ephemeral_messages: list[str] = []
643 context.queue_steering_message_callback = persistent_messages.append
644 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
645 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
646 tool_call = ToolCall(
647 id="read-dup",
648 name="read",
649 arguments={"file_path": str(temp_dir / "index.html")},
650 )
651 duplicate_message = (
652 "[Skipped - duplicate action: Already read "
653 f"{temp_dir / 'index.html'} recently without any intervening changes; "
654 "reuse the earlier read result instead of rereading]"
655 )
656 executor = FakeExecutor(
657 [
658 ToolExecutionOutcome(
659 tool_call=tool_call,
660 state=ToolExecutionState.DUPLICATE,
661 message=Message.tool_result_message(
662 tool_call_id=tool_call.id,
663 display_content=duplicate_message,
664 result_content=duplicate_message,
665 ),
666 event_content=duplicate_message,
667 is_error=False,
668 result_output=duplicate_message,
669 )
670 ]
671 )
672
673 summary = TurnSummary(final_response="")
674 dod = create_definition_of_done("Fix the chapter links")
675 dod.implementation_plan = str(implementation_plan)
676 dod.pending_items.append("Create the remaining chapter files")
677 await runner.execute_batch(
678 tool_calls=[tool_call],
679 tool_source="assistant",
680 pending_tool_calls_seen=set(),
681 emit=_noop_emit,
682 summary=summary,
683 dod=dod,
684 executor=executor, # type: ignore[arg-type]
685 on_confirmation=None,
686 on_user_question=None,
687 emit_confirmation=None,
688 consecutive_errors=0,
689 )
690
691 assert len(persistent_messages) == 1
692 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
693 assert "A declared output artifact is still missing." in persistent_messages[0]
694 assert "Resume by creating `04-variables.html` now." in persistent_messages[0]
695 assert (
696 f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads."
697 in persistent_messages[0]
698 )
699 assert ephemeral_messages == []
700
701
702 @pytest.mark.asyncio
703 async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
704 temp_dir: Path,
705 ) -> None:
706 async def assess_confidence(
707 tool_name: str,
708 tool_args: dict,
709 context: str,
710 ) -> ConfidenceAssessment:
711 raise AssertionError("Confidence scoring should not run for this scenario")
712
713 async def verify_action(
714 tool_name: str,
715 tool_args: dict,
716 result: str,
717 expected: str = "",
718 ) -> ActionVerification:
719 raise AssertionError("Verification should not run for this scenario")
720
721 context = build_context(
722 temp_dir=temp_dir,
723 messages=[],
724 safeguards=FakeSafeguards(),
725 assess_confidence=assess_confidence,
726 verify_action=verify_action,
727 auto_recover=False,
728 )
729 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
730 dod = create_definition_of_done("Create a multi-file nginx guide.")
731 sync_todos_to_definition_of_done(
732 dod,
733 [
734 {
735 "content": "Create 03-first-website.html",
736 "active_form": "Creating 03-first-website.html",
737 "status": "pending",
738 },
739 {
740 "content": "Create 04-configuration-basics.html",
741 "active_form": "Creating 04-configuration-basics.html",
742 "status": "pending",
743 },
744 ],
745 )
746
747 chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html"
748 chapter_path.parent.mkdir(parents=True)
749 write_call = ToolCall(
750 id="write-ch3",
751 name="write",
752 arguments={"file_path": str(chapter_path), "content": "<html></html>\n"},
753 )
754 stale_todo_call = ToolCall(
755 id="todo-stale",
756 name="TodoWrite",
757 arguments={
758 "todos": [
759 {
760 "content": "Create 03-first-website.html",
761 "active_form": "Creating 03-first-website.html",
762 "status": "pending",
763 },
764 {
765 "content": "Create 04-configuration-basics.html",
766 "active_form": "Creating 04-configuration-basics.html",
767 "status": "pending",
768 },
769 ]
770 },
771 )
772 executor = FakeExecutor(
773 [
774 tool_outcome(
775 tool_call=write_call,
776 output=f"Successfully wrote {chapter_path}",
777 is_error=False,
778 ),
779 tool_outcome(
780 tool_call=stale_todo_call,
781 output="Todos updated",
782 is_error=False,
783 metadata={
784 "new_todos": [
785 {
786 "content": "Create 03-first-website.html",
787 "active_form": "Creating 03-first-website.html",
788 "status": "pending",
789 },
790 {
791 "content": "Create 04-configuration-basics.html",
792 "active_form": "Creating 04-configuration-basics.html",
793 "status": "pending",
794 },
795 ]
796 },
797 ),
798 ]
799 )
800
801 summary = TurnSummary(final_response="")
802 await runner.execute_batch(
803 tool_calls=[write_call, stale_todo_call],
804 tool_source="assistant",
805 pending_tool_calls_seen=set(),
806 emit=_noop_emit,
807 summary=summary,
808 dod=dod,
809 executor=executor, # type: ignore[arg-type]
810 on_confirmation=None,
811 on_user_question=None,
812 emit_confirmation=None,
813 consecutive_errors=0,
814 )
815
816 assert "Create 03-first-website.html" in dod.completed_items
817 assert "Create 03-first-website.html" not in dod.pending_items
818 assert "Create 04-configuration-basics.html" in dod.pending_items
819
820
821 @pytest.mark.asyncio
822 async def test_tool_batch_runner_proactively_queues_verified_html_inventory(
823 temp_dir: Path,
824 ) -> None:
825 async def assess_confidence(
826 tool_name: str,
827 tool_args: dict,
828 context: str,
829 ) -> ConfidenceAssessment:
830 raise AssertionError("Confidence scoring should be disabled in this scenario")
831
832 async def verify_action(
833 tool_name: str,
834 tool_args: dict,
835 result: str,
836 expected: str = "",
837 ) -> ActionVerification:
838 raise AssertionError("Verification should not run for this scenario")
839
840 chapters = temp_dir / "chapters"
841 chapters.mkdir()
842 (chapters / "01-introduction.html").write_text(
843 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
844 )
845 (chapters / "02-setup.html").write_text(
846 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
847 )
848 (temp_dir / "index.html").write_text("<ul></ul>\n")
849
850 context = build_context(
851 temp_dir=temp_dir,
852 messages=[],
853 safeguards=FakeSafeguards(),
854 assess_confidence=assess_confidence,
855 verify_action=verify_action,
856 auto_recover=False,
857 )
858 context.session.current_task = (
859 f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files."
860 )
861 persistent_messages: list[str] = []
862 ephemeral_messages: list[str] = []
863 context.queue_steering_message_callback = persistent_messages.append
864 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
865 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
866 tool_call = ToolCall(
867 id="glob-1",
868 name="glob",
869 arguments={"path": str(chapters), "pattern": "*.html"},
870 )
871 executor = FakeExecutor(
872 [
873 tool_outcome(
874 tool_call=tool_call,
875 output="\n".join(
876 [
877 str(chapters / "01-introduction.html"),
878 str(chapters / "02-setup.html"),
879 ]
880 ),
881 is_error=False,
882 )
883 ]
884 )
885
886 summary = TurnSummary(final_response="")
887 await runner.execute_batch(
888 tool_calls=[tool_call],
889 tool_source="assistant",
890 pending_tool_calls_seen=set(),
891 emit=_noop_emit,
892 summary=summary,
893 dod=create_definition_of_done("Fix the chapter links"),
894 executor=executor, # type: ignore[arg-type]
895 on_confirmation=None,
896 on_user_question=None,
897 emit_confirmation=None,
898 consecutive_errors=0,
899 )
900
901 assert persistent_messages == []
902 assert ephemeral_messages == []
903 assert len(summary.tool_result_messages) == 1
904 assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content
905
906
907 @pytest.mark.asyncio
908 async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit(
909 temp_dir: Path,
910 ) -> None:
911 async def assess_confidence(
912 tool_name: str,
913 tool_args: dict,
914 context: str,
915 ) -> ConfidenceAssessment:
916 raise AssertionError("Confidence scoring should be disabled in this scenario")
917
918 async def verify_action(
919 tool_name: str,
920 tool_args: dict,
921 result: str,
922 expected: str = "",
923 ) -> ActionVerification:
924 raise AssertionError("Verification should not run for this scenario")
925
926 chapters = temp_dir / "chapters"
927 chapters.mkdir()
928 (chapters / "01-introduction.html").write_text(
929 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
930 )
931 (chapters / "02-setup.html").write_text(
932 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
933 )
934 index_path = temp_dir / "index.html"
935 old_block = (
936 '<ul class="chapter-list">\n'
937 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
938 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
939 "</ul>\n"
940 )
941 new_block = (
942 '<ul class="chapter-list">\n'
943 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
944 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
945 "</ul>\n"
946 )
947 index_path.write_text(new_block)
948
949 context = build_context(
950 temp_dir=temp_dir,
951 messages=[],
952 safeguards=FakeSafeguards(),
953 assess_confidence=assess_confidence,
954 verify_action=verify_action,
955 auto_recover=False,
956 )
957 context.session.current_task = (
958 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
959 )
960 persistent_messages: list[str] = []
961 ephemeral_messages: list[str] = []
962 context.queue_steering_message_callback = persistent_messages.append
963 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
964 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
965 tool_call = ToolCall(
966 id="edit-1",
967 name="edit",
968 arguments={
969 "file_path": str(index_path),
970 "old_string": old_block,
971 "new_string": new_block,
972 },
973 )
974 executor = FakeExecutor(
975 [
976 tool_outcome(
977 tool_call=tool_call,
978 output=f"Successfully edited {index_path}",
979 is_error=False,
980 )
981 ]
982 )
983
984 summary = TurnSummary(final_response="")
985 await runner.execute_batch(
986 tool_calls=[tool_call],
987 tool_source="assistant",
988 pending_tool_calls_seen=set(),
989 emit=_noop_emit,
990 summary=summary,
991 dod=create_definition_of_done(
992 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
993 ),
994 executor=executor, # type: ignore[arg-type]
995 on_confirmation=None,
996 on_user_question=None,
997 emit_confirmation=None,
998 consecutive_errors=0,
999 )
1000
1001 assert all(
1002 "Semantic verification preview:" not in message.content
1003 for message in summary.tool_result_messages
1004 )
1005 assert persistent_messages == []
1006 assert ephemeral_messages == []
1007
1008
1009 @pytest.mark.asyncio
1010 async def test_tool_batch_runner_does_not_apply_html_toc_handoff_to_reference_read(
1011 temp_dir: Path,
1012 ) -> None:
1013 async def assess_confidence(
1014 tool_name: str,
1015 tool_args: dict,
1016 context: str,
1017 ) -> ConfidenceAssessment:
1018 raise AssertionError("Confidence scoring should be disabled in this scenario")
1019
1020 async def verify_action(
1021 tool_name: str,
1022 tool_args: dict,
1023 result: str,
1024 expected: str = "",
1025 ) -> ActionVerification:
1026 raise AssertionError("Verification should not run for this scenario")
1027
1028 chapters = temp_dir / "chapters"
1029 chapters.mkdir()
1030 (chapters / "01-introduction.html").write_text(
1031 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
1032 )
1033 (chapters / "02-setup.html").write_text(
1034 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
1035 )
1036 index_path = temp_dir / "index.html"
1037 index_path.write_text(
1038 "<h2>Table of Contents</h2>\n"
1039 '<ul class="chapter-list">\n'
1040 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
1041 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
1042 "</ul>\n"
1043 )
1044
1045 prompt = (
1046 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1047 "for the structure and cadence of the guide. We are going to make an all "
1048 "new equally thorough guide on how to use the nginx tool."
1049 )
1050
1051 context = build_context(
1052 temp_dir=temp_dir,
1053 messages=[],
1054 safeguards=FakeSafeguards(),
1055 assess_confidence=assess_confidence,
1056 verify_action=verify_action,
1057 auto_recover=False,
1058 )
1059 context.session.current_task = prompt # type: ignore[attr-defined]
1060 persistent_messages: list[str] = []
1061 ephemeral_messages: list[str] = []
1062 context.queue_steering_message_callback = persistent_messages.append
1063 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1064 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1065 tool_call = ToolCall(
1066 id="read-index",
1067 name="read",
1068 arguments={"file_path": str(index_path)},
1069 )
1070 executor = FakeExecutor(
1071 [
1072 tool_outcome(
1073 tool_call=tool_call,
1074 output=index_path.read_text(),
1075 is_error=False,
1076 )
1077 ]
1078 )
1079
1080 summary = TurnSummary(final_response="")
1081 await runner.execute_batch(
1082 tool_calls=[tool_call],
1083 tool_source="assistant",
1084 pending_tool_calls_seen=set(),
1085 emit=_noop_emit,
1086 summary=summary,
1087 dod=create_definition_of_done(prompt),
1088 executor=executor, # type: ignore[arg-type]
1089 on_confirmation=None,
1090 on_user_question=None,
1091 emit_confirmation=None,
1092 consecutive_errors=0,
1093 )
1094
1095 assert persistent_messages == []
1096 assert ephemeral_messages == []
1097 assert all(
1098 "Semantic verification preview:" not in message.content
1099 for message in summary.tool_result_messages
1100 )
1101
1102
1103 @pytest.mark.asyncio
1104 async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progress(
1105 temp_dir: Path,
1106 ) -> None:
1107 async def assess_confidence(
1108 tool_name: str,
1109 tool_args: dict,
1110 context: str,
1111 ) -> ConfidenceAssessment:
1112 raise AssertionError("Confidence scoring should be disabled in this scenario")
1113
1114 async def verify_action(
1115 tool_name: str,
1116 tool_args: dict,
1117 result: str,
1118 expected: str = "",
1119 ) -> ActionVerification:
1120 raise AssertionError("Verification should not run for this scenario")
1121
1122 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1123 reference.parent.mkdir(parents=True)
1124 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1125 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
1126 chapters = nginx_root / "chapters"
1127 implementation_plan = temp_dir / "implementation.md"
1128 implementation_plan.write_text(
1129 "\n".join(
1130 [
1131 "# Implementation Plan",
1132 "",
1133 "## File Changes",
1134 f"- `{chapters}/`",
1135 f"- `{nginx_root / 'index.html'}`",
1136 "",
1137 ]
1138 )
1139 )
1140
1141 context = build_context(
1142 temp_dir=temp_dir,
1143 messages=[],
1144 safeguards=FakeSafeguards(),
1145 assess_confidence=assess_confidence,
1146 verify_action=verify_action,
1147 auto_recover=False,
1148 )
1149 persistent_messages: list[str] = []
1150 ephemeral_messages: list[str] = []
1151 context.queue_steering_message_callback = persistent_messages.append
1152 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1153 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1154 dod = create_definition_of_done("Create an equally thorough nginx guide.")
1155 dod.implementation_plan = str(implementation_plan)
1156 sync_todos_to_definition_of_done(
1157 dod,
1158 [
1159 {
1160 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1161 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1162 "status": "pending",
1163 },
1164 {
1165 "content": "Create the nginx directory structure",
1166 "active_form": "Working on: Create the nginx directory structure",
1167 "status": "pending",
1168 },
1169 {
1170 "content": "Create the nginx index.html file",
1171 "active_form": "Working on: Create the nginx index.html file",
1172 "status": "pending",
1173 },
1174 ],
1175 )
1176 tool_call = ToolCall(
1177 id="read-reference",
1178 name="read",
1179 arguments={"file_path": str(reference)},
1180 )
1181 executor = FakeExecutor(
1182 [
1183 tool_outcome(
1184 tool_call=tool_call,
1185 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1186 is_error=False,
1187 )
1188 ]
1189 )
1190
1191 summary = TurnSummary(final_response="")
1192 await runner.execute_batch(
1193 tool_calls=[tool_call],
1194 tool_source="assistant",
1195 pending_tool_calls_seen=set(),
1196 emit=_noop_emit,
1197 summary=summary,
1198 dod=dod,
1199 executor=executor, # type: ignore[arg-type]
1200 on_confirmation=None,
1201 on_user_question=None,
1202 emit_confirmation=None,
1203 consecutive_errors=0,
1204 )
1205
1206 assert (
1207 "Examine the existing Fortran guide structure to understand the cadence and format"
1208 in dod.completed_items
1209 )
1210 assert any(
1211 "Continue with the next pending item: `Create the nginx directory structure`"
1212 in message
1213 for message in persistent_messages
1214 )
1215 assert any(
1216 "Resume by creating `chapters/` now." in message
1217 for message in persistent_messages
1218 )
1219 assert all("01-introduction.html" not in message for message in persistent_messages)
1220 assert ephemeral_messages == []
1221
1222
1223 @pytest.mark.asyncio
1224 async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_todo(
1225 temp_dir: Path,
1226 ) -> None:
1227 async def assess_confidence(
1228 tool_name: str,
1229 tool_args: dict,
1230 context: str,
1231 ) -> ConfidenceAssessment:
1232 raise AssertionError("Confidence scoring should be disabled in this scenario")
1233
1234 async def verify_action(
1235 tool_name: str,
1236 tool_args: dict,
1237 result: str,
1238 expected: str = "",
1239 ) -> ActionVerification:
1240 raise AssertionError("Verification should not run for this scenario")
1241
1242 reference = temp_dir / "fortran" / "index.html"
1243 reference.parent.mkdir(parents=True)
1244 reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1245
1246 messages = [
1247 Message(
1248 role=Role.TOOL,
1249 content=(
1250 "Observation [read]: Result: "
1251 "<h1>Fortran Beginner's Guide</h1>\n"
1252 ),
1253 )
1254 ]
1255 context = build_context(
1256 temp_dir=temp_dir,
1257 messages=messages,
1258 safeguards=FakeSafeguards(),
1259 assess_confidence=assess_confidence,
1260 verify_action=verify_action,
1261 auto_recover=False,
1262 )
1263 prompt = (
1264 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
1265 "for the structure and cadence of the guide. We are going to make an all "
1266 "new equally thorough guide on how to use the nginx tool."
1267 )
1268 context.session.current_task = prompt
1269 persistent_messages: list[str] = []
1270 ephemeral_messages: list[str] = []
1271 context.queue_steering_message_callback = persistent_messages.append
1272 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1273 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1274 dod = create_definition_of_done(prompt)
1275 sync_todos_to_definition_of_done(
1276 dod,
1277 [
1278 {
1279 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1280 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1281 "status": "completed",
1282 },
1283 {
1284 "content": "Create the nginx directory structure",
1285 "active_form": "Working on: Create the nginx directory structure",
1286 "status": "pending",
1287 },
1288 {
1289 "content": "Create the nginx index.html file",
1290 "active_form": "Working on: Create the nginx index.html file",
1291 "status": "pending",
1292 },
1293 ],
1294 )
1295 tool_call = ToolCall(
1296 id="read-dup",
1297 name="read",
1298 arguments={"file_path": str(reference)},
1299 )
1300 duplicate_message = (
1301 "[Skipped - duplicate action: Already read "
1302 f"{reference} recently without any intervening changes; "
1303 "reuse the earlier read result instead of rereading]"
1304 )
1305 executor = FakeExecutor(
1306 [
1307 ToolExecutionOutcome(
1308 tool_call=tool_call,
1309 state=ToolExecutionState.DUPLICATE,
1310 message=Message.tool_result_message(
1311 tool_call_id=tool_call.id,
1312 display_content=duplicate_message,
1313 result_content=duplicate_message,
1314 ),
1315 event_content=duplicate_message,
1316 is_error=False,
1317 result_output=duplicate_message,
1318 )
1319 ]
1320 )
1321
1322 summary = TurnSummary(final_response="")
1323 await runner.execute_batch(
1324 tool_calls=[tool_call],
1325 tool_source="assistant",
1326 pending_tool_calls_seen=set(),
1327 emit=_noop_emit,
1328 summary=summary,
1329 dod=dod,
1330 executor=executor, # type: ignore[arg-type]
1331 on_confirmation=None,
1332 on_user_question=None,
1333 emit_confirmation=None,
1334 consecutive_errors=0,
1335 )
1336
1337 assert len(persistent_messages) == 1
1338 assert "Reuse the earlier observation instead of repeating it." in persistent_messages[0]
1339 assert (
1340 "Continue with the next pending item: `Create the nginx directory structure`"
1341 in persistent_messages[0]
1342 )
1343 assert "Update `" not in persistent_messages[0]
1344 assert ephemeral_messages == []
1345
1346
1347 @pytest.mark.asyncio
1348 async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact(
1349 temp_dir: Path,
1350 ) -> None:
1351 async def assess_confidence(
1352 tool_name: str,
1353 tool_args: dict,
1354 context: str,
1355 ) -> ConfidenceAssessment:
1356 raise AssertionError("Confidence scoring should be disabled in this scenario")
1357
1358 async def verify_action(
1359 tool_name: str,
1360 tool_args: dict,
1361 result: str,
1362 expected: str = "",
1363 ) -> ActionVerification:
1364 raise AssertionError("Verification should not run for this scenario")
1365
1366 guide_root = temp_dir / "Loader" / "guides" / "nginx"
1367 chapters = guide_root / "chapters"
1368 chapters.mkdir(parents=True)
1369 chapter_one = chapters / "01-introduction.html"
1370 chapter_one.write_text("<html></html>\n")
1371 index_path = guide_root / "index.html"
1372
1373 reference = temp_dir / "Loader" / "guides" / "fortran" / "chapters" / "01-introduction.html"
1374 reference.parent.mkdir(parents=True, exist_ok=True)
1375 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1376
1377 implementation_plan = temp_dir / "implementation.md"
1378 implementation_plan.write_text(
1379 "\n".join(
1380 [
1381 "# Implementation Plan",
1382 "",
1383 "## File Changes",
1384 f"- `{guide_root}/`",
1385 f"- `{chapters}/`",
1386 f"- `{index_path}`",
1387 f"- `{chapter_one}`",
1388 f"- `{chapters / '02-installation.html'}`",
1389 "",
1390 ]
1391 )
1392 )
1393
1394 context = build_context(
1395 temp_dir=temp_dir,
1396 messages=[],
1397 safeguards=FakeSafeguards(),
1398 assess_confidence=assess_confidence,
1399 verify_action=verify_action,
1400 auto_recover=False,
1401 )
1402 persistent_messages: list[str] = []
1403 ephemeral_messages: list[str] = []
1404 context.queue_steering_message_callback = persistent_messages.append
1405 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1406 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1407 dod = create_definition_of_done("Create a multi-file nginx guide.")
1408 dod.implementation_plan = str(implementation_plan)
1409 dod.touched_files.append(str(chapter_one))
1410 sync_todos_to_definition_of_done(
1411 dod,
1412 [
1413 {
1414 "content": "Examine the existing Fortran guide structure to understand the format and cadence",
1415 "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence",
1416 "status": "pending",
1417 },
1418 {
1419 "content": "Create each chapter file with appropriate content",
1420 "active_form": "Working on: Create each chapter file with appropriate content",
1421 "status": "pending",
1422 },
1423 {
1424 "content": "Ensure all files follow the same structure and style as the Fortran guide",
1425 "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide",
1426 "status": "pending",
1427 },
1428 ],
1429 )
1430 tool_call = ToolCall(
1431 id="read-reference-chapter",
1432 name="read",
1433 arguments={"file_path": str(reference)},
1434 )
1435 read_output = "Observation [read]: Result: <h1>Introduction</h1>\n<p>Guide cadence.</p>\n"
1436 executor = FakeExecutor(
1437 [
1438 ToolExecutionOutcome(
1439 tool_call=tool_call,
1440 state=ToolExecutionState.EXECUTED,
1441 message=Message.tool_result_message(
1442 tool_call_id=tool_call.id,
1443 display_content=read_output,
1444 result_content=read_output,
1445 ),
1446 event_content=read_output,
1447 is_error=False,
1448 result_output=read_output,
1449 )
1450 ]
1451 )
1452
1453 summary = TurnSummary(final_response="")
1454 await runner.execute_batch(
1455 tool_calls=[tool_call],
1456 tool_source="assistant",
1457 pending_tool_calls_seen=set(),
1458 emit=_noop_emit,
1459 summary=summary,
1460 dod=dod,
1461 executor=executor, # type: ignore[arg-type]
1462 on_confirmation=None,
1463 on_user_question=None,
1464 emit_confirmation=None,
1465 consecutive_errors=0,
1466 )
1467
1468 assert persistent_messages
1469 assert any(
1470 "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`"
1471 in message
1472 for message in persistent_messages
1473 )
1474 assert any("Resume by creating `index.html` now." in message for message in persistent_messages)
1475 assert not any(
1476 "Continue with the next pending item: `Create each chapter file with appropriate content`"
1477 in message
1478 for message in persistent_messages
1479 )
1480 assert ephemeral_messages == []
1481
1482
1483 @pytest.mark.asyncio
1484 async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete(
1485 temp_dir: Path,
1486 ) -> None:
1487 async def assess_confidence(
1488 tool_name: str,
1489 tool_args: dict,
1490 context: str,
1491 ) -> ConfidenceAssessment:
1492 raise AssertionError("Confidence scoring should not run for this scenario")
1493
1494 async def verify_action(
1495 tool_name: str,
1496 tool_args: dict,
1497 result: str,
1498 expected: str = "",
1499 ) -> ActionVerification:
1500 raise AssertionError("Verification should not run for this scenario")
1501
1502 guide_root = temp_dir / "guides" / "nginx"
1503 chapters = guide_root / "chapters"
1504 guide_root.mkdir(parents=True)
1505 chapters.mkdir()
1506 index_path = guide_root / "index.html"
1507 chapter_one = chapters / "01-getting-started.html"
1508 chapter_two = chapters / "02-installation.html"
1509 index_path.write_text("<html></html>\n")
1510 chapter_one.write_text("<h1>One</h1>\n")
1511 chapter_two.write_text("<h1>Two</h1>\n")
1512
1513 implementation_plan = temp_dir / "implementation.md"
1514 implementation_plan.write_text(
1515 "\n".join(
1516 [
1517 "# Implementation Plan",
1518 "",
1519 "## File Changes",
1520 f"- `{guide_root}/`",
1521 f"- `{chapters}/`",
1522 f"- `{index_path}`",
1523 f"- `{chapter_one}`",
1524 f"- `{chapter_two}`",
1525 "",
1526 ]
1527 )
1528 )
1529
1530 context = build_context(
1531 temp_dir=temp_dir,
1532 messages=[],
1533 safeguards=FakeSafeguards(),
1534 assess_confidence=assess_confidence,
1535 verify_action=verify_action,
1536 auto_recover=False,
1537 )
1538 persistent_messages: list[str] = []
1539 ephemeral_messages: list[str] = []
1540 context.queue_steering_message_callback = persistent_messages.append
1541 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1542 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1543 dod = create_definition_of_done("Create a multi-file nginx guide.")
1544 dod.implementation_plan = str(implementation_plan)
1545 dod.pending_items = [
1546 "Create 07-performance-tuning.html",
1547 "Verify all guide files are linked and complete",
1548 "Complete the requested work",
1549 ]
1550
1551 tool_call = ToolCall(
1552 id="read-dup",
1553 name="read",
1554 arguments={"file_path": str(chapter_one)},
1555 )
1556 duplicate_message = (
1557 "[Skipped - duplicate action: Already read "
1558 f"{chapter_one} recently without any intervening changes; "
1559 "reuse the earlier read result instead of rereading]"
1560 )
1561 executor = FakeExecutor(
1562 [
1563 ToolExecutionOutcome(
1564 tool_call=tool_call,
1565 state=ToolExecutionState.DUPLICATE,
1566 message=Message.tool_result_message(
1567 tool_call_id=tool_call.id,
1568 display_content=duplicate_message,
1569 result_content=duplicate_message,
1570 ),
1571 event_content=duplicate_message,
1572 is_error=False,
1573 result_output=duplicate_message,
1574 )
1575 ]
1576 )
1577
1578 summary = TurnSummary(final_response="")
1579 await runner.execute_batch(
1580 tool_calls=[tool_call],
1581 tool_source="assistant",
1582 pending_tool_calls_seen=set(),
1583 emit=_noop_emit,
1584 summary=summary,
1585 dod=dod,
1586 executor=executor, # type: ignore[arg-type]
1587 on_confirmation=None,
1588 on_user_question=None,
1589 emit_confirmation=None,
1590 consecutive_errors=0,
1591 )
1592
1593 assert len(persistent_messages) == 1
1594 assert "Verify all guide files are linked and complete" in persistent_messages[0]
1595 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1596 assert ephemeral_messages == []
1597
1598
1599 @pytest.mark.asyncio
1600 async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff(
1601 temp_dir: Path,
1602 ) -> None:
1603 async def assess_confidence(
1604 tool_name: str,
1605 tool_args: dict,
1606 context: str,
1607 ) -> ConfidenceAssessment:
1608 raise AssertionError("Confidence scoring should not run for this scenario")
1609
1610 async def verify_action(
1611 tool_name: str,
1612 tool_args: dict,
1613 result: str,
1614 expected: str = "",
1615 ) -> ActionVerification:
1616 raise AssertionError("Verification should not run for this scenario")
1617
1618 guide_root = temp_dir / "guides" / "nginx"
1619 chapters = guide_root / "chapters"
1620 guide_root.mkdir(parents=True)
1621 chapters.mkdir()
1622 index_path = guide_root / "index.html"
1623 chapter_one = chapters / "01-getting-started.html"
1624 chapter_two = chapters / "02-installation.html"
1625 index_path.write_text("<html></html>\n")
1626 chapter_one.write_text("<h1>One</h1>\n")
1627 chapter_two.write_text("<h1>Two</h1>\n")
1628
1629 implementation_plan = temp_dir / "implementation.md"
1630 implementation_plan.write_text(
1631 "\n".join(
1632 [
1633 "# Implementation Plan",
1634 "",
1635 "## File Changes",
1636 f"- `{guide_root}/`",
1637 f"- `{chapters}/`",
1638 f"- `{index_path}`",
1639 f"- `{chapter_one}`",
1640 f"- `{chapter_two}`",
1641 "",
1642 ]
1643 )
1644 )
1645
1646 context = build_context(
1647 temp_dir=temp_dir,
1648 messages=[],
1649 safeguards=FakeSafeguards(),
1650 assess_confidence=assess_confidence,
1651 verify_action=verify_action,
1652 auto_recover=False,
1653 )
1654 persistent_messages: list[str] = []
1655 ephemeral_messages: list[str] = []
1656 context.queue_steering_message_callback = persistent_messages.append
1657 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1658 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1659 dod = create_definition_of_done("Create a multi-file nginx guide.")
1660 dod.implementation_plan = str(implementation_plan)
1661 dod.verification_commands = [f"ls -la {guide_root}"]
1662 dod.pending_items = [
1663 "Create 07-performance-tuning.html",
1664 "Complete the requested work",
1665 ]
1666
1667 tool_call = ToolCall(
1668 id="read-dup",
1669 name="read",
1670 arguments={"file_path": str(chapter_one)},
1671 )
1672 duplicate_message = (
1673 "[Skipped - duplicate action: Already read "
1674 f"{chapter_one} recently without any intervening changes; "
1675 "reuse the earlier read result instead of rereading]"
1676 )
1677 executor = FakeExecutor(
1678 [
1679 ToolExecutionOutcome(
1680 tool_call=tool_call,
1681 state=ToolExecutionState.DUPLICATE,
1682 message=Message.tool_result_message(
1683 tool_call_id=tool_call.id,
1684 display_content=duplicate_message,
1685 result_content=duplicate_message,
1686 ),
1687 event_content=duplicate_message,
1688 is_error=False,
1689 result_output=duplicate_message,
1690 )
1691 ]
1692 )
1693
1694 summary = TurnSummary(final_response="")
1695 await runner.execute_batch(
1696 tool_calls=[tool_call],
1697 tool_source="assistant",
1698 pending_tool_calls_seen=set(),
1699 emit=_noop_emit,
1700 summary=summary,
1701 dod=dod,
1702 executor=executor, # type: ignore[arg-type]
1703 on_confirmation=None,
1704 on_user_question=None,
1705 emit_confirmation=None,
1706 consecutive_errors=0,
1707 )
1708
1709 assert len(persistent_messages) == 1
1710 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1711 assert (
1712 "Move to verification or final confirmation using the files already on disk."
1713 in persistent_messages[0]
1714 )
1715 assert "Create 07-performance-tuning.html" not in persistent_messages[0]
1716 assert ephemeral_messages == []
1717
1718
1719 @pytest.mark.asyncio
1720 async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos(
1721 temp_dir: Path,
1722 ) -> None:
1723 async def assess_confidence(
1724 tool_name: str,
1725 tool_args: dict,
1726 context: str,
1727 ) -> ConfidenceAssessment:
1728 raise AssertionError("Confidence scoring should not run for this scenario")
1729
1730 async def verify_action(
1731 tool_name: str,
1732 tool_args: dict,
1733 result: str,
1734 expected: str = "",
1735 ) -> ActionVerification:
1736 raise AssertionError("Verification should not run for this scenario")
1737
1738 guide_root = temp_dir / "guides" / "nginx"
1739 chapters = guide_root / "chapters"
1740 guide_root.mkdir(parents=True)
1741 chapters.mkdir()
1742 index_path = guide_root / "index.html"
1743 chapter_one = chapters / "01-getting-started.html"
1744 chapter_two = chapters / "02-installation.html"
1745 index_path.write_text("<html></html>\n")
1746 chapter_one.write_text("<h1>One</h1>\n")
1747 chapter_two.write_text("<h1>Two</h1>\n")
1748
1749 implementation_plan = temp_dir / "implementation.md"
1750 implementation_plan.write_text(
1751 "\n".join(
1752 [
1753 "# Implementation Plan",
1754 "",
1755 "## File Changes",
1756 f"- `{guide_root}/`",
1757 f"- `{chapters}/`",
1758 f"- `{index_path}`",
1759 f"- `{chapter_one}`",
1760 f"- `{chapter_two}`",
1761 "",
1762 ]
1763 )
1764 )
1765
1766 context = build_context(
1767 temp_dir=temp_dir,
1768 messages=[],
1769 safeguards=FakeSafeguards(),
1770 assess_confidence=assess_confidence,
1771 verify_action=verify_action,
1772 auto_recover=False,
1773 )
1774 persistent_messages: list[str] = []
1775 ephemeral_messages: list[str] = []
1776 context.queue_steering_message_callback = persistent_messages.append
1777 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1778 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1779 dod = create_definition_of_done("Create a multi-file nginx guide.")
1780 dod.implementation_plan = str(implementation_plan)
1781 dod.verification_commands = [f"ls -la {guide_root}"]
1782 dod.pending_items = [
1783 "Create 01-getting-started.html",
1784 "Creating 02-installation.html",
1785 "Complete the requested work",
1786 ]
1787
1788 tool_call = ToolCall(
1789 id="read-dup-built-stale",
1790 name="read",
1791 arguments={"file_path": str(chapter_one)},
1792 )
1793 duplicate_message = (
1794 "[Skipped - duplicate action: Already read "
1795 f"{chapter_one} recently without any intervening changes; "
1796 "reuse the earlier read result instead of rereading]"
1797 )
1798 executor = FakeExecutor(
1799 [
1800 ToolExecutionOutcome(
1801 tool_call=tool_call,
1802 state=ToolExecutionState.DUPLICATE,
1803 message=Message.tool_result_message(
1804 tool_call_id=tool_call.id,
1805 display_content=duplicate_message,
1806 result_content=duplicate_message,
1807 ),
1808 event_content=duplicate_message,
1809 is_error=False,
1810 result_output=duplicate_message,
1811 )
1812 ]
1813 )
1814
1815 summary = TurnSummary(final_response="")
1816 await runner.execute_batch(
1817 tool_calls=[tool_call],
1818 tool_source="assistant",
1819 pending_tool_calls_seen=set(),
1820 emit=_noop_emit,
1821 summary=summary,
1822 dod=dod,
1823 executor=executor, # type: ignore[arg-type]
1824 on_confirmation=None,
1825 on_user_question=None,
1826 emit_confirmation=None,
1827 consecutive_errors=0,
1828 )
1829
1830 assert len(persistent_messages) == 1
1831 assert "All explicitly planned artifacts already exist." in persistent_messages[0]
1832 assert (
1833 "Move to verification or final confirmation using the files already on disk."
1834 in persistent_messages[0]
1835 )
1836 assert "Create 01-getting-started.html" not in persistent_messages[0]
1837 assert "Creating 02-installation.html" not in persistent_messages[0]
1838 assert ephemeral_messages == []
1839
1840
1841 @pytest.mark.asyncio
1842 async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
1843 temp_dir: Path,
1844 ) -> None:
1845 async def assess_confidence(
1846 tool_name: str,
1847 tool_args: dict,
1848 context: str,
1849 ) -> ConfidenceAssessment:
1850 raise AssertionError("Confidence scoring should be disabled in this scenario")
1851
1852 async def verify_action(
1853 tool_name: str,
1854 tool_args: dict,
1855 result: str,
1856 expected: str = "",
1857 ) -> ActionVerification:
1858 raise AssertionError("Verification should not run for this scenario")
1859
1860 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1861 reference.parent.mkdir(parents=True)
1862 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1863
1864 context = build_context(
1865 temp_dir=temp_dir,
1866 messages=[],
1867 safeguards=FakeSafeguards(),
1868 assess_confidence=assess_confidence,
1869 verify_action=verify_action,
1870 auto_recover=False,
1871 )
1872 persistent_messages: list[str] = []
1873 ephemeral_messages: list[str] = []
1874 context.queue_steering_message_callback = persistent_messages.append
1875 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1876 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1877 dod = create_definition_of_done("Create a multi-file nginx guide.")
1878 sync_todos_to_definition_of_done(
1879 dod,
1880 [
1881 {
1882 "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1883 "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1884 "status": "pending",
1885 },
1886 {
1887 "content": "Create the nginx index.html file",
1888 "active_form": "Working on: Create the nginx index.html file",
1889 "status": "pending",
1890 },
1891 ],
1892 )
1893 tool_call = ToolCall(
1894 id="read-reference",
1895 name="read",
1896 arguments={"file_path": str(reference)},
1897 )
1898 executor = FakeExecutor(
1899 [
1900 tool_outcome(
1901 tool_call=tool_call,
1902 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1903 is_error=False,
1904 )
1905 ]
1906 )
1907
1908 summary = TurnSummary(final_response="")
1909 await runner.execute_batch(
1910 tool_calls=[tool_call],
1911 tool_source="assistant",
1912 pending_tool_calls_seen=set(),
1913 emit=_noop_emit,
1914 summary=summary,
1915 dod=dod,
1916 executor=executor, # type: ignore[arg-type]
1917 on_confirmation=None,
1918 on_user_question=None,
1919 emit_confirmation=None,
1920 consecutive_errors=0,
1921 )
1922
1923 assert any(
1924 "Continue with the next pending item: `Create the nginx index.html file`"
1925 in message
1926 for message in persistent_messages
1927 )
1928 assert any(
1929 "stop gathering more reference material and perform the change now" in message
1930 for message in persistent_messages
1931 )
1932 assert ephemeral_messages == []
1933
1934
1935 @pytest.mark.asyncio
1936 async def test_tool_batch_runner_discovery_completion_handoff_stays_persistent(
1937 temp_dir: Path,
1938 ) -> None:
1939 async def assess_confidence(
1940 tool_name: str,
1941 tool_args: dict,
1942 context: str,
1943 ) -> ConfidenceAssessment:
1944 raise AssertionError("Confidence scoring should be disabled in this scenario")
1945
1946 async def verify_action(
1947 tool_name: str,
1948 tool_args: dict,
1949 result: str,
1950 expected: str = "",
1951 ) -> ActionVerification:
1952 raise AssertionError("Verification should not run for this scenario")
1953
1954 reference = temp_dir / "fortran" / "chapters" / "01-introduction.html"
1955 reference.parent.mkdir(parents=True)
1956 reference.write_text("<h1>Introduction</h1>\n<p>Guide cadence.</p>\n")
1957
1958 context = build_context(
1959 temp_dir=temp_dir,
1960 messages=[],
1961 safeguards=FakeSafeguards(),
1962 assess_confidence=assess_confidence,
1963 verify_action=verify_action,
1964 auto_recover=False,
1965 )
1966 persistent_messages: list[str] = []
1967 ephemeral_messages: list[str] = []
1968 context.queue_steering_message_callback = persistent_messages.append
1969 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
1970 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1971 dod = create_definition_of_done("Create a multi-file nginx guide.")
1972 sync_todos_to_definition_of_done(
1973 dod,
1974 [
1975 {
1976 "content": "First, examine the existing fortran guide structure and content",
1977 "active_form": "Working on: First, examine the existing fortran guide structure and content",
1978 "status": "pending",
1979 },
1980 {
1981 "content": "Create the nginx directory structure",
1982 "active_form": "Working on: Create the nginx directory structure",
1983 "status": "pending",
1984 },
1985 ],
1986 )
1987 tool_call = ToolCall(
1988 id="read-reference",
1989 name="read",
1990 arguments={"file_path": str(reference)},
1991 )
1992 executor = FakeExecutor(
1993 [
1994 tool_outcome(
1995 tool_call=tool_call,
1996 output="<h1>Introduction</h1>\n<p>Guide cadence.</p>\n",
1997 is_error=False,
1998 )
1999 ]
2000 )
2001
2002 summary = TurnSummary(final_response="")
2003 await runner.execute_batch(
2004 tool_calls=[tool_call],
2005 tool_source="assistant",
2006 pending_tool_calls_seen=set(),
2007 emit=_noop_emit,
2008 summary=summary,
2009 dod=dod,
2010 executor=executor, # type: ignore[arg-type]
2011 on_confirmation=None,
2012 on_user_question=None,
2013 emit_confirmation=None,
2014 consecutive_errors=0,
2015 )
2016
2017 assert persistent_messages
2018 assert any(
2019 "Continue with the next pending item: `Create the nginx directory structure`"
2020 in message
2021 for message in persistent_messages
2022 )
2023 assert ephemeral_messages == []
2024
2025
2026 @pytest.mark.asyncio
2027 async def test_tool_batch_runner_missing_artifact_nudge_prefers_pending_index_after_mkdir(
2028 temp_dir: Path,
2029 ) -> None:
2030 async def assess_confidence(
2031 tool_name: str,
2032 tool_args: dict,
2033 context: str,
2034 ) -> ConfidenceAssessment:
2035 raise AssertionError("Confidence scoring should be disabled in this scenario")
2036
2037 async def verify_action(
2038 tool_name: str,
2039 tool_args: dict,
2040 result: str,
2041 expected: str = "",
2042 ) -> ActionVerification:
2043 raise AssertionError("Verification should not run for this scenario")
2044
2045 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
2046 chapters = nginx_root / "chapters"
2047 implementation_plan = temp_dir / "implementation.md"
2048 implementation_plan.write_text(
2049 "\n".join(
2050 [
2051 "# Implementation Plan",
2052 "",
2053 "## File Changes",
2054 f"- `{chapters}/`",
2055 f"- `{nginx_root / 'index.html'}`",
2056 "",
2057 ]
2058 )
2059 )
2060
2061 context = build_context(
2062 temp_dir=temp_dir,
2063 messages=[],
2064 safeguards=FakeSafeguards(),
2065 assess_confidence=assess_confidence,
2066 verify_action=verify_action,
2067 auto_recover=False,
2068 )
2069 persistent_messages: list[str] = []
2070 ephemeral_messages: list[str] = []
2071 context.queue_steering_message_callback = persistent_messages.append
2072 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2073 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2074 dod = create_definition_of_done("Create a multi-file nginx guide.")
2075 dod.implementation_plan = str(implementation_plan)
2076 sync_todos_to_definition_of_done(
2077 dod,
2078 [
2079 {
2080 "content": "Create the nginx directory structure",
2081 "active_form": "Creating the nginx directory structure",
2082 "status": "pending",
2083 },
2084 {
2085 "content": "Develop the main index.html file with proper structure",
2086 "active_form": "Developing the main index.html file with proper structure",
2087 "status": "pending",
2088 },
2089 ],
2090 )
2091
2092 tool_call = ToolCall(
2093 id="mkdir-nginx",
2094 name="bash",
2095 arguments={"command": f"mkdir -p {chapters}"},
2096 )
2097 executor = FakeExecutor(
2098 [
2099 tool_outcome(
2100 tool_call=tool_call,
2101 output="",
2102 is_error=False,
2103 )
2104 ]
2105 )
2106
2107 summary = TurnSummary(final_response="")
2108 await runner.execute_batch(
2109 tool_calls=[tool_call],
2110 tool_source="assistant",
2111 pending_tool_calls_seen=set(),
2112 emit=_noop_emit,
2113 summary=summary,
2114 dod=dod,
2115 executor=executor, # type: ignore[arg-type]
2116 on_confirmation=None,
2117 on_user_question=None,
2118 emit_confirmation=None,
2119 consecutive_errors=0,
2120 )
2121
2122 assert persistent_messages
2123 message = persistent_messages[-1]
2124 assert "Next step: create `index.html`." in message
2125 assert (
2126 f"Prefer one `write(file_path=..., content=...)` call for `{(nginx_root / 'index.html').resolve(strict=False)}` now."
2127 in message
2128 )
2129 assert "One declared output artifact is still missing." not in message
2130 assert "Do not reread reference material or spend the next turn on bookkeeping." in message
2131 assert "Resume by creating the next output file under `chapters/` now." not in message
2132 assert ephemeral_messages == []
2133
2134
2135 @pytest.mark.asyncio
2136 async def test_tool_batch_runner_first_file_handoff_stays_persistent(
2137 temp_dir: Path,
2138 ) -> None:
2139 async def assess_confidence(
2140 tool_name: str,
2141 tool_args: dict,
2142 context: str,
2143 ) -> ConfidenceAssessment:
2144 raise AssertionError("Confidence scoring should be disabled in this scenario")
2145
2146 async def verify_action(
2147 tool_name: str,
2148 tool_args: dict,
2149 result: str,
2150 expected: str = "",
2151 ) -> ActionVerification:
2152 raise AssertionError("Verification should not run for this scenario")
2153
2154 nginx_root = temp_dir / "guides" / "nginx"
2155 chapters = nginx_root / "chapters"
2156 chapters.mkdir(parents=True)
2157 index_path = nginx_root / "index.html"
2158
2159 implementation_plan = temp_dir / "implementation.md"
2160 implementation_plan.write_text(
2161 "\n".join(
2162 [
2163 "# Implementation Plan",
2164 "",
2165 "## File Changes",
2166 f"- `{chapters}/`",
2167 f"- `{index_path}`",
2168 f"- `{chapters / '01-introduction.html'}`",
2169 "",
2170 ]
2171 )
2172 )
2173
2174 context = build_context(
2175 temp_dir=temp_dir,
2176 messages=[],
2177 safeguards=FakeSafeguards(),
2178 assess_confidence=assess_confidence,
2179 verify_action=verify_action,
2180 auto_recover=False,
2181 )
2182 persistent_messages: list[str] = []
2183 ephemeral_messages: list[str] = []
2184 context.queue_steering_message_callback = persistent_messages.append
2185 context.queue_ephemeral_steering_message_callback = ephemeral_messages.append
2186 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2187 dod = create_definition_of_done("Create a multi-file nginx guide.")
2188 dod.implementation_plan = str(implementation_plan)
2189 sync_todos_to_definition_of_done(
2190 dod,
2191 [
2192 {
2193 "content": "Create the main index.html file with proper structure",
2194 "active_form": "Creating the main index.html file with proper structure",
2195 "status": "pending",
2196 },
2197 {
2198 "content": "Create each chapter file with appropriate content",
2199 "active_form": "Creating each chapter file with appropriate content",
2200 "status": "pending",
2201 },
2202 ],
2203 )
2204
2205 tool_call = ToolCall(
2206 id="write-index",
2207 name="write",
2208 arguments={
2209 "file_path": str(index_path),
2210 "content": "<html></html>\n",
2211 },
2212 )
2213 executor = FakeExecutor(
2214 [
2215 tool_outcome(
2216 tool_call=tool_call,
2217 output=f"Successfully wrote 14 bytes to {index_path}",
2218 is_error=False,
2219 )
2220 ]
2221 )
2222
2223 summary = TurnSummary(final_response="")
2224 await runner.execute_batch(
2225 tool_calls=[tool_call],
2226 tool_source="assistant",
2227 pending_tool_calls_seen=set(),
2228 emit=_noop_emit,
2229 summary=summary,
2230 dod=dod,
2231 executor=executor, # type: ignore[arg-type]
2232 on_confirmation=None,
2233 on_user_question=None,
2234 emit_confirmation=None,
2235 consecutive_errors=0,
2236 )
2237
2238 assert persistent_messages
2239 message = persistent_messages[-1]
2240 assert "Confirmed progress:" in message
2241 assert "Resume by creating `01-introduction.html` now." in message
2242 assert (
2243 f"Prefer one `write` call for `{(chapters / '01-introduction.html').resolve(strict=False)}` "
2244 "instead of more rereads."
2245 in message
2246 )
2247 assert "Do not move to verification, final confirmation, or TodoWrite-only bookkeeping" in message
2248 assert ephemeral_messages == []
2249
2250
2251 @pytest.mark.asyncio
2252 async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review(
2253 temp_dir: Path,
2254 ) -> None:
2255 async def assess_confidence(
2256 tool_name: str,
2257 tool_args: dict,
2258 context: str,
2259 ) -> ConfidenceAssessment:
2260 raise AssertionError("Confidence scoring should be disabled in this scenario")
2261
2262 async def verify_action(
2263 tool_name: str,
2264 tool_args: dict,
2265 result: str,
2266 expected: str = "",
2267 ) -> ActionVerification:
2268 raise AssertionError("Verification should not run for this scenario")
2269
2270 guide_root = temp_dir / "guides" / "nginx"
2271 chapters = guide_root / "chapters"
2272 chapters.mkdir(parents=True)
2273 index_path = guide_root / "index.html"
2274 chapter_one = chapters / "01-getting-started.html"
2275 chapter_one.write_text("<h1>One</h1>\n")
2276 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2277
2278 implementation_plan = temp_dir / "implementation.md"
2279 implementation_plan.write_text(
2280 "\n".join(
2281 [
2282 "# Implementation Plan",
2283 "",
2284 "## File Changes",
2285 f"- `{index_path}`",
2286 f"- `{chapter_one}`",
2287 f"- `{chapters / '06-ssl-configuration.html'}`",
2288 "",
2289 ]
2290 )
2291 )
2292
2293 context = build_context(
2294 temp_dir=temp_dir,
2295 messages=[],
2296 safeguards=FakeSafeguards(),
2297 assess_confidence=assess_confidence,
2298 verify_action=verify_action,
2299 auto_recover=False,
2300 )
2301 queued_messages: list[str] = []
2302 context.queue_steering_message_callback = queued_messages.append
2303 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2304 dod = create_definition_of_done("Create a multi-file nginx guide.")
2305 dod.implementation_plan = str(implementation_plan)
2306 sync_todos_to_definition_of_done(
2307 dod,
2308 [
2309 {
2310 "content": "Ensure all files are properly linked and formatted consistently",
2311 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2312 "status": "pending",
2313 },
2314 {
2315 "content": "Create the final chapter (06-ssl-configuration.html)",
2316 "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)",
2317 "status": "pending",
2318 },
2319 ],
2320 )
2321 assert tool_batches_should_prioritize_missing_artifact(
2322 dod=dod,
2323 next_pending=dod.pending_items[0],
2324 missing_artifact=(chapters / "06-ssl-configuration.html", False),
2325 project_root=temp_dir,
2326 )
2327
2328 tool_call = ToolCall(
2329 id="dup-read",
2330 name="read",
2331 arguments={"file_path": str(index_path)},
2332 )
2333 runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined]
2334
2335 assert queued_messages
2336 message = queued_messages[-1]
2337 assert "06-ssl-configuration.html" in message
2338 assert "Do not switch into review or consistency-check mode" in message
2339 assert (
2340 "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`"
2341 not in message
2342 )
2343
2344
2345 @pytest.mark.asyncio
2346 async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist(
2347 temp_dir: Path,
2348 ) -> None:
2349 async def assess_confidence(
2350 tool_name: str,
2351 tool_args: dict,
2352 context: str,
2353 ) -> ConfidenceAssessment:
2354 raise AssertionError("Confidence scoring should be disabled in this scenario")
2355
2356 async def verify_action(
2357 tool_name: str,
2358 tool_args: dict,
2359 result: str,
2360 expected: str = "",
2361 ) -> ActionVerification:
2362 raise AssertionError("Verification should not run for this scenario")
2363
2364 guide_root = temp_dir / "guides" / "nginx"
2365 chapters = guide_root / "chapters"
2366 chapters.mkdir(parents=True)
2367 index_path = guide_root / "index.html"
2368 chapter_one = chapters / "01-getting-started.html"
2369 chapter_two = chapters / "02-installation.html"
2370 index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n")
2371 chapter_one.write_text("<h1>One</h1>\n")
2372 chapter_two.write_text("<h1>Two</h1>\n")
2373
2374 implementation_plan = temp_dir / "implementation.md"
2375 implementation_plan.write_text(
2376 "\n".join(
2377 [
2378 "# Implementation Plan",
2379 "",
2380 "## File Changes",
2381 f"- `{chapters}/`",
2382 f"- `{index_path}`",
2383 f"- `{chapter_one}`",
2384 f"- `{chapter_two}`",
2385 "",
2386 ]
2387 )
2388 )
2389
2390 context = build_context(
2391 temp_dir=temp_dir,
2392 messages=[],
2393 safeguards=FakeSafeguards(),
2394 assess_confidence=assess_confidence,
2395 verify_action=verify_action,
2396 auto_recover=False,
2397 )
2398 queued_messages: list[str] = []
2399 context.queue_steering_message_callback = queued_messages.append
2400 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2401 dod = create_definition_of_done("Create a multi-file nginx guide.")
2402 dod.implementation_plan = str(implementation_plan)
2403 sync_todos_to_definition_of_done(
2404 dod,
2405 [
2406 {
2407 "content": "Create the guide files",
2408 "active_form": "Working on: Create the guide files",
2409 "status": "completed",
2410 },
2411 {
2412 "content": "Ensure all files are properly linked and formatted consistently",
2413 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2414 "status": "pending",
2415 },
2416 ],
2417 )
2418 tool_call = ToolCall(
2419 id="write-final",
2420 name="write",
2421 arguments={
2422 "file_path": str(chapter_two),
2423 "content": "<h1>Two</h1>\n",
2424 },
2425 )
2426 executor = FakeExecutor(
2427 [
2428 tool_outcome(
2429 tool_call=tool_call,
2430 output=f"Successfully wrote {chapter_two}",
2431 is_error=False,
2432 )
2433 ]
2434 )
2435
2436 summary = TurnSummary(final_response="")
2437 await runner.execute_batch(
2438 tool_calls=[tool_call],
2439 tool_source="assistant",
2440 pending_tool_calls_seen=set(),
2441 emit=_noop_emit,
2442 summary=summary,
2443 dod=dod,
2444 executor=executor, # type: ignore[arg-type]
2445 on_confirmation=None,
2446 on_user_question=None,
2447 emit_confirmation=None,
2448 consecutive_errors=0,
2449 )
2450
2451 assert any(
2452 "All explicitly planned artifacts now exist." in message
2453 for message in queued_messages
2454 )
2455 assert any(
2456 "Ensure all files are properly linked and formatted consistently" in message
2457 for message in queued_messages
2458 )
2459 assert any(
2460 "Move to verification once no specific mismatch remains." in message
2461 for message in queued_messages
2462 )
2463
2464
2465 @pytest.mark.asyncio
2466 async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact(
2467 temp_dir: Path,
2468 ) -> None:
2469 async def assess_confidence(
2470 tool_name: str,
2471 tool_args: dict,
2472 context: str,
2473 ) -> ConfidenceAssessment:
2474 raise AssertionError("Confidence scoring should not run in this scenario")
2475
2476 async def verify_action(
2477 tool_name: str,
2478 tool_args: dict,
2479 result: str,
2480 expected: str = "",
2481 ) -> ActionVerification:
2482 raise AssertionError("Verification should not run in this scenario")
2483
2484 guide_root = temp_dir / "guides" / "nginx"
2485 chapters = guide_root / "chapters"
2486 guide_root.mkdir(parents=True)
2487 chapters.mkdir()
2488 index_path = guide_root / "index.html"
2489 index_path.write_text("<html></html>\n")
2490 chapter_one = chapters / "01-getting-started.html"
2491 chapter_two = chapters / "02-installation.html"
2492 implementation_plan = temp_dir / "implementation.md"
2493 implementation_plan.write_text(
2494 "\n".join(
2495 [
2496 "# Implementation Plan",
2497 "",
2498 "## File Changes",
2499 f"- `{guide_root}/`",
2500 f"- `{index_path}`",
2501 f"- `{chapter_one}`",
2502 f"- `{chapter_two}`",
2503 "",
2504 ]
2505 )
2506 )
2507
2508 context = build_context(
2509 temp_dir=temp_dir,
2510 messages=[],
2511 safeguards=FakeSafeguards(),
2512 assess_confidence=assess_confidence,
2513 verify_action=verify_action,
2514 auto_recover=False,
2515 )
2516 queued_messages: list[str] = []
2517 context.queue_steering_message_callback = queued_messages.append
2518 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2519 dod = create_definition_of_done("Create a multi-file nginx guide.")
2520 dod.implementation_plan = str(implementation_plan)
2521 sync_todos_to_definition_of_done(
2522 dod,
2523 [
2524 {
2525 "content": "Create the main index.html file with proper structure",
2526 "active_form": "Working on: Create the main index.html file with proper structure",
2527 "status": "pending",
2528 },
2529 {
2530 "content": "Create each chapter file in sequence, following the established pattern",
2531 "active_form": "Working on: Create each chapter file in sequence, following the established pattern",
2532 "status": "pending",
2533 },
2534 {
2535 "content": "Ensure all files are properly linked and formatted consistently",
2536 "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
2537 "status": "pending",
2538 },
2539 ],
2540 )
2541 tool_call = ToolCall(
2542 id="write-index",
2543 name="write",
2544 arguments={"file_path": str(index_path), "content": "<html></html>\n"},
2545 )
2546 executor = FakeExecutor(
2547 [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)]
2548 )
2549
2550 summary = TurnSummary(final_response="")
2551 await runner.execute_batch(
2552 tool_calls=[tool_call],
2553 tool_source="assistant",
2554 pending_tool_calls_seen=set(),
2555 emit=_noop_emit,
2556 summary=summary,
2557 dod=dod,
2558 executor=executor, # type: ignore[arg-type]
2559 on_confirmation=None,
2560 on_user_question=None,
2561 emit_confirmation=None,
2562 consecutive_errors=0,
2563 )
2564
2565 assert queued_messages
2566 message = queued_messages[-1]
2567 assert "Resume by creating `01-getting-started.html` now." in message
2568 assert "refresh `TodoWrite`" in message
2569 assert "Do not move to verification, final confirmation, or TodoWrite-only bookkeeping" in message
2570 assert "Do not spend another turn on working notes or rediscovery alone." in message
2571
2572
2573 @pytest.mark.asyncio
2574 async def test_tool_batch_runner_large_plan_does_not_claim_completion_early(
2575 temp_dir: Path,
2576 ) -> None:
2577 async def assess_confidence(
2578 tool_name: str,
2579 tool_args: dict,
2580 context: str,
2581 ) -> ConfidenceAssessment:
2582 raise AssertionError("Confidence scoring should not run in this scenario")
2583
2584 async def verify_action(
2585 tool_name: str,
2586 tool_args: dict,
2587 result: str,
2588 expected: str = "",
2589 ) -> ActionVerification:
2590 raise AssertionError("Verification should not run in this scenario")
2591
2592 guide_root = temp_dir / "guides" / "nginx"
2593 chapters = guide_root / "chapters"
2594 guide_root.mkdir(parents=True)
2595 chapters.mkdir()
2596 index_path = guide_root / "index.html"
2597 index_path.write_text("<html></html>\n")
2598
2599 chapter_paths = [
2600 chapters / "01-getting-started.html",
2601 chapters / "02-installation.html",
2602 chapters / "03-first-website.html",
2603 chapters / "04-configuration-basics.html",
2604 chapters / "05-advanced-configurations.html",
2605 chapters / "06-performance-tuning.html",
2606 chapters / "07-security-best-practices.html",
2607 ]
2608 for chapter in chapter_paths[:4]:
2609 chapter.write_text(f"<h1>{chapter.stem}</h1>\n")
2610 chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n")
2611
2612 implementation_plan = temp_dir / "implementation.md"
2613 implementation_plan.write_text(
2614 "\n".join(
2615 [
2616 "# Implementation Plan",
2617 "",
2618 "## File Changes",
2619 f"- `{guide_root}/`",
2620 f"- `{chapters}/`",
2621 f"- `{index_path}`",
2622 *[f"- `{path}`" for path in chapter_paths],
2623 "",
2624 ]
2625 )
2626 )
2627
2628 context = build_context(
2629 temp_dir=temp_dir,
2630 messages=[],
2631 safeguards=FakeSafeguards(),
2632 assess_confidence=assess_confidence,
2633 verify_action=verify_action,
2634 auto_recover=False,
2635 )
2636 queued_messages: list[str] = []
2637 context.queue_steering_message_callback = queued_messages.append
2638 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2639 dod = create_definition_of_done("Create a thorough nginx guide.")
2640 dod.implementation_plan = str(implementation_plan)
2641 sync_todos_to_definition_of_done(
2642 dod,
2643 [
2644 {
2645 "content": "Create the nginx guide artifacts",
2646 "active_form": "Creating nginx guide artifacts",
2647 "status": "pending",
2648 },
2649 {
2650 "content": "Verify all guide files are linked and complete",
2651 "active_form": "Verifying guide linkage and completeness",
2652 "status": "pending",
2653 },
2654 ],
2655 )
2656 tool_call = ToolCall(
2657 id="write-chapter-05",
2658 name="write",
2659 arguments={
2660 "file_path": str(chapter_paths[4]),
2661 "content": "<h1>Advanced configurations</h1>\n",
2662 },
2663 )
2664 executor = FakeExecutor(
2665 [
2666 tool_outcome(
2667 tool_call=tool_call,
2668 output=f"Successfully wrote {chapter_paths[4]}",
2669 is_error=False,
2670 )
2671 ]
2672 )
2673
2674 summary = TurnSummary(final_response="")
2675 await runner.execute_batch(
2676 tool_calls=[tool_call],
2677 tool_source="assistant",
2678 pending_tool_calls_seen=set(),
2679 emit=_noop_emit,
2680 summary=summary,
2681 dod=dod,
2682 executor=executor, # type: ignore[arg-type]
2683 on_confirmation=None,
2684 on_user_question=None,
2685 emit_confirmation=None,
2686 consecutive_errors=0,
2687 )
2688
2689 assert any(
2690 "Resume by creating `06-performance-tuning.html` now." in message
2691 for message in queued_messages
2692 )
2693 assert not any(
2694 "All explicitly planned artifacts now exist." in message
2695 for message in queued_messages
2696 )
2697
2698
2699 @pytest.mark.asyncio
2700 async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress(
2701 temp_dir: Path,
2702 ) -> None:
2703 async def assess_confidence(
2704 tool_name: str,
2705 tool_args: dict,
2706 context: str,
2707 ) -> ConfidenceAssessment:
2708 raise AssertionError("Confidence scoring should not run in this scenario")
2709
2710 async def verify_action(
2711 tool_name: str,
2712 tool_args: dict,
2713 result: str,
2714 expected: str = "",
2715 ) -> ActionVerification:
2716 raise AssertionError("Verification should not run in this scenario")
2717
2718 guide_root = temp_dir / "guides" / "nginx"
2719 chapters = guide_root / "chapters"
2720 guide_root.mkdir(parents=True)
2721 chapters.mkdir()
2722 index_path = guide_root / "index.html"
2723 chapter_paths = [
2724 chapters / "01-introduction.html",
2725 chapters / "02-installation.html",
2726 chapters / "03-configuration.html",
2727 chapters / "04-basic-usage.html",
2728 chapters / "05-advanced-features.html",
2729 ]
2730 for path in (index_path, *chapter_paths[:4]):
2731 path.write_text("<html></html>\n")
2732
2733 implementation_plan = temp_dir / "implementation.md"
2734 implementation_plan.write_text(
2735 "\n".join(
2736 [
2737 "# Implementation Plan",
2738 "",
2739 "## File Changes",
2740 f"- `{guide_root}/`",
2741 f"- `{chapters}/`",
2742 f"- `{index_path}`",
2743 *[f"- `{path}`" for path in chapter_paths],
2744 "",
2745 ]
2746 )
2747 )
2748
2749 context = build_context(
2750 temp_dir=temp_dir,
2751 messages=[],
2752 safeguards=FakeSafeguards(),
2753 assess_confidence=assess_confidence,
2754 verify_action=verify_action,
2755 auto_recover=False,
2756 )
2757 queued_messages: list[str] = []
2758 context.queue_steering_message_callback = queued_messages.append
2759 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2760 dod = create_definition_of_done("Create a thorough nginx guide.")
2761 dod.implementation_plan = str(implementation_plan)
2762 dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4]))
2763 dod.completed_items.extend(
2764 [
2765 "Create the nginx directory structure",
2766 "Create the main index.html file with proper structure",
2767 ]
2768 )
2769 sync_todos_to_definition_of_done(
2770 dod,
2771 [
2772 {
2773 "content": "Create each chapter file with appropriate content",
2774 "active_form": "Creating each chapter file with appropriate content",
2775 "status": "pending",
2776 }
2777 ],
2778 )
2779 tool_call = ToolCall(
2780 id="write-chapter-04",
2781 name="write",
2782 arguments={
2783 "file_path": str(chapter_paths[3]),
2784 "content": "<html>updated</html>\n",
2785 },
2786 )
2787 executor = FakeExecutor(
2788 [
2789 tool_outcome(
2790 tool_call=tool_call,
2791 output=f"Successfully wrote {chapter_paths[3]}",
2792 is_error=False,
2793 )
2794 ]
2795 )
2796
2797 summary = TurnSummary(final_response="")
2798 await runner.execute_batch(
2799 tool_calls=[tool_call],
2800 tool_source="assistant",
2801 pending_tool_calls_seen=set(),
2802 emit=_noop_emit,
2803 summary=summary,
2804 dod=dod,
2805 executor=executor, # type: ignore[arg-type]
2806 on_confirmation=None,
2807 on_user_question=None,
2808 emit_confirmation=None,
2809 consecutive_errors=0,
2810 )
2811
2812 assert queued_messages
2813 message = queued_messages[-1]
2814 assert "Resume by creating `05-advanced-features.html` now." in message
2815 assert "No TodoWrite, no verification, no rereads until that artifact exists." in message
2816 assert "refresh `TodoWrite`" not in message
2817
2818
2819 @pytest.mark.asyncio
2820 async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step(
2821 temp_dir: Path,
2822 ) -> None:
2823 async def assess_confidence(
2824 tool_name: str,
2825 tool_args: dict,
2826 context: str,
2827 ) -> ConfidenceAssessment:
2828 raise AssertionError("Confidence scoring should not run in this scenario")
2829
2830 async def verify_action(
2831 tool_name: str,
2832 tool_args: dict,
2833 result: str,
2834 expected: str = "",
2835 ) -> ActionVerification:
2836 raise AssertionError("Verification should not run in this scenario")
2837
2838 guide_root = temp_dir / "guides" / "nginx"
2839 chapters = guide_root / "chapters"
2840 guide_root.mkdir(parents=True)
2841 chapters.mkdir()
2842 index_path = guide_root / "index.html"
2843 index_path.write_text("<html></html>\n")
2844 chapter_one = chapters / "01-getting-started.html"
2845 chapter_two = chapters / "02-installation.html"
2846 chapter_one.write_text("<h1>One</h1>\n")
2847
2848 implementation_plan = temp_dir / "implementation.md"
2849 implementation_plan.write_text(
2850 "\n".join(
2851 [
2852 "# Implementation Plan",
2853 "",
2854 "## File Changes",
2855 f"- `{guide_root}/`",
2856 f"- `{chapters}/`",
2857 f"- `{index_path}`",
2858 f"- `{chapter_one}`",
2859 f"- `{chapter_two}`",
2860 "",
2861 ]
2862 )
2863 )
2864
2865 context = build_context(
2866 temp_dir=temp_dir,
2867 messages=[],
2868 safeguards=FakeSafeguards(),
2869 assess_confidence=assess_confidence,
2870 verify_action=verify_action,
2871 auto_recover=False,
2872 )
2873 queued_messages: list[str] = []
2874 context.queue_steering_message_callback = queued_messages.append
2875 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
2876 dod = create_definition_of_done("Create a multi-file nginx guide.")
2877 dod.implementation_plan = str(implementation_plan)
2878 sync_todos_to_definition_of_done(
2879 dod,
2880 [
2881 {
2882 "content": "Create 01-getting-started.html",
2883 "active_form": "Creating 01-getting-started.html",
2884 "status": "completed",
2885 },
2886 {
2887 "content": "Create 02-installation.html",
2888 "active_form": "Creating 02-installation.html",
2889 "status": "pending",
2890 },
2891 ],
2892 )
2893 dod.touched_files.extend([str(index_path), str(chapter_one)])
2894
2895 tool_call = ToolCall(
2896 id="todo-only",
2897 name="TodoWrite",
2898 arguments={
2899 "todos": [
2900 {
2901 "content": "Create 01-getting-started.html",
2902 "active_form": "Creating 01-getting-started.html",
2903 "status": "completed",
2904 },
2905 {
2906 "content": "Create 02-installation.html",
2907 "active_form": "Creating 02-installation.html",
2908 "status": "pending",
2909 },
2910 ]
2911 },
2912 )
2913 executor = FakeExecutor(
2914 [
2915 tool_outcome(
2916 tool_call=tool_call,
2917 output="Todos updated",
2918 is_error=False,
2919 metadata={
2920 "new_todos": [
2921 {
2922 "content": "Create 01-getting-started.html",
2923 "active_form": "Creating 01-getting-started.html",
2924 "status": "completed",
2925 },
2926 {
2927 "content": "Create 02-installation.html",
2928 "active_form": "Creating 02-installation.html",
2929 "status": "pending",
2930 },
2931 ]
2932 },
2933 )
2934 ]
2935 )
2936
2937 summary = TurnSummary(final_response="")
2938 await runner.execute_batch(
2939 tool_calls=[tool_call],
2940 tool_source="assistant",
2941 pending_tool_calls_seen=set(),
2942 emit=_noop_emit,
2943 summary=summary,
2944 dod=dod,
2945 executor=executor, # type: ignore[arg-type]
2946 on_confirmation=None,
2947 on_user_question=None,
2948 emit_confirmation=None,
2949 consecutive_errors=0,
2950 )
2951
2952 assert queued_messages
2953 message = queued_messages[-1]
2954 assert "Todo tracking is updated. A declared output artifact is still missing." in message
2955 assert "Resume by creating `02-installation.html` now." in message
2956 assert "refresh `TodoWrite`" in message
2957 assert "Do not spend the next turn on TodoWrite alone" in message
2958
2959
2960 @pytest.mark.asyncio
2961 async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff(
2962 temp_dir: Path,
2963 ) -> None:
2964 async def assess_confidence(
2965 tool_name: str,
2966 tool_args: dict,
2967 context: str,
2968 ) -> ConfidenceAssessment:
2969 raise AssertionError("Confidence scoring should not run in this scenario")
2970
2971 async def verify_action(
2972 tool_name: str,
2973 tool_args: dict,
2974 result: str,
2975 expected: str = "",
2976 ) -> ActionVerification:
2977 raise AssertionError("Verification should not run in this scenario")
2978
2979 guide_root = temp_dir / "guides" / "nginx"
2980 chapters = guide_root / "chapters"
2981 guide_root.mkdir(parents=True)
2982 chapters.mkdir()
2983 index_path = guide_root / "index.html"
2984 chapter_one = chapters / "01-getting-started.html"
2985 chapter_two = chapters / "02-installation.html"
2986 index_path.write_text("<html></html>\n")
2987 chapter_one.write_text("<h1>One</h1>\n")
2988 chapter_two.write_text("<h1>Two</h1>\n")
2989
2990 implementation_plan = temp_dir / "implementation.md"
2991 implementation_plan.write_text(
2992 "\n".join(
2993 [
2994 "# Implementation Plan",
2995 "",
2996 "## File Changes",
2997 f"- `{guide_root}/`",
2998 f"- `{chapters}/`",
2999 f"- `{index_path}`",
3000 f"- `{chapter_one}`",
3001 f"- `{chapter_two}`",
3002 "",
3003 ]
3004 )
3005 )
3006
3007 context = build_context(
3008 temp_dir=temp_dir,
3009 messages=[],
3010 safeguards=FakeSafeguards(),
3011 assess_confidence=assess_confidence,
3012 verify_action=verify_action,
3013 auto_recover=False,
3014 )
3015 queued_messages: list[str] = []
3016 context.queue_steering_message_callback = queued_messages.append
3017 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3018 dod = create_definition_of_done("Create a multi-file nginx guide.")
3019 dod.implementation_plan = str(implementation_plan)
3020 dod.verification_commands = [f"ls -la {guide_root}"]
3021 sync_todos_to_definition_of_done(
3022 dod,
3023 [
3024 {
3025 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3026 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3027 "status": "pending",
3028 },
3029 {
3030 "content": "Verify all guide files are linked and complete",
3031 "active_form": "Working on: Verify all guide files are linked and complete",
3032 "status": "pending",
3033 },
3034 ],
3035 project_root=temp_dir,
3036 )
3037
3038 tool_call = ToolCall(
3039 id="todo-only",
3040 name="TodoWrite",
3041 arguments={
3042 "todos": [
3043 {
3044 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3045 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3046 "status": "pending",
3047 },
3048 {
3049 "content": "Verify all guide files are linked and complete",
3050 "active_form": "Working on: Verify all guide files are linked and complete",
3051 "status": "pending",
3052 },
3053 ]
3054 },
3055 )
3056 executor = FakeExecutor(
3057 [
3058 tool_outcome(
3059 tool_call=tool_call,
3060 output="Todos updated",
3061 is_error=False,
3062 metadata={
3063 "new_todos": [
3064 {
3065 "content": "First, examine the existing Fortran guide structure to understand the format and content organization",
3066 "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization",
3067 "status": "pending",
3068 },
3069 {
3070 "content": "Verify all guide files are linked and complete",
3071 "active_form": "Working on: Verify all guide files are linked and complete",
3072 "status": "pending",
3073 },
3074 ]
3075 },
3076 )
3077 ]
3078 )
3079
3080 summary = TurnSummary(final_response="")
3081 await runner.execute_batch(
3082 tool_calls=[tool_call],
3083 tool_source="assistant",
3084 pending_tool_calls_seen=set(),
3085 emit=_noop_emit,
3086 summary=summary,
3087 dod=dod,
3088 executor=executor, # type: ignore[arg-type]
3089 on_confirmation=None,
3090 on_user_question=None,
3091 emit_confirmation=None,
3092 consecutive_errors=0,
3093 )
3094
3095 assert queued_messages
3096 message = queued_messages[-1]
3097 assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
3098 assert "Verify all guide files are linked and complete" in message
3099 assert "Move to verification once no specific mismatch remains." in message
3100 assert "reopen reference materials" in message
3101 assert "Fortran guide structure" not in message
3102
3103
3104 @pytest.mark.asyncio
3105 async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
3106 temp_dir: Path,
3107 ) -> None:
3108 async def assess_confidence(
3109 tool_name: str,
3110 tool_args: dict,
3111 context: str,
3112 ) -> ConfidenceAssessment:
3113 raise AssertionError("Confidence scoring should not run in this scenario")
3114
3115 async def verify_action(
3116 tool_name: str,
3117 tool_args: dict,
3118 result: str,
3119 expected: str = "",
3120 ) -> ActionVerification:
3121 raise AssertionError("Verification should not run in this scenario")
3122
3123 guide_root = temp_dir / "guides" / "nginx"
3124 chapters = guide_root / "chapters"
3125 guide_root.mkdir(parents=True)
3126 chapters.mkdir()
3127 index_path = guide_root / "index.html"
3128 index_path.write_text(
3129 "\n".join(
3130 [
3131 "<!DOCTYPE html>",
3132 "<html>",
3133 "<body>",
3134 '<a href="chapters/01-introduction.html">Introduction</a>',
3135 "</body>",
3136 "</html>",
3137 "",
3138 ]
3139 )
3140 )
3141
3142 implementation_plan = temp_dir / "implementation.md"
3143 implementation_plan.write_text(
3144 "\n".join(
3145 [
3146 "# Implementation Plan",
3147 "",
3148 "## File Changes",
3149 f"- `{guide_root}/`",
3150 f"- `{chapters}/`",
3151 f"- `{index_path}`",
3152 "",
3153 ]
3154 )
3155 )
3156
3157 context = build_context(
3158 temp_dir=temp_dir,
3159 messages=[],
3160 safeguards=FakeSafeguards(),
3161 assess_confidence=assess_confidence,
3162 verify_action=verify_action,
3163 auto_recover=False,
3164 )
3165 queued_messages: list[str] = []
3166 context.queue_steering_message_callback = queued_messages.append
3167 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3168 dod = create_definition_of_done("Create a multi-file nginx guide.")
3169 dod.implementation_plan = str(implementation_plan)
3170 dod.touched_files.append(str(index_path))
3171 sync_todos_to_definition_of_done(
3172 dod,
3173 [
3174 {
3175 "content": "Examine the existing Fortran guide structure",
3176 "active_form": "Examining the existing Fortran guide structure",
3177 "status": "completed",
3178 },
3179 {
3180 "content": "Create the nginx directory structure",
3181 "active_form": "Creating the nginx directory structure",
3182 "status": "completed",
3183 },
3184 {
3185 "content": "Write the introduction chapter",
3186 "active_form": "Writing the introduction chapter",
3187 "status": "pending",
3188 },
3189 ],
3190 project_root=temp_dir,
3191 )
3192
3193 tool_call = ToolCall(
3194 id="todo-next-mutation",
3195 name="TodoWrite",
3196 arguments={
3197 "todos": [
3198 {
3199 "content": "Examine the existing Fortran guide structure",
3200 "active_form": "Examining the existing Fortran guide structure",
3201 "status": "completed",
3202 },
3203 {
3204 "content": "Create the nginx directory structure",
3205 "active_form": "Creating the nginx directory structure",
3206 "status": "completed",
3207 },
3208 {
3209 "content": "Write the introduction chapter",
3210 "active_form": "Writing the introduction chapter",
3211 "status": "pending",
3212 },
3213 ]
3214 },
3215 )
3216 executor = FakeExecutor(
3217 [
3218 tool_outcome(
3219 tool_call=tool_call,
3220 output="Todos updated",
3221 is_error=False,
3222 metadata={
3223 "new_todos": [
3224 {
3225 "content": "Examine the existing Fortran guide structure",
3226 "active_form": "Examining the existing Fortran guide structure",
3227 "status": "completed",
3228 },
3229 {
3230 "content": "Create the nginx directory structure",
3231 "active_form": "Creating the nginx directory structure",
3232 "status": "completed",
3233 },
3234 {
3235 "content": "Write the introduction chapter",
3236 "active_form": "Writing the introduction chapter",
3237 "status": "pending",
3238 },
3239 ]
3240 },
3241 )
3242 ]
3243 )
3244
3245 summary = TurnSummary(final_response="")
3246 await runner.execute_batch(
3247 tool_calls=[tool_call],
3248 tool_source="assistant",
3249 pending_tool_calls_seen=set(),
3250 emit=_noop_emit,
3251 summary=summary,
3252 dod=dod,
3253 executor=executor, # type: ignore[arg-type]
3254 on_confirmation=None,
3255 on_user_question=None,
3256 emit_confirmation=None,
3257 consecutive_errors=0,
3258 )
3259
3260 assert queued_messages
3261 message = queued_messages[-1]
3262 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3263 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3264 assert "Resume by creating `01-introduction.html` now." in message
3265 assert "It is the next missing declared output under `chapters/`." in message
3266 assert "Prefer one `write` call for `" in message
3267 assert "01-introduction.html` instead of more rereads." in message
3268 assert "Do not spend the next turn on TodoWrite alone" in message
3269
3270
3271 @pytest.mark.asyncio
3272 async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file(
3273 temp_dir: Path,
3274 ) -> None:
3275 async def assess_confidence(
3276 tool_name: str,
3277 tool_args: dict,
3278 context: str,
3279 ) -> ConfidenceAssessment:
3280 raise AssertionError("Confidence scoring should not run in this scenario")
3281
3282 async def verify_action(
3283 tool_name: str,
3284 tool_args: dict,
3285 result: str,
3286 expected: str = "",
3287 ) -> ActionVerification:
3288 raise AssertionError("Verification should not run in this scenario")
3289
3290 guide_root = temp_dir / "guides" / "nginx"
3291 chapters = guide_root / "chapters"
3292 guide_root.mkdir(parents=True)
3293 chapters.mkdir()
3294 index_path = guide_root / "index.html"
3295 index_path.write_text(
3296 "\n".join(
3297 [
3298 "<html>",
3299 '<a href="chapters/introduction.html">Introduction</a>',
3300 '<a href="chapters/installation.html">Installation</a>',
3301 "</html>",
3302 ]
3303 )
3304 + "\n"
3305 )
3306
3307 implementation_plan = temp_dir / "implementation.md"
3308 implementation_plan.write_text(
3309 "\n".join(
3310 [
3311 "# Implementation Plan",
3312 "",
3313 "## File Changes",
3314 f"- `{guide_root}/`",
3315 f"- `{chapters}/`",
3316 f"- `{index_path}`",
3317 "",
3318 ]
3319 )
3320 )
3321
3322 dod = create_definition_of_done("Create a multi-file nginx guide.")
3323 dod.implementation_plan = str(implementation_plan)
3324 dod.pending_items = [
3325 "Write the introduction chapter",
3326 "Complete the requested work",
3327 ]
3328 dod.touched_files.append(str(index_path))
3329
3330 queued_messages: list[str] = []
3331 context = build_context(
3332 temp_dir=temp_dir,
3333 messages=[],
3334 safeguards=FakeSafeguards(),
3335 assess_confidence=assess_confidence,
3336 verify_action=verify_action,
3337 auto_recover=False,
3338 )
3339 context.queue_steering_message_callback = queued_messages.append
3340 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3341
3342 tool_call = ToolCall(
3343 id="todo-1",
3344 name="TodoWrite",
3345 arguments={
3346 "todos": [
3347 {
3348 "content": "Write the introduction chapter",
3349 "activeForm": "Writing the introduction chapter",
3350 "status": "pending",
3351 }
3352 ]
3353 },
3354 )
3355 executor = FakeExecutor(
3356 [
3357 tool_outcome(
3358 tool_call=tool_call,
3359 output="Todos updated",
3360 is_error=False,
3361 metadata={
3362 "new_todos": [
3363 {
3364 "content": "Write the introduction chapter",
3365 "active_form": "Writing the introduction chapter",
3366 "status": "pending",
3367 }
3368 ]
3369 },
3370 )
3371 ]
3372 )
3373
3374 summary = TurnSummary(final_response="")
3375 await runner.execute_batch(
3376 tool_calls=[tool_call],
3377 tool_source="assistant",
3378 pending_tool_calls_seen=set(),
3379 emit=_noop_emit,
3380 summary=summary,
3381 dod=dod,
3382 executor=executor, # type: ignore[arg-type]
3383 on_confirmation=None,
3384 on_user_question=None,
3385 emit_confirmation=None,
3386 consecutive_errors=0,
3387 )
3388
3389 assert queued_messages
3390 message = queued_messages[-1]
3391 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3392 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3393 assert "Resume by creating `introduction.html` now." in message
3394 assert "It is the next missing declared output under `chapters/`." in message
3395 assert "Prefer one `write` call for `" in message
3396 assert "introduction.html` instead of more rereads." in message
3397 assert "Do not spend the next turn on TodoWrite alone" in message
3398
3399
3400 @pytest.mark.asyncio
3401 async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
3402 temp_dir: Path,
3403 ) -> None:
3404 async def assess_confidence(
3405 tool_name: str,
3406 tool_args: dict,
3407 context: str,
3408 ) -> ConfidenceAssessment:
3409 raise AssertionError("Confidence scoring should not run in this scenario")
3410
3411 async def verify_action(
3412 tool_name: str,
3413 tool_args: dict,
3414 result: str,
3415 expected: str = "",
3416 ) -> ActionVerification:
3417 raise AssertionError("Verification should not run in this scenario")
3418
3419 guide_root = temp_dir / "guides" / "nginx"
3420 chapters = guide_root / "chapters"
3421 guide_root.mkdir(parents=True)
3422 chapters.mkdir()
3423 index_path = guide_root / "index.html"
3424 chapter_one = chapters / "01-introduction.html"
3425 index_path.write_text(
3426 "\n".join(
3427 [
3428 "<html>",
3429 '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
3430 '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
3431 "</html>",
3432 ]
3433 )
3434 + "\n"
3435 )
3436 chapter_one.write_text("<html></html>\n")
3437
3438 implementation_plan = temp_dir / "implementation.md"
3439 implementation_plan.write_text(
3440 "\n".join(
3441 [
3442 "# Implementation Plan",
3443 "",
3444 "## File Changes",
3445 f"- `{guide_root}/`",
3446 f"- `{chapters}/`",
3447 f"- `{index_path}`",
3448 "",
3449 ]
3450 )
3451 )
3452
3453 dod = create_definition_of_done("Create a multi-file nginx guide.")
3454 dod.implementation_plan = str(implementation_plan)
3455 dod.pending_items = [
3456 "Creating Chapter 2: Installation and Setup",
3457 "Complete the requested work",
3458 ]
3459 dod.touched_files.extend([str(index_path), str(chapter_one)])
3460
3461 queued_messages: list[str] = []
3462 context = build_context(
3463 temp_dir=temp_dir,
3464 messages=[],
3465 safeguards=FakeSafeguards(),
3466 assess_confidence=assess_confidence,
3467 verify_action=verify_action,
3468 auto_recover=False,
3469 )
3470 context.queue_steering_message_callback = queued_messages.append
3471 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3472
3473 tool_call = ToolCall(
3474 id="todo-1",
3475 name="TodoWrite",
3476 arguments={
3477 "todos": [
3478 {
3479 "content": "Creating Chapter 2: Installation and Setup",
3480 "activeForm": "Creating Chapter 2: Installation and Setup",
3481 "status": "pending",
3482 }
3483 ]
3484 },
3485 )
3486 executor = FakeExecutor(
3487 [
3488 tool_outcome(
3489 tool_call=tool_call,
3490 output="Todos updated",
3491 is_error=False,
3492 metadata={
3493 "new_todos": [
3494 {
3495 "content": "Creating Chapter 2: Installation and Setup",
3496 "active_form": "Creating Chapter 2: Installation and Setup",
3497 "status": "pending",
3498 }
3499 ]
3500 },
3501 )
3502 ]
3503 )
3504
3505 summary = TurnSummary(final_response="")
3506 await runner.execute_batch(
3507 tool_calls=[tool_call],
3508 tool_source="assistant",
3509 pending_tool_calls_seen=set(),
3510 emit=_noop_emit,
3511 summary=summary,
3512 dod=dod,
3513 executor=executor, # type: ignore[arg-type]
3514 on_confirmation=None,
3515 on_user_question=None,
3516 emit_confirmation=None,
3517 consecutive_errors=0,
3518 )
3519
3520 assert queued_messages
3521 message = queued_messages[-1]
3522 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3523 assert "Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
3524 assert "Resume by creating `02-installation.html` now." in message
3525 assert (
3526 f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
3527 "instead of more rereads."
3528 in message
3529 )
3530 assert "Make your next response the concrete mutation tool call itself" in message
3531
3532
3533 @pytest.mark.asyncio
3534 async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
3535 temp_dir: Path,
3536 ) -> None:
3537 async def assess_confidence(
3538 tool_name: str,
3539 tool_args: dict,
3540 context: str,
3541 ) -> ConfidenceAssessment:
3542 raise AssertionError("Confidence scoring should not run in this scenario")
3543
3544 async def verify_action(
3545 tool_name: str,
3546 tool_args: dict,
3547 result: str,
3548 expected: str = "",
3549 ) -> ActionVerification:
3550 raise AssertionError("Verification should not run in this scenario")
3551
3552 reference_chapters = temp_dir / "fortran" / "chapters"
3553 reference_chapters.mkdir(parents=True)
3554 (reference_chapters / "01-introduction.html").write_text("<h1>Introduction</h1>\n")
3555
3556 guide_root = temp_dir / "guides" / "nginx"
3557 chapters = guide_root / "chapters"
3558 guide_root.mkdir(parents=True)
3559 chapters.mkdir()
3560 index_path = guide_root / "index.html"
3561 index_path.write_text("<html></html>\n")
3562
3563 implementation_plan = temp_dir / "implementation.md"
3564 implementation_plan.write_text(
3565 "\n".join(
3566 [
3567 "# Implementation Plan",
3568 "",
3569 "## File Changes",
3570 f"- `{guide_root}/`",
3571 f"- `{chapters}/`",
3572 f"- `{index_path}`",
3573 "",
3574 ]
3575 )
3576 )
3577
3578 dod = create_definition_of_done("Create a multi-file nginx guide.")
3579 dod.implementation_plan = str(implementation_plan)
3580 dod.pending_items = [
3581 "Write the introduction chapter",
3582 "Complete the requested work",
3583 ]
3584 dod.touched_files.append(str(index_path))
3585
3586 queued_messages: list[str] = []
3587 context = build_context(
3588 temp_dir=temp_dir,
3589 messages=[
3590 Message(
3591 role=Role.ASSISTANT,
3592 content="",
3593 tool_calls=[
3594 ToolCall(
3595 id="read-ref-1",
3596 name="read",
3597 arguments={"file_path": str(reference_chapters / "01-introduction.html")},
3598 )
3599 ],
3600 )
3601 ],
3602 safeguards=FakeSafeguards(),
3603 assess_confidence=assess_confidence,
3604 verify_action=verify_action,
3605 auto_recover=False,
3606 )
3607 context.queue_steering_message_callback = queued_messages.append
3608 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3609
3610 tool_call = ToolCall(
3611 id="todo-observed-1",
3612 name="TodoWrite",
3613 arguments={
3614 "todos": [
3615 {
3616 "content": "Write the introduction chapter",
3617 "activeForm": "Writing the introduction chapter",
3618 "status": "pending",
3619 }
3620 ]
3621 },
3622 )
3623 executor = FakeExecutor(
3624 [
3625 tool_outcome(
3626 tool_call=tool_call,
3627 output="Todos updated",
3628 is_error=False,
3629 metadata={
3630 "new_todos": [
3631 {
3632 "content": "Write the introduction chapter",
3633 "active_form": "Writing the introduction chapter",
3634 "status": "pending",
3635 }
3636 ]
3637 },
3638 )
3639 ]
3640 )
3641
3642 summary = TurnSummary(final_response="")
3643 await runner.execute_batch(
3644 tool_calls=[tool_call],
3645 tool_source="assistant",
3646 pending_tool_calls_seen=set(),
3647 emit=_noop_emit,
3648 summary=summary,
3649 dod=dod,
3650 executor=executor, # type: ignore[arg-type]
3651 on_confirmation=None,
3652 on_user_question=None,
3653 emit_confirmation=None,
3654 consecutive_errors=0,
3655 )
3656
3657 assert queued_messages
3658 message = queued_messages[-1]
3659 assert "Todo tracking is updated. A declared output artifact is still missing." in message
3660 assert "Continue with the next pending item: `Write the introduction chapter`." in message
3661 assert "Resume by creating `01-introduction.html` now." in message
3662 assert (
3663 "It mirrors the observed filename pattern from another `chapters/` directory "
3664 "you already inspected."
3665 in message
3666 )
3667 assert "01-introduction.html` instead of more rereads." in message
3668
3669
3670 @pytest.mark.asyncio
3671 async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step(
3672 temp_dir: Path,
3673 ) -> None:
3674 async def assess_confidence(
3675 tool_name: str,
3676 tool_args: dict,
3677 context: str,
3678 ) -> ConfidenceAssessment:
3679 raise AssertionError("Confidence scoring should not run in this scenario")
3680
3681 async def verify_action(
3682 tool_name: str,
3683 tool_args: dict,
3684 result: str,
3685 expected: str = "",
3686 ) -> ActionVerification:
3687 raise AssertionError("Verification should not run in this scenario")
3688
3689 guide_root = temp_dir / "guides" / "nginx"
3690 chapters = guide_root / "chapters"
3691 guide_root.mkdir(parents=True)
3692 chapters.mkdir()
3693 index_path = guide_root / "index.html"
3694 chapter_one = chapters / "01-getting-started.html"
3695 chapter_two = chapters / "02-installation.html"
3696 index_path.write_text("<html></html>\n")
3697 chapter_one.write_text("<h1>One</h1>\n")
3698
3699 implementation_plan = temp_dir / "implementation.md"
3700 implementation_plan.write_text(
3701 "\n".join(
3702 [
3703 "# Implementation Plan",
3704 "",
3705 "## File Changes",
3706 f"- `{guide_root}/`",
3707 f"- `{chapters}/`",
3708 f"- `{index_path}`",
3709 f"- `{chapter_one}`",
3710 f"- `{chapter_two}`",
3711 "",
3712 ]
3713 )
3714 )
3715
3716 context = build_context(
3717 temp_dir=temp_dir,
3718 messages=[],
3719 safeguards=FakeSafeguards(),
3720 assess_confidence=assess_confidence,
3721 verify_action=verify_action,
3722 auto_recover=False,
3723 )
3724 queued_messages: list[str] = []
3725 context.queue_steering_message_callback = queued_messages.append
3726 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3727 dod = create_definition_of_done("Create a multi-file nginx guide.")
3728 dod.implementation_plan = str(implementation_plan)
3729 sync_todos_to_definition_of_done(
3730 dod,
3731 [
3732 {
3733 "content": "Create 01-getting-started.html",
3734 "active_form": "Creating 01-getting-started.html",
3735 "status": "completed",
3736 },
3737 {
3738 "content": "Create 02-installation.html",
3739 "active_form": "Creating 02-installation.html",
3740 "status": "pending",
3741 },
3742 ],
3743 project_root=temp_dir,
3744 )
3745 dod.touched_files.extend([str(index_path), str(chapter_one)])
3746
3747 tool_call = ToolCall(
3748 id="working-note",
3749 name="notepad_write_working",
3750 arguments={"content": "Creating the second chapter file: Installation"},
3751 )
3752 executor = FakeExecutor(
3753 [
3754 tool_outcome(
3755 tool_call=tool_call,
3756 output="Working note recorded",
3757 is_error=False,
3758 )
3759 ]
3760 )
3761
3762 summary = TurnSummary(final_response="")
3763 await runner.execute_batch(
3764 tool_calls=[tool_call],
3765 tool_source="assistant",
3766 pending_tool_calls_seen=set(),
3767 emit=_noop_emit,
3768 summary=summary,
3769 dod=dod,
3770 executor=executor, # type: ignore[arg-type]
3771 on_confirmation=None,
3772 on_user_question=None,
3773 emit_confirmation=None,
3774 consecutive_errors=0,
3775 )
3776
3777 assert queued_messages
3778 message = queued_messages[-1]
3779 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
3780 assert "Resume by creating `02-installation.html` now." in message
3781 assert "Make your next response the concrete mutation tool call itself" in message
3782 assert "refresh `TodoWrite`" in message
3783 assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message
3784
3785
3786 @pytest.mark.asyncio
3787 async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step(
3788 temp_dir: Path,
3789 ) -> None:
3790 async def assess_confidence(
3791 tool_name: str,
3792 tool_args: dict,
3793 context: str,
3794 ) -> ConfidenceAssessment:
3795 raise AssertionError("Confidence scoring should be disabled in this scenario")
3796
3797 async def verify_action(
3798 tool_name: str,
3799 tool_args: dict,
3800 result: str,
3801 expected: str = "",
3802 ) -> ActionVerification:
3803 raise AssertionError("Verification should not run in this scenario")
3804
3805 implementation_plan = temp_dir / "implementation.md"
3806 implementation_plan.write_text(
3807 "\n".join(
3808 [
3809 "# Implementation Plan",
3810 "",
3811 "## File Changes",
3812 f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`",
3813 f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`",
3814 "",
3815 ]
3816 )
3817 )
3818
3819 context = build_context(
3820 temp_dir=temp_dir,
3821 messages=[],
3822 safeguards=FakeSafeguards(),
3823 assess_confidence=assess_confidence,
3824 verify_action=verify_action,
3825 auto_recover=False,
3826 )
3827 queued_messages: list[str] = []
3828 context.queue_steering_message_callback = queued_messages.append
3829 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3830 dod = create_definition_of_done("Create a multi-file nginx guide.")
3831 dod.implementation_plan = str(implementation_plan)
3832 dod.pending_items.extend(
3833 [
3834 "First, examine the existing fortran guide structure and content to understand the format",
3835 "Create the nginx directory structure",
3836 "Develop the main index.html file for the nginx guide",
3837 ]
3838 )
3839
3840 tool_call = ToolCall(
3841 id="working-note",
3842 name="notepad_write_working",
3843 arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"},
3844 )
3845 executor = FakeExecutor(
3846 [
3847 tool_outcome(
3848 tool_call=tool_call,
3849 output="Working note recorded",
3850 is_error=False,
3851 )
3852 ]
3853 )
3854
3855 summary = TurnSummary(final_response="")
3856 await runner.execute_batch(
3857 tool_calls=[tool_call],
3858 tool_source="assistant",
3859 pending_tool_calls_seen=set(),
3860 emit=_noop_emit,
3861 summary=summary,
3862 dod=dod,
3863 executor=executor, # type: ignore[arg-type]
3864 on_confirmation=None,
3865 on_user_question=None,
3866 emit_confirmation=None,
3867 consecutive_errors=0,
3868 )
3869
3870 assert queued_messages
3871 message = queued_messages[-1]
3872 assert (
3873 "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`."
3874 in message
3875 )
3876 assert "one concrete evidence-gathering tool call" in message
3877 assert "Resume by creating `index.html` now." not in message
3878
3879
3880 @pytest.mark.asyncio
3881 async def test_tool_batch_runner_working_note_prefers_declared_output_gap_over_stale_discovery(
3882 temp_dir: Path,
3883 ) -> None:
3884 async def assess_confidence(
3885 tool_name: str,
3886 tool_args: dict,
3887 context: str,
3888 ) -> ConfidenceAssessment:
3889 raise AssertionError("Confidence scoring should be disabled in this scenario")
3890
3891 async def verify_action(
3892 tool_name: str,
3893 tool_args: dict,
3894 result: str,
3895 expected: str = "",
3896 ) -> ActionVerification:
3897 raise AssertionError("Verification should not run in this scenario")
3898
3899 guide_root = temp_dir / "guides" / "nginx"
3900 chapters_dir = guide_root / "chapters"
3901 chapters_dir.mkdir(parents=True)
3902 index_path = guide_root / "index.html"
3903 first_chapter = chapters_dir / "01-introduction.html"
3904 index_path.write_text(
3905 "\n".join(
3906 [
3907 '<a href="chapters/01-introduction.html">Introduction</a>',
3908 '<a href="chapters/02-installation.html">Installation</a>',
3909 '<a href="chapters/03-configuration.html">Configuration</a>',
3910 ]
3911 )
3912 )
3913 first_chapter.write_text("<h1>Introduction</h1>\n")
3914
3915 implementation_plan = temp_dir / "implementation.md"
3916 implementation_plan.write_text(
3917 "\n".join(
3918 [
3919 "# Implementation Plan",
3920 "",
3921 "## File Changes",
3922 f"- `{guide_root / 'index.html'}`",
3923 f"- `{chapters_dir}/`",
3924 "",
3925 ]
3926 )
3927 )
3928
3929 context = build_context(
3930 temp_dir=temp_dir,
3931 messages=[],
3932 safeguards=FakeSafeguards(),
3933 assess_confidence=assess_confidence,
3934 verify_action=verify_action,
3935 auto_recover=False,
3936 )
3937 queued_messages: list[str] = []
3938 context.queue_steering_message_callback = queued_messages.append
3939 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
3940 dod = create_definition_of_done("Create a multi-file nginx guide.")
3941 dod.implementation_plan = str(implementation_plan)
3942 dod.pending_items.extend(
3943 [
3944 "First, examine the existing fortran guide structure and content to understand the format",
3945 "Create chapter files following the established pattern",
3946 ]
3947 )
3948 dod.touched_files.extend([str(index_path), str(first_chapter)])
3949
3950 tool_call = ToolCall(
3951 id="working-note",
3952 name="notepad_write_working",
3953 arguments={"content": "Created index and first chapter; next is chapter 2"},
3954 )
3955 executor = FakeExecutor(
3956 [
3957 tool_outcome(
3958 tool_call=tool_call,
3959 output="Working note recorded",
3960 is_error=False,
3961 )
3962 ]
3963 )
3964
3965 summary = TurnSummary(final_response="")
3966 await runner.execute_batch(
3967 tool_calls=[tool_call],
3968 tool_source="assistant",
3969 pending_tool_calls_seen=set(),
3970 emit=_noop_emit,
3971 summary=summary,
3972 dod=dod,
3973 executor=executor, # type: ignore[arg-type]
3974 on_confirmation=None,
3975 on_user_question=None,
3976 emit_confirmation=None,
3977 consecutive_errors=0,
3978 )
3979
3980 assert queued_messages
3981 message = queued_messages[-1]
3982 assert "Bookkeeping note is recorded. A declared output artifact is still missing." in message
3983 assert "Resume by creating `02-installation.html` now." in message
3984 assert "Continue with the next pending item: `First, examine the existing fortran guide structure" not in message
3985
3986
3987 @pytest.mark.asyncio
3988 async def test_tool_batch_runner_shallow_glob_does_not_handoff_before_content_read(
3989 temp_dir: Path,
3990 ) -> None:
3991 async def assess_confidence(
3992 tool_name: str,
3993 tool_args: dict,
3994 context: str,
3995 ) -> ConfidenceAssessment:
3996 raise AssertionError("Confidence scoring should be disabled in this scenario")
3997
3998 async def verify_action(
3999 tool_name: str,
4000 tool_args: dict,
4001 result: str,
4002 expected: str = "",
4003 ) -> ActionVerification:
4004 raise AssertionError("Verification should not run in this scenario")
4005
4006 fortran_root = temp_dir / "Loader" / "guides" / "fortran"
4007 chapters_dir = fortran_root / "chapters"
4008 chapters_dir.mkdir(parents=True)
4009
4010 implementation_plan = temp_dir / "implementation.md"
4011 implementation_plan.write_text(
4012 "\n".join(
4013 [
4014 "# Implementation Plan",
4015 "",
4016 "## File Changes",
4017 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'index.html'}`",
4018 f"- `{temp_dir / 'Loader' / 'guides' / 'nginx' / 'chapters'}`",
4019 "",
4020 ]
4021 )
4022 )
4023
4024 context = build_context(
4025 temp_dir=temp_dir,
4026 messages=[],
4027 safeguards=FakeSafeguards(),
4028 assess_confidence=assess_confidence,
4029 verify_action=verify_action,
4030 auto_recover=False,
4031 )
4032 queued_messages: list[str] = []
4033 context.queue_steering_message_callback = queued_messages.append
4034 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4035 dod = create_definition_of_done("Create a multi-file nginx guide.")
4036 dod.implementation_plan = str(implementation_plan)
4037 dod.pending_items.extend(
4038 [
4039 "First, examine the existing fortran guide structure and content",
4040 "Create the nginx directory structure",
4041 "Develop the main index.html file for nginx guide",
4042 ]
4043 )
4044
4045 tool_call = ToolCall(
4046 id="glob-1",
4047 name="glob",
4048 arguments={"pattern": "**", "path": str(fortran_root)},
4049 )
4050 executor = FakeExecutor(
4051 [
4052 tool_outcome(
4053 tool_call=tool_call,
4054 output=f"{fortran_root}\n{chapters_dir}",
4055 is_error=False,
4056 )
4057 ]
4058 )
4059
4060 summary = TurnSummary(final_response="")
4061 await runner.execute_batch(
4062 tool_calls=[tool_call],
4063 tool_source="assistant",
4064 pending_tool_calls_seen=set(),
4065 emit=_noop_emit,
4066 summary=summary,
4067 dod=dod,
4068 executor=executor, # type: ignore[arg-type]
4069 on_confirmation=None,
4070 on_user_question=None,
4071 emit_confirmation=None,
4072 consecutive_errors=0,
4073 )
4074
4075 assert queued_messages == []
4076
4077
4078 @pytest.mark.asyncio
4079 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
4080 temp_dir: Path,
4081 ) -> None:
4082 async def assess_confidence(
4083 tool_name: str,
4084 tool_args: dict,
4085 context: str,
4086 ) -> ConfidenceAssessment:
4087 raise AssertionError("Confidence scoring should not run in this scenario")
4088
4089 async def verify_action(
4090 tool_name: str,
4091 tool_args: dict,
4092 result: str,
4093 expected: str = "",
4094 ) -> ActionVerification:
4095 raise AssertionError("Verification should not run in this scenario")
4096
4097 prompt = (
4098 "Have a look at ~/Loader/guides/fortran/index.html, then "
4099 "~/Loader/guides/fortran/chapters. The table of contents links in "
4100 "index.html are inaccurate and the href’s are wrong. Let’s update the "
4101 "links and their link texts to be correct."
4102 )
4103 chapters = temp_dir / "chapters"
4104 chapters.mkdir()
4105 (chapters / "01-introduction.html").write_text(
4106 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
4107 )
4108 (chapters / "02-setup.html").write_text(
4109 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
4110 )
4111 current_block = (
4112 "<h2>Table of Contents</h2>\n"
4113 ' <ul class="chapter-list">\n'
4114 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
4115 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
4116 " </ul>\n"
4117 )
4118 index_path = temp_dir / "index.html"
4119 index_path.write_text(current_block)
4120
4121 context = build_context(
4122 temp_dir=temp_dir,
4123 messages=[],
4124 safeguards=FakeSafeguards(),
4125 assess_confidence=assess_confidence,
4126 verify_action=verify_action,
4127 auto_recover=False,
4128 )
4129 context.session.current_task = prompt # type: ignore[attr-defined]
4130 queued_messages: list[str] = []
4131 context.queue_steering_message_callback = queued_messages.append
4132 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4133 tool_call = ToolCall(
4134 id="edit-1",
4135 name="edit",
4136 arguments={
4137 "file_path": str(index_path),
4138 "old_string": current_block,
4139 "new_string": current_block,
4140 },
4141 )
4142 executor = FakeExecutor(
4143 [
4144 tool_outcome(
4145 tool_call=tool_call,
4146 output=(
4147 "[Blocked - old_string and new_string are identical - no change "
4148 "would occur] Suggestion: Provide different old and new strings"
4149 ),
4150 is_error=True,
4151 state=ToolExecutionState.BLOCKED,
4152 )
4153 ]
4154 )
4155
4156 await runner.execute_batch(
4157 tool_calls=[tool_call],
4158 tool_source="assistant",
4159 pending_tool_calls_seen=set(),
4160 emit=_noop_emit,
4161 summary=TurnSummary(final_response=""),
4162 dod=create_definition_of_done(prompt),
4163 executor=executor, # type: ignore[arg-type]
4164 on_confirmation=None,
4165 on_user_question=None,
4166 emit_confirmation=None,
4167 consecutive_errors=0,
4168 )
4169
4170 assert queued_messages == []
4171
4172
4173 def test_tool_batch_runner_blocked_noop_edit_nudge_stays_on_active_repair_target(
4174 temp_dir: Path,
4175 ) -> None:
4176 async def assess_confidence(
4177 tool_name: str,
4178 tool_args: dict,
4179 context: str,
4180 ) -> ConfidenceAssessment:
4181 raise AssertionError("Confidence scoring should be disabled in this scenario")
4182
4183 async def verify_action(
4184 tool_name: str,
4185 tool_args: dict,
4186 result: str,
4187 expected: str = "",
4188 ) -> ActionVerification:
4189 raise AssertionError("Verification should not run in this scenario")
4190
4191 repair_target = temp_dir / "guide" / "chapters" / "04-basic-usage.html"
4192 context = build_context(
4193 temp_dir=temp_dir,
4194 messages=[
4195 Message(
4196 role=Role.ASSISTANT,
4197 content=(
4198 "Repair focus:\n"
4199 f"- Fix the broken local reference `05-advanced-topics.html` in `{repair_target}`.\n"
4200 f"- Immediate next step: edit `{repair_target}`.\n"
4201 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '05-advanced-topics.html'}`; otherwise remove or replace `05-advanced-topics.html`.\n"
4202 ),
4203 )
4204 ],
4205 safeguards=FakeSafeguards(),
4206 assess_confidence=assess_confidence,
4207 verify_action=verify_action,
4208 )
4209 queued: list[str] = []
4210 context.queue_steering_message_callback = queued.append
4211 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4212
4213 runner._queue_blocked_html_edit_nudge(
4214 ToolCall(
4215 id="edit-1",
4216 name="edit",
4217 arguments={
4218 "file_path": str(repair_target),
4219 "old_string": "same",
4220 "new_string": "same",
4221 },
4222 ),
4223 "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings",
4224 )
4225
4226 assert queued
4227 assert str(repair_target) in queued[0]
4228 assert "no on-disk change" in queued[0]
4229 assert "replace the surrounding block" in queued[0]
4230 assert "Do not reopen unrelated reference materials" in queued[0]
4231
4232
4233 async def _noop_emit(event: AgentEvent) -> None:
4234 return None
4235
4236
4237 @pytest.mark.asyncio
4238 async def test_tool_batch_runner_marks_verification_planned_after_new_mutation(
4239 temp_dir: Path,
4240 ) -> None:
4241 async def assess_confidence(
4242 tool_name: str,
4243 tool_args: dict,
4244 context: str,
4245 ) -> ConfidenceAssessment:
4246 raise AssertionError("Confidence scoring should be disabled in this scenario")
4247
4248 async def verify_action(
4249 tool_name: str,
4250 tool_args: dict,
4251 result: str,
4252 expected: str = "",
4253 ) -> ActionVerification:
4254 raise AssertionError("Verification should not run for this scenario")
4255
4256 context = build_context(
4257 temp_dir=temp_dir,
4258 messages=[],
4259 safeguards=FakeSafeguards(),
4260 assess_confidence=assess_confidence,
4261 verify_action=verify_action,
4262 )
4263 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4264 tool_call = ToolCall(
4265 id="write-1",
4266 name="write",
4267 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4268 )
4269 executor = FakeExecutor(
4270 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4271 )
4272 summary = TurnSummary(final_response="")
4273 dod = create_definition_of_done("Update README and verify it still works.")
4274 events: list[AgentEvent] = []
4275
4276 async def emit(event: AgentEvent) -> None:
4277 events.append(event)
4278
4279 await runner.execute_batch(
4280 tool_calls=[tool_call],
4281 tool_source="assistant",
4282 pending_tool_calls_seen=set(),
4283 emit=emit,
4284 summary=summary,
4285 dod=dod,
4286 executor=executor, # type: ignore[arg-type]
4287 on_confirmation=None,
4288 on_user_question=None,
4289 emit_confirmation=None,
4290 consecutive_errors=0,
4291 )
4292
4293 assert dod.last_verification_result == "planned"
4294 assert dod.verification_commands
4295 assert "Collect verification evidence" in dod.pending_items
4296 assert dod.active_verification_attempt_id == "verification-attempt-1"
4297 assert dod.active_verification_attempt_number == 1
4298 assert summary.workflow_timeline[-1].reason_code == "verification_planned"
4299 assert summary.workflow_timeline[-1].policy_outcome == "planned"
4300 assert summary.workflow_timeline[-1].verification_observations[0].status == "planned"
4301 assert (
4302 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4303 == "verification-attempt-1"
4304 )
4305 assert (
4306 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4307 )
4308
4309
4310 @pytest.mark.asyncio
4311 async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
4312 temp_dir: Path,
4313 ) -> None:
4314 async def assess_confidence(
4315 tool_name: str,
4316 tool_args: dict,
4317 context: str,
4318 ) -> ConfidenceAssessment:
4319 raise AssertionError("Confidence scoring should be disabled in this scenario")
4320
4321 async def verify_action(
4322 tool_name: str,
4323 tool_args: dict,
4324 result: str,
4325 expected: str = "",
4326 ) -> ActionVerification:
4327 raise AssertionError("Verification should not run in this scenario")
4328
4329 context = build_context(
4330 temp_dir=temp_dir,
4331 messages=[],
4332 safeguards=FakeSafeguards(),
4333 assess_confidence=assess_confidence,
4334 verify_action=verify_action,
4335 )
4336 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4337 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
4338 chapters = nginx_root / "chapters"
4339 implementation_plan = temp_dir / "implementation.md"
4340 implementation_plan.write_text(
4341 "\n".join(
4342 [
4343 "# Implementation Plan",
4344 "",
4345 "## File Changes",
4346 f"- `{chapters}/`",
4347 f"- `{nginx_root / 'index.html'}`",
4348 "",
4349 ]
4350 )
4351 )
4352
4353 tool_call = ToolCall(
4354 id="mkdir-1",
4355 name="bash",
4356 arguments={"command": f"mkdir -p {chapters}"},
4357 )
4358 executor = FakeExecutor(
4359 [tool_outcome(tool_call=tool_call, output="", is_error=False)]
4360 )
4361 summary = TurnSummary(final_response="")
4362 dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
4363 dod.implementation_plan = str(implementation_plan)
4364 events: list[AgentEvent] = []
4365
4366 async def emit(event: AgentEvent) -> None:
4367 events.append(event)
4368
4369 await runner.execute_batch(
4370 tool_calls=[tool_call],
4371 tool_source="assistant",
4372 pending_tool_calls_seen=set(),
4373 emit=emit,
4374 summary=summary,
4375 dod=dod,
4376 executor=executor, # type: ignore[arg-type]
4377 on_confirmation=None,
4378 on_user_question=None,
4379 emit_confirmation=None,
4380 consecutive_errors=0,
4381 )
4382
4383 assert dod.last_verification_result is None
4384 assert "Collect verification evidence" not in dod.pending_items
4385 assert not any(
4386 entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
4387 )
4388
4389
4390 @pytest.mark.asyncio
4391 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
4392 temp_dir: Path,
4393 ) -> None:
4394 async def assess_confidence(
4395 tool_name: str,
4396 tool_args: dict,
4397 context: str,
4398 ) -> ConfidenceAssessment:
4399 raise AssertionError("Confidence scoring should be disabled in this scenario")
4400
4401 async def verify_action(
4402 tool_name: str,
4403 tool_args: dict,
4404 result: str,
4405 expected: str = "",
4406 ) -> ActionVerification:
4407 raise AssertionError("Verification should not run for this scenario")
4408
4409 context = build_context(
4410 temp_dir=temp_dir,
4411 messages=[],
4412 safeguards=FakeSafeguards(),
4413 assess_confidence=assess_confidence,
4414 verify_action=verify_action,
4415 )
4416 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4417 tool_call = ToolCall(
4418 id="write-1",
4419 name="write",
4420 arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
4421 )
4422 executor = FakeExecutor(
4423 [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
4424 )
4425 summary = TurnSummary(final_response="")
4426 dod = create_definition_of_done("Update README and verify it still works.")
4427 dod.verification_commands = ["uv run pytest -q"]
4428 dod.last_verification_result = "passed"
4429 dod.verification_attempt_counter = 1
4430 dod.active_verification_attempt_id = "verification-attempt-1"
4431 dod.active_verification_attempt_number = 1
4432 dod.evidence = [
4433 VerificationEvidence(
4434 command="uv run pytest -q",
4435 passed=True,
4436 stdout="401 passed",
4437 kind="test",
4438 )
4439 ]
4440 dod.completed_items.append("Collect verification evidence")
4441 events: list[AgentEvent] = []
4442
4443 async def emit(event: AgentEvent) -> None:
4444 events.append(event)
4445
4446 await runner.execute_batch(
4447 tool_calls=[tool_call],
4448 tool_source="assistant",
4449 pending_tool_calls_seen=set(),
4450 emit=emit,
4451 summary=summary,
4452 dod=dod,
4453 executor=executor, # type: ignore[arg-type]
4454 on_confirmation=None,
4455 on_user_question=None,
4456 emit_confirmation=None,
4457 consecutive_errors=0,
4458 )
4459
4460 assert dod.last_verification_result == "stale"
4461 assert dod.evidence == []
4462 assert "Collect verification evidence" in dod.pending_items
4463 assert "Collect verification evidence" not in dod.completed_items
4464 assert dod.active_verification_attempt_id == "verification-attempt-2"
4465 assert dod.active_verification_attempt_number == 2
4466 assert summary.workflow_timeline[-1].reason_code == "verification_stale"
4467 assert summary.workflow_timeline[-1].policy_outcome == "stale"
4468 assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
4469 assert (
4470 summary.workflow_timeline[-1].verification_observations[0].attempt_id
4471 == "verification-attempt-1"
4472 )
4473 assert (
4474 summary.workflow_timeline[-1].verification_observations[0].attempt_number == 1
4475 )
4476 assert (
4477 summary.workflow_timeline[-1].verification_observations[0].supersedes_attempt_id
4478 == "verification-attempt-2"
4479 )
4480 assert (
4481 summary.workflow_timeline[-1].verification_observations[0].command
4482 == "uv run pytest -q"
4483 )
4484
4485
4486 def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None:
4487 async def assess_confidence(
4488 tool_name: str,
4489 tool_args: dict,
4490 context: str,
4491 ) -> ConfidenceAssessment:
4492 raise AssertionError("Confidence scoring should be disabled in this scenario")
4493
4494 async def verify_action(
4495 tool_name: str,
4496 tool_args: dict,
4497 result: str,
4498 expected: str = "",
4499 ) -> ActionVerification:
4500 raise AssertionError("Verification should not run in this scenario")
4501
4502 repair_target = temp_dir / "guide" / "index.html"
4503 context = build_context(
4504 temp_dir=temp_dir,
4505 messages=[
4506 Message(
4507 role=Role.ASSISTANT,
4508 content=(
4509 "Repair focus:\n"
4510 f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n"
4511 f"- Immediate next step: edit `{repair_target}`.\n"
4512 f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n"
4513 ),
4514 )
4515 ],
4516 safeguards=FakeSafeguards(),
4517 assess_confidence=assess_confidence,
4518 verify_action=verify_action,
4519 )
4520 queued: list[str] = []
4521 context.queue_steering_message_callback = queued.append
4522 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4523
4524 runner._queue_blocked_active_repair_nudge(
4525 "[Blocked - active repair scope: verification already identified the repair target.]"
4526 )
4527
4528 assert queued
4529 assert str(repair_target) in queued[0]
4530 assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0]
4531 assert "Do not reopen unrelated reference materials" in queued[0]
4532
4533
4534 def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths(
4535 temp_dir: Path,
4536 ) -> None:
4537 async def assess_confidence(
4538 tool_name: str,
4539 tool_args: dict,
4540 context: str,
4541 ) -> ConfidenceAssessment:
4542 raise AssertionError("Confidence scoring should be disabled in this scenario")
4543
4544 async def verify_action(
4545 tool_name: str,
4546 tool_args: dict,
4547 result: str,
4548 expected: str = "",
4549 ) -> ActionVerification:
4550 raise AssertionError("Verification should not run in this scenario")
4551
4552 repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html"
4553 stylesheet = temp_dir / "guide" / "styles.css"
4554 context = build_context(
4555 temp_dir=temp_dir,
4556 messages=[
4557 Message(
4558 role=Role.ASSISTANT,
4559 content=(
4560 "Repair focus:\n"
4561 f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n"
4562 f"- Immediate next step: edit `{repair_target}`.\n"
4563 f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n"
4564 ),
4565 )
4566 ],
4567 safeguards=FakeSafeguards(),
4568 assess_confidence=assess_confidence,
4569 verify_action=verify_action,
4570 )
4571 queued: list[str] = []
4572 context.queue_steering_message_callback = queued.append
4573 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4574
4575 runner._queue_blocked_active_repair_mutation_nudge(
4576 "[Blocked - active repair mutation scope: verification already identified the repair target.]"
4577 )
4578
4579 assert queued
4580 assert str(repair_target) in queued[0]
4581 assert str(stylesheet) in queued[0]
4582 assert "before widening the change set" in queued[0]
4583
4584
4585 def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact(
4586 temp_dir: Path,
4587 ) -> None:
4588 async def assess_confidence(
4589 tool_name: str,
4590 tool_args: dict,
4591 context: str,
4592 ) -> ConfidenceAssessment:
4593 raise AssertionError("Confidence scoring should be disabled in this scenario")
4594
4595 async def verify_action(
4596 tool_name: str,
4597 tool_args: dict,
4598 result: str,
4599 expected: str = "",
4600 ) -> ActionVerification:
4601 raise AssertionError("Verification should not run in this scenario")
4602
4603 context = build_context(
4604 temp_dir=temp_dir,
4605 messages=[],
4606 safeguards=FakeSafeguards(),
4607 assess_confidence=assess_confidence,
4608 verify_action=verify_action,
4609 )
4610 queued: list[str] = []
4611 context.queue_steering_message_callback = queued.append
4612 store = DefinitionOfDoneStore(temp_dir)
4613 dod = create_definition_of_done("Create a multi-file guide from a reference")
4614 plan_path = temp_dir / "implementation.md"
4615 plan_path.write_text(
4616 "# File Changes\n"
4617 "- `guide/index.html`\n"
4618 "- `guide/chapters/01-getting-started.html`\n"
4619 "- `guide/chapters/02-installation.html`\n"
4620 "- `guide/chapters/03-first-website.html`\n"
4621 )
4622 dod.implementation_plan = str(plan_path)
4623 (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True)
4624 (temp_dir / "guide" / "index.html").write_text("index")
4625 (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one")
4626 (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two")
4627 runner = ToolBatchRunner(context, store)
4628
4629 runner._queue_blocked_late_reference_drift_nudge(
4630 "[Blocked - late reference drift: several planned artifacts already exist.]",
4631 dod=dod,
4632 )
4633
4634 assert queued
4635 assert "03-first-website.html" in queued[0]
4636 assert "older reference materials" in queued[0]
4637
4638
4639 def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification(
4640 temp_dir: Path,
4641 ) -> None:
4642 async def assess_confidence(
4643 tool_name: str,
4644 tool_args: dict,
4645 context: str,
4646 ) -> ConfidenceAssessment:
4647 raise AssertionError("Confidence scoring should be disabled in this scenario")
4648
4649 async def verify_action(
4650 tool_name: str,
4651 tool_args: dict,
4652 result: str,
4653 expected: str = "",
4654 ) -> ActionVerification:
4655 raise AssertionError("Verification should not run in this scenario")
4656
4657 guide_root = temp_dir / "guide"
4658 chapters = guide_root / "chapters"
4659 guide_root.mkdir(parents=True)
4660 chapters.mkdir()
4661 index_path = guide_root / "index.html"
4662 chapter_one = chapters / "01-getting-started.html"
4663 chapter_two = chapters / "02-installation.html"
4664 index_path.write_text("index")
4665 chapter_one.write_text("one")
4666 chapter_two.write_text("two")
4667
4668 implementation_plan = temp_dir / "implementation.md"
4669 implementation_plan.write_text(
4670 "\n".join(
4671 [
4672 "# Implementation Plan",
4673 "",
4674 "## File Changes",
4675 f"- `{guide_root}`",
4676 f"- `{chapters}`",
4677 f"- `{index_path}`",
4678 f"- `{chapter_one}`",
4679 f"- `{chapter_two}`",
4680 "",
4681 ]
4682 )
4683 )
4684
4685 context = build_context(
4686 temp_dir=temp_dir,
4687 messages=[],
4688 safeguards=FakeSafeguards(),
4689 assess_confidence=assess_confidence,
4690 verify_action=verify_action,
4691 )
4692 queued: list[str] = []
4693 context.queue_steering_message_callback = queued.append
4694 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4695 dod = create_definition_of_done("Create a multi-file guide from a reference")
4696 dod.implementation_plan = str(implementation_plan)
4697 dod.verification_commands = [f"ls -la {guide_root}"]
4698 sync_todos_to_definition_of_done(
4699 dod,
4700 [
4701 {
4702 "content": "Verify all guide files are linked and complete",
4703 "active_form": "Working on: Verify all guide files are linked and complete",
4704 "status": "pending",
4705 }
4706 ],
4707 project_root=temp_dir,
4708 )
4709
4710 runner._queue_blocked_completed_artifact_scope_nudge(
4711 "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]",
4712 dod=dod,
4713 )
4714
4715 assert queued
4716 assert "All explicitly planned artifacts already exist." in queued[0]
4717 assert "Verify all guide files are linked and complete" in queued[0]
4718 assert "Do not reopen earlier reference materials." in queued[0]
4719
4720
4721 def test_tool_batch_runner_blocked_html_declared_target_nudge_uses_closest_declared_target(
4722 temp_dir: Path,
4723 ) -> None:
4724 async def assess_confidence(
4725 tool_name: str,
4726 tool_args: dict,
4727 context: str,
4728 ) -> ConfidenceAssessment:
4729 raise AssertionError("Confidence scoring should be disabled in this scenario")
4730
4731 async def verify_action(
4732 tool_name: str,
4733 tool_args: dict,
4734 result: str,
4735 expected: str = "",
4736 ) -> ActionVerification:
4737 raise AssertionError("Verification should not run in this scenario")
4738
4739 context = build_context(
4740 temp_dir=temp_dir,
4741 messages=[],
4742 safeguards=FakeSafeguards(),
4743 assess_confidence=assess_confidence,
4744 verify_action=verify_action,
4745 )
4746 queued: list[str] = []
4747 context.queue_steering_message_callback = queued.append
4748 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4749
4750 runner._queue_blocked_html_declared_target_nudge(
4751 ToolCall(
4752 id="write-ch1",
4753 name="write",
4754 arguments={"file_path": str(temp_dir / "guide" / "chapters" / "01-introduction.html")},
4755 ),
4756 (
4757 "[Blocked - HTML page introduces new local targets outside the current declared artifact set] "
4758 "Suggestion: Keep non-root HTML pages within the root-declared local-link set and avoid "
4759 "introducing new sibling targets that the guide root does not declare, for example fix: 02-setup.html. "
4760 "Already-declared local targets include: chapters/01-introduction.html, chapters/02-installation.html, "
4761 "chapters/03-configuration.html. Closest declared local targets include: chapters/02-installation.html"
4762 ),
4763 )
4764
4765 assert queued
4766 assert str(temp_dir / "guide" / "chapters" / "01-introduction.html") in queued[0]
4767 assert "`chapters/02-installation.html`" in queued[0]
4768 assert "same file now" in queued[0]
4769
4770
4771 @pytest.mark.asyncio
4772 async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
4773 temp_dir: Path,
4774 ) -> None:
4775 async def assess_confidence(
4776 tool_name: str,
4777 tool_args: dict,
4778 context: str,
4779 ) -> ConfidenceAssessment:
4780 raise AssertionError("Confidence scoring should be disabled in this scenario")
4781
4782 async def verify_action(
4783 tool_name: str,
4784 tool_args: dict,
4785 result: str,
4786 expected: str = "",
4787 ) -> ActionVerification:
4788 raise AssertionError("Verification should not run in this scenario")
4789
4790 guide_root = temp_dir / "guides" / "nginx"
4791 chapters = guide_root / "chapters"
4792 chapters.mkdir(parents=True)
4793 index_path = guide_root / "index.html"
4794 chapter_one = chapters / "01-introduction.html"
4795 chapter_two = chapters / "02-installation.html"
4796 index_path.write_text("<html></html>\n")
4797 chapter_one.write_text("<h1>Intro</h1>\n")
4798
4799 implementation_plan = temp_dir / "implementation.md"
4800 implementation_plan.write_text(
4801 "\n".join(
4802 [
4803 "# Implementation Plan",
4804 "",
4805 "## File Changes",
4806 f"- `{index_path}`",
4807 f"- `{chapter_one}`",
4808 f"- `{chapter_two}`",
4809 "",
4810 ]
4811 )
4812 )
4813
4814 context = build_context(
4815 temp_dir=temp_dir,
4816 messages=[],
4817 safeguards=FakeSafeguards(),
4818 assess_confidence=assess_confidence,
4819 verify_action=verify_action,
4820 auto_recover=False,
4821 )
4822 queued: list[str] = []
4823 context.queue_steering_message_callback = queued.append
4824 runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
4825 tool_call = ToolCall(
4826 id="write-2",
4827 name="write",
4828 arguments={"file_path": "", "content": "<html></html>\n"},
4829 )
4830 blocked_message = "[Blocked - Empty file path] Suggestion: Provide a valid file path"
4831 executor = FakeExecutor(
4832 [
4833 ToolExecutionOutcome(
4834 tool_call=tool_call,
4835 state=ToolExecutionState.BLOCKED,
4836 message=Message.tool_result_message(
4837 tool_call_id=tool_call.id,
4838 display_content=blocked_message,
4839 result_content=blocked_message,
4840 is_error=True,
4841 ),
4842 event_content=blocked_message,
4843 is_error=True,
4844 result_output=blocked_message,
4845 )
4846 ]
4847 )
4848 dod = create_definition_of_done("Create a multi-file nginx guide.")
4849 dod.implementation_plan = str(implementation_plan)
4850 dod.touched_files.extend([str(index_path), str(chapter_one)])
4851 dod.pending_items.append("Creating Chapter 2: Installation and Setup")
4852
4853 await runner.execute_batch(
4854 tool_calls=[tool_call],
4855 tool_source="assistant",
4856 pending_tool_calls_seen=set(),
4857 emit=_noop_emit,
4858 summary=TurnSummary(final_response=""),
4859 dod=dod,
4860 executor=executor, # type: ignore[arg-type]
4861 on_confirmation=None,
4862 on_user_question=None,
4863 emit_confirmation=None,
4864 consecutive_errors=0,
4865 )
4866
4867 assert queued
4868 assert "did not provide a valid `file_path`" in queued[0]
4869 assert "Resume by creating `02-installation.html` now." in queued[0]
4870 assert (
4871 f"Prefer one `write` call for `{chapter_two}` instead of more rereads."
4872 in queued[0]
4873 )
4874 assert context.recovery_context is not None
4875 assert context.recovery_context.attempts[-1].error == blocked_message