Python · 80202 bytes Raw Blame History
1 """Deterministic runtime parity coverage for the current Loader loop."""
2
3 from __future__ import annotations
4
5 import json
6 from pathlib import Path
7
8 import pytest
9
10 from loader.agent.loop import Agent, AgentConfig
11 from loader.llm.base import CompletionResponse, Role, StreamChunk, ToolCall
12 from loader.runtime.capabilities import resolve_capability_profile
13 from loader.runtime.permissions import PermissionMode
14 from tests.helpers.runtime_harness import (
15 ScriptedBackend,
16 run_explore_scenario,
17 run_scenario,
18 )
19
20 SCENARIO_NAMES = [
21 "streaming_text",
22 "read_file_roundtrip",
23 "multi_tool_turn_roundtrip",
24 "turn_summary_smoke_for_multi_tool_turn",
25 "write_file_allowed",
26 "write_file_denied",
27 "bash_stdout_roundtrip",
28 "bash_confirmation_prompt_approved",
29 "bash_confirmation_prompt_denied",
30 "read_only_mode_denies_write",
31 "read_only_mode_denies_mutating_bash",
32 "read_only_mode_allows_safe_bash",
33 "workspace_write_denies_write_outside_root",
34 "danger_full_access_allows_dangerous_bash",
35 "prompt_mode_prompts_destructive_write",
36 "allow_mode_skips_prompt_for_destructive_write",
37 "deny_rule_blocks_allowed_mode",
38 "ask_rule_prompts_even_when_mode_would_allow",
39 "raw_json_tool_call_fallback",
40 "raw_json_todowrite_tool_call_fallback",
41 "raw_json_patch_tool_call_fallback",
42 "raw_json_ask_user_question_tool_call_fallback",
43 "raw_bracket_ask_user_question_tool_call_fallback",
44 "native_and_raw_tool_paths_share_executor_trace",
45 "backend_capability_probe_refreshes_native_tool_mode",
46 "run_streaming_delegates_to_primary_runtime",
47 "definition_of_done_verify_phase",
48 "verify_failure_routes_to_fix_loop",
49 "verify_retry_budget_exhaustion",
50 "ambiguous_prompt_routes_to_clarify",
51 "complex_prompt_routes_to_plan",
52 "verify_failure_fix_loop_does_not_reroute_workflow",
53 "conversational_task_skips_verify_phase",
54 "explore_mode_skips_dod_and_router",
55 "explore_mode_denies_write",
56 "explore_mode_ignores_global_allow_policy",
57 "non_mutating_completion_no_longer_forces_continuation",
58 "tool_result_contract_regression",
59 ]
60
61
62 def load_manifest() -> list[dict[str, str]]:
63 """Load the auditable parity scenario manifest."""
64
65 manifest_path = Path(__file__).parent / "fixtures" / "runtime_parity_manifest.json"
66 return json.loads(manifest_path.read_text())
67
68
69 def non_streaming_config(*, completion_check: bool = False) -> AgentConfig:
70 """Shared config for deterministic complete() tests."""
71
72 config = AgentConfig(auto_context=False, stream=False, max_iterations=8)
73 config.reasoning.completion_check = completion_check
74 return config
75
76
77 def native_tool_response(
78 *tool_calls: ToolCall,
79 content: str = "Using tools.",
80 ) -> CompletionResponse:
81 """Build a completion that includes native tool calls."""
82
83 return CompletionResponse(content=content, tool_calls=list(tool_calls))
84
85
86 def final_response(content: str) -> CompletionResponse:
87 """Build a completion with no further tool calls."""
88
89 return CompletionResponse(content=content)
90
91
92 def tool_event_names(run) -> list[str]:
93 """Return emitted tool event names in order."""
94
95 return [
96 event.tool_name
97 for event in run.events
98 if event.type == "tool_call" and event.tool_name and event.phase != "verification"
99 ]
100
101
102 def tool_result_messages(run) -> list[str]:
103 """Return emitted tool result messages in order."""
104
105 return [
106 event.content
107 for event in run.events
108 if event.type == "tool_result" and event.phase != "verification"
109 ]
110
111
112 def verification_commands(run) -> list[str]:
113 """Return verification-phase bash commands."""
114
115 return [
116 str((event.tool_args or {}).get("command", ""))
117 for event in run.events
118 if event.type == "tool_call" and event.phase == "verification"
119 ]
120
121
122 def trace_event_names(run) -> list[str]:
123 """Return recorded runtime trace event names."""
124
125 summary = run.agent.last_turn_summary
126 assert summary is not None
127 return [event.name for event in summary.trace]
128
129
130 def dod_statuses(run) -> list[str]:
131 """Return DoD statuses emitted during a run."""
132
133 return [
134 event.dod_status
135 for event in run.events
136 if event.type == "dod_status" and event.dod_status
137 ]
138
139
140 def workflow_modes(run) -> list[str]:
141 """Return emitted workflow modes in order."""
142
143 return [
144 event.workflow_mode
145 for event in run.events
146 if event.type == "workflow_mode" and event.workflow_mode
147 ]
148
149
150 def artifact_kinds(run) -> list[str]:
151 """Return emitted artifact kinds in order."""
152
153 return [
154 event.artifact_kind
155 for event in run.events
156 if event.type == "artifact" and event.artifact_kind
157 ]
158
159
160 @pytest.mark.asyncio
161 async def test_runtime_parity_manifest_matches_implemented_cases() -> None:
162 manifest_names = [entry["name"] for entry in load_manifest()]
163 assert manifest_names == SCENARIO_NAMES
164
165
166 @pytest.mark.asyncio
167 async def test_streaming_text_scenario() -> None:
168 backend = ScriptedBackend(
169 streams=[
170 [
171 StreamChunk(content="Mock streaming ", is_done=False),
172 StreamChunk(
173 content="says hello from Loader.",
174 full_content="Mock streaming says hello from Loader.",
175 is_done=True,
176 ),
177 ]
178 ]
179 )
180
181 run = await run_scenario("hello there", backend, config=AgentConfig(auto_context=False))
182
183 assert run.response == "Mock streaming says hello from Loader."
184 assert [call.mode for call in run.invocations] == ["stream"]
185 assert not tool_event_names(run)
186
187
188 @pytest.mark.asyncio
189 async def test_read_file_roundtrip(temp_dir: Path) -> None:
190 fixture = temp_dir / "fixture.txt"
191 fixture.write_text("alpha parity line\nbeta line\n")
192
193 backend = ScriptedBackend(
194 completions=[
195 native_tool_response(
196 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
197 content="I'll inspect that file.",
198 ),
199 final_response("The file contains alpha parity line."),
200 ]
201 )
202
203 run = await run_scenario(
204 "Read the fixture file and summarize it.",
205 backend,
206 config=non_streaming_config(),
207 project_root=temp_dir,
208 )
209
210 assert "alpha parity line" in run.response
211 assert tool_event_names(run) == ["read"]
212 assert any("alpha parity line" in message for message in tool_result_messages(run))
213 assert len(run.invocations) == 2
214 assert any(message.role == Role.TOOL for message in run.invocations[1].messages)
215
216
217 @pytest.mark.asyncio
218 @pytest.mark.parametrize("alias_key", ["file", "filepath"])
219 async def test_read_file_alias_roundtrip(temp_dir: Path, alias_key: str) -> None:
220 fixture = temp_dir / "fixture.txt"
221 fixture.write_text("alpha parity line\nbeta line\n")
222
223 backend = ScriptedBackend(
224 completions=[
225 native_tool_response(
226 ToolCall(id="read-1", name="read", arguments={alias_key: str(fixture)}),
227 content="I'll inspect that file.",
228 ),
229 final_response("The file contains alpha parity line."),
230 ]
231 )
232
233 run = await run_scenario(
234 "Read the fixture file and summarize it.",
235 backend,
236 config=non_streaming_config(),
237 project_root=temp_dir,
238 )
239
240 assert "alpha parity line" in run.response
241 assert tool_event_names(run) == ["read"]
242 assert any("alpha parity line" in message for message in tool_result_messages(run))
243
244
245 @pytest.mark.asyncio
246 async def test_multi_tool_turn_roundtrip(temp_dir: Path) -> None:
247 fixture = temp_dir / "fixture.txt"
248 fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
249
250 backend = ScriptedBackend(
251 completions=[
252 native_tool_response(
253 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
254 ToolCall(
255 id="grep-1",
256 name="grep",
257 arguments={"pattern": "parity", "path": str(fixture)},
258 ),
259 content="I'll inspect the file and count parity matches.",
260 ),
261 final_response("The file has two parity lines, including alpha parity line."),
262 ]
263 )
264
265 run = await run_scenario(
266 "Inspect the fixture and find parity lines.",
267 backend,
268 config=non_streaming_config(),
269 project_root=temp_dir,
270 )
271
272 assert tool_event_names(run) == ["read", "grep"]
273 assert len(tool_result_messages(run)) == 2
274 assert "two parity lines" in run.response
275
276
277 @pytest.mark.asyncio
278 async def test_turn_summary_smoke_for_multi_tool_turn(temp_dir: Path) -> None:
279 fixture = temp_dir / "fixture.txt"
280 fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
281
282 backend = ScriptedBackend(
283 completions=[
284 native_tool_response(
285 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
286 ToolCall(
287 id="grep-1",
288 name="grep",
289 arguments={"pattern": "parity", "path": str(fixture)},
290 ),
291 content="I'll inspect the file and count parity matches.",
292 ),
293 final_response("The file has two parity lines, including alpha parity line."),
294 ]
295 )
296
297 run = await run_scenario(
298 "Inspect the fixture and find parity lines.",
299 backend,
300 config=non_streaming_config(),
301 project_root=temp_dir,
302 )
303
304 summary = run.agent.last_turn_summary
305 assert summary is not None
306 assert summary.final_response == run.response
307 assert summary.iterations == 2
308 assert len(summary.assistant_messages) == 2
309 assert len(summary.tool_result_messages) == 2
310 assert "assistant.tool_batch" in trace_event_names(run)
311
312
313 @pytest.mark.asyncio
314 async def test_write_file_allowed(temp_dir: Path) -> None:
315 target = temp_dir / "allowed.txt"
316 backend = ScriptedBackend(
317 completions=[
318 native_tool_response(
319 ToolCall(
320 id="write-1",
321 name="write",
322 arguments={"file_path": str(target), "content": "hello from loader\n"},
323 ),
324 content="I'll create the file now.",
325 ),
326 final_response("Successfully created the file."),
327 ]
328 )
329
330 run = await run_scenario(
331 "Create allowed.txt with a greeting.",
332 backend,
333 config=non_streaming_config(),
334 project_root=temp_dir,
335 )
336
337 assert target.read_text() == "hello from loader\n"
338 assert "Successfully created the file." in run.response
339 assert tool_event_names(run) == ["write"]
340
341
342 @pytest.mark.asyncio
343 async def test_write_file_denied(temp_dir: Path) -> None:
344 target = temp_dir / "denied.txt"
345 config = non_streaming_config()
346 config.permission_mode = PermissionMode.PROMPT
347 backend = ScriptedBackend(
348 completions=[
349 native_tool_response(
350 ToolCall(
351 id="write-1",
352 name="write",
353 arguments={"file_path": str(target), "content": "should not exist\n"},
354 ),
355 content="I'll create the file if you approve it.",
356 ),
357 final_response("I skipped the write as requested."),
358 ]
359 )
360
361 async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
362 assert tool_name == "write"
363 assert "approval" in message.lower()
364 assert "active_mode=prompt" in details
365 return False
366
367 run = await run_scenario(
368 "Create denied.txt with a greeting.",
369 backend,
370 config=config,
371 project_root=temp_dir,
372 on_confirmation=deny_confirmation,
373 )
374
375 assert not target.exists()
376 assert "skipped the write" in run.response.lower()
377 assert any(event.type == "confirmation" for event in run.events)
378
379
380 @pytest.mark.asyncio
381 async def test_bash_stdout_roundtrip(temp_dir: Path, monkeypatch: pytest.MonkeyPatch) -> None:
382 monkeypatch.chdir(temp_dir)
383 backend = ScriptedBackend(
384 completions=[
385 native_tool_response(
386 ToolCall(id="bash-1", name="bash", arguments={"command": "pwd"}),
387 content="I'll check the current directory.",
388 ),
389 final_response("Confirmed the working directory."),
390 ]
391 )
392
393 run = await run_scenario(
394 "Tell me the current directory.",
395 backend,
396 config=non_streaming_config(),
397 project_root=temp_dir,
398 )
399
400 assert str(temp_dir) in tool_result_messages(run)[0]
401 assert "Confirmed the working directory." in run.response
402
403
404 @pytest.mark.asyncio
405 async def test_bash_confirmation_prompt_approved(
406 temp_dir: Path,
407 monkeypatch: pytest.MonkeyPatch,
408 ) -> None:
409 monkeypatch.chdir(temp_dir)
410 target = temp_dir / "approved.txt"
411 config = non_streaming_config()
412 config.permission_mode = PermissionMode.PROMPT
413 backend = ScriptedBackend(
414 completions=[
415 native_tool_response(
416 ToolCall(id="bash-1", name="bash", arguments={"command": "touch approved.txt"}),
417 content="I'll create the file after approval.",
418 ),
419 final_response("The shell command completed."),
420 ]
421 )
422
423 async def approve_confirmation(tool_name: str, message: str, details: str) -> bool:
424 assert tool_name == "bash"
425 assert "approval" in message.lower()
426 assert "touch approved.txt" in details
427 return True
428
429 run = await run_scenario(
430 "Create approved.txt using bash.",
431 backend,
432 config=config,
433 project_root=temp_dir,
434 on_confirmation=approve_confirmation,
435 )
436
437 assert target.exists()
438 assert "shell command completed" in run.response.lower()
439 assert any(event.type == "confirmation" for event in run.events)
440
441
442 @pytest.mark.asyncio
443 async def test_bash_confirmation_prompt_denied(
444 temp_dir: Path,
445 monkeypatch: pytest.MonkeyPatch,
446 ) -> None:
447 monkeypatch.chdir(temp_dir)
448 target = temp_dir / "denied-bash.txt"
449 config = non_streaming_config()
450 config.permission_mode = PermissionMode.PROMPT
451 backend = ScriptedBackend(
452 completions=[
453 native_tool_response(
454 ToolCall(id="bash-1", name="bash", arguments={"command": "touch denied-bash.txt"}),
455 content="I'll create the file if you allow it.",
456 ),
457 final_response("I left the shell command undone."),
458 ]
459 )
460
461 async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
462 assert tool_name == "bash"
463 assert "touch denied-bash.txt" in details
464 return False
465
466 run = await run_scenario(
467 "Create denied-bash.txt using bash.",
468 backend,
469 config=config,
470 project_root=temp_dir,
471 on_confirmation=deny_confirmation,
472 )
473
474 assert not target.exists()
475 assert "left the shell command undone" in run.response.lower()
476 assert any(event.type == "confirmation" for event in run.events)
477
478
479 @pytest.mark.asyncio
480 async def test_read_only_mode_denies_write(temp_dir: Path) -> None:
481 config = non_streaming_config()
482 config.permission_mode = PermissionMode.READ_ONLY
483 config.auto_recover = False
484 target = temp_dir / "blocked-by-policy.txt"
485 backend = ScriptedBackend(
486 completions=[
487 native_tool_response(
488 ToolCall(
489 id="write-1",
490 name="write",
491 arguments={"file_path": str(target), "content": "denied\n"},
492 ),
493 content="I'll create the file.",
494 ),
495 final_response("The write was blocked."),
496 ]
497 )
498
499 run = await run_scenario(
500 "Create blocked-by-policy.txt.",
501 backend,
502 config=config,
503 project_root=temp_dir,
504 )
505
506 assert not target.exists()
507 assert any("requires workspace-write" in message for message in tool_result_messages(run))
508
509
510 @pytest.mark.asyncio
511 async def test_read_only_mode_denies_mutating_bash(temp_dir: Path) -> None:
512 config = non_streaming_config()
513 config.permission_mode = PermissionMode.READ_ONLY
514 config.auto_recover = False
515 target = temp_dir / "bash-blocked.txt"
516 backend = ScriptedBackend(
517 completions=[
518 native_tool_response(
519 ToolCall(
520 id="bash-1",
521 name="bash",
522 arguments={"command": f"touch {target}"},
523 ),
524 content="I'll create the file with bash.",
525 ),
526 final_response("The bash command was blocked."),
527 ]
528 )
529
530 run = await run_scenario(
531 "Create bash-blocked.txt using bash.",
532 backend,
533 config=config,
534 project_root=temp_dir,
535 )
536
537 assert not target.exists()
538 assert any("requires workspace-write" in message for message in tool_result_messages(run))
539
540
541 @pytest.mark.asyncio
542 async def test_read_only_mode_allows_safe_bash(temp_dir: Path) -> None:
543 config = non_streaming_config()
544 config.permission_mode = PermissionMode.READ_ONLY
545 backend = ScriptedBackend(
546 completions=[
547 native_tool_response(
548 ToolCall(id="bash-1", name="bash", arguments={"command": "pwd"}),
549 content="I'll inspect the current directory.",
550 ),
551 final_response("Inspected the current directory."),
552 ]
553 )
554
555 run = await run_scenario(
556 "Show the current directory.",
557 backend,
558 config=config,
559 project_root=temp_dir,
560 )
561
562 assert tool_event_names(run) == ["bash"]
563 assert not any("requires" in message for message in tool_result_messages(run))
564
565
566 @pytest.mark.asyncio
567 async def test_workspace_write_denies_write_outside_root(temp_dir: Path) -> None:
568 config = non_streaming_config()
569 config.auto_recover = False
570 outside = temp_dir.parent / "outside-root.txt"
571 if outside.exists():
572 outside.unlink()
573
574 backend = ScriptedBackend(
575 completions=[
576 native_tool_response(
577 ToolCall(
578 id="write-1",
579 name="write",
580 arguments={"file_path": str(outside), "content": "outside\n"},
581 ),
582 content="I'll write outside the workspace.",
583 ),
584 final_response("The write was blocked."),
585 ]
586 )
587
588 async def decline_confirmation(_name: str, _msg: str, _details: str) -> bool:
589 return False
590
591 run = await run_scenario(
592 "Write a file outside the workspace.",
593 backend,
594 config=config,
595 project_root=temp_dir,
596 on_confirmation=decline_confirmation,
597 )
598
599 assert not outside.exists()
600 assert any(
601 "declined" in message.lower() or "outside workspace" in message.lower()
602 for message in tool_result_messages(run)
603 )
604
605
606 @pytest.mark.asyncio
607 async def test_danger_full_access_allows_dangerous_bash(temp_dir: Path) -> None:
608 target = temp_dir / "mode.txt"
609 target.write_text("hello\n")
610 config = non_streaming_config()
611 config.permission_mode = PermissionMode.DANGER_FULL_ACCESS
612 backend = ScriptedBackend(
613 completions=[
614 native_tool_response(
615 ToolCall(
616 id="bash-1",
617 name="bash",
618 arguments={"command": f"chmod 600 {target}"},
619 ),
620 content="I'll change the file permissions.",
621 ),
622 final_response("Updated the file permissions."),
623 ]
624 )
625
626 run = await run_scenario(
627 "Lock down mode.txt permissions.",
628 backend,
629 config=config,
630 project_root=temp_dir,
631 )
632
633 assert tool_event_names(run) == ["bash"]
634 assert not any("requires" in message for message in tool_result_messages(run))
635 assert not any(event.type == "confirmation" for event in run.events)
636
637
638 @pytest.mark.asyncio
639 async def test_prompt_mode_prompts_destructive_write(temp_dir: Path) -> None:
640 target = temp_dir / "prompted.txt"
641 config = non_streaming_config()
642 config.permission_mode = PermissionMode.PROMPT
643 backend = ScriptedBackend(
644 completions=[
645 native_tool_response(
646 ToolCall(
647 id="write-1",
648 name="write",
649 arguments={"file_path": str(target), "content": "prompted\n"},
650 ),
651 content="I'll create the file after approval.",
652 ),
653 final_response("The file was created."),
654 ]
655 )
656 prompts: list[str] = []
657
658 async def approve_confirmation(tool_name: str, message: str, details: str) -> bool:
659 assert tool_name == "write"
660 prompts.append(details)
661 return True
662
663 run = await run_scenario(
664 "Create prompted.txt after approval.",
665 backend,
666 config=config,
667 project_root=temp_dir,
668 on_confirmation=approve_confirmation,
669 )
670
671 assert target.read_text() == "prompted\n"
672 assert prompts and "active_mode=prompt" in prompts[0]
673 assert any(event.type == "confirmation" for event in run.events)
674
675
676 @pytest.mark.asyncio
677 async def test_allow_mode_skips_prompt_for_destructive_write(temp_dir: Path) -> None:
678 target = temp_dir / "allow-mode.txt"
679 config = non_streaming_config()
680 config.permission_mode = PermissionMode.ALLOW
681 backend = ScriptedBackend(
682 completions=[
683 native_tool_response(
684 ToolCall(
685 id="write-1",
686 name="write",
687 arguments={"file_path": str(target), "content": "allow mode\n"},
688 ),
689 content="I'll create the file directly.",
690 ),
691 final_response("The file was created."),
692 ]
693 )
694 prompts: list[str] = []
695
696 async def unexpected_confirmation(tool_name: str, message: str, details: str) -> bool:
697 prompts.append(tool_name)
698 return False
699
700 run = await run_scenario(
701 "Create allow-mode.txt directly.",
702 backend,
703 config=config,
704 project_root=temp_dir,
705 on_confirmation=unexpected_confirmation,
706 )
707
708 assert target.read_text() == "allow mode\n"
709 assert prompts == []
710 assert not any(event.type == "confirmation" for event in run.events)
711 assert "The file was created." in run.response
712
713
714 @pytest.mark.asyncio
715 async def test_deny_rule_blocks_allowed_mode(temp_dir: Path) -> None:
716 loader_root = temp_dir / ".loader"
717 loader_root.mkdir()
718 (loader_root / "permission-rules.json").write_text(
719 '{"deny": [{"tool": "write", "path_contains": "secrets"}]}\n'
720 )
721 target = temp_dir / "secrets.txt"
722 config = non_streaming_config()
723 config.permission_mode = PermissionMode.ALLOW
724 config.auto_recover = False
725 backend = ScriptedBackend(
726 completions=[
727 native_tool_response(
728 ToolCall(
729 id="write-1",
730 name="write",
731 arguments={"file_path": str(target), "content": "denied\n"},
732 ),
733 content="I'll write the secret file.",
734 ),
735 final_response("The write was blocked by policy."),
736 ]
737 )
738
739 run = await run_scenario(
740 "Create secrets.txt.",
741 backend,
742 config=config,
743 project_root=temp_dir,
744 )
745
746 assert not target.exists()
747 assert any("denied by rule" in message for message in tool_result_messages(run))
748 assert "tool.permission_denied" in trace_event_names(run)
749
750
751 @pytest.mark.asyncio
752 async def test_ask_rule_prompts_even_when_mode_would_allow(temp_dir: Path) -> None:
753 loader_root = temp_dir / ".loader"
754 loader_root.mkdir()
755 (loader_root / "permission-rules.json").write_text(
756 '{"ask": [{"tool": "write", "path_contains": "README"}]}\n'
757 )
758 target = temp_dir / "README.md"
759 config = non_streaming_config()
760 config.permission_mode = PermissionMode.ALLOW
761 backend = ScriptedBackend(
762 completions=[
763 native_tool_response(
764 ToolCall(
765 id="write-1",
766 name="write",
767 arguments={"file_path": str(target), "content": "ask rule\n"},
768 ),
769 content="I'll update the README if you approve it.",
770 ),
771 final_response("The write was declined."),
772 ]
773 )
774 prompts: list[str] = []
775
776 async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
777 prompts.append(details)
778 return False
779
780 run = await run_scenario(
781 "Update README.md.",
782 backend,
783 config=config,
784 project_root=temp_dir,
785 on_confirmation=deny_confirmation,
786 )
787
788 assert not target.exists()
789 assert prompts and "matched_ask_rule=tool=write, path_contains=README" in prompts[0]
790 assert any(event.type == "confirmation" for event in run.events)
791 assert "declined" in run.response.lower()
792
793
794 @pytest.mark.asyncio
795 async def test_raw_json_tool_call_fallback(temp_dir: Path) -> None:
796 fixture = temp_dir / "fixture.txt"
797 fixture.write_text("alpha parity line\n")
798 raw_json = f'{{"name": "read", "arguments": {{"file_path": "{fixture}"}}}}'
799
800 backend = ScriptedBackend(
801 streams=[
802 [
803 StreamChunk(content=raw_json[:25], is_done=False),
804 StreamChunk(content=raw_json[25:], full_content=raw_json, is_done=True),
805 ],
806 [
807 StreamChunk(
808 content="Recovered the raw JSON tool call and read the file.",
809 full_content="Recovered the raw JSON tool call and read the file.",
810 is_done=True,
811 )
812 ],
813 ]
814 )
815
816 run = await run_scenario(
817 "Read the fixture file.",
818 backend,
819 config=AgentConfig(auto_context=False, max_iterations=8),
820 project_root=temp_dir,
821 )
822
823 assert tool_event_names(run) == ["read"]
824 assert any("alpha parity line" in message for message in tool_result_messages(run))
825 assert "Recovered the raw JSON tool call" in run.response
826
827
828 @pytest.mark.asyncio
829 async def test_raw_json_todowrite_tool_call_fallback(temp_dir: Path) -> None:
830 raw_json = json.dumps(
831 {
832 "name": "TodoWrite",
833 "arguments": {
834 "todos": [
835 {
836 "content": "Run tests",
837 "active_form": "Running tests",
838 "status": "completed",
839 }
840 ]
841 },
842 }
843 )
844 backend = ScriptedBackend(
845 completions=[
846 CompletionResponse(content=raw_json),
847 final_response("Tracked the current todo list."),
848 ]
849 )
850
851 run = await run_scenario(
852 "Track the current work items.",
853 backend,
854 config=non_streaming_config(),
855 project_root=temp_dir,
856 )
857
858 todo_store = temp_dir / ".loader" / "todos" / "active.json"
859 assert tool_event_names(run) == ["TodoWrite"]
860 assert json.loads(todo_store.read_text()) == []
861 assert "Tracked the current todo list." in run.response
862
863
864 @pytest.mark.asyncio
865 async def test_raw_json_patch_tool_call_fallback(temp_dir: Path) -> None:
866 target = temp_dir / "sample.txt"
867 target.write_text("alpha\nbeta\ngamma\n")
868 raw_json = json.dumps(
869 {
870 "name": "patch",
871 "arguments": {
872 "file_path": str(target),
873 "hunks": [
874 {
875 "old_start": 2,
876 "old_lines": 1,
877 "new_start": 2,
878 "new_lines": 1,
879 "lines": ["-beta", "+beta updated"],
880 }
881 ],
882 },
883 }
884 )
885 backend = ScriptedBackend(
886 completions=[
887 CompletionResponse(content=raw_json),
888 final_response("Patched sample.txt."),
889 ]
890 )
891
892 run = await run_scenario(
893 "Update sample.txt.",
894 backend,
895 config=non_streaming_config(),
896 project_root=temp_dir,
897 )
898
899 assert tool_event_names(run) == ["patch"]
900 assert target.read_text() == "alpha\nbeta updated\ngamma\n"
901 assert "Patched sample.txt." in run.response
902
903
904 @pytest.mark.asyncio
905 async def test_native_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None:
906 target = temp_dir / "sample.txt"
907 target.write_text("alpha\nbeta\ngamma\n")
908
909 backend = ScriptedBackend(
910 completions=[
911 native_tool_response(
912 ToolCall(
913 id="patch-1",
914 name="patch",
915 arguments={
916 "file_path": str(target),
917 "patch": (
918 "--- a/sample.txt\n"
919 "+++ b/sample.txt\n"
920 "@@ -2,1 +2,1 @@\n"
921 "-beta\n"
922 "+beta updated\n"
923 ),
924 },
925 ),
926 content="I'll patch the file directly.",
927 ),
928 final_response("Patched sample.txt."),
929 ]
930 )
931
932 run = await run_scenario(
933 "Update sample.txt.",
934 backend,
935 config=non_streaming_config(),
936 project_root=temp_dir,
937 )
938
939 assert tool_event_names(run) == ["patch"]
940 assert target.read_text() == "alpha\nbeta updated\ngamma\n"
941 assert "Patched sample.txt." in run.response
942
943
944 @pytest.mark.asyncio
945 async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
946 raw_json = json.dumps(
947 {
948 "name": "AskUserQuestion",
949 "arguments": {
950 "title": "Path Choice",
951 "context": "Choose the safer Loader cleanup path.",
952 "question": "Which path should we take?",
953 "options": [
954 {
955 "label": "Plan first",
956 "description": "Keep the next move documented.",
957 },
958 {
959 "label": "Execute now",
960 "description": "Start changing code immediately.",
961 },
962 ],
963 },
964 }
965 )
966 backend = ScriptedBackend(
967 completions=[
968 CompletionResponse(content=raw_json),
969 final_response("We'll execute now."),
970 ]
971 )
972
973 async def answer(question: str, options: list[str] | None) -> str:
974 assert "Which path should we take?" in question
975 assert options == [
976 "Plan first - Keep the next move documented.",
977 "Execute now - Start changing code immediately.",
978 ]
979 return "2"
980
981 run = await run_scenario(
982 "Decide the next path before changing code.",
983 backend,
984 config=non_streaming_config(),
985 project_root=temp_dir,
986 on_user_question=answer,
987 )
988
989 assert tool_event_names(run) == ["AskUserQuestion"]
990 assert any("Execute now" in message for message in tool_result_messages(run))
991 assert "We'll execute now." in run.response
992
993
994 @pytest.mark.asyncio
995 async def test_raw_bracket_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
996 backend = ScriptedBackend(
997 streams=[
998 [
999 StreamChunk(
1000 content='[calls askuserquestion tool with: question="Which path should we take?"]',
1001 full_content='[calls askuserquestion tool with: question="Which path should we take?"]',
1002 is_done=True,
1003 )
1004 ],
1005 [
1006 StreamChunk(
1007 content="We'll plan first.",
1008 full_content="We'll plan first.",
1009 is_done=True,
1010 )
1011 ],
1012 ]
1013 )
1014
1015 async def answer(question: str, options: list[str] | None) -> str:
1016 assert "Which path should we take?" in question
1017 assert options is None
1018 return "Plan first"
1019
1020 run = await run_scenario(
1021 "Read the fixture file.",
1022 backend,
1023 config=AgentConfig(auto_context=False, max_iterations=8),
1024 project_root=temp_dir,
1025 on_user_question=answer,
1026 )
1027
1028 assert tool_event_names(run) == ["AskUserQuestion"]
1029 assert any('"answer": "Plan first"' in message for message in tool_result_messages(run))
1030 assert "We'll plan first." in run.response
1031
1032
1033 @pytest.mark.asyncio
1034 async def test_non_streaming_bracket_ask_user_question_tool_call_fallback(
1035 temp_dir: Path,
1036 ) -> None:
1037 backend = ScriptedBackend(
1038 completions=[
1039 CompletionResponse(
1040 content='[calls askuserquestion tool with: question="Which path should we take?"]'
1041 ),
1042 final_response("We'll plan first."),
1043 ]
1044 )
1045
1046 async def answer(question: str, options: list[str] | None) -> str:
1047 assert "Which path should we take?" in question
1048 assert options is None
1049 return "Plan first"
1050
1051 run = await run_scenario(
1052 "Read the fixture file.",
1053 backend,
1054 config=non_streaming_config(),
1055 project_root=temp_dir,
1056 on_user_question=answer,
1057 )
1058
1059 assert tool_event_names(run) == ["AskUserQuestion"]
1060 assert any('"answer": "Plan first"' in message for message in tool_result_messages(run))
1061 assert "We'll plan first." in run.response
1062
1063
1064 @pytest.mark.asyncio
1065 async def test_native_and_raw_tool_paths_share_executor_trace(temp_dir: Path) -> None:
1066 native_fixture = temp_dir / "native.txt"
1067 native_fixture.write_text("native parity line\n")
1068 native_backend = ScriptedBackend(
1069 completions=[
1070 native_tool_response(
1071 ToolCall(id="read-1", name="read", arguments={"file_path": str(native_fixture)}),
1072 content="I'll inspect the native tool result.",
1073 ),
1074 final_response("Native read complete."),
1075 ]
1076 )
1077 native_run = await run_scenario(
1078 "Read native.txt.",
1079 native_backend,
1080 config=non_streaming_config(),
1081 project_root=temp_dir,
1082 )
1083
1084 raw_fixture = temp_dir / "raw.txt"
1085 raw_fixture.write_text("raw parity line\n")
1086 raw_json = f'{{"name": "read", "arguments": {{"file_path": "{raw_fixture}"}}}}'
1087 raw_backend = ScriptedBackend(
1088 streams=[
1089 [
1090 StreamChunk(content=raw_json[:20], is_done=False),
1091 StreamChunk(content=raw_json[20:], full_content=raw_json, is_done=True),
1092 ],
1093 [
1094 StreamChunk(
1095 content="Raw read complete.",
1096 full_content="Raw read complete.",
1097 is_done=True,
1098 )
1099 ],
1100 ]
1101 )
1102 raw_run = await run_scenario(
1103 "Read raw.txt.",
1104 raw_backend,
1105 config=AgentConfig(auto_context=False, max_iterations=8),
1106 project_root=temp_dir,
1107 )
1108
1109 for run in (native_run, raw_run):
1110 names = trace_event_names(run)
1111 assert "assistant.tool_batch" in names
1112 assert "tool.received" in names
1113 assert "tool.executed" in names
1114
1115 native_summary = native_run.agent.last_turn_summary
1116 raw_summary = raw_run.agent.last_turn_summary
1117 assert native_summary is not None
1118 assert raw_summary is not None
1119 assert any(
1120 event.name == "tool.received" and event.data["source"] == "native"
1121 for event in native_summary.trace
1122 )
1123 assert any(
1124 event.name == "tool.received" and event.data["source"] == "raw_text"
1125 for event in raw_summary.trace
1126 )
1127
1128
1129 @pytest.mark.asyncio
1130 async def test_backend_capability_probe_refreshes_native_tool_mode(
1131 temp_dir: Path,
1132 ) -> None:
1133 fixture = temp_dir / "fixture.txt"
1134 fixture.write_text("capability probe line\n")
1135
1136 class LazyCapabilityBackend(ScriptedBackend):
1137 def __init__(self, completions: list[CompletionResponse]) -> None:
1138 super().__init__(completions=completions, supports_native_tools=False)
1139 self.model = "custom-qwen-build"
1140 self._described = False
1141
1142 async def describe_model(self) -> dict[str, dict[str, list[str]]]:
1143 self._described = True
1144 return {"details": {"families": ["qwen2.5"]}}
1145
1146 def capability_profile(self):
1147 model_details = (
1148 {"details": {"families": ["qwen2.5"]}} if self._described else None
1149 )
1150 return resolve_capability_profile(
1151 self.model,
1152 model_details=model_details,
1153 )
1154
1155 backend = LazyCapabilityBackend(
1156 completions=[
1157 native_tool_response(
1158 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
1159 content="I'll inspect that file after probing capabilities.",
1160 ),
1161 final_response("Capability probing enabled the native read."),
1162 ]
1163 )
1164
1165 run = await run_scenario(
1166 "Read the fixture file after checking model capabilities.",
1167 backend,
1168 config=non_streaming_config(),
1169 project_root=temp_dir,
1170 )
1171
1172 assert backend._described
1173 assert not run.agent.use_react
1174 assert run.invocations[0].tools is not None
1175 assert tool_event_names(run) == ["read"]
1176 assert "Capability probing enabled the native read." in run.response
1177
1178
1179 @pytest.mark.asyncio
1180 async def test_run_streaming_delegates_to_primary_runtime(temp_dir: Path) -> None:
1181 fixture = temp_dir / "streaming.txt"
1182 fixture.write_text("streamed runtime line\n")
1183 backend = ScriptedBackend(
1184 streams=[
1185 [
1186 StreamChunk(
1187 content="I'll inspect the file now.",
1188 full_content="I'll inspect the file now.",
1189 tool_calls=[
1190 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)})
1191 ],
1192 is_done=True,
1193 )
1194 ],
1195 [
1196 StreamChunk(
1197 content="Finished reading the streamed fixture.",
1198 full_content="Finished reading the streamed fixture.",
1199 is_done=True,
1200 )
1201 ],
1202 ]
1203 )
1204 agent = Agent(
1205 backend=backend,
1206 config=AgentConfig(auto_context=False, max_iterations=8),
1207 project_root=temp_dir,
1208 )
1209
1210 events = [event async for event in agent.run_streaming("Read the streamed fixture file.")]
1211
1212 assert any(event.type == "tool_call" and event.tool_name == "read" for event in events)
1213 assert any(
1214 event.type == "tool_result" and "streamed runtime line" in event.content
1215 for event in events
1216 )
1217 assert agent.last_turn_summary is not None
1218 assert agent.last_turn_summary.final_response.startswith(
1219 "Finished reading the streamed fixture."
1220 )
1221
1222
1223 @pytest.mark.asyncio
1224 async def test_definition_of_done_verify_phase(temp_dir: Path) -> None:
1225 target = temp_dir / "verified.txt"
1226 backend = ScriptedBackend(
1227 completions=[
1228 native_tool_response(
1229 ToolCall(
1230 id="write-1",
1231 name="write",
1232 arguments={"file_path": str(target), "content": "verified\n"},
1233 ),
1234 content="I'll create the file now.",
1235 ),
1236 final_response("Created verified.txt."),
1237 ]
1238 )
1239
1240 run = await run_scenario(
1241 "Create verified.txt with a line of text.",
1242 backend,
1243 config=non_streaming_config(),
1244 project_root=temp_dir,
1245 )
1246
1247 assert verification_commands(run) == [f"test -f {target}"]
1248 assert dod_statuses(run) == ["draft", "verifying", "done"]
1249 assert "Verification:" in run.response
1250 assert run.agent.last_turn_summary is not None
1251 assert run.agent.last_turn_summary.verification_status == "passed"
1252 assert run.agent.last_turn_summary.definition_of_done is not None
1253
1254
1255 @pytest.mark.asyncio
1256 async def test_verify_failure_routes_to_fix_loop(
1257 temp_dir: Path,
1258 monkeypatch: pytest.MonkeyPatch,
1259 ) -> None:
1260 monkeypatch.chdir(temp_dir)
1261 target = temp_dir / "broken.py"
1262 backend = ScriptedBackend(
1263 completions=[
1264 native_tool_response(
1265 ToolCall(
1266 id="write-1",
1267 name="write",
1268 arguments={"file_path": str(target), "content": "print(\n"},
1269 ),
1270 content="I'll create the script.",
1271 ),
1272 final_response("Created broken.py."),
1273 native_tool_response(
1274 ToolCall(
1275 id="write-2",
1276 name="write",
1277 arguments={
1278 "file_path": str(target),
1279 "content": "print('fixed from verify loop')\n",
1280 },
1281 ),
1282 content="I'll fix the verification failure.",
1283 ),
1284 final_response("Fixed broken.py."),
1285 ]
1286 )
1287
1288 run = await run_scenario(
1289 "Create broken.py and make sure it runs.",
1290 backend,
1291 config=non_streaming_config(),
1292 project_root=temp_dir,
1293 )
1294
1295 assert target.read_text() == "print('fixed from verify loop')\n"
1296 assert verification_commands(run) == ["python broken.py", "python broken.py"]
1297 assert "fixing" in dod_statuses(run)
1298 assert "Verification:" in run.response
1299 assert run.agent.last_turn_summary is not None
1300 assert run.agent.last_turn_summary.verification_status == "passed"
1301
1302
1303 @pytest.mark.asyncio
1304 async def test_verify_retry_budget_exhaustion(
1305 temp_dir: Path,
1306 monkeypatch: pytest.MonkeyPatch,
1307 ) -> None:
1308 monkeypatch.chdir(temp_dir)
1309 target = temp_dir / "still-broken.py"
1310 config = non_streaming_config()
1311 config.verification_retry_budget = 1
1312 backend = ScriptedBackend(
1313 completions=[
1314 native_tool_response(
1315 ToolCall(
1316 id="write-1",
1317 name="write",
1318 arguments={"file_path": str(target), "content": "print(\n"},
1319 ),
1320 content="I'll create the script.",
1321 ),
1322 final_response("Created still-broken.py."),
1323 native_tool_response(
1324 ToolCall(
1325 id="write-2",
1326 name="write",
1327 arguments={"file_path": str(target), "content": "print(\n"},
1328 ),
1329 content="I'll try one more fix.",
1330 ),
1331 final_response("Tried to fix still-broken.py."),
1332 ]
1333 )
1334
1335 run = await run_scenario(
1336 "Create still-broken.py and make sure it runs.",
1337 backend,
1338 config=config,
1339 project_root=temp_dir,
1340 )
1341
1342 assert "couldn't verify" in run.response.lower()
1343 assert dod_statuses(run)[-1] == "failed"
1344 assert run.agent.last_turn_summary is not None
1345 assert run.agent.last_turn_summary.verification_status == "failed"
1346
1347
1348 @pytest.mark.asyncio
1349 async def test_ambiguous_prompt_routes_to_clarify(temp_dir: Path) -> None:
1350 backend = ScriptedBackend(
1351 completions=[
1352 native_tool_response(
1353 ToolCall(
1354 id="ask-1",
1355 name="AskUserQuestion",
1356 arguments={
1357 "question": (
1358 "What outcome matters most, and what should stay out of scope?"
1359 )
1360 },
1361 ),
1362 content="I need one clarification first.",
1363 ),
1364 final_response(
1365 "\n".join(
1366 [
1367 "## Task Statement",
1368 "Improve Loader so it feels more like claw-code.",
1369 "",
1370 "## Desired Outcome",
1371 "- Make Loader more reliable without broad redesign.",
1372 "",
1373 "## In Scope",
1374 "- Tighten the runtime workflow around the user-facing goal.",
1375 "",
1376 "## Non Goals",
1377 "- Rebuild unrelated subsystems.",
1378 "",
1379 "## Decision Boundaries",
1380 "- Escalate before changing unrelated UX patterns.",
1381 "",
1382 "## Constraints",
1383 "- Stay inside the current repository.",
1384 "",
1385 "## Likely Touchpoints",
1386 "- Runtime entry points and prompt behavior.",
1387 "",
1388 "## Assumptions",
1389 "- The user wants a narrow runtime-quality improvement.",
1390 "",
1391 "## Acceptance Criteria",
1392 "- The improvement stays focused on runtime behavior.",
1393 ]
1394 )
1395 ),
1396 final_response("I have the brief and can move forward."),
1397 ]
1398 )
1399
1400 async def answer(question: str, options: list[str] | None) -> str:
1401 assert "outcome matters most" in question.lower()
1402 assert options is None
1403 return "Do not redesign the whole interface."
1404
1405 run = await run_scenario(
1406 "Improve Loader so it feels more like claw-code.",
1407 backend,
1408 config=non_streaming_config(),
1409 project_root=temp_dir,
1410 on_user_question=answer,
1411 )
1412
1413 dod = run.agent.last_turn_summary.definition_of_done
1414 assert dod is not None
1415 assert workflow_modes(run)[:2] == ["clarify", "execute"]
1416 assert artifact_kinds(run) == ["clarify_brief"]
1417 assert dod.clarify_brief is not None
1418 assert Path(dod.clarify_brief).exists()
1419
1420
1421 @pytest.mark.asyncio
1422 async def test_complex_prompt_routes_to_plan(temp_dir: Path) -> None:
1423 target = temp_dir / "planned.txt"
1424 backend = ScriptedBackend(
1425 completions=[
1426 final_response(
1427 "\n".join(
1428 [
1429 "# Implementation Plan",
1430 "",
1431 "## File Changes",
1432 f"- Create {target.name} in the workspace root.",
1433 "",
1434 "## Execution Order",
1435 f"1. Write {target.name}.",
1436 "2. Confirm the file exists.",
1437 "",
1438 "## Risks",
1439 "- Writing the wrong file path.",
1440 "",
1441 "<<<VERIFICATION>>>",
1442 "",
1443 "# Verification Plan",
1444 "",
1445 "## Acceptance Criteria",
1446 f"- {target.name} exists in the workspace root.",
1447 "",
1448 "## Verification Commands",
1449 f"- `test -f {target}`",
1450 "",
1451 "## Notes",
1452 "- Use a deterministic file existence check.",
1453 ]
1454 )
1455 ),
1456 native_tool_response(
1457 ToolCall(
1458 id="write-1",
1459 name="write",
1460 arguments={"file_path": str(target), "content": "planned output\n"},
1461 ),
1462 content="I'll create the file now.",
1463 ),
1464 final_response("The file is in place."),
1465 ]
1466 )
1467
1468 run = await run_scenario(
1469 "Implement a persistent workflow mode router with clarify artifacts, "
1470 "planning artifacts, and verification-plan wiring in the runtime.",
1471 backend,
1472 config=non_streaming_config(),
1473 project_root=temp_dir,
1474 )
1475
1476 dod = run.agent.last_turn_summary.definition_of_done
1477 assert dod is not None
1478 assert workflow_modes(run)[:3] == ["plan", "execute", "verify"]
1479 assert artifact_kinds(run) == ["implementation_plan", "verification_plan"]
1480 assert not any(event.type == "decomposition" for event in run.events)
1481 assert not any(event.type == "subtask" for event in run.events)
1482 assert dod.verification_commands == [f"test -f {target}"]
1483 assert verification_commands(run) == [f"test -f {target}"]
1484
1485
1486 @pytest.mark.asyncio
1487 async def test_verify_failure_fix_loop_does_not_reroute_workflow(temp_dir: Path) -> None:
1488 target = temp_dir / "retry.txt"
1489 backend = ScriptedBackend(
1490 completions=[
1491 final_response(
1492 "\n".join(
1493 [
1494 "# Implementation Plan",
1495 "",
1496 "## File Changes",
1497 f"- Create {target.name}.",
1498 "",
1499 "## Execution Order",
1500 f"1. Write {target.name}.",
1501 "2. Fix it if verification fails.",
1502 "",
1503 "## Risks",
1504 "- Initial content may be wrong.",
1505 "",
1506 "<<<VERIFICATION>>>",
1507 "",
1508 "# Verification Plan",
1509 "",
1510 "## Acceptance Criteria",
1511 "- The file contains the word fixed.",
1512 "",
1513 "## Verification Commands",
1514 f"- `grep -q fixed {target}`",
1515 "",
1516 "## Notes",
1517 "- Retry if the first write misses the target string.",
1518 ]
1519 )
1520 ),
1521 native_tool_response(
1522 ToolCall(
1523 id="write-1",
1524 name="write",
1525 arguments={"file_path": str(target), "content": "draft output\n"},
1526 ),
1527 content="I'll write the first draft.",
1528 ),
1529 final_response("First draft is written."),
1530 native_tool_response(
1531 ToolCall(
1532 id="write-2",
1533 name="write",
1534 arguments={"file_path": str(target), "content": "fixed output\n"},
1535 ),
1536 content="I'll correct the file.",
1537 ),
1538 final_response("The file now contains the fixed output."),
1539 ]
1540 )
1541
1542 run = await run_scenario(
1543 "Implement a persistent workflow mode router with clarify artifacts, "
1544 "planning artifacts, and verification-plan wiring in the runtime.",
1545 backend,
1546 config=non_streaming_config(),
1547 project_root=temp_dir,
1548 )
1549
1550 modes = workflow_modes(run)
1551 assert modes.count("plan") == 1
1552 assert modes.count("clarify") == 0
1553 assert modes.count("execute") >= 2
1554 assert modes.count("verify") >= 2
1555
1556
1557 @pytest.mark.asyncio
1558 async def test_conversational_task_skips_verify_phase() -> None:
1559 backend = ScriptedBackend(
1560 streams=[
1561 [
1562 StreamChunk(content="Hello there.", full_content="Hello there.", is_done=True),
1563 ]
1564 ]
1565 )
1566
1567 run = await run_scenario("hello there", backend, config=AgentConfig(auto_context=False))
1568
1569 assert run.response == "Hello there."
1570 assert not dod_statuses(run)
1571 assert run.agent.last_turn_summary is None
1572
1573
1574 @pytest.mark.asyncio
1575 async def test_explore_mode_skips_dod_and_router(temp_dir: Path) -> None:
1576 target = temp_dir / "feature.py"
1577 target.write_text("def important_helper():\n return 1\n")
1578 backend = ScriptedBackend(
1579 completions=[
1580 native_tool_response(
1581 ToolCall(
1582 id="grep-1",
1583 name="grep",
1584 arguments={
1585 "pattern": "important_helper",
1586 "path": str(temp_dir),
1587 "include": "*.py",
1588 },
1589 ),
1590 content="I'll search for that helper.",
1591 ),
1592 final_response("important_helper is defined in feature.py."),
1593 ]
1594 )
1595
1596 run = await run_explore_scenario(
1597 "Where is important_helper defined?",
1598 backend,
1599 config=non_streaming_config(),
1600 project_root=temp_dir,
1601 )
1602
1603 assert "feature.py" in run.response
1604 assert tool_event_names(run) == ["grep"]
1605 assert not dod_statuses(run)
1606 assert not workflow_modes(run)
1607 assert run.agent.last_turn_summary is not None
1608 assert run.agent.last_turn_summary.definition_of_done is None
1609 assert run.agent.last_turn_summary.workflow_mode == "explore"
1610 assert "explore.completed" in trace_event_names(run)
1611 assert not (temp_dir / ".loader" / "dod").exists()
1612 assert run.invocations[0].tools is not None
1613 assert "write" not in {tool["name"] for tool in run.invocations[0].tools or []}
1614
1615
1616 @pytest.mark.asyncio
1617 async def test_explore_mode_denies_write(temp_dir: Path) -> None:
1618 target = temp_dir / "new.txt"
1619 config = non_streaming_config()
1620 config.permission_mode = PermissionMode.WORKSPACE_WRITE
1621 backend = ScriptedBackend(
1622 completions=[
1623 native_tool_response(
1624 ToolCall(
1625 id="write-1",
1626 name="write",
1627 arguments={
1628 "file_path": str(target),
1629 "content": "not allowed\n",
1630 },
1631 ),
1632 content="I'll write a file.",
1633 ),
1634 final_response("Explore mode is read-only, so I cannot make that change here."),
1635 ]
1636 )
1637
1638 run = await run_explore_scenario(
1639 "Create a new file anyway.",
1640 backend,
1641 config=config,
1642 project_root=temp_dir,
1643 )
1644
1645 assert not target.exists()
1646 assert tool_event_names(run) == ["write"]
1647 assert any("read-only" in message.lower() for message in tool_result_messages(run))
1648 assert "cannot make that change" in run.response.lower()
1649 assert "tool.permission_denied" in trace_event_names(run)
1650 assert not dod_statuses(run)
1651 assert not workflow_modes(run)
1652 assert not (temp_dir / ".loader" / "dod").exists()
1653
1654
1655 @pytest.mark.asyncio
1656 async def test_explore_mode_ignores_global_allow_policy(temp_dir: Path) -> None:
1657 loader_root = temp_dir / ".loader"
1658 loader_root.mkdir()
1659 (loader_root / "permission-rules.json").write_text(
1660 '{"allow": [{"tool": "write", "path_contains": "new.txt"}]}\n'
1661 )
1662 target = temp_dir / "new.txt"
1663 config = non_streaming_config()
1664 config.permission_mode = PermissionMode.ALLOW
1665 backend = ScriptedBackend(
1666 completions=[
1667 native_tool_response(
1668 ToolCall(
1669 id="write-1",
1670 name="write",
1671 arguments={
1672 "file_path": str(target),
1673 "content": "still denied\n",
1674 },
1675 ),
1676 content="I'll write a file.",
1677 ),
1678 final_response("Explore mode is read-only, so I cannot make that change here."),
1679 ]
1680 )
1681
1682 run = await run_explore_scenario(
1683 "Create a new file anyway.",
1684 backend,
1685 config=config,
1686 project_root=temp_dir,
1687 )
1688
1689 assert not target.exists()
1690 assert any("read-only" in message.lower() for message in tool_result_messages(run))
1691 assert "tool.permission_denied" in trace_event_names(run)
1692 assert not dod_statuses(run)
1693 assert not workflow_modes(run)
1694
1695
1696 @pytest.mark.asyncio
1697 async def test_informational_completion_allows_explicit_done_without_continuation(
1698 temp_dir: Path,
1699 monkeypatch: pytest.MonkeyPatch,
1700 ) -> None:
1701 monkeypatch.chdir(temp_dir)
1702 target = temp_dir / "hello.py"
1703 backend = ScriptedBackend(
1704 completions=[
1705 final_response("Done."),
1706 ]
1707 )
1708 config = non_streaming_config(completion_check=True)
1709
1710 run = await run_scenario(
1711 "Explain how a hello.py file would work.",
1712 backend,
1713 config=config,
1714 project_root=temp_dir,
1715 )
1716
1717 assert not target.exists()
1718 assert not any(event.type == "completion_check" for event in run.events)
1719 assert tool_event_names(run) == []
1720 assert run.response == "Done."
1721
1722
1723 @pytest.mark.asyncio
1724 async def test_tool_result_contract_regression() -> None:
1725 errors: list[str] = []
1726 duplicate_path = "/tmp/already-created.txt"
1727
1728 duplicate_backend = ScriptedBackend(
1729 completions=[
1730 native_tool_response(
1731 ToolCall(
1732 id="dup-1",
1733 name="write",
1734 arguments={"file_path": duplicate_path, "content": "already there\n"},
1735 ),
1736 content="I'll create the file again.",
1737 ),
1738 final_response("Skipped the duplicate write."),
1739 ]
1740 )
1741 duplicate_agent = Agent(duplicate_backend, config=non_streaming_config())
1742 duplicate_agent.safeguards.record_action(
1743 "write",
1744 {"file_path": duplicate_path, "content": "already there\n"},
1745 )
1746
1747 try:
1748 await duplicate_agent.run("Create /tmp/already-created.txt again.")
1749 except TypeError as exc:
1750 errors.append(f"duplicate branch raised {exc}")
1751
1752 validation_backend = ScriptedBackend(
1753 completions=[
1754 native_tool_response(
1755 ToolCall(id="invalid-1", name="bash", arguments={"command": ""}),
1756 content="I'll run that command.",
1757 ),
1758 final_response("Blocked the invalid command."),
1759 ]
1760 )
1761 validation_agent = Agent(validation_backend, config=non_streaming_config())
1762
1763 try:
1764 await validation_agent.run("Run an empty command.")
1765 except TypeError as exc:
1766 errors.append(f"validation branch raised {exc}")
1767
1768 assert not errors, "\n".join(errors)
1769
1770
1771 @pytest.mark.asyncio
1772 async def test_duplicate_read_is_skipped_without_intervening_mutation(
1773 temp_dir: Path,
1774 ) -> None:
1775 fixture = temp_dir / "index.html"
1776 fixture.write_text("alpha parity line\n")
1777
1778 backend = ScriptedBackend(
1779 completions=[
1780 native_tool_response(
1781 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
1782 content="I'll inspect the file.",
1783 ),
1784 native_tool_response(
1785 ToolCall(id="read-2", name="read", arguments={"file_path": str(fixture)}),
1786 content="I'll reread the same file.",
1787 ),
1788 final_response("I'll use the existing file contents instead of rereading."),
1789 ]
1790 )
1791
1792 run = await run_scenario(
1793 "Inspect index.html and keep moving.",
1794 backend,
1795 config=non_streaming_config(),
1796 project_root=temp_dir,
1797 )
1798
1799 assert tool_event_names(run) == ["read", "read"]
1800 messages = tool_result_messages(run)
1801 assert any("alpha parity line" in message for message in messages)
1802 assert any(
1803 "Skipped - duplicate action" in message and "Already read" in message
1804 for message in messages
1805 )
1806 assert "existing file contents" in run.response
1807
1808
1809 @pytest.mark.asyncio
1810 async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence(
1811 temp_dir: Path,
1812 ) -> None:
1813 chapters = temp_dir / "chapters"
1814 chapters.mkdir()
1815 (chapters / "01-introduction.html").write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n")
1816 (chapters / "02-setup.html").write_text("<h1>Chapter 2: Setting Up Fortran</h1>\n")
1817 index_file = temp_dir / "index.html"
1818 index_file.write_text("broken table of contents\n")
1819
1820 backend = ScriptedBackend(
1821 completions=[
1822 native_tool_response(
1823 ToolCall(
1824 id="glob-1",
1825 name="glob",
1826 arguments={"path": str(chapters), "pattern": "*.html"},
1827 ),
1828 content="I'll inspect the chapter inventory first.",
1829 ),
1830 native_tool_response(
1831 ToolCall(
1832 id="read-1",
1833 name="read",
1834 arguments={"file_path": str(index_file)},
1835 ),
1836 content="I'll inspect the index next.",
1837 ),
1838 native_tool_response(
1839 ToolCall(
1840 id="read-2",
1841 name="read",
1842 arguments={"file_path": str(index_file)},
1843 ),
1844 content="I'll reopen the index.",
1845 ),
1846 final_response("I'll reuse the earlier evidence and patch the index next."),
1847 ]
1848 )
1849
1850 run = await run_scenario(
1851 "Update index.html so the table of contents links are correct.",
1852 backend,
1853 config=non_streaming_config(),
1854 project_root=temp_dir,
1855 )
1856
1857 messages = tool_result_messages(run)
1858 steering_messages = [
1859 event.content
1860 for event in run.events
1861 if event.type == "steering" and event.content
1862 ]
1863
1864 assert any("reuse the earlier read result instead of rereading" in message for message in messages)
1865 assert any("Reuse the earlier observation instead of repeating it." in message for message in steering_messages)
1866 assert any("index.html" in message for message in steering_messages)
1867
1868
1869 @pytest.mark.asyncio
1870 async def test_relative_file_read_stays_on_recent_external_context(
1871 temp_dir: Path,
1872 ) -> None:
1873 external_dir = temp_dir.parent / f"{temp_dir.name}-external-guide"
1874 external_dir.mkdir(exist_ok=True)
1875 external_index = external_dir / "index.html"
1876 external_index.write_text("external guide index\n")
1877
1878 backend = ScriptedBackend(
1879 completions=[
1880 native_tool_response(
1881 ToolCall(
1882 id="read-1",
1883 name="read",
1884 arguments={"file_path": str(external_index)},
1885 ),
1886 content="I'll inspect the external index first.",
1887 ),
1888 native_tool_response(
1889 ToolCall(
1890 id="read-2",
1891 name="read",
1892 arguments={"file_path": "index.html"},
1893 ),
1894 content="I'll reopen index.html in the same guide.",
1895 ),
1896 final_response("I stayed on the external guide instead of snapping back to the repo."),
1897 ]
1898 )
1899
1900 run = await run_scenario(
1901 "Inspect the external guide index twice.",
1902 backend,
1903 config=non_streaming_config(),
1904 project_root=temp_dir,
1905 )
1906
1907 assert tool_event_names(run) == ["read", "read"]
1908 messages = tool_result_messages(run)
1909 assert any("external guide index" in message for message in messages)
1910 assert not any("File not found: index.html" in message for message in messages)
1911 assert any(
1912 "Skipped - duplicate action" in message or "external guide index" in message
1913 for message in messages[1:]
1914 )
1915
1916
1917 @pytest.mark.asyncio
1918 async def test_blocked_shell_text_rewrite_queues_file_tool_steering(
1919 temp_dir: Path,
1920 ) -> None:
1921 target = temp_dir / "notes.txt"
1922 target.write_text("old value\n")
1923
1924 backend = ScriptedBackend(
1925 completions=[
1926 native_tool_response(
1927 ToolCall(
1928 id="bash-1",
1929 name="bash",
1930 arguments={"command": "sed -i '1s/old/new/' notes.txt"},
1931 ),
1932 content="I'll update the file with sed.",
1933 ),
1934 native_tool_response(
1935 ToolCall(
1936 id="edit-1",
1937 name="edit",
1938 arguments={
1939 "file_path": str(target),
1940 "old_string": "old value",
1941 "new_string": "new value",
1942 },
1943 ),
1944 content="I'll switch to the edit tool instead.",
1945 ),
1946 final_response("Updated the file with Loader's file tools."),
1947 ]
1948 )
1949
1950 run = await run_scenario(
1951 "Update notes.txt from old value to new value.",
1952 backend,
1953 config=non_streaming_config(),
1954 project_root=temp_dir,
1955 )
1956
1957 assert tool_event_names(run) == ["bash", "edit"]
1958 assert target.read_text() == "new value\n"
1959 messages = tool_result_messages(run)
1960 assert any("Shell-based text rewrites are brittle" in message for message in messages)
1961 steering_messages = [
1962 event.content
1963 for event in run.events
1964 if event.type == "steering" and event.content
1965 ]
1966 assert any("Use Loader's file tools for this text edit" in message for message in steering_messages)
1967
1968
1969 @pytest.mark.asyncio
1970 async def test_blocked_html_index_edit_queues_inventory_reuse_steering(
1971 temp_dir: Path,
1972 ) -> None:
1973 chapters = temp_dir / "chapters"
1974 chapters.mkdir()
1975 (chapters / "05-input-output.html").write_text("<h1>Chapter 5: Input and Output</h1>\n")
1976 index_file = temp_dir / "index.html"
1977 index_file.write_text(
1978 '<ul class="chapter-list">\n'
1979 ' <li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>\n'
1980 '</ul>\n'
1981 )
1982
1983 backend = ScriptedBackend(
1984 completions=[
1985 native_tool_response(
1986 ToolCall(
1987 id="glob-1",
1988 name="glob",
1989 arguments={"path": str(chapters), "pattern": "*.html"},
1990 ),
1991 content="I'll check which chapter files exist first.",
1992 ),
1993 native_tool_response(
1994 ToolCall(
1995 id="edit-1",
1996 name="edit",
1997 arguments={
1998 "file_path": str(index_file),
1999 "old_string": '<li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>',
2000 "new_string": '<li><a href="chapters/05-control-structures.html">Chapter 5: Control Structures</a></li>',
2001 },
2002 ),
2003 content="I'll update the TOC entry.",
2004 ),
2005 final_response("I'll reuse the known chapter inventory and correct the TOC."),
2006 ]
2007 )
2008
2009 run = await run_scenario(
2010 "Fix the index table of contents so it matches the chapters directory.",
2011 backend,
2012 config=non_streaming_config(),
2013 project_root=temp_dir,
2014 )
2015
2016 messages = tool_result_messages(run)
2017 steering_messages = [
2018 event.content
2019 for event in run.events
2020 if event.type == "steering" and event.content
2021 ]
2022
2023 assert any("Edited HTML links point to files that do not exist" in message for message in messages)
2024 assert steering_messages == []
2025
2026
2027 @pytest.mark.asyncio
2028 async def test_full_path_glob_pattern_still_injects_verified_html_inventory(
2029 temp_dir: Path,
2030 ) -> None:
2031 chapters = temp_dir / "chapters"
2032 chapters.mkdir()
2033 (chapters / "01-introduction.html").write_text(
2034 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
2035 )
2036 (chapters / "02-setup.html").write_text(
2037 "<h1>Chapter 2: Setting Up Fortran</h1>\n"
2038 )
2039 index_file = temp_dir / "index.html"
2040 index_file.write_text("broken table of contents\n")
2041
2042 backend = ScriptedBackend(
2043 completions=[
2044 native_tool_response(
2045 ToolCall(
2046 id="glob-1",
2047 name="glob",
2048 arguments={"pattern": f"{chapters}/*.html"},
2049 ),
2050 content="I'll inspect the chapter inventory first.",
2051 ),
2052 final_response("I'll update index.html using the verified inventory."),
2053 ]
2054 )
2055
2056 run = await run_scenario(
2057 "Fix index.html so the chapter links match the real chapter files.",
2058 backend,
2059 config=non_streaming_config(),
2060 project_root=temp_dir,
2061 )
2062
2063 assert tool_event_names(run) == ["glob"]
2064 messages = tool_result_messages(run)
2065 assert all("Verified chapter inventory:" not in message for message in messages)
2066
2067
2068 @pytest.mark.asyncio
2069 async def test_verified_html_inventory_blocks_redundant_chapter_reread(
2070 temp_dir: Path,
2071 ) -> None:
2072 chapters = temp_dir / "chapters"
2073 chapters.mkdir()
2074 (chapters / "01-introduction.html").write_text(
2075 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
2076 )
2077 (chapters / "02-setup.html").write_text(
2078 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
2079 )
2080 index_file = temp_dir / "index.html"
2081 index_file.write_text("broken table of contents\n")
2082
2083 backend = ScriptedBackend(
2084 completions=[
2085 native_tool_response(
2086 ToolCall(
2087 id="glob-1",
2088 name="glob",
2089 arguments={"path": str(chapters), "pattern": "*.html"},
2090 ),
2091 content="I'll inspect the chapter inventory first.",
2092 ),
2093 native_tool_response(
2094 ToolCall(
2095 id="read-1",
2096 name="read",
2097 arguments={"file_path": str(chapters / '01-introduction.html')},
2098 ),
2099 content="I'll open the first chapter file to extract its title.",
2100 ),
2101 final_response("I'll update index.html using the verified chapter inventory."),
2102 ]
2103 )
2104
2105 run = await run_scenario(
2106 "Fix index.html so the chapter links and titles match the real chapter files.",
2107 backend,
2108 config=non_streaming_config(),
2109 project_root=temp_dir,
2110 )
2111
2112 messages = tool_result_messages(run)
2113 assert all("Verified chapter inventory:" not in message for message in messages)
2114 assert all("verified sibling chapter inventory" not in message for message in messages)
2115
2116
2117 @pytest.mark.asyncio
2118 async def test_successful_html_toc_edit_blocks_post_success_reread_and_steers_to_finish(
2119 temp_dir: Path,
2120 ) -> None:
2121 chapters = temp_dir / "chapters"
2122 chapters.mkdir()
2123 (chapters / "01-introduction.html").write_text(
2124 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
2125 )
2126 (chapters / "02-setup.html").write_text(
2127 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
2128 )
2129 index_file = temp_dir / "index.html"
2130 old_block = (
2131 '<h2>Table of Contents</h2>\n'
2132 '<ul class="chapter-list">\n'
2133 ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n'
2134 ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n'
2135 '</ul>\n'
2136 )
2137 new_block = (
2138 '<h2>Table of Contents</h2>\n'
2139 '<ul class="chapter-list">\n'
2140 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n'
2141 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n'
2142 '</ul>\n'
2143 )
2144 index_file.write_text(new_block.replace("01-introduction.html", "01-old.html").replace("02-setup.html", "02-old.html").replace("Introduction to Fortran", "Old").replace("Setting Up Your Environment", "Old"))
2145
2146 backend = ScriptedBackend(
2147 completions=[
2148 native_tool_response(
2149 ToolCall(
2150 id="glob-1",
2151 name="glob",
2152 arguments={"path": str(chapters), "pattern": "*.html"},
2153 ),
2154 content="I'll inspect the chapter inventory first.",
2155 ),
2156 native_tool_response(
2157 ToolCall(
2158 id="read-1",
2159 name="read",
2160 arguments={"file_path": str(index_file)},
2161 ),
2162 content="I'll inspect index.html next.",
2163 ),
2164 native_tool_response(
2165 ToolCall(
2166 id="edit-1",
2167 name="edit",
2168 arguments={
2169 "file_path": str(index_file),
2170 "old_string": old_block,
2171 "new_string": new_block,
2172 },
2173 ),
2174 content="I'll fix the TOC now.",
2175 ),
2176 native_tool_response(
2177 ToolCall(
2178 id="read-2",
2179 name="read",
2180 arguments={"file_path": str(index_file)},
2181 ),
2182 content="I'll reread index.html to confirm the change.",
2183 ),
2184 final_response(
2185 "I updated index.html so the table of contents matches the real chapter files."
2186 ),
2187 ]
2188 )
2189
2190 run = await run_scenario(
2191 "Update index.html so every chapter link and title matches the real HTML files in chapters/.",
2192 backend,
2193 config=non_streaming_config(),
2194 project_root=temp_dir,
2195 )
2196
2197 messages = tool_result_messages(run)
2198 steering_messages = [
2199 event.content
2200 for event in run.events
2201 if event.type == "steering" and event.content
2202 ]
2203
2204 assert all(
2205 "Semantic verification preview:" not in message
2206 for message in messages
2207 )
2208 assert steering_messages == []
2209 assert "updated index.html" in run.response.lower()
2210
2211
2212 @pytest.mark.asyncio
2213 async def test_exact_prompt_finishes_when_index_toc_is_already_correct(
2214 temp_dir: Path,
2215 ) -> None:
2216 chapters = temp_dir / "chapters"
2217 chapters.mkdir()
2218 (chapters / "01-introduction.html").write_text(
2219 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
2220 )
2221 (chapters / "02-setup.html").write_text(
2222 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
2223 )
2224 index_file = temp_dir / "index.html"
2225 index_file.write_text(
2226 "\n".join(
2227 [
2228 "<h2>Table of Contents</h2>",
2229 ' <ul class="chapter-list">',
2230 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
2231 ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>',
2232 " </ul>",
2233 "",
2234 ]
2235 )
2236 )
2237
2238 backend = ScriptedBackend(
2239 completions=[
2240 native_tool_response(
2241 ToolCall(
2242 id="read-1",
2243 name="read",
2244 arguments={"file_path": str(index_file)},
2245 ),
2246 content="I'll inspect index.html first.",
2247 ),
2248 native_tool_response(
2249 ToolCall(
2250 id="read-2",
2251 name="read",
2252 arguments={"file_path": str(index_file), "offset": 1, "limit": 8},
2253 ),
2254 content="I'll reread just the table-of-contents lines.",
2255 ),
2256 final_response(
2257 "The table of contents is already correct, so no edit is needed."
2258 ),
2259 ]
2260 )
2261
2262 prompt = (
2263 "Have a look at ~/Loader/guides/fortran/index.html, then "
2264 "~/Loader/guides/fortran/chapters. The table of contents links in "
2265 "index.html are inaccurate and the href’s are wrong. Let’s update the "
2266 "links and their link texts to be correct."
2267 )
2268 run = await run_scenario(
2269 prompt,
2270 backend,
2271 config=non_streaming_config(),
2272 project_root=temp_dir,
2273 )
2274
2275 messages = tool_result_messages(run)
2276 steering_messages = [
2277 event.content
2278 for event in run.events
2279 if event.type == "steering" and event.content
2280 ]
2281
2282 assert all(
2283 "Semantic verification preview:" not in message
2284 for message in messages
2285 )
2286 assert steering_messages == []
2287 assert (
2288 sum(
2289 1
2290 for event in run.events
2291 if event.type == "tool_call"
2292 and event.tool_name == "read"
2293 and event.phase != "verification"
2294 )
2295 == 1
2296 )
2297 assert "no edit is needed" in run.response.lower()
2298
2299
2300 @pytest.mark.asyncio
2301 async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
2302 temp_dir: Path,
2303 ) -> None:
2304 index_file = temp_dir / "index.html"
2305 chapter_file = temp_dir / "chapter-1.html"
2306 index_file.write_text("table of contents\n")
2307 chapter_file.write_text("chapter body\n")
2308
2309 backend = ScriptedBackend(
2310 completions=[
2311 native_tool_response(
2312 ToolCall(
2313 id="read-1",
2314 name="read",
2315 arguments={"file_path": str(index_file)},
2316 ),
2317 content="I'll inspect the index first.",
2318 ),
2319 native_tool_response(
2320 ToolCall(
2321 id="read-2",
2322 name="read",
2323 arguments={"file_path": str(chapter_file)},
2324 ),
2325 content="I'll inspect the chapter next.",
2326 ),
2327 native_tool_response(
2328 ToolCall(
2329 id="read-3",
2330 name="read",
2331 arguments={"file_path": str(index_file)},
2332 ),
2333 content="I'll reopen the index to reconcile the findings.",
2334 ),
2335 final_response("I re-opened the index after checking the chapter."),
2336 ]
2337 )
2338
2339 run = await run_scenario(
2340 "Inspect the index, inspect a chapter, then return to the index.",
2341 backend,
2342 config=non_streaming_config(),
2343 project_root=temp_dir,
2344 )
2345
2346 assert tool_event_names(run) == ["read", "read", "read"]
2347 messages = tool_result_messages(run)
2348 assert not any("Skipped - duplicate action" in message for message in messages)
2349 assert sum("table of contents" in message for message in messages) == 2
2350 assert any("chapter body" in message for message in messages)
2351
2352
2353 @pytest.mark.asyncio
2354 async def test_repeated_bash_probe_is_allowed_after_mutation(
2355 temp_dir: Path,
2356 ) -> None:
2357 target = temp_dir / "notes.txt"
2358 target.write_text("old value\n")
2359 list_command = f"ls -1 {temp_dir}"
2360
2361 backend = ScriptedBackend(
2362 completions=[
2363 native_tool_response(
2364 ToolCall(id="bash-1", name="bash", arguments={"command": list_command}),
2365 content="I'll inspect the directory first.",
2366 ),
2367 native_tool_response(
2368 ToolCall(
2369 id="edit-1",
2370 name="edit",
2371 arguments={
2372 "file_path": str(target),
2373 "old_string": "old value",
2374 "new_string": "new value",
2375 },
2376 ),
2377 content="I'll update the file.",
2378 ),
2379 native_tool_response(
2380 ToolCall(id="bash-2", name="bash", arguments={"command": list_command}),
2381 content="I'll list the directory again after the edit.",
2382 ),
2383 final_response("I re-ran ls after the edit without hitting duplicate rejection."),
2384 ]
2385 )
2386
2387 run = await run_scenario(
2388 "Inspect the directory, edit the file, then inspect again.",
2389 backend,
2390 config=non_streaming_config(),
2391 project_root=temp_dir,
2392 )
2393
2394 assert tool_event_names(run) == ["bash", "edit", "bash"]
2395 messages = tool_result_messages(run)
2396 assert not any("Skipped - duplicate action" in message for message in messages)
2397 assert sum("notes.txt" in message for message in messages) >= 2
2398 assert target.read_text() == "new value\n"