Python · 62119 bytes Raw Blame History
1 """Deterministic runtime parity coverage for the current Loader loop."""
2
3 from __future__ import annotations
4
5 import json
6 from pathlib import Path
7
8 import pytest
9
10 from loader.agent.loop import Agent, AgentConfig
11 from loader.llm.base import CompletionResponse, Role, StreamChunk, ToolCall
12 from loader.runtime.capabilities import resolve_capability_profile
13 from loader.runtime.permissions import PermissionMode
14 from tests.helpers.runtime_harness import (
15 ScriptedBackend,
16 run_explore_scenario,
17 run_scenario,
18 )
19
20 SCENARIO_NAMES = [
21 "streaming_text",
22 "read_file_roundtrip",
23 "multi_tool_turn_roundtrip",
24 "turn_summary_smoke_for_multi_tool_turn",
25 "write_file_allowed",
26 "write_file_denied",
27 "bash_stdout_roundtrip",
28 "bash_confirmation_prompt_approved",
29 "bash_confirmation_prompt_denied",
30 "read_only_mode_denies_write",
31 "read_only_mode_denies_mutating_bash",
32 "read_only_mode_allows_safe_bash",
33 "workspace_write_denies_write_outside_root",
34 "danger_full_access_allows_dangerous_bash",
35 "prompt_mode_prompts_destructive_write",
36 "allow_mode_skips_prompt_for_destructive_write",
37 "deny_rule_blocks_allowed_mode",
38 "ask_rule_prompts_even_when_mode_would_allow",
39 "raw_json_tool_call_fallback",
40 "raw_json_todowrite_tool_call_fallback",
41 "raw_json_patch_tool_call_fallback",
42 "raw_json_ask_user_question_tool_call_fallback",
43 "raw_bracket_ask_user_question_tool_call_fallback",
44 "native_and_raw_tool_paths_share_executor_trace",
45 "backend_capability_probe_refreshes_native_tool_mode",
46 "run_streaming_delegates_to_primary_runtime",
47 "definition_of_done_verify_phase",
48 "verify_failure_routes_to_fix_loop",
49 "verify_retry_budget_exhaustion",
50 "ambiguous_prompt_routes_to_clarify",
51 "complex_prompt_routes_to_plan",
52 "verify_failure_fix_loop_does_not_reroute_workflow",
53 "conversational_task_skips_verify_phase",
54 "explore_mode_skips_dod_and_router",
55 "explore_mode_denies_write",
56 "explore_mode_ignores_global_allow_policy",
57 "non_mutating_completion_no_longer_forces_continuation",
58 "tool_result_contract_regression",
59 ]
60
61
62 def load_manifest() -> list[dict[str, str]]:
63 """Load the auditable parity scenario manifest."""
64
65 manifest_path = Path(__file__).parent / "fixtures" / "runtime_parity_manifest.json"
66 return json.loads(manifest_path.read_text())
67
68
69 def non_streaming_config(*, completion_check: bool = False) -> AgentConfig:
70 """Shared config for deterministic complete() tests."""
71
72 config = AgentConfig(auto_context=False, stream=False, max_iterations=8)
73 config.reasoning.completion_check = completion_check
74 return config
75
76
77 def native_tool_response(
78 *tool_calls: ToolCall,
79 content: str = "Using tools.",
80 ) -> CompletionResponse:
81 """Build a completion that includes native tool calls."""
82
83 return CompletionResponse(content=content, tool_calls=list(tool_calls))
84
85
86 def final_response(content: str) -> CompletionResponse:
87 """Build a completion with no further tool calls."""
88
89 return CompletionResponse(content=content)
90
91
92 def tool_event_names(run) -> list[str]:
93 """Return emitted tool event names in order."""
94
95 return [
96 event.tool_name
97 for event in run.events
98 if event.type == "tool_call" and event.tool_name and event.phase != "verification"
99 ]
100
101
102 def tool_result_messages(run) -> list[str]:
103 """Return emitted tool result messages in order."""
104
105 return [
106 event.content
107 for event in run.events
108 if event.type == "tool_result" and event.phase != "verification"
109 ]
110
111
112 def verification_commands(run) -> list[str]:
113 """Return verification-phase bash commands."""
114
115 return [
116 str((event.tool_args or {}).get("command", ""))
117 for event in run.events
118 if event.type == "tool_call" and event.phase == "verification"
119 ]
120
121
122 def trace_event_names(run) -> list[str]:
123 """Return recorded runtime trace event names."""
124
125 summary = run.agent.last_turn_summary
126 assert summary is not None
127 return [event.name for event in summary.trace]
128
129
130 def dod_statuses(run) -> list[str]:
131 """Return DoD statuses emitted during a run."""
132
133 return [
134 event.dod_status
135 for event in run.events
136 if event.type == "dod_status" and event.dod_status
137 ]
138
139
140 def workflow_modes(run) -> list[str]:
141 """Return emitted workflow modes in order."""
142
143 return [
144 event.workflow_mode
145 for event in run.events
146 if event.type == "workflow_mode" and event.workflow_mode
147 ]
148
149
150 def artifact_kinds(run) -> list[str]:
151 """Return emitted artifact kinds in order."""
152
153 return [
154 event.artifact_kind
155 for event in run.events
156 if event.type == "artifact" and event.artifact_kind
157 ]
158
159
160 @pytest.mark.asyncio
161 async def test_runtime_parity_manifest_matches_implemented_cases() -> None:
162 manifest_names = [entry["name"] for entry in load_manifest()]
163 assert manifest_names == SCENARIO_NAMES
164
165
166 @pytest.mark.asyncio
167 async def test_streaming_text_scenario() -> None:
168 backend = ScriptedBackend(
169 streams=[
170 [
171 StreamChunk(content="Mock streaming ", is_done=False),
172 StreamChunk(
173 content="says hello from Loader.",
174 full_content="Mock streaming says hello from Loader.",
175 is_done=True,
176 ),
177 ]
178 ]
179 )
180
181 run = await run_scenario("hello there", backend, config=AgentConfig(auto_context=False))
182
183 assert run.response == "Mock streaming says hello from Loader."
184 assert [call.mode for call in run.invocations] == ["stream"]
185 assert not tool_event_names(run)
186
187
188 @pytest.mark.asyncio
189 async def test_read_file_roundtrip(temp_dir: Path) -> None:
190 fixture = temp_dir / "fixture.txt"
191 fixture.write_text("alpha parity line\nbeta line\n")
192
193 backend = ScriptedBackend(
194 completions=[
195 native_tool_response(
196 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
197 content="I'll inspect that file.",
198 ),
199 final_response("The file contains alpha parity line."),
200 ]
201 )
202
203 run = await run_scenario(
204 "Read the fixture file and summarize it.",
205 backend,
206 config=non_streaming_config(),
207 project_root=temp_dir,
208 )
209
210 assert "alpha parity line" in run.response
211 assert tool_event_names(run) == ["read"]
212 assert any("alpha parity line" in message for message in tool_result_messages(run))
213 assert len(run.invocations) == 2
214 assert any(message.role == Role.TOOL for message in run.invocations[1].messages)
215
216
217 @pytest.mark.asyncio
218 @pytest.mark.parametrize("alias_key", ["file", "filepath"])
219 async def test_read_file_alias_roundtrip(temp_dir: Path, alias_key: str) -> None:
220 fixture = temp_dir / "fixture.txt"
221 fixture.write_text("alpha parity line\nbeta line\n")
222
223 backend = ScriptedBackend(
224 completions=[
225 native_tool_response(
226 ToolCall(id="read-1", name="read", arguments={alias_key: str(fixture)}),
227 content="I'll inspect that file.",
228 ),
229 final_response("The file contains alpha parity line."),
230 ]
231 )
232
233 run = await run_scenario(
234 "Read the fixture file and summarize it.",
235 backend,
236 config=non_streaming_config(),
237 project_root=temp_dir,
238 )
239
240 assert "alpha parity line" in run.response
241 assert tool_event_names(run) == ["read"]
242 assert any("alpha parity line" in message for message in tool_result_messages(run))
243
244
245 @pytest.mark.asyncio
246 async def test_multi_tool_turn_roundtrip(temp_dir: Path) -> None:
247 fixture = temp_dir / "fixture.txt"
248 fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
249
250 backend = ScriptedBackend(
251 completions=[
252 native_tool_response(
253 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
254 ToolCall(
255 id="grep-1",
256 name="grep",
257 arguments={"pattern": "parity", "path": str(fixture)},
258 ),
259 content="I'll inspect the file and count parity matches.",
260 ),
261 final_response("The file has two parity lines, including alpha parity line."),
262 ]
263 )
264
265 run = await run_scenario(
266 "Inspect the fixture and find parity lines.",
267 backend,
268 config=non_streaming_config(),
269 project_root=temp_dir,
270 )
271
272 assert tool_event_names(run) == ["read", "grep"]
273 assert len(tool_result_messages(run)) == 2
274 assert "two parity lines" in run.response
275
276
277 @pytest.mark.asyncio
278 async def test_turn_summary_smoke_for_multi_tool_turn(temp_dir: Path) -> None:
279 fixture = temp_dir / "fixture.txt"
280 fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
281
282 backend = ScriptedBackend(
283 completions=[
284 native_tool_response(
285 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
286 ToolCall(
287 id="grep-1",
288 name="grep",
289 arguments={"pattern": "parity", "path": str(fixture)},
290 ),
291 content="I'll inspect the file and count parity matches.",
292 ),
293 final_response("The file has two parity lines, including alpha parity line."),
294 ]
295 )
296
297 run = await run_scenario(
298 "Inspect the fixture and find parity lines.",
299 backend,
300 config=non_streaming_config(),
301 project_root=temp_dir,
302 )
303
304 summary = run.agent.last_turn_summary
305 assert summary is not None
306 assert summary.final_response == run.response
307 assert summary.iterations == 2
308 assert len(summary.assistant_messages) == 2
309 assert len(summary.tool_result_messages) == 2
310 assert "assistant.tool_batch" in trace_event_names(run)
311
312
313 @pytest.mark.asyncio
314 async def test_write_file_allowed(temp_dir: Path) -> None:
315 target = temp_dir / "allowed.txt"
316 backend = ScriptedBackend(
317 completions=[
318 native_tool_response(
319 ToolCall(
320 id="write-1",
321 name="write",
322 arguments={"file_path": str(target), "content": "hello from loader\n"},
323 ),
324 content="I'll create the file now.",
325 ),
326 final_response("Successfully created the file."),
327 ]
328 )
329
330 run = await run_scenario(
331 "Create allowed.txt with a greeting.",
332 backend,
333 config=non_streaming_config(),
334 project_root=temp_dir,
335 )
336
337 assert target.read_text() == "hello from loader\n"
338 assert "Successfully created the file." in run.response
339 assert tool_event_names(run) == ["write"]
340
341
342 @pytest.mark.asyncio
343 async def test_write_file_denied(temp_dir: Path) -> None:
344 target = temp_dir / "denied.txt"
345 config = non_streaming_config()
346 config.permission_mode = PermissionMode.PROMPT
347 backend = ScriptedBackend(
348 completions=[
349 native_tool_response(
350 ToolCall(
351 id="write-1",
352 name="write",
353 arguments={"file_path": str(target), "content": "should not exist\n"},
354 ),
355 content="I'll create the file if you approve it.",
356 ),
357 final_response("I skipped the write as requested."),
358 ]
359 )
360
361 async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
362 assert tool_name == "write"
363 assert "approval" in message.lower()
364 assert "active_mode=prompt" in details
365 return False
366
367 run = await run_scenario(
368 "Create denied.txt with a greeting.",
369 backend,
370 config=config,
371 project_root=temp_dir,
372 on_confirmation=deny_confirmation,
373 )
374
375 assert not target.exists()
376 assert "skipped the write" in run.response.lower()
377 assert any(event.type == "confirmation" for event in run.events)
378
379
380 @pytest.mark.asyncio
381 async def test_bash_stdout_roundtrip(temp_dir: Path, monkeypatch: pytest.MonkeyPatch) -> None:
382 monkeypatch.chdir(temp_dir)
383 backend = ScriptedBackend(
384 completions=[
385 native_tool_response(
386 ToolCall(id="bash-1", name="bash", arguments={"command": "pwd"}),
387 content="I'll check the current directory.",
388 ),
389 final_response("Confirmed the working directory."),
390 ]
391 )
392
393 run = await run_scenario(
394 "Tell me the current directory.",
395 backend,
396 config=non_streaming_config(),
397 project_root=temp_dir,
398 )
399
400 assert str(temp_dir) in tool_result_messages(run)[0]
401 assert "Confirmed the working directory." in run.response
402
403
404 @pytest.mark.asyncio
405 async def test_bash_confirmation_prompt_approved(
406 temp_dir: Path,
407 monkeypatch: pytest.MonkeyPatch,
408 ) -> None:
409 monkeypatch.chdir(temp_dir)
410 target = temp_dir / "approved.txt"
411 config = non_streaming_config()
412 config.permission_mode = PermissionMode.PROMPT
413 backend = ScriptedBackend(
414 completions=[
415 native_tool_response(
416 ToolCall(id="bash-1", name="bash", arguments={"command": "touch approved.txt"}),
417 content="I'll create the file after approval.",
418 ),
419 final_response("The shell command completed."),
420 ]
421 )
422
423 async def approve_confirmation(tool_name: str, message: str, details: str) -> bool:
424 assert tool_name == "bash"
425 assert "approval" in message.lower()
426 assert "touch approved.txt" in details
427 return True
428
429 run = await run_scenario(
430 "Create approved.txt using bash.",
431 backend,
432 config=config,
433 project_root=temp_dir,
434 on_confirmation=approve_confirmation,
435 )
436
437 assert target.exists()
438 assert "shell command completed" in run.response.lower()
439 assert any(event.type == "confirmation" for event in run.events)
440
441
442 @pytest.mark.asyncio
443 async def test_bash_confirmation_prompt_denied(
444 temp_dir: Path,
445 monkeypatch: pytest.MonkeyPatch,
446 ) -> None:
447 monkeypatch.chdir(temp_dir)
448 target = temp_dir / "denied-bash.txt"
449 config = non_streaming_config()
450 config.permission_mode = PermissionMode.PROMPT
451 backend = ScriptedBackend(
452 completions=[
453 native_tool_response(
454 ToolCall(id="bash-1", name="bash", arguments={"command": "touch denied-bash.txt"}),
455 content="I'll create the file if you allow it.",
456 ),
457 final_response("I left the shell command undone."),
458 ]
459 )
460
461 async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
462 assert tool_name == "bash"
463 assert "touch denied-bash.txt" in details
464 return False
465
466 run = await run_scenario(
467 "Create denied-bash.txt using bash.",
468 backend,
469 config=config,
470 project_root=temp_dir,
471 on_confirmation=deny_confirmation,
472 )
473
474 assert not target.exists()
475 assert "left the shell command undone" in run.response.lower()
476 assert any(event.type == "confirmation" for event in run.events)
477
478
479 @pytest.mark.asyncio
480 async def test_read_only_mode_denies_write(temp_dir: Path) -> None:
481 config = non_streaming_config()
482 config.permission_mode = PermissionMode.READ_ONLY
483 config.auto_recover = False
484 target = temp_dir / "blocked-by-policy.txt"
485 backend = ScriptedBackend(
486 completions=[
487 native_tool_response(
488 ToolCall(
489 id="write-1",
490 name="write",
491 arguments={"file_path": str(target), "content": "denied\n"},
492 ),
493 content="I'll create the file.",
494 ),
495 final_response("The write was blocked."),
496 ]
497 )
498
499 run = await run_scenario(
500 "Create blocked-by-policy.txt.",
501 backend,
502 config=config,
503 project_root=temp_dir,
504 )
505
506 assert not target.exists()
507 assert any("requires workspace-write" in message for message in tool_result_messages(run))
508
509
510 @pytest.mark.asyncio
511 async def test_read_only_mode_denies_mutating_bash(temp_dir: Path) -> None:
512 config = non_streaming_config()
513 config.permission_mode = PermissionMode.READ_ONLY
514 config.auto_recover = False
515 target = temp_dir / "bash-blocked.txt"
516 backend = ScriptedBackend(
517 completions=[
518 native_tool_response(
519 ToolCall(
520 id="bash-1",
521 name="bash",
522 arguments={"command": f"touch {target}"},
523 ),
524 content="I'll create the file with bash.",
525 ),
526 final_response("The bash command was blocked."),
527 ]
528 )
529
530 run = await run_scenario(
531 "Create bash-blocked.txt using bash.",
532 backend,
533 config=config,
534 project_root=temp_dir,
535 )
536
537 assert not target.exists()
538 assert any("requires workspace-write" in message for message in tool_result_messages(run))
539
540
541 @pytest.mark.asyncio
542 async def test_read_only_mode_allows_safe_bash(temp_dir: Path) -> None:
543 config = non_streaming_config()
544 config.permission_mode = PermissionMode.READ_ONLY
545 backend = ScriptedBackend(
546 completions=[
547 native_tool_response(
548 ToolCall(id="bash-1", name="bash", arguments={"command": "pwd"}),
549 content="I'll inspect the current directory.",
550 ),
551 final_response("Inspected the current directory."),
552 ]
553 )
554
555 run = await run_scenario(
556 "Show the current directory.",
557 backend,
558 config=config,
559 project_root=temp_dir,
560 )
561
562 assert tool_event_names(run) == ["bash"]
563 assert not any("requires" in message for message in tool_result_messages(run))
564
565
566 @pytest.mark.asyncio
567 async def test_workspace_write_denies_write_outside_root(temp_dir: Path) -> None:
568 config = non_streaming_config()
569 config.auto_recover = False
570 outside = temp_dir.parent / "outside-root.txt"
571 if outside.exists():
572 outside.unlink()
573
574 backend = ScriptedBackend(
575 completions=[
576 native_tool_response(
577 ToolCall(
578 id="write-1",
579 name="write",
580 arguments={"file_path": str(outside), "content": "outside\n"},
581 ),
582 content="I'll write outside the workspace.",
583 ),
584 final_response("The write was blocked."),
585 ]
586 )
587
588 async def decline_confirmation(_name: str, _msg: str, _details: str) -> bool:
589 return False
590
591 run = await run_scenario(
592 "Write a file outside the workspace.",
593 backend,
594 config=config,
595 project_root=temp_dir,
596 on_confirmation=decline_confirmation,
597 )
598
599 assert not outside.exists()
600 assert any(
601 "declined" in message.lower() or "outside workspace" in message.lower()
602 for message in tool_result_messages(run)
603 )
604
605
606 @pytest.mark.asyncio
607 async def test_danger_full_access_allows_dangerous_bash(temp_dir: Path) -> None:
608 target = temp_dir / "mode.txt"
609 target.write_text("hello\n")
610 config = non_streaming_config()
611 config.permission_mode = PermissionMode.DANGER_FULL_ACCESS
612 backend = ScriptedBackend(
613 completions=[
614 native_tool_response(
615 ToolCall(
616 id="bash-1",
617 name="bash",
618 arguments={"command": f"chmod 600 {target}"},
619 ),
620 content="I'll change the file permissions.",
621 ),
622 final_response("Updated the file permissions."),
623 ]
624 )
625
626 run = await run_scenario(
627 "Lock down mode.txt permissions.",
628 backend,
629 config=config,
630 project_root=temp_dir,
631 )
632
633 assert tool_event_names(run) == ["bash"]
634 assert not any("requires" in message for message in tool_result_messages(run))
635 assert not any(event.type == "confirmation" for event in run.events)
636
637
638 @pytest.mark.asyncio
639 async def test_prompt_mode_prompts_destructive_write(temp_dir: Path) -> None:
640 target = temp_dir / "prompted.txt"
641 config = non_streaming_config()
642 config.permission_mode = PermissionMode.PROMPT
643 backend = ScriptedBackend(
644 completions=[
645 native_tool_response(
646 ToolCall(
647 id="write-1",
648 name="write",
649 arguments={"file_path": str(target), "content": "prompted\n"},
650 ),
651 content="I'll create the file after approval.",
652 ),
653 final_response("The file was created."),
654 ]
655 )
656 prompts: list[str] = []
657
658 async def approve_confirmation(tool_name: str, message: str, details: str) -> bool:
659 assert tool_name == "write"
660 prompts.append(details)
661 return True
662
663 run = await run_scenario(
664 "Create prompted.txt after approval.",
665 backend,
666 config=config,
667 project_root=temp_dir,
668 on_confirmation=approve_confirmation,
669 )
670
671 assert target.read_text() == "prompted\n"
672 assert prompts and "active_mode=prompt" in prompts[0]
673 assert any(event.type == "confirmation" for event in run.events)
674
675
676 @pytest.mark.asyncio
677 async def test_allow_mode_skips_prompt_for_destructive_write(temp_dir: Path) -> None:
678 target = temp_dir / "allow-mode.txt"
679 config = non_streaming_config()
680 config.permission_mode = PermissionMode.ALLOW
681 backend = ScriptedBackend(
682 completions=[
683 native_tool_response(
684 ToolCall(
685 id="write-1",
686 name="write",
687 arguments={"file_path": str(target), "content": "allow mode\n"},
688 ),
689 content="I'll create the file directly.",
690 ),
691 final_response("The file was created."),
692 ]
693 )
694 prompts: list[str] = []
695
696 async def unexpected_confirmation(tool_name: str, message: str, details: str) -> bool:
697 prompts.append(tool_name)
698 return False
699
700 run = await run_scenario(
701 "Create allow-mode.txt directly.",
702 backend,
703 config=config,
704 project_root=temp_dir,
705 on_confirmation=unexpected_confirmation,
706 )
707
708 assert target.read_text() == "allow mode\n"
709 assert prompts == []
710 assert not any(event.type == "confirmation" for event in run.events)
711 assert "The file was created." in run.response
712
713
714 @pytest.mark.asyncio
715 async def test_deny_rule_blocks_allowed_mode(temp_dir: Path) -> None:
716 loader_root = temp_dir / ".loader"
717 loader_root.mkdir()
718 (loader_root / "permission-rules.json").write_text(
719 '{"deny": [{"tool": "write", "path_contains": "secrets"}]}\n'
720 )
721 target = temp_dir / "secrets.txt"
722 config = non_streaming_config()
723 config.permission_mode = PermissionMode.ALLOW
724 config.auto_recover = False
725 backend = ScriptedBackend(
726 completions=[
727 native_tool_response(
728 ToolCall(
729 id="write-1",
730 name="write",
731 arguments={"file_path": str(target), "content": "denied\n"},
732 ),
733 content="I'll write the secret file.",
734 ),
735 final_response("The write was blocked by policy."),
736 ]
737 )
738
739 run = await run_scenario(
740 "Create secrets.txt.",
741 backend,
742 config=config,
743 project_root=temp_dir,
744 )
745
746 assert not target.exists()
747 assert any("denied by rule" in message for message in tool_result_messages(run))
748 assert "tool.permission_denied" in trace_event_names(run)
749
750
751 @pytest.mark.asyncio
752 async def test_ask_rule_prompts_even_when_mode_would_allow(temp_dir: Path) -> None:
753 loader_root = temp_dir / ".loader"
754 loader_root.mkdir()
755 (loader_root / "permission-rules.json").write_text(
756 '{"ask": [{"tool": "write", "path_contains": "README"}]}\n'
757 )
758 target = temp_dir / "README.md"
759 config = non_streaming_config()
760 config.permission_mode = PermissionMode.ALLOW
761 backend = ScriptedBackend(
762 completions=[
763 native_tool_response(
764 ToolCall(
765 id="write-1",
766 name="write",
767 arguments={"file_path": str(target), "content": "ask rule\n"},
768 ),
769 content="I'll update the README if you approve it.",
770 ),
771 final_response("The write was declined."),
772 ]
773 )
774 prompts: list[str] = []
775
776 async def deny_confirmation(tool_name: str, message: str, details: str) -> bool:
777 prompts.append(details)
778 return False
779
780 run = await run_scenario(
781 "Update README.md.",
782 backend,
783 config=config,
784 project_root=temp_dir,
785 on_confirmation=deny_confirmation,
786 )
787
788 assert not target.exists()
789 assert prompts and "matched_ask_rule=tool=write, path_contains=README" in prompts[0]
790 assert any(event.type == "confirmation" for event in run.events)
791 assert "declined" in run.response.lower()
792
793
794 @pytest.mark.asyncio
795 async def test_raw_json_tool_call_fallback(temp_dir: Path) -> None:
796 fixture = temp_dir / "fixture.txt"
797 fixture.write_text("alpha parity line\n")
798 raw_json = f'{{"name": "read", "arguments": {{"file_path": "{fixture}"}}}}'
799
800 backend = ScriptedBackend(
801 streams=[
802 [
803 StreamChunk(content=raw_json[:25], is_done=False),
804 StreamChunk(content=raw_json[25:], full_content=raw_json, is_done=True),
805 ],
806 [
807 StreamChunk(
808 content="Recovered the raw JSON tool call and read the file.",
809 full_content="Recovered the raw JSON tool call and read the file.",
810 is_done=True,
811 )
812 ],
813 ]
814 )
815
816 run = await run_scenario(
817 "Read the fixture file.",
818 backend,
819 config=AgentConfig(auto_context=False, max_iterations=8),
820 project_root=temp_dir,
821 )
822
823 assert tool_event_names(run) == ["read"]
824 assert any("alpha parity line" in message for message in tool_result_messages(run))
825 assert "Recovered the raw JSON tool call" in run.response
826
827
828 @pytest.mark.asyncio
829 async def test_raw_json_todowrite_tool_call_fallback(temp_dir: Path) -> None:
830 raw_json = json.dumps(
831 {
832 "name": "TodoWrite",
833 "arguments": {
834 "todos": [
835 {
836 "content": "Run tests",
837 "active_form": "Running tests",
838 "status": "completed",
839 }
840 ]
841 },
842 }
843 )
844 backend = ScriptedBackend(
845 completions=[
846 CompletionResponse(content=raw_json),
847 final_response("Tracked the current todo list."),
848 ]
849 )
850
851 run = await run_scenario(
852 "Track the current work items.",
853 backend,
854 config=non_streaming_config(),
855 project_root=temp_dir,
856 )
857
858 todo_store = temp_dir / ".loader" / "todos" / "active.json"
859 assert tool_event_names(run) == ["TodoWrite"]
860 assert json.loads(todo_store.read_text()) == []
861 assert "Tracked the current todo list." in run.response
862
863
864 @pytest.mark.asyncio
865 async def test_raw_json_patch_tool_call_fallback(temp_dir: Path) -> None:
866 target = temp_dir / "sample.txt"
867 target.write_text("alpha\nbeta\ngamma\n")
868 raw_json = json.dumps(
869 {
870 "name": "patch",
871 "arguments": {
872 "file_path": str(target),
873 "hunks": [
874 {
875 "old_start": 2,
876 "old_lines": 1,
877 "new_start": 2,
878 "new_lines": 1,
879 "lines": ["-beta", "+beta updated"],
880 }
881 ],
882 },
883 }
884 )
885 backend = ScriptedBackend(
886 completions=[
887 CompletionResponse(content=raw_json),
888 final_response("Patched sample.txt."),
889 ]
890 )
891
892 run = await run_scenario(
893 "Update sample.txt.",
894 backend,
895 config=non_streaming_config(),
896 project_root=temp_dir,
897 )
898
899 assert tool_event_names(run) == ["patch"]
900 assert target.read_text() == "alpha\nbeta updated\ngamma\n"
901 assert "Patched sample.txt." in run.response
902
903
904 @pytest.mark.asyncio
905 async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
906 raw_json = json.dumps(
907 {
908 "name": "AskUserQuestion",
909 "arguments": {
910 "title": "Path Choice",
911 "context": "Choose the safer Loader cleanup path.",
912 "question": "Which path should we take?",
913 "options": [
914 {
915 "label": "Plan first",
916 "description": "Keep the next move documented.",
917 },
918 {
919 "label": "Execute now",
920 "description": "Start changing code immediately.",
921 },
922 ],
923 },
924 }
925 )
926 backend = ScriptedBackend(
927 completions=[
928 CompletionResponse(content=raw_json),
929 final_response("We'll execute now."),
930 ]
931 )
932
933 async def answer(question: str, options: list[str] | None) -> str:
934 assert "Which path should we take?" in question
935 assert options == [
936 "Plan first - Keep the next move documented.",
937 "Execute now - Start changing code immediately.",
938 ]
939 return "2"
940
941 run = await run_scenario(
942 "Decide the next path before changing code.",
943 backend,
944 config=non_streaming_config(),
945 project_root=temp_dir,
946 on_user_question=answer,
947 )
948
949 assert tool_event_names(run) == ["AskUserQuestion"]
950 assert any("Execute now" in message for message in tool_result_messages(run))
951 assert "We'll execute now." in run.response
952
953
954 @pytest.mark.asyncio
955 async def test_raw_bracket_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
956 backend = ScriptedBackend(
957 streams=[
958 [
959 StreamChunk(
960 content='[calls askuserquestion tool with: question="Which path should we take?"]',
961 full_content='[calls askuserquestion tool with: question="Which path should we take?"]',
962 is_done=True,
963 )
964 ],
965 [
966 StreamChunk(
967 content="We'll plan first.",
968 full_content="We'll plan first.",
969 is_done=True,
970 )
971 ],
972 ]
973 )
974
975 async def answer(question: str, options: list[str] | None) -> str:
976 assert "Which path should we take?" in question
977 assert options is None
978 return "Plan first"
979
980 run = await run_scenario(
981 "Read the fixture file.",
982 backend,
983 config=AgentConfig(auto_context=False, max_iterations=8),
984 project_root=temp_dir,
985 on_user_question=answer,
986 )
987
988 assert tool_event_names(run) == ["AskUserQuestion"]
989 assert any('"answer": "Plan first"' in message for message in tool_result_messages(run))
990 assert "We'll plan first." in run.response
991
992
993 @pytest.mark.asyncio
994 async def test_non_streaming_bracket_ask_user_question_tool_call_fallback(
995 temp_dir: Path,
996 ) -> None:
997 backend = ScriptedBackend(
998 completions=[
999 CompletionResponse(
1000 content='[calls askuserquestion tool with: question="Which path should we take?"]'
1001 ),
1002 final_response("We'll plan first."),
1003 ]
1004 )
1005
1006 async def answer(question: str, options: list[str] | None) -> str:
1007 assert "Which path should we take?" in question
1008 assert options is None
1009 return "Plan first"
1010
1011 run = await run_scenario(
1012 "Read the fixture file.",
1013 backend,
1014 config=non_streaming_config(),
1015 project_root=temp_dir,
1016 on_user_question=answer,
1017 )
1018
1019 assert tool_event_names(run) == ["AskUserQuestion"]
1020 assert any('"answer": "Plan first"' in message for message in tool_result_messages(run))
1021 assert "We'll plan first." in run.response
1022
1023
1024 @pytest.mark.asyncio
1025 async def test_native_and_raw_tool_paths_share_executor_trace(temp_dir: Path) -> None:
1026 native_fixture = temp_dir / "native.txt"
1027 native_fixture.write_text("native parity line\n")
1028 native_backend = ScriptedBackend(
1029 completions=[
1030 native_tool_response(
1031 ToolCall(id="read-1", name="read", arguments={"file_path": str(native_fixture)}),
1032 content="I'll inspect the native tool result.",
1033 ),
1034 final_response("Native read complete."),
1035 ]
1036 )
1037 native_run = await run_scenario(
1038 "Read native.txt.",
1039 native_backend,
1040 config=non_streaming_config(),
1041 project_root=temp_dir,
1042 )
1043
1044 raw_fixture = temp_dir / "raw.txt"
1045 raw_fixture.write_text("raw parity line\n")
1046 raw_json = f'{{"name": "read", "arguments": {{"file_path": "{raw_fixture}"}}}}'
1047 raw_backend = ScriptedBackend(
1048 streams=[
1049 [
1050 StreamChunk(content=raw_json[:20], is_done=False),
1051 StreamChunk(content=raw_json[20:], full_content=raw_json, is_done=True),
1052 ],
1053 [
1054 StreamChunk(
1055 content="Raw read complete.",
1056 full_content="Raw read complete.",
1057 is_done=True,
1058 )
1059 ],
1060 ]
1061 )
1062 raw_run = await run_scenario(
1063 "Read raw.txt.",
1064 raw_backend,
1065 config=AgentConfig(auto_context=False, max_iterations=8),
1066 project_root=temp_dir,
1067 )
1068
1069 for run in (native_run, raw_run):
1070 names = trace_event_names(run)
1071 assert "assistant.tool_batch" in names
1072 assert "tool.received" in names
1073 assert "tool.executed" in names
1074
1075 native_summary = native_run.agent.last_turn_summary
1076 raw_summary = raw_run.agent.last_turn_summary
1077 assert native_summary is not None
1078 assert raw_summary is not None
1079 assert any(
1080 event.name == "tool.received" and event.data["source"] == "native"
1081 for event in native_summary.trace
1082 )
1083 assert any(
1084 event.name == "tool.received" and event.data["source"] == "raw_text"
1085 for event in raw_summary.trace
1086 )
1087
1088
1089 @pytest.mark.asyncio
1090 async def test_backend_capability_probe_refreshes_native_tool_mode(
1091 temp_dir: Path,
1092 ) -> None:
1093 fixture = temp_dir / "fixture.txt"
1094 fixture.write_text("capability probe line\n")
1095
1096 class LazyCapabilityBackend(ScriptedBackend):
1097 def __init__(self, completions: list[CompletionResponse]) -> None:
1098 super().__init__(completions=completions, supports_native_tools=False)
1099 self.model = "custom-qwen-build"
1100 self._described = False
1101
1102 async def describe_model(self) -> dict[str, dict[str, list[str]]]:
1103 self._described = True
1104 return {"details": {"families": ["qwen2.5"]}}
1105
1106 def capability_profile(self):
1107 model_details = (
1108 {"details": {"families": ["qwen2.5"]}} if self._described else None
1109 )
1110 return resolve_capability_profile(
1111 self.model,
1112 model_details=model_details,
1113 )
1114
1115 backend = LazyCapabilityBackend(
1116 completions=[
1117 native_tool_response(
1118 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
1119 content="I'll inspect that file after probing capabilities.",
1120 ),
1121 final_response("Capability probing enabled the native read."),
1122 ]
1123 )
1124
1125 run = await run_scenario(
1126 "Read the fixture file after checking model capabilities.",
1127 backend,
1128 config=non_streaming_config(),
1129 project_root=temp_dir,
1130 )
1131
1132 assert backend._described
1133 assert not run.agent.use_react
1134 assert run.invocations[0].tools is not None
1135 assert tool_event_names(run) == ["read"]
1136 assert "Capability probing enabled the native read." in run.response
1137
1138
1139 @pytest.mark.asyncio
1140 async def test_run_streaming_delegates_to_primary_runtime(temp_dir: Path) -> None:
1141 fixture = temp_dir / "streaming.txt"
1142 fixture.write_text("streamed runtime line\n")
1143 backend = ScriptedBackend(
1144 streams=[
1145 [
1146 StreamChunk(
1147 content="I'll inspect the file now.",
1148 full_content="I'll inspect the file now.",
1149 tool_calls=[
1150 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)})
1151 ],
1152 is_done=True,
1153 )
1154 ],
1155 [
1156 StreamChunk(
1157 content="Finished reading the streamed fixture.",
1158 full_content="Finished reading the streamed fixture.",
1159 is_done=True,
1160 )
1161 ],
1162 ]
1163 )
1164 agent = Agent(
1165 backend=backend,
1166 config=AgentConfig(auto_context=False, max_iterations=8),
1167 project_root=temp_dir,
1168 )
1169
1170 events = [event async for event in agent.run_streaming("Read the streamed fixture file.")]
1171
1172 assert any(event.type == "tool_call" and event.tool_name == "read" for event in events)
1173 assert any(
1174 event.type == "tool_result" and "streamed runtime line" in event.content
1175 for event in events
1176 )
1177 assert agent.last_turn_summary is not None
1178 assert agent.last_turn_summary.final_response.startswith(
1179 "Finished reading the streamed fixture."
1180 )
1181
1182
1183 @pytest.mark.asyncio
1184 async def test_definition_of_done_verify_phase(temp_dir: Path) -> None:
1185 target = temp_dir / "verified.txt"
1186 backend = ScriptedBackend(
1187 completions=[
1188 native_tool_response(
1189 ToolCall(
1190 id="write-1",
1191 name="write",
1192 arguments={"file_path": str(target), "content": "verified\n"},
1193 ),
1194 content="I'll create the file now.",
1195 ),
1196 final_response("Created verified.txt."),
1197 ]
1198 )
1199
1200 run = await run_scenario(
1201 "Create verified.txt with a line of text.",
1202 backend,
1203 config=non_streaming_config(),
1204 project_root=temp_dir,
1205 )
1206
1207 assert verification_commands(run) == [f"test -f {target}"]
1208 assert dod_statuses(run) == ["draft", "verifying", "done"]
1209 assert "Verification:" in run.response
1210 assert run.agent.last_turn_summary is not None
1211 assert run.agent.last_turn_summary.verification_status == "passed"
1212 assert run.agent.last_turn_summary.definition_of_done is not None
1213
1214
1215 @pytest.mark.asyncio
1216 async def test_verify_failure_routes_to_fix_loop(
1217 temp_dir: Path,
1218 monkeypatch: pytest.MonkeyPatch,
1219 ) -> None:
1220 monkeypatch.chdir(temp_dir)
1221 target = temp_dir / "broken.py"
1222 backend = ScriptedBackend(
1223 completions=[
1224 native_tool_response(
1225 ToolCall(
1226 id="write-1",
1227 name="write",
1228 arguments={"file_path": str(target), "content": "print(\n"},
1229 ),
1230 content="I'll create the script.",
1231 ),
1232 final_response("Created broken.py."),
1233 native_tool_response(
1234 ToolCall(
1235 id="write-2",
1236 name="write",
1237 arguments={
1238 "file_path": str(target),
1239 "content": "print('fixed from verify loop')\n",
1240 },
1241 ),
1242 content="I'll fix the verification failure.",
1243 ),
1244 final_response("Fixed broken.py."),
1245 ]
1246 )
1247
1248 run = await run_scenario(
1249 "Create broken.py and make sure it runs.",
1250 backend,
1251 config=non_streaming_config(),
1252 project_root=temp_dir,
1253 )
1254
1255 assert target.read_text() == "print('fixed from verify loop')\n"
1256 assert verification_commands(run) == ["python broken.py", "python broken.py"]
1257 assert "fixing" in dod_statuses(run)
1258 assert "Verification:" in run.response
1259 assert run.agent.last_turn_summary is not None
1260 assert run.agent.last_turn_summary.verification_status == "passed"
1261
1262
1263 @pytest.mark.asyncio
1264 async def test_verify_retry_budget_exhaustion(
1265 temp_dir: Path,
1266 monkeypatch: pytest.MonkeyPatch,
1267 ) -> None:
1268 monkeypatch.chdir(temp_dir)
1269 target = temp_dir / "still-broken.py"
1270 config = non_streaming_config()
1271 config.verification_retry_budget = 1
1272 backend = ScriptedBackend(
1273 completions=[
1274 native_tool_response(
1275 ToolCall(
1276 id="write-1",
1277 name="write",
1278 arguments={"file_path": str(target), "content": "print(\n"},
1279 ),
1280 content="I'll create the script.",
1281 ),
1282 final_response("Created still-broken.py."),
1283 native_tool_response(
1284 ToolCall(
1285 id="write-2",
1286 name="write",
1287 arguments={"file_path": str(target), "content": "print(\n"},
1288 ),
1289 content="I'll try one more fix.",
1290 ),
1291 final_response("Tried to fix still-broken.py."),
1292 ]
1293 )
1294
1295 run = await run_scenario(
1296 "Create still-broken.py and make sure it runs.",
1297 backend,
1298 config=config,
1299 project_root=temp_dir,
1300 )
1301
1302 assert "couldn't verify" in run.response.lower()
1303 assert dod_statuses(run)[-1] == "failed"
1304 assert run.agent.last_turn_summary is not None
1305 assert run.agent.last_turn_summary.verification_status == "failed"
1306
1307
1308 @pytest.mark.asyncio
1309 async def test_ambiguous_prompt_routes_to_clarify(temp_dir: Path) -> None:
1310 backend = ScriptedBackend(
1311 completions=[
1312 native_tool_response(
1313 ToolCall(
1314 id="ask-1",
1315 name="AskUserQuestion",
1316 arguments={
1317 "question": (
1318 "What outcome matters most, and what should stay out of scope?"
1319 )
1320 },
1321 ),
1322 content="I need one clarification first.",
1323 ),
1324 final_response(
1325 "\n".join(
1326 [
1327 "## Task Statement",
1328 "Improve Loader so it feels more like claw-code.",
1329 "",
1330 "## Desired Outcome",
1331 "- Make Loader more reliable without broad redesign.",
1332 "",
1333 "## In Scope",
1334 "- Tighten the runtime workflow around the user-facing goal.",
1335 "",
1336 "## Non Goals",
1337 "- Rebuild unrelated subsystems.",
1338 "",
1339 "## Decision Boundaries",
1340 "- Escalate before changing unrelated UX patterns.",
1341 "",
1342 "## Constraints",
1343 "- Stay inside the current repository.",
1344 "",
1345 "## Likely Touchpoints",
1346 "- Runtime entry points and prompt behavior.",
1347 "",
1348 "## Assumptions",
1349 "- The user wants a narrow runtime-quality improvement.",
1350 "",
1351 "## Acceptance Criteria",
1352 "- The improvement stays focused on runtime behavior.",
1353 ]
1354 )
1355 ),
1356 final_response("I have the brief and can move forward."),
1357 ]
1358 )
1359
1360 async def answer(question: str, options: list[str] | None) -> str:
1361 assert "outcome matters most" in question.lower()
1362 assert options is None
1363 return "Do not redesign the whole interface."
1364
1365 run = await run_scenario(
1366 "Improve Loader so it feels more like claw-code.",
1367 backend,
1368 config=non_streaming_config(),
1369 project_root=temp_dir,
1370 on_user_question=answer,
1371 )
1372
1373 dod = run.agent.last_turn_summary.definition_of_done
1374 assert dod is not None
1375 assert workflow_modes(run)[:2] == ["clarify", "execute"]
1376 assert artifact_kinds(run) == ["clarify_brief"]
1377 assert dod.clarify_brief is not None
1378 assert Path(dod.clarify_brief).exists()
1379
1380
1381 @pytest.mark.asyncio
1382 async def test_complex_prompt_routes_to_plan(temp_dir: Path) -> None:
1383 target = temp_dir / "planned.txt"
1384 backend = ScriptedBackend(
1385 completions=[
1386 final_response(
1387 "\n".join(
1388 [
1389 "# Implementation Plan",
1390 "",
1391 "## File Changes",
1392 f"- Create {target.name} in the workspace root.",
1393 "",
1394 "## Execution Order",
1395 f"1. Write {target.name}.",
1396 "2. Confirm the file exists.",
1397 "",
1398 "## Risks",
1399 "- Writing the wrong file path.",
1400 "",
1401 "<<<VERIFICATION>>>",
1402 "",
1403 "# Verification Plan",
1404 "",
1405 "## Acceptance Criteria",
1406 f"- {target.name} exists in the workspace root.",
1407 "",
1408 "## Verification Commands",
1409 f"- `test -f {target}`",
1410 "",
1411 "## Notes",
1412 "- Use a deterministic file existence check.",
1413 ]
1414 )
1415 ),
1416 native_tool_response(
1417 ToolCall(
1418 id="write-1",
1419 name="write",
1420 arguments={"file_path": str(target), "content": "planned output\n"},
1421 ),
1422 content="I'll create the file now.",
1423 ),
1424 final_response("The file is in place."),
1425 ]
1426 )
1427
1428 run = await run_scenario(
1429 "Implement a persistent workflow mode router with clarify artifacts, "
1430 "planning artifacts, and verification-plan wiring in the runtime.",
1431 backend,
1432 config=non_streaming_config(),
1433 project_root=temp_dir,
1434 )
1435
1436 dod = run.agent.last_turn_summary.definition_of_done
1437 assert dod is not None
1438 assert workflow_modes(run)[:3] == ["plan", "execute", "verify"]
1439 assert artifact_kinds(run) == ["implementation_plan", "verification_plan"]
1440 assert not any(event.type == "decomposition" for event in run.events)
1441 assert not any(event.type == "subtask" for event in run.events)
1442 assert dod.verification_commands == [f"test -f {target}"]
1443 assert verification_commands(run) == [f"test -f {target}"]
1444
1445
1446 @pytest.mark.asyncio
1447 async def test_verify_failure_fix_loop_does_not_reroute_workflow(temp_dir: Path) -> None:
1448 target = temp_dir / "retry.txt"
1449 backend = ScriptedBackend(
1450 completions=[
1451 final_response(
1452 "\n".join(
1453 [
1454 "# Implementation Plan",
1455 "",
1456 "## File Changes",
1457 f"- Create {target.name}.",
1458 "",
1459 "## Execution Order",
1460 f"1. Write {target.name}.",
1461 "2. Fix it if verification fails.",
1462 "",
1463 "## Risks",
1464 "- Initial content may be wrong.",
1465 "",
1466 "<<<VERIFICATION>>>",
1467 "",
1468 "# Verification Plan",
1469 "",
1470 "## Acceptance Criteria",
1471 "- The file contains the word fixed.",
1472 "",
1473 "## Verification Commands",
1474 f"- `grep -q fixed {target}`",
1475 "",
1476 "## Notes",
1477 "- Retry if the first write misses the target string.",
1478 ]
1479 )
1480 ),
1481 native_tool_response(
1482 ToolCall(
1483 id="write-1",
1484 name="write",
1485 arguments={"file_path": str(target), "content": "draft output\n"},
1486 ),
1487 content="I'll write the first draft.",
1488 ),
1489 final_response("First draft is written."),
1490 native_tool_response(
1491 ToolCall(
1492 id="write-2",
1493 name="write",
1494 arguments={"file_path": str(target), "content": "fixed output\n"},
1495 ),
1496 content="I'll correct the file.",
1497 ),
1498 final_response("The file now contains the fixed output."),
1499 ]
1500 )
1501
1502 run = await run_scenario(
1503 "Implement a persistent workflow mode router with clarify artifacts, "
1504 "planning artifacts, and verification-plan wiring in the runtime.",
1505 backend,
1506 config=non_streaming_config(),
1507 project_root=temp_dir,
1508 )
1509
1510 modes = workflow_modes(run)
1511 assert modes.count("plan") == 1
1512 assert modes.count("clarify") == 0
1513 assert modes.count("execute") >= 2
1514 assert modes.count("verify") >= 2
1515
1516
1517 @pytest.mark.asyncio
1518 async def test_conversational_task_skips_verify_phase() -> None:
1519 backend = ScriptedBackend(
1520 streams=[
1521 [
1522 StreamChunk(content="Hello there.", full_content="Hello there.", is_done=True),
1523 ]
1524 ]
1525 )
1526
1527 run = await run_scenario("hello there", backend, config=AgentConfig(auto_context=False))
1528
1529 assert run.response == "Hello there."
1530 assert not dod_statuses(run)
1531 assert run.agent.last_turn_summary is None
1532
1533
1534 @pytest.mark.asyncio
1535 async def test_explore_mode_skips_dod_and_router(temp_dir: Path) -> None:
1536 target = temp_dir / "feature.py"
1537 target.write_text("def important_helper():\n return 1\n")
1538 backend = ScriptedBackend(
1539 completions=[
1540 native_tool_response(
1541 ToolCall(
1542 id="grep-1",
1543 name="grep",
1544 arguments={
1545 "pattern": "important_helper",
1546 "path": str(temp_dir),
1547 "include": "*.py",
1548 },
1549 ),
1550 content="I'll search for that helper.",
1551 ),
1552 final_response("important_helper is defined in feature.py."),
1553 ]
1554 )
1555
1556 run = await run_explore_scenario(
1557 "Where is important_helper defined?",
1558 backend,
1559 config=non_streaming_config(),
1560 project_root=temp_dir,
1561 )
1562
1563 assert "feature.py" in run.response
1564 assert tool_event_names(run) == ["grep"]
1565 assert not dod_statuses(run)
1566 assert not workflow_modes(run)
1567 assert run.agent.last_turn_summary is not None
1568 assert run.agent.last_turn_summary.definition_of_done is None
1569 assert run.agent.last_turn_summary.workflow_mode == "explore"
1570 assert "explore.completed" in trace_event_names(run)
1571 assert not (temp_dir / ".loader" / "dod").exists()
1572 assert run.invocations[0].tools is not None
1573 assert "write" not in {tool["name"] for tool in run.invocations[0].tools or []}
1574
1575
1576 @pytest.mark.asyncio
1577 async def test_explore_mode_denies_write(temp_dir: Path) -> None:
1578 target = temp_dir / "new.txt"
1579 config = non_streaming_config()
1580 config.permission_mode = PermissionMode.WORKSPACE_WRITE
1581 backend = ScriptedBackend(
1582 completions=[
1583 native_tool_response(
1584 ToolCall(
1585 id="write-1",
1586 name="write",
1587 arguments={
1588 "file_path": str(target),
1589 "content": "not allowed\n",
1590 },
1591 ),
1592 content="I'll write a file.",
1593 ),
1594 final_response("Explore mode is read-only, so I cannot make that change here."),
1595 ]
1596 )
1597
1598 run = await run_explore_scenario(
1599 "Create a new file anyway.",
1600 backend,
1601 config=config,
1602 project_root=temp_dir,
1603 )
1604
1605 assert not target.exists()
1606 assert tool_event_names(run) == ["write"]
1607 assert any("read-only" in message.lower() for message in tool_result_messages(run))
1608 assert "cannot make that change" in run.response.lower()
1609 assert "tool.permission_denied" in trace_event_names(run)
1610 assert not dod_statuses(run)
1611 assert not workflow_modes(run)
1612 assert not (temp_dir / ".loader" / "dod").exists()
1613
1614
1615 @pytest.mark.asyncio
1616 async def test_explore_mode_ignores_global_allow_policy(temp_dir: Path) -> None:
1617 loader_root = temp_dir / ".loader"
1618 loader_root.mkdir()
1619 (loader_root / "permission-rules.json").write_text(
1620 '{"allow": [{"tool": "write", "path_contains": "new.txt"}]}\n'
1621 )
1622 target = temp_dir / "new.txt"
1623 config = non_streaming_config()
1624 config.permission_mode = PermissionMode.ALLOW
1625 backend = ScriptedBackend(
1626 completions=[
1627 native_tool_response(
1628 ToolCall(
1629 id="write-1",
1630 name="write",
1631 arguments={
1632 "file_path": str(target),
1633 "content": "still denied\n",
1634 },
1635 ),
1636 content="I'll write a file.",
1637 ),
1638 final_response("Explore mode is read-only, so I cannot make that change here."),
1639 ]
1640 )
1641
1642 run = await run_explore_scenario(
1643 "Create a new file anyway.",
1644 backend,
1645 config=config,
1646 project_root=temp_dir,
1647 )
1648
1649 assert not target.exists()
1650 assert any("read-only" in message.lower() for message in tool_result_messages(run))
1651 assert "tool.permission_denied" in trace_event_names(run)
1652 assert not dod_statuses(run)
1653 assert not workflow_modes(run)
1654
1655
1656 @pytest.mark.asyncio
1657 async def test_informational_completion_allows_explicit_done_without_continuation(
1658 temp_dir: Path,
1659 monkeypatch: pytest.MonkeyPatch,
1660 ) -> None:
1661 monkeypatch.chdir(temp_dir)
1662 target = temp_dir / "hello.py"
1663 backend = ScriptedBackend(
1664 completions=[
1665 final_response("Done."),
1666 ]
1667 )
1668 config = non_streaming_config(completion_check=True)
1669
1670 run = await run_scenario(
1671 "Explain how a hello.py file would work.",
1672 backend,
1673 config=config,
1674 project_root=temp_dir,
1675 )
1676
1677 assert not target.exists()
1678 assert not any(event.type == "completion_check" for event in run.events)
1679 assert tool_event_names(run) == []
1680 assert run.response == "Done."
1681
1682
1683 @pytest.mark.asyncio
1684 async def test_tool_result_contract_regression() -> None:
1685 errors: list[str] = []
1686 duplicate_path = "/tmp/already-created.txt"
1687
1688 duplicate_backend = ScriptedBackend(
1689 completions=[
1690 native_tool_response(
1691 ToolCall(
1692 id="dup-1",
1693 name="write",
1694 arguments={"file_path": duplicate_path, "content": "already there\n"},
1695 ),
1696 content="I'll create the file again.",
1697 ),
1698 final_response("Skipped the duplicate write."),
1699 ]
1700 )
1701 duplicate_agent = Agent(duplicate_backend, config=non_streaming_config())
1702 duplicate_agent.safeguards.record_action(
1703 "write",
1704 {"file_path": duplicate_path, "content": "already there\n"},
1705 )
1706
1707 try:
1708 await duplicate_agent.run("Create /tmp/already-created.txt again.")
1709 except TypeError as exc:
1710 errors.append(f"duplicate branch raised {exc}")
1711
1712 validation_backend = ScriptedBackend(
1713 completions=[
1714 native_tool_response(
1715 ToolCall(id="invalid-1", name="bash", arguments={"command": ""}),
1716 content="I'll run that command.",
1717 ),
1718 final_response("Blocked the invalid command."),
1719 ]
1720 )
1721 validation_agent = Agent(validation_backend, config=non_streaming_config())
1722
1723 try:
1724 await validation_agent.run("Run an empty command.")
1725 except TypeError as exc:
1726 errors.append(f"validation branch raised {exc}")
1727
1728 assert not errors, "\n".join(errors)
1729
1730
1731 @pytest.mark.asyncio
1732 async def test_duplicate_read_is_skipped_without_intervening_mutation(
1733 temp_dir: Path,
1734 ) -> None:
1735 fixture = temp_dir / "index.html"
1736 fixture.write_text("alpha parity line\n")
1737
1738 backend = ScriptedBackend(
1739 completions=[
1740 native_tool_response(
1741 ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
1742 content="I'll inspect the file.",
1743 ),
1744 native_tool_response(
1745 ToolCall(id="read-2", name="read", arguments={"file_path": str(fixture)}),
1746 content="I'll reread the same file.",
1747 ),
1748 final_response("I'll use the existing file contents instead of rereading."),
1749 ]
1750 )
1751
1752 run = await run_scenario(
1753 "Inspect index.html and keep moving.",
1754 backend,
1755 config=non_streaming_config(),
1756 project_root=temp_dir,
1757 )
1758
1759 assert tool_event_names(run) == ["read", "read"]
1760 messages = tool_result_messages(run)
1761 assert any("alpha parity line" in message for message in messages)
1762 assert any(
1763 "Skipped - duplicate action" in message and "Already read" in message
1764 for message in messages
1765 )
1766 assert "existing file contents" in run.response
1767
1768
1769 @pytest.mark.asyncio
1770 async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
1771 temp_dir: Path,
1772 ) -> None:
1773 index_file = temp_dir / "index.html"
1774 chapter_file = temp_dir / "chapter-1.html"
1775 index_file.write_text("table of contents\n")
1776 chapter_file.write_text("chapter body\n")
1777
1778 backend = ScriptedBackend(
1779 completions=[
1780 native_tool_response(
1781 ToolCall(
1782 id="read-1",
1783 name="read",
1784 arguments={"file_path": str(index_file)},
1785 ),
1786 content="I'll inspect the index first.",
1787 ),
1788 native_tool_response(
1789 ToolCall(
1790 id="read-2",
1791 name="read",
1792 arguments={"file_path": str(chapter_file)},
1793 ),
1794 content="I'll inspect the chapter next.",
1795 ),
1796 native_tool_response(
1797 ToolCall(
1798 id="read-3",
1799 name="read",
1800 arguments={"file_path": str(index_file)},
1801 ),
1802 content="I'll reopen the index to reconcile the findings.",
1803 ),
1804 final_response("I re-opened the index after checking the chapter."),
1805 ]
1806 )
1807
1808 run = await run_scenario(
1809 "Inspect the index, inspect a chapter, then return to the index.",
1810 backend,
1811 config=non_streaming_config(),
1812 project_root=temp_dir,
1813 )
1814
1815 assert tool_event_names(run) == ["read", "read", "read"]
1816 messages = tool_result_messages(run)
1817 assert not any("Skipped - duplicate action" in message for message in messages)
1818 assert sum("table of contents" in message for message in messages) == 2
1819 assert any("chapter body" in message for message in messages)
1820
1821
1822 @pytest.mark.asyncio
1823 async def test_repeated_bash_probe_is_allowed_after_mutation(
1824 temp_dir: Path,
1825 ) -> None:
1826 target = temp_dir / "notes.txt"
1827 target.write_text("old value\n")
1828 list_command = f"ls -1 {temp_dir}"
1829
1830 backend = ScriptedBackend(
1831 completions=[
1832 native_tool_response(
1833 ToolCall(id="bash-1", name="bash", arguments={"command": list_command}),
1834 content="I'll inspect the directory first.",
1835 ),
1836 native_tool_response(
1837 ToolCall(
1838 id="edit-1",
1839 name="edit",
1840 arguments={
1841 "file_path": str(target),
1842 "old_string": "old value",
1843 "new_string": "new value",
1844 },
1845 ),
1846 content="I'll update the file.",
1847 ),
1848 native_tool_response(
1849 ToolCall(id="bash-2", name="bash", arguments={"command": list_command}),
1850 content="I'll list the directory again after the edit.",
1851 ),
1852 final_response("I re-ran ls after the edit without hitting duplicate rejection."),
1853 ]
1854 )
1855
1856 run = await run_scenario(
1857 "Inspect the directory, edit the file, then inspect again.",
1858 backend,
1859 config=non_streaming_config(),
1860 project_root=temp_dir,
1861 )
1862
1863 assert tool_event_names(run) == ["bash", "edit", "bash"]
1864 messages = tool_result_messages(run)
1865 assert not any("Skipped - duplicate action" in message for message in messages)
1866 assert sum("notes.txt" in message for message in messages) >= 2
1867 assert target.read_text() == "new value\n"