Python · 34096 bytes Raw Blame History
1 """Direct tests for tool-batch confidence, verification, and recovery helpers."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.context import RuntimeContext
12 from loader.runtime.events import AgentEvent
13 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
14 from loader.runtime.permissions import (
15 PermissionMode,
16 build_permission_policy,
17 load_permission_rules,
18 )
19 from loader.runtime.reasoning_types import (
20 ActionVerification,
21 ConfidenceAssessment,
22 ConfidenceLevel,
23 )
24 from loader.runtime.recovery import RecoveryContext
25 from loader.runtime.tool_batch_checks import (
26 ToolBatchConfidenceGate,
27 ToolBatchVerificationGate,
28 )
29 from loader.runtime.tool_batch_recovery import ToolBatchRecoveryController
30 from loader.tools.base import ToolResult as RegistryToolResult
31 from loader.tools.base import create_default_registry
32 from tests.helpers.runtime_harness import ScriptedBackend
33
34
35 class FakeSession:
36 def __init__(self, messages: list[Message]) -> None:
37 self.messages = list(messages)
38
39 def append(self, message: Message) -> None:
40 self.messages.append(message)
41
42
43 class FakeCodeFilter:
44 def reset(self) -> None:
45 return None
46
47
48 class FakeSafeguards:
49 def __init__(self) -> None:
50 self.action_tracker = object()
51 self.validator = object()
52 self.code_filter = FakeCodeFilter()
53
54 def filter_stream_chunk(self, content: str) -> str:
55 return content
56
57 def filter_complete_content(self, content: str) -> str:
58 return content
59
60 def should_steer(self) -> bool:
61 return False
62
63 def get_steering_message(self) -> str | None:
64 return None
65
66 def record_response(self, content: str) -> None:
67 return None
68
69
70 def build_context(
71 *,
72 temp_dir: Path,
73 messages: list[Message],
74 assess_confidence,
75 verify_action,
76 recovery_context: RecoveryContext | None = None,
77 confidence_scoring: bool = False,
78 verification: bool = False,
79 min_confidence_for_action: int = 3,
80 ) -> RuntimeContext:
81 registry = create_default_registry(temp_dir)
82 registry.configure_workspace_root(temp_dir)
83 rule_status = load_permission_rules(temp_dir)
84 policy = build_permission_policy(
85 active_mode=PermissionMode.WORKSPACE_WRITE,
86 workspace_root=temp_dir,
87 tool_requirements=registry.get_tool_requirements(),
88 rules=rule_status.rules,
89 )
90 return RuntimeContext(
91 project_root=temp_dir,
92 backend=ScriptedBackend(),
93 registry=registry,
94 session=FakeSession(messages), # type: ignore[arg-type]
95 config=SimpleNamespace(
96 force_react=False,
97 max_recovery_attempts=2,
98 auto_recover=True,
99 reasoning=SimpleNamespace(
100 rollback=False,
101 show_rollback_plan=False,
102 completion_check=True,
103 max_continuation_prompts=5,
104 self_critique=False,
105 confidence_scoring=confidence_scoring,
106 min_confidence_for_action=min_confidence_for_action,
107 verification=verification,
108 ),
109 ),
110 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
111 project_context=None,
112 permission_policy=policy,
113 permission_config_status=rule_status,
114 workflow_mode="execute",
115 safeguards=FakeSafeguards(),
116 reasoning=SimpleNamespace(
117 assess_confidence=assess_confidence,
118 verify_action=verify_action,
119 ),
120 recovery_context=recovery_context,
121 )
122
123
124 def tool_outcome(
125 *,
126 tool_call: ToolCall,
127 output: str,
128 is_error: bool,
129 ) -> ToolExecutionOutcome:
130 return ToolExecutionOutcome(
131 tool_call=tool_call,
132 state=ToolExecutionState.EXECUTED,
133 message=Message.tool_result_message(
134 tool_call_id=tool_call.id,
135 display_content=output,
136 result_content=output,
137 is_error=is_error,
138 ),
139 event_content=output,
140 is_error=is_error,
141 result_output=output,
142 registry_result=RegistryToolResult(output=output, is_error=is_error),
143 )
144
145
146 @pytest.mark.asyncio
147 async def test_tool_batch_confidence_gate_skips_low_confidence_actions(
148 temp_dir: Path,
149 ) -> None:
150 captured: dict[str, str] = {}
151
152 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
153 captured["context"] = context
154 return ConfidenceAssessment(
155 action=f"{tool_name} with {tool_args}",
156 tool_name=tool_name,
157 tool_args=tool_args,
158 level=ConfidenceLevel.LOW,
159 reasoning="Need more context first.",
160 risks=["Unknown file contents"],
161 )
162
163 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
164 raise AssertionError("Verification should not run here")
165
166 context = build_context(
167 temp_dir=temp_dir,
168 messages=[
169 Message(role=Role.USER, content="Inspect the README."),
170 Message(role=Role.ASSISTANT, content="I'll read it next."),
171 ],
172 assess_confidence=assess_confidence,
173 verify_action=verify_action,
174 confidence_scoring=True,
175 )
176 gate = ToolBatchConfidenceGate(context)
177 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
178 events: list[AgentEvent] = []
179
180 async def emit(event: AgentEvent) -> None:
181 events.append(event)
182
183 should_skip = await gate.should_skip(tool_call=tool_call, emit=emit)
184
185 assert should_skip is True
186 assert "Inspect the README." in captured["context"]
187 assert context.session.messages[-1].role == Role.USER
188 assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content
189 assert [event.type for event in events] == ["confidence"]
190
191
192 @pytest.mark.asyncio
193 async def test_tool_batch_verification_gate_requests_correction(
194 temp_dir: Path,
195 ) -> None:
196 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
197 raise AssertionError("Confidence should not run here")
198
199 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
200 return ActionVerification(
201 tool_name=tool_name,
202 tool_args=tool_args,
203 expected_outcome="Success",
204 actual_result=result,
205 verified=True,
206 discrepancies=["Output did not match the requested content"],
207 needs_correction=True,
208 correction_suggestion="Read the file before editing again.",
209 )
210
211 context = build_context(
212 temp_dir=temp_dir,
213 messages=[],
214 assess_confidence=assess_confidence,
215 verify_action=verify_action,
216 verification=True,
217 )
218 gate = ToolBatchVerificationGate(context)
219 tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"})
220 outcome = tool_outcome(tool_call=tool_call, output="unexpected contents", is_error=False)
221 events: list[AgentEvent] = []
222
223 async def emit(event: AgentEvent) -> None:
224 events.append(event)
225
226 should_continue = await gate.should_continue(
227 tool_call=tool_call,
228 outcome=outcome,
229 emit=emit,
230 )
231
232 assert should_continue is True
233 assert context.session.messages[-1].role == Role.USER
234 assert "[VERIFICATION FAILED]" in context.session.messages[-1].content
235 assert [event.type for event in events] == ["verification"]
236
237
238 @pytest.mark.asyncio
239 async def test_tool_batch_recovery_controller_returns_follow_up(
240 temp_dir: Path,
241 ) -> None:
242 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
243 raise AssertionError("Confidence should not run here")
244
245 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
246 raise AssertionError("Verification should not run here")
247
248 context = build_context(
249 temp_dir=temp_dir,
250 messages=[],
251 assess_confidence=assess_confidence,
252 verify_action=verify_action,
253 )
254 context.session.current_task = (
255 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
256 )
257 controller = ToolBatchRecoveryController(context)
258 tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"})
259 outcome = tool_outcome(tool_call=tool_call, output="command failed", is_error=True)
260 events: list[AgentEvent] = []
261
262 async def emit(event: AgentEvent) -> None:
263 events.append(event)
264
265 follow_up = await controller.build_follow_up(
266 tool_call=tool_call,
267 outcome=outcome,
268 emit=emit,
269 )
270
271 assert follow_up is not None
272 assert context.recovery_context is not None
273 assert "Previous attempts:" in follow_up.content
274 assert any(event.type == "recovery" for event in events)
275
276
277 @pytest.mark.asyncio
278 async def test_tool_batch_recovery_controller_includes_known_state_for_missing_file(
279 temp_dir: Path,
280 ) -> None:
281 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
282 raise AssertionError("Confidence should not run here")
283
284 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
285 raise AssertionError("Verification should not run here")
286
287 messages = [
288 Message(
289 role=Role.TOOL,
290 content=(
291 "Observation [glob]: Result: "
292 "/Users/mfwolffe/Loader/guides/fortran/chapters/01-introduction.html\n"
293 "/Users/mfwolffe/Loader/guides/fortran/chapters/02-setup.html\n"
294 "/Users/mfwolffe/Loader/guides/fortran/chapters/03-basics.html\n"
295 "/Users/mfwolffe/Loader/guides/fortran/chapters/04-variables.html"
296 ),
297 tool_results=[],
298 ),
299 Message(
300 role=Role.ASSISTANT,
301 content="I already inspected the setup chapter.",
302 tool_calls=[
303 ToolCall(
304 id="read-setup",
305 name="read",
306 arguments={"file_path": "~/Loader/guides/fortran/chapters/02-setup.html"},
307 )
308 ],
309 ),
310 Message.tool_result_message(
311 tool_call_id="read-setup",
312 display_content="<h1>Chapter 2: Setting Up Fortran</h1>\n",
313 result_content="<h1>Chapter 2: Setting Up Fortran</h1>\n",
314 ),
315 Message(
316 role=Role.TOOL,
317 content=(
318 "Observation [notepad_write_working]: Result: "
319 "- 02-basic-syntax.html -> 02-setup.html\n"
320 "- 03-variables-data-types.html -> 03-basics.html\n"
321 "- 04-operators-expressions.html -> 04-variables.html"
322 ),
323 tool_results=[],
324 ),
325 Message(
326 role=Role.ASSISTANT,
327 content="I should update the index now.",
328 tool_calls=[
329 ToolCall(
330 id="read-index",
331 name="read",
332 arguments={"file_path": "~/Loader/guides/fortran/index.html"},
333 )
334 ],
335 ),
336 ]
337 context = build_context(
338 temp_dir=temp_dir,
339 messages=messages,
340 assess_confidence=assess_confidence,
341 verify_action=verify_action,
342 )
343 context.session.current_task = (
344 "Update ~/Loader/guides/fortran/index.html with the right chapter links."
345 )
346 controller = ToolBatchRecoveryController(context)
347 tool_call = ToolCall(
348 id="read-missing",
349 name="read",
350 arguments={"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"},
351 )
352 outcome = tool_outcome(
353 tool_call=tool_call,
354 output="File not found: ~/Loader/guides/fortran/chapters/04-data-types.html",
355 is_error=True,
356 )
357
358 events: list[AgentEvent] = []
359
360 async def emit(event: AgentEvent) -> None:
361 events.append(event)
362
363 follow_up = await controller.build_follow_up(
364 tool_call=tool_call,
365 outcome=outcome,
366 emit=emit,
367 )
368
369 assert follow_up is not None
370 assert "## CONTINUE FROM KNOWN STATE" in follow_up.content
371 assert "apply the fix using confirmed findings" in follow_up.content
372 assert "## ACTION BIAS FOR THIS RECOVERY" in follow_up.content
373 assert "Prefer edit/write/patch on the target file" in follow_up.content
374 assert "04-variables.html" in follow_up.content
375 assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content
376 assert any(event.type == "recovery" for event in events)
377
378
379 @pytest.mark.asyncio
380 async def test_tool_batch_recovery_controller_suggests_known_sibling_files(
381 temp_dir: Path,
382 ) -> None:
383 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
384 raise AssertionError("Confidence should not run here")
385
386 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
387 raise AssertionError("Verification should not run here")
388
389 chapters = temp_dir / "chapters"
390 chapters.mkdir()
391 (chapters / "04-variables.html").write_text(
392 "<h1>Chapter 4: Variables and Data Types</h1>\n"
393 )
394 (chapters / "05-input-output.html").write_text(
395 "<h1>Chapter 5: Input and Output</h1>\n"
396 )
397
398 messages: list[Message] = []
399 context = build_context(
400 temp_dir=temp_dir,
401 messages=messages,
402 assess_confidence=assess_confidence,
403 verify_action=verify_action,
404 )
405 controller = ToolBatchRecoveryController(context)
406 tool_call = ToolCall(
407 id="read-missing",
408 name="read",
409 arguments={"file_path": str(chapters / "04-data-types.html")},
410 )
411 outcome = tool_outcome(
412 tool_call=tool_call,
413 output=f"File not found: {chapters / '04-data-types.html'}",
414 is_error=True,
415 )
416
417 events: list[AgentEvent] = []
418
419 async def emit(event: AgentEvent) -> None:
420 events.append(event)
421
422 follow_up = await controller.build_follow_up(
423 tool_call=tool_call,
424 outcome=outcome,
425 emit=emit,
426 )
427
428 assert follow_up is not None
429 assert "## LIKELY FILE CANDIDATES" in follow_up.content
430 assert "`04-variables.html`" in follow_up.content
431 assert "instead of retrying the missing path" in follow_up.content
432
433
434 @pytest.mark.asyncio
435 async def test_tool_batch_recovery_controller_includes_current_html_target_excerpt(
436 temp_dir: Path,
437 ) -> None:
438 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
439 raise AssertionError("Confidence should not run here")
440
441 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
442 raise AssertionError("Verification should not run here")
443
444 chapters = temp_dir / "chapters"
445 chapters.mkdir()
446 (chapters / "01-introduction.html").write_text(
447 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
448 )
449 (chapters / "02-setup.html").write_text(
450 "<h1>Chapter 2: Setting Up Your Environment</h1>\n"
451 )
452 index = temp_dir / "index.html"
453 index.write_text(
454 "<h2>Table of Contents</h2>\n"
455 "<ul class=\"chapter-list\">\n"
456 " <li><a href=\"chapters/01-introduction.html\">Chapter 1: Introduction to Fortran</a></li>\n"
457 " <li><a href=\"chapters/02-basic-syntax.html\">Chapter 2: Basic Syntax</a></li>\n"
458 "</ul>\n"
459 )
460
461 context = build_context(
462 temp_dir=temp_dir,
463 messages=[],
464 assess_confidence=assess_confidence,
465 verify_action=verify_action,
466 )
467 context.session.current_task = (
468 "Update index.html so every chapter link and title matches the real HTML files in chapters/."
469 )
470 controller = ToolBatchRecoveryController(context)
471 tool_call = ToolCall(
472 id="patch-index",
473 name="patch",
474 arguments={
475 "file_path": str(index),
476 "hunks": [
477 {
478 "old_start": 1,
479 "old_lines": 1,
480 "new_start": 1,
481 "new_lines": 1,
482 "lines": ["-bad", "+good"],
483 }
484 ],
485 },
486 )
487 outcome = tool_outcome(
488 tool_call=tool_call,
489 output="Patch failed: hunk did not apply cleanly",
490 is_error=True,
491 )
492
493 events: list[AgentEvent] = []
494
495 async def emit(event: AgentEvent) -> None:
496 events.append(event)
497
498 follow_up = await controller.build_follow_up(
499 tool_call=tool_call,
500 outcome=outcome,
501 emit=emit,
502 )
503
504 assert follow_up is not None
505 assert "## CURRENT TARGET EXCERPT" in follow_up.content
506 assert "- Target file:" in follow_up.content
507 assert "index.html" in follow_up.content
508 assert (
509 "Closest on-disk block to the requested patch:" in follow_up.content
510 or "Current file contents near the requested patch location:" in follow_up.content
511 )
512 assert '1 | <h2>Table of Contents</h2>' in follow_up.content
513 assert (
514 '3 | <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>'
515 in follow_up.content
516 )
517 assert "Use the exact on-disk text above" in follow_up.content
518 assert "Verified chapter inventory:" not in follow_up.content
519
520
521 @pytest.mark.asyncio
522 async def test_tool_batch_recovery_controller_includes_current_target_excerpt_for_edit_mismatch(
523 temp_dir: Path,
524 ) -> None:
525 async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
526 raise AssertionError("Confidence should not run here")
527
528 async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
529 raise AssertionError("Verification should not run here")
530
531 guide = temp_dir / "guide.md"
532 guide.write_text(
533 "# Loader Guide\n"
534 "\n"
535 "## Overview\n"
536 "Loader helps agentic coding workflows.\n"
537 "\n"
538 "## Status\n"
539 "The runtime is stable.\n"
540 )
541
542 context = build_context(
543 temp_dir=temp_dir,
544 messages=[],
545 assess_confidence=assess_confidence,
546 verify_action=verify_action,
547 )
548 context.session.current_task = "Update guide.md to mention the runtime is resilient."
549 controller = ToolBatchRecoveryController(context)
550 tool_call = ToolCall(
551 id="edit-guide",
552 name="edit",
553 arguments={
554 "file_path": str(guide),
555 "old_string": "## Runtime\nThe runtime is stable.\n",
556 "new_string": "## Runtime\nThe runtime is resilient.\n",
557 },
558 )
559 outcome = tool_outcome(
560 tool_call=tool_call,
561 output="old_string not found in file. Make sure it matches exactly.",
562 is_error=True,
563 )
564
565 follow_up = await controller.build_follow_up(
566 tool_call=tool_call,
567 outcome=outcome,
568 emit=lambda event: _noop_emit(event),
569 )
570
571 assert follow_up is not None
572 assert "## CURRENT TARGET EXCERPT" in follow_up.content
573 assert "- Target file:" in follow_up.content
574 assert "guide.md" in follow_up.content
575 assert "Closest on-disk block to the requested edit:" in follow_up.content
576 assert "6 | ## Status" in follow_up.content
577 assert "7 | The runtime is stable." in follow_up.content
578 assert "replace the containing block in one edit" in follow_up.content
579 assert "## STALE EDIT RECOVERY" in follow_up.content
580 assert "do not retry it from memory" in follow_up.content
581 assert "complete replacement file" in follow_up.content
582
583
584 @pytest.mark.asyncio
585 async def test_tool_batch_recovery_controller_scopes_known_state_to_active_target(
586 temp_dir: Path,
587 ) -> None:
588 async def assess_confidence(
589 tool_name: str,
590 tool_args: dict,
591 context: str,
592 ) -> ConfidenceAssessment:
593 raise AssertionError("Confidence should not run here")
594
595 async def verify_action(
596 tool_name: str,
597 tool_args: dict,
598 result: str,
599 expected: str = "",
600 ) -> ActionVerification:
601 raise AssertionError("Verification should not run here")
602
603 nginx_chapters = temp_dir / "nginx" / "chapters"
604 nginx_chapters.mkdir(parents=True)
605 nginx_index = temp_dir / "nginx" / "index.html"
606 nginx_index.write_text(
607 "<h2>Table of Contents</h2>\n"
608 "<ul>\n"
609 ' <li><a href="chapters/01_getting_started.html">Getting Started with NGINX</a></li>\n'
610 ' <li><a href="chapters/02_installation.html">Installation</a></li>\n'
611 "</ul>\n"
612 )
613 (nginx_chapters / "01_getting_started.html").write_text(
614 "<h1>Getting Started with NGINX</h1>\n"
615 )
616
617 context = build_context(
618 temp_dir=temp_dir,
619 messages=[
620 Message(
621 role=Role.TOOL,
622 content=(
623 "Observation [read]: Result: "
624 f"{temp_dir / 'fortran' / 'index.html'}\n"
625 "Semantic verification preview: validated 12 toc links in index.html"
626 ),
627 ),
628 ],
629 assess_confidence=assess_confidence,
630 verify_action=verify_action,
631 )
632 context.session.current_task = ( # type: ignore[attr-defined]
633 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
634 "for the structure and cadence of the guide. We are going to make an all "
635 "new equally thorough guide on how to use the nginx tool. It will live in "
636 "~/Loader/guides/nginx/index.html and ~/Loader/guides/nginx/chapters/."
637 )
638 controller = ToolBatchRecoveryController(context)
639 tool_call = ToolCall(
640 id="edit-nginx",
641 name="edit",
642 arguments={
643 "file_path": str(nginx_index),
644 "old_string": "<ul>\n</ul>",
645 "new_string": "<ul class=\"chapter-list\">\n</ul>",
646 },
647 )
648 outcome = tool_outcome(
649 tool_call=tool_call,
650 output=(
651 "Tool execution error: EditTool.execute() missing 1 required positional "
652 "argument: 'new_string'"
653 ),
654 is_error=True,
655 )
656
657 events: list[AgentEvent] = []
658
659 async def emit(event: AgentEvent) -> None:
660 events.append(event)
661
662 follow_up = await controller.build_follow_up(
663 tool_call=tool_call,
664 outcome=outcome,
665 emit=emit,
666 )
667
668 assert follow_up is not None
669 assert (
670 "Preferred next step: Update "
671 f"`{temp_dir / 'fortran' / 'index.html'}`"
672 ) not in follow_up.content
673
674
675 @pytest.mark.asyncio
676 async def test_tool_batch_recovery_controller_prioritizes_active_verification_repair_target(
677 temp_dir: Path,
678 ) -> None:
679 async def assess_confidence(
680 tool_name: str,
681 tool_args: dict,
682 context: str,
683 ) -> ConfidenceAssessment:
684 raise AssertionError("Confidence should not run here")
685
686 async def verify_action(
687 tool_name: str,
688 tool_args: dict,
689 result: str,
690 expected: str = "",
691 ) -> ActionVerification:
692 raise AssertionError("Verification should not run here")
693
694 nginx_root = temp_dir / "Loader" / "guides" / "nginx"
695 chapters = nginx_root / "chapters"
696 chapters.mkdir(parents=True)
697 index = nginx_root / "index.html"
698 index.write_text(
699 "<ul>\n"
700 ' <li><a href="chapters/01-introduction.html">Introduction</a></li>\n'
701 "</ul>\n"
702 )
703 (chapters / "01-getting-started.html").write_text("<h1>Getting Started</h1>\n")
704
705 repair_message = (
706 "[DEFINITION OF DONE CHECK FAILED]\n"
707 "Repair focus:\n"
708 f"- Fix the broken local reference `chapters/01-introduction.html` in `{index}`.\n"
709 f"- Immediate next step: edit `{index}`.\n"
710 f"- If the broken reference should remain, create `{chapters / '01-introduction.html'}`; "
711 "otherwise remove or replace `chapters/01-introduction.html`.\n"
712 "- Do not reread unrelated reference materials or restart discovery while this "
713 "concrete repair target is unresolved.\n"
714 )
715
716 context = build_context(
717 temp_dir=temp_dir,
718 messages=[
719 Message(role=Role.USER, content=repair_message),
720 Message(
721 role=Role.TOOL,
722 content=(
723 "Observation [glob]: Result: "
724 f"{chapters / '01-getting-started.html'}"
725 ),
726 ),
727 ],
728 assess_confidence=assess_confidence,
729 verify_action=verify_action,
730 )
731 context.session.current_task = ( # type: ignore[attr-defined]
732 "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel "
733 "for the structure and cadence of the guide. We are going to make an all "
734 "new equally thorough guide on how to use the nginx tool."
735 )
736 controller = ToolBatchRecoveryController(context)
737 tool_call = ToolCall(
738 id="read-bad-path",
739 name="read",
740 arguments={"path": "~/nginx-guide/chapter1.html"},
741 )
742 outcome = tool_outcome(
743 tool_call=tool_call,
744 output="File not found: ~/nginx-guide/chapter1.html",
745 is_error=True,
746 )
747
748 follow_up = await controller.build_follow_up(
749 tool_call=tool_call,
750 outcome=outcome,
751 emit=lambda event: _noop_emit(event),
752 )
753
754 assert follow_up is not None
755 assert "## ACTIVE REPAIR TARGET" in follow_up.content
756 assert str(index) in follow_up.content
757 assert "chapters/01-introduction.html" in follow_up.content
758 assert "Do not go back to the original reference guide" in follow_up.content
759 assert "Current task: Have a look at ~/Loader/guides/fortran" not in follow_up.content
760 assert "~/nginx-guide/chapter1.html" in follow_up.content
761
762
763 @pytest.mark.asyncio
764 async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files(
765 temp_dir: Path,
766 ) -> None:
767 async def assess_confidence(
768 tool_name: str,
769 tool_args: dict,
770 context: str,
771 ) -> ConfidenceAssessment:
772 raise AssertionError("Confidence should not run here")
773
774 async def verify_action(
775 tool_name: str,
776 tool_args: dict,
777 result: str,
778 expected: str = "",
779 ) -> ActionVerification:
780 raise AssertionError("Verification should not run here")
781
782 existing = RecoveryContext(
783 original_tool="read",
784 original_args={"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"},
785 max_retries=3,
786 )
787 existing.add_attempt(
788 "read",
789 {"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"},
790 "File not found: ~/Loader/guides/fortran/chapters/04-data-types.html",
791 )
792 context = build_context(
793 temp_dir=temp_dir,
794 messages=[],
795 assess_confidence=assess_confidence,
796 verify_action=verify_action,
797 recovery_context=existing,
798 )
799 controller = ToolBatchRecoveryController(context)
800 tool_call = ToolCall(
801 id="read-missing-2",
802 name="read",
803 arguments={"file_path": "~/Loader/guides/fortran/chapters/02-basic-syntax.html"},
804 )
805 outcome = tool_outcome(
806 tool_call=tool_call,
807 output="File not found: ~/Loader/guides/fortran/chapters/02-basic-syntax.html",
808 is_error=True,
809 )
810
811 follow_up = await controller.build_follow_up(
812 tool_call=tool_call,
813 outcome=outcome,
814 emit=lambda event: _noop_emit(event),
815 )
816
817 assert follow_up is not None
818 assert context.recovery_context is existing
819 assert len(existing.attempts) == 2
820 assert "## Current attempt: 2/3" in follow_up.content
821 assert "02-basic-syntax.html" in follow_up.content
822
823
824 @pytest.mark.asyncio
825 async def test_tool_batch_recovery_controller_uses_generic_loop_guidance(
826 temp_dir: Path,
827 ) -> None:
828 async def assess_confidence(
829 tool_name: str,
830 tool_args: dict,
831 context: str,
832 ) -> ConfidenceAssessment:
833 raise AssertionError("Confidence should not run here")
834
835 async def verify_action(
836 tool_name: str,
837 tool_args: dict,
838 result: str,
839 expected: str = "",
840 ) -> ActionVerification:
841 raise AssertionError("Verification should not run here")
842
843 existing = RecoveryContext(
844 original_tool="read",
845 original_args={"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"},
846 max_retries=3,
847 )
848 existing.add_attempt(
849 "read",
850 {"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"},
851 "File not found: ~/Loader/guides/nginx/chapters/01-introduction.html",
852 )
853 context = build_context(
854 temp_dir=temp_dir,
855 messages=[],
856 assess_confidence=assess_confidence,
857 verify_action=verify_action,
858 recovery_context=existing,
859 )
860 controller = ToolBatchRecoveryController(context)
861 tool_call = ToolCall(
862 id="read-missing-repeat",
863 name="read",
864 arguments={"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"},
865 )
866 outcome = tool_outcome(
867 tool_call=tool_call,
868 output="File not found: ~/Loader/guides/nginx/chapters/01-introduction.html",
869 is_error=True,
870 )
871 events: list[AgentEvent] = []
872
873 async def emit(event: AgentEvent) -> None:
874 events.append(event)
875
876 follow_up = await controller.build_follow_up(
877 tool_call=tool_call,
878 outcome=outcome,
879 emit=emit,
880 )
881
882 assert follow_up is not None
883 assert any(event.type == "error" for event in events)
884 error_event = next(event for event in events if event.type == "error")
885 assert "read a config file first" not in error_event.content
886 assert "verify the current result" in error_event.content
887
888
889 @pytest.mark.asyncio
890 async def test_tool_batch_recovery_controller_surfaces_missing_write_payload_fix(
891 temp_dir: Path,
892 ) -> None:
893 async def assess_confidence(
894 tool_name: str,
895 tool_args: dict,
896 context: str,
897 ) -> ConfidenceAssessment:
898 raise AssertionError("Confidence should not run here")
899
900 async def verify_action(
901 tool_name: str,
902 tool_args: dict,
903 result: str,
904 expected: str = "",
905 ) -> ActionVerification:
906 raise AssertionError("Verification should not run here")
907
908 context = build_context(
909 temp_dir=temp_dir,
910 messages=[
911 Message(
912 role=Role.USER,
913 content="Create ~/Loader/guides/nginx/index.html",
914 )
915 ],
916 assess_confidence=assess_confidence,
917 verify_action=verify_action,
918 )
919 controller = ToolBatchRecoveryController(context)
920 tool_call = ToolCall(
921 id="write-metadata-only",
922 name="write",
923 arguments={
924 "file_path": "~/Loader/guides/nginx/index.html",
925 "content_chars": 1354,
926 "content_lines": 30,
927 },
928 )
929 outcome = tool_outcome(
930 tool_call=tool_call,
931 output=(
932 "[Validation warning] Writing empty content to file\n"
933 "Tool execution error: WriteTool.execute() missing 1 required "
934 "positional argument: 'content'"
935 ),
936 is_error=True,
937 )
938
939 follow_up = await controller.build_follow_up(
940 tool_call=tool_call,
941 outcome=outcome,
942 emit=lambda event: _noop_emit(event),
943 )
944
945 assert follow_up is not None
946 assert "## PAYLOAD FORMAT FIX" in follow_up.content
947 assert "content_chars" in follow_up.content
948 assert "write(file_path=..., content='...')" in follow_up.content
949 assert "index.html" in follow_up.content
950
951
952 @pytest.mark.asyncio
953 async def test_tool_batch_recovery_controller_resets_context_for_unrelated_failures(
954 temp_dir: Path,
955 ) -> None:
956 async def assess_confidence(
957 tool_name: str,
958 tool_args: dict,
959 context: str,
960 ) -> ConfidenceAssessment:
961 raise AssertionError("Confidence should not run here")
962
963 async def verify_action(
964 tool_name: str,
965 tool_args: dict,
966 result: str,
967 expected: str = "",
968 ) -> ActionVerification:
969 raise AssertionError("Verification should not run here")
970
971 existing = RecoveryContext(
972 original_tool="read",
973 original_args={"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"},
974 max_retries=3,
975 )
976 existing.add_attempt(
977 "read",
978 {"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"},
979 "File not found: ~/Loader/guides/fortran/chapters/04-data-types.html",
980 )
981 context = build_context(
982 temp_dir=temp_dir,
983 messages=[],
984 assess_confidence=assess_confidence,
985 verify_action=verify_action,
986 recovery_context=existing,
987 )
988 controller = ToolBatchRecoveryController(context)
989 tool_call = ToolCall(
990 id="bash-timeout",
991 name="bash",
992 arguments={"command": "pytest"},
993 )
994 outcome = tool_outcome(
995 tool_call=tool_call,
996 output="command failed",
997 is_error=True,
998 )
999
1000 follow_up = await controller.build_follow_up(
1001 tool_call=tool_call,
1002 outcome=outcome,
1003 emit=lambda event: _noop_emit(event),
1004 )
1005
1006 assert follow_up is not None
1007 assert context.recovery_context is not None
1008 assert context.recovery_context is not existing
1009 assert len(context.recovery_context.attempts) == 1
1010 assert "## Current attempt: 1/2" in follow_up.content
1011
1012
1013 async def _noop_emit(event: AgentEvent) -> None:
1014 return None