| 1 | """Direct tests for tool-batch confidence, verification, and recovery helpers.""" |
| 2 | |
| 3 | from __future__ import annotations |
| 4 | |
| 5 | from pathlib import Path |
| 6 | from types import SimpleNamespace |
| 7 | |
| 8 | import pytest |
| 9 | |
| 10 | from loader.llm.base import Message, Role, ToolCall |
| 11 | from loader.runtime.context import RuntimeContext |
| 12 | from loader.runtime.events import AgentEvent |
| 13 | from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState |
| 14 | from loader.runtime.permissions import ( |
| 15 | PermissionMode, |
| 16 | build_permission_policy, |
| 17 | load_permission_rules, |
| 18 | ) |
| 19 | from loader.runtime.reasoning_types import ( |
| 20 | ActionVerification, |
| 21 | ConfidenceAssessment, |
| 22 | ConfidenceLevel, |
| 23 | ) |
| 24 | from loader.runtime.recovery import RecoveryContext |
| 25 | from loader.runtime.tool_batch_checks import ( |
| 26 | ToolBatchConfidenceGate, |
| 27 | ToolBatchVerificationGate, |
| 28 | ) |
| 29 | from loader.runtime.tool_batch_recovery import ToolBatchRecoveryController |
| 30 | from loader.tools.base import ToolResult as RegistryToolResult |
| 31 | from loader.tools.base import create_default_registry |
| 32 | from tests.helpers.runtime_harness import ScriptedBackend |
| 33 | |
| 34 | |
| 35 | class FakeSession: |
| 36 | def __init__(self, messages: list[Message]) -> None: |
| 37 | self.messages = list(messages) |
| 38 | |
| 39 | def append(self, message: Message) -> None: |
| 40 | self.messages.append(message) |
| 41 | |
| 42 | |
| 43 | class FakeCodeFilter: |
| 44 | def reset(self) -> None: |
| 45 | return None |
| 46 | |
| 47 | |
| 48 | class FakeSafeguards: |
| 49 | def __init__(self) -> None: |
| 50 | self.action_tracker = object() |
| 51 | self.validator = object() |
| 52 | self.code_filter = FakeCodeFilter() |
| 53 | |
| 54 | def filter_stream_chunk(self, content: str) -> str: |
| 55 | return content |
| 56 | |
| 57 | def filter_complete_content(self, content: str) -> str: |
| 58 | return content |
| 59 | |
| 60 | def should_steer(self) -> bool: |
| 61 | return False |
| 62 | |
| 63 | def get_steering_message(self) -> str | None: |
| 64 | return None |
| 65 | |
| 66 | def record_response(self, content: str) -> None: |
| 67 | return None |
| 68 | |
| 69 | |
| 70 | def build_context( |
| 71 | *, |
| 72 | temp_dir: Path, |
| 73 | messages: list[Message], |
| 74 | assess_confidence, |
| 75 | verify_action, |
| 76 | recovery_context: RecoveryContext | None = None, |
| 77 | confidence_scoring: bool = False, |
| 78 | verification: bool = False, |
| 79 | min_confidence_for_action: int = 3, |
| 80 | ) -> RuntimeContext: |
| 81 | registry = create_default_registry(temp_dir) |
| 82 | registry.configure_workspace_root(temp_dir) |
| 83 | rule_status = load_permission_rules(temp_dir) |
| 84 | policy = build_permission_policy( |
| 85 | active_mode=PermissionMode.WORKSPACE_WRITE, |
| 86 | workspace_root=temp_dir, |
| 87 | tool_requirements=registry.get_tool_requirements(), |
| 88 | rules=rule_status.rules, |
| 89 | ) |
| 90 | return RuntimeContext( |
| 91 | project_root=temp_dir, |
| 92 | backend=ScriptedBackend(), |
| 93 | registry=registry, |
| 94 | session=FakeSession(messages), # type: ignore[arg-type] |
| 95 | config=SimpleNamespace( |
| 96 | force_react=False, |
| 97 | max_recovery_attempts=2, |
| 98 | auto_recover=True, |
| 99 | reasoning=SimpleNamespace( |
| 100 | rollback=False, |
| 101 | show_rollback_plan=False, |
| 102 | completion_check=True, |
| 103 | max_continuation_prompts=5, |
| 104 | self_critique=False, |
| 105 | confidence_scoring=confidence_scoring, |
| 106 | min_confidence_for_action=min_confidence_for_action, |
| 107 | verification=verification, |
| 108 | ), |
| 109 | ), |
| 110 | capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type] |
| 111 | project_context=None, |
| 112 | permission_policy=policy, |
| 113 | permission_config_status=rule_status, |
| 114 | workflow_mode="execute", |
| 115 | safeguards=FakeSafeguards(), |
| 116 | reasoning=SimpleNamespace( |
| 117 | assess_confidence=assess_confidence, |
| 118 | verify_action=verify_action, |
| 119 | ), |
| 120 | recovery_context=recovery_context, |
| 121 | ) |
| 122 | |
| 123 | |
| 124 | def tool_outcome( |
| 125 | *, |
| 126 | tool_call: ToolCall, |
| 127 | output: str, |
| 128 | is_error: bool, |
| 129 | ) -> ToolExecutionOutcome: |
| 130 | return ToolExecutionOutcome( |
| 131 | tool_call=tool_call, |
| 132 | state=ToolExecutionState.EXECUTED, |
| 133 | message=Message.tool_result_message( |
| 134 | tool_call_id=tool_call.id, |
| 135 | display_content=output, |
| 136 | result_content=output, |
| 137 | is_error=is_error, |
| 138 | ), |
| 139 | event_content=output, |
| 140 | is_error=is_error, |
| 141 | result_output=output, |
| 142 | registry_result=RegistryToolResult(output=output, is_error=is_error), |
| 143 | ) |
| 144 | |
| 145 | |
| 146 | @pytest.mark.asyncio |
| 147 | async def test_tool_batch_confidence_gate_skips_low_confidence_actions( |
| 148 | temp_dir: Path, |
| 149 | ) -> None: |
| 150 | captured: dict[str, str] = {} |
| 151 | |
| 152 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 153 | captured["context"] = context |
| 154 | return ConfidenceAssessment( |
| 155 | action=f"{tool_name} with {tool_args}", |
| 156 | tool_name=tool_name, |
| 157 | tool_args=tool_args, |
| 158 | level=ConfidenceLevel.LOW, |
| 159 | reasoning="Need more context first.", |
| 160 | risks=["Unknown file contents"], |
| 161 | ) |
| 162 | |
| 163 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 164 | raise AssertionError("Verification should not run here") |
| 165 | |
| 166 | context = build_context( |
| 167 | temp_dir=temp_dir, |
| 168 | messages=[ |
| 169 | Message(role=Role.USER, content="Inspect the README."), |
| 170 | Message(role=Role.ASSISTANT, content="I'll read it next."), |
| 171 | ], |
| 172 | assess_confidence=assess_confidence, |
| 173 | verify_action=verify_action, |
| 174 | confidence_scoring=True, |
| 175 | ) |
| 176 | gate = ToolBatchConfidenceGate(context) |
| 177 | tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"}) |
| 178 | events: list[AgentEvent] = [] |
| 179 | |
| 180 | async def emit(event: AgentEvent) -> None: |
| 181 | events.append(event) |
| 182 | |
| 183 | should_skip = await gate.should_skip(tool_call=tool_call, emit=emit) |
| 184 | |
| 185 | assert should_skip is True |
| 186 | assert "Inspect the README." in captured["context"] |
| 187 | assert context.session.messages[-1].role == Role.USER |
| 188 | assert "[LOW CONFIDENCE WARNING]" in context.session.messages[-1].content |
| 189 | assert [event.type for event in events] == ["confidence"] |
| 190 | |
| 191 | |
| 192 | @pytest.mark.asyncio |
| 193 | async def test_tool_batch_verification_gate_requests_correction( |
| 194 | temp_dir: Path, |
| 195 | ) -> None: |
| 196 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 197 | raise AssertionError("Confidence should not run here") |
| 198 | |
| 199 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 200 | return ActionVerification( |
| 201 | tool_name=tool_name, |
| 202 | tool_args=tool_args, |
| 203 | expected_outcome="Success", |
| 204 | actual_result=result, |
| 205 | verified=True, |
| 206 | discrepancies=["Output did not match the requested content"], |
| 207 | needs_correction=True, |
| 208 | correction_suggestion="Read the file before editing again.", |
| 209 | ) |
| 210 | |
| 211 | context = build_context( |
| 212 | temp_dir=temp_dir, |
| 213 | messages=[], |
| 214 | assess_confidence=assess_confidence, |
| 215 | verify_action=verify_action, |
| 216 | verification=True, |
| 217 | ) |
| 218 | gate = ToolBatchVerificationGate(context) |
| 219 | tool_call = ToolCall(id="read-1", name="read", arguments={"file_path": "README.md"}) |
| 220 | outcome = tool_outcome(tool_call=tool_call, output="unexpected contents", is_error=False) |
| 221 | events: list[AgentEvent] = [] |
| 222 | |
| 223 | async def emit(event: AgentEvent) -> None: |
| 224 | events.append(event) |
| 225 | |
| 226 | should_continue = await gate.should_continue( |
| 227 | tool_call=tool_call, |
| 228 | outcome=outcome, |
| 229 | emit=emit, |
| 230 | ) |
| 231 | |
| 232 | assert should_continue is True |
| 233 | assert context.session.messages[-1].role == Role.USER |
| 234 | assert "[VERIFICATION FAILED]" in context.session.messages[-1].content |
| 235 | assert [event.type for event in events] == ["verification"] |
| 236 | |
| 237 | |
| 238 | @pytest.mark.asyncio |
| 239 | async def test_tool_batch_recovery_controller_returns_follow_up( |
| 240 | temp_dir: Path, |
| 241 | ) -> None: |
| 242 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 243 | raise AssertionError("Confidence should not run here") |
| 244 | |
| 245 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 246 | raise AssertionError("Verification should not run here") |
| 247 | |
| 248 | context = build_context( |
| 249 | temp_dir=temp_dir, |
| 250 | messages=[], |
| 251 | assess_confidence=assess_confidence, |
| 252 | verify_action=verify_action, |
| 253 | ) |
| 254 | context.session.current_task = ( |
| 255 | "Update index.html so every chapter link and title matches the real HTML files in chapters/." |
| 256 | ) |
| 257 | controller = ToolBatchRecoveryController(context) |
| 258 | tool_call = ToolCall(id="bash-1", name="bash", arguments={"command": "pytest"}) |
| 259 | outcome = tool_outcome(tool_call=tool_call, output="command failed", is_error=True) |
| 260 | events: list[AgentEvent] = [] |
| 261 | |
| 262 | async def emit(event: AgentEvent) -> None: |
| 263 | events.append(event) |
| 264 | |
| 265 | follow_up = await controller.build_follow_up( |
| 266 | tool_call=tool_call, |
| 267 | outcome=outcome, |
| 268 | emit=emit, |
| 269 | ) |
| 270 | |
| 271 | assert follow_up is not None |
| 272 | assert context.recovery_context is not None |
| 273 | assert "Previous attempts:" in follow_up.content |
| 274 | assert any(event.type == "recovery" for event in events) |
| 275 | |
| 276 | |
| 277 | @pytest.mark.asyncio |
| 278 | async def test_tool_batch_recovery_controller_includes_known_state_for_missing_file( |
| 279 | temp_dir: Path, |
| 280 | ) -> None: |
| 281 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 282 | raise AssertionError("Confidence should not run here") |
| 283 | |
| 284 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 285 | raise AssertionError("Verification should not run here") |
| 286 | |
| 287 | messages = [ |
| 288 | Message( |
| 289 | role=Role.TOOL, |
| 290 | content=( |
| 291 | "Observation [glob]: Result: " |
| 292 | "/Users/mfwolffe/Loader/guides/fortran/chapters/01-introduction.html\n" |
| 293 | "/Users/mfwolffe/Loader/guides/fortran/chapters/02-setup.html\n" |
| 294 | "/Users/mfwolffe/Loader/guides/fortran/chapters/03-basics.html\n" |
| 295 | "/Users/mfwolffe/Loader/guides/fortran/chapters/04-variables.html" |
| 296 | ), |
| 297 | tool_results=[], |
| 298 | ), |
| 299 | Message( |
| 300 | role=Role.ASSISTANT, |
| 301 | content="I already inspected the setup chapter.", |
| 302 | tool_calls=[ |
| 303 | ToolCall( |
| 304 | id="read-setup", |
| 305 | name="read", |
| 306 | arguments={"file_path": "~/Loader/guides/fortran/chapters/02-setup.html"}, |
| 307 | ) |
| 308 | ], |
| 309 | ), |
| 310 | Message.tool_result_message( |
| 311 | tool_call_id="read-setup", |
| 312 | display_content="<h1>Chapter 2: Setting Up Fortran</h1>\n", |
| 313 | result_content="<h1>Chapter 2: Setting Up Fortran</h1>\n", |
| 314 | ), |
| 315 | Message( |
| 316 | role=Role.TOOL, |
| 317 | content=( |
| 318 | "Observation [notepad_write_working]: Result: " |
| 319 | "- 02-basic-syntax.html -> 02-setup.html\n" |
| 320 | "- 03-variables-data-types.html -> 03-basics.html\n" |
| 321 | "- 04-operators-expressions.html -> 04-variables.html" |
| 322 | ), |
| 323 | tool_results=[], |
| 324 | ), |
| 325 | Message( |
| 326 | role=Role.ASSISTANT, |
| 327 | content="I should update the index now.", |
| 328 | tool_calls=[ |
| 329 | ToolCall( |
| 330 | id="read-index", |
| 331 | name="read", |
| 332 | arguments={"file_path": "~/Loader/guides/fortran/index.html"}, |
| 333 | ) |
| 334 | ], |
| 335 | ), |
| 336 | ] |
| 337 | context = build_context( |
| 338 | temp_dir=temp_dir, |
| 339 | messages=messages, |
| 340 | assess_confidence=assess_confidence, |
| 341 | verify_action=verify_action, |
| 342 | ) |
| 343 | context.session.current_task = ( |
| 344 | "Update ~/Loader/guides/fortran/index.html with the right chapter links." |
| 345 | ) |
| 346 | controller = ToolBatchRecoveryController(context) |
| 347 | tool_call = ToolCall( |
| 348 | id="read-missing", |
| 349 | name="read", |
| 350 | arguments={"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"}, |
| 351 | ) |
| 352 | outcome = tool_outcome( |
| 353 | tool_call=tool_call, |
| 354 | output="File not found: ~/Loader/guides/fortran/chapters/04-data-types.html", |
| 355 | is_error=True, |
| 356 | ) |
| 357 | |
| 358 | events: list[AgentEvent] = [] |
| 359 | |
| 360 | async def emit(event: AgentEvent) -> None: |
| 361 | events.append(event) |
| 362 | |
| 363 | follow_up = await controller.build_follow_up( |
| 364 | tool_call=tool_call, |
| 365 | outcome=outcome, |
| 366 | emit=emit, |
| 367 | ) |
| 368 | |
| 369 | assert follow_up is not None |
| 370 | assert "## CONTINUE FROM KNOWN STATE" in follow_up.content |
| 371 | assert "apply the fix using confirmed findings" in follow_up.content |
| 372 | assert "## ACTION BIAS FOR THIS RECOVERY" in follow_up.content |
| 373 | assert "Prefer edit/write/patch on the target file" in follow_up.content |
| 374 | assert "04-variables.html" in follow_up.content |
| 375 | assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content |
| 376 | assert any(event.type == "recovery" for event in events) |
| 377 | |
| 378 | |
| 379 | @pytest.mark.asyncio |
| 380 | async def test_tool_batch_recovery_controller_suggests_known_sibling_files( |
| 381 | temp_dir: Path, |
| 382 | ) -> None: |
| 383 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 384 | raise AssertionError("Confidence should not run here") |
| 385 | |
| 386 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 387 | raise AssertionError("Verification should not run here") |
| 388 | |
| 389 | chapters = temp_dir / "chapters" |
| 390 | chapters.mkdir() |
| 391 | (chapters / "04-variables.html").write_text( |
| 392 | "<h1>Chapter 4: Variables and Data Types</h1>\n" |
| 393 | ) |
| 394 | (chapters / "05-input-output.html").write_text( |
| 395 | "<h1>Chapter 5: Input and Output</h1>\n" |
| 396 | ) |
| 397 | |
| 398 | messages: list[Message] = [] |
| 399 | context = build_context( |
| 400 | temp_dir=temp_dir, |
| 401 | messages=messages, |
| 402 | assess_confidence=assess_confidence, |
| 403 | verify_action=verify_action, |
| 404 | ) |
| 405 | controller = ToolBatchRecoveryController(context) |
| 406 | tool_call = ToolCall( |
| 407 | id="read-missing", |
| 408 | name="read", |
| 409 | arguments={"file_path": str(chapters / "04-data-types.html")}, |
| 410 | ) |
| 411 | outcome = tool_outcome( |
| 412 | tool_call=tool_call, |
| 413 | output=f"File not found: {chapters / '04-data-types.html'}", |
| 414 | is_error=True, |
| 415 | ) |
| 416 | |
| 417 | events: list[AgentEvent] = [] |
| 418 | |
| 419 | async def emit(event: AgentEvent) -> None: |
| 420 | events.append(event) |
| 421 | |
| 422 | follow_up = await controller.build_follow_up( |
| 423 | tool_call=tool_call, |
| 424 | outcome=outcome, |
| 425 | emit=emit, |
| 426 | ) |
| 427 | |
| 428 | assert follow_up is not None |
| 429 | assert "## LIKELY FILE CANDIDATES" in follow_up.content |
| 430 | assert "`04-variables.html`" in follow_up.content |
| 431 | assert "instead of retrying the missing path" in follow_up.content |
| 432 | |
| 433 | |
| 434 | @pytest.mark.asyncio |
| 435 | async def test_tool_batch_recovery_controller_includes_current_html_target_excerpt( |
| 436 | temp_dir: Path, |
| 437 | ) -> None: |
| 438 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 439 | raise AssertionError("Confidence should not run here") |
| 440 | |
| 441 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 442 | raise AssertionError("Verification should not run here") |
| 443 | |
| 444 | chapters = temp_dir / "chapters" |
| 445 | chapters.mkdir() |
| 446 | (chapters / "01-introduction.html").write_text( |
| 447 | "<h1>Chapter 1: Introduction to Fortran</h1>\n" |
| 448 | ) |
| 449 | (chapters / "02-setup.html").write_text( |
| 450 | "<h1>Chapter 2: Setting Up Your Environment</h1>\n" |
| 451 | ) |
| 452 | index = temp_dir / "index.html" |
| 453 | index.write_text( |
| 454 | "<h2>Table of Contents</h2>\n" |
| 455 | "<ul class=\"chapter-list\">\n" |
| 456 | " <li><a href=\"chapters/01-introduction.html\">Chapter 1: Introduction to Fortran</a></li>\n" |
| 457 | " <li><a href=\"chapters/02-basic-syntax.html\">Chapter 2: Basic Syntax</a></li>\n" |
| 458 | "</ul>\n" |
| 459 | ) |
| 460 | |
| 461 | context = build_context( |
| 462 | temp_dir=temp_dir, |
| 463 | messages=[], |
| 464 | assess_confidence=assess_confidence, |
| 465 | verify_action=verify_action, |
| 466 | ) |
| 467 | context.session.current_task = ( |
| 468 | "Update index.html so every chapter link and title matches the real HTML files in chapters/." |
| 469 | ) |
| 470 | controller = ToolBatchRecoveryController(context) |
| 471 | tool_call = ToolCall( |
| 472 | id="patch-index", |
| 473 | name="patch", |
| 474 | arguments={ |
| 475 | "file_path": str(index), |
| 476 | "hunks": [ |
| 477 | { |
| 478 | "old_start": 1, |
| 479 | "old_lines": 1, |
| 480 | "new_start": 1, |
| 481 | "new_lines": 1, |
| 482 | "lines": ["-bad", "+good"], |
| 483 | } |
| 484 | ], |
| 485 | }, |
| 486 | ) |
| 487 | outcome = tool_outcome( |
| 488 | tool_call=tool_call, |
| 489 | output="Patch failed: hunk did not apply cleanly", |
| 490 | is_error=True, |
| 491 | ) |
| 492 | |
| 493 | events: list[AgentEvent] = [] |
| 494 | |
| 495 | async def emit(event: AgentEvent) -> None: |
| 496 | events.append(event) |
| 497 | |
| 498 | follow_up = await controller.build_follow_up( |
| 499 | tool_call=tool_call, |
| 500 | outcome=outcome, |
| 501 | emit=emit, |
| 502 | ) |
| 503 | |
| 504 | assert follow_up is not None |
| 505 | assert "## CURRENT TARGET EXCERPT" in follow_up.content |
| 506 | assert "- Target file:" in follow_up.content |
| 507 | assert "index.html" in follow_up.content |
| 508 | assert ( |
| 509 | "Closest on-disk block to the requested patch:" in follow_up.content |
| 510 | or "Current file contents near the requested patch location:" in follow_up.content |
| 511 | ) |
| 512 | assert '1 | <h2>Table of Contents</h2>' in follow_up.content |
| 513 | assert ( |
| 514 | '3 | <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>' |
| 515 | in follow_up.content |
| 516 | ) |
| 517 | assert "Use the exact on-disk text above" in follow_up.content |
| 518 | assert "Verified chapter inventory:" not in follow_up.content |
| 519 | |
| 520 | |
| 521 | @pytest.mark.asyncio |
| 522 | async def test_tool_batch_recovery_controller_includes_current_target_excerpt_for_edit_mismatch( |
| 523 | temp_dir: Path, |
| 524 | ) -> None: |
| 525 | async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: |
| 526 | raise AssertionError("Confidence should not run here") |
| 527 | |
| 528 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 529 | raise AssertionError("Verification should not run here") |
| 530 | |
| 531 | guide = temp_dir / "guide.md" |
| 532 | guide.write_text( |
| 533 | "# Loader Guide\n" |
| 534 | "\n" |
| 535 | "## Overview\n" |
| 536 | "Loader helps agentic coding workflows.\n" |
| 537 | "\n" |
| 538 | "## Status\n" |
| 539 | "The runtime is stable.\n" |
| 540 | ) |
| 541 | |
| 542 | context = build_context( |
| 543 | temp_dir=temp_dir, |
| 544 | messages=[], |
| 545 | assess_confidence=assess_confidence, |
| 546 | verify_action=verify_action, |
| 547 | ) |
| 548 | context.session.current_task = "Update guide.md to mention the runtime is resilient." |
| 549 | controller = ToolBatchRecoveryController(context) |
| 550 | tool_call = ToolCall( |
| 551 | id="edit-guide", |
| 552 | name="edit", |
| 553 | arguments={ |
| 554 | "file_path": str(guide), |
| 555 | "old_string": "## Runtime\nThe runtime is stable.\n", |
| 556 | "new_string": "## Runtime\nThe runtime is resilient.\n", |
| 557 | }, |
| 558 | ) |
| 559 | outcome = tool_outcome( |
| 560 | tool_call=tool_call, |
| 561 | output="old_string not found in file. Make sure it matches exactly.", |
| 562 | is_error=True, |
| 563 | ) |
| 564 | |
| 565 | follow_up = await controller.build_follow_up( |
| 566 | tool_call=tool_call, |
| 567 | outcome=outcome, |
| 568 | emit=lambda event: _noop_emit(event), |
| 569 | ) |
| 570 | |
| 571 | assert follow_up is not None |
| 572 | assert "## CURRENT TARGET EXCERPT" in follow_up.content |
| 573 | assert "- Target file:" in follow_up.content |
| 574 | assert "guide.md" in follow_up.content |
| 575 | assert "Closest on-disk block to the requested edit:" in follow_up.content |
| 576 | assert "6 | ## Status" in follow_up.content |
| 577 | assert "7 | The runtime is stable." in follow_up.content |
| 578 | assert "replace the containing block in one edit" in follow_up.content |
| 579 | assert "## STALE EDIT RECOVERY" in follow_up.content |
| 580 | assert "do not retry it from memory" in follow_up.content |
| 581 | assert "complete replacement file" in follow_up.content |
| 582 | |
| 583 | |
| 584 | @pytest.mark.asyncio |
| 585 | async def test_tool_batch_recovery_controller_scopes_known_state_to_active_target( |
| 586 | temp_dir: Path, |
| 587 | ) -> None: |
| 588 | async def assess_confidence( |
| 589 | tool_name: str, |
| 590 | tool_args: dict, |
| 591 | context: str, |
| 592 | ) -> ConfidenceAssessment: |
| 593 | raise AssertionError("Confidence should not run here") |
| 594 | |
| 595 | async def verify_action( |
| 596 | tool_name: str, |
| 597 | tool_args: dict, |
| 598 | result: str, |
| 599 | expected: str = "", |
| 600 | ) -> ActionVerification: |
| 601 | raise AssertionError("Verification should not run here") |
| 602 | |
| 603 | nginx_chapters = temp_dir / "nginx" / "chapters" |
| 604 | nginx_chapters.mkdir(parents=True) |
| 605 | nginx_index = temp_dir / "nginx" / "index.html" |
| 606 | nginx_index.write_text( |
| 607 | "<h2>Table of Contents</h2>\n" |
| 608 | "<ul>\n" |
| 609 | ' <li><a href="chapters/01_getting_started.html">Getting Started with NGINX</a></li>\n' |
| 610 | ' <li><a href="chapters/02_installation.html">Installation</a></li>\n' |
| 611 | "</ul>\n" |
| 612 | ) |
| 613 | (nginx_chapters / "01_getting_started.html").write_text( |
| 614 | "<h1>Getting Started with NGINX</h1>\n" |
| 615 | ) |
| 616 | |
| 617 | context = build_context( |
| 618 | temp_dir=temp_dir, |
| 619 | messages=[ |
| 620 | Message( |
| 621 | role=Role.TOOL, |
| 622 | content=( |
| 623 | "Observation [read]: Result: " |
| 624 | f"{temp_dir / 'fortran' / 'index.html'}\n" |
| 625 | "Semantic verification preview: validated 12 toc links in index.html" |
| 626 | ), |
| 627 | ), |
| 628 | ], |
| 629 | assess_confidence=assess_confidence, |
| 630 | verify_action=verify_action, |
| 631 | ) |
| 632 | context.session.current_task = ( # type: ignore[attr-defined] |
| 633 | "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel " |
| 634 | "for the structure and cadence of the guide. We are going to make an all " |
| 635 | "new equally thorough guide on how to use the nginx tool. It will live in " |
| 636 | "~/Loader/guides/nginx/index.html and ~/Loader/guides/nginx/chapters/." |
| 637 | ) |
| 638 | controller = ToolBatchRecoveryController(context) |
| 639 | tool_call = ToolCall( |
| 640 | id="edit-nginx", |
| 641 | name="edit", |
| 642 | arguments={ |
| 643 | "file_path": str(nginx_index), |
| 644 | "old_string": "<ul>\n</ul>", |
| 645 | "new_string": "<ul class=\"chapter-list\">\n</ul>", |
| 646 | }, |
| 647 | ) |
| 648 | outcome = tool_outcome( |
| 649 | tool_call=tool_call, |
| 650 | output=( |
| 651 | "Tool execution error: EditTool.execute() missing 1 required positional " |
| 652 | "argument: 'new_string'" |
| 653 | ), |
| 654 | is_error=True, |
| 655 | ) |
| 656 | |
| 657 | events: list[AgentEvent] = [] |
| 658 | |
| 659 | async def emit(event: AgentEvent) -> None: |
| 660 | events.append(event) |
| 661 | |
| 662 | follow_up = await controller.build_follow_up( |
| 663 | tool_call=tool_call, |
| 664 | outcome=outcome, |
| 665 | emit=emit, |
| 666 | ) |
| 667 | |
| 668 | assert follow_up is not None |
| 669 | assert ( |
| 670 | "Preferred next step: Update " |
| 671 | f"`{temp_dir / 'fortran' / 'index.html'}`" |
| 672 | ) not in follow_up.content |
| 673 | |
| 674 | |
| 675 | @pytest.mark.asyncio |
| 676 | async def test_tool_batch_recovery_controller_prioritizes_active_verification_repair_target( |
| 677 | temp_dir: Path, |
| 678 | ) -> None: |
| 679 | async def assess_confidence( |
| 680 | tool_name: str, |
| 681 | tool_args: dict, |
| 682 | context: str, |
| 683 | ) -> ConfidenceAssessment: |
| 684 | raise AssertionError("Confidence should not run here") |
| 685 | |
| 686 | async def verify_action( |
| 687 | tool_name: str, |
| 688 | tool_args: dict, |
| 689 | result: str, |
| 690 | expected: str = "", |
| 691 | ) -> ActionVerification: |
| 692 | raise AssertionError("Verification should not run here") |
| 693 | |
| 694 | nginx_root = temp_dir / "Loader" / "guides" / "nginx" |
| 695 | chapters = nginx_root / "chapters" |
| 696 | chapters.mkdir(parents=True) |
| 697 | index = nginx_root / "index.html" |
| 698 | index.write_text( |
| 699 | "<ul>\n" |
| 700 | ' <li><a href="chapters/01-introduction.html">Introduction</a></li>\n' |
| 701 | "</ul>\n" |
| 702 | ) |
| 703 | (chapters / "01-getting-started.html").write_text("<h1>Getting Started</h1>\n") |
| 704 | |
| 705 | repair_message = ( |
| 706 | "[DEFINITION OF DONE CHECK FAILED]\n" |
| 707 | "Repair focus:\n" |
| 708 | f"- Fix the broken local reference `chapters/01-introduction.html` in `{index}`.\n" |
| 709 | f"- Immediate next step: edit `{index}`.\n" |
| 710 | f"- If the broken reference should remain, create `{chapters / '01-introduction.html'}`; " |
| 711 | "otherwise remove or replace `chapters/01-introduction.html`.\n" |
| 712 | "- Do not reread unrelated reference materials or restart discovery while this " |
| 713 | "concrete repair target is unresolved.\n" |
| 714 | ) |
| 715 | |
| 716 | context = build_context( |
| 717 | temp_dir=temp_dir, |
| 718 | messages=[ |
| 719 | Message(role=Role.USER, content=repair_message), |
| 720 | Message( |
| 721 | role=Role.TOOL, |
| 722 | content=( |
| 723 | "Observation [glob]: Result: " |
| 724 | f"{chapters / '01-getting-started.html'}" |
| 725 | ), |
| 726 | ), |
| 727 | ], |
| 728 | assess_confidence=assess_confidence, |
| 729 | verify_action=verify_action, |
| 730 | ) |
| 731 | context.session.current_task = ( # type: ignore[attr-defined] |
| 732 | "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel " |
| 733 | "for the structure and cadence of the guide. We are going to make an all " |
| 734 | "new equally thorough guide on how to use the nginx tool." |
| 735 | ) |
| 736 | controller = ToolBatchRecoveryController(context) |
| 737 | tool_call = ToolCall( |
| 738 | id="read-bad-path", |
| 739 | name="read", |
| 740 | arguments={"path": "~/nginx-guide/chapter1.html"}, |
| 741 | ) |
| 742 | outcome = tool_outcome( |
| 743 | tool_call=tool_call, |
| 744 | output="File not found: ~/nginx-guide/chapter1.html", |
| 745 | is_error=True, |
| 746 | ) |
| 747 | |
| 748 | follow_up = await controller.build_follow_up( |
| 749 | tool_call=tool_call, |
| 750 | outcome=outcome, |
| 751 | emit=lambda event: _noop_emit(event), |
| 752 | ) |
| 753 | |
| 754 | assert follow_up is not None |
| 755 | assert "## ACTIVE REPAIR TARGET" in follow_up.content |
| 756 | assert str(index) in follow_up.content |
| 757 | assert "chapters/01-introduction.html" in follow_up.content |
| 758 | assert "Do not go back to the original reference guide" in follow_up.content |
| 759 | assert "Current task: Have a look at ~/Loader/guides/fortran" not in follow_up.content |
| 760 | assert "~/nginx-guide/chapter1.html" in follow_up.content |
| 761 | |
| 762 | |
| 763 | @pytest.mark.asyncio |
| 764 | async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files( |
| 765 | temp_dir: Path, |
| 766 | ) -> None: |
| 767 | async def assess_confidence( |
| 768 | tool_name: str, |
| 769 | tool_args: dict, |
| 770 | context: str, |
| 771 | ) -> ConfidenceAssessment: |
| 772 | raise AssertionError("Confidence should not run here") |
| 773 | |
| 774 | async def verify_action( |
| 775 | tool_name: str, |
| 776 | tool_args: dict, |
| 777 | result: str, |
| 778 | expected: str = "", |
| 779 | ) -> ActionVerification: |
| 780 | raise AssertionError("Verification should not run here") |
| 781 | |
| 782 | existing = RecoveryContext( |
| 783 | original_tool="read", |
| 784 | original_args={"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"}, |
| 785 | max_retries=3, |
| 786 | ) |
| 787 | existing.add_attempt( |
| 788 | "read", |
| 789 | {"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"}, |
| 790 | "File not found: ~/Loader/guides/fortran/chapters/04-data-types.html", |
| 791 | ) |
| 792 | context = build_context( |
| 793 | temp_dir=temp_dir, |
| 794 | messages=[], |
| 795 | assess_confidence=assess_confidence, |
| 796 | verify_action=verify_action, |
| 797 | recovery_context=existing, |
| 798 | ) |
| 799 | controller = ToolBatchRecoveryController(context) |
| 800 | tool_call = ToolCall( |
| 801 | id="read-missing-2", |
| 802 | name="read", |
| 803 | arguments={"file_path": "~/Loader/guides/fortran/chapters/02-basic-syntax.html"}, |
| 804 | ) |
| 805 | outcome = tool_outcome( |
| 806 | tool_call=tool_call, |
| 807 | output="File not found: ~/Loader/guides/fortran/chapters/02-basic-syntax.html", |
| 808 | is_error=True, |
| 809 | ) |
| 810 | |
| 811 | follow_up = await controller.build_follow_up( |
| 812 | tool_call=tool_call, |
| 813 | outcome=outcome, |
| 814 | emit=lambda event: _noop_emit(event), |
| 815 | ) |
| 816 | |
| 817 | assert follow_up is not None |
| 818 | assert context.recovery_context is existing |
| 819 | assert len(existing.attempts) == 2 |
| 820 | assert "## Current attempt: 2/3" in follow_up.content |
| 821 | assert "02-basic-syntax.html" in follow_up.content |
| 822 | |
| 823 | |
| 824 | @pytest.mark.asyncio |
| 825 | async def test_tool_batch_recovery_controller_uses_generic_loop_guidance( |
| 826 | temp_dir: Path, |
| 827 | ) -> None: |
| 828 | async def assess_confidence( |
| 829 | tool_name: str, |
| 830 | tool_args: dict, |
| 831 | context: str, |
| 832 | ) -> ConfidenceAssessment: |
| 833 | raise AssertionError("Confidence should not run here") |
| 834 | |
| 835 | async def verify_action( |
| 836 | tool_name: str, |
| 837 | tool_args: dict, |
| 838 | result: str, |
| 839 | expected: str = "", |
| 840 | ) -> ActionVerification: |
| 841 | raise AssertionError("Verification should not run here") |
| 842 | |
| 843 | existing = RecoveryContext( |
| 844 | original_tool="read", |
| 845 | original_args={"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"}, |
| 846 | max_retries=3, |
| 847 | ) |
| 848 | existing.add_attempt( |
| 849 | "read", |
| 850 | {"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"}, |
| 851 | "File not found: ~/Loader/guides/nginx/chapters/01-introduction.html", |
| 852 | ) |
| 853 | context = build_context( |
| 854 | temp_dir=temp_dir, |
| 855 | messages=[], |
| 856 | assess_confidence=assess_confidence, |
| 857 | verify_action=verify_action, |
| 858 | recovery_context=existing, |
| 859 | ) |
| 860 | controller = ToolBatchRecoveryController(context) |
| 861 | tool_call = ToolCall( |
| 862 | id="read-missing-repeat", |
| 863 | name="read", |
| 864 | arguments={"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"}, |
| 865 | ) |
| 866 | outcome = tool_outcome( |
| 867 | tool_call=tool_call, |
| 868 | output="File not found: ~/Loader/guides/nginx/chapters/01-introduction.html", |
| 869 | is_error=True, |
| 870 | ) |
| 871 | events: list[AgentEvent] = [] |
| 872 | |
| 873 | async def emit(event: AgentEvent) -> None: |
| 874 | events.append(event) |
| 875 | |
| 876 | follow_up = await controller.build_follow_up( |
| 877 | tool_call=tool_call, |
| 878 | outcome=outcome, |
| 879 | emit=emit, |
| 880 | ) |
| 881 | |
| 882 | assert follow_up is not None |
| 883 | assert any(event.type == "error" for event in events) |
| 884 | error_event = next(event for event in events if event.type == "error") |
| 885 | assert "read a config file first" not in error_event.content |
| 886 | assert "verify the current result" in error_event.content |
| 887 | |
| 888 | |
| 889 | @pytest.mark.asyncio |
| 890 | async def test_tool_batch_recovery_controller_surfaces_missing_write_payload_fix( |
| 891 | temp_dir: Path, |
| 892 | ) -> None: |
| 893 | async def assess_confidence( |
| 894 | tool_name: str, |
| 895 | tool_args: dict, |
| 896 | context: str, |
| 897 | ) -> ConfidenceAssessment: |
| 898 | raise AssertionError("Confidence should not run here") |
| 899 | |
| 900 | async def verify_action( |
| 901 | tool_name: str, |
| 902 | tool_args: dict, |
| 903 | result: str, |
| 904 | expected: str = "", |
| 905 | ) -> ActionVerification: |
| 906 | raise AssertionError("Verification should not run here") |
| 907 | |
| 908 | context = build_context( |
| 909 | temp_dir=temp_dir, |
| 910 | messages=[ |
| 911 | Message( |
| 912 | role=Role.USER, |
| 913 | content="Create ~/Loader/guides/nginx/index.html", |
| 914 | ) |
| 915 | ], |
| 916 | assess_confidence=assess_confidence, |
| 917 | verify_action=verify_action, |
| 918 | ) |
| 919 | controller = ToolBatchRecoveryController(context) |
| 920 | tool_call = ToolCall( |
| 921 | id="write-metadata-only", |
| 922 | name="write", |
| 923 | arguments={ |
| 924 | "file_path": "~/Loader/guides/nginx/index.html", |
| 925 | "content_chars": 1354, |
| 926 | "content_lines": 30, |
| 927 | }, |
| 928 | ) |
| 929 | outcome = tool_outcome( |
| 930 | tool_call=tool_call, |
| 931 | output=( |
| 932 | "[Validation warning] Writing empty content to file\n" |
| 933 | "Tool execution error: WriteTool.execute() missing 1 required " |
| 934 | "positional argument: 'content'" |
| 935 | ), |
| 936 | is_error=True, |
| 937 | ) |
| 938 | |
| 939 | follow_up = await controller.build_follow_up( |
| 940 | tool_call=tool_call, |
| 941 | outcome=outcome, |
| 942 | emit=lambda event: _noop_emit(event), |
| 943 | ) |
| 944 | |
| 945 | assert follow_up is not None |
| 946 | assert "## PAYLOAD FORMAT FIX" in follow_up.content |
| 947 | assert "content_chars" in follow_up.content |
| 948 | assert "write(file_path=..., content='...')" in follow_up.content |
| 949 | assert "index.html" in follow_up.content |
| 950 | |
| 951 | |
| 952 | @pytest.mark.asyncio |
| 953 | async def test_tool_batch_recovery_controller_resets_context_for_unrelated_failures( |
| 954 | temp_dir: Path, |
| 955 | ) -> None: |
| 956 | async def assess_confidence( |
| 957 | tool_name: str, |
| 958 | tool_args: dict, |
| 959 | context: str, |
| 960 | ) -> ConfidenceAssessment: |
| 961 | raise AssertionError("Confidence should not run here") |
| 962 | |
| 963 | async def verify_action( |
| 964 | tool_name: str, |
| 965 | tool_args: dict, |
| 966 | result: str, |
| 967 | expected: str = "", |
| 968 | ) -> ActionVerification: |
| 969 | raise AssertionError("Verification should not run here") |
| 970 | |
| 971 | existing = RecoveryContext( |
| 972 | original_tool="read", |
| 973 | original_args={"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"}, |
| 974 | max_retries=3, |
| 975 | ) |
| 976 | existing.add_attempt( |
| 977 | "read", |
| 978 | {"file_path": "~/Loader/guides/fortran/chapters/04-data-types.html"}, |
| 979 | "File not found: ~/Loader/guides/fortran/chapters/04-data-types.html", |
| 980 | ) |
| 981 | context = build_context( |
| 982 | temp_dir=temp_dir, |
| 983 | messages=[], |
| 984 | assess_confidence=assess_confidence, |
| 985 | verify_action=verify_action, |
| 986 | recovery_context=existing, |
| 987 | ) |
| 988 | controller = ToolBatchRecoveryController(context) |
| 989 | tool_call = ToolCall( |
| 990 | id="bash-timeout", |
| 991 | name="bash", |
| 992 | arguments={"command": "pytest"}, |
| 993 | ) |
| 994 | outcome = tool_outcome( |
| 995 | tool_call=tool_call, |
| 996 | output="command failed", |
| 997 | is_error=True, |
| 998 | ) |
| 999 | |
| 1000 | follow_up = await controller.build_follow_up( |
| 1001 | tool_call=tool_call, |
| 1002 | outcome=outcome, |
| 1003 | emit=lambda event: _noop_emit(event), |
| 1004 | ) |
| 1005 | |
| 1006 | assert follow_up is not None |
| 1007 | assert context.recovery_context is not None |
| 1008 | assert context.recovery_context is not existing |
| 1009 | assert len(context.recovery_context.attempts) == 1 |
| 1010 | assert "## Current attempt: 1/2" in follow_up.content |
| 1011 | |
| 1012 | |
| 1013 | async def _noop_emit(event: AgentEvent) -> None: |
| 1014 | return None |