Adopt runtime-owned action reasoning services
- SHA
ad61aba70e9bb47b380336335314b48b4fb9444d- Parents
-
8902e62 - Tree
771c2bb
ad61aba
ad61aba70e9bb47b380336335314b48b4fb9444d8902e62
771c2bb| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/agent/loop.py
|
2 | 100 |
| A |
src/loader/runtime/action_reasoning.py
|
227 | 0 |
| M |
src/loader/runtime/context.py
|
50 | 5 |
| A |
src/loader/runtime/reasoning_service.py
|
100 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
6 | 6 |
| M |
tests/test_assistant_turns.py
|
0 | 4 |
| M |
tests/test_finalization.py
|
0 | 4 |
| M |
tests/test_repair.py
|
0 | 4 |
| M |
tests/test_runtime_context.py
|
3 | 3 |
| M |
tests/test_tool_batches.py
|
11 | 13 |
src/loader/agent/loop.pymodified@@ -20,6 +20,7 @@ from ..runtime.permissions import ( | ||
| 20 | 20 | load_permission_rules, |
| 21 | 21 | ) |
| 22 | 22 | from ..runtime.prompt_history import PromptSnapshot |
| 23 | +from ..runtime.reasoning_service import RuntimeReasoningService | |
| 23 | 24 | from ..runtime.session import ConversationSession |
| 24 | 25 | from ..runtime.workflow import WorkflowMode |
| 25 | 26 | from ..tools.base import ToolRegistry, create_default_registry |
@@ -33,25 +34,15 @@ from .planner import ( | ||
| 33 | 34 | ) |
| 34 | 35 | from .prompts import build_system_prompt_result |
| 35 | 36 | from .reasoning import ( |
| 36 | - CONFIDENCE_PROMPT, | |
| 37 | 37 | DECOMPOSITION_PROMPT, |
| 38 | 38 | SELF_CRITIQUE_PROMPT, |
| 39 | - VERIFICATION_PROMPT, | |
| 40 | - ActionVerification, | |
| 41 | - ConfidenceAssessment, | |
| 42 | - ConfidenceLevel, | |
| 43 | 39 | SelfCritique, |
| 44 | 40 | TaskDecomposition, |
| 45 | - estimate_confidence_quick, | |
| 46 | 41 | is_conversational, |
| 47 | - parse_confidence, | |
| 48 | 42 | parse_decomposition, |
| 49 | 43 | parse_self_critique, |
| 50 | - parse_verification, | |
| 51 | - quick_verify, | |
| 52 | 44 | should_decompose, |
| 53 | 45 | ) |
| 54 | -from .recovery import RecoveryContext | |
| 55 | 46 | from .safeguards import RuntimeSafeguards |
| 56 | 47 | |
| 57 | 48 | |
@@ -151,9 +142,6 @@ class Agent: | ||
| 151 | 142 | self.capability_profile = resolve_backend_capability_profile(self.backend) |
| 152 | 143 | self.last_turn_summary: TurnSummary | None = None |
| 153 | 144 | |
| 154 | - # Recovery tracking | |
| 155 | - self._recovery_context: RecoveryContext | None = None | |
| 156 | - | |
| 157 | 145 | # Steering: allow user to send messages during execution |
| 158 | 146 | self._steering_queue: asyncio.Queue[str] = asyncio.Queue() |
| 159 | 147 | self._is_running: bool = False |
@@ -376,12 +364,6 @@ class Agent: | ||
| 376 | 364 | if context is not None: |
| 377 | 365 | context.capability_profile = self.capability_profile |
| 378 | 366 | |
| 379 | - def _get_recovery_context() -> RecoveryContext | None: | |
| 380 | - return self._recovery_context | |
| 381 | - | |
| 382 | - def _set_recovery_context(value: RecoveryContext | None) -> None: | |
| 383 | - self._recovery_context = value | |
| 384 | - | |
| 385 | 367 | context = RuntimeContext( |
| 386 | 368 | project_root=self.project_root, |
| 387 | 369 | backend=self.backend, |
@@ -400,11 +382,8 @@ class Agent: | ||
| 400 | 382 | queue_steering_message=_queue_steering_message, |
| 401 | 383 | set_workflow_mode=_set_workflow_mode, |
| 402 | 384 | refresh_capability_profile=_refresh_capability_profile, |
| 403 | - assess_confidence=self._assess_confidence, | |
| 404 | - verify_action=self._verify_action, | |
| 405 | - get_recovery_context=_get_recovery_context, | |
| 406 | - set_recovery_context=_set_recovery_context, | |
| 407 | 385 | ), |
| 386 | + reasoning=RuntimeReasoningService(self.backend, self.config), | |
| 408 | 387 | prompt_format=self.prompt_format, |
| 409 | 388 | prompt_sections=list(self.prompt_sections), |
| 410 | 389 | ) |
@@ -484,82 +463,6 @@ class Agent: | ||
| 484 | 463 | ) |
| 485 | 464 | return parse_self_critique(critique_response.content, response) |
| 486 | 465 | |
| 487 | - async def _assess_confidence( | |
| 488 | - self, | |
| 489 | - tool_name: str, | |
| 490 | - tool_args: dict, | |
| 491 | - context: str = "", | |
| 492 | - ) -> ConfidenceAssessment: | |
| 493 | - """Assess confidence in a tool action.""" | |
| 494 | - cfg = self.config.reasoning | |
| 495 | - | |
| 496 | - # Try quick heuristic first | |
| 497 | - if cfg.use_quick_confidence: | |
| 498 | - quick_level = estimate_confidence_quick(tool_name, tool_args, context) | |
| 499 | - # Only call LLM if quick estimate is low | |
| 500 | - if quick_level.value >= ConfidenceLevel.MEDIUM.value: | |
| 501 | - return ConfidenceAssessment( | |
| 502 | - action=f"{tool_name} with {tool_args}", | |
| 503 | - tool_name=tool_name, | |
| 504 | - tool_args=tool_args, | |
| 505 | - level=quick_level, | |
| 506 | - reasoning="Quick heuristic assessment", | |
| 507 | - ) | |
| 508 | - | |
| 509 | - # Full LLM assessment | |
| 510 | - action = f"Call {tool_name} with arguments: {tool_args}" | |
| 511 | - prompt = CONFIDENCE_PROMPT.format( | |
| 512 | - action=action, | |
| 513 | - tool_name=tool_name, | |
| 514 | - tool_args=tool_args, | |
| 515 | - context=context[-2000:] if context else "No prior context", | |
| 516 | - ) | |
| 517 | - response = await self.backend.complete( | |
| 518 | - messages=[Message(role=Role.USER, content=prompt)], | |
| 519 | - tools=None, | |
| 520 | - temperature=0.3, | |
| 521 | - max_tokens=300, | |
| 522 | - ) | |
| 523 | - return parse_confidence(response.content, tool_name, tool_args) | |
| 524 | - | |
| 525 | - async def _verify_action( | |
| 526 | - self, | |
| 527 | - tool_name: str, | |
| 528 | - tool_args: dict, | |
| 529 | - result: str, | |
| 530 | - expected: str = "", | |
| 531 | - ) -> ActionVerification: | |
| 532 | - """Verify that an action produced the expected result.""" | |
| 533 | - cfg = self.config.reasoning | |
| 534 | - | |
| 535 | - # Try quick verification first | |
| 536 | - if cfg.use_quick_verification: | |
| 537 | - quick_result = quick_verify(tool_name, tool_args, result) | |
| 538 | - if quick_result: | |
| 539 | - return ActionVerification( | |
| 540 | - tool_name=tool_name, | |
| 541 | - tool_args=tool_args, | |
| 542 | - expected_outcome=expected or "Success", | |
| 543 | - actual_result=result[:500], | |
| 544 | - verified=True, | |
| 545 | - verification_method="quick_heuristic", | |
| 546 | - ) | |
| 547 | - | |
| 548 | - # Full LLM verification | |
| 549 | - prompt = VERIFICATION_PROMPT.format( | |
| 550 | - tool_name=tool_name, | |
| 551 | - tool_args=tool_args, | |
| 552 | - expected=expected or "The action should complete successfully", | |
| 553 | - result=result[:2000], # Truncate long results | |
| 554 | - ) | |
| 555 | - response = await self.backend.complete( | |
| 556 | - messages=[Message(role=Role.USER, content=prompt)], | |
| 557 | - tools=None, | |
| 558 | - temperature=0.3, | |
| 559 | - max_tokens=300, | |
| 560 | - ) | |
| 561 | - return parse_verification(response.content, tool_name, tool_args, expected, result) | |
| 562 | - | |
| 563 | 466 | async def _handle_conversational( |
| 564 | 467 | self, |
| 565 | 468 | user_message: str, |
@@ -1179,7 +1082,6 @@ class Agent: | ||
| 1179 | 1082 | self.prompt_format = None |
| 1180 | 1083 | self.prompt_sections = [] |
| 1181 | 1084 | self.session = self._create_session(messages=self.messages) |
| 1182 | - self._recovery_context = None | |
| 1183 | 1085 | self._current_task = None |
| 1184 | 1086 | self.last_turn_summary = None |
| 1185 | 1087 | self.workflow_mode = WorkflowMode.EXECUTE.value |
src/loader/runtime/action_reasoning.pyadded@@ -0,0 +1,227 @@ | ||
| 1 | +"""Runtime-owned prompts and heuristics for action reasoning.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import re | |
| 6 | + | |
| 7 | +from .reasoning_types import ( | |
| 8 | + ActionVerification, | |
| 9 | + ConfidenceAssessment, | |
| 10 | + ConfidenceLevel, | |
| 11 | +) | |
| 12 | + | |
| 13 | +CONFIDENCE_PROMPT = """Rate your confidence in this action before executing. | |
| 14 | + | |
| 15 | +Action: {action} | |
| 16 | +Tool: {tool_name} | |
| 17 | +Arguments: {tool_args} | |
| 18 | + | |
| 19 | +Previous context: | |
| 20 | +{context} | |
| 21 | + | |
| 22 | +Consider: | |
| 23 | +1. Do you have enough information to proceed? | |
| 24 | +2. What could go wrong? | |
| 25 | +3. Is this the right approach? | |
| 26 | +4. Are there better alternatives? | |
| 27 | + | |
| 28 | +Respond in this exact JSON format: | |
| 29 | +{{ | |
| 30 | + "confidence": 1-5, // 1=very low, 2=low, 3=medium, 4=high, 5=very high | |
| 31 | + "reasoning": "Why this confidence level", | |
| 32 | + "risks": ["Risk 1", "Risk 2"], | |
| 33 | + "mitigations": ["How to mitigate risk 1"], | |
| 34 | + "requires_verification": true/false, | |
| 35 | + "alternative_approaches": ["Alternative 1 if confidence is low"] | |
| 36 | +}} | |
| 37 | + | |
| 38 | +Only output the JSON, no other text.""" | |
| 39 | + | |
| 40 | + | |
| 41 | +VERIFICATION_PROMPT = """Verify that the action produced the expected result. | |
| 42 | + | |
| 43 | +Action taken: | |
| 44 | +- Tool: {tool_name} | |
| 45 | +- Arguments: {tool_args} | |
| 46 | + | |
| 47 | +Expected outcome: {expected} | |
| 48 | + | |
| 49 | +Actual result: | |
| 50 | +{result} | |
| 51 | + | |
| 52 | +Analyze: | |
| 53 | +1. Did the action succeed? | |
| 54 | +2. Does the result match expectations? | |
| 55 | +3. Are there any unexpected side effects? | |
| 56 | +4. Is any correction needed? | |
| 57 | + | |
| 58 | +Respond in this exact JSON format: | |
| 59 | +{{ | |
| 60 | + "verified": true/false, | |
| 61 | + "verification_method": "How you verified (e.g., output_contains, no_error, file_created)", | |
| 62 | + "discrepancies": ["Discrepancy 1 if any"], | |
| 63 | + "needs_correction": true/false, | |
| 64 | + "correction_suggestion": "What to do if correction is needed" | |
| 65 | +}} | |
| 66 | + | |
| 67 | +Only output the JSON, no other text.""" | |
| 68 | + | |
| 69 | + | |
| 70 | +def parse_confidence( | |
| 71 | + response: str, | |
| 72 | + tool_name: str, | |
| 73 | + tool_args: dict, | |
| 74 | +) -> ConfidenceAssessment: | |
| 75 | + """Parse LLM response into ConfidenceAssessment.""" | |
| 76 | + | |
| 77 | + import json | |
| 78 | + | |
| 79 | + json_match = re.search(r"\{.*\}", response, re.DOTALL) | |
| 80 | + if not json_match: | |
| 81 | + return ConfidenceAssessment( | |
| 82 | + action="", | |
| 83 | + tool_name=tool_name, | |
| 84 | + tool_args=tool_args, | |
| 85 | + ) | |
| 86 | + | |
| 87 | + try: | |
| 88 | + data = json.loads(json_match.group()) | |
| 89 | + confidence_val = data.get("confidence", 3) | |
| 90 | + level = ConfidenceLevel(max(1, min(5, confidence_val))) | |
| 91 | + | |
| 92 | + return ConfidenceAssessment( | |
| 93 | + action=data.get("action", ""), | |
| 94 | + tool_name=tool_name, | |
| 95 | + tool_args=tool_args, | |
| 96 | + level=level, | |
| 97 | + reasoning=data.get("reasoning", ""), | |
| 98 | + risks=data.get("risks", []), | |
| 99 | + mitigations=data.get("mitigations", []), | |
| 100 | + requires_verification=data.get("requires_verification", level.value <= 3), | |
| 101 | + ) | |
| 102 | + except (json.JSONDecodeError, ValueError): | |
| 103 | + return ConfidenceAssessment( | |
| 104 | + action="", | |
| 105 | + tool_name=tool_name, | |
| 106 | + tool_args=tool_args, | |
| 107 | + ) | |
| 108 | + | |
| 109 | + | |
| 110 | +def parse_verification( | |
| 111 | + response: str, | |
| 112 | + tool_name: str, | |
| 113 | + tool_args: dict, | |
| 114 | + expected: str, | |
| 115 | + result: str, | |
| 116 | +) -> ActionVerification: | |
| 117 | + """Parse LLM response into ActionVerification.""" | |
| 118 | + | |
| 119 | + import json | |
| 120 | + | |
| 121 | + json_match = re.search(r"\{.*\}", response, re.DOTALL) | |
| 122 | + if not json_match: | |
| 123 | + return ActionVerification( | |
| 124 | + tool_name=tool_name, | |
| 125 | + tool_args=tool_args, | |
| 126 | + expected_outcome=expected, | |
| 127 | + actual_result=result, | |
| 128 | + verified="error" not in result.lower(), | |
| 129 | + ) | |
| 130 | + | |
| 131 | + try: | |
| 132 | + data = json.loads(json_match.group()) | |
| 133 | + return ActionVerification( | |
| 134 | + tool_name=tool_name, | |
| 135 | + tool_args=tool_args, | |
| 136 | + expected_outcome=expected, | |
| 137 | + actual_result=result, | |
| 138 | + verified=data.get("verified", False), | |
| 139 | + verification_method=data.get("verification_method", ""), | |
| 140 | + discrepancies=data.get("discrepancies", []), | |
| 141 | + needs_correction=data.get("needs_correction", False), | |
| 142 | + correction_suggestion=data.get("correction_suggestion", ""), | |
| 143 | + ) | |
| 144 | + except json.JSONDecodeError: | |
| 145 | + return ActionVerification( | |
| 146 | + tool_name=tool_name, | |
| 147 | + tool_args=tool_args, | |
| 148 | + expected_outcome=expected, | |
| 149 | + actual_result=result, | |
| 150 | + verified="error" not in result.lower(), | |
| 151 | + ) | |
| 152 | + | |
| 153 | + | |
| 154 | +def estimate_confidence_quick( | |
| 155 | + tool_name: str, | |
| 156 | + tool_args: dict, | |
| 157 | + context: str = "", | |
| 158 | +) -> ConfidenceLevel: | |
| 159 | + """Estimate action confidence with heuristics before an LLM call.""" | |
| 160 | + | |
| 161 | + del context | |
| 162 | + | |
| 163 | + if tool_name in {"read", "glob", "grep", "git"}: | |
| 164 | + return ConfidenceLevel.HIGH | |
| 165 | + | |
| 166 | + if tool_name == "write": | |
| 167 | + file_path = tool_args.get("file_path", "") | |
| 168 | + if not file_path: | |
| 169 | + return ConfidenceLevel.LOW | |
| 170 | + return ConfidenceLevel.MEDIUM | |
| 171 | + | |
| 172 | + if tool_name == "edit": | |
| 173 | + old_string = tool_args.get("old_string", "") | |
| 174 | + if not old_string: | |
| 175 | + return ConfidenceLevel.LOW | |
| 176 | + return ConfidenceLevel.MEDIUM | |
| 177 | + | |
| 178 | + if tool_name == "patch": | |
| 179 | + hunks = tool_args.get("hunks", []) | |
| 180 | + if not hunks: | |
| 181 | + return ConfidenceLevel.LOW | |
| 182 | + return ConfidenceLevel.MEDIUM | |
| 183 | + | |
| 184 | + if tool_name == "bash": | |
| 185 | + command = tool_args.get("command", "") | |
| 186 | + dangerous = ["rm -rf", "sudo", "chmod 777", "dd if=", "> /dev/"] | |
| 187 | + if any(pattern in command for pattern in dangerous): | |
| 188 | + return ConfidenceLevel.VERY_LOW | |
| 189 | + safe_read = ["ls", "cat", "grep", "find", "pwd", "echo", "head", "tail"] | |
| 190 | + if any(command.strip().startswith(prefix) for prefix in safe_read): | |
| 191 | + return ConfidenceLevel.HIGH | |
| 192 | + return ConfidenceLevel.MEDIUM | |
| 193 | + | |
| 194 | + return ConfidenceLevel.MEDIUM | |
| 195 | + | |
| 196 | + | |
| 197 | +def quick_verify(tool_name: str, tool_args: dict, result: str) -> bool: | |
| 198 | + """Estimate whether a tool action succeeded without an LLM call.""" | |
| 199 | + | |
| 200 | + del tool_args | |
| 201 | + | |
| 202 | + result_lower = result.lower() | |
| 203 | + error_indicators = [ | |
| 204 | + "error:", | |
| 205 | + "failed", | |
| 206 | + "not found", | |
| 207 | + "permission denied", | |
| 208 | + "no such file", | |
| 209 | + "command not found", | |
| 210 | + "exception", | |
| 211 | + "traceback", | |
| 212 | + "fatal:", | |
| 213 | + "cannot", | |
| 214 | + ] | |
| 215 | + if any(indicator in result_lower for indicator in error_indicators): | |
| 216 | + return False | |
| 217 | + | |
| 218 | + if tool_name == "write": | |
| 219 | + return "created" in result_lower or "wrote" in result_lower or len(result) < 200 | |
| 220 | + if tool_name in {"edit", "patch"}: | |
| 221 | + return "edited" in result_lower or "patched" in result_lower or "+" in result or "-" in result | |
| 222 | + if tool_name in {"read", "git"}: | |
| 223 | + return len(result.strip()) > 0 | |
| 224 | + if tool_name == "bash": | |
| 225 | + return True | |
| 226 | + | |
| 227 | + return True | |
src/loader/runtime/context.pymodified@@ -2,7 +2,7 @@ | ||
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | -from collections.abc import Awaitable, Callable | |
| 5 | +from collections.abc import Callable | |
| 6 | 6 | from dataclasses import dataclass, field |
| 7 | 7 | from pathlib import Path |
| 8 | 8 | from typing import Any, Protocol |
@@ -74,6 +74,28 @@ class RuntimeSafeguardsProtocol(Protocol): | ||
| 74 | 74 | def record_response(self, content: str) -> None: |
| 75 | 75 | """Record a completed assistant response for safeguard bookkeeping.""" |
| 76 | 76 | |
| 77 | + | |
| 78 | +class RuntimeReasoningServiceProtocol(Protocol): | |
| 79 | + """Typed action-reasoning surface the runtime can rely on.""" | |
| 80 | + | |
| 81 | + async def assess_confidence( | |
| 82 | + self, | |
| 83 | + tool_name: str, | |
| 84 | + tool_args: dict[str, Any], | |
| 85 | + context: str = "", | |
| 86 | + ) -> ConfidenceAssessment: | |
| 87 | + """Assess confidence in a planned tool action.""" | |
| 88 | + | |
| 89 | + async def verify_action( | |
| 90 | + self, | |
| 91 | + tool_name: str, | |
| 92 | + tool_args: dict[str, Any], | |
| 93 | + result: str, | |
| 94 | + expected: str = "", | |
| 95 | + ) -> ActionVerification: | |
| 96 | + """Verify that a tool action produced the desired result.""" | |
| 97 | + | |
| 98 | + | |
| 77 | 99 | @dataclass(slots=True) |
| 78 | 100 | class RuntimeLegacyServices: |
| 79 | 101 | """Explicit migration seams for legacy agent-owned behavior.""" |
@@ -83,10 +105,6 @@ class RuntimeLegacyServices: | ||
| 83 | 105 | queue_steering_message: Callable[[str], None] |
| 84 | 106 | set_workflow_mode: Callable[[str], None] |
| 85 | 107 | refresh_capability_profile: Callable[[], None] |
| 86 | - assess_confidence: Callable[[str, dict[str, Any], str], Awaitable[ConfidenceAssessment]] | |
| 87 | - verify_action: Callable[[str, dict[str, Any], str, str], Awaitable[ActionVerification]] | |
| 88 | - get_recovery_context: Callable[[], RecoveryContext | None] | |
| 89 | - set_recovery_context: Callable[[RecoveryContext | None], None] | |
| 90 | 108 | |
| 91 | 109 | |
| 92 | 110 | @dataclass(slots=True) |
@@ -105,6 +123,8 @@ class RuntimeContext: | ||
| 105 | 123 | workflow_mode: str |
| 106 | 124 | safeguards: RuntimeSafeguardsProtocol |
| 107 | 125 | legacy: RuntimeLegacyServices |
| 126 | + reasoning: RuntimeReasoningServiceProtocol | None = None | |
| 127 | + recovery_context: RecoveryContext | None = None | |
| 108 | 128 | prompt_format: str | None = None |
| 109 | 129 | prompt_sections: list[str] = field(default_factory=list) |
| 110 | 130 | |
@@ -131,3 +151,28 @@ class RuntimeContext: | ||
| 131 | 151 | """Return rule counts for the active permission policy.""" |
| 132 | 152 | |
| 133 | 153 | return self.permission_policy.rule_counts() |
| 154 | + | |
| 155 | + async def assess_confidence( | |
| 156 | + self, | |
| 157 | + tool_name: str, | |
| 158 | + tool_args: dict[str, Any], | |
| 159 | + context: str = "", | |
| 160 | + ) -> ConfidenceAssessment: | |
| 161 | + """Assess confidence using the primary runtime reasoning service.""" | |
| 162 | + | |
| 163 | + if self.reasoning is None: | |
| 164 | + raise RuntimeError("RuntimeContext.reasoning is required for confidence checks") | |
| 165 | + return await self.reasoning.assess_confidence(tool_name, tool_args, context) | |
| 166 | + | |
| 167 | + async def verify_action( | |
| 168 | + self, | |
| 169 | + tool_name: str, | |
| 170 | + tool_args: dict[str, Any], | |
| 171 | + result: str, | |
| 172 | + expected: str = "", | |
| 173 | + ) -> ActionVerification: | |
| 174 | + """Verify a tool action using the primary runtime reasoning service.""" | |
| 175 | + | |
| 176 | + if self.reasoning is None: | |
| 177 | + raise RuntimeError("RuntimeContext.reasoning is required for verification checks") | |
| 178 | + return await self.reasoning.verify_action(tool_name, tool_args, result, expected) | |
src/loader/runtime/reasoning_service.pyadded@@ -0,0 +1,100 @@ | ||
| 1 | +"""Runtime-owned confidence and verification services.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from typing import Any | |
| 6 | + | |
| 7 | +from ..llm.base import LLMBackend, Message, Role | |
| 8 | +from .action_reasoning import ( | |
| 9 | + CONFIDENCE_PROMPT, | |
| 10 | + VERIFICATION_PROMPT, | |
| 11 | + estimate_confidence_quick, | |
| 12 | + parse_confidence, | |
| 13 | + parse_verification, | |
| 14 | + quick_verify, | |
| 15 | +) | |
| 16 | +from .reasoning_types import ( | |
| 17 | + ActionVerification, | |
| 18 | + ConfidenceAssessment, | |
| 19 | + ConfidenceLevel, | |
| 20 | +) | |
| 21 | + | |
| 22 | + | |
| 23 | +class RuntimeReasoningService: | |
| 24 | + """Provide confidence scoring and verification without Agent callbacks.""" | |
| 25 | + | |
| 26 | + def __init__(self, backend: LLMBackend, config: Any) -> None: | |
| 27 | + self.backend = backend | |
| 28 | + self.config = config | |
| 29 | + | |
| 30 | + async def assess_confidence( | |
| 31 | + self, | |
| 32 | + tool_name: str, | |
| 33 | + tool_args: dict[str, Any], | |
| 34 | + context: str = "", | |
| 35 | + ) -> ConfidenceAssessment: | |
| 36 | + """Assess confidence in a planned tool action.""" | |
| 37 | + | |
| 38 | + cfg = self.config.reasoning | |
| 39 | + | |
| 40 | + if getattr(cfg, "use_quick_confidence", True): | |
| 41 | + quick_level = estimate_confidence_quick(tool_name, tool_args, context) | |
| 42 | + if quick_level.value >= ConfidenceLevel.MEDIUM.value: | |
| 43 | + return ConfidenceAssessment( | |
| 44 | + action=f"{tool_name} with {tool_args}", | |
| 45 | + tool_name=tool_name, | |
| 46 | + tool_args=tool_args, | |
| 47 | + level=quick_level, | |
| 48 | + reasoning="Quick heuristic assessment", | |
| 49 | + ) | |
| 50 | + | |
| 51 | + action = f"Call {tool_name} with arguments: {tool_args}" | |
| 52 | + prompt = CONFIDENCE_PROMPT.format( | |
| 53 | + action=action, | |
| 54 | + tool_name=tool_name, | |
| 55 | + tool_args=tool_args, | |
| 56 | + context=context[-2000:] if context else "No prior context", | |
| 57 | + ) | |
| 58 | + response = await self.backend.complete( | |
| 59 | + messages=[Message(role=Role.USER, content=prompt)], | |
| 60 | + tools=None, | |
| 61 | + temperature=0.3, | |
| 62 | + max_tokens=300, | |
| 63 | + ) | |
| 64 | + return parse_confidence(response.content, tool_name, tool_args) | |
| 65 | + | |
| 66 | + async def verify_action( | |
| 67 | + self, | |
| 68 | + tool_name: str, | |
| 69 | + tool_args: dict[str, Any], | |
| 70 | + result: str, | |
| 71 | + expected: str = "", | |
| 72 | + ) -> ActionVerification: | |
| 73 | + """Verify that a completed tool action achieved its goal.""" | |
| 74 | + | |
| 75 | + cfg = self.config.reasoning | |
| 76 | + | |
| 77 | + if getattr(cfg, "use_quick_verification", True): | |
| 78 | + if quick_verify(tool_name, tool_args, result): | |
| 79 | + return ActionVerification( | |
| 80 | + tool_name=tool_name, | |
| 81 | + tool_args=tool_args, | |
| 82 | + expected_outcome=expected or "Success", | |
| 83 | + actual_result=result[:500], | |
| 84 | + verified=True, | |
| 85 | + verification_method="quick_heuristic", | |
| 86 | + ) | |
| 87 | + | |
| 88 | + prompt = VERIFICATION_PROMPT.format( | |
| 89 | + tool_name=tool_name, | |
| 90 | + tool_args=tool_args, | |
| 91 | + expected=expected or "The action should complete successfully", | |
| 92 | + result=result[:2000], | |
| 93 | + ) | |
| 94 | + response = await self.backend.complete( | |
| 95 | + messages=[Message(role=Role.USER, content=prompt)], | |
| 96 | + tools=None, | |
| 97 | + temperature=0.3, | |
| 98 | + max_tokens=300, | |
| 99 | + ) | |
| 100 | + return parse_verification(response.content, tool_name, tool_args, expected, result) | |
src/loader/runtime/tool_batches.pymodified@@ -181,7 +181,7 @@ class ToolBatchRunner: | ||
| 181 | 181 | for message in self.context.messages[-5:] |
| 182 | 182 | if message.content |
| 183 | 183 | ) |
| 184 | - confidence = await self.context.legacy.assess_confidence( | |
| 184 | + confidence = await self.context.assess_confidence( | |
| 185 | 185 | tool_call.name, |
| 186 | 186 | tool_call.arguments, |
| 187 | 187 | context, |
@@ -224,7 +224,7 @@ class ToolBatchRunner: | ||
| 224 | 224 | if isinstance(new_todos, list): |
| 225 | 225 | sync_todos_to_definition_of_done(dod, new_todos) |
| 226 | 226 | self.dod_store.save(dod) |
| 227 | - self.context.legacy.set_recovery_context(None) | |
| 227 | + self.context.recovery_context = None | |
| 228 | 228 | return None |
| 229 | 229 | |
| 230 | 230 | async def _run_post_tool_verification( |
@@ -244,7 +244,7 @@ class ToolBatchRunner: | ||
| 244 | 244 | ): |
| 245 | 245 | return False |
| 246 | 246 | |
| 247 | - verification = await self.context.legacy.verify_action( | |
| 247 | + verification = await self.context.verify_action( | |
| 248 | 248 | tool_call.name, |
| 249 | 249 | tool_call.arguments, |
| 250 | 250 | outcome.result_output, |
@@ -277,14 +277,14 @@ class ToolBatchRunner: | ||
| 277 | 277 | ) -> Message | None: |
| 278 | 278 | """Generate a recovery follow-up after an executed tool failure.""" |
| 279 | 279 | |
| 280 | - recovery_context = self.context.legacy.get_recovery_context() | |
| 280 | + recovery_context = self.context.recovery_context | |
| 281 | 281 | if recovery_context is None: |
| 282 | 282 | recovery_context = RecoveryContext( |
| 283 | 283 | original_tool=tool_call.name, |
| 284 | 284 | original_args=tool_call.arguments, |
| 285 | 285 | max_retries=self.context.config.max_recovery_attempts, |
| 286 | 286 | ) |
| 287 | - self.context.legacy.set_recovery_context(recovery_context) | |
| 287 | + self.context.recovery_context = recovery_context | |
| 288 | 288 | |
| 289 | 289 | if recovery_context.is_similar_attempt( |
| 290 | 290 | tool_call.name, |
@@ -341,7 +341,7 @@ class ToolBatchRunner: | ||
| 341 | 341 | tool_name=tool_call.name, |
| 342 | 342 | ) |
| 343 | 343 | ) |
| 344 | - self.context.legacy.set_recovery_context(None) | |
| 344 | + self.context.recovery_context = None | |
| 345 | 345 | return Message.tool_result_message( |
| 346 | 346 | tool_call_id=tool_call.id, |
| 347 | 347 | display_content=(f"Observation [{tool_call.name}]: Error: {failure_message}"), |
tests/test_assistant_turns.pymodified@@ -137,10 +137,6 @@ def build_runtime_context( | ||
| 137 | 137 | queue_steering_message=queued_messages.append, |
| 138 | 138 | set_workflow_mode=lambda mode: None, |
| 139 | 139 | refresh_capability_profile=lambda: None, |
| 140 | - assess_confidence=lambda tool_name, tool_args, context: None, # type: ignore[arg-type] | |
| 141 | - verify_action=lambda tool_name, tool_args, result, expected: None, # type: ignore[arg-type] | |
| 142 | - get_recovery_context=lambda: None, | |
| 143 | - set_recovery_context=lambda value: None, | |
| 144 | 140 | ), |
| 145 | 141 | ) |
| 146 | 142 | return context, queued_messages |
tests/test_finalization.pymodified@@ -122,10 +122,6 @@ def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext: | ||
| 122 | 122 | queue_steering_message=lambda message: None, |
| 123 | 123 | set_workflow_mode=lambda mode: None, |
| 124 | 124 | refresh_capability_profile=lambda: None, |
| 125 | - assess_confidence=lambda tool_name, tool_args, context: None, # type: ignore[arg-type] | |
| 126 | - verify_action=lambda tool_name, tool_args, result, expected: None, # type: ignore[arg-type] | |
| 127 | - get_recovery_context=lambda: None, | |
| 128 | - set_recovery_context=lambda value: None, | |
| 129 | 125 | ), |
| 130 | 126 | ) |
| 131 | 127 | |
tests/test_repair.pymodified@@ -91,10 +91,6 @@ def build_context( | ||
| 91 | 91 | queue_steering_message=lambda message: None, |
| 92 | 92 | set_workflow_mode=lambda mode: None, |
| 93 | 93 | refresh_capability_profile=lambda: None, |
| 94 | - assess_confidence=lambda tool_name, tool_args, context: None, # type: ignore[arg-type] | |
| 95 | - verify_action=lambda tool_name, tool_args, result, expected: None, # type: ignore[arg-type] | |
| 96 | - get_recovery_context=lambda: None, | |
| 97 | - set_recovery_context=lambda value: None, | |
| 98 | 94 | ), |
| 99 | 95 | ) |
| 100 | 96 | |
tests/test_runtime_context.pymodified@@ -36,6 +36,7 @@ def test_agent_builds_typed_runtime_context(temp_dir: Path) -> None: | ||
| 36 | 36 | assert context.use_react == agent.use_react |
| 37 | 37 | assert context.active_permission_mode == agent.active_permission_mode |
| 38 | 38 | assert context.active_permission_rule_counts == agent.active_permission_rule_counts |
| 39 | + assert context.reasoning is not None | |
| 39 | 40 | assert context.legacy.message_history() is agent.messages |
| 40 | 41 | |
| 41 | 42 | |
@@ -56,9 +57,8 @@ def test_runtime_context_legacy_services_stay_in_sync(temp_dir: Path) -> None: | ||
| 56 | 57 | assert context.workflow_mode == "clarify" |
| 57 | 58 | |
| 58 | 59 | recovery = RecoveryContext(original_tool="read", original_args={"file_path": "README.md"}) |
| 59 | - context.legacy.set_recovery_context(recovery) | |
| 60 | - assert agent._recovery_context is recovery | |
| 61 | - assert context.legacy.get_recovery_context() is recovery | |
| 60 | + context.recovery_context = recovery | |
| 61 | + assert context.recovery_context is recovery | |
| 62 | 62 | |
| 63 | 63 | context.legacy.refresh_capability_profile() |
| 64 | 64 | assert context.capability_profile == agent.capability_profile |
tests/test_tool_batches.pymodified@@ -2,7 +2,6 @@ | ||
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | -from dataclasses import dataclass | |
| 6 | 5 | from pathlib import Path |
| 7 | 6 | from types import SimpleNamespace |
| 8 | 7 | |
@@ -18,14 +17,13 @@ from loader.runtime.permissions import ( | ||
| 18 | 17 | build_permission_policy, |
| 19 | 18 | load_permission_rules, |
| 20 | 19 | ) |
| 21 | -from loader.runtime.recovery import RecoveryContext | |
| 22 | 20 | from loader.runtime.reasoning_types import ( |
| 23 | 21 | ActionVerification, |
| 24 | 22 | ConfidenceAssessment, |
| 25 | 23 | ConfidenceLevel, |
| 26 | 24 | ) |
| 25 | +from loader.runtime.recovery import RecoveryContext | |
| 27 | 26 | from loader.runtime.tool_batches import ToolBatchRunner |
| 28 | -from loader.runtime.tracing import RuntimeTracer | |
| 29 | 27 | from loader.tools.base import ToolResult as RegistryToolResult |
| 30 | 28 | from loader.tools.base import create_default_registry |
| 31 | 29 | from tests.helpers.runtime_harness import ScriptedBackend |
@@ -97,7 +95,7 @@ def build_context( | ||
| 97 | 95 | verification: bool = False, |
| 98 | 96 | auto_recover: bool = True, |
| 99 | 97 | min_confidence_for_action: int = 3, |
| 100 | -) -> tuple[RuntimeContext, dict[str, RecoveryContext | None]]: | |
| 98 | +) -> RuntimeContext: | |
| 101 | 99 | registry = create_default_registry(temp_dir) |
| 102 | 100 | registry.configure_workspace_root(temp_dir) |
| 103 | 101 | rule_status = load_permission_rules(temp_dir) |
@@ -107,7 +105,6 @@ def build_context( | ||
| 107 | 105 | tool_requirements=registry.get_tool_requirements(), |
| 108 | 106 | rules=rule_status.rules, |
| 109 | 107 | ) |
| 110 | - recovery_holder = {"value": recovery_context} | |
| 111 | 108 | context = RuntimeContext( |
| 112 | 109 | project_root=temp_dir, |
| 113 | 110 | backend=ScriptedBackend(), |
@@ -140,13 +137,14 @@ def build_context( | ||
| 140 | 137 | queue_steering_message=lambda message: None, |
| 141 | 138 | set_workflow_mode=lambda mode: None, |
| 142 | 139 | refresh_capability_profile=lambda: None, |
| 140 | + ), | |
| 141 | + reasoning=SimpleNamespace( | |
| 143 | 142 | assess_confidence=assess_confidence, |
| 144 | 143 | verify_action=verify_action, |
| 145 | - get_recovery_context=lambda: recovery_holder["value"], | |
| 146 | - set_recovery_context=lambda value: recovery_holder.__setitem__("value", value), | |
| 147 | 144 | ), |
| 145 | + recovery_context=recovery_context, | |
| 148 | 146 | ) |
| 149 | - return context, recovery_holder | |
| 147 | + return context | |
| 150 | 148 | |
| 151 | 149 | |
| 152 | 150 | def tool_outcome( |
@@ -189,7 +187,7 @@ async def test_tool_batch_runner_uses_context_for_confidence_gate(temp_dir: Path | ||
| 189 | 187 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 190 | 188 | raise AssertionError("Verification should not run for skipped actions") |
| 191 | 189 | |
| 192 | - context, _ = build_context( | |
| 190 | + context = build_context( | |
| 193 | 191 | temp_dir=temp_dir, |
| 194 | 192 | messages=[ |
| 195 | 193 | Message(role=Role.USER, content="Please inspect the project."), |
@@ -239,7 +237,7 @@ async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: P | ||
| 239 | 237 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 240 | 238 | raise AssertionError("Verification should not run for failed actions") |
| 241 | 239 | |
| 242 | - context, recovery_holder = build_context( | |
| 240 | + context = build_context( | |
| 243 | 241 | temp_dir=temp_dir, |
| 244 | 242 | messages=[], |
| 245 | 243 | safeguards=FakeSafeguards(), |
@@ -270,7 +268,7 @@ async def test_tool_batch_runner_tracks_recovery_with_legacy_context(temp_dir: P | ||
| 270 | 268 | consecutive_errors=0, |
| 271 | 269 | ) |
| 272 | 270 | |
| 273 | - assert recovery_holder["value"] is not None | |
| 271 | + assert context.recovery_context is not None | |
| 274 | 272 | assert summary.tool_result_messages |
| 275 | 273 | assert context.session.messages[-1] == summary.tool_result_messages[-1] |
| 276 | 274 | assert any(event.type == "recovery" for event in events) |
@@ -300,7 +298,7 @@ async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) | ||
| 300 | 298 | original_tool="edit", |
| 301 | 299 | original_args={"file_path": "README.md"}, |
| 302 | 300 | ) |
| 303 | - context, recovery_holder = build_context( | |
| 301 | + context = build_context( | |
| 304 | 302 | temp_dir=temp_dir, |
| 305 | 303 | messages=[], |
| 306 | 304 | safeguards=FakeSafeguards(), |
@@ -332,7 +330,7 @@ async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) | ||
| 332 | 330 | ) |
| 333 | 331 | |
| 334 | 332 | assert verification_calls == ["file contents"] |
| 335 | - assert recovery_holder["value"] is None | |
| 333 | + assert context.recovery_context is None | |
| 336 | 334 | assert context.session.messages[-1].role == Role.TOOL |
| 337 | 335 | assert context.session.messages[-1].content == "file contents" |
| 338 | 336 | assert any(event.type == "verification" for event in events) |