Refactor Loader into a typed turn runtime
- SHA
3c5be8521db3e1a409a1d619ae6c3e45bb0f2c26- Parents
-
51c2c2e - Tree
4283571
3c5be85
3c5be8521db3e1a409a1d619ae6c3e45bb0f2c2651c2c2e
4283571| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/agent/loop.py
|
32 | 939 |
| M |
src/loader/llm/base.py
|
29 | 1 |
| M |
src/loader/runtime/__init__.py
|
6 | 1 |
| M |
src/loader/runtime/capabilities.py
|
40 | 1 |
| A |
src/loader/runtime/conversation.py
|
705 | 0 |
| A |
src/loader/runtime/executor.py
|
217 | 0 |
src/loader/agent/loop.pymodified@@ -8,6 +8,10 @@ from typing import AsyncIterator, Awaitable, Callable | ||
| 8 | 8 | from ..llm.base import LLMBackend, Message, Role, ToolCall |
| 9 | 9 | from ..tools.base import ToolRegistry, create_default_registry, ConfirmationRequired |
| 10 | 10 | from ..context.project import ProjectContext, detect_project |
| 11 | +from ..runtime.capabilities import resolve_backend_capability_profile | |
| 12 | +from ..runtime.conversation import ConversationRuntime | |
| 13 | +from ..runtime.events import AgentEvent, TurnSummary | |
| 14 | +from ..runtime.session import ConversationSession | |
| 11 | 15 | from .prompts import build_system_prompt |
| 12 | 16 | from .parsing import parse_tool_calls, format_tool_result |
| 13 | 17 | from .planner import Plan, parse_plan, should_plan, format_step_prompt, PLANNING_PROMPT, SHOULD_PLAN_PROMPT |
@@ -45,7 +49,7 @@ from .reasoning import ( | ||
| 45 | 49 | estimate_complexity, |
| 46 | 50 | get_token_budget, |
| 47 | 51 | ) |
| 48 | -from .safeguards import RuntimeSafeguards, ValidationResult | |
| 52 | +from .safeguards import RuntimeSafeguards | |
| 49 | 53 | |
| 50 | 54 | |
| 51 | 55 | @dataclass |
@@ -99,34 +103,6 @@ class AgentConfig: | ||
| 99 | 103 | self.reasoning = ReasoningConfig() |
| 100 | 104 | |
| 101 | 105 | |
| 102 | -@dataclass | |
| 103 | -class AgentEvent: | |
| 104 | - """Event emitted during agent execution.""" | |
| 105 | - # Event types: thinking, tool_call, tool_result, response, error, plan, step, | |
| 106 | - # recovery, stream, confirmation, steering, decomposition, subtask, critique, | |
| 107 | - # confidence, verification | |
| 108 | - type: str | |
| 109 | - content: str = "" | |
| 110 | - tool_name: str | None = None | |
| 111 | - tool_args: dict | None = None | |
| 112 | - step_info: str | None = None # For step progress like "[2/5] Doing X" | |
| 113 | - recovery_attempt: int | None = None # For recovery events | |
| 114 | - is_stream_end: bool = False # For stream events - indicates final chunk | |
| 115 | - confirm_message: str | None = None # For confirmation events | |
| 116 | - confirm_details: str | None = None # For confirmation events | |
| 117 | - is_error: bool = False # For tool_result events | |
| 118 | - | |
| 119 | - # Reasoning events | |
| 120 | - decomposition: TaskDecomposition | None = None # For decomposition events | |
| 121 | - subtask: Subtask | None = None # For subtask events | |
| 122 | - critique: SelfCritique | None = None # For critique events | |
| 123 | - confidence: ConfidenceAssessment | None = None # For confidence events | |
| 124 | - verification: ActionVerification | None = None # For verification events | |
| 125 | - completion_check: TaskCompletionCheck | None = None # For completion events | |
| 126 | - rollback_plan: RollbackPlan | None = None # For rollback events | |
| 127 | - rollback_action: RollbackAction | None = None # For individual rollback action | |
| 128 | - | |
| 129 | - | |
| 130 | 106 | class Agent: |
| 131 | 107 | """The main agent that orchestrates the LLM and tools.""" |
| 132 | 108 | |
@@ -141,8 +117,15 @@ class Agent: | ||
| 141 | 117 | self.registry = registry or create_default_registry() |
| 142 | 118 | self.config = config or AgentConfig() |
| 143 | 119 | self.messages: list[Message] = [] |
| 120 | + self.session = ConversationSession( | |
| 121 | + system_message_factory=self._get_system_message, | |
| 122 | + few_shot_factory=self._get_few_shot_examples, | |
| 123 | + messages=self.messages, | |
| 124 | + ) | |
| 144 | 125 | self._system_message: Message | None = None |
| 145 | 126 | self._use_react: bool | None = None |
| 127 | + self.capability_profile = resolve_backend_capability_profile(self.backend) | |
| 128 | + self.last_turn_summary: TurnSummary | None = None | |
| 146 | 129 | |
| 147 | 130 | # Recovery tracking |
| 148 | 131 | self._recovery_context: RecoveryContext | None = None |
@@ -199,19 +182,7 @@ class Agent: | ||
| 199 | 182 | self._use_react = True |
| 200 | 183 | return True |
| 201 | 184 | |
| 202 | - # Check if backend supports native tools | |
| 203 | - if hasattr(self.backend, "supports_native_tools"): | |
| 204 | - supports_native = self.backend.supports_native_tools() | |
| 205 | - self._use_react = not supports_native | |
| 206 | - # Debug log | |
| 207 | - try: | |
| 208 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 209 | - f.write(f"[loop] use_react: supports_native={supports_native}, use_react={self._use_react}\n") | |
| 210 | - except Exception: | |
| 211 | - pass | |
| 212 | - else: | |
| 213 | - # Default to ReAct for unknown backends | |
| 214 | - self._use_react = True | |
| 185 | + self._use_react = not self.capability_profile.supports_native_tools | |
| 215 | 186 | |
| 216 | 187 | return self._use_react |
| 217 | 188 | |
@@ -234,14 +205,13 @@ class Agent: | ||
| 234 | 205 | |
| 235 | 206 | def _build_messages(self) -> list[Message]: |
| 236 | 207 | """Build the full message list for the LLM.""" |
| 237 | - messages = [self._get_system_message()] | |
| 208 | + return self.session.build_request_messages() | |
| 238 | 209 | |
| 239 | - # Add few-shot examples if this is a fresh conversation | |
| 240 | - if len(self.messages) <= 2: # User message + maybe prefill | |
| 241 | - messages.extend(self._get_few_shot_examples()) | |
| 210 | + def refresh_capability_profile(self) -> None: | |
| 211 | + """Refresh the runtime capability profile from the current backend.""" | |
| 242 | 212 | |
| 243 | - messages.extend(self.messages) | |
| 244 | - return messages | |
| 213 | + self.capability_profile = resolve_backend_capability_profile(self.backend) | |
| 214 | + self._use_react = None | |
| 245 | 215 | |
| 246 | 216 | def _get_few_shot_examples(self) -> list[Message]: |
| 247 | 217 | """Get few-shot examples demonstrating proper tool use.""" |
@@ -612,897 +582,15 @@ class Agent: | ||
| 612 | 582 | original_task: str | None = None, |
| 613 | 583 | ) -> str: |
| 614 | 584 | """Inner execution loop without planning.""" |
| 615 | - iterations = 0 | |
| 616 | - final_response = "" | |
| 617 | - actions_taken: list[str] = [] # Track what we've done | |
| 618 | - continuation_count = 0 # How many times we've nudged to continue | |
| 619 | - empty_retry_count = 0 # How many times we've retried on empty response | |
| 620 | - MAX_EMPTY_RETRIES = 5 # More retries before giving up - small models need patience | |
| 621 | - extracted_iterations = 0 # How many times we've extracted bracket-format tool calls | |
| 622 | - MAX_EXTRACTED_ITERATIONS = 3 # Limit extracted tool call loops | |
| 623 | - consecutive_errors = 0 # Track consecutive tool errors | |
| 624 | - | |
| 625 | - # Adaptive token budgeting based on task complexity | |
| 626 | - complexity = estimate_complexity(task) | |
| 627 | - max_tokens, _ = get_token_budget(complexity) | |
| 628 | - # Use configured max_tokens as ceiling, complexity as floor | |
| 629 | - effective_max_tokens = min(self.config.max_tokens, max(max_tokens, 512)) | |
| 630 | - | |
| 631 | - # Rollback planning | |
| 632 | - rollback_plan = RollbackPlan() if self.config.reasoning.rollback else None | |
| 633 | - | |
| 634 | - while iterations < self.config.max_iterations: | |
| 635 | - iterations += 1 | |
| 636 | 585 | |
| 637 | - # On first iteration, add assistant prefilling to guide tool use | |
| 638 | - if iterations == 1 and len(self.messages) == 1: # Just the user's message | |
| 639 | - # Check if task looks like it needs immediate action | |
| 640 | - task_lower = task.lower() | |
| 641 | - action_keywords = ['create', 'write', 'make', 'run', 'execute', 'build', 'install', 'delete', 'remove', 'add', 'edit', 'modify', 'update', 'fix'] | |
| 642 | - if any(kw in task_lower for kw in action_keywords): | |
| 643 | - # Prime with partial assistant response - start of tool call | |
| 644 | - self.messages.append(Message( | |
| 645 | - role=Role.ASSISTANT, | |
| 646 | - content="[", | |
| 647 | - )) | |
| 648 | - try: | |
| 649 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 650 | - f.write(f"[loop] Added assistant prefill '[' for action task\n") | |
| 651 | - except Exception: | |
| 652 | - pass | |
| 653 | - | |
| 654 | - # Check for steering messages from user | |
| 655 | - steering_messages = self._drain_steering_queue() | |
| 656 | - for steer_msg in steering_messages: | |
| 657 | - await emit(AgentEvent(type="steering", content=steer_msg)) | |
| 658 | - self.messages.append(Message( | |
| 659 | - role=Role.USER, | |
| 660 | - content=f"[USER INTERRUPTION]: {steer_msg}", | |
| 661 | - )) | |
| 662 | - | |
| 663 | - # Get completion from LLM | |
| 664 | - await emit(AgentEvent(type="thinking")) | |
| 665 | - | |
| 666 | - # Reset code block filter state for this LLM call | |
| 667 | - self.safeguards.code_filter.reset() | |
| 668 | - | |
| 669 | - # Pass tools only for native tool calling | |
| 670 | - tools = None if self.use_react else self.registry.get_schemas() | |
| 671 | - | |
| 672 | - # Use streaming or regular completion | |
| 673 | - pending_tool_calls_seen: set[str] = set() # Track IDs of pending tool calls shown | |
| 674 | - if self.config.stream: | |
| 675 | - full_content = "" | |
| 676 | - full_content_unfiltered = "" # Keep original for history | |
| 677 | - tool_calls: list[ToolCall] = [] | |
| 678 | - | |
| 679 | - async for chunk in self.backend.stream( | |
| 680 | - messages=self._build_messages(), | |
| 681 | - tools=tools, | |
| 682 | - temperature=self.config.temperature, | |
| 683 | - max_tokens=effective_max_tokens, | |
| 684 | - ): | |
| 685 | - # Filter content through safeguards (removes code blocks) | |
| 686 | - filtered_content = "" | |
| 687 | - if chunk.content: | |
| 688 | - filtered_content = self.safeguards.filter_stream_chunk(chunk.content) | |
| 689 | - full_content_unfiltered += chunk.content | |
| 690 | - | |
| 691 | - # Emit stream events for filtered content OR for final chunk (to signal end) | |
| 692 | - if filtered_content or chunk.is_done: | |
| 693 | - await emit(AgentEvent( | |
| 694 | - type="stream", | |
| 695 | - content=filtered_content, | |
| 696 | - is_stream_end=chunk.is_done, | |
| 697 | - )) | |
| 698 | - | |
| 699 | - # Check if we should inject steering (bad patterns detected) | |
| 700 | - if self.safeguards.should_steer(): | |
| 701 | - steering_msg = self.safeguards.get_steering_message() | |
| 702 | - if steering_msg: | |
| 703 | - # Queue steering for next iteration | |
| 704 | - self._steering_queue.put_nowait(steering_msg) | |
| 705 | - | |
| 706 | - # Show pending tool calls as they're detected (ReAct mode interleaving) | |
| 707 | - if chunk.pending_tool_call and chunk.pending_tool_call.id not in pending_tool_calls_seen: | |
| 708 | - pending_tool_calls_seen.add(chunk.pending_tool_call.id) | |
| 709 | - await emit(AgentEvent( | |
| 710 | - type="tool_call", | |
| 711 | - tool_name=chunk.pending_tool_call.name, | |
| 712 | - tool_args=chunk.pending_tool_call.arguments, | |
| 713 | - )) | |
| 714 | - if chunk.is_done: | |
| 715 | - full_content = chunk.full_content or full_content_unfiltered | |
| 716 | - tool_calls = chunk.tool_calls | |
| 717 | - # Debug log | |
| 718 | - try: | |
| 719 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 720 | - f.write(f"[loop] chunk.is_done: got {len(tool_calls)} tool_calls\n") | |
| 721 | - except Exception: | |
| 722 | - pass | |
| 723 | - | |
| 724 | - content = full_content | |
| 725 | - response_content = full_content | |
| 726 | - else: | |
| 727 | - response = await self.backend.complete( | |
| 728 | - messages=self._build_messages(), | |
| 729 | - tools=tools, | |
| 730 | - temperature=self.config.temperature, | |
| 731 | - max_tokens=effective_max_tokens, | |
| 732 | - ) | |
| 733 | - # Filter content through safeguards (removes code blocks) | |
| 734 | - response_content = response.content # Keep original for history | |
| 735 | - content = self.safeguards.filter_complete_content(response.content) | |
| 736 | - tool_calls = response.tool_calls if not self.use_react else [] | |
| 737 | - | |
| 738 | - # Check if we should inject steering (bad patterns detected) | |
| 739 | - if self.safeguards.should_steer(): | |
| 740 | - steering_msg = self.safeguards.get_steering_message() | |
| 741 | - if steering_msg: | |
| 742 | - self._steering_queue.put_nowait(steering_msg) | |
| 743 | - | |
| 744 | - # Handle empty responses (common with small models after clarifications) | |
| 745 | - if not content.strip(): | |
| 746 | - empty_retry_count += 1 | |
| 747 | - if empty_retry_count <= MAX_EMPTY_RETRIES: | |
| 748 | - # Use progressively more direct prompts | |
| 749 | - task_context = original_task or task | |
| 750 | - retry_prompts = [ | |
| 751 | - # Retry 1: Gentle nudge with action focus | |
| 752 | - f"Great! Now let me proceed with the task. I'll start by using my tools.", | |
| 753 | - # Retry 2: More explicit about what to do | |
| 754 | - f"I understand. Let me create that now using my tools (write, bash, etc.).", | |
| 755 | - # Retry 3: Very direct instruction | |
| 756 | - f"Proceeding with: {task_context[:80]}. I'll use the write tool to create the files.", | |
| 757 | - # Retry 4: Action-first prompt | |
| 758 | - f"Starting now. First step: create the necessary files and directories.", | |
| 759 | - # Retry 5: Last attempt with full context | |
| 760 | - f"Let me complete this task step by step. The goal is: {task_context[:100]}", | |
| 761 | - ] | |
| 762 | - prompt = retry_prompts[min(empty_retry_count - 1, len(retry_prompts) - 1)] | |
| 763 | - self.messages.append(Message( | |
| 764 | - role=Role.ASSISTANT, | |
| 765 | - content=prompt, # Add as assistant message to give model a "running start" | |
| 766 | - )) | |
| 767 | - continue | |
| 768 | - else: | |
| 769 | - # Give up after max retries - but make the message less alarming | |
| 770 | - await emit(AgentEvent( | |
| 771 | - type="response", | |
| 772 | - content="I need a bit more direction. What specifically would you like me to create or do?", | |
| 773 | - )) | |
| 774 | - break | |
| 775 | - | |
| 776 | - # Get tool calls - either native or parsed from text | |
| 777 | - if self.use_react: | |
| 778 | - # Parse tool calls from text (ReAct mode) | |
| 779 | - parsed = parse_tool_calls(content) | |
| 780 | - tool_calls = parsed.tool_calls | |
| 781 | - content = parsed.content | |
| 782 | - | |
| 783 | - # Check if this is a final answer | |
| 784 | - if parsed.is_final_answer and not tool_calls: | |
| 785 | - final_response = content | |
| 786 | - self.messages.append(Message( | |
| 787 | - role=Role.ASSISTANT, | |
| 788 | - content=response_content, # Keep original for history | |
| 789 | - )) | |
| 790 | - await emit(AgentEvent(type="response", content=final_response)) | |
| 791 | - break | |
| 792 | - | |
| 793 | - # If there are tool calls, execute them | |
| 794 | - if tool_calls: | |
| 795 | - # Debug log | |
| 796 | - try: | |
| 797 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 798 | - f.write(f"[loop] executing {len(tool_calls)} tool_calls\n") | |
| 799 | - for tc in tool_calls: | |
| 800 | - f.write(f"[loop] - {tc.name}: id={tc.id}, args_keys={list(tc.arguments.keys())}\n") | |
| 801 | - except Exception: | |
| 802 | - pass | |
| 803 | - | |
| 804 | - # Add assistant message with tool calls | |
| 805 | - self.messages.append(Message( | |
| 806 | - role=Role.ASSISTANT, | |
| 807 | - content=response_content, | |
| 808 | - tool_calls=tool_calls, | |
| 809 | - )) | |
| 810 | - | |
| 811 | - # Execute each tool (with recovery logic) | |
| 812 | - for tool_call in tool_calls: | |
| 813 | - cfg = self.config.reasoning | |
| 814 | - | |
| 815 | - # Confidence scoring before execution | |
| 816 | - if cfg.confidence_scoring: | |
| 817 | - context = "\n".join( | |
| 818 | - m.content[:500] for m in self.messages[-5:] | |
| 819 | - if m.content | |
| 820 | - ) | |
| 821 | - confidence = await self._assess_confidence( | |
| 822 | - tool_call.name, | |
| 823 | - tool_call.arguments, | |
| 824 | - context, | |
| 825 | - ) | |
| 826 | - await emit(AgentEvent( | |
| 827 | - type="confidence", | |
| 828 | - content=f"Confidence: {confidence.level.name} ({confidence.score}/5)", | |
| 829 | - confidence=confidence, | |
| 830 | - tool_name=tool_call.name, | |
| 831 | - )) | |
| 832 | - | |
| 833 | - # If confidence is too low, ask LLM to reconsider | |
| 834 | - if confidence.score < cfg.min_confidence_for_action: | |
| 835 | - low_conf_msg = ( | |
| 836 | - f"[LOW CONFIDENCE WARNING] The planned action has low confidence " | |
| 837 | - f"({confidence.level.name}).\n" | |
| 838 | - f"Reasoning: {confidence.reasoning}\n" | |
| 839 | - f"Risks: {', '.join(confidence.risks)}\n" | |
| 840 | - f"Consider an alternative approach or gather more information first." | |
| 841 | - ) | |
| 842 | - self.messages.append(Message( | |
| 843 | - role=Role.USER, | |
| 844 | - content=low_conf_msg, | |
| 845 | - )) | |
| 846 | - continue # Skip this tool call, let LLM reconsider | |
| 847 | - | |
| 848 | - # Only emit tool_call if not already shown during streaming | |
| 849 | - if tool_call.id not in pending_tool_calls_seen: | |
| 850 | - try: | |
| 851 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 852 | - f.write(f"[loop] emitting tool_call event for {tool_call.name}\n") | |
| 853 | - except Exception: | |
| 854 | - pass | |
| 855 | - await emit(AgentEvent( | |
| 856 | - type="tool_call", | |
| 857 | - tool_name=tool_call.name, | |
| 858 | - tool_args=tool_call.arguments, | |
| 859 | - )) | |
| 860 | - else: | |
| 861 | - try: | |
| 862 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 863 | - f.write(f"[loop] SKIPPING tool_call event for {tool_call.name} (already in pending_seen)\n") | |
| 864 | - except Exception: | |
| 865 | - pass | |
| 866 | - | |
| 867 | - # Track this action for completion checking | |
| 868 | - action_desc = f"{tool_call.name}: {str(tool_call.arguments)[:100]}" | |
| 869 | - actions_taken.append(action_desc) | |
| 870 | - | |
| 871 | - # Check for duplicate actions using safeguards | |
| 872 | - is_dup, dup_reason = self.safeguards.check_duplicate( | |
| 873 | - tool_call.name, tool_call.arguments | |
| 874 | - ) | |
| 875 | - if is_dup: | |
| 876 | - try: | |
| 877 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 878 | - f.write(f"[loop] SKIPPING duplicate: {dup_reason}\n") | |
| 879 | - except Exception: | |
| 880 | - pass | |
| 881 | - # Add a tool result indicating skip | |
| 882 | - self.messages.append(Message( | |
| 883 | - role=Role.TOOL, | |
| 884 | - content=f"[Skipped - duplicate action: {dup_reason}]", | |
| 885 | - tool_call_id=tool_call.id, | |
| 886 | - )) | |
| 887 | - continue # Skip to next tool call | |
| 888 | - | |
| 889 | - # Pre-action validation | |
| 890 | - validation = self.safeguards.validate_action( | |
| 891 | - tool_call.name, tool_call.arguments | |
| 892 | - ) | |
| 893 | - if not validation.valid: | |
| 894 | - try: | |
| 895 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 896 | - f.write(f"[loop] BLOCKED by validation: {validation.reason}\n") | |
| 897 | - except Exception: | |
| 898 | - pass | |
| 899 | - # Add a tool result with the validation error | |
| 900 | - error_msg = f"[Blocked - {validation.reason}]" | |
| 901 | - if validation.suggestion: | |
| 902 | - error_msg += f" Suggestion: {validation.suggestion}" | |
| 903 | - self.messages.append(Message( | |
| 904 | - role=Role.TOOL, | |
| 905 | - content=error_msg, | |
| 906 | - tool_call_id=tool_call.id, | |
| 907 | - )) | |
| 908 | - await emit(AgentEvent( | |
| 909 | - type="tool_result", | |
| 910 | - content=error_msg, | |
| 911 | - tool_name=tool_call.name, | |
| 912 | - is_error=True, | |
| 913 | - )) | |
| 914 | - continue # Skip to next tool call | |
| 915 | - | |
| 916 | - # Rollback planning: create rollback action before destructive ops | |
| 917 | - if rollback_plan and is_destructive_tool(tool_call.name, tool_call.arguments): | |
| 918 | - async def read_file_for_backup(path: str) -> str: | |
| 919 | - """Read file contents for backup.""" | |
| 920 | - result = await self.registry.execute("read", file_path=path) | |
| 921 | - return result.output if not result.is_error else "" | |
| 922 | - | |
| 923 | - rollback_action = await create_rollback_plan_for_action( | |
| 924 | - tool_call.name, | |
| 925 | - tool_call.arguments, | |
| 926 | - read_file_for_backup, | |
| 927 | - ) | |
| 928 | - if rollback_action: | |
| 929 | - rollback_plan.actions.append(rollback_action) | |
| 930 | - if self.config.reasoning.show_rollback_plan: | |
| 931 | - await emit(AgentEvent( | |
| 932 | - type="rollback", | |
| 933 | - content=f"Rollback tracked: {rollback_action.description}", | |
| 934 | - rollback_action=rollback_action, | |
| 935 | - )) | |
| 936 | - | |
| 937 | - # Try to execute, handling confirmation if needed | |
| 938 | - try: | |
| 939 | - result = await self.registry.execute( | |
| 940 | - tool_call.name, | |
| 941 | - **tool_call.arguments, | |
| 942 | - ) | |
| 943 | - except ConfirmationRequired as conf: | |
| 944 | - # Emit confirmation event | |
| 945 | - await emit(AgentEvent( | |
| 946 | - type="confirmation", | |
| 947 | - tool_name=conf.tool_name, | |
| 948 | - confirm_message=conf.message, | |
| 949 | - confirm_details=conf.details, | |
| 950 | - )) | |
| 951 | - | |
| 952 | - # If we have a confirmation callback, ask user | |
| 953 | - if on_confirmation: | |
| 954 | - confirmed = await on_confirmation( | |
| 955 | - conf.tool_name, | |
| 956 | - conf.message, | |
| 957 | - conf.details, | |
| 958 | - ) | |
| 959 | - if confirmed: | |
| 960 | - # Re-execute with skip_confirmation | |
| 961 | - old_skip = self.registry.skip_confirmation | |
| 962 | - self.registry.skip_confirmation = True | |
| 963 | - try: | |
| 964 | - result = await self.registry.execute( | |
| 965 | - tool_call.name, | |
| 966 | - **tool_call.arguments, | |
| 967 | - ) | |
| 968 | - finally: | |
| 969 | - self.registry.skip_confirmation = old_skip | |
| 970 | - else: | |
| 971 | - # User declined - create a skip result | |
| 972 | - from ..tools.base import ToolResult | |
| 973 | - result = ToolResult( | |
| 974 | - output=f"Tool {tool_call.name} was declined by user", | |
| 975 | - is_error=False, | |
| 976 | - ) | |
| 977 | - else: | |
| 978 | - # No callback - treat as auto-confirmed (for non-TUI mode) | |
| 979 | - old_skip = self.registry.skip_confirmation | |
| 980 | - self.registry.skip_confirmation = True | |
| 981 | - try: | |
| 982 | - result = await self.registry.execute( | |
| 983 | - tool_call.name, | |
| 984 | - **tool_call.arguments, | |
| 985 | - ) | |
| 986 | - finally: | |
| 987 | - self.registry.skip_confirmation = old_skip | |
| 988 | - | |
| 989 | - # Handle errors with recovery | |
| 990 | - if result.is_error and self.config.auto_recover: | |
| 991 | - # Initialize or update recovery context | |
| 992 | - if self._recovery_context is None: | |
| 993 | - self._recovery_context = RecoveryContext( | |
| 994 | - original_tool=tool_call.name, | |
| 995 | - original_args=tool_call.arguments, | |
| 996 | - max_retries=self.config.max_recovery_attempts, | |
| 997 | - ) | |
| 998 | - | |
| 999 | - # Check if this or a similar call was already tried (loop detection) | |
| 1000 | - if self._recovery_context.is_similar_attempt(tool_call.name, tool_call.arguments): | |
| 1001 | - await emit(AgentEvent( | |
| 1002 | - type="error", | |
| 1003 | - content=f"Loop detected: already tried a similar command. Try a DIFFERENT approach (e.g., read a config file first).", | |
| 1004 | - tool_name=tool_call.name, | |
| 1005 | - )) | |
| 1006 | - else: | |
| 1007 | - # Record this attempt | |
| 1008 | - self._recovery_context.add_attempt( | |
| 1009 | - tool_call.name, | |
| 1010 | - tool_call.arguments, | |
| 1011 | - result.output, | |
| 1012 | - ) | |
| 1013 | - | |
| 1014 | - # Can we retry? | |
| 1015 | - if self._recovery_context.can_retry(): | |
| 1016 | - attempt_num = len(self._recovery_context.attempts) | |
| 1017 | - await emit(AgentEvent( | |
| 1018 | - type="recovery", | |
| 1019 | - content=f"Tool failed, attempting recovery ({attempt_num}/{self._recovery_context.max_retries})", | |
| 1020 | - tool_name=tool_call.name, | |
| 1021 | - recovery_attempt=attempt_num, | |
| 1022 | - )) | |
| 1023 | - | |
| 1024 | - # Add recovery prompt for LLM | |
| 1025 | - recovery_prompt = format_recovery_prompt( | |
| 1026 | - self._recovery_context, | |
| 1027 | - tool_call.name, | |
| 1028 | - tool_call.arguments, | |
| 1029 | - result.output, | |
| 1030 | - ) | |
| 1031 | - self.messages.append(Message( | |
| 1032 | - role=Role.TOOL, | |
| 1033 | - content=recovery_prompt, | |
| 1034 | - )) | |
| 1035 | - | |
| 1036 | - # Continue to let LLM try an alternative | |
| 1037 | - continue | |
| 1038 | - else: | |
| 1039 | - # Max retries exceeded | |
| 1040 | - failure_msg = format_failure_message(self._recovery_context) | |
| 1041 | - await emit(AgentEvent( | |
| 1042 | - type="error", | |
| 1043 | - content=failure_msg, | |
| 1044 | - tool_name=tool_call.name, | |
| 1045 | - )) | |
| 1046 | - self._recovery_context = None | |
| 1047 | - | |
| 1048 | - # Add the final error result | |
| 1049 | - result_text = format_tool_result( | |
| 1050 | - tool_call.name, | |
| 1051 | - failure_msg, | |
| 1052 | - is_error=True, | |
| 1053 | - ) | |
| 1054 | - self.messages.append(Message( | |
| 1055 | - role=Role.TOOL, | |
| 1056 | - content=result_text, | |
| 1057 | - )) | |
| 1058 | - continue | |
| 1059 | - else: | |
| 1060 | - # Success or no auto-recover - clear recovery context | |
| 1061 | - if not result.is_error: | |
| 1062 | - self._recovery_context = None | |
| 1063 | - # Record successful action to prevent duplicates | |
| 1064 | - self.safeguards.record_action(tool_call.name, tool_call.arguments) | |
| 1065 | - | |
| 1066 | - # Check for repetitive loop pattern | |
| 1067 | - is_loop, loop_desc = self.safeguards.detect_loop() | |
| 1068 | - if is_loop: | |
| 1069 | - await emit(AgentEvent( | |
| 1070 | - type="error", | |
| 1071 | - content=f"Loop detected: {loop_desc}. Stopping to prevent repetitive behavior.", | |
| 1072 | - )) | |
| 1073 | - final_response = "I noticed I was repeating the same actions. Let me know what you'd like me to do differently." | |
| 1074 | - self.messages.append(Message( | |
| 1075 | - role=Role.ASSISTANT, | |
| 1076 | - content=final_response, | |
| 1077 | - )) | |
| 1078 | - await emit(AgentEvent(type="response", content=final_response)) | |
| 1079 | - return final_response | |
| 1080 | - | |
| 1081 | - await emit(AgentEvent( | |
| 1082 | - type="tool_result", | |
| 1083 | - content=result.output, | |
| 1084 | - tool_name=tool_call.name, | |
| 1085 | - is_error=result.is_error, | |
| 1086 | - )) | |
| 1087 | - | |
| 1088 | - # Post-action verification | |
| 1089 | - if cfg.verification and not result.is_error: | |
| 1090 | - verification = await self._verify_action( | |
| 1091 | - tool_call.name, | |
| 1092 | - tool_call.arguments, | |
| 1093 | - result.output, | |
| 1094 | - ) | |
| 1095 | - await emit(AgentEvent( | |
| 1096 | - type="verification", | |
| 1097 | - content=f"Verified: {verification.verified}", | |
| 1098 | - verification=verification, | |
| 1099 | - tool_name=tool_call.name, | |
| 1100 | - )) | |
| 1101 | - | |
| 1102 | - if not verification.verified and verification.needs_correction: | |
| 1103 | - # Add correction suggestion for LLM | |
| 1104 | - correction_msg = ( | |
| 1105 | - f"[VERIFICATION FAILED] The action did not produce expected results.\n" | |
| 1106 | - f"Discrepancies: {', '.join(verification.discrepancies)}\n" | |
| 1107 | - f"Suggestion: {verification.correction_suggestion}" | |
| 1108 | - ) | |
| 1109 | - self.messages.append(Message( | |
| 1110 | - role=Role.USER, | |
| 1111 | - content=correction_msg, | |
| 1112 | - )) | |
| 1113 | - # Don't add the tool result - let LLM try correction | |
| 1114 | - continue | |
| 1115 | - | |
| 1116 | - # Add tool result message | |
| 1117 | - result_text = format_tool_result( | |
| 1118 | - tool_call.name, | |
| 1119 | - result.output, | |
| 1120 | - result.is_error, | |
| 1121 | - ) | |
| 1122 | - self.messages.append(Message( | |
| 1123 | - role=Role.TOOL, | |
| 1124 | - content=result_text, | |
| 1125 | - )) | |
| 1126 | - | |
| 1127 | - # Continue the loop to get next response | |
| 1128 | - continue | |
| 1129 | - | |
| 1130 | - # No tool calls - check if model outputted raw JSON tool calls as text | |
| 1131 | - # Some small models do this instead of using the proper API | |
| 1132 | - if not tool_calls: | |
| 1133 | - try: | |
| 1134 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1135 | - f.write(f"[loop] no tool_calls, checking for raw JSON/bracket format in content (len={len(content)})\n") | |
| 1136 | - except Exception: | |
| 1137 | - pass | |
| 1138 | - raw_tool_calls = self._extract_raw_json_tool_calls(content) | |
| 1139 | - try: | |
| 1140 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1141 | - f.write(f"[loop] _extract_raw_json_tool_calls returned {len(raw_tool_calls)} calls\n") | |
| 1142 | - for tc in raw_tool_calls: | |
| 1143 | - f.write(f"[loop] - {tc.name}: {list(tc.arguments.keys())}\n") | |
| 1144 | - except Exception: | |
| 1145 | - pass | |
| 1146 | - if raw_tool_calls: | |
| 1147 | - # Successfully extracted tool calls from raw JSON - use them | |
| 1148 | - tool_calls = raw_tool_calls | |
| 1149 | - # Clear the streamed content (it was raw JSON, looks ugly) | |
| 1150 | - await emit(AgentEvent(type="clear_stream")) | |
| 1151 | - | |
| 1152 | - # If we now have tool calls (from raw JSON extraction), execute them | |
| 1153 | - if tool_calls: | |
| 1154 | - extracted_iterations += 1 | |
| 1155 | - | |
| 1156 | - # Check if we've exceeded extraction limits | |
| 1157 | - if extracted_iterations > MAX_EXTRACTED_ITERATIONS: | |
| 1158 | - # Model keeps outputting bracket-format calls - stop and let user continue | |
| 1159 | - final_response = content | |
| 1160 | - self.messages.append(Message(role=Role.ASSISTANT, content=response_content)) | |
| 1161 | - await emit(AgentEvent( | |
| 1162 | - type="response", | |
| 1163 | - content=final_response + "\n\nLet me know if you'd like me to continue or make changes." | |
| 1164 | - )) | |
| 1165 | - break | |
| 1166 | - | |
| 1167 | - try: | |
| 1168 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1169 | - f.write(f"[loop] executing {len(tool_calls)} extracted tool calls (iteration {extracted_iterations})\n") | |
| 1170 | - except Exception: | |
| 1171 | - pass | |
| 1172 | - | |
| 1173 | - # Track errors in this batch | |
| 1174 | - batch_errors = 0 | |
| 1175 | - | |
| 1176 | - # This duplicates the tool execution logic above, but that's intentional | |
| 1177 | - # to handle the case where raw JSON tool calls are extracted | |
| 1178 | - for i, tc in enumerate(tool_calls): | |
| 1179 | - # Skip browser/display commands that don't work in terminal | |
| 1180 | - if tc.name == "bash": | |
| 1181 | - cmd = tc.arguments.get("command", "") | |
| 1182 | - if any(x in cmd for x in ["xdg-open", "open ", "firefox", "chrome", "browser"]): | |
| 1183 | - try: | |
| 1184 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1185 | - f.write(f"[loop] skipping browser command: {cmd[:50]}\n") | |
| 1186 | - except Exception: | |
| 1187 | - pass | |
| 1188 | - continue | |
| 1189 | - | |
| 1190 | - # Use safeguards for duplicate checking | |
| 1191 | - is_dup, dup_reason = self.safeguards.check_duplicate(tc.name, tc.arguments) | |
| 1192 | - if is_dup: | |
| 1193 | - try: | |
| 1194 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1195 | - f.write(f"[loop] skipping duplicate: {dup_reason}\n") | |
| 1196 | - except Exception: | |
| 1197 | - pass | |
| 1198 | - continue | |
| 1199 | - | |
| 1200 | - # Pre-action validation | |
| 1201 | - validation = self.safeguards.validate_action(tc.name, tc.arguments) | |
| 1202 | - if not validation.valid: | |
| 1203 | - try: | |
| 1204 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1205 | - f.write(f"[loop] BLOCKED by validation: {validation.reason}\n") | |
| 1206 | - except Exception: | |
| 1207 | - pass | |
| 1208 | - error_msg = f"[Blocked - {validation.reason}]" | |
| 1209 | - if validation.suggestion: | |
| 1210 | - error_msg += f" Suggestion: {validation.suggestion}" | |
| 1211 | - await emit(AgentEvent( | |
| 1212 | - type="tool_result", | |
| 1213 | - content=error_msg, | |
| 1214 | - tool_name=tc.name, | |
| 1215 | - is_error=True, | |
| 1216 | - )) | |
| 1217 | - self.messages.append(Message( | |
| 1218 | - role=Role.TOOL, | |
| 1219 | - content=error_msg, | |
| 1220 | - )) | |
| 1221 | - batch_errors += 1 | |
| 1222 | - continue | |
| 1223 | - | |
| 1224 | - # Small delay between tool executions for better UX | |
| 1225 | - if i > 0: | |
| 1226 | - await asyncio.sleep(0.4) | |
| 1227 | - try: | |
| 1228 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1229 | - f.write(f"[loop] executing extracted tool: {tc.name} args={tc.arguments}\n") | |
| 1230 | - except Exception: | |
| 1231 | - pass | |
| 1232 | - actions_taken.append(f"{tc.name}: {str(tc.arguments)[:50]}...") | |
| 1233 | - await emit(AgentEvent( | |
| 1234 | - type="tool_call", | |
| 1235 | - tool_name=tc.name, | |
| 1236 | - tool_args=tc.arguments, | |
| 1237 | - )) | |
| 1238 | - | |
| 1239 | - # Execute the tool | |
| 1240 | - is_error = False | |
| 1241 | - try: | |
| 1242 | - result = await self.registry.execute(tc.name, **tc.arguments) | |
| 1243 | - result_text = result.output | |
| 1244 | - is_error = result.is_error | |
| 1245 | - except ConfirmationRequired as e: | |
| 1246 | - # Emit confirmation event | |
| 1247 | - await emit(AgentEvent( | |
| 1248 | - type="confirmation", | |
| 1249 | - tool_name=e.tool_name, | |
| 1250 | - confirm_message=e.message, | |
| 1251 | - confirm_details=e.details, | |
| 1252 | - )) | |
| 1253 | - if on_confirmation: | |
| 1254 | - confirmed = await on_confirmation(tc.name, e.message, e.details) | |
| 1255 | - if confirmed: | |
| 1256 | - # Re-execute with skip_confirmation | |
| 1257 | - old_skip = self.registry.skip_confirmation | |
| 1258 | - self.registry.skip_confirmation = True | |
| 1259 | - try: | |
| 1260 | - result = await self.registry.execute(tc.name, **tc.arguments) | |
| 1261 | - result_text = result.output | |
| 1262 | - is_error = result.is_error | |
| 1263 | - finally: | |
| 1264 | - self.registry.skip_confirmation = old_skip | |
| 1265 | - else: | |
| 1266 | - result_text = "Tool execution cancelled by user." | |
| 1267 | - else: | |
| 1268 | - # No callback - auto-confirm for extracted tool calls | |
| 1269 | - old_skip = self.registry.skip_confirmation | |
| 1270 | - self.registry.skip_confirmation = True | |
| 1271 | - try: | |
| 1272 | - result = await self.registry.execute(tc.name, **tc.arguments) | |
| 1273 | - result_text = result.output | |
| 1274 | - is_error = result.is_error | |
| 1275 | - finally: | |
| 1276 | - self.registry.skip_confirmation = old_skip | |
| 1277 | - except Exception as e: | |
| 1278 | - result_text = f"Error: {e}" | |
| 1279 | - is_error = True | |
| 1280 | - | |
| 1281 | - # Track errors | |
| 1282 | - if is_error: | |
| 1283 | - batch_errors += 1 | |
| 1284 | - consecutive_errors += 1 | |
| 1285 | - else: | |
| 1286 | - consecutive_errors = 0 # Reset on success | |
| 1287 | - # Record successful action to prevent duplicates | |
| 1288 | - self.safeguards.record_action(tc.name, tc.arguments) | |
| 1289 | - | |
| 1290 | - # Check for repetitive loop pattern | |
| 1291 | - is_loop, loop_desc = self.safeguards.detect_loop() | |
| 1292 | - if is_loop: | |
| 1293 | - await emit(AgentEvent( | |
| 1294 | - type="error", | |
| 1295 | - content=f"Loop detected: {loop_desc}. Stopping to prevent repetitive behavior.", | |
| 1296 | - )) | |
| 1297 | - final_response = "I noticed I was repeating the same actions. Let me know what you'd like me to do differently." | |
| 1298 | - self.messages.append(Message( | |
| 1299 | - role=Role.ASSISTANT, | |
| 1300 | - content=final_response, | |
| 1301 | - )) | |
| 1302 | - await emit(AgentEvent(type="response", content=final_response)) | |
| 1303 | - return final_response | |
| 1304 | - | |
| 1305 | - await emit(AgentEvent( | |
| 1306 | - type="tool_result", | |
| 1307 | - content=result_text, | |
| 1308 | - tool_name=tc.name, | |
| 1309 | - is_error=is_error, | |
| 1310 | - )) | |
| 1311 | - | |
| 1312 | - self.messages.append(Message( | |
| 1313 | - role=Role.ASSISTANT, | |
| 1314 | - content=response_content, | |
| 1315 | - )) | |
| 1316 | - self.messages.append(Message( | |
| 1317 | - role=Role.TOOL, | |
| 1318 | - content=result_text, | |
| 1319 | - )) | |
| 1320 | - | |
| 1321 | - # After executing batch, check if we should stop | |
| 1322 | - # Stop if: all tools in batch failed, or we have many consecutive errors | |
| 1323 | - if batch_errors == len(tool_calls) or consecutive_errors >= 3: | |
| 1324 | - # All failed or too many consecutive errors - stop trying | |
| 1325 | - final_response = "I ran into some issues. Let me know if you'd like me to try a different approach." | |
| 1326 | - await emit(AgentEvent(type="response", content=final_response)) | |
| 1327 | - break | |
| 1328 | - | |
| 1329 | - continue | |
| 1330 | - | |
| 1331 | - # No tool calls - check if model is describing instead of acting | |
| 1332 | - # IMPORTANT: Check ORIGINAL content before safeguards filtered it! | |
| 1333 | - # Debug log | |
| 1334 | - try: | |
| 1335 | - has_unexecuted = self._contains_unexecuted_code(response_content) | |
| 1336 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1337 | - f.write(f"[chatbot-check] iterations={iterations}, has_unexecuted={has_unexecuted}\n") | |
| 1338 | - f.write(f"[chatbot-check] response_content (first 200): {response_content[:200]}\n") | |
| 1339 | - f.write(f"[chatbot-check] filtered content (first 200): {content[:200]}\n") | |
| 1340 | - except Exception: | |
| 1341 | - pass | |
| 1342 | - | |
| 1343 | - if self._contains_unexecuted_code(response_content) and iterations < self.config.max_iterations - 1: | |
| 1344 | - # Model outputted code blocks without using tools - nudge it | |
| 1345 | - try: | |
| 1346 | - with open("/tmp/loader_debug.log", "a") as f: | |
| 1347 | - f.write(f"[chatbot-check] TRIGGERING chatbot recovery\n") | |
| 1348 | - except Exception: | |
| 1349 | - pass | |
| 1350 | - # Silently steer - don't show error to user (internal correction) | |
| 1351 | - self.messages.append(Message( | |
| 1352 | - role=Role.ASSISTANT, | |
| 1353 | - content=response_content, | |
| 1354 | - )) | |
| 1355 | - self.messages.append(Message( | |
| 1356 | - role=Role.USER, | |
| 1357 | - content="CRITICAL ERROR: You are PRETENDING to use tools instead of actually using them.\n\n" | |
| 1358 | - "DO NOT write:\n" | |
| 1359 | - "- 'Used bash tool with command...' (THIS IS FAKE)\n" | |
| 1360 | - "- 'Created a file using the write tool...' (THIS IS FAKE)\n" | |
| 1361 | - "- 'Here is what I did:' followed by descriptions\n" | |
| 1362 | - "- Numbered steps or instructions\n" | |
| 1363 | - "- Code blocks for me to copy\n\n" | |
| 1364 | - "Your tool calls MUST go through the proper tool interface.\n" | |
| 1365 | - "Writing 'Used bash tool...' does NOT execute anything!\n\n" | |
| 1366 | - "ACTUALLY call the tools using the tool_call mechanism.\n" | |
| 1367 | - "DO IT NOW - stop narrating and start executing.", | |
| 1368 | - )) | |
| 1369 | - continue | |
| 1370 | - | |
| 1371 | - # No tool calls and early in the task - MAY be giving up too soon | |
| 1372 | - # But only intervene if we haven't done ANY work yet | |
| 1373 | - if not self.use_react and len(actions_taken) == 0 and iterations < self.config.max_iterations - 2: | |
| 1374 | - # Check if response looks like deflection without having done anything | |
| 1375 | - deflection_phrases = ["you can", "you should", "you could", "try running"] | |
| 1376 | - looks_like_deflection = any(p in content.lower() for p in deflection_phrases) | |
| 1377 | - | |
| 1378 | - if looks_like_deflection: | |
| 1379 | - self.messages.append(Message( | |
| 1380 | - role=Role.ASSISTANT, | |
| 1381 | - content=response_content, | |
| 1382 | - )) | |
| 1383 | - self.messages.append(Message( | |
| 1384 | - role=Role.USER, | |
| 1385 | - content="Please use your tools to execute the task rather than telling me what to do.", | |
| 1386 | - )) | |
| 1387 | - continue | |
| 1388 | - | |
| 1389 | - # Self-critique before finalizing (if enabled and response has substance) | |
| 1390 | - cfg = self.config.reasoning | |
| 1391 | - if cfg.self_critique and len(content) > 100: | |
| 1392 | - # Check if we should critique this response | |
| 1393 | - is_code_response = "```" in content or any( | |
| 1394 | - keyword in content.lower() | |
| 1395 | - for keyword in ["def ", "function ", "class ", "import "] | |
| 1396 | - ) | |
| 1397 | - if should_self_critique(content, is_code=is_code_response): | |
| 1398 | - context = task | |
| 1399 | - critique = await self._self_critique(content, context) | |
| 1400 | - | |
| 1401 | - await emit(AgentEvent( | |
| 1402 | - type="critique", | |
| 1403 | - content=f"Self-critique: {len(critique.issues_found)} issues found", | |
| 1404 | - critique=critique, | |
| 1405 | - )) | |
| 1406 | - | |
| 1407 | - if critique.can_revise(): | |
| 1408 | - # Ask for revision | |
| 1409 | - revision_msg = ( | |
| 1410 | - f"[SELF-CRITIQUE] Review your response:\n" | |
| 1411 | - f"Issues found: {', '.join(critique.issues_found)}\n" | |
| 1412 | - f"Suggestions: {', '.join(critique.suggestions)}\n\n" | |
| 1413 | - "Please provide an improved response addressing these issues." | |
| 1414 | - ) | |
| 1415 | - self.messages.append(Message( | |
| 1416 | - role=Role.ASSISTANT, | |
| 1417 | - content=response_content, | |
| 1418 | - )) | |
| 1419 | - self.messages.append(Message( | |
| 1420 | - role=Role.USER, | |
| 1421 | - content=revision_msg, | |
| 1422 | - )) | |
| 1423 | - critique.revision_count += 1 | |
| 1424 | - continue # Loop to get revised response | |
| 1425 | - | |
| 1426 | - # Check for text loop (agent repeating the same response) | |
| 1427 | - is_text_loop, text_loop_desc = self.safeguards.detect_text_loop(content) | |
| 1428 | - if is_text_loop: | |
| 1429 | - await emit(AgentEvent( | |
| 1430 | - type="error", | |
| 1431 | - content=f"Text loop detected: {text_loop_desc}. Stopping.", | |
| 1432 | - )) | |
| 1433 | - final_response = "I seem to be repeating myself. Let me know if you'd like me to try a different approach." | |
| 1434 | - self.messages.append(Message( | |
| 1435 | - role=Role.ASSISTANT, | |
| 1436 | - content=final_response, | |
| 1437 | - )) | |
| 1438 | - await emit(AgentEvent(type="response", content=final_response)) | |
| 1439 | - return final_response | |
| 1440 | - | |
| 1441 | - # Record response for future loop detection | |
| 1442 | - self.safeguards.record_response(content) | |
| 1443 | - | |
| 1444 | - # Task completion check - don't give up too early! | |
| 1445 | - # Use original_task if available (for multi-turn conversations) | |
| 1446 | - effective_task = original_task or task | |
| 1447 | - if cfg.completion_check and continuation_count < cfg.max_continuation_prompts: | |
| 1448 | - # Quick heuristic check first | |
| 1449 | - if cfg.use_quick_completion: | |
| 1450 | - is_premature = detect_premature_completion(effective_task, content, actions_taken) | |
| 1451 | - else: | |
| 1452 | - is_premature = False | |
| 1453 | - | |
| 1454 | - if is_premature: | |
| 1455 | - continuation_count += 1 | |
| 1456 | - continuation_prompt = get_continuation_prompt(effective_task, actions_taken, content) | |
| 1457 | - | |
| 1458 | - await emit(AgentEvent( | |
| 1459 | - type="completion_check", | |
| 1460 | - content=f"Task may be incomplete ({len(actions_taken)} actions taken)", | |
| 1461 | - completion_check=TaskCompletionCheck( | |
| 1462 | - original_task=effective_task, | |
| 1463 | - is_complete=False, | |
| 1464 | - accomplished=[a.split(":")[0] for a in actions_taken], | |
| 1465 | - continuation_prompt=continuation_prompt, | |
| 1466 | - ), | |
| 1467 | - )) | |
| 1468 | - | |
| 1469 | - # Add the assistant's response and nudge to continue | |
| 1470 | - self.messages.append(Message( | |
| 1471 | - role=Role.ASSISTANT, | |
| 1472 | - content=response_content, | |
| 1473 | - )) | |
| 1474 | - self.messages.append(Message( | |
| 1475 | - role=Role.USER, | |
| 1476 | - content=continuation_prompt, | |
| 1477 | - )) | |
| 1478 | - continue # Loop to get continuation | |
| 1479 | - | |
| 1480 | - # This is the final response | |
| 1481 | - final_response = content | |
| 1482 | - | |
| 1483 | - # If we completed actions, add follow-up question to encourage continued conversation | |
| 1484 | - if actions_taken and final_response.strip(): | |
| 1485 | - # Only add if the response doesn't already end with a question | |
| 1486 | - if not final_response.rstrip().endswith('?'): | |
| 1487 | - final_response = final_response.rstrip() + "\n\nWould you like me to make any changes or additions?" | |
| 1488 | - | |
| 1489 | - self.messages.append(Message( | |
| 1490 | - role=Role.ASSISTANT, | |
| 1491 | - content=response_content, | |
| 1492 | - )) | |
| 1493 | - | |
| 1494 | - # Emit rollback plan summary if we tracked any actions | |
| 1495 | - if rollback_plan and rollback_plan.actions: | |
| 1496 | - await emit(AgentEvent( | |
| 1497 | - type="rollback_summary", | |
| 1498 | - content=f"Rollback plan: {len(rollback_plan.actions)} action(s) tracked", | |
| 1499 | - rollback_plan=rollback_plan, | |
| 1500 | - )) | |
| 1501 | - | |
| 1502 | - await emit(AgentEvent(type="response", content=final_response)) | |
| 1503 | - break | |
| 1504 | - | |
| 1505 | - return final_response | |
| 586 | + runtime = ConversationRuntime(self) | |
| 587 | + self.last_turn_summary = await runtime.run_turn( | |
| 588 | + task, | |
| 589 | + emit, | |
| 590 | + on_confirmation=on_confirmation, | |
| 591 | + original_task=original_task, | |
| 592 | + ) | |
| 593 | + return self.last_turn_summary.final_response | |
| 1506 | 594 | |
| 1507 | 595 | async def run_streaming( |
| 1508 | 596 | self, |
@@ -1923,7 +1011,12 @@ class Agent: | ||
| 1923 | 1011 | def clear_history(self) -> None: |
| 1924 | 1012 | """Clear conversation history.""" |
| 1925 | 1013 | self.messages = [] |
| 1014 | + self.session = ConversationSession( | |
| 1015 | + system_message_factory=self._get_system_message, | |
| 1016 | + few_shot_factory=self._get_few_shot_examples, | |
| 1017 | + messages=self.messages, | |
| 1018 | + ) | |
| 1926 | 1019 | self._recovery_context = None |
| 1927 | 1020 | self._current_task = None |
| 1928 | - self._executed_commands = set() # Clear command dedup tracking | |
| 1021 | + self.last_turn_summary = None | |
| 1929 | 1022 | self.safeguards.reset() # Reset all runtime safeguards |
src/loader/llm/base.pymodified@@ -1,9 +1,10 @@ | ||
| 1 | 1 | """Base classes for LLM backends.""" |
| 2 | 2 | |
| 3 | 3 | from abc import ABC, abstractmethod |
| 4 | +from collections.abc import AsyncIterator | |
| 4 | 5 | from dataclasses import dataclass, field |
| 5 | 6 | from enum import Enum |
| 6 | -from typing import AsyncIterator, Any | |
| 7 | +from typing import Any | |
| 7 | 8 | |
| 8 | 9 | |
| 9 | 10 | class Role(str, Enum): |
@@ -38,6 +39,29 @@ class Message: | ||
| 38 | 39 | tool_calls: list[ToolCall] = field(default_factory=list) |
| 39 | 40 | tool_results: list[ToolResult] = field(default_factory=list) |
| 40 | 41 | |
| 42 | + @classmethod | |
| 43 | + def tool_result_message( | |
| 44 | + cls, | |
| 45 | + *, | |
| 46 | + tool_call_id: str, | |
| 47 | + display_content: str, | |
| 48 | + result_content: str, | |
| 49 | + is_error: bool = False, | |
| 50 | + ) -> "Message": | |
| 51 | + """Build a tool-result message with a typed tool result payload.""" | |
| 52 | + | |
| 53 | + return cls( | |
| 54 | + role=Role.TOOL, | |
| 55 | + content=display_content, | |
| 56 | + tool_results=[ | |
| 57 | + ToolResult( | |
| 58 | + tool_call_id=tool_call_id, | |
| 59 | + content=result_content, | |
| 60 | + is_error=is_error, | |
| 61 | + ) | |
| 62 | + ], | |
| 63 | + ) | |
| 64 | + | |
| 41 | 65 | def to_dict(self) -> dict[str, Any]: |
| 42 | 66 | """Convert to dict for API calls.""" |
| 43 | 67 | result: dict[str, Any] = { |
@@ -49,6 +73,10 @@ class Message: | ||
| 49 | 73 | {"id": tc.id, "name": tc.name, "arguments": tc.arguments} |
| 50 | 74 | for tc in self.tool_calls |
| 51 | 75 | ] |
| 76 | + if self.tool_results: | |
| 77 | + primary_result = self.tool_results[0] | |
| 78 | + result["tool_call_id"] = primary_result.tool_call_id | |
| 79 | + result["is_error"] = primary_result.is_error | |
| 52 | 80 | return result |
| 53 | 81 | |
| 54 | 82 | |
src/loader/runtime/__init__.pymodified@@ -1,6 +1,10 @@ | ||
| 1 | 1 | """Runtime primitives for Loader's turn engine.""" |
| 2 | 2 | |
| 3 | -from .capabilities import CapabilityProfile, resolve_capability_profile | |
| 3 | +from .capabilities import ( | |
| 4 | + CapabilityProfile, | |
| 5 | + resolve_backend_capability_profile, | |
| 6 | + resolve_capability_profile, | |
| 7 | +) | |
| 4 | 8 | from .events import AgentEvent, TurnSummary |
| 5 | 9 | from .session import ConversationSession |
| 6 | 10 | from .tracing import RuntimeTraceEvent, RuntimeTracer |
@@ -12,5 +16,6 @@ __all__ = [ | ||
| 12 | 16 | "RuntimeTraceEvent", |
| 13 | 17 | "RuntimeTracer", |
| 14 | 18 | "TurnSummary", |
| 19 | + "resolve_backend_capability_profile", | |
| 15 | 20 | "resolve_capability_profile", |
| 16 | 21 | ] |
src/loader/runtime/capabilities.pymodified@@ -3,12 +3,24 @@ | ||
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from dataclasses import dataclass, field |
| 6 | -from typing import Any, Literal | |
| 6 | +from typing import Any, Literal, Protocol | |
| 7 | 7 | |
| 8 | 8 | ToolCallFormat = Literal["native", "json_tag", "bracket"] |
| 9 | 9 | VerificationStrictness = Literal["lax", "standard", "strict"] |
| 10 | 10 | |
| 11 | 11 | |
| 12 | +class SupportsCapabilityProfile(Protocol): | |
| 13 | + """Runtime interface for backends that can describe capabilities.""" | |
| 14 | + | |
| 15 | + def capability_profile(self) -> CapabilityProfile: ... | |
| 16 | + | |
| 17 | + | |
| 18 | +class SupportsNativeTools(Protocol): | |
| 19 | + """Runtime interface for backends that can explicitly report tool support.""" | |
| 20 | + | |
| 21 | + def supports_native_tools(self) -> bool: ... | |
| 22 | + | |
| 23 | + | |
| 12 | 24 | @dataclass(frozen=True) |
| 13 | 25 | class CapabilityProfile: |
| 14 | 26 | """Resolved model/runtime capability profile.""" |
@@ -203,3 +215,30 @@ def resolve_capability_profile( | ||
| 203 | 215 | verification_strictness="standard", |
| 204 | 216 | notes=["Unknown model family; defaulting to safe ReAct-style tool use."], |
| 205 | 217 | ) |
| 218 | + | |
| 219 | + | |
| 220 | +def resolve_backend_capability_profile(backend: Any) -> CapabilityProfile: | |
| 221 | + """Resolve capabilities from the backend first, then fall back to model heuristics.""" | |
| 222 | + | |
| 223 | + explicit_profile = getattr(backend, "capability_profile", None) | |
| 224 | + if callable(explicit_profile): | |
| 225 | + profile = explicit_profile() | |
| 226 | + if isinstance(profile, CapabilityProfile): | |
| 227 | + return profile | |
| 228 | + | |
| 229 | + model_name = getattr(backend, "model", backend.__class__.__name__) | |
| 230 | + explicit_native_tools = getattr(backend, "supports_native_tools", None) | |
| 231 | + if callable(explicit_native_tools): | |
| 232 | + supports_native_tools = bool(explicit_native_tools()) | |
| 233 | + preferred_tool_call_format: ToolCallFormat = ( | |
| 234 | + "native" if supports_native_tools else "json_tag" | |
| 235 | + ) | |
| 236 | + return _profile( | |
| 237 | + model_name, | |
| 238 | + supports_native_tools=supports_native_tools, | |
| 239 | + preferred_tool_call_format=preferred_tool_call_format, | |
| 240 | + verification_strictness="standard", | |
| 241 | + notes=["Resolved from backend capability surface."], | |
| 242 | + ) | |
| 243 | + | |
| 244 | + return resolve_capability_profile(model_name) | |
src/loader/runtime/conversation.pyadded@@ -0,0 +1,705 @@ | ||
| 1 | +"""Typed turn engine for Loader runtime execution.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from collections.abc import Awaitable, Callable | |
| 6 | +from dataclasses import dataclass, field | |
| 7 | +from typing import Any | |
| 8 | + | |
| 9 | +from ..agent.parsing import parse_tool_calls | |
| 10 | +from ..agent.reasoning import ( | |
| 11 | + RollbackPlan, | |
| 12 | + TaskCompletionCheck, | |
| 13 | + create_rollback_plan_for_action, | |
| 14 | + detect_premature_completion, | |
| 15 | + estimate_complexity, | |
| 16 | + get_continuation_prompt, | |
| 17 | + get_token_budget, | |
| 18 | + is_destructive_tool, | |
| 19 | + should_self_critique, | |
| 20 | +) | |
| 21 | +from ..agent.recovery import RecoveryContext, format_failure_message, format_recovery_prompt | |
| 22 | +from ..llm.base import Message, Role, ToolCall | |
| 23 | +from .events import AgentEvent, TurnSummary | |
| 24 | +from .executor import ToolExecutionState, ToolExecutor | |
| 25 | +from .tracing import RuntimeTracer | |
| 26 | + | |
| 27 | +EventSink = Callable[[AgentEvent], Awaitable[None]] | |
| 28 | +ConfirmationHandler = Callable[[str, str, str], Awaitable[bool]] | None | |
| 29 | + | |
| 30 | + | |
| 31 | +@dataclass | |
| 32 | +class AssistantTurn: | |
| 33 | + """Assistant output for one iteration of the conversation loop.""" | |
| 34 | + | |
| 35 | + content: str | |
| 36 | + response_content: str | |
| 37 | + tool_calls: list[ToolCall] | |
| 38 | + pending_tool_calls_seen: set[str] = field(default_factory=set) | |
| 39 | + usage: dict[str, int] = field(default_factory=dict) | |
| 40 | + | |
| 41 | + | |
| 42 | +class ConversationRuntime: | |
| 43 | + """Runs one explicit conversation turn against the current session.""" | |
| 44 | + | |
| 45 | + def __init__(self, agent: Any) -> None: | |
| 46 | + self.agent = agent | |
| 47 | + self.tracer = RuntimeTracer() | |
| 48 | + self.executor = ToolExecutor(agent.registry, agent.safeguards, self.tracer) | |
| 49 | + | |
| 50 | + async def run_turn( | |
| 51 | + self, | |
| 52 | + task: str, | |
| 53 | + emit: EventSink, | |
| 54 | + on_confirmation: ConfirmationHandler = None, | |
| 55 | + original_task: str | None = None, | |
| 56 | + ) -> TurnSummary: | |
| 57 | + """Run one task turn and return a structured summary.""" | |
| 58 | + | |
| 59 | + iterations = 0 | |
| 60 | + final_response = "" | |
| 61 | + actions_taken: list[str] = [] | |
| 62 | + continuation_count = 0 | |
| 63 | + empty_retry_count = 0 | |
| 64 | + max_empty_retries = 5 | |
| 65 | + extracted_iterations = 0 | |
| 66 | + max_extracted_iterations = 3 | |
| 67 | + consecutive_errors = 0 | |
| 68 | + | |
| 69 | + complexity = estimate_complexity(task) | |
| 70 | + max_tokens, _ = get_token_budget(complexity) | |
| 71 | + effective_max_tokens = min(self.agent.config.max_tokens, max(max_tokens, 512)) | |
| 72 | + | |
| 73 | + rollback_plan = RollbackPlan() if self.agent.config.reasoning.rollback else None | |
| 74 | + summary = TurnSummary(final_response="") | |
| 75 | + | |
| 76 | + while iterations < self.agent.config.max_iterations: | |
| 77 | + iterations += 1 | |
| 78 | + summary.iterations = iterations | |
| 79 | + self.tracer.record("turn.iteration_started", iteration=iterations) | |
| 80 | + | |
| 81 | + if iterations == 1 and len(self.agent.messages) == 1: | |
| 82 | + task_lower = task.lower() | |
| 83 | + action_keywords = [ | |
| 84 | + "create", | |
| 85 | + "write", | |
| 86 | + "make", | |
| 87 | + "run", | |
| 88 | + "execute", | |
| 89 | + "build", | |
| 90 | + "install", | |
| 91 | + "delete", | |
| 92 | + "remove", | |
| 93 | + "add", | |
| 94 | + "edit", | |
| 95 | + "modify", | |
| 96 | + "update", | |
| 97 | + "fix", | |
| 98 | + ] | |
| 99 | + if any(keyword in task_lower for keyword in action_keywords): | |
| 100 | + self.agent.session.append(Message(role=Role.ASSISTANT, content="[")) | |
| 101 | + | |
| 102 | + steering_messages = self.agent._drain_steering_queue() | |
| 103 | + for steering_message in steering_messages: | |
| 104 | + await emit(AgentEvent(type="steering", content=steering_message)) | |
| 105 | + self.agent.session.append( | |
| 106 | + Message( | |
| 107 | + role=Role.USER, | |
| 108 | + content=f"[USER INTERRUPTION]: {steering_message}", | |
| 109 | + ) | |
| 110 | + ) | |
| 111 | + | |
| 112 | + await emit(AgentEvent(type="thinking")) | |
| 113 | + assistant_turn = await self._request_assistant_turn( | |
| 114 | + emit=emit, | |
| 115 | + max_tokens=effective_max_tokens, | |
| 116 | + ) | |
| 117 | + self._merge_usage(summary.usage, assistant_turn.usage) | |
| 118 | + | |
| 119 | + content = assistant_turn.content | |
| 120 | + response_content = assistant_turn.response_content | |
| 121 | + tool_calls = list(assistant_turn.tool_calls) | |
| 122 | + pending_tool_calls_seen = set(assistant_turn.pending_tool_calls_seen) | |
| 123 | + | |
| 124 | + if not content.strip(): | |
| 125 | + empty_retry_count += 1 | |
| 126 | + if empty_retry_count <= max_empty_retries: | |
| 127 | + task_context = original_task or task | |
| 128 | + retry_prompts = [ | |
| 129 | + "Great! Now let me proceed with the task. I'll start by using my tools.", | |
| 130 | + "I understand. Let me create that now using my tools (write, bash, etc.).", | |
| 131 | + f"Proceeding with: {task_context[:80]}. I'll use the write tool to create the files.", | |
| 132 | + "Starting now. First step: create the necessary files and directories.", | |
| 133 | + f"Let me complete this task step by step. The goal is: {task_context[:100]}", | |
| 134 | + ] | |
| 135 | + prompt = retry_prompts[min(empty_retry_count - 1, len(retry_prompts) - 1)] | |
| 136 | + self.agent.session.append(Message(role=Role.ASSISTANT, content=prompt)) | |
| 137 | + continue | |
| 138 | + | |
| 139 | + final_response = ( | |
| 140 | + "I need a bit more direction. What specifically would you like me to create or do?" | |
| 141 | + ) | |
| 142 | + summary.final_response = final_response | |
| 143 | + summary.failures.append("assistant returned empty output repeatedly") | |
| 144 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 145 | + break | |
| 146 | + | |
| 147 | + if self.agent.use_react: | |
| 148 | + parsed = parse_tool_calls(content) | |
| 149 | + tool_calls = parsed.tool_calls | |
| 150 | + content = parsed.content | |
| 151 | + | |
| 152 | + if parsed.is_final_answer and not tool_calls: | |
| 153 | + assistant_message = Message(role=Role.ASSISTANT, content=response_content) | |
| 154 | + self.agent.session.append(assistant_message) | |
| 155 | + summary.assistant_messages.append(assistant_message) | |
| 156 | + final_response = content | |
| 157 | + summary.final_response = final_response | |
| 158 | + self.tracer.record("turn.completed", reason="final_answer") | |
| 159 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 160 | + break | |
| 161 | + | |
| 162 | + tool_source = "native" | |
| 163 | + if not tool_calls: | |
| 164 | + raw_tool_calls = self.agent._extract_raw_json_tool_calls(response_content) | |
| 165 | + if raw_tool_calls: | |
| 166 | + tool_calls = raw_tool_calls | |
| 167 | + tool_source = "raw_text" | |
| 168 | + await emit(AgentEvent(type="clear_stream")) | |
| 169 | + | |
| 170 | + if tool_calls: | |
| 171 | + if tool_source == "raw_text": | |
| 172 | + extracted_iterations += 1 | |
| 173 | + if extracted_iterations > max_extracted_iterations: | |
| 174 | + final_response = ( | |
| 175 | + content | |
| 176 | + + "\n\nLet me know if you'd like me to continue or make changes." | |
| 177 | + ) | |
| 178 | + assistant_message = Message(role=Role.ASSISTANT, content=response_content) | |
| 179 | + self.agent.session.append(assistant_message) | |
| 180 | + summary.assistant_messages.append(assistant_message) | |
| 181 | + summary.final_response = final_response | |
| 182 | + summary.failures.append("raw tool extraction exceeded iteration budget") | |
| 183 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 184 | + break | |
| 185 | + | |
| 186 | + assistant_message = Message( | |
| 187 | + role=Role.ASSISTANT, | |
| 188 | + content=response_content, | |
| 189 | + tool_calls=tool_calls, | |
| 190 | + ) | |
| 191 | + self.agent.session.append(assistant_message) | |
| 192 | + summary.assistant_messages.append(assistant_message) | |
| 193 | + self.tracer.record( | |
| 194 | + "assistant.tool_batch", | |
| 195 | + tool_count=len(tool_calls), | |
| 196 | + source=tool_source, | |
| 197 | + ) | |
| 198 | + | |
| 199 | + for tool_call in tool_calls: | |
| 200 | + cfg = self.agent.config.reasoning | |
| 201 | + | |
| 202 | + if cfg.confidence_scoring: | |
| 203 | + context = "\n".join( | |
| 204 | + message.content[:500] | |
| 205 | + for message in self.agent.messages[-5:] | |
| 206 | + if message.content | |
| 207 | + ) | |
| 208 | + confidence = await self.agent._assess_confidence( | |
| 209 | + tool_call.name, | |
| 210 | + tool_call.arguments, | |
| 211 | + context, | |
| 212 | + ) | |
| 213 | + await emit( | |
| 214 | + AgentEvent( | |
| 215 | + type="confidence", | |
| 216 | + content=f"Confidence: {confidence.level.name} ({confidence.score}/5)", | |
| 217 | + confidence=confidence, | |
| 218 | + tool_name=tool_call.name, | |
| 219 | + ) | |
| 220 | + ) | |
| 221 | + if confidence.score < cfg.min_confidence_for_action: | |
| 222 | + low_confidence_message = ( | |
| 223 | + "[LOW CONFIDENCE WARNING] The planned action has low confidence " | |
| 224 | + f"({confidence.level.name}).\n" | |
| 225 | + f"Reasoning: {confidence.reasoning}\n" | |
| 226 | + f"Risks: {', '.join(confidence.risks)}\n" | |
| 227 | + "Consider an alternative approach or gather more information first." | |
| 228 | + ) | |
| 229 | + self.agent.session.append( | |
| 230 | + Message(role=Role.USER, content=low_confidence_message) | |
| 231 | + ) | |
| 232 | + continue | |
| 233 | + | |
| 234 | + if tool_call.id not in pending_tool_calls_seen: | |
| 235 | + await emit( | |
| 236 | + AgentEvent( | |
| 237 | + type="tool_call", | |
| 238 | + tool_name=tool_call.name, | |
| 239 | + tool_args=tool_call.arguments, | |
| 240 | + ) | |
| 241 | + ) | |
| 242 | + | |
| 243 | + actions_taken.append(f"{tool_call.name}: {str(tool_call.arguments)[:100]}") | |
| 244 | + | |
| 245 | + if rollback_plan and is_destructive_tool(tool_call.name, tool_call.arguments): | |
| 246 | + | |
| 247 | + async def read_file_for_backup(path: str) -> str: | |
| 248 | + read_result = await self.agent.registry.execute("read", file_path=path) | |
| 249 | + return read_result.output if not read_result.is_error else "" | |
| 250 | + | |
| 251 | + rollback_action = await create_rollback_plan_for_action( | |
| 252 | + tool_call.name, | |
| 253 | + tool_call.arguments, | |
| 254 | + read_file_for_backup, | |
| 255 | + ) | |
| 256 | + if rollback_action: | |
| 257 | + rollback_plan.actions.append(rollback_action) | |
| 258 | + if self.agent.config.reasoning.show_rollback_plan: | |
| 259 | + await emit( | |
| 260 | + AgentEvent( | |
| 261 | + type="rollback", | |
| 262 | + content=f"Rollback tracked: {rollback_action.description}", | |
| 263 | + rollback_action=rollback_action, | |
| 264 | + ) | |
| 265 | + ) | |
| 266 | + | |
| 267 | + outcome = await self.executor.execute_tool_call( | |
| 268 | + tool_call, | |
| 269 | + on_confirmation=on_confirmation, | |
| 270 | + emit_confirmation=self._emit_confirmation(emit), | |
| 271 | + source=tool_source, | |
| 272 | + ) | |
| 273 | + | |
| 274 | + if ( | |
| 275 | + outcome.state == ToolExecutionState.EXECUTED | |
| 276 | + and outcome.is_error | |
| 277 | + and self.agent.config.auto_recover | |
| 278 | + ): | |
| 279 | + recovery_result = await self._handle_recovery(tool_call, outcome, emit) | |
| 280 | + if recovery_result is not None: | |
| 281 | + summary.tool_result_messages.append(recovery_result) | |
| 282 | + self.agent.session.append(recovery_result) | |
| 283 | + continue | |
| 284 | + | |
| 285 | + if outcome.state == ToolExecutionState.EXECUTED and not outcome.is_error: | |
| 286 | + self.agent._recovery_context = None | |
| 287 | + is_loop, loop_description = self.agent.safeguards.detect_loop() | |
| 288 | + if is_loop: | |
| 289 | + final_response = ( | |
| 290 | + "I noticed I was repeating the same actions. " | |
| 291 | + "Let me know what you'd like me to do differently." | |
| 292 | + ) | |
| 293 | + summary.final_response = final_response | |
| 294 | + summary.failures.append(loop_description) | |
| 295 | + loop_message = Message(role=Role.ASSISTANT, content=final_response) | |
| 296 | + self.agent.session.append(loop_message) | |
| 297 | + summary.assistant_messages.append(loop_message) | |
| 298 | + await emit( | |
| 299 | + AgentEvent( | |
| 300 | + type="error", | |
| 301 | + content=( | |
| 302 | + f"Loop detected: {loop_description}. " | |
| 303 | + "Stopping to prevent repetitive behavior." | |
| 304 | + ), | |
| 305 | + ) | |
| 306 | + ) | |
| 307 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 308 | + return self._finalize_summary(summary) | |
| 309 | + | |
| 310 | + if outcome.is_error: | |
| 311 | + consecutive_errors += 1 | |
| 312 | + else: | |
| 313 | + consecutive_errors = 0 | |
| 314 | + | |
| 315 | + await emit( | |
| 316 | + AgentEvent( | |
| 317 | + type="tool_result", | |
| 318 | + content=outcome.event_content, | |
| 319 | + tool_name=tool_call.name, | |
| 320 | + is_error=outcome.is_error, | |
| 321 | + ) | |
| 322 | + ) | |
| 323 | + | |
| 324 | + if ( | |
| 325 | + cfg.verification | |
| 326 | + and outcome.state == ToolExecutionState.EXECUTED | |
| 327 | + and not outcome.is_error | |
| 328 | + ): | |
| 329 | + verification = await self.agent._verify_action( | |
| 330 | + tool_call.name, | |
| 331 | + tool_call.arguments, | |
| 332 | + outcome.result_output, | |
| 333 | + ) | |
| 334 | + await emit( | |
| 335 | + AgentEvent( | |
| 336 | + type="verification", | |
| 337 | + content=f"Verified: {verification.verified}", | |
| 338 | + verification=verification, | |
| 339 | + tool_name=tool_call.name, | |
| 340 | + ) | |
| 341 | + ) | |
| 342 | + if not verification.verified and verification.needs_correction: | |
| 343 | + correction_message = ( | |
| 344 | + "[VERIFICATION FAILED] The action did not produce expected results.\n" | |
| 345 | + f"Discrepancies: {', '.join(verification.discrepancies)}\n" | |
| 346 | + f"Suggestion: {verification.correction_suggestion}" | |
| 347 | + ) | |
| 348 | + self.agent.session.append( | |
| 349 | + Message(role=Role.USER, content=correction_message) | |
| 350 | + ) | |
| 351 | + continue | |
| 352 | + | |
| 353 | + self.agent.session.append(outcome.message) | |
| 354 | + summary.tool_result_messages.append(outcome.message) | |
| 355 | + | |
| 356 | + if consecutive_errors >= 3: | |
| 357 | + final_response = ( | |
| 358 | + "I ran into some issues. Let me know if you'd like me to try a different approach." | |
| 359 | + ) | |
| 360 | + summary.final_response = final_response | |
| 361 | + summary.failures.append("three consecutive tool errors") | |
| 362 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 363 | + break | |
| 364 | + | |
| 365 | + continue | |
| 366 | + | |
| 367 | + if self.agent._contains_unexecuted_code(response_content): | |
| 368 | + if iterations < self.agent.config.max_iterations - 1: | |
| 369 | + self.agent.session.append(Message(role=Role.ASSISTANT, content=response_content)) | |
| 370 | + self.agent.session.append( | |
| 371 | + Message( | |
| 372 | + role=Role.USER, | |
| 373 | + content=( | |
| 374 | + "CRITICAL ERROR: You are PRETENDING to use tools instead of actually " | |
| 375 | + "using them.\n\n" | |
| 376 | + "DO NOT write:\n" | |
| 377 | + "- 'Used bash tool with command...' (THIS IS FAKE)\n" | |
| 378 | + "- 'Created a file using the write tool...' (THIS IS FAKE)\n" | |
| 379 | + "- 'Here is what I did:' followed by descriptions\n" | |
| 380 | + "- Numbered steps or instructions\n" | |
| 381 | + "- Code blocks for me to copy\n\n" | |
| 382 | + "Your tool calls MUST go through the proper tool interface.\n" | |
| 383 | + "Writing 'Used bash tool...' does NOT execute anything!\n\n" | |
| 384 | + "ACTUALLY call the tools using the tool_call mechanism.\n" | |
| 385 | + "DO IT NOW - stop narrating and start executing." | |
| 386 | + ), | |
| 387 | + ) | |
| 388 | + ) | |
| 389 | + continue | |
| 390 | + | |
| 391 | + if ( | |
| 392 | + not self.agent.use_react | |
| 393 | + and len(actions_taken) == 0 | |
| 394 | + and iterations < self.agent.config.max_iterations - 2 | |
| 395 | + ): | |
| 396 | + deflection_phrases = ["you can", "you should", "you could", "try running"] | |
| 397 | + if any(phrase in content.lower() for phrase in deflection_phrases): | |
| 398 | + self.agent.session.append(Message(role=Role.ASSISTANT, content=response_content)) | |
| 399 | + self.agent.session.append( | |
| 400 | + Message( | |
| 401 | + role=Role.USER, | |
| 402 | + content="Please use your tools to execute the task rather than telling me what to do.", | |
| 403 | + ) | |
| 404 | + ) | |
| 405 | + continue | |
| 406 | + | |
| 407 | + cfg = self.agent.config.reasoning | |
| 408 | + if cfg.self_critique and len(content) > 100: | |
| 409 | + is_code_response = "```" in content or any( | |
| 410 | + keyword in content.lower() | |
| 411 | + for keyword in ["def ", "function ", "class ", "import "] | |
| 412 | + ) | |
| 413 | + if should_self_critique(content, is_code=is_code_response): | |
| 414 | + critique = await self.agent._self_critique(content, task) | |
| 415 | + await emit( | |
| 416 | + AgentEvent( | |
| 417 | + type="critique", | |
| 418 | + content=f"Self-critique: {len(critique.issues_found)} issues found", | |
| 419 | + critique=critique, | |
| 420 | + ) | |
| 421 | + ) | |
| 422 | + if critique.can_revise(): | |
| 423 | + revision_message = ( | |
| 424 | + "[SELF-CRITIQUE] Review your response:\n" | |
| 425 | + f"Issues found: {', '.join(critique.issues_found)}\n" | |
| 426 | + f"Suggestions: {', '.join(critique.suggestions)}\n\n" | |
| 427 | + "Please provide an improved response addressing these issues." | |
| 428 | + ) | |
| 429 | + self.agent.session.append(Message(role=Role.ASSISTANT, content=response_content)) | |
| 430 | + self.agent.session.append(Message(role=Role.USER, content=revision_message)) | |
| 431 | + critique.revision_count += 1 | |
| 432 | + continue | |
| 433 | + | |
| 434 | + is_text_loop, loop_description = self.agent.safeguards.detect_text_loop(content) | |
| 435 | + if is_text_loop: | |
| 436 | + final_response = ( | |
| 437 | + "I seem to be repeating myself. Let me know if you'd like me to try a different approach." | |
| 438 | + ) | |
| 439 | + summary.final_response = final_response | |
| 440 | + summary.failures.append(loop_description) | |
| 441 | + final_message = Message(role=Role.ASSISTANT, content=final_response) | |
| 442 | + self.agent.session.append(final_message) | |
| 443 | + summary.assistant_messages.append(final_message) | |
| 444 | + await emit( | |
| 445 | + AgentEvent( | |
| 446 | + type="error", | |
| 447 | + content=f"Text loop detected: {loop_description}. Stopping.", | |
| 448 | + ) | |
| 449 | + ) | |
| 450 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 451 | + return self._finalize_summary(summary) | |
| 452 | + | |
| 453 | + self.agent.safeguards.record_response(content) | |
| 454 | + effective_task = original_task or task | |
| 455 | + if cfg.completion_check and continuation_count < cfg.max_continuation_prompts: | |
| 456 | + is_premature = ( | |
| 457 | + detect_premature_completion(effective_task, content, actions_taken) | |
| 458 | + if cfg.use_quick_completion | |
| 459 | + else False | |
| 460 | + ) | |
| 461 | + if is_premature: | |
| 462 | + continuation_count += 1 | |
| 463 | + continuation_prompt = get_continuation_prompt( | |
| 464 | + effective_task, | |
| 465 | + actions_taken, | |
| 466 | + content, | |
| 467 | + ) | |
| 468 | + await emit( | |
| 469 | + AgentEvent( | |
| 470 | + type="completion_check", | |
| 471 | + content=f"Task may be incomplete ({len(actions_taken)} actions taken)", | |
| 472 | + completion_check=TaskCompletionCheck( | |
| 473 | + original_task=effective_task, | |
| 474 | + is_complete=False, | |
| 475 | + accomplished=[action.split(":")[0] for action in actions_taken], | |
| 476 | + continuation_prompt=continuation_prompt, | |
| 477 | + ), | |
| 478 | + ) | |
| 479 | + ) | |
| 480 | + self.agent.session.append(Message(role=Role.ASSISTANT, content=response_content)) | |
| 481 | + self.agent.session.append(Message(role=Role.USER, content=continuation_prompt)) | |
| 482 | + continue | |
| 483 | + | |
| 484 | + final_response = content | |
| 485 | + if actions_taken and final_response.strip() and not final_response.rstrip().endswith("?"): | |
| 486 | + final_response = ( | |
| 487 | + final_response.rstrip() | |
| 488 | + + "\n\nWould you like me to make any changes or additions?" | |
| 489 | + ) | |
| 490 | + | |
| 491 | + final_message = Message(role=Role.ASSISTANT, content=response_content) | |
| 492 | + self.agent.session.append(final_message) | |
| 493 | + summary.assistant_messages.append(final_message) | |
| 494 | + | |
| 495 | + if rollback_plan and rollback_plan.actions: | |
| 496 | + await emit( | |
| 497 | + AgentEvent( | |
| 498 | + type="rollback_summary", | |
| 499 | + content=f"Rollback plan: {len(rollback_plan.actions)} action(s) tracked", | |
| 500 | + rollback_plan=rollback_plan, | |
| 501 | + ) | |
| 502 | + ) | |
| 503 | + | |
| 504 | + summary.final_response = final_response | |
| 505 | + await emit(AgentEvent(type="response", content=final_response)) | |
| 506 | + break | |
| 507 | + | |
| 508 | + return self._finalize_summary(summary) | |
| 509 | + | |
| 510 | + async def _request_assistant_turn( | |
| 511 | + self, | |
| 512 | + *, | |
| 513 | + emit: EventSink, | |
| 514 | + max_tokens: int, | |
| 515 | + ) -> AssistantTurn: | |
| 516 | + self.agent.safeguards.code_filter.reset() | |
| 517 | + tools = None if self.agent.use_react else self.agent.registry.get_schemas() | |
| 518 | + self.tracer.record( | |
| 519 | + "assistant.requested", | |
| 520 | + use_react=self.agent.use_react, | |
| 521 | + stream=self.agent.config.stream, | |
| 522 | + max_tokens=max_tokens, | |
| 523 | + ) | |
| 524 | + | |
| 525 | + if self.agent.config.stream: | |
| 526 | + full_content = "" | |
| 527 | + full_content_unfiltered = "" | |
| 528 | + tool_calls: list[ToolCall] = [] | |
| 529 | + pending_tool_calls_seen: set[str] = set() | |
| 530 | + | |
| 531 | + async for chunk in self.agent.backend.stream( | |
| 532 | + messages=self.agent.session.build_request_messages(), | |
| 533 | + tools=tools, | |
| 534 | + temperature=self.agent.config.temperature, | |
| 535 | + max_tokens=max_tokens, | |
| 536 | + ): | |
| 537 | + filtered_content = "" | |
| 538 | + if chunk.content: | |
| 539 | + filtered_content = self.agent.safeguards.filter_stream_chunk(chunk.content) | |
| 540 | + full_content_unfiltered += chunk.content | |
| 541 | + | |
| 542 | + if filtered_content or chunk.is_done: | |
| 543 | + await emit( | |
| 544 | + AgentEvent( | |
| 545 | + type="stream", | |
| 546 | + content=filtered_content, | |
| 547 | + is_stream_end=chunk.is_done, | |
| 548 | + ) | |
| 549 | + ) | |
| 550 | + | |
| 551 | + if self.agent.safeguards.should_steer(): | |
| 552 | + steering_message = self.agent.safeguards.get_steering_message() | |
| 553 | + if steering_message: | |
| 554 | + self.agent._steering_queue.put_nowait(steering_message) | |
| 555 | + | |
| 556 | + if chunk.pending_tool_call and chunk.pending_tool_call.id not in pending_tool_calls_seen: | |
| 557 | + pending_tool_calls_seen.add(chunk.pending_tool_call.id) | |
| 558 | + await emit( | |
| 559 | + AgentEvent( | |
| 560 | + type="tool_call", | |
| 561 | + tool_name=chunk.pending_tool_call.name, | |
| 562 | + tool_args=chunk.pending_tool_call.arguments, | |
| 563 | + ) | |
| 564 | + ) | |
| 565 | + | |
| 566 | + if chunk.is_done: | |
| 567 | + full_content = chunk.full_content or full_content_unfiltered | |
| 568 | + tool_calls = chunk.tool_calls | |
| 569 | + | |
| 570 | + self.tracer.record( | |
| 571 | + "assistant.responded", | |
| 572 | + stream=True, | |
| 573 | + tool_call_count=len(tool_calls), | |
| 574 | + content_length=len(full_content), | |
| 575 | + ) | |
| 576 | + return AssistantTurn( | |
| 577 | + content=full_content, | |
| 578 | + response_content=full_content, | |
| 579 | + tool_calls=tool_calls, | |
| 580 | + pending_tool_calls_seen=pending_tool_calls_seen, | |
| 581 | + ) | |
| 582 | + | |
| 583 | + response = await self.agent.backend.complete( | |
| 584 | + messages=self.agent.session.build_request_messages(), | |
| 585 | + tools=tools, | |
| 586 | + temperature=self.agent.config.temperature, | |
| 587 | + max_tokens=max_tokens, | |
| 588 | + ) | |
| 589 | + response_content = response.content | |
| 590 | + content = self.agent.safeguards.filter_complete_content(response.content) | |
| 591 | + tool_calls = response.tool_calls if not self.agent.use_react else [] | |
| 592 | + if self.agent.safeguards.should_steer(): | |
| 593 | + steering_message = self.agent.safeguards.get_steering_message() | |
| 594 | + if steering_message: | |
| 595 | + self.agent._steering_queue.put_nowait(steering_message) | |
| 596 | + self.tracer.record( | |
| 597 | + "assistant.responded", | |
| 598 | + stream=False, | |
| 599 | + tool_call_count=len(tool_calls), | |
| 600 | + content_length=len(content), | |
| 601 | + ) | |
| 602 | + return AssistantTurn( | |
| 603 | + content=content, | |
| 604 | + response_content=response_content, | |
| 605 | + tool_calls=tool_calls, | |
| 606 | + usage=response.usage, | |
| 607 | + ) | |
| 608 | + | |
| 609 | + async def _handle_recovery( | |
| 610 | + self, | |
| 611 | + tool_call: ToolCall, | |
| 612 | + outcome, | |
| 613 | + emit: EventSink, | |
| 614 | + ) -> Message | None: | |
| 615 | + if self.agent._recovery_context is None: | |
| 616 | + self.agent._recovery_context = RecoveryContext( | |
| 617 | + original_tool=tool_call.name, | |
| 618 | + original_args=tool_call.arguments, | |
| 619 | + max_retries=self.agent.config.max_recovery_attempts, | |
| 620 | + ) | |
| 621 | + | |
| 622 | + if self.agent._recovery_context.is_similar_attempt(tool_call.name, tool_call.arguments): | |
| 623 | + await emit( | |
| 624 | + AgentEvent( | |
| 625 | + type="error", | |
| 626 | + content=( | |
| 627 | + "Loop detected: already tried a similar command. " | |
| 628 | + "Try a DIFFERENT approach (e.g., read a config file first)." | |
| 629 | + ), | |
| 630 | + tool_name=tool_call.name, | |
| 631 | + ) | |
| 632 | + ) | |
| 633 | + else: | |
| 634 | + self.agent._recovery_context.add_attempt( | |
| 635 | + tool_call.name, | |
| 636 | + tool_call.arguments, | |
| 637 | + outcome.result_output, | |
| 638 | + ) | |
| 639 | + | |
| 640 | + if self.agent._recovery_context.can_retry(): | |
| 641 | + attempt_number = len(self.agent._recovery_context.attempts) | |
| 642 | + await emit( | |
| 643 | + AgentEvent( | |
| 644 | + type="recovery", | |
| 645 | + content=( | |
| 646 | + "Tool failed, attempting recovery " | |
| 647 | + f"({attempt_number}/{self.agent._recovery_context.max_retries})" | |
| 648 | + ), | |
| 649 | + tool_name=tool_call.name, | |
| 650 | + recovery_attempt=attempt_number, | |
| 651 | + ) | |
| 652 | + ) | |
| 653 | + recovery_prompt = format_recovery_prompt( | |
| 654 | + self.agent._recovery_context, | |
| 655 | + tool_call.name, | |
| 656 | + tool_call.arguments, | |
| 657 | + outcome.result_output, | |
| 658 | + ) | |
| 659 | + return Message.tool_result_message( | |
| 660 | + tool_call_id=tool_call.id, | |
| 661 | + display_content=recovery_prompt, | |
| 662 | + result_content=recovery_prompt, | |
| 663 | + is_error=True, | |
| 664 | + ) | |
| 665 | + | |
| 666 | + failure_message = format_failure_message(self.agent._recovery_context) | |
| 667 | + await emit( | |
| 668 | + AgentEvent( | |
| 669 | + type="error", | |
| 670 | + content=failure_message, | |
| 671 | + tool_name=tool_call.name, | |
| 672 | + ) | |
| 673 | + ) | |
| 674 | + self.agent._recovery_context = None | |
| 675 | + return Message.tool_result_message( | |
| 676 | + tool_call_id=tool_call.id, | |
| 677 | + display_content=( | |
| 678 | + f"Observation [{tool_call.name}]: Error: {failure_message}" | |
| 679 | + ), | |
| 680 | + result_content=failure_message, | |
| 681 | + is_error=True, | |
| 682 | + ) | |
| 683 | + | |
| 684 | + def _finalize_summary(self, summary: TurnSummary) -> TurnSummary: | |
| 685 | + summary.trace = list(self.tracer.events) | |
| 686 | + return summary | |
| 687 | + | |
| 688 | + @staticmethod | |
| 689 | + def _merge_usage(target: dict[str, int], update: dict[str, int]) -> None: | |
| 690 | + for key, value in update.items(): | |
| 691 | + target[key] = target.get(key, 0) + value | |
| 692 | + | |
| 693 | + @staticmethod | |
| 694 | + def _emit_confirmation(emit: EventSink): | |
| 695 | + async def _emit(tool_name: str, message: str, details: str) -> None: | |
| 696 | + await emit( | |
| 697 | + AgentEvent( | |
| 698 | + type="confirmation", | |
| 699 | + tool_name=tool_name, | |
| 700 | + confirm_message=message, | |
| 701 | + confirm_details=details, | |
| 702 | + ) | |
| 703 | + ) | |
| 704 | + | |
| 705 | + return _emit | |
src/loader/runtime/executor.pyadded@@ -0,0 +1,217 @@ | ||
| 1 | +"""Unified tool execution path for runtime turns.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from collections.abc import Awaitable, Callable | |
| 6 | +from dataclasses import dataclass | |
| 7 | +from enum import StrEnum | |
| 8 | + | |
| 9 | +from ..agent.parsing import format_tool_result | |
| 10 | +from ..agent.recovery import ErrorCategory, categorize_error | |
| 11 | +from ..llm.base import Message, ToolCall | |
| 12 | +from ..tools.base import ConfirmationRequired, ToolRegistry | |
| 13 | +from ..tools.base import ToolResult as RegistryToolResult | |
| 14 | +from .tracing import RuntimeTracer | |
| 15 | + | |
| 16 | +BrowserConfirmation = Callable[[str, str, str], Awaitable[bool]] | None | |
| 17 | +ConfirmationEmitter = Callable[[str, str, str], Awaitable[None]] | None | |
| 18 | + | |
| 19 | + | |
| 20 | +class ToolExecutionState(StrEnum): | |
| 21 | + """Outcome states for one tool call.""" | |
| 22 | + | |
| 23 | + EXECUTED = "executed" | |
| 24 | + DUPLICATE = "duplicate" | |
| 25 | + BLOCKED = "blocked" | |
| 26 | + DECLINED = "declined" | |
| 27 | + | |
| 28 | + | |
| 29 | +@dataclass | |
| 30 | +class ToolExecutionOutcome: | |
| 31 | + """Structured outcome for one tool call.""" | |
| 32 | + | |
| 33 | + tool_call: ToolCall | |
| 34 | + state: ToolExecutionState | |
| 35 | + message: Message | |
| 36 | + event_content: str | |
| 37 | + is_error: bool | |
| 38 | + result_output: str | |
| 39 | + error_category: ErrorCategory | None = None | |
| 40 | + registry_result: RegistryToolResult | None = None | |
| 41 | + | |
| 42 | + | |
| 43 | +class ToolExecutor: | |
| 44 | + """Centralizes duplicate checks, validation, execution, and result messages.""" | |
| 45 | + | |
| 46 | + def __init__(self, registry: ToolRegistry, safeguards, tracer: RuntimeTracer) -> None: | |
| 47 | + self.registry = registry | |
| 48 | + self.safeguards = safeguards | |
| 49 | + self.tracer = tracer | |
| 50 | + | |
| 51 | + async def execute_tool_call( | |
| 52 | + self, | |
| 53 | + tool_call: ToolCall, | |
| 54 | + *, | |
| 55 | + on_confirmation: BrowserConfirmation = None, | |
| 56 | + emit_confirmation: ConfirmationEmitter = None, | |
| 57 | + source: str, | |
| 58 | + ) -> ToolExecutionOutcome: | |
| 59 | + """Execute a tool call through one consistent runtime path.""" | |
| 60 | + | |
| 61 | + self.tracer.record( | |
| 62 | + "tool.received", | |
| 63 | + tool_name=tool_call.name, | |
| 64 | + tool_call_id=tool_call.id, | |
| 65 | + source=source, | |
| 66 | + ) | |
| 67 | + | |
| 68 | + browser_block = self._browser_command_message(tool_call) | |
| 69 | + if browser_block is not None: | |
| 70 | + return self._blocked_outcome(tool_call, browser_block) | |
| 71 | + | |
| 72 | + is_duplicate, duplicate_reason = self.safeguards.check_duplicate( | |
| 73 | + tool_call.name, | |
| 74 | + tool_call.arguments, | |
| 75 | + ) | |
| 76 | + if is_duplicate: | |
| 77 | + self.tracer.record( | |
| 78 | + "tool.duplicate", | |
| 79 | + tool_name=tool_call.name, | |
| 80 | + tool_call_id=tool_call.id, | |
| 81 | + reason=duplicate_reason, | |
| 82 | + ) | |
| 83 | + duplicate_message = f"[Skipped - duplicate action: {duplicate_reason}]" | |
| 84 | + return ToolExecutionOutcome( | |
| 85 | + tool_call=tool_call, | |
| 86 | + state=ToolExecutionState.DUPLICATE, | |
| 87 | + message=Message.tool_result_message( | |
| 88 | + tool_call_id=tool_call.id, | |
| 89 | + display_content=duplicate_message, | |
| 90 | + result_content=duplicate_message, | |
| 91 | + ), | |
| 92 | + event_content=duplicate_message, | |
| 93 | + is_error=False, | |
| 94 | + result_output=duplicate_message, | |
| 95 | + ) | |
| 96 | + | |
| 97 | + validation = self.safeguards.validate_action(tool_call.name, tool_call.arguments) | |
| 98 | + if not validation.valid: | |
| 99 | + error_message = f"[Blocked - {validation.reason}]" | |
| 100 | + if validation.suggestion: | |
| 101 | + error_message += f" Suggestion: {validation.suggestion}" | |
| 102 | + self.tracer.record( | |
| 103 | + "tool.blocked", | |
| 104 | + tool_name=tool_call.name, | |
| 105 | + tool_call_id=tool_call.id, | |
| 106 | + reason=validation.reason, | |
| 107 | + ) | |
| 108 | + return self._blocked_outcome(tool_call, error_message) | |
| 109 | + | |
| 110 | + result = await self._execute_registry( | |
| 111 | + tool_call, | |
| 112 | + on_confirmation, | |
| 113 | + emit_confirmation, | |
| 114 | + ) | |
| 115 | + result_text = format_tool_result( | |
| 116 | + tool_call.name, | |
| 117 | + result.output, | |
| 118 | + result.is_error, | |
| 119 | + ) | |
| 120 | + if not result.is_error: | |
| 121 | + self.safeguards.record_action(tool_call.name, tool_call.arguments) | |
| 122 | + | |
| 123 | + category = categorize_error(result.output) if result.is_error else None | |
| 124 | + state = ToolExecutionState.EXECUTED | |
| 125 | + if result.output == f"Tool {tool_call.name} was declined by user": | |
| 126 | + state = ToolExecutionState.DECLINED | |
| 127 | + | |
| 128 | + self.tracer.record( | |
| 129 | + "tool.executed", | |
| 130 | + tool_name=tool_call.name, | |
| 131 | + tool_call_id=tool_call.id, | |
| 132 | + state=state, | |
| 133 | + is_error=result.is_error, | |
| 134 | + ) | |
| 135 | + return ToolExecutionOutcome( | |
| 136 | + tool_call=tool_call, | |
| 137 | + state=state, | |
| 138 | + message=Message.tool_result_message( | |
| 139 | + tool_call_id=tool_call.id, | |
| 140 | + display_content=result_text, | |
| 141 | + result_content=result.output, | |
| 142 | + is_error=result.is_error, | |
| 143 | + ), | |
| 144 | + event_content=result.output, | |
| 145 | + is_error=result.is_error, | |
| 146 | + result_output=result.output, | |
| 147 | + error_category=category, | |
| 148 | + registry_result=result, | |
| 149 | + ) | |
| 150 | + | |
| 151 | + def _blocked_outcome(self, tool_call: ToolCall, message: str) -> ToolExecutionOutcome: | |
| 152 | + return ToolExecutionOutcome( | |
| 153 | + tool_call=tool_call, | |
| 154 | + state=ToolExecutionState.BLOCKED, | |
| 155 | + message=Message.tool_result_message( | |
| 156 | + tool_call_id=tool_call.id, | |
| 157 | + display_content=message, | |
| 158 | + result_content=message, | |
| 159 | + is_error=True, | |
| 160 | + ), | |
| 161 | + event_content=message, | |
| 162 | + is_error=True, | |
| 163 | + result_output=message, | |
| 164 | + error_category=categorize_error(message), | |
| 165 | + ) | |
| 166 | + | |
| 167 | + async def _execute_registry( | |
| 168 | + self, | |
| 169 | + tool_call: ToolCall, | |
| 170 | + on_confirmation: BrowserConfirmation, | |
| 171 | + emit_confirmation: ConfirmationEmitter, | |
| 172 | + ) -> RegistryToolResult: | |
| 173 | + try: | |
| 174 | + return await self.registry.execute(tool_call.name, **tool_call.arguments) | |
| 175 | + except ConfirmationRequired as confirmation: | |
| 176 | + self.tracer.record( | |
| 177 | + "tool.confirmation_requested", | |
| 178 | + tool_name=confirmation.tool_name, | |
| 179 | + tool_call_id=tool_call.id, | |
| 180 | + ) | |
| 181 | + if emit_confirmation: | |
| 182 | + await emit_confirmation( | |
| 183 | + confirmation.tool_name, | |
| 184 | + confirmation.message, | |
| 185 | + confirmation.details, | |
| 186 | + ) | |
| 187 | + if on_confirmation: | |
| 188 | + confirmed = await on_confirmation( | |
| 189 | + confirmation.tool_name, | |
| 190 | + confirmation.message, | |
| 191 | + confirmation.details, | |
| 192 | + ) | |
| 193 | + else: | |
| 194 | + confirmed = True | |
| 195 | + | |
| 196 | + if not confirmed: | |
| 197 | + return RegistryToolResult( | |
| 198 | + output=f"Tool {tool_call.name} was declined by user", | |
| 199 | + is_error=False, | |
| 200 | + ) | |
| 201 | + | |
| 202 | + previous_skip = self.registry.skip_confirmation | |
| 203 | + self.registry.skip_confirmation = True | |
| 204 | + try: | |
| 205 | + return await self.registry.execute(tool_call.name, **tool_call.arguments) | |
| 206 | + finally: | |
| 207 | + self.registry.skip_confirmation = previous_skip | |
| 208 | + | |
| 209 | + def _browser_command_message(self, tool_call: ToolCall) -> str | None: | |
| 210 | + if tool_call.name != "bash": | |
| 211 | + return None | |
| 212 | + | |
| 213 | + command = str(tool_call.arguments.get("command", "")) | |
| 214 | + browser_terms = ["xdg-open", "open ", "firefox", "chrome", "browser"] | |
| 215 | + if any(term in command for term in browser_terms): | |
| 216 | + return "[Blocked - Browser/display commands are not supported in the terminal runtime]" | |
| 217 | + return None | |