`6e4f880`

fixes to agent behavior and more i forget

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 4 months ago

SHA: 6e4f880d2032c3e49e6642c143ba919fdb197fb0
Parents: 833086d
Tree: f3fed0c

11 changed files

Status	File	+	-
A	`package-lock.json`	56	0
A	`package.json`	30	0
M	`src/loader/agent/loop.py`	61	20
M	`src/loader/agent/parsing.py`	50	0
M	`src/loader/agent/reasoning.py`	67	88
M	`src/loader/llm/base.py`	3	0
M	`src/loader/llm/ollama.py`	105	17
M	`src/loader/ui/adapter.py`	78	11
M	`src/loader/ui/app.py`	27	0
M	`src/loader/ui/widgets/streaming.py`	15	1
M	`tests/test_parsing.py`	31	0

package-lock.jsonadded

 +{
 +  "name": "loader",
 +  "version": "1.0.0",
 +  "lockfileVersion": 3,
 +  "requires": true,
 +  "packages": {
 +    "": {
 +      "name": "loader",
 +      "version": "1.0.0",
 +      "license": "ISC",
 +      "dependencies": {
 +        "react": "^19.2.3",
 +        "react-dom": "^19.2.3",
 +        "react-parallax-tilt": "^1.7.315"
 +      },
 +      "devDependencies": {}
 +    },
 +    "node_modules/react": {
 +      "version": "19.2.3",
 +      "resolved": "https://registry.npmjs.org/react/-/react-19.2.3.tgz",
 +      "integrity": "sha512-Ku/hhYbVjOQnXDZFv2+RibmLFGwFdeeKHFcOTlrt7xplBnya5OGn/hIRDsqDiSUcfORsDC7MPxwork8jBwsIWA==",
 +      "license": "MIT",
 +      "engines": {
 +        "node": ">=0.10.0"
 +      }
 +    },
 +    "node_modules/react-dom": {
 +      "version": "19.2.3",
 +      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.3.tgz",
 +      "integrity": "sha512-yELu4WmLPw5Mr/lmeEpox5rw3RETacE++JgHqQzd2dg+YbJuat3jH4ingc+WPZhxaoFzdv9y33G+F7Nl5O0GBg==",
 +      "license": "MIT",
 +      "dependencies": {
 +        "scheduler": "^0.27.0"
 +      },
 +      "peerDependencies": {
 +        "react": "^19.2.3"
 +      }
 +    },
 +    "node_modules/react-parallax-tilt": {
 +      "version": "1.7.315",
 +      "resolved": "https://registry.npmjs.org/react-parallax-tilt/-/react-parallax-tilt-1.7.315.tgz",
 +      "integrity": "sha512-m0I2yPEmzEC+qGelF+8P+L60lH/S50OJE+pz1bVmurnkKNMyd2Q4qhtAi8zRibNkwFd6oOGvA8qEqAySBbAOJg==",
 +      "license": "MIT",
 +      "peerDependencies": {
 +        "react": "^15.0.0 || ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0",
 +        "react-dom": "^15.0.0 || ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
 +      }
 +    },
 +    "node_modules/scheduler": {
 +      "version": "0.27.0",
 +      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz",
 +      "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==",
 +      "license": "MIT"
 +    }
 +  }
 +}

package.jsonadded

 +{
 +  "dependencies": {
 +    "react": "^19.2.3",
 +    "react-dom": "^19.2.3",
 +    "react-parallax-tilt": "^1.7.315"
 +  },
 +  "name": "loader",
 +  "version": "1.0.0",
 +  "description": "Local agentic coding assistant. Runs on your hardware with local LLMs.",
 +  "main": "index.js",
 +  "directories": {
 +    "doc": "docs",
 +    "test": "tests"
 +  },
 +  "scripts": {
 +    "test": "echo \"Error: no test specified\" && exit 1"
 +  },
 +  "repository": {
 +    "type": "git",
 +    "url": "git+https://github.com/tenseleyFlow/loader.git"
 +  },
 +  "keywords": [],
 +  "author": "",
 +  "license": "ISC",
 +  "type": "commonjs",
 +  "bugs": {
 +    "url": "https://github.com/tenseleyFlow/loader/issues"
 +  },
 +  "homepage": "https://github.com/tenseleyFlow/loader#readme"
 +}

src/loader/agent/loop.pymodified

          # Check if backend supports native tools
          if hasattr(self.backend, "supports_native_tools"):
 -            self._use_react = not self.backend.supports_native_tools()
 +            supports_native = self.backend.supports_native_tools()
 +            self._use_react = not supports_native
 +            # Debug log
 +            try:
 +                with open("/tmp/loader_debug.log", "a") as f:
 +                    f.write(f"[loop] use_react: supports_native={supports_native}, use_react={self._use_react}\n")
 +            except Exception:
 +                pass
          else:
              # Default to ReAct for unknown backends
              self._use_react = True
              tools = None if self.use_react else self.registry.get_schemas()
              # Use streaming or regular completion
 +            pending_tool_calls_seen: set[str] = set()  # Track IDs of pending tool calls shown
              if self.config.stream:
                  full_content = ""
                  tool_calls: list[ToolCall] = []
                      temperature=self.config.temperature,
                      max_tokens=effective_max_tokens,
                  ):
 -                    if chunk.content:
 +                    # Emit stream events for content OR for final chunk (to signal end)
 +                    if chunk.content or chunk.is_done:
                          await emit(AgentEvent(
                              type="stream",
                              content=chunk.content,
                              is_stream_end=chunk.is_done,
                          ))
 +                    # Show pending tool calls as they're detected (ReAct mode interleaving)
 +                    if chunk.pending_tool_call and chunk.pending_tool_call.id not in pending_tool_calls_seen:
 +                        pending_tool_calls_seen.add(chunk.pending_tool_call.id)
 +                        await emit(AgentEvent(
 +                            type="tool_call",
 +                            tool_name=chunk.pending_tool_call.name,
 +                            tool_args=chunk.pending_tool_call.arguments,
 +                        ))
                      if chunk.is_done:
                          full_content = chunk.full_content or full_content
                          tool_calls = chunk.tool_calls
 +                        # Debug log
 +                        try:
 +                            with open("/tmp/loader_debug.log", "a") as f:
 +                                f.write(f"[loop] chunk.is_done: got {len(tool_calls)} tool_calls\n")
 +                        except Exception:
 +                            pass
                  content = full_content
                  response_content = full_content
              # If there are tool calls, execute them
              if tool_calls:
 +                # Debug log
 +                try:
 +                    with open("/tmp/loader_debug.log", "a") as f:
 +                        f.write(f"[loop] executing {len(tool_calls)} tool_calls\n")
 +                        for tc in tool_calls:
 +                            f.write(f"[loop]   - {tc.name}: id={tc.id}, args_keys={list(tc.arguments.keys())}\n")
 +                except Exception:
 +                    pass
++
                  # Add assistant message with tool calls
                  self.messages.append(Message(
                      role=Role.ASSISTANT,
                              ))
                              continue  # Skip this tool call, let LLM reconsider
 -                    await emit(AgentEvent(
 -                        type="tool_call",
 -                        tool_name=tool_call.name,
 -                        tool_args=tool_call.arguments,
 -                    ))
 +                    # Only emit tool_call if not already shown during streaming
 +                    if tool_call.id not in pending_tool_calls_seen:
 +                        try:
 +                            with open("/tmp/loader_debug.log", "a") as f:
 +                                f.write(f"[loop] emitting tool_call event for {tool_call.name}\n")
 +                        except Exception:
 +                            pass
 +                        await emit(AgentEvent(
 +                            type="tool_call",
 +                            tool_name=tool_call.name,
 +                            tool_args=tool_call.arguments,
 +                        ))
 +                    else:
 +                        try:
 +                            with open("/tmp/loader_debug.log", "a") as f:
 +                                f.write(f"[loop] SKIPPING tool_call event for {tool_call.name} (already in pending_seen)\n")
 +                        except Exception:
 +                            pass
                      # Track this action for completion checking
                      action_desc = f"{tool_call.name}: {str(tool_call.arguments)[:100]}"
                  ))
                  continue
 -            # No tool calls and early in the task - likely giving up too soon
 -            # This catches native mode models that stop without using tools
 -            if not self.use_react and len(actions_taken) < 5 and iterations < self.config.max_iterations - 2:
 -                # Check if response looks like a stopping point but we haven't done much
 -                stopping_phrases = [
 -                    "let me know", "feel free", "hope this", "happy to help",
 -                    "anything else", "is there", "that's", "all done", "complete",
 -                ]
 -                looks_like_stopping = any(p in content.lower() for p in stopping_phrases)
+-
 -                if looks_like_stopping or len(content) < 150:
 +            # No tool calls and early in the task - MAY be giving up too soon
 +            # But only intervene if we haven't done ANY work yet
 +            if not self.use_react and len(actions_taken) == 0 and iterations < self.config.max_iterations - 2:
 +                # Check if response looks like deflection without having done anything
 +                deflection_phrases = ["you can", "you should", "you could", "try running"]
 +                looks_like_deflection = any(p in content.lower() for p in deflection_phrases)
++
 +                if looks_like_deflection:
                      self.messages.append(Message(
                          role=Role.ASSISTANT,
                          content=response_content,
                      ))
                      self.messages.append(Message(
                          role=Role.USER,
 -                        content="You stopped without completing the task. Continue executing - "
 -                                "use your tools to finish the job. Don't describe what to do, DO IT.",
 +                        content="Please use your tools to execute the task rather than telling me what to do.",
                      ))
                      continue

src/loader/agent/parsing.pymodified

      return {}
 +def _parse_bracket_args(args_str: str) -> dict:
 +    """Parse arguments from bracketed tool call format.
++
 +    Handles formats like:
 +        file_path=/tmp/test.txt, content="hello world"
 +        command="ls -la"
 +        file_path="test.py", old_string="foo", new_string="bar"
++
 +    Args:
 +        args_str: The arguments string (everything after "tool with:" or "tool:")
++
 +    Returns:
 +        Dictionary of parsed arguments
 +    """
 +    args = {}
++
 +    # Pattern to match key=value pairs where value can be:
 +    # - quoted string (single or double quotes)
 +    # - unquoted value (until comma or end)
 +    pattern = r'(\w+)\s*=\s*(?:"([^"]*?)"|\'([^\']*?)\'|([^,\]]+?))\s*(?:,|$)'
++
 +    for match in re.finditer(pattern, args_str):
 +        key = match.group(1)
 +        # Value is in one of the capture groups (2=double quoted, 3=single quoted, 4=unquoted)
 +        value = match.group(2) or match.group(3) or match.group(4)
 +        if value is not None:
 +            value = value.strip()
 +            args[key] = value
++
 +    return args
++
++
  def parse_tool_calls(text: str) -> ParsedResponse:
      """Parse tool calls from LLM text output.
          if tool_calls:
              content = re.sub(bare_json_pattern, "", content)
 +    # Pattern 3: Bracketed format [calls/USE tool with/: key=value, ...]
 +    # Examples:
 +    #   [calls write tool with: file_path=/tmp/test.txt, content="hello"]
 +    #   [USE bash tool: command="ls -la"]
 +    if not tool_calls:
 +        bracket_pattern = r'\[(?:calls|USE)\s+(\w+)\s+tool(?:\s+with)?[:\s]+([^\]]+)\]'
 +        for i, (name, args_str) in enumerate(re.findall(bracket_pattern, text, re.IGNORECASE)):
 +            args = _parse_bracket_args(args_str)
 +            if args:
 +                tool_calls.append(ToolCall(
 +                    id=f"call_{i}",
 +                    name=name.lower(),
 +                    arguments=args,
 +                ))
 +        # Remove bracketed tool calls from content
 +        if tool_calls:
 +            content = re.sub(bracket_pattern, "", content, flags=re.IGNORECASE)
++
      # Clean up content
      content = content.strip()

src/loader/agent/reasoning.pymodified

      """Quick heuristic to detect if agent is stopping too early.
      Returns True if the agent might be giving up prematurely.
 +    This should be CONSERVATIVE - only trigger when really needed,
 +    not for simple tasks that are genuinely complete.
      """
      task_lower = task.lower()
      response_lower = response.lower()
 -    # Keywords that suggest the task should involve multiple steps
 -    multi_step_indicators = [
 -        "create a", "build a", "make a", "set up", "setup",
 -        "initialize", "scaffold", "generate", "implement",
 -        "project", "application", "app", "website", "api",
 -        "add", "write", "develop", "design", "help me",
 -    ]
 +    # If no actions taken at all and task requires action, that's premature
 +    if not actions_taken:
 +        # But only if this looks like an actionable task
 +        action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"]
 +        if any(verb in task_lower for verb in action_verbs):
 +            return True
 +        return False  # Informational/conversational tasks don't need actions
 -    # Keywords that suggest testing/verification should happen
 -    verification_indicators = [
 -        "test", "run", "start", "launch", "verify", "check",
 -        "demo", "show", "demonstrate", "work", "function",
 +    # If we took actions and got successful results, trust that we're done
 +    # Check for success indicators in response
 +    success_indicators = [
 +        "successfully", "created", "written", "done", "completed",
 +        "file now contains", "has been updated", "installed",
+     ]
 +    if any(ind in response_lower for ind in success_indicators) and len(actions_taken) >= 1:
 +        return False  # Likely actually done
 -    # Keywords in response that suggest premature completion
 -    premature_phrases = [
 -        "i've created", "i created", "file has been created",
 -        "here's the", "i've set up the basic", "i've written",
 -        "you can now", "you should now", "you can run",
 -        "that's it", "all done", "complete", "finished",
 -        "let me know", "feel free to", "hope this helps",
 -        "is there anything else",
 +    # Keywords that suggest COMPLEX multi-step tasks (not simple ones)
 +    complex_indicators = [
 +        "set up a project", "create a project", "build a complete",
 +        "scaffold", "initialize a new", "create a full",
 +        "implement a full", "develop a complete",
+     ]
 +    is_complex = any(ind in task_lower for ind in complex_indicators)
 -    # Check if this looks like a multi-step task
 -    is_multi_step = any(ind in task_lower for ind in multi_step_indicators)
+-
 -    # Check if verification was expected but not done
 -    expects_verification = any(ind in task_lower for ind in verification_indicators)
 +    # Simple creation tasks don't need follow-up
 +    simple_creation = [
 +        "create a file", "write a file", "make a file",
 +        "add a function", "edit the", "fix the", "update the",
 +        "read the", "show me", "list",
 +    ]
 +    is_simple = any(ind in task_lower for ind in simple_creation)
 -    # Check for premature completion phrases
 -    has_premature_phrase = any(phrase in response_lower for phrase in premature_phrases)
 +    # If it's a simple task with at least one action, it's probably done
 +    if is_simple and len(actions_taken) >= 1:
 +        return False
 -    # Action count thresholds
 -    few_actions = len(actions_taken) < 3
 -    very_few_actions = len(actions_taken) < 2
 +    # Explicit verification requests need bash
 +    explicit_verification = ["and test", "and run", "and verify", "make sure it works"]
 +    needs_verification = any(ind in task_lower for ind in explicit_verification)
      # Categorize what actions were taken
      action_types = set()
          elif "glob" in action_lower or "grep" in action_lower:
              action_types.add("search")
 -    # More aggressive detection:
 +    # Detection rules (more conservative):
 -    # 1. Multi-step task with premature phrases and few actions
 -    if is_multi_step and has_premature_phrase and few_actions:
 +    # 1. Complex project tasks with very few actions
 +    if is_complex and len(actions_taken) < 3:
          return True
 -    # 2. Multi-step task with very few actions (regardless of phrases)
 -    if is_multi_step and very_few_actions:
 +    # 2. Explicitly requested verification but no bash run
 +    if needs_verification and "bash" not in action_types:
          return True
 -    # 3. Only wrote/edited files but never ran/tested anything
 -    if action_types and action_types <= {"write", "edit", "read"} and few_actions:
 -        # Wrote files but never executed bash to test
 -        if "write" in action_types or "edit" in action_types:
 -            return True
+-
 -    # 4. Verification expected but no bash commands run
 -    if expects_verification and "bash" not in action_types:
 -        return True
+-
 -    # 5. Response has chatbot-style "let me know" phrases
 -    chatbot_phrases = ["let me know", "feel free", "hope this", "happy to help"]
 -    if any(phrase in response_lower for phrase in chatbot_phrases):
 -        return True
+-
 -    # 6. Response is very short but task seems substantial
 -    if len(response) < 200 and is_multi_step and len(actions_taken) > 0:
 +    # 3. Chatbot-style deflection with no real work done
 +    deflection_phrases = ["you can now", "you should", "you can run", "you can use"]
 +    if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2:
          return True
      return False
      """Generate a prompt to encourage the agent to continue.
      Returns a prompt that nudges the agent to follow through.
 +    Should be helpful, not aggressive.
      """
      task_lower = task.lower()
      actions_str = ", ".join(a.split(":")[0] for a in actions_taken[-5:]) if actions_taken else "none"
      # Determine what type of follow-up is needed
      follow_ups = []
 -    # Project setup tasks should initialize
 -    if any(kw in task_lower for kw in ["node", "npm", "javascript", "react", "vue", "next"]):
 -        if not any("npm" in a for a in actions_taken):
 -            follow_ups.append("Run `npm install` to install dependencies")
 -            follow_ups.append("Start the development server to verify it works")
+-
 -    if any(kw in task_lower for kw in ["python", "pip", "django", "flask", "fastapi"]):
 -        if not any("pip" in a or "uv" in a for a in actions_taken):
 -            follow_ups.append("Install dependencies with pip/uv")
 -            follow_ups.append("Run the application to verify it works")
+-
 -    # Test tasks should run tests
 -    if "test" in task_lower:
 +    # Only suggest package install if explicitly mentioned in task
 +    if any(kw in task_lower for kw in ["install", "dependencies", "set up project"]):
 +        if "node" in task_lower or "npm" in task_lower:
 +            if not any("npm" in a for a in actions_taken):
 +                follow_ups.append("Run `npm install` to install dependencies")
 +        if "python" in task_lower or "pip" in task_lower:
 +            if not any("pip" in a or "uv" in a for a in actions_taken):
 +                follow_ups.append("Install dependencies")
++
 +    # Only suggest running tests if "test" is explicitly in task
 +    if "test" in task_lower and "run" in task_lower:
          if not any("test" in a or "pytest" in a or "jest" in a for a in actions_taken):
 -            follow_ups.append("Run the tests to verify they pass")
+-
 -    # Build tasks should verify build
 -    if "build" in task_lower or "compile" in task_lower:
 -        if not any("build" in a or "compile" in a for a in actions_taken):
 -            follow_ups.append("Run the build to verify it succeeds")
 +            follow_ups.append("Run the tests")
 -    # Generic follow-ups for creation tasks
 -    if any(kw in task_lower for kw in ["create", "make", "build", "set up"]):
 -        if len(actions_taken) < 3:
 -            follow_ups.append("Verify the creation was successful")
 -            follow_ups.append("Demonstrate that it works as expected")
 +    # If task explicitly asks to run/verify, remind to do so
 +    if any(kw in task_lower for kw in ["and run", "and test", "and verify", "make sure it works"]):
 +        follow_ups.append("Execute what was created to verify it works")
      if follow_ups:
 -        steps = "\n".join(f"- {step}" for step in follow_ups[:3])
 +        steps = "\n".join(f"- {step}" for step in follow_ups[:2])
          return (
 -            f"STOP - You are NOT done. The task was: \"{task}\"\n\n"
 -            f"Actions so far: {actions_str}\n"
 -            f"You MUST also:\n{steps}\n\n"
 -            f"DO NOT respond with text. USE YOUR TOOLS NOW to complete these steps."
 +            f"The task was: \"{task}\"\n\n"
 +            f"You may need to also:\n{steps}\n\n"
 +            f"If the task is actually complete, just confirm what was done."
+         )
 -    # Generic continuation - be forceful
 +    # Generic - be gentle
      return (
 -        f"INCOMPLETE. Task: \"{task}\"\n"
 -        f"Actions taken: {actions_str} ({len(actions_taken)} total)\n\n"
 -        f"You stopped too early. What about:\n"
 -        f"- Testing/verifying the result?\n"
 -        f"- Running what you created?\n"
 -        f"- Installing dependencies?\n\n"
 -        f"USE YOUR TOOLS to continue. Do not just describe - EXECUTE."
 +        f"Task: \"{task}\"\n"
 +        f"You took {len(actions_taken)} action(s). "
 +        f"If there's more to do, continue. Otherwise, confirm completion."
+     )

src/loader/llm/base.pymodified

      full_content: str = ""  # Accumulated full content (only set when is_done=True)
      tool_calls: list[ToolCall] = field(default_factory=list)
      is_done: bool = False
 +    # Pending tool call detected during streaming (ReAct mode)
 +    # This allows showing tool widgets as they're detected, before streaming ends
 +    pending_tool_call: ToolCall | None = None
  @dataclass

src/loader/llm/ollama.pymodified

          if "tool_calls" in message:
              for i, tc in enumerate(message["tool_calls"]):
                  func = tc.get("function", {})
 +                # Arguments may be a JSON string or dict
 +                args = func.get("arguments", {})
 +                if isinstance(args, str):
 +                    try:
 +                        args = json.loads(args)
 +                    except json.JSONDecodeError:
 +                        args = {}
                  tool_calls.append(ToolCall(
                      id=tc.get("id", f"call_{i}"),
                      name=func.get("name", ""),
 -                    arguments=func.get("arguments", {}),
 +                    arguments=args,
                  ))
          else:
              # Try to parse tool calls from text
              async for chunk in self._stream_response(response):
                  yield chunk
 +    def _debug_log(self, message: str) -> None:
 +        """Write debug message to log file."""
 +        try:
 +            with open("/tmp/loader_debug.log", "a") as f:
 +                f.write(f"[ollama] {message}\n")
 +        except Exception:
 +            pass
++
      async def _stream_response(self, response) -> AsyncIterator[StreamChunk]:
          """Internal helper to stream response chunks."""
          import re
          full_content = ""
          display_content = ""  # Content to show (filtered)
          json_buffer = ""  # Buffer for potential tool call JSON
 +        tool_call_buffer = ""  # Buffer for <tool_call> block content
          in_json_block = False
          in_think_block = False  # For reasoning models like deepseek-r1
          in_tool_call_block = False  # For ReAct <tool_call> tags
 +        detected_tool_calls: list[ToolCall] = []  # Track tool calls found during streaming
 +        tool_call_counter = 0
          async for line in response.aiter_lines():
              if not line:
                  tool_calls = []
                  # Check for native tool calls first
                  if "tool_calls" in message:
 +                    self._debug_log(f"is_done: found native tool_calls in message: {len(message['tool_calls'])}")
                      for i, tc in enumerate(message["tool_calls"]):
                          func = tc.get("function", {})
 +                        # Arguments may be a JSON string or dict
 +                        args = func.get("arguments", {})
 +                        if isinstance(args, str):
 +                            try:
 +                                args = json.loads(args)
 +                            except json.JSONDecodeError:
 +                                args = {}
                          tool_calls.append(ToolCall(
                              id=tc.get("id", f"call_{i}"),
                              name=func.get("name", ""),
 -                            arguments=func.get("arguments", {}),
 +                            arguments=args,
                          ))
                  else:
 -                    # Try to parse tool calls from text
 -                    clean_content, tool_calls = self._parse_tool_calls(full_content)
 -                    display_content = clean_content
+-
 +                    # Use detected tool calls from streaming, or parse from text
 +                    if detected_tool_calls:
 +                        self._debug_log(f"is_done: using {len(detected_tool_calls)} detected_tool_calls from streaming")
 +                        tool_calls = detected_tool_calls
 +                    else:
 +                        self._debug_log(f"is_done: parsing tool calls from text (len={len(full_content)})")
 +                        self._debug_log(f"is_done: full_content = {repr(full_content[:500])}")
 +                        clean_content, tool_calls = self._parse_tool_calls(full_content)
 +                        self._debug_log(f"is_done: parsed {len(tool_calls)} tool calls")
 +                        display_content = clean_content
++
 +                self._debug_log(f"is_done: yielding final chunk with {len(tool_calls)} tool_calls")
                  yield StreamChunk(
                      content="",  # Don't emit final chunk content (already streamed)
                      full_content=display_content or full_content,
                      # Skip content inside think block
                      continue
 -                # Filter out <tool_call> blocks from ReAct mode
 +                # Filter out <tool_call> blocks from ReAct mode - but parse them!
                  if "<tool_call>" in chunk_content:
                      in_tool_call_block = True
 +                    tool_call_buffer = ""  # Reset buffer
                      # Keep content before <tool_call>
                      before = chunk_content.split("<tool_call>")[0]
                      if before:
                          display_content += before
                          yield StreamChunk(content=before)
 +                    # Start buffering the tool call content
 +                    after_tag = chunk_content.split("<tool_call>", 1)[-1]
 +                    if "</tool_call>" in after_tag:
 +                        # Complete tool call in same chunk
 +                        tool_json = after_tag.split("</tool_call>")[0]
 +                        after_close = after_tag.split("</tool_call>", 1)[-1]
 +                        in_tool_call_block = False
 +                        # Parse and yield the tool call
 +                        try:
 +                            tc_data = json.loads(tool_json.strip())
 +                            tc = ToolCall(
 +                                id=f"call_{tool_call_counter}",
 +                                name=tc_data.get("name", ""),
 +                                arguments=tc_data.get("arguments", tc_data.get("parameters", {})),
 +                            )
 +                            tool_call_counter += 1
 +                            detected_tool_calls.append(tc)
 +                            yield StreamChunk(content="", pending_tool_call=tc)
 +                        except (json.JSONDecodeError, KeyError):
 +                            pass
 +                        if after_close.strip():
 +                            display_content += after_close
 +                            yield StreamChunk(content=after_close)
 +                    else:
 +                        tool_call_buffer = after_tag
                      continue
                  elif in_tool_call_block:
                      if "</tool_call>" in chunk_content:
                          in_tool_call_block = False
 -                        # Keep content after </tool_call>
 -                        after = chunk_content.split("</tool_call>")[-1]
 -                        if after:
 -                            display_content += after
 -                            yield StreamChunk(content=after)
 -                    # Skip content inside tool_call block
 +                        # Complete the tool call buffer
 +                        tool_json = tool_call_buffer + chunk_content.split("</tool_call>")[0]
 +                        after_close = chunk_content.split("</tool_call>", 1)[-1]
 +                        # Parse and yield the tool call
 +                        try:
 +                            tc_data = json.loads(tool_json.strip())
 +                            tc = ToolCall(
 +                                id=f"call_{tool_call_counter}",
 +                                name=tc_data.get("name", ""),
 +                                arguments=tc_data.get("arguments", tc_data.get("parameters", {})),
 +                            )
 +                            tool_call_counter += 1
 +                            detected_tool_calls.append(tc)
 +                            yield StreamChunk(content="", pending_tool_call=tc)
 +                        except (json.JSONDecodeError, KeyError):
 +                            pass
 +                        if after_close.strip():
 +                            display_content += after_close
 +                            yield StreamChunk(content=after_close)
 +                    else:
 +                        # Still accumulating tool call content
 +                        tool_call_buffer += chunk_content
                      continue
 -                # Filter out tool call JSON from display
 +                # Filter out tool call JSON from display (bare JSON without tags)
                  # Detect start of JSON tool call
                  if not in_json_block and '{"name"' in chunk_content:
                      in_json_block = True
                      open_braces = json_buffer.count('{')
                      close_braces = json_buffer.count('}')
                      if close_braces >= open_braces and open_braces > 0:
 -                        # JSON block complete, don't display it
 +                        # JSON block complete, try to parse it
                          in_json_block = False
 -                        # Check for content after the JSON
                          try:
                              # Find where JSON ends
                              last_brace = json_buffer.rfind('}')
 +                            json_str = json_buffer[:last_brace + 1]
                              after_json = json_buffer[last_brace + 1:]
 +                            # Try to parse as tool call
 +                            tc_data = json.loads(json_str)
 +                            if "name" in tc_data:
 +                                tc = ToolCall(
 +                                    id=f"call_{tool_call_counter}",
 +                                    name=tc_data.get("name", ""),
 +                                    arguments=tc_data.get("arguments", tc_data.get("parameters", {})),
 +                                )
 +                                tool_call_counter += 1
 +                                detected_tool_calls.append(tc)
 +                                yield StreamChunk(content="", pending_tool_call=tc)
                              if after_json.strip():
                                  display_content += after_json
                                  yield StreamChunk(content=after_json)
 -                        except Exception:
 +                        except (json.JSONDecodeError, KeyError):
 +                            # Not valid JSON, just discard
                              pass
                          json_buffer = ""
                  else:

src/loader/ui/adapter.pymodified

  class EventAdapter:
      """Adapts Agent callback events to Textual messages."""
 +    DEBUG_LOG_FILE = "/tmp/loader_debug.log"
++
      def __init__(self, app: "LoaderApp") -> None:  # noqa: F821
          self.app = app
          self._tool_args_queue: list[tuple[str, dict]] = []  # Queue of (tool_name, args)
 +        # Clear debug log on start
 +        try:
 +            with open(self.DEBUG_LOG_FILE, "w") as f:
 +                f.write("=== Loader Debug Log ===\n")
 +        except Exception:
 +            pass
++
 +    def _debug_log(self, message: str) -> None:
 +        """Write debug message to log file."""
 +        try:
 +            with open(self.DEBUG_LOG_FILE, "a") as f:
 +                f.write(f"{message}\n")
 +        except Exception:
 +            pass
      def handle_event(self, event: AgentEvent) -> None:
          """Convert AgentEvent to appropriate Textual message and post it."""
 +        self._debug_log(f"handle_event: type={event.type}")
          match event.type:
              case "thinking":
                  self.app.post_message(ThinkingStarted())
              case "tool_call":
                  # Queue args for matching with result (FIFO)
 -                self._tool_args_queue.append((event.tool_name or "", event.tool_args or {}))
 +                tool_name = event.tool_name or ""
 +                tool_args = event.tool_args or {}
 +                self._tool_args_queue.append((tool_name, tool_args))
++
 +                # Debug: log tool args for edit/write (helps diagnose diff view issues)
 +                self._debug_log(f"tool_call '{tool_name}': queued, keys={list(tool_args.keys())}")
 +                if tool_name == "write":
 +                    content = tool_args.get("content", "")
 +                    self._debug_log(f"  write content: {len(content) if content else 0} chars")
 +                elif tool_name == "edit":
 +                    self._debug_log(f"  edit old_string: {bool(tool_args.get('old_string'))}, new_string: {bool(tool_args.get('new_string'))}")
++
                  self.app.post_message(
                      ToolCallStarted(
 -                        tool_name=event.tool_name or "",
 -                        tool_args=event.tool_args or {},
 +                        tool_name=tool_name,
 +                        tool_args=tool_args,
+                     )
+                 )
                          if queued_name == tool_name:
                              tool_args = queued_args
                              self._tool_args_queue.pop(i)
 +                            self._debug_log(f"tool_result '{tool_name}': matched in queue, keys={list(tool_args.keys())}")
                              break
                      else:
                          # No match found, use FIFO
 -                        _, tool_args = self._tool_args_queue.pop(0)
 +                        popped_name, tool_args = self._tool_args_queue.pop(0)
 +                        self._debug_log(f"tool_result '{tool_name}': no match, used FIFO (got '{popped_name}'), keys={list(tool_args.keys())}")
 +                else:
 +                    self._debug_log(f"tool_result '{tool_name}': queue was EMPTY!")
                  # Extract diff info for edit/write tools
                  old_string = None
                  new_string = None
                  file_path = None
 -                if tool_name == "edit" and tool_args:
 -                    old_string = tool_args.get("old_string")
 -                    new_string = tool_args.get("new_string")
 -                    file_path = tool_args.get("file_path")
 -                elif tool_name == "write" and tool_args:
 +                if tool_name == "edit":
 +                    if tool_args:
 +                        # Try multiple key names that models might use
 +                        old_string = (
 +                            tool_args.get("old_string")
 +                            or tool_args.get("old")
 +                            or tool_args.get("original")
 +                            or tool_args.get("search")
 +                            or tool_args.get("find")
 +                        )
 +                        new_string = (
 +                            tool_args.get("new_string")
 +                            or tool_args.get("new")
 +                            or tool_args.get("replacement")
 +                            or tool_args.get("replace")
 +                        )
 +                        file_path = (
 +                            tool_args.get("file_path")
 +                            or tool_args.get("path")
 +                            or tool_args.get("filename")
 +                            or tool_args.get("file")
 +                        )
 +                        self._debug_log(f"  edit extracted: old={bool(old_string)} ({len(old_string) if old_string else 0} chars), new={bool(new_string)} ({len(new_string) if new_string else 0} chars), path={file_path}")
 +                    else:
 +                        self._debug_log(f"  edit: tool_args was empty!")
 +                elif tool_name == "write":
                      # For writes, content is the new file content
 -                    new_string = tool_args.get("content")
 -                    file_path = tool_args.get("file_path")
 +                    # Try multiple key names that models might use
 +                    if tool_args:
 +                        new_string = (
 +                            tool_args.get("content")
 +                            or tool_args.get("contents")
 +                            or tool_args.get("text")
 +                            or tool_args.get("data")
 +                        )
 +                        file_path = (
 +                            tool_args.get("file_path")
 +                            or tool_args.get("path")
 +                            or tool_args.get("filename")
 +                        )
 +                        self._debug_log(f"  write extracted: new={bool(new_string)} ({len(new_string) if new_string else 0} chars), path={file_path}")
 +                    else:
 +                        self._debug_log(f"  write: tool_args was empty!")
                  self.app.post_message(
                      ToolCallCompleted(

src/loader/ui/app.pymodified

          self._tool_widget_queue: list[ToolCallWidget] = []  # Queue of pending tool widgets
          self._timer_handle = None
 +    def _debug_log(self, message: str) -> None:
 +        """Write debug message to log file."""
 +        try:
 +            with open("/tmp/loader_debug.log", "a") as f:
 +                f.write(f"{message}\n")
 +        except Exception:
 +            pass
++
      def compose(self) -> ComposeResult:
          yield Container(
              ScrollableContainer(id="message-area"),
          # If agent is running, this is a steering message
          if self.is_generating and self.agent.is_running:
 +            # Finalize current streaming so new content appears below user's message
 +            if self._current_streaming is not None:
 +                self._current_streaming.stop_streaming()
 +                self._current_streaming = None
              self._add_steering_message(user_input)
              self.agent.steer(user_input)
              return
          """Handle tool call start."""
          msg_area = self.query_one("#message-area", ScrollableContainer)
 +        # Finalize any ongoing streaming - tool calls interrupt thinking
 +        if self._current_streaming is not None:
 +            self._current_streaming.stop_streaming()
 +            self._current_streaming = None
++
          # Create tool widget
          widget = ToolCallWidget(
              tool_name=message.tool_name,
          """Handle tool call completion."""
          msg_area = self.query_one("#message-area", ScrollableContainer)
 +        # Debug: log what we received
 +        try:
 +            with open("/tmp/loader_debug.log", "a") as f:
 +                f.write(f"on_tool_call_completed: tool={message.tool_name}, new_string={bool(message.new_string)}, old_string={bool(message.old_string)}, file_path={message.file_path}\n")
 +        except Exception:
 +            pass
++
          # Get the corresponding tool widget from queue (FIFO)
          tool_widget = self._tool_widget_queue.pop(0) if self._tool_widget_queue else None
          # Check if this is an edit tool with diff info
          if message.tool_name == "edit" and message.old_string and message.new_string:
              # Replace tool widget with diff widget
 +            self._debug_log("  -> showing EDIT diff widget")
              if tool_widget:
                  tool_widget.remove()
              msg_area.mount(diff_widget)
          # Check if this is a write tool - show as diff (new file)
          elif message.tool_name == "write" and message.new_string:
 +            self._debug_log("  -> showing WRITE diff widget")
              if tool_widget:
                  tool_widget.remove()
              msg_area.mount(diff_widget)
          elif tool_widget:
              # Update existing tool widget with result
 +            self._debug_log("  -> showing regular tool widget result")
              tool_widget.set_result(
                  message.content, is_error=message.is_error
+             )

src/loader/ui/widgets/streaming.pymodified

      def render(self) -> Text:
          """Render the content with optional cursor."""
          # Use Text object to avoid markup interpretation of LLM output
 -        text = Text(self._content_buffer)
 +        # Clean any tool_call tags that slipped through filtering
 +        content = self._clean_tool_tags(self._content_buffer)
 +        text = Text(content)
          if self.is_streaming:
              text.append("|", style="dim")  # Cursor indicator
          return text
 +    def _clean_tool_tags(self, content: str) -> str:
 +        """Remove any tool_call/think tags that weren't filtered during streaming."""
 +        import re
 +        # Remove <tool_call>...</tool_call> blocks
 +        content = re.sub(r'<tool_call>.*?</tool_call>', '', content, flags=re.DOTALL | re.IGNORECASE)
 +        # Remove orphaned tags
 +        content = re.sub(r'</?tool_call>', '', content, flags=re.IGNORECASE)
 +        content = re.sub(r'</?think>', '', content, flags=re.IGNORECASE)
 +        # Clean up excess newlines from removed blocks
 +        content = re.sub(r'\n{3,}', '\n\n', content)
 +        return content
++
      def append(self, chunk: str) -> None:
          """Append a chunk to the content."""
          self._content_buffer += chunk

tests/test_parsing.pymodified

          assert "<tool_call>" not in result.content
          assert "</tool_call>" not in result.content
 +    def test_parse_bracketed_calls_format(self):
 +        """Test parsing [calls tool with: key=value] format."""
 +        text = '''I'll create the file now.
 +[calls write tool with: file_path=/tmp/test.txt, content="hello world"]
 +Created the file.'''
 +        result = parse_tool_calls(text)
 +        assert len(result.tool_calls) == 1
 +        assert result.tool_calls[0].name == "write"
 +        assert result.tool_calls[0].arguments["file_path"] == "/tmp/test.txt"
 +        assert result.tool_calls[0].arguments["content"] == "hello world"
 +        # Bracketed call should be removed from content
 +        assert "[calls" not in result.content
++
 +    def test_parse_bracketed_use_format(self):
 +        """Test parsing [USE tool: key=value] format."""
 +        text = '[USE bash tool: command="ls -la"]'
 +        result = parse_tool_calls(text)
 +        assert len(result.tool_calls) == 1
 +        assert result.tool_calls[0].name == "bash"
 +        assert result.tool_calls[0].arguments["command"] == "ls -la"
++
 +    def test_parse_bracketed_edit_format(self):
 +        """Test parsing bracketed format with edit tool."""
 +        text = '[calls edit tool with: file_path="test.py", old_string="foo", new_string="bar"]'
 +        result = parse_tool_calls(text)
 +        assert len(result.tool_calls) == 1
 +        assert result.tool_calls[0].name == "edit"
 +        assert result.tool_calls[0].arguments["file_path"] == "test.py"
 +        assert result.tool_calls[0].arguments["old_string"] == "foo"
 +        assert result.tool_calls[0].arguments["new_string"] == "bar"
++
  class TestFormatToolResult:
      """Tests for format_tool_result function."""