`0397de7`

fix: detect and filter hallucinated tool use narrations

Models sometimes describe using tools instead of actually calling them,
outputting text like 'Used bash tool with command...' or 'Here is what
I did:' followed by fake tool descriptions.

- Add hallucination detection patterns in _contains_unexecuted_code()
- Update steering message to explicitly address narration problem
- Add streaming filter to hide hallucinated narrations from users
- Add complete content filter for hallucination patterns

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 4 months ago

SHA: 0397de7221225f3e072ed7f38d1d44826d06aa3c
Parents: dafa5bc
Tree: dabf095

2 changed files

Status	File	+	-
M	`src/loader/agent/loop.py`	26	8
M	`src/loader/agent/safeguards.py`	44	0

src/loader/agent/loop.pymodified

                  ))
                  self.messages.append(Message(
                      role=Role.USER,
 -                    content="CRITICAL ERROR: You are giving me instructions to copy instead of EXECUTING the task.\n\n"
 +                    content="CRITICAL ERROR: You are PRETENDING to use tools instead of actually using them.\n\n"
                              "DO NOT write:\n"
 -                            "- Numbered steps (1., 2., 3.)\n"
 -                            "- Instructions like 'Open your terminal...'\n"
 -                            "- Code blocks for me to copy\n"
 -                            "- 'You can run...', 'Create this file...'\n\n"
 -                            "INSTEAD: Use your bash and write tools RIGHT NOW to execute the task.\n"
 -                            "Example: [call write tool], [call bash tool]\n"
 -                            "DO IT NOW - don't describe it.",
 +                            "- 'Used bash tool with command...' (THIS IS FAKE)\n"
 +                            "- 'Created a file using the write tool...' (THIS IS FAKE)\n"
 +                            "- 'Here is what I did:' followed by descriptions\n"
 +                            "- Numbered steps or instructions\n"
 +                            "- Code blocks for me to copy\n\n"
 +                            "Your tool calls MUST go through the proper tool interface.\n"
 +                            "Writing 'Used bash tool...' does NOT execute anything!\n\n"
 +                            "ACTUALLY call the tools using the tool_call mechanism.\n"
 +                            "DO IT NOW - stop narrating and start executing.",
                  ))
                  continue
              if re.search(pattern, content, re.IGNORECASE):
                  return True
 +        # Check for hallucinated/narrated tool uses - model DESCRIBES using tools
 +        # but doesn't actually call them (past tense narration)
 +        hallucination_patterns = [
 +            r'used\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool',  # "Used bash tool..."
 +            r'used\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool',  # "Used the bash tool..."
 +            r'using\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool',  # "...using the write tool"
 +            r'with\s+file_path\s*=\s*[`\'"]',  # "with file_path=`..." (narrated parameter)
 +            r'with\s+command\s*[`\'"]',  # "with command `..." (narrated bash)
 +            r'i\s+(ran|executed|created|wrote|read)\s+(the\s+)?(command|file)',  # "I ran the command"
 +            r'\*\s*used\s+`',  # "* Used `bash`..." (bullet point narration)
 +            r'here\s+is\s+what\s+i\s+did:',  # "Here is what I did:"
 +        ]
 +        for pattern in hallucination_patterns:
 +            if re.search(pattern, content, re.IGNORECASE):
 +                return True
++
          # Look for markdown code blocks
          code_blocks = re.findall(r'```(\w*)\n(.*?)```', content, re.DOTALL)

src/loader/agent/safeguards.pymodified

                  was_filtered = True
                  continue
 +            # Check for hallucinated tool narration and filter the line
 +            hallucination_match = re.search(
 +                r'([Uu]sed\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool|'
 +                r'[Uu]sing\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool|'
 +                r'with\s+file_path\s*=\s*[`\'"]|'
 +                r'with\s+command\s*[`\'"]|'
 +                r'[Hh]ere\s+is\s+what\s+[Ii]\s+did:)',
 +                self._buffer
 +            )
 +            if hallucination_match:
 +                # Find end of line and remove whole line
 +                line_start = self._buffer.rfind('\n', 0, hallucination_match.start()) + 1
 +                line_end = self._buffer.find('\n', hallucination_match.end())
 +                if line_end == -1:
 +                    # Line continues to end of buffer - wait for more
 +                    if line_start > 0:
 +                        result_parts.append(self._buffer[:line_start])
 +                    self._buffer = self._buffer[line_start:]
 +                    break
 +                else:
 +                    # Remove the whole line
 +                    result_parts.append(self._buffer[:line_start])
 +                    removed.append(self._buffer[line_start:line_end])
 +                    self._buffer = self._buffer[line_end:]
 +                    was_filtered = True
 +                    continue
++
              # Check for preamble patterns and filter the line
              preamble_match = re.search(
                  r'(Here is a JSON response|Here are the function calls|'
              removed.extend(matches)
              filtered = re.sub(pattern, '', filtered, flags=re.IGNORECASE | re.MULTILINE)
 +        # Pattern to match hallucinated/narrated tool uses (remove entire line)
 +        # These are lines where model describes using tools instead of actually calling them
 +        hallucination_patterns = [
 +            r'^.*[Uu]sed\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool.*$',  # "Used bash tool..."
 +            r'^.*[Uu]sing\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool.*$',  # "...using the write tool"
 +            r'^.*with\s+file_path\s*=\s*[`\'"][^`\'"]+[`\'"].*$',  # Narrated file_path parameter
 +            r'^.*with\s+command\s*[`\'"][^`\'"]+[`\'"].*$',  # Narrated bash command
 +            r'^\s*\*\s*[Uu]sed\s+`.*$',  # "* Used `bash`..." (bullet point narration)
 +            r'^.*[Hh]ere\s+is\s+what\s+[Ii]\s+did:.*$',  # "Here is what I did:"
 +            r'^\s*\d+\.\s+[Uu]sed\s+.*tool.*$',  # "1. Used bash tool..."
 +            r'^\s*\d+\.\s+[Cc]reated\s+.*using\s+the\s+.*tool.*$',  # "1. Created... using the write tool"
 +        ]
 +        for pattern in hallucination_patterns:
 +            matches = re.findall(pattern, filtered, re.MULTILINE)
 +            removed.extend(matches)
 +            filtered = re.sub(pattern, '', filtered, flags=re.MULTILINE)
++
          # Filter internal recovery/system prompts (multiline blocks)
          internal_prompt_patterns = [
              # Recovery prompts