tenseleyflow/loader / 0397de7

Browse files

fix: detect and filter hallucinated tool use narrations

Models sometimes describe using tools instead of actually calling them,
outputting text like 'Used bash tool with command...' or 'Here is what
I did:' followed by fake tool descriptions.

- Add hallucination detection patterns in _contains_unexecuted_code()
- Update steering message to explicitly address narration problem
- Add streaming filter to hide hallucinated narrations from users
- Add complete content filter for hallucination patterns
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
0397de7221225f3e072ed7f38d1d44826d06aa3c
Parents
dafa5bc
Tree
dabf095

2 changed files

StatusFile+-
M src/loader/agent/loop.py 26 8
M src/loader/agent/safeguards.py 44 0
src/loader/agent/loop.pymodified
@@ -1314,15 +1314,17 @@ class Agent:
13141314
                 ))
13151315
                 self.messages.append(Message(
13161316
                     role=Role.USER,
1317
-                    content="CRITICAL ERROR: You are giving me instructions to copy instead of EXECUTING the task.\n\n"
1317
+                    content="CRITICAL ERROR: You are PRETENDING to use tools instead of actually using them.\n\n"
13181318
                             "DO NOT write:\n"
1319
-                            "- Numbered steps (1., 2., 3.)\n"
1320
-                            "- Instructions like 'Open your terminal...'\n"
1321
-                            "- Code blocks for me to copy\n"
1322
-                            "- 'You can run...', 'Create this file...'\n\n"
1323
-                            "INSTEAD: Use your bash and write tools RIGHT NOW to execute the task.\n"
1324
-                            "Example: [call write tool], [call bash tool]\n"
1325
-                            "DO IT NOW - don't describe it.",
1319
+                            "- 'Used bash tool with command...' (THIS IS FAKE)\n"
1320
+                            "- 'Created a file using the write tool...' (THIS IS FAKE)\n"
1321
+                            "- 'Here is what I did:' followed by descriptions\n"
1322
+                            "- Numbered steps or instructions\n"
1323
+                            "- Code blocks for me to copy\n\n"
1324
+                            "Your tool calls MUST go through the proper tool interface.\n"
1325
+                            "Writing 'Used bash tool...' does NOT execute anything!\n\n"
1326
+                            "ACTUALLY call the tools using the tool_call mechanism.\n"
1327
+                            "DO IT NOW - stop narrating and start executing.",
13261328
                 ))
13271329
                 continue
13281330
 
@@ -1579,6 +1581,22 @@ class Agent:
15791581
             if re.search(pattern, content, re.IGNORECASE):
15801582
                 return True
15811583
 
1584
+        # Check for hallucinated/narrated tool uses - model DESCRIBES using tools
1585
+        # but doesn't actually call them (past tense narration)
1586
+        hallucination_patterns = [
1587
+            r'used\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool',  # "Used bash tool..."
1588
+            r'used\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool',  # "Used the bash tool..."
1589
+            r'using\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool',  # "...using the write tool"
1590
+            r'with\s+file_path\s*=\s*[`\'"]',  # "with file_path=`..." (narrated parameter)
1591
+            r'with\s+command\s*[`\'"]',  # "with command `..." (narrated bash)
1592
+            r'i\s+(ran|executed|created|wrote|read)\s+(the\s+)?(command|file)',  # "I ran the command"
1593
+            r'\*\s*used\s+`',  # "* Used `bash`..." (bullet point narration)
1594
+            r'here\s+is\s+what\s+i\s+did:',  # "Here is what I did:"
1595
+        ]
1596
+        for pattern in hallucination_patterns:
1597
+            if re.search(pattern, content, re.IGNORECASE):
1598
+                return True
1599
+
15821600
         # Look for markdown code blocks
15831601
         code_blocks = re.findall(r'```(\w*)\n(.*?)```', content, re.DOTALL)
15841602
 
src/loader/agent/safeguards.pymodified
@@ -130,6 +130,33 @@ class CodeBlockFilter:
130130
                 was_filtered = True
131131
                 continue
132132
 
133
+            # Check for hallucinated tool narration and filter the line
134
+            hallucination_match = re.search(
135
+                r'([Uu]sed\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool|'
136
+                r'[Uu]sing\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool|'
137
+                r'with\s+file_path\s*=\s*[`\'"]|'
138
+                r'with\s+command\s*[`\'"]|'
139
+                r'[Hh]ere\s+is\s+what\s+[Ii]\s+did:)',
140
+                self._buffer
141
+            )
142
+            if hallucination_match:
143
+                # Find end of line and remove whole line
144
+                line_start = self._buffer.rfind('\n', 0, hallucination_match.start()) + 1
145
+                line_end = self._buffer.find('\n', hallucination_match.end())
146
+                if line_end == -1:
147
+                    # Line continues to end of buffer - wait for more
148
+                    if line_start > 0:
149
+                        result_parts.append(self._buffer[:line_start])
150
+                    self._buffer = self._buffer[line_start:]
151
+                    break
152
+                else:
153
+                    # Remove the whole line
154
+                    result_parts.append(self._buffer[:line_start])
155
+                    removed.append(self._buffer[line_start:line_end])
156
+                    self._buffer = self._buffer[line_end:]
157
+                    was_filtered = True
158
+                    continue
159
+
133160
             # Check for preamble patterns and filter the line
134161
             preamble_match = re.search(
135162
                 r'(Here is a JSON response|Here are the function calls|'
@@ -273,6 +300,23 @@ class CodeBlockFilter:
273300
             removed.extend(matches)
274301
             filtered = re.sub(pattern, '', filtered, flags=re.IGNORECASE | re.MULTILINE)
275302
 
303
+        # Pattern to match hallucinated/narrated tool uses (remove entire line)
304
+        # These are lines where model describes using tools instead of actually calling them
305
+        hallucination_patterns = [
306
+            r'^.*[Uu]sed\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool.*$',  # "Used bash tool..."
307
+            r'^.*[Uu]sing\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool.*$',  # "...using the write tool"
308
+            r'^.*with\s+file_path\s*=\s*[`\'"][^`\'"]+[`\'"].*$',  # Narrated file_path parameter
309
+            r'^.*with\s+command\s*[`\'"][^`\'"]+[`\'"].*$',  # Narrated bash command
310
+            r'^\s*\*\s*[Uu]sed\s+`.*$',  # "* Used `bash`..." (bullet point narration)
311
+            r'^.*[Hh]ere\s+is\s+what\s+[Ii]\s+did:.*$',  # "Here is what I did:"
312
+            r'^\s*\d+\.\s+[Uu]sed\s+.*tool.*$',  # "1. Used bash tool..."
313
+            r'^\s*\d+\.\s+[Cc]reated\s+.*using\s+the\s+.*tool.*$',  # "1. Created... using the write tool"
314
+        ]
315
+        for pattern in hallucination_patterns:
316
+            matches = re.findall(pattern, filtered, re.MULTILINE)
317
+            removed.extend(matches)
318
+            filtered = re.sub(pattern, '', filtered, flags=re.MULTILINE)
319
+
276320
         # Filter internal recovery/system prompts (multiline blocks)
277321
         internal_prompt_patterns = [
278322
             # Recovery prompts