@@ -1,595 +1,21 @@ |
| 1 | | -"""Runtime safeguards to improve agent behavior. |
| 2 | | - |
| 3 | | -These safeguards help keep the agent on track when models don't follow |
| 4 | | -instructions perfectly. They work at runtime to filter, detect, and correct |
| 5 | | -problematic patterns. |
| 6 | | -""" |
| 7 | | - |
| 8 | | -import re |
| 9 | | -from dataclasses import dataclass, field |
| 10 | | -from pathlib import Path |
| 11 | | - |
| 12 | | -from ..runtime.safeguard_services import ( |
| 13 | | - ActionTracker, |
| 14 | | - PreActionValidator, |
| 15 | | - ValidationResult, |
| 1 | +"""Compatibility exports for runtime-owned safeguards.""" |
| 2 | + |
| 3 | +from ..runtime.safeguard_services import ActionTracker, PreActionValidator, ValidationResult |
| 4 | +from ..runtime.safeguards import ( |
| 5 | + CodeBlockFilter, |
| 6 | + FilterResult, |
| 7 | + PatternDetector, |
| 8 | + PatternMatch, |
| 9 | + RuntimeSafeguards, |
| 16 | 10 | ) |
| 17 | 11 | |
| 18 | | - |
| 19 | | -@dataclass |
| 20 | | -class FilterResult: |
| 21 | | - """Result of filtering content.""" |
| 22 | | - content: str # Filtered content |
| 23 | | - was_filtered: bool # Whether any filtering occurred |
| 24 | | - removed_blocks: list[str] = field(default_factory=list) # What was removed |
| 25 | | - |
| 26 | | - |
| 27 | | -class CodeBlockFilter: |
| 28 | | - """Filters markdown code blocks and bracket tool calls from streamed content. |
| 29 | | - |
| 30 | | - Handles both complete blocks (```...```) and partial blocks that span |
| 31 | | - multiple stream chunks. Also filters [calls X tool with ...] patterns. |
| 32 | | - |
| 33 | | - Candidate for removal once the typed runtime makes tool-call leakage |
| 34 | | - structurally impossible. |
| 35 | | - """ |
| 36 | | - |
| 37 | | - def __init__(self): |
| 38 | | - self._buffer = "" |
| 39 | | - self._in_code_block = False |
| 40 | | - self._block_lang = "" |
| 41 | | - self._current_block = "" |
| 42 | | - self._in_bracket = False |
| 43 | | - self._bracket_content = "" |
| 44 | | - self._in_json_tool = False |
| 45 | | - self._json_brace_count = 0 |
| 46 | | - |
| 47 | | - def reset(self): |
| 48 | | - """Reset filter state.""" |
| 49 | | - self._buffer = "" |
| 50 | | - self._in_code_block = False |
| 51 | | - self._block_lang = "" |
| 52 | | - self._current_block = "" |
| 53 | | - self._in_bracket = False |
| 54 | | - self._bracket_content = "" |
| 55 | | - self._in_json_tool = False |
| 56 | | - self._json_brace_count = 0 |
| 57 | | - |
| 58 | | - def _is_bracket_tool_start(self, text: str) -> bool: |
| 59 | | - """Check if text looks like start of a bracket tool call.""" |
| 60 | | - # Patterns like: [calls, [call, [USE |
| 61 | | - return bool(re.match(r'\[(?:calls?|USE)\s', text, re.IGNORECASE)) |
| 62 | | - |
| 63 | | - def filter_chunk(self, chunk: str) -> FilterResult: |
| 64 | | - """Filter a streaming chunk, removing code blocks and bracket tool calls. |
| 65 | | - |
| 66 | | - Returns filtered content. Handles partial blocks across chunks. |
| 67 | | - """ |
| 68 | | - if not chunk: |
| 69 | | - return FilterResult(content="", was_filtered=False) |
| 70 | | - |
| 71 | | - result_parts = [] |
| 72 | | - removed = [] |
| 73 | | - was_filtered = False |
| 74 | | - |
| 75 | | - # Process character by character to handle streaming |
| 76 | | - self._buffer += chunk |
| 77 | | - |
| 78 | | - while self._buffer: |
| 79 | | - # Handle bracket tool calls: [calls X tool with ...] |
| 80 | | - if self._in_bracket: |
| 81 | | - # Look for closing ] |
| 82 | | - end_idx = self._buffer.find(']') |
| 83 | | - if end_idx >= 0: |
| 84 | | - self._bracket_content += self._buffer[:end_idx] |
| 85 | | - removed.append(f"[{self._bracket_content}]") |
| 86 | | - self._buffer = self._buffer[end_idx + 1:] |
| 87 | | - self._in_bracket = False |
| 88 | | - self._bracket_content = "" |
| 89 | | - was_filtered = True |
| 90 | | - else: |
| 91 | | - # Still in bracket, consume all |
| 92 | | - self._bracket_content += self._buffer |
| 93 | | - self._buffer = "" |
| 94 | | - was_filtered = True |
| 95 | | - continue |
| 96 | | - |
| 97 | | - # Check for bracket start: [calls, [USE, or [output (fake outputs) |
| 98 | | - bracket_match = re.search(r'\[(?=(?:calls?|USE|output)\s*[:\s])', self._buffer, re.IGNORECASE) |
| 99 | | - if bracket_match: |
| 100 | | - # Output everything before the bracket |
| 101 | | - result_parts.append(self._buffer[:bracket_match.start()]) |
| 102 | | - self._buffer = self._buffer[bracket_match.start() + 1:] # Skip the [ |
| 103 | | - self._in_bracket = True |
| 104 | | - was_filtered = True |
| 105 | | - continue |
| 106 | | - |
| 107 | | - # Handle JSON tool calls: {"name": "write", "arguments": {...}} |
| 108 | | - if self._in_json_tool: |
| 109 | | - # Track braces to find the end |
| 110 | | - for i, char in enumerate(self._buffer): |
| 111 | | - if char == '{': |
| 112 | | - self._json_brace_count += 1 |
| 113 | | - elif char == '}': |
| 114 | | - self._json_brace_count -= 1 |
| 115 | | - if self._json_brace_count == 0: |
| 116 | | - # Found end of JSON |
| 117 | | - removed.append(self._buffer[:i + 1]) |
| 118 | | - self._buffer = self._buffer[i + 1:] |
| 119 | | - self._in_json_tool = False |
| 120 | | - was_filtered = True |
| 121 | | - break |
| 122 | | - else: |
| 123 | | - # Still in JSON, consume all |
| 124 | | - self._buffer = "" |
| 125 | | - was_filtered = True |
| 126 | | - continue |
| 127 | | - |
| 128 | | - # Check for JSON tool call start: {"name": "write" etc |
| 129 | | - json_tool_match = re.search( |
| 130 | | - r'\{\s*"name"\s*:\s*"(?:write|read|edit|bash|glob|grep)"', |
| 131 | | - self._buffer |
| 132 | | - ) |
| 133 | | - if json_tool_match: |
| 134 | | - # Output everything before the JSON |
| 135 | | - result_parts.append(self._buffer[:json_tool_match.start()]) |
| 136 | | - self._buffer = self._buffer[json_tool_match.start():] |
| 137 | | - self._in_json_tool = True |
| 138 | | - self._json_brace_count = 0 # Will count starting from { |
| 139 | | - was_filtered = True |
| 140 | | - continue |
| 141 | | - |
| 142 | | - # Check for hallucinated tool narration and filter the line |
| 143 | | - hallucination_match = re.search( |
| 144 | | - r'([Uu]sed\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool|' |
| 145 | | - r'[Uu]sing\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool|' |
| 146 | | - r'with\s+file_path\s*=\s*[`\'"]|' |
| 147 | | - r'with\s+command\s*[`\'"]|' |
| 148 | | - r'[Hh]ere\s+is\s+what\s+[Ii]\s+did:)', |
| 149 | | - self._buffer |
| 150 | | - ) |
| 151 | | - if hallucination_match: |
| 152 | | - # Find end of line and remove whole line |
| 153 | | - line_start = self._buffer.rfind('\n', 0, hallucination_match.start()) + 1 |
| 154 | | - line_end = self._buffer.find('\n', hallucination_match.end()) |
| 155 | | - if line_end == -1: |
| 156 | | - # Line continues to end of buffer - wait for more |
| 157 | | - if line_start > 0: |
| 158 | | - result_parts.append(self._buffer[:line_start]) |
| 159 | | - self._buffer = self._buffer[line_start:] |
| 160 | | - break |
| 161 | | - else: |
| 162 | | - # Remove the whole line |
| 163 | | - result_parts.append(self._buffer[:line_start]) |
| 164 | | - removed.append(self._buffer[line_start:line_end]) |
| 165 | | - self._buffer = self._buffer[line_end:] |
| 166 | | - was_filtered = True |
| 167 | | - continue |
| 168 | | - |
| 169 | | - # Check for preamble patterns and filter the line |
| 170 | | - preamble_match = re.search( |
| 171 | | - r'(Here is a JSON response|Here are the function calls|' |
| 172 | | - r'Here is the response with|I will respond with|' |
| 173 | | - r'The following JSON|Below is the)', |
| 174 | | - self._buffer, re.IGNORECASE |
| 175 | | - ) |
| 176 | | - if preamble_match: |
| 177 | | - # Find end of line and remove whole line |
| 178 | | - line_start = self._buffer.rfind('\n', 0, preamble_match.start()) + 1 |
| 179 | | - line_end = self._buffer.find('\n', preamble_match.end()) |
| 180 | | - if line_end == -1: |
| 181 | | - # Line continues to end of buffer - wait for more |
| 182 | | - if line_start > 0: |
| 183 | | - result_parts.append(self._buffer[:line_start]) |
| 184 | | - self._buffer = self._buffer[line_start:] |
| 185 | | - break |
| 186 | | - else: |
| 187 | | - # Remove the whole line |
| 188 | | - result_parts.append(self._buffer[:line_start]) |
| 189 | | - removed.append(self._buffer[line_start:line_end]) |
| 190 | | - self._buffer = self._buffer[line_end:] |
| 191 | | - was_filtered = True |
| 192 | | - continue |
| 193 | | - if self._in_code_block: |
| 194 | | - # Look for closing ``` |
| 195 | | - end_match = re.search(r'```', self._buffer) |
| 196 | | - if end_match: |
| 197 | | - # Found end of code block |
| 198 | | - block_content = self._buffer[:end_match.start()] |
| 199 | | - self._current_block += block_content |
| 200 | | - removed.append(f"```{self._block_lang}\n{self._current_block}```") |
| 201 | | - self._buffer = self._buffer[end_match.end():] |
| 202 | | - self._in_code_block = False |
| 203 | | - self._current_block = "" |
| 204 | | - self._block_lang = "" |
| 205 | | - was_filtered = True |
| 206 | | - else: |
| 207 | | - # Still in code block, consume all |
| 208 | | - self._current_block += self._buffer |
| 209 | | - self._buffer = "" |
| 210 | | - was_filtered = True |
| 211 | | - else: |
| 212 | | - # Look for opening ``` |
| 213 | | - start_match = re.search(r'```(\w*)\n?', self._buffer) |
| 214 | | - if start_match: |
| 215 | | - # Found start of code block |
| 216 | | - # Output everything before the block |
| 217 | | - result_parts.append(self._buffer[:start_match.start()]) |
| 218 | | - self._block_lang = start_match.group(1) |
| 219 | | - self._buffer = self._buffer[start_match.end():] |
| 220 | | - self._in_code_block = True |
| 221 | | - was_filtered = True |
| 222 | | - else: |
| 223 | | - # Check if buffer ends with partial ``` marker |
| 224 | | - if self._buffer.endswith('`') or self._buffer.endswith('``'): |
| 225 | | - # Hold back potential partial marker |
| 226 | | - split_point = len(self._buffer) - self._buffer[::-1].index('`') - 1 |
| 227 | | - if split_point > 0: |
| 228 | | - # Find where backticks start |
| 229 | | - for i in range(len(self._buffer) - 1, -1, -1): |
| 230 | | - if self._buffer[i] != '`': |
| 231 | | - result_parts.append(self._buffer[:i+1]) |
| 232 | | - self._buffer = self._buffer[i+1:] |
| 233 | | - break |
| 234 | | - break |
| 235 | | - else: |
| 236 | | - # No code block markers, output all |
| 237 | | - result_parts.append(self._buffer) |
| 238 | | - self._buffer = "" |
| 239 | | - |
| 240 | | - return FilterResult( |
| 241 | | - content="".join(result_parts), |
| 242 | | - was_filtered=was_filtered, |
| 243 | | - removed_blocks=removed, |
| 244 | | - ) |
| 245 | | - |
| 246 | | - def filter_complete(self, content: str) -> FilterResult: |
| 247 | | - """Filter complete content (non-streaming), removing code blocks, bracket tool calls, and preambles.""" |
| 248 | | - removed = [] |
| 249 | | - |
| 250 | | - # Pattern to match code blocks |
| 251 | | - code_pattern = r'```\w*\n?[\s\S]*?```' |
| 252 | | - removed.extend(re.findall(code_pattern, content)) |
| 253 | | - filtered = re.sub(code_pattern, '', content) |
| 254 | | - |
| 255 | | - # Pattern to match bracket-format tool calls: [calls X tool with ...] and fake outputs |
| 256 | | - bracket_patterns = [ |
| 257 | | - r'\[calls?\s+\w+\s+tool\s+with[:\s][^\]]+\]', |
| 258 | | - r'\[USE\s+\w+\s+tool[:\s][^\]]+\]', |
| 259 | | - r'\[output[:\s][^\]]+\]', # Fake outputs from model |
| 260 | | - ] |
| 261 | | - for pattern in bracket_patterns: |
| 262 | | - matches = re.findall(pattern, filtered, re.IGNORECASE) |
| 263 | | - removed.extend(matches) |
| 264 | | - filtered = re.sub(pattern, '', filtered, flags=re.IGNORECASE) |
| 265 | | - |
| 266 | | - # Pattern to match JSON tool calls: {"name": "write", "arguments": {...}} |
| 267 | | - # Use a function to handle nested braces properly |
| 268 | | - def remove_json_tool_calls(text: str) -> tuple[str, list[str]]: |
| 269 | | - json_removed = [] |
| 270 | | - tool_pattern = r'\{\s*"name"\s*:\s*"(?:write|read|edit|bash|glob|grep)"' |
| 271 | | - result = text |
| 272 | | - while True: |
| 273 | | - match = re.search(tool_pattern, result) |
| 274 | | - if not match: |
| 275 | | - break |
| 276 | | - # Find matching closing brace |
| 277 | | - start = match.start() |
| 278 | | - brace_count = 0 |
| 279 | | - end = start |
| 280 | | - for i, char in enumerate(result[start:], start): |
| 281 | | - if char == '{': |
| 282 | | - brace_count += 1 |
| 283 | | - elif char == '}': |
| 284 | | - brace_count -= 1 |
| 285 | | - if brace_count == 0: |
| 286 | | - end = i + 1 |
| 287 | | - break |
| 288 | | - if end > start: |
| 289 | | - json_removed.append(result[start:end]) |
| 290 | | - result = result[:start] + result[end:] |
| 291 | | - else: |
| 292 | | - break # Couldn't find matching brace |
| 293 | | - return result, json_removed |
| 294 | | - |
| 295 | | - filtered, json_matches = remove_json_tool_calls(filtered) |
| 296 | | - removed.extend(json_matches) |
| 297 | | - |
| 298 | | - # Pattern to match preamble lines (remove entire line) |
| 299 | | - preamble_patterns = [ |
| 300 | | - r'^.*Here is a JSON response.*$', |
| 301 | | - r'^.*Here are the function calls.*$', |
| 302 | | - r'^.*Here is the response with.*$', |
| 303 | | - r'^.*I will respond with.*$', |
| 304 | | - r'^.*The following (JSON|function calls|tool calls).*$', |
| 305 | | - r'^.*Below (is|are) the (JSON|function|tool).*$', |
| 306 | | - ] |
| 307 | | - for pattern in preamble_patterns: |
| 308 | | - matches = re.findall(pattern, filtered, re.IGNORECASE | re.MULTILINE) |
| 309 | | - removed.extend(matches) |
| 310 | | - filtered = re.sub(pattern, '', filtered, flags=re.IGNORECASE | re.MULTILINE) |
| 311 | | - |
| 312 | | - # Pattern to match hallucinated/narrated tool uses (remove entire line) |
| 313 | | - # These are lines where model describes using tools instead of actually calling them |
| 314 | | - hallucination_patterns = [ |
| 315 | | - r'^.*[Uu]sed\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool.*$', # "Used bash tool..." |
| 316 | | - r'^.*[Uu]sing\s+the\s+`?(?:bash|write|read|edit|glob|grep)`?\s+tool.*$', # "...using the write tool" |
| 317 | | - r'^.*with\s+file_path\s*=\s*[`\'"][^`\'"]+[`\'"].*$', # Narrated file_path parameter |
| 318 | | - r'^.*with\s+command\s*[`\'"][^`\'"]+[`\'"].*$', # Narrated bash command |
| 319 | | - r'^\s*\*\s*[Uu]sed\s+`.*$', # "* Used `bash`..." (bullet point narration) |
| 320 | | - r'^.*[Hh]ere\s+is\s+what\s+[Ii]\s+did:.*$', # "Here is what I did:" |
| 321 | | - r'^\s*\d+\.\s+[Uu]sed\s+.*tool.*$', # "1. Used bash tool..." |
| 322 | | - r'^\s*\d+\.\s+[Cc]reated\s+.*using\s+the\s+.*tool.*$', # "1. Created... using the write tool" |
| 323 | | - ] |
| 324 | | - for pattern in hallucination_patterns: |
| 325 | | - matches = re.findall(pattern, filtered, re.MULTILINE) |
| 326 | | - removed.extend(matches) |
| 327 | | - filtered = re.sub(pattern, '', filtered, flags=re.MULTILINE) |
| 328 | | - |
| 329 | | - # Filter internal recovery/system prompts (multiline blocks) |
| 330 | | - internal_prompt_patterns = [ |
| 331 | | - # Recovery prompts |
| 332 | | - r'## TOOL FAILURE - INVESTIGATE AND ADAPT[\s\S]*?What will you do\?', |
| 333 | | - r'## REQUIRED: Choose ONE[\s\S]*?(?=\n\n|\Z)', |
| 334 | | - r'## CRITICAL RULES:[\s\S]*?(?=\n\n|\Z)', |
| 335 | | - r'## Current attempt:.*$', |
| 336 | | - r'\*\*Your next action should gather information[\s\S]*?What will you do\?', |
| 337 | | - # Observation prefixes |
| 338 | | - r'^Observation \[[\w]+\]:.*$', |
| 339 | | - ] |
| 340 | | - for pattern in internal_prompt_patterns: |
| 341 | | - matches = re.findall(pattern, filtered, re.MULTILINE) |
| 342 | | - removed.extend(matches) |
| 343 | | - filtered = re.sub(pattern, '', filtered, flags=re.MULTILINE) |
| 344 | | - |
| 345 | | - # Clean up multiple blank lines left behind |
| 346 | | - filtered = re.sub(r'\n{3,}', '\n\n', filtered) |
| 347 | | - |
| 348 | | - return FilterResult( |
| 349 | | - content=filtered.strip(), |
| 350 | | - was_filtered=bool(removed), |
| 351 | | - removed_blocks=removed, |
| 352 | | - ) |
| 353 | | - |
| 354 | | - |
| 355 | | -@dataclass |
| 356 | | -class PatternMatch: |
| 357 | | - """A detected problematic pattern.""" |
| 358 | | - pattern_type: str # 'code_block', 'narration', 'preview', 'repetition' |
| 359 | | - match_text: str |
| 360 | | - severity: str # 'low', 'medium', 'high' |
| 361 | | - |
| 362 | | - |
| 363 | | -class PatternDetector: |
| 364 | | - """Detects problematic patterns in agent output. |
| 365 | | - |
| 366 | | - Patterns include: |
| 367 | | - - Code blocks (which should be tool calls instead) |
| 368 | | - - Narration ("I will call...", "Now I'll...") |
| 369 | | - - Previews ("The file will look like:", "After editing:") |
| 370 | | - - Repetitive commands |
| 371 | | - """ |
| 372 | | - |
| 373 | | - # Narration patterns - model announcing what it will do instead of doing it |
| 374 | | - NARRATION_PATTERNS = [ |
| 375 | | - (r"I('ll| will) (use|call|execute|run) the (\w+) tool", "narration", "high"), |
| 376 | | - (r"Let me (use|call|execute|run) the (\w+) tool", "narration", "high"), |
| 377 | | - (r"Now I('ll| will) (create|write|edit|run|execute)", "narration", "medium"), |
| 378 | | - (r"I('m going to| am going to) (use|call|create|write)", "narration", "medium"), |
| 379 | | - (r"First,? I('ll| will) (use|call|create)", "narration", "medium"), |
| 380 | | - (r"Next,? I('ll| will) (use|call|create)", "narration", "medium"), |
| 381 | | - ] |
| 382 | | - |
| 383 | | - # Preview patterns - model showing content instead of using tools |
| 384 | | - PREVIEW_PATTERNS = [ |
| 385 | | - (r"(The|This) file will (look like|contain|have):", "preview", "high"), |
| 386 | | - (r"After editing,? (the file|it) will (look like|contain):", "preview", "high"), |
| 387 | | - (r"Here('s| is) (the|what) (content|code|file):", "preview", "high"), |
| 388 | | - (r"Save this (to|as|in) [\w./]+:", "preview", "high"), |
| 389 | | - (r"Create a file (with|containing):", "preview", "medium"), |
| 390 | | - (r"(The|Your) [\w./]+ (should|will) (look like|contain):", "preview", "medium"), |
| 391 | | - ] |
| 392 | | - |
| 393 | | - # Preamble patterns - model describing JSON/function calls instead of using them |
| 394 | | - PREAMBLE_PATTERNS = [ |
| 395 | | - (r"Here is a JSON response", "preamble", "high"), |
| 396 | | - (r"Here are the function calls", "preamble", "high"), |
| 397 | | - (r"Here is the response with", "preamble", "high"), |
| 398 | | - (r"I will respond with", "preamble", "high"), |
| 399 | | - (r"The following (JSON|function calls|tool calls)", "preamble", "high"), |
| 400 | | - (r"Below (is|are) the (JSON|function|tool)", "preamble", "high"), |
| 401 | | - ] |
| 402 | | - |
| 403 | | - # Code block patterns |
| 404 | | - CODE_BLOCK_PATTERNS = [ |
| 405 | | - (r'```\w+\n', "code_block", "high"), |
| 406 | | - (r'```\n', "code_block", "medium"), |
| 407 | | - ] |
| 408 | | - |
| 409 | | - def __init__(self): |
| 410 | | - self._all_patterns = ( |
| 411 | | - self.NARRATION_PATTERNS + |
| 412 | | - self.PREVIEW_PATTERNS + |
| 413 | | - self.PREAMBLE_PATTERNS + |
| 414 | | - self.CODE_BLOCK_PATTERNS |
| 415 | | - ) |
| 416 | | - self._recent_detections: list[PatternMatch] = [] |
| 417 | | - |
| 418 | | - def reset(self): |
| 419 | | - """Reset detection state.""" |
| 420 | | - self._recent_detections = [] |
| 421 | | - |
| 422 | | - def detect(self, content: str) -> list[PatternMatch]: |
| 423 | | - """Detect problematic patterns in content.""" |
| 424 | | - matches = [] |
| 425 | | - |
| 426 | | - for pattern, ptype, severity in self._all_patterns: |
| 427 | | - for match in re.finditer(pattern, content, re.IGNORECASE): |
| 428 | | - matches.append(PatternMatch( |
| 429 | | - pattern_type=ptype, |
| 430 | | - match_text=match.group(0), |
| 431 | | - severity=severity, |
| 432 | | - )) |
| 433 | | - |
| 434 | | - self._recent_detections.extend(matches) |
| 435 | | - return matches |
| 436 | | - |
| 437 | | - def has_high_severity(self, content: str) -> bool: |
| 438 | | - """Check if content has high-severity patterns.""" |
| 439 | | - matches = self.detect(content) |
| 440 | | - return any(m.severity == "high" for m in matches) |
| 441 | | - |
| 442 | | - def get_steering_message(self, matches: list[PatternMatch]) -> str | None: |
| 443 | | - """Generate a steering message based on detected patterns. |
| 444 | | - |
| 445 | | - Returns None if no steering needed. |
| 446 | | - """ |
| 447 | | - if not matches: |
| 448 | | - return None |
| 449 | | - |
| 450 | | - # Prioritize high severity |
| 451 | | - high_severity = [m for m in matches if m.severity == "high"] |
| 452 | | - if not high_severity: |
| 453 | | - return None |
| 454 | | - |
| 455 | | - # Generate appropriate steering message |
| 456 | | - pattern_types = set(m.pattern_type for m in high_severity) |
| 457 | | - |
| 458 | | - if "preamble" in pattern_types: |
| 459 | | - return ( |
| 460 | | - "[STOP] Do not describe JSON or function calls. " |
| 461 | | - "Just USE the tools directly. No preambles." |
| 462 | | - ) |
| 463 | | - elif "code_block" in pattern_types or "preview" in pattern_types: |
| 464 | | - return ( |
| 465 | | - "[REMINDER] Do not show code blocks or previews. " |
| 466 | | - "Use tools directly to create/edit files. " |
| 467 | | - "No ```code```, just call the tool." |
| 468 | | - ) |
| 469 | | - elif "narration" in pattern_types: |
| 470 | | - return ( |
| 471 | | - "[REMINDER] Don't announce tool calls. " |
| 472 | | - "Just use the tool directly without narration." |
| 473 | | - ) |
| 474 | | - |
| 475 | | - return None |
| 476 | | - |
| 477 | | - |
| 478 | | -class RuntimeSafeguards: |
| 479 | | - """Combined runtime safeguards for the agent. |
| 480 | | - |
| 481 | | - Usage: |
| 482 | | - safeguards = RuntimeSafeguards() |
| 483 | | - |
| 484 | | - # For streaming: |
| 485 | | - filtered = safeguards.filter_stream_chunk(chunk) |
| 486 | | - if safeguards.should_steer(): |
| 487 | | - steering_msg = safeguards.get_steering_message() |
| 488 | | - |
| 489 | | - # Before tool execution: |
| 490 | | - is_dup, reason = safeguards.check_duplicate(tool_name, args) |
| 491 | | - if is_dup: |
| 492 | | - skip this tool call |
| 493 | | - |
| 494 | | - # Pre-action validation: |
| 495 | | - validation = safeguards.validate_action(tool_name, args) |
| 496 | | - if not validation.valid: |
| 497 | | - skip or warn |
| 498 | | - |
| 499 | | - # After tool execution: |
| 500 | | - safeguards.record_action(tool_name, args) |
| 501 | | - """ |
| 502 | | - |
| 503 | | - def __init__(self): |
| 504 | | - self.code_filter = CodeBlockFilter() |
| 505 | | - self.pattern_detector = PatternDetector() |
| 506 | | - self.action_tracker = ActionTracker() |
| 507 | | - self.validator = PreActionValidator() |
| 508 | | - self._pending_steering: str | None = None |
| 509 | | - self._accumulated_content = "" |
| 510 | | - |
| 511 | | - def reset(self): |
| 512 | | - """Reset all safeguards for a new conversation.""" |
| 513 | | - self.code_filter.reset() |
| 514 | | - self.pattern_detector.reset() |
| 515 | | - self.action_tracker.reset() |
| 516 | | - self._pending_steering = None |
| 517 | | - self._accumulated_content = "" |
| 518 | | - |
| 519 | | - def filter_stream_chunk(self, chunk: str) -> str: |
| 520 | | - """Filter a streaming chunk, removing code blocks. |
| 521 | | - |
| 522 | | - Also detects patterns for potential steering. |
| 523 | | - """ |
| 524 | | - # Filter code blocks |
| 525 | | - result = self.code_filter.filter_chunk(chunk) |
| 526 | | - |
| 527 | | - # Accumulate for pattern detection |
| 528 | | - self._accumulated_content += chunk |
| 529 | | - |
| 530 | | - # Check for patterns periodically (every 200 chars) |
| 531 | | - if len(self._accumulated_content) > 200: |
| 532 | | - matches = self.pattern_detector.detect(self._accumulated_content) |
| 533 | | - if matches: |
| 534 | | - steering = self.pattern_detector.get_steering_message(matches) |
| 535 | | - if steering: |
| 536 | | - self._pending_steering = steering |
| 537 | | - self._accumulated_content = self._accumulated_content[-100:] # Keep last 100 chars for context |
| 538 | | - |
| 539 | | - return result.content |
| 540 | | - |
| 541 | | - def filter_complete_content(self, content: str) -> str: |
| 542 | | - """Filter complete content (non-streaming).""" |
| 543 | | - result = self.code_filter.filter_complete(content) |
| 544 | | - |
| 545 | | - # Also detect patterns |
| 546 | | - matches = self.pattern_detector.detect(content) |
| 547 | | - if matches: |
| 548 | | - steering = self.pattern_detector.get_steering_message(matches) |
| 549 | | - if steering: |
| 550 | | - self._pending_steering = steering |
| 551 | | - |
| 552 | | - return result.content |
| 553 | | - |
| 554 | | - def should_steer(self) -> bool: |
| 555 | | - """Check if we should inject a steering message.""" |
| 556 | | - return self._pending_steering is not None |
| 557 | | - |
| 558 | | - def get_steering_message(self) -> str | None: |
| 559 | | - """Get pending steering message and clear it.""" |
| 560 | | - msg = self._pending_steering |
| 561 | | - self._pending_steering = None |
| 562 | | - return msg |
| 563 | | - |
| 564 | | - def check_duplicate(self, tool_name: str, arguments: dict) -> tuple[bool, str]: |
| 565 | | - """Check if a tool call would be a duplicate.""" |
| 566 | | - return self.action_tracker.check_tool_call(tool_name, arguments) |
| 567 | | - |
| 568 | | - def record_action(self, tool_name: str, arguments: dict) -> None: |
| 569 | | - """Record a completed tool action.""" |
| 570 | | - self.action_tracker.record_tool_call(tool_name, arguments) |
| 571 | | - |
| 572 | | - def detect_loop(self) -> tuple[bool, str]: |
| 573 | | - """Detect if the agent is in a repetitive loop. |
| 574 | | - |
| 575 | | - Returns (is_loop, pattern_description). |
| 576 | | - """ |
| 577 | | - return self.action_tracker.detect_loop() |
| 578 | | - |
| 579 | | - def validate_action(self, tool_name: str, arguments: dict) -> ValidationResult: |
| 580 | | - """Validate a tool action before execution. |
| 581 | | - |
| 582 | | - Returns ValidationResult with validity and any warnings/errors. |
| 583 | | - """ |
| 584 | | - return self.validator.validate(tool_name, arguments) |
| 585 | | - |
| 586 | | - def record_response(self, response: str) -> None: |
| 587 | | - """Record a response for text loop detection.""" |
| 588 | | - self.action_tracker.record_response(response) |
| 589 | | - |
| 590 | | - def detect_text_loop(self, response: str) -> tuple[bool, str]: |
| 591 | | - """Detect if the agent is repeating the same response. |
| 592 | | - |
| 593 | | - Returns (is_loop, description). |
| 594 | | - """ |
| 595 | | - return self.action_tracker.detect_text_loop(response) |
| 12 | +__all__ = [ |
| 13 | + "ActionTracker", |
| 14 | + "CodeBlockFilter", |
| 15 | + "FilterResult", |
| 16 | + "PatternDetector", |
| 17 | + "PatternMatch", |
| 18 | + "PreActionValidator", |
| 19 | + "RuntimeSafeguards", |
| 20 | + "ValidationResult", |
| 21 | +] |