@@ -6,6 +6,71 @@ import re |
| 6 | 6 | |
| 7 | 7 | from .reasoning_types import TaskCompletionCheck |
| 8 | 8 | |
| 9 | +_ACTION_VERBS = ("create", "write", "make", "edit", "fix", "add", "delete", "run") |
| 10 | +_COMPLEX_INDICATORS = ( |
| 11 | + "set up a project", |
| 12 | + "create a project", |
| 13 | + "build a complete", |
| 14 | + "scaffold", |
| 15 | + "initialize a new", |
| 16 | + "create a full", |
| 17 | + "implement a full", |
| 18 | + "develop a complete", |
| 19 | +) |
| 20 | +_SIMPLE_TASK_INDICATORS = ( |
| 21 | + "create a file", |
| 22 | + "write a file", |
| 23 | + "make a file", |
| 24 | + "add a function", |
| 25 | + "edit the", |
| 26 | + "fix the", |
| 27 | + "update the", |
| 28 | + "read the", |
| 29 | + "show me", |
| 30 | + "list", |
| 31 | + "design a webpage", |
| 32 | + "create a webpage", |
| 33 | + "make a webpage", |
| 34 | + "create a page", |
| 35 | + "design a page", |
| 36 | + "create an html", |
| 37 | + "make an html", |
| 38 | + "write an html", |
| 39 | + "help me design", |
| 40 | + "create a simple", |
| 41 | + "make a simple", |
| 42 | + "write a simple", |
| 43 | +) |
| 44 | +_VERIFICATION_INDICATORS = ("and test", "and run", "and verify", "make sure it works") |
| 45 | +_DEFLECTION_PHRASES = ("you can now", "you should", "you can run", "you can use") |
| 46 | +_INFORMATIONAL_PREFIXES = ( |
| 47 | + "explain ", |
| 48 | + "describe ", |
| 49 | + "summarize ", |
| 50 | + "compare ", |
| 51 | + "outline ", |
| 52 | + "review ", |
| 53 | + "analyze ", |
| 54 | + "what ", |
| 55 | + "how ", |
| 56 | + "why ", |
| 57 | + "which ", |
| 58 | + "who ", |
| 59 | + "where ", |
| 60 | + "when ", |
| 61 | +) |
| 62 | +_EXPLICIT_COMPLETIONS = { |
| 63 | + "done", |
| 64 | + "done.", |
| 65 | + "completed", |
| 66 | + "completed.", |
| 67 | + "all set", |
| 68 | + "all set.", |
| 69 | +} |
| 70 | +_INSTALL_HINTS = ("install", "dependencies", "set up project") |
| 71 | +_NODE_HINTS = ("node", "npm") |
| 72 | +_PYTHON_HINTS = ("python", "pip") |
| 73 | + |
| 9 | 74 | COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed. |
| 10 | 75 | |
| 11 | 76 | Original task: {task} |
@@ -39,146 +104,152 @@ def detect_premature_completion( |
| 39 | 104 | actions_taken: list[str], |
| 40 | 105 | ) -> bool: |
| 41 | 106 | """Heuristically detect when the assistant is stopping too early.""" |
| 42 | | - |
| 43 | | - task_lower = task.lower() |
| 44 | | - response_lower = response.lower() |
| 45 | | - |
| 46 | | - if not actions_taken: |
| 47 | | - explicit_completion = response_lower.strip() |
| 48 | | - if explicit_completion in { |
| 49 | | - "done", |
| 50 | | - "done.", |
| 51 | | - "completed", |
| 52 | | - "completed.", |
| 53 | | - "all set", |
| 54 | | - "all set.", |
| 55 | | - }: |
| 56 | | - return False |
| 57 | | - action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"] |
| 58 | | - if any(verb in task_lower for verb in action_verbs): |
| 59 | | - return True |
| 60 | | - return False |
| 61 | | - |
| 62 | | - success_indicators = [ |
| 63 | | - "successfully", |
| 64 | | - "created", |
| 65 | | - "written", |
| 66 | | - "done", |
| 67 | | - "completed", |
| 68 | | - "file now contains", |
| 69 | | - "has been updated", |
| 70 | | - "installed", |
| 71 | | - ] |
| 72 | | - if any(indicator in response_lower for indicator in success_indicators): |
| 73 | | - return False |
| 74 | | - |
| 75 | | - complex_indicators = [ |
| 76 | | - "set up a project", |
| 77 | | - "create a project", |
| 78 | | - "build a complete", |
| 79 | | - "scaffold", |
| 80 | | - "initialize a new", |
| 81 | | - "create a full", |
| 82 | | - "implement a full", |
| 83 | | - "develop a complete", |
| 84 | | - ] |
| 85 | | - is_complex = any(indicator in task_lower for indicator in complex_indicators) |
| 86 | | - |
| 87 | | - simple_creation = [ |
| 88 | | - "create a file", |
| 89 | | - "write a file", |
| 90 | | - "make a file", |
| 91 | | - "add a function", |
| 92 | | - "edit the", |
| 93 | | - "fix the", |
| 94 | | - "update the", |
| 95 | | - "read the", |
| 96 | | - "show me", |
| 97 | | - "list", |
| 98 | | - "design a webpage", |
| 99 | | - "create a webpage", |
| 100 | | - "make a webpage", |
| 101 | | - "create a page", |
| 102 | | - "design a page", |
| 103 | | - "create an html", |
| 104 | | - "make an html", |
| 105 | | - "write an html", |
| 106 | | - "help me design", |
| 107 | | - "create a simple", |
| 108 | | - "make a simple", |
| 109 | | - "write a simple", |
| 110 | | - ] |
| 111 | | - is_simple = any(indicator in task_lower for indicator in simple_creation) |
| 112 | | - |
| 113 | | - if "write" in str(actions_taken).lower() and len(actions_taken) >= 1: |
| 107 | + if not actions_taken and response.lower().strip() in _EXPLICIT_COMPLETIONS: |
| 114 | 108 | return False |
| 115 | | - if is_simple and len(actions_taken) >= 1: |
| 116 | | - return False |
| 117 | | - |
| 118 | | - explicit_verification = ["and test", "and run", "and verify", "make sure it works"] |
| 119 | | - needs_verification = any(indicator in task_lower for indicator in explicit_verification) |
| 120 | | - |
| 121 | | - action_types = set() |
| 122 | | - for action in actions_taken: |
| 123 | | - action_lower = action.lower() |
| 124 | | - if "write" in action_lower: |
| 125 | | - action_types.add("write") |
| 126 | | - elif "edit" in action_lower: |
| 127 | | - action_types.add("edit") |
| 128 | | - elif "bash" in action_lower: |
| 129 | | - action_types.add("bash") |
| 130 | | - elif "read" in action_lower: |
| 131 | | - action_types.add("read") |
| 132 | | - elif "glob" in action_lower or "grep" in action_lower: |
| 133 | | - action_types.add("search") |
| 134 | | - |
| 135 | | - if is_complex and len(actions_taken) < 3: |
| 136 | | - return True |
| 137 | | - if needs_verification and "bash" not in action_types: |
| 138 | | - return True |
| 139 | | - |
| 140 | | - deflection_phrases = ["you can now", "you should", "you can run", "you can use"] |
| 141 | | - if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2: |
| 142 | | - return True |
| 143 | | - |
| 144 | | - return False |
| 109 | + return not assess_completion_follow_through( |
| 110 | + task=task, |
| 111 | + response=response, |
| 112 | + actions_taken=actions_taken, |
| 113 | + ).is_complete |
| 145 | 114 | |
| 146 | 115 | |
| 147 | 116 | def get_continuation_prompt(task: str, actions_taken: list[str], response: str) -> str: |
| 148 | 117 | """Generate a helpful follow-through prompt for incomplete tasks.""" |
| 118 | + return assess_completion_follow_through( |
| 119 | + task=task, |
| 120 | + response=response, |
| 121 | + actions_taken=actions_taken, |
| 122 | + ).continuation_prompt |
| 149 | 123 | |
| 150 | | - del response |
| 151 | 124 | |
| 152 | | - task_lower = task.lower() |
| 153 | | - follow_ups: list[str] = [] |
| 125 | +def assess_completion_follow_through( |
| 126 | + *, |
| 127 | + task: str, |
| 128 | + response: str, |
| 129 | + actions_taken: list[str], |
| 130 | +) -> TaskCompletionCheck: |
| 131 | + """Build a typed follow-through assessment for one candidate response.""" |
| 132 | + |
| 133 | + task_lower = task.lower().strip() |
| 134 | + response_lower = response.lower().strip() |
| 135 | + action_types = _action_types(actions_taken) |
| 136 | + informational = _is_informational_task(task_lower) |
| 137 | + complex_task = any(indicator in task_lower for indicator in _COMPLEX_INDICATORS) |
| 138 | + simple_task = any(indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS) |
| 139 | + requires_verification = any( |
| 140 | + indicator in task_lower for indicator in _VERIFICATION_INDICATORS |
| 141 | + ) |
| 142 | + requires_install = any(indicator in task_lower for indicator in _INSTALL_HINTS) |
| 143 | + |
| 144 | + accomplished = [_summarize_action(action) for action in actions_taken] |
| 145 | + required_evidence = _required_evidence( |
| 146 | + task_lower=task_lower, |
| 147 | + informational=informational, |
| 148 | + complex_task=complex_task, |
| 149 | + requires_verification=requires_verification, |
| 150 | + requires_install=requires_install, |
| 151 | + ) |
| 152 | + missing_evidence: list[str] = [] |
| 153 | + remaining: list[str] = [] |
| 154 | + suggested_next_steps: list[str] = [] |
| 155 | + |
| 156 | + if informational: |
| 157 | + return TaskCompletionCheck( |
| 158 | + original_task=task, |
| 159 | + is_complete=bool(response.strip()), |
| 160 | + accomplished=accomplished, |
| 161 | + required_evidence=required_evidence, |
| 162 | + missing_evidence=[], |
| 163 | + remaining=[], |
| 164 | + suggested_next_steps=[], |
| 165 | + continuation_prompt=_format_continuation_prompt( |
| 166 | + task=task, |
| 167 | + missing_evidence=[], |
| 168 | + suggested_next_steps=[], |
| 169 | + action_count=len(actions_taken), |
| 170 | + ), |
| 171 | + ) |
| 172 | + |
| 173 | + if not actions_taken and _requires_action(task_lower): |
| 174 | + _append_follow_through_gap( |
| 175 | + missing_evidence, |
| 176 | + remaining, |
| 177 | + suggested_next_steps, |
| 178 | + evidence="showing the requested work was actually carried out", |
| 179 | + remaining_item="Perform the requested work instead of stopping at intent or narration", |
| 180 | + next_step="Carry out the requested change or command now", |
| 181 | + ) |
| 154 | 182 | |
| 155 | | - if any(keyword in task_lower for keyword in ["install", "dependencies", "set up project"]): |
| 156 | | - if "node" in task_lower or "npm" in task_lower: |
| 157 | | - if not any("npm" in action for action in actions_taken): |
| 158 | | - follow_ups.append("Run `npm install` to install dependencies") |
| 159 | | - if "python" in task_lower or "pip" in task_lower: |
| 160 | | - if not any("pip" in action or "uv" in action for action in actions_taken): |
| 161 | | - follow_ups.append("Install dependencies") |
| 183 | + if requires_install and not _has_install_evidence(task_lower, action_types, actions_taken): |
| 184 | + _append_follow_through_gap( |
| 185 | + missing_evidence, |
| 186 | + remaining, |
| 187 | + suggested_next_steps, |
| 188 | + evidence="showing dependencies or setup steps were completed", |
| 189 | + remaining_item="Install or initialize the required dependencies", |
| 190 | + next_step=_install_follow_up(task_lower), |
| 191 | + ) |
| 162 | 192 | |
| 163 | | - if "test" in task_lower and "run" in task_lower: |
| 164 | | - if not any("test" in action or "pytest" in action or "jest" in action for action in actions_taken): |
| 165 | | - follow_ups.append("Run the tests") |
| 193 | + if requires_verification and not _has_verification_evidence(action_types, actions_taken): |
| 194 | + _append_follow_through_gap( |
| 195 | + missing_evidence, |
| 196 | + remaining, |
| 197 | + suggested_next_steps, |
| 198 | + evidence="showing the result was run or verified", |
| 199 | + remaining_item="Run the result and capture a concrete verification outcome", |
| 200 | + next_step="Execute what you created or run the relevant tests now", |
| 201 | + ) |
| 166 | 202 | |
| 167 | | - if any(keyword in task_lower for keyword in ["and run", "and test", "and verify", "make sure it works"]): |
| 168 | | - follow_ups.append("Execute what was created to verify it works") |
| 203 | + if complex_task and len(actions_taken) < 3: |
| 204 | + _append_follow_through_gap( |
| 205 | + missing_evidence, |
| 206 | + remaining, |
| 207 | + suggested_next_steps, |
| 208 | + evidence="showing the broader end-to-end implementation or setup was completed", |
| 209 | + remaining_item="Finish the larger end-to-end task instead of stopping after a partial step", |
| 210 | + next_step="Continue through the remaining setup or implementation steps", |
| 211 | + ) |
| 169 | 212 | |
| 170 | | - if follow_ups: |
| 171 | | - steps = "\n".join(f"- {step}" for step in follow_ups[:2]) |
| 172 | | - return ( |
| 173 | | - f'The task was: "{task}"\n\n' |
| 174 | | - f"You may need to also:\n{steps}\n\n" |
| 175 | | - "If the task is actually complete, just confirm what was done." |
| 213 | + if ( |
| 214 | + any(phrase in response_lower for phrase in _DEFLECTION_PHRASES) |
| 215 | + and len(actions_taken) < 2 |
| 216 | + ): |
| 217 | + _append_follow_through_gap( |
| 218 | + missing_evidence, |
| 219 | + remaining, |
| 220 | + suggested_next_steps, |
| 221 | + evidence="showing execution evidence rather than instructions handed back to the user", |
| 222 | + remaining_item="Perform the work yourself or state concretely what you already verified", |
| 223 | + next_step="Continue the task instead of handing the next step to the user", |
| 176 | 224 | ) |
| 177 | 225 | |
| 178 | | - return ( |
| 179 | | - f'Task: "{task}"\n' |
| 180 | | - f"You took {len(actions_taken)} action(s). " |
| 181 | | - "If there's more to do, continue. Otherwise, confirm completion." |
| 226 | + if "write" in action_types and actions_taken and simple_task: |
| 227 | + missing_evidence = [ |
| 228 | + item |
| 229 | + for item in missing_evidence |
| 230 | + if item != "showing the requested work was actually carried out" |
| 231 | + ] |
| 232 | + remaining = [ |
| 233 | + item |
| 234 | + for item in remaining |
| 235 | + if item != "Perform the requested work instead of stopping at intent or narration" |
| 236 | + ] |
| 237 | + |
| 238 | + is_complete = not missing_evidence |
| 239 | + return TaskCompletionCheck( |
| 240 | + original_task=task, |
| 241 | + is_complete=is_complete, |
| 242 | + accomplished=accomplished, |
| 243 | + required_evidence=required_evidence, |
| 244 | + missing_evidence=missing_evidence, |
| 245 | + remaining=remaining, |
| 246 | + suggested_next_steps=suggested_next_steps, |
| 247 | + continuation_prompt=_format_continuation_prompt( |
| 248 | + task=task, |
| 249 | + missing_evidence=missing_evidence, |
| 250 | + suggested_next_steps=suggested_next_steps, |
| 251 | + action_count=len(actions_taken), |
| 252 | + ), |
| 182 | 253 | ) |
| 183 | 254 | |
| 184 | 255 | |
@@ -207,9 +278,154 @@ def parse_completion_check(response: str, original_task: str) -> TaskCompletionC |
| 207 | 278 | original_task=original_task, |
| 208 | 279 | is_complete=data.get("is_complete", False), |
| 209 | 280 | accomplished=data.get("accomplished", []), |
| 281 | + required_evidence=data.get("required_evidence", []), |
| 282 | + missing_evidence=data.get("missing_evidence", data.get("remaining", [])), |
| 210 | 283 | remaining=data.get("remaining", []), |
| 211 | 284 | suggested_next_steps=next_steps, |
| 212 | 285 | continuation_prompt=continuation, |
| 213 | 286 | ) |
| 214 | 287 | except json.JSONDecodeError: |
| 215 | 288 | return TaskCompletionCheck(original_task=original_task) |
| 289 | + |
| 290 | + |
| 291 | +def _action_types(actions_taken: list[str]) -> set[str]: |
| 292 | + action_types: set[str] = set() |
| 293 | + for action in actions_taken: |
| 294 | + action_lower = action.lower() |
| 295 | + if "write" in action_lower: |
| 296 | + action_types.add("write") |
| 297 | + elif "edit" in action_lower or "patch" in action_lower: |
| 298 | + action_types.add("edit") |
| 299 | + elif "bash" in action_lower or "shell" in action_lower: |
| 300 | + action_types.add("bash") |
| 301 | + elif "read" in action_lower: |
| 302 | + action_types.add("read") |
| 303 | + elif "glob" in action_lower or "grep" in action_lower or "search" in action_lower: |
| 304 | + action_types.add("search") |
| 305 | + elif "todo" in action_lower: |
| 306 | + action_types.add("workflow") |
| 307 | + return action_types |
| 308 | + |
| 309 | + |
| 310 | +def _is_informational_task(task_lower: str) -> bool: |
| 311 | + if task_lower.startswith(_INFORMATIONAL_PREFIXES): |
| 312 | + return True |
| 313 | + if task_lower.endswith("?") and task_lower.startswith( |
| 314 | + ("what ", "how ", "why ", "which ", "who ", "where ", "when ") |
| 315 | + ): |
| 316 | + return True |
| 317 | + return False |
| 318 | + |
| 319 | + |
| 320 | +def _requires_action(task_lower: str) -> bool: |
| 321 | + return any(verb in task_lower for verb in _ACTION_VERBS) or any( |
| 322 | + indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS |
| 323 | + ) |
| 324 | + |
| 325 | + |
| 326 | +def _required_evidence( |
| 327 | + *, |
| 328 | + task_lower: str, |
| 329 | + informational: bool, |
| 330 | + complex_task: bool, |
| 331 | + requires_verification: bool, |
| 332 | + requires_install: bool, |
| 333 | +) -> list[str]: |
| 334 | + if informational: |
| 335 | + return [] |
| 336 | + |
| 337 | + required: list[str] = [] |
| 338 | + if _requires_action(task_lower): |
| 339 | + required.append("showing the requested work was actually carried out") |
| 340 | + if requires_install: |
| 341 | + required.append("showing dependencies or setup steps were completed") |
| 342 | + if requires_verification: |
| 343 | + required.append("showing the result was run or verified") |
| 344 | + if complex_task: |
| 345 | + required.append("showing the broader end-to-end implementation or setup was completed") |
| 346 | + return required |
| 347 | + |
| 348 | + |
| 349 | +def _has_install_evidence( |
| 350 | + task_lower: str, |
| 351 | + action_types: set[str], |
| 352 | + actions_taken: list[str], |
| 353 | +) -> bool: |
| 354 | + del action_types |
| 355 | + action_text = " ".join(actions_taken).lower() |
| 356 | + if any(hint in task_lower for hint in _NODE_HINTS) and "npm" in action_text: |
| 357 | + return True |
| 358 | + if any(hint in task_lower for hint in _PYTHON_HINTS) and ( |
| 359 | + "pip" in action_text or "uv" in action_text |
| 360 | + ): |
| 361 | + return True |
| 362 | + return "install" in action_text or "init" in action_text or "setup" in action_text |
| 363 | + |
| 364 | + |
| 365 | +def _has_verification_evidence( |
| 366 | + action_types: set[str], |
| 367 | + actions_taken: list[str], |
| 368 | +) -> bool: |
| 369 | + if "bash" in action_types: |
| 370 | + return True |
| 371 | + action_text = " ".join(actions_taken).lower() |
| 372 | + return any( |
| 373 | + token in action_text |
| 374 | + for token in ("test", "pytest", "jest", "verify", "run", "execute") |
| 375 | + ) |
| 376 | + |
| 377 | + |
| 378 | +def _install_follow_up(task_lower: str) -> str: |
| 379 | + if any(hint in task_lower for hint in _NODE_HINTS): |
| 380 | + return "Run `npm install` to install dependencies" |
| 381 | + if any(hint in task_lower for hint in _PYTHON_HINTS): |
| 382 | + return "Install the Python dependencies" |
| 383 | + return "Install or initialize the required dependencies now" |
| 384 | + |
| 385 | + |
| 386 | +def _append_follow_through_gap( |
| 387 | + missing_evidence: list[str], |
| 388 | + remaining: list[str], |
| 389 | + suggested_next_steps: list[str], |
| 390 | + *, |
| 391 | + evidence: str, |
| 392 | + remaining_item: str, |
| 393 | + next_step: str, |
| 394 | +) -> None: |
| 395 | + if evidence not in missing_evidence: |
| 396 | + missing_evidence.append(evidence) |
| 397 | + if remaining_item not in remaining: |
| 398 | + remaining.append(remaining_item) |
| 399 | + if next_step not in suggested_next_steps: |
| 400 | + suggested_next_steps.append(next_step) |
| 401 | + |
| 402 | + |
| 403 | +def _format_continuation_prompt( |
| 404 | + *, |
| 405 | + task: str, |
| 406 | + missing_evidence: list[str], |
| 407 | + suggested_next_steps: list[str], |
| 408 | + action_count: int, |
| 409 | +) -> str: |
| 410 | + if suggested_next_steps: |
| 411 | + evidence_lines = "\n".join(f"- {item}" for item in missing_evidence[:2]) |
| 412 | + step_lines = "\n".join(f"- {step}" for step in suggested_next_steps[:3]) |
| 413 | + return ( |
| 414 | + f'The task was: "{task}"\n\n' |
| 415 | + "The response still needs concrete evidence for:\n" |
| 416 | + f"{evidence_lines}\n\n" |
| 417 | + "Continue with:\n" |
| 418 | + f"{step_lines}\n\n" |
| 419 | + "If the task is actually complete, confirm the missing evidence explicitly." |
| 420 | + ) |
| 421 | + |
| 422 | + return ( |
| 423 | + f'Task: "{task}"\n' |
| 424 | + f"You took {action_count} action(s). " |
| 425 | + "If there's more to do, continue. Otherwise, confirm completion." |
| 426 | + ) |
| 427 | + |
| 428 | + |
| 429 | +def _summarize_action(action: str) -> str: |
| 430 | + head, _, _ = action.partition(":") |
| 431 | + return head.strip() or action.strip() |