@@ -6,6 +6,71 @@ import re |
| 6 | | 6 | |
| 7 | from .reasoning_types import TaskCompletionCheck | 7 | from .reasoning_types import TaskCompletionCheck |
| 8 | | 8 | |
| | 9 | +_ACTION_VERBS = ("create", "write", "make", "edit", "fix", "add", "delete", "run") |
| | 10 | +_COMPLEX_INDICATORS = ( |
| | 11 | + "set up a project", |
| | 12 | + "create a project", |
| | 13 | + "build a complete", |
| | 14 | + "scaffold", |
| | 15 | + "initialize a new", |
| | 16 | + "create a full", |
| | 17 | + "implement a full", |
| | 18 | + "develop a complete", |
| | 19 | +) |
| | 20 | +_SIMPLE_TASK_INDICATORS = ( |
| | 21 | + "create a file", |
| | 22 | + "write a file", |
| | 23 | + "make a file", |
| | 24 | + "add a function", |
| | 25 | + "edit the", |
| | 26 | + "fix the", |
| | 27 | + "update the", |
| | 28 | + "read the", |
| | 29 | + "show me", |
| | 30 | + "list", |
| | 31 | + "design a webpage", |
| | 32 | + "create a webpage", |
| | 33 | + "make a webpage", |
| | 34 | + "create a page", |
| | 35 | + "design a page", |
| | 36 | + "create an html", |
| | 37 | + "make an html", |
| | 38 | + "write an html", |
| | 39 | + "help me design", |
| | 40 | + "create a simple", |
| | 41 | + "make a simple", |
| | 42 | + "write a simple", |
| | 43 | +) |
| | 44 | +_VERIFICATION_INDICATORS = ("and test", "and run", "and verify", "make sure it works") |
| | 45 | +_DEFLECTION_PHRASES = ("you can now", "you should", "you can run", "you can use") |
| | 46 | +_INFORMATIONAL_PREFIXES = ( |
| | 47 | + "explain ", |
| | 48 | + "describe ", |
| | 49 | + "summarize ", |
| | 50 | + "compare ", |
| | 51 | + "outline ", |
| | 52 | + "review ", |
| | 53 | + "analyze ", |
| | 54 | + "what ", |
| | 55 | + "how ", |
| | 56 | + "why ", |
| | 57 | + "which ", |
| | 58 | + "who ", |
| | 59 | + "where ", |
| | 60 | + "when ", |
| | 61 | +) |
| | 62 | +_EXPLICIT_COMPLETIONS = { |
| | 63 | + "done", |
| | 64 | + "done.", |
| | 65 | + "completed", |
| | 66 | + "completed.", |
| | 67 | + "all set", |
| | 68 | + "all set.", |
| | 69 | +} |
| | 70 | +_INSTALL_HINTS = ("install", "dependencies", "set up project") |
| | 71 | +_NODE_HINTS = ("node", "npm") |
| | 72 | +_PYTHON_HINTS = ("python", "pip") |
| | 73 | + |
| 9 | COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed. | 74 | COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed. |
| 10 | | 75 | |
| 11 | Original task: {task} | 76 | Original task: {task} |
@@ -39,146 +104,152 @@ def detect_premature_completion( |
| 39 | actions_taken: list[str], | 104 | actions_taken: list[str], |
| 40 | ) -> bool: | 105 | ) -> bool: |
| 41 | """Heuristically detect when the assistant is stopping too early.""" | 106 | """Heuristically detect when the assistant is stopping too early.""" |
| 42 | - | 107 | + if not actions_taken and response.lower().strip() in _EXPLICIT_COMPLETIONS: |
| 43 | - task_lower = task.lower() | | |
| 44 | - response_lower = response.lower() | | |
| 45 | - | | |
| 46 | - if not actions_taken: | | |
| 47 | - explicit_completion = response_lower.strip() | | |
| 48 | - if explicit_completion in { | | |
| 49 | - "done", | | |
| 50 | - "done.", | | |
| 51 | - "completed", | | |
| 52 | - "completed.", | | |
| 53 | - "all set", | | |
| 54 | - "all set.", | | |
| 55 | - }: | | |
| 56 | - return False | | |
| 57 | - action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"] | | |
| 58 | - if any(verb in task_lower for verb in action_verbs): | | |
| 59 | - return True | | |
| 60 | - return False | | |
| 61 | - | | |
| 62 | - success_indicators = [ | | |
| 63 | - "successfully", | | |
| 64 | - "created", | | |
| 65 | - "written", | | |
| 66 | - "done", | | |
| 67 | - "completed", | | |
| 68 | - "file now contains", | | |
| 69 | - "has been updated", | | |
| 70 | - "installed", | | |
| 71 | - ] | | |
| 72 | - if any(indicator in response_lower for indicator in success_indicators): | | |
| 73 | - return False | | |
| 74 | - | | |
| 75 | - complex_indicators = [ | | |
| 76 | - "set up a project", | | |
| 77 | - "create a project", | | |
| 78 | - "build a complete", | | |
| 79 | - "scaffold", | | |
| 80 | - "initialize a new", | | |
| 81 | - "create a full", | | |
| 82 | - "implement a full", | | |
| 83 | - "develop a complete", | | |
| 84 | - ] | | |
| 85 | - is_complex = any(indicator in task_lower for indicator in complex_indicators) | | |
| 86 | - | | |
| 87 | - simple_creation = [ | | |
| 88 | - "create a file", | | |
| 89 | - "write a file", | | |
| 90 | - "make a file", | | |
| 91 | - "add a function", | | |
| 92 | - "edit the", | | |
| 93 | - "fix the", | | |
| 94 | - "update the", | | |
| 95 | - "read the", | | |
| 96 | - "show me", | | |
| 97 | - "list", | | |
| 98 | - "design a webpage", | | |
| 99 | - "create a webpage", | | |
| 100 | - "make a webpage", | | |
| 101 | - "create a page", | | |
| 102 | - "design a page", | | |
| 103 | - "create an html", | | |
| 104 | - "make an html", | | |
| 105 | - "write an html", | | |
| 106 | - "help me design", | | |
| 107 | - "create a simple", | | |
| 108 | - "make a simple", | | |
| 109 | - "write a simple", | | |
| 110 | - ] | | |
| 111 | - is_simple = any(indicator in task_lower for indicator in simple_creation) | | |
| 112 | - | | |
| 113 | - if "write" in str(actions_taken).lower() and len(actions_taken) >= 1: | | |
| 114 | return False | 108 | return False |
| 115 | - if is_simple and len(actions_taken) >= 1: | 109 | + return not assess_completion_follow_through( |
| 116 | - return False | 110 | + task=task, |
| 117 | - | 111 | + response=response, |
| 118 | - explicit_verification = ["and test", "and run", "and verify", "make sure it works"] | 112 | + actions_taken=actions_taken, |
| 119 | - needs_verification = any(indicator in task_lower for indicator in explicit_verification) | 113 | + ).is_complete |
| 120 | - | | |
| 121 | - action_types = set() | | |
| 122 | - for action in actions_taken: | | |
| 123 | - action_lower = action.lower() | | |
| 124 | - if "write" in action_lower: | | |
| 125 | - action_types.add("write") | | |
| 126 | - elif "edit" in action_lower: | | |
| 127 | - action_types.add("edit") | | |
| 128 | - elif "bash" in action_lower: | | |
| 129 | - action_types.add("bash") | | |
| 130 | - elif "read" in action_lower: | | |
| 131 | - action_types.add("read") | | |
| 132 | - elif "glob" in action_lower or "grep" in action_lower: | | |
| 133 | - action_types.add("search") | | |
| 134 | - | | |
| 135 | - if is_complex and len(actions_taken) < 3: | | |
| 136 | - return True | | |
| 137 | - if needs_verification and "bash" not in action_types: | | |
| 138 | - return True | | |
| 139 | - | | |
| 140 | - deflection_phrases = ["you can now", "you should", "you can run", "you can use"] | | |
| 141 | - if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2: | | |
| 142 | - return True | | |
| 143 | - | | |
| 144 | - return False | | |
| 145 | | 114 | |
| 146 | | 115 | |
| 147 | def get_continuation_prompt(task: str, actions_taken: list[str], response: str) -> str: | 116 | def get_continuation_prompt(task: str, actions_taken: list[str], response: str) -> str: |
| 148 | """Generate a helpful follow-through prompt for incomplete tasks.""" | 117 | """Generate a helpful follow-through prompt for incomplete tasks.""" |
| | 118 | + return assess_completion_follow_through( |
| | 119 | + task=task, |
| | 120 | + response=response, |
| | 121 | + actions_taken=actions_taken, |
| | 122 | + ).continuation_prompt |
| 149 | | 123 | |
| 150 | - del response | | |
| 151 | | 124 | |
| 152 | - task_lower = task.lower() | 125 | +def assess_completion_follow_through( |
| 153 | - follow_ups: list[str] = [] | 126 | + *, |
| | 127 | + task: str, |
| | 128 | + response: str, |
| | 129 | + actions_taken: list[str], |
| | 130 | +) -> TaskCompletionCheck: |
| | 131 | + """Build a typed follow-through assessment for one candidate response.""" |
| | 132 | + |
| | 133 | + task_lower = task.lower().strip() |
| | 134 | + response_lower = response.lower().strip() |
| | 135 | + action_types = _action_types(actions_taken) |
| | 136 | + informational = _is_informational_task(task_lower) |
| | 137 | + complex_task = any(indicator in task_lower for indicator in _COMPLEX_INDICATORS) |
| | 138 | + simple_task = any(indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS) |
| | 139 | + requires_verification = any( |
| | 140 | + indicator in task_lower for indicator in _VERIFICATION_INDICATORS |
| | 141 | + ) |
| | 142 | + requires_install = any(indicator in task_lower for indicator in _INSTALL_HINTS) |
| | 143 | + |
| | 144 | + accomplished = [_summarize_action(action) for action in actions_taken] |
| | 145 | + required_evidence = _required_evidence( |
| | 146 | + task_lower=task_lower, |
| | 147 | + informational=informational, |
| | 148 | + complex_task=complex_task, |
| | 149 | + requires_verification=requires_verification, |
| | 150 | + requires_install=requires_install, |
| | 151 | + ) |
| | 152 | + missing_evidence: list[str] = [] |
| | 153 | + remaining: list[str] = [] |
| | 154 | + suggested_next_steps: list[str] = [] |
| | 155 | + |
| | 156 | + if informational: |
| | 157 | + return TaskCompletionCheck( |
| | 158 | + original_task=task, |
| | 159 | + is_complete=bool(response.strip()), |
| | 160 | + accomplished=accomplished, |
| | 161 | + required_evidence=required_evidence, |
| | 162 | + missing_evidence=[], |
| | 163 | + remaining=[], |
| | 164 | + suggested_next_steps=[], |
| | 165 | + continuation_prompt=_format_continuation_prompt( |
| | 166 | + task=task, |
| | 167 | + missing_evidence=[], |
| | 168 | + suggested_next_steps=[], |
| | 169 | + action_count=len(actions_taken), |
| | 170 | + ), |
| | 171 | + ) |
| | 172 | + |
| | 173 | + if not actions_taken and _requires_action(task_lower): |
| | 174 | + _append_follow_through_gap( |
| | 175 | + missing_evidence, |
| | 176 | + remaining, |
| | 177 | + suggested_next_steps, |
| | 178 | + evidence="showing the requested work was actually carried out", |
| | 179 | + remaining_item="Perform the requested work instead of stopping at intent or narration", |
| | 180 | + next_step="Carry out the requested change or command now", |
| | 181 | + ) |
| 154 | | 182 | |
| 155 | - if any(keyword in task_lower for keyword in ["install", "dependencies", "set up project"]): | 183 | + if requires_install and not _has_install_evidence(task_lower, action_types, actions_taken): |
| 156 | - if "node" in task_lower or "npm" in task_lower: | 184 | + _append_follow_through_gap( |
| 157 | - if not any("npm" in action for action in actions_taken): | 185 | + missing_evidence, |
| 158 | - follow_ups.append("Run `npm install` to install dependencies") | 186 | + remaining, |
| 159 | - if "python" in task_lower or "pip" in task_lower: | 187 | + suggested_next_steps, |
| 160 | - if not any("pip" in action or "uv" in action for action in actions_taken): | 188 | + evidence="showing dependencies or setup steps were completed", |
| 161 | - follow_ups.append("Install dependencies") | 189 | + remaining_item="Install or initialize the required dependencies", |
| | 190 | + next_step=_install_follow_up(task_lower), |
| | 191 | + ) |
| 162 | | 192 | |
| 163 | - if "test" in task_lower and "run" in task_lower: | 193 | + if requires_verification and not _has_verification_evidence(action_types, actions_taken): |
| 164 | - if not any("test" in action or "pytest" in action or "jest" in action for action in actions_taken): | 194 | + _append_follow_through_gap( |
| 165 | - follow_ups.append("Run the tests") | 195 | + missing_evidence, |
| | 196 | + remaining, |
| | 197 | + suggested_next_steps, |
| | 198 | + evidence="showing the result was run or verified", |
| | 199 | + remaining_item="Run the result and capture a concrete verification outcome", |
| | 200 | + next_step="Execute what you created or run the relevant tests now", |
| | 201 | + ) |
| 166 | | 202 | |
| 167 | - if any(keyword in task_lower for keyword in ["and run", "and test", "and verify", "make sure it works"]): | 203 | + if complex_task and len(actions_taken) < 3: |
| 168 | - follow_ups.append("Execute what was created to verify it works") | 204 | + _append_follow_through_gap( |
| | 205 | + missing_evidence, |
| | 206 | + remaining, |
| | 207 | + suggested_next_steps, |
| | 208 | + evidence="showing the broader end-to-end implementation or setup was completed", |
| | 209 | + remaining_item="Finish the larger end-to-end task instead of stopping after a partial step", |
| | 210 | + next_step="Continue through the remaining setup or implementation steps", |
| | 211 | + ) |
| 169 | | 212 | |
| 170 | - if follow_ups: | 213 | + if ( |
| 171 | - steps = "\n".join(f"- {step}" for step in follow_ups[:2]) | 214 | + any(phrase in response_lower for phrase in _DEFLECTION_PHRASES) |
| 172 | - return ( | 215 | + and len(actions_taken) < 2 |
| 173 | - f'The task was: "{task}"\n\n' | 216 | + ): |
| 174 | - f"You may need to also:\n{steps}\n\n" | 217 | + _append_follow_through_gap( |
| 175 | - "If the task is actually complete, just confirm what was done." | 218 | + missing_evidence, |
| | 219 | + remaining, |
| | 220 | + suggested_next_steps, |
| | 221 | + evidence="showing execution evidence rather than instructions handed back to the user", |
| | 222 | + remaining_item="Perform the work yourself or state concretely what you already verified", |
| | 223 | + next_step="Continue the task instead of handing the next step to the user", |
| 176 | ) | 224 | ) |
| 177 | | 225 | |
| 178 | - return ( | 226 | + if "write" in action_types and actions_taken and simple_task: |
| 179 | - f'Task: "{task}"\n' | 227 | + missing_evidence = [ |
| 180 | - f"You took {len(actions_taken)} action(s). " | 228 | + item |
| 181 | - "If there's more to do, continue. Otherwise, confirm completion." | 229 | + for item in missing_evidence |
| | 230 | + if item != "showing the requested work was actually carried out" |
| | 231 | + ] |
| | 232 | + remaining = [ |
| | 233 | + item |
| | 234 | + for item in remaining |
| | 235 | + if item != "Perform the requested work instead of stopping at intent or narration" |
| | 236 | + ] |
| | 237 | + |
| | 238 | + is_complete = not missing_evidence |
| | 239 | + return TaskCompletionCheck( |
| | 240 | + original_task=task, |
| | 241 | + is_complete=is_complete, |
| | 242 | + accomplished=accomplished, |
| | 243 | + required_evidence=required_evidence, |
| | 244 | + missing_evidence=missing_evidence, |
| | 245 | + remaining=remaining, |
| | 246 | + suggested_next_steps=suggested_next_steps, |
| | 247 | + continuation_prompt=_format_continuation_prompt( |
| | 248 | + task=task, |
| | 249 | + missing_evidence=missing_evidence, |
| | 250 | + suggested_next_steps=suggested_next_steps, |
| | 251 | + action_count=len(actions_taken), |
| | 252 | + ), |
| 182 | ) | 253 | ) |
| 183 | | 254 | |
| 184 | | 255 | |
@@ -207,9 +278,154 @@ def parse_completion_check(response: str, original_task: str) -> TaskCompletionC |
| 207 | original_task=original_task, | 278 | original_task=original_task, |
| 208 | is_complete=data.get("is_complete", False), | 279 | is_complete=data.get("is_complete", False), |
| 209 | accomplished=data.get("accomplished", []), | 280 | accomplished=data.get("accomplished", []), |
| | 281 | + required_evidence=data.get("required_evidence", []), |
| | 282 | + missing_evidence=data.get("missing_evidence", data.get("remaining", [])), |
| 210 | remaining=data.get("remaining", []), | 283 | remaining=data.get("remaining", []), |
| 211 | suggested_next_steps=next_steps, | 284 | suggested_next_steps=next_steps, |
| 212 | continuation_prompt=continuation, | 285 | continuation_prompt=continuation, |
| 213 | ) | 286 | ) |
| 214 | except json.JSONDecodeError: | 287 | except json.JSONDecodeError: |
| 215 | return TaskCompletionCheck(original_task=original_task) | 288 | return TaskCompletionCheck(original_task=original_task) |
| | 289 | + |
| | 290 | + |
| | 291 | +def _action_types(actions_taken: list[str]) -> set[str]: |
| | 292 | + action_types: set[str] = set() |
| | 293 | + for action in actions_taken: |
| | 294 | + action_lower = action.lower() |
| | 295 | + if "write" in action_lower: |
| | 296 | + action_types.add("write") |
| | 297 | + elif "edit" in action_lower or "patch" in action_lower: |
| | 298 | + action_types.add("edit") |
| | 299 | + elif "bash" in action_lower or "shell" in action_lower: |
| | 300 | + action_types.add("bash") |
| | 301 | + elif "read" in action_lower: |
| | 302 | + action_types.add("read") |
| | 303 | + elif "glob" in action_lower or "grep" in action_lower or "search" in action_lower: |
| | 304 | + action_types.add("search") |
| | 305 | + elif "todo" in action_lower: |
| | 306 | + action_types.add("workflow") |
| | 307 | + return action_types |
| | 308 | + |
| | 309 | + |
| | 310 | +def _is_informational_task(task_lower: str) -> bool: |
| | 311 | + if task_lower.startswith(_INFORMATIONAL_PREFIXES): |
| | 312 | + return True |
| | 313 | + if task_lower.endswith("?") and task_lower.startswith( |
| | 314 | + ("what ", "how ", "why ", "which ", "who ", "where ", "when ") |
| | 315 | + ): |
| | 316 | + return True |
| | 317 | + return False |
| | 318 | + |
| | 319 | + |
| | 320 | +def _requires_action(task_lower: str) -> bool: |
| | 321 | + return any(verb in task_lower for verb in _ACTION_VERBS) or any( |
| | 322 | + indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS |
| | 323 | + ) |
| | 324 | + |
| | 325 | + |
| | 326 | +def _required_evidence( |
| | 327 | + *, |
| | 328 | + task_lower: str, |
| | 329 | + informational: bool, |
| | 330 | + complex_task: bool, |
| | 331 | + requires_verification: bool, |
| | 332 | + requires_install: bool, |
| | 333 | +) -> list[str]: |
| | 334 | + if informational: |
| | 335 | + return [] |
| | 336 | + |
| | 337 | + required: list[str] = [] |
| | 338 | + if _requires_action(task_lower): |
| | 339 | + required.append("showing the requested work was actually carried out") |
| | 340 | + if requires_install: |
| | 341 | + required.append("showing dependencies or setup steps were completed") |
| | 342 | + if requires_verification: |
| | 343 | + required.append("showing the result was run or verified") |
| | 344 | + if complex_task: |
| | 345 | + required.append("showing the broader end-to-end implementation or setup was completed") |
| | 346 | + return required |
| | 347 | + |
| | 348 | + |
| | 349 | +def _has_install_evidence( |
| | 350 | + task_lower: str, |
| | 351 | + action_types: set[str], |
| | 352 | + actions_taken: list[str], |
| | 353 | +) -> bool: |
| | 354 | + del action_types |
| | 355 | + action_text = " ".join(actions_taken).lower() |
| | 356 | + if any(hint in task_lower for hint in _NODE_HINTS) and "npm" in action_text: |
| | 357 | + return True |
| | 358 | + if any(hint in task_lower for hint in _PYTHON_HINTS) and ( |
| | 359 | + "pip" in action_text or "uv" in action_text |
| | 360 | + ): |
| | 361 | + return True |
| | 362 | + return "install" in action_text or "init" in action_text or "setup" in action_text |
| | 363 | + |
| | 364 | + |
| | 365 | +def _has_verification_evidence( |
| | 366 | + action_types: set[str], |
| | 367 | + actions_taken: list[str], |
| | 368 | +) -> bool: |
| | 369 | + if "bash" in action_types: |
| | 370 | + return True |
| | 371 | + action_text = " ".join(actions_taken).lower() |
| | 372 | + return any( |
| | 373 | + token in action_text |
| | 374 | + for token in ("test", "pytest", "jest", "verify", "run", "execute") |
| | 375 | + ) |
| | 376 | + |
| | 377 | + |
| | 378 | +def _install_follow_up(task_lower: str) -> str: |
| | 379 | + if any(hint in task_lower for hint in _NODE_HINTS): |
| | 380 | + return "Run `npm install` to install dependencies" |
| | 381 | + if any(hint in task_lower for hint in _PYTHON_HINTS): |
| | 382 | + return "Install the Python dependencies" |
| | 383 | + return "Install or initialize the required dependencies now" |
| | 384 | + |
| | 385 | + |
| | 386 | +def _append_follow_through_gap( |
| | 387 | + missing_evidence: list[str], |
| | 388 | + remaining: list[str], |
| | 389 | + suggested_next_steps: list[str], |
| | 390 | + *, |
| | 391 | + evidence: str, |
| | 392 | + remaining_item: str, |
| | 393 | + next_step: str, |
| | 394 | +) -> None: |
| | 395 | + if evidence not in missing_evidence: |
| | 396 | + missing_evidence.append(evidence) |
| | 397 | + if remaining_item not in remaining: |
| | 398 | + remaining.append(remaining_item) |
| | 399 | + if next_step not in suggested_next_steps: |
| | 400 | + suggested_next_steps.append(next_step) |
| | 401 | + |
| | 402 | + |
| | 403 | +def _format_continuation_prompt( |
| | 404 | + *, |
| | 405 | + task: str, |
| | 406 | + missing_evidence: list[str], |
| | 407 | + suggested_next_steps: list[str], |
| | 408 | + action_count: int, |
| | 409 | +) -> str: |
| | 410 | + if suggested_next_steps: |
| | 411 | + evidence_lines = "\n".join(f"- {item}" for item in missing_evidence[:2]) |
| | 412 | + step_lines = "\n".join(f"- {step}" for step in suggested_next_steps[:3]) |
| | 413 | + return ( |
| | 414 | + f'The task was: "{task}"\n\n' |
| | 415 | + "The response still needs concrete evidence for:\n" |
| | 416 | + f"{evidence_lines}\n\n" |
| | 417 | + "Continue with:\n" |
| | 418 | + f"{step_lines}\n\n" |
| | 419 | + "If the task is actually complete, confirm the missing evidence explicitly." |
| | 420 | + ) |
| | 421 | + |
| | 422 | + return ( |
| | 423 | + f'Task: "{task}"\n' |
| | 424 | + f"You took {action_count} action(s). " |
| | 425 | + "If there's more to do, continue. Otherwise, confirm completion." |
| | 426 | + ) |
| | 427 | + |
| | 428 | + |
| | 429 | +def _summarize_action(action: str) -> str: |
| | 430 | + head, _, _ = action.partition(":") |
| | 431 | + return head.strip() or action.strip() |