@@ -722,46 +722,52 @@ def detect_premature_completion(task: str, response: str, actions_taken: list[st |
| 722 | """Quick heuristic to detect if agent is stopping too early. | 722 | """Quick heuristic to detect if agent is stopping too early. |
| 723 | | 723 | |
| 724 | Returns True if the agent might be giving up prematurely. | 724 | Returns True if the agent might be giving up prematurely. |
| | 725 | + This should be CONSERVATIVE - only trigger when really needed, |
| | 726 | + not for simple tasks that are genuinely complete. |
| 725 | """ | 727 | """ |
| 726 | task_lower = task.lower() | 728 | task_lower = task.lower() |
| 727 | response_lower = response.lower() | 729 | response_lower = response.lower() |
| 728 | | 730 | |
| 729 | - # Keywords that suggest the task should involve multiple steps | 731 | + # If no actions taken at all and task requires action, that's premature |
| 730 | - multi_step_indicators = [ | 732 | + if not actions_taken: |
| 731 | - "create a", "build a", "make a", "set up", "setup", | 733 | + # But only if this looks like an actionable task |
| 732 | - "initialize", "scaffold", "generate", "implement", | 734 | + action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"] |
| 733 | - "project", "application", "app", "website", "api", | 735 | + if any(verb in task_lower for verb in action_verbs): |
| 734 | - "add", "write", "develop", "design", "help me", | 736 | + return True |
| 735 | - ] | 737 | + return False # Informational/conversational tasks don't need actions |
| 736 | | 738 | |
| 737 | - # Keywords that suggest testing/verification should happen | 739 | + # If we took actions and got successful results, trust that we're done |
| 738 | - verification_indicators = [ | 740 | + # Check for success indicators in response |
| 739 | - "test", "run", "start", "launch", "verify", "check", | 741 | + success_indicators = [ |
| 740 | - "demo", "show", "demonstrate", "work", "function", | 742 | + "successfully", "created", "written", "done", "completed", |
| | 743 | + "file now contains", "has been updated", "installed", |
| 741 | ] | 744 | ] |
| | 745 | + if any(ind in response_lower for ind in success_indicators) and len(actions_taken) >= 1: |
| | 746 | + return False # Likely actually done |
| 742 | | 747 | |
| 743 | - # Keywords in response that suggest premature completion | 748 | + # Keywords that suggest COMPLEX multi-step tasks (not simple ones) |
| 744 | - premature_phrases = [ | 749 | + complex_indicators = [ |
| 745 | - "i've created", "i created", "file has been created", | 750 | + "set up a project", "create a project", "build a complete", |
| 746 | - "here's the", "i've set up the basic", "i've written", | 751 | + "scaffold", "initialize a new", "create a full", |
| 747 | - "you can now", "you should now", "you can run", | 752 | + "implement a full", "develop a complete", |
| 748 | - "that's it", "all done", "complete", "finished", | | |
| 749 | - "let me know", "feel free to", "hope this helps", | | |
| 750 | - "is there anything else", | | |
| 751 | ] | 753 | ] |
| | 754 | + is_complex = any(ind in task_lower for ind in complex_indicators) |
| 752 | | 755 | |
| 753 | - # Check if this looks like a multi-step task | 756 | + # Simple creation tasks don't need follow-up |
| 754 | - is_multi_step = any(ind in task_lower for ind in multi_step_indicators) | 757 | + simple_creation = [ |
| 755 | - | 758 | + "create a file", "write a file", "make a file", |
| 756 | - # Check if verification was expected but not done | 759 | + "add a function", "edit the", "fix the", "update the", |
| 757 | - expects_verification = any(ind in task_lower for ind in verification_indicators) | 760 | + "read the", "show me", "list", |
| | 761 | + ] |
| | 762 | + is_simple = any(ind in task_lower for ind in simple_creation) |
| 758 | | 763 | |
| 759 | - # Check for premature completion phrases | 764 | + # If it's a simple task with at least one action, it's probably done |
| 760 | - has_premature_phrase = any(phrase in response_lower for phrase in premature_phrases) | 765 | + if is_simple and len(actions_taken) >= 1: |
| | 766 | + return False |
| 761 | | 767 | |
| 762 | - # Action count thresholds | 768 | + # Explicit verification requests need bash |
| 763 | - few_actions = len(actions_taken) < 3 | 769 | + explicit_verification = ["and test", "and run", "and verify", "make sure it works"] |
| 764 | - very_few_actions = len(actions_taken) < 2 | 770 | + needs_verification = any(ind in task_lower for ind in explicit_verification) |
| 765 | | 771 | |
| 766 | # Categorize what actions were taken | 772 | # Categorize what actions were taken |
| 767 | action_types = set() | 773 | action_types = set() |
@@ -778,33 +784,19 @@ def detect_premature_completion(task: str, response: str, actions_taken: list[st |
| 778 | elif "glob" in action_lower or "grep" in action_lower: | 784 | elif "glob" in action_lower or "grep" in action_lower: |
| 779 | action_types.add("search") | 785 | action_types.add("search") |
| 780 | | 786 | |
| 781 | - # More aggressive detection: | 787 | + # Detection rules (more conservative): |
| 782 | | 788 | |
| 783 | - # 1. Multi-step task with premature phrases and few actions | 789 | + # 1. Complex project tasks with very few actions |
| 784 | - if is_multi_step and has_premature_phrase and few_actions: | 790 | + if is_complex and len(actions_taken) < 3: |
| 785 | return True | 791 | return True |
| 786 | | 792 | |
| 787 | - # 2. Multi-step task with very few actions (regardless of phrases) | 793 | + # 2. Explicitly requested verification but no bash run |
| 788 | - if is_multi_step and very_few_actions: | 794 | + if needs_verification and "bash" not in action_types: |
| 789 | return True | 795 | return True |
| 790 | | 796 | |
| 791 | - # 3. Only wrote/edited files but never ran/tested anything | 797 | + # 3. Chatbot-style deflection with no real work done |
| 792 | - if action_types and action_types <= {"write", "edit", "read"} and few_actions: | 798 | + deflection_phrases = ["you can now", "you should", "you can run", "you can use"] |
| 793 | - # Wrote files but never executed bash to test | 799 | + if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2: |
| 794 | - if "write" in action_types or "edit" in action_types: | | |
| 795 | - return True | | |
| 796 | - | | |
| 797 | - # 4. Verification expected but no bash commands run | | |
| 798 | - if expects_verification and "bash" not in action_types: | | |
| 799 | - return True | | |
| 800 | - | | |
| 801 | - # 5. Response has chatbot-style "let me know" phrases | | |
| 802 | - chatbot_phrases = ["let me know", "feel free", "hope this", "happy to help"] | | |
| 803 | - if any(phrase in response_lower for phrase in chatbot_phrases): | | |
| 804 | - return True | | |
| 805 | - | | |
| 806 | - # 6. Response is very short but task seems substantial | | |
| 807 | - if len(response) < 200 and is_multi_step and len(actions_taken) > 0: | | |
| 808 | return True | 800 | return True |
| 809 | | 801 | |
| 810 | return False | 802 | return False |
@@ -814,6 +806,7 @@ def get_continuation_prompt(task: str, actions_taken: list[str], response: str) |
| 814 | """Generate a prompt to encourage the agent to continue. | 806 | """Generate a prompt to encourage the agent to continue. |
| 815 | | 807 | |
| 816 | Returns a prompt that nudges the agent to follow through. | 808 | Returns a prompt that nudges the agent to follow through. |
| | 809 | + Should be helpful, not aggressive. |
| 817 | """ | 810 | """ |
| 818 | task_lower = task.lower() | 811 | task_lower = task.lower() |
| 819 | actions_str = ", ".join(a.split(":")[0] for a in actions_taken[-5:]) if actions_taken else "none" | 812 | actions_str = ", ".join(a.split(":")[0] for a in actions_taken[-5:]) if actions_taken else "none" |
@@ -821,51 +814,37 @@ def get_continuation_prompt(task: str, actions_taken: list[str], response: str) |
| 821 | # Determine what type of follow-up is needed | 814 | # Determine what type of follow-up is needed |
| 822 | follow_ups = [] | 815 | follow_ups = [] |
| 823 | | 816 | |
| 824 | - # Project setup tasks should initialize | 817 | + # Only suggest package install if explicitly mentioned in task |
| 825 | - if any(kw in task_lower for kw in ["node", "npm", "javascript", "react", "vue", "next"]): | 818 | + if any(kw in task_lower for kw in ["install", "dependencies", "set up project"]): |
| 826 | - if not any("npm" in a for a in actions_taken): | 819 | + if "node" in task_lower or "npm" in task_lower: |
| 827 | - follow_ups.append("Run `npm install` to install dependencies") | 820 | + if not any("npm" in a for a in actions_taken): |
| 828 | - follow_ups.append("Start the development server to verify it works") | 821 | + follow_ups.append("Run `npm install` to install dependencies") |
| 829 | - | 822 | + if "python" in task_lower or "pip" in task_lower: |
| 830 | - if any(kw in task_lower for kw in ["python", "pip", "django", "flask", "fastapi"]): | 823 | + if not any("pip" in a or "uv" in a for a in actions_taken): |
| 831 | - if not any("pip" in a or "uv" in a for a in actions_taken): | 824 | + follow_ups.append("Install dependencies") |
| 832 | - follow_ups.append("Install dependencies with pip/uv") | 825 | + |
| 833 | - follow_ups.append("Run the application to verify it works") | 826 | + # Only suggest running tests if "test" is explicitly in task |
| 834 | - | 827 | + if "test" in task_lower and "run" in task_lower: |
| 835 | - # Test tasks should run tests | | |
| 836 | - if "test" in task_lower: | | |
| 837 | if not any("test" in a or "pytest" in a or "jest" in a for a in actions_taken): | 828 | if not any("test" in a or "pytest" in a or "jest" in a for a in actions_taken): |
| 838 | - follow_ups.append("Run the tests to verify they pass") | 829 | + follow_ups.append("Run the tests") |
| 839 | - | | |
| 840 | - # Build tasks should verify build | | |
| 841 | - if "build" in task_lower or "compile" in task_lower: | | |
| 842 | - if not any("build" in a or "compile" in a for a in actions_taken): | | |
| 843 | - follow_ups.append("Run the build to verify it succeeds") | | |
| 844 | | 830 | |
| 845 | - # Generic follow-ups for creation tasks | 831 | + # If task explicitly asks to run/verify, remind to do so |
| 846 | - if any(kw in task_lower for kw in ["create", "make", "build", "set up"]): | 832 | + if any(kw in task_lower for kw in ["and run", "and test", "and verify", "make sure it works"]): |
| 847 | - if len(actions_taken) < 3: | 833 | + follow_ups.append("Execute what was created to verify it works") |
| 848 | - follow_ups.append("Verify the creation was successful") | | |
| 849 | - follow_ups.append("Demonstrate that it works as expected") | | |
| 850 | | 834 | |
| 851 | if follow_ups: | 835 | if follow_ups: |
| 852 | - steps = "\n".join(f"- {step}" for step in follow_ups[:3]) | 836 | + steps = "\n".join(f"- {step}" for step in follow_ups[:2]) |
| 853 | return ( | 837 | return ( |
| 854 | - f"STOP - You are NOT done. The task was: \"{task}\"\n\n" | 838 | + f"The task was: \"{task}\"\n\n" |
| 855 | - f"Actions so far: {actions_str}\n" | 839 | + f"You may need to also:\n{steps}\n\n" |
| 856 | - f"You MUST also:\n{steps}\n\n" | 840 | + f"If the task is actually complete, just confirm what was done." |
| 857 | - f"DO NOT respond with text. USE YOUR TOOLS NOW to complete these steps." | | |
| 858 | ) | 841 | ) |
| 859 | | 842 | |
| 860 | - # Generic continuation - be forceful | 843 | + # Generic - be gentle |
| 861 | return ( | 844 | return ( |
| 862 | - f"INCOMPLETE. Task: \"{task}\"\n" | 845 | + f"Task: \"{task}\"\n" |
| 863 | - f"Actions taken: {actions_str} ({len(actions_taken)} total)\n\n" | 846 | + f"You took {len(actions_taken)} action(s). " |
| 864 | - f"You stopped too early. What about:\n" | 847 | + f"If there's more to do, continue. Otherwise, confirm completion." |
| 865 | - f"- Testing/verifying the result?\n" | | |
| 866 | - f"- Running what you created?\n" | | |
| 867 | - f"- Installing dependencies?\n\n" | | |
| 868 | - f"USE YOUR TOOLS to continue. Do not just describe - EXECUTE." | | |
| 869 | ) | 848 | ) |
| 870 | | 849 | |
| 871 | | 850 | |