@@ -722,46 +722,52 @@ def detect_premature_completion(task: str, response: str, actions_taken: list[st |
| 722 | 722 | """Quick heuristic to detect if agent is stopping too early. |
| 723 | 723 | |
| 724 | 724 | Returns True if the agent might be giving up prematurely. |
| 725 | + This should be CONSERVATIVE - only trigger when really needed, |
| 726 | + not for simple tasks that are genuinely complete. |
| 725 | 727 | """ |
| 726 | 728 | task_lower = task.lower() |
| 727 | 729 | response_lower = response.lower() |
| 728 | 730 | |
| 729 | | - # Keywords that suggest the task should involve multiple steps |
| 730 | | - multi_step_indicators = [ |
| 731 | | - "create a", "build a", "make a", "set up", "setup", |
| 732 | | - "initialize", "scaffold", "generate", "implement", |
| 733 | | - "project", "application", "app", "website", "api", |
| 734 | | - "add", "write", "develop", "design", "help me", |
| 735 | | - ] |
| 731 | + # If no actions taken at all and task requires action, that's premature |
| 732 | + if not actions_taken: |
| 733 | + # But only if this looks like an actionable task |
| 734 | + action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"] |
| 735 | + if any(verb in task_lower for verb in action_verbs): |
| 736 | + return True |
| 737 | + return False # Informational/conversational tasks don't need actions |
| 736 | 738 | |
| 737 | | - # Keywords that suggest testing/verification should happen |
| 738 | | - verification_indicators = [ |
| 739 | | - "test", "run", "start", "launch", "verify", "check", |
| 740 | | - "demo", "show", "demonstrate", "work", "function", |
| 739 | + # If we took actions and got successful results, trust that we're done |
| 740 | + # Check for success indicators in response |
| 741 | + success_indicators = [ |
| 742 | + "successfully", "created", "written", "done", "completed", |
| 743 | + "file now contains", "has been updated", "installed", |
| 741 | 744 | ] |
| 745 | + if any(ind in response_lower for ind in success_indicators) and len(actions_taken) >= 1: |
| 746 | + return False # Likely actually done |
| 742 | 747 | |
| 743 | | - # Keywords in response that suggest premature completion |
| 744 | | - premature_phrases = [ |
| 745 | | - "i've created", "i created", "file has been created", |
| 746 | | - "here's the", "i've set up the basic", "i've written", |
| 747 | | - "you can now", "you should now", "you can run", |
| 748 | | - "that's it", "all done", "complete", "finished", |
| 749 | | - "let me know", "feel free to", "hope this helps", |
| 750 | | - "is there anything else", |
| 748 | + # Keywords that suggest COMPLEX multi-step tasks (not simple ones) |
| 749 | + complex_indicators = [ |
| 750 | + "set up a project", "create a project", "build a complete", |
| 751 | + "scaffold", "initialize a new", "create a full", |
| 752 | + "implement a full", "develop a complete", |
| 751 | 753 | ] |
| 754 | + is_complex = any(ind in task_lower for ind in complex_indicators) |
| 752 | 755 | |
| 753 | | - # Check if this looks like a multi-step task |
| 754 | | - is_multi_step = any(ind in task_lower for ind in multi_step_indicators) |
| 755 | | - |
| 756 | | - # Check if verification was expected but not done |
| 757 | | - expects_verification = any(ind in task_lower for ind in verification_indicators) |
| 756 | + # Simple creation tasks don't need follow-up |
| 757 | + simple_creation = [ |
| 758 | + "create a file", "write a file", "make a file", |
| 759 | + "add a function", "edit the", "fix the", "update the", |
| 760 | + "read the", "show me", "list", |
| 761 | + ] |
| 762 | + is_simple = any(ind in task_lower for ind in simple_creation) |
| 758 | 763 | |
| 759 | | - # Check for premature completion phrases |
| 760 | | - has_premature_phrase = any(phrase in response_lower for phrase in premature_phrases) |
| 764 | + # If it's a simple task with at least one action, it's probably done |
| 765 | + if is_simple and len(actions_taken) >= 1: |
| 766 | + return False |
| 761 | 767 | |
| 762 | | - # Action count thresholds |
| 763 | | - few_actions = len(actions_taken) < 3 |
| 764 | | - very_few_actions = len(actions_taken) < 2 |
| 768 | + # Explicit verification requests need bash |
| 769 | + explicit_verification = ["and test", "and run", "and verify", "make sure it works"] |
| 770 | + needs_verification = any(ind in task_lower for ind in explicit_verification) |
| 765 | 771 | |
| 766 | 772 | # Categorize what actions were taken |
| 767 | 773 | action_types = set() |
@@ -778,33 +784,19 @@ def detect_premature_completion(task: str, response: str, actions_taken: list[st |
| 778 | 784 | elif "glob" in action_lower or "grep" in action_lower: |
| 779 | 785 | action_types.add("search") |
| 780 | 786 | |
| 781 | | - # More aggressive detection: |
| 787 | + # Detection rules (more conservative): |
| 782 | 788 | |
| 783 | | - # 1. Multi-step task with premature phrases and few actions |
| 784 | | - if is_multi_step and has_premature_phrase and few_actions: |
| 789 | + # 1. Complex project tasks with very few actions |
| 790 | + if is_complex and len(actions_taken) < 3: |
| 785 | 791 | return True |
| 786 | 792 | |
| 787 | | - # 2. Multi-step task with very few actions (regardless of phrases) |
| 788 | | - if is_multi_step and very_few_actions: |
| 793 | + # 2. Explicitly requested verification but no bash run |
| 794 | + if needs_verification and "bash" not in action_types: |
| 789 | 795 | return True |
| 790 | 796 | |
| 791 | | - # 3. Only wrote/edited files but never ran/tested anything |
| 792 | | - if action_types and action_types <= {"write", "edit", "read"} and few_actions: |
| 793 | | - # Wrote files but never executed bash to test |
| 794 | | - if "write" in action_types or "edit" in action_types: |
| 795 | | - return True |
| 796 | | - |
| 797 | | - # 4. Verification expected but no bash commands run |
| 798 | | - if expects_verification and "bash" not in action_types: |
| 799 | | - return True |
| 800 | | - |
| 801 | | - # 5. Response has chatbot-style "let me know" phrases |
| 802 | | - chatbot_phrases = ["let me know", "feel free", "hope this", "happy to help"] |
| 803 | | - if any(phrase in response_lower for phrase in chatbot_phrases): |
| 804 | | - return True |
| 805 | | - |
| 806 | | - # 6. Response is very short but task seems substantial |
| 807 | | - if len(response) < 200 and is_multi_step and len(actions_taken) > 0: |
| 797 | + # 3. Chatbot-style deflection with no real work done |
| 798 | + deflection_phrases = ["you can now", "you should", "you can run", "you can use"] |
| 799 | + if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2: |
| 808 | 800 | return True |
| 809 | 801 | |
| 810 | 802 | return False |
@@ -814,6 +806,7 @@ def get_continuation_prompt(task: str, actions_taken: list[str], response: str) |
| 814 | 806 | """Generate a prompt to encourage the agent to continue. |
| 815 | 807 | |
| 816 | 808 | Returns a prompt that nudges the agent to follow through. |
| 809 | + Should be helpful, not aggressive. |
| 817 | 810 | """ |
| 818 | 811 | task_lower = task.lower() |
| 819 | 812 | actions_str = ", ".join(a.split(":")[0] for a in actions_taken[-5:]) if actions_taken else "none" |
@@ -821,51 +814,37 @@ def get_continuation_prompt(task: str, actions_taken: list[str], response: str) |
| 821 | 814 | # Determine what type of follow-up is needed |
| 822 | 815 | follow_ups = [] |
| 823 | 816 | |
| 824 | | - # Project setup tasks should initialize |
| 825 | | - if any(kw in task_lower for kw in ["node", "npm", "javascript", "react", "vue", "next"]): |
| 826 | | - if not any("npm" in a for a in actions_taken): |
| 827 | | - follow_ups.append("Run `npm install` to install dependencies") |
| 828 | | - follow_ups.append("Start the development server to verify it works") |
| 829 | | - |
| 830 | | - if any(kw in task_lower for kw in ["python", "pip", "django", "flask", "fastapi"]): |
| 831 | | - if not any("pip" in a or "uv" in a for a in actions_taken): |
| 832 | | - follow_ups.append("Install dependencies with pip/uv") |
| 833 | | - follow_ups.append("Run the application to verify it works") |
| 834 | | - |
| 835 | | - # Test tasks should run tests |
| 836 | | - if "test" in task_lower: |
| 817 | + # Only suggest package install if explicitly mentioned in task |
| 818 | + if any(kw in task_lower for kw in ["install", "dependencies", "set up project"]): |
| 819 | + if "node" in task_lower or "npm" in task_lower: |
| 820 | + if not any("npm" in a for a in actions_taken): |
| 821 | + follow_ups.append("Run `npm install` to install dependencies") |
| 822 | + if "python" in task_lower or "pip" in task_lower: |
| 823 | + if not any("pip" in a or "uv" in a for a in actions_taken): |
| 824 | + follow_ups.append("Install dependencies") |
| 825 | + |
| 826 | + # Only suggest running tests if "test" is explicitly in task |
| 827 | + if "test" in task_lower and "run" in task_lower: |
| 837 | 828 | if not any("test" in a or "pytest" in a or "jest" in a for a in actions_taken): |
| 838 | | - follow_ups.append("Run the tests to verify they pass") |
| 839 | | - |
| 840 | | - # Build tasks should verify build |
| 841 | | - if "build" in task_lower or "compile" in task_lower: |
| 842 | | - if not any("build" in a or "compile" in a for a in actions_taken): |
| 843 | | - follow_ups.append("Run the build to verify it succeeds") |
| 829 | + follow_ups.append("Run the tests") |
| 844 | 830 | |
| 845 | | - # Generic follow-ups for creation tasks |
| 846 | | - if any(kw in task_lower for kw in ["create", "make", "build", "set up"]): |
| 847 | | - if len(actions_taken) < 3: |
| 848 | | - follow_ups.append("Verify the creation was successful") |
| 849 | | - follow_ups.append("Demonstrate that it works as expected") |
| 831 | + # If task explicitly asks to run/verify, remind to do so |
| 832 | + if any(kw in task_lower for kw in ["and run", "and test", "and verify", "make sure it works"]): |
| 833 | + follow_ups.append("Execute what was created to verify it works") |
| 850 | 834 | |
| 851 | 835 | if follow_ups: |
| 852 | | - steps = "\n".join(f"- {step}" for step in follow_ups[:3]) |
| 836 | + steps = "\n".join(f"- {step}" for step in follow_ups[:2]) |
| 853 | 837 | return ( |
| 854 | | - f"STOP - You are NOT done. The task was: \"{task}\"\n\n" |
| 855 | | - f"Actions so far: {actions_str}\n" |
| 856 | | - f"You MUST also:\n{steps}\n\n" |
| 857 | | - f"DO NOT respond with text. USE YOUR TOOLS NOW to complete these steps." |
| 838 | + f"The task was: \"{task}\"\n\n" |
| 839 | + f"You may need to also:\n{steps}\n\n" |
| 840 | + f"If the task is actually complete, just confirm what was done." |
| 858 | 841 | ) |
| 859 | 842 | |
| 860 | | - # Generic continuation - be forceful |
| 843 | + # Generic - be gentle |
| 861 | 844 | return ( |
| 862 | | - f"INCOMPLETE. Task: \"{task}\"\n" |
| 863 | | - f"Actions taken: {actions_str} ({len(actions_taken)} total)\n\n" |
| 864 | | - f"You stopped too early. What about:\n" |
| 865 | | - f"- Testing/verifying the result?\n" |
| 866 | | - f"- Running what you created?\n" |
| 867 | | - f"- Installing dependencies?\n\n" |
| 868 | | - f"USE YOUR TOOLS to continue. Do not just describe - EXECUTE." |
| 845 | + f"Task: \"{task}\"\n" |
| 846 | + f"You took {len(actions_taken)} action(s). " |
| 847 | + f"If there's more to do, continue. Otherwise, confirm completion." |
| 869 | 848 | ) |
| 870 | 849 | |
| 871 | 850 | |