`e83c568`

Add typed completion follow-through assessment

Authored by

espadonne 1 month ago

SHA: e83c568e08b785112bccaa24824cf3495e435d0c
Parents: e2158c6
Tree: df3f54d

6 changed files

Status	File	+	-
M	`src/loader/runtime/completion_policy.py`	18	13
M	`src/loader/runtime/task_completion.py`	343	127
M	`tests/test_completion_policy.py`	41	1
M	`tests/test_reasoning_compat.py`	1	0
M	`tests/test_reasoning_types.py`	2	0
M	`tests/test_turn_completion.py`	2	1

src/loader/runtime/completion_policy.pymodified

  from .context import RuntimeContext
  from .events import AgentEvent, TurnSummary
  from .reasoning_types import TaskCompletionCheck
 -from .task_completion import detect_premature_completion, get_continuation_prompt
 +from .task_completion import assess_completion_follow_through, detect_premature_completion
  EventSink = Callable[[AgentEvent], Awaitable[None]]
      should_continue: bool
      decision_code: str
      decision_summary: str
 +    completion_check: TaskCompletionCheck | None = None
  class CompletionPolicy:
          """Nudge non-mutating tasks to continue when completion looks premature."""
          cfg = self.context.config.reasoning
 +        completion_check = (
 +            assess_completion_follow_through(
 +                task=task,
 +                response=content,
 +                actions_taken=actions_taken,
 +            )
 +            if cfg.use_quick_completion
 +            else None
 +        )
          if continuation_count >= cfg.max_continuation_prompts:
              return ContinuationDecision(
                  should_continue=False,
                  decision_code="continuation_budget_exhausted",
                  decision_summary="accepted the response because the continuation budget was exhausted",
 +                completion_check=completion_check,
+             )
          is_premature = (
                  should_continue=False,
                  decision_code="completion_response_accepted",
                  decision_summary="accepted the response because completion heuristics found no missing follow-through",
 +                completion_check=completion_check,
+             )
 -        continuation_prompt = get_continuation_prompt(
 -            task,
 -            actions_taken,
 -            content,
 -        )
          await emit(
              AgentEvent(
                  type="completion_check",
                  content=f"Task may be incomplete ({len(actions_taken)} actions taken)",
 -                completion_check=TaskCompletionCheck(
 -                    original_task=task,
 -                    is_complete=False,
 -                    accomplished=[action.split(":")[0] for action in actions_taken],
 -                    continuation_prompt=continuation_prompt,
 -                ),
 +                completion_check=completion_check,
+             )
+         )
          self.context.session.append(Message(role=Role.ASSISTANT, content=response_content))
 -        self.context.session.append(Message(role=Role.USER, content=continuation_prompt))
 +        self.context.session.append(
 +            Message(role=Role.USER, content=completion_check.continuation_prompt)
 +        )
          return ContinuationDecision(
              should_continue=True,
              decision_code="premature_completion_nudge",
              decision_summary="requested one continuation because the non-mutating response looked incomplete",
 +            completion_check=completion_check,
+         )
      @staticmethod

src/loader/runtime/task_completion.pymodified

  from .reasoning_types import TaskCompletionCheck
 +_ACTION_VERBS = ("create", "write", "make", "edit", "fix", "add", "delete", "run")
 +_COMPLEX_INDICATORS = (
 +    "set up a project",
 +    "create a project",
 +    "build a complete",
 +    "scaffold",
 +    "initialize a new",
 +    "create a full",
 +    "implement a full",
 +    "develop a complete",
 +)
 +_SIMPLE_TASK_INDICATORS = (
 +    "create a file",
 +    "write a file",
 +    "make a file",
 +    "add a function",
 +    "edit the",
 +    "fix the",
 +    "update the",
 +    "read the",
 +    "show me",
 +    "list",
 +    "design a webpage",
 +    "create a webpage",
 +    "make a webpage",
 +    "create a page",
 +    "design a page",
 +    "create an html",
 +    "make an html",
 +    "write an html",
 +    "help me design",
 +    "create a simple",
 +    "make a simple",
 +    "write a simple",
 +)
 +_VERIFICATION_INDICATORS = ("and test", "and run", "and verify", "make sure it works")
 +_DEFLECTION_PHRASES = ("you can now", "you should", "you can run", "you can use")
 +_INFORMATIONAL_PREFIXES = (
 +    "explain ",
 +    "describe ",
 +    "summarize ",
 +    "compare ",
 +    "outline ",
 +    "review ",
 +    "analyze ",
 +    "what ",
 +    "how ",
 +    "why ",
 +    "which ",
 +    "who ",
 +    "where ",
 +    "when ",
 +)
 +_EXPLICIT_COMPLETIONS = {
 +    "done",
 +    "done.",
 +    "completed",
 +    "completed.",
 +    "all set",
 +    "all set.",
 +}
 +_INSTALL_HINTS = ("install", "dependencies", "set up project")
 +_NODE_HINTS = ("node", "npm")
 +_PYTHON_HINTS = ("python", "pip")
++
  COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed.
  Original task: {task}
      actions_taken: list[str],
  ) -> bool:
      """Heuristically detect when the assistant is stopping too early."""
+-
 -    task_lower = task.lower()
 -    response_lower = response.lower()
+-
 -    if not actions_taken:
 -        explicit_completion = response_lower.strip()
 -        if explicit_completion in {
 -            "done",
 -            "done.",
 -            "completed",
 -            "completed.",
 -            "all set",
 -            "all set.",
 -        }:
 -            return False
 -        action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"]
 -        if any(verb in task_lower for verb in action_verbs):
 -            return True
 -        return False
+-
 -    success_indicators = [
 -        "successfully",
 -        "created",
 -        "written",
 -        "done",
 -        "completed",
 -        "file now contains",
 -        "has been updated",
 -        "installed",
 -    ]
 -    if any(indicator in response_lower for indicator in success_indicators):
 -        return False
+-
 -    complex_indicators = [
 -        "set up a project",
 -        "create a project",
 -        "build a complete",
 -        "scaffold",
 -        "initialize a new",
 -        "create a full",
 -        "implement a full",
 -        "develop a complete",
 -    ]
 -    is_complex = any(indicator in task_lower for indicator in complex_indicators)
+-
 -    simple_creation = [
 -        "create a file",
 -        "write a file",
 -        "make a file",
 -        "add a function",
 -        "edit the",
 -        "fix the",
 -        "update the",
 -        "read the",
 -        "show me",
 -        "list",
 -        "design a webpage",
 -        "create a webpage",
 -        "make a webpage",
 -        "create a page",
 -        "design a page",
 -        "create an html",
 -        "make an html",
 -        "write an html",
 -        "help me design",
 -        "create a simple",
 -        "make a simple",
 -        "write a simple",
 -    ]
 -    is_simple = any(indicator in task_lower for indicator in simple_creation)
+-
 -    if "write" in str(actions_taken).lower() and len(actions_taken) >= 1:
 +    if not actions_taken and response.lower().strip() in _EXPLICIT_COMPLETIONS:
          return False
 -    if is_simple and len(actions_taken) >= 1:
 -        return False
+-
 -    explicit_verification = ["and test", "and run", "and verify", "make sure it works"]
 -    needs_verification = any(indicator in task_lower for indicator in explicit_verification)
+-
 -    action_types = set()
 -    for action in actions_taken:
 -        action_lower = action.lower()
 -        if "write" in action_lower:
 -            action_types.add("write")
 -        elif "edit" in action_lower:
 -            action_types.add("edit")
 -        elif "bash" in action_lower:
 -            action_types.add("bash")
 -        elif "read" in action_lower:
 -            action_types.add("read")
 -        elif "glob" in action_lower or "grep" in action_lower:
 -            action_types.add("search")
+-
 -    if is_complex and len(actions_taken) < 3:
 -        return True
 -    if needs_verification and "bash" not in action_types:
 -        return True
+-
 -    deflection_phrases = ["you can now", "you should", "you can run", "you can use"]
 -    if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2:
 -        return True
+-
 -    return False
 +    return not assess_completion_follow_through(
 +        task=task,
 +        response=response,
 +        actions_taken=actions_taken,
 +    ).is_complete
  def get_continuation_prompt(task: str, actions_taken: list[str], response: str) -> str:
      """Generate a helpful follow-through prompt for incomplete tasks."""
 +    return assess_completion_follow_through(
 +        task=task,
 +        response=response,
 +        actions_taken=actions_taken,
 +    ).continuation_prompt
 -    del response
 -    task_lower = task.lower()
 -    follow_ups: list[str] = []
 +def assess_completion_follow_through(
 +    *,
 +    task: str,
 +    response: str,
 +    actions_taken: list[str],
 +) -> TaskCompletionCheck:
 +    """Build a typed follow-through assessment for one candidate response."""
++
 +    task_lower = task.lower().strip()
 +    response_lower = response.lower().strip()
 +    action_types = _action_types(actions_taken)
 +    informational = _is_informational_task(task_lower)
 +    complex_task = any(indicator in task_lower for indicator in _COMPLEX_INDICATORS)
 +    simple_task = any(indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS)
 +    requires_verification = any(
 +        indicator in task_lower for indicator in _VERIFICATION_INDICATORS
 +    )
 +    requires_install = any(indicator in task_lower for indicator in _INSTALL_HINTS)
++
 +    accomplished = [_summarize_action(action) for action in actions_taken]
 +    required_evidence = _required_evidence(
 +        task_lower=task_lower,
 +        informational=informational,
 +        complex_task=complex_task,
 +        requires_verification=requires_verification,
 +        requires_install=requires_install,
 +    )
 +    missing_evidence: list[str] = []
 +    remaining: list[str] = []
 +    suggested_next_steps: list[str] = []
++
 +    if informational:
 +        return TaskCompletionCheck(
 +            original_task=task,
 +            is_complete=bool(response.strip()),
 +            accomplished=accomplished,
 +            required_evidence=required_evidence,
 +            missing_evidence=[],
 +            remaining=[],
 +            suggested_next_steps=[],
 +            continuation_prompt=_format_continuation_prompt(
 +                task=task,
 +                missing_evidence=[],
 +                suggested_next_steps=[],
 +                action_count=len(actions_taken),
 +            ),
 +        )
++
 +    if not actions_taken and _requires_action(task_lower):
 +        _append_follow_through_gap(
 +            missing_evidence,
 +            remaining,
 +            suggested_next_steps,
 +            evidence="showing the requested work was actually carried out",
 +            remaining_item="Perform the requested work instead of stopping at intent or narration",
 +            next_step="Carry out the requested change or command now",
 +        )
 -    if any(keyword in task_lower for keyword in ["install", "dependencies", "set up project"]):
 -        if "node" in task_lower or "npm" in task_lower:
 -            if not any("npm" in action for action in actions_taken):
 -                follow_ups.append("Run `npm install` to install dependencies")
 -        if "python" in task_lower or "pip" in task_lower:
 -            if not any("pip" in action or "uv" in action for action in actions_taken):
 -                follow_ups.append("Install dependencies")
 +    if requires_install and not _has_install_evidence(task_lower, action_types, actions_taken):
 +        _append_follow_through_gap(
 +            missing_evidence,
 +            remaining,
 +            suggested_next_steps,
 +            evidence="showing dependencies or setup steps were completed",
 +            remaining_item="Install or initialize the required dependencies",
 +            next_step=_install_follow_up(task_lower),
 +        )
 -    if "test" in task_lower and "run" in task_lower:
 -        if not any("test" in action or "pytest" in action or "jest" in action for action in actions_taken):
 -            follow_ups.append("Run the tests")
 +    if requires_verification and not _has_verification_evidence(action_types, actions_taken):
 +        _append_follow_through_gap(
 +            missing_evidence,
 +            remaining,
 +            suggested_next_steps,
 +            evidence="showing the result was run or verified",
 +            remaining_item="Run the result and capture a concrete verification outcome",
 +            next_step="Execute what you created or run the relevant tests now",
 +        )
 -    if any(keyword in task_lower for keyword in ["and run", "and test", "and verify", "make sure it works"]):
 -        follow_ups.append("Execute what was created to verify it works")
 +    if complex_task and len(actions_taken) < 3:
 +        _append_follow_through_gap(
 +            missing_evidence,
 +            remaining,
 +            suggested_next_steps,
 +            evidence="showing the broader end-to-end implementation or setup was completed",
 +            remaining_item="Finish the larger end-to-end task instead of stopping after a partial step",
 +            next_step="Continue through the remaining setup or implementation steps",
 +        )
 -    if follow_ups:
 -        steps = "\n".join(f"- {step}" for step in follow_ups[:2])
 -        return (
 -            f'The task was: "{task}"\n\n'
 -            f"You may need to also:\n{steps}\n\n"
 -            "If the task is actually complete, just confirm what was done."
 +    if (
 +        any(phrase in response_lower for phrase in _DEFLECTION_PHRASES)
 +        and len(actions_taken) < 2
 +    ):
 +        _append_follow_through_gap(
 +            missing_evidence,
 +            remaining,
 +            suggested_next_steps,
 +            evidence="showing execution evidence rather than instructions handed back to the user",
 +            remaining_item="Perform the work yourself or state concretely what you already verified",
 +            next_step="Continue the task instead of handing the next step to the user",
+         )
 -    return (
 -        f'Task: "{task}"\n'
 -        f"You took {len(actions_taken)} action(s). "
 -        "If there's more to do, continue. Otherwise, confirm completion."
 +    if "write" in action_types and actions_taken and simple_task:
 +        missing_evidence = [
 +            item
 +            for item in missing_evidence
 +            if item != "showing the requested work was actually carried out"
 +        ]
 +        remaining = [
 +            item
 +            for item in remaining
 +            if item != "Perform the requested work instead of stopping at intent or narration"
 +        ]
++
 +    is_complete = not missing_evidence
 +    return TaskCompletionCheck(
 +        original_task=task,
 +        is_complete=is_complete,
 +        accomplished=accomplished,
 +        required_evidence=required_evidence,
 +        missing_evidence=missing_evidence,
 +        remaining=remaining,
 +        suggested_next_steps=suggested_next_steps,
 +        continuation_prompt=_format_continuation_prompt(
 +            task=task,
 +            missing_evidence=missing_evidence,
 +            suggested_next_steps=suggested_next_steps,
 +            action_count=len(actions_taken),
 +        ),
+     )
              original_task=original_task,
              is_complete=data.get("is_complete", False),
              accomplished=data.get("accomplished", []),
 +            required_evidence=data.get("required_evidence", []),
 +            missing_evidence=data.get("missing_evidence", data.get("remaining", [])),
              remaining=data.get("remaining", []),
              suggested_next_steps=next_steps,
              continuation_prompt=continuation,
+         )
      except json.JSONDecodeError:
          return TaskCompletionCheck(original_task=original_task)
++
++
 +def _action_types(actions_taken: list[str]) -> set[str]:
 +    action_types: set[str] = set()
 +    for action in actions_taken:
 +        action_lower = action.lower()
 +        if "write" in action_lower:
 +            action_types.add("write")
 +        elif "edit" in action_lower or "patch" in action_lower:
 +            action_types.add("edit")
 +        elif "bash" in action_lower or "shell" in action_lower:
 +            action_types.add("bash")
 +        elif "read" in action_lower:
 +            action_types.add("read")
 +        elif "glob" in action_lower or "grep" in action_lower or "search" in action_lower:
 +            action_types.add("search")
 +        elif "todo" in action_lower:
 +            action_types.add("workflow")
 +    return action_types
++
++
 +def _is_informational_task(task_lower: str) -> bool:
 +    if task_lower.startswith(_INFORMATIONAL_PREFIXES):
 +        return True
 +    if task_lower.endswith("?") and task_lower.startswith(
 +        ("what ", "how ", "why ", "which ", "who ", "where ", "when ")
 +    ):
 +        return True
 +    return False
++
++
 +def _requires_action(task_lower: str) -> bool:
 +    return any(verb in task_lower for verb in _ACTION_VERBS) or any(
 +        indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS
 +    )
++
++
 +def _required_evidence(
 +    *,
 +    task_lower: str,
 +    informational: bool,
 +    complex_task: bool,
 +    requires_verification: bool,
 +    requires_install: bool,
 +) -> list[str]:
 +    if informational:
 +        return []
++
 +    required: list[str] = []
 +    if _requires_action(task_lower):
 +        required.append("showing the requested work was actually carried out")
 +    if requires_install:
 +        required.append("showing dependencies or setup steps were completed")
 +    if requires_verification:
 +        required.append("showing the result was run or verified")
 +    if complex_task:
 +        required.append("showing the broader end-to-end implementation or setup was completed")
 +    return required
++
++
 +def _has_install_evidence(
 +    task_lower: str,
 +    action_types: set[str],
 +    actions_taken: list[str],
 +) -> bool:
 +    del action_types
 +    action_text = " ".join(actions_taken).lower()
 +    if any(hint in task_lower for hint in _NODE_HINTS) and "npm" in action_text:
 +        return True
 +    if any(hint in task_lower for hint in _PYTHON_HINTS) and (
 +        "pip" in action_text or "uv" in action_text
 +    ):
 +        return True
 +    return "install" in action_text or "init" in action_text or "setup" in action_text
++
++
 +def _has_verification_evidence(
 +    action_types: set[str],
 +    actions_taken: list[str],
 +) -> bool:
 +    if "bash" in action_types:
 +        return True
 +    action_text = " ".join(actions_taken).lower()
 +    return any(
 +        token in action_text
 +        for token in ("test", "pytest", "jest", "verify", "run", "execute")
 +    )
++
++
 +def _install_follow_up(task_lower: str) -> str:
 +    if any(hint in task_lower for hint in _NODE_HINTS):
 +        return "Run `npm install` to install dependencies"
 +    if any(hint in task_lower for hint in _PYTHON_HINTS):
 +        return "Install the Python dependencies"
 +    return "Install or initialize the required dependencies now"
++
++
 +def _append_follow_through_gap(
 +    missing_evidence: list[str],
 +    remaining: list[str],
 +    suggested_next_steps: list[str],
 +    *,
 +    evidence: str,
 +    remaining_item: str,
 +    next_step: str,
 +) -> None:
 +    if evidence not in missing_evidence:
 +        missing_evidence.append(evidence)
 +    if remaining_item not in remaining:
 +        remaining.append(remaining_item)
 +    if next_step not in suggested_next_steps:
 +        suggested_next_steps.append(next_step)
++
++
 +def _format_continuation_prompt(
 +    *,
 +    task: str,
 +    missing_evidence: list[str],
 +    suggested_next_steps: list[str],
 +    action_count: int,
 +) -> str:
 +    if suggested_next_steps:
 +        evidence_lines = "\n".join(f"- {item}" for item in missing_evidence[:2])
 +        step_lines = "\n".join(f"- {step}" for step in suggested_next_steps[:3])
 +        return (
 +            f'The task was: "{task}"\n\n'
 +            "The response still needs concrete evidence for:\n"
 +            f"{evidence_lines}\n\n"
 +            "Continue with:\n"
 +            f"{step_lines}\n\n"
 +            "If the task is actually complete, confirm the missing evidence explicitly."
 +        )
++
 +    return (
 +        f'Task: "{task}"\n'
 +        f"You took {action_count} action(s). "
 +        "If there's more to do, continue. Otherwise, confirm completion."
 +    )
++
++
 +def _summarize_action(action: str) -> str:
 +    head, _, _ = action.partition(":")
 +    return head.strip() or action.strip()

tests/test_completion_policy.pymodified

      load_permission_rules,
+ )
  from loader.runtime.task_completion import (
 +    assess_completion_follow_through,
      detect_premature_completion,
      get_continuation_prompt,
+ )
          "The script has been created.",
+     )
 -    assert "Run the tests" in prompt or "verify it works" in prompt
 +    assert "Continue with" in prompt
 +    assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
++
++
 +def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
 +    check = assess_completion_follow_through(
 +        task="Create the script and test that it works.",
 +        response="The script has been created.",
 +        actions_taken=["write: script.py"],
 +    )
++
 +    assert check.is_complete is False
 +    assert "showing the requested work was actually carried out" in check.required_evidence
 +    assert "showing the result was run or verified" in check.required_evidence
 +    assert check.missing_evidence == ["showing the result was run or verified"]
 +    assert check.suggested_next_steps == [
 +        "Execute what you created or run the relevant tests now"
 +    ]
++
++
 +def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
 +    check = assess_completion_follow_through(
 +        task="Explain how Loader's workflow timeline works.",
 +        response="Loader records workflow decisions and policy events in a timeline.",
 +        actions_taken=[],
 +    )
++
 +    assert check.is_complete is True
 +    assert check.required_evidence == []
 +    assert check.missing_evidence == []
  @pytest.mark.asyncio
      assert decision.decision_summary == (
          "requested one continuation because the non-mutating response looked incomplete"
+     )
 +    assert decision.completion_check is not None
 +    assert decision.completion_check.missing_evidence == [
 +        "showing the requested work was actually carried out",
 +        "showing the result was run or verified",
 +    ]
      assert context.session.messages[-2] == Message(
          role=Role.ASSISTANT,
          content="I can handle that.",
      assert context.session.messages[-1].role == Role.USER
      assert "verify it works" in context.session.messages[-1].content.lower()
      assert events[0].type == "completion_check"
 +    assert events[0].completion_check is not None
 +    assert events[0].completion_check.missing_evidence == [
 +        "showing the requested work was actually carried out",
 +        "showing the result was run or verified",
 +    ]

tests/test_reasoning_compat.pymodified

      assert completion.is_complete is False
      assert completion.remaining == ["Run the tests"]
 +    assert completion.missing_evidence == ["Run the tests"]
      assert "Run pytest -q" in completion.continuation_prompt

tests/test_reasoning_types.pymodified

      assert critique.can_revise() is True
      assert completion.is_complete is False
      assert completion.accomplished == []
 +    assert completion.required_evidence == []
 +    assert completion.missing_evidence == []

tests/test_turn_completion.pymodified

      assert prepared.summary.workflow_timeline[-1].policy_stage == "continuation_check"
      assert prepared.summary.workflow_timeline[-1].policy_outcome == "continue"
      assert agent.session.messages[-1].role.value == "user"
 -    assert "If there's more to do, continue" in agent.session.messages[-1].content
 +    assert "concrete evidence" in agent.session.messages[-1].content
 +    assert "Carry out the requested change or command now" in agent.session.messages[-1].content
      assert any(event.type == "completion_check" for event in events)