`e83c568`

Add typed completion follow-through assessment

Authored by

espadonne 1 month ago

SHA: e83c568e08b785112bccaa24824cf3495e435d0c
Parents: e2158c6
Tree: df3f54d

6 changed files

Status	File	+	-
M	`src/loader/runtime/completion_policy.py`	18	13
M	`src/loader/runtime/task_completion.py`	343	127
M	`tests/test_completion_policy.py`	41	1
M	`tests/test_reasoning_compat.py`	1	0
M	`tests/test_reasoning_types.py`	2	0
M	`tests/test_turn_completion.py`	2	1

src/loader/runtime/completion_policy.pymodified

  from .context import RuntimeContext
  from .events import AgentEvent, TurnSummary
  from .reasoning_types import TaskCompletionCheck
--from .task_completion import detect_premature_completion, get_continuation_prompt
++from .task_completion import assess_completion_follow_through, detect_premature_completion
  EventSink = Callable[[AgentEvent], Awaitable[None]]
      should_continue: bool
      decision_code: str
      decision_summary: str
++    completion_check: TaskCompletionCheck | None = None
  class CompletionPolicy:
          """Nudge non-mutating tasks to continue when completion looks premature."""
          cfg = self.context.config.reasoning
++        completion_check = (
++            assess_completion_follow_through(
++                task=task,
++                response=content,
++                actions_taken=actions_taken,
++            )
++            if cfg.use_quick_completion
++            else None
++        )
          if continuation_count >= cfg.max_continuation_prompts:
              return ContinuationDecision(
                  should_continue=False,
                  decision_code="continuation_budget_exhausted",
                  decision_summary="accepted the response because the continuation budget was exhausted",
++                completion_check=completion_check,
+             )
          is_premature = (
                  should_continue=False,
                  decision_code="completion_response_accepted",
                  decision_summary="accepted the response because completion heuristics found no missing follow-through",
++                completion_check=completion_check,
+             )
--        continuation_prompt = get_continuation_prompt(
--            task,
--            actions_taken,
--            content,
--        )
          await emit(
              AgentEvent(
                  type="completion_check",
                  content=f"Task may be incomplete ({len(actions_taken)} actions taken)",
--                completion_check=TaskCompletionCheck(
++                completion_check=completion_check,
--                    original_task=task,
--                    is_complete=False,
--                    accomplished=[action.split(":")[0] for action in actions_taken],
--                    continuation_prompt=continuation_prompt,
--                ),
+             )
+         )
          self.context.session.append(Message(role=Role.ASSISTANT, content=response_content))
--        self.context.session.append(Message(role=Role.USER, content=continuation_prompt))
++        self.context.session.append(
++            Message(role=Role.USER, content=completion_check.continuation_prompt)
++        )
          return ContinuationDecision(
              should_continue=True,
              decision_code="premature_completion_nudge",
              decision_summary="requested one continuation because the non-mutating response looked incomplete",
++            completion_check=completion_check,
+         )
      @staticmethod

src/loader/runtime/task_completion.pymodified

  from .reasoning_types import TaskCompletionCheck
++_ACTION_VERBS = ("create", "write", "make", "edit", "fix", "add", "delete", "run")
++_COMPLEX_INDICATORS = (
++    "set up a project",
++    "create a project",
++    "build a complete",
++    "scaffold",
++    "initialize a new",
++    "create a full",
++    "implement a full",
++    "develop a complete",
++)
++_SIMPLE_TASK_INDICATORS = (
++    "create a file",
++    "write a file",
++    "make a file",
++    "add a function",
++    "edit the",
++    "fix the",
++    "update the",
++    "read the",
++    "show me",
++    "list",
++    "design a webpage",
++    "create a webpage",
++    "make a webpage",
++    "create a page",
++    "design a page",
++    "create an html",
++    "make an html",
++    "write an html",
++    "help me design",
++    "create a simple",
++    "make a simple",
++    "write a simple",
++)
++_VERIFICATION_INDICATORS = ("and test", "and run", "and verify", "make sure it works")
++_DEFLECTION_PHRASES = ("you can now", "you should", "you can run", "you can use")
++_INFORMATIONAL_PREFIXES = (
++    "explain ",
++    "describe ",
++    "summarize ",
++    "compare ",
++    "outline ",
++    "review ",
++    "analyze ",
++    "what ",
++    "how ",
++    "why ",
++    "which ",
++    "who ",
++    "where ",
++    "when ",
++)
++_EXPLICIT_COMPLETIONS = {
++    "done",
++    "done.",
++    "completed",
++    "completed.",
++    "all set",
++    "all set.",
++}
++_INSTALL_HINTS = ("install", "dependencies", "set up project")
++_NODE_HINTS = ("node", "npm")
++_PYTHON_HINTS = ("python", "pip")
++
  COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed.
  Original task: {task}
      actions_taken: list[str],
  ) -> bool:
      """Heuristically detect when the assistant is stopping too early."""
--
++    if not actions_taken and response.lower().strip() in _EXPLICIT_COMPLETIONS:
--    task_lower = task.lower()
--    response_lower = response.lower()
--
--    if not actions_taken:
--        explicit_completion = response_lower.strip()
--        if explicit_completion in {
--            "done",
--            "done.",
--            "completed",
--            "completed.",
--            "all set",
--            "all set.",
--        }:
--            return False
--        action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"]
--        if any(verb in task_lower for verb in action_verbs):
--            return True
--        return False
--
--    success_indicators = [
--        "successfully",
--        "created",
--        "written",
--        "done",
--        "completed",
--        "file now contains",
--        "has been updated",
--        "installed",
--    ]
--    if any(indicator in response_lower for indicator in success_indicators):
--        return False
--
--    complex_indicators = [
--        "set up a project",
--        "create a project",
--        "build a complete",
--        "scaffold",
--        "initialize a new",
--        "create a full",
--        "implement a full",
--        "develop a complete",
--    ]
--    is_complex = any(indicator in task_lower for indicator in complex_indicators)
--
--    simple_creation = [
--        "create a file",
--        "write a file",
--        "make a file",
--        "add a function",
--        "edit the",
--        "fix the",
--        "update the",
--        "read the",
--        "show me",
--        "list",
--        "design a webpage",
--        "create a webpage",
--        "make a webpage",
--        "create a page",
--        "design a page",
--        "create an html",
--        "make an html",
--        "write an html",
--        "help me design",
--        "create a simple",
--        "make a simple",
--        "write a simple",
--    ]
--    is_simple = any(indicator in task_lower for indicator in simple_creation)
--
--    if "write" in str(actions_taken).lower() and len(actions_taken) >= 1:
          return False
--    if is_simple and len(actions_taken) >= 1:
++    return not assess_completion_follow_through(
--        return False
++        task=task,
--
++        response=response,
--    explicit_verification = ["and test", "and run", "and verify", "make sure it works"]
++        actions_taken=actions_taken,
--    needs_verification = any(indicator in task_lower for indicator in explicit_verification)
++    ).is_complete
--
--    action_types = set()
--    for action in actions_taken:
--        action_lower = action.lower()
--        if "write" in action_lower:
--            action_types.add("write")
--        elif "edit" in action_lower:
--            action_types.add("edit")
--        elif "bash" in action_lower:
--            action_types.add("bash")
--        elif "read" in action_lower:
--            action_types.add("read")
--        elif "glob" in action_lower or "grep" in action_lower:
--            action_types.add("search")
--
--    if is_complex and len(actions_taken) < 3:
--        return True
--    if needs_verification and "bash" not in action_types:
--        return True
--
--    deflection_phrases = ["you can now", "you should", "you can run", "you can use"]
--    if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2:
--        return True
--
--    return False
  def get_continuation_prompt(task: str, actions_taken: list[str], response: str) -> str:
      """Generate a helpful follow-through prompt for incomplete tasks."""
++    return assess_completion_follow_through(
++        task=task,
++        response=response,
++        actions_taken=actions_taken,
++    ).continuation_prompt
--    del response
--    task_lower = task.lower()
++def assess_completion_follow_through(
--    follow_ups: list[str] = []
++    *,
++    task: str,
++    response: str,
++    actions_taken: list[str],
++) -> TaskCompletionCheck:
++    """Build a typed follow-through assessment for one candidate response."""
++
++    task_lower = task.lower().strip()
++    response_lower = response.lower().strip()
++    action_types = _action_types(actions_taken)
++    informational = _is_informational_task(task_lower)
++    complex_task = any(indicator in task_lower for indicator in _COMPLEX_INDICATORS)
++    simple_task = any(indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS)
++    requires_verification = any(
++        indicator in task_lower for indicator in _VERIFICATION_INDICATORS
++    )
++    requires_install = any(indicator in task_lower for indicator in _INSTALL_HINTS)
++
++    accomplished = [_summarize_action(action) for action in actions_taken]
++    required_evidence = _required_evidence(
++        task_lower=task_lower,
++        informational=informational,
++        complex_task=complex_task,
++        requires_verification=requires_verification,
++        requires_install=requires_install,
++    )
++    missing_evidence: list[str] = []
++    remaining: list[str] = []
++    suggested_next_steps: list[str] = []
++
++    if informational:
++        return TaskCompletionCheck(
++            original_task=task,
++            is_complete=bool(response.strip()),
++            accomplished=accomplished,
++            required_evidence=required_evidence,
++            missing_evidence=[],
++            remaining=[],
++            suggested_next_steps=[],
++            continuation_prompt=_format_continuation_prompt(
++                task=task,
++                missing_evidence=[],
++                suggested_next_steps=[],
++                action_count=len(actions_taken),
++            ),
++        )
++
++    if not actions_taken and _requires_action(task_lower):
++        _append_follow_through_gap(
++            missing_evidence,
++            remaining,
++            suggested_next_steps,
++            evidence="showing the requested work was actually carried out",
++            remaining_item="Perform the requested work instead of stopping at intent or narration",
++            next_step="Carry out the requested change or command now",
++        )
--    if any(keyword in task_lower for keyword in ["install", "dependencies", "set up project"]):
++    if requires_install and not _has_install_evidence(task_lower, action_types, actions_taken):
--        if "node" in task_lower or "npm" in task_lower:
++        _append_follow_through_gap(
--            if not any("npm" in action for action in actions_taken):
++            missing_evidence,
--                follow_ups.append("Run `npm install` to install dependencies")
++            remaining,
--        if "python" in task_lower or "pip" in task_lower:
++            suggested_next_steps,
--            if not any("pip" in action or "uv" in action for action in actions_taken):
++            evidence="showing dependencies or setup steps were completed",
--                follow_ups.append("Install dependencies")
++            remaining_item="Install or initialize the required dependencies",
++            next_step=_install_follow_up(task_lower),
++        )
--    if "test" in task_lower and "run" in task_lower:
++    if requires_verification and not _has_verification_evidence(action_types, actions_taken):
--        if not any("test" in action or "pytest" in action or "jest" in action for action in actions_taken):
++        _append_follow_through_gap(
--            follow_ups.append("Run the tests")
++            missing_evidence,
++            remaining,
++            suggested_next_steps,
++            evidence="showing the result was run or verified",
++            remaining_item="Run the result and capture a concrete verification outcome",
++            next_step="Execute what you created or run the relevant tests now",
++        )
--    if any(keyword in task_lower for keyword in ["and run", "and test", "and verify", "make sure it works"]):
++    if complex_task and len(actions_taken) < 3:
--        follow_ups.append("Execute what was created to verify it works")
++        _append_follow_through_gap(
++            missing_evidence,
++            remaining,
++            suggested_next_steps,
++            evidence="showing the broader end-to-end implementation or setup was completed",
++            remaining_item="Finish the larger end-to-end task instead of stopping after a partial step",
++            next_step="Continue through the remaining setup or implementation steps",
++        )
--    if follow_ups:
++    if (
--        steps = "\n".join(f"- {step}" for step in follow_ups[:2])
++        any(phrase in response_lower for phrase in _DEFLECTION_PHRASES)
--        return (
++        and len(actions_taken) < 2
--            f'The task was: "{task}"\n\n'
++    ):
--            f"You may need to also:\n{steps}\n\n"
++        _append_follow_through_gap(
--            "If the task is actually complete, just confirm what was done."
++            missing_evidence,
++            remaining,
++            suggested_next_steps,
++            evidence="showing execution evidence rather than instructions handed back to the user",
++            remaining_item="Perform the work yourself or state concretely what you already verified",
++            next_step="Continue the task instead of handing the next step to the user",
+         )
--    return (
++    if "write" in action_types and actions_taken and simple_task:
--        f'Task: "{task}"\n'
++        missing_evidence = [
--        f"You took {len(actions_taken)} action(s). "
++            item
--        "If there's more to do, continue. Otherwise, confirm completion."
++            for item in missing_evidence
++            if item != "showing the requested work was actually carried out"
++        ]
++        remaining = [
++            item
++            for item in remaining
++            if item != "Perform the requested work instead of stopping at intent or narration"
++        ]
++
++    is_complete = not missing_evidence
++    return TaskCompletionCheck(
++        original_task=task,
++        is_complete=is_complete,
++        accomplished=accomplished,
++        required_evidence=required_evidence,
++        missing_evidence=missing_evidence,
++        remaining=remaining,
++        suggested_next_steps=suggested_next_steps,
++        continuation_prompt=_format_continuation_prompt(
++            task=task,
++            missing_evidence=missing_evidence,
++            suggested_next_steps=suggested_next_steps,
++            action_count=len(actions_taken),
++        ),
+     )
              original_task=original_task,
              is_complete=data.get("is_complete", False),
              accomplished=data.get("accomplished", []),
++            required_evidence=data.get("required_evidence", []),
++            missing_evidence=data.get("missing_evidence", data.get("remaining", [])),
              remaining=data.get("remaining", []),
              suggested_next_steps=next_steps,
              continuation_prompt=continuation,
+         )
      except json.JSONDecodeError:
          return TaskCompletionCheck(original_task=original_task)
++
++
++def _action_types(actions_taken: list[str]) -> set[str]:
++    action_types: set[str] = set()
++    for action in actions_taken:
++        action_lower = action.lower()
++        if "write" in action_lower:
++            action_types.add("write")
++        elif "edit" in action_lower or "patch" in action_lower:
++            action_types.add("edit")
++        elif "bash" in action_lower or "shell" in action_lower:
++            action_types.add("bash")
++        elif "read" in action_lower:
++            action_types.add("read")
++        elif "glob" in action_lower or "grep" in action_lower or "search" in action_lower:
++            action_types.add("search")
++        elif "todo" in action_lower:
++            action_types.add("workflow")
++    return action_types
++
++
++def _is_informational_task(task_lower: str) -> bool:
++    if task_lower.startswith(_INFORMATIONAL_PREFIXES):
++        return True
++    if task_lower.endswith("?") and task_lower.startswith(
++        ("what ", "how ", "why ", "which ", "who ", "where ", "when ")
++    ):
++        return True
++    return False
++
++
++def _requires_action(task_lower: str) -> bool:
++    return any(verb in task_lower for verb in _ACTION_VERBS) or any(
++        indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS
++    )
++
++
++def _required_evidence(
++    *,
++    task_lower: str,
++    informational: bool,
++    complex_task: bool,
++    requires_verification: bool,
++    requires_install: bool,
++) -> list[str]:
++    if informational:
++        return []
++
++    required: list[str] = []
++    if _requires_action(task_lower):
++        required.append("showing the requested work was actually carried out")
++    if requires_install:
++        required.append("showing dependencies or setup steps were completed")
++    if requires_verification:
++        required.append("showing the result was run or verified")
++    if complex_task:
++        required.append("showing the broader end-to-end implementation or setup was completed")
++    return required
++
++
++def _has_install_evidence(
++    task_lower: str,
++    action_types: set[str],
++    actions_taken: list[str],
++) -> bool:
++    del action_types
++    action_text = " ".join(actions_taken).lower()
++    if any(hint in task_lower for hint in _NODE_HINTS) and "npm" in action_text:
++        return True
++    if any(hint in task_lower for hint in _PYTHON_HINTS) and (
++        "pip" in action_text or "uv" in action_text
++    ):
++        return True
++    return "install" in action_text or "init" in action_text or "setup" in action_text
++
++
++def _has_verification_evidence(
++    action_types: set[str],
++    actions_taken: list[str],
++) -> bool:
++    if "bash" in action_types:
++        return True
++    action_text = " ".join(actions_taken).lower()
++    return any(
++        token in action_text
++        for token in ("test", "pytest", "jest", "verify", "run", "execute")
++    )
++
++
++def _install_follow_up(task_lower: str) -> str:
++    if any(hint in task_lower for hint in _NODE_HINTS):
++        return "Run `npm install` to install dependencies"
++    if any(hint in task_lower for hint in _PYTHON_HINTS):
++        return "Install the Python dependencies"
++    return "Install or initialize the required dependencies now"
++
++
++def _append_follow_through_gap(
++    missing_evidence: list[str],
++    remaining: list[str],
++    suggested_next_steps: list[str],
++    *,
++    evidence: str,
++    remaining_item: str,
++    next_step: str,
++) -> None:
++    if evidence not in missing_evidence:
++        missing_evidence.append(evidence)
++    if remaining_item not in remaining:
++        remaining.append(remaining_item)
++    if next_step not in suggested_next_steps:
++        suggested_next_steps.append(next_step)
++
++
++def _format_continuation_prompt(
++    *,
++    task: str,
++    missing_evidence: list[str],
++    suggested_next_steps: list[str],
++    action_count: int,
++) -> str:
++    if suggested_next_steps:
++        evidence_lines = "\n".join(f"- {item}" for item in missing_evidence[:2])
++        step_lines = "\n".join(f"- {step}" for step in suggested_next_steps[:3])
++        return (
++            f'The task was: "{task}"\n\n'
++            "The response still needs concrete evidence for:\n"
++            f"{evidence_lines}\n\n"
++            "Continue with:\n"
++            f"{step_lines}\n\n"
++            "If the task is actually complete, confirm the missing evidence explicitly."
++        )
++
++    return (
++        f'Task: "{task}"\n'
++        f"You took {action_count} action(s). "
++        "If there's more to do, continue. Otherwise, confirm completion."
++    )
++
++
++def _summarize_action(action: str) -> str:
++    head, _, _ = action.partition(":")
++    return head.strip() or action.strip()

tests/test_completion_policy.pymodified

      load_permission_rules,
+ )
  from loader.runtime.task_completion import (
++    assess_completion_follow_through,
      detect_premature_completion,
      get_continuation_prompt,
+ )
          "The script has been created.",
+     )
--    assert "Run the tests" in prompt or "verify it works" in prompt
++    assert "Continue with" in prompt
++    assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
++
++
++def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
++    check = assess_completion_follow_through(
++        task="Create the script and test that it works.",
++        response="The script has been created.",
++        actions_taken=["write: script.py"],
++    )
++
++    assert check.is_complete is False
++    assert "showing the requested work was actually carried out" in check.required_evidence
++    assert "showing the result was run or verified" in check.required_evidence
++    assert check.missing_evidence == ["showing the result was run or verified"]
++    assert check.suggested_next_steps == [
++        "Execute what you created or run the relevant tests now"
++    ]
++
++
++def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
++    check = assess_completion_follow_through(
++        task="Explain how Loader's workflow timeline works.",
++        response="Loader records workflow decisions and policy events in a timeline.",
++        actions_taken=[],
++    )
++
++    assert check.is_complete is True
++    assert check.required_evidence == []
++    assert check.missing_evidence == []
  @pytest.mark.asyncio
      assert decision.decision_summary == (
          "requested one continuation because the non-mutating response looked incomplete"
+     )
++    assert decision.completion_check is not None
++    assert decision.completion_check.missing_evidence == [
++        "showing the requested work was actually carried out",
++        "showing the result was run or verified",
++    ]
      assert context.session.messages[-2] == Message(
          role=Role.ASSISTANT,
          content="I can handle that.",
      assert context.session.messages[-1].role == Role.USER
      assert "verify it works" in context.session.messages[-1].content.lower()
      assert events[0].type == "completion_check"
++    assert events[0].completion_check is not None
++    assert events[0].completion_check.missing_evidence == [
++        "showing the requested work was actually carried out",
++        "showing the result was run or verified",
++    ]

tests/test_reasoning_compat.pymodified


 
     assert completion.is_complete is False
     assert completion.remaining == ["Run the tests"]
+    assert completion.missing_evidence == ["Run the tests"]
     assert "Run pytest -q" in completion.continuation_prompt
 
 

tests/test_reasoning_types.pymodified

`@@ -60,3 +60,5 @@` def test_self_critique_and_completion_defaults_are_stable() -> None:
60	assert critique.can_revise() is True	60	assert critique.can_revise() is True
61	assert completion.is_complete is False	61	assert completion.is_complete is False
62	assert completion.accomplished == []	62	assert completion.accomplished == []
		63	+ assert completion.required_evidence == []
		64	+ assert completion.missing_evidence == []

tests/test_turn_completion.pymodified


     assert prepared.summary.workflow_timeline[-1].policy_stage == "continuation_check"
     assert prepared.summary.workflow_timeline[-1].policy_outcome == "continue"
     assert agent.session.messages[-1].role.value == "user"
+    assert "concrete evidence" in agent.session.messages[-1].content
+    assert "Carry out the requested change or command now" in agent.session.messages[-1].content
     assert any(event.type == "completion_check" for event in events)
 
 

`@@ -56,6 +56,7 @@` def test_parse_completion_check_builds_continuation_prompt() -> None:
56		56
57	assert completion.is_complete is False	57	assert completion.is_complete is False
58	assert completion.remaining == ["Run the tests"]	58	assert completion.remaining == ["Run the tests"]
		59	+ assert completion.missing_evidence == ["Run the tests"]
59	assert "Run pytest -q" in completion.continuation_prompt	60	assert "Run pytest -q" in completion.continuation_prompt
60		61
61		62

`@@ -82,7 +82,8 @@` async def test_turn_completion_requests_continuation_for_premature_text_response
82	assert prepared.summary.workflow_timeline[-1].policy_stage == "continuation_check"	82	assert prepared.summary.workflow_timeline[-1].policy_stage == "continuation_check"
83	assert prepared.summary.workflow_timeline[-1].policy_outcome == "continue"	83	assert prepared.summary.workflow_timeline[-1].policy_outcome == "continue"
84	assert agent.session.messages[-1].role.value == "user"	84	assert agent.session.messages[-1].role.value == "user"
85	- assert "If there's more to do, continue" in agent.session.messages[-1].content	85	+ assert "concrete evidence" in agent.session.messages[-1].content
		86	+ assert "Carry out the requested change or command now" in agent.session.messages[-1].content
86	assert any(event.type == "completion_check" for event in events)	87	assert any(event.type == "completion_check" for event in events)
87		88
88		89