tenseleyflow/loader / e83c568

Browse files

Add typed completion follow-through assessment

Authored by espadonne
SHA
e83c568e08b785112bccaa24824cf3495e435d0c
Parents
e2158c6
Tree
df3f54d

6 changed files

StatusFile+-
M src/loader/runtime/completion_policy.py 18 13
M src/loader/runtime/task_completion.py 343 127
M tests/test_completion_policy.py 41 1
M tests/test_reasoning_compat.py 1 0
M tests/test_reasoning_types.py 2 0
M tests/test_turn_completion.py 2 1
src/loader/runtime/completion_policy.pymodified
@@ -9,7 +9,7 @@ from ..llm.base import Message, Role
99
 from .context import RuntimeContext
1010
 from .events import AgentEvent, TurnSummary
1111
 from .reasoning_types import TaskCompletionCheck
12
-from .task_completion import detect_premature_completion, get_continuation_prompt
12
+from .task_completion import assess_completion_follow_through, detect_premature_completion
1313
 
1414
 EventSink = Callable[[AgentEvent], Awaitable[None]]
1515
 
@@ -31,6 +31,7 @@ class ContinuationDecision:
3131
     should_continue: bool
3232
     decision_code: str
3333
     decision_summary: str
34
+    completion_check: TaskCompletionCheck | None = None
3435
 
3536
 
3637
 class CompletionPolicy:
@@ -92,11 +93,21 @@ class CompletionPolicy:
9293
         """Nudge non-mutating tasks to continue when completion looks premature."""
9394
 
9495
         cfg = self.context.config.reasoning
96
+        completion_check = (
97
+            assess_completion_follow_through(
98
+                task=task,
99
+                response=content,
100
+                actions_taken=actions_taken,
101
+            )
102
+            if cfg.use_quick_completion
103
+            else None
104
+        )
95105
         if continuation_count >= cfg.max_continuation_prompts:
96106
             return ContinuationDecision(
97107
                 should_continue=False,
98108
                 decision_code="continuation_budget_exhausted",
99109
                 decision_summary="accepted the response because the continuation budget was exhausted",
110
+                completion_check=completion_check,
100111
             )
101112
 
102113
         is_premature = (
@@ -109,31 +120,25 @@ class CompletionPolicy:
109120
                 should_continue=False,
110121
                 decision_code="completion_response_accepted",
111122
                 decision_summary="accepted the response because completion heuristics found no missing follow-through",
123
+                completion_check=completion_check,
112124
             )
113125
 
114
-        continuation_prompt = get_continuation_prompt(
115
-            task,
116
-            actions_taken,
117
-            content,
118
-        )
119126
         await emit(
120127
             AgentEvent(
121128
                 type="completion_check",
122129
                 content=f"Task may be incomplete ({len(actions_taken)} actions taken)",
123
-                completion_check=TaskCompletionCheck(
124
-                    original_task=task,
125
-                    is_complete=False,
126
-                    accomplished=[action.split(":")[0] for action in actions_taken],
127
-                    continuation_prompt=continuation_prompt,
128
-                ),
130
+                completion_check=completion_check,
129131
             )
130132
         )
131133
         self.context.session.append(Message(role=Role.ASSISTANT, content=response_content))
132
-        self.context.session.append(Message(role=Role.USER, content=continuation_prompt))
134
+        self.context.session.append(
135
+            Message(role=Role.USER, content=completion_check.continuation_prompt)
136
+        )
133137
         return ContinuationDecision(
134138
             should_continue=True,
135139
             decision_code="premature_completion_nudge",
136140
             decision_summary="requested one continuation because the non-mutating response looked incomplete",
141
+            completion_check=completion_check,
137142
         )
138143
 
139144
     @staticmethod
src/loader/runtime/task_completion.pymodified
@@ -6,6 +6,71 @@ import re
66
 
77
 from .reasoning_types import TaskCompletionCheck
88
 
9
+_ACTION_VERBS = ("create", "write", "make", "edit", "fix", "add", "delete", "run")
10
+_COMPLEX_INDICATORS = (
11
+    "set up a project",
12
+    "create a project",
13
+    "build a complete",
14
+    "scaffold",
15
+    "initialize a new",
16
+    "create a full",
17
+    "implement a full",
18
+    "develop a complete",
19
+)
20
+_SIMPLE_TASK_INDICATORS = (
21
+    "create a file",
22
+    "write a file",
23
+    "make a file",
24
+    "add a function",
25
+    "edit the",
26
+    "fix the",
27
+    "update the",
28
+    "read the",
29
+    "show me",
30
+    "list",
31
+    "design a webpage",
32
+    "create a webpage",
33
+    "make a webpage",
34
+    "create a page",
35
+    "design a page",
36
+    "create an html",
37
+    "make an html",
38
+    "write an html",
39
+    "help me design",
40
+    "create a simple",
41
+    "make a simple",
42
+    "write a simple",
43
+)
44
+_VERIFICATION_INDICATORS = ("and test", "and run", "and verify", "make sure it works")
45
+_DEFLECTION_PHRASES = ("you can now", "you should", "you can run", "you can use")
46
+_INFORMATIONAL_PREFIXES = (
47
+    "explain ",
48
+    "describe ",
49
+    "summarize ",
50
+    "compare ",
51
+    "outline ",
52
+    "review ",
53
+    "analyze ",
54
+    "what ",
55
+    "how ",
56
+    "why ",
57
+    "which ",
58
+    "who ",
59
+    "where ",
60
+    "when ",
61
+)
62
+_EXPLICIT_COMPLETIONS = {
63
+    "done",
64
+    "done.",
65
+    "completed",
66
+    "completed.",
67
+    "all set",
68
+    "all set.",
69
+}
70
+_INSTALL_HINTS = ("install", "dependencies", "set up project")
71
+_NODE_HINTS = ("node", "npm")
72
+_PYTHON_HINTS = ("python", "pip")
73
+
974
 COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed.
1075
 
1176
 Original task: {task}
@@ -39,146 +104,152 @@ def detect_premature_completion(
39104
     actions_taken: list[str],
40105
 ) -> bool:
41106
     """Heuristically detect when the assistant is stopping too early."""
42
-
43
-    task_lower = task.lower()
44
-    response_lower = response.lower()
45
-
46
-    if not actions_taken:
47
-        explicit_completion = response_lower.strip()
48
-        if explicit_completion in {
49
-            "done",
50
-            "done.",
51
-            "completed",
52
-            "completed.",
53
-            "all set",
54
-            "all set.",
55
-        }:
56
-            return False
57
-        action_verbs = ["create", "write", "make", "edit", "fix", "add", "delete", "run"]
58
-        if any(verb in task_lower for verb in action_verbs):
59
-            return True
60
-        return False
61
-
62
-    success_indicators = [
63
-        "successfully",
64
-        "created",
65
-        "written",
66
-        "done",
67
-        "completed",
68
-        "file now contains",
69
-        "has been updated",
70
-        "installed",
71
-    ]
72
-    if any(indicator in response_lower for indicator in success_indicators):
73
-        return False
74
-
75
-    complex_indicators = [
76
-        "set up a project",
77
-        "create a project",
78
-        "build a complete",
79
-        "scaffold",
80
-        "initialize a new",
81
-        "create a full",
82
-        "implement a full",
83
-        "develop a complete",
84
-    ]
85
-    is_complex = any(indicator in task_lower for indicator in complex_indicators)
86
-
87
-    simple_creation = [
88
-        "create a file",
89
-        "write a file",
90
-        "make a file",
91
-        "add a function",
92
-        "edit the",
93
-        "fix the",
94
-        "update the",
95
-        "read the",
96
-        "show me",
97
-        "list",
98
-        "design a webpage",
99
-        "create a webpage",
100
-        "make a webpage",
101
-        "create a page",
102
-        "design a page",
103
-        "create an html",
104
-        "make an html",
105
-        "write an html",
106
-        "help me design",
107
-        "create a simple",
108
-        "make a simple",
109
-        "write a simple",
110
-    ]
111
-    is_simple = any(indicator in task_lower for indicator in simple_creation)
112
-
113
-    if "write" in str(actions_taken).lower() and len(actions_taken) >= 1:
107
+    if not actions_taken and response.lower().strip() in _EXPLICIT_COMPLETIONS:
114108
         return False
115
-    if is_simple and len(actions_taken) >= 1:
116
-        return False
117
-
118
-    explicit_verification = ["and test", "and run", "and verify", "make sure it works"]
119
-    needs_verification = any(indicator in task_lower for indicator in explicit_verification)
120
-
121
-    action_types = set()
122
-    for action in actions_taken:
123
-        action_lower = action.lower()
124
-        if "write" in action_lower:
125
-            action_types.add("write")
126
-        elif "edit" in action_lower:
127
-            action_types.add("edit")
128
-        elif "bash" in action_lower:
129
-            action_types.add("bash")
130
-        elif "read" in action_lower:
131
-            action_types.add("read")
132
-        elif "glob" in action_lower or "grep" in action_lower:
133
-            action_types.add("search")
134
-
135
-    if is_complex and len(actions_taken) < 3:
136
-        return True
137
-    if needs_verification and "bash" not in action_types:
138
-        return True
139
-
140
-    deflection_phrases = ["you can now", "you should", "you can run", "you can use"]
141
-    if any(phrase in response_lower for phrase in deflection_phrases) and len(actions_taken) < 2:
142
-        return True
143
-
144
-    return False
109
+    return not assess_completion_follow_through(
110
+        task=task,
111
+        response=response,
112
+        actions_taken=actions_taken,
113
+    ).is_complete
145114
 
146115
 
147116
 def get_continuation_prompt(task: str, actions_taken: list[str], response: str) -> str:
148117
     """Generate a helpful follow-through prompt for incomplete tasks."""
118
+    return assess_completion_follow_through(
119
+        task=task,
120
+        response=response,
121
+        actions_taken=actions_taken,
122
+    ).continuation_prompt
149123
 
150
-    del response
151124
 
152
-    task_lower = task.lower()
153
-    follow_ups: list[str] = []
125
+def assess_completion_follow_through(
126
+    *,
127
+    task: str,
128
+    response: str,
129
+    actions_taken: list[str],
130
+) -> TaskCompletionCheck:
131
+    """Build a typed follow-through assessment for one candidate response."""
132
+
133
+    task_lower = task.lower().strip()
134
+    response_lower = response.lower().strip()
135
+    action_types = _action_types(actions_taken)
136
+    informational = _is_informational_task(task_lower)
137
+    complex_task = any(indicator in task_lower for indicator in _COMPLEX_INDICATORS)
138
+    simple_task = any(indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS)
139
+    requires_verification = any(
140
+        indicator in task_lower for indicator in _VERIFICATION_INDICATORS
141
+    )
142
+    requires_install = any(indicator in task_lower for indicator in _INSTALL_HINTS)
143
+
144
+    accomplished = [_summarize_action(action) for action in actions_taken]
145
+    required_evidence = _required_evidence(
146
+        task_lower=task_lower,
147
+        informational=informational,
148
+        complex_task=complex_task,
149
+        requires_verification=requires_verification,
150
+        requires_install=requires_install,
151
+    )
152
+    missing_evidence: list[str] = []
153
+    remaining: list[str] = []
154
+    suggested_next_steps: list[str] = []
155
+
156
+    if informational:
157
+        return TaskCompletionCheck(
158
+            original_task=task,
159
+            is_complete=bool(response.strip()),
160
+            accomplished=accomplished,
161
+            required_evidence=required_evidence,
162
+            missing_evidence=[],
163
+            remaining=[],
164
+            suggested_next_steps=[],
165
+            continuation_prompt=_format_continuation_prompt(
166
+                task=task,
167
+                missing_evidence=[],
168
+                suggested_next_steps=[],
169
+                action_count=len(actions_taken),
170
+            ),
171
+        )
172
+
173
+    if not actions_taken and _requires_action(task_lower):
174
+        _append_follow_through_gap(
175
+            missing_evidence,
176
+            remaining,
177
+            suggested_next_steps,
178
+            evidence="showing the requested work was actually carried out",
179
+            remaining_item="Perform the requested work instead of stopping at intent or narration",
180
+            next_step="Carry out the requested change or command now",
181
+        )
154182
 
155
-    if any(keyword in task_lower for keyword in ["install", "dependencies", "set up project"]):
156
-        if "node" in task_lower or "npm" in task_lower:
157
-            if not any("npm" in action for action in actions_taken):
158
-                follow_ups.append("Run `npm install` to install dependencies")
159
-        if "python" in task_lower or "pip" in task_lower:
160
-            if not any("pip" in action or "uv" in action for action in actions_taken):
161
-                follow_ups.append("Install dependencies")
183
+    if requires_install and not _has_install_evidence(task_lower, action_types, actions_taken):
184
+        _append_follow_through_gap(
185
+            missing_evidence,
186
+            remaining,
187
+            suggested_next_steps,
188
+            evidence="showing dependencies or setup steps were completed",
189
+            remaining_item="Install or initialize the required dependencies",
190
+            next_step=_install_follow_up(task_lower),
191
+        )
162192
 
163
-    if "test" in task_lower and "run" in task_lower:
164
-        if not any("test" in action or "pytest" in action or "jest" in action for action in actions_taken):
165
-            follow_ups.append("Run the tests")
193
+    if requires_verification and not _has_verification_evidence(action_types, actions_taken):
194
+        _append_follow_through_gap(
195
+            missing_evidence,
196
+            remaining,
197
+            suggested_next_steps,
198
+            evidence="showing the result was run or verified",
199
+            remaining_item="Run the result and capture a concrete verification outcome",
200
+            next_step="Execute what you created or run the relevant tests now",
201
+        )
166202
 
167
-    if any(keyword in task_lower for keyword in ["and run", "and test", "and verify", "make sure it works"]):
168
-        follow_ups.append("Execute what was created to verify it works")
203
+    if complex_task and len(actions_taken) < 3:
204
+        _append_follow_through_gap(
205
+            missing_evidence,
206
+            remaining,
207
+            suggested_next_steps,
208
+            evidence="showing the broader end-to-end implementation or setup was completed",
209
+            remaining_item="Finish the larger end-to-end task instead of stopping after a partial step",
210
+            next_step="Continue through the remaining setup or implementation steps",
211
+        )
169212
 
170
-    if follow_ups:
171
-        steps = "\n".join(f"- {step}" for step in follow_ups[:2])
172
-        return (
173
-            f'The task was: "{task}"\n\n'
174
-            f"You may need to also:\n{steps}\n\n"
175
-            "If the task is actually complete, just confirm what was done."
213
+    if (
214
+        any(phrase in response_lower for phrase in _DEFLECTION_PHRASES)
215
+        and len(actions_taken) < 2
216
+    ):
217
+        _append_follow_through_gap(
218
+            missing_evidence,
219
+            remaining,
220
+            suggested_next_steps,
221
+            evidence="showing execution evidence rather than instructions handed back to the user",
222
+            remaining_item="Perform the work yourself or state concretely what you already verified",
223
+            next_step="Continue the task instead of handing the next step to the user",
176224
         )
177225
 
178
-    return (
179
-        f'Task: "{task}"\n'
180
-        f"You took {len(actions_taken)} action(s). "
181
-        "If there's more to do, continue. Otherwise, confirm completion."
226
+    if "write" in action_types and actions_taken and simple_task:
227
+        missing_evidence = [
228
+            item
229
+            for item in missing_evidence
230
+            if item != "showing the requested work was actually carried out"
231
+        ]
232
+        remaining = [
233
+            item
234
+            for item in remaining
235
+            if item != "Perform the requested work instead of stopping at intent or narration"
236
+        ]
237
+
238
+    is_complete = not missing_evidence
239
+    return TaskCompletionCheck(
240
+        original_task=task,
241
+        is_complete=is_complete,
242
+        accomplished=accomplished,
243
+        required_evidence=required_evidence,
244
+        missing_evidence=missing_evidence,
245
+        remaining=remaining,
246
+        suggested_next_steps=suggested_next_steps,
247
+        continuation_prompt=_format_continuation_prompt(
248
+            task=task,
249
+            missing_evidence=missing_evidence,
250
+            suggested_next_steps=suggested_next_steps,
251
+            action_count=len(actions_taken),
252
+        ),
182253
     )
183254
 
184255
 
@@ -207,9 +278,154 @@ def parse_completion_check(response: str, original_task: str) -> TaskCompletionC
207278
             original_task=original_task,
208279
             is_complete=data.get("is_complete", False),
209280
             accomplished=data.get("accomplished", []),
281
+            required_evidence=data.get("required_evidence", []),
282
+            missing_evidence=data.get("missing_evidence", data.get("remaining", [])),
210283
             remaining=data.get("remaining", []),
211284
             suggested_next_steps=next_steps,
212285
             continuation_prompt=continuation,
213286
         )
214287
     except json.JSONDecodeError:
215288
         return TaskCompletionCheck(original_task=original_task)
289
+
290
+
291
+def _action_types(actions_taken: list[str]) -> set[str]:
292
+    action_types: set[str] = set()
293
+    for action in actions_taken:
294
+        action_lower = action.lower()
295
+        if "write" in action_lower:
296
+            action_types.add("write")
297
+        elif "edit" in action_lower or "patch" in action_lower:
298
+            action_types.add("edit")
299
+        elif "bash" in action_lower or "shell" in action_lower:
300
+            action_types.add("bash")
301
+        elif "read" in action_lower:
302
+            action_types.add("read")
303
+        elif "glob" in action_lower or "grep" in action_lower or "search" in action_lower:
304
+            action_types.add("search")
305
+        elif "todo" in action_lower:
306
+            action_types.add("workflow")
307
+    return action_types
308
+
309
+
310
+def _is_informational_task(task_lower: str) -> bool:
311
+    if task_lower.startswith(_INFORMATIONAL_PREFIXES):
312
+        return True
313
+    if task_lower.endswith("?") and task_lower.startswith(
314
+        ("what ", "how ", "why ", "which ", "who ", "where ", "when ")
315
+    ):
316
+        return True
317
+    return False
318
+
319
+
320
+def _requires_action(task_lower: str) -> bool:
321
+    return any(verb in task_lower for verb in _ACTION_VERBS) or any(
322
+        indicator in task_lower for indicator in _SIMPLE_TASK_INDICATORS
323
+    )
324
+
325
+
326
+def _required_evidence(
327
+    *,
328
+    task_lower: str,
329
+    informational: bool,
330
+    complex_task: bool,
331
+    requires_verification: bool,
332
+    requires_install: bool,
333
+) -> list[str]:
334
+    if informational:
335
+        return []
336
+
337
+    required: list[str] = []
338
+    if _requires_action(task_lower):
339
+        required.append("showing the requested work was actually carried out")
340
+    if requires_install:
341
+        required.append("showing dependencies or setup steps were completed")
342
+    if requires_verification:
343
+        required.append("showing the result was run or verified")
344
+    if complex_task:
345
+        required.append("showing the broader end-to-end implementation or setup was completed")
346
+    return required
347
+
348
+
349
+def _has_install_evidence(
350
+    task_lower: str,
351
+    action_types: set[str],
352
+    actions_taken: list[str],
353
+) -> bool:
354
+    del action_types
355
+    action_text = " ".join(actions_taken).lower()
356
+    if any(hint in task_lower for hint in _NODE_HINTS) and "npm" in action_text:
357
+        return True
358
+    if any(hint in task_lower for hint in _PYTHON_HINTS) and (
359
+        "pip" in action_text or "uv" in action_text
360
+    ):
361
+        return True
362
+    return "install" in action_text or "init" in action_text or "setup" in action_text
363
+
364
+
365
+def _has_verification_evidence(
366
+    action_types: set[str],
367
+    actions_taken: list[str],
368
+) -> bool:
369
+    if "bash" in action_types:
370
+        return True
371
+    action_text = " ".join(actions_taken).lower()
372
+    return any(
373
+        token in action_text
374
+        for token in ("test", "pytest", "jest", "verify", "run", "execute")
375
+    )
376
+
377
+
378
+def _install_follow_up(task_lower: str) -> str:
379
+    if any(hint in task_lower for hint in _NODE_HINTS):
380
+        return "Run `npm install` to install dependencies"
381
+    if any(hint in task_lower for hint in _PYTHON_HINTS):
382
+        return "Install the Python dependencies"
383
+    return "Install or initialize the required dependencies now"
384
+
385
+
386
+def _append_follow_through_gap(
387
+    missing_evidence: list[str],
388
+    remaining: list[str],
389
+    suggested_next_steps: list[str],
390
+    *,
391
+    evidence: str,
392
+    remaining_item: str,
393
+    next_step: str,
394
+) -> None:
395
+    if evidence not in missing_evidence:
396
+        missing_evidence.append(evidence)
397
+    if remaining_item not in remaining:
398
+        remaining.append(remaining_item)
399
+    if next_step not in suggested_next_steps:
400
+        suggested_next_steps.append(next_step)
401
+
402
+
403
+def _format_continuation_prompt(
404
+    *,
405
+    task: str,
406
+    missing_evidence: list[str],
407
+    suggested_next_steps: list[str],
408
+    action_count: int,
409
+) -> str:
410
+    if suggested_next_steps:
411
+        evidence_lines = "\n".join(f"- {item}" for item in missing_evidence[:2])
412
+        step_lines = "\n".join(f"- {step}" for step in suggested_next_steps[:3])
413
+        return (
414
+            f'The task was: "{task}"\n\n'
415
+            "The response still needs concrete evidence for:\n"
416
+            f"{evidence_lines}\n\n"
417
+            "Continue with:\n"
418
+            f"{step_lines}\n\n"
419
+            "If the task is actually complete, confirm the missing evidence explicitly."
420
+        )
421
+
422
+    return (
423
+        f'Task: "{task}"\n'
424
+        f"You took {action_count} action(s). "
425
+        "If there's more to do, continue. Otherwise, confirm completion."
426
+    )
427
+
428
+
429
+def _summarize_action(action: str) -> str:
430
+    head, _, _ = action.partition(":")
431
+    return head.strip() or action.strip()
tests/test_completion_policy.pymodified
@@ -17,6 +17,7 @@ from loader.runtime.permissions import (
1717
     load_permission_rules,
1818
 )
1919
 from loader.runtime.task_completion import (
20
+    assess_completion_follow_through,
2021
     detect_premature_completion,
2122
     get_continuation_prompt,
2223
 )
@@ -128,7 +129,36 @@ def test_get_continuation_prompt_surfaces_missing_verification_steps() -> None:
128129
         "The script has been created.",
129130
     )
130131
 
131
-    assert "Run the tests" in prompt or "verify it works" in prompt
132
+    assert "Continue with" in prompt
133
+    assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
134
+
135
+
136
+def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
137
+    check = assess_completion_follow_through(
138
+        task="Create the script and test that it works.",
139
+        response="The script has been created.",
140
+        actions_taken=["write: script.py"],
141
+    )
142
+
143
+    assert check.is_complete is False
144
+    assert "showing the requested work was actually carried out" in check.required_evidence
145
+    assert "showing the result was run or verified" in check.required_evidence
146
+    assert check.missing_evidence == ["showing the result was run or verified"]
147
+    assert check.suggested_next_steps == [
148
+        "Execute what you created or run the relevant tests now"
149
+    ]
150
+
151
+
152
+def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
153
+    check = assess_completion_follow_through(
154
+        task="Explain how Loader's workflow timeline works.",
155
+        response="Loader records workflow decisions and policy events in a timeline.",
156
+        actions_taken=[],
157
+    )
158
+
159
+    assert check.is_complete is True
160
+    assert check.required_evidence == []
161
+    assert check.missing_evidence == []
132162
 
133163
 
134164
 @pytest.mark.asyncio
@@ -194,6 +224,11 @@ async def test_completion_policy_requests_continuation_using_runtime_context(
194224
     assert decision.decision_summary == (
195225
         "requested one continuation because the non-mutating response looked incomplete"
196226
     )
227
+    assert decision.completion_check is not None
228
+    assert decision.completion_check.missing_evidence == [
229
+        "showing the requested work was actually carried out",
230
+        "showing the result was run or verified",
231
+    ]
197232
     assert context.session.messages[-2] == Message(
198233
         role=Role.ASSISTANT,
199234
         content="I can handle that.",
@@ -201,3 +236,8 @@ async def test_completion_policy_requests_continuation_using_runtime_context(
201236
     assert context.session.messages[-1].role == Role.USER
202237
     assert "verify it works" in context.session.messages[-1].content.lower()
203238
     assert events[0].type == "completion_check"
239
+    assert events[0].completion_check is not None
240
+    assert events[0].completion_check.missing_evidence == [
241
+        "showing the requested work was actually carried out",
242
+        "showing the result was run or verified",
243
+    ]
tests/test_reasoning_compat.pymodified
@@ -56,6 +56,7 @@ def test_parse_completion_check_builds_continuation_prompt() -> None:
5656
 
5757
     assert completion.is_complete is False
5858
     assert completion.remaining == ["Run the tests"]
59
+    assert completion.missing_evidence == ["Run the tests"]
5960
     assert "Run pytest -q" in completion.continuation_prompt
6061
 
6162
 
tests/test_reasoning_types.pymodified
@@ -60,3 +60,5 @@ def test_self_critique_and_completion_defaults_are_stable() -> None:
6060
     assert critique.can_revise() is True
6161
     assert completion.is_complete is False
6262
     assert completion.accomplished == []
63
+    assert completion.required_evidence == []
64
+    assert completion.missing_evidence == []
tests/test_turn_completion.pymodified
@@ -82,7 +82,8 @@ async def test_turn_completion_requests_continuation_for_premature_text_response
8282
     assert prepared.summary.workflow_timeline[-1].policy_stage == "continuation_check"
8383
     assert prepared.summary.workflow_timeline[-1].policy_outcome == "continue"
8484
     assert agent.session.messages[-1].role.value == "user"
85
-    assert "If there's more to do, continue" in agent.session.messages[-1].content
85
+    assert "concrete evidence" in agent.session.messages[-1].content
86
+    assert "Carry out the requested change or command now" in agent.session.messages[-1].content
8687
     assert any(event.type == "completion_check" for event in events)
8788
 
8889