tenseleyflow/loader / 3a703a1

Browse files

Harden qwen recovery loops and verification

Authored by espadonne
SHA
3a703a1e283382d4405282386f066a805123c57f
Parents
f0d4490
Tree
132f2bd

17 changed files

StatusFile+-
M src/loader/runtime/finalization.py 8 0
M src/loader/runtime/recovery.py 8 6
M src/loader/runtime/safeguard_services.py 30 7
M src/loader/runtime/tool_batch_recovery.py 107 2
M src/loader/runtime/tool_batches.py 43 0
M src/loader/runtime/workflow.py 222 2
M src/loader/tools/file_tools.py 27 10
M src/loader/tools/fs_safety.py 62 0
M src/loader/utils/file_mutations.py 19 1
M tests/test_completion_policy.py 74 1
M tests/test_expanded_tools.py 37 0
M tests/test_finalization.py 69 1
M tests/test_runtime_harness.py 100 0
M tests/test_safeguard_services.py 15 1
M tests/test_tool_batch_policies.py 62 0
M tests/test_tool_batches.py 102 0
M tests/test_workflow.py 112 0
src/loader/runtime/finalization.pymodified
@@ -231,6 +231,14 @@ class TurnFinalizer:
231
                 project_root=self.context.project_root,
231
                 project_root=self.context.project_root,
232
                 task_statement=dod.task_statement,
232
                 task_statement=dod.task_statement,
233
             )
233
             )
234
+        else:
235
+            for command in derive_verification_commands(
236
+                dod,
237
+                project_root=self.context.project_root,
238
+                task_statement=dod.task_statement,
239
+            ):
240
+                if command not in dod.verification_commands:
241
+                    dod.verification_commands.append(command)
234
 
242
 
235
         await self.set_workflow_mode(
243
         await self.set_workflow_mode(
236
             ModeDecision.transition(
244
             ModeDecision.transition(
src/loader/runtime/recovery.pymodified
@@ -690,15 +690,17 @@ The command failed. You MUST analyze the error and take a DIFFERENT action.
690
 {hints}
690
 {hints}
691
 
691
 
692
 ## CRITICAL RULES:
692
 ## CRITICAL RULES:
693
-1. **INVESTIGATE FIRST** - Read config files, list directories, check what exists
693
+1. Start from the error and the state you already know
694
-2. **DO NOT** just retry the same command with slight variations
694
+2. Investigate only if a specific fact is still missing
695
-3. **DO NOT** try `npm start` then `npm run start` - these are the same thing!
695
+3. If you already have enough confirmed evidence, apply the fix instead of rereading the same files
696
-4. **READ THE ERROR** - It usually tells you exactly what's wrong
696
+4. **DO NOT** just retry the same command with slight variations
697
-5. If the error says "missing script: start", read package.json to see what scripts exist
697
+5. **DO NOT** try `npm start` then `npm run start` - these are the same thing!
698
+6. **READ THE ERROR** - It usually tells you exactly what's wrong
699
+7. If the error says "missing script: start", read package.json to see what scripts exist
698
 
700
 
699
 ## Current attempt: {attempt_count}/{max_retries}
701
 ## Current attempt: {attempt_count}/{max_retries}
700
 
702
 
701
-**Your next action should gather information OR try a fundamentally different approach.**
703
+**Your next action should either gather the missing information OR apply the fix using confirmed findings.**
702
 What will you do?"""
704
 What will you do?"""
703
 
705
 
704
 
706
 
src/loader/runtime/safeguard_services.pymodified
@@ -76,6 +76,11 @@ class ActionTracker:
76
         sig = str(hash(str(hunks)))
76
         sig = str(hash(str(hunks)))
77
         return sig in self._files_edited.get(norm_path, [])
77
         return sig in self._files_edited.get(norm_path, [])
78
 
78
 
79
+    def would_duplicate_raw_patch(self, file_path: str, patch_text: str) -> bool:
80
+        norm_path = self._normalize_path(file_path)
81
+        sig = str(hash(patch_text))
82
+        return sig in self._files_edited.get(norm_path, [])
83
+
79
     def would_duplicate_command(self, command: str) -> bool:
84
     def would_duplicate_command(self, command: str) -> bool:
80
         norm_cmd = self._normalize_command(command)
85
         norm_cmd = self._normalize_command(command)
81
         return norm_cmd in self._commands_run
86
         return norm_cmd in self._commands_run
@@ -123,8 +128,12 @@ class ActionTracker:
123
         elif tool_name == "patch":
128
         elif tool_name == "patch":
124
             file_path = arguments.get("file_path", "")
129
             file_path = arguments.get("file_path", "")
125
             hunks = arguments.get("hunks", [])
130
             hunks = arguments.get("hunks", [])
126
-            if isinstance(hunks, list) and self.would_duplicate_patch(file_path, hunks):
131
+            raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text")
132
+            if isinstance(hunks, list) and hunks and self.would_duplicate_patch(file_path, hunks):
127
                 return True, f"Same patch already applied to: {file_path}"
133
                 return True, f"Same patch already applied to: {file_path}"
134
+            if isinstance(raw_patch, str) and raw_patch.strip():
135
+                if self.would_duplicate_raw_patch(file_path, raw_patch):
136
+                    return True, f"Same patch already applied to: {file_path}"
128
 
137
 
129
         elif tool_name == "read":
138
         elif tool_name == "read":
130
             read_key = self._make_read_key(arguments)
139
             read_key = self._make_read_key(arguments)
@@ -135,7 +144,8 @@ class ActionTracker:
135
                     (
144
                     (
136
                         "Already read "
145
                         "Already read "
137
                         f"{str(arguments.get('file_path', '')).strip()} "
146
                         f"{str(arguments.get('file_path', '')).strip()} "
138
-                        "recently without any intervening changes"
147
+                        "recently without any intervening changes; "
148
+                        "reuse the earlier read result instead of rereading"
139
                     ),
149
                     ),
140
                     repeat_threshold=self.READ_REPEAT_THRESHOLD,
150
                     repeat_threshold=self.READ_REPEAT_THRESHOLD,
141
                 )
151
                 )
@@ -148,7 +158,10 @@ class ActionTracker:
148
                 duplicate, reason = self._check_recent_observation(
158
                 duplicate, reason = self._check_recent_observation(
149
                     self._recent_searches,
159
                     self._recent_searches,
150
                     observation_key,
160
                     observation_key,
151
-                    "Already ran the same search recently without any intervening changes",
161
+                    (
162
+                        "Already ran the same search recently without any intervening "
163
+                        "changes; reuse the earlier search result instead of rerunning it"
164
+                    ),
152
                     repeat_threshold=self.SEARCH_REPEAT_THRESHOLD,
165
                     repeat_threshold=self.SEARCH_REPEAT_THRESHOLD,
153
                 )
166
                 )
154
                 if duplicate:
167
                 if duplicate:
@@ -160,7 +173,10 @@ class ActionTracker:
160
                 duplicate, reason = self._check_recent_observation(
173
                 duplicate, reason = self._check_recent_observation(
161
                     self._recent_bash_observations,
174
                     self._recent_bash_observations,
162
                     self._normalize_command(command),
175
                     self._normalize_command(command),
163
-                    "Already ran the same read-only shell probe recently without any intervening changes",
176
+                    (
177
+                        "Already ran the same read-only shell probe recently without any "
178
+                        "intervening changes; reuse the earlier shell output instead of rerunning it"
179
+                    ),
164
                     repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD,
180
                     repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD,
165
                 )
181
                 )
166
                 if duplicate:
182
                 if duplicate:
@@ -196,7 +212,11 @@ class ActionTracker:
196
             file_path = arguments.get("file_path", "")
212
             file_path = arguments.get("file_path", "")
197
             hunks = arguments.get("hunks", [])
213
             hunks = arguments.get("hunks", [])
198
             if file_path:
214
             if file_path:
199
-                self.record_edit(file_path, str(hunks), "structured_patch")
215
+                raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text")
216
+                if isinstance(hunks, list) and hunks:
217
+                    self.record_edit(file_path, str(hunks), "structured_patch")
218
+                elif isinstance(raw_patch, str) and raw_patch.strip():
219
+                    self.record_edit(file_path, raw_patch, "raw_patch")
200
                 self._note_mutation()
220
                 self._note_mutation()
201
 
221
 
202
         elif tool_name == "read":
222
         elif tool_name == "read":
@@ -592,6 +612,7 @@ class PreActionValidator:
592
     def _validate_patch(self, arguments: dict) -> ValidationResult:
612
     def _validate_patch(self, arguments: dict) -> ValidationResult:
593
         file_path = arguments.get("file_path", "")
613
         file_path = arguments.get("file_path", "")
594
         hunks = arguments.get("hunks", [])
614
         hunks = arguments.get("hunks", [])
615
+        raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text")
595
 
616
 
596
         if not file_path or not str(file_path).strip():
617
         if not file_path or not str(file_path).strip():
597
             return ValidationResult(
618
             return ValidationResult(
@@ -605,11 +626,13 @@ class PreActionValidator:
605
         if not path_result.valid:
626
         if not path_result.valid:
606
             return path_result
627
             return path_result
607
 
628
 
608
-        if not isinstance(hunks, list) or not hunks:
629
+        has_hunks = isinstance(hunks, list) and bool(hunks)
630
+        has_raw_patch = isinstance(raw_patch, str) and bool(raw_patch.strip())
631
+        if not has_hunks and not has_raw_patch:
609
             return ValidationResult(
632
             return ValidationResult(
610
                 valid=False,
633
                 valid=False,
611
                 reason="Patch hunks are missing",
634
                 reason="Patch hunks are missing",
612
-                suggestion="Provide one or more structured patch hunks",
635
+                suggestion="Provide structured patch hunks or a unified diff patch string",
613
                 severity="error",
636
                 severity="error",
614
             )
637
             )
615
 
638
 
src/loader/runtime/tool_batch_recovery.pymodified
@@ -3,6 +3,9 @@
3
 from __future__ import annotations
3
 from __future__ import annotations
4
 
4
 
5
 from collections.abc import Awaitable, Callable
5
 from collections.abc import Awaitable, Callable
6
+from difflib import SequenceMatcher
7
+from pathlib import Path
8
+import re
6
 
9
 
7
 from ..llm.base import Message, ToolCall
10
 from ..llm.base import Message, ToolCall
8
 from .compaction import infer_preferred_next_step, summarize_confirmed_facts
11
 from .compaction import infer_preferred_next_step, summarize_confirmed_facts
@@ -82,7 +85,11 @@ class ToolBatchRecoveryController:
82
                 tool_call.arguments,
85
                 tool_call.arguments,
83
                 outcome.result_output,
86
                 outcome.result_output,
84
             )
87
             )
85
-            recovery_prompt = self._augment_recovery_prompt(recovery_prompt)
88
+            recovery_prompt = self._augment_recovery_prompt(
89
+                recovery_prompt,
90
+                tool_call=tool_call,
91
+                outcome=outcome,
92
+            )
86
             return Message.tool_result_message(
93
             return Message.tool_result_message(
87
                 tool_call_id=tool_call.id,
94
                 tool_call_id=tool_call.id,
88
                 display_content=recovery_prompt,
95
                 display_content=recovery_prompt,
@@ -106,7 +113,13 @@ class ToolBatchRecoveryController:
106
             is_error=True,
113
             is_error=True,
107
         )
114
         )
108
 
115
 
109
-    def _augment_recovery_prompt(self, prompt: str) -> str:
116
+    def _augment_recovery_prompt(
117
+        self,
118
+        prompt: str,
119
+        *,
120
+        tool_call: ToolCall,
121
+        outcome: ToolExecutionOutcome,
122
+    ) -> str:
110
         """Append transcript-aware recovery guidance when recent facts exist."""
123
         """Append transcript-aware recovery guidance when recent facts exist."""
111
 
124
 
112
         session = self.context.session
125
         session = self.context.session
@@ -116,6 +129,7 @@ class ToolBatchRecoveryController:
116
             session.messages,
129
             session.messages,
117
             current_task=current_task,
130
             current_task=current_task,
118
         )
131
         )
132
+        actionable_known_state = bool(confirmed_facts and preferred_next_step)
119
         if not confirmed_facts and not preferred_next_step and not current_task:
133
         if not confirmed_facts and not preferred_next_step and not current_task:
120
             return prompt
134
             return prompt
121
 
135
 
@@ -130,4 +144,95 @@ class ToolBatchRecoveryController:
130
             "- Preserve progress: do not restart by rereading already-confirmed files "
144
             "- Preserve progress: do not restart by rereading already-confirmed files "
131
             "unless you need genuinely new evidence."
145
             "unless you need genuinely new evidence."
132
         )
146
         )
147
+        if actionable_known_state:
148
+            lines.extend(
149
+                [
150
+                    "",
151
+                    "## ACTION BIAS FOR THIS RECOVERY",
152
+                    "- The confirmed findings above are already enough to keep moving.",
153
+                    "- Prefer edit/write/patch on the target file over rereading the same files.",
154
+                    "- Only inspect one more file if a specific filename, href, or title is still unknown.",
155
+                    "- Treat the preferred next step as the default path forward.",
156
+                ]
157
+            )
158
+        candidate_lines = self._file_not_found_candidate_lines(tool_call, outcome)
159
+        if candidate_lines:
160
+            lines.extend(["", "## LIKELY FILE CANDIDATES", *candidate_lines])
133
         return "\n".join(lines)
161
         return "\n".join(lines)
162
+
163
+    def _file_not_found_candidate_lines(
164
+        self,
165
+        tool_call: ToolCall,
166
+        outcome: ToolExecutionOutcome,
167
+    ) -> list[str]:
168
+        if tool_call.name not in {"read", "write", "edit", "patch"}:
169
+            return []
170
+        if "not found" not in outcome.result_output.lower():
171
+            return []
172
+
173
+        missing_path = self._canonicalize_path(
174
+            str(
175
+                tool_call.arguments.get("file_path")
176
+                or tool_call.arguments.get("path")
177
+                or ""
178
+            ).strip()
179
+        )
180
+        if not missing_path:
181
+            return []
182
+
183
+        candidates = self._rank_known_file_candidates(missing_path)
184
+        if not candidates:
185
+            return []
186
+
187
+        names = ", ".join(f"`{Path(candidate).name}`" for candidate in candidates[:3])
188
+        return [
189
+            f"- Requested file does not exist: `{missing_path}`",
190
+            f"- Closest known files in the same directory: {names}",
191
+            "- Prefer one of those exact filenames instead of retrying the missing path.",
192
+        ]
193
+
194
+    def _rank_known_file_candidates(self, missing_path: str) -> list[str]:
195
+        missing_parent = str(Path(missing_path).parent)
196
+        missing_name = Path(missing_path).name
197
+        missing_prefix = missing_name.split("-", 1)[0]
198
+
199
+        ranked: list[tuple[float, str]] = []
200
+        seen: set[str] = set()
201
+        for candidate in self._known_file_paths():
202
+            if candidate == missing_path:
203
+                continue
204
+            if str(Path(candidate).parent) != missing_parent:
205
+                continue
206
+            name = Path(candidate).name
207
+            if name in seen:
208
+                continue
209
+            seen.add(name)
210
+
211
+            score = SequenceMatcher(None, missing_name, name).ratio()
212
+            if missing_prefix and name.startswith(f"{missing_prefix}-"):
213
+                score += 1.0
214
+            ranked.append((score, candidate))
215
+
216
+        ranked.sort(key=lambda item: (-item[0], item[1]))
217
+        return [candidate for _, candidate in ranked]
218
+
219
+    def _known_file_paths(self) -> list[str]:
220
+        pattern = re.compile(r"(?:~|/)[^\s`\"']+\.html")
221
+        discovered: list[str] = []
222
+        seen: set[str] = set()
223
+        for message in self.context.session.messages:
224
+            for raw_path in pattern.findall(message.content):
225
+                candidate = self._canonicalize_path(raw_path)
226
+                if not candidate or candidate in seen:
227
+                    continue
228
+                seen.add(candidate)
229
+                discovered.append(candidate)
230
+        return discovered
231
+
232
+    def _canonicalize_path(self, raw_path: str) -> str:
233
+        if not raw_path:
234
+            return ""
235
+        try:
236
+            return str(Path(raw_path).expanduser().resolve(strict=False))
237
+        except (OSError, RuntimeError, ValueError):
238
+            return str(Path(raw_path).expanduser())
src/loader/runtime/tool_batches.pymodified
@@ -31,6 +31,8 @@ from .verification_observations import (
31
     VerificationObservationStatus,
31
     VerificationObservationStatus,
32
 )
32
 )
33
 from .workflow import sync_todos_to_definition_of_done
33
 from .workflow import sync_todos_to_definition_of_done
34
+from .workflow import advance_todos_from_tool_call
35
+from .compaction import infer_preferred_next_step
34
 
36
 
35
 EventSink = Callable[[AgentEvent], Awaitable[None]]
37
 EventSink = Callable[[AgentEvent], Awaitable[None]]
36
 ConfirmationHandler = (
38
 ConfirmationHandler = (
@@ -215,6 +217,8 @@ class ToolBatchRunner:
215
             # otherwise the model operates blind and loops.
217
             # otherwise the model operates blind and loops.
216
             self.context.session.append(outcome.message)
218
             self.context.session.append(outcome.message)
217
             summary.tool_result_messages.append(outcome.message)
219
             summary.tool_result_messages.append(outcome.message)
220
+            if outcome.state == ToolExecutionState.DUPLICATE:
221
+                self._queue_duplicate_observation_nudge(tool_call)
218
 
222
 
219
             should_continue = await self.verification_gate.should_continue(
223
             should_continue = await self.verification_gate.should_continue(
220
                 tool_call=tool_call,
224
                 tool_call=tool_call,
@@ -247,6 +251,43 @@ class ToolBatchRunner:
247
 
251
 
248
         return result
252
         return result
249
 
253
 
254
+    def _queue_duplicate_observation_nudge(self, tool_call: ToolCall) -> None:
255
+        """Queue a concrete next-step nudge after duplicate observational actions."""
256
+
257
+        if tool_call.name not in {"read", "glob", "grep", "bash"}:
258
+            return
259
+
260
+        current_task = getattr(self.context.session, "current_task", None)
261
+        preferred_next_step = infer_preferred_next_step(
262
+            self.context.session.messages,
263
+            current_task=current_task,
264
+        )
265
+        if preferred_next_step:
266
+            self.context.queue_steering_message(
267
+                "Reuse the earlier observation instead of repeating it. "
268
+                f"{preferred_next_step} "
269
+                "Only gather more evidence if a specific filename, href, or title is still unknown."
270
+            )
271
+            return
272
+
273
+        target_path = str(
274
+            tool_call.arguments.get("file_path")
275
+            or tool_call.arguments.get("path")
276
+            or ""
277
+        ).strip()
278
+        if target_path:
279
+            self.context.queue_steering_message(
280
+                "Reuse the earlier observation instead of repeating it. "
281
+                f"Use the current contents of `{target_path}` and take a different next step. "
282
+                "Only gather more evidence if a specific filename, href, or title is still unknown."
283
+            )
284
+            return
285
+
286
+        self.context.queue_steering_message(
287
+            "Reuse the earlier observation instead of repeating it. "
288
+            "Choose a different next step that makes progress."
289
+        )
290
+
250
     async def _record_successful_execution(
291
     async def _record_successful_execution(
251
         self,
292
         self,
252
         *,
293
         *,
@@ -279,6 +320,8 @@ class ToolBatchRunner:
279
             new_todos = outcome.registry_result.metadata.get("new_todos", [])
320
             new_todos = outcome.registry_result.metadata.get("new_todos", [])
280
             if isinstance(new_todos, list):
321
             if isinstance(new_todos, list):
281
                 sync_todos_to_definition_of_done(dod, new_todos)
322
                 sync_todos_to_definition_of_done(dod, new_todos)
323
+        else:
324
+            advance_todos_from_tool_call(dod, tool_call)
282
         self.dod_store.save(dod)
325
         self.dod_store.save(dod)
283
         recovery_context = self.context.recovery_context
326
         recovery_context = self.context.recovery_context
284
         if recovery_context is not None:
327
         if recovery_context is not None:
src/loader/runtime/workflow.pymodified
@@ -8,6 +8,7 @@ from datetime import UTC, datetime
8
 from pathlib import Path
8
 from pathlib import Path
9
 from typing import ClassVar
9
 from typing import ClassVar
10
 
10
 
11
+from ..llm.base import ToolCall
11
 from .clarify_grounding import ClarifyGrounding
12
 from .clarify_grounding import ClarifyGrounding
12
 from .dod import slugify
13
 from .dod import slugify
13
 from .workflow_policy import (
14
 from .workflow_policy import (
@@ -43,6 +44,7 @@ __all__ = [
43
     "WorkflowSignalPacket",
44
     "WorkflowSignalPacket",
44
     "WorkflowTimelineEntry",
45
     "WorkflowTimelineEntry",
45
     "WorkflowTimelineEntryKind",
46
     "WorkflowTimelineEntryKind",
47
+    "advance_todos_from_tool_call",
46
     "build_execute_bridge",
48
     "build_execute_bridge",
47
     "enrich_clarify_brief_with_grounding",
49
     "enrich_clarify_brief_with_grounding",
48
     "extract_verification_commands_from_markdown",
50
     "extract_verification_commands_from_markdown",
@@ -67,6 +69,64 @@ _GENERIC_CONSTRAINTS = {
67
 _GENERIC_ASSUMPTIONS = {
69
 _GENERIC_ASSUMPTIONS = {
68
     "Unspecified details stay unchanged unless evidence says otherwise.",
70
     "Unspecified details stay unchanged unless evidence says otherwise.",
69
 }
71
 }
72
+_SPECIAL_TODO_ITEMS = {
73
+    "Complete the requested work",
74
+    "Collect verification evidence",
75
+}
76
+_READ_STEP_HINTS = (
77
+    "read",
78
+    "examine",
79
+    "inspect",
80
+    "review",
81
+    "check",
82
+    "look at",
83
+    "look through",
84
+    "open",
85
+    "understand",
86
+    "study",
87
+)
88
+_SEARCH_STEP_HINTS = (
89
+    "list",
90
+    "find",
91
+    "search",
92
+    "scan",
93
+    "discover",
94
+    "locate",
95
+    "enumerate",
96
+    "gather",
97
+)
98
+_PARSE_STEP_HINTS = (
99
+    "parse",
100
+    "extract",
101
+    "identify",
102
+    "map",
103
+    "determine",
104
+)
105
+_MUTATION_STEP_HINTS = (
106
+    "update",
107
+    "edit",
108
+    "write",
109
+    "fix",
110
+    "modify",
111
+    "change",
112
+    "patch",
113
+    "replace",
114
+    "correct",
115
+    "rewrite",
116
+)
117
+_VERIFY_STEP_HINTS = (
118
+    "verify",
119
+    "validation",
120
+    "validate",
121
+    "test",
122
+    "confirm",
123
+    "check",
124
+)
125
+_SHELL_COMMAND_START = re.compile(
126
+    r"(?<![\w/.-])("
127
+    r"ls|grep|pytest|uv|python3?|html5validator|cargo|npm|node|mypy|ruff|find|git|cat|sed|head|tail"
128
+    r")\b"
129
+)
70
 
130
 
71
 _SECTION_ALIASES = {
131
 _SECTION_ALIASES = {
72
     "task statement": "task_statement",
132
     "task statement": "task_statement",
@@ -486,6 +546,116 @@ def sync_todos_to_definition_of_done(
486
     dod.completed_items = list(dict.fromkeys(completed + special_completed))
546
     dod.completed_items = list(dict.fromkeys(completed + special_completed))
487
 
547
 
488
 
548
 
549
+def advance_todos_from_tool_call(dod, tool_call: ToolCall) -> bool:
550
+    """Advance the best-matching pending todo from a successful tool call."""
551
+
552
+    best_index: int | None = None
553
+    best_score = 0
554
+
555
+    for index, item in enumerate(dod.pending_items):
556
+        label = item.strip()
557
+        if not label or label in _SPECIAL_TODO_ITEMS:
558
+            continue
559
+        score = _todo_progress_score(label, tool_call)
560
+        if score > best_score:
561
+            best_index = index
562
+            best_score = score
563
+
564
+    if best_index is None or best_score <= 0:
565
+        return False
566
+
567
+    completed = dod.pending_items.pop(best_index)
568
+    if completed not in dod.completed_items:
569
+        dod.completed_items.append(completed)
570
+    return True
571
+
572
+
573
+def _todo_progress_score(item: str, tool_call: ToolCall) -> int:
574
+    text = item.lower()
575
+    name = tool_call.name
576
+    file_path = str(tool_call.arguments.get("file_path", "")).strip().lower()
577
+    path = str(tool_call.arguments.get("path", "")).strip().lower()
578
+    pattern = str(tool_call.arguments.get("pattern", "")).strip().lower()
579
+    command = str(tool_call.arguments.get("command", "")).strip().lower()
580
+    combined = " ".join(part for part in (file_path, path, pattern, command) if part)
581
+
582
+    path_hint = file_path or path
583
+    basename = Path(path_hint).name.lower() if path_hint else ""
584
+    parent = Path(path_hint).parent.name.lower() if path_hint else ""
585
+
586
+    score = 0
587
+    if basename and basename in text:
588
+        score += 3
589
+    if parent and parent not in {"", "."} and parent in text:
590
+        score += 2
591
+    if "index" in text and "index" in combined:
592
+        score += 2
593
+    if "chapter" in text and ("chapter" in basename or "chapters" in combined):
594
+        score += 1
595
+    if "html" in text and ".html" in combined:
596
+        score += 1
597
+
598
+    if name == "read":
599
+        if _contains_any(text, _READ_STEP_HINTS):
600
+            score += 2
601
+        if _contains_any(text, _PARSE_STEP_HINTS) and ".html" in combined:
602
+            score += 1
603
+    elif name in {"glob", "grep"}:
604
+        if _contains_any(text, _SEARCH_STEP_HINTS):
605
+            score += 2
606
+        if name == "glob" and _contains_any(text, _READ_STEP_HINTS) and ".html" in combined:
607
+            score += 1
608
+    elif name == "bash":
609
+        if _looks_like_verification_command(command):
610
+            if _contains_any(text, _VERIFY_STEP_HINTS):
611
+                score += 3
612
+        elif _looks_like_search_command(command):
613
+            if _contains_any(text, _SEARCH_STEP_HINTS):
614
+                score += 2
615
+        elif _looks_like_read_command(command):
616
+            if _contains_any(text, _READ_STEP_HINTS):
617
+                score += 2
618
+    elif name in {"write", "edit", "patch"}:
619
+        if _contains_any(text, _MUTATION_STEP_HINTS):
620
+            score += 3
621
+
622
+    if name in {"write", "edit", "patch"} and _contains_any(text, _VERIFY_STEP_HINTS):
623
+        return 0
624
+    return score
625
+
626
+
627
+def _contains_any(text: str, candidates: tuple[str, ...]) -> bool:
628
+    return any(candidate in text for candidate in candidates)
629
+
630
+
631
+def _looks_like_search_command(command: str) -> bool:
632
+    return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
633
+
634
+
635
+def _looks_like_read_command(command: str) -> bool:
636
+    return any(token in command for token in ("cat ", "sed ", "head ", "tail "))
637
+
638
+
639
+def _looks_like_verification_command(command: str) -> bool:
640
+    return any(
641
+        token in command
642
+        for token in (
643
+            "pytest",
644
+            "unittest",
645
+            " test",
646
+            " check",
647
+            " verify",
648
+            "html5validator",
649
+            "mypy",
650
+            "ruff",
651
+            "lint",
652
+            "grep ",
653
+            "diff ",
654
+            "cmp ",
655
+        )
656
+    )
657
+
658
+
489
 def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
659
 def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
490
     """Extract verification commands from a verification-plan markdown document."""
660
     """Extract verification commands from a verification-plan markdown document."""
491
 
661
 
@@ -686,11 +856,61 @@ def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None:
686
 def _extract_commands(items: list[str]) -> list[str]:
856
 def _extract_commands(items: list[str]) -> list[str]:
687
     commands: list[str] = []
857
     commands: list[str] = []
688
     for item in items:
858
     for item in items:
689
-        match = re.match(r"^`(.+)`$", item)
859
+        text = item.strip()
690
-        commands.append((match.group(1) if match else item).strip())
860
+        if not text:
861
+            continue
862
+
863
+        # Code fences often contain shell comments plus the actual command lines.
864
+        if "```" in text:
865
+            text = text.replace("```bash", "```").replace("```sh", "```")
866
+            if "\n" not in text:
867
+                commands.extend(_extract_collapsed_shell_commands(text))
868
+                continue
869
+
870
+        lines = text.splitlines() if "\n" in text or "```" in text else [text]
871
+        for line in lines:
872
+            candidate = line.strip()
873
+            if not candidate or candidate.startswith("```"):
874
+                continue
875
+            candidate = re.sub(r"^-\s+", "", candidate)
876
+            match = re.match(r"^`(.+)`$", candidate)
877
+            candidate = (match.group(1) if match else candidate).strip()
878
+            if candidate.startswith("#"):
879
+                candidate = _extract_shell_command_from_text(candidate)
880
+                if not candidate:
881
+                    continue
882
+            if candidate:
883
+                commands.append(candidate)
691
     return [command for command in commands if command]
884
     return [command for command in commands if command]
692
 
885
 
693
 
886
 
887
+def _extract_collapsed_shell_commands(text: str) -> list[str]:
888
+    stripped = re.sub(r"```(?:\w+)?", "", text).strip()
889
+    if not stripped:
890
+        return []
891
+
892
+    matches = list(_SHELL_COMMAND_START.finditer(stripped))
893
+    if not matches:
894
+        extracted = _extract_shell_command_from_text(stripped)
895
+        return [extracted] if extracted else []
896
+
897
+    commands: list[str] = []
898
+    for index, match in enumerate(matches):
899
+        start = match.start()
900
+        end = matches[index + 1].start() if index + 1 < len(matches) else len(stripped)
901
+        candidate = stripped[start:end].strip()
902
+        if candidate:
903
+            commands.append(candidate)
904
+    return commands
905
+
906
+
907
+def _extract_shell_command_from_text(text: str) -> str:
908
+    match = _SHELL_COMMAND_START.search(text)
909
+    if match is None:
910
+        return ""
911
+    return text[match.start():].strip()
912
+
913
+
694
 def _has_concrete_anchor(task: str) -> bool:
914
 def _has_concrete_anchor(task: str) -> bool:
695
     return any(
915
     return any(
696
         re.search(pattern, task)
916
         re.search(pattern, task)
src/loader/tools/file_tools.pymodified
@@ -13,6 +13,7 @@ from .fs_safety import (
13
     ensure_safe_to_read,
13
     ensure_safe_to_read,
14
     ensure_safe_to_write,
14
     ensure_safe_to_write,
15
     make_structured_patch,
15
     make_structured_patch,
16
+    parse_unified_diff_patch,
16
     resolve_workspace_path,
17
     resolve_workspace_path,
17
 )
18
 )
18
 
19
 
@@ -447,7 +448,8 @@ class PatchTool(Tool):
447
     def description(self) -> str:
448
     def description(self) -> str:
448
         return (
449
         return (
449
             "Apply structured patch hunks to a file. Prefer this for larger "
450
             "Apply structured patch hunks to a file. Prefer this for larger "
450
-            "or multi-line edits where exact old/new string replacement is brittle."
451
+            "or multi-line edits where exact old/new string replacement is brittle. "
452
+            "A raw unified diff string is also accepted via `patch`."
451
         )
453
         )
452
 
454
 
453
     @property
455
     @property
@@ -483,8 +485,15 @@ class PatchTool(Tool):
483
                         ],
485
                         ],
484
                     },
486
                     },
485
                 },
487
                 },
488
+                "patch": {
489
+                    "type": "string",
490
+                    "description": (
491
+                        "Optional unified diff patch string. Loader will parse this "
492
+                        "into structured hunks when possible."
493
+                    ),
494
+                },
486
             },
495
             },
487
-            "required": ["file_path", "hunks"],
496
+            "required": ["file_path"],
488
         }
497
         }
489
 
498
 
490
     @property
499
     @property
@@ -505,7 +514,8 @@ class PatchTool(Tool):
505
     async def execute(
514
     async def execute(
506
         self,
515
         self,
507
         file_path: str,
516
         file_path: str,
508
-        hunks: list[dict[str, Any]],
517
+        hunks: list[dict[str, Any]] | None = None,
518
+        patch: str | None = None,
509
         **kwargs: Any,
519
         **kwargs: Any,
510
     ) -> ToolResult:
520
     ) -> ToolResult:
511
         kwargs.pop("_skip_confirmation", None)
521
         kwargs.pop("_skip_confirmation", None)
@@ -544,13 +554,20 @@ class PatchTool(Tool):
544
             ensure_safe_to_read(path)
554
             ensure_safe_to_read(path)
545
             original_content = await asyncio.to_thread(path.read_text)
555
             original_content = await asyncio.to_thread(path.read_text)
546
             original_lines = original_content.splitlines()
556
             original_lines = original_content.splitlines()
547
-            parsed_hunks = [
557
+            raw_patch = patch or kwargs.get("diff") or kwargs.get("patch_text")
548
-                StructuredPatchHunk.from_dict_with_original(
558
+            parsed_hunks: list[StructuredPatchHunk]
549
-                    hunk,
559
+            if hunks:
550
-                    original_lines=original_lines,
560
+                parsed_hunks = [
551
-                )
561
+                    StructuredPatchHunk.from_dict_with_original(
552
-                for hunk in hunks
562
+                        hunk,
553
-            ]
563
+                        original_lines=original_lines,
564
+                    )
565
+                    for hunk in hunks
566
+                ]
567
+            elif isinstance(raw_patch, str) and raw_patch.strip():
568
+                parsed_hunks = parse_unified_diff_patch(raw_patch)
569
+            else:
570
+                parsed_hunks = []
554
             if not parsed_hunks:
571
             if not parsed_hunks:
555
                 raise ValueError("hunks must not be empty")
572
                 raise ValueError("hunks must not be empty")
556
             updated_content = apply_structured_patch(original_content, parsed_hunks)
573
             updated_content = apply_structured_patch(original_content, parsed_hunks)
src/loader/tools/fs_safety.pymodified
@@ -4,6 +4,7 @@ from __future__ import annotations
4
 
4
 
5
 from dataclasses import asdict, dataclass
5
 from dataclasses import asdict, dataclass
6
 from pathlib import Path
6
 from pathlib import Path
7
+import re
7
 
8
 
8
 MAX_READ_SIZE = 10 * 1024 * 1024
9
 MAX_READ_SIZE = 10 * 1024 * 1024
9
 MAX_WRITE_SIZE = 10 * 1024 * 1024
10
 MAX_WRITE_SIZE = 10 * 1024 * 1024
@@ -233,3 +234,64 @@ def _expect_patch_line(
233
             "structured patch context mismatch: "
234
             "structured patch context mismatch: "
234
             f"expected {expected!r}, found {actual!r}"
235
             f"expected {expected!r}, found {actual!r}"
235
         )
236
         )
237
+
238
+
239
+_UNIFIED_DIFF_HUNK_RE = re.compile(
240
+    r"^@@ -(?P<old_start>\d+)(?:,(?P<old_lines>\d+))? "
241
+    r"\+(?P<new_start>\d+)(?:,(?P<new_lines>\d+))? @@"
242
+)
243
+
244
+
245
+def parse_unified_diff_patch(patch_text: str) -> list[StructuredPatchHunk]:
246
+    """Parse a unified diff string into structured patch hunks."""
247
+
248
+    if not str(patch_text).strip():
249
+        raise ValueError("patch text is empty")
250
+
251
+    hunks: list[StructuredPatchHunk] = []
252
+    current_hunk: StructuredPatchHunk | None = None
253
+
254
+    for raw_line in str(patch_text).splitlines():
255
+        if raw_line.startswith(("--- ", "+++ ")):
256
+            continue
257
+        if raw_line.startswith("@@"):
258
+            match = _UNIFIED_DIFF_HUNK_RE.match(raw_line)
259
+            if match is None:
260
+                raise ValueError(
261
+                    "patch text contains an invalid unified-diff hunk header"
262
+                )
263
+            if current_hunk is not None:
264
+                hunks.append(current_hunk)
265
+            current_hunk = StructuredPatchHunk(
266
+                old_start=int(match.group("old_start")),
267
+                old_lines=int(match.group("old_lines") or 1),
268
+                new_start=int(match.group("new_start")),
269
+                new_lines=int(match.group("new_lines") or 1),
270
+                lines=[],
271
+            )
272
+            continue
273
+
274
+        if raw_line == r"\ No newline at end of file":
275
+            continue
276
+
277
+        if current_hunk is None:
278
+            if not raw_line.strip():
279
+                continue
280
+            raise ValueError(
281
+                "patch text must include at least one unified-diff hunk header"
282
+            )
283
+
284
+        prefix = raw_line[:1]
285
+        if prefix not in {" ", "+", "-"}:
286
+            raise ValueError(
287
+                "patch text contains a diff line without a valid prefix"
288
+            )
289
+        current_hunk.lines.append(raw_line)
290
+
291
+    if current_hunk is not None:
292
+        hunks.append(current_hunk)
293
+
294
+    if not hunks:
295
+        raise ValueError("patch text must include at least one unified-diff hunk")
296
+
297
+    return hunks
src/loader/utils/file_mutations.pymodified
@@ -11,7 +11,11 @@ from rich.console import Group
11
 from rich.panel import Panel
11
 from rich.panel import Panel
12
 from rich.text import Text
12
 from rich.text import Text
13
 
13
 
14
-from ..tools.fs_safety import StructuredPatchHunk, make_structured_patch
14
+from ..tools.fs_safety import (
15
+    StructuredPatchHunk,
16
+    make_structured_patch,
17
+    parse_unified_diff_patch,
18
+)
15
 
19
 
16
 FILE_MUTATION_TOOLS = {"write", "edit", "patch"}
20
 FILE_MUTATION_TOOLS = {"write", "edit", "patch"}
17
 DIFF_TRUNCATION_NOTICE = "truncated for display; full result preserved in session"
21
 DIFF_TRUNCATION_NOTICE = "truncated for display; full result preserved in session"
@@ -98,6 +102,8 @@ def build_file_mutation_preview(
98
         structured_patch = _coerce_patch_hunks(info.get("hunks")) or _coerce_patch_hunks(
102
         structured_patch = _coerce_patch_hunks(info.get("hunks")) or _coerce_patch_hunks(
99
             args.get("hunks")
103
             args.get("hunks")
100
         )
104
         )
105
+    if not structured_patch and tool_name == "patch":
106
+        structured_patch = _coerce_raw_patch_hunks(info) or _coerce_raw_patch_hunks(args)
101
 
107
 
102
     old_text = _extract_old_text(tool_name, info) or _extract_old_text(tool_name, args)
108
     old_text = _extract_old_text(tool_name, info) or _extract_old_text(tool_name, args)
103
     new_text = _extract_new_text(tool_name, info) or _extract_new_text(tool_name, args)
109
     new_text = _extract_new_text(tool_name, info) or _extract_new_text(tool_name, args)
@@ -192,6 +198,18 @@ def _coerce_patch_hunks(value: Any) -> list[StructuredPatchHunk]:
192
     return hunks
198
     return hunks
193
 
199
 
194
 
200
 
201
+def _coerce_raw_patch_hunks(payload: dict[str, Any]) -> list[StructuredPatchHunk]:
202
+    for key in ("patch", "diff", "patch_text"):
203
+        value = payload.get(key)
204
+        if not isinstance(value, str) or not value.strip():
205
+            continue
206
+        try:
207
+            return parse_unified_diff_patch(value)
208
+        except ValueError:
209
+            continue
210
+    return []
211
+
212
+
195
 def _extract_file_path(payload: dict[str, Any]) -> str | None:
213
 def _extract_file_path(payload: dict[str, Any]) -> str | None:
196
     for key in ("file_path", "filePath", "path", "filename", "file"):
214
     for key in ("file_path", "filePath", "path", "filename", "file"):
197
         value = payload.get(key)
215
         value = payload.get(key)
tests/test_completion_policy.pymodified
@@ -7,7 +7,7 @@ from types import SimpleNamespace
7
 
7
 
8
 import pytest
8
 import pytest
9
 
9
 
10
-from loader.llm.base import Message, Role
10
+from loader.llm.base import Message, Role, ToolCall
11
 from loader.runtime.completion_policy import CompletionPolicy
11
 from loader.runtime.completion_policy import CompletionPolicy
12
 from loader.runtime.context import RuntimeContext
12
 from loader.runtime.context import RuntimeContext
13
 from loader.runtime.dod import VerificationEvidence, create_definition_of_done
13
 from loader.runtime.dod import VerificationEvidence, create_definition_of_done
@@ -24,6 +24,7 @@ from loader.runtime.task_completion import (
24
     detect_premature_completion,
24
     detect_premature_completion,
25
     get_continuation_prompt,
25
     get_continuation_prompt,
26
 )
26
 )
27
+from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done
27
 from loader.runtime.verification_observations import (
28
 from loader.runtime.verification_observations import (
28
     VerificationObservationStatus,
29
     VerificationObservationStatus,
29
     verification_attempt_id,
30
     verification_attempt_id,
@@ -338,6 +339,78 @@ def test_completion_assessment_attaches_typed_verification_provenance() -> None:
338
     assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
339
     assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
339
 
340
 
340
 
341
 
342
+def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None:
343
+    dod = create_definition_of_done("Fix the chapter links in index.html.")
344
+    sync_todos_to_definition_of_done(
345
+        dod,
346
+        [
347
+            {
348
+                "content": "First, examine the current index.html file to understand its structure",
349
+                "active_form": "Working on: First, examine the current index.html file to understand its structure",
350
+                "status": "pending",
351
+            },
352
+            {
353
+                "content": "List and read all HTML files in the chapters directory to extract chapter information",
354
+                "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information",
355
+                "status": "pending",
356
+            },
357
+            {
358
+                "content": "Parse chapter titles from each HTML file",
359
+                "active_form": "Working on: Parse chapter titles from each HTML file",
360
+                "status": "pending",
361
+            },
362
+            {
363
+                "content": "Update index.html with correct chapter links and titles",
364
+                "active_form": "Working on: Update index.html with correct chapter links and titles",
365
+                "status": "pending",
366
+            },
367
+        ],
368
+    )
369
+    advance_todos_from_tool_call(
370
+        dod,
371
+        ToolCall(
372
+            id="read-index",
373
+            name="read",
374
+            arguments={"file_path": "/tmp/fortran/index.html"},
375
+        ),
376
+    )
377
+    advance_todos_from_tool_call(
378
+        dod,
379
+        ToolCall(
380
+            id="glob-chapters",
381
+            name="glob",
382
+            arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"},
383
+        ),
384
+    )
385
+    advance_todos_from_tool_call(
386
+        dod,
387
+        ToolCall(
388
+            id="read-chapter",
389
+            name="read",
390
+            arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"},
391
+        ),
392
+    )
393
+
394
+    assessment = assess_completion_follow_through_with_provenance(
395
+        task="Update /tmp/fortran/index.html so every chapter link is correct.",
396
+        response="I'll update the index.html file with the correct chapter links and titles.",
397
+        actions_taken=[
398
+            "read: {'file_path': '/tmp/fortran/index.html'}",
399
+            "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}",
400
+            "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}",
401
+        ],
402
+        dod=dod,
403
+    )
404
+
405
+    assert assessment.check.missing_evidence[0] == (
406
+        "completion of tracked work items "
407
+        "(Update index.html with correct chapter links and titles)"
408
+    )
409
+    assert assessment.check.suggested_next_steps[0] == (
410
+        "Complete the tracked item: Update index.html with correct chapter links and titles"
411
+    )
412
+
413
+
341
 @pytest.mark.asyncio
414
 @pytest.mark.asyncio
342
 async def test_completion_policy_stops_for_text_loop_using_runtime_context(
415
 async def test_completion_policy_stops_for_text_loop_using_runtime_context(
343
     temp_dir: Path,
416
     temp_dir: Path,
tests/test_expanded_tools.pymodified
@@ -87,6 +87,43 @@ async def test_patch_tool_accepts_replacement_block_hunks(temp_dir: Path) -> Non
87
     assert result.metadata["structured_patch"]
87
     assert result.metadata["structured_patch"]
88
 
88
 
89
 
89
 
90
+@pytest.mark.asyncio
91
+async def test_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None:
92
+    target = temp_dir / "sample.txt"
93
+    target.write_text("alpha\nbeta\ngamma\n")
94
+    tool = PatchTool(workspace_root=temp_dir)
95
+
96
+    result = await tool.execute(
97
+        file_path=str(target),
98
+        patch=(
99
+            "--- a/sample.txt\n"
100
+            "+++ b/sample.txt\n"
101
+            "@@ -2,1 +2,1 @@\n"
102
+            "-beta\n"
103
+            "+beta updated\n"
104
+        ),
105
+    )
106
+
107
+    assert result.is_error is False
108
+    assert target.read_text() == "alpha\nbeta updated\ngamma\n"
109
+    assert result.metadata["structured_patch"]
110
+
111
+
112
+@pytest.mark.asyncio
113
+async def test_patch_tool_rejects_invalid_unified_diff_string(temp_dir: Path) -> None:
114
+    target = temp_dir / "sample.txt"
115
+    target.write_text("alpha\nbeta\ngamma\n")
116
+    tool = PatchTool(workspace_root=temp_dir)
117
+
118
+    result = await tool.execute(
119
+        file_path=str(target),
120
+        patch="--- a/sample.txt\n+++ b/sample.txt\n@@ ...\n",
121
+    )
122
+
123
+    assert result.is_error is True
124
+    assert "invalid unified-diff hunk header" in result.output
125
+
126
+
90
 @pytest.mark.asyncio
127
 @pytest.mark.asyncio
91
 async def test_git_tool_inspects_read_only_repo_state(temp_dir: Path) -> None:
128
 async def test_git_tool_inspects_read_only_repo_state(temp_dir: Path) -> None:
92
     subprocess.run(["git", "init", "--quiet"], cwd=temp_dir, check=True)
129
     subprocess.run(["git", "init", "--quiet"], cwd=temp_dir, check=True)
tests/test_finalization.pymodified
@@ -113,6 +113,22 @@ class FakeExecutor:
113
         return self._outcomes.pop(0)
113
         return self._outcomes.pop(0)
114
 
114
 
115
 
115
 
116
+class RecordingExecutor:
117
+    def __init__(self) -> None:
118
+        self.commands: list[str] = []
119
+
120
+    async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
121
+        command = str(tool_call.arguments.get("command", ""))
122
+        self.commands.append(command)
123
+        return tool_outcome(
124
+            tool_call=tool_call,
125
+            output="ok",
126
+            is_error=False,
127
+            exit_code=0,
128
+            stdout="ok",
129
+        )
130
+
131
+
116
 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
132
 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
117
     registry = create_default_registry(temp_dir)
133
     registry = create_default_registry(temp_dir)
118
     registry.configure_workspace_root(temp_dir)
134
     registry.configure_workspace_root(temp_dir)
@@ -301,7 +317,6 @@ async def test_turn_finalizer_records_passed_verification_observation(
301
     )
317
     )
302
     dod = create_definition_of_done("Update the runtime tests.")
318
     dod = create_definition_of_done("Update the runtime tests.")
303
     dod.mutating_actions.append("write")
319
     dod.mutating_actions.append("write")
304
-    dod.touched_files.append(str(temp_dir / "tests" / "test_runtime.py"))
305
     dod.verification_commands = ["uv run pytest -q"]
320
     dod.verification_commands = ["uv run pytest -q"]
306
     summary = TurnSummary(final_response="")
321
     summary = TurnSummary(final_response="")
307
     tool_call = ToolCall(
322
     tool_call = ToolCall(
@@ -360,6 +375,59 @@ async def test_turn_finalizer_records_passed_verification_observation(
360
     assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
375
     assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
361
         VerificationObservationStatus.PASSED.value
376
         VerificationObservationStatus.PASSED.value
362
     ]
377
     ]
378
+
379
+
380
+@pytest.mark.asyncio
381
+async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
382
+    temp_dir: Path,
383
+) -> None:
384
+    chapters = temp_dir / "chapters"
385
+    chapters.mkdir()
386
+    (chapters / "01-introduction.html").write_text(
387
+        "<h1>Chapter 1: Introduction to Fortran</h1>\n"
388
+    )
389
+    index = temp_dir / "index.html"
390
+    index.write_text(
391
+        "\n".join(
392
+            [
393
+                '<ul class="chapter-list">',
394
+                '  <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
395
+                "</ul>",
396
+            ]
397
+        )
398
+    )
399
+
400
+    session = FakeSession()
401
+    context = build_context(temp_dir, session)
402
+    finalizer = TurnFinalizer(
403
+        context,
404
+        RuntimeTracer(),
405
+        DefinitionOfDoneStore(temp_dir),
406
+        set_workflow_mode=_noop_set_workflow_mode,
407
+    )
408
+    dod = create_definition_of_done(
409
+        "Update index.html so the table of contents links and chapter titles are correct."
410
+    )
411
+    dod.mutating_actions.append("edit")
412
+    dod.touched_files.append(str(index))
413
+    dod.verification_commands = ['grep -n "href=" index.html']
414
+    summary = TurnSummary(final_response="")
415
+    executor = RecordingExecutor()
416
+
417
+    async def capture(event) -> None:
418
+        return None
419
+
420
+    result = await finalizer.run_definition_of_done_gate(
421
+        dod=dod,
422
+        candidate_response="Updated the index.html links.",
423
+        emit=capture,
424
+        summary=summary,
425
+        executor=executor,  # type: ignore[arg-type]
426
+    )
427
+
428
+    assert result.should_continue is False
429
+    assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
430
+    assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in executor.commands)
363
     assert (
431
     assert (
364
         session.workflow_timeline[-1].verification_observations[0].attempt_id
432
         session.workflow_timeline[-1].verification_observations[0].attempt_id
365
         == "verification-attempt-1"
433
         == "verification-attempt-1"
tests/test_runtime_harness.pymodified
@@ -901,6 +901,46 @@ async def test_raw_json_patch_tool_call_fallback(temp_dir: Path) -> None:
901
     assert "Patched sample.txt." in run.response
901
     assert "Patched sample.txt." in run.response
902
 
902
 
903
 
903
 
904
+@pytest.mark.asyncio
905
+async def test_native_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None:
906
+    target = temp_dir / "sample.txt"
907
+    target.write_text("alpha\nbeta\ngamma\n")
908
+
909
+    backend = ScriptedBackend(
910
+        completions=[
911
+            native_tool_response(
912
+                ToolCall(
913
+                    id="patch-1",
914
+                    name="patch",
915
+                    arguments={
916
+                        "file_path": str(target),
917
+                        "patch": (
918
+                            "--- a/sample.txt\n"
919
+                            "+++ b/sample.txt\n"
920
+                            "@@ -2,1 +2,1 @@\n"
921
+                            "-beta\n"
922
+                            "+beta updated\n"
923
+                        ),
924
+                    },
925
+                ),
926
+                content="I'll patch the file directly.",
927
+            ),
928
+            final_response("Patched sample.txt."),
929
+        ]
930
+    )
931
+
932
+    run = await run_scenario(
933
+        "Update sample.txt.",
934
+        backend,
935
+        config=non_streaming_config(),
936
+        project_root=temp_dir,
937
+    )
938
+
939
+    assert tool_event_names(run) == ["patch"]
940
+    assert target.read_text() == "alpha\nbeta updated\ngamma\n"
941
+    assert "Patched sample.txt." in run.response
942
+
943
+
904
 @pytest.mark.asyncio
944
 @pytest.mark.asyncio
905
 async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
945
 async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
906
     raw_json = json.dumps(
946
     raw_json = json.dumps(
@@ -1766,6 +1806,66 @@ async def test_duplicate_read_is_skipped_without_intervening_mutation(
1766
     assert "existing file contents" in run.response
1806
     assert "existing file contents" in run.response
1767
 
1807
 
1768
 
1808
 
1809
+@pytest.mark.asyncio
1810
+async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence(
1811
+    temp_dir: Path,
1812
+) -> None:
1813
+    chapters = temp_dir / "chapters"
1814
+    chapters.mkdir()
1815
+    (chapters / "01-introduction.html").write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n")
1816
+    (chapters / "02-setup.html").write_text("<h1>Chapter 2: Setting Up Fortran</h1>\n")
1817
+    index_file = temp_dir / "index.html"
1818
+    index_file.write_text("broken table of contents\n")
1819
+
1820
+    backend = ScriptedBackend(
1821
+        completions=[
1822
+            native_tool_response(
1823
+                ToolCall(
1824
+                    id="glob-1",
1825
+                    name="glob",
1826
+                    arguments={"path": str(chapters), "pattern": "*.html"},
1827
+                ),
1828
+                content="I'll inspect the chapter inventory first.",
1829
+            ),
1830
+            native_tool_response(
1831
+                ToolCall(
1832
+                    id="read-1",
1833
+                    name="read",
1834
+                    arguments={"file_path": str(index_file)},
1835
+                ),
1836
+                content="I'll inspect the index next.",
1837
+            ),
1838
+            native_tool_response(
1839
+                ToolCall(
1840
+                    id="read-2",
1841
+                    name="read",
1842
+                    arguments={"file_path": str(index_file)},
1843
+                ),
1844
+                content="I'll reopen the index.",
1845
+            ),
1846
+            final_response("I'll reuse the earlier evidence and patch the index next."),
1847
+        ]
1848
+    )
1849
+
1850
+    run = await run_scenario(
1851
+        "Update index.html so the table of contents links are correct.",
1852
+        backend,
1853
+        config=non_streaming_config(),
1854
+        project_root=temp_dir,
1855
+    )
1856
+
1857
+    messages = tool_result_messages(run)
1858
+    steering_messages = [
1859
+        event.content
1860
+        for event in run.events
1861
+        if event.type == "steering" and event.content
1862
+    ]
1863
+
1864
+    assert any("reuse the earlier read result instead of rereading" in message for message in messages)
1865
+    assert any("Reuse the earlier observation instead of repeating it." in message for message in steering_messages)
1866
+    assert any("index.html" in message for message in steering_messages)
1867
+
1868
+
1769
 @pytest.mark.asyncio
1869
 @pytest.mark.asyncio
1770
 async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
1870
 async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
1771
     temp_dir: Path,
1871
     temp_dir: Path,
tests/test_safeguard_services.pymodified
@@ -158,11 +158,25 @@ def test_pre_action_validator_blocks_patch_without_hunks() -> None:
158
     assert result == ValidationResult(
158
     assert result == ValidationResult(
159
         valid=False,
159
         valid=False,
160
         reason="Patch hunks are missing",
160
         reason="Patch hunks are missing",
161
-        suggestion="Provide one or more structured patch hunks",
161
+        suggestion="Provide structured patch hunks or a unified diff patch string",
162
         severity="error",
162
         severity="error",
163
     )
163
     )
164
 
164
 
165
 
165
 
166
+def test_pre_action_validator_allows_patch_string_without_hunks() -> None:
167
+    validator = PreActionValidator()
168
+
169
+    result = validator.validate(
170
+        "patch",
171
+        {
172
+            "file_path": "notes.txt",
173
+            "patch": "--- a/notes.txt\n+++ b/notes.txt\n@@ -1,1 +1,1 @@\n-old\n+new\n",
174
+        },
175
+    )
176
+
177
+    assert result == ValidationResult(valid=True)
178
+
179
+
166
 def test_runtime_safeguards_wrap_runtime_owned_services() -> None:
180
 def test_runtime_safeguards_wrap_runtime_owned_services() -> None:
167
     safeguards = RuntimeSafeguards()
181
     safeguards = RuntimeSafeguards()
168
 
182
 
tests/test_tool_batch_policies.pymodified
@@ -349,12 +349,74 @@ async def test_tool_batch_recovery_controller_includes_known_state_for_missing_f
349
 
349
 
350
     assert follow_up is not None
350
     assert follow_up is not None
351
     assert "## CONTINUE FROM KNOWN STATE" in follow_up.content
351
     assert "## CONTINUE FROM KNOWN STATE" in follow_up.content
352
+    assert "apply the fix using confirmed findings" in follow_up.content
353
+    assert "## ACTION BIAS FOR THIS RECOVERY" in follow_up.content
354
+    assert "Prefer edit/write/patch on the target file" in follow_up.content
352
     assert "04-variables.html" in follow_up.content
355
     assert "04-variables.html" in follow_up.content
353
     assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content
356
     assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content
354
     assert "`~/Loader/guides/fortran/index.html`" in follow_up.content
357
     assert "`~/Loader/guides/fortran/index.html`" in follow_up.content
355
     assert any(event.type == "recovery" for event in events)
358
     assert any(event.type == "recovery" for event in events)
356
 
359
 
357
 
360
 
361
+@pytest.mark.asyncio
362
+async def test_tool_batch_recovery_controller_suggests_known_sibling_files(
363
+    temp_dir: Path,
364
+) -> None:
365
+    async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
366
+        raise AssertionError("Confidence should not run here")
367
+
368
+    async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
369
+        raise AssertionError("Verification should not run here")
370
+
371
+    messages = [
372
+        Message(
373
+            role=Role.TOOL,
374
+            content=(
375
+                "Observation [glob]: Result: "
376
+                "/private/tmp/fortran-qwen-recovery-check/chapters/01-introduction.html\n"
377
+                "/private/tmp/fortran-qwen-recovery-check/chapters/02-setup.html\n"
378
+                "/private/tmp/fortran-qwen-recovery-check/chapters/03-basics.html\n"
379
+                "/private/tmp/fortran-qwen-recovery-check/chapters/04-variables.html\n"
380
+                "/private/tmp/fortran-qwen-recovery-check/chapters/05-input-output.html"
381
+            ),
382
+            tool_results=[],
383
+        ),
384
+    ]
385
+    context = build_context(
386
+        temp_dir=temp_dir,
387
+        messages=messages,
388
+        assess_confidence=assess_confidence,
389
+        verify_action=verify_action,
390
+    )
391
+    controller = ToolBatchRecoveryController(context)
392
+    tool_call = ToolCall(
393
+        id="read-missing",
394
+        name="read",
395
+        arguments={"file_path": "/tmp/fortran-qwen-recovery-check/chapters/04-data-types.html"},
396
+    )
397
+    outcome = tool_outcome(
398
+        tool_call=tool_call,
399
+        output="File not found: /tmp/fortran-qwen-recovery-check/chapters/04-data-types.html",
400
+        is_error=True,
401
+    )
402
+
403
+    events: list[AgentEvent] = []
404
+
405
+    async def emit(event: AgentEvent) -> None:
406
+        events.append(event)
407
+
408
+    follow_up = await controller.build_follow_up(
409
+        tool_call=tool_call,
410
+        outcome=outcome,
411
+        emit=emit,
412
+    )
413
+
414
+    assert follow_up is not None
415
+    assert "## LIKELY FILE CANDIDATES" in follow_up.content
416
+    assert "`04-variables.html`" in follow_up.content
417
+    assert "instead of retrying the missing path" in follow_up.content
418
+
419
+
358
 @pytest.mark.asyncio
420
 @pytest.mark.asyncio
359
 async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files(
421
 async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files(
360
     temp_dir: Path,
422
     temp_dir: Path,
tests/test_tool_batches.pymodified
@@ -540,6 +540,108 @@ async def test_tool_batch_runner_clears_recovery_context_after_successful_mutati
540
     assert context.recovery_context is None
540
     assert context.recovery_context is None
541
 
541
 
542
 
542
 
543
+@pytest.mark.asyncio
544
+async def test_tool_batch_runner_queues_duplicate_observation_nudge(
545
+    temp_dir: Path,
546
+) -> None:
547
+    async def assess_confidence(
548
+        tool_name: str,
549
+        tool_args: dict,
550
+        context: str,
551
+    ) -> ConfidenceAssessment:
552
+        raise AssertionError("Confidence scoring should be disabled in this scenario")
553
+
554
+    async def verify_action(
555
+        tool_name: str,
556
+        tool_args: dict,
557
+        result: str,
558
+        expected: str = "",
559
+    ) -> ActionVerification:
560
+        raise AssertionError("Verification should not run for this scenario")
561
+
562
+    messages = [
563
+        Message(
564
+            role=Role.TOOL,
565
+            content=(
566
+                "Observation [glob]: Result: "
567
+                f"{temp_dir}/chapters/01-introduction.html\n"
568
+                f"{temp_dir}/chapters/02-setup.html\n"
569
+                f"{temp_dir}/chapters/03-basics.html"
570
+            ),
571
+            tool_results=[],
572
+        ),
573
+        Message(
574
+            role=Role.ASSISTANT,
575
+            content="I should update the index now.",
576
+            tool_calls=[
577
+                ToolCall(
578
+                    id="read-index",
579
+                    name="read",
580
+                    arguments={"file_path": str(temp_dir / 'index.html')},
581
+                )
582
+            ],
583
+        ),
584
+    ]
585
+    context = build_context(
586
+        temp_dir=temp_dir,
587
+        messages=messages,
588
+        safeguards=FakeSafeguards(),
589
+        assess_confidence=assess_confidence,
590
+        verify_action=verify_action,
591
+        auto_recover=False,
592
+    )
593
+    context.session.current_task = (
594
+        f"Update {temp_dir / 'index.html'} with the right chapter links."
595
+    )
596
+    queued_messages: list[str] = []
597
+    context.queue_steering_message_callback = queued_messages.append
598
+    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
599
+    tool_call = ToolCall(
600
+        id="read-dup",
601
+        name="read",
602
+        arguments={"file_path": str(temp_dir / "index.html")},
603
+    )
604
+    duplicate_message = (
605
+        "[Skipped - duplicate action: Already read "
606
+        f"{temp_dir / 'index.html'} recently without any intervening changes; "
607
+        "reuse the earlier read result instead of rereading]"
608
+    )
609
+    executor = FakeExecutor(
610
+        [
611
+            ToolExecutionOutcome(
612
+                tool_call=tool_call,
613
+                state=ToolExecutionState.DUPLICATE,
614
+                message=Message.tool_result_message(
615
+                    tool_call_id=tool_call.id,
616
+                    display_content=duplicate_message,
617
+                    result_content=duplicate_message,
618
+                ),
619
+                event_content=duplicate_message,
620
+                is_error=False,
621
+                result_output=duplicate_message,
622
+            )
623
+        ]
624
+    )
625
+
626
+    await runner.execute_batch(
627
+        tool_calls=[tool_call],
628
+        tool_source="assistant",
629
+        pending_tool_calls_seen=set(),
630
+        emit=_noop_emit,
631
+        summary=TurnSummary(final_response=""),
632
+        dod=create_definition_of_done("Fix the chapter links"),
633
+        executor=executor,  # type: ignore[arg-type]
634
+        on_confirmation=None,
635
+        on_user_question=None,
636
+        emit_confirmation=None,
637
+        consecutive_errors=0,
638
+    )
639
+
640
+    assert len(queued_messages) == 1
641
+    assert "Reuse the earlier observation instead of repeating it." in queued_messages[0]
642
+    assert "index.html" in queued_messages[0]
643
+
644
+
543
 async def _noop_emit(event: AgentEvent) -> None:
645
 async def _noop_emit(event: AgentEvent) -> None:
544
     return None
646
     return None
545
 
647
 
tests/test_workflow.pymodified
@@ -4,6 +4,7 @@ from __future__ import annotations
4
 
4
 
5
 from pathlib import Path
5
 from pathlib import Path
6
 
6
 
7
+from loader.llm.base import ToolCall
7
 from loader.runtime.clarify_grounding import ClarifyGrounding, ClarifyRepoFact
8
 from loader.runtime.clarify_grounding import ClarifyGrounding, ClarifyRepoFact
8
 from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
9
 from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
9
 from loader.runtime.workflow import (
10
 from loader.runtime.workflow import (
@@ -12,6 +13,7 @@ from loader.runtime.workflow import (
12
     PlanningArtifacts,
13
     PlanningArtifacts,
13
     WorkflowArtifactStore,
14
     WorkflowArtifactStore,
14
     WorkflowMode,
15
     WorkflowMode,
16
+    advance_todos_from_tool_call,
15
     build_execute_bridge,
17
     build_execute_bridge,
16
     enrich_clarify_brief_with_grounding,
18
     enrich_clarify_brief_with_grounding,
17
     extract_verification_commands_from_markdown,
19
     extract_verification_commands_from_markdown,
@@ -183,6 +185,26 @@ def test_planning_artifacts_recover_embedded_verification_from_legacy_separator(
183
     ]
185
     ]
184
 
186
 
185
 
187
 
188
+def test_extract_verification_commands_from_markdown_splits_code_blocks() -> None:
189
+    markdown = "\n".join(
190
+        [
191
+            "# Verification Plan",
192
+            "",
193
+            "## Verification Commands",
194
+            "```bash",
195
+            "# Check chapter files",
196
+            "ls chapters",
197
+            "grep -n \"href=\" index.html",
198
+            "```",
199
+        ]
200
+    )
201
+
202
+    assert extract_verification_commands_from_markdown(markdown) == [
203
+        "ls chapters",
204
+        'grep -n "href=" index.html',
205
+    ]
206
+
207
+
186
 def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
208
 def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
187
     store = WorkflowArtifactStore(tmp_path)
209
     store = WorkflowArtifactStore(tmp_path)
188
     brief = ClarifyBrief.fallback(
210
     brief = ClarifyBrief.fallback(
@@ -250,3 +272,93 @@ def test_sync_todos_to_definition_of_done_preserves_runtime_items() -> None:
250
     assert "Writing router" in dod.pending_items
272
     assert "Writing router" in dod.pending_items
251
     assert "Collect verification evidence" in dod.pending_items
273
     assert "Collect verification evidence" in dod.pending_items
252
     assert "Update tests" in dod.completed_items
274
     assert "Update tests" in dod.completed_items
275
+
276
+
277
+def test_advance_todos_from_tool_call_tracks_plan_progress() -> None:
278
+    dod = create_definition_of_done("Fix the chapter links in index.html.")
279
+    sync_todos_to_definition_of_done(
280
+        dod,
281
+        [
282
+            {
283
+                "content": "First, examine the current index.html file to understand its structure",
284
+                "active_form": "Working on: First, examine the current index.html file to understand its structure",
285
+                "status": "pending",
286
+            },
287
+            {
288
+                "content": "List and read all HTML files in the chapters directory to extract chapter information",
289
+                "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information",
290
+                "status": "pending",
291
+            },
292
+            {
293
+                "content": "Parse chapter titles from each HTML file",
294
+                "active_form": "Working on: Parse chapter titles from each HTML file",
295
+                "status": "pending",
296
+            },
297
+            {
298
+                "content": "Update index.html with correct chapter links and titles",
299
+                "active_form": "Working on: Update index.html with correct chapter links and titles",
300
+                "status": "pending",
301
+            },
302
+            {
303
+                "content": "Verify the updated index.html file is properly formatted",
304
+                "active_form": "Working on: Verify the updated index.html file is properly formatted",
305
+                "status": "pending",
306
+            },
307
+        ],
308
+    )
309
+
310
+    assert advance_todos_from_tool_call(
311
+        dod,
312
+        ToolCall(
313
+            id="read-index",
314
+            name="read",
315
+            arguments={"file_path": "/tmp/fortran/index.html"},
316
+        ),
317
+    )
318
+    assert (
319
+        "First, examine the current index.html file to understand its structure"
320
+        in dod.completed_items
321
+    )
322
+
323
+    assert advance_todos_from_tool_call(
324
+        dod,
325
+        ToolCall(
326
+            id="glob-chapters",
327
+            name="glob",
328
+            arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"},
329
+        ),
330
+    )
331
+    assert (
332
+        "List and read all HTML files in the chapters directory to extract chapter information"
333
+        in dod.completed_items
334
+    )
335
+
336
+    assert advance_todos_from_tool_call(
337
+        dod,
338
+        ToolCall(
339
+            id="read-chapter",
340
+            name="read",
341
+            arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"},
342
+        ),
343
+    )
344
+    assert "Parse chapter titles from each HTML file" in dod.completed_items
345
+
346
+    assert advance_todos_from_tool_call(
347
+        dod,
348
+        ToolCall(
349
+            id="patch-index",
350
+            name="patch",
351
+            arguments={"file_path": "/tmp/fortran/index.html", "hunks": []},
352
+        ),
353
+    )
354
+    assert "Update index.html with correct chapter links and titles" in dod.completed_items
355
+
356
+    assert advance_todos_from_tool_call(
357
+        dod,
358
+        ToolCall(
359
+            id="verify-index",
360
+            name="bash",
361
+            arguments={"command": "grep -o 'href=\"[^\"]*\"' /tmp/fortran/index.html"},
362
+        ),
363
+    )
364
+    assert "Verify the updated index.html file is properly formatted" in dod.completed_items