tenseleyflow/loader / 297e213

Browse files

Strengthen qwen progress and verification handoffs

Authored by espadonne
SHA
297e213d29e61587549386472439e606500e8b69
Parents
918941f
Tree
7288310

13 changed files

StatusFile+-
M src/loader/runtime/artifact_invalidation.py 40 3
M src/loader/runtime/dod.py 28 0
M src/loader/runtime/finalization.py 1 0
M src/loader/runtime/tool_batches.py 34 1
M src/loader/runtime/workflow.py 170 1
M src/loader/runtime/workflow_lanes.py 44 0
M src/loader/runtime/workflow_recovery.py 19 0
M tests/test_artifact_invalidation.py 25 0
M tests/test_dod.py 23 0
M tests/test_finalization.py 47 0
M tests/test_tool_batches.py 95 0
M tests/test_workflow.py 151 0
M tests/test_workflow_runtime.py 15 0
src/loader/runtime/artifact_invalidation.pymodified
@@ -49,13 +49,18 @@ class ArtifactInvalidationAssessor:
4949
         unexpected_paths = [
5050
             name
5151
             for path in touched_files
52
-            if (name := _path_name(path)) and name.lower() not in plan_text
52
+            if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path)
5353
         ]
5454
         confirmed_touchpoints = [
5555
             name
5656
             for path in touched_files
5757
             if (name := _path_name(path))
5858
         ]
59
+        confirmed_touchpoint_keys = {
60
+            _path_reference_identity(path)
61
+            for path in touched_files
62
+            if _path_reference_identity(path)
63
+        }
5964
         inferred_touchpoints = [
6065
             item
6166
             for item in _extract_path_mentions(
@@ -63,7 +68,7 @@ class ArtifactInvalidationAssessor:
6368
                 implementation_text,
6469
                 verification_text,
6570
             )
66
-            if _path_name(item) not in confirmed_touchpoints
71
+            if _path_reference_identity(item) not in confirmed_touchpoint_keys
6772
         ]
6873
         stale_plan = False
6974
         stale_brief = False
@@ -147,7 +152,11 @@ class ArtifactInvalidationAssessor:
147152
                     )
148153
 
149154
             out_of_brief_paths = [
150
-                name for name in unexpected_paths if name.lower() not in brief_text
155
+                name
156
+                for path in touched_files
157
+                if (name := _path_name(path))
158
+                and name in unexpected_paths
159
+                and not _text_covers_path_reference(brief_text, path)
151160
             ]
152161
             if out_of_brief_paths:
153162
                 stale_brief = True
@@ -200,6 +209,34 @@ def _path_name(path: str) -> str:
200209
     return normalized.rsplit("/", maxsplit=1)[-1].strip()
201210
 
202211
 
212
+def _path_reference_identity(path: str) -> str:
213
+    normalized = _path_name(path)
214
+    if not normalized:
215
+        return ""
216
+    return _canonical_path_reference(normalized)
217
+
218
+
219
+def _text_covers_path_reference(text: str, path: str) -> bool:
220
+    normalized_text = text.lower()
221
+    candidates = [candidate for candidate in (str(path).strip(), _path_name(path)) if candidate]
222
+
223
+    for candidate in candidates:
224
+        if candidate.lower() in normalized_text:
225
+            return True
226
+
227
+    canonical_text = _canonical_path_reference(text)
228
+    return any(
229
+        canonical_candidate and canonical_candidate in canonical_text
230
+        for canonical_candidate in (_canonical_path_reference(candidate) for candidate in candidates)
231
+    )
232
+
233
+
234
+def _canonical_path_reference(value: str) -> str:
235
+    normalized = value.lower().strip()
236
+    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
237
+    return " ".join(normalized.split())
238
+
239
+
203240
 def _text_covers_requirement(text: str, requirement: str) -> bool:
204241
     normalized_text = text.lower()
205242
     normalized_requirement = requirement.lower()
src/loader/runtime/dod.pymodified
@@ -208,6 +208,7 @@ def derive_verification_commands(
208208
     *,
209209
     project_root: Path,
210210
     task_statement: str,
211
+    supplement_existing: bool = False,
211212
 ) -> list[str]:
212213
     """Generate verification commands from execution history and project shape."""
213214
 
@@ -234,6 +235,8 @@ def derive_verification_commands(
234235
 
235236
     if commands:
236237
         return commands
238
+    if supplement_existing:
239
+        return commands
237240
 
238241
     if dod.task_size == "small":
239242
         for path_str in dod.touched_files[:3]:
@@ -245,6 +248,11 @@ def derive_verification_commands(
245248
                     commands,
246249
                     f"python -m py_compile {shlex.quote(str(effective_path))}",
247250
                 )
251
+    elif _uses_external_artifacts_only(dod, project_root=project_root):
252
+        for path_str in dod.touched_files[:3]:
253
+            path = Path(path_str)
254
+            effective_path = path if path.is_absolute() else (project_root / path)
255
+            _append_unique(commands, f"test -f {shlex.quote(str(effective_path))}")
248256
     else:
249257
         if (project_root / "pyproject.toml").exists():
250258
             _append_unique(commands, "uv run pytest -q")
@@ -407,6 +415,26 @@ def _append_unique(items: list[str], value: str) -> None:
407415
         items.append(value)
408416
 
409417
 
418
+def _uses_external_artifacts_only(dod: DefinitionOfDone, *, project_root: Path) -> bool:
419
+    touched = [Path(path) for path in dod.touched_files if str(path).strip()]
420
+    if not touched:
421
+        return False
422
+    try:
423
+        root = project_root.resolve()
424
+    except FileNotFoundError:
425
+        root = project_root
426
+    external = [path for path in touched if not _path_is_within_root(path, root)]
427
+    return bool(external) and len(external) == len(touched)
428
+
429
+
430
+def _path_is_within_root(path: Path, root: Path) -> bool:
431
+    try:
432
+        path.resolve().relative_to(root)
433
+        return True
434
+    except ValueError:
435
+        return False
436
+
437
+
410438
 def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]:
411439
     """Build a todo item list from the current DoD state.
412440
 
src/loader/runtime/finalization.pymodified
@@ -294,6 +294,7 @@ class TurnFinalizer:
294294
                 dod,
295295
                 project_root=self.context.project_root,
296296
                 task_statement=dod.task_statement,
297
+                supplement_existing=True,
297298
             ):
298299
                 if command not in dod.verification_commands:
299300
                     dod.verification_commands.append(command)
src/loader/runtime/tool_batches.pymodified
@@ -46,6 +46,19 @@ _TODO_NUDGE_EXCLUDED_ITEMS = {
4646
     "Complete the requested work",
4747
     _VERIFY_ITEM,
4848
 }
49
+_MUTATION_TODO_HINTS = (
50
+    "create",
51
+    "update",
52
+    "edit",
53
+    "write",
54
+    "fix",
55
+    "modify",
56
+    "change",
57
+    "patch",
58
+    "replace",
59
+    "correct",
60
+    "rewrite",
61
+)
4962
 
5063
 
5164
 @dataclass
@@ -290,18 +303,26 @@ class ToolBatchRunner:
290303
             max_items=2,
291304
         )
292305
         if next_pending and not html_toc_rule.task_targets_html_toc(current_task):
306
+            mutation_suffix = ""
307
+            if _todo_is_mutation_step(next_pending):
308
+                mutation_suffix = (
309
+                    " You already have enough evidence for that step, so stop gathering "
310
+                    "more reference material and perform the change now."
311
+                )
293312
             if confirmed_facts:
294313
                 self.context.queue_steering_message(
295314
                     "Reuse the earlier observation instead of repeating it. "
296315
                     f"Confirmed facts: {confirmed_facts}. "
297316
                     f"Continue with the next pending item: `{next_pending}`. "
298317
                     "Only gather more evidence if a specific fact required for that step is still unknown."
318
+                    + mutation_suffix
299319
                 )
300320
             else:
301321
                 self.context.queue_steering_message(
302322
                     "Reuse the earlier observation instead of repeating it. "
303323
                     f"Continue with the next pending item: `{next_pending}`. "
304324
                     "Only gather more evidence if a specific fact required for that step is still unknown."
325
+                    + mutation_suffix
305326
                 )
306327
             return
307328
 
@@ -752,10 +773,17 @@ class ToolBatchRunner:
752773
         if not completed_label or not next_pending or next_pending == completed_label:
753774
             return
754775
 
776
+        mutation_suffix = ""
777
+        if _todo_is_mutation_step(next_pending):
778
+            mutation_suffix = (
779
+                " You already have enough evidence for that step, so stop gathering "
780
+                "more reference material and perform the change now."
781
+            )
782
+
755783
         self.context.queue_steering_message(
756784
             f"Confirmed progress: `{completed_label}` is now satisfied by the successful "
757785
             f"`{tool_call.name}` result. Continue with the next pending item: "
758
-            f"`{next_pending}` instead of rereading the same evidence."
786
+            f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}"
759787
         )
760788
 
761789
 
@@ -795,6 +823,11 @@ def _mark_verification_stale(
795823
         dod.pending_items.append(_VERIFY_ITEM)
796824
 
797825
 
826
+def _todo_is_mutation_step(label: str) -> bool:
827
+    lowered = label.lower()
828
+    return any(token in lowered for token in _MUTATION_TODO_HINTS)
829
+
830
+
798831
 def _mark_verification_planned(
799832
     *,
800833
     context: RuntimeContext,
src/loader/runtime/workflow.pymodified
@@ -125,6 +125,75 @@ _VERIFY_STEP_HINTS = (
125125
     "confirm",
126126
     "check",
127127
 )
128
+_AGGREGATE_TODO_HINTS = (
129
+    "each ",
130
+    "all ",
131
+    "every ",
132
+    "sequence",
133
+    "multiple ",
134
+    "across ",
135
+    "consistently",
136
+    "properly linked",
137
+    "directory structure",
138
+)
139
+_ACTIONABLE_STEP_VERBS = {
140
+    "add",
141
+    "apply",
142
+    "build",
143
+    "check",
144
+    "confirm",
145
+    "create",
146
+    "document",
147
+    "edit",
148
+    "ensure",
149
+    "fix",
150
+    "implement",
151
+    "inspect",
152
+    "list",
153
+    "move",
154
+    "parse",
155
+    "patch",
156
+    "read",
157
+    "refactor",
158
+    "remove",
159
+    "rename",
160
+    "reorder",
161
+    "rerun",
162
+    "re-run",
163
+    "review",
164
+    "run",
165
+    "search",
166
+    "test",
167
+    "update",
168
+    "validate",
169
+    "verify",
170
+    "write",
171
+}
172
+_RETROSPECTIVE_STEP_VERBS = {
173
+    "added",
174
+    "applied",
175
+    "built",
176
+    "checked",
177
+    "completed",
178
+    "confirmed",
179
+    "created",
180
+    "edited",
181
+    "ensured",
182
+    "examined",
183
+    "generated",
184
+    "implemented",
185
+    "inspected",
186
+    "listed",
187
+    "looked",
188
+    "parsed",
189
+    "patched",
190
+    "read",
191
+    "reviewed",
192
+    "updated",
193
+    "validated",
194
+    "verified",
195
+    "wrote",
196
+}
128197
 _TASK_COVERAGE_STOP_WORDS = {
129198
     "the",
130199
     "and",
@@ -491,6 +560,41 @@ class PlanningArtifacts:
491560
             implementation_steps=list(self.implementation_steps),
492561
         )
493562
 
563
+    def with_progress_context(
564
+        self,
565
+        *,
566
+        touched_files: list[str],
567
+        completed_items: list[str],
568
+    ) -> PlanningArtifacts:
569
+        """Return one copy that preserves already-confirmed execution progress."""
570
+
571
+        progress_items: list[str] = []
572
+        for raw_path in touched_files:
573
+            path_text = str(raw_path).strip()
574
+            if not path_text:
575
+                continue
576
+            progress_items.append(f"Already touched during execution: `{path_text}`.")
577
+        for raw_item in completed_items:
578
+            item = str(raw_item).strip()
579
+            if not item or item in _SPECIAL_TODO_ITEMS:
580
+                continue
581
+            progress_items.append(f"Already completed during execution: {item}.")
582
+
583
+        if not progress_items:
584
+            return self
585
+
586
+        return PlanningArtifacts(
587
+            implementation_markdown=_replace_markdown_section_items(
588
+                self.implementation_markdown,
589
+                "Confirmed Progress",
590
+                list(dict.fromkeys(progress_items)),
591
+            ),
592
+            verification_markdown=self.verification_markdown,
593
+            verification_commands=list(self.verification_commands),
594
+            acceptance_criteria=list(self.acceptance_criteria),
595
+            implementation_steps=list(self.implementation_steps),
596
+        )
597
+
494598
 
495599
 class WorkflowArtifactStore:
496600
     """Persist briefs and plans under `.loader/`."""
@@ -627,6 +731,15 @@ def merge_refreshed_todos_with_existing_scope(
627731
         and item not in _SPECIAL_TODO_ITEMS
628732
         and _task_text_covers_requirement(task_statement, item)
629733
     ]
734
+    refreshed_candidates = [
735
+        item.strip()
736
+        for item in refreshed_steps
737
+        if item.strip()
738
+        and (
739
+            not (grounded_completed or grounded_pending)
740
+            or _looks_actionable_refresh_step(item)
741
+        )
742
+    ]
630743
 
631744
     todos: list[dict[str, str]] = []
632745
     seen: set[str] = set()
@@ -641,7 +754,7 @@ def merge_refreshed_todos_with_existing_scope(
641754
                 "status": "completed",
642755
             }
643756
         )
644
-    for item in [*grounded_pending, *refreshed_steps]:
757
+    for item in [*grounded_pending, *refreshed_candidates]:
645758
         label = item.strip()
646759
         if not label or label in seen:
647760
             continue
@@ -740,7 +853,14 @@ def _todo_progress_score(item: str, tool_call: ToolCall) -> int:
740853
         elif _looks_like_read_command(command):
741854
             if _contains_any(text, _READ_STEP_HINTS):
742855
                 score += 2
856
+        elif _looks_like_fs_mutation_command(command):
857
+            if _contains_any(text, _MUTATION_STEP_HINTS):
858
+                score += 3
859
+            if "directory" in text and "mkdir" in command:
860
+                score += 2
743861
     elif name in {"write", "edit", "patch"}:
862
+        if _todo_describes_aggregate_mutation(text) and basename and basename not in text:
863
+            return 0
744864
         if _contains_any(text, _MUTATION_STEP_HINTS):
745865
             score += 3
746866
 
@@ -753,6 +873,13 @@ def _contains_any(text: str, candidates: tuple[str, ...]) -> bool:
753873
     return any(candidate in text for candidate in candidates)
754874
 
755875
 
876
+def _todo_describes_aggregate_mutation(text: str) -> bool:
877
+    return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any(
878
+        text,
879
+        _MUTATION_STEP_HINTS,
880
+    )
881
+
882
+
756883
 def _looks_like_search_command(command: str) -> bool:
757884
     return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
758885
 
@@ -781,6 +908,27 @@ def _looks_like_verification_command(command: str) -> bool:
781908
     )
782909
 
783910
 
911
+def _looks_like_fs_mutation_command(command: str) -> bool:
912
+    stripped = command.strip()
913
+    return any(
914
+        stripped.startswith(prefix)
915
+        for prefix in (
916
+            "mkdir ",
917
+            "mkdir\t",
918
+            "touch ",
919
+            "touch\t",
920
+            "cp ",
921
+            "cp\t",
922
+            "mv ",
923
+            "mv\t",
924
+            "ln ",
925
+            "ln\t",
926
+            "install ",
927
+            "install\t",
928
+        )
929
+    )
930
+
931
+
784932
 def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
785933
     """Extract verification commands from a verification-plan markdown document."""
786934
 
@@ -1057,6 +1205,27 @@ def _requirement_describes_output_scope(requirement: str) -> bool:
10571205
     )
10581206
 
10591207
 
1208
+def _looks_actionable_refresh_step(step: str) -> bool:
1209
+    normalized = step.strip()
1210
+    if not normalized:
1211
+        return False
1212
+    if re.fullmatch(r"(?:[\w.-]+/)*[\w.-]+\.[A-Za-z0-9]+", normalized):
1213
+        return False
1214
+
1215
+    lowered = normalized.lower()
1216
+    lowered = re.sub(r"^(?:first|next|then|finally|afterward|afterwards)\b[,:]?\s*", "", lowered)
1217
+    first_word_match = re.match(r"^[a-z-]+", lowered)
1218
+    if first_word_match is None:
1219
+        return False
1220
+
1221
+    first_word = first_word_match.group(0)
1222
+    if first_word in _RETROSPECTIVE_STEP_VERBS:
1223
+        return False
1224
+    if first_word in _ACTIONABLE_STEP_VERBS:
1225
+        return True
1226
+    return False
1227
+
1228
+
10601229
 def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None:
10611230
     if section in brief.explicit_sections:
10621231
         return
src/loader/runtime/workflow_lanes.pymodified
@@ -208,6 +208,10 @@ class WorkflowLaneRunner:
208208
                 refreshed_acceptance_criteria=list(artifacts.acceptance_criteria),
209209
             )
210210
             artifacts = artifacts.with_acceptance_criteria(preserved_acceptance)
211
+            artifacts = artifacts.with_progress_context(
212
+                touched_files=list(dod.touched_files),
213
+                completed_items=list(dod.completed_items),
214
+            )
211215
         implementation_path, verification_path = self.artifact_store.write_plan(
212216
             task,
213217
             artifacts,
@@ -610,6 +614,41 @@ class WorkflowLaneRunner:
610614
 
611615
         refresh_block = ""
612616
         if refresh_reasons:
617
+            progress_lines: list[str] = []
618
+            touched = [str(path).strip() for path in dod.touched_files if str(path).strip()]
619
+            completed = [
620
+                item.strip()
621
+                for item in dod.completed_items
622
+                if item.strip()
623
+                and item not in {"Complete the requested work", "Collect verification evidence"}
624
+            ]
625
+            pending = [
626
+                item.strip()
627
+                for item in dod.pending_items
628
+                if item.strip()
629
+                and item not in {"Complete the requested work", "Collect verification evidence"}
630
+            ]
631
+            if touched:
632
+                progress_lines.extend(
633
+                    [
634
+                        "Already touched during execution:",
635
+                        *[f"- {item}" for item in touched[:12]],
636
+                    ]
637
+                )
638
+            if completed:
639
+                progress_lines.extend(
640
+                    [
641
+                        "Already completed work:",
642
+                        *[f"- {item}" for item in completed[:12]],
643
+                    ]
644
+                )
645
+            if pending:
646
+                progress_lines.extend(
647
+                    [
648
+                        "Still pending:",
649
+                        *[f"- {item}" for item in pending[:12]],
650
+                    ]
651
+                )
613652
             refresh_block = (
614653
                 "Refresh the existing planning artifacts instead of creating a fresh plan "
615654
                 "from scratch.\n"
@@ -619,6 +658,11 @@ class WorkflowLaneRunner:
619658
                 "artifact.\n"
620659
                 "Use the current task state and these recovery reasons:\n"
621660
                 + "\n".join(f"- {item}" for item in refresh_reasons)
661
+                + (
662
+                    ("\n\nCurrent execution progress:\n" + "\n".join(progress_lines))
663
+                    if progress_lines
664
+                    else ""
665
+                )
622666
                 + "\n\n"
623667
             )
624668
 
src/loader/runtime/workflow_recovery.pymodified
@@ -29,6 +29,10 @@ UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None
2929
 WorkflowModeSetter = Callable[..., Awaitable[None]]
3030
 TimelineAppender = Callable[..., None]
3131
 BridgeAppender = Callable[[DefinitionOfDone], None]
32
+_RECOVERY_TODO_EXCLUDED_ITEMS = {
33
+    "Complete the requested work",
34
+    "Collect verification evidence",
35
+}
3236
 
3337
 
3438
 class WorkflowRecoveryController:
@@ -186,6 +190,21 @@ class WorkflowRecoveryController:
186190
             summary=summary,
187191
         )
188192
         self.append_execute_bridge(dod)
193
+        next_pending = next(
194
+            (
195
+                item
196
+                for item in dod.pending_items
197
+                if item not in _RECOVERY_TODO_EXCLUDED_ITEMS
198
+            ),
199
+            None,
200
+        )
201
+        if next_pending:
202
+            self.context.queue_steering_message(
203
+                "Plan refresh preserved the progress already made. "
204
+                f"Reuse the existing files and confirmed facts, then continue with the next "
205
+                f"pending item: `{next_pending}`. "
206
+                "Do not restart from initial discovery unless a specific missing fact blocks that step."
207
+            )
189208
         return True
190209
 
191210
     async def _run_clarify_reentry_for_drift(
tests/test_artifact_invalidation.pymodified
@@ -67,3 +67,28 @@ def test_artifact_invalidation_can_force_full_replan_when_brief_and_plan_drift()
6767
         for item in freshness.evidence
6868
     )
6969
     assert freshness.evidence_summary
70
+
71
+
72
+def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint() -> None:
73
+    assessor = ArtifactInvalidationAssessor()
74
+
75
+    freshness = assessor.assess(
76
+        task_statement="Build a multi-file nginx guide.",
77
+        clarify_text=None,
78
+        implementation_text=(
79
+            "# Implementation Plan\n"
80
+            "- Create 01-getting-started.html in the chapters directory.\n"
81
+        ),
82
+        verification_text=(
83
+            "# Verification Plan\n"
84
+            "## Acceptance Criteria\n"
85
+            "- 01-getting-started.html exists.\n"
86
+        ),
87
+        acceptance_criteria=["01-getting-started.html exists."],
88
+        touched_files=["/tmp/chapters/01_getting_started.html"],
89
+        last_verification_result=None,
90
+    )
91
+
92
+    assert freshness.stale_plan is False
93
+    assert freshness.stale_brief is False
94
+    assert "touched_files_outside_plan" not in freshness.reason_codes
tests/test_dod.pymodified
@@ -143,6 +143,29 @@ def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Pat
143143
     assert not any(command == f"test -f {index}" for command in commands)
144144
 
145145
 
146
+def test_derive_verification_commands_avoids_repo_defaults_for_external_artifacts(
147
+    tmp_path: Path,
148
+) -> None:
149
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='loader'\n")
150
+    (tmp_path / "package.json").write_text("{}\n")
151
+    external_root = tmp_path.parent / "external-guide"
152
+    external_root.mkdir(exist_ok=True)
153
+    external_index = external_root / "index.html"
154
+    external_index.write_text("<html></html>\n")
155
+
156
+    dod = create_definition_of_done("Create an external nginx guide.")
157
+    dod.task_size = "standard"
158
+    dod.touched_files = [str(external_index)]
159
+
160
+    commands = derive_verification_commands(
161
+        dod,
162
+        project_root=tmp_path,
163
+        task_statement=dod.task_statement,
164
+    )
165
+
166
+    assert commands == [f"test -f {external_index}"]
167
+
168
+
146169
 def test_build_verification_summary_keeps_concrete_missing_link_details() -> None:
147170
     summary = build_verification_summary(
148171
         [
tests/test_finalization.pymodified
@@ -434,6 +434,53 @@ async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_comma
434434
     )
435435
 
436436
 
437
+@pytest.mark.asyncio
438
+async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
439
+    temp_dir: Path,
440
+) -> None:
441
+    (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
442
+    (temp_dir / "package.json").write_text("{}\n")
443
+    external_root = temp_dir.parent / "external-nginx-guide"
444
+    external_root.mkdir(exist_ok=True)
445
+    external_index = external_root / "index.html"
446
+    external_index.write_text("<html></html>\n")
447
+
448
+    session = FakeSession()
449
+    context = build_context(temp_dir, session)
450
+    finalizer = TurnFinalizer(
451
+        context,
452
+        RuntimeTracer(),
453
+        DefinitionOfDoneStore(temp_dir),
454
+        set_workflow_mode=_noop_set_workflow_mode,
455
+    )
456
+    dod = create_definition_of_done("Create an external nginx guide.")
457
+    dod.mutating_actions.append("write")
458
+    dod.touched_files.append(str(external_index))
459
+    dod.verification_commands = [
460
+        f"ls -la {external_root}",
461
+        f"grep -n \"html\" {external_index}",
462
+    ]
463
+    summary = TurnSummary(final_response="")
464
+    executor = RecordingExecutor()
465
+
466
+    async def capture(event) -> None:
467
+        return None
468
+
469
+    result = await finalizer.run_definition_of_done_gate(
470
+        dod=dod,
471
+        candidate_response="Created the external nginx guide.",
472
+        emit=capture,
473
+        summary=summary,
474
+        executor=executor,  # type: ignore[arg-type]
475
+    )
476
+
477
+    assert result.should_continue is False
478
+    assert executor.commands == [
479
+        f"ls -la {external_root}",
480
+        f'grep -n "html" {external_index}',
481
+    ]
482
+
483
+
437484
 @pytest.mark.asyncio
438485
 async def test_turn_finalizer_records_missing_verification_observation(
439486
     temp_dir: Path,
tests/test_tool_batches.pymodified
@@ -1041,6 +1041,10 @@ async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progre
10411041
         in message
10421042
         for message in queued_messages
10431043
     )
1044
+    assert any(
1045
+        "stop gathering more reference material and perform the change now" in message
1046
+        for message in queued_messages
1047
+    )
10441048
 
10451049
 
10461050
 @pytest.mark.asyncio
@@ -1161,6 +1165,97 @@ async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_t
11611165
     assert "Update `" not in queued_messages[0]
11621166
 
11631167
 
1168
+@pytest.mark.asyncio
1169
+async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
1170
+    temp_dir: Path,
1171
+) -> None:
1172
+    async def assess_confidence(
1173
+        tool_name: str,
1174
+        tool_args: dict,
1175
+        context: str,
1176
+    ) -> ConfidenceAssessment:
1177
+        raise AssertionError("Confidence scoring should be disabled in this scenario")
1178
+
1179
+    async def verify_action(
1180
+        tool_name: str,
1181
+        tool_args: dict,
1182
+        result: str,
1183
+        expected: str = "",
1184
+    ) -> ActionVerification:
1185
+        raise AssertionError("Verification should not run for this scenario")
1186
+
1187
+    reference = temp_dir / "fortran" / "index.html"
1188
+    reference.parent.mkdir(parents=True)
1189
+    reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1190
+
1191
+    context = build_context(
1192
+        temp_dir=temp_dir,
1193
+        messages=[],
1194
+        safeguards=FakeSafeguards(),
1195
+        assess_confidence=assess_confidence,
1196
+        verify_action=verify_action,
1197
+        auto_recover=False,
1198
+    )
1199
+    queued_messages: list[str] = []
1200
+    context.queue_steering_message_callback = queued_messages.append
1201
+    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1202
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
1203
+    sync_todos_to_definition_of_done(
1204
+        dod,
1205
+        [
1206
+            {
1207
+                "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1208
+                "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1209
+                "status": "pending",
1210
+            },
1211
+            {
1212
+                "content": "Create the nginx index.html file",
1213
+                "active_form": "Working on: Create the nginx index.html file",
1214
+                "status": "pending",
1215
+            },
1216
+        ],
1217
+    )
1218
+    tool_call = ToolCall(
1219
+        id="read-reference",
1220
+        name="read",
1221
+        arguments={"file_path": str(reference)},
1222
+    )
1223
+    executor = FakeExecutor(
1224
+        [
1225
+            tool_outcome(
1226
+                tool_call=tool_call,
1227
+                output="<h1>Fortran Beginner's Guide</h1>\n",
1228
+                is_error=False,
1229
+            )
1230
+        ]
1231
+    )
1232
+
1233
+    summary = TurnSummary(final_response="")
1234
+    await runner.execute_batch(
1235
+        tool_calls=[tool_call],
1236
+        tool_source="assistant",
1237
+        pending_tool_calls_seen=set(),
1238
+        emit=_noop_emit,
1239
+        summary=summary,
1240
+        dod=dod,
1241
+        executor=executor,  # type: ignore[arg-type]
1242
+        on_confirmation=None,
1243
+        on_user_question=None,
1244
+        emit_confirmation=None,
1245
+        consecutive_errors=0,
1246
+    )
1247
+
1248
+    assert any(
1249
+        "Continue with the next pending item: `Create the nginx index.html file`"
1250
+        in message
1251
+        for message in queued_messages
1252
+    )
1253
+    assert any(
1254
+        "stop gathering more reference material and perform the change now" in message
1255
+        for message in queued_messages
1256
+    )
1257
+
1258
+
11641259
 @pytest.mark.asyncio
11651260
 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
11661261
     temp_dir: Path,
tests/test_workflow.pymodified
@@ -345,6 +345,47 @@ def test_planning_artifacts_with_acceptance_criteria_rewrites_verification_markd
345345
     )
346346
 
347347
 
348
+def test_planning_artifacts_with_progress_context_records_touched_and_completed_work() -> None:
349
+    artifacts = PlanningArtifacts.from_model_output(
350
+        "\n".join(
351
+            [
352
+                "# Implementation Plan",
353
+                "",
354
+                "## Execution Order",
355
+                "1. Create the guide files.",
356
+                "",
357
+                "<<<VERIFICATION>>>",
358
+                "",
359
+                "# Verification Plan",
360
+                "",
361
+                "## Acceptance Criteria",
362
+                "- At least one chapter file exists.",
363
+                "",
364
+                "## Verification Commands",
365
+                "- `find chapters -name \"*.html\" | wc -l`",
366
+            ]
367
+        ),
368
+        task_statement="Create a thorough nginx guide.",
369
+    )
370
+
371
+    updated = artifacts.with_progress_context(
372
+        touched_files=["/tmp/nginx/index.html"],
373
+        completed_items=[
374
+            "Create the guide scaffold",
375
+            "Collect verification evidence",
376
+        ],
377
+    )
378
+
379
+    assert "## Confirmed Progress" in updated.implementation_markdown
380
+    assert "Already touched during execution: `/tmp/nginx/index.html`." in (
381
+        updated.implementation_markdown
382
+    )
383
+    assert "Already completed during execution: Create the guide scaffold." in (
384
+        updated.implementation_markdown
385
+    )
386
+    assert "Collect verification evidence" not in updated.implementation_markdown
387
+
388
+
348389
 def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None:
349390
     task = (
350391
         "Create an equally thorough nginx guide with index.html plus chapter files "
@@ -371,6 +412,48 @@ def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() ->
371412
     )
372413
 
373414
 
415
+def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() -> None:
416
+    task = (
417
+        "Create an equally thorough nginx guide with index.html plus chapter files "
418
+        "covering getting started, installation, first website setup, configs, and "
419
+        "advanced topics."
420
+    )
421
+
422
+    todos = merge_refreshed_todos_with_existing_scope(
423
+        task,
424
+        existing_pending_items=[
425
+            "Create each chapter file in sequence, following the same structure as the Fortran guide",
426
+            "Ensure all files are properly linked and formatted consistently",
427
+        ],
428
+        existing_completed_items=[
429
+            "First, examine the existing Fortran guide structure to understand the format and cadence",
430
+            "Create the directory structure for the new nginx guide",
431
+            "Create the main index.html file",
432
+        ],
433
+        refreshed_steps=[
434
+            "First examined the existing Fortran guide structure to understand format and cadence",
435
+            "Created the main index.html file with navigation",
436
+            "Created chapter files in sequence:",
437
+            "01-getting-started.html",
438
+            "02-installation.html",
439
+            "03-first-website.html",
440
+            "04-configuring.html",
441
+            "All files properly linked with navigation between chapters",
442
+            "Verify the final navigation links across the guide",
443
+        ],
444
+    )
445
+
446
+    labels = {item["content"]: item["status"] for item in todos}
447
+    assert (
448
+        labels["Create each chapter file in sequence, following the same structure as the Fortran guide"]
449
+        == "pending"
450
+    )
451
+    assert labels["Ensure all files are properly linked and formatted consistently"] == "pending"
452
+    assert labels["Verify the final navigation links across the guide"] == "pending"
453
+    assert "Created chapter files in sequence:" not in labels
454
+    assert "04-configuring.html" not in labels
455
+
456
+
374457
 def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
375458
     store = WorkflowArtifactStore(tmp_path)
376459
     brief = ClarifyBrief.fallback(
@@ -528,3 +611,71 @@ def test_advance_todos_from_tool_call_tracks_plan_progress() -> None:
528611
         ),
529612
     )
530613
     assert "Verify the updated index.html file is properly formatted" in dod.completed_items
614
+
615
+
616
+def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() -> None:
617
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
618
+    sync_todos_to_definition_of_done(
619
+        dod,
620
+        [
621
+            {
622
+                "content": "Create each chapter file in sequence, following the same structure as the Fortran guide",
623
+                "active_form": "Working on: Create each chapter file in sequence, following the same structure as the Fortran guide",
624
+                "status": "pending",
625
+            },
626
+            {
627
+                "content": "Ensure all files are properly linked and formatted consistently",
628
+                "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
629
+                "status": "pending",
630
+            },
631
+        ],
632
+    )
633
+
634
+    assert (
635
+        advance_todos_from_tool_call(
636
+            dod,
637
+            ToolCall(
638
+                id="write-one-chapter",
639
+                name="write",
640
+                arguments={
641
+                    "file_path": "/tmp/nginx/chapters/01-getting-started.html",
642
+                    "content": "<html></html>",
643
+                },
644
+            ),
645
+        )
646
+        is False
647
+    )
648
+    assert (
649
+        "Create each chapter file in sequence, following the same structure as the Fortran guide"
650
+        in dod.pending_items
651
+    )
652
+
653
+
654
+def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None:
655
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
656
+    sync_todos_to_definition_of_done(
657
+        dod,
658
+        [
659
+            {
660
+                "content": "Create the nginx directory structure",
661
+                "active_form": "Working on: Create the nginx directory structure",
662
+                "status": "pending",
663
+            },
664
+            {
665
+                "content": "Create index.html for nginx guide",
666
+                "active_form": "Working on: Create index.html for nginx guide",
667
+                "status": "pending",
668
+            },
669
+        ],
670
+    )
671
+
672
+    assert advance_todos_from_tool_call(
673
+        dod,
674
+        ToolCall(
675
+            id="mkdir-nginx",
676
+            name="bash",
677
+            arguments={"command": "mkdir -p ~/Loader/guides/nginx/chapters"},
678
+        ),
679
+    )
680
+    assert "Create the nginx directory structure" in dod.completed_items
681
+    assert "Create index.html for nginx guide" in dod.pending_items
tests/test_workflow_runtime.pymodified
@@ -1438,6 +1438,21 @@ async def test_stale_plan_artifacts_trigger_targeted_plan_refresh(
14381438
         entry.reason_code == "plan_refresh_completed"
14391439
         for entry in run.agent.last_turn_summary.workflow_timeline
14401440
     )
1441
+    refresh_prompt = next(
1442
+        invocation.messages[-1].content
1443
+        for invocation in backend.invocations
1444
+        if "Refresh the existing planning artifacts instead of creating a fresh plan from scratch."
1445
+        in invocation.messages[-1].content
1446
+    )
1447
+    assert "Current execution progress:" in refresh_prompt
1448
+    assert "Already touched during execution:" in refresh_prompt
1449
+    assert f"- {target}" in refresh_prompt
1450
+    assert any(
1451
+        "Plan refresh preserved the progress already made." in message.content
1452
+        and "Do not restart from initial discovery" in message.content
1453
+        for invocation in backend.invocations
1454
+        for message in invocation.messages
1455
+    )
14411456
 
14421457
 
14431458
 @pytest.mark.asyncio