tenseleyflow/loader / 297e213

Browse files

Strengthen qwen progress and verification handoffs

Authored by espadonne
SHA
297e213d29e61587549386472439e606500e8b69
Parents
918941f
Tree
7288310

13 changed files

StatusFile+-
M src/loader/runtime/artifact_invalidation.py 40 3
M src/loader/runtime/dod.py 28 0
M src/loader/runtime/finalization.py 1 0
M src/loader/runtime/tool_batches.py 34 1
M src/loader/runtime/workflow.py 170 1
M src/loader/runtime/workflow_lanes.py 44 0
M src/loader/runtime/workflow_recovery.py 19 0
M tests/test_artifact_invalidation.py 25 0
M tests/test_dod.py 23 0
M tests/test_finalization.py 47 0
M tests/test_tool_batches.py 95 0
M tests/test_workflow.py 151 0
M tests/test_workflow_runtime.py 15 0
src/loader/runtime/artifact_invalidation.pymodified
@@ -49,13 +49,18 @@ class ArtifactInvalidationAssessor:
49
         unexpected_paths = [
49
         unexpected_paths = [
50
             name
50
             name
51
             for path in touched_files
51
             for path in touched_files
52
-            if (name := _path_name(path)) and name.lower() not in plan_text
52
+            if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path)
53
         ]
53
         ]
54
         confirmed_touchpoints = [
54
         confirmed_touchpoints = [
55
             name
55
             name
56
             for path in touched_files
56
             for path in touched_files
57
             if (name := _path_name(path))
57
             if (name := _path_name(path))
58
         ]
58
         ]
59
+        confirmed_touchpoint_keys = {
60
+            _path_reference_identity(path)
61
+            for path in touched_files
62
+            if _path_reference_identity(path)
63
+        }
59
         inferred_touchpoints = [
64
         inferred_touchpoints = [
60
             item
65
             item
61
             for item in _extract_path_mentions(
66
             for item in _extract_path_mentions(
@@ -63,7 +68,7 @@ class ArtifactInvalidationAssessor:
63
                 implementation_text,
68
                 implementation_text,
64
                 verification_text,
69
                 verification_text,
65
             )
70
             )
66
-            if _path_name(item) not in confirmed_touchpoints
71
+            if _path_reference_identity(item) not in confirmed_touchpoint_keys
67
         ]
72
         ]
68
         stale_plan = False
73
         stale_plan = False
69
         stale_brief = False
74
         stale_brief = False
@@ -147,7 +152,11 @@ class ArtifactInvalidationAssessor:
147
                     )
152
                     )
148
 
153
 
149
             out_of_brief_paths = [
154
             out_of_brief_paths = [
150
-                name for name in unexpected_paths if name.lower() not in brief_text
155
+                name
156
+                for path in touched_files
157
+                if (name := _path_name(path))
158
+                and name in unexpected_paths
159
+                and not _text_covers_path_reference(brief_text, path)
151
             ]
160
             ]
152
             if out_of_brief_paths:
161
             if out_of_brief_paths:
153
                 stale_brief = True
162
                 stale_brief = True
@@ -200,6 +209,34 @@ def _path_name(path: str) -> str:
200
     return normalized.rsplit("/", maxsplit=1)[-1].strip()
209
     return normalized.rsplit("/", maxsplit=1)[-1].strip()
201
 
210
 
202
 
211
 
212
+def _path_reference_identity(path: str) -> str:
213
+    normalized = _path_name(path)
214
+    if not normalized:
215
+        return ""
216
+    return _canonical_path_reference(normalized)
217
+
218
+
219
+def _text_covers_path_reference(text: str, path: str) -> bool:
220
+    normalized_text = text.lower()
221
+    candidates = [candidate for candidate in (str(path).strip(), _path_name(path)) if candidate]
222
+
223
+    for candidate in candidates:
224
+        if candidate.lower() in normalized_text:
225
+            return True
226
+
227
+    canonical_text = _canonical_path_reference(text)
228
+    return any(
229
+        canonical_candidate and canonical_candidate in canonical_text
230
+        for canonical_candidate in (_canonical_path_reference(candidate) for candidate in candidates)
231
+    )
232
+
233
+
234
+def _canonical_path_reference(value: str) -> str:
235
+    normalized = value.lower().strip()
236
+    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
237
+    return " ".join(normalized.split())
238
+
239
+
203
 def _text_covers_requirement(text: str, requirement: str) -> bool:
240
 def _text_covers_requirement(text: str, requirement: str) -> bool:
204
     normalized_text = text.lower()
241
     normalized_text = text.lower()
205
     normalized_requirement = requirement.lower()
242
     normalized_requirement = requirement.lower()
src/loader/runtime/dod.pymodified
@@ -208,6 +208,7 @@ def derive_verification_commands(
208
     *,
208
     *,
209
     project_root: Path,
209
     project_root: Path,
210
     task_statement: str,
210
     task_statement: str,
211
+    supplement_existing: bool = False,
211
 ) -> list[str]:
212
 ) -> list[str]:
212
     """Generate verification commands from execution history and project shape."""
213
     """Generate verification commands from execution history and project shape."""
213
 
214
 
@@ -234,6 +235,8 @@ def derive_verification_commands(
234
 
235
 
235
     if commands:
236
     if commands:
236
         return commands
237
         return commands
238
+    if supplement_existing:
239
+        return commands
237
 
240
 
238
     if dod.task_size == "small":
241
     if dod.task_size == "small":
239
         for path_str in dod.touched_files[:3]:
242
         for path_str in dod.touched_files[:3]:
@@ -245,6 +248,11 @@ def derive_verification_commands(
245
                     commands,
248
                     commands,
246
                     f"python -m py_compile {shlex.quote(str(effective_path))}",
249
                     f"python -m py_compile {shlex.quote(str(effective_path))}",
247
                 )
250
                 )
251
+    elif _uses_external_artifacts_only(dod, project_root=project_root):
252
+        for path_str in dod.touched_files[:3]:
253
+            path = Path(path_str)
254
+            effective_path = path if path.is_absolute() else (project_root / path)
255
+            _append_unique(commands, f"test -f {shlex.quote(str(effective_path))}")
248
     else:
256
     else:
249
         if (project_root / "pyproject.toml").exists():
257
         if (project_root / "pyproject.toml").exists():
250
             _append_unique(commands, "uv run pytest -q")
258
             _append_unique(commands, "uv run pytest -q")
@@ -407,6 +415,26 @@ def _append_unique(items: list[str], value: str) -> None:
407
         items.append(value)
415
         items.append(value)
408
 
416
 
409
 
417
 
418
+def _uses_external_artifacts_only(dod: DefinitionOfDone, *, project_root: Path) -> bool:
419
+    touched = [Path(path) for path in dod.touched_files if str(path).strip()]
420
+    if not touched:
421
+        return False
422
+    try:
423
+        root = project_root.resolve()
424
+    except FileNotFoundError:
425
+        root = project_root
426
+    external = [path for path in touched if not _path_is_within_root(path, root)]
427
+    return bool(external) and len(external) == len(touched)
428
+
429
+
430
+def _path_is_within_root(path: Path, root: Path) -> bool:
431
+    try:
432
+        path.resolve().relative_to(root)
433
+        return True
434
+    except ValueError:
435
+        return False
436
+
437
+
410
 def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]:
438
 def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]:
411
     """Build a todo item list from the current DoD state.
439
     """Build a todo item list from the current DoD state.
412
 
440
 
src/loader/runtime/finalization.pymodified
@@ -294,6 +294,7 @@ class TurnFinalizer:
294
                 dod,
294
                 dod,
295
                 project_root=self.context.project_root,
295
                 project_root=self.context.project_root,
296
                 task_statement=dod.task_statement,
296
                 task_statement=dod.task_statement,
297
+                supplement_existing=True,
297
             ):
298
             ):
298
                 if command not in dod.verification_commands:
299
                 if command not in dod.verification_commands:
299
                     dod.verification_commands.append(command)
300
                     dod.verification_commands.append(command)
src/loader/runtime/tool_batches.pymodified
@@ -46,6 +46,19 @@ _TODO_NUDGE_EXCLUDED_ITEMS = {
46
     "Complete the requested work",
46
     "Complete the requested work",
47
     _VERIFY_ITEM,
47
     _VERIFY_ITEM,
48
 }
48
 }
49
+_MUTATION_TODO_HINTS = (
50
+    "create",
51
+    "update",
52
+    "edit",
53
+    "write",
54
+    "fix",
55
+    "modify",
56
+    "change",
57
+    "patch",
58
+    "replace",
59
+    "correct",
60
+    "rewrite",
61
+)
49
 
62
 
50
 
63
 
51
 @dataclass
64
 @dataclass
@@ -290,18 +303,26 @@ class ToolBatchRunner:
290
             max_items=2,
303
             max_items=2,
291
         )
304
         )
292
         if next_pending and not html_toc_rule.task_targets_html_toc(current_task):
305
         if next_pending and not html_toc_rule.task_targets_html_toc(current_task):
306
+            mutation_suffix = ""
307
+            if _todo_is_mutation_step(next_pending):
308
+                mutation_suffix = (
309
+                    " You already have enough evidence for that step, so stop gathering "
310
+                    "more reference material and perform the change now."
311
+                )
293
             if confirmed_facts:
312
             if confirmed_facts:
294
                 self.context.queue_steering_message(
313
                 self.context.queue_steering_message(
295
                     "Reuse the earlier observation instead of repeating it. "
314
                     "Reuse the earlier observation instead of repeating it. "
296
                     f"Confirmed facts: {confirmed_facts}. "
315
                     f"Confirmed facts: {confirmed_facts}. "
297
                     f"Continue with the next pending item: `{next_pending}`. "
316
                     f"Continue with the next pending item: `{next_pending}`. "
298
                     "Only gather more evidence if a specific fact required for that step is still unknown."
317
                     "Only gather more evidence if a specific fact required for that step is still unknown."
318
+                    + mutation_suffix
299
                 )
319
                 )
300
             else:
320
             else:
301
                 self.context.queue_steering_message(
321
                 self.context.queue_steering_message(
302
                     "Reuse the earlier observation instead of repeating it. "
322
                     "Reuse the earlier observation instead of repeating it. "
303
                     f"Continue with the next pending item: `{next_pending}`. "
323
                     f"Continue with the next pending item: `{next_pending}`. "
304
                     "Only gather more evidence if a specific fact required for that step is still unknown."
324
                     "Only gather more evidence if a specific fact required for that step is still unknown."
325
+                    + mutation_suffix
305
                 )
326
                 )
306
             return
327
             return
307
 
328
 
@@ -752,10 +773,17 @@ class ToolBatchRunner:
752
         if not completed_label or not next_pending or next_pending == completed_label:
773
         if not completed_label or not next_pending or next_pending == completed_label:
753
             return
774
             return
754
 
775
 
776
+        mutation_suffix = ""
777
+        if _todo_is_mutation_step(next_pending):
778
+            mutation_suffix = (
779
+                " You already have enough evidence for that step, so stop gathering "
780
+                "more reference material and perform the change now."
781
+            )
782
+
755
         self.context.queue_steering_message(
783
         self.context.queue_steering_message(
756
             f"Confirmed progress: `{completed_label}` is now satisfied by the successful "
784
             f"Confirmed progress: `{completed_label}` is now satisfied by the successful "
757
             f"`{tool_call.name}` result. Continue with the next pending item: "
785
             f"`{tool_call.name}` result. Continue with the next pending item: "
758
-            f"`{next_pending}` instead of rereading the same evidence."
786
+            f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}"
759
         )
787
         )
760
 
788
 
761
 
789
 
@@ -795,6 +823,11 @@ def _mark_verification_stale(
795
         dod.pending_items.append(_VERIFY_ITEM)
823
         dod.pending_items.append(_VERIFY_ITEM)
796
 
824
 
797
 
825
 
826
+def _todo_is_mutation_step(label: str) -> bool:
827
+    lowered = label.lower()
828
+    return any(token in lowered for token in _MUTATION_TODO_HINTS)
829
+
830
+
798
 def _mark_verification_planned(
831
 def _mark_verification_planned(
799
     *,
832
     *,
800
     context: RuntimeContext,
833
     context: RuntimeContext,
src/loader/runtime/workflow.pymodified
@@ -125,6 +125,75 @@ _VERIFY_STEP_HINTS = (
125
     "confirm",
125
     "confirm",
126
     "check",
126
     "check",
127
 )
127
 )
128
+_AGGREGATE_TODO_HINTS = (
129
+    "each ",
130
+    "all ",
131
+    "every ",
132
+    "sequence",
133
+    "multiple ",
134
+    "across ",
135
+    "consistently",
136
+    "properly linked",
137
+    "directory structure",
138
+)
139
+_ACTIONABLE_STEP_VERBS = {
140
+    "add",
141
+    "apply",
142
+    "build",
143
+    "check",
144
+    "confirm",
145
+    "create",
146
+    "document",
147
+    "edit",
148
+    "ensure",
149
+    "fix",
150
+    "implement",
151
+    "inspect",
152
+    "list",
153
+    "move",
154
+    "parse",
155
+    "patch",
156
+    "read",
157
+    "refactor",
158
+    "remove",
159
+    "rename",
160
+    "reorder",
161
+    "rerun",
162
+    "re-run",
163
+    "review",
164
+    "run",
165
+    "search",
166
+    "test",
167
+    "update",
168
+    "validate",
169
+    "verify",
170
+    "write",
171
+}
172
+_RETROSPECTIVE_STEP_VERBS = {
173
+    "added",
174
+    "applied",
175
+    "built",
176
+    "checked",
177
+    "completed",
178
+    "confirmed",
179
+    "created",
180
+    "edited",
181
+    "ensured",
182
+    "examined",
183
+    "generated",
184
+    "implemented",
185
+    "inspected",
186
+    "listed",
187
+    "looked",
188
+    "parsed",
189
+    "patched",
190
+    "read",
191
+    "reviewed",
192
+    "updated",
193
+    "validated",
194
+    "verified",
195
+    "wrote",
196
+}
128
 _TASK_COVERAGE_STOP_WORDS = {
197
 _TASK_COVERAGE_STOP_WORDS = {
129
     "the",
198
     "the",
130
     "and",
199
     "and",
@@ -491,6 +560,41 @@ class PlanningArtifacts:
491
             implementation_steps=list(self.implementation_steps),
560
             implementation_steps=list(self.implementation_steps),
492
         )
561
         )
493
 
562
 
563
+    def with_progress_context(
564
+        self,
565
+        *,
566
+        touched_files: list[str],
567
+        completed_items: list[str],
568
+    ) -> PlanningArtifacts:
569
+        """Return one copy that preserves already-confirmed execution progress."""
570
+
571
+        progress_items: list[str] = []
572
+        for raw_path in touched_files:
573
+            path_text = str(raw_path).strip()
574
+            if not path_text:
575
+                continue
576
+            progress_items.append(f"Already touched during execution: `{path_text}`.")
577
+        for raw_item in completed_items:
578
+            item = str(raw_item).strip()
579
+            if not item or item in _SPECIAL_TODO_ITEMS:
580
+                continue
581
+            progress_items.append(f"Already completed during execution: {item}.")
582
+
583
+        if not progress_items:
584
+            return self
585
+
586
+        return PlanningArtifacts(
587
+            implementation_markdown=_replace_markdown_section_items(
588
+                self.implementation_markdown,
589
+                "Confirmed Progress",
590
+                list(dict.fromkeys(progress_items)),
591
+            ),
592
+            verification_markdown=self.verification_markdown,
593
+            verification_commands=list(self.verification_commands),
594
+            acceptance_criteria=list(self.acceptance_criteria),
595
+            implementation_steps=list(self.implementation_steps),
596
+        )
597
+
494
 
598
 
495
 class WorkflowArtifactStore:
599
 class WorkflowArtifactStore:
496
     """Persist briefs and plans under `.loader/`."""
600
     """Persist briefs and plans under `.loader/`."""
@@ -627,6 +731,15 @@ def merge_refreshed_todos_with_existing_scope(
627
         and item not in _SPECIAL_TODO_ITEMS
731
         and item not in _SPECIAL_TODO_ITEMS
628
         and _task_text_covers_requirement(task_statement, item)
732
         and _task_text_covers_requirement(task_statement, item)
629
     ]
733
     ]
734
+    refreshed_candidates = [
735
+        item.strip()
736
+        for item in refreshed_steps
737
+        if item.strip()
738
+        and (
739
+            not (grounded_completed or grounded_pending)
740
+            or _looks_actionable_refresh_step(item)
741
+        )
742
+    ]
630
 
743
 
631
     todos: list[dict[str, str]] = []
744
     todos: list[dict[str, str]] = []
632
     seen: set[str] = set()
745
     seen: set[str] = set()
@@ -641,7 +754,7 @@ def merge_refreshed_todos_with_existing_scope(
641
                 "status": "completed",
754
                 "status": "completed",
642
             }
755
             }
643
         )
756
         )
644
-    for item in [*grounded_pending, *refreshed_steps]:
757
+    for item in [*grounded_pending, *refreshed_candidates]:
645
         label = item.strip()
758
         label = item.strip()
646
         if not label or label in seen:
759
         if not label or label in seen:
647
             continue
760
             continue
@@ -740,7 +853,14 @@ def _todo_progress_score(item: str, tool_call: ToolCall) -> int:
740
         elif _looks_like_read_command(command):
853
         elif _looks_like_read_command(command):
741
             if _contains_any(text, _READ_STEP_HINTS):
854
             if _contains_any(text, _READ_STEP_HINTS):
742
                 score += 2
855
                 score += 2
856
+        elif _looks_like_fs_mutation_command(command):
857
+            if _contains_any(text, _MUTATION_STEP_HINTS):
858
+                score += 3
859
+            if "directory" in text and "mkdir" in command:
860
+                score += 2
743
     elif name in {"write", "edit", "patch"}:
861
     elif name in {"write", "edit", "patch"}:
862
+        if _todo_describes_aggregate_mutation(text) and basename and basename not in text:
863
+            return 0
744
         if _contains_any(text, _MUTATION_STEP_HINTS):
864
         if _contains_any(text, _MUTATION_STEP_HINTS):
745
             score += 3
865
             score += 3
746
 
866
 
@@ -753,6 +873,13 @@ def _contains_any(text: str, candidates: tuple[str, ...]) -> bool:
753
     return any(candidate in text for candidate in candidates)
873
     return any(candidate in text for candidate in candidates)
754
 
874
 
755
 
875
 
876
+def _todo_describes_aggregate_mutation(text: str) -> bool:
877
+    return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any(
878
+        text,
879
+        _MUTATION_STEP_HINTS,
880
+    )
881
+
882
+
756
 def _looks_like_search_command(command: str) -> bool:
883
 def _looks_like_search_command(command: str) -> bool:
757
     return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
884
     return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
758
 
885
 
@@ -781,6 +908,27 @@ def _looks_like_verification_command(command: str) -> bool:
781
     )
908
     )
782
 
909
 
783
 
910
 
911
+def _looks_like_fs_mutation_command(command: str) -> bool:
912
+    stripped = command.strip()
913
+    return any(
914
+        stripped.startswith(prefix)
915
+        for prefix in (
916
+            "mkdir ",
917
+            "mkdir\t",
918
+            "touch ",
919
+            "touch\t",
920
+            "cp ",
921
+            "cp\t",
922
+            "mv ",
923
+            "mv\t",
924
+            "ln ",
925
+            "ln\t",
926
+            "install ",
927
+            "install\t",
928
+        )
929
+    )
930
+
931
+
784
 def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
932
 def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
785
     """Extract verification commands from a verification-plan markdown document."""
933
     """Extract verification commands from a verification-plan markdown document."""
786
 
934
 
@@ -1057,6 +1205,27 @@ def _requirement_describes_output_scope(requirement: str) -> bool:
1057
     )
1205
     )
1058
 
1206
 
1059
 
1207
 
1208
+def _looks_actionable_refresh_step(step: str) -> bool:
1209
+    normalized = step.strip()
1210
+    if not normalized:
1211
+        return False
1212
+    if re.fullmatch(r"(?:[\w.-]+/)*[\w.-]+\.[A-Za-z0-9]+", normalized):
1213
+        return False
1214
+
1215
+    lowered = normalized.lower()
1216
+    lowered = re.sub(r"^(?:first|next|then|finally|afterward|afterwards)\b[,:]?\s*", "", lowered)
1217
+    first_word_match = re.match(r"^[a-z-]+", lowered)
1218
+    if first_word_match is None:
1219
+        return False
1220
+
1221
+    first_word = first_word_match.group(0)
1222
+    if first_word in _RETROSPECTIVE_STEP_VERBS:
1223
+        return False
1224
+    if first_word in _ACTIONABLE_STEP_VERBS:
1225
+        return True
1226
+    return False
1227
+
1228
+
1060
 def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None:
1229
 def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None:
1061
     if section in brief.explicit_sections:
1230
     if section in brief.explicit_sections:
1062
         return
1231
         return
src/loader/runtime/workflow_lanes.pymodified
@@ -208,6 +208,10 @@ class WorkflowLaneRunner:
208
                 refreshed_acceptance_criteria=list(artifacts.acceptance_criteria),
208
                 refreshed_acceptance_criteria=list(artifacts.acceptance_criteria),
209
             )
209
             )
210
             artifacts = artifacts.with_acceptance_criteria(preserved_acceptance)
210
             artifacts = artifacts.with_acceptance_criteria(preserved_acceptance)
211
+            artifacts = artifacts.with_progress_context(
212
+                touched_files=list(dod.touched_files),
213
+                completed_items=list(dod.completed_items),
214
+            )
211
         implementation_path, verification_path = self.artifact_store.write_plan(
215
         implementation_path, verification_path = self.artifact_store.write_plan(
212
             task,
216
             task,
213
             artifacts,
217
             artifacts,
@@ -610,6 +614,41 @@ class WorkflowLaneRunner:
610
 
614
 
611
         refresh_block = ""
615
         refresh_block = ""
612
         if refresh_reasons:
616
         if refresh_reasons:
617
+            progress_lines: list[str] = []
618
+            touched = [str(path).strip() for path in dod.touched_files if str(path).strip()]
619
+            completed = [
620
+                item.strip()
621
+                for item in dod.completed_items
622
+                if item.strip()
623
+                and item not in {"Complete the requested work", "Collect verification evidence"}
624
+            ]
625
+            pending = [
626
+                item.strip()
627
+                for item in dod.pending_items
628
+                if item.strip()
629
+                and item not in {"Complete the requested work", "Collect verification evidence"}
630
+            ]
631
+            if touched:
632
+                progress_lines.extend(
633
+                    [
634
+                        "Already touched during execution:",
635
+                        *[f"- {item}" for item in touched[:12]],
636
+                    ]
637
+                )
638
+            if completed:
639
+                progress_lines.extend(
640
+                    [
641
+                        "Already completed work:",
642
+                        *[f"- {item}" for item in completed[:12]],
643
+                    ]
644
+                )
645
+            if pending:
646
+                progress_lines.extend(
647
+                    [
648
+                        "Still pending:",
649
+                        *[f"- {item}" for item in pending[:12]],
650
+                    ]
651
+                )
613
             refresh_block = (
652
             refresh_block = (
614
                 "Refresh the existing planning artifacts instead of creating a fresh plan "
653
                 "Refresh the existing planning artifacts instead of creating a fresh plan "
615
                 "from scratch.\n"
654
                 "from scratch.\n"
@@ -619,6 +658,11 @@ class WorkflowLaneRunner:
619
                 "artifact.\n"
658
                 "artifact.\n"
620
                 "Use the current task state and these recovery reasons:\n"
659
                 "Use the current task state and these recovery reasons:\n"
621
                 + "\n".join(f"- {item}" for item in refresh_reasons)
660
                 + "\n".join(f"- {item}" for item in refresh_reasons)
661
+                + (
662
+                    ("\n\nCurrent execution progress:\n" + "\n".join(progress_lines))
663
+                    if progress_lines
664
+                    else ""
665
+                )
622
                 + "\n\n"
666
                 + "\n\n"
623
             )
667
             )
624
 
668
 
src/loader/runtime/workflow_recovery.pymodified
@@ -29,6 +29,10 @@ UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None
29
 WorkflowModeSetter = Callable[..., Awaitable[None]]
29
 WorkflowModeSetter = Callable[..., Awaitable[None]]
30
 TimelineAppender = Callable[..., None]
30
 TimelineAppender = Callable[..., None]
31
 BridgeAppender = Callable[[DefinitionOfDone], None]
31
 BridgeAppender = Callable[[DefinitionOfDone], None]
32
+_RECOVERY_TODO_EXCLUDED_ITEMS = {
33
+    "Complete the requested work",
34
+    "Collect verification evidence",
35
+}
32
 
36
 
33
 
37
 
34
 class WorkflowRecoveryController:
38
 class WorkflowRecoveryController:
@@ -186,6 +190,21 @@ class WorkflowRecoveryController:
186
             summary=summary,
190
             summary=summary,
187
         )
191
         )
188
         self.append_execute_bridge(dod)
192
         self.append_execute_bridge(dod)
193
+        next_pending = next(
194
+            (
195
+                item
196
+                for item in dod.pending_items
197
+                if item not in _RECOVERY_TODO_EXCLUDED_ITEMS
198
+            ),
199
+            None,
200
+        )
201
+        if next_pending:
202
+            self.context.queue_steering_message(
203
+                "Plan refresh preserved the progress already made. "
204
+                f"Reuse the existing files and confirmed facts, then continue with the next "
205
+                f"pending item: `{next_pending}`. "
206
+                "Do not restart from initial discovery unless a specific missing fact blocks that step."
207
+            )
189
         return True
208
         return True
190
 
209
 
191
     async def _run_clarify_reentry_for_drift(
210
     async def _run_clarify_reentry_for_drift(
tests/test_artifact_invalidation.pymodified
@@ -67,3 +67,28 @@ def test_artifact_invalidation_can_force_full_replan_when_brief_and_plan_drift()
67
         for item in freshness.evidence
67
         for item in freshness.evidence
68
     )
68
     )
69
     assert freshness.evidence_summary
69
     assert freshness.evidence_summary
70
+
71
+
72
+def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint() -> None:
73
+    assessor = ArtifactInvalidationAssessor()
74
+
75
+    freshness = assessor.assess(
76
+        task_statement="Build a multi-file nginx guide.",
77
+        clarify_text=None,
78
+        implementation_text=(
79
+            "# Implementation Plan\n"
80
+            "- Create 01-getting-started.html in the chapters directory.\n"
81
+        ),
82
+        verification_text=(
83
+            "# Verification Plan\n"
84
+            "## Acceptance Criteria\n"
85
+            "- 01-getting-started.html exists.\n"
86
+        ),
87
+        acceptance_criteria=["01-getting-started.html exists."],
88
+        touched_files=["/tmp/chapters/01_getting_started.html"],
89
+        last_verification_result=None,
90
+    )
91
+
92
+    assert freshness.stale_plan is False
93
+    assert freshness.stale_brief is False
94
+    assert "touched_files_outside_plan" not in freshness.reason_codes
tests/test_dod.pymodified
@@ -143,6 +143,29 @@ def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Pat
143
     assert not any(command == f"test -f {index}" for command in commands)
143
     assert not any(command == f"test -f {index}" for command in commands)
144
 
144
 
145
 
145
 
146
+def test_derive_verification_commands_avoids_repo_defaults_for_external_artifacts(
147
+    tmp_path: Path,
148
+) -> None:
149
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='loader'\n")
150
+    (tmp_path / "package.json").write_text("{}\n")
151
+    external_root = tmp_path.parent / "external-guide"
152
+    external_root.mkdir(exist_ok=True)
153
+    external_index = external_root / "index.html"
154
+    external_index.write_text("<html></html>\n")
155
+
156
+    dod = create_definition_of_done("Create an external nginx guide.")
157
+    dod.task_size = "standard"
158
+    dod.touched_files = [str(external_index)]
159
+
160
+    commands = derive_verification_commands(
161
+        dod,
162
+        project_root=tmp_path,
163
+        task_statement=dod.task_statement,
164
+    )
165
+
166
+    assert commands == [f"test -f {external_index}"]
167
+
168
+
146
 def test_build_verification_summary_keeps_concrete_missing_link_details() -> None:
169
 def test_build_verification_summary_keeps_concrete_missing_link_details() -> None:
147
     summary = build_verification_summary(
170
     summary = build_verification_summary(
148
         [
171
         [
tests/test_finalization.pymodified
@@ -434,6 +434,53 @@ async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_comma
434
     )
434
     )
435
 
435
 
436
 
436
 
437
+@pytest.mark.asyncio
438
+async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
439
+    temp_dir: Path,
440
+) -> None:
441
+    (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
442
+    (temp_dir / "package.json").write_text("{}\n")
443
+    external_root = temp_dir.parent / "external-nginx-guide"
444
+    external_root.mkdir(exist_ok=True)
445
+    external_index = external_root / "index.html"
446
+    external_index.write_text("<html></html>\n")
447
+
448
+    session = FakeSession()
449
+    context = build_context(temp_dir, session)
450
+    finalizer = TurnFinalizer(
451
+        context,
452
+        RuntimeTracer(),
453
+        DefinitionOfDoneStore(temp_dir),
454
+        set_workflow_mode=_noop_set_workflow_mode,
455
+    )
456
+    dod = create_definition_of_done("Create an external nginx guide.")
457
+    dod.mutating_actions.append("write")
458
+    dod.touched_files.append(str(external_index))
459
+    dod.verification_commands = [
460
+        f"ls -la {external_root}",
461
+        f"grep -n \"html\" {external_index}",
462
+    ]
463
+    summary = TurnSummary(final_response="")
464
+    executor = RecordingExecutor()
465
+
466
+    async def capture(event) -> None:
467
+        return None
468
+
469
+    result = await finalizer.run_definition_of_done_gate(
470
+        dod=dod,
471
+        candidate_response="Created the external nginx guide.",
472
+        emit=capture,
473
+        summary=summary,
474
+        executor=executor,  # type: ignore[arg-type]
475
+    )
476
+
477
+    assert result.should_continue is False
478
+    assert executor.commands == [
479
+        f"ls -la {external_root}",
480
+        f'grep -n "html" {external_index}',
481
+    ]
482
+
483
+
437
 @pytest.mark.asyncio
484
 @pytest.mark.asyncio
438
 async def test_turn_finalizer_records_missing_verification_observation(
485
 async def test_turn_finalizer_records_missing_verification_observation(
439
     temp_dir: Path,
486
     temp_dir: Path,
tests/test_tool_batches.pymodified
@@ -1041,6 +1041,10 @@ async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progre
1041
         in message
1041
         in message
1042
         for message in queued_messages
1042
         for message in queued_messages
1043
     )
1043
     )
1044
+    assert any(
1045
+        "stop gathering more reference material and perform the change now" in message
1046
+        for message in queued_messages
1047
+    )
1044
 
1048
 
1045
 
1049
 
1046
 @pytest.mark.asyncio
1050
 @pytest.mark.asyncio
@@ -1161,6 +1165,97 @@ async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_t
1161
     assert "Update `" not in queued_messages[0]
1165
     assert "Update `" not in queued_messages[0]
1162
 
1166
 
1163
 
1167
 
1168
+@pytest.mark.asyncio
1169
+async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
1170
+    temp_dir: Path,
1171
+) -> None:
1172
+    async def assess_confidence(
1173
+        tool_name: str,
1174
+        tool_args: dict,
1175
+        context: str,
1176
+    ) -> ConfidenceAssessment:
1177
+        raise AssertionError("Confidence scoring should be disabled in this scenario")
1178
+
1179
+    async def verify_action(
1180
+        tool_name: str,
1181
+        tool_args: dict,
1182
+        result: str,
1183
+        expected: str = "",
1184
+    ) -> ActionVerification:
1185
+        raise AssertionError("Verification should not run for this scenario")
1186
+
1187
+    reference = temp_dir / "fortran" / "index.html"
1188
+    reference.parent.mkdir(parents=True)
1189
+    reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
1190
+
1191
+    context = build_context(
1192
+        temp_dir=temp_dir,
1193
+        messages=[],
1194
+        safeguards=FakeSafeguards(),
1195
+        assess_confidence=assess_confidence,
1196
+        verify_action=verify_action,
1197
+        auto_recover=False,
1198
+    )
1199
+    queued_messages: list[str] = []
1200
+    context.queue_steering_message_callback = queued_messages.append
1201
+    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
1202
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
1203
+    sync_todos_to_definition_of_done(
1204
+        dod,
1205
+        [
1206
+            {
1207
+                "content": "Examine the existing Fortran guide structure to understand the cadence and format",
1208
+                "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
1209
+                "status": "pending",
1210
+            },
1211
+            {
1212
+                "content": "Create the nginx index.html file",
1213
+                "active_form": "Working on: Create the nginx index.html file",
1214
+                "status": "pending",
1215
+            },
1216
+        ],
1217
+    )
1218
+    tool_call = ToolCall(
1219
+        id="read-reference",
1220
+        name="read",
1221
+        arguments={"file_path": str(reference)},
1222
+    )
1223
+    executor = FakeExecutor(
1224
+        [
1225
+            tool_outcome(
1226
+                tool_call=tool_call,
1227
+                output="<h1>Fortran Beginner's Guide</h1>\n",
1228
+                is_error=False,
1229
+            )
1230
+        ]
1231
+    )
1232
+
1233
+    summary = TurnSummary(final_response="")
1234
+    await runner.execute_batch(
1235
+        tool_calls=[tool_call],
1236
+        tool_source="assistant",
1237
+        pending_tool_calls_seen=set(),
1238
+        emit=_noop_emit,
1239
+        summary=summary,
1240
+        dod=dod,
1241
+        executor=executor,  # type: ignore[arg-type]
1242
+        on_confirmation=None,
1243
+        on_user_question=None,
1244
+        emit_confirmation=None,
1245
+        consecutive_errors=0,
1246
+    )
1247
+
1248
+    assert any(
1249
+        "Continue with the next pending item: `Create the nginx index.html file`"
1250
+        in message
1251
+        for message in queued_messages
1252
+    )
1253
+    assert any(
1254
+        "stop gathering more reference material and perform the change now" in message
1255
+        for message in queued_messages
1256
+    )
1257
+
1258
+
1164
 @pytest.mark.asyncio
1259
 @pytest.mark.asyncio
1165
 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
1260
 async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
1166
     temp_dir: Path,
1261
     temp_dir: Path,
tests/test_workflow.pymodified
@@ -345,6 +345,47 @@ def test_planning_artifacts_with_acceptance_criteria_rewrites_verification_markd
345
     )
345
     )
346
 
346
 
347
 
347
 
348
+def test_planning_artifacts_with_progress_context_records_touched_and_completed_work() -> None:
349
+    artifacts = PlanningArtifacts.from_model_output(
350
+        "\n".join(
351
+            [
352
+                "# Implementation Plan",
353
+                "",
354
+                "## Execution Order",
355
+                "1. Create the guide files.",
356
+                "",
357
+                "<<<VERIFICATION>>>",
358
+                "",
359
+                "# Verification Plan",
360
+                "",
361
+                "## Acceptance Criteria",
362
+                "- At least one chapter file exists.",
363
+                "",
364
+                "## Verification Commands",
365
+                "- `find chapters -name \"*.html\" | wc -l`",
366
+            ]
367
+        ),
368
+        task_statement="Create a thorough nginx guide.",
369
+    )
370
+
371
+    updated = artifacts.with_progress_context(
372
+        touched_files=["/tmp/nginx/index.html"],
373
+        completed_items=[
374
+            "Create the guide scaffold",
375
+            "Collect verification evidence",
376
+        ],
377
+    )
378
+
379
+    assert "## Confirmed Progress" in updated.implementation_markdown
380
+    assert "Already touched during execution: `/tmp/nginx/index.html`." in (
381
+        updated.implementation_markdown
382
+    )
383
+    assert "Already completed during execution: Create the guide scaffold." in (
384
+        updated.implementation_markdown
385
+    )
386
+    assert "Collect verification evidence" not in updated.implementation_markdown
387
+
388
+
348
 def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None:
389
 def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None:
349
     task = (
390
     task = (
350
         "Create an equally thorough nginx guide with index.html plus chapter files "
391
         "Create an equally thorough nginx guide with index.html plus chapter files "
@@ -371,6 +412,48 @@ def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() ->
371
     )
412
     )
372
 
413
 
373
 
414
 
415
+def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() -> None:
416
+    task = (
417
+        "Create an equally thorough nginx guide with index.html plus chapter files "
418
+        "covering getting started, installation, first website setup, configs, and "
419
+        "advanced topics."
420
+    )
421
+
422
+    todos = merge_refreshed_todos_with_existing_scope(
423
+        task,
424
+        existing_pending_items=[
425
+            "Create each chapter file in sequence, following the same structure as the Fortran guide",
426
+            "Ensure all files are properly linked and formatted consistently",
427
+        ],
428
+        existing_completed_items=[
429
+            "First, examine the existing Fortran guide structure to understand the format and cadence",
430
+            "Create the directory structure for the new nginx guide",
431
+            "Create the main index.html file",
432
+        ],
433
+        refreshed_steps=[
434
+            "First examined the existing Fortran guide structure to understand format and cadence",
435
+            "Created the main index.html file with navigation",
436
+            "Created chapter files in sequence:",
437
+            "01-getting-started.html",
438
+            "02-installation.html",
439
+            "03-first-website.html",
440
+            "04-configuring.html",
441
+            "All files properly linked with navigation between chapters",
442
+            "Verify the final navigation links across the guide",
443
+        ],
444
+    )
445
+
446
+    labels = {item["content"]: item["status"] for item in todos}
447
+    assert (
448
+        labels["Create each chapter file in sequence, following the same structure as the Fortran guide"]
449
+        == "pending"
450
+    )
451
+    assert labels["Ensure all files are properly linked and formatted consistently"] == "pending"
452
+    assert labels["Verify the final navigation links across the guide"] == "pending"
453
+    assert "Created chapter files in sequence:" not in labels
454
+    assert "04-configuring.html" not in labels
455
+
456
+
374
 def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
457
 def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
375
     store = WorkflowArtifactStore(tmp_path)
458
     store = WorkflowArtifactStore(tmp_path)
376
     brief = ClarifyBrief.fallback(
459
     brief = ClarifyBrief.fallback(
@@ -528,3 +611,71 @@ def test_advance_todos_from_tool_call_tracks_plan_progress() -> None:
528
         ),
611
         ),
529
     )
612
     )
530
     assert "Verify the updated index.html file is properly formatted" in dod.completed_items
613
     assert "Verify the updated index.html file is properly formatted" in dod.completed_items
614
+
615
+
616
+def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() -> None:
617
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
618
+    sync_todos_to_definition_of_done(
619
+        dod,
620
+        [
621
+            {
622
+                "content": "Create each chapter file in sequence, following the same structure as the Fortran guide",
623
+                "active_form": "Working on: Create each chapter file in sequence, following the same structure as the Fortran guide",
624
+                "status": "pending",
625
+            },
626
+            {
627
+                "content": "Ensure all files are properly linked and formatted consistently",
628
+                "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
629
+                "status": "pending",
630
+            },
631
+        ],
632
+    )
633
+
634
+    assert (
635
+        advance_todos_from_tool_call(
636
+            dod,
637
+            ToolCall(
638
+                id="write-one-chapter",
639
+                name="write",
640
+                arguments={
641
+                    "file_path": "/tmp/nginx/chapters/01-getting-started.html",
642
+                    "content": "<html></html>",
643
+                },
644
+            ),
645
+        )
646
+        is False
647
+    )
648
+    assert (
649
+        "Create each chapter file in sequence, following the same structure as the Fortran guide"
650
+        in dod.pending_items
651
+    )
652
+
653
+
654
+def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None:
655
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
656
+    sync_todos_to_definition_of_done(
657
+        dod,
658
+        [
659
+            {
660
+                "content": "Create the nginx directory structure",
661
+                "active_form": "Working on: Create the nginx directory structure",
662
+                "status": "pending",
663
+            },
664
+            {
665
+                "content": "Create index.html for nginx guide",
666
+                "active_form": "Working on: Create index.html for nginx guide",
667
+                "status": "pending",
668
+            },
669
+        ],
670
+    )
671
+
672
+    assert advance_todos_from_tool_call(
673
+        dod,
674
+        ToolCall(
675
+            id="mkdir-nginx",
676
+            name="bash",
677
+            arguments={"command": "mkdir -p ~/Loader/guides/nginx/chapters"},
678
+        ),
679
+    )
680
+    assert "Create the nginx directory structure" in dod.completed_items
681
+    assert "Create index.html for nginx guide" in dod.pending_items
tests/test_workflow_runtime.pymodified
@@ -1438,6 +1438,21 @@ async def test_stale_plan_artifacts_trigger_targeted_plan_refresh(
1438
         entry.reason_code == "plan_refresh_completed"
1438
         entry.reason_code == "plan_refresh_completed"
1439
         for entry in run.agent.last_turn_summary.workflow_timeline
1439
         for entry in run.agent.last_turn_summary.workflow_timeline
1440
     )
1440
     )
1441
+    refresh_prompt = next(
1442
+        invocation.messages[-1].content
1443
+        for invocation in backend.invocations
1444
+        if "Refresh the existing planning artifacts instead of creating a fresh plan from scratch."
1445
+        in invocation.messages[-1].content
1446
+    )
1447
+    assert "Current execution progress:" in refresh_prompt
1448
+    assert "Already touched during execution:" in refresh_prompt
1449
+    assert f"- {target}" in refresh_prompt
1450
+    assert any(
1451
+        "Plan refresh preserved the progress already made." in message.content
1452
+        and "Do not restart from initial discovery" in message.content
1453
+        for invocation in backend.invocations
1454
+        for message in invocation.messages
1455
+    )
1441
 
1456
 
1442
 
1457
 
1443
 @pytest.mark.asyncio
1458
 @pytest.mark.asyncio