tenseleyflow/loader / f0d4490

Browse files

Refine qwen recovery loops and verification

Authored by espadonne
SHA
f0d44903e030e62ea940f004e583842d18ab991c
Parents
03423c5
Tree
dde0785

11 changed files

StatusFile+-
M src/loader/runtime/dod.py 83 0
M src/loader/runtime/finalization.py 9 0
M src/loader/runtime/hooks.py 30 0
M src/loader/runtime/safeguard_services.py 73 22
M src/loader/runtime/workflow.py 41 2
M tests/test_dod.py 36 0
M tests/test_permissions.py 39 0
M tests/test_runtime_harness.py 53 0
M tests/test_safeguard_services.py 43 0
M tests/test_workflow.py 33 0
M tests/test_workflow_runtime.py 79 0
src/loader/runtime/dod.pymodified
@@ -208,6 +208,11 @@ def derive_verification_commands(
208208
     """Generate verification commands from execution history and project shape."""
209209
 
210210
     commands: list[str] = []
211
+    semantic_command = _derive_html_toc_verification_command(
212
+        dod,
213
+        project_root=project_root,
214
+        task_statement=task_statement,
215
+    )
211216
 
212217
     explicit = [cmd for cmd in dod.successful_commands if _is_verification_command(cmd)]
213218
     for command in explicit:
@@ -220,6 +225,9 @@ def derive_verification_commands(
220225
             if path.suffix == ".py":
221226
                 _append_unique(commands, f"python {shlex.quote(path.name)}")
222227
 
228
+    if semantic_command:
229
+        _append_unique(commands, semantic_command)
230
+
223231
     if commands:
224232
         return commands
225233
 
@@ -465,6 +473,81 @@ def _extract_files_from_bash(command: str) -> list[str]:
465473
     return []
466474
 
467475
 
476
+def _derive_html_toc_verification_command(
477
+    dod: DefinitionOfDone,
478
+    *,
479
+    project_root: Path,
480
+    task_statement: str,
481
+) -> str | None:
482
+    task_hints = " ".join([task_statement, *dod.acceptance_criteria]).lower()
483
+    if not any(
484
+        hint in task_hints
485
+        for hint in ("href", "link", "links", "table of contents", "chapter title")
486
+    ):
487
+        return None
488
+
489
+    for path_str in dod.touched_files:
490
+        path = Path(path_str)
491
+        effective_path = path if path.is_absolute() else (project_root / path)
492
+        if effective_path.name != "index.html" or effective_path.suffix != ".html":
493
+            continue
494
+        if not (effective_path.parent / "chapters").is_dir():
495
+            continue
496
+        return _build_html_toc_verification_command(effective_path)
497
+    return None
498
+
499
+
500
+def _build_html_toc_verification_command(index_path: Path) -> str:
501
+    path_literal = repr(str(index_path))
502
+    return "\n".join(
503
+        [
504
+            "/usr/bin/python3 - <<'PY'",
505
+            "from pathlib import Path",
506
+            "import re",
507
+            "import sys",
508
+            "",
509
+            f"index = Path({path_literal}).expanduser()",
510
+            "root = index.parent",
511
+            "text = index.read_text()",
512
+            "section_match = re.search(r'<ul class=\"chapter-list\">(.*?)</ul>', text, re.S)",
513
+            "if section_match is None:",
514
+            "    print('Missing chapter-list table of contents', file=sys.stderr)",
515
+            "    raise SystemExit(1)",
516
+            "links = re.findall(r'<a href=\"([^\"]+)\">([^<]+)</a>', section_match.group(1))",
517
+            "if not links:",
518
+            "    print('No chapter links found in table of contents', file=sys.stderr)",
519
+            "    raise SystemExit(1)",
520
+            "",
521
+            "missing = []",
522
+            "mismatched = []",
523
+            "for href, label in links:",
524
+            "    target = (root / href).resolve()",
525
+            "    if not target.exists():",
526
+            "        missing.append(f'{href} -> missing')",
527
+            "        continue",
528
+            "    body = target.read_text()",
529
+            "    match = re.search(r'<h1>(.*?)</h1>', body, re.S)",
530
+            "    title = match.group(1).strip() if match else ''",
531
+            "    if title and label.strip() != title:",
532
+            "        mismatched.append(f'{href} -> {label.strip()} != {title}')",
533
+            "",
534
+            "if missing or mismatched:",
535
+            "    if missing:",
536
+            "        print('Missing links:', file=sys.stderr)",
537
+            "        for item in missing:",
538
+            "            print(item, file=sys.stderr)",
539
+            "    if mismatched:",
540
+            "        print('Title mismatches:', file=sys.stderr)",
541
+            "        for item in mismatched:",
542
+            "            print(item, file=sys.stderr)",
543
+            "    raise SystemExit(1)",
544
+            "",
545
+            "print(f'validated {len(links)} toc links in {index.name}')",
546
+            "PY",
547
+        ]
548
+    )
549
+
550
+
468551
 def _first_non_empty_line(text: str) -> str:
469552
     for line in text.splitlines():
470553
         stripped = line.strip()
src/loader/runtime/finalization.pymodified
@@ -216,6 +216,15 @@ class TurnFinalizer:
216216
                 Path(dod.verification_plan).read_text()
217217
             )
218218
 
219
+        if (
220
+            not dod.verification_commands
221
+            and dod.implementation_plan
222
+            and Path(dod.implementation_plan).exists()
223
+        ):
224
+            dod.verification_commands = extract_verification_commands_from_markdown(
225
+                Path(dod.implementation_plan).read_text()
226
+            )
227
+
219228
         if not dod.verification_commands:
220229
             dod.verification_commands = derive_verification_commands(
221230
                 dod,
src/loader/runtime/hooks.pymodified
@@ -131,6 +131,35 @@ class FilePathAliasHook(BaseToolHook):
131131
         return HookResult()
132132
 
133133
 
134
+class SearchPathAliasHook(BaseToolHook):
135
+    """Normalize common search-path aliases before validation and execution."""
136
+
137
+    _SEARCH_TOOLS = frozenset({"glob", "grep"})
138
+    _ALIASES = ("directory", "dir", "folder")
139
+
140
+    async def pre_tool_use(self, context: HookContext) -> HookResult:
141
+        if context.tool_call.name not in self._SEARCH_TOOLS:
142
+            return HookResult()
143
+
144
+        arguments = context.tool_call.arguments
145
+        path = str(arguments.get("path", "")).strip()
146
+        if path:
147
+            return HookResult()
148
+
149
+        for alias in self._ALIASES:
150
+            candidate = arguments.get(alias)
151
+            if not str(candidate or "").strip():
152
+                continue
153
+
154
+            updated_arguments = dict(arguments)
155
+            updated_arguments["path"] = candidate
156
+            for cleanup_key in self._ALIASES:
157
+                updated_arguments.pop(cleanup_key, None)
158
+            return HookResult(updated_arguments=updated_arguments)
159
+
160
+        return HookResult()
161
+
162
+
134163
 class HookManager:
135164
     """Runs tool hooks across Loader's three lifecycle events."""
136165
 
@@ -327,6 +356,7 @@ def build_default_tool_hooks(
327356
     return HookManager(
328357
         [
329358
             FilePathAliasHook(),
359
+            SearchPathAliasHook(),
330360
             DuplicateActionHook(action_tracker),
331361
             ActionValidationHook(validator),
332362
             RollbackTrackingHook(registry, rollback_plan),
src/loader/runtime/safeguard_services.pymodified
@@ -16,6 +16,9 @@ class ActionTracker:
1616
     LOOP_REPEAT_THRESHOLD = 2
1717
     MAX_RESPONSE_HISTORY = 5
1818
     OBSERVATION_REPEAT_WINDOW = 8
19
+    READ_REPEAT_THRESHOLD = 3
20
+    SEARCH_REPEAT_THRESHOLD = 2
21
+    BASH_OBSERVATION_REPEAT_THRESHOLD = 2
1922
 
2023
     def __init__(self) -> None:
2124
         self._file_writes: dict[str, list[str]] = {}
@@ -26,9 +29,9 @@ class ActionTracker:
2629
         self._response_history: list[str] = []
2730
         self._action_index = 0
2831
         self._mutation_epoch = 0
29
-        self._recent_reads: dict[str, tuple[int, int]] = {}
30
-        self._recent_searches: dict[str, tuple[int, int]] = {}
31
-        self._recent_bash_observations: dict[str, tuple[int, int]] = {}
32
+        self._recent_reads: dict[str, tuple[int, int, int]] = {}
33
+        self._recent_searches: dict[str, tuple[int, int, int]] = {}
34
+        self._recent_bash_observations: dict[str, tuple[int, int, int]] = {}
3235
 
3336
     def reset(self) -> None:
3437
         self._file_writes.clear()
@@ -124,12 +127,17 @@ class ActionTracker:
124127
                 return True, f"Same patch already applied to: {file_path}"
125128
 
126129
         elif tool_name == "read":
127
-            file_path = str(arguments.get("file_path", "")).strip()
128
-            if file_path:
130
+            read_key = self._make_read_key(arguments)
131
+            if read_key:
129132
                 duplicate, reason = self._check_recent_observation(
130133
                     self._recent_reads,
131
-                    self._normalize_path(file_path),
132
-                    f"Already read {file_path} recently without any intervening changes",
134
+                    read_key,
135
+                    (
136
+                        "Already read "
137
+                        f"{str(arguments.get('file_path', '')).strip()} "
138
+                        "recently without any intervening changes"
139
+                    ),
140
+                    repeat_threshold=self.READ_REPEAT_THRESHOLD,
133141
                 )
134142
                 if duplicate:
135143
                     return True, reason
@@ -141,6 +149,7 @@ class ActionTracker:
141149
                     self._recent_searches,
142150
                     observation_key,
143151
                     "Already ran the same search recently without any intervening changes",
152
+                    repeat_threshold=self.SEARCH_REPEAT_THRESHOLD,
144153
                 )
145154
                 if duplicate:
146155
                     return True, reason
@@ -152,6 +161,7 @@ class ActionTracker:
152161
                     self._recent_bash_observations,
153162
                     self._normalize_command(command),
154163
                     "Already ran the same read-only shell probe recently without any intervening changes",
164
+                    repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD,
155165
                 )
156166
                 if duplicate:
157167
                     return True, reason
@@ -190,19 +200,19 @@ class ActionTracker:
190200
                 self._note_mutation()
191201
 
192202
         elif tool_name == "read":
193
-            file_path = str(arguments.get("file_path", "")).strip()
194
-            if file_path:
195
-                self._recent_reads[self._normalize_path(file_path)] = (
196
-                    self._mutation_epoch,
197
-                    self._action_index,
203
+            read_key = self._make_read_key(arguments)
204
+            if read_key:
205
+                self._record_observation(
206
+                    self._recent_reads,
207
+                    read_key,
198208
                 )
199209
 
200210
         elif tool_name in {"glob", "grep"}:
201211
             observation_key = self._make_search_key(tool_name, arguments)
202212
             if observation_key:
203
-                self._recent_searches[observation_key] = (
204
-                    self._mutation_epoch,
205
-                    self._action_index,
213
+                self._record_observation(
214
+                    self._recent_searches,
215
+                    observation_key,
206216
                 )
207217
 
208218
         elif tool_name == "bash":
@@ -212,9 +222,9 @@ class ActionTracker:
212222
                 if self._is_mutating_bash(command):
213223
                     self._note_mutation()
214224
                 elif self._is_observational_bash(command):
215
-                    self._recent_bash_observations[self._normalize_command(command)] = (
216
-                        self._mutation_epoch,
217
-                        self._action_index,
225
+                    self._record_observation(
226
+                        self._recent_bash_observations,
227
+                        self._normalize_command(command),
218228
                     )
219229
 
220230
     def detect_loop(self) -> tuple[bool, str]:
@@ -299,20 +309,49 @@ class ActionTracker:
299309
 
300310
     def _check_recent_observation(
301311
         self,
302
-        cache: dict[str, tuple[int, int]],
312
+        cache: dict[str, tuple[int, int, int]],
303313
         key: str,
304314
         reason: str,
315
+        *,
316
+        repeat_threshold: int,
305317
     ) -> tuple[bool, str]:
306318
         last_seen = cache.get(key)
307319
         if last_seen is None:
308320
             return False, ""
309321
 
310
-        last_epoch, last_index = last_seen
322
+        last_epoch, last_index, repeat_count = last_seen
311323
         if last_epoch != self._mutation_epoch:
312324
             return False, ""
313
-        if (self._action_index - last_index) > self.OBSERVATION_REPEAT_WINDOW:
325
+        gap = self._action_index - last_index
326
+        if gap > self.OBSERVATION_REPEAT_WINDOW:
314327
             return False, ""
315
-        return True, reason
328
+        if gap <= 0:
329
+            return True, reason
330
+        if repeat_count >= repeat_threshold:
331
+            return True, reason
332
+        return False, ""
333
+
334
+    def _record_observation(
335
+        self,
336
+        cache: dict[str, tuple[int, int, int]],
337
+        key: str,
338
+    ) -> None:
339
+        last_seen = cache.get(key)
340
+        if last_seen is None:
341
+            cache[key] = (self._mutation_epoch, self._action_index, 1)
342
+            return
343
+
344
+        last_epoch, last_index, repeat_count = last_seen
345
+        gap = self._action_index - last_index
346
+        if last_epoch != self._mutation_epoch or gap > self.OBSERVATION_REPEAT_WINDOW:
347
+            cache[key] = (self._mutation_epoch, self._action_index, 1)
348
+            return
349
+
350
+        cache[key] = (
351
+            self._mutation_epoch,
352
+            self._action_index,
353
+            repeat_count + 1,
354
+        )
316355
 
317356
     def _make_search_key(self, tool_name: str, arguments: dict) -> str | None:
318357
         pattern = str(arguments.get("pattern", "")).strip()
@@ -322,6 +361,18 @@ class ActionTracker:
322361
         normalized_path = self._normalize_path(path) if path else ""
323362
         return f"{tool_name}:{normalized_path}:{pattern}"
324363
 
364
+    def _make_read_key(self, arguments: dict) -> str | None:
365
+        file_path = str(arguments.get("file_path", "")).strip()
366
+        if not file_path:
367
+            return None
368
+        offset = str(arguments.get("offset", "")).strip()
369
+        limit = str(arguments.get("limit", "")).strip()
370
+        return (
371
+            f"{self._normalize_path(file_path)}"
372
+            f":offset={offset or 'full'}"
373
+            f":limit={limit or 'all'}"
374
+        )
375
+
325376
     def _is_observational_bash(self, command: str) -> bool:
326377
         norm_cmd = self._normalize_command(command)
327378
         if not norm_cmd:
src/loader/runtime/workflow.pymodified
@@ -52,6 +52,10 @@ __all__ = [
5252
 ]
5353
 
5454
 VERIFICATION_SEPARATOR = "<<<VERIFICATION>>>"
55
+_VERIFICATION_SEPARATORS = (
56
+    VERIFICATION_SEPARATOR,
57
+    "<<VERIFICATION>>",
58
+)
5559
 _GENERIC_TOUCHPOINTS = {
5660
     "Determine the concrete files during execution.",
5761
     "Identify exact files during planning or execution.",
@@ -518,12 +522,47 @@ def build_execute_bridge(
518522
 
519523
 
520524
 def _split_plan_output(model_output: str) -> tuple[str, str]:
521
-    if VERIFICATION_SEPARATOR in model_output:
522
-        implementation, verification = model_output.split(VERIFICATION_SEPARATOR, maxsplit=1)
525
+    for separator in _VERIFICATION_SEPARATORS:
526
+        if separator not in model_output:
527
+            continue
528
+        implementation, verification = model_output.split(separator, maxsplit=1)
529
+        split = _split_embedded_verification_heading(
530
+            implementation.strip(),
531
+            fallback_verification=verification.strip(),
532
+        )
533
+        if split is not None:
534
+            return split
523535
         return implementation.strip(), verification.strip()
536
+
537
+    split = _split_embedded_verification_heading(model_output.strip())
538
+    if split is not None:
539
+        return split
524540
     return model_output.strip(), ""
525541
 
526542
 
543
+def _split_embedded_verification_heading(
544
+    implementation_markdown: str,
545
+    *,
546
+    fallback_verification: str = "",
547
+) -> tuple[str, str] | None:
548
+    match = re.search(r"(?m)^#\s+Verification Plan\s*$", implementation_markdown)
549
+    if match is None:
550
+        if fallback_verification.strip():
551
+            return implementation_markdown, fallback_verification.strip()
552
+        return None
553
+
554
+    implementation = implementation_markdown[:match.start()].rstrip()
555
+    verification = implementation_markdown[match.start():].strip()
556
+    if not implementation:
557
+        implementation = implementation_markdown.strip()
558
+        verification = fallback_verification.strip()
559
+    if not verification:
560
+        verification = fallback_verification.strip()
561
+    if not implementation or not verification:
562
+        return None
563
+    return implementation, verification
564
+
565
+
527566
 def _ensure_heading(markdown: str, heading: str) -> str:
528567
     stripped = markdown.strip()
529568
     if not stripped:
tests/test_dod.pymodified
@@ -103,3 +103,39 @@ def test_record_successful_tool_call_preserves_absolute_path_string(tmp_path: Pa
103103
     )
104104
 
105105
     assert dod.touched_files == [str(absolute_path)]
106
+
107
+
108
+def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Path) -> None:
109
+    chapters = tmp_path / "chapters"
110
+    chapters.mkdir()
111
+    (chapters / "01-introduction.html").write_text(
112
+        "<h1>Chapter 1: Introduction to Fortran</h1>\n"
113
+    )
114
+    index = tmp_path / "index.html"
115
+    index.write_text(
116
+        "\n".join(
117
+            [
118
+                '<ul class="chapter-list">',
119
+                '  <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
120
+                "</ul>",
121
+            ]
122
+        )
123
+    )
124
+
125
+    dod = create_definition_of_done(
126
+        "Update index.html so the table of contents links and hrefs are correct."
127
+    )
128
+    dod.acceptance_criteria = [
129
+        "All table of contents links in index.html point to existing chapter files.",
130
+        "All link texts match the actual chapter titles.",
131
+    ]
132
+    dod.touched_files = [str(index)]
133
+
134
+    commands = derive_verification_commands(
135
+        dod,
136
+        project_root=tmp_path,
137
+        task_statement=dod.task_statement,
138
+    )
139
+
140
+    assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in commands)
141
+    assert not any(command == f"test -f {index}" for command in commands)
tests/test_permissions.pymodified
@@ -15,6 +15,7 @@ from loader.runtime.hooks import (
1515
     HookContext,
1616
     HookManager,
1717
     HookResult,
18
+    SearchPathAliasHook,
1819
 )
1920
 from loader.runtime.permissions import (
2021
     PermissionMode,
@@ -342,3 +343,41 @@ async def test_file_path_alias_hook_canonicalizes_common_aliases(
342343
     assert result.updated_arguments["file_path"] == expected_path
343344
     for alias in ("file", "filepath", "filePath", "filename", "path"):
344345
         assert alias not in result.updated_arguments
346
+
347
+
348
+@pytest.mark.asyncio
349
+@pytest.mark.parametrize(
350
+    ("tool_name", "arguments", "expected_path"),
351
+    [
352
+        ("glob", {"pattern": "*.html", "directory": "chapters"}, "chapters"),
353
+        ("grep", {"pattern": "alpha", "dir": "src"}, "src"),
354
+    ],
355
+)
356
+async def test_search_path_alias_hook_canonicalizes_common_aliases(
357
+    temp_dir: Path,
358
+    tool_name: str,
359
+    arguments: dict[str, object],
360
+    expected_path: str,
361
+) -> None:
362
+    registry = create_default_registry(temp_dir)
363
+    policy = build_permission_policy(
364
+        active_mode=PermissionMode.WORKSPACE_WRITE,
365
+        workspace_root=temp_dir,
366
+        tool_requirements=registry.get_tool_requirements(),
367
+    )
368
+    hook = SearchPathAliasHook()
369
+
370
+    result = await hook.pre_tool_use(
371
+        HookContext(
372
+            tool_call=ToolCall(id=f"{tool_name}-1", name=tool_name, arguments=arguments),
373
+            tool=registry.get(tool_name),
374
+            registry=registry,
375
+            permission_policy=policy,
376
+            source="native",
377
+        )
378
+    )
379
+
380
+    assert result.updated_arguments is not None
381
+    assert result.updated_arguments["path"] == expected_path
382
+    for alias in ("directory", "dir", "folder"):
383
+        assert alias not in result.updated_arguments
tests/test_runtime_harness.pymodified
@@ -1766,6 +1766,59 @@ async def test_duplicate_read_is_skipped_without_intervening_mutation(
17661766
     assert "existing file contents" in run.response
17671767
 
17681768
 
1769
+@pytest.mark.asyncio
1770
+async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
1771
+    temp_dir: Path,
1772
+) -> None:
1773
+    index_file = temp_dir / "index.html"
1774
+    chapter_file = temp_dir / "chapter-1.html"
1775
+    index_file.write_text("table of contents\n")
1776
+    chapter_file.write_text("chapter body\n")
1777
+
1778
+    backend = ScriptedBackend(
1779
+        completions=[
1780
+            native_tool_response(
1781
+                ToolCall(
1782
+                    id="read-1",
1783
+                    name="read",
1784
+                    arguments={"file_path": str(index_file)},
1785
+                ),
1786
+                content="I'll inspect the index first.",
1787
+            ),
1788
+            native_tool_response(
1789
+                ToolCall(
1790
+                    id="read-2",
1791
+                    name="read",
1792
+                    arguments={"file_path": str(chapter_file)},
1793
+                ),
1794
+                content="I'll inspect the chapter next.",
1795
+            ),
1796
+            native_tool_response(
1797
+                ToolCall(
1798
+                    id="read-3",
1799
+                    name="read",
1800
+                    arguments={"file_path": str(index_file)},
1801
+                ),
1802
+                content="I'll reopen the index to reconcile the findings.",
1803
+            ),
1804
+            final_response("I re-opened the index after checking the chapter."),
1805
+        ]
1806
+    )
1807
+
1808
+    run = await run_scenario(
1809
+        "Inspect the index, inspect a chapter, then return to the index.",
1810
+        backend,
1811
+        config=non_streaming_config(),
1812
+        project_root=temp_dir,
1813
+    )
1814
+
1815
+    assert tool_event_names(run) == ["read", "read", "read"]
1816
+    messages = tool_result_messages(run)
1817
+    assert not any("Skipped - duplicate action" in message for message in messages)
1818
+    assert sum("table of contents" in message for message in messages) == 2
1819
+    assert any("chapter body" in message for message in messages)
1820
+
1821
+
17691822
 @pytest.mark.asyncio
17701823
 async def test_repeated_bash_probe_is_allowed_after_mutation(
17711824
     temp_dir: Path,
tests/test_safeguard_services.pymodified
@@ -88,6 +88,49 @@ def test_action_tracker_blocks_repeated_read_without_changes(tmp_path) -> None:
8888
     assert str(file_path) in reason
8989
 
9090
 
91
+def test_action_tracker_allows_one_interleaved_reread_without_changes(tmp_path) -> None:
92
+    tracker = ActionTracker()
93
+    index_path = tmp_path / "index.html"
94
+    chapter_path = tmp_path / "chapter-1.html"
95
+
96
+    tracker.record_tool_call("read", {"file_path": str(index_path)})
97
+    tracker.record_tool_call("read", {"file_path": str(chapter_path)})
98
+
99
+    assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "")
100
+
101
+
102
+def test_action_tracker_allows_reading_a_different_slice_of_the_same_file(tmp_path) -> None:
103
+    tracker = ActionTracker()
104
+    index_path = tmp_path / "index.html"
105
+
106
+    tracker.record_tool_call("read", {"file_path": str(index_path)})
107
+
108
+    assert tracker.check_tool_call(
109
+        "read",
110
+        {"file_path": str(index_path), "offset": 1, "limit": 50},
111
+    ) == (False, "")
112
+
113
+
114
+def test_action_tracker_blocks_fourth_interleaved_reread_without_changes(tmp_path) -> None:
115
+    tracker = ActionTracker()
116
+    index_path = tmp_path / "index.html"
117
+    chapter_a = tmp_path / "chapter-1.html"
118
+    chapter_b = tmp_path / "chapter-2.html"
119
+    chapter_c = tmp_path / "chapter-3.html"
120
+
121
+    tracker.record_tool_call("read", {"file_path": str(index_path)})
122
+    tracker.record_tool_call("read", {"file_path": str(chapter_a)})
123
+    tracker.record_tool_call("read", {"file_path": str(index_path)})
124
+    tracker.record_tool_call("read", {"file_path": str(chapter_b)})
125
+    tracker.record_tool_call("read", {"file_path": str(index_path)})
126
+    tracker.record_tool_call("read", {"file_path": str(chapter_c)})
127
+
128
+    is_duplicate, reason = tracker.check_tool_call("read", {"file_path": str(index_path)})
129
+
130
+    assert is_duplicate is True
131
+    assert str(index_path) in reason
132
+
133
+
91134
 def test_action_tracker_allows_repeated_read_after_mutation(tmp_path) -> None:
92135
     tracker = ActionTracker()
93136
     file_path = tmp_path / "index.html"
tests/test_workflow.pymodified
@@ -150,6 +150,39 @@ def test_planning_artifacts_round_trip_and_extract_commands() -> None:
150150
     ]
151151
 
152152
 
153
+def test_planning_artifacts_recover_embedded_verification_from_legacy_separator() -> None:
154
+    artifacts = PlanningArtifacts.from_model_output(
155
+        "\n".join(
156
+            [
157
+                "# Implementation Plan",
158
+                "",
159
+                "## Execution Order",
160
+                "1. Inspect index.html.",
161
+                "2. Fix the chapter links.",
162
+                "",
163
+                "# Verification Plan",
164
+                "",
165
+                "## Acceptance Criteria",
166
+                "- All chapter links point to real files.",
167
+                "",
168
+                "## Verification Commands",
169
+                "- `grep -o 'href=\"[^\"]*\"' index.html`",
170
+                "- `ls chapters`",
171
+                "",
172
+                "<<VERIFICATION>>",
173
+            ]
174
+        ),
175
+        task_statement="Fix the broken chapter links in index.html.",
176
+    )
177
+
178
+    assert "## Verification Commands" not in artifacts.implementation_markdown
179
+    assert "## Verification Commands" in artifacts.verification_markdown
180
+    assert artifacts.verification_commands == [
181
+        "grep -o 'href=\"[^\"]*\"' index.html",
182
+        "ls chapters",
183
+    ]
184
+
185
+
153186
 def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
154187
     store = WorkflowArtifactStore(tmp_path)
155188
     brief = ClarifyBrief.fallback(
tests/test_workflow_runtime.pymodified
@@ -72,6 +72,16 @@ def artifact_kinds(run) -> list[str]:
7272
     ]
7373
 
7474
 
75
+def verification_commands(run) -> list[str]:
76
+    """Return verification-phase bash commands."""
77
+
78
+    return [
79
+        str((event.tool_args or {}).get("command", ""))
80
+        for event in run.events
81
+        if event.type == "tool_call" and event.phase == "verification"
82
+    ]
83
+
84
+
7585
 def workflow_timeline_kinds(run) -> list[str]:
7686
     assert run.agent.last_turn_summary is not None
7787
     return [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline]
@@ -1247,6 +1257,75 @@ async def test_verify_failure_returns_to_execute_without_retriggering_plan(
12471257
     assert "fixed output" in target.read_text()
12481258
 
12491259
 
1260
+@pytest.mark.asyncio
1261
+async def test_plan_mode_recovers_verification_commands_from_legacy_separator(
1262
+    temp_dir: Path,
1263
+) -> None:
1264
+    target = temp_dir / "planned.txt"
1265
+    backend = ScriptedBackend(
1266
+        completions=[
1267
+            CompletionResponse(
1268
+                content="\n".join(
1269
+                    [
1270
+                        "# Implementation Plan",
1271
+                        "",
1272
+                        "## File Changes",
1273
+                        f"- Create {target.name} in the workspace root.",
1274
+                        "",
1275
+                        "## Execution Order",
1276
+                        f"1. Write {target.name}.",
1277
+                        "2. Verify the file exists.",
1278
+                        "",
1279
+                        "## Risks",
1280
+                        "- Losing the verification commands during parsing.",
1281
+                        "",
1282
+                        "# Verification Plan",
1283
+                        "",
1284
+                        "## Acceptance Criteria",
1285
+                        f"- {target.name} exists in the workspace root.",
1286
+                        "",
1287
+                        "## Verification Commands",
1288
+                        f"- `test -f {target}`",
1289
+                        "",
1290
+                        "## Notes",
1291
+                        "- This simulates a legacy separator emitted after the plan body.",
1292
+                        "",
1293
+                        "<<VERIFICATION>>",
1294
+                    ]
1295
+                )
1296
+            ),
1297
+            CompletionResponse(
1298
+                content="I'll create the planned artifact.",
1299
+                tool_calls=[
1300
+                    ToolCall(
1301
+                        id="write-1",
1302
+                        name="write",
1303
+                        arguments={
1304
+                            "file_path": str(target),
1305
+                            "content": "planned output\n",
1306
+                        },
1307
+                    )
1308
+                ],
1309
+            ),
1310
+            CompletionResponse(content="The planned artifact is in place."),
1311
+        ]
1312
+    )
1313
+
1314
+    run = await run_scenario(
1315
+        "Implement a persistent workflow mode router with clarify artifacts, "
1316
+        "planning artifacts, and verification-plan wiring in the runtime.",
1317
+        backend,
1318
+        config=non_streaming_config(),
1319
+        project_root=temp_dir,
1320
+    )
1321
+
1322
+    dod = run.agent.last_turn_summary.definition_of_done
1323
+    assert dod is not None
1324
+    assert dod.verification_commands == [f"test -f {target}"]
1325
+    assert verification_commands(run) == [f"test -f {target}"]
1326
+    assert Path(dod.verification_plan).read_text().count("## Verification Commands") == 1
1327
+
1328
+
12501329
 @pytest.mark.asyncio
12511330
 async def test_stale_plan_artifacts_trigger_targeted_plan_refresh(
12521331
     temp_dir: Path,