`f0d4490`

Refine qwen recovery loops and verification

Authored by

espadonne 3 weeks ago

SHA: f0d44903e030e62ea940f004e583842d18ab991c
Parents: 03423c5
Tree: dde0785

11 changed files

Status	File	+	-
M	`src/loader/runtime/dod.py`	83	0
M	`src/loader/runtime/finalization.py`	9	0
M	`src/loader/runtime/hooks.py`	30	0
M	`src/loader/runtime/safeguard_services.py`	73	22
M	`src/loader/runtime/workflow.py`	41	2
M	`tests/test_dod.py`	36	0
M	`tests/test_permissions.py`	39	0
M	`tests/test_runtime_harness.py`	53	0
M	`tests/test_safeguard_services.py`	43	0
M	`tests/test_workflow.py`	33	0
M	`tests/test_workflow_runtime.py`	79	0

src/loader/runtime/dod.pymodified

      """Generate verification commands from execution history and project shape."""
      commands: list[str] = []
 +    semantic_command = _derive_html_toc_verification_command(
 +        dod,
 +        project_root=project_root,
 +        task_statement=task_statement,
 +    )
      explicit = [cmd for cmd in dod.successful_commands if _is_verification_command(cmd)]
      for command in explicit:
              if path.suffix == ".py":
                  _append_unique(commands, f"python {shlex.quote(path.name)}")
 +    if semantic_command:
 +        _append_unique(commands, semantic_command)
++
      if commands:
          return commands
      return []
 +def _derive_html_toc_verification_command(
 +    dod: DefinitionOfDone,
 +    *,
 +    project_root: Path,
 +    task_statement: str,
 +) -> str | None:
 +    task_hints = " ".join([task_statement, *dod.acceptance_criteria]).lower()
 +    if not any(
 +        hint in task_hints
 +        for hint in ("href", "link", "links", "table of contents", "chapter title")
 +    ):
 +        return None
++
 +    for path_str in dod.touched_files:
 +        path = Path(path_str)
 +        effective_path = path if path.is_absolute() else (project_root / path)
 +        if effective_path.name != "index.html" or effective_path.suffix != ".html":
 +            continue
 +        if not (effective_path.parent / "chapters").is_dir():
 +            continue
 +        return _build_html_toc_verification_command(effective_path)
 +    return None
++
++
 +def _build_html_toc_verification_command(index_path: Path) -> str:
 +    path_literal = repr(str(index_path))
 +    return "\n".join(
 +        [
 +            "/usr/bin/python3 - <<'PY'",
 +            "from pathlib import Path",
 +            "import re",
 +            "import sys",
 +            "",
 +            f"index = Path({path_literal}).expanduser()",
 +            "root = index.parent",
 +            "text = index.read_text()",
 +            "section_match = re.search(r'<ul class=\"chapter-list\">(.*?)</ul>', text, re.S)",
 +            "if section_match is None:",
 +            "    print('Missing chapter-list table of contents', file=sys.stderr)",
 +            "    raise SystemExit(1)",
 +            "links = re.findall(r'<a href=\"([^\"]+)\">([^<]+)</a>', section_match.group(1))",
 +            "if not links:",
 +            "    print('No chapter links found in table of contents', file=sys.stderr)",
 +            "    raise SystemExit(1)",
 +            "",
 +            "missing = []",
 +            "mismatched = []",
 +            "for href, label in links:",
 +            "    target = (root / href).resolve()",
 +            "    if not target.exists():",
 +            "        missing.append(f'{href} -> missing')",
 +            "        continue",
 +            "    body = target.read_text()",
 +            "    match = re.search(r'<h1>(.*?)</h1>', body, re.S)",
 +            "    title = match.group(1).strip() if match else ''",
 +            "    if title and label.strip() != title:",
 +            "        mismatched.append(f'{href} -> {label.strip()} != {title}')",
 +            "",
 +            "if missing or mismatched:",
 +            "    if missing:",
 +            "        print('Missing links:', file=sys.stderr)",
 +            "        for item in missing:",
 +            "            print(item, file=sys.stderr)",
 +            "    if mismatched:",
 +            "        print('Title mismatches:', file=sys.stderr)",
 +            "        for item in mismatched:",
 +            "            print(item, file=sys.stderr)",
 +            "    raise SystemExit(1)",
 +            "",
 +            "print(f'validated {len(links)} toc links in {index.name}')",
 +            "PY",
 +        ]
 +    )
++
++
  def _first_non_empty_line(text: str) -> str:
      for line in text.splitlines():
          stripped = line.strip()

src/loader/runtime/finalization.pymodified

                  Path(dod.verification_plan).read_text()
+             )
 +        if (
 +            not dod.verification_commands
 +            and dod.implementation_plan
 +            and Path(dod.implementation_plan).exists()
 +        ):
 +            dod.verification_commands = extract_verification_commands_from_markdown(
 +                Path(dod.implementation_plan).read_text()
 +            )
++
          if not dod.verification_commands:
              dod.verification_commands = derive_verification_commands(
                  dod,

src/loader/runtime/hooks.pymodified

          return HookResult()
 +class SearchPathAliasHook(BaseToolHook):
 +    """Normalize common search-path aliases before validation and execution."""
++
 +    _SEARCH_TOOLS = frozenset({"glob", "grep"})
 +    _ALIASES = ("directory", "dir", "folder")
++
 +    async def pre_tool_use(self, context: HookContext) -> HookResult:
 +        if context.tool_call.name not in self._SEARCH_TOOLS:
 +            return HookResult()
++
 +        arguments = context.tool_call.arguments
 +        path = str(arguments.get("path", "")).strip()
 +        if path:
 +            return HookResult()
++
 +        for alias in self._ALIASES:
 +            candidate = arguments.get(alias)
 +            if not str(candidate or "").strip():
 +                continue
++
 +            updated_arguments = dict(arguments)
 +            updated_arguments["path"] = candidate
 +            for cleanup_key in self._ALIASES:
 +                updated_arguments.pop(cleanup_key, None)
 +            return HookResult(updated_arguments=updated_arguments)
++
 +        return HookResult()
++
++
  class HookManager:
      """Runs tool hooks across Loader's three lifecycle events."""
      return HookManager(
+         [
              FilePathAliasHook(),
 +            SearchPathAliasHook(),
              DuplicateActionHook(action_tracker),
              ActionValidationHook(validator),
              RollbackTrackingHook(registry, rollback_plan),

src/loader/runtime/safeguard_services.pymodified

      LOOP_REPEAT_THRESHOLD = 2
      MAX_RESPONSE_HISTORY = 5
      OBSERVATION_REPEAT_WINDOW = 8
 +    READ_REPEAT_THRESHOLD = 3
 +    SEARCH_REPEAT_THRESHOLD = 2
 +    BASH_OBSERVATION_REPEAT_THRESHOLD = 2
      def __init__(self) -> None:
          self._file_writes: dict[str, list[str]] = {}
          self._response_history: list[str] = []
          self._action_index = 0
          self._mutation_epoch = 0
 -        self._recent_reads: dict[str, tuple[int, int]] = {}
 -        self._recent_searches: dict[str, tuple[int, int]] = {}
 -        self._recent_bash_observations: dict[str, tuple[int, int]] = {}
 +        self._recent_reads: dict[str, tuple[int, int, int]] = {}
 +        self._recent_searches: dict[str, tuple[int, int, int]] = {}
 +        self._recent_bash_observations: dict[str, tuple[int, int, int]] = {}
      def reset(self) -> None:
          self._file_writes.clear()
                  return True, f"Same patch already applied to: {file_path}"
          elif tool_name == "read":
 -            file_path = str(arguments.get("file_path", "")).strip()
 -            if file_path:
 +            read_key = self._make_read_key(arguments)
 +            if read_key:
                  duplicate, reason = self._check_recent_observation(
                      self._recent_reads,
 -                    self._normalize_path(file_path),
 -                    f"Already read {file_path} recently without any intervening changes",
 +                    read_key,
 +                    (
 +                        "Already read "
 +                        f"{str(arguments.get('file_path', '')).strip()} "
 +                        "recently without any intervening changes"
 +                    ),
 +                    repeat_threshold=self.READ_REPEAT_THRESHOLD,
+                 )
                  if duplicate:
                      return True, reason
                      self._recent_searches,
                      observation_key,
                      "Already ran the same search recently without any intervening changes",
 +                    repeat_threshold=self.SEARCH_REPEAT_THRESHOLD,
+                 )
                  if duplicate:
                      return True, reason
                      self._recent_bash_observations,
                      self._normalize_command(command),
                      "Already ran the same read-only shell probe recently without any intervening changes",
 +                    repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD,
+                 )
                  if duplicate:
                      return True, reason
                  self._note_mutation()
          elif tool_name == "read":
 -            file_path = str(arguments.get("file_path", "")).strip()
 -            if file_path:
 -                self._recent_reads[self._normalize_path(file_path)] = (
 -                    self._mutation_epoch,
 -                    self._action_index,
 +            read_key = self._make_read_key(arguments)
 +            if read_key:
 +                self._record_observation(
 +                    self._recent_reads,
 +                    read_key,
+                 )
          elif tool_name in {"glob", "grep"}:
              observation_key = self._make_search_key(tool_name, arguments)
              if observation_key:
 -                self._recent_searches[observation_key] = (
 -                    self._mutation_epoch,
 -                    self._action_index,
 +                self._record_observation(
 +                    self._recent_searches,
 +                    observation_key,
+                 )
          elif tool_name == "bash":
                  if self._is_mutating_bash(command):
                      self._note_mutation()
                  elif self._is_observational_bash(command):
 -                    self._recent_bash_observations[self._normalize_command(command)] = (
 -                        self._mutation_epoch,
 -                        self._action_index,
 +                    self._record_observation(
 +                        self._recent_bash_observations,
 +                        self._normalize_command(command),
+                     )
      def detect_loop(self) -> tuple[bool, str]:
      def _check_recent_observation(
          self,
 -        cache: dict[str, tuple[int, int]],
 +        cache: dict[str, tuple[int, int, int]],
          key: str,
          reason: str,
 +        *,
 +        repeat_threshold: int,
      ) -> tuple[bool, str]:
          last_seen = cache.get(key)
          if last_seen is None:
              return False, ""
 -        last_epoch, last_index = last_seen
 +        last_epoch, last_index, repeat_count = last_seen
          if last_epoch != self._mutation_epoch:
              return False, ""
 -        if (self._action_index - last_index) > self.OBSERVATION_REPEAT_WINDOW:
 +        gap = self._action_index - last_index
 +        if gap > self.OBSERVATION_REPEAT_WINDOW:
              return False, ""
 -        return True, reason
 +        if gap <= 0:
 +            return True, reason
 +        if repeat_count >= repeat_threshold:
 +            return True, reason
 +        return False, ""
++
 +    def _record_observation(
 +        self,
 +        cache: dict[str, tuple[int, int, int]],
 +        key: str,
 +    ) -> None:
 +        last_seen = cache.get(key)
 +        if last_seen is None:
 +            cache[key] = (self._mutation_epoch, self._action_index, 1)
 +            return
++
 +        last_epoch, last_index, repeat_count = last_seen
 +        gap = self._action_index - last_index
 +        if last_epoch != self._mutation_epoch or gap > self.OBSERVATION_REPEAT_WINDOW:
 +            cache[key] = (self._mutation_epoch, self._action_index, 1)
 +            return
++
 +        cache[key] = (
 +            self._mutation_epoch,
 +            self._action_index,
 +            repeat_count + 1,
 +        )
      def _make_search_key(self, tool_name: str, arguments: dict) -> str | None:
          pattern = str(arguments.get("pattern", "")).strip()
          normalized_path = self._normalize_path(path) if path else ""
          return f"{tool_name}:{normalized_path}:{pattern}"
 +    def _make_read_key(self, arguments: dict) -> str | None:
 +        file_path = str(arguments.get("file_path", "")).strip()
 +        if not file_path:
 +            return None
 +        offset = str(arguments.get("offset", "")).strip()
 +        limit = str(arguments.get("limit", "")).strip()
 +        return (
 +            f"{self._normalize_path(file_path)}"
 +            f":offset={offset or 'full'}"
 +            f":limit={limit or 'all'}"
 +        )
++
      def _is_observational_bash(self, command: str) -> bool:
          norm_cmd = self._normalize_command(command)
          if not norm_cmd:

src/loader/runtime/workflow.pymodified

+ ]
  VERIFICATION_SEPARATOR = "<<<VERIFICATION>>>"
 +_VERIFICATION_SEPARATORS = (
 +    VERIFICATION_SEPARATOR,
 +    "<<VERIFICATION>>",
 +)
  _GENERIC_TOUCHPOINTS = {
      "Determine the concrete files during execution.",
      "Identify exact files during planning or execution.",
  def _split_plan_output(model_output: str) -> tuple[str, str]:
 -    if VERIFICATION_SEPARATOR in model_output:
 -        implementation, verification = model_output.split(VERIFICATION_SEPARATOR, maxsplit=1)
 +    for separator in _VERIFICATION_SEPARATORS:
 +        if separator not in model_output:
 +            continue
 +        implementation, verification = model_output.split(separator, maxsplit=1)
 +        split = _split_embedded_verification_heading(
 +            implementation.strip(),
 +            fallback_verification=verification.strip(),
 +        )
 +        if split is not None:
 +            return split
          return implementation.strip(), verification.strip()
++
 +    split = _split_embedded_verification_heading(model_output.strip())
 +    if split is not None:
 +        return split
      return model_output.strip(), ""
 +def _split_embedded_verification_heading(
 +    implementation_markdown: str,
 +    *,
 +    fallback_verification: str = "",
 +) -> tuple[str, str] | None:
 +    match = re.search(r"(?m)^#\s+Verification Plan\s*$", implementation_markdown)
 +    if match is None:
 +        if fallback_verification.strip():
 +            return implementation_markdown, fallback_verification.strip()
 +        return None
++
 +    implementation = implementation_markdown[:match.start()].rstrip()
 +    verification = implementation_markdown[match.start():].strip()
 +    if not implementation:
 +        implementation = implementation_markdown.strip()
 +        verification = fallback_verification.strip()
 +    if not verification:
 +        verification = fallback_verification.strip()
 +    if not implementation or not verification:
 +        return None
 +    return implementation, verification
++
++
  def _ensure_heading(markdown: str, heading: str) -> str:
      stripped = markdown.strip()
      if not stripped:

tests/test_dod.pymodified

+     )
      assert dod.touched_files == [str(absolute_path)]
++
++
 +def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Path) -> None:
 +    chapters = tmp_path / "chapters"
 +    chapters.mkdir()
 +    (chapters / "01-introduction.html").write_text(
 +        "<h1>Chapter 1: Introduction to Fortran</h1>\n"
 +    )
 +    index = tmp_path / "index.html"
 +    index.write_text(
 +        "\n".join(
 +            [
 +                '<ul class="chapter-list">',
 +                '  <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
 +                "</ul>",
 +            ]
 +        )
 +    )
++
 +    dod = create_definition_of_done(
 +        "Update index.html so the table of contents links and hrefs are correct."
 +    )
 +    dod.acceptance_criteria = [
 +        "All table of contents links in index.html point to existing chapter files.",
 +        "All link texts match the actual chapter titles.",
 +    ]
 +    dod.touched_files = [str(index)]
++
 +    commands = derive_verification_commands(
 +        dod,
 +        project_root=tmp_path,
 +        task_statement=dod.task_statement,
 +    )
++
 +    assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in commands)
 +    assert not any(command == f"test -f {index}" for command in commands)

tests/test_permissions.pymodified

      HookContext,
      HookManager,
      HookResult,
 +    SearchPathAliasHook,
+ )
  from loader.runtime.permissions import (
      PermissionMode,
      assert result.updated_arguments["file_path"] == expected_path
      for alias in ("file", "filepath", "filePath", "filename", "path"):
          assert alias not in result.updated_arguments
++
++
 +@pytest.mark.asyncio
 +@pytest.mark.parametrize(
 +    ("tool_name", "arguments", "expected_path"),
 +    [
 +        ("glob", {"pattern": "*.html", "directory": "chapters"}, "chapters"),
 +        ("grep", {"pattern": "alpha", "dir": "src"}, "src"),
 +    ],
 +)
 +async def test_search_path_alias_hook_canonicalizes_common_aliases(
 +    temp_dir: Path,
 +    tool_name: str,
 +    arguments: dict[str, object],
 +    expected_path: str,
 +) -> None:
 +    registry = create_default_registry(temp_dir)
 +    policy = build_permission_policy(
 +        active_mode=PermissionMode.WORKSPACE_WRITE,
 +        workspace_root=temp_dir,
 +        tool_requirements=registry.get_tool_requirements(),
 +    )
 +    hook = SearchPathAliasHook()
++
 +    result = await hook.pre_tool_use(
 +        HookContext(
 +            tool_call=ToolCall(id=f"{tool_name}-1", name=tool_name, arguments=arguments),
 +            tool=registry.get(tool_name),
 +            registry=registry,
 +            permission_policy=policy,
 +            source="native",
 +        )
 +    )
++
 +    assert result.updated_arguments is not None
 +    assert result.updated_arguments["path"] == expected_path
 +    for alias in ("directory", "dir", "folder"):
 +        assert alias not in result.updated_arguments

tests/test_runtime_harness.pymodified

      assert "existing file contents" in run.response
 +@pytest.mark.asyncio
 +async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
 +    temp_dir: Path,
 +) -> None:
 +    index_file = temp_dir / "index.html"
 +    chapter_file = temp_dir / "chapter-1.html"
 +    index_file.write_text("table of contents\n")
 +    chapter_file.write_text("chapter body\n")
++
 +    backend = ScriptedBackend(
 +        completions=[
 +            native_tool_response(
 +                ToolCall(
 +                    id="read-1",
 +                    name="read",
 +                    arguments={"file_path": str(index_file)},
 +                ),
 +                content="I'll inspect the index first.",
 +            ),
 +            native_tool_response(
 +                ToolCall(
 +                    id="read-2",
 +                    name="read",
 +                    arguments={"file_path": str(chapter_file)},
 +                ),
 +                content="I'll inspect the chapter next.",
 +            ),
 +            native_tool_response(
 +                ToolCall(
 +                    id="read-3",
 +                    name="read",
 +                    arguments={"file_path": str(index_file)},
 +                ),
 +                content="I'll reopen the index to reconcile the findings.",
 +            ),
 +            final_response("I re-opened the index after checking the chapter."),
 +        ]
 +    )
++
 +    run = await run_scenario(
 +        "Inspect the index, inspect a chapter, then return to the index.",
 +        backend,
 +        config=non_streaming_config(),
 +        project_root=temp_dir,
 +    )
++
 +    assert tool_event_names(run) == ["read", "read", "read"]
 +    messages = tool_result_messages(run)
 +    assert not any("Skipped - duplicate action" in message for message in messages)
 +    assert sum("table of contents" in message for message in messages) == 2
 +    assert any("chapter body" in message for message in messages)
++
++
  @pytest.mark.asyncio
  async def test_repeated_bash_probe_is_allowed_after_mutation(
      temp_dir: Path,

tests/test_safeguard_services.pymodified

      assert str(file_path) in reason
 +def test_action_tracker_allows_one_interleaved_reread_without_changes(tmp_path) -> None:
 +    tracker = ActionTracker()
 +    index_path = tmp_path / "index.html"
 +    chapter_path = tmp_path / "chapter-1.html"
++
 +    tracker.record_tool_call("read", {"file_path": str(index_path)})
 +    tracker.record_tool_call("read", {"file_path": str(chapter_path)})
++
 +    assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "")
++
++
 +def test_action_tracker_allows_reading_a_different_slice_of_the_same_file(tmp_path) -> None:
 +    tracker = ActionTracker()
 +    index_path = tmp_path / "index.html"
++
 +    tracker.record_tool_call("read", {"file_path": str(index_path)})
++
 +    assert tracker.check_tool_call(
 +        "read",
 +        {"file_path": str(index_path), "offset": 1, "limit": 50},
 +    ) == (False, "")
++
++
 +def test_action_tracker_blocks_fourth_interleaved_reread_without_changes(tmp_path) -> None:
 +    tracker = ActionTracker()
 +    index_path = tmp_path / "index.html"
 +    chapter_a = tmp_path / "chapter-1.html"
 +    chapter_b = tmp_path / "chapter-2.html"
 +    chapter_c = tmp_path / "chapter-3.html"
++
 +    tracker.record_tool_call("read", {"file_path": str(index_path)})
 +    tracker.record_tool_call("read", {"file_path": str(chapter_a)})
 +    tracker.record_tool_call("read", {"file_path": str(index_path)})
 +    tracker.record_tool_call("read", {"file_path": str(chapter_b)})
 +    tracker.record_tool_call("read", {"file_path": str(index_path)})
 +    tracker.record_tool_call("read", {"file_path": str(chapter_c)})
++
 +    is_duplicate, reason = tracker.check_tool_call("read", {"file_path": str(index_path)})
++
 +    assert is_duplicate is True
 +    assert str(index_path) in reason
++
++
  def test_action_tracker_allows_repeated_read_after_mutation(tmp_path) -> None:
      tracker = ActionTracker()
      file_path = tmp_path / "index.html"

tests/test_workflow.pymodified

+     ]
 +def test_planning_artifacts_recover_embedded_verification_from_legacy_separator() -> None:
 +    artifacts = PlanningArtifacts.from_model_output(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## Execution Order",
 +                "1. Inspect index.html.",
 +                "2. Fix the chapter links.",
 +                "",
 +                "# Verification Plan",
 +                "",
 +                "## Acceptance Criteria",
 +                "- All chapter links point to real files.",
 +                "",
 +                "## Verification Commands",
 +                "- `grep -o 'href=\"[^\"]*\"' index.html`",
 +                "- `ls chapters`",
 +                "",
 +                "<<VERIFICATION>>",
 +            ]
 +        ),
 +        task_statement="Fix the broken chapter links in index.html.",
 +    )
++
 +    assert "## Verification Commands" not in artifacts.implementation_markdown
 +    assert "## Verification Commands" in artifacts.verification_markdown
 +    assert artifacts.verification_commands == [
 +        "grep -o 'href=\"[^\"]*\"' index.html",
 +        "ls chapters",
 +    ]
++
++
  def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
      store = WorkflowArtifactStore(tmp_path)
      brief = ClarifyBrief.fallback(

tests/test_workflow_runtime.pymodified

+     ]
 +def verification_commands(run) -> list[str]:
 +    """Return verification-phase bash commands."""
++
 +    return [
 +        str((event.tool_args or {}).get("command", ""))
 +        for event in run.events
 +        if event.type == "tool_call" and event.phase == "verification"
 +    ]
++
++
  def workflow_timeline_kinds(run) -> list[str]:
      assert run.agent.last_turn_summary is not None
      return [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline]
      assert "fixed output" in target.read_text()
 +@pytest.mark.asyncio
 +async def test_plan_mode_recovers_verification_commands_from_legacy_separator(
 +    temp_dir: Path,
 +) -> None:
 +    target = temp_dir / "planned.txt"
 +    backend = ScriptedBackend(
 +        completions=[
 +            CompletionResponse(
 +                content="\n".join(
 +                    [
 +                        "# Implementation Plan",
 +                        "",
 +                        "## File Changes",
 +                        f"- Create {target.name} in the workspace root.",
 +                        "",
 +                        "## Execution Order",
 +                        f"1. Write {target.name}.",
 +                        "2. Verify the file exists.",
 +                        "",
 +                        "## Risks",
 +                        "- Losing the verification commands during parsing.",
 +                        "",
 +                        "# Verification Plan",
 +                        "",
 +                        "## Acceptance Criteria",
 +                        f"- {target.name} exists in the workspace root.",
 +                        "",
 +                        "## Verification Commands",
 +                        f"- `test -f {target}`",
 +                        "",
 +                        "## Notes",
 +                        "- This simulates a legacy separator emitted after the plan body.",
 +                        "",
 +                        "<<VERIFICATION>>",
 +                    ]
 +                )
 +            ),
 +            CompletionResponse(
 +                content="I'll create the planned artifact.",
 +                tool_calls=[
 +                    ToolCall(
 +                        id="write-1",
 +                        name="write",
 +                        arguments={
 +                            "file_path": str(target),
 +                            "content": "planned output\n",
 +                        },
 +                    )
 +                ],
 +            ),
 +            CompletionResponse(content="The planned artifact is in place."),
 +        ]
 +    )
++
 +    run = await run_scenario(
 +        "Implement a persistent workflow mode router with clarify artifacts, "
 +        "planning artifacts, and verification-plan wiring in the runtime.",
 +        backend,
 +        config=non_streaming_config(),
 +        project_root=temp_dir,
 +    )
++
 +    dod = run.agent.last_turn_summary.definition_of_done
 +    assert dod is not None
 +    assert dod.verification_commands == [f"test -f {target}"]
 +    assert verification_commands(run) == [f"test -f {target}"]
 +    assert Path(dod.verification_plan).read_text().count("## Verification Commands") == 1
++
++
  @pytest.mark.asyncio
  async def test_stale_plan_artifacts_trigger_targeted_plan_refresh(
      temp_dir: Path,