Strengthen qwen progress and verification handoffs

Status	File	+	-
M	`src/loader/runtime/artifact_invalidation.py`	40	3
M	`src/loader/runtime/dod.py`	28	0
M	`src/loader/runtime/finalization.py`	1	0
M	`src/loader/runtime/tool_batches.py`	34	1
M	`src/loader/runtime/workflow.py`	170	1
M	`src/loader/runtime/workflow_lanes.py`	44	0
M	`src/loader/runtime/workflow_recovery.py`	19	0
M	`tests/test_artifact_invalidation.py`	25	0
M	`tests/test_dod.py`	23	0
M	`tests/test_finalization.py`	47	0
M	`tests/test_tool_batches.py`	95	0
M	`tests/test_workflow.py`	151	0
M	`tests/test_workflow_runtime.py`	15	0

src/loader/runtime/artifact_invalidation.pymodified

          unexpected_paths = [
              name
              for path in touched_files
--            if (name := _path_name(path)) and name.lower() not in plan_text
++            if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path)
+         ]
          confirmed_touchpoints = [
              name
              for path in touched_files
              if (name := _path_name(path))
+         ]
++        confirmed_touchpoint_keys = {
++            _path_reference_identity(path)
++            for path in touched_files
++            if _path_reference_identity(path)
++        }
          inferred_touchpoints = [
              item
              for item in _extract_path_mentions(
                  implementation_text,
                  verification_text,
+             )
--            if _path_name(item) not in confirmed_touchpoints
++            if _path_reference_identity(item) not in confirmed_touchpoint_keys
+         ]
          stale_plan = False
          stale_brief = False
+                     )
              out_of_brief_paths = [
--                name for name in unexpected_paths if name.lower() not in brief_text
++                name
++                for path in touched_files
++                if (name := _path_name(path))
++                and name in unexpected_paths
++                and not _text_covers_path_reference(brief_text, path)
+             ]
              if out_of_brief_paths:
                  stale_brief = True
      return normalized.rsplit("/", maxsplit=1)[-1].strip()
++def _path_reference_identity(path: str) -> str:
++    normalized = _path_name(path)
++    if not normalized:
++        return ""
++    return _canonical_path_reference(normalized)
++
++
++def _text_covers_path_reference(text: str, path: str) -> bool:
++    normalized_text = text.lower()
++    candidates = [candidate for candidate in (str(path).strip(), _path_name(path)) if candidate]
++
++    for candidate in candidates:
++        if candidate.lower() in normalized_text:
++            return True
++
++    canonical_text = _canonical_path_reference(text)
++    return any(
++        canonical_candidate and canonical_candidate in canonical_text
++        for canonical_candidate in (_canonical_path_reference(candidate) for candidate in candidates)
++    )
++
++
++def _canonical_path_reference(value: str) -> str:
++    normalized = value.lower().strip()
++    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
++    return " ".join(normalized.split())
++
++
  def _text_covers_requirement(text: str, requirement: str) -> bool:
      normalized_text = text.lower()
      normalized_requirement = requirement.lower()

src/loader/runtime/dod.pymodified

      *,
      project_root: Path,
      task_statement: str,
++    supplement_existing: bool = False,
  ) -> list[str]:
      """Generate verification commands from execution history and project shape."""
      if commands:
          return commands
++    if supplement_existing:
++        return commands
      if dod.task_size == "small":
          for path_str in dod.touched_files[:3]:
                      commands,
                      f"python -m py_compile {shlex.quote(str(effective_path))}",
+                 )
++    elif _uses_external_artifacts_only(dod, project_root=project_root):
++        for path_str in dod.touched_files[:3]:
++            path = Path(path_str)
++            effective_path = path if path.is_absolute() else (project_root / path)
++            _append_unique(commands, f"test -f {shlex.quote(str(effective_path))}")
      else:
          if (project_root / "pyproject.toml").exists():
              _append_unique(commands, "uv run pytest -q")
          items.append(value)
++def _uses_external_artifacts_only(dod: DefinitionOfDone, *, project_root: Path) -> bool:
++    touched = [Path(path) for path in dod.touched_files if str(path).strip()]
++    if not touched:
++        return False
++    try:
++        root = project_root.resolve()
++    except FileNotFoundError:
++        root = project_root
++    external = [path for path in touched if not _path_is_within_root(path, root)]
++    return bool(external) and len(external) == len(touched)
++
++
++def _path_is_within_root(path: Path, root: Path) -> bool:
++    try:
++        path.resolve().relative_to(root)
++        return True
++    except ValueError:
++        return False
++
++
  def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]:
      """Build a todo item list from the current DoD state.

src/loader/runtime/finalization.pymodified


                 dod,
                 project_root=self.context.project_root,
                 task_statement=dod.task_statement,
+                supplement_existing=True,
             ):
                 if command not in dod.verification_commands:
                     dod.verification_commands.append(command)

src/loader/runtime/tool_batches.pymodified

      "Complete the requested work",
      _VERIFY_ITEM,
+ }
++_MUTATION_TODO_HINTS = (
++    "create",
++    "update",
++    "edit",
++    "write",
++    "fix",
++    "modify",
++    "change",
++    "patch",
++    "replace",
++    "correct",
++    "rewrite",
++)
  @dataclass
              max_items=2,
+         )
          if next_pending and not html_toc_rule.task_targets_html_toc(current_task):
++            mutation_suffix = ""
++            if _todo_is_mutation_step(next_pending):
++                mutation_suffix = (
++                    " You already have enough evidence for that step, so stop gathering "
++                    "more reference material and perform the change now."
++                )
              if confirmed_facts:
                  self.context.queue_steering_message(
                      "Reuse the earlier observation instead of repeating it. "
                      f"Confirmed facts: {confirmed_facts}. "
                      f"Continue with the next pending item: `{next_pending}`. "
                      "Only gather more evidence if a specific fact required for that step is still unknown."
++                    + mutation_suffix
+                 )
              else:
                  self.context.queue_steering_message(
                      "Reuse the earlier observation instead of repeating it. "
                      f"Continue with the next pending item: `{next_pending}`. "
                      "Only gather more evidence if a specific fact required for that step is still unknown."
++                    + mutation_suffix
+                 )
              return
          if not completed_label or not next_pending or next_pending == completed_label:
              return
++        mutation_suffix = ""
++        if _todo_is_mutation_step(next_pending):
++            mutation_suffix = (
++                " You already have enough evidence for that step, so stop gathering "
++                "more reference material and perform the change now."
++            )
++
          self.context.queue_steering_message(
              f"Confirmed progress: `{completed_label}` is now satisfied by the successful "
              f"`{tool_call.name}` result. Continue with the next pending item: "
--            f"`{next_pending}` instead of rereading the same evidence."
++            f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}"
+         )
          dod.pending_items.append(_VERIFY_ITEM)
++def _todo_is_mutation_step(label: str) -> bool:
++    lowered = label.lower()
++    return any(token in lowered for token in _MUTATION_TODO_HINTS)
++
++
  def _mark_verification_planned(
      *,
      context: RuntimeContext,

src/loader/runtime/workflow.pymodified

      "confirm",
      "check",
+ )
++_AGGREGATE_TODO_HINTS = (
++    "each ",
++    "all ",
++    "every ",
++    "sequence",
++    "multiple ",
++    "across ",
++    "consistently",
++    "properly linked",
++    "directory structure",
++)
++_ACTIONABLE_STEP_VERBS = {
++    "add",
++    "apply",
++    "build",
++    "check",
++    "confirm",
++    "create",
++    "document",
++    "edit",
++    "ensure",
++    "fix",
++    "implement",
++    "inspect",
++    "list",
++    "move",
++    "parse",
++    "patch",
++    "read",
++    "refactor",
++    "remove",
++    "rename",
++    "reorder",
++    "rerun",
++    "re-run",
++    "review",
++    "run",
++    "search",
++    "test",
++    "update",
++    "validate",
++    "verify",
++    "write",
++}
++_RETROSPECTIVE_STEP_VERBS = {
++    "added",
++    "applied",
++    "built",
++    "checked",
++    "completed",
++    "confirmed",
++    "created",
++    "edited",
++    "ensured",
++    "examined",
++    "generated",
++    "implemented",
++    "inspected",
++    "listed",
++    "looked",
++    "parsed",
++    "patched",
++    "read",
++    "reviewed",
++    "updated",
++    "validated",
++    "verified",
++    "wrote",
++}
  _TASK_COVERAGE_STOP_WORDS = {
      "the",
      "and",
              implementation_steps=list(self.implementation_steps),
+         )
++    def with_progress_context(
++        self,
++        *,
++        touched_files: list[str],
++        completed_items: list[str],
++    ) -> PlanningArtifacts:
++        """Return one copy that preserves already-confirmed execution progress."""
++
++        progress_items: list[str] = []
++        for raw_path in touched_files:
++            path_text = str(raw_path).strip()
++            if not path_text:
++                continue
++            progress_items.append(f"Already touched during execution: `{path_text}`.")
++        for raw_item in completed_items:
++            item = str(raw_item).strip()
++            if not item or item in _SPECIAL_TODO_ITEMS:
++                continue
++            progress_items.append(f"Already completed during execution: {item}.")
++
++        if not progress_items:
++            return self
++
++        return PlanningArtifacts(
++            implementation_markdown=_replace_markdown_section_items(
++                self.implementation_markdown,
++                "Confirmed Progress",
++                list(dict.fromkeys(progress_items)),
++            ),
++            verification_markdown=self.verification_markdown,
++            verification_commands=list(self.verification_commands),
++            acceptance_criteria=list(self.acceptance_criteria),
++            implementation_steps=list(self.implementation_steps),
++        )
++
  class WorkflowArtifactStore:
      """Persist briefs and plans under `.loader/`."""
          and item not in _SPECIAL_TODO_ITEMS
          and _task_text_covers_requirement(task_statement, item)
+     ]
++    refreshed_candidates = [
++        item.strip()
++        for item in refreshed_steps
++        if item.strip()
++        and (
++            not (grounded_completed or grounded_pending)
++            or _looks_actionable_refresh_step(item)
++        )
++    ]
      todos: list[dict[str, str]] = []
      seen: set[str] = set()
                  "status": "completed",
+             }
+         )
--    for item in [*grounded_pending, *refreshed_steps]:
++    for item in [*grounded_pending, *refreshed_candidates]:
          label = item.strip()
          if not label or label in seen:
              continue
          elif _looks_like_read_command(command):
              if _contains_any(text, _READ_STEP_HINTS):
                  score += 2
++        elif _looks_like_fs_mutation_command(command):
++            if _contains_any(text, _MUTATION_STEP_HINTS):
++                score += 3
++            if "directory" in text and "mkdir" in command:
++                score += 2
      elif name in {"write", "edit", "patch"}:
++        if _todo_describes_aggregate_mutation(text) and basename and basename not in text:
++            return 0
          if _contains_any(text, _MUTATION_STEP_HINTS):
              score += 3
      return any(candidate in text for candidate in candidates)
++def _todo_describes_aggregate_mutation(text: str) -> bool:
++    return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any(
++        text,
++        _MUTATION_STEP_HINTS,
++    )
++
++
  def _looks_like_search_command(command: str) -> bool:
      return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
+     )
++def _looks_like_fs_mutation_command(command: str) -> bool:
++    stripped = command.strip()
++    return any(
++        stripped.startswith(prefix)
++        for prefix in (
++            "mkdir ",
++            "mkdir\t",
++            "touch ",
++            "touch\t",
++            "cp ",
++            "cp\t",
++            "mv ",
++            "mv\t",
++            "ln ",
++            "ln\t",
++            "install ",
++            "install\t",
++        )
++    )
++
++
  def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
      """Extract verification commands from a verification-plan markdown document."""
+     )
++def _looks_actionable_refresh_step(step: str) -> bool:
++    normalized = step.strip()
++    if not normalized:
++        return False
++    if re.fullmatch(r"(?:[\w.-]+/)*[\w.-]+\.[A-Za-z0-9]+", normalized):
++        return False
++
++    lowered = normalized.lower()
++    lowered = re.sub(r"^(?:first|next|then|finally|afterward|afterwards)\b[,:]?\s*", "", lowered)
++    first_word_match = re.match(r"^[a-z-]+", lowered)
++    if first_word_match is None:
++        return False
++
++    first_word = first_word_match.group(0)
++    if first_word in _RETROSPECTIVE_STEP_VERBS:
++        return False
++    if first_word in _ACTIONABLE_STEP_VERBS:
++        return True
++    return False
++
++
  def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None:
      if section in brief.explicit_sections:
          return

src/loader/runtime/workflow_lanes.pymodified

                  refreshed_acceptance_criteria=list(artifacts.acceptance_criteria),
+             )
              artifacts = artifacts.with_acceptance_criteria(preserved_acceptance)
++            artifacts = artifacts.with_progress_context(
++                touched_files=list(dod.touched_files),
++                completed_items=list(dod.completed_items),
++            )
          implementation_path, verification_path = self.artifact_store.write_plan(
              task,
              artifacts,
          refresh_block = ""
          if refresh_reasons:
++            progress_lines: list[str] = []
++            touched = [str(path).strip() for path in dod.touched_files if str(path).strip()]
++            completed = [
++                item.strip()
++                for item in dod.completed_items
++                if item.strip()
++                and item not in {"Complete the requested work", "Collect verification evidence"}
++            ]
++            pending = [
++                item.strip()
++                for item in dod.pending_items
++                if item.strip()
++                and item not in {"Complete the requested work", "Collect verification evidence"}
++            ]
++            if touched:
++                progress_lines.extend(
++                    [
++                        "Already touched during execution:",
++                        *[f"- {item}" for item in touched[:12]],
++                    ]
++                )
++            if completed:
++                progress_lines.extend(
++                    [
++                        "Already completed work:",
++                        *[f"- {item}" for item in completed[:12]],
++                    ]
++                )
++            if pending:
++                progress_lines.extend(
++                    [
++                        "Still pending:",
++                        *[f"- {item}" for item in pending[:12]],
++                    ]
++                )
              refresh_block = (
                  "Refresh the existing planning artifacts instead of creating a fresh plan "
                  "from scratch.\n"
                  "artifact.\n"
                  "Use the current task state and these recovery reasons:\n"
                  + "\n".join(f"- {item}" for item in refresh_reasons)
++                + (
++                    ("\n\nCurrent execution progress:\n" + "\n".join(progress_lines))
++                    if progress_lines
++                    else ""
++                )
                  + "\n\n"
+             )

src/loader/runtime/workflow_recovery.pymodified

  WorkflowModeSetter = Callable[..., Awaitable[None]]
  TimelineAppender = Callable[..., None]
  BridgeAppender = Callable[[DefinitionOfDone], None]
++_RECOVERY_TODO_EXCLUDED_ITEMS = {
++    "Complete the requested work",
++    "Collect verification evidence",
++}
  class WorkflowRecoveryController:
              summary=summary,
+         )
          self.append_execute_bridge(dod)
++        next_pending = next(
++            (
++                item
++                for item in dod.pending_items
++                if item not in _RECOVERY_TODO_EXCLUDED_ITEMS
++            ),
++            None,
++        )
++        if next_pending:
++            self.context.queue_steering_message(
++                "Plan refresh preserved the progress already made. "
++                f"Reuse the existing files and confirmed facts, then continue with the next "
++                f"pending item: `{next_pending}`. "
++                "Do not restart from initial discovery unless a specific missing fact blocks that step."
++            )
          return True
      async def _run_clarify_reentry_for_drift(

tests/test_artifact_invalidation.pymodified

          for item in freshness.evidence
+     )
      assert freshness.evidence_summary
++
++
++def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint() -> None:
++    assessor = ArtifactInvalidationAssessor()
++
++    freshness = assessor.assess(
++        task_statement="Build a multi-file nginx guide.",
++        clarify_text=None,
++        implementation_text=(
++            "# Implementation Plan\n"
++            "- Create 01-getting-started.html in the chapters directory.\n"
++        ),
++        verification_text=(
++            "# Verification Plan\n"
++            "## Acceptance Criteria\n"
++            "- 01-getting-started.html exists.\n"
++        ),
++        acceptance_criteria=["01-getting-started.html exists."],
++        touched_files=["/tmp/chapters/01_getting_started.html"],
++        last_verification_result=None,
++    )
++
++    assert freshness.stale_plan is False
++    assert freshness.stale_brief is False
++    assert "touched_files_outside_plan" not in freshness.reason_codes

tests/test_dod.pymodified

      assert not any(command == f"test -f {index}" for command in commands)
++def test_derive_verification_commands_avoids_repo_defaults_for_external_artifacts(
++    tmp_path: Path,
++) -> None:
++    (tmp_path / "pyproject.toml").write_text("[project]\nname='loader'\n")
++    (tmp_path / "package.json").write_text("{}\n")
++    external_root = tmp_path.parent / "external-guide"
++    external_root.mkdir(exist_ok=True)
++    external_index = external_root / "index.html"
++    external_index.write_text("<html></html>\n")
++
++    dod = create_definition_of_done("Create an external nginx guide.")
++    dod.task_size = "standard"
++    dod.touched_files = [str(external_index)]
++
++    commands = derive_verification_commands(
++        dod,
++        project_root=tmp_path,
++        task_statement=dod.task_statement,
++    )
++
++    assert commands == [f"test -f {external_index}"]
++
++
  def test_build_verification_summary_keeps_concrete_missing_link_details() -> None:
      summary = build_verification_summary(
+         [

tests/test_finalization.pymodified

+     )
++@pytest.mark.asyncio
++async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
++    temp_dir: Path,
++) -> None:
++    (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
++    (temp_dir / "package.json").write_text("{}\n")
++    external_root = temp_dir.parent / "external-nginx-guide"
++    external_root.mkdir(exist_ok=True)
++    external_index = external_root / "index.html"
++    external_index.write_text("<html></html>\n")
++
++    session = FakeSession()
++    context = build_context(temp_dir, session)
++    finalizer = TurnFinalizer(
++        context,
++        RuntimeTracer(),
++        DefinitionOfDoneStore(temp_dir),
++        set_workflow_mode=_noop_set_workflow_mode,
++    )
++    dod = create_definition_of_done("Create an external nginx guide.")
++    dod.mutating_actions.append("write")
++    dod.touched_files.append(str(external_index))
++    dod.verification_commands = [
++        f"ls -la {external_root}",
++        f"grep -n \"html\" {external_index}",
++    ]
++    summary = TurnSummary(final_response="")
++    executor = RecordingExecutor()
++
++    async def capture(event) -> None:
++        return None
++
++    result = await finalizer.run_definition_of_done_gate(
++        dod=dod,
++        candidate_response="Created the external nginx guide.",
++        emit=capture,
++        summary=summary,
++        executor=executor,  # type: ignore[arg-type]
++    )
++
++    assert result.should_continue is False
++    assert executor.commands == [
++        f"ls -la {external_root}",
++        f'grep -n "html" {external_index}',
++    ]
++
++
  @pytest.mark.asyncio
  async def test_turn_finalizer_records_missing_verification_observation(
      temp_dir: Path,

tests/test_tool_batches.pymodified

          in message
          for message in queued_messages
+     )
++    assert any(
++        "stop gathering more reference material and perform the change now" in message
++        for message in queued_messages
++    )
  @pytest.mark.asyncio
      assert "Update `" not in queued_messages[0]
++@pytest.mark.asyncio
++async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
++    temp_dir: Path,
++) -> None:
++    async def assess_confidence(
++        tool_name: str,
++        tool_args: dict,
++        context: str,
++    ) -> ConfidenceAssessment:
++        raise AssertionError("Confidence scoring should be disabled in this scenario")
++
++    async def verify_action(
++        tool_name: str,
++        tool_args: dict,
++        result: str,
++        expected: str = "",
++    ) -> ActionVerification:
++        raise AssertionError("Verification should not run for this scenario")
++
++    reference = temp_dir / "fortran" / "index.html"
++    reference.parent.mkdir(parents=True)
++    reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
++
++    context = build_context(
++        temp_dir=temp_dir,
++        messages=[],
++        safeguards=FakeSafeguards(),
++        assess_confidence=assess_confidence,
++        verify_action=verify_action,
++        auto_recover=False,
++    )
++    queued_messages: list[str] = []
++    context.queue_steering_message_callback = queued_messages.append
++    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
++    dod = create_definition_of_done("Create a multi-file nginx guide.")
++    sync_todos_to_definition_of_done(
++        dod,
++        [
++            {
++                "content": "Examine the existing Fortran guide structure to understand the cadence and format",
++                "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
++                "status": "pending",
++            },
++            {
++                "content": "Create the nginx index.html file",
++                "active_form": "Working on: Create the nginx index.html file",
++                "status": "pending",
++            },
++        ],
++    )
++    tool_call = ToolCall(
++        id="read-reference",
++        name="read",
++        arguments={"file_path": str(reference)},
++    )
++    executor = FakeExecutor(
++        [
++            tool_outcome(
++                tool_call=tool_call,
++                output="<h1>Fortran Beginner's Guide</h1>\n",
++                is_error=False,
++            )
++        ]
++    )
++
++    summary = TurnSummary(final_response="")
++    await runner.execute_batch(
++        tool_calls=[tool_call],
++        tool_source="assistant",
++        pending_tool_calls_seen=set(),
++        emit=_noop_emit,
++        summary=summary,
++        dod=dod,
++        executor=executor,  # type: ignore[arg-type]
++        on_confirmation=None,
++        on_user_question=None,
++        emit_confirmation=None,
++        consecutive_errors=0,
++    )
++
++    assert any(
++        "Continue with the next pending item: `Create the nginx index.html file`"
++        in message
++        for message in queued_messages
++    )
++    assert any(
++        "stop gathering more reference material and perform the change now" in message
++        for message in queued_messages
++    )
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      temp_dir: Path,

tests/test_workflow.pymodified

+     )
++def test_planning_artifacts_with_progress_context_records_touched_and_completed_work() -> None:
++    artifacts = PlanningArtifacts.from_model_output(
++        "\n".join(
++            [
++                "# Implementation Plan",
++                "",
++                "## Execution Order",
++                "1. Create the guide files.",
++                "",
++                "<<<VERIFICATION>>>",
++                "",
++                "# Verification Plan",
++                "",
++                "## Acceptance Criteria",
++                "- At least one chapter file exists.",
++                "",
++                "## Verification Commands",
++                "- `find chapters -name \"*.html\" | wc -l`",
++            ]
++        ),
++        task_statement="Create a thorough nginx guide.",
++    )
++
++    updated = artifacts.with_progress_context(
++        touched_files=["/tmp/nginx/index.html"],
++        completed_items=[
++            "Create the guide scaffold",
++            "Collect verification evidence",
++        ],
++    )
++
++    assert "## Confirmed Progress" in updated.implementation_markdown
++    assert "Already touched during execution: `/tmp/nginx/index.html`." in (
++        updated.implementation_markdown
++    )
++    assert "Already completed during execution: Create the guide scaffold." in (
++        updated.implementation_markdown
++    )
++    assert "Collect verification evidence" not in updated.implementation_markdown
++
++
  def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None:
      task = (
          "Create an equally thorough nginx guide with index.html plus chapter files "
+     )
++def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() -> None:
++    task = (
++        "Create an equally thorough nginx guide with index.html plus chapter files "
++        "covering getting started, installation, first website setup, configs, and "
++        "advanced topics."
++    )
++
++    todos = merge_refreshed_todos_with_existing_scope(
++        task,
++        existing_pending_items=[
++            "Create each chapter file in sequence, following the same structure as the Fortran guide",
++            "Ensure all files are properly linked and formatted consistently",
++        ],
++        existing_completed_items=[
++            "First, examine the existing Fortran guide structure to understand the format and cadence",
++            "Create the directory structure for the new nginx guide",
++            "Create the main index.html file",
++        ],
++        refreshed_steps=[
++            "First examined the existing Fortran guide structure to understand format and cadence",
++            "Created the main index.html file with navigation",
++            "Created chapter files in sequence:",
++            "01-getting-started.html",
++            "02-installation.html",
++            "03-first-website.html",
++            "04-configuring.html",
++            "All files properly linked with navigation between chapters",
++            "Verify the final navigation links across the guide",
++        ],
++    )
++
++    labels = {item["content"]: item["status"] for item in todos}
++    assert (
++        labels["Create each chapter file in sequence, following the same structure as the Fortran guide"]
++        == "pending"
++    )
++    assert labels["Ensure all files are properly linked and formatted consistently"] == "pending"
++    assert labels["Verify the final navigation links across the guide"] == "pending"
++    assert "Created chapter files in sequence:" not in labels
++    assert "04-configuring.html" not in labels
++
++
  def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
      store = WorkflowArtifactStore(tmp_path)
      brief = ClarifyBrief.fallback(
          ),
+     )
      assert "Verify the updated index.html file is properly formatted" in dod.completed_items
++
++
++def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() -> None:
++    dod = create_definition_of_done("Create a multi-file nginx guide.")
++    sync_todos_to_definition_of_done(
++        dod,
++        [
++            {
++                "content": "Create each chapter file in sequence, following the same structure as the Fortran guide",
++                "active_form": "Working on: Create each chapter file in sequence, following the same structure as the Fortran guide",
++                "status": "pending",
++            },
++            {
++                "content": "Ensure all files are properly linked and formatted consistently",
++                "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
++                "status": "pending",
++            },
++        ],
++    )
++
++    assert (
++        advance_todos_from_tool_call(
++            dod,
++            ToolCall(
++                id="write-one-chapter",
++                name="write",
++                arguments={
++                    "file_path": "/tmp/nginx/chapters/01-getting-started.html",
++                    "content": "<html></html>",
++                },
++            ),
++        )
++        is False
++    )
++    assert (
++        "Create each chapter file in sequence, following the same structure as the Fortran guide"
++        in dod.pending_items
++    )
++
++
++def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None:
++    dod = create_definition_of_done("Create a multi-file nginx guide.")
++    sync_todos_to_definition_of_done(
++        dod,
++        [
++            {
++                "content": "Create the nginx directory structure",
++                "active_form": "Working on: Create the nginx directory structure",
++                "status": "pending",
++            },
++            {
++                "content": "Create index.html for nginx guide",
++                "active_form": "Working on: Create index.html for nginx guide",
++                "status": "pending",
++            },
++        ],
++    )
++
++    assert advance_todos_from_tool_call(
++        dod,
++        ToolCall(
++            id="mkdir-nginx",
++            name="bash",
++            arguments={"command": "mkdir -p ~/Loader/guides/nginx/chapters"},
++        ),
++    )
++    assert "Create the nginx directory structure" in dod.completed_items
++    assert "Create index.html for nginx guide" in dod.pending_items

tests/test_workflow_runtime.pymodified

          entry.reason_code == "plan_refresh_completed"
          for entry in run.agent.last_turn_summary.workflow_timeline
+     )
++    refresh_prompt = next(
++        invocation.messages[-1].content
++        for invocation in backend.invocations
++        if "Refresh the existing planning artifacts instead of creating a fresh plan from scratch."
++        in invocation.messages[-1].content
++    )
++    assert "Current execution progress:" in refresh_prompt
++    assert "Already touched during execution:" in refresh_prompt
++    assert f"- {target}" in refresh_prompt
++    assert any(
++        "Plan refresh preserved the progress already made." in message.content
++        and "Do not restart from initial discovery" in message.content
++        for invocation in backend.invocations
++        for message in invocation.messages
++    )
  @pytest.mark.asyncio

`@@ -294,6 +294,7 @@` class TurnFinalizer:
294	dod,	294	dod,
295	project_root=self.context.project_root,	295	project_root=self.context.project_root,
296	task_statement=dod.task_statement,	296	task_statement=dod.task_statement,
		297	+ supplement_existing=True,
297	):	298	):
298	if command not in dod.verification_commands:	299	if command not in dod.verification_commands:
299	dod.verification_commands.append(command)	300	dod.verification_commands.append(command)

tenseleyflow/loader / `297e213`

13 changed files