Strengthen qwen progress and verification handoffs

Status	File	+	-
M	`src/loader/runtime/artifact_invalidation.py`	40	3
M	`src/loader/runtime/dod.py`	28	0
M	`src/loader/runtime/finalization.py`	1	0
M	`src/loader/runtime/tool_batches.py`	34	1
M	`src/loader/runtime/workflow.py`	170	1
M	`src/loader/runtime/workflow_lanes.py`	44	0
M	`src/loader/runtime/workflow_recovery.py`	19	0
M	`tests/test_artifact_invalidation.py`	25	0
M	`tests/test_dod.py`	23	0
M	`tests/test_finalization.py`	47	0
M	`tests/test_tool_batches.py`	95	0
M	`tests/test_workflow.py`	151	0
M	`tests/test_workflow_runtime.py`	15	0

src/loader/runtime/artifact_invalidation.pymodified

          unexpected_paths = [
              name
              for path in touched_files
 -            if (name := _path_name(path)) and name.lower() not in plan_text
 +            if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path)
+         ]
          confirmed_touchpoints = [
              name
              for path in touched_files
              if (name := _path_name(path))
+         ]
 +        confirmed_touchpoint_keys = {
 +            _path_reference_identity(path)
 +            for path in touched_files
 +            if _path_reference_identity(path)
 +        }
          inferred_touchpoints = [
              item
              for item in _extract_path_mentions(
                  implementation_text,
                  verification_text,
+             )
 -            if _path_name(item) not in confirmed_touchpoints
 +            if _path_reference_identity(item) not in confirmed_touchpoint_keys
+         ]
          stale_plan = False
          stale_brief = False
+                     )
              out_of_brief_paths = [
 -                name for name in unexpected_paths if name.lower() not in brief_text
 +                name
 +                for path in touched_files
 +                if (name := _path_name(path))
 +                and name in unexpected_paths
 +                and not _text_covers_path_reference(brief_text, path)
+             ]
              if out_of_brief_paths:
                  stale_brief = True
      return normalized.rsplit("/", maxsplit=1)[-1].strip()
 +def _path_reference_identity(path: str) -> str:
 +    normalized = _path_name(path)
 +    if not normalized:
 +        return ""
 +    return _canonical_path_reference(normalized)
++
++
 +def _text_covers_path_reference(text: str, path: str) -> bool:
 +    normalized_text = text.lower()
 +    candidates = [candidate for candidate in (str(path).strip(), _path_name(path)) if candidate]
++
 +    for candidate in candidates:
 +        if candidate.lower() in normalized_text:
 +            return True
++
 +    canonical_text = _canonical_path_reference(text)
 +    return any(
 +        canonical_candidate and canonical_candidate in canonical_text
 +        for canonical_candidate in (_canonical_path_reference(candidate) for candidate in candidates)
 +    )
++
++
 +def _canonical_path_reference(value: str) -> str:
 +    normalized = value.lower().strip()
 +    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
 +    return " ".join(normalized.split())
++
++
  def _text_covers_requirement(text: str, requirement: str) -> bool:
      normalized_text = text.lower()
      normalized_requirement = requirement.lower()

src/loader/runtime/dod.pymodified

      *,
      project_root: Path,
      task_statement: str,
 +    supplement_existing: bool = False,
  ) -> list[str]:
      """Generate verification commands from execution history and project shape."""
      if commands:
          return commands
 +    if supplement_existing:
 +        return commands
      if dod.task_size == "small":
          for path_str in dod.touched_files[:3]:
                      commands,
                      f"python -m py_compile {shlex.quote(str(effective_path))}",
+                 )
 +    elif _uses_external_artifacts_only(dod, project_root=project_root):
 +        for path_str in dod.touched_files[:3]:
 +            path = Path(path_str)
 +            effective_path = path if path.is_absolute() else (project_root / path)
 +            _append_unique(commands, f"test -f {shlex.quote(str(effective_path))}")
      else:
          if (project_root / "pyproject.toml").exists():
              _append_unique(commands, "uv run pytest -q")
          items.append(value)
 +def _uses_external_artifacts_only(dod: DefinitionOfDone, *, project_root: Path) -> bool:
 +    touched = [Path(path) for path in dod.touched_files if str(path).strip()]
 +    if not touched:
 +        return False
 +    try:
 +        root = project_root.resolve()
 +    except FileNotFoundError:
 +        root = project_root
 +    external = [path for path in touched if not _path_is_within_root(path, root)]
 +    return bool(external) and len(external) == len(touched)
++
++
 +def _path_is_within_root(path: Path, root: Path) -> bool:
 +    try:
 +        path.resolve().relative_to(root)
 +        return True
 +    except ValueError:
 +        return False
++
++
  def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]:
      """Build a todo item list from the current DoD state.

src/loader/runtime/finalization.pymodified

                  dod,
                  project_root=self.context.project_root,
                  task_statement=dod.task_statement,
 +                supplement_existing=True,
              ):
                  if command not in dod.verification_commands:
                      dod.verification_commands.append(command)

src/loader/runtime/tool_batches.pymodified

      "Complete the requested work",
      _VERIFY_ITEM,
+ }
 +_MUTATION_TODO_HINTS = (
 +    "create",
 +    "update",
 +    "edit",
 +    "write",
 +    "fix",
 +    "modify",
 +    "change",
 +    "patch",
 +    "replace",
 +    "correct",
 +    "rewrite",
 +)
  @dataclass
              max_items=2,
+         )
          if next_pending and not html_toc_rule.task_targets_html_toc(current_task):
 +            mutation_suffix = ""
 +            if _todo_is_mutation_step(next_pending):
 +                mutation_suffix = (
 +                    " You already have enough evidence for that step, so stop gathering "
 +                    "more reference material and perform the change now."
 +                )
              if confirmed_facts:
                  self.context.queue_steering_message(
                      "Reuse the earlier observation instead of repeating it. "
                      f"Confirmed facts: {confirmed_facts}. "
                      f"Continue with the next pending item: `{next_pending}`. "
                      "Only gather more evidence if a specific fact required for that step is still unknown."
 +                    + mutation_suffix
+                 )
              else:
                  self.context.queue_steering_message(
                      "Reuse the earlier observation instead of repeating it. "
                      f"Continue with the next pending item: `{next_pending}`. "
                      "Only gather more evidence if a specific fact required for that step is still unknown."
 +                    + mutation_suffix
+                 )
              return
          if not completed_label or not next_pending or next_pending == completed_label:
              return
 +        mutation_suffix = ""
 +        if _todo_is_mutation_step(next_pending):
 +            mutation_suffix = (
 +                " You already have enough evidence for that step, so stop gathering "
 +                "more reference material and perform the change now."
 +            )
++
          self.context.queue_steering_message(
              f"Confirmed progress: `{completed_label}` is now satisfied by the successful "
              f"`{tool_call.name}` result. Continue with the next pending item: "
 -            f"`{next_pending}` instead of rereading the same evidence."
 +            f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}"
+         )
          dod.pending_items.append(_VERIFY_ITEM)
 +def _todo_is_mutation_step(label: str) -> bool:
 +    lowered = label.lower()
 +    return any(token in lowered for token in _MUTATION_TODO_HINTS)
++
++
  def _mark_verification_planned(
      *,
      context: RuntimeContext,

src/loader/runtime/workflow.pymodified

      "confirm",
      "check",
+ )
 +_AGGREGATE_TODO_HINTS = (
 +    "each ",
 +    "all ",
 +    "every ",
 +    "sequence",
 +    "multiple ",
 +    "across ",
 +    "consistently",
 +    "properly linked",
 +    "directory structure",
 +)
 +_ACTIONABLE_STEP_VERBS = {
 +    "add",
 +    "apply",
 +    "build",
 +    "check",
 +    "confirm",
 +    "create",
 +    "document",
 +    "edit",
 +    "ensure",
 +    "fix",
 +    "implement",
 +    "inspect",
 +    "list",
 +    "move",
 +    "parse",
 +    "patch",
 +    "read",
 +    "refactor",
 +    "remove",
 +    "rename",
 +    "reorder",
 +    "rerun",
 +    "re-run",
 +    "review",
 +    "run",
 +    "search",
 +    "test",
 +    "update",
 +    "validate",
 +    "verify",
 +    "write",
 +}
 +_RETROSPECTIVE_STEP_VERBS = {
 +    "added",
 +    "applied",
 +    "built",
 +    "checked",
 +    "completed",
 +    "confirmed",
 +    "created",
 +    "edited",
 +    "ensured",
 +    "examined",
 +    "generated",
 +    "implemented",
 +    "inspected",
 +    "listed",
 +    "looked",
 +    "parsed",
 +    "patched",
 +    "read",
 +    "reviewed",
 +    "updated",
 +    "validated",
 +    "verified",
 +    "wrote",
 +}
  _TASK_COVERAGE_STOP_WORDS = {
      "the",
      "and",
              implementation_steps=list(self.implementation_steps),
+         )
 +    def with_progress_context(
 +        self,
 +        *,
 +        touched_files: list[str],
 +        completed_items: list[str],
 +    ) -> PlanningArtifacts:
 +        """Return one copy that preserves already-confirmed execution progress."""
++
 +        progress_items: list[str] = []
 +        for raw_path in touched_files:
 +            path_text = str(raw_path).strip()
 +            if not path_text:
 +                continue
 +            progress_items.append(f"Already touched during execution: `{path_text}`.")
 +        for raw_item in completed_items:
 +            item = str(raw_item).strip()
 +            if not item or item in _SPECIAL_TODO_ITEMS:
 +                continue
 +            progress_items.append(f"Already completed during execution: {item}.")
++
 +        if not progress_items:
 +            return self
++
 +        return PlanningArtifacts(
 +            implementation_markdown=_replace_markdown_section_items(
 +                self.implementation_markdown,
 +                "Confirmed Progress",
 +                list(dict.fromkeys(progress_items)),
 +            ),
 +            verification_markdown=self.verification_markdown,
 +            verification_commands=list(self.verification_commands),
 +            acceptance_criteria=list(self.acceptance_criteria),
 +            implementation_steps=list(self.implementation_steps),
 +        )
++
  class WorkflowArtifactStore:
      """Persist briefs and plans under `.loader/`."""
          and item not in _SPECIAL_TODO_ITEMS
          and _task_text_covers_requirement(task_statement, item)
+     ]
 +    refreshed_candidates = [
 +        item.strip()
 +        for item in refreshed_steps
 +        if item.strip()
 +        and (
 +            not (grounded_completed or grounded_pending)
 +            or _looks_actionable_refresh_step(item)
 +        )
 +    ]
      todos: list[dict[str, str]] = []
      seen: set[str] = set()
                  "status": "completed",
+             }
+         )
 -    for item in [*grounded_pending, *refreshed_steps]:
 +    for item in [*grounded_pending, *refreshed_candidates]:
          label = item.strip()
          if not label or label in seen:
              continue
          elif _looks_like_read_command(command):
              if _contains_any(text, _READ_STEP_HINTS):
                  score += 2
 +        elif _looks_like_fs_mutation_command(command):
 +            if _contains_any(text, _MUTATION_STEP_HINTS):
 +                score += 3
 +            if "directory" in text and "mkdir" in command:
 +                score += 2
      elif name in {"write", "edit", "patch"}:
 +        if _todo_describes_aggregate_mutation(text) and basename and basename not in text:
 +            return 0
          if _contains_any(text, _MUTATION_STEP_HINTS):
              score += 3
      return any(candidate in text for candidate in candidates)
 +def _todo_describes_aggregate_mutation(text: str) -> bool:
 +    return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any(
 +        text,
 +        _MUTATION_STEP_HINTS,
 +    )
++
++
  def _looks_like_search_command(command: str) -> bool:
      return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
+     )
 +def _looks_like_fs_mutation_command(command: str) -> bool:
 +    stripped = command.strip()
 +    return any(
 +        stripped.startswith(prefix)
 +        for prefix in (
 +            "mkdir ",
 +            "mkdir\t",
 +            "touch ",
 +            "touch\t",
 +            "cp ",
 +            "cp\t",
 +            "mv ",
 +            "mv\t",
 +            "ln ",
 +            "ln\t",
 +            "install ",
 +            "install\t",
 +        )
 +    )
++
++
  def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
      """Extract verification commands from a verification-plan markdown document."""
+     )
 +def _looks_actionable_refresh_step(step: str) -> bool:
 +    normalized = step.strip()
 +    if not normalized:
 +        return False
 +    if re.fullmatch(r"(?:[\w.-]+/)*[\w.-]+\.[A-Za-z0-9]+", normalized):
 +        return False
++
 +    lowered = normalized.lower()
 +    lowered = re.sub(r"^(?:first|next|then|finally|afterward|afterwards)\b[,:]?\s*", "", lowered)
 +    first_word_match = re.match(r"^[a-z-]+", lowered)
 +    if first_word_match is None:
 +        return False
++
 +    first_word = first_word_match.group(0)
 +    if first_word in _RETROSPECTIVE_STEP_VERBS:
 +        return False
 +    if first_word in _ACTIONABLE_STEP_VERBS:
 +        return True
 +    return False
++
++
  def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None:
      if section in brief.explicit_sections:
          return

src/loader/runtime/workflow_lanes.pymodified

                  refreshed_acceptance_criteria=list(artifacts.acceptance_criteria),
+             )
              artifacts = artifacts.with_acceptance_criteria(preserved_acceptance)
 +            artifacts = artifacts.with_progress_context(
 +                touched_files=list(dod.touched_files),
 +                completed_items=list(dod.completed_items),
 +            )
          implementation_path, verification_path = self.artifact_store.write_plan(
              task,
              artifacts,
          refresh_block = ""
          if refresh_reasons:
 +            progress_lines: list[str] = []
 +            touched = [str(path).strip() for path in dod.touched_files if str(path).strip()]
 +            completed = [
 +                item.strip()
 +                for item in dod.completed_items
 +                if item.strip()
 +                and item not in {"Complete the requested work", "Collect verification evidence"}
 +            ]
 +            pending = [
 +                item.strip()
 +                for item in dod.pending_items
 +                if item.strip()
 +                and item not in {"Complete the requested work", "Collect verification evidence"}
 +            ]
 +            if touched:
 +                progress_lines.extend(
 +                    [
 +                        "Already touched during execution:",
 +                        *[f"- {item}" for item in touched[:12]],
 +                    ]
 +                )
 +            if completed:
 +                progress_lines.extend(
 +                    [
 +                        "Already completed work:",
 +                        *[f"- {item}" for item in completed[:12]],
 +                    ]
 +                )
 +            if pending:
 +                progress_lines.extend(
 +                    [
 +                        "Still pending:",
 +                        *[f"- {item}" for item in pending[:12]],
 +                    ]
 +                )
              refresh_block = (
                  "Refresh the existing planning artifacts instead of creating a fresh plan "
                  "from scratch.\n"
                  "artifact.\n"
                  "Use the current task state and these recovery reasons:\n"
                  + "\n".join(f"- {item}" for item in refresh_reasons)
 +                + (
 +                    ("\n\nCurrent execution progress:\n" + "\n".join(progress_lines))
 +                    if progress_lines
 +                    else ""
 +                )
                  + "\n\n"
+             )

src/loader/runtime/workflow_recovery.pymodified

  WorkflowModeSetter = Callable[..., Awaitable[None]]
  TimelineAppender = Callable[..., None]
  BridgeAppender = Callable[[DefinitionOfDone], None]
 +_RECOVERY_TODO_EXCLUDED_ITEMS = {
 +    "Complete the requested work",
 +    "Collect verification evidence",
 +}
  class WorkflowRecoveryController:
              summary=summary,
+         )
          self.append_execute_bridge(dod)
 +        next_pending = next(
 +            (
 +                item
 +                for item in dod.pending_items
 +                if item not in _RECOVERY_TODO_EXCLUDED_ITEMS
 +            ),
 +            None,
 +        )
 +        if next_pending:
 +            self.context.queue_steering_message(
 +                "Plan refresh preserved the progress already made. "
 +                f"Reuse the existing files and confirmed facts, then continue with the next "
 +                f"pending item: `{next_pending}`. "
 +                "Do not restart from initial discovery unless a specific missing fact blocks that step."
 +            )
          return True
      async def _run_clarify_reentry_for_drift(

tests/test_artifact_invalidation.pymodified

          for item in freshness.evidence
+     )
      assert freshness.evidence_summary
++
++
 +def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint() -> None:
 +    assessor = ArtifactInvalidationAssessor()
++
 +    freshness = assessor.assess(
 +        task_statement="Build a multi-file nginx guide.",
 +        clarify_text=None,
 +        implementation_text=(
 +            "# Implementation Plan\n"
 +            "- Create 01-getting-started.html in the chapters directory.\n"
 +        ),
 +        verification_text=(
 +            "# Verification Plan\n"
 +            "## Acceptance Criteria\n"
 +            "- 01-getting-started.html exists.\n"
 +        ),
 +        acceptance_criteria=["01-getting-started.html exists."],
 +        touched_files=["/tmp/chapters/01_getting_started.html"],
 +        last_verification_result=None,
 +    )
++
 +    assert freshness.stale_plan is False
 +    assert freshness.stale_brief is False
 +    assert "touched_files_outside_plan" not in freshness.reason_codes

tests/test_dod.pymodified

      assert not any(command == f"test -f {index}" for command in commands)
 +def test_derive_verification_commands_avoids_repo_defaults_for_external_artifacts(
 +    tmp_path: Path,
 +) -> None:
 +    (tmp_path / "pyproject.toml").write_text("[project]\nname='loader'\n")
 +    (tmp_path / "package.json").write_text("{}\n")
 +    external_root = tmp_path.parent / "external-guide"
 +    external_root.mkdir(exist_ok=True)
 +    external_index = external_root / "index.html"
 +    external_index.write_text("<html></html>\n")
++
 +    dod = create_definition_of_done("Create an external nginx guide.")
 +    dod.task_size = "standard"
 +    dod.touched_files = [str(external_index)]
++
 +    commands = derive_verification_commands(
 +        dod,
 +        project_root=tmp_path,
 +        task_statement=dod.task_statement,
 +    )
++
 +    assert commands == [f"test -f {external_index}"]
++
++
  def test_build_verification_summary_keeps_concrete_missing_link_details() -> None:
      summary = build_verification_summary(
+         [

tests/test_finalization.pymodified

+     )
 +@pytest.mark.asyncio
 +async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
 +    temp_dir: Path,
 +) -> None:
 +    (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
 +    (temp_dir / "package.json").write_text("{}\n")
 +    external_root = temp_dir.parent / "external-nginx-guide"
 +    external_root.mkdir(exist_ok=True)
 +    external_index = external_root / "index.html"
 +    external_index.write_text("<html></html>\n")
++
 +    session = FakeSession()
 +    context = build_context(temp_dir, session)
 +    finalizer = TurnFinalizer(
 +        context,
 +        RuntimeTracer(),
 +        DefinitionOfDoneStore(temp_dir),
 +        set_workflow_mode=_noop_set_workflow_mode,
 +    )
 +    dod = create_definition_of_done("Create an external nginx guide.")
 +    dod.mutating_actions.append("write")
 +    dod.touched_files.append(str(external_index))
 +    dod.verification_commands = [
 +        f"ls -la {external_root}",
 +        f"grep -n \"html\" {external_index}",
 +    ]
 +    summary = TurnSummary(final_response="")
 +    executor = RecordingExecutor()
++
 +    async def capture(event) -> None:
 +        return None
++
 +    result = await finalizer.run_definition_of_done_gate(
 +        dod=dod,
 +        candidate_response="Created the external nginx guide.",
 +        emit=capture,
 +        summary=summary,
 +        executor=executor,  # type: ignore[arg-type]
 +    )
++
 +    assert result.should_continue is False
 +    assert executor.commands == [
 +        f"ls -la {external_root}",
 +        f'grep -n "html" {external_index}',
 +    ]
++
++
  @pytest.mark.asyncio
  async def test_turn_finalizer_records_missing_verification_observation(
      temp_dir: Path,

tests/test_tool_batches.pymodified

          in message
          for message in queued_messages
+     )
 +    assert any(
 +        "stop gathering more reference material and perform the change now" in message
 +        for message in queued_messages
 +    )
  @pytest.mark.asyncio
      assert "Update `" not in queued_messages[0]
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_observation_handoff_pushes_mutation_step(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should be disabled in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run for this scenario")
++
 +    reference = temp_dir / "fortran" / "index.html"
 +    reference.parent.mkdir(parents=True)
 +    reference.write_text("<h1>Fortran Beginner's Guide</h1>\n")
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +        auto_recover=False,
 +    )
 +    queued_messages: list[str] = []
 +    context.queue_steering_message_callback = queued_messages.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    dod = create_definition_of_done("Create a multi-file nginx guide.")
 +    sync_todos_to_definition_of_done(
 +        dod,
 +        [
 +            {
 +                "content": "Examine the existing Fortran guide structure to understand the cadence and format",
 +                "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Create the nginx index.html file",
 +                "active_form": "Working on: Create the nginx index.html file",
 +                "status": "pending",
 +            },
 +        ],
 +    )
 +    tool_call = ToolCall(
 +        id="read-reference",
 +        name="read",
 +        arguments={"file_path": str(reference)},
 +    )
 +    executor = FakeExecutor(
 +        [
 +            tool_outcome(
 +                tool_call=tool_call,
 +                output="<h1>Fortran Beginner's Guide</h1>\n",
 +                is_error=False,
 +            )
 +        ]
 +    )
++
 +    summary = TurnSummary(final_response="")
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=_noop_emit,
 +        summary=summary,
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert any(
 +        "Continue with the next pending item: `Create the nginx index.html file`"
 +        in message
 +        for message in queued_messages
 +    )
 +    assert any(
 +        "stop gathering more reference material and perform the change now" in message
 +        for message in queued_messages
 +    )
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid(
      temp_dir: Path,

tests/test_workflow.pymodified

+     )
 +def test_planning_artifacts_with_progress_context_records_touched_and_completed_work() -> None:
 +    artifacts = PlanningArtifacts.from_model_output(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## Execution Order",
 +                "1. Create the guide files.",
 +                "",
 +                "<<<VERIFICATION>>>",
 +                "",
 +                "# Verification Plan",
 +                "",
 +                "## Acceptance Criteria",
 +                "- At least one chapter file exists.",
 +                "",
 +                "## Verification Commands",
 +                "- `find chapters -name \"*.html\" | wc -l`",
 +            ]
 +        ),
 +        task_statement="Create a thorough nginx guide.",
 +    )
++
 +    updated = artifacts.with_progress_context(
 +        touched_files=["/tmp/nginx/index.html"],
 +        completed_items=[
 +            "Create the guide scaffold",
 +            "Collect verification evidence",
 +        ],
 +    )
++
 +    assert "## Confirmed Progress" in updated.implementation_markdown
 +    assert "Already touched during execution: `/tmp/nginx/index.html`." in (
 +        updated.implementation_markdown
 +    )
 +    assert "Already completed during execution: Create the guide scaffold." in (
 +        updated.implementation_markdown
 +    )
 +    assert "Collect verification evidence" not in updated.implementation_markdown
++
++
  def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None:
      task = (
          "Create an equally thorough nginx guide with index.html plus chapter files "
+     )
 +def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() -> None:
 +    task = (
 +        "Create an equally thorough nginx guide with index.html plus chapter files "
 +        "covering getting started, installation, first website setup, configs, and "
 +        "advanced topics."
 +    )
++
 +    todos = merge_refreshed_todos_with_existing_scope(
 +        task,
 +        existing_pending_items=[
 +            "Create each chapter file in sequence, following the same structure as the Fortran guide",
 +            "Ensure all files are properly linked and formatted consistently",
 +        ],
 +        existing_completed_items=[
 +            "First, examine the existing Fortran guide structure to understand the format and cadence",
 +            "Create the directory structure for the new nginx guide",
 +            "Create the main index.html file",
 +        ],
 +        refreshed_steps=[
 +            "First examined the existing Fortran guide structure to understand format and cadence",
 +            "Created the main index.html file with navigation",
 +            "Created chapter files in sequence:",
 +            "01-getting-started.html",
 +            "02-installation.html",
 +            "03-first-website.html",
 +            "04-configuring.html",
 +            "All files properly linked with navigation between chapters",
 +            "Verify the final navigation links across the guide",
 +        ],
 +    )
++
 +    labels = {item["content"]: item["status"] for item in todos}
 +    assert (
 +        labels["Create each chapter file in sequence, following the same structure as the Fortran guide"]
 +        == "pending"
 +    )
 +    assert labels["Ensure all files are properly linked and formatted consistently"] == "pending"
 +    assert labels["Verify the final navigation links across the guide"] == "pending"
 +    assert "Created chapter files in sequence:" not in labels
 +    assert "04-configuring.html" not in labels
++
++
  def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
      store = WorkflowArtifactStore(tmp_path)
      brief = ClarifyBrief.fallback(
          ),
+     )
      assert "Verify the updated index.html file is properly formatted" in dod.completed_items
++
++
 +def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() -> None:
 +    dod = create_definition_of_done("Create a multi-file nginx guide.")
 +    sync_todos_to_definition_of_done(
 +        dod,
 +        [
 +            {
 +                "content": "Create each chapter file in sequence, following the same structure as the Fortran guide",
 +                "active_form": "Working on: Create each chapter file in sequence, following the same structure as the Fortran guide",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Ensure all files are properly linked and formatted consistently",
 +                "active_form": "Working on: Ensure all files are properly linked and formatted consistently",
 +                "status": "pending",
 +            },
 +        ],
 +    )
++
 +    assert (
 +        advance_todos_from_tool_call(
 +            dod,
 +            ToolCall(
 +                id="write-one-chapter",
 +                name="write",
 +                arguments={
 +                    "file_path": "/tmp/nginx/chapters/01-getting-started.html",
 +                    "content": "<html></html>",
 +                },
 +            ),
 +        )
 +        is False
 +    )
 +    assert (
 +        "Create each chapter file in sequence, following the same structure as the Fortran guide"
 +        in dod.pending_items
 +    )
++
++
 +def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None:
 +    dod = create_definition_of_done("Create a multi-file nginx guide.")
 +    sync_todos_to_definition_of_done(
 +        dod,
 +        [
 +            {
 +                "content": "Create the nginx directory structure",
 +                "active_form": "Working on: Create the nginx directory structure",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Create index.html for nginx guide",
 +                "active_form": "Working on: Create index.html for nginx guide",
 +                "status": "pending",
 +            },
 +        ],
 +    )
++
 +    assert advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="mkdir-nginx",
 +            name="bash",
 +            arguments={"command": "mkdir -p ~/Loader/guides/nginx/chapters"},
 +        ),
 +    )
 +    assert "Create the nginx directory structure" in dod.completed_items
 +    assert "Create index.html for nginx guide" in dod.pending_items

tests/test_workflow_runtime.pymodified

          entry.reason_code == "plan_refresh_completed"
          for entry in run.agent.last_turn_summary.workflow_timeline
+     )
 +    refresh_prompt = next(
 +        invocation.messages[-1].content
 +        for invocation in backend.invocations
 +        if "Refresh the existing planning artifacts instead of creating a fresh plan from scratch."
 +        in invocation.messages[-1].content
 +    )
 +    assert "Current execution progress:" in refresh_prompt
 +    assert "Already touched during execution:" in refresh_prompt
 +    assert f"- {target}" in refresh_prompt
 +    assert any(
 +        "Plan refresh preserved the progress already made." in message.content
 +        and "Do not restart from initial discovery" in message.content
 +        for invocation in backend.invocations
 +        for message in invocation.messages
 +    )
  @pytest.mark.asyncio

tenseleyflow/loader / `297e213`

13 changed files