`3a703a1`

Harden qwen recovery loops and verification

Authored by

espadonne 3 weeks ago

SHA: 3a703a1e283382d4405282386f066a805123c57f
Parents: f0d4490
Tree: 132f2bd

17 changed files

Status	File	+	-
M	`src/loader/runtime/finalization.py`	8	0
M	`src/loader/runtime/recovery.py`	8	6
M	`src/loader/runtime/safeguard_services.py`	30	7
M	`src/loader/runtime/tool_batch_recovery.py`	107	2
M	`src/loader/runtime/tool_batches.py`	43	0
M	`src/loader/runtime/workflow.py`	222	2
M	`src/loader/tools/file_tools.py`	27	10
M	`src/loader/tools/fs_safety.py`	62	0
M	`src/loader/utils/file_mutations.py`	19	1
M	`tests/test_completion_policy.py`	74	1
M	`tests/test_expanded_tools.py`	37	0
M	`tests/test_finalization.py`	69	1
M	`tests/test_runtime_harness.py`	100	0
M	`tests/test_safeguard_services.py`	15	1
M	`tests/test_tool_batch_policies.py`	62	0
M	`tests/test_tool_batches.py`	102	0
M	`tests/test_workflow.py`	112	0

src/loader/runtime/finalization.pymodified

                  project_root=self.context.project_root,
                  task_statement=dod.task_statement,
+             )
 +        else:
 +            for command in derive_verification_commands(
 +                dod,
 +                project_root=self.context.project_root,
 +                task_statement=dod.task_statement,
 +            ):
 +                if command not in dod.verification_commands:
 +                    dod.verification_commands.append(command)
          await self.set_workflow_mode(
              ModeDecision.transition(

src/loader/runtime/recovery.pymodified

  {hints}
  ## CRITICAL RULES:
 -1. **INVESTIGATE FIRST** - Read config files, list directories, check what exists
 -2. **DO NOT** just retry the same command with slight variations
 -3. **DO NOT** try `npm start` then `npm run start` - these are the same thing!
 -4. **READ THE ERROR** - It usually tells you exactly what's wrong
 -5. If the error says "missing script: start", read package.json to see what scripts exist
 +1. Start from the error and the state you already know
 +2. Investigate only if a specific fact is still missing
 +3. If you already have enough confirmed evidence, apply the fix instead of rereading the same files
 +4. **DO NOT** just retry the same command with slight variations
 +5. **DO NOT** try `npm start` then `npm run start` - these are the same thing!
 +6. **READ THE ERROR** - It usually tells you exactly what's wrong
 +7. If the error says "missing script: start", read package.json to see what scripts exist
  ## Current attempt: {attempt_count}/{max_retries}
 -**Your next action should gather information OR try a fundamentally different approach.**
 +**Your next action should either gather the missing information OR apply the fix using confirmed findings.**
  What will you do?"""

src/loader/runtime/safeguard_services.pymodified

          sig = str(hash(str(hunks)))
          return sig in self._files_edited.get(norm_path, [])
 +    def would_duplicate_raw_patch(self, file_path: str, patch_text: str) -> bool:
 +        norm_path = self._normalize_path(file_path)
 +        sig = str(hash(patch_text))
 +        return sig in self._files_edited.get(norm_path, [])
++
      def would_duplicate_command(self, command: str) -> bool:
          norm_cmd = self._normalize_command(command)
          return norm_cmd in self._commands_run
          elif tool_name == "patch":
              file_path = arguments.get("file_path", "")
              hunks = arguments.get("hunks", [])
 -            if isinstance(hunks, list) and self.would_duplicate_patch(file_path, hunks):
 +            raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text")
 +            if isinstance(hunks, list) and hunks and self.would_duplicate_patch(file_path, hunks):
                  return True, f"Same patch already applied to: {file_path}"
 +            if isinstance(raw_patch, str) and raw_patch.strip():
 +                if self.would_duplicate_raw_patch(file_path, raw_patch):
 +                    return True, f"Same patch already applied to: {file_path}"
          elif tool_name == "read":
              read_key = self._make_read_key(arguments)
+                     (
                          "Already read "
                          f"{str(arguments.get('file_path', '')).strip()} "
 -                        "recently without any intervening changes"
 +                        "recently without any intervening changes; "
 +                        "reuse the earlier read result instead of rereading"
                      ),
                      repeat_threshold=self.READ_REPEAT_THRESHOLD,
+                 )
                  duplicate, reason = self._check_recent_observation(
                      self._recent_searches,
                      observation_key,
 -                    "Already ran the same search recently without any intervening changes",
 +                    (
 +                        "Already ran the same search recently without any intervening "
 +                        "changes; reuse the earlier search result instead of rerunning it"
 +                    ),
                      repeat_threshold=self.SEARCH_REPEAT_THRESHOLD,
+                 )
                  if duplicate:
                  duplicate, reason = self._check_recent_observation(
                      self._recent_bash_observations,
                      self._normalize_command(command),
 -                    "Already ran the same read-only shell probe recently without any intervening changes",
 +                    (
 +                        "Already ran the same read-only shell probe recently without any "
 +                        "intervening changes; reuse the earlier shell output instead of rerunning it"
 +                    ),
                      repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD,
+                 )
                  if duplicate:
              file_path = arguments.get("file_path", "")
              hunks = arguments.get("hunks", [])
              if file_path:
 -                self.record_edit(file_path, str(hunks), "structured_patch")
 +                raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text")
 +                if isinstance(hunks, list) and hunks:
 +                    self.record_edit(file_path, str(hunks), "structured_patch")
 +                elif isinstance(raw_patch, str) and raw_patch.strip():
 +                    self.record_edit(file_path, raw_patch, "raw_patch")
                  self._note_mutation()
          elif tool_name == "read":
      def _validate_patch(self, arguments: dict) -> ValidationResult:
          file_path = arguments.get("file_path", "")
          hunks = arguments.get("hunks", [])
 +        raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text")
          if not file_path or not str(file_path).strip():
              return ValidationResult(
          if not path_result.valid:
              return path_result
 -        if not isinstance(hunks, list) or not hunks:
 +        has_hunks = isinstance(hunks, list) and bool(hunks)
 +        has_raw_patch = isinstance(raw_patch, str) and bool(raw_patch.strip())
 +        if not has_hunks and not has_raw_patch:
              return ValidationResult(
                  valid=False,
                  reason="Patch hunks are missing",
 -                suggestion="Provide one or more structured patch hunks",
 +                suggestion="Provide structured patch hunks or a unified diff patch string",
                  severity="error",
+             )

src/loader/runtime/tool_batch_recovery.pymodified

  from __future__ import annotations
  from collections.abc import Awaitable, Callable
 +from difflib import SequenceMatcher
 +from pathlib import Path
 +import re
  from ..llm.base import Message, ToolCall
  from .compaction import infer_preferred_next_step, summarize_confirmed_facts
                  tool_call.arguments,
                  outcome.result_output,
+             )
 -            recovery_prompt = self._augment_recovery_prompt(recovery_prompt)
 +            recovery_prompt = self._augment_recovery_prompt(
 +                recovery_prompt,
 +                tool_call=tool_call,
 +                outcome=outcome,
 +            )
              return Message.tool_result_message(
                  tool_call_id=tool_call.id,
                  display_content=recovery_prompt,
              is_error=True,
+         )
 -    def _augment_recovery_prompt(self, prompt: str) -> str:
 +    def _augment_recovery_prompt(
 +        self,
 +        prompt: str,
 +        *,
 +        tool_call: ToolCall,
 +        outcome: ToolExecutionOutcome,
 +    ) -> str:
          """Append transcript-aware recovery guidance when recent facts exist."""
          session = self.context.session
              session.messages,
              current_task=current_task,
+         )
 +        actionable_known_state = bool(confirmed_facts and preferred_next_step)
          if not confirmed_facts and not preferred_next_step and not current_task:
              return prompt
              "- Preserve progress: do not restart by rereading already-confirmed files "
              "unless you need genuinely new evidence."
+         )
 +        if actionable_known_state:
 +            lines.extend(
 +                [
 +                    "",
 +                    "## ACTION BIAS FOR THIS RECOVERY",
 +                    "- The confirmed findings above are already enough to keep moving.",
 +                    "- Prefer edit/write/patch on the target file over rereading the same files.",
 +                    "- Only inspect one more file if a specific filename, href, or title is still unknown.",
 +                    "- Treat the preferred next step as the default path forward.",
 +                ]
 +            )
 +        candidate_lines = self._file_not_found_candidate_lines(tool_call, outcome)
 +        if candidate_lines:
 +            lines.extend(["", "## LIKELY FILE CANDIDATES", *candidate_lines])
          return "\n".join(lines)
++
 +    def _file_not_found_candidate_lines(
 +        self,
 +        tool_call: ToolCall,
 +        outcome: ToolExecutionOutcome,
 +    ) -> list[str]:
 +        if tool_call.name not in {"read", "write", "edit", "patch"}:
 +            return []
 +        if "not found" not in outcome.result_output.lower():
 +            return []
++
 +        missing_path = self._canonicalize_path(
 +            str(
 +                tool_call.arguments.get("file_path")
 +                or tool_call.arguments.get("path")
 +                or ""
 +            ).strip()
 +        )
 +        if not missing_path:
 +            return []
++
 +        candidates = self._rank_known_file_candidates(missing_path)
 +        if not candidates:
 +            return []
++
 +        names = ", ".join(f"`{Path(candidate).name}`" for candidate in candidates[:3])
 +        return [
 +            f"- Requested file does not exist: `{missing_path}`",
 +            f"- Closest known files in the same directory: {names}",
 +            "- Prefer one of those exact filenames instead of retrying the missing path.",
 +        ]
++
 +    def _rank_known_file_candidates(self, missing_path: str) -> list[str]:
 +        missing_parent = str(Path(missing_path).parent)
 +        missing_name = Path(missing_path).name
 +        missing_prefix = missing_name.split("-", 1)[0]
++
 +        ranked: list[tuple[float, str]] = []
 +        seen: set[str] = set()
 +        for candidate in self._known_file_paths():
 +            if candidate == missing_path:
 +                continue
 +            if str(Path(candidate).parent) != missing_parent:
 +                continue
 +            name = Path(candidate).name
 +            if name in seen:
 +                continue
 +            seen.add(name)
++
 +            score = SequenceMatcher(None, missing_name, name).ratio()
 +            if missing_prefix and name.startswith(f"{missing_prefix}-"):
 +                score += 1.0
 +            ranked.append((score, candidate))
++
 +        ranked.sort(key=lambda item: (-item[0], item[1]))
 +        return [candidate for _, candidate in ranked]
++
 +    def _known_file_paths(self) -> list[str]:
 +        pattern = re.compile(r"(?:~|/)[^\s`\"']+\.html")
 +        discovered: list[str] = []
 +        seen: set[str] = set()
 +        for message in self.context.session.messages:
 +            for raw_path in pattern.findall(message.content):
 +                candidate = self._canonicalize_path(raw_path)
 +                if not candidate or candidate in seen:
 +                    continue
 +                seen.add(candidate)
 +                discovered.append(candidate)
 +        return discovered
++
 +    def _canonicalize_path(self, raw_path: str) -> str:
 +        if not raw_path:
 +            return ""
 +        try:
 +            return str(Path(raw_path).expanduser().resolve(strict=False))
 +        except (OSError, RuntimeError, ValueError):
 +            return str(Path(raw_path).expanduser())

src/loader/runtime/tool_batches.pymodified

      VerificationObservationStatus,
+ )
  from .workflow import sync_todos_to_definition_of_done
 +from .workflow import advance_todos_from_tool_call
 +from .compaction import infer_preferred_next_step
  EventSink = Callable[[AgentEvent], Awaitable[None]]
  ConfirmationHandler = (
              # otherwise the model operates blind and loops.
              self.context.session.append(outcome.message)
              summary.tool_result_messages.append(outcome.message)
 +            if outcome.state == ToolExecutionState.DUPLICATE:
 +                self._queue_duplicate_observation_nudge(tool_call)
              should_continue = await self.verification_gate.should_continue(
                  tool_call=tool_call,
          return result
 +    def _queue_duplicate_observation_nudge(self, tool_call: ToolCall) -> None:
 +        """Queue a concrete next-step nudge after duplicate observational actions."""
++
 +        if tool_call.name not in {"read", "glob", "grep", "bash"}:
 +            return
++
 +        current_task = getattr(self.context.session, "current_task", None)
 +        preferred_next_step = infer_preferred_next_step(
 +            self.context.session.messages,
 +            current_task=current_task,
 +        )
 +        if preferred_next_step:
 +            self.context.queue_steering_message(
 +                "Reuse the earlier observation instead of repeating it. "
 +                f"{preferred_next_step} "
 +                "Only gather more evidence if a specific filename, href, or title is still unknown."
 +            )
 +            return
++
 +        target_path = str(
 +            tool_call.arguments.get("file_path")
 +            or tool_call.arguments.get("path")
 +            or ""
 +        ).strip()
 +        if target_path:
 +            self.context.queue_steering_message(
 +                "Reuse the earlier observation instead of repeating it. "
 +                f"Use the current contents of `{target_path}` and take a different next step. "
 +                "Only gather more evidence if a specific filename, href, or title is still unknown."
 +            )
 +            return
++
 +        self.context.queue_steering_message(
 +            "Reuse the earlier observation instead of repeating it. "
 +            "Choose a different next step that makes progress."
 +        )
++
      async def _record_successful_execution(
          self,
          *,
              new_todos = outcome.registry_result.metadata.get("new_todos", [])
              if isinstance(new_todos, list):
                  sync_todos_to_definition_of_done(dod, new_todos)
 +        else:
 +            advance_todos_from_tool_call(dod, tool_call)
          self.dod_store.save(dod)
          recovery_context = self.context.recovery_context
          if recovery_context is not None:

src/loader/runtime/workflow.pymodified

  from pathlib import Path
  from typing import ClassVar
 +from ..llm.base import ToolCall
  from .clarify_grounding import ClarifyGrounding
  from .dod import slugify
  from .workflow_policy import (
      "WorkflowSignalPacket",
      "WorkflowTimelineEntry",
      "WorkflowTimelineEntryKind",
 +    "advance_todos_from_tool_call",
      "build_execute_bridge",
      "enrich_clarify_brief_with_grounding",
      "extract_verification_commands_from_markdown",
  _GENERIC_ASSUMPTIONS = {
      "Unspecified details stay unchanged unless evidence says otherwise.",
+ }
 +_SPECIAL_TODO_ITEMS = {
 +    "Complete the requested work",
 +    "Collect verification evidence",
 +}
 +_READ_STEP_HINTS = (
 +    "read",
 +    "examine",
 +    "inspect",
 +    "review",
 +    "check",
 +    "look at",
 +    "look through",
 +    "open",
 +    "understand",
 +    "study",
 +)
 +_SEARCH_STEP_HINTS = (
 +    "list",
 +    "find",
 +    "search",
 +    "scan",
 +    "discover",
 +    "locate",
 +    "enumerate",
 +    "gather",
 +)
 +_PARSE_STEP_HINTS = (
 +    "parse",
 +    "extract",
 +    "identify",
 +    "map",
 +    "determine",
 +)
 +_MUTATION_STEP_HINTS = (
 +    "update",
 +    "edit",
 +    "write",
 +    "fix",
 +    "modify",
 +    "change",
 +    "patch",
 +    "replace",
 +    "correct",
 +    "rewrite",
 +)
 +_VERIFY_STEP_HINTS = (
 +    "verify",
 +    "validation",
 +    "validate",
 +    "test",
 +    "confirm",
 +    "check",
 +)
 +_SHELL_COMMAND_START = re.compile(
 +    r"(?<![\w/.-])("
 +    r"ls|grep|pytest|uv|python3?|html5validator|cargo|npm|node|mypy|ruff|find|git|cat|sed|head|tail"
 +    r")\b"
 +)
  _SECTION_ALIASES = {
      "task statement": "task_statement",
      dod.completed_items = list(dict.fromkeys(completed + special_completed))
 +def advance_todos_from_tool_call(dod, tool_call: ToolCall) -> bool:
 +    """Advance the best-matching pending todo from a successful tool call."""
++
 +    best_index: int | None = None
 +    best_score = 0
++
 +    for index, item in enumerate(dod.pending_items):
 +        label = item.strip()
 +        if not label or label in _SPECIAL_TODO_ITEMS:
 +            continue
 +        score = _todo_progress_score(label, tool_call)
 +        if score > best_score:
 +            best_index = index
 +            best_score = score
++
 +    if best_index is None or best_score <= 0:
 +        return False
++
 +    completed = dod.pending_items.pop(best_index)
 +    if completed not in dod.completed_items:
 +        dod.completed_items.append(completed)
 +    return True
++
++
 +def _todo_progress_score(item: str, tool_call: ToolCall) -> int:
 +    text = item.lower()
 +    name = tool_call.name
 +    file_path = str(tool_call.arguments.get("file_path", "")).strip().lower()
 +    path = str(tool_call.arguments.get("path", "")).strip().lower()
 +    pattern = str(tool_call.arguments.get("pattern", "")).strip().lower()
 +    command = str(tool_call.arguments.get("command", "")).strip().lower()
 +    combined = " ".join(part for part in (file_path, path, pattern, command) if part)
++
 +    path_hint = file_path or path
 +    basename = Path(path_hint).name.lower() if path_hint else ""
 +    parent = Path(path_hint).parent.name.lower() if path_hint else ""
++
 +    score = 0
 +    if basename and basename in text:
 +        score += 3
 +    if parent and parent not in {"", "."} and parent in text:
 +        score += 2
 +    if "index" in text and "index" in combined:
 +        score += 2
 +    if "chapter" in text and ("chapter" in basename or "chapters" in combined):
 +        score += 1
 +    if "html" in text and ".html" in combined:
 +        score += 1
++
 +    if name == "read":
 +        if _contains_any(text, _READ_STEP_HINTS):
 +            score += 2
 +        if _contains_any(text, _PARSE_STEP_HINTS) and ".html" in combined:
 +            score += 1
 +    elif name in {"glob", "grep"}:
 +        if _contains_any(text, _SEARCH_STEP_HINTS):
 +            score += 2
 +        if name == "glob" and _contains_any(text, _READ_STEP_HINTS) and ".html" in combined:
 +            score += 1
 +    elif name == "bash":
 +        if _looks_like_verification_command(command):
 +            if _contains_any(text, _VERIFY_STEP_HINTS):
 +                score += 3
 +        elif _looks_like_search_command(command):
 +            if _contains_any(text, _SEARCH_STEP_HINTS):
 +                score += 2
 +        elif _looks_like_read_command(command):
 +            if _contains_any(text, _READ_STEP_HINTS):
 +                score += 2
 +    elif name in {"write", "edit", "patch"}:
 +        if _contains_any(text, _MUTATION_STEP_HINTS):
 +            score += 3
++
 +    if name in {"write", "edit", "patch"} and _contains_any(text, _VERIFY_STEP_HINTS):
 +        return 0
 +    return score
++
++
 +def _contains_any(text: str, candidates: tuple[str, ...]) -> bool:
 +    return any(candidate in text for candidate in candidates)
++
++
 +def _looks_like_search_command(command: str) -> bool:
 +    return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob "))
++
++
 +def _looks_like_read_command(command: str) -> bool:
 +    return any(token in command for token in ("cat ", "sed ", "head ", "tail "))
++
++
 +def _looks_like_verification_command(command: str) -> bool:
 +    return any(
 +        token in command
 +        for token in (
 +            "pytest",
 +            "unittest",
 +            " test",
 +            " check",
 +            " verify",
 +            "html5validator",
 +            "mypy",
 +            "ruff",
 +            "lint",
 +            "grep ",
 +            "diff ",
 +            "cmp ",
 +        )
 +    )
++
++
  def extract_verification_commands_from_markdown(markdown: str) -> list[str]:
      """Extract verification commands from a verification-plan markdown document."""
  def _extract_commands(items: list[str]) -> list[str]:
      commands: list[str] = []
      for item in items:
 -        match = re.match(r"^`(.+)`$", item)
 -        commands.append((match.group(1) if match else item).strip())
 +        text = item.strip()
 +        if not text:
 +            continue
++
 +        # Code fences often contain shell comments plus the actual command lines.
 +        if "```" in text:
 +            text = text.replace("```bash", "```").replace("```sh", "```")
 +            if "\n" not in text:
 +                commands.extend(_extract_collapsed_shell_commands(text))
 +                continue
++
 +        lines = text.splitlines() if "\n" in text or "```" in text else [text]
 +        for line in lines:
 +            candidate = line.strip()
 +            if not candidate or candidate.startswith("```"):
 +                continue
 +            candidate = re.sub(r"^-\s+", "", candidate)
 +            match = re.match(r"^`(.+)`$", candidate)
 +            candidate = (match.group(1) if match else candidate).strip()
 +            if candidate.startswith("#"):
 +                candidate = _extract_shell_command_from_text(candidate)
 +                if not candidate:
 +                    continue
 +            if candidate:
 +                commands.append(candidate)
      return [command for command in commands if command]
 +def _extract_collapsed_shell_commands(text: str) -> list[str]:
 +    stripped = re.sub(r"```(?:\w+)?", "", text).strip()
 +    if not stripped:
 +        return []
++
 +    matches = list(_SHELL_COMMAND_START.finditer(stripped))
 +    if not matches:
 +        extracted = _extract_shell_command_from_text(stripped)
 +        return [extracted] if extracted else []
++
 +    commands: list[str] = []
 +    for index, match in enumerate(matches):
 +        start = match.start()
 +        end = matches[index + 1].start() if index + 1 < len(matches) else len(stripped)
 +        candidate = stripped[start:end].strip()
 +        if candidate:
 +            commands.append(candidate)
 +    return commands
++
++
 +def _extract_shell_command_from_text(text: str) -> str:
 +    match = _SHELL_COMMAND_START.search(text)
 +    if match is None:
 +        return ""
 +    return text[match.start():].strip()
++
++
  def _has_concrete_anchor(task: str) -> bool:
      return any(
          re.search(pattern, task)

src/loader/tools/file_tools.pymodified

      ensure_safe_to_read,
      ensure_safe_to_write,
      make_structured_patch,
 +    parse_unified_diff_patch,
      resolve_workspace_path,
+ )
      def description(self) -> str:
          return (
              "Apply structured patch hunks to a file. Prefer this for larger "
 -            "or multi-line edits where exact old/new string replacement is brittle."
 +            "or multi-line edits where exact old/new string replacement is brittle. "
 +            "A raw unified diff string is also accepted via `patch`."
+         )
      @property
                          ],
                      },
                  },
 +                "patch": {
 +                    "type": "string",
 +                    "description": (
 +                        "Optional unified diff patch string. Loader will parse this "
 +                        "into structured hunks when possible."
 +                    ),
 +                },
              },
 -            "required": ["file_path", "hunks"],
 +            "required": ["file_path"],
+         }
      @property
      async def execute(
          self,
          file_path: str,
 -        hunks: list[dict[str, Any]],
 +        hunks: list[dict[str, Any]] | None = None,
 +        patch: str | None = None,
          **kwargs: Any,
      ) -> ToolResult:
          kwargs.pop("_skip_confirmation", None)
              ensure_safe_to_read(path)
              original_content = await asyncio.to_thread(path.read_text)
              original_lines = original_content.splitlines()
 -            parsed_hunks = [
 -                StructuredPatchHunk.from_dict_with_original(
 -                    hunk,
 -                    original_lines=original_lines,
 -                )
 -                for hunk in hunks
 -            ]
 +            raw_patch = patch or kwargs.get("diff") or kwargs.get("patch_text")
 +            parsed_hunks: list[StructuredPatchHunk]
 +            if hunks:
 +                parsed_hunks = [
 +                    StructuredPatchHunk.from_dict_with_original(
 +                        hunk,
 +                        original_lines=original_lines,
 +                    )
 +                    for hunk in hunks
 +                ]
 +            elif isinstance(raw_patch, str) and raw_patch.strip():
 +                parsed_hunks = parse_unified_diff_patch(raw_patch)
 +            else:
 +                parsed_hunks = []
              if not parsed_hunks:
                  raise ValueError("hunks must not be empty")
              updated_content = apply_structured_patch(original_content, parsed_hunks)

src/loader/tools/fs_safety.pymodified

  from dataclasses import asdict, dataclass
  from pathlib import Path
 +import re
  MAX_READ_SIZE = 10 * 1024 * 1024
  MAX_WRITE_SIZE = 10 * 1024 * 1024
              "structured patch context mismatch: "
              f"expected {expected!r}, found {actual!r}"
+         )
++
++
 +_UNIFIED_DIFF_HUNK_RE = re.compile(
 +    r"^@@ -(?P<old_start>\d+)(?:,(?P<old_lines>\d+))? "
 +    r"\+(?P<new_start>\d+)(?:,(?P<new_lines>\d+))? @@"
 +)
++
++
 +def parse_unified_diff_patch(patch_text: str) -> list[StructuredPatchHunk]:
 +    """Parse a unified diff string into structured patch hunks."""
++
 +    if not str(patch_text).strip():
 +        raise ValueError("patch text is empty")
++
 +    hunks: list[StructuredPatchHunk] = []
 +    current_hunk: StructuredPatchHunk | None = None
++
 +    for raw_line in str(patch_text).splitlines():
 +        if raw_line.startswith(("--- ", "+++ ")):
 +            continue
 +        if raw_line.startswith("@@"):
 +            match = _UNIFIED_DIFF_HUNK_RE.match(raw_line)
 +            if match is None:
 +                raise ValueError(
 +                    "patch text contains an invalid unified-diff hunk header"
 +                )
 +            if current_hunk is not None:
 +                hunks.append(current_hunk)
 +            current_hunk = StructuredPatchHunk(
 +                old_start=int(match.group("old_start")),
 +                old_lines=int(match.group("old_lines") or 1),
 +                new_start=int(match.group("new_start")),
 +                new_lines=int(match.group("new_lines") or 1),
 +                lines=[],
 +            )
 +            continue
++
 +        if raw_line == r"\ No newline at end of file":
 +            continue
++
 +        if current_hunk is None:
 +            if not raw_line.strip():
 +                continue
 +            raise ValueError(
 +                "patch text must include at least one unified-diff hunk header"
 +            )
++
 +        prefix = raw_line[:1]
 +        if prefix not in {" ", "+", "-"}:
 +            raise ValueError(
 +                "patch text contains a diff line without a valid prefix"
 +            )
 +        current_hunk.lines.append(raw_line)
++
 +    if current_hunk is not None:
 +        hunks.append(current_hunk)
++
 +    if not hunks:
 +        raise ValueError("patch text must include at least one unified-diff hunk")
++
 +    return hunks

src/loader/utils/file_mutations.pymodified

  from rich.panel import Panel
  from rich.text import Text
 -from ..tools.fs_safety import StructuredPatchHunk, make_structured_patch
 +from ..tools.fs_safety import (
 +    StructuredPatchHunk,
 +    make_structured_patch,
 +    parse_unified_diff_patch,
 +)
  FILE_MUTATION_TOOLS = {"write", "edit", "patch"}
  DIFF_TRUNCATION_NOTICE = "truncated for display; full result preserved in session"
          structured_patch = _coerce_patch_hunks(info.get("hunks")) or _coerce_patch_hunks(
              args.get("hunks")
+         )
 +    if not structured_patch and tool_name == "patch":
 +        structured_patch = _coerce_raw_patch_hunks(info) or _coerce_raw_patch_hunks(args)
      old_text = _extract_old_text(tool_name, info) or _extract_old_text(tool_name, args)
      new_text = _extract_new_text(tool_name, info) or _extract_new_text(tool_name, args)
      return hunks
 +def _coerce_raw_patch_hunks(payload: dict[str, Any]) -> list[StructuredPatchHunk]:
 +    for key in ("patch", "diff", "patch_text"):
 +        value = payload.get(key)
 +        if not isinstance(value, str) or not value.strip():
 +            continue
 +        try:
 +            return parse_unified_diff_patch(value)
 +        except ValueError:
 +            continue
 +    return []
++
++
  def _extract_file_path(payload: dict[str, Any]) -> str | None:
      for key in ("file_path", "filePath", "path", "filename", "file"):
          value = payload.get(key)

tests/test_completion_policy.pymodified

  import pytest
 -from loader.llm.base import Message, Role
 +from loader.llm.base import Message, Role, ToolCall
  from loader.runtime.completion_policy import CompletionPolicy
  from loader.runtime.context import RuntimeContext
  from loader.runtime.dod import VerificationEvidence, create_definition_of_done
      detect_premature_completion,
      get_continuation_prompt,
+ )
 +from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done
  from loader.runtime.verification_observations import (
      VerificationObservationStatus,
      verification_attempt_id,
      assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
 +def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None:
 +    dod = create_definition_of_done("Fix the chapter links in index.html.")
 +    sync_todos_to_definition_of_done(
 +        dod,
 +        [
 +            {
 +                "content": "First, examine the current index.html file to understand its structure",
 +                "active_form": "Working on: First, examine the current index.html file to understand its structure",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "List and read all HTML files in the chapters directory to extract chapter information",
 +                "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Parse chapter titles from each HTML file",
 +                "active_form": "Working on: Parse chapter titles from each HTML file",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Update index.html with correct chapter links and titles",
 +                "active_form": "Working on: Update index.html with correct chapter links and titles",
 +                "status": "pending",
 +            },
 +        ],
 +    )
 +    advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="read-index",
 +            name="read",
 +            arguments={"file_path": "/tmp/fortran/index.html"},
 +        ),
 +    )
 +    advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="glob-chapters",
 +            name="glob",
 +            arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"},
 +        ),
 +    )
 +    advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="read-chapter",
 +            name="read",
 +            arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"},
 +        ),
 +    )
++
 +    assessment = assess_completion_follow_through_with_provenance(
 +        task="Update /tmp/fortran/index.html so every chapter link is correct.",
 +        response="I'll update the index.html file with the correct chapter links and titles.",
 +        actions_taken=[
 +            "read: {'file_path': '/tmp/fortran/index.html'}",
 +            "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}",
 +            "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}",
 +        ],
 +        dod=dod,
 +    )
++
 +    assert assessment.check.missing_evidence[0] == (
 +        "completion of tracked work items "
 +        "(Update index.html with correct chapter links and titles)"
 +    )
 +    assert assessment.check.suggested_next_steps[0] == (
 +        "Complete the tracked item: Update index.html with correct chapter links and titles"
 +    )
++
++
  @pytest.mark.asyncio
  async def test_completion_policy_stops_for_text_loop_using_runtime_context(
      temp_dir: Path,

tests/test_expanded_tools.pymodified

      assert result.metadata["structured_patch"]
 +@pytest.mark.asyncio
 +async def test_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None:
 +    target = temp_dir / "sample.txt"
 +    target.write_text("alpha\nbeta\ngamma\n")
 +    tool = PatchTool(workspace_root=temp_dir)
++
 +    result = await tool.execute(
 +        file_path=str(target),
 +        patch=(
 +            "--- a/sample.txt\n"
 +            "+++ b/sample.txt\n"
 +            "@@ -2,1 +2,1 @@\n"
 +            "-beta\n"
 +            "+beta updated\n"
 +        ),
 +    )
++
 +    assert result.is_error is False
 +    assert target.read_text() == "alpha\nbeta updated\ngamma\n"
 +    assert result.metadata["structured_patch"]
++
++
 +@pytest.mark.asyncio
 +async def test_patch_tool_rejects_invalid_unified_diff_string(temp_dir: Path) -> None:
 +    target = temp_dir / "sample.txt"
 +    target.write_text("alpha\nbeta\ngamma\n")
 +    tool = PatchTool(workspace_root=temp_dir)
++
 +    result = await tool.execute(
 +        file_path=str(target),
 +        patch="--- a/sample.txt\n+++ b/sample.txt\n@@ ...\n",
 +    )
++
 +    assert result.is_error is True
 +    assert "invalid unified-diff hunk header" in result.output
++
++
  @pytest.mark.asyncio
  async def test_git_tool_inspects_read_only_repo_state(temp_dir: Path) -> None:
      subprocess.run(["git", "init", "--quiet"], cwd=temp_dir, check=True)

tests/test_finalization.pymodified

          return self._outcomes.pop(0)
 +class RecordingExecutor:
 +    def __init__(self) -> None:
 +        self.commands: list[str] = []
++
 +    async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
 +        command = str(tool_call.arguments.get("command", ""))
 +        self.commands.append(command)
 +        return tool_outcome(
 +            tool_call=tool_call,
 +            output="ok",
 +            is_error=False,
 +            exit_code=0,
 +            stdout="ok",
 +        )
++
++
  def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
      registry = create_default_registry(temp_dir)
      registry.configure_workspace_root(temp_dir)
+     )
      dod = create_definition_of_done("Update the runtime tests.")
      dod.mutating_actions.append("write")
 -    dod.touched_files.append(str(temp_dir / "tests" / "test_runtime.py"))
      dod.verification_commands = ["uv run pytest -q"]
      summary = TurnSummary(final_response="")
      tool_call = ToolCall(
      assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
          VerificationObservationStatus.PASSED.value
+     ]
++
++
 +@pytest.mark.asyncio
 +async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
 +    temp_dir: Path,
 +) -> None:
 +    chapters = temp_dir / "chapters"
 +    chapters.mkdir()
 +    (chapters / "01-introduction.html").write_text(
 +        "<h1>Chapter 1: Introduction to Fortran</h1>\n"
 +    )
 +    index = temp_dir / "index.html"
 +    index.write_text(
 +        "\n".join(
 +            [
 +                '<ul class="chapter-list">',
 +                '  <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
 +                "</ul>",
 +            ]
 +        )
 +    )
++
 +    session = FakeSession()
 +    context = build_context(temp_dir, session)
 +    finalizer = TurnFinalizer(
 +        context,
 +        RuntimeTracer(),
 +        DefinitionOfDoneStore(temp_dir),
 +        set_workflow_mode=_noop_set_workflow_mode,
 +    )
 +    dod = create_definition_of_done(
 +        "Update index.html so the table of contents links and chapter titles are correct."
 +    )
 +    dod.mutating_actions.append("edit")
 +    dod.touched_files.append(str(index))
 +    dod.verification_commands = ['grep -n "href=" index.html']
 +    summary = TurnSummary(final_response="")
 +    executor = RecordingExecutor()
++
 +    async def capture(event) -> None:
 +        return None
++
 +    result = await finalizer.run_definition_of_done_gate(
 +        dod=dod,
 +        candidate_response="Updated the index.html links.",
 +        emit=capture,
 +        summary=summary,
 +        executor=executor,  # type: ignore[arg-type]
 +    )
++
 +    assert result.should_continue is False
 +    assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
 +    assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in executor.commands)
      assert (
          session.workflow_timeline[-1].verification_observations[0].attempt_id
          == "verification-attempt-1"

tests/test_runtime_harness.pymodified

      assert "Patched sample.txt." in run.response
 +@pytest.mark.asyncio
 +async def test_native_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None:
 +    target = temp_dir / "sample.txt"
 +    target.write_text("alpha\nbeta\ngamma\n")
++
 +    backend = ScriptedBackend(
 +        completions=[
 +            native_tool_response(
 +                ToolCall(
 +                    id="patch-1",
 +                    name="patch",
 +                    arguments={
 +                        "file_path": str(target),
 +                        "patch": (
 +                            "--- a/sample.txt\n"
 +                            "+++ b/sample.txt\n"
 +                            "@@ -2,1 +2,1 @@\n"
 +                            "-beta\n"
 +                            "+beta updated\n"
 +                        ),
 +                    },
 +                ),
 +                content="I'll patch the file directly.",
 +            ),
 +            final_response("Patched sample.txt."),
 +        ]
 +    )
++
 +    run = await run_scenario(
 +        "Update sample.txt.",
 +        backend,
 +        config=non_streaming_config(),
 +        project_root=temp_dir,
 +    )
++
 +    assert tool_event_names(run) == ["patch"]
 +    assert target.read_text() == "alpha\nbeta updated\ngamma\n"
 +    assert "Patched sample.txt." in run.response
++
++
  @pytest.mark.asyncio
  async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None:
      raw_json = json.dumps(
      assert "existing file contents" in run.response
 +@pytest.mark.asyncio
 +async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence(
 +    temp_dir: Path,
 +) -> None:
 +    chapters = temp_dir / "chapters"
 +    chapters.mkdir()
 +    (chapters / "01-introduction.html").write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n")
 +    (chapters / "02-setup.html").write_text("<h1>Chapter 2: Setting Up Fortran</h1>\n")
 +    index_file = temp_dir / "index.html"
 +    index_file.write_text("broken table of contents\n")
++
 +    backend = ScriptedBackend(
 +        completions=[
 +            native_tool_response(
 +                ToolCall(
 +                    id="glob-1",
 +                    name="glob",
 +                    arguments={"path": str(chapters), "pattern": "*.html"},
 +                ),
 +                content="I'll inspect the chapter inventory first.",
 +            ),
 +            native_tool_response(
 +                ToolCall(
 +                    id="read-1",
 +                    name="read",
 +                    arguments={"file_path": str(index_file)},
 +                ),
 +                content="I'll inspect the index next.",
 +            ),
 +            native_tool_response(
 +                ToolCall(
 +                    id="read-2",
 +                    name="read",
 +                    arguments={"file_path": str(index_file)},
 +                ),
 +                content="I'll reopen the index.",
 +            ),
 +            final_response("I'll reuse the earlier evidence and patch the index next."),
 +        ]
 +    )
++
 +    run = await run_scenario(
 +        "Update index.html so the table of contents links are correct.",
 +        backend,
 +        config=non_streaming_config(),
 +        project_root=temp_dir,
 +    )
++
 +    messages = tool_result_messages(run)
 +    steering_messages = [
 +        event.content
 +        for event in run.events
 +        if event.type == "steering" and event.content
 +    ]
++
 +    assert any("reuse the earlier read result instead of rereading" in message for message in messages)
 +    assert any("Reuse the earlier observation instead of repeating it." in message for message in steering_messages)
 +    assert any("index.html" in message for message in steering_messages)
++
++
  @pytest.mark.asyncio
  async def test_interleaved_reread_is_allowed_once_without_intervening_mutation(
      temp_dir: Path,

tests/test_safeguard_services.pymodified

      assert result == ValidationResult(
          valid=False,
          reason="Patch hunks are missing",
 -        suggestion="Provide one or more structured patch hunks",
 +        suggestion="Provide structured patch hunks or a unified diff patch string",
          severity="error",
+     )
 +def test_pre_action_validator_allows_patch_string_without_hunks() -> None:
 +    validator = PreActionValidator()
++
 +    result = validator.validate(
 +        "patch",
 +        {
 +            "file_path": "notes.txt",
 +            "patch": "--- a/notes.txt\n+++ b/notes.txt\n@@ -1,1 +1,1 @@\n-old\n+new\n",
 +        },
 +    )
++
 +    assert result == ValidationResult(valid=True)
++
++
  def test_runtime_safeguards_wrap_runtime_owned_services() -> None:
      safeguards = RuntimeSafeguards()

tests/test_tool_batch_policies.pymodified

      assert follow_up is not None
      assert "## CONTINUE FROM KNOWN STATE" in follow_up.content
 +    assert "apply the fix using confirmed findings" in follow_up.content
 +    assert "## ACTION BIAS FOR THIS RECOVERY" in follow_up.content
 +    assert "Prefer edit/write/patch on the target file" in follow_up.content
      assert "04-variables.html" in follow_up.content
      assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content
      assert "`~/Loader/guides/fortran/index.html`" in follow_up.content
      assert any(event.type == "recovery" for event in events)
 +@pytest.mark.asyncio
 +async def test_tool_batch_recovery_controller_suggests_known_sibling_files(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence should not run here")
++
 +    async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
 +        raise AssertionError("Verification should not run here")
++
 +    messages = [
 +        Message(
 +            role=Role.TOOL,
 +            content=(
 +                "Observation [glob]: Result: "
 +                "/private/tmp/fortran-qwen-recovery-check/chapters/01-introduction.html\n"
 +                "/private/tmp/fortran-qwen-recovery-check/chapters/02-setup.html\n"
 +                "/private/tmp/fortran-qwen-recovery-check/chapters/03-basics.html\n"
 +                "/private/tmp/fortran-qwen-recovery-check/chapters/04-variables.html\n"
 +                "/private/tmp/fortran-qwen-recovery-check/chapters/05-input-output.html"
 +            ),
 +            tool_results=[],
 +        ),
 +    ]
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=messages,
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +    )
 +    controller = ToolBatchRecoveryController(context)
 +    tool_call = ToolCall(
 +        id="read-missing",
 +        name="read",
 +        arguments={"file_path": "/tmp/fortran-qwen-recovery-check/chapters/04-data-types.html"},
 +    )
 +    outcome = tool_outcome(
 +        tool_call=tool_call,
 +        output="File not found: /tmp/fortran-qwen-recovery-check/chapters/04-data-types.html",
 +        is_error=True,
 +    )
++
 +    events: list[AgentEvent] = []
++
 +    async def emit(event: AgentEvent) -> None:
 +        events.append(event)
++
 +    follow_up = await controller.build_follow_up(
 +        tool_call=tool_call,
 +        outcome=outcome,
 +        emit=emit,
 +    )
++
 +    assert follow_up is not None
 +    assert "## LIKELY FILE CANDIDATES" in follow_up.content
 +    assert "`04-variables.html`" in follow_up.content
 +    assert "instead of retrying the missing path" in follow_up.content
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files(
      temp_dir: Path,

tests/test_tool_batches.pymodified

      assert context.recovery_context is None
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_queues_duplicate_observation_nudge(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should be disabled in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run for this scenario")
++
 +    messages = [
 +        Message(
 +            role=Role.TOOL,
 +            content=(
 +                "Observation [glob]: Result: "
 +                f"{temp_dir}/chapters/01-introduction.html\n"
 +                f"{temp_dir}/chapters/02-setup.html\n"
 +                f"{temp_dir}/chapters/03-basics.html"
 +            ),
 +            tool_results=[],
 +        ),
 +        Message(
 +            role=Role.ASSISTANT,
 +            content="I should update the index now.",
 +            tool_calls=[
 +                ToolCall(
 +                    id="read-index",
 +                    name="read",
 +                    arguments={"file_path": str(temp_dir / 'index.html')},
 +                )
 +            ],
 +        ),
 +    ]
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=messages,
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +        auto_recover=False,
 +    )
 +    context.session.current_task = (
 +        f"Update {temp_dir / 'index.html'} with the right chapter links."
 +    )
 +    queued_messages: list[str] = []
 +    context.queue_steering_message_callback = queued_messages.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    tool_call = ToolCall(
 +        id="read-dup",
 +        name="read",
 +        arguments={"file_path": str(temp_dir / "index.html")},
 +    )
 +    duplicate_message = (
 +        "[Skipped - duplicate action: Already read "
 +        f"{temp_dir / 'index.html'} recently without any intervening changes; "
 +        "reuse the earlier read result instead of rereading]"
 +    )
 +    executor = FakeExecutor(
 +        [
 +            ToolExecutionOutcome(
 +                tool_call=tool_call,
 +                state=ToolExecutionState.DUPLICATE,
 +                message=Message.tool_result_message(
 +                    tool_call_id=tool_call.id,
 +                    display_content=duplicate_message,
 +                    result_content=duplicate_message,
 +                ),
 +                event_content=duplicate_message,
 +                is_error=False,
 +                result_output=duplicate_message,
 +            )
 +        ]
 +    )
++
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=_noop_emit,
 +        summary=TurnSummary(final_response=""),
 +        dod=create_definition_of_done("Fix the chapter links"),
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert len(queued_messages) == 1
 +    assert "Reuse the earlier observation instead of repeating it." in queued_messages[0]
 +    assert "index.html" in queued_messages[0]
++
++
  async def _noop_emit(event: AgentEvent) -> None:
      return None

tests/test_workflow.pymodified

  from pathlib import Path
 +from loader.llm.base import ToolCall
  from loader.runtime.clarify_grounding import ClarifyGrounding, ClarifyRepoFact
  from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
  from loader.runtime.workflow import (
      PlanningArtifacts,
      WorkflowArtifactStore,
      WorkflowMode,
 +    advance_todos_from_tool_call,
      build_execute_bridge,
      enrich_clarify_brief_with_grounding,
      extract_verification_commands_from_markdown,
+     ]
 +def test_extract_verification_commands_from_markdown_splits_code_blocks() -> None:
 +    markdown = "\n".join(
 +        [
 +            "# Verification Plan",
 +            "",
 +            "## Verification Commands",
 +            "```bash",
 +            "# Check chapter files",
 +            "ls chapters",
 +            "grep -n \"href=\" index.html",
 +            "```",
 +        ]
 +    )
++
 +    assert extract_verification_commands_from_markdown(markdown) == [
 +        "ls chapters",
 +        'grep -n "href=" index.html',
 +    ]
++
++
  def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None:
      store = WorkflowArtifactStore(tmp_path)
      brief = ClarifyBrief.fallback(
      assert "Writing router" in dod.pending_items
      assert "Collect verification evidence" in dod.pending_items
      assert "Update tests" in dod.completed_items
++
++
 +def test_advance_todos_from_tool_call_tracks_plan_progress() -> None:
 +    dod = create_definition_of_done("Fix the chapter links in index.html.")
 +    sync_todos_to_definition_of_done(
 +        dod,
 +        [
 +            {
 +                "content": "First, examine the current index.html file to understand its structure",
 +                "active_form": "Working on: First, examine the current index.html file to understand its structure",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "List and read all HTML files in the chapters directory to extract chapter information",
 +                "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Parse chapter titles from each HTML file",
 +                "active_form": "Working on: Parse chapter titles from each HTML file",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Update index.html with correct chapter links and titles",
 +                "active_form": "Working on: Update index.html with correct chapter links and titles",
 +                "status": "pending",
 +            },
 +            {
 +                "content": "Verify the updated index.html file is properly formatted",
 +                "active_form": "Working on: Verify the updated index.html file is properly formatted",
 +                "status": "pending",
 +            },
 +        ],
 +    )
++
 +    assert advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="read-index",
 +            name="read",
 +            arguments={"file_path": "/tmp/fortran/index.html"},
 +        ),
 +    )
 +    assert (
 +        "First, examine the current index.html file to understand its structure"
 +        in dod.completed_items
 +    )
++
 +    assert advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="glob-chapters",
 +            name="glob",
 +            arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"},
 +        ),
 +    )
 +    assert (
 +        "List and read all HTML files in the chapters directory to extract chapter information"
 +        in dod.completed_items
 +    )
++
 +    assert advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="read-chapter",
 +            name="read",
 +            arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"},
 +        ),
 +    )
 +    assert "Parse chapter titles from each HTML file" in dod.completed_items
++
 +    assert advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="patch-index",
 +            name="patch",
 +            arguments={"file_path": "/tmp/fortran/index.html", "hunks": []},
 +        ),
 +    )
 +    assert "Update index.html with correct chapter links and titles" in dod.completed_items
++
 +    assert advance_todos_from_tool_call(
 +        dod,
 +        ToolCall(
 +            id="verify-index",
 +            name="bash",
 +            arguments={"command": "grep -o 'href=\"[^\"]*\"' /tmp/fortran/index.html"},
 +        ),
 +    )
 +    assert "Verify the updated index.html file is properly formatted" in dod.completed_items