`6d1c5ec`

Name concrete TodoWrite targets

Authored by

espadonne 2 weeks ago

SHA: 6d1c5ec23dbc5fc901c34bff3b791ad0f8cfb258
Parents: 67175cc
Tree: dc0e0f5

5 changed files

Status	File	+	-
M	`src/loader/runtime/repair.py`	3	158
M	`src/loader/runtime/tool_batches.py`	28	0
M	`src/loader/runtime/workflow.py`	172	0
M	`tests/test_runtime_repair_flows.py`	102	0
M	`tests/test_tool_batches.py`	132	0

src/loader/runtime/repair.pymodified

+ )
  from .parsing import parse_tool_calls
  from .workflow import (
 +    infer_pending_todo_output_target,
      preferred_pending_todo_item,
      reconcile_aggregate_completion_steps,
      todo_file_candidates,
          dod: DefinitionOfDone,
          item: str,
      ) -> Path | None:
 -        candidates = todo_file_candidates(item)
 -        if not candidates:
 -            return self._infer_pending_item_target_from_html_graph(dod, item)
+-
 -        planned_targets = collect_planned_artifact_targets(
 -            dod,
 -            project_root=self.context.project_root,
 -            max_paths=12,
 -        )
 -        planned_files = {
 -            target.name.lower(): target
 -            for target, expect_directory in planned_targets
 -            if not expect_directory
 -        }
 -        planned_directories = [
 -            target
 -            for target, expect_directory in planned_targets
 -            if expect_directory
 -        ]
 -        touched_paths = [
 -            Path(path)
 -            for path in dod.touched_files
 -            if str(path).strip()
 -        ]
+-
 -        for candidate in candidates:
 -            candidate_str = str(candidate)
 -            if candidate.is_absolute() or candidate_str.startswith("~"):
 -                return Path(candidate_str).expanduser()
+-
 -            planned_match = planned_files.get(candidate.name.lower())
 -            if planned_match is not None:
 -                return planned_match
+-
 -            for touched in reversed(touched_paths):
 -                if touched.name.lower() == candidate.name.lower():
 -                    continue
 -                if candidate.suffix and touched.suffix.lower() != candidate.suffix.lower():
 -                    continue
 -                return touched.parent / candidate.name
+-
 -            for directory in planned_directories:
 -                return directory / candidate.name
+-
 -        return None
+-
 -    def _infer_pending_item_target_from_html_graph(
 -        self,
 -        dod: DefinitionOfDone,
 -        item: str,
 -    ) -> Path | None:
 -        target_label = _normalize_pending_output_label(item)
 -        if not target_label:
 -            return None
+-
 -        html_files = self._pending_item_html_sources(dod)
 -        matches: list[tuple[int, bool, Path]] = []
 -        for html_file in html_files:
 -            try:
 -                content = html_file.read_text()
 -            except OSError:
 -                continue
 -            for href, link_text in _iter_local_html_links(html_file, content):
 -                resolved = (html_file.parent / href).resolve(strict=False)
 -                score = _pending_output_link_match_score(
 -                    target_label,
 -                    _normalize_pending_output_label(link_text),
 -                )
 -                if score <= 0:
 -                    continue
 -                matches.append((score, not resolved.exists(), resolved))
+-
 -        if not matches:
 -            return None
 -        matches.sort(key=lambda item: (item[0], item[1], str(item[2])), reverse=True)
 -        return matches[0][2]
+-
 -    def _pending_item_html_sources(self, dod: DefinitionOfDone) -> list[Path]:
 -        planned_targets = collect_planned_artifact_targets(
 +        return infer_pending_todo_output_target(
              dod,
 +            item,
              project_root=self.context.project_root,
 -            max_paths=12,
+         )
 -        html_sources: list[Path] = []
 -        seen: set[str] = set()
+-
 -        for raw_path in dod.touched_files:
 -            path = Path(raw_path).expanduser().resolve(strict=False)
 -            if path.suffix.lower() not in {".html", ".htm"}:
 -                continue
 -            key = str(path)
 -            if key in seen:
 -                continue
 -            seen.add(key)
 -            html_sources.append(path)
+-
 -        for target, expect_directory in planned_targets:
 -            if expect_directory or target.suffix.lower() not in {".html", ".htm"}:
 -                continue
 -            key = str(target)
 -            if key in seen:
 -                continue
 -            seen.add(key)
 -            html_sources.append(target)
+-
 -        return html_sources
      def _preferred_resume_pending_item(
          self,
  def _todo_is_consistency_review_step(label: str) -> bool:
      lowered = label.lower()
      return any(token in lowered for token in _CONSISTENCY_REVIEW_HINTS)
+-
+-
 -def _normalize_pending_output_label(value: str) -> str:
 -    text = " ".join(str(value).strip().split()).lower()
 -    if not text:
 -        return ""
 -    text = re.sub(
 -        r"^(?:working on:\s*)?(?:create|creating|write|writing|build|building|develop|developing)\s+",
 -        "",
 -        text,
 -    )
 -    text = re.sub(r"\bfor nginx guide\b", "", text)
 -    text = re.sub(r"[^a-z0-9]+", " ", text)
 -    return " ".join(text.split())
+-
+-
 -def _pending_output_link_match_score(todo_label: str, link_label: str) -> int:
 -    if not todo_label or not link_label:
 -        return 0
 -    if todo_label == link_label:
 -        return 3
 -    if todo_label in link_label or link_label in todo_label:
 -        return 2
 -    todo_tokens = {token for token in todo_label.split() if len(token) > 2}
 -    link_tokens = {token for token in link_label.split() if len(token) > 2}
 -    if not todo_tokens or not link_tokens:
 -        return 0
 -    overlap = todo_tokens & link_tokens
 -    if len(overlap) >= min(3, len(todo_tokens), len(link_tokens)):
 -        return 1
 -    return 0
+-
+-
 -def _iter_local_html_links(file_path: Path, content: str) -> list[tuple[str, str]]:
 -    pattern = re.compile(
 -        r"<a\b[^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>(.*?)</a>",
 -        re.IGNORECASE | re.DOTALL,
 -    )
 -    links: list[tuple[str, str]] = []
 -    seen: set[tuple[str, str]] = set()
 -    for href, inner_html in pattern.findall(content):
 -        target = href.strip()
 -        if not target or target.startswith(("#", "http://", "https://", "mailto:")):
 -            continue
 -        trimmed_target = target.split("?", 1)[0].split("#", 1)[0]
 -        if Path(trimmed_target).suffix.lower() not in {".html", ".htm"}:
 -            continue
 -        label = re.sub(r"<[^>]+>", " ", inner_html)
 -        label = " ".join(label.split())
 -        key = (trimmed_target, label)
 -        if key in seen:
 -            continue
 -        seen.add(key)
 -        links.append((trimmed_target, label))
 -    return links

src/loader/runtime/tool_batches.pymodified

  from .workflow import (
      advance_todos_from_tool_call,
      effective_pending_todo_items,
 +    infer_pending_todo_output_target,
      preferred_pending_todo_item,
      reconcile_aggregate_completion_steps,
      sync_todos_to_definition_of_done,
+         )
          if missing_artifact is None:
              if next_pending and _todo_is_mutation_step(next_pending):
 +                pending_target = infer_pending_todo_output_target(
 +                    dod,
 +                    next_pending,
 +                    project_root=self.context.project_root,
 +                )
 +                if pending_target is not None:
 +                    concrete_message = (
 +                        "Todo tracking is updated. Continue with the next pending item: "
 +                        f"`{next_pending}`. Resume by creating `{pending_target.name}` now. "
 +                        f"Prefer one `write` call for `{pending_target}` instead of more rereads. "
 +                    )
 +                    if not pending_target.parent.exists():
 +                        concrete_message += (
 +                            "The `write` tool can create that file's parent directories "
 +                            "automatically, so do the write in one step instead of stopping "
 +                            "for a separate mkdir. "
 +                        )
 +                    concrete_message += (
 +                        "Use the current output files as the source of truth, and do not "
 +                        "reopen reference materials unless one specific fact required for "
 +                        "that step is still unknown. Make your next response the concrete "
 +                        "mutation tool call itself, not another bookkeeping-only turn. "
 +                        "Perform the mutation now instead of spending another turn on "
 +                        "planning, rereads, or verification."
 +                    )
 +                    self.context.queue_steering_message(concrete_message)
 +                    return
                  self.context.queue_steering_message(
                      "Todo tracking is updated. Continue with the next pending item: "
                      f"`{next_pending}`. Use the current output files as the source of "

src/loader/runtime/workflow.pymodified

      "effective_pending_todo_items",
      "enrich_clarify_brief_with_grounding",
      "extract_verification_commands_from_markdown",
 +    "infer_pending_todo_output_target",
      "load_brief",
      "load_planning_artifacts",
      "merge_refreshed_todos_with_existing_scope",
      return candidates
 +def infer_pending_todo_output_target(
 +    dod,
 +    item: str,
 +    *,
 +    project_root: Path | None = None,
 +) -> Path | None:
 +    """Infer the concrete file path a pending todo is asking the model to mutate."""
++
 +    root = project_root or Path.cwd()
 +    candidates = todo_file_candidates(item)
 +    planned_targets = collect_planned_artifact_targets(
 +        dod,
 +        project_root=root,
 +        max_paths=12,
 +    )
++
 +    if candidates:
 +        planned_files = {
 +            target.name.lower(): target
 +            for target, expect_directory in planned_targets
 +            if not expect_directory
 +        }
 +        planned_directories = [
 +            target
 +            for target, expect_directory in planned_targets
 +            if expect_directory
 +        ]
 +        touched_paths = [
 +            Path(path)
 +            for path in dod.touched_files
 +            if str(path).strip()
 +        ]
++
 +        for candidate in candidates:
 +            candidate_str = str(candidate)
 +            if candidate.is_absolute() or candidate_str.startswith("~"):
 +                return Path(candidate_str).expanduser()
++
 +            planned_match = planned_files.get(candidate.name.lower())
 +            if planned_match is not None:
 +                return planned_match
++
 +            for touched in reversed(touched_paths):
 +                if touched.name.lower() == candidate.name.lower():
 +                    continue
 +                if candidate.suffix and touched.suffix.lower() != candidate.suffix.lower():
 +                    continue
 +                return touched.parent / candidate.name
++
 +            for directory in planned_directories:
 +                return directory / candidate.name
++
 +    target_label = _normalize_pending_output_label(item)
 +    if not target_label:
 +        return None
++
 +    matches: list[tuple[int, bool, Path]] = []
 +    for html_file in _pending_item_html_sources(
 +        dod,
 +        project_root=root,
 +    ):
 +        try:
 +            content = html_file.read_text()
 +        except OSError:
 +            continue
 +        for href, link_text in _iter_local_html_links(content):
 +            resolved = (html_file.parent / href).resolve(strict=False)
 +            score = _pending_output_link_match_score(
 +                target_label,
 +                _normalize_pending_output_label(link_text),
 +            )
 +            if score <= 0:
 +                continue
 +            matches.append((score, not resolved.exists(), resolved))
++
 +    if not matches:
 +        return None
 +    matches.sort(key=lambda item: (item[0], item[1], str(item[2])), reverse=True)
 +    return matches[0][2]
++
++
  def preserve_task_grounded_acceptance_criteria(
      task_statement: str,
      *,
      return list(dict.fromkeys([*grounded_existing, *refreshed_acceptance_criteria]))
 +def _pending_item_html_sources(
 +    dod,
 +    *,
 +    project_root: Path,
 +) -> list[Path]:
 +    planned_targets = collect_planned_artifact_targets(
 +        dod,
 +        project_root=project_root,
 +        max_paths=12,
 +    )
 +    html_sources: list[Path] = []
 +    seen: set[str] = set()
++
 +    for raw_path in dod.touched_files:
 +        path = Path(raw_path).expanduser().resolve(strict=False)
 +        if path.suffix.lower() not in {".html", ".htm"}:
 +            continue
 +        key = str(path)
 +        if key in seen:
 +            continue
 +        seen.add(key)
 +        html_sources.append(path)
++
 +    for target, expect_directory in planned_targets:
 +        if expect_directory or target.suffix.lower() not in {".html", ".htm"}:
 +            continue
 +        key = str(target)
 +        if key in seen:
 +            continue
 +        seen.add(key)
 +        html_sources.append(target)
++
 +    return html_sources
++
++
 +def _normalize_pending_output_label(value: str) -> str:
 +    text = " ".join(str(value).strip().split()).lower()
 +    if not text:
 +        return ""
 +    text = re.sub(
 +        r"^(?:working on:\s*)?(?:create|creating|write|writing|build|building|develop|developing)\s+",
 +        "",
 +        text,
 +    )
 +    text = re.sub(r"\bfor nginx guide\b", "", text)
 +    text = re.sub(r"[^a-z0-9]+", " ", text)
 +    return " ".join(text.split())
++
++
 +def _pending_output_link_match_score(todo_label: str, link_label: str) -> int:
 +    if not todo_label or not link_label:
 +        return 0
 +    if todo_label == link_label:
 +        return 3
 +    if todo_label in link_label or link_label in todo_label:
 +        return 2
 +    todo_tokens = {token for token in todo_label.split() if len(token) > 2}
 +    link_tokens = {token for token in link_label.split() if len(token) > 2}
 +    if not todo_tokens or not link_tokens:
 +        return 0
 +    overlap = todo_tokens & link_tokens
 +    if len(overlap) >= min(3, len(todo_tokens), len(link_tokens)):
 +        return 1
 +    return 0
++
++
 +def _iter_local_html_links(content: str) -> list[tuple[str, str]]:
 +    pattern = re.compile(
 +        r"<a\b[^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>(.*?)</a>",
 +        re.IGNORECASE | re.DOTALL,
 +    )
 +    links: list[tuple[str, str]] = []
 +    seen: set[tuple[str, str]] = set()
 +    for href, inner_html in pattern.findall(content):
 +        target = href.strip()
 +        if not target or target.startswith(("#", "http://", "https://", "mailto:")):
 +            continue
 +        trimmed_target = target.split("?", 1)[0].split("#", 1)[0]
 +        if Path(trimmed_target).suffix.lower() not in {".html", ".htm"}:
 +            continue
 +        label = re.sub(r"<[^>]+>", " ", inner_html)
 +        label = " ".join(label.split())
 +        key = (trimmed_target, label)
 +        if key in seen:
 +            continue
 +        seen.add(key)
 +        links.append((trimmed_target, label))
 +    return links
++
++
  def merge_refreshed_todos_with_existing_scope(
      task_statement: str,
      *,

tests/test_runtime_repair_flows.pymodified

      assert sum("retry 1/2" in message for message in retry_messages) >= 2
 +@pytest.mark.asyncio
 +async def test_empty_response_retry_budget_resets_after_todowrite_turn(
 +    temp_dir: Path,
 +) -> None:
 +    first = temp_dir / "index.html"
 +    second = temp_dir / "chapters" / "01-introduction.html"
 +    backend = ScriptedBackend(
 +        completions=[
 +            CompletionResponse(content=""),
 +            CompletionResponse(
 +                content="I'll create the guide index now.",
 +                tool_calls=[
 +                    ToolCall(
 +                        id="write-1",
 +                        name="write",
 +                        arguments={
 +                            "file_path": str(first),
 +                            "content": "<html></html>\n",
 +                        },
 +                    )
 +                ],
 +            ),
 +            CompletionResponse(
 +                content="I'll create the first chapter now.",
 +                tool_calls=[
 +                    ToolCall(
 +                        id="write-2",
 +                        name="write",
 +                        arguments={
 +                            "file_path": str(second),
 +                            "content": "<html></html>\n",
 +                        },
 +                    )
 +                ],
 +            ),
 +            CompletionResponse(
 +                content="I'll update the task list now.",
 +                tool_calls=[
 +                    ToolCall(
 +                        id="todo-1",
 +                        name="TodoWrite",
 +                        arguments={
 +                            "todos": [
 +                                {
 +                                    "content": "Create index.html",
 +                                    "status": "completed",
 +                                    "active_form": "Creating index.html",
 +                                },
 +                                {
 +                                    "content": "Create 01-introduction.html",
 +                                    "status": "completed",
 +                                    "active_form": "Creating 01-introduction.html",
 +                                },
 +                                {
 +                                    "content": "Create 02-installation.html",
 +                                    "status": "pending",
 +                                    "active_form": "Creating 02-installation.html",
 +                                },
 +                            ]
 +                        },
 +                    )
 +                ],
 +            ),
 +            CompletionResponse(content=""),
 +            CompletionResponse(
 +                content="I'll create the second chapter now.",
 +                tool_calls=[
 +                    ToolCall(
 +                        id="write-3",
 +                        name="write",
 +                        arguments={
 +                            "file_path": str(temp_dir / "chapters" / "02-installation.html"),
 +                            "content": "<html></html>\n",
 +                        },
 +                    )
 +                ],
 +            ),
 +            CompletionResponse(content="The guide files are created."),
 +        ]
 +    )
++
 +    run = await run_scenario(
 +        "Create a small nginx guide.",
 +        backend,
 +        config=non_streaming_config(),
 +        project_root=temp_dir,
 +    )
++
 +    assert run.response.startswith("The guide files are created.")
 +    retry_messages: list[str] = []
 +    for invocation in backend.invocations:
 +        for message in invocation.messages:
 +            if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
 +                continue
 +            if retry_messages and retry_messages[-1] == message.content:
 +                continue
 +            retry_messages.append(message.content)
 +    assert len(retry_messages) >= 2
 +    assert all("retry 2/2" not in message for message in retry_messages)
 +    assert sum("retry 1/2" in message for message in retry_messages) >= 2
++
++
  @pytest.mark.asyncio
  async def test_repeated_empty_responses_fail_honestly_after_one_retry(
      temp_dir: Path,

tests/test_tool_batches.pymodified

      assert "Do not spend the next turn on TodoWrite alone" in message
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should not run in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run in this scenario")
++
 +    guide_root = temp_dir / "guides" / "nginx"
 +    chapters = guide_root / "chapters"
 +    guide_root.mkdir(parents=True)
 +    chapters.mkdir()
 +    index_path = guide_root / "index.html"
 +    chapter_one = chapters / "01-introduction.html"
 +    index_path.write_text(
 +        "\n".join(
 +            [
 +                "<html>",
 +                '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>',
 +                '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>',
 +                "</html>",
 +            ]
 +        )
 +        + "\n"
 +    )
 +    chapter_one.write_text("<html></html>\n")
++
 +    implementation_plan = temp_dir / "implementation.md"
 +    implementation_plan.write_text(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## File Changes",
 +                f"- `{guide_root}/`",
 +                f"- `{chapters}/`",
 +                f"- `{index_path}`",
 +                "",
 +            ]
 +        )
 +    )
++
 +    dod = create_definition_of_done("Create a multi-file nginx guide.")
 +    dod.implementation_plan = str(implementation_plan)
 +    dod.pending_items = [
 +        "Creating Chapter 2: Installation and Setup",
 +        "Complete the requested work",
 +    ]
 +    dod.touched_files.extend([str(index_path), str(chapter_one)])
++
 +    queued_messages: list[str] = []
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +        auto_recover=False,
 +    )
 +    context.queue_steering_message_callback = queued_messages.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
++
 +    tool_call = ToolCall(
 +        id="todo-1",
 +        name="TodoWrite",
 +        arguments={
 +            "todos": [
 +                {
 +                    "content": "Creating Chapter 2: Installation and Setup",
 +                    "activeForm": "Creating Chapter 2: Installation and Setup",
 +                    "status": "pending",
 +                }
 +            ]
 +        },
 +    )
 +    executor = FakeExecutor(
 +        [
 +            tool_outcome(
 +                tool_call=tool_call,
 +                output="Todos updated",
 +                is_error=False,
 +                metadata={
 +                    "new_todos": [
 +                        {
 +                            "content": "Creating Chapter 2: Installation and Setup",
 +                            "active_form": "Creating Chapter 2: Installation and Setup",
 +                            "status": "pending",
 +                        }
 +                    ]
 +                },
 +            )
 +        ]
 +    )
++
 +    summary = TurnSummary(final_response="")
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=_noop_emit,
 +        summary=summary,
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert queued_messages
 +    message = queued_messages[-1]
 +    assert "Todo tracking is updated. Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message
 +    assert "Resume by creating `02-installation.html` now." in message
 +    assert (
 +        f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` "
 +        "instead of more rereads."
 +        in message
 +    )
 +    assert "Make your next response the concrete mutation tool call itself" in message
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file(
      temp_dir: Path,