`39d15a6`

Delay setup-only verification planning

Authored by

espadonne 2 weeks ago

SHA: 39d15a655a7fa1ee038c8f6e5ca8be9c0bcf4d1f
Parents: 36515f8
Tree: bf5d20b

2 changed files

Status	File	+	-
M	`src/loader/runtime/tool_batches.py`	37	1
M	`tests/test_tool_batches.py`	80	0

src/loader/runtime/tool_batches.pymodified

                  dod=dod,
                  tool_call=tool_call,
+             )
 -        elif is_mutating:
 +        elif is_mutating and _should_plan_verification_for_tool_call(
 +            dod,
 +            tool_call=tool_call,
 +            project_root=self.context.project_root,
 +        ):
              _mark_verification_planned(
                  context=self.context,
                  summary=summary,
      return any(token in lowered for token in _MUTATION_TODO_HINTS)
 +def _should_plan_verification_for_tool_call(
 +    dod: DefinitionOfDone,
 +    *,
 +    tool_call: ToolCall,
 +    project_root: Path,
 +) -> bool:
 +    if tool_call.name in {"write", "edit", "patch"}:
 +        return True
 +    if tool_call.name != "bash":
 +        return False
 +    if any(
 +        Path(path).expanduser().resolve(strict=False).suffix
 +        for path in dod.touched_files
 +        if str(path).strip()
 +    ):
 +        return True
 +    return any(
 +        not expect_directory
 +        and planned_artifact_target_satisfied(
 +            dod,
 +            target=target,
 +            expect_directory=False,
 +            project_root=project_root,
 +        )
 +        for target, expect_directory in collect_planned_artifact_targets(
 +            dod,
 +            project_root=project_root,
 +            max_paths=12,
 +        )
 +    )
++
++
  def _mark_verification_planned(
      *,
      context: RuntimeContext,

tests/test_tool_batches.pymodified

+     )
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should be disabled in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run in this scenario")
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +    )
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    nginx_root = temp_dir / "Loader" / "guides" / "nginx"
 +    chapters = nginx_root / "chapters"
 +    implementation_plan = temp_dir / "implementation.md"
 +    implementation_plan.write_text(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## File Changes",
 +                f"- `{chapters}/`",
 +                f"- `{nginx_root / 'index.html'}`",
 +                "",
 +            ]
 +        )
 +    )
++
 +    tool_call = ToolCall(
 +        id="mkdir-1",
 +        name="bash",
 +        arguments={"command": f"mkdir -p {chapters}"},
 +    )
 +    executor = FakeExecutor(
 +        [tool_outcome(tool_call=tool_call, output="", is_error=False)]
 +    )
 +    summary = TurnSummary(final_response="")
 +    dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.")
 +    dod.implementation_plan = str(implementation_plan)
 +    events: list[AgentEvent] = []
++
 +    async def emit(event: AgentEvent) -> None:
 +        events.append(event)
++
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=emit,
 +        summary=summary,
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert dod.last_verification_result is None
 +    assert "Collect verification evidence" not in dod.pending_items
 +    assert not any(
 +        entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
 +    )
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
      temp_dir: Path,