@@ -4029,6 +4029,86 @@ async def test_tool_batch_runner_marks_verification_planned_after_new_mutation( |
| 4029 | 4029 | ) |
| 4030 | 4030 | |
| 4031 | 4031 | |
| 4032 | +@pytest.mark.asyncio |
| 4033 | +async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_only_mkdir( |
| 4034 | + temp_dir: Path, |
| 4035 | +) -> None: |
| 4036 | + async def assess_confidence( |
| 4037 | + tool_name: str, |
| 4038 | + tool_args: dict, |
| 4039 | + context: str, |
| 4040 | + ) -> ConfidenceAssessment: |
| 4041 | + raise AssertionError("Confidence scoring should be disabled in this scenario") |
| 4042 | + |
| 4043 | + async def verify_action( |
| 4044 | + tool_name: str, |
| 4045 | + tool_args: dict, |
| 4046 | + result: str, |
| 4047 | + expected: str = "", |
| 4048 | + ) -> ActionVerification: |
| 4049 | + raise AssertionError("Verification should not run in this scenario") |
| 4050 | + |
| 4051 | + context = build_context( |
| 4052 | + temp_dir=temp_dir, |
| 4053 | + messages=[], |
| 4054 | + safeguards=FakeSafeguards(), |
| 4055 | + assess_confidence=assess_confidence, |
| 4056 | + verify_action=verify_action, |
| 4057 | + ) |
| 4058 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 4059 | + nginx_root = temp_dir / "Loader" / "guides" / "nginx" |
| 4060 | + chapters = nginx_root / "chapters" |
| 4061 | + implementation_plan = temp_dir / "implementation.md" |
| 4062 | + implementation_plan.write_text( |
| 4063 | + "\n".join( |
| 4064 | + [ |
| 4065 | + "# Implementation Plan", |
| 4066 | + "", |
| 4067 | + "## File Changes", |
| 4068 | + f"- `{chapters}/`", |
| 4069 | + f"- `{nginx_root / 'index.html'}`", |
| 4070 | + "", |
| 4071 | + ] |
| 4072 | + ) |
| 4073 | + ) |
| 4074 | + |
| 4075 | + tool_call = ToolCall( |
| 4076 | + id="mkdir-1", |
| 4077 | + name="bash", |
| 4078 | + arguments={"command": f"mkdir -p {chapters}"}, |
| 4079 | + ) |
| 4080 | + executor = FakeExecutor( |
| 4081 | + [tool_outcome(tool_call=tool_call, output="", is_error=False)] |
| 4082 | + ) |
| 4083 | + summary = TurnSummary(final_response="") |
| 4084 | + dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.") |
| 4085 | + dod.implementation_plan = str(implementation_plan) |
| 4086 | + events: list[AgentEvent] = [] |
| 4087 | + |
| 4088 | + async def emit(event: AgentEvent) -> None: |
| 4089 | + events.append(event) |
| 4090 | + |
| 4091 | + await runner.execute_batch( |
| 4092 | + tool_calls=[tool_call], |
| 4093 | + tool_source="assistant", |
| 4094 | + pending_tool_calls_seen=set(), |
| 4095 | + emit=emit, |
| 4096 | + summary=summary, |
| 4097 | + dod=dod, |
| 4098 | + executor=executor, # type: ignore[arg-type] |
| 4099 | + on_confirmation=None, |
| 4100 | + on_user_question=None, |
| 4101 | + emit_confirmation=None, |
| 4102 | + consecutive_errors=0, |
| 4103 | + ) |
| 4104 | + |
| 4105 | + assert dod.last_verification_result is None |
| 4106 | + assert "Collect verification evidence" not in dod.pending_items |
| 4107 | + assert not any( |
| 4108 | + entry.reason_code == "verification_planned" for entry in summary.workflow_timeline |
| 4109 | + ) |
| 4110 | + |
| 4111 | + |
| 4032 | 4112 | @pytest.mark.asyncio |
| 4033 | 4113 | async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation( |
| 4034 | 4114 | temp_dir: Path, |