Preempt post-build audit batches
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
0bd46e04b8a974223491d1f6e6c2ab610e0148f1- Parents
-
8b65eda - Tree
eec838f
0bd46e0
0bd46e04b8a974223491d1f6e6c2ab610e0148f18b65eda
eec838f| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/response_route_handlers.py
|
7 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
35 | 0 |
| M |
tests/test_tool_batches.py
|
112 | 0 |
src/loader/runtime/response_route_handlers.pymodified@@ -140,6 +140,13 @@ class ToolBatchRouteHandler: | ||
| 140 | 140 | emit_confirmation=emit_confirmation, |
| 141 | 141 | consecutive_errors=context.consecutive_errors, |
| 142 | 142 | ) |
| 143 | + if batch_result.continue_after_batch: | |
| 144 | + return ResponseRouteDecision( | |
| 145 | + action=ResponseRouteAction.CONTINUE, | |
| 146 | + continuation_count=context.continuation_count, | |
| 147 | + consecutive_errors=batch_result.consecutive_errors, | |
| 148 | + new_actions_taken=batch_result.actions_taken, | |
| 149 | + ) | |
| 143 | 150 | if batch_result.halted: |
| 144 | 151 | return ResponseRouteDecision( |
| 145 | 152 | action=ResponseRouteAction.FINALIZE, |
src/loader/runtime/tool_batches.pymodified@@ -147,6 +147,7 @@ class ToolBatchResult: | ||
| 147 | 147 | actions_taken: list[str] = field(default_factory=list) |
| 148 | 148 | consecutive_errors: int = 0 |
| 149 | 149 | halted: bool = False |
| 150 | + continue_after_batch: bool = False | |
| 150 | 151 | final_response: str = "" |
| 151 | 152 | |
| 152 | 153 | |
@@ -318,6 +319,12 @@ class ToolBatchRunner: | ||
| 318 | 319 | # otherwise the model operates blind and loops. |
| 319 | 320 | self.context.session.append(outcome.message) |
| 320 | 321 | summary.tool_result_messages.append(outcome.message) |
| 322 | + if self._should_preempt_for_verification_handoff( | |
| 323 | + tool_call=executed_tool_call, | |
| 324 | + dod=dod, | |
| 325 | + ): | |
| 326 | + result.continue_after_batch = True | |
| 327 | + return result | |
| 321 | 328 | if outcome.state == ToolExecutionState.DUPLICATE: |
| 322 | 329 | self._queue_duplicate_observation_nudge(tool_call, dod=dod) |
| 323 | 330 | elif outcome.state == ToolExecutionState.BLOCKED: |
@@ -394,6 +401,34 @@ class ToolBatchRunner: | ||
| 394 | 401 | |
| 395 | 402 | return result |
| 396 | 403 | |
| 404 | + def _should_preempt_for_verification_handoff( | |
| 405 | + self, | |
| 406 | + *, | |
| 407 | + tool_call: ToolCall, | |
| 408 | + dod: DefinitionOfDone, | |
| 409 | + ) -> bool: | |
| 410 | + """Yield back to the main loop once post-build work has clearly transitioned to verify.""" | |
| 411 | + | |
| 412 | + if self.context.workflow_mode != "verify": | |
| 413 | + return False | |
| 414 | + if dod.status in {"fixing", "done"}: | |
| 415 | + return False | |
| 416 | + if not all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root): | |
| 417 | + return False | |
| 418 | + verification_commands = dod.verification_commands or derive_verification_commands( | |
| 419 | + dod, | |
| 420 | + project_root=self.context.project_root, | |
| 421 | + task_statement=getattr(self.context.session, "current_task", "") or "", | |
| 422 | + supplement_existing=True, | |
| 423 | + ) | |
| 424 | + if not verification_commands: | |
| 425 | + return False | |
| 426 | + return tool_call.name in ( | |
| 427 | + {"TodoWrite"} | |
| 428 | + | _OBSERVATION_TOOLS | |
| 429 | + | _BOOKKEEPING_NOTE_TOOL_NAMES | |
| 430 | + ) | |
| 431 | + | |
| 397 | 432 | def _queue_duplicate_observation_nudge( |
| 398 | 433 | self, |
| 399 | 434 | tool_call: ToolCall, |
tests/test_tool_batches.pymodified@@ -4168,6 +4168,118 @@ async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing | ||
| 4168 | 4168 | assert context.workflow_mode == "verify" |
| 4169 | 4169 | |
| 4170 | 4170 | |
| 4171 | +@pytest.mark.asyncio | |
| 4172 | +async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff( | |
| 4173 | + temp_dir: Path, | |
| 4174 | +) -> None: | |
| 4175 | + async def assess_confidence( | |
| 4176 | + tool_name: str, | |
| 4177 | + tool_args: dict, | |
| 4178 | + context: str, | |
| 4179 | + ) -> ConfidenceAssessment: | |
| 4180 | + raise AssertionError("Confidence scoring should not run for this scenario") | |
| 4181 | + | |
| 4182 | + async def verify_action( | |
| 4183 | + tool_name: str, | |
| 4184 | + tool_args: dict, | |
| 4185 | + result: str, | |
| 4186 | + expected: str = "", | |
| 4187 | + ) -> ActionVerification: | |
| 4188 | + raise AssertionError("Verification should not run for this scenario") | |
| 4189 | + | |
| 4190 | + guide_root = temp_dir / "guides" / "nginx" | |
| 4191 | + chapters = guide_root / "chapters" | |
| 4192 | + guide_root.mkdir(parents=True) | |
| 4193 | + chapters.mkdir() | |
| 4194 | + index_path = guide_root / "index.html" | |
| 4195 | + chapter_one = chapters / "01-introduction.html" | |
| 4196 | + chapter_two = chapters / "02-installation.html" | |
| 4197 | + index_path.write_text("<html></html>\n") | |
| 4198 | + chapter_one.write_text("<html></html>\n") | |
| 4199 | + chapter_two.write_text("<html></html>\n") | |
| 4200 | + | |
| 4201 | + implementation_plan = temp_dir / "implementation.md" | |
| 4202 | + implementation_plan.write_text( | |
| 4203 | + "\n".join( | |
| 4204 | + [ | |
| 4205 | + "# Implementation Plan", | |
| 4206 | + "", | |
| 4207 | + "## File Changes", | |
| 4208 | + f"- `{guide_root}/`", | |
| 4209 | + f"- `{chapters}/`", | |
| 4210 | + f"- `{index_path}`", | |
| 4211 | + f"- `{chapter_one}`", | |
| 4212 | + f"- `{chapter_two}`", | |
| 4213 | + "", | |
| 4214 | + ] | |
| 4215 | + ) | |
| 4216 | + ) | |
| 4217 | + | |
| 4218 | + context = build_context( | |
| 4219 | + temp_dir=temp_dir, | |
| 4220 | + messages=[], | |
| 4221 | + safeguards=FakeSafeguards(), | |
| 4222 | + assess_confidence=assess_confidence, | |
| 4223 | + verify_action=verify_action, | |
| 4224 | + auto_recover=False, | |
| 4225 | + ) | |
| 4226 | + queued_messages: list[str] = [] | |
| 4227 | + context.queue_steering_message_callback = queued_messages.append | |
| 4228 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 4229 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 4230 | + dod.implementation_plan = str(implementation_plan) | |
| 4231 | + dod.verification_commands = [f"ls -la {guide_root}"] | |
| 4232 | + | |
| 4233 | + todo_call = ToolCall( | |
| 4234 | + id="todo-post-build-preempt", | |
| 4235 | + name="TodoWrite", | |
| 4236 | + arguments={"todos": []}, | |
| 4237 | + ) | |
| 4238 | + audit_read = ToolCall( | |
| 4239 | + id="read-after-todo", | |
| 4240 | + name="read", | |
| 4241 | + arguments={"file_path": str(index_path)}, | |
| 4242 | + ) | |
| 4243 | + executor = FakeExecutor( | |
| 4244 | + [ | |
| 4245 | + tool_outcome( | |
| 4246 | + tool_call=todo_call, | |
| 4247 | + output="Todos updated", | |
| 4248 | + is_error=False, | |
| 4249 | + metadata={"new_todos": []}, | |
| 4250 | + ), | |
| 4251 | + tool_outcome( | |
| 4252 | + tool_call=audit_read, | |
| 4253 | + output=index_path.read_text(), | |
| 4254 | + is_error=False, | |
| 4255 | + ), | |
| 4256 | + ] | |
| 4257 | + ) | |
| 4258 | + | |
| 4259 | + summary = TurnSummary(final_response="") | |
| 4260 | + result = await runner.execute_batch( | |
| 4261 | + tool_calls=[todo_call, audit_read], | |
| 4262 | + tool_source="assistant", | |
| 4263 | + pending_tool_calls_seen=set(), | |
| 4264 | + emit=_noop_emit, | |
| 4265 | + summary=summary, | |
| 4266 | + dod=dod, | |
| 4267 | + executor=executor, # type: ignore[arg-type] | |
| 4268 | + on_confirmation=None, | |
| 4269 | + on_user_question=None, | |
| 4270 | + emit_confirmation=None, | |
| 4271 | + consecutive_errors=0, | |
| 4272 | + ) | |
| 4273 | + | |
| 4274 | + assert result.continue_after_batch is True | |
| 4275 | + assert result.halted is False | |
| 4276 | + assert [call.id for call in executor.calls] == ["todo-post-build-preempt"] | |
| 4277 | + assert len(summary.tool_result_messages) == 1 | |
| 4278 | + assert context.workflow_mode == "verify" | |
| 4279 | + assert queued_messages | |
| 4280 | + assert "Verification should run next." in queued_messages[-1] | |
| 4281 | + | |
| 4282 | + | |
| 4171 | 4283 | @pytest.mark.asyncio |
| 4172 | 4284 | async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist( |
| 4173 | 4285 | temp_dir: Path, |