`0bd46e0`

Preempt post-build audit batches

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 1 week ago

SHA: 0bd46e04b8a974223491d1f6e6c2ab610e0148f1
Parents: 8b65eda
Tree: eec838f

3 changed files

Status	File	+
M	`src/loader/runtime/response_route_handlers.py`	7
M	`src/loader/runtime/tool_batches.py`	35
M	`tests/test_tool_batches.py`	112

src/loader/runtime/response_route_handlers.pymodified

              emit_confirmation=emit_confirmation,
              consecutive_errors=context.consecutive_errors,
+         )
 +        if batch_result.continue_after_batch:
 +            return ResponseRouteDecision(
 +                action=ResponseRouteAction.CONTINUE,
 +                continuation_count=context.continuation_count,
 +                consecutive_errors=batch_result.consecutive_errors,
 +                new_actions_taken=batch_result.actions_taken,
 +            )
          if batch_result.halted:
              return ResponseRouteDecision(
                  action=ResponseRouteAction.FINALIZE,

src/loader/runtime/tool_batches.pymodified

      actions_taken: list[str] = field(default_factory=list)
      consecutive_errors: int = 0
      halted: bool = False
 +    continue_after_batch: bool = False
      final_response: str = ""
              # otherwise the model operates blind and loops.
              self.context.session.append(outcome.message)
              summary.tool_result_messages.append(outcome.message)
 +            if self._should_preempt_for_verification_handoff(
 +                tool_call=executed_tool_call,
 +                dod=dod,
 +            ):
 +                result.continue_after_batch = True
 +                return result
              if outcome.state == ToolExecutionState.DUPLICATE:
                  self._queue_duplicate_observation_nudge(tool_call, dod=dod)
              elif outcome.state == ToolExecutionState.BLOCKED:
          return result
 +    def _should_preempt_for_verification_handoff(
 +        self,
 +        *,
 +        tool_call: ToolCall,
 +        dod: DefinitionOfDone,
 +    ) -> bool:
 +        """Yield back to the main loop once post-build work has clearly transitioned to verify."""
++
 +        if self.context.workflow_mode != "verify":
 +            return False
 +        if dod.status in {"fixing", "done"}:
 +            return False
 +        if not all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root):
 +            return False
 +        verification_commands = dod.verification_commands or derive_verification_commands(
 +            dod,
 +            project_root=self.context.project_root,
 +            task_statement=getattr(self.context.session, "current_task", "") or "",
 +            supplement_existing=True,
 +        )
 +        if not verification_commands:
 +            return False
 +        return tool_call.name in (
 +            {"TodoWrite"}
 +            | _OBSERVATION_TOOLS
 +            | _BOOKKEEPING_NOTE_TOOL_NAMES
 +        )
++
      def _queue_duplicate_observation_nudge(
          self,
          tool_call: ToolCall,

tests/test_tool_batches.pymodified

      assert context.workflow_mode == "verify"
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should not run for this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run for this scenario")
++
 +    guide_root = temp_dir / "guides" / "nginx"
 +    chapters = guide_root / "chapters"
 +    guide_root.mkdir(parents=True)
 +    chapters.mkdir()
 +    index_path = guide_root / "index.html"
 +    chapter_one = chapters / "01-introduction.html"
 +    chapter_two = chapters / "02-installation.html"
 +    index_path.write_text("<html></html>\n")
 +    chapter_one.write_text("<html></html>\n")
 +    chapter_two.write_text("<html></html>\n")
++
 +    implementation_plan = temp_dir / "implementation.md"
 +    implementation_plan.write_text(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## File Changes",
 +                f"- `{guide_root}/`",
 +                f"- `{chapters}/`",
 +                f"- `{index_path}`",
 +                f"- `{chapter_one}`",
 +                f"- `{chapter_two}`",
 +                "",
 +            ]
 +        )
 +    )
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +        auto_recover=False,
 +    )
 +    queued_messages: list[str] = []
 +    context.queue_steering_message_callback = queued_messages.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    dod = create_definition_of_done("Create a multi-file nginx guide.")
 +    dod.implementation_plan = str(implementation_plan)
 +    dod.verification_commands = [f"ls -la {guide_root}"]
++
 +    todo_call = ToolCall(
 +        id="todo-post-build-preempt",
 +        name="TodoWrite",
 +        arguments={"todos": []},
 +    )
 +    audit_read = ToolCall(
 +        id="read-after-todo",
 +        name="read",
 +        arguments={"file_path": str(index_path)},
 +    )
 +    executor = FakeExecutor(
 +        [
 +            tool_outcome(
 +                tool_call=todo_call,
 +                output="Todos updated",
 +                is_error=False,
 +                metadata={"new_todos": []},
 +            ),
 +            tool_outcome(
 +                tool_call=audit_read,
 +                output=index_path.read_text(),
 +                is_error=False,
 +            ),
 +        ]
 +    )
++
 +    summary = TurnSummary(final_response="")
 +    result = await runner.execute_batch(
 +        tool_calls=[todo_call, audit_read],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=_noop_emit,
 +        summary=summary,
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert result.continue_after_batch is True
 +    assert result.halted is False
 +    assert [call.id for call in executor.calls] == ["todo-post-build-preempt"]
 +    assert len(summary.tool_result_messages) == 1
 +    assert context.workflow_mode == "verify"
 +    assert queued_messages
 +    assert "Verification should run next." in queued_messages[-1]
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_todowrite_drops_unplanned_expansion_after_outputs_exist(
      temp_dir: Path,