`ad61aba`

Adopt runtime-owned action reasoning services

Authored by

espadonne 1 month ago

SHA: ad61aba70e9bb47b380336335314b48b4fb9444d
Parents: 8902e62
Tree: 771c2bb

10 changed files

Status	File	+	-
M	`src/loader/agent/loop.py`	2	100
A	`src/loader/runtime/action_reasoning.py`	227	0
M	`src/loader/runtime/context.py`	50	5
A	`src/loader/runtime/reasoning_service.py`	100	0
M	`src/loader/runtime/tool_batches.py`	6	6
M	`tests/test_assistant_turns.py`	0	4
M	`tests/test_finalization.py`	0	4
M	`tests/test_repair.py`	0	4
M	`tests/test_runtime_context.py`	3	3
M	`tests/test_tool_batches.py`	11	13

src/loader/agent/loop.pymodified

      load_permission_rules,
+ )
  from ..runtime.prompt_history import PromptSnapshot
 +from ..runtime.reasoning_service import RuntimeReasoningService
  from ..runtime.session import ConversationSession
  from ..runtime.workflow import WorkflowMode
  from ..tools.base import ToolRegistry, create_default_registry
+ )
  from .prompts import build_system_prompt_result
  from .reasoning import (
 -    CONFIDENCE_PROMPT,
      DECOMPOSITION_PROMPT,
      SELF_CRITIQUE_PROMPT,
 -    VERIFICATION_PROMPT,
 -    ActionVerification,
 -    ConfidenceAssessment,
 -    ConfidenceLevel,
      SelfCritique,
      TaskDecomposition,
 -    estimate_confidence_quick,
      is_conversational,
 -    parse_confidence,
      parse_decomposition,
      parse_self_critique,
 -    parse_verification,
 -    quick_verify,
      should_decompose,
+ )
 -from .recovery import RecoveryContext
  from .safeguards import RuntimeSafeguards
          self.capability_profile = resolve_backend_capability_profile(self.backend)
          self.last_turn_summary: TurnSummary | None = None
 -        # Recovery tracking
 -        self._recovery_context: RecoveryContext | None = None
+-
          # Steering: allow user to send messages during execution
          self._steering_queue: asyncio.Queue[str] = asyncio.Queue()
          self._is_running: bool = False
              if context is not None:
                  context.capability_profile = self.capability_profile
 -        def _get_recovery_context() -> RecoveryContext | None:
 -            return self._recovery_context
+-
 -        def _set_recovery_context(value: RecoveryContext | None) -> None:
 -            self._recovery_context = value
+-
          context = RuntimeContext(
              project_root=self.project_root,
              backend=self.backend,
                  queue_steering_message=_queue_steering_message,
                  set_workflow_mode=_set_workflow_mode,
                  refresh_capability_profile=_refresh_capability_profile,
 -                assess_confidence=self._assess_confidence,
 -                verify_action=self._verify_action,
 -                get_recovery_context=_get_recovery_context,
 -                set_recovery_context=_set_recovery_context,
              ),
 +            reasoning=RuntimeReasoningService(self.backend, self.config),
              prompt_format=self.prompt_format,
              prompt_sections=list(self.prompt_sections),
+         )
+         )
          return parse_self_critique(critique_response.content, response)
 -    async def _assess_confidence(
 -        self,
 -        tool_name: str,
 -        tool_args: dict,
 -        context: str = "",
 -    ) -> ConfidenceAssessment:
 -        """Assess confidence in a tool action."""
 -        cfg = self.config.reasoning
+-
 -        # Try quick heuristic first
 -        if cfg.use_quick_confidence:
 -            quick_level = estimate_confidence_quick(tool_name, tool_args, context)
 -            # Only call LLM if quick estimate is low
 -            if quick_level.value >= ConfidenceLevel.MEDIUM.value:
 -                return ConfidenceAssessment(
 -                    action=f"{tool_name} with {tool_args}",
 -                    tool_name=tool_name,
 -                    tool_args=tool_args,
 -                    level=quick_level,
 -                    reasoning="Quick heuristic assessment",
 -                )
+-
 -        # Full LLM assessment
 -        action = f"Call {tool_name} with arguments: {tool_args}"
 -        prompt = CONFIDENCE_PROMPT.format(
 -            action=action,
 -            tool_name=tool_name,
 -            tool_args=tool_args,
 -            context=context[-2000:] if context else "No prior context",
 -        )
 -        response = await self.backend.complete(
 -            messages=[Message(role=Role.USER, content=prompt)],
 -            tools=None,
 -            temperature=0.3,
 -            max_tokens=300,
 -        )
 -        return parse_confidence(response.content, tool_name, tool_args)
+-
 -    async def _verify_action(
 -        self,
 -        tool_name: str,
 -        tool_args: dict,
 -        result: str,
 -        expected: str = "",
 -    ) -> ActionVerification:
 -        """Verify that an action produced the expected result."""
 -        cfg = self.config.reasoning
+-
 -        # Try quick verification first
 -        if cfg.use_quick_verification:
 -            quick_result = quick_verify(tool_name, tool_args, result)
 -            if quick_result:
 -                return ActionVerification(
 -                    tool_name=tool_name,
 -                    tool_args=tool_args,
 -                    expected_outcome=expected or "Success",
 -                    actual_result=result[:500],
 -                    verified=True,
 -                    verification_method="quick_heuristic",
 -                )
+-
 -        # Full LLM verification
 -        prompt = VERIFICATION_PROMPT.format(
 -            tool_name=tool_name,
 -            tool_args=tool_args,
 -            expected=expected or "The action should complete successfully",
 -            result=result[:2000],  # Truncate long results
 -        )
 -        response = await self.backend.complete(
 -            messages=[Message(role=Role.USER, content=prompt)],
 -            tools=None,
 -            temperature=0.3,
 -            max_tokens=300,
 -        )
 -        return parse_verification(response.content, tool_name, tool_args, expected, result)
+-
      async def _handle_conversational(
          self,
          user_message: str,
          self.prompt_format = None
          self.prompt_sections = []
          self.session = self._create_session(messages=self.messages)
 -        self._recovery_context = None
          self._current_task = None
          self.last_turn_summary = None
          self.workflow_mode = WorkflowMode.EXECUTE.value

src/loader/runtime/action_reasoning.pyadded

 +"""Runtime-owned prompts and heuristics for action reasoning."""
++
 +from __future__ import annotations
++
 +import re
++
 +from .reasoning_types import (
 +    ActionVerification,
 +    ConfidenceAssessment,
 +    ConfidenceLevel,
 +)
++
 +CONFIDENCE_PROMPT = """Rate your confidence in this action before executing.
++
 +Action: {action}
 +Tool: {tool_name}
 +Arguments: {tool_args}
++
 +Previous context:
 +{context}
++
 +Consider:
 +1. Do you have enough information to proceed?
 +2. What could go wrong?
 +3. Is this the right approach?
 +4. Are there better alternatives?
++
 +Respond in this exact JSON format:
 +{{
 +  "confidence": 1-5,  // 1=very low, 2=low, 3=medium, 4=high, 5=very high
 +  "reasoning": "Why this confidence level",
 +  "risks": ["Risk 1", "Risk 2"],
 +  "mitigations": ["How to mitigate risk 1"],
 +  "requires_verification": true/false,
 +  "alternative_approaches": ["Alternative 1 if confidence is low"]
 +}}
++
 +Only output the JSON, no other text."""
++
++
 +VERIFICATION_PROMPT = """Verify that the action produced the expected result.
++
 +Action taken:
 +- Tool: {tool_name}
 +- Arguments: {tool_args}
++
 +Expected outcome: {expected}
++
 +Actual result:
 +{result}
++
 +Analyze:
 +1. Did the action succeed?
 +2. Does the result match expectations?
 +3. Are there any unexpected side effects?
 +4. Is any correction needed?
++
 +Respond in this exact JSON format:
 +{{
 +  "verified": true/false,
 +  "verification_method": "How you verified (e.g., output_contains, no_error, file_created)",
 +  "discrepancies": ["Discrepancy 1 if any"],
 +  "needs_correction": true/false,
 +  "correction_suggestion": "What to do if correction is needed"
 +}}
++
 +Only output the JSON, no other text."""
++
++
 +def parse_confidence(
 +    response: str,
 +    tool_name: str,
 +    tool_args: dict,
 +) -> ConfidenceAssessment:
 +    """Parse LLM response into ConfidenceAssessment."""
++
 +    import json
++
 +    json_match = re.search(r"\{.*\}", response, re.DOTALL)
 +    if not json_match:
 +        return ConfidenceAssessment(
 +            action="",
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +        )
++
 +    try:
 +        data = json.loads(json_match.group())
 +        confidence_val = data.get("confidence", 3)
 +        level = ConfidenceLevel(max(1, min(5, confidence_val)))
++
 +        return ConfidenceAssessment(
 +            action=data.get("action", ""),
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +            level=level,
 +            reasoning=data.get("reasoning", ""),
 +            risks=data.get("risks", []),
 +            mitigations=data.get("mitigations", []),
 +            requires_verification=data.get("requires_verification", level.value <= 3),
 +        )
 +    except (json.JSONDecodeError, ValueError):
 +        return ConfidenceAssessment(
 +            action="",
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +        )
++
++
 +def parse_verification(
 +    response: str,
 +    tool_name: str,
 +    tool_args: dict,
 +    expected: str,
 +    result: str,
 +) -> ActionVerification:
 +    """Parse LLM response into ActionVerification."""
++
 +    import json
++
 +    json_match = re.search(r"\{.*\}", response, re.DOTALL)
 +    if not json_match:
 +        return ActionVerification(
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +            expected_outcome=expected,
 +            actual_result=result,
 +            verified="error" not in result.lower(),
 +        )
++
 +    try:
 +        data = json.loads(json_match.group())
 +        return ActionVerification(
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +            expected_outcome=expected,
 +            actual_result=result,
 +            verified=data.get("verified", False),
 +            verification_method=data.get("verification_method", ""),
 +            discrepancies=data.get("discrepancies", []),
 +            needs_correction=data.get("needs_correction", False),
 +            correction_suggestion=data.get("correction_suggestion", ""),
 +        )
 +    except json.JSONDecodeError:
 +        return ActionVerification(
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +            expected_outcome=expected,
 +            actual_result=result,
 +            verified="error" not in result.lower(),
 +        )
++
++
 +def estimate_confidence_quick(
 +    tool_name: str,
 +    tool_args: dict,
 +    context: str = "",
 +) -> ConfidenceLevel:
 +    """Estimate action confidence with heuristics before an LLM call."""
++
 +    del context
++
 +    if tool_name in {"read", "glob", "grep", "git"}:
 +        return ConfidenceLevel.HIGH
++
 +    if tool_name == "write":
 +        file_path = tool_args.get("file_path", "")
 +        if not file_path:
 +            return ConfidenceLevel.LOW
 +        return ConfidenceLevel.MEDIUM
++
 +    if tool_name == "edit":
 +        old_string = tool_args.get("old_string", "")
 +        if not old_string:
 +            return ConfidenceLevel.LOW
 +        return ConfidenceLevel.MEDIUM
++
 +    if tool_name == "patch":
 +        hunks = tool_args.get("hunks", [])
 +        if not hunks:
 +            return ConfidenceLevel.LOW
 +        return ConfidenceLevel.MEDIUM
++
 +    if tool_name == "bash":
 +        command = tool_args.get("command", "")
 +        dangerous = ["rm -rf", "sudo", "chmod 777", "dd if=", "> /dev/"]
 +        if any(pattern in command for pattern in dangerous):
 +            return ConfidenceLevel.VERY_LOW
 +        safe_read = ["ls", "cat", "grep", "find", "pwd", "echo", "head", "tail"]
 +        if any(command.strip().startswith(prefix) for prefix in safe_read):
 +            return ConfidenceLevel.HIGH
 +        return ConfidenceLevel.MEDIUM
++
 +    return ConfidenceLevel.MEDIUM
++
++
 +def quick_verify(tool_name: str, tool_args: dict, result: str) -> bool:
 +    """Estimate whether a tool action succeeded without an LLM call."""
++
 +    del tool_args
++
 +    result_lower = result.lower()
 +    error_indicators = [
 +        "error:",
 +        "failed",
 +        "not found",
 +        "permission denied",
 +        "no such file",
 +        "command not found",
 +        "exception",
 +        "traceback",
 +        "fatal:",
 +        "cannot",
 +    ]
 +    if any(indicator in result_lower for indicator in error_indicators):
 +        return False
++
 +    if tool_name == "write":
 +        return "created" in result_lower or "wrote" in result_lower or len(result) < 200
 +    if tool_name in {"edit", "patch"}:
 +        return "edited" in result_lower or "patched" in result_lower or "+" in result or "-" in result
 +    if tool_name in {"read", "git"}:
 +        return len(result.strip()) > 0
 +    if tool_name == "bash":
 +        return True
++
 +    return True

src/loader/runtime/context.pymodified

  from __future__ import annotations
 -from collections.abc import Awaitable, Callable
 +from collections.abc import Callable
  from dataclasses import dataclass, field
  from pathlib import Path
  from typing import Any, Protocol
      def record_response(self, content: str) -> None:
          """Record a completed assistant response for safeguard bookkeeping."""
++
 +class RuntimeReasoningServiceProtocol(Protocol):
 +    """Typed action-reasoning surface the runtime can rely on."""
++
 +    async def assess_confidence(
 +        self,
 +        tool_name: str,
 +        tool_args: dict[str, Any],
 +        context: str = "",
 +    ) -> ConfidenceAssessment:
 +        """Assess confidence in a planned tool action."""
++
 +    async def verify_action(
 +        self,
 +        tool_name: str,
 +        tool_args: dict[str, Any],
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        """Verify that a tool action produced the desired result."""
++
++
  @dataclass(slots=True)
  class RuntimeLegacyServices:
      """Explicit migration seams for legacy agent-owned behavior."""
      queue_steering_message: Callable[[str], None]
      set_workflow_mode: Callable[[str], None]
      refresh_capability_profile: Callable[[], None]
 -    assess_confidence: Callable[[str, dict[str, Any], str], Awaitable[ConfidenceAssessment]]
 -    verify_action: Callable[[str, dict[str, Any], str, str], Awaitable[ActionVerification]]
 -    get_recovery_context: Callable[[], RecoveryContext | None]
 -    set_recovery_context: Callable[[RecoveryContext | None], None]
  @dataclass(slots=True)
      workflow_mode: str
      safeguards: RuntimeSafeguardsProtocol
      legacy: RuntimeLegacyServices
 +    reasoning: RuntimeReasoningServiceProtocol | None = None
 +    recovery_context: RecoveryContext | None = None
      prompt_format: str | None = None
      prompt_sections: list[str] = field(default_factory=list)
          """Return rule counts for the active permission policy."""
          return self.permission_policy.rule_counts()
++
 +    async def assess_confidence(
 +        self,
 +        tool_name: str,
 +        tool_args: dict[str, Any],
 +        context: str = "",
 +    ) -> ConfidenceAssessment:
 +        """Assess confidence using the primary runtime reasoning service."""
++
 +        if self.reasoning is None:
 +            raise RuntimeError("RuntimeContext.reasoning is required for confidence checks")
 +        return await self.reasoning.assess_confidence(tool_name, tool_args, context)
++
 +    async def verify_action(
 +        self,
 +        tool_name: str,
 +        tool_args: dict[str, Any],
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        """Verify a tool action using the primary runtime reasoning service."""
++
 +        if self.reasoning is None:
 +            raise RuntimeError("RuntimeContext.reasoning is required for verification checks")
 +        return await self.reasoning.verify_action(tool_name, tool_args, result, expected)

src/loader/runtime/reasoning_service.pyadded

 +"""Runtime-owned confidence and verification services."""
++
 +from __future__ import annotations
++
 +from typing import Any
++
 +from ..llm.base import LLMBackend, Message, Role
 +from .action_reasoning import (
 +    CONFIDENCE_PROMPT,
 +    VERIFICATION_PROMPT,
 +    estimate_confidence_quick,
 +    parse_confidence,
 +    parse_verification,
 +    quick_verify,
 +)
 +from .reasoning_types import (
 +    ActionVerification,
 +    ConfidenceAssessment,
 +    ConfidenceLevel,
 +)
++
++
 +class RuntimeReasoningService:
 +    """Provide confidence scoring and verification without Agent callbacks."""
++
 +    def __init__(self, backend: LLMBackend, config: Any) -> None:
 +        self.backend = backend
 +        self.config = config
++
 +    async def assess_confidence(
 +        self,
 +        tool_name: str,
 +        tool_args: dict[str, Any],
 +        context: str = "",
 +    ) -> ConfidenceAssessment:
 +        """Assess confidence in a planned tool action."""
++
 +        cfg = self.config.reasoning
++
 +        if getattr(cfg, "use_quick_confidence", True):
 +            quick_level = estimate_confidence_quick(tool_name, tool_args, context)
 +            if quick_level.value >= ConfidenceLevel.MEDIUM.value:
 +                return ConfidenceAssessment(
 +                    action=f"{tool_name} with {tool_args}",
 +                    tool_name=tool_name,
 +                    tool_args=tool_args,
 +                    level=quick_level,
 +                    reasoning="Quick heuristic assessment",
 +                )
++
 +        action = f"Call {tool_name} with arguments: {tool_args}"
 +        prompt = CONFIDENCE_PROMPT.format(
 +            action=action,
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +            context=context[-2000:] if context else "No prior context",
 +        )
 +        response = await self.backend.complete(
 +            messages=[Message(role=Role.USER, content=prompt)],
 +            tools=None,
 +            temperature=0.3,
 +            max_tokens=300,
 +        )
 +        return parse_confidence(response.content, tool_name, tool_args)
++
 +    async def verify_action(
 +        self,
 +        tool_name: str,
 +        tool_args: dict[str, Any],
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        """Verify that a completed tool action achieved its goal."""
++
 +        cfg = self.config.reasoning
++
 +        if getattr(cfg, "use_quick_verification", True):
 +            if quick_verify(tool_name, tool_args, result):
 +                return ActionVerification(
 +                    tool_name=tool_name,
 +                    tool_args=tool_args,
 +                    expected_outcome=expected or "Success",
 +                    actual_result=result[:500],
 +                    verified=True,
 +                    verification_method="quick_heuristic",
 +                )
++
 +        prompt = VERIFICATION_PROMPT.format(
 +            tool_name=tool_name,
 +            tool_args=tool_args,
 +            expected=expected or "The action should complete successfully",
 +            result=result[:2000],
 +        )
 +        response = await self.backend.complete(
 +            messages=[Message(role=Role.USER, content=prompt)],
 +            tools=None,
 +            temperature=0.3,
 +            max_tokens=300,
 +        )
 +        return parse_verification(response.content, tool_name, tool_args, expected, result)

src/loader/runtime/tool_batches.pymodified

              for message in self.context.messages[-5:]
              if message.content
+         )
 -        confidence = await self.context.legacy.assess_confidence(
 +        confidence = await self.context.assess_confidence(
              tool_call.name,
              tool_call.arguments,
              context,
              if isinstance(new_todos, list):
                  sync_todos_to_definition_of_done(dod, new_todos)
          self.dod_store.save(dod)
 -        self.context.legacy.set_recovery_context(None)
 +        self.context.recovery_context = None
          return None
      async def _run_post_tool_verification(
          ):
              return False
 -        verification = await self.context.legacy.verify_action(
 +        verification = await self.context.verify_action(
              tool_call.name,
              tool_call.arguments,
              outcome.result_output,
      ) -> Message | None:
          """Generate a recovery follow-up after an executed tool failure."""
 -        recovery_context = self.context.legacy.get_recovery_context()
 +        recovery_context = self.context.recovery_context
          if recovery_context is None:
              recovery_context = RecoveryContext(
                  original_tool=tool_call.name,
                  original_args=tool_call.arguments,
                  max_retries=self.context.config.max_recovery_attempts,
+             )
 -            self.context.legacy.set_recovery_context(recovery_context)
 +            self.context.recovery_context = recovery_context
          if recovery_context.is_similar_attempt(
              tool_call.name,
                  tool_name=tool_call.name,
+             )
+         )
 -        self.context.legacy.set_recovery_context(None)
 +        self.context.recovery_context = None
          return Message.tool_result_message(
              tool_call_id=tool_call.id,
              display_content=(f"Observation [{tool_call.name}]: Error: {failure_message}"),

tests/test_assistant_turns.pymodified

              queue_steering_message=queued_messages.append,
              set_workflow_mode=lambda mode: None,
              refresh_capability_profile=lambda: None,
 -            assess_confidence=lambda tool_name, tool_args, context: None,  # type: ignore[arg-type]
 -            verify_action=lambda tool_name, tool_args, result, expected: None,  # type: ignore[arg-type]
 -            get_recovery_context=lambda: None,
 -            set_recovery_context=lambda value: None,
          ),
+     )
      return context, queued_messages

tests/test_finalization.pymodified

              queue_steering_message=lambda message: None,
              set_workflow_mode=lambda mode: None,
              refresh_capability_profile=lambda: None,
 -            assess_confidence=lambda tool_name, tool_args, context: None,  # type: ignore[arg-type]
 -            verify_action=lambda tool_name, tool_args, result, expected: None,  # type: ignore[arg-type]
 -            get_recovery_context=lambda: None,
 -            set_recovery_context=lambda value: None,
          ),
+     )

tests/test_repair.pymodified

              queue_steering_message=lambda message: None,
              set_workflow_mode=lambda mode: None,
              refresh_capability_profile=lambda: None,
 -            assess_confidence=lambda tool_name, tool_args, context: None,  # type: ignore[arg-type]
 -            verify_action=lambda tool_name, tool_args, result, expected: None,  # type: ignore[arg-type]
 -            get_recovery_context=lambda: None,
 -            set_recovery_context=lambda value: None,
          ),
+     )

tests/test_runtime_context.pymodified

      assert context.use_react == agent.use_react
      assert context.active_permission_mode == agent.active_permission_mode
      assert context.active_permission_rule_counts == agent.active_permission_rule_counts
 +    assert context.reasoning is not None
      assert context.legacy.message_history() is agent.messages
      assert context.workflow_mode == "clarify"
      recovery = RecoveryContext(original_tool="read", original_args={"file_path": "README.md"})
 -    context.legacy.set_recovery_context(recovery)
 -    assert agent._recovery_context is recovery
 -    assert context.legacy.get_recovery_context() is recovery
 +    context.recovery_context = recovery
 +    assert context.recovery_context is recovery
      context.legacy.refresh_capability_profile()
      assert context.capability_profile == agent.capability_profile

tests/test_tool_batches.pymodified

  from __future__ import annotations
 -from dataclasses import dataclass
  from pathlib import Path
  from types import SimpleNamespace
      build_permission_policy,
      load_permission_rules,
+ )
 -from loader.runtime.recovery import RecoveryContext
  from loader.runtime.reasoning_types import (
      ActionVerification,
      ConfidenceAssessment,
      ConfidenceLevel,
+ )
 +from loader.runtime.recovery import RecoveryContext
  from loader.runtime.tool_batches import ToolBatchRunner
 -from loader.runtime.tracing import RuntimeTracer
  from loader.tools.base import ToolResult as RegistryToolResult
  from loader.tools.base import create_default_registry
  from tests.helpers.runtime_harness import ScriptedBackend
      verification: bool = False,
      auto_recover: bool = True,
      min_confidence_for_action: int = 3,
 -) -> tuple[RuntimeContext, dict[str, RecoveryContext | None]]:
 +) -> RuntimeContext:
      registry = create_default_registry(temp_dir)
      registry.configure_workspace_root(temp_dir)
      rule_status = load_permission_rules(temp_dir)
          tool_requirements=registry.get_tool_requirements(),
          rules=rule_status.rules,
+     )
 -    recovery_holder = {"value": recovery_context}
      context = RuntimeContext(
          project_root=temp_dir,
          backend=ScriptedBackend(),
              queue_steering_message=lambda message: None,
              set_workflow_mode=lambda mode: None,
              refresh_capability_profile=lambda: None,
 +        ),
 +        reasoning=SimpleNamespace(
              assess_confidence=assess_confidence,
              verify_action=verify_action,
 -            get_recovery_context=lambda: recovery_holder["value"],
 -            set_recovery_context=lambda value: recovery_holder.__setitem__("value", value),
          ),
 +        recovery_context=recovery_context,
+     )
 -    return context, recovery_holder
 +    return context
  def tool_outcome(
      async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
          raise AssertionError("Verification should not run for skipped actions")
 -    context, _ = build_context(
 +    context = build_context(
          temp_dir=temp_dir,
          messages=[
              Message(role=Role.USER, content="Please inspect the project."),
      async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification:
          raise AssertionError("Verification should not run for failed actions")
 -    context, recovery_holder = build_context(
 +    context = build_context(
          temp_dir=temp_dir,
          messages=[],
          safeguards=FakeSafeguards(),
          consecutive_errors=0,
+     )
 -    assert recovery_holder["value"] is not None
 +    assert context.recovery_context is not None
      assert summary.tool_result_messages
      assert context.session.messages[-1] == summary.tool_result_messages[-1]
      assert any(event.type == "recovery" for event in events)
          original_tool="edit",
          original_args={"file_path": "README.md"},
+     )
 -    context, recovery_holder = build_context(
 +    context = build_context(
          temp_dir=temp_dir,
          messages=[],
          safeguards=FakeSafeguards(),
+     )
      assert verification_calls == ["file contents"]
 -    assert recovery_holder["value"] is None
 +    assert context.recovery_context is None
      assert context.session.messages[-1].role == Role.TOOL
      assert context.session.messages[-1].content == "file contents"
      assert any(event.type == "verification" for event in events)