`e36e64f`

Move reasoning types into runtime

Authored by

espadonne 1 month ago

SHA: e36e64fc543bee5048462c1e91f98561b4ecce7f
Parents: 50ab16b
Tree: 2a55e16

7 changed files

Status	File	+	-
M	`src/loader/agent/reasoning.py`	9	173
M	`src/loader/runtime/context.py`	1	4
M	`src/loader/runtime/events.py`	3	3
A	`src/loader/runtime/reasoning_types.py`	199	0
M	`src/loader/ui/adapter.py`	1	1
A	`tests/test_reasoning_types.py`	62	0
M	`tests/test_tool_batches.py`	5	1

src/loader/agent/reasoning.pymodified

  """
  import re
 -from dataclasses import dataclass, field
 -from enum import Enum
  from typing import Any
  from ..runtime.rollback import (
      get_undo_command,
      is_destructive_tool,
+ )
 +from ..runtime.reasoning_types import (
 +    ActionVerification,
 +    ConfidenceAssessment,
 +    ConfidenceLevel,
 +    SelfCritique,
 +    Subtask,
 +    TaskCompletionCheck,
 +    TaskDecomposition,
 +)
  # === Query Classification ===
      return budgets.get(complexity, (1024, 8192))
 -class ConfidenceLevel(Enum):
 -    """Confidence levels for actions."""
 -    VERY_LOW = 1      # < 20% - Need more information
 -    LOW = 2           # 20-40% - Uncertain, may need verification
 -    MEDIUM = 3        # 40-60% - Reasonable guess
 -    HIGH = 4          # 60-80% - Confident
 -    VERY_HIGH = 5     # 80-100% - Certain
+-
+-
 -@dataclass
 -class Subtask:
 -    """A decomposed subtask with dependencies."""
 -    id: str
 -    description: str
 -    dependencies: list[str] = field(default_factory=list)  # IDs of subtasks this depends on
 -    verification: str = ""  # How to verify this subtask succeeded
 -    status: str = "pending"  # pending, in_progress, completed, failed, skipped
 -    result: str = ""
 -    attempts: int = 0
 -    max_attempts: int = 2
+-
+-
 -@dataclass
 -class TaskDecomposition:
 -    """A decomposed task with ordered subtasks."""
 -    original_task: str
 -    subtasks: list[Subtask] = field(default_factory=list)
 -    current_index: int = 0
 -    rollback_points: list[int] = field(default_factory=list)  # Indices where we can safely rollback
+-
 -    def next_subtask(self) -> Subtask | None:
 -        """Get the next pending subtask that has all dependencies met."""
 -        completed_ids = {st.id for st in self.subtasks if st.status == "completed"}
+-
 -        for st in self.subtasks:
 -            if st.status == "pending":
 -                # Check if all dependencies are completed
 -                if all(dep in completed_ids for dep in st.dependencies):
 -                    return st
 -        return None
+-
 -    def mark_completed(self, subtask_id: str, result: str = "") -> None:
 -        """Mark a subtask as completed."""
 -        for st in self.subtasks:
 -            if st.id == subtask_id:
 -                st.status = "completed"
 -                st.result = result
 -                break
+-
 -    def mark_failed(self, subtask_id: str, error: str = "") -> None:
 -        """Mark a subtask as failed."""
 -        for st in self.subtasks:
 -            if st.id == subtask_id:
 -                st.status = "failed"
 -                st.result = error
 -                st.attempts += 1
 -                break
+-
 -    def can_retry(self, subtask_id: str) -> bool:
 -        """Check if a subtask can be retried."""
 -        for st in self.subtasks:
 -            if st.id == subtask_id:
 -                return st.attempts < st.max_attempts
 -        return False
+-
 -    def reset_for_retry(self, subtask_id: str) -> None:
 -        """Reset a subtask for retry."""
 -        for st in self.subtasks:
 -            if st.id == subtask_id:
 -                st.status = "pending"
 -                break
+-
 -    def progress_str(self) -> str:
 -        """Get progress string like '[2/5]'."""
 -        completed = sum(1 for st in self.subtasks if st.status == "completed")
 -        total = len(self.subtasks)
 -        return f"[{completed}/{total}]"
+-
 -    def is_complete(self) -> bool:
 -        """Check if all subtasks are completed."""
 -        return all(st.status in ("completed", "skipped") for st in self.subtasks)
+-
 -    def has_failures(self) -> bool:
 -        """Check if any subtask has failed (and can't be retried)."""
 -        return any(
 -            st.status == "failed" and st.attempts >= st.max_attempts
 -            for st in self.subtasks
 -        )
+-
 -    def to_prompt(self) -> str:
 -        """Format decomposition for LLM prompt."""
 -        lines = [f"Task: {self.original_task}", "", "Subtasks:"]
 -        for i, st in enumerate(self.subtasks, 1):
 -            status_icon = {
 -                "pending": "○",
 -                "in_progress": "◐",
 -                "completed": "●",
 -                "failed": "✗",
 -                "skipped": "⊘",
 -            }.get(st.status, "?")
 -            deps = f" (after: {', '.join(st.dependencies)})" if st.dependencies else ""
 -            lines.append(f"  {status_icon} {i}. {st.description}{deps}")
 -            if st.verification:
 -                lines.append(f"      Verify: {st.verification}")
 -        return "\n".join(lines)
+-
+-
 -@dataclass
 -class SelfCritique:
 -    """Result of self-critique analysis."""
 -    original_response: str
 -    issues_found: list[str] = field(default_factory=list)
 -    suggestions: list[str] = field(default_factory=list)
 -    should_revise: bool = False
 -    revised_response: str = ""
 -    revision_count: int = 0
 -    max_revisions: int = 2
+-
 -    def can_revise(self) -> bool:
 -        """Check if we can do another revision."""
 -        return self.should_revise and self.revision_count < self.max_revisions
+-
+-
 -@dataclass
 -class ConfidenceAssessment:
 -    """Confidence assessment for an action."""
 -    action: str  # Description of the action
 -    tool_name: str
 -    tool_args: dict[str, Any]
 -    level: ConfidenceLevel = ConfidenceLevel.MEDIUM
 -    reasoning: str = ""
 -    risks: list[str] = field(default_factory=list)
 -    mitigations: list[str] = field(default_factory=list)
 -    requires_verification: bool = False
+-
 -    @property
 -    def score(self) -> int:
 -        """Get numeric score 1-5."""
 -        return self.level.value
+-
 -    @property
 -    def is_low_confidence(self) -> bool:
 -        """Check if confidence is low enough to warrant caution."""
 -        return self.level.value <= ConfidenceLevel.LOW.value
+-
+-
 -@dataclass
 -class ActionVerification:
 -    """Verification result for a completed action."""
 -    tool_name: str
 -    tool_args: dict[str, Any]
 -    expected_outcome: str
 -    actual_result: str
 -    verified: bool = False
 -    verification_method: str = ""  # How we verified (e.g., "file_exists", "output_contains")
 -    discrepancies: list[str] = field(default_factory=list)
 -    needs_correction: bool = False
 -    correction_suggestion: str = ""
+-
+-
  # Prompts for reasoning stages
  DECOMPOSITION_PROMPT = """Analyze this task and break it down into atomic subtasks.
  # === Task Completion Detection ===
 -@dataclass
 -class TaskCompletionCheck:
 -    """Result of checking if a task is complete."""
 -    original_task: str
 -    is_complete: bool = False
 -    accomplished: list[str] = field(default_factory=list)
 -    remaining: list[str] = field(default_factory=list)
 -    suggested_next_steps: list[str] = field(default_factory=list)
 -    continuation_prompt: str = ""
+-
+-
  COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed.
  Original task: {task}

src/loader/runtime/context.pymodified

  from pathlib import Path
  from typing import Any, Protocol
 -from ..agent.reasoning import (
 -    ActionVerification,
 -    ConfidenceAssessment,
 -)
  from ..context.project import ProjectContext
  from ..llm.base import LLMBackend, Message
  from ..tools.base import ToolRegistry
  from .capabilities import CapabilityProfile
  from .permissions import PermissionConfigStatus, PermissionPolicy
  from .recovery import RecoveryContext
 +from .reasoning_types import ActionVerification, ConfidenceAssessment
  from .session import ConversationSession

src/loader/runtime/events.pymodified

  from dataclasses import dataclass, field
  from typing import Any
 -from ..agent.reasoning import (
 +from ..llm.base import Message
 +from .dod import DefinitionOfDone
 +from .reasoning_types import (
      ActionVerification,
      ConfidenceAssessment,
      SelfCritique,
      TaskCompletionCheck,
      TaskDecomposition,
+ )
 -from ..llm.base import Message
 -from .dod import DefinitionOfDone
  from .rollback import RollbackAction, RollbackPlan
  from .tracing import RuntimeTraceEvent

src/loader/runtime/reasoning_types.pyadded

 +"""Runtime-owned typed surfaces shared with reasoning flows."""
++
 +from __future__ import annotations
++
 +from dataclasses import dataclass, field
 +from enum import Enum
 +from typing import Any
++
++
 +class ConfidenceLevel(Enum):
 +    """Confidence levels for actions."""
++
 +    VERY_LOW = 1
 +    LOW = 2
 +    MEDIUM = 3
 +    HIGH = 4
 +    VERY_HIGH = 5
++
++
 +@dataclass
 +class Subtask:
 +    """A decomposed subtask with dependencies."""
++
 +    id: str
 +    description: str
 +    dependencies: list[str] = field(default_factory=list)
 +    verification: str = ""
 +    status: str = "pending"
 +    result: str = ""
 +    attempts: int = 0
 +    max_attempts: int = 2
++
++
 +@dataclass
 +class TaskDecomposition:
 +    """A decomposed task with ordered subtasks."""
++
 +    original_task: str
 +    subtasks: list[Subtask] = field(default_factory=list)
 +    current_index: int = 0
 +    rollback_points: list[int] = field(default_factory=list)
++
 +    def next_subtask(self) -> Subtask | None:
 +        """Get the next pending subtask that has all dependencies met."""
++
 +        completed_ids = {subtask.id for subtask in self.subtasks if subtask.status == "completed"}
 +        for subtask in self.subtasks:
 +            if subtask.status == "pending" and all(
 +                dependency in completed_ids for dependency in subtask.dependencies
 +            ):
 +                return subtask
 +        return None
++
 +    def mark_completed(self, subtask_id: str, result: str = "") -> None:
 +        """Mark a subtask as completed."""
++
 +        for subtask in self.subtasks:
 +            if subtask.id == subtask_id:
 +                subtask.status = "completed"
 +                subtask.result = result
 +                break
++
 +    def mark_failed(self, subtask_id: str, error: str = "") -> None:
 +        """Mark a subtask as failed."""
++
 +        for subtask in self.subtasks:
 +            if subtask.id == subtask_id:
 +                subtask.status = "failed"
 +                subtask.result = error
 +                subtask.attempts += 1
 +                break
++
 +    def can_retry(self, subtask_id: str) -> bool:
 +        """Check if a subtask can be retried."""
++
 +        for subtask in self.subtasks:
 +            if subtask.id == subtask_id:
 +                return subtask.attempts < subtask.max_attempts
 +        return False
++
 +    def reset_for_retry(self, subtask_id: str) -> None:
 +        """Reset a subtask for retry."""
++
 +        for subtask in self.subtasks:
 +            if subtask.id == subtask_id:
 +                subtask.status = "pending"
 +                break
++
 +    def progress_str(self) -> str:
 +        """Get progress string like '[2/5]'."""
++
 +        completed = sum(1 for subtask in self.subtasks if subtask.status == "completed")
 +        return f"[{completed}/{len(self.subtasks)}]"
++
 +    def is_complete(self) -> bool:
 +        """Check if all subtasks are completed."""
++
 +        return all(subtask.status in ("completed", "skipped") for subtask in self.subtasks)
++
 +    def has_failures(self) -> bool:
 +        """Check if any subtask has failed and exhausted retries."""
++
 +        return any(
 +            subtask.status == "failed" and subtask.attempts >= subtask.max_attempts
 +            for subtask in self.subtasks
 +        )
++
 +    def to_prompt(self) -> str:
 +        """Format decomposition for LLM prompt."""
++
 +        lines = [f"Task: {self.original_task}", "", "Subtasks:"]
 +        for index, subtask in enumerate(self.subtasks, 1):
 +            status_icon = {
 +                "pending": "○",
 +                "in_progress": "◐",
 +                "completed": "●",
 +                "failed": "✗",
 +                "skipped": "⊘",
 +            }.get(subtask.status, "?")
 +            dependencies = (
 +                f" (after: {', '.join(subtask.dependencies)})"
 +                if subtask.dependencies
 +                else ""
 +            )
 +            lines.append(f"  {status_icon} {index}. {subtask.description}{dependencies}")
 +            if subtask.verification:
 +                lines.append(f"      Verify: {subtask.verification}")
 +        return "\n".join(lines)
++
++
 +@dataclass
 +class SelfCritique:
 +    """Result of self-critique analysis."""
++
 +    original_response: str
 +    issues_found: list[str] = field(default_factory=list)
 +    suggestions: list[str] = field(default_factory=list)
 +    should_revise: bool = False
 +    revised_response: str = ""
 +    revision_count: int = 0
 +    max_revisions: int = 2
++
 +    def can_revise(self) -> bool:
 +        """Check if we can do another revision."""
++
 +        return self.should_revise and self.revision_count < self.max_revisions
++
++
 +@dataclass
 +class ConfidenceAssessment:
 +    """Confidence assessment for an action."""
++
 +    action: str
 +    tool_name: str
 +    tool_args: dict[str, Any]
 +    level: ConfidenceLevel = ConfidenceLevel.MEDIUM
 +    reasoning: str = ""
 +    risks: list[str] = field(default_factory=list)
 +    mitigations: list[str] = field(default_factory=list)
 +    requires_verification: bool = False
++
 +    @property
 +    def score(self) -> int:
 +        """Get numeric score 1-5."""
++
 +        return self.level.value
++
 +    @property
 +    def is_low_confidence(self) -> bool:
 +        """Check if confidence is low enough to warrant caution."""
++
 +        return self.level.value <= ConfidenceLevel.LOW.value
++
++
 +@dataclass
 +class ActionVerification:
 +    """Verification result for a completed action."""
++
 +    tool_name: str
 +    tool_args: dict[str, Any]
 +    expected_outcome: str
 +    actual_result: str
 +    verified: bool = False
 +    verification_method: str = ""
 +    discrepancies: list[str] = field(default_factory=list)
 +    needs_correction: bool = False
 +    correction_suggestion: str = ""
++
++
 +@dataclass
 +class TaskCompletionCheck:
 +    """Result of checking if a task is complete."""
++
 +    original_task: str
 +    is_complete: bool = False
 +    accomplished: list[str] = field(default_factory=list)
 +    remaining: list[str] = field(default_factory=list)
 +    suggested_next_steps: list[str] = field(default_factory=list)
 +    continuation_prompt: str = ""

src/loader/ui/adapter.pymodified

  from ..agent.loop import AgentEvent
  if TYPE_CHECKING:
 -    from ..agent.reasoning import (
 +    from ..runtime.reasoning_types import (
          ActionVerification,
          ConfidenceAssessment,
          SelfCritique,

tests/test_reasoning_types.pyadded

 +"""Tests for runtime-owned reasoning type surfaces."""
++
 +from __future__ import annotations
++
 +from loader.runtime.reasoning_types import (
 +    ConfidenceAssessment,
 +    ConfidenceLevel,
 +    SelfCritique,
 +    Subtask,
 +    TaskCompletionCheck,
 +    TaskDecomposition,
 +)
++
++
 +def test_task_decomposition_tracks_progress_and_retry_state() -> None:
 +    decomposition = TaskDecomposition(
 +        original_task="Ship feature",
 +        subtasks=[
 +            Subtask(id="1", description="Read spec"),
 +            Subtask(id="2", description="Implement", dependencies=["1"]),
 +        ],
 +    )
++
 +    assert decomposition.next_subtask() is decomposition.subtasks[0]
 +    assert decomposition.progress_str() == "[0/2]"
++
 +    decomposition.mark_completed("1", "done")
++
 +    assert decomposition.progress_str() == "[1/2]"
 +    assert decomposition.next_subtask() is decomposition.subtasks[1]
++
 +    decomposition.mark_failed("2", "test failure")
++
 +    assert decomposition.can_retry("2") is True
 +    decomposition.reset_for_retry("2")
 +    assert decomposition.subtasks[1].status == "pending"
++
++
 +def test_confidence_assessment_exposes_score_helpers() -> None:
 +    assessment = ConfidenceAssessment(
 +        action="Write file",
 +        tool_name="write",
 +        tool_args={"file_path": "notes.txt"},
 +        level=ConfidenceLevel.LOW,
 +    )
++
 +    assert assessment.score == 2
 +    assert assessment.is_low_confidence is True
++
++
 +def test_self_critique_and_completion_defaults_are_stable() -> None:
 +    critique = SelfCritique(
 +        original_response="draft",
 +        should_revise=True,
 +        revision_count=1,
 +        max_revisions=2,
 +    )
 +    completion = TaskCompletionCheck(original_task="Ship feature")
++
 +    assert critique.can_revise() is True
 +    assert completion.is_complete is False
 +    assert completion.accomplished == []

tests/test_tool_batches.pymodified

  import pytest
 -from loader.agent.reasoning import ActionVerification, ConfidenceAssessment, ConfidenceLevel
  from loader.llm.base import Message, Role, ToolCall
  from loader.runtime.context import RuntimeContext, RuntimeLegacyServices
  from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
      load_permission_rules,
+ )
  from loader.runtime.recovery import RecoveryContext
 +from loader.runtime.reasoning_types import (
 +    ActionVerification,
 +    ConfidenceAssessment,
 +    ConfidenceLevel,
 +)
  from loader.runtime.tool_batches import ToolBatchRunner
  from loader.runtime.tracing import RuntimeTracer
  from loader.tools.base import ToolResult as RegistryToolResult