Move reasoning types into runtime
- SHA
e36e64fc543bee5048462c1e91f98561b4ecce7f- Parents
-
50ab16b - Tree
2a55e16
e36e64f
e36e64fc543bee5048462c1e91f98561b4ecce7f50ab16b
2a55e16| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/agent/reasoning.py
|
9 | 173 |
| M |
src/loader/runtime/context.py
|
1 | 4 |
| M |
src/loader/runtime/events.py
|
3 | 3 |
| A |
src/loader/runtime/reasoning_types.py
|
199 | 0 |
| M |
src/loader/ui/adapter.py
|
1 | 1 |
| A |
tests/test_reasoning_types.py
|
62 | 0 |
| M |
tests/test_tool_batches.py
|
5 | 1 |
src/loader/agent/reasoning.pymodified@@ -10,8 +10,6 @@ enabled to improve the agent's decision-making: | ||
| 10 | 10 | """ |
| 11 | 11 | |
| 12 | 12 | import re |
| 13 | -from dataclasses import dataclass, field | |
| 14 | -from enum import Enum | |
| 15 | 13 | from typing import Any |
| 16 | 14 | |
| 17 | 15 | from ..runtime.rollback import ( |
@@ -23,6 +21,15 @@ from ..runtime.rollback import ( | ||
| 23 | 21 | get_undo_command, |
| 24 | 22 | is_destructive_tool, |
| 25 | 23 | ) |
| 24 | +from ..runtime.reasoning_types import ( | |
| 25 | + ActionVerification, | |
| 26 | + ConfidenceAssessment, | |
| 27 | + ConfidenceLevel, | |
| 28 | + SelfCritique, | |
| 29 | + Subtask, | |
| 30 | + TaskCompletionCheck, | |
| 31 | + TaskDecomposition, | |
| 32 | +) | |
| 26 | 33 | |
| 27 | 34 | |
| 28 | 35 | # === Query Classification === |
@@ -133,166 +140,6 @@ def get_token_budget(complexity: str) -> tuple[int, int]: | ||
| 133 | 140 | return budgets.get(complexity, (1024, 8192)) |
| 134 | 141 | |
| 135 | 142 | |
| 136 | -class ConfidenceLevel(Enum): | |
| 137 | - """Confidence levels for actions.""" | |
| 138 | - VERY_LOW = 1 # < 20% - Need more information | |
| 139 | - LOW = 2 # 20-40% - Uncertain, may need verification | |
| 140 | - MEDIUM = 3 # 40-60% - Reasonable guess | |
| 141 | - HIGH = 4 # 60-80% - Confident | |
| 142 | - VERY_HIGH = 5 # 80-100% - Certain | |
| 143 | - | |
| 144 | - | |
| 145 | -@dataclass | |
| 146 | -class Subtask: | |
| 147 | - """A decomposed subtask with dependencies.""" | |
| 148 | - id: str | |
| 149 | - description: str | |
| 150 | - dependencies: list[str] = field(default_factory=list) # IDs of subtasks this depends on | |
| 151 | - verification: str = "" # How to verify this subtask succeeded | |
| 152 | - status: str = "pending" # pending, in_progress, completed, failed, skipped | |
| 153 | - result: str = "" | |
| 154 | - attempts: int = 0 | |
| 155 | - max_attempts: int = 2 | |
| 156 | - | |
| 157 | - | |
| 158 | -@dataclass | |
| 159 | -class TaskDecomposition: | |
| 160 | - """A decomposed task with ordered subtasks.""" | |
| 161 | - original_task: str | |
| 162 | - subtasks: list[Subtask] = field(default_factory=list) | |
| 163 | - current_index: int = 0 | |
| 164 | - rollback_points: list[int] = field(default_factory=list) # Indices where we can safely rollback | |
| 165 | - | |
| 166 | - def next_subtask(self) -> Subtask | None: | |
| 167 | - """Get the next pending subtask that has all dependencies met.""" | |
| 168 | - completed_ids = {st.id for st in self.subtasks if st.status == "completed"} | |
| 169 | - | |
| 170 | - for st in self.subtasks: | |
| 171 | - if st.status == "pending": | |
| 172 | - # Check if all dependencies are completed | |
| 173 | - if all(dep in completed_ids for dep in st.dependencies): | |
| 174 | - return st | |
| 175 | - return None | |
| 176 | - | |
| 177 | - def mark_completed(self, subtask_id: str, result: str = "") -> None: | |
| 178 | - """Mark a subtask as completed.""" | |
| 179 | - for st in self.subtasks: | |
| 180 | - if st.id == subtask_id: | |
| 181 | - st.status = "completed" | |
| 182 | - st.result = result | |
| 183 | - break | |
| 184 | - | |
| 185 | - def mark_failed(self, subtask_id: str, error: str = "") -> None: | |
| 186 | - """Mark a subtask as failed.""" | |
| 187 | - for st in self.subtasks: | |
| 188 | - if st.id == subtask_id: | |
| 189 | - st.status = "failed" | |
| 190 | - st.result = error | |
| 191 | - st.attempts += 1 | |
| 192 | - break | |
| 193 | - | |
| 194 | - def can_retry(self, subtask_id: str) -> bool: | |
| 195 | - """Check if a subtask can be retried.""" | |
| 196 | - for st in self.subtasks: | |
| 197 | - if st.id == subtask_id: | |
| 198 | - return st.attempts < st.max_attempts | |
| 199 | - return False | |
| 200 | - | |
| 201 | - def reset_for_retry(self, subtask_id: str) -> None: | |
| 202 | - """Reset a subtask for retry.""" | |
| 203 | - for st in self.subtasks: | |
| 204 | - if st.id == subtask_id: | |
| 205 | - st.status = "pending" | |
| 206 | - break | |
| 207 | - | |
| 208 | - def progress_str(self) -> str: | |
| 209 | - """Get progress string like '[2/5]'.""" | |
| 210 | - completed = sum(1 for st in self.subtasks if st.status == "completed") | |
| 211 | - total = len(self.subtasks) | |
| 212 | - return f"[{completed}/{total}]" | |
| 213 | - | |
| 214 | - def is_complete(self) -> bool: | |
| 215 | - """Check if all subtasks are completed.""" | |
| 216 | - return all(st.status in ("completed", "skipped") for st in self.subtasks) | |
| 217 | - | |
| 218 | - def has_failures(self) -> bool: | |
| 219 | - """Check if any subtask has failed (and can't be retried).""" | |
| 220 | - return any( | |
| 221 | - st.status == "failed" and st.attempts >= st.max_attempts | |
| 222 | - for st in self.subtasks | |
| 223 | - ) | |
| 224 | - | |
| 225 | - def to_prompt(self) -> str: | |
| 226 | - """Format decomposition for LLM prompt.""" | |
| 227 | - lines = [f"Task: {self.original_task}", "", "Subtasks:"] | |
| 228 | - for i, st in enumerate(self.subtasks, 1): | |
| 229 | - status_icon = { | |
| 230 | - "pending": "○", | |
| 231 | - "in_progress": "◐", | |
| 232 | - "completed": "●", | |
| 233 | - "failed": "✗", | |
| 234 | - "skipped": "⊘", | |
| 235 | - }.get(st.status, "?") | |
| 236 | - deps = f" (after: {', '.join(st.dependencies)})" if st.dependencies else "" | |
| 237 | - lines.append(f" {status_icon} {i}. {st.description}{deps}") | |
| 238 | - if st.verification: | |
| 239 | - lines.append(f" Verify: {st.verification}") | |
| 240 | - return "\n".join(lines) | |
| 241 | - | |
| 242 | - | |
| 243 | -@dataclass | |
| 244 | -class SelfCritique: | |
| 245 | - """Result of self-critique analysis.""" | |
| 246 | - original_response: str | |
| 247 | - issues_found: list[str] = field(default_factory=list) | |
| 248 | - suggestions: list[str] = field(default_factory=list) | |
| 249 | - should_revise: bool = False | |
| 250 | - revised_response: str = "" | |
| 251 | - revision_count: int = 0 | |
| 252 | - max_revisions: int = 2 | |
| 253 | - | |
| 254 | - def can_revise(self) -> bool: | |
| 255 | - """Check if we can do another revision.""" | |
| 256 | - return self.should_revise and self.revision_count < self.max_revisions | |
| 257 | - | |
| 258 | - | |
| 259 | -@dataclass | |
| 260 | -class ConfidenceAssessment: | |
| 261 | - """Confidence assessment for an action.""" | |
| 262 | - action: str # Description of the action | |
| 263 | - tool_name: str | |
| 264 | - tool_args: dict[str, Any] | |
| 265 | - level: ConfidenceLevel = ConfidenceLevel.MEDIUM | |
| 266 | - reasoning: str = "" | |
| 267 | - risks: list[str] = field(default_factory=list) | |
| 268 | - mitigations: list[str] = field(default_factory=list) | |
| 269 | - requires_verification: bool = False | |
| 270 | - | |
| 271 | - @property | |
| 272 | - def score(self) -> int: | |
| 273 | - """Get numeric score 1-5.""" | |
| 274 | - return self.level.value | |
| 275 | - | |
| 276 | - @property | |
| 277 | - def is_low_confidence(self) -> bool: | |
| 278 | - """Check if confidence is low enough to warrant caution.""" | |
| 279 | - return self.level.value <= ConfidenceLevel.LOW.value | |
| 280 | - | |
| 281 | - | |
| 282 | -@dataclass | |
| 283 | -class ActionVerification: | |
| 284 | - """Verification result for a completed action.""" | |
| 285 | - tool_name: str | |
| 286 | - tool_args: dict[str, Any] | |
| 287 | - expected_outcome: str | |
| 288 | - actual_result: str | |
| 289 | - verified: bool = False | |
| 290 | - verification_method: str = "" # How we verified (e.g., "file_exists", "output_contains") | |
| 291 | - discrepancies: list[str] = field(default_factory=list) | |
| 292 | - needs_correction: bool = False | |
| 293 | - correction_suggestion: str = "" | |
| 294 | - | |
| 295 | - | |
| 296 | 143 | # Prompts for reasoning stages |
| 297 | 144 | |
| 298 | 145 | DECOMPOSITION_PROMPT = """Analyze this task and break it down into atomic subtasks. |
@@ -705,17 +552,6 @@ def quick_verify(tool_name: str, tool_args: dict, result: str) -> bool: | ||
| 705 | 552 | |
| 706 | 553 | # === Task Completion Detection === |
| 707 | 554 | |
| 708 | -@dataclass | |
| 709 | -class TaskCompletionCheck: | |
| 710 | - """Result of checking if a task is complete.""" | |
| 711 | - original_task: str | |
| 712 | - is_complete: bool = False | |
| 713 | - accomplished: list[str] = field(default_factory=list) | |
| 714 | - remaining: list[str] = field(default_factory=list) | |
| 715 | - suggested_next_steps: list[str] = field(default_factory=list) | |
| 716 | - continuation_prompt: str = "" | |
| 717 | - | |
| 718 | - | |
| 719 | 555 | COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed. |
| 720 | 556 | |
| 721 | 557 | Original task: {task} |
src/loader/runtime/context.pymodified@@ -7,16 +7,13 @@ from dataclasses import dataclass | ||
| 7 | 7 | from pathlib import Path |
| 8 | 8 | from typing import Any, Protocol |
| 9 | 9 | |
| 10 | -from ..agent.reasoning import ( | |
| 11 | - ActionVerification, | |
| 12 | - ConfidenceAssessment, | |
| 13 | -) | |
| 14 | 10 | from ..context.project import ProjectContext |
| 15 | 11 | from ..llm.base import LLMBackend, Message |
| 16 | 12 | from ..tools.base import ToolRegistry |
| 17 | 13 | from .capabilities import CapabilityProfile |
| 18 | 14 | from .permissions import PermissionConfigStatus, PermissionPolicy |
| 19 | 15 | from .recovery import RecoveryContext |
| 16 | +from .reasoning_types import ActionVerification, ConfidenceAssessment | |
| 20 | 17 | from .session import ConversationSession |
| 21 | 18 | |
| 22 | 19 | |
src/loader/runtime/events.pymodified@@ -5,7 +5,9 @@ from __future__ import annotations | ||
| 5 | 5 | from dataclasses import dataclass, field |
| 6 | 6 | from typing import Any |
| 7 | 7 | |
| 8 | -from ..agent.reasoning import ( | |
| 8 | +from ..llm.base import Message | |
| 9 | +from .dod import DefinitionOfDone | |
| 10 | +from .reasoning_types import ( | |
| 9 | 11 | ActionVerification, |
| 10 | 12 | ConfidenceAssessment, |
| 11 | 13 | SelfCritique, |
@@ -13,8 +15,6 @@ from ..agent.reasoning import ( | ||
| 13 | 15 | TaskCompletionCheck, |
| 14 | 16 | TaskDecomposition, |
| 15 | 17 | ) |
| 16 | -from ..llm.base import Message | |
| 17 | -from .dod import DefinitionOfDone | |
| 18 | 18 | from .rollback import RollbackAction, RollbackPlan |
| 19 | 19 | from .tracing import RuntimeTraceEvent |
| 20 | 20 | |
src/loader/runtime/reasoning_types.pyadded@@ -0,0 +1,199 @@ | ||
| 1 | +"""Runtime-owned typed surfaces shared with reasoning flows.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from dataclasses import dataclass, field | |
| 6 | +from enum import Enum | |
| 7 | +from typing import Any | |
| 8 | + | |
| 9 | + | |
| 10 | +class ConfidenceLevel(Enum): | |
| 11 | + """Confidence levels for actions.""" | |
| 12 | + | |
| 13 | + VERY_LOW = 1 | |
| 14 | + LOW = 2 | |
| 15 | + MEDIUM = 3 | |
| 16 | + HIGH = 4 | |
| 17 | + VERY_HIGH = 5 | |
| 18 | + | |
| 19 | + | |
| 20 | +@dataclass | |
| 21 | +class Subtask: | |
| 22 | + """A decomposed subtask with dependencies.""" | |
| 23 | + | |
| 24 | + id: str | |
| 25 | + description: str | |
| 26 | + dependencies: list[str] = field(default_factory=list) | |
| 27 | + verification: str = "" | |
| 28 | + status: str = "pending" | |
| 29 | + result: str = "" | |
| 30 | + attempts: int = 0 | |
| 31 | + max_attempts: int = 2 | |
| 32 | + | |
| 33 | + | |
| 34 | +@dataclass | |
| 35 | +class TaskDecomposition: | |
| 36 | + """A decomposed task with ordered subtasks.""" | |
| 37 | + | |
| 38 | + original_task: str | |
| 39 | + subtasks: list[Subtask] = field(default_factory=list) | |
| 40 | + current_index: int = 0 | |
| 41 | + rollback_points: list[int] = field(default_factory=list) | |
| 42 | + | |
| 43 | + def next_subtask(self) -> Subtask | None: | |
| 44 | + """Get the next pending subtask that has all dependencies met.""" | |
| 45 | + | |
| 46 | + completed_ids = {subtask.id for subtask in self.subtasks if subtask.status == "completed"} | |
| 47 | + for subtask in self.subtasks: | |
| 48 | + if subtask.status == "pending" and all( | |
| 49 | + dependency in completed_ids for dependency in subtask.dependencies | |
| 50 | + ): | |
| 51 | + return subtask | |
| 52 | + return None | |
| 53 | + | |
| 54 | + def mark_completed(self, subtask_id: str, result: str = "") -> None: | |
| 55 | + """Mark a subtask as completed.""" | |
| 56 | + | |
| 57 | + for subtask in self.subtasks: | |
| 58 | + if subtask.id == subtask_id: | |
| 59 | + subtask.status = "completed" | |
| 60 | + subtask.result = result | |
| 61 | + break | |
| 62 | + | |
| 63 | + def mark_failed(self, subtask_id: str, error: str = "") -> None: | |
| 64 | + """Mark a subtask as failed.""" | |
| 65 | + | |
| 66 | + for subtask in self.subtasks: | |
| 67 | + if subtask.id == subtask_id: | |
| 68 | + subtask.status = "failed" | |
| 69 | + subtask.result = error | |
| 70 | + subtask.attempts += 1 | |
| 71 | + break | |
| 72 | + | |
| 73 | + def can_retry(self, subtask_id: str) -> bool: | |
| 74 | + """Check if a subtask can be retried.""" | |
| 75 | + | |
| 76 | + for subtask in self.subtasks: | |
| 77 | + if subtask.id == subtask_id: | |
| 78 | + return subtask.attempts < subtask.max_attempts | |
| 79 | + return False | |
| 80 | + | |
| 81 | + def reset_for_retry(self, subtask_id: str) -> None: | |
| 82 | + """Reset a subtask for retry.""" | |
| 83 | + | |
| 84 | + for subtask in self.subtasks: | |
| 85 | + if subtask.id == subtask_id: | |
| 86 | + subtask.status = "pending" | |
| 87 | + break | |
| 88 | + | |
| 89 | + def progress_str(self) -> str: | |
| 90 | + """Get progress string like '[2/5]'.""" | |
| 91 | + | |
| 92 | + completed = sum(1 for subtask in self.subtasks if subtask.status == "completed") | |
| 93 | + return f"[{completed}/{len(self.subtasks)}]" | |
| 94 | + | |
| 95 | + def is_complete(self) -> bool: | |
| 96 | + """Check if all subtasks are completed.""" | |
| 97 | + | |
| 98 | + return all(subtask.status in ("completed", "skipped") for subtask in self.subtasks) | |
| 99 | + | |
| 100 | + def has_failures(self) -> bool: | |
| 101 | + """Check if any subtask has failed and exhausted retries.""" | |
| 102 | + | |
| 103 | + return any( | |
| 104 | + subtask.status == "failed" and subtask.attempts >= subtask.max_attempts | |
| 105 | + for subtask in self.subtasks | |
| 106 | + ) | |
| 107 | + | |
| 108 | + def to_prompt(self) -> str: | |
| 109 | + """Format decomposition for LLM prompt.""" | |
| 110 | + | |
| 111 | + lines = [f"Task: {self.original_task}", "", "Subtasks:"] | |
| 112 | + for index, subtask in enumerate(self.subtasks, 1): | |
| 113 | + status_icon = { | |
| 114 | + "pending": "○", | |
| 115 | + "in_progress": "◐", | |
| 116 | + "completed": "●", | |
| 117 | + "failed": "✗", | |
| 118 | + "skipped": "⊘", | |
| 119 | + }.get(subtask.status, "?") | |
| 120 | + dependencies = ( | |
| 121 | + f" (after: {', '.join(subtask.dependencies)})" | |
| 122 | + if subtask.dependencies | |
| 123 | + else "" | |
| 124 | + ) | |
| 125 | + lines.append(f" {status_icon} {index}. {subtask.description}{dependencies}") | |
| 126 | + if subtask.verification: | |
| 127 | + lines.append(f" Verify: {subtask.verification}") | |
| 128 | + return "\n".join(lines) | |
| 129 | + | |
| 130 | + | |
| 131 | +@dataclass | |
| 132 | +class SelfCritique: | |
| 133 | + """Result of self-critique analysis.""" | |
| 134 | + | |
| 135 | + original_response: str | |
| 136 | + issues_found: list[str] = field(default_factory=list) | |
| 137 | + suggestions: list[str] = field(default_factory=list) | |
| 138 | + should_revise: bool = False | |
| 139 | + revised_response: str = "" | |
| 140 | + revision_count: int = 0 | |
| 141 | + max_revisions: int = 2 | |
| 142 | + | |
| 143 | + def can_revise(self) -> bool: | |
| 144 | + """Check if we can do another revision.""" | |
| 145 | + | |
| 146 | + return self.should_revise and self.revision_count < self.max_revisions | |
| 147 | + | |
| 148 | + | |
| 149 | +@dataclass | |
| 150 | +class ConfidenceAssessment: | |
| 151 | + """Confidence assessment for an action.""" | |
| 152 | + | |
| 153 | + action: str | |
| 154 | + tool_name: str | |
| 155 | + tool_args: dict[str, Any] | |
| 156 | + level: ConfidenceLevel = ConfidenceLevel.MEDIUM | |
| 157 | + reasoning: str = "" | |
| 158 | + risks: list[str] = field(default_factory=list) | |
| 159 | + mitigations: list[str] = field(default_factory=list) | |
| 160 | + requires_verification: bool = False | |
| 161 | + | |
| 162 | + @property | |
| 163 | + def score(self) -> int: | |
| 164 | + """Get numeric score 1-5.""" | |
| 165 | + | |
| 166 | + return self.level.value | |
| 167 | + | |
| 168 | + @property | |
| 169 | + def is_low_confidence(self) -> bool: | |
| 170 | + """Check if confidence is low enough to warrant caution.""" | |
| 171 | + | |
| 172 | + return self.level.value <= ConfidenceLevel.LOW.value | |
| 173 | + | |
| 174 | + | |
| 175 | +@dataclass | |
| 176 | +class ActionVerification: | |
| 177 | + """Verification result for a completed action.""" | |
| 178 | + | |
| 179 | + tool_name: str | |
| 180 | + tool_args: dict[str, Any] | |
| 181 | + expected_outcome: str | |
| 182 | + actual_result: str | |
| 183 | + verified: bool = False | |
| 184 | + verification_method: str = "" | |
| 185 | + discrepancies: list[str] = field(default_factory=list) | |
| 186 | + needs_correction: bool = False | |
| 187 | + correction_suggestion: str = "" | |
| 188 | + | |
| 189 | + | |
| 190 | +@dataclass | |
| 191 | +class TaskCompletionCheck: | |
| 192 | + """Result of checking if a task is complete.""" | |
| 193 | + | |
| 194 | + original_task: str | |
| 195 | + is_complete: bool = False | |
| 196 | + accomplished: list[str] = field(default_factory=list) | |
| 197 | + remaining: list[str] = field(default_factory=list) | |
| 198 | + suggested_next_steps: list[str] = field(default_factory=list) | |
| 199 | + continuation_prompt: str = "" | |
src/loader/ui/adapter.pymodified@@ -8,7 +8,7 @@ from textual.message import Message | ||
| 8 | 8 | from ..agent.loop import AgentEvent |
| 9 | 9 | |
| 10 | 10 | if TYPE_CHECKING: |
| 11 | - from ..agent.reasoning import ( | |
| 11 | + from ..runtime.reasoning_types import ( | |
| 12 | 12 | ActionVerification, |
| 13 | 13 | ConfidenceAssessment, |
| 14 | 14 | SelfCritique, |
tests/test_reasoning_types.pyadded@@ -0,0 +1,62 @@ | ||
| 1 | +"""Tests for runtime-owned reasoning type surfaces.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from loader.runtime.reasoning_types import ( | |
| 6 | + ConfidenceAssessment, | |
| 7 | + ConfidenceLevel, | |
| 8 | + SelfCritique, | |
| 9 | + Subtask, | |
| 10 | + TaskCompletionCheck, | |
| 11 | + TaskDecomposition, | |
| 12 | +) | |
| 13 | + | |
| 14 | + | |
| 15 | +def test_task_decomposition_tracks_progress_and_retry_state() -> None: | |
| 16 | + decomposition = TaskDecomposition( | |
| 17 | + original_task="Ship feature", | |
| 18 | + subtasks=[ | |
| 19 | + Subtask(id="1", description="Read spec"), | |
| 20 | + Subtask(id="2", description="Implement", dependencies=["1"]), | |
| 21 | + ], | |
| 22 | + ) | |
| 23 | + | |
| 24 | + assert decomposition.next_subtask() is decomposition.subtasks[0] | |
| 25 | + assert decomposition.progress_str() == "[0/2]" | |
| 26 | + | |
| 27 | + decomposition.mark_completed("1", "done") | |
| 28 | + | |
| 29 | + assert decomposition.progress_str() == "[1/2]" | |
| 30 | + assert decomposition.next_subtask() is decomposition.subtasks[1] | |
| 31 | + | |
| 32 | + decomposition.mark_failed("2", "test failure") | |
| 33 | + | |
| 34 | + assert decomposition.can_retry("2") is True | |
| 35 | + decomposition.reset_for_retry("2") | |
| 36 | + assert decomposition.subtasks[1].status == "pending" | |
| 37 | + | |
| 38 | + | |
| 39 | +def test_confidence_assessment_exposes_score_helpers() -> None: | |
| 40 | + assessment = ConfidenceAssessment( | |
| 41 | + action="Write file", | |
| 42 | + tool_name="write", | |
| 43 | + tool_args={"file_path": "notes.txt"}, | |
| 44 | + level=ConfidenceLevel.LOW, | |
| 45 | + ) | |
| 46 | + | |
| 47 | + assert assessment.score == 2 | |
| 48 | + assert assessment.is_low_confidence is True | |
| 49 | + | |
| 50 | + | |
| 51 | +def test_self_critique_and_completion_defaults_are_stable() -> None: | |
| 52 | + critique = SelfCritique( | |
| 53 | + original_response="draft", | |
| 54 | + should_revise=True, | |
| 55 | + revision_count=1, | |
| 56 | + max_revisions=2, | |
| 57 | + ) | |
| 58 | + completion = TaskCompletionCheck(original_task="Ship feature") | |
| 59 | + | |
| 60 | + assert critique.can_revise() is True | |
| 61 | + assert completion.is_complete is False | |
| 62 | + assert completion.accomplished == [] | |
tests/test_tool_batches.pymodified@@ -8,7 +8,6 @@ from types import SimpleNamespace | ||
| 8 | 8 | |
| 9 | 9 | import pytest |
| 10 | 10 | |
| 11 | -from loader.agent.reasoning import ActionVerification, ConfidenceAssessment, ConfidenceLevel | |
| 12 | 11 | from loader.llm.base import Message, Role, ToolCall |
| 13 | 12 | from loader.runtime.context import RuntimeContext, RuntimeLegacyServices |
| 14 | 13 | from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done |
@@ -20,6 +19,11 @@ from loader.runtime.permissions import ( | ||
| 20 | 19 | load_permission_rules, |
| 21 | 20 | ) |
| 22 | 21 | from loader.runtime.recovery import RecoveryContext |
| 22 | +from loader.runtime.reasoning_types import ( | |
| 23 | + ActionVerification, | |
| 24 | + ConfidenceAssessment, | |
| 25 | + ConfidenceLevel, | |
| 26 | +) | |
| 23 | 27 | from loader.runtime.tool_batches import ToolBatchRunner |
| 24 | 28 | from loader.runtime.tracing import RuntimeTracer |
| 25 | 29 | from loader.tools.base import ToolResult as RegistryToolResult |