tenseleyflow/loader / e36e64f

Browse files

Move reasoning types into runtime

Authored by espadonne
SHA
e36e64fc543bee5048462c1e91f98561b4ecce7f
Parents
50ab16b
Tree
2a55e16

7 changed files

StatusFile+-
M src/loader/agent/reasoning.py 9 173
M src/loader/runtime/context.py 1 4
M src/loader/runtime/events.py 3 3
A src/loader/runtime/reasoning_types.py 199 0
M src/loader/ui/adapter.py 1 1
A tests/test_reasoning_types.py 62 0
M tests/test_tool_batches.py 5 1
src/loader/agent/reasoning.pymodified
@@ -10,8 +10,6 @@ enabled to improve the agent's decision-making:
1010
 """
1111
 
1212
 import re
13
-from dataclasses import dataclass, field
14
-from enum import Enum
1513
 from typing import Any
1614
 
1715
 from ..runtime.rollback import (
@@ -23,6 +21,15 @@ from ..runtime.rollback import (
2321
     get_undo_command,
2422
     is_destructive_tool,
2523
 )
24
+from ..runtime.reasoning_types import (
25
+    ActionVerification,
26
+    ConfidenceAssessment,
27
+    ConfidenceLevel,
28
+    SelfCritique,
29
+    Subtask,
30
+    TaskCompletionCheck,
31
+    TaskDecomposition,
32
+)
2633
 
2734
 
2835
 # === Query Classification ===
@@ -133,166 +140,6 @@ def get_token_budget(complexity: str) -> tuple[int, int]:
133140
     return budgets.get(complexity, (1024, 8192))
134141
 
135142
 
136
-class ConfidenceLevel(Enum):
137
-    """Confidence levels for actions."""
138
-    VERY_LOW = 1      # < 20% - Need more information
139
-    LOW = 2           # 20-40% - Uncertain, may need verification
140
-    MEDIUM = 3        # 40-60% - Reasonable guess
141
-    HIGH = 4          # 60-80% - Confident
142
-    VERY_HIGH = 5     # 80-100% - Certain
143
-
144
-
145
-@dataclass
146
-class Subtask:
147
-    """A decomposed subtask with dependencies."""
148
-    id: str
149
-    description: str
150
-    dependencies: list[str] = field(default_factory=list)  # IDs of subtasks this depends on
151
-    verification: str = ""  # How to verify this subtask succeeded
152
-    status: str = "pending"  # pending, in_progress, completed, failed, skipped
153
-    result: str = ""
154
-    attempts: int = 0
155
-    max_attempts: int = 2
156
-
157
-
158
-@dataclass
159
-class TaskDecomposition:
160
-    """A decomposed task with ordered subtasks."""
161
-    original_task: str
162
-    subtasks: list[Subtask] = field(default_factory=list)
163
-    current_index: int = 0
164
-    rollback_points: list[int] = field(default_factory=list)  # Indices where we can safely rollback
165
-
166
-    def next_subtask(self) -> Subtask | None:
167
-        """Get the next pending subtask that has all dependencies met."""
168
-        completed_ids = {st.id for st in self.subtasks if st.status == "completed"}
169
-
170
-        for st in self.subtasks:
171
-            if st.status == "pending":
172
-                # Check if all dependencies are completed
173
-                if all(dep in completed_ids for dep in st.dependencies):
174
-                    return st
175
-        return None
176
-
177
-    def mark_completed(self, subtask_id: str, result: str = "") -> None:
178
-        """Mark a subtask as completed."""
179
-        for st in self.subtasks:
180
-            if st.id == subtask_id:
181
-                st.status = "completed"
182
-                st.result = result
183
-                break
184
-
185
-    def mark_failed(self, subtask_id: str, error: str = "") -> None:
186
-        """Mark a subtask as failed."""
187
-        for st in self.subtasks:
188
-            if st.id == subtask_id:
189
-                st.status = "failed"
190
-                st.result = error
191
-                st.attempts += 1
192
-                break
193
-
194
-    def can_retry(self, subtask_id: str) -> bool:
195
-        """Check if a subtask can be retried."""
196
-        for st in self.subtasks:
197
-            if st.id == subtask_id:
198
-                return st.attempts < st.max_attempts
199
-        return False
200
-
201
-    def reset_for_retry(self, subtask_id: str) -> None:
202
-        """Reset a subtask for retry."""
203
-        for st in self.subtasks:
204
-            if st.id == subtask_id:
205
-                st.status = "pending"
206
-                break
207
-
208
-    def progress_str(self) -> str:
209
-        """Get progress string like '[2/5]'."""
210
-        completed = sum(1 for st in self.subtasks if st.status == "completed")
211
-        total = len(self.subtasks)
212
-        return f"[{completed}/{total}]"
213
-
214
-    def is_complete(self) -> bool:
215
-        """Check if all subtasks are completed."""
216
-        return all(st.status in ("completed", "skipped") for st in self.subtasks)
217
-
218
-    def has_failures(self) -> bool:
219
-        """Check if any subtask has failed (and can't be retried)."""
220
-        return any(
221
-            st.status == "failed" and st.attempts >= st.max_attempts
222
-            for st in self.subtasks
223
-        )
224
-
225
-    def to_prompt(self) -> str:
226
-        """Format decomposition for LLM prompt."""
227
-        lines = [f"Task: {self.original_task}", "", "Subtasks:"]
228
-        for i, st in enumerate(self.subtasks, 1):
229
-            status_icon = {
230
-                "pending": "○",
231
-                "in_progress": "◐",
232
-                "completed": "●",
233
-                "failed": "✗",
234
-                "skipped": "⊘",
235
-            }.get(st.status, "?")
236
-            deps = f" (after: {', '.join(st.dependencies)})" if st.dependencies else ""
237
-            lines.append(f"  {status_icon} {i}. {st.description}{deps}")
238
-            if st.verification:
239
-                lines.append(f"      Verify: {st.verification}")
240
-        return "\n".join(lines)
241
-
242
-
243
-@dataclass
244
-class SelfCritique:
245
-    """Result of self-critique analysis."""
246
-    original_response: str
247
-    issues_found: list[str] = field(default_factory=list)
248
-    suggestions: list[str] = field(default_factory=list)
249
-    should_revise: bool = False
250
-    revised_response: str = ""
251
-    revision_count: int = 0
252
-    max_revisions: int = 2
253
-
254
-    def can_revise(self) -> bool:
255
-        """Check if we can do another revision."""
256
-        return self.should_revise and self.revision_count < self.max_revisions
257
-
258
-
259
-@dataclass
260
-class ConfidenceAssessment:
261
-    """Confidence assessment for an action."""
262
-    action: str  # Description of the action
263
-    tool_name: str
264
-    tool_args: dict[str, Any]
265
-    level: ConfidenceLevel = ConfidenceLevel.MEDIUM
266
-    reasoning: str = ""
267
-    risks: list[str] = field(default_factory=list)
268
-    mitigations: list[str] = field(default_factory=list)
269
-    requires_verification: bool = False
270
-
271
-    @property
272
-    def score(self) -> int:
273
-        """Get numeric score 1-5."""
274
-        return self.level.value
275
-
276
-    @property
277
-    def is_low_confidence(self) -> bool:
278
-        """Check if confidence is low enough to warrant caution."""
279
-        return self.level.value <= ConfidenceLevel.LOW.value
280
-
281
-
282
-@dataclass
283
-class ActionVerification:
284
-    """Verification result for a completed action."""
285
-    tool_name: str
286
-    tool_args: dict[str, Any]
287
-    expected_outcome: str
288
-    actual_result: str
289
-    verified: bool = False
290
-    verification_method: str = ""  # How we verified (e.g., "file_exists", "output_contains")
291
-    discrepancies: list[str] = field(default_factory=list)
292
-    needs_correction: bool = False
293
-    correction_suggestion: str = ""
294
-
295
-
296143
 # Prompts for reasoning stages
297144
 
298145
 DECOMPOSITION_PROMPT = """Analyze this task and break it down into atomic subtasks.
@@ -705,17 +552,6 @@ def quick_verify(tool_name: str, tool_args: dict, result: str) -> bool:
705552
 
706553
 # === Task Completion Detection ===
707554
 
708
-@dataclass
709
-class TaskCompletionCheck:
710
-    """Result of checking if a task is complete."""
711
-    original_task: str
712
-    is_complete: bool = False
713
-    accomplished: list[str] = field(default_factory=list)
714
-    remaining: list[str] = field(default_factory=list)
715
-    suggested_next_steps: list[str] = field(default_factory=list)
716
-    continuation_prompt: str = ""
717
-
718
-
719555
 COMPLETION_CHECK_PROMPT = """Evaluate if this task has been FULLY completed.
720556
 
721557
 Original task: {task}
src/loader/runtime/context.pymodified
@@ -7,16 +7,13 @@ from dataclasses import dataclass
77
 from pathlib import Path
88
 from typing import Any, Protocol
99
 
10
-from ..agent.reasoning import (
11
-    ActionVerification,
12
-    ConfidenceAssessment,
13
-)
1410
 from ..context.project import ProjectContext
1511
 from ..llm.base import LLMBackend, Message
1612
 from ..tools.base import ToolRegistry
1713
 from .capabilities import CapabilityProfile
1814
 from .permissions import PermissionConfigStatus, PermissionPolicy
1915
 from .recovery import RecoveryContext
16
+from .reasoning_types import ActionVerification, ConfidenceAssessment
2017
 from .session import ConversationSession
2118
 
2219
 
src/loader/runtime/events.pymodified
@@ -5,7 +5,9 @@ from __future__ import annotations
55
 from dataclasses import dataclass, field
66
 from typing import Any
77
 
8
-from ..agent.reasoning import (
8
+from ..llm.base import Message
9
+from .dod import DefinitionOfDone
10
+from .reasoning_types import (
911
     ActionVerification,
1012
     ConfidenceAssessment,
1113
     SelfCritique,
@@ -13,8 +15,6 @@ from ..agent.reasoning import (
1315
     TaskCompletionCheck,
1416
     TaskDecomposition,
1517
 )
16
-from ..llm.base import Message
17
-from .dod import DefinitionOfDone
1818
 from .rollback import RollbackAction, RollbackPlan
1919
 from .tracing import RuntimeTraceEvent
2020
 
src/loader/runtime/reasoning_types.pyadded
@@ -0,0 +1,199 @@
1
+"""Runtime-owned typed surfaces shared with reasoning flows."""
2
+
3
+from __future__ import annotations
4
+
5
+from dataclasses import dataclass, field
6
+from enum import Enum
7
+from typing import Any
8
+
9
+
10
+class ConfidenceLevel(Enum):
11
+    """Confidence levels for actions."""
12
+
13
+    VERY_LOW = 1
14
+    LOW = 2
15
+    MEDIUM = 3
16
+    HIGH = 4
17
+    VERY_HIGH = 5
18
+
19
+
20
+@dataclass
21
+class Subtask:
22
+    """A decomposed subtask with dependencies."""
23
+
24
+    id: str
25
+    description: str
26
+    dependencies: list[str] = field(default_factory=list)
27
+    verification: str = ""
28
+    status: str = "pending"
29
+    result: str = ""
30
+    attempts: int = 0
31
+    max_attempts: int = 2
32
+
33
+
34
+@dataclass
35
+class TaskDecomposition:
36
+    """A decomposed task with ordered subtasks."""
37
+
38
+    original_task: str
39
+    subtasks: list[Subtask] = field(default_factory=list)
40
+    current_index: int = 0
41
+    rollback_points: list[int] = field(default_factory=list)
42
+
43
+    def next_subtask(self) -> Subtask | None:
44
+        """Get the next pending subtask that has all dependencies met."""
45
+
46
+        completed_ids = {subtask.id for subtask in self.subtasks if subtask.status == "completed"}
47
+        for subtask in self.subtasks:
48
+            if subtask.status == "pending" and all(
49
+                dependency in completed_ids for dependency in subtask.dependencies
50
+            ):
51
+                return subtask
52
+        return None
53
+
54
+    def mark_completed(self, subtask_id: str, result: str = "") -> None:
55
+        """Mark a subtask as completed."""
56
+
57
+        for subtask in self.subtasks:
58
+            if subtask.id == subtask_id:
59
+                subtask.status = "completed"
60
+                subtask.result = result
61
+                break
62
+
63
+    def mark_failed(self, subtask_id: str, error: str = "") -> None:
64
+        """Mark a subtask as failed."""
65
+
66
+        for subtask in self.subtasks:
67
+            if subtask.id == subtask_id:
68
+                subtask.status = "failed"
69
+                subtask.result = error
70
+                subtask.attempts += 1
71
+                break
72
+
73
+    def can_retry(self, subtask_id: str) -> bool:
74
+        """Check if a subtask can be retried."""
75
+
76
+        for subtask in self.subtasks:
77
+            if subtask.id == subtask_id:
78
+                return subtask.attempts < subtask.max_attempts
79
+        return False
80
+
81
+    def reset_for_retry(self, subtask_id: str) -> None:
82
+        """Reset a subtask for retry."""
83
+
84
+        for subtask in self.subtasks:
85
+            if subtask.id == subtask_id:
86
+                subtask.status = "pending"
87
+                break
88
+
89
+    def progress_str(self) -> str:
90
+        """Get progress string like '[2/5]'."""
91
+
92
+        completed = sum(1 for subtask in self.subtasks if subtask.status == "completed")
93
+        return f"[{completed}/{len(self.subtasks)}]"
94
+
95
+    def is_complete(self) -> bool:
96
+        """Check if all subtasks are completed."""
97
+
98
+        return all(subtask.status in ("completed", "skipped") for subtask in self.subtasks)
99
+
100
+    def has_failures(self) -> bool:
101
+        """Check if any subtask has failed and exhausted retries."""
102
+
103
+        return any(
104
+            subtask.status == "failed" and subtask.attempts >= subtask.max_attempts
105
+            for subtask in self.subtasks
106
+        )
107
+
108
+    def to_prompt(self) -> str:
109
+        """Format decomposition for LLM prompt."""
110
+
111
+        lines = [f"Task: {self.original_task}", "", "Subtasks:"]
112
+        for index, subtask in enumerate(self.subtasks, 1):
113
+            status_icon = {
114
+                "pending": "○",
115
+                "in_progress": "◐",
116
+                "completed": "●",
117
+                "failed": "✗",
118
+                "skipped": "⊘",
119
+            }.get(subtask.status, "?")
120
+            dependencies = (
121
+                f" (after: {', '.join(subtask.dependencies)})"
122
+                if subtask.dependencies
123
+                else ""
124
+            )
125
+            lines.append(f"  {status_icon} {index}. {subtask.description}{dependencies}")
126
+            if subtask.verification:
127
+                lines.append(f"      Verify: {subtask.verification}")
128
+        return "\n".join(lines)
129
+
130
+
131
+@dataclass
132
+class SelfCritique:
133
+    """Result of self-critique analysis."""
134
+
135
+    original_response: str
136
+    issues_found: list[str] = field(default_factory=list)
137
+    suggestions: list[str] = field(default_factory=list)
138
+    should_revise: bool = False
139
+    revised_response: str = ""
140
+    revision_count: int = 0
141
+    max_revisions: int = 2
142
+
143
+    def can_revise(self) -> bool:
144
+        """Check if we can do another revision."""
145
+
146
+        return self.should_revise and self.revision_count < self.max_revisions
147
+
148
+
149
+@dataclass
150
+class ConfidenceAssessment:
151
+    """Confidence assessment for an action."""
152
+
153
+    action: str
154
+    tool_name: str
155
+    tool_args: dict[str, Any]
156
+    level: ConfidenceLevel = ConfidenceLevel.MEDIUM
157
+    reasoning: str = ""
158
+    risks: list[str] = field(default_factory=list)
159
+    mitigations: list[str] = field(default_factory=list)
160
+    requires_verification: bool = False
161
+
162
+    @property
163
+    def score(self) -> int:
164
+        """Get numeric score 1-5."""
165
+
166
+        return self.level.value
167
+
168
+    @property
169
+    def is_low_confidence(self) -> bool:
170
+        """Check if confidence is low enough to warrant caution."""
171
+
172
+        return self.level.value <= ConfidenceLevel.LOW.value
173
+
174
+
175
+@dataclass
176
+class ActionVerification:
177
+    """Verification result for a completed action."""
178
+
179
+    tool_name: str
180
+    tool_args: dict[str, Any]
181
+    expected_outcome: str
182
+    actual_result: str
183
+    verified: bool = False
184
+    verification_method: str = ""
185
+    discrepancies: list[str] = field(default_factory=list)
186
+    needs_correction: bool = False
187
+    correction_suggestion: str = ""
188
+
189
+
190
+@dataclass
191
+class TaskCompletionCheck:
192
+    """Result of checking if a task is complete."""
193
+
194
+    original_task: str
195
+    is_complete: bool = False
196
+    accomplished: list[str] = field(default_factory=list)
197
+    remaining: list[str] = field(default_factory=list)
198
+    suggested_next_steps: list[str] = field(default_factory=list)
199
+    continuation_prompt: str = ""
src/loader/ui/adapter.pymodified
@@ -8,7 +8,7 @@ from textual.message import Message
88
 from ..agent.loop import AgentEvent
99
 
1010
 if TYPE_CHECKING:
11
-    from ..agent.reasoning import (
11
+    from ..runtime.reasoning_types import (
1212
         ActionVerification,
1313
         ConfidenceAssessment,
1414
         SelfCritique,
tests/test_reasoning_types.pyadded
@@ -0,0 +1,62 @@
1
+"""Tests for runtime-owned reasoning type surfaces."""
2
+
3
+from __future__ import annotations
4
+
5
+from loader.runtime.reasoning_types import (
6
+    ConfidenceAssessment,
7
+    ConfidenceLevel,
8
+    SelfCritique,
9
+    Subtask,
10
+    TaskCompletionCheck,
11
+    TaskDecomposition,
12
+)
13
+
14
+
15
+def test_task_decomposition_tracks_progress_and_retry_state() -> None:
16
+    decomposition = TaskDecomposition(
17
+        original_task="Ship feature",
18
+        subtasks=[
19
+            Subtask(id="1", description="Read spec"),
20
+            Subtask(id="2", description="Implement", dependencies=["1"]),
21
+        ],
22
+    )
23
+
24
+    assert decomposition.next_subtask() is decomposition.subtasks[0]
25
+    assert decomposition.progress_str() == "[0/2]"
26
+
27
+    decomposition.mark_completed("1", "done")
28
+
29
+    assert decomposition.progress_str() == "[1/2]"
30
+    assert decomposition.next_subtask() is decomposition.subtasks[1]
31
+
32
+    decomposition.mark_failed("2", "test failure")
33
+
34
+    assert decomposition.can_retry("2") is True
35
+    decomposition.reset_for_retry("2")
36
+    assert decomposition.subtasks[1].status == "pending"
37
+
38
+
39
+def test_confidence_assessment_exposes_score_helpers() -> None:
40
+    assessment = ConfidenceAssessment(
41
+        action="Write file",
42
+        tool_name="write",
43
+        tool_args={"file_path": "notes.txt"},
44
+        level=ConfidenceLevel.LOW,
45
+    )
46
+
47
+    assert assessment.score == 2
48
+    assert assessment.is_low_confidence is True
49
+
50
+
51
+def test_self_critique_and_completion_defaults_are_stable() -> None:
52
+    critique = SelfCritique(
53
+        original_response="draft",
54
+        should_revise=True,
55
+        revision_count=1,
56
+        max_revisions=2,
57
+    )
58
+    completion = TaskCompletionCheck(original_task="Ship feature")
59
+
60
+    assert critique.can_revise() is True
61
+    assert completion.is_complete is False
62
+    assert completion.accomplished == []
tests/test_tool_batches.pymodified
@@ -8,7 +8,6 @@ from types import SimpleNamespace
88
 
99
 import pytest
1010
 
11
-from loader.agent.reasoning import ActionVerification, ConfidenceAssessment, ConfidenceLevel
1211
 from loader.llm.base import Message, Role, ToolCall
1312
 from loader.runtime.context import RuntimeContext, RuntimeLegacyServices
1413
 from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
@@ -20,6 +19,11 @@ from loader.runtime.permissions import (
2019
     load_permission_rules,
2120
 )
2221
 from loader.runtime.recovery import RecoveryContext
22
+from loader.runtime.reasoning_types import (
23
+    ActionVerification,
24
+    ConfidenceAssessment,
25
+    ConfidenceLevel,
26
+)
2327
 from loader.runtime.tool_batches import ToolBatchRunner
2428
 from loader.runtime.tracing import RuntimeTracer
2529
 from loader.tools.base import ToolResult as RegistryToolResult