Soften recovered handoffs
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
6ee06b156ec0a550b3976c82b11dc279d956b65d- Parents
-
1403a6a - Tree
6dfed4b
6ee06b1
6ee06b156ec0a550b3976c82b11dc279d956b65d1403a6a
6dfed4b| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/tool_batches.py
|
26 | 6 |
| M |
tests/test_tool_batches.py
|
117 | 0 |
src/loader/runtime/tool_batches.pymodified@@ -1130,6 +1130,9 @@ class ToolBatchRunner: | ||
| 1130 | 1130 | dod, |
| 1131 | 1131 | project_root=self.context.project_root, |
| 1132 | 1132 | ) |
| 1133 | + session_messages = list(getattr(self.context.session, "messages", []) or []) | |
| 1134 | + if use_persistent_handoff and _recent_recovery_prompt(session_messages): | |
| 1135 | + use_persistent_handoff = False | |
| 1133 | 1136 | queue_message = ( |
| 1134 | 1137 | self.context.queue_steering_message |
| 1135 | 1138 | if use_persistent_handoff |
@@ -1143,7 +1146,7 @@ class ToolBatchRunner: | ||
| 1143 | 1146 | compact_resume = _compact_missing_artifact_handoff( |
| 1144 | 1147 | (resume_target, False), |
| 1145 | 1148 | project_root=self.context.project_root, |
| 1146 | - messages=list(getattr(self.context.session, "messages", []) or []), | |
| 1149 | + messages=session_messages, | |
| 1147 | 1150 | ) |
| 1148 | 1151 | if compact_resume: |
| 1149 | 1152 | queue_message( |
@@ -1160,7 +1163,7 @@ class ToolBatchRunner: | ||
| 1160 | 1163 | compact_handoff = _compact_missing_artifact_handoff( |
| 1161 | 1164 | missing_artifact, |
| 1162 | 1165 | project_root=self.context.project_root, |
| 1163 | - messages=list(getattr(self.context.session, "messages", []) or []), | |
| 1166 | + messages=session_messages, | |
| 1164 | 1167 | ) |
| 1165 | 1168 | if compact_handoff: |
| 1166 | 1169 | queue_message( |
@@ -1194,10 +1197,11 @@ class ToolBatchRunner: | ||
| 1194 | 1197 | *, |
| 1195 | 1198 | dod: DefinitionOfDone, |
| 1196 | 1199 | ) -> None: |
| 1200 | + session_messages = list(getattr(self.context.session, "messages", []) or []) | |
| 1197 | 1201 | missing_artifact = _next_missing_planned_artifact( |
| 1198 | 1202 | dod, |
| 1199 | 1203 | project_root=self.context.project_root, |
| 1200 | - messages=list(getattr(self.context.session, "messages", []) or []), | |
| 1204 | + messages=session_messages, | |
| 1201 | 1205 | ) |
| 1202 | 1206 | next_pending = preferred_pending_todo_item( |
| 1203 | 1207 | dod, |
@@ -1311,7 +1315,7 @@ class ToolBatchRunner: | ||
| 1311 | 1315 | + _missing_artifact_resume_suffix( |
| 1312 | 1316 | missing_artifact, |
| 1313 | 1317 | project_root=self.context.project_root, |
| 1314 | - messages=list(getattr(self.context.session, "messages", []) or []), | |
| 1318 | + messages=session_messages, | |
| 1315 | 1319 | ) |
| 1316 | 1320 | + todo_refresh |
| 1317 | 1321 | + " Do not spend the next turn on TodoWrite alone, bookkeeping notes, " |
@@ -1327,10 +1331,11 @@ class ToolBatchRunner: | ||
| 1327 | 1331 | if tool_call.name not in _BOOKKEEPING_NOTE_TOOL_NAMES: |
| 1328 | 1332 | return |
| 1329 | 1333 | |
| 1334 | + session_messages = list(getattr(self.context.session, "messages", []) or []) | |
| 1330 | 1335 | missing_artifact = _next_missing_planned_artifact( |
| 1331 | 1336 | dod, |
| 1332 | 1337 | project_root=self.context.project_root, |
| 1333 | - messages=list(getattr(self.context.session, "messages", []) or []), | |
| 1338 | + messages=session_messages, | |
| 1334 | 1339 | ) |
| 1335 | 1340 | if missing_artifact is None: |
| 1336 | 1341 | return |
@@ -1377,7 +1382,7 @@ class ToolBatchRunner: | ||
| 1377 | 1382 | + _missing_artifact_resume_suffix( |
| 1378 | 1383 | missing_artifact, |
| 1379 | 1384 | project_root=self.context.project_root, |
| 1380 | - messages=list(getattr(self.context.session, "messages", []) or []), | |
| 1385 | + messages=session_messages, | |
| 1381 | 1386 | ) |
| 1382 | 1387 | + todo_refresh |
| 1383 | 1388 | + " Do not spend the next turn on additional notes, rediscovery, " |
@@ -2133,6 +2138,21 @@ def _is_pure_directory_creation_tool_call(tool_call: ToolCall) -> bool: | ||
| 2133 | 2138 | return bool(parts) and parts[0] == "mkdir" |
| 2134 | 2139 | |
| 2135 | 2140 | |
| 2141 | +def _recent_recovery_prompt(messages: list[Any]) -> bool: | |
| 2142 | + for message in reversed(messages[-4:]): | |
| 2143 | + role = getattr(message, "role", None) | |
| 2144 | + if getattr(role, "value", role) != "user": | |
| 2145 | + continue | |
| 2146 | + content = getattr(message, "content", "") | |
| 2147 | + if not isinstance(content, str): | |
| 2148 | + continue | |
| 2149 | + if content.startswith("[EMPTY ASSISTANT RESPONSE]"): | |
| 2150 | + return True | |
| 2151 | + if content.startswith("[CONTINUE CURRENT STEP]"): | |
| 2152 | + return True | |
| 2153 | + return False | |
| 2154 | + | |
| 2155 | + | |
| 2136 | 2156 | def _tool_call_label(tool_call: ToolCall) -> str: |
| 2137 | 2157 | """Human-readable label for one tool call.""" |
| 2138 | 2158 | name = tool_call.name |
tests/test_tool_batches.pymodified@@ -2238,6 +2238,123 @@ async def test_tool_batch_runner_first_file_handoff_stays_persistent( | ||
| 2238 | 2238 | assert ephemeral_messages == [] |
| 2239 | 2239 | |
| 2240 | 2240 | |
| 2241 | +@pytest.mark.asyncio | |
| 2242 | +async def test_tool_batch_runner_softens_first_file_handoff_after_recovery_prompt( | |
| 2243 | + temp_dir: Path, | |
| 2244 | +) -> None: | |
| 2245 | + async def assess_confidence( | |
| 2246 | + tool_name: str, | |
| 2247 | + tool_args: dict, | |
| 2248 | + context: str, | |
| 2249 | + ) -> ConfidenceAssessment: | |
| 2250 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 2251 | + | |
| 2252 | + async def verify_action( | |
| 2253 | + tool_name: str, | |
| 2254 | + tool_args: dict, | |
| 2255 | + result: str, | |
| 2256 | + expected: str = "", | |
| 2257 | + ) -> ActionVerification: | |
| 2258 | + raise AssertionError("Verification should not run for this scenario") | |
| 2259 | + | |
| 2260 | + nginx_root = temp_dir / "guides" / "nginx" | |
| 2261 | + chapters = nginx_root / "chapters" | |
| 2262 | + chapters.mkdir(parents=True) | |
| 2263 | + index_path = nginx_root / "index.html" | |
| 2264 | + | |
| 2265 | + implementation_plan = temp_dir / "implementation.md" | |
| 2266 | + implementation_plan.write_text( | |
| 2267 | + "\n".join( | |
| 2268 | + [ | |
| 2269 | + "# Implementation Plan", | |
| 2270 | + "", | |
| 2271 | + "## File Changes", | |
| 2272 | + f"- `{chapters}/`", | |
| 2273 | + f"- `{index_path}`", | |
| 2274 | + f"- `{chapters / '01-introduction.html'}`", | |
| 2275 | + "", | |
| 2276 | + ] | |
| 2277 | + ) | |
| 2278 | + ) | |
| 2279 | + | |
| 2280 | + context = build_context( | |
| 2281 | + temp_dir=temp_dir, | |
| 2282 | + messages=[ | |
| 2283 | + Message( | |
| 2284 | + role=Role.USER, | |
| 2285 | + content=( | |
| 2286 | + "[EMPTY ASSISTANT RESPONSE]\n" | |
| 2287 | + "Respond with that concrete mutation tool call now. Do not return an empty response." | |
| 2288 | + ), | |
| 2289 | + ) | |
| 2290 | + ], | |
| 2291 | + safeguards=FakeSafeguards(), | |
| 2292 | + assess_confidence=assess_confidence, | |
| 2293 | + verify_action=verify_action, | |
| 2294 | + auto_recover=False, | |
| 2295 | + ) | |
| 2296 | + persistent_messages: list[str] = [] | |
| 2297 | + ephemeral_messages: list[str] = [] | |
| 2298 | + context.queue_steering_message_callback = persistent_messages.append | |
| 2299 | + context.queue_ephemeral_steering_message_callback = ephemeral_messages.append | |
| 2300 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2301 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2302 | + dod.implementation_plan = str(implementation_plan) | |
| 2303 | + sync_todos_to_definition_of_done( | |
| 2304 | + dod, | |
| 2305 | + [ | |
| 2306 | + { | |
| 2307 | + "content": "Create the main index.html file with proper structure", | |
| 2308 | + "active_form": "Creating the main index.html file with proper structure", | |
| 2309 | + "status": "pending", | |
| 2310 | + }, | |
| 2311 | + { | |
| 2312 | + "content": "Create each chapter file with appropriate content", | |
| 2313 | + "active_form": "Creating each chapter file with appropriate content", | |
| 2314 | + "status": "pending", | |
| 2315 | + }, | |
| 2316 | + ], | |
| 2317 | + ) | |
| 2318 | + | |
| 2319 | + tool_call = ToolCall( | |
| 2320 | + id="write-index-recovered", | |
| 2321 | + name="write", | |
| 2322 | + arguments={ | |
| 2323 | + "file_path": str(index_path), | |
| 2324 | + "content": "<html></html>\n", | |
| 2325 | + }, | |
| 2326 | + ) | |
| 2327 | + executor = FakeExecutor( | |
| 2328 | + [ | |
| 2329 | + tool_outcome( | |
| 2330 | + tool_call=tool_call, | |
| 2331 | + output=f"Successfully wrote 14 bytes to {index_path}", | |
| 2332 | + is_error=False, | |
| 2333 | + ) | |
| 2334 | + ] | |
| 2335 | + ) | |
| 2336 | + | |
| 2337 | + summary = TurnSummary(final_response="") | |
| 2338 | + await runner.execute_batch( | |
| 2339 | + tool_calls=[tool_call], | |
| 2340 | + tool_source="assistant", | |
| 2341 | + pending_tool_calls_seen=set(), | |
| 2342 | + emit=_noop_emit, | |
| 2343 | + summary=summary, | |
| 2344 | + dod=dod, | |
| 2345 | + executor=executor, # type: ignore[arg-type] | |
| 2346 | + on_confirmation=None, | |
| 2347 | + on_user_question=None, | |
| 2348 | + emit_confirmation=None, | |
| 2349 | + consecutive_errors=0, | |
| 2350 | + ) | |
| 2351 | + | |
| 2352 | + assert persistent_messages == [] | |
| 2353 | + assert ephemeral_messages | |
| 2354 | + message = ephemeral_messages[-1] | |
| 2355 | + assert "Resume by creating `01-introduction.html` now." in message | |
| 2356 | + | |
| 2357 | + | |
| 2241 | 2358 | @pytest.mark.asyncio |
| 2242 | 2359 | async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review( |
| 2243 | 2360 | temp_dir: Path, |