Tighten late-stage build nudges
- SHA
8331de4e0a4e71229c4b667413521822169c53c8- Parents
-
d913493 - Tree
8b87952
8331de4
8331de4e0a4e71229c4b667413521822169c53c8d913493
8b87952| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/repair.py
|
27 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
37 | 0 |
| M |
tests/test_repair.py
|
81 | 0 |
| M |
tests/test_tool_batches.py
|
120 | 0 |
src/loader/runtime/repair.pymodified@@ -239,6 +239,29 @@ class ResponseRepairer: | ||
| 239 | 239 | retry_number: int, |
| 240 | 240 | max_empty_retries: int, |
| 241 | 241 | ) -> str: |
| 242 | + if dod is not None and self._should_compact_empty_retry_message(dod): | |
| 243 | + compact_lines: list[str] = [] | |
| 244 | + compact_lines.extend(self._planned_artifact_progress_lines(dod)[:2]) | |
| 245 | + compact_lines.extend( | |
| 246 | + self._next_step_resume_lines( | |
| 247 | + dod, | |
| 248 | + retry_number=retry_number, | |
| 249 | + ) | |
| 250 | + ) | |
| 251 | + return "\n".join( | |
| 252 | + [ | |
| 253 | + "[EMPTY ASSISTANT RESPONSE]", | |
| 254 | + ( | |
| 255 | + "Your last response was empty " | |
| 256 | + f"(retry {retry_number}/{max_empty_retries}). Continue from the " | |
| 257 | + "exact next step below." | |
| 258 | + ), | |
| 259 | + *[f"- {line}" for line in compact_lines], | |
| 260 | + "", | |
| 261 | + "Respond with that concrete mutation tool call now. Do not return an empty response.", | |
| 262 | + ] | |
| 263 | + ) | |
| 264 | + | |
| 242 | 265 | progress_lines: list[str] = [] |
| 243 | 266 | if dod is not None: |
| 244 | 267 | reconcile_aggregate_completion_steps( |
@@ -343,6 +366,10 @@ class ResponseRepairer: | ||
| 343 | 366 | return base_max_empty_retries |
| 344 | 367 | return base_max_empty_retries + _LATE_STAGE_EMPTY_RETRY_EXTRA |
| 345 | 368 | |
| 369 | + def _should_compact_empty_retry_message(self, dod: DefinitionOfDone) -> bool: | |
| 370 | + completed_artifacts, missing_artifacts = self._planned_artifact_counts(dod) | |
| 371 | + return completed_artifacts >= 7 and missing_artifacts > 0 | |
| 372 | + | |
| 346 | 373 | def _planned_artifact_counts(self, dod: DefinitionOfDone) -> tuple[int, int]: |
| 347 | 374 | completed = 0 |
| 348 | 375 | missing = 0 |
src/loader/runtime/tool_batches.pymodified@@ -913,6 +913,19 @@ class ToolBatchRunner: | ||
| 913 | 913 | dod, |
| 914 | 914 | project_root=self.context.project_root, |
| 915 | 915 | ) |
| 916 | + if _late_stage_missing_artifact_build( | |
| 917 | + dod, | |
| 918 | + project_root=self.context.project_root, | |
| 919 | + ): | |
| 920 | + self.context.queue_steering_message( | |
| 921 | + f"Confirmed progress: {current_label} is now recorded." | |
| 922 | + + _missing_artifact_resume_suffix( | |
| 923 | + missing_artifact, | |
| 924 | + project_root=self.context.project_root, | |
| 925 | + ) | |
| 926 | + + " No TodoWrite, no verification, no rereads until that artifact exists." | |
| 927 | + ) | |
| 928 | + return | |
| 916 | 929 | self.context.queue_steering_message( |
| 917 | 930 | f"Confirmed progress: {current_label} is now recorded." |
| 918 | 931 | " One explicitly planned artifact is still missing." |
@@ -1129,6 +1142,30 @@ def _next_missing_planned_artifact( | ||
| 1129 | 1142 | return None |
| 1130 | 1143 | |
| 1131 | 1144 | |
| 1145 | +def _late_stage_missing_artifact_build( | |
| 1146 | + dod: DefinitionOfDone, | |
| 1147 | + *, | |
| 1148 | + project_root: Path, | |
| 1149 | +) -> bool: | |
| 1150 | + completed = 0 | |
| 1151 | + missing = 0 | |
| 1152 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 1153 | + dod, | |
| 1154 | + project_root=project_root, | |
| 1155 | + max_paths=12, | |
| 1156 | + ): | |
| 1157 | + if planned_artifact_target_satisfied( | |
| 1158 | + dod, | |
| 1159 | + target=target, | |
| 1160 | + expect_directory=expect_directory, | |
| 1161 | + project_root=project_root, | |
| 1162 | + ): | |
| 1163 | + completed += 1 | |
| 1164 | + else: | |
| 1165 | + missing += 1 | |
| 1166 | + return completed >= 7 and missing > 0 | |
| 1167 | + | |
| 1168 | + | |
| 1132 | 1169 | def _missing_artifact_resume_suffix( |
| 1133 | 1170 | missing_artifact: tuple[Path, bool] | None, |
| 1134 | 1171 | *, |
tests/test_repair.pymodified@@ -459,6 +459,87 @@ def test_empty_response_retry_budget_extends_for_late_stage_multi_artifact_progr | ||
| 459 | 459 | assert "Follow the same one-file-at-a-time mutation pattern" in decision.retry_message |
| 460 | 460 | |
| 461 | 461 | |
| 462 | +def test_empty_response_retry_uses_compact_prompt_after_substantial_progress( | |
| 463 | + temp_dir: Path, | |
| 464 | +) -> None: | |
| 465 | + context = build_context( | |
| 466 | + temp_dir=temp_dir, | |
| 467 | + use_react=False, | |
| 468 | + ) | |
| 469 | + context.session.messages.append( | |
| 470 | + SimpleNamespace( | |
| 471 | + content=( | |
| 472 | + "Observation [notepad_write_working]: Result: " | |
| 473 | + "- [2026-04-23T19:00:00Z] Creating fifth chapter file: Advanced features" | |
| 474 | + ) | |
| 475 | + ) | |
| 476 | + ) | |
| 477 | + repairer = ResponseRepairer(context) | |
| 478 | + | |
| 479 | + guide_root = temp_dir / "guides" / "nginx" | |
| 480 | + chapters = guide_root / "chapters" | |
| 481 | + chapters.mkdir(parents=True) | |
| 482 | + index_path = guide_root / "index.html" | |
| 483 | + chapter_one = chapters / "01-getting-started.html" | |
| 484 | + chapter_two = chapters / "02-installation.html" | |
| 485 | + chapter_three = chapters / "03-first-website.html" | |
| 486 | + chapter_four = chapters / "04-configuration-basics.html" | |
| 487 | + chapter_five = chapters / "05-advanced-features.html" | |
| 488 | + index_path.write_text("<html></html>\n") | |
| 489 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 490 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 491 | + chapter_three.write_text("<h1>Three</h1>\n") | |
| 492 | + chapter_four.write_text("<h1>Four</h1>\n") | |
| 493 | + | |
| 494 | + implementation_plan = temp_dir / "implementation.md" | |
| 495 | + implementation_plan.write_text( | |
| 496 | + "\n".join( | |
| 497 | + [ | |
| 498 | + "# Implementation Plan", | |
| 499 | + "", | |
| 500 | + "## File Changes", | |
| 501 | + f"- `{guide_root}/`", | |
| 502 | + f"- `{chapters}/`", | |
| 503 | + f"- `{index_path}`", | |
| 504 | + f"- `{chapter_one}`", | |
| 505 | + f"- `{chapter_two}`", | |
| 506 | + f"- `{chapter_three}`", | |
| 507 | + f"- `{chapter_four}`", | |
| 508 | + f"- `{chapter_five}`", | |
| 509 | + "", | |
| 510 | + ] | |
| 511 | + ) | |
| 512 | + ) | |
| 513 | + | |
| 514 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 515 | + dod.implementation_plan = str(implementation_plan) | |
| 516 | + dod.touched_files.extend( | |
| 517 | + [str(index_path), str(chapter_one), str(chapter_two), str(chapter_three)] | |
| 518 | + ) | |
| 519 | + dod.completed_items.extend( | |
| 520 | + [ | |
| 521 | + "Create the directory structure for the new nginx guide", | |
| 522 | + "Create the main index.html file with proper structure", | |
| 523 | + ] | |
| 524 | + ) | |
| 525 | + dod.pending_items.append("Create each chapter file in sequence") | |
| 526 | + | |
| 527 | + decision = repairer.handle_empty_response( | |
| 528 | + task="Create a multi-file nginx guide.", | |
| 529 | + original_task=None, | |
| 530 | + empty_retry_count=3, | |
| 531 | + max_empty_retries=2, | |
| 532 | + dod=dod, | |
| 533 | + ) | |
| 534 | + | |
| 535 | + assert decision.should_continue is True | |
| 536 | + assert decision.retry_message is not None | |
| 537 | + assert "Continue from the exact next step below." in decision.retry_message | |
| 538 | + assert "Latest working note:" not in decision.retry_message | |
| 539 | + assert "Confirmed completed work:" not in decision.retry_message | |
| 540 | + assert "Next pending item:" not in decision.retry_message | |
| 541 | + | |
| 542 | + | |
| 462 | 543 | def test_empty_response_retry_points_at_next_output_file_when_planned_directory_is_empty( |
| 463 | 544 | temp_dir: Path, |
| 464 | 545 | ) -> None: |
tests/test_tool_batches.pymodified@@ -2183,6 +2183,126 @@ async def test_tool_batch_runner_large_plan_does_not_claim_completion_early( | ||
| 2183 | 2183 | ) |
| 2184 | 2184 | |
| 2185 | 2185 | |
| 2186 | +@pytest.mark.asyncio | |
| 2187 | +async def test_tool_batch_runner_uses_compact_missing_artifact_nudge_after_substantial_progress( | |
| 2188 | + temp_dir: Path, | |
| 2189 | +) -> None: | |
| 2190 | + async def assess_confidence( | |
| 2191 | + tool_name: str, | |
| 2192 | + tool_args: dict, | |
| 2193 | + context: str, | |
| 2194 | + ) -> ConfidenceAssessment: | |
| 2195 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2196 | + | |
| 2197 | + async def verify_action( | |
| 2198 | + tool_name: str, | |
| 2199 | + tool_args: dict, | |
| 2200 | + result: str, | |
| 2201 | + expected: str = "", | |
| 2202 | + ) -> ActionVerification: | |
| 2203 | + raise AssertionError("Verification should not run in this scenario") | |
| 2204 | + | |
| 2205 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2206 | + chapters = guide_root / "chapters" | |
| 2207 | + guide_root.mkdir(parents=True) | |
| 2208 | + chapters.mkdir() | |
| 2209 | + index_path = guide_root / "index.html" | |
| 2210 | + chapter_paths = [ | |
| 2211 | + chapters / "01-introduction.html", | |
| 2212 | + chapters / "02-installation.html", | |
| 2213 | + chapters / "03-configuration.html", | |
| 2214 | + chapters / "04-basic-usage.html", | |
| 2215 | + chapters / "05-advanced-features.html", | |
| 2216 | + ] | |
| 2217 | + for path in (index_path, *chapter_paths[:4]): | |
| 2218 | + path.write_text("<html></html>\n") | |
| 2219 | + | |
| 2220 | + implementation_plan = temp_dir / "implementation.md" | |
| 2221 | + implementation_plan.write_text( | |
| 2222 | + "\n".join( | |
| 2223 | + [ | |
| 2224 | + "# Implementation Plan", | |
| 2225 | + "", | |
| 2226 | + "## File Changes", | |
| 2227 | + f"- `{guide_root}/`", | |
| 2228 | + f"- `{chapters}/`", | |
| 2229 | + f"- `{index_path}`", | |
| 2230 | + *[f"- `{path}`" for path in chapter_paths], | |
| 2231 | + "", | |
| 2232 | + ] | |
| 2233 | + ) | |
| 2234 | + ) | |
| 2235 | + | |
| 2236 | + context = build_context( | |
| 2237 | + temp_dir=temp_dir, | |
| 2238 | + messages=[], | |
| 2239 | + safeguards=FakeSafeguards(), | |
| 2240 | + assess_confidence=assess_confidence, | |
| 2241 | + verify_action=verify_action, | |
| 2242 | + auto_recover=False, | |
| 2243 | + ) | |
| 2244 | + queued_messages: list[str] = [] | |
| 2245 | + context.queue_steering_message_callback = queued_messages.append | |
| 2246 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2247 | + dod = create_definition_of_done("Create a thorough nginx guide.") | |
| 2248 | + dod.implementation_plan = str(implementation_plan) | |
| 2249 | + dod.touched_files.extend(str(path) for path in (index_path, *chapter_paths[:4])) | |
| 2250 | + dod.completed_items.extend( | |
| 2251 | + [ | |
| 2252 | + "Create the nginx directory structure", | |
| 2253 | + "Create the main index.html file with proper structure", | |
| 2254 | + ] | |
| 2255 | + ) | |
| 2256 | + sync_todos_to_definition_of_done( | |
| 2257 | + dod, | |
| 2258 | + [ | |
| 2259 | + { | |
| 2260 | + "content": "Create each chapter file with appropriate content", | |
| 2261 | + "active_form": "Creating each chapter file with appropriate content", | |
| 2262 | + "status": "pending", | |
| 2263 | + } | |
| 2264 | + ], | |
| 2265 | + ) | |
| 2266 | + tool_call = ToolCall( | |
| 2267 | + id="write-chapter-04", | |
| 2268 | + name="write", | |
| 2269 | + arguments={ | |
| 2270 | + "file_path": str(chapter_paths[3]), | |
| 2271 | + "content": "<html>updated</html>\n", | |
| 2272 | + }, | |
| 2273 | + ) | |
| 2274 | + executor = FakeExecutor( | |
| 2275 | + [ | |
| 2276 | + tool_outcome( | |
| 2277 | + tool_call=tool_call, | |
| 2278 | + output=f"Successfully wrote {chapter_paths[3]}", | |
| 2279 | + is_error=False, | |
| 2280 | + ) | |
| 2281 | + ] | |
| 2282 | + ) | |
| 2283 | + | |
| 2284 | + summary = TurnSummary(final_response="") | |
| 2285 | + await runner.execute_batch( | |
| 2286 | + tool_calls=[tool_call], | |
| 2287 | + tool_source="assistant", | |
| 2288 | + pending_tool_calls_seen=set(), | |
| 2289 | + emit=_noop_emit, | |
| 2290 | + summary=summary, | |
| 2291 | + dod=dod, | |
| 2292 | + executor=executor, # type: ignore[arg-type] | |
| 2293 | + on_confirmation=None, | |
| 2294 | + on_user_question=None, | |
| 2295 | + emit_confirmation=None, | |
| 2296 | + consecutive_errors=0, | |
| 2297 | + ) | |
| 2298 | + | |
| 2299 | + assert queued_messages | |
| 2300 | + message = queued_messages[-1] | |
| 2301 | + assert "Resume by creating `05-advanced-features.html` now." in message | |
| 2302 | + assert "No TodoWrite, no verification, no rereads until that artifact exists." in message | |
| 2303 | + assert "refresh `TodoWrite`" not in message | |
| 2304 | + | |
| 2305 | + | |
| 2186 | 2306 | @pytest.mark.asyncio |
| 2187 | 2307 | async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step( |
| 2188 | 2308 | temp_dir: Path, |