Recover stale guide edits
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
7a4495ac004853da076b066fea24c139f2940e15- Parents
-
bea3236 - Tree
6e040e4
7a4495a
7a4495ac004853da076b066fea24c139f2940e15bea3236
6e040e4| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/finalization.py
|
8 | 2 |
| M |
src/loader/runtime/recovery.py
|
44 | 8 |
| M |
src/loader/runtime/tool_batch_recovery.py
|
33 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
63 | 0 |
| M |
tests/test_finalization.py
|
1 | 0 |
| M |
tests/test_recovery.py
|
45 | 0 |
| M |
tests/test_tool_batch_policies.py
|
3 | 0 |
| M |
tests/test_tool_batches.py
|
84 | 0 |
src/loader/runtime/finalization.pymodified@@ -1362,6 +1362,9 @@ def _html_quality_repair_strategy_lines() -> list[str]: | ||
| 1362 | 1362 | "exceed the expected text-character floor; if it says insufficient " |
| 1363 | 1363 | "structured content, add enough real sections, lists, code, tables, or " |
| 1364 | 1364 | "other content blocks to exceed the block floor.", |
| 1365 | + "- Prefer a complete `write` or a `patch`/`edit` anchored to exact current " | |
| 1366 | + "on-disk text; do not use remembered minimal `old_string` snippets for " | |
| 1367 | + "generated-document expansion.", | |
| 1365 | 1368 | "- Treat structured content as substantive page body material, not table-of-" |
| 1366 | 1369 | "contents inflation: do not add duplicate navigation entries, relabel links " |
| 1367 | 1370 | "to existing pages as new chapters, or introduce new missing page links just " |
@@ -1451,10 +1454,13 @@ def _build_verification_failure_recovery_nudge( | ||
| 1451 | 1454 | return ( |
| 1452 | 1455 | "Verification now identifies generated artifact content quality issues. " |
| 1453 | 1456 | "Do not restart discovery or keep auditing unrelated files. " |
| 1454 | - "Your next response should be one concrete `edit` or `write`-style tool " | |
| 1457 | + "Your next response should be one concrete `write`, `patch`, or exact-current " | |
| 1458 | + "`edit` tool " | |
| 1455 | 1459 | f"call that expands `{primary_target.artifact_path}` to address: " |
| 1456 | 1460 | f"{primary_target.issue}. Make a substantial change that clears the stated " |
| 1457 | - "threshold, not a small incremental edit, and do not summarize completion " | |
| 1461 | + "threshold, not a small incremental edit. Do not use remembered minimal " | |
| 1462 | + "`old_string` snippets; anchor edits to current on-disk text or rewrite the " | |
| 1463 | + "file completely. Do not summarize completion " | |
| 1458 | 1464 | f"after only one target is touched.{remaining_hint}" |
| 1459 | 1465 | ) |
| 1460 | 1466 | |
src/loader/runtime/recovery.pymodified@@ -872,6 +872,27 @@ def get_recovery_hints( | ||
| 872 | 872 | return "\n".join(f"- {hint}" for hint in category_hints) |
| 873 | 873 | |
| 874 | 874 | |
| 875 | +def _edit_old_string_mismatch_hints( | |
| 876 | + tool_name: str, | |
| 877 | + args: dict[str, Any] | None, | |
| 878 | + error: str, | |
| 879 | +) -> list[str]: | |
| 880 | + """Return specific recovery hints for stale edit replacement strings.""" | |
| 881 | + | |
| 882 | + if tool_name != "edit": | |
| 883 | + return [] | |
| 884 | + if "old_string not found" not in str(error or "").lower(): | |
| 885 | + return [] | |
| 886 | + | |
| 887 | + target = str((args or {}).get("file_path") or (args or {}).get("path") or "").strip() | |
| 888 | + target_suffix = f" for `{target}`" if target else "" | |
| 889 | + return [ | |
| 890 | + f"`old_string` is stale or copied from memory{target_suffix}; do not retry the same text.", | |
| 891 | + "If you use `edit`, copy one exact contiguous `old_string` from the current on-disk file.", | |
| 892 | + "If the repair rewrites most of the file, use `write` with the complete replacement content or `patch` anchored to current text instead.", | |
| 893 | + ] | |
| 894 | + | |
| 895 | + | |
| 875 | 896 | RECOVERY_PROMPT = """## TOOL FAILURE - INVESTIGATE AND ADAPT |
| 876 | 897 | |
| 877 | 898 | The command failed. You MUST analyze the error and take a DIFFERENT action. |
@@ -911,6 +932,9 @@ def format_recovery_prompt( | ||
| 911 | 932 | |
| 912 | 933 | category = categorize_error(error) |
| 913 | 934 | hints = get_recovery_hints(category, tool_name, args) |
| 935 | + mismatch_hints = _edit_old_string_mismatch_hints(tool_name, args, error) | |
| 936 | + if mismatch_hints: | |
| 937 | + hints = "\n".join(f"- {hint}" for hint in mismatch_hints) + "\n" + hints | |
| 914 | 938 | args_str = ", ".join(f"{key}={value!r}" for key, value in args.items()) |
| 915 | 939 | |
| 916 | 940 | return RECOVERY_PROMPT.format( |
@@ -1004,14 +1028,26 @@ def format_failure_message(context: RecoveryContext) -> str: | ||
| 1004 | 1028 | ], |
| 1005 | 1029 | } |
| 1006 | 1030 | |
| 1007 | - specific_suggestions = suggestions.get( | |
| 1008 | - last_category, | |
| 1009 | - [ | |
| 1010 | - "Manually check the file/directory structure", | |
| 1011 | - "Review the error messages for clues", | |
| 1012 | - "Try a completely different approach", | |
| 1013 | - ], | |
| 1014 | - ) | |
| 1031 | + last_attempt = context.attempts[-1] if context.attempts else None | |
| 1032 | + if ( | |
| 1033 | + last_attempt is not None | |
| 1034 | + and last_attempt.tool_name == "edit" | |
| 1035 | + and "old_string not found" in str(last_attempt.error or "").lower() | |
| 1036 | + ): | |
| 1037 | + specific_suggestions = [ | |
| 1038 | + "`old_string` was stale; do not retry the same remembered text", | |
| 1039 | + "Use exact current on-disk text for `edit`, or switch to `patch`/`write`", | |
| 1040 | + "For large generated-file repairs, replace the whole file or anchor a patch to current content", | |
| 1041 | + ] | |
| 1042 | + else: | |
| 1043 | + specific_suggestions = suggestions.get( | |
| 1044 | + last_category, | |
| 1045 | + [ | |
| 1046 | + "Manually check the file/directory structure", | |
| 1047 | + "Review the error messages for clues", | |
| 1048 | + "Try a completely different approach", | |
| 1049 | + ], | |
| 1050 | + ) | |
| 1015 | 1051 | |
| 1016 | 1052 | lines.extend(["", "Suggestions:"]) |
| 1017 | 1053 | for suggestion in specific_suggestions: |
src/loader/runtime/tool_batch_recovery.pymodified@@ -238,11 +238,44 @@ class ToolBatchRecoveryController: | ||
| 238 | 238 | target_excerpt_lines = self._target_excerpt_lines(tool_call) |
| 239 | 239 | if target_excerpt_lines: |
| 240 | 240 | lines.extend(["", "## CURRENT TARGET EXCERPT", *target_excerpt_lines]) |
| 241 | + stale_edit_lines = self._stale_edit_recovery_lines(tool_call, outcome) | |
| 242 | + if stale_edit_lines: | |
| 243 | + lines.extend(["", "## STALE EDIT RECOVERY", *stale_edit_lines]) | |
| 241 | 244 | payload_fix_lines = self._missing_payload_fix_lines(tool_call, outcome) |
| 242 | 245 | if payload_fix_lines: |
| 243 | 246 | lines.extend(["", "## PAYLOAD FORMAT FIX", *payload_fix_lines]) |
| 244 | 247 | return "\n".join(lines) |
| 245 | 248 | |
| 249 | + def _stale_edit_recovery_lines( | |
| 250 | + self, | |
| 251 | + tool_call: ToolCall, | |
| 252 | + outcome: ToolExecutionOutcome, | |
| 253 | + ) -> list[str]: | |
| 254 | + """Steer old_string misses away from repeated remembered edits.""" | |
| 255 | + | |
| 256 | + if tool_call.name != "edit": | |
| 257 | + return [] | |
| 258 | + if "old_string not found" not in outcome.result_output.lower(): | |
| 259 | + return [] | |
| 260 | + | |
| 261 | + raw_path = str( | |
| 262 | + tool_call.arguments.get("file_path") | |
| 263 | + or tool_call.arguments.get("path") | |
| 264 | + or "" | |
| 265 | + ).strip() | |
| 266 | + target = self._canonicalize_path(raw_path) | |
| 267 | + target_line = ( | |
| 268 | + f"- The failed `old_string` is stale for `{target}`; do not retry it from memory." | |
| 269 | + if target | |
| 270 | + else "- The failed `old_string` is stale; do not retry it from memory." | |
| 271 | + ) | |
| 272 | + return [ | |
| 273 | + target_line, | |
| 274 | + "- If using `edit`, copy an exact contiguous `old_string` from the current target excerpt above.", | |
| 275 | + "- For generated-document expansion or large rewrites, prefer `write` with the complete replacement file or `patch` anchored to current text.", | |
| 276 | + "- Do not reopen unrelated reference materials before applying the next mutation.", | |
| 277 | + ] | |
| 278 | + | |
| 246 | 279 | def _missing_payload_fix_lines( |
| 247 | 280 | self, |
| 248 | 281 | tool_call: ToolCall, |
src/loader/runtime/tool_batches.pymodified@@ -1000,6 +1000,23 @@ class ToolBatchRunner: | ||
| 1000 | 1000 | if changed_path not in allowed_paths: |
| 1001 | 1001 | return |
| 1002 | 1002 | |
| 1003 | + if _repair_context_is_html_quality(repair): | |
| 1004 | + next_target = _next_quality_repair_path( | |
| 1005 | + repair, | |
| 1006 | + changed_path=changed_path, | |
| 1007 | + ) | |
| 1008 | + if next_target: | |
| 1009 | + self.context.queue_steering_message( | |
| 1010 | + "The active HTML content-quality repair target was updated. " | |
| 1011 | + "If the current file now comfortably clears its stated threshold, " | |
| 1012 | + f"continue directly with the next listed quality target `{next_target}` " | |
| 1013 | + "using one substantial write/edit/patch anchored to current content. " | |
| 1014 | + "If it still looks thin, expand this same file further now. " | |
| 1015 | + "Do not rerun verification, reopen unrelated references, or summarize " | |
| 1016 | + "completion after only one quality target." | |
| 1017 | + ) | |
| 1018 | + return | |
| 1019 | + | |
| 1003 | 1020 | if changed_path == str(Path(repair.artifact_path).expanduser().resolve(strict=False)): |
| 1004 | 1021 | self.context.queue_steering_message( |
| 1005 | 1022 | "The active verification repair target was updated. " |
@@ -3245,6 +3262,52 @@ def _recent_edit_string_mismatch_target(recovery_context: RecoveryContext | None | ||
| 3245 | 3262 | return "" |
| 3246 | 3263 | |
| 3247 | 3264 | |
| 3265 | +def _repair_context_is_html_quality(repair: Any) -> bool: | |
| 3266 | + """Return whether the active repair context is for generated HTML quality.""" | |
| 3267 | + | |
| 3268 | + return any( | |
| 3269 | + _repair_line_is_html_quality(line) | |
| 3270 | + for line in getattr(repair, "repair_lines", ()) or () | |
| 3271 | + ) | |
| 3272 | + | |
| 3273 | + | |
| 3274 | +def _repair_line_is_html_quality(line: str) -> bool: | |
| 3275 | + lowered = str(line or "").lower() | |
| 3276 | + return ( | |
| 3277 | + "thin content" in lowered | |
| 3278 | + or "insufficient structured content" in lowered | |
| 3279 | + or "content-quality" in lowered | |
| 3280 | + or "content quality" in lowered | |
| 3281 | + ) | |
| 3282 | + | |
| 3283 | + | |
| 3284 | +def _next_quality_repair_path(repair: Any, *, changed_path: str) -> str: | |
| 3285 | + """Return the next concrete repair file after a successful quality mutation.""" | |
| 3286 | + | |
| 3287 | + try: | |
| 3288 | + normalized_changed = str(Path(changed_path).expanduser().resolve(strict=False)) | |
| 3289 | + except (OSError, RuntimeError, ValueError): | |
| 3290 | + normalized_changed = str(Path(changed_path).expanduser()) | |
| 3291 | + | |
| 3292 | + normalized_paths: list[str] = [] | |
| 3293 | + for raw_path in getattr(repair, "allowed_paths", ()) or (): | |
| 3294 | + try: | |
| 3295 | + normalized = str(Path(raw_path).expanduser().resolve(strict=False)) | |
| 3296 | + except (OSError, RuntimeError, ValueError): | |
| 3297 | + normalized = str(Path(raw_path).expanduser()) | |
| 3298 | + if normalized and normalized not in normalized_paths: | |
| 3299 | + normalized_paths.append(normalized) | |
| 3300 | + | |
| 3301 | + if normalized_changed in normalized_paths: | |
| 3302 | + index = normalized_paths.index(normalized_changed) | |
| 3303 | + if index + 1 < len(normalized_paths): | |
| 3304 | + return normalized_paths[index + 1] | |
| 3305 | + for normalized in normalized_paths: | |
| 3306 | + if normalized != normalized_changed: | |
| 3307 | + return normalized | |
| 3308 | + return "" | |
| 3309 | + | |
| 3310 | + | |
| 3248 | 3311 | def _tool_call_targets_path(tool_call: ToolCall, target: str) -> bool: |
| 3249 | 3312 | if not target: |
| 3250 | 3313 | return False |
tests/test_finalization.pymodified@@ -489,6 +489,7 @@ def test_verification_repair_guidance_keeps_multi_file_quality_worklist( | ||
| 489 | 489 | assert f"Improve `{chapter_paths[0]}`: thin content" in guidance |
| 490 | 490 | assert f"Improve `{chapter_paths[-1]}`: thin content" in guidance |
| 491 | 491 | assert "add enough concrete prose" in guidance |
| 492 | + assert "exact current on-disk text" in guidance | |
| 492 | 493 | assert "not table-of-contents inflation" in guidance |
| 493 | 494 | assert "do not add duplicate navigation entries" in guidance |
| 494 | 495 | assert "do not stop after touching only the first file" in guidance |
tests/test_recovery.pymodified@@ -224,6 +224,32 @@ class TestFormatRecoveryPrompt: | ||
| 224 | 224 | assert "content_chars" in prompt |
| 225 | 225 | assert "index.html" in prompt |
| 226 | 226 | |
| 227 | + def test_format_recovery_prompt_for_old_string_miss_prefers_current_text(self): | |
| 228 | + ctx = RecoveryContext( | |
| 229 | + original_tool="edit", | |
| 230 | + original_args={ | |
| 231 | + "file_path": "~/Loader/guides/nginx/chapters/02-installation.html", | |
| 232 | + "old_string": "<h1>Installation</h1>", | |
| 233 | + "new_string": "<h1>Installation</h1><p>Expanded.</p>", | |
| 234 | + }, | |
| 235 | + ) | |
| 236 | + ctx.add_attempt( | |
| 237 | + "edit", | |
| 238 | + ctx.original_args, | |
| 239 | + "old_string not found in file. Make sure it matches exactly.", | |
| 240 | + ) | |
| 241 | + | |
| 242 | + prompt = format_recovery_prompt( | |
| 243 | + ctx, | |
| 244 | + "edit", | |
| 245 | + ctx.original_args, | |
| 246 | + "old_string not found in file. Make sure it matches exactly.", | |
| 247 | + ) | |
| 248 | + | |
| 249 | + assert "`old_string` is stale" in prompt | |
| 250 | + assert "current on-disk file" in prompt | |
| 251 | + assert "write` with the complete replacement content" in prompt | |
| 252 | + | |
| 227 | 253 | |
| 228 | 254 | class TestFormatFailureMessage: |
| 229 | 255 | """Tests for failure message formatting.""" |
@@ -244,3 +270,22 @@ class TestFormatFailureMessage: | ||
| 244 | 270 | assert "Error 1" in msg |
| 245 | 271 | assert "Error 2" in msg |
| 246 | 272 | assert "Error 3" in msg |
| 273 | + | |
| 274 | + def test_format_failure_message_for_old_string_miss_is_specific(self): | |
| 275 | + ctx = RecoveryContext( | |
| 276 | + original_tool="edit", | |
| 277 | + original_args={ | |
| 278 | + "file_path": "guide.html", | |
| 279 | + "old_string": "<h1>Old</h1>", | |
| 280 | + "new_string": "<h1>New</h1>", | |
| 281 | + }, | |
| 282 | + max_retries=2, | |
| 283 | + ) | |
| 284 | + ctx.add_attempt("edit", ctx.original_args, "old_string not found in file") | |
| 285 | + ctx.add_attempt("edit", ctx.original_args, "old_string not found in file") | |
| 286 | + | |
| 287 | + msg = format_failure_message(ctx) | |
| 288 | + | |
| 289 | + assert "`old_string` was stale" in msg | |
| 290 | + assert "exact current on-disk text" in msg | |
| 291 | + assert "Try a completely different approach" not in msg | |
tests/test_tool_batch_policies.pymodified@@ -576,6 +576,9 @@ async def test_tool_batch_recovery_controller_includes_current_target_excerpt_fo | ||
| 576 | 576 | assert "6 | ## Status" in follow_up.content |
| 577 | 577 | assert "7 | The runtime is stable." in follow_up.content |
| 578 | 578 | assert "replace the containing block in one edit" in follow_up.content |
| 579 | + assert "## STALE EDIT RECOVERY" in follow_up.content | |
| 580 | + assert "do not retry it from memory" in follow_up.content | |
| 581 | + assert "complete replacement file" in follow_up.content | |
| 579 | 582 | |
| 580 | 583 | |
| 581 | 584 | @pytest.mark.asyncio |
tests/test_tool_batches.pymodified@@ -6993,6 +6993,90 @@ def test_tool_batch_runner_duplicate_repair_mutation_restates_verifier_deltas( | ||
| 6993 | 6993 | assert "make one real edit" in queued[0] |
| 6994 | 6994 | |
| 6995 | 6995 | |
| 6996 | +@pytest.mark.asyncio | |
| 6997 | +async def test_tool_batch_runner_quality_repair_success_hands_to_next_target( | |
| 6998 | + temp_dir: Path, | |
| 6999 | +) -> None: | |
| 7000 | + async def assess_confidence( | |
| 7001 | + tool_name: str, | |
| 7002 | + tool_args: dict, | |
| 7003 | + context: str, | |
| 7004 | + ) -> ConfidenceAssessment: | |
| 7005 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 7006 | + | |
| 7007 | + async def verify_action( | |
| 7008 | + tool_name: str, | |
| 7009 | + tool_args: dict, | |
| 7010 | + result: str, | |
| 7011 | + expected: str = "", | |
| 7012 | + ) -> ActionVerification: | |
| 7013 | + raise AssertionError("Verification should not run in this scenario") | |
| 7014 | + | |
| 7015 | + chapters = temp_dir / "guide" / "chapters" | |
| 7016 | + first = chapters / "01-introduction.html" | |
| 7017 | + second = chapters / "02-installation.html" | |
| 7018 | + chapters.mkdir(parents=True) | |
| 7019 | + first.write_text("<h1>Intro</h1>\n") | |
| 7020 | + second.write_text("<h1>Install</h1>\n") | |
| 7021 | + context = build_context( | |
| 7022 | + temp_dir=temp_dir, | |
| 7023 | + messages=[ | |
| 7024 | + Message( | |
| 7025 | + role=Role.ASSISTANT, | |
| 7026 | + content=( | |
| 7027 | + "Repair focus:\n" | |
| 7028 | + f"- Improve `{first}`: thin content (400 text chars, expected at least 1758).\n" | |
| 7029 | + f"- Improve `{second}`: insufficient structured content (6 blocks, expected at least 18).\n" | |
| 7030 | + f"- Immediate next step: edit `{first}` with a substantial expansion or replacement.\n" | |
| 7031 | + "- Repair every listed quality target in order before any final answer.\n" | |
| 7032 | + ), | |
| 7033 | + ) | |
| 7034 | + ], | |
| 7035 | + safeguards=FakeSafeguards(), | |
| 7036 | + assess_confidence=assess_confidence, | |
| 7037 | + verify_action=verify_action, | |
| 7038 | + ) | |
| 7039 | + queued: list[str] = [] | |
| 7040 | + context.queue_steering_message_callback = queued.append | |
| 7041 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 7042 | + dod = create_definition_of_done("Repair generated HTML guide quality.") | |
| 7043 | + tool_call = ToolCall( | |
| 7044 | + id="write-intro", | |
| 7045 | + name="write", | |
| 7046 | + arguments={ | |
| 7047 | + "file_path": str(first), | |
| 7048 | + "content": "<h1>Intro</h1><p>Substantial expansion.</p>\n", | |
| 7049 | + }, | |
| 7050 | + ) | |
| 7051 | + | |
| 7052 | + await runner.execute_batch( | |
| 7053 | + tool_calls=[tool_call], | |
| 7054 | + tool_source="assistant", | |
| 7055 | + pending_tool_calls_seen=set(), | |
| 7056 | + emit=_noop_emit, | |
| 7057 | + summary=TurnSummary(final_response=""), | |
| 7058 | + dod=dod, | |
| 7059 | + executor=FakeExecutor( | |
| 7060 | + [ | |
| 7061 | + tool_outcome( | |
| 7062 | + tool_call=tool_call, | |
| 7063 | + output=f"Successfully wrote {first}", | |
| 7064 | + is_error=False, | |
| 7065 | + ) | |
| 7066 | + ] | |
| 7067 | + ), # type: ignore[arg-type] | |
| 7068 | + on_confirmation=None, | |
| 7069 | + on_user_question=None, | |
| 7070 | + emit_confirmation=None, | |
| 7071 | + consecutive_errors=0, | |
| 7072 | + ) | |
| 7073 | + | |
| 7074 | + assert queued | |
| 7075 | + assert any("next listed quality target" in message for message in queued) | |
| 7076 | + assert any(str(second.resolve(strict=False)) in message for message in queued) | |
| 7077 | + assert any("Do not rerun verification" in message for message in queued) | |
| 7078 | + | |
| 7079 | + | |
| 6996 | 7080 | @pytest.mark.asyncio |
| 6997 | 7081 | async def test_tool_batch_runner_hands_off_after_active_repair_support_file_write( |
| 6998 | 7082 | temp_dir: Path, |