Refine repair loop steering
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
c10dbd184e4fba56433bf8f81f7aca5dd0217ea0- Parents
-
b210a18 - Tree
15f7763
c10dbd1
c10dbd184e4fba56433bf8f81f7aca5dd0217ea0b210a18
15f7763| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/hooks.py
|
4 | 5 |
| M |
src/loader/runtime/tool_batches.py
|
57 | 0 |
| M |
tests/test_permissions.py
|
9 | 18 |
| M |
tests/test_tool_batches.py
|
104 | 0 |
src/loader/runtime/hooks.pymodified@@ -997,6 +997,8 @@ class LateReferenceDriftHook(BaseToolHook): | ||
| 997 | 997 | async def pre_tool_use(self, context: HookContext) -> HookResult: |
| 998 | 998 | if context.tool_call.name not in _OBSERVATION_TOOLS: |
| 999 | 999 | return HookResult() |
| 1000 | + if context.source == "verification": | |
| 1001 | + return HookResult() | |
| 1000 | 1002 | |
| 1001 | 1003 | completed_scope = self._completed_artifact_scope() |
| 1002 | 1004 | if completed_scope is not None: |
@@ -1039,9 +1041,6 @@ class LateReferenceDriftHook(BaseToolHook): | ||
| 1039 | 1041 | terminal_state="blocked", |
| 1040 | 1042 | ) |
| 1041 | 1043 | |
| 1042 | - if context.source == "verification": | |
| 1043 | - return HookResult() | |
| 1044 | - | |
| 1045 | 1044 | late_stage = self._late_stage_missing_artifact() |
| 1046 | 1045 | if late_stage is None: |
| 1047 | 1046 | return HookResult() |
@@ -1142,6 +1141,8 @@ class LateReferenceDriftHook(BaseToolHook): | ||
| 1142 | 1141 | return False |
| 1143 | 1142 | |
| 1144 | 1143 | async def post_tool_use(self, context: HookContext) -> HookResult: |
| 1144 | + if context.source == "verification": | |
| 1145 | + return HookResult() | |
| 1145 | 1146 | if _tool_call_is_effective_mutation(context.tool_call): |
| 1146 | 1147 | self._reset_completed_scope_state() |
| 1147 | 1148 | return HookResult() |
@@ -1150,8 +1151,6 @@ class LateReferenceDriftHook(BaseToolHook): | ||
| 1150 | 1151 | |
| 1151 | 1152 | completed_scope = self._completed_artifact_scope() |
| 1152 | 1153 | if completed_scope is None: |
| 1153 | - if context.source == "verification": | |
| 1154 | - return HookResult() | |
| 1155 | 1154 | self._reset_completed_scope_state() |
| 1156 | 1155 | return HookResult() |
| 1157 | 1156 | |
src/loader/runtime/tool_batches.pymodified@@ -529,6 +529,20 @@ class ToolBatchRunner: | ||
| 529 | 529 | self.context.session.messages, |
| 530 | 530 | max_items=2, |
| 531 | 531 | ) |
| 532 | + edit_mismatch_target = _recent_edit_string_mismatch_target( | |
| 533 | + self.context.recovery_context, | |
| 534 | + ) | |
| 535 | + if edit_mismatch_target and _tool_call_targets_path(tool_call, edit_mismatch_target): | |
| 536 | + self.context.queue_steering_message( | |
| 537 | + "Reuse the earlier observation instead of repeating it. " | |
| 538 | + f"The last edit on `{edit_mismatch_target}` failed because `old_string` " | |
| 539 | + "did not exactly match the current file. Use the already-read contents " | |
| 540 | + "as the source of truth and send one concrete mutation now: `edit` with " | |
| 541 | + "an exact `old_string` copied from that file, `patch`, or `write` with " | |
| 542 | + "the complete replacement content if the change rewrites most of the file. " | |
| 543 | + "Do not read the same file again first." | |
| 544 | + ) | |
| 545 | + return | |
| 532 | 546 | if _should_prioritize_missing_artifact( |
| 533 | 547 | dod=dod, |
| 534 | 548 | next_pending=next_pending, |
@@ -3107,6 +3121,49 @@ def _is_recoverable_guidance_block(event_content: str) -> bool: | ||
| 3107 | 3121 | ) |
| 3108 | 3122 | |
| 3109 | 3123 | |
| 3124 | +def _recent_edit_string_mismatch_target(recovery_context: RecoveryContext | None) -> str: | |
| 3125 | + """Return the active edit target when recovery is from an old_string miss.""" | |
| 3126 | + | |
| 3127 | + if recovery_context is None: | |
| 3128 | + return "" | |
| 3129 | + for attempt in reversed(recovery_context.attempts): | |
| 3130 | + if attempt.tool_name != "edit": | |
| 3131 | + continue | |
| 3132 | + if "old_string not found" not in str(attempt.error or "").lower(): | |
| 3133 | + continue | |
| 3134 | + target = str( | |
| 3135 | + attempt.arguments.get("file_path") | |
| 3136 | + or attempt.arguments.get("path") | |
| 3137 | + or "" | |
| 3138 | + ).strip() | |
| 3139 | + if target: | |
| 3140 | + return target | |
| 3141 | + return "" | |
| 3142 | + | |
| 3143 | + | |
| 3144 | +def _tool_call_targets_path(tool_call: ToolCall, target: str) -> bool: | |
| 3145 | + if not target: | |
| 3146 | + return False | |
| 3147 | + candidate = str( | |
| 3148 | + tool_call.arguments.get("file_path") | |
| 3149 | + or tool_call.arguments.get("path") | |
| 3150 | + or "" | |
| 3151 | + ).strip() | |
| 3152 | + if not candidate and tool_call.name == "bash": | |
| 3153 | + paths = _extract_bash_paths( | |
| 3154 | + str(tool_call.arguments.get("command") or "").strip(), | |
| 3155 | + ) | |
| 3156 | + candidate = paths[0] if paths else "" | |
| 3157 | + if not candidate: | |
| 3158 | + return False | |
| 3159 | + try: | |
| 3160 | + return Path(candidate).expanduser().resolve(strict=False) == Path( | |
| 3161 | + target | |
| 3162 | + ).expanduser().resolve(strict=False) | |
| 3163 | + except (OSError, RuntimeError, ValueError): | |
| 3164 | + return candidate == target | |
| 3165 | + | |
| 3166 | + | |
| 3110 | 3167 | def _active_repair_focus_preview(repair_lines: list[str], *, max_lines: int = 4) -> str: |
| 3111 | 3168 | """Compact repair-focus bullets for steering after no-op mutations.""" |
| 3112 | 3169 | |
tests/test_permissions.pymodified@@ -1770,7 +1770,7 @@ async def test_late_reference_drift_hook_blocks_reference_reads_when_outputs_exi | ||
| 1770 | 1770 | |
| 1771 | 1771 | |
| 1772 | 1772 | @pytest.mark.asyncio |
| 1773 | -async def test_late_reference_drift_hook_blocks_verification_reference_reads_after_artifacts_exist( | |
| 1773 | +async def test_late_reference_drift_hook_allows_verification_reference_reads_after_artifacts_exist( | |
| 1774 | 1774 | temp_dir: Path, |
| 1775 | 1775 | ) -> None: |
| 1776 | 1776 | registry = create_default_registry(temp_dir) |
@@ -1826,10 +1826,7 @@ async def test_late_reference_drift_hook_blocks_verification_reference_reads_aft | ||
| 1826 | 1826 | ) |
| 1827 | 1827 | ) |
| 1828 | 1828 | |
| 1829 | - assert result.decision == HookDecision.DENY | |
| 1830 | - assert result.terminal_state == "blocked" | |
| 1831 | - assert result.message is not None | |
| 1832 | - assert "completed artifact set scope" in result.message | |
| 1829 | + assert result.decision == HookDecision.CONTINUE | |
| 1833 | 1830 | |
| 1834 | 1831 | |
| 1835 | 1832 | @pytest.mark.asyncio |
@@ -1902,7 +1899,7 @@ async def test_late_reference_drift_hook_blocks_excessive_post_build_self_audits | ||
| 1902 | 1899 | |
| 1903 | 1900 | |
| 1904 | 1901 | @pytest.mark.asyncio |
| 1905 | -async def test_late_reference_drift_hook_blocks_excessive_post_build_self_audits_during_verification( | |
| 1902 | +async def test_late_reference_drift_hook_allows_post_build_self_audits_during_verification( | |
| 1906 | 1903 | temp_dir: Path, |
| 1907 | 1904 | ) -> None: |
| 1908 | 1905 | registry = create_default_registry(temp_dir) |
@@ -1962,12 +1959,9 @@ async def test_late_reference_drift_hook_blocks_excessive_post_build_self_audits | ||
| 1962 | 1959 | assert result.decision == HookDecision.CONTINUE |
| 1963 | 1960 | await hook.post_tool_use(context) |
| 1964 | 1961 | |
| 1965 | - blocked = await hook.pre_tool_use(make_context(5)) | |
| 1962 | + result = await hook.pre_tool_use(make_context(5)) | |
| 1966 | 1963 | |
| 1967 | - assert blocked.decision == HookDecision.DENY | |
| 1968 | - assert blocked.terminal_state == "blocked" | |
| 1969 | - assert blocked.message is not None | |
| 1970 | - assert "post-build audit loop" in blocked.message | |
| 1964 | + assert result.decision == HookDecision.CONTINUE | |
| 1971 | 1965 | |
| 1972 | 1966 | |
| 1973 | 1967 | @pytest.mark.asyncio |
@@ -2025,7 +2019,7 @@ async def test_late_reference_drift_hook_blocks_relative_bash_reference_reads_af | ||
| 2025 | 2019 | tool=registry.get("bash"), |
| 2026 | 2020 | registry=registry, |
| 2027 | 2021 | permission_policy=policy, |
| 2028 | - source="verification", | |
| 2022 | + source="native", | |
| 2029 | 2023 | ) |
| 2030 | 2024 | ) |
| 2031 | 2025 | |
@@ -2036,7 +2030,7 @@ async def test_late_reference_drift_hook_blocks_relative_bash_reference_reads_af | ||
| 2036 | 2030 | |
| 2037 | 2031 | |
| 2038 | 2032 | @pytest.mark.asyncio |
| 2039 | -async def test_late_reference_drift_hook_blocks_relative_bash_post_build_audit_loop( | |
| 2033 | +async def test_late_reference_drift_hook_allows_relative_bash_post_build_audit_loop_during_verification( | |
| 2040 | 2034 | temp_dir: Path, |
| 2041 | 2035 | ) -> None: |
| 2042 | 2036 | registry = create_default_registry(temp_dir) |
@@ -2097,12 +2091,9 @@ async def test_late_reference_drift_hook_blocks_relative_bash_post_build_audit_l | ||
| 2097 | 2091 | assert result.decision == HookDecision.CONTINUE |
| 2098 | 2092 | await hook.post_tool_use(context) |
| 2099 | 2093 | |
| 2100 | - blocked = await hook.pre_tool_use(make_context(5)) | |
| 2094 | + result = await hook.pre_tool_use(make_context(5)) | |
| 2101 | 2095 | |
| 2102 | - assert blocked.decision == HookDecision.DENY | |
| 2103 | - assert blocked.terminal_state == "blocked" | |
| 2104 | - assert blocked.message is not None | |
| 2105 | - assert "post-build audit loop" in blocked.message | |
| 2096 | + assert result.decision == HookDecision.CONTINUE | |
| 2106 | 2097 | |
| 2107 | 2098 | |
| 2108 | 2099 | @pytest.mark.asyncio |
tests/test_tool_batches.pymodified@@ -826,6 +826,110 @@ async def test_tool_batch_runner_duplicate_read_keeps_root_declared_missing_html | ||
| 826 | 826 | assert ephemeral_messages == [] |
| 827 | 827 | |
| 828 | 828 | |
| 829 | +@pytest.mark.asyncio | |
| 830 | +async def test_tool_batch_runner_duplicate_read_after_edit_mismatch_steers_to_mutation( | |
| 831 | + temp_dir: Path, | |
| 832 | +) -> None: | |
| 833 | + async def assess_confidence( | |
| 834 | + tool_name: str, | |
| 835 | + tool_args: dict, | |
| 836 | + context: str, | |
| 837 | + ) -> ConfidenceAssessment: | |
| 838 | + raise AssertionError("Confidence scoring should not run for this scenario") | |
| 839 | + | |
| 840 | + async def verify_action( | |
| 841 | + tool_name: str, | |
| 842 | + tool_args: dict, | |
| 843 | + result: str, | |
| 844 | + expected: str = "", | |
| 845 | + ) -> ActionVerification: | |
| 846 | + raise AssertionError("Verification should not run for this scenario") | |
| 847 | + | |
| 848 | + target = temp_dir / "guide" / "chapters" / "02-installation.html" | |
| 849 | + target.parent.mkdir(parents=True) | |
| 850 | + target.write_text( | |
| 851 | + "<h1>Chapter 2: Installation Guide</h1>\n" | |
| 852 | + "<p>This chapter is still too thin.</p>\n" | |
| 853 | + ) | |
| 854 | + recovery_context = RecoveryContext( | |
| 855 | + original_tool="edit", | |
| 856 | + original_args={ | |
| 857 | + "file_path": str(target), | |
| 858 | + "old_string": "<h1>Installation</h1>", | |
| 859 | + "new_string": "<h1>Installation</h1><p>Expanded.</p>", | |
| 860 | + }, | |
| 861 | + max_retries=2, | |
| 862 | + ) | |
| 863 | + recovery_context.add_attempt( | |
| 864 | + "edit", | |
| 865 | + { | |
| 866 | + "file_path": str(target), | |
| 867 | + "old_string": "<h1>Installation</h1>", | |
| 868 | + "new_string": "<h1>Installation</h1><p>Expanded.</p>", | |
| 869 | + }, | |
| 870 | + "old_string not found in file. Make sure it matches exactly.", | |
| 871 | + ) | |
| 872 | + context = build_context( | |
| 873 | + temp_dir=temp_dir, | |
| 874 | + messages=[], | |
| 875 | + safeguards=FakeSafeguards(), | |
| 876 | + assess_confidence=assess_confidence, | |
| 877 | + verify_action=verify_action, | |
| 878 | + recovery_context=recovery_context, | |
| 879 | + auto_recover=False, | |
| 880 | + ) | |
| 881 | + persistent_messages: list[str] = [] | |
| 882 | + context.queue_steering_message_callback = persistent_messages.append | |
| 883 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 884 | + tool_call = ToolCall( | |
| 885 | + id="read-dup-after-edit-miss", | |
| 886 | + name="read", | |
| 887 | + arguments={"file_path": str(target)}, | |
| 888 | + ) | |
| 889 | + duplicate_message = ( | |
| 890 | + "[Skipped - duplicate action: Already read " | |
| 891 | + f"{target} recently without any intervening changes; " | |
| 892 | + "reuse the earlier read result instead of rereading]" | |
| 893 | + ) | |
| 894 | + executor = FakeExecutor( | |
| 895 | + [ | |
| 896 | + ToolExecutionOutcome( | |
| 897 | + tool_call=tool_call, | |
| 898 | + state=ToolExecutionState.DUPLICATE, | |
| 899 | + message=Message.tool_result_message( | |
| 900 | + tool_call_id=tool_call.id, | |
| 901 | + display_content=duplicate_message, | |
| 902 | + result_content=duplicate_message, | |
| 903 | + ), | |
| 904 | + event_content=duplicate_message, | |
| 905 | + is_error=False, | |
| 906 | + result_output=duplicate_message, | |
| 907 | + ) | |
| 908 | + ] | |
| 909 | + ) | |
| 910 | + dod = create_definition_of_done("Expand thin generated guide chapters.") | |
| 911 | + | |
| 912 | + await runner.execute_batch( | |
| 913 | + tool_calls=[tool_call], | |
| 914 | + tool_source="assistant", | |
| 915 | + pending_tool_calls_seen=set(), | |
| 916 | + emit=_noop_emit, | |
| 917 | + summary=TurnSummary(final_response=""), | |
| 918 | + dod=dod, | |
| 919 | + executor=executor, # type: ignore[arg-type] | |
| 920 | + on_confirmation=None, | |
| 921 | + on_user_question=None, | |
| 922 | + emit_confirmation=None, | |
| 923 | + consecutive_errors=0, | |
| 924 | + ) | |
| 925 | + | |
| 926 | + assert len(persistent_messages) == 1 | |
| 927 | + assert "last edit" in persistent_messages[0] | |
| 928 | + assert "`old_string` did not exactly match" in persistent_messages[0] | |
| 929 | + assert "send one concrete mutation now" in persistent_messages[0] | |
| 930 | + assert "`write` with the complete replacement content" in persistent_messages[0] | |
| 931 | + | |
| 932 | + | |
| 829 | 933 | @pytest.mark.asyncio |
| 830 | 934 | async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo( |
| 831 | 935 | temp_dir: Path, |