Delay verification during builds
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
edbe39782458b675172af014a36949d31b0de32f- Parents
-
c9cf957 - Tree
24e1a2b
edbe397
edbe39782458b675172af014a36949d31b0de32fc9cf957
24e1a2b| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/tool_batches.py
|
31 | 18 |
| M |
tests/test_tool_batches.py
|
90 | 0 |
src/loader/runtime/tool_batches.pymodified@@ -913,24 +913,6 @@ class ToolBatchRunner: | ||
| 913 | 913 | is_mutating = is_state_mutating_tool_call(tool_call) |
| 914 | 914 | previously_verified = dod.last_verification_result == "passed" |
| 915 | 915 | record_successful_tool_call(dod, tool_call) |
| 916 | - if previously_verified and is_mutating: | |
| 917 | - _mark_verification_stale( | |
| 918 | - context=self.context, | |
| 919 | - summary=summary, | |
| 920 | - dod=dod, | |
| 921 | - tool_call=tool_call, | |
| 922 | - ) | |
| 923 | - elif is_mutating and _should_plan_verification_for_tool_call( | |
| 924 | - dod, | |
| 925 | - tool_call=tool_call, | |
| 926 | - project_root=self.context.project_root, | |
| 927 | - ): | |
| 928 | - _mark_verification_planned( | |
| 929 | - context=self.context, | |
| 930 | - summary=summary, | |
| 931 | - dod=dod, | |
| 932 | - tool_call=tool_call, | |
| 933 | - ) | |
| 934 | 916 | if tool_call.name == "TodoWrite" and outcome.registry_result is not None: |
| 935 | 917 | new_todos = outcome.registry_result.metadata.get("new_todos", []) |
| 936 | 918 | if isinstance(new_todos, list): |
@@ -964,6 +946,24 @@ class ToolBatchRunner: | ||
| 964 | 946 | tool_call=tool_call, |
| 965 | 947 | dod=dod, |
| 966 | 948 | ) |
| 949 | + if previously_verified and is_mutating: | |
| 950 | + _mark_verification_stale( | |
| 951 | + context=self.context, | |
| 952 | + summary=summary, | |
| 953 | + dod=dod, | |
| 954 | + tool_call=tool_call, | |
| 955 | + ) | |
| 956 | + elif is_mutating and _should_plan_verification_for_tool_call( | |
| 957 | + dod, | |
| 958 | + tool_call=tool_call, | |
| 959 | + project_root=self.context.project_root, | |
| 960 | + ): | |
| 961 | + _mark_verification_planned( | |
| 962 | + context=self.context, | |
| 963 | + summary=summary, | |
| 964 | + dod=dod, | |
| 965 | + tool_call=tool_call, | |
| 966 | + ) | |
| 967 | 967 | self.dod_store.save(dod) |
| 968 | 968 | recovery_context = self.context.recovery_context |
| 969 | 969 | if recovery_context is not None: |
@@ -2145,6 +2145,19 @@ def _should_plan_verification_for_tool_call( | ||
| 2145 | 2145 | tool_call: ToolCall, |
| 2146 | 2146 | project_root: Path, |
| 2147 | 2147 | ) -> bool: |
| 2148 | + actionable_pending = [ | |
| 2149 | + item | |
| 2150 | + for item in effective_pending_todo_items( | |
| 2151 | + dod, | |
| 2152 | + project_root=project_root, | |
| 2153 | + ) | |
| 2154 | + if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 2155 | + ] | |
| 2156 | + if any( | |
| 2157 | + _todo_is_mutation_step(item) or _todo_is_consistency_review_step(item) | |
| 2158 | + for item in actionable_pending | |
| 2159 | + ): | |
| 2160 | + return False | |
| 2148 | 2161 | if tool_call.name in {"write", "edit", "patch"}: |
| 2149 | 2162 | return True |
| 2150 | 2163 | if tool_call.name != "bash": |
tests/test_tool_batches.pymodified@@ -5096,6 +5096,96 @@ async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_ | ||
| 5096 | 5096 | ) |
| 5097 | 5097 | |
| 5098 | 5098 | |
| 5099 | +@pytest.mark.asyncio | |
| 5100 | +async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending( | |
| 5101 | + temp_dir: Path, | |
| 5102 | +) -> None: | |
| 5103 | + async def assess_confidence( | |
| 5104 | + tool_name: str, | |
| 5105 | + tool_args: dict, | |
| 5106 | + context: str, | |
| 5107 | + ) -> ConfidenceAssessment: | |
| 5108 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 5109 | + | |
| 5110 | + async def verify_action( | |
| 5111 | + tool_name: str, | |
| 5112 | + tool_args: dict, | |
| 5113 | + result: str, | |
| 5114 | + expected: str = "", | |
| 5115 | + ) -> ActionVerification: | |
| 5116 | + raise AssertionError("Verification should not run in this scenario") | |
| 5117 | + | |
| 5118 | + context = build_context( | |
| 5119 | + temp_dir=temp_dir, | |
| 5120 | + messages=[], | |
| 5121 | + safeguards=FakeSafeguards(), | |
| 5122 | + assess_confidence=assess_confidence, | |
| 5123 | + verify_action=verify_action, | |
| 5124 | + ) | |
| 5125 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 5126 | + nginx_root = temp_dir / "Loader" / "guides" / "nginx" | |
| 5127 | + chapters = nginx_root / "chapters" | |
| 5128 | + chapters.mkdir(parents=True) | |
| 5129 | + index_path = nginx_root / "index.html" | |
| 5130 | + implementation_plan = temp_dir / "implementation.md" | |
| 5131 | + implementation_plan.write_text( | |
| 5132 | + "\n".join( | |
| 5133 | + [ | |
| 5134 | + "# Implementation Plan", | |
| 5135 | + "", | |
| 5136 | + "## File Changes", | |
| 5137 | + f"- `{nginx_root}/`", | |
| 5138 | + f"- `{chapters}/`", | |
| 5139 | + f"- `{index_path}`", | |
| 5140 | + "", | |
| 5141 | + ] | |
| 5142 | + ) | |
| 5143 | + ) | |
| 5144 | + | |
| 5145 | + tool_call = ToolCall( | |
| 5146 | + id="write-index", | |
| 5147 | + name="write", | |
| 5148 | + arguments={"file_path": str(index_path), "content": "<html></html>\n"}, | |
| 5149 | + ) | |
| 5150 | + executor = FakeExecutor( | |
| 5151 | + [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)] | |
| 5152 | + ) | |
| 5153 | + summary = TurnSummary(final_response="") | |
| 5154 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 5155 | + dod.implementation_plan = str(implementation_plan) | |
| 5156 | + dod.pending_items.extend( | |
| 5157 | + [ | |
| 5158 | + "Develop the main index.html file with proper structure", | |
| 5159 | + "Create first nginx chapter", | |
| 5160 | + ] | |
| 5161 | + ) | |
| 5162 | + events: list[AgentEvent] = [] | |
| 5163 | + | |
| 5164 | + async def emit(event: AgentEvent) -> None: | |
| 5165 | + events.append(event) | |
| 5166 | + | |
| 5167 | + await runner.execute_batch( | |
| 5168 | + tool_calls=[tool_call], | |
| 5169 | + tool_source="assistant", | |
| 5170 | + pending_tool_calls_seen=set(), | |
| 5171 | + emit=emit, | |
| 5172 | + summary=summary, | |
| 5173 | + dod=dod, | |
| 5174 | + executor=executor, # type: ignore[arg-type] | |
| 5175 | + on_confirmation=None, | |
| 5176 | + on_user_question=None, | |
| 5177 | + emit_confirmation=None, | |
| 5178 | + consecutive_errors=0, | |
| 5179 | + ) | |
| 5180 | + | |
| 5181 | + assert dod.last_verification_result is None | |
| 5182 | + assert "Collect verification evidence" not in dod.pending_items | |
| 5183 | + assert "Create first nginx chapter" in dod.pending_items | |
| 5184 | + assert not any( | |
| 5185 | + entry.reason_code == "verification_planned" for entry in summary.workflow_timeline | |
| 5186 | + ) | |
| 5187 | + | |
| 5188 | + | |
| 5099 | 5189 | @pytest.mark.asyncio |
| 5100 | 5190 | async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation( |
| 5101 | 5191 | temp_dir: Path, |