tenseleyflow/loader / edbe397

Browse files

Delay verification during builds

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
edbe39782458b675172af014a36949d31b0de32f
Parents
c9cf957
Tree
24e1a2b

2 changed files

StatusFile+-
M src/loader/runtime/tool_batches.py 31 18
M tests/test_tool_batches.py 90 0
src/loader/runtime/tool_batches.pymodified
@@ -913,24 +913,6 @@ class ToolBatchRunner:
913913
         is_mutating = is_state_mutating_tool_call(tool_call)
914914
         previously_verified = dod.last_verification_result == "passed"
915915
         record_successful_tool_call(dod, tool_call)
916
-        if previously_verified and is_mutating:
917
-            _mark_verification_stale(
918
-                context=self.context,
919
-                summary=summary,
920
-                dod=dod,
921
-                tool_call=tool_call,
922
-            )
923
-        elif is_mutating and _should_plan_verification_for_tool_call(
924
-            dod,
925
-            tool_call=tool_call,
926
-            project_root=self.context.project_root,
927
-        ):
928
-            _mark_verification_planned(
929
-                context=self.context,
930
-                summary=summary,
931
-                dod=dod,
932
-                tool_call=tool_call,
933
-            )
934916
         if tool_call.name == "TodoWrite" and outcome.registry_result is not None:
935917
             new_todos = outcome.registry_result.metadata.get("new_todos", [])
936918
             if isinstance(new_todos, list):
@@ -964,6 +946,24 @@ class ToolBatchRunner:
964946
                 tool_call=tool_call,
965947
                 dod=dod,
966948
             )
949
+        if previously_verified and is_mutating:
950
+            _mark_verification_stale(
951
+                context=self.context,
952
+                summary=summary,
953
+                dod=dod,
954
+                tool_call=tool_call,
955
+            )
956
+        elif is_mutating and _should_plan_verification_for_tool_call(
957
+            dod,
958
+            tool_call=tool_call,
959
+            project_root=self.context.project_root,
960
+        ):
961
+            _mark_verification_planned(
962
+                context=self.context,
963
+                summary=summary,
964
+                dod=dod,
965
+                tool_call=tool_call,
966
+            )
967967
         self.dod_store.save(dod)
968968
         recovery_context = self.context.recovery_context
969969
         if recovery_context is not None:
@@ -2145,6 +2145,19 @@ def _should_plan_verification_for_tool_call(
21452145
     tool_call: ToolCall,
21462146
     project_root: Path,
21472147
 ) -> bool:
2148
+    actionable_pending = [
2149
+        item
2150
+        for item in effective_pending_todo_items(
2151
+            dod,
2152
+            project_root=project_root,
2153
+        )
2154
+        if item not in _TODO_NUDGE_EXCLUDED_ITEMS
2155
+    ]
2156
+    if any(
2157
+        _todo_is_mutation_step(item) or _todo_is_consistency_review_step(item)
2158
+        for item in actionable_pending
2159
+    ):
2160
+        return False
21482161
     if tool_call.name in {"write", "edit", "patch"}:
21492162
         return True
21502163
     if tool_call.name != "bash":
tests/test_tool_batches.pymodified
@@ -5096,6 +5096,96 @@ async def test_tool_batch_runner_does_not_mark_verification_planned_after_setup_
50965096
     )
50975097
 
50985098
 
5099
+@pytest.mark.asyncio
5100
+async def test_tool_batch_runner_does_not_mark_verification_planned_while_chapter_build_pending(
5101
+    temp_dir: Path,
5102
+) -> None:
5103
+    async def assess_confidence(
5104
+        tool_name: str,
5105
+        tool_args: dict,
5106
+        context: str,
5107
+    ) -> ConfidenceAssessment:
5108
+        raise AssertionError("Confidence scoring should be disabled in this scenario")
5109
+
5110
+    async def verify_action(
5111
+        tool_name: str,
5112
+        tool_args: dict,
5113
+        result: str,
5114
+        expected: str = "",
5115
+    ) -> ActionVerification:
5116
+        raise AssertionError("Verification should not run in this scenario")
5117
+
5118
+    context = build_context(
5119
+        temp_dir=temp_dir,
5120
+        messages=[],
5121
+        safeguards=FakeSafeguards(),
5122
+        assess_confidence=assess_confidence,
5123
+        verify_action=verify_action,
5124
+    )
5125
+    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
5126
+    nginx_root = temp_dir / "Loader" / "guides" / "nginx"
5127
+    chapters = nginx_root / "chapters"
5128
+    chapters.mkdir(parents=True)
5129
+    index_path = nginx_root / "index.html"
5130
+    implementation_plan = temp_dir / "implementation.md"
5131
+    implementation_plan.write_text(
5132
+        "\n".join(
5133
+            [
5134
+                "# Implementation Plan",
5135
+                "",
5136
+                "## File Changes",
5137
+                f"- `{nginx_root}/`",
5138
+                f"- `{chapters}/`",
5139
+                f"- `{index_path}`",
5140
+                "",
5141
+            ]
5142
+        )
5143
+    )
5144
+
5145
+    tool_call = ToolCall(
5146
+        id="write-index",
5147
+        name="write",
5148
+        arguments={"file_path": str(index_path), "content": "<html></html>\n"},
5149
+    )
5150
+    executor = FakeExecutor(
5151
+        [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
5152
+    )
5153
+    summary = TurnSummary(final_response="")
5154
+    dod = create_definition_of_done("Create a multi-file nginx guide.")
5155
+    dod.implementation_plan = str(implementation_plan)
5156
+    dod.pending_items.extend(
5157
+        [
5158
+            "Develop the main index.html file with proper structure",
5159
+            "Create first nginx chapter",
5160
+        ]
5161
+    )
5162
+    events: list[AgentEvent] = []
5163
+
5164
+    async def emit(event: AgentEvent) -> None:
5165
+        events.append(event)
5166
+
5167
+    await runner.execute_batch(
5168
+        tool_calls=[tool_call],
5169
+        tool_source="assistant",
5170
+        pending_tool_calls_seen=set(),
5171
+        emit=emit,
5172
+        summary=summary,
5173
+        dod=dod,
5174
+        executor=executor,  # type: ignore[arg-type]
5175
+        on_confirmation=None,
5176
+        on_user_question=None,
5177
+        emit_confirmation=None,
5178
+        consecutive_errors=0,
5179
+    )
5180
+
5181
+    assert dod.last_verification_result is None
5182
+    assert "Collect verification evidence" not in dod.pending_items
5183
+    assert "Create first nginx chapter" in dod.pending_items
5184
+    assert not any(
5185
+        entry.reason_code == "verification_planned" for entry in summary.workflow_timeline
5186
+    )
5187
+
5188
+
50995189
 @pytest.mark.asyncio
51005190
 async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
51015191
     temp_dir: Path,