tenseleyflow/loader / 96f060e

Browse files

Invalidate stale verification after new mutations

Authored by espadonne
SHA
96f060ef3f81960a9a5b59ea6dc2c50eedd499a4
Parents
c568344
Tree
6d75ea2

6 changed files

StatusFile+-
M src/loader/runtime/completion_policy.py 3 0
M src/loader/runtime/task_completion.py 64 0
M src/loader/runtime/tool_batches.py 107 1
M src/loader/runtime/verification_observations.py 1 0
M tests/test_completion_policy.py 68 0
M tests/test_tool_batches.py 89 1
src/loader/runtime/completion_policy.pymodified
@@ -264,6 +264,9 @@ class CompletionPolicy:
264264
         for entry in verification_observations:
265265
             if entry.status == VerificationObservationStatus.FAILED.value:
266266
                 return CompletionPolicy._render_observation(entry)
267
+        for entry in verification_observations:
268
+            if entry.status == VerificationObservationStatus.STALE.value:
269
+                return CompletionPolicy._render_observation(entry)
267270
         for entry in verification_observations:
268271
             if entry.status == VerificationObservationStatus.MISSING.value:
269272
                 return CompletionPolicy._render_observation(entry)
src/loader/runtime/task_completion.pymodified
@@ -119,6 +119,7 @@ class _FollowThroughFacts:
119119
     has_install_evidence: bool
120120
     has_verification_evidence: bool
121121
     has_failed_verification: bool
122
+    has_stale_verification: bool
122123
     verification_command: str | None
123124
     pending_items: list[str]
124125
     accomplished: list[str]
@@ -319,6 +320,30 @@ def assess_completion_follow_through_with_provenance(
319320
                 status=EvidenceProvenanceStatus.CONTRADICTS,
320321
             ):
321322
                 _append_unique_provenance(evidence_provenance, entry)
323
+        elif facts.has_stale_verification:
324
+            _append_follow_through_gap(
325
+                missing_evidence,
326
+                remaining,
327
+                suggested_next_steps,
328
+                evidence=_stale_verification_evidence(facts.verification_command),
329
+                remaining_item="Rerun verification after the implementation changed again",
330
+                next_step=_stale_verification_follow_up(facts.verification_command),
331
+            )
332
+            _append_unique_provenance(
333
+                evidence_provenance,
334
+                EvidenceProvenance(
335
+                    category="verification",
336
+                    source="dod.last_verification_result",
337
+                    summary=(
338
+                        "previous verification became stale for "
339
+                        f"`{facts.verification_command}` after new mutating work"
340
+                        if facts.verification_command
341
+                        else "previous verification became stale after new mutating work"
342
+                    ),
343
+                    status=EvidenceProvenanceStatus.MISSING.value,
344
+                    subject=facts.verification_command,
345
+                ),
346
+            )
322347
         elif not facts.has_verification_evidence:
323348
             _append_follow_through_gap(
324349
                 missing_evidence,
@@ -726,6 +751,7 @@ def _build_follow_through_facts(
726751
     has_install_evidence = _has_install_evidence(task_lower, action_types, actions_taken)
727752
     has_verification_evidence = _has_verification_evidence(action_types, actions_taken)
728753
     has_failed_verification = False
754
+    has_stale_verification = False
729755
     verification_command: str | None = None
730756
     pending_items: list[str] = []
731757
 
@@ -735,6 +761,7 @@ def _build_follow_through_facts(
735761
             has_install_evidence=has_install_evidence,
736762
             has_verification_evidence=has_verification_evidence,
737763
             has_failed_verification=has_failed_verification,
764
+            has_stale_verification=has_stale_verification,
738765
             verification_command=verification_command,
739766
             pending_items=pending_items,
740767
             accomplished=accomplished,
@@ -758,6 +785,7 @@ def _build_follow_through_facts(
758785
         dod.last_verification_result == "failed"
759786
         or any(not evidence.passed for evidence in dod.evidence)
760787
     )
788
+    has_stale_verification = dod.last_verification_result == "stale"
761789
     has_recorded_work = has_recorded_work or bool(
762790
         dod.touched_files
763791
         or dod.successful_commands
@@ -765,6 +793,7 @@ def _build_follow_through_facts(
765793
         or dod.completed_items
766794
         or has_verification_evidence
767795
         or has_failed_verification
796
+        or has_stale_verification
768797
     )
769798
     for evidence in dod.evidence:
770799
         if not evidence.passed:
@@ -785,6 +814,7 @@ def _build_follow_through_facts(
785814
         has_install_evidence=has_install_evidence,
786815
         has_verification_evidence=has_verification_evidence,
787816
         has_failed_verification=has_failed_verification,
817
+        has_stale_verification=has_stale_verification,
788818
         verification_command=verification_command,
789819
         pending_items=pending_items,
790820
         accomplished=accomplished,
@@ -827,6 +857,15 @@ def _failed_verification_evidence(verification_command: str | None) -> str:
827857
     return "a passing verification result (current verification is still failing)"
828858
 
829859
 
860
+def _stale_verification_evidence(verification_command: str | None) -> str:
861
+    if verification_command:
862
+        return (
863
+            f"a fresh passing verification result from `{verification_command}` "
864
+            "(previous verification became stale after new mutating work)"
865
+        )
866
+    return "a fresh passing verification result after new mutating work"
867
+
868
+
830869
 def _verification_follow_up(
831870
     *,
832871
     task_lower: str,
@@ -843,6 +882,12 @@ def _verification_retry_step(verification_command: str | None) -> str:
843882
     return "Fix the failing verification result and rerun it"
844883
 
845884
 
885
+def _stale_verification_follow_up(verification_command: str | None) -> str:
886
+    if verification_command:
887
+        return f"Rerun `{verification_command}` now that the implementation changed again"
888
+    return "Rerun the relevant verification now that the implementation changed again"
889
+
890
+
846891
 def _verification_provenance(
847892
     *,
848893
     dod: DefinitionOfDone | None,
@@ -945,6 +990,25 @@ def _observed_completion_verification(
945990
             )
946991
         return observations
947992
 
993
+    if dod.last_verification_result == VerificationObservationStatus.STALE.value:
994
+        if verification_command:
995
+            return [
996
+                VerificationObservation(
997
+                    status=VerificationObservationStatus.STALE.value,
998
+                    summary=(
999
+                        "verification became stale for "
1000
+                        f"`{verification_command}` after new mutating work"
1001
+                    ),
1002
+                    command=verification_command,
1003
+                )
1004
+            ]
1005
+        return [
1006
+            VerificationObservation(
1007
+                status=VerificationObservationStatus.STALE.value,
1008
+                summary="previous verification became stale after new mutating work",
1009
+            )
1010
+        ]
1011
+
9481012
     if verification_command:
9491013
         return [
9501014
             VerificationObservation(
src/loader/runtime/tool_batches.pymodified
@@ -7,17 +7,30 @@ from dataclasses import dataclass, field
77
 
88
 from ..llm.base import ToolCall
99
 from .context import RuntimeContext
10
-from .dod import DefinitionOfDone, DefinitionOfDoneStore, record_successful_tool_call
10
+from .dod import (
11
+    DefinitionOfDone,
12
+    DefinitionOfDoneStore,
13
+    is_state_mutating_tool_call,
14
+    record_successful_tool_call,
15
+)
1116
 from .events import AgentEvent, TurnSummary
17
+from .evidence_provenance import EvidenceProvenance, EvidenceProvenanceStatus
1218
 from .executor import ToolExecutionState, ToolExecutor
19
+from .policy_timeline import append_verification_timeline_entry
1320
 from .tool_batch_checks import ToolBatchConfidenceGate, ToolBatchVerificationGate
1421
 from .tool_batch_recovery import ToolBatchRecoveryController
22
+from .verification_observations import (
23
+    VerificationObservation,
24
+    VerificationObservationStatus,
25
+)
1526
 from .workflow import sync_todos_to_definition_of_done
1627
 
1728
 EventSink = Callable[[AgentEvent], Awaitable[None]]
1829
 ConfirmationHandler = Callable[[str, str, str], Awaitable[bool]] | None
1930
 UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None
2031
 
32
+_VERIFY_ITEM = "Collect verification evidence"
33
+
2134
 
2235
 @dataclass
2336
 class ToolBatchResult:
@@ -190,7 +203,15 @@ class ToolBatchRunner:
190203
     ) -> str | None:
191204
         """Update DoD bookkeeping after a successful tool execution."""
192205
 
206
+        previously_verified = dod.last_verification_result == "passed"
193207
         record_successful_tool_call(dod, tool_call)
208
+        if previously_verified and is_state_mutating_tool_call(tool_call):
209
+            _mark_verification_stale(
210
+                context=self.context,
211
+                summary=summary,
212
+                dod=dod,
213
+                tool_call=tool_call,
214
+            )
194215
         if tool_call.name == "TodoWrite" and outcome.registry_result is not None:
195216
             new_todos = outcome.registry_result.metadata.get("new_todos", [])
196217
             if isinstance(new_todos, list):
@@ -198,3 +219,88 @@ class ToolBatchRunner:
198219
         self.dod_store.save(dod)
199220
         self.context.recovery_context = None
200221
         return None
222
+
223
+
224
+def _mark_verification_stale(
225
+    *,
226
+    context: RuntimeContext,
227
+    summary: TurnSummary,
228
+    dod: DefinitionOfDone,
229
+    tool_call: ToolCall,
230
+) -> None:
231
+    detail = _stale_verification_detail(tool_call)
232
+    append_verification_timeline_entry(
233
+        context,
234
+        summary,
235
+        reason_code="verification_stale",
236
+        reason_summary="previous verification became stale after new mutating work",
237
+        evidence_summary=[f"fresh verification required after {detail}"],
238
+        evidence_provenance=_stale_verification_provenance(dod, detail=detail),
239
+        verification_observations=_stale_verification_observations(
240
+            dod,
241
+            detail=detail,
242
+        ),
243
+    )
244
+    dod.last_verification_result = VerificationObservationStatus.STALE.value
245
+    dod.evidence = []
246
+    while _VERIFY_ITEM in dod.completed_items:
247
+        dod.completed_items.remove(_VERIFY_ITEM)
248
+    if _VERIFY_ITEM not in dod.pending_items:
249
+        dod.pending_items.append(_VERIFY_ITEM)
250
+
251
+
252
+def _stale_verification_observations(
253
+    dod: DefinitionOfDone,
254
+    *,
255
+    detail: str,
256
+) -> list[VerificationObservation]:
257
+    return [
258
+        VerificationObservation(
259
+            status=VerificationObservationStatus.STALE.value,
260
+            summary=f"verification became stale for `{command}` after new mutating work",
261
+            command=command,
262
+            kind="runtime",
263
+            detail=detail,
264
+        )
265
+        for command in _stale_verification_commands(dod)
266
+    ]
267
+
268
+
269
+def _stale_verification_provenance(
270
+    dod: DefinitionOfDone,
271
+    *,
272
+    detail: str,
273
+) -> list[EvidenceProvenance]:
274
+    return [
275
+        EvidenceProvenance(
276
+            category="verification",
277
+            source="tool_execution",
278
+            summary=f"fresh verification required for `{command}` after new mutating work",
279
+            status=EvidenceProvenanceStatus.MISSING.value,
280
+            subject=command,
281
+            detail=detail,
282
+        )
283
+        for command in _stale_verification_commands(dod)
284
+    ]
285
+
286
+
287
+def _stale_verification_commands(dod: DefinitionOfDone) -> list[str]:
288
+    commands = [command for command in dod.verification_commands if command]
289
+    if commands:
290
+        return commands
291
+    observed = [evidence.command for evidence in dod.evidence if evidence.command]
292
+    if observed:
293
+        return observed
294
+    return ["verification"]
295
+
296
+
297
+def _stale_verification_detail(tool_call: ToolCall) -> str:
298
+    if tool_call.name in {"write", "edit", "patch"}:
299
+        file_path = str(tool_call.arguments.get("file_path", "")).strip()
300
+        if file_path:
301
+            return f"{tool_call.name} changed {file_path}"
302
+    if tool_call.name == "bash":
303
+        command = str(tool_call.arguments.get("command", "")).strip()
304
+        if command:
305
+            return f"bash ran `{command}`"
306
+    return f"{tool_call.name} changed the workspace"
src/loader/runtime/verification_observations.pymodified
@@ -11,6 +11,7 @@ class VerificationObservationStatus(StrEnum):
1111
     """How one verification observation resolved at runtime."""
1212
 
1313
     PENDING = "pending"
14
+    STALE = "stale"
1415
     PASSED = "passed"
1516
     FAILED = "failed"
1617
     SKIPPED = "skipped"
tests/test_completion_policy.pymodified
@@ -219,6 +219,27 @@ def test_assess_completion_follow_through_surfaces_failing_verification() -> Non
219219
     ]
220220
 
221221
 
222
+def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None:
223
+    dod = create_definition_of_done("Run pytest -q and make sure it works.")
224
+    dod.verification_commands = ["pytest -q"]
225
+    dod.last_verification_result = "stale"
226
+
227
+    check = assess_completion_follow_through(
228
+        task="Run pytest -q and make sure it works.",
229
+        response="The tests were already handled.",
230
+        actions_taken=["write: README.md"],
231
+        dod=dod,
232
+    )
233
+
234
+    assert check.is_complete is False
235
+    assert check.missing_evidence == [
236
+        "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)"
237
+    ]
238
+    assert check.suggested_next_steps == [
239
+        "Rerun `pytest -q` now that the implementation changed again"
240
+    ]
241
+
242
+
222243
 def test_completion_assessment_attaches_typed_verification_provenance() -> None:
223244
     dod = create_definition_of_done("Run pytest -q and make sure it works.")
224245
     dod.verification_commands = ["pytest -q"]
@@ -485,6 +506,53 @@ async def test_completion_policy_uses_missing_observed_verification_when_budget_
485506
     assert events[0].type == "completion_check"
486507
 
487508
 
509
+@pytest.mark.asyncio
510
+async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted(
511
+    temp_dir: Path,
512
+) -> None:
513
+    context = build_context(
514
+        temp_dir,
515
+        safeguards=FakeSafeguards(),
516
+        max_continuation_prompts=1,
517
+    )
518
+    policy = CompletionPolicy(context)
519
+    dod = create_definition_of_done("Run pytest -q and make sure it works.")
520
+    dod.verification_commands = ["pytest -q"]
521
+    dod.last_verification_result = "stale"
522
+    events = []
523
+
524
+    async def emit(event) -> None:
525
+        events.append(event)
526
+
527
+    decision = await policy.maybe_continue_for_completion(
528
+        content="The tests were already handled.",
529
+        response_content="The tests were already handled.",
530
+        task="Run pytest -q and make sure it works.",
531
+        actions_taken=["write: README.md"],
532
+        continuation_count=1,
533
+        emit=emit,
534
+        dod=dod,
535
+    )
536
+
537
+    assert decision.should_continue is False
538
+    assert decision.should_finalize is True
539
+    assert decision.decision_code == "continuation_budget_exhausted"
540
+    assert decision.decision_summary == (
541
+        "stopped because the continuation budget was exhausted while observed "
542
+        "verification still showed verification became stale for `pytest -q` "
543
+        "after new mutating work"
544
+    )
545
+    assert decision.final_response == (
546
+        "I stopped because the continuation budget was exhausted and observed "
547
+        "verification still showed: verification became stale for `pytest -q` "
548
+        "after new mutating work."
549
+    )
550
+    assert [item.status for item in decision.verification_observations] == [
551
+        VerificationObservationStatus.STALE.value
552
+    ]
553
+    assert events[0].type == "completion_check"
554
+
555
+
488556
 @pytest.mark.asyncio
489557
 async def test_completion_policy_finalizes_when_budget_is_exhausted(
490558
     temp_dir: Path,
tests/test_tool_batches.pymodified
@@ -9,7 +9,11 @@ import pytest
99
 
1010
 from loader.llm.base import Message, Role, ToolCall
1111
 from loader.runtime.context import RuntimeContext
12
-from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
12
+from loader.runtime.dod import (
13
+    DefinitionOfDoneStore,
14
+    VerificationEvidence,
15
+    create_definition_of_done,
16
+)
1317
 from loader.runtime.events import AgentEvent, TurnSummary
1418
 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
1519
 from loader.runtime.permissions import (
@@ -32,10 +36,14 @@ from tests.helpers.runtime_harness import ScriptedBackend
3236
 class FakeSession:
3337
     def __init__(self, messages: list[Message]) -> None:
3438
         self.messages = list(messages)
39
+        self.workflow_timeline = []
3540
 
3641
     def append(self, message: Message) -> None:
3742
         self.messages.append(message)
3843
 
44
+    def append_workflow_timeline_entry(self, entry) -> None:
45
+        self.workflow_timeline.append(entry)
46
+
3947
 
4048
 class FakeCodeFilter:
4149
     def reset(self) -> None:
@@ -327,3 +335,83 @@ async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path)
327335
     assert context.session.messages[-1].role == Role.TOOL
328336
     assert context.session.messages[-1].content == "file contents"
329337
     assert any(event.type == "verification" for event in events)
338
+
339
+
340
+@pytest.mark.asyncio
341
+async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
342
+    temp_dir: Path,
343
+) -> None:
344
+    async def assess_confidence(
345
+        tool_name: str,
346
+        tool_args: dict,
347
+        context: str,
348
+    ) -> ConfidenceAssessment:
349
+        raise AssertionError("Confidence scoring should be disabled in this scenario")
350
+
351
+    async def verify_action(
352
+        tool_name: str,
353
+        tool_args: dict,
354
+        result: str,
355
+        expected: str = "",
356
+    ) -> ActionVerification:
357
+        raise AssertionError("Verification should not run for this scenario")
358
+
359
+    context = build_context(
360
+        temp_dir=temp_dir,
361
+        messages=[],
362
+        safeguards=FakeSafeguards(),
363
+        assess_confidence=assess_confidence,
364
+        verify_action=verify_action,
365
+    )
366
+    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
367
+    tool_call = ToolCall(
368
+        id="write-1",
369
+        name="write",
370
+        arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
371
+    )
372
+    executor = FakeExecutor(
373
+        [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
374
+    )
375
+    summary = TurnSummary(final_response="")
376
+    dod = create_definition_of_done("Update README and verify it still works.")
377
+    dod.verification_commands = ["uv run pytest -q"]
378
+    dod.last_verification_result = "passed"
379
+    dod.evidence = [
380
+        VerificationEvidence(
381
+            command="uv run pytest -q",
382
+            passed=True,
383
+            stdout="401 passed",
384
+            kind="test",
385
+        )
386
+    ]
387
+    dod.completed_items.append("Collect verification evidence")
388
+    events: list[AgentEvent] = []
389
+
390
+    async def emit(event: AgentEvent) -> None:
391
+        events.append(event)
392
+
393
+    await runner.execute_batch(
394
+        tool_calls=[tool_call],
395
+        tool_source="assistant",
396
+        pending_tool_calls_seen=set(),
397
+        emit=emit,
398
+        summary=summary,
399
+        dod=dod,
400
+        executor=executor,  # type: ignore[arg-type]
401
+        on_confirmation=None,
402
+        on_user_question=None,
403
+        emit_confirmation=None,
404
+        consecutive_errors=0,
405
+    )
406
+
407
+    assert dod.last_verification_result == "stale"
408
+    assert dod.evidence == []
409
+    assert "Collect verification evidence" in dod.pending_items
410
+    assert "Collect verification evidence" not in dod.completed_items
411
+    assert summary.workflow_timeline[-1].reason_code == "verification_stale"
412
+    assert summary.workflow_timeline[-1].policy_outcome == "stale"
413
+    assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
414
+    assert (
415
+        summary.workflow_timeline[-1].verification_observations[0].command
416
+        == "uv run pytest -q"
417
+    )