Invalidate stale verification after new mutations
- SHA
96f060ef3f81960a9a5b59ea6dc2c50eedd499a4- Parents
-
c568344 - Tree
6d75ea2
96f060e
96f060ef3f81960a9a5b59ea6dc2c50eedd499a4c568344
6d75ea2| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/completion_policy.py
|
3 | 0 |
| M |
src/loader/runtime/task_completion.py
|
64 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
107 | 1 |
| M |
src/loader/runtime/verification_observations.py
|
1 | 0 |
| M |
tests/test_completion_policy.py
|
68 | 0 |
| M |
tests/test_tool_batches.py
|
89 | 1 |
src/loader/runtime/completion_policy.pymodified@@ -264,6 +264,9 @@ class CompletionPolicy: | ||
| 264 | 264 | for entry in verification_observations: |
| 265 | 265 | if entry.status == VerificationObservationStatus.FAILED.value: |
| 266 | 266 | return CompletionPolicy._render_observation(entry) |
| 267 | + for entry in verification_observations: | |
| 268 | + if entry.status == VerificationObservationStatus.STALE.value: | |
| 269 | + return CompletionPolicy._render_observation(entry) | |
| 267 | 270 | for entry in verification_observations: |
| 268 | 271 | if entry.status == VerificationObservationStatus.MISSING.value: |
| 269 | 272 | return CompletionPolicy._render_observation(entry) |
src/loader/runtime/task_completion.pymodified@@ -119,6 +119,7 @@ class _FollowThroughFacts: | ||
| 119 | 119 | has_install_evidence: bool |
| 120 | 120 | has_verification_evidence: bool |
| 121 | 121 | has_failed_verification: bool |
| 122 | + has_stale_verification: bool | |
| 122 | 123 | verification_command: str | None |
| 123 | 124 | pending_items: list[str] |
| 124 | 125 | accomplished: list[str] |
@@ -319,6 +320,30 @@ def assess_completion_follow_through_with_provenance( | ||
| 319 | 320 | status=EvidenceProvenanceStatus.CONTRADICTS, |
| 320 | 321 | ): |
| 321 | 322 | _append_unique_provenance(evidence_provenance, entry) |
| 323 | + elif facts.has_stale_verification: | |
| 324 | + _append_follow_through_gap( | |
| 325 | + missing_evidence, | |
| 326 | + remaining, | |
| 327 | + suggested_next_steps, | |
| 328 | + evidence=_stale_verification_evidence(facts.verification_command), | |
| 329 | + remaining_item="Rerun verification after the implementation changed again", | |
| 330 | + next_step=_stale_verification_follow_up(facts.verification_command), | |
| 331 | + ) | |
| 332 | + _append_unique_provenance( | |
| 333 | + evidence_provenance, | |
| 334 | + EvidenceProvenance( | |
| 335 | + category="verification", | |
| 336 | + source="dod.last_verification_result", | |
| 337 | + summary=( | |
| 338 | + "previous verification became stale for " | |
| 339 | + f"`{facts.verification_command}` after new mutating work" | |
| 340 | + if facts.verification_command | |
| 341 | + else "previous verification became stale after new mutating work" | |
| 342 | + ), | |
| 343 | + status=EvidenceProvenanceStatus.MISSING.value, | |
| 344 | + subject=facts.verification_command, | |
| 345 | + ), | |
| 346 | + ) | |
| 322 | 347 | elif not facts.has_verification_evidence: |
| 323 | 348 | _append_follow_through_gap( |
| 324 | 349 | missing_evidence, |
@@ -726,6 +751,7 @@ def _build_follow_through_facts( | ||
| 726 | 751 | has_install_evidence = _has_install_evidence(task_lower, action_types, actions_taken) |
| 727 | 752 | has_verification_evidence = _has_verification_evidence(action_types, actions_taken) |
| 728 | 753 | has_failed_verification = False |
| 754 | + has_stale_verification = False | |
| 729 | 755 | verification_command: str | None = None |
| 730 | 756 | pending_items: list[str] = [] |
| 731 | 757 | |
@@ -735,6 +761,7 @@ def _build_follow_through_facts( | ||
| 735 | 761 | has_install_evidence=has_install_evidence, |
| 736 | 762 | has_verification_evidence=has_verification_evidence, |
| 737 | 763 | has_failed_verification=has_failed_verification, |
| 764 | + has_stale_verification=has_stale_verification, | |
| 738 | 765 | verification_command=verification_command, |
| 739 | 766 | pending_items=pending_items, |
| 740 | 767 | accomplished=accomplished, |
@@ -758,6 +785,7 @@ def _build_follow_through_facts( | ||
| 758 | 785 | dod.last_verification_result == "failed" |
| 759 | 786 | or any(not evidence.passed for evidence in dod.evidence) |
| 760 | 787 | ) |
| 788 | + has_stale_verification = dod.last_verification_result == "stale" | |
| 761 | 789 | has_recorded_work = has_recorded_work or bool( |
| 762 | 790 | dod.touched_files |
| 763 | 791 | or dod.successful_commands |
@@ -765,6 +793,7 @@ def _build_follow_through_facts( | ||
| 765 | 793 | or dod.completed_items |
| 766 | 794 | or has_verification_evidence |
| 767 | 795 | or has_failed_verification |
| 796 | + or has_stale_verification | |
| 768 | 797 | ) |
| 769 | 798 | for evidence in dod.evidence: |
| 770 | 799 | if not evidence.passed: |
@@ -785,6 +814,7 @@ def _build_follow_through_facts( | ||
| 785 | 814 | has_install_evidence=has_install_evidence, |
| 786 | 815 | has_verification_evidence=has_verification_evidence, |
| 787 | 816 | has_failed_verification=has_failed_verification, |
| 817 | + has_stale_verification=has_stale_verification, | |
| 788 | 818 | verification_command=verification_command, |
| 789 | 819 | pending_items=pending_items, |
| 790 | 820 | accomplished=accomplished, |
@@ -827,6 +857,15 @@ def _failed_verification_evidence(verification_command: str | None) -> str: | ||
| 827 | 857 | return "a passing verification result (current verification is still failing)" |
| 828 | 858 | |
| 829 | 859 | |
| 860 | +def _stale_verification_evidence(verification_command: str | None) -> str: | |
| 861 | + if verification_command: | |
| 862 | + return ( | |
| 863 | + f"a fresh passing verification result from `{verification_command}` " | |
| 864 | + "(previous verification became stale after new mutating work)" | |
| 865 | + ) | |
| 866 | + return "a fresh passing verification result after new mutating work" | |
| 867 | + | |
| 868 | + | |
| 830 | 869 | def _verification_follow_up( |
| 831 | 870 | *, |
| 832 | 871 | task_lower: str, |
@@ -843,6 +882,12 @@ def _verification_retry_step(verification_command: str | None) -> str: | ||
| 843 | 882 | return "Fix the failing verification result and rerun it" |
| 844 | 883 | |
| 845 | 884 | |
| 885 | +def _stale_verification_follow_up(verification_command: str | None) -> str: | |
| 886 | + if verification_command: | |
| 887 | + return f"Rerun `{verification_command}` now that the implementation changed again" | |
| 888 | + return "Rerun the relevant verification now that the implementation changed again" | |
| 889 | + | |
| 890 | + | |
| 846 | 891 | def _verification_provenance( |
| 847 | 892 | *, |
| 848 | 893 | dod: DefinitionOfDone | None, |
@@ -945,6 +990,25 @@ def _observed_completion_verification( | ||
| 945 | 990 | ) |
| 946 | 991 | return observations |
| 947 | 992 | |
| 993 | + if dod.last_verification_result == VerificationObservationStatus.STALE.value: | |
| 994 | + if verification_command: | |
| 995 | + return [ | |
| 996 | + VerificationObservation( | |
| 997 | + status=VerificationObservationStatus.STALE.value, | |
| 998 | + summary=( | |
| 999 | + "verification became stale for " | |
| 1000 | + f"`{verification_command}` after new mutating work" | |
| 1001 | + ), | |
| 1002 | + command=verification_command, | |
| 1003 | + ) | |
| 1004 | + ] | |
| 1005 | + return [ | |
| 1006 | + VerificationObservation( | |
| 1007 | + status=VerificationObservationStatus.STALE.value, | |
| 1008 | + summary="previous verification became stale after new mutating work", | |
| 1009 | + ) | |
| 1010 | + ] | |
| 1011 | + | |
| 948 | 1012 | if verification_command: |
| 949 | 1013 | return [ |
| 950 | 1014 | VerificationObservation( |
src/loader/runtime/tool_batches.pymodified@@ -7,17 +7,30 @@ from dataclasses import dataclass, field | ||
| 7 | 7 | |
| 8 | 8 | from ..llm.base import ToolCall |
| 9 | 9 | from .context import RuntimeContext |
| 10 | -from .dod import DefinitionOfDone, DefinitionOfDoneStore, record_successful_tool_call | |
| 10 | +from .dod import ( | |
| 11 | + DefinitionOfDone, | |
| 12 | + DefinitionOfDoneStore, | |
| 13 | + is_state_mutating_tool_call, | |
| 14 | + record_successful_tool_call, | |
| 15 | +) | |
| 11 | 16 | from .events import AgentEvent, TurnSummary |
| 17 | +from .evidence_provenance import EvidenceProvenance, EvidenceProvenanceStatus | |
| 12 | 18 | from .executor import ToolExecutionState, ToolExecutor |
| 19 | +from .policy_timeline import append_verification_timeline_entry | |
| 13 | 20 | from .tool_batch_checks import ToolBatchConfidenceGate, ToolBatchVerificationGate |
| 14 | 21 | from .tool_batch_recovery import ToolBatchRecoveryController |
| 22 | +from .verification_observations import ( | |
| 23 | + VerificationObservation, | |
| 24 | + VerificationObservationStatus, | |
| 25 | +) | |
| 15 | 26 | from .workflow import sync_todos_to_definition_of_done |
| 16 | 27 | |
| 17 | 28 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 18 | 29 | ConfirmationHandler = Callable[[str, str, str], Awaitable[bool]] | None |
| 19 | 30 | UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None |
| 20 | 31 | |
| 32 | +_VERIFY_ITEM = "Collect verification evidence" | |
| 33 | + | |
| 21 | 34 | |
| 22 | 35 | @dataclass |
| 23 | 36 | class ToolBatchResult: |
@@ -190,7 +203,15 @@ class ToolBatchRunner: | ||
| 190 | 203 | ) -> str | None: |
| 191 | 204 | """Update DoD bookkeeping after a successful tool execution.""" |
| 192 | 205 | |
| 206 | + previously_verified = dod.last_verification_result == "passed" | |
| 193 | 207 | record_successful_tool_call(dod, tool_call) |
| 208 | + if previously_verified and is_state_mutating_tool_call(tool_call): | |
| 209 | + _mark_verification_stale( | |
| 210 | + context=self.context, | |
| 211 | + summary=summary, | |
| 212 | + dod=dod, | |
| 213 | + tool_call=tool_call, | |
| 214 | + ) | |
| 194 | 215 | if tool_call.name == "TodoWrite" and outcome.registry_result is not None: |
| 195 | 216 | new_todos = outcome.registry_result.metadata.get("new_todos", []) |
| 196 | 217 | if isinstance(new_todos, list): |
@@ -198,3 +219,88 @@ class ToolBatchRunner: | ||
| 198 | 219 | self.dod_store.save(dod) |
| 199 | 220 | self.context.recovery_context = None |
| 200 | 221 | return None |
| 222 | + | |
| 223 | + | |
| 224 | +def _mark_verification_stale( | |
| 225 | + *, | |
| 226 | + context: RuntimeContext, | |
| 227 | + summary: TurnSummary, | |
| 228 | + dod: DefinitionOfDone, | |
| 229 | + tool_call: ToolCall, | |
| 230 | +) -> None: | |
| 231 | + detail = _stale_verification_detail(tool_call) | |
| 232 | + append_verification_timeline_entry( | |
| 233 | + context, | |
| 234 | + summary, | |
| 235 | + reason_code="verification_stale", | |
| 236 | + reason_summary="previous verification became stale after new mutating work", | |
| 237 | + evidence_summary=[f"fresh verification required after {detail}"], | |
| 238 | + evidence_provenance=_stale_verification_provenance(dod, detail=detail), | |
| 239 | + verification_observations=_stale_verification_observations( | |
| 240 | + dod, | |
| 241 | + detail=detail, | |
| 242 | + ), | |
| 243 | + ) | |
| 244 | + dod.last_verification_result = VerificationObservationStatus.STALE.value | |
| 245 | + dod.evidence = [] | |
| 246 | + while _VERIFY_ITEM in dod.completed_items: | |
| 247 | + dod.completed_items.remove(_VERIFY_ITEM) | |
| 248 | + if _VERIFY_ITEM not in dod.pending_items: | |
| 249 | + dod.pending_items.append(_VERIFY_ITEM) | |
| 250 | + | |
| 251 | + | |
| 252 | +def _stale_verification_observations( | |
| 253 | + dod: DefinitionOfDone, | |
| 254 | + *, | |
| 255 | + detail: str, | |
| 256 | +) -> list[VerificationObservation]: | |
| 257 | + return [ | |
| 258 | + VerificationObservation( | |
| 259 | + status=VerificationObservationStatus.STALE.value, | |
| 260 | + summary=f"verification became stale for `{command}` after new mutating work", | |
| 261 | + command=command, | |
| 262 | + kind="runtime", | |
| 263 | + detail=detail, | |
| 264 | + ) | |
| 265 | + for command in _stale_verification_commands(dod) | |
| 266 | + ] | |
| 267 | + | |
| 268 | + | |
| 269 | +def _stale_verification_provenance( | |
| 270 | + dod: DefinitionOfDone, | |
| 271 | + *, | |
| 272 | + detail: str, | |
| 273 | +) -> list[EvidenceProvenance]: | |
| 274 | + return [ | |
| 275 | + EvidenceProvenance( | |
| 276 | + category="verification", | |
| 277 | + source="tool_execution", | |
| 278 | + summary=f"fresh verification required for `{command}` after new mutating work", | |
| 279 | + status=EvidenceProvenanceStatus.MISSING.value, | |
| 280 | + subject=command, | |
| 281 | + detail=detail, | |
| 282 | + ) | |
| 283 | + for command in _stale_verification_commands(dod) | |
| 284 | + ] | |
| 285 | + | |
| 286 | + | |
| 287 | +def _stale_verification_commands(dod: DefinitionOfDone) -> list[str]: | |
| 288 | + commands = [command for command in dod.verification_commands if command] | |
| 289 | + if commands: | |
| 290 | + return commands | |
| 291 | + observed = [evidence.command for evidence in dod.evidence if evidence.command] | |
| 292 | + if observed: | |
| 293 | + return observed | |
| 294 | + return ["verification"] | |
| 295 | + | |
| 296 | + | |
| 297 | +def _stale_verification_detail(tool_call: ToolCall) -> str: | |
| 298 | + if tool_call.name in {"write", "edit", "patch"}: | |
| 299 | + file_path = str(tool_call.arguments.get("file_path", "")).strip() | |
| 300 | + if file_path: | |
| 301 | + return f"{tool_call.name} changed {file_path}" | |
| 302 | + if tool_call.name == "bash": | |
| 303 | + command = str(tool_call.arguments.get("command", "")).strip() | |
| 304 | + if command: | |
| 305 | + return f"bash ran `{command}`" | |
| 306 | + return f"{tool_call.name} changed the workspace" | |
src/loader/runtime/verification_observations.pymodified@@ -11,6 +11,7 @@ class VerificationObservationStatus(StrEnum): | ||
| 11 | 11 | """How one verification observation resolved at runtime.""" |
| 12 | 12 | |
| 13 | 13 | PENDING = "pending" |
| 14 | + STALE = "stale" | |
| 14 | 15 | PASSED = "passed" |
| 15 | 16 | FAILED = "failed" |
| 16 | 17 | SKIPPED = "skipped" |
tests/test_completion_policy.pymodified@@ -219,6 +219,27 @@ def test_assess_completion_follow_through_surfaces_failing_verification() -> Non | ||
| 219 | 219 | ] |
| 220 | 220 | |
| 221 | 221 | |
| 222 | +def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None: | |
| 223 | + dod = create_definition_of_done("Run pytest -q and make sure it works.") | |
| 224 | + dod.verification_commands = ["pytest -q"] | |
| 225 | + dod.last_verification_result = "stale" | |
| 226 | + | |
| 227 | + check = assess_completion_follow_through( | |
| 228 | + task="Run pytest -q and make sure it works.", | |
| 229 | + response="The tests were already handled.", | |
| 230 | + actions_taken=["write: README.md"], | |
| 231 | + dod=dod, | |
| 232 | + ) | |
| 233 | + | |
| 234 | + assert check.is_complete is False | |
| 235 | + assert check.missing_evidence == [ | |
| 236 | + "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)" | |
| 237 | + ] | |
| 238 | + assert check.suggested_next_steps == [ | |
| 239 | + "Rerun `pytest -q` now that the implementation changed again" | |
| 240 | + ] | |
| 241 | + | |
| 242 | + | |
| 222 | 243 | def test_completion_assessment_attaches_typed_verification_provenance() -> None: |
| 223 | 244 | dod = create_definition_of_done("Run pytest -q and make sure it works.") |
| 224 | 245 | dod.verification_commands = ["pytest -q"] |
@@ -485,6 +506,53 @@ async def test_completion_policy_uses_missing_observed_verification_when_budget_ | ||
| 485 | 506 | assert events[0].type == "completion_check" |
| 486 | 507 | |
| 487 | 508 | |
| 509 | +@pytest.mark.asyncio | |
| 510 | +async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted( | |
| 511 | + temp_dir: Path, | |
| 512 | +) -> None: | |
| 513 | + context = build_context( | |
| 514 | + temp_dir, | |
| 515 | + safeguards=FakeSafeguards(), | |
| 516 | + max_continuation_prompts=1, | |
| 517 | + ) | |
| 518 | + policy = CompletionPolicy(context) | |
| 519 | + dod = create_definition_of_done("Run pytest -q and make sure it works.") | |
| 520 | + dod.verification_commands = ["pytest -q"] | |
| 521 | + dod.last_verification_result = "stale" | |
| 522 | + events = [] | |
| 523 | + | |
| 524 | + async def emit(event) -> None: | |
| 525 | + events.append(event) | |
| 526 | + | |
| 527 | + decision = await policy.maybe_continue_for_completion( | |
| 528 | + content="The tests were already handled.", | |
| 529 | + response_content="The tests were already handled.", | |
| 530 | + task="Run pytest -q and make sure it works.", | |
| 531 | + actions_taken=["write: README.md"], | |
| 532 | + continuation_count=1, | |
| 533 | + emit=emit, | |
| 534 | + dod=dod, | |
| 535 | + ) | |
| 536 | + | |
| 537 | + assert decision.should_continue is False | |
| 538 | + assert decision.should_finalize is True | |
| 539 | + assert decision.decision_code == "continuation_budget_exhausted" | |
| 540 | + assert decision.decision_summary == ( | |
| 541 | + "stopped because the continuation budget was exhausted while observed " | |
| 542 | + "verification still showed verification became stale for `pytest -q` " | |
| 543 | + "after new mutating work" | |
| 544 | + ) | |
| 545 | + assert decision.final_response == ( | |
| 546 | + "I stopped because the continuation budget was exhausted and observed " | |
| 547 | + "verification still showed: verification became stale for `pytest -q` " | |
| 548 | + "after new mutating work." | |
| 549 | + ) | |
| 550 | + assert [item.status for item in decision.verification_observations] == [ | |
| 551 | + VerificationObservationStatus.STALE.value | |
| 552 | + ] | |
| 553 | + assert events[0].type == "completion_check" | |
| 554 | + | |
| 555 | + | |
| 488 | 556 | @pytest.mark.asyncio |
| 489 | 557 | async def test_completion_policy_finalizes_when_budget_is_exhausted( |
| 490 | 558 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -9,7 +9,11 @@ import pytest | ||
| 9 | 9 | |
| 10 | 10 | from loader.llm.base import Message, Role, ToolCall |
| 11 | 11 | from loader.runtime.context import RuntimeContext |
| 12 | -from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done | |
| 12 | +from loader.runtime.dod import ( | |
| 13 | + DefinitionOfDoneStore, | |
| 14 | + VerificationEvidence, | |
| 15 | + create_definition_of_done, | |
| 16 | +) | |
| 13 | 17 | from loader.runtime.events import AgentEvent, TurnSummary |
| 14 | 18 | from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState |
| 15 | 19 | from loader.runtime.permissions import ( |
@@ -32,10 +36,14 @@ from tests.helpers.runtime_harness import ScriptedBackend | ||
| 32 | 36 | class FakeSession: |
| 33 | 37 | def __init__(self, messages: list[Message]) -> None: |
| 34 | 38 | self.messages = list(messages) |
| 39 | + self.workflow_timeline = [] | |
| 35 | 40 | |
| 36 | 41 | def append(self, message: Message) -> None: |
| 37 | 42 | self.messages.append(message) |
| 38 | 43 | |
| 44 | + def append_workflow_timeline_entry(self, entry) -> None: | |
| 45 | + self.workflow_timeline.append(entry) | |
| 46 | + | |
| 39 | 47 | |
| 40 | 48 | class FakeCodeFilter: |
| 41 | 49 | def reset(self) -> None: |
@@ -327,3 +335,83 @@ async def test_tool_batch_runner_verifies_with_context_services(temp_dir: Path) | ||
| 327 | 335 | assert context.session.messages[-1].role == Role.TOOL |
| 328 | 336 | assert context.session.messages[-1].content == "file contents" |
| 329 | 337 | assert any(event.type == "verification" for event in events) |
| 338 | + | |
| 339 | + | |
| 340 | +@pytest.mark.asyncio | |
| 341 | +async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation( | |
| 342 | + temp_dir: Path, | |
| 343 | +) -> None: | |
| 344 | + async def assess_confidence( | |
| 345 | + tool_name: str, | |
| 346 | + tool_args: dict, | |
| 347 | + context: str, | |
| 348 | + ) -> ConfidenceAssessment: | |
| 349 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 350 | + | |
| 351 | + async def verify_action( | |
| 352 | + tool_name: str, | |
| 353 | + tool_args: dict, | |
| 354 | + result: str, | |
| 355 | + expected: str = "", | |
| 356 | + ) -> ActionVerification: | |
| 357 | + raise AssertionError("Verification should not run for this scenario") | |
| 358 | + | |
| 359 | + context = build_context( | |
| 360 | + temp_dir=temp_dir, | |
| 361 | + messages=[], | |
| 362 | + safeguards=FakeSafeguards(), | |
| 363 | + assess_confidence=assess_confidence, | |
| 364 | + verify_action=verify_action, | |
| 365 | + ) | |
| 366 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 367 | + tool_call = ToolCall( | |
| 368 | + id="write-1", | |
| 369 | + name="write", | |
| 370 | + arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"}, | |
| 371 | + ) | |
| 372 | + executor = FakeExecutor( | |
| 373 | + [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)] | |
| 374 | + ) | |
| 375 | + summary = TurnSummary(final_response="") | |
| 376 | + dod = create_definition_of_done("Update README and verify it still works.") | |
| 377 | + dod.verification_commands = ["uv run pytest -q"] | |
| 378 | + dod.last_verification_result = "passed" | |
| 379 | + dod.evidence = [ | |
| 380 | + VerificationEvidence( | |
| 381 | + command="uv run pytest -q", | |
| 382 | + passed=True, | |
| 383 | + stdout="401 passed", | |
| 384 | + kind="test", | |
| 385 | + ) | |
| 386 | + ] | |
| 387 | + dod.completed_items.append("Collect verification evidence") | |
| 388 | + events: list[AgentEvent] = [] | |
| 389 | + | |
| 390 | + async def emit(event: AgentEvent) -> None: | |
| 391 | + events.append(event) | |
| 392 | + | |
| 393 | + await runner.execute_batch( | |
| 394 | + tool_calls=[tool_call], | |
| 395 | + tool_source="assistant", | |
| 396 | + pending_tool_calls_seen=set(), | |
| 397 | + emit=emit, | |
| 398 | + summary=summary, | |
| 399 | + dod=dod, | |
| 400 | + executor=executor, # type: ignore[arg-type] | |
| 401 | + on_confirmation=None, | |
| 402 | + on_user_question=None, | |
| 403 | + emit_confirmation=None, | |
| 404 | + consecutive_errors=0, | |
| 405 | + ) | |
| 406 | + | |
| 407 | + assert dod.last_verification_result == "stale" | |
| 408 | + assert dod.evidence == [] | |
| 409 | + assert "Collect verification evidence" in dod.pending_items | |
| 410 | + assert "Collect verification evidence" not in dod.completed_items | |
| 411 | + assert summary.workflow_timeline[-1].reason_code == "verification_stale" | |
| 412 | + assert summary.workflow_timeline[-1].policy_outcome == "stale" | |
| 413 | + assert summary.workflow_timeline[-1].verification_observations[0].status == "stale" | |
| 414 | + assert ( | |
| 415 | + summary.workflow_timeline[-1].verification_observations[0].command | |
| 416 | + == "uv run pytest -q" | |
| 417 | + ) | |