`96f060e`

Invalidate stale verification after new mutations

Authored by

espadonne 1 month ago

SHA: 96f060ef3f81960a9a5b59ea6dc2c50eedd499a4
Parents: c568344
Tree: 6d75ea2

6 changed files

Status	File	+	-
M	`src/loader/runtime/completion_policy.py`	3	0
M	`src/loader/runtime/task_completion.py`	64	0
M	`src/loader/runtime/tool_batches.py`	107	1
M	`src/loader/runtime/verification_observations.py`	1	0
M	`tests/test_completion_policy.py`	68	0
M	`tests/test_tool_batches.py`	89	1

src/loader/runtime/completion_policy.pymodified

          for entry in verification_observations:
              if entry.status == VerificationObservationStatus.FAILED.value:
                  return CompletionPolicy._render_observation(entry)
 +        for entry in verification_observations:
 +            if entry.status == VerificationObservationStatus.STALE.value:
 +                return CompletionPolicy._render_observation(entry)
          for entry in verification_observations:
              if entry.status == VerificationObservationStatus.MISSING.value:
                  return CompletionPolicy._render_observation(entry)

src/loader/runtime/task_completion.pymodified

      has_install_evidence: bool
      has_verification_evidence: bool
      has_failed_verification: bool
 +    has_stale_verification: bool
      verification_command: str | None
      pending_items: list[str]
      accomplished: list[str]
                  status=EvidenceProvenanceStatus.CONTRADICTS,
              ):
                  _append_unique_provenance(evidence_provenance, entry)
 +        elif facts.has_stale_verification:
 +            _append_follow_through_gap(
 +                missing_evidence,
 +                remaining,
 +                suggested_next_steps,
 +                evidence=_stale_verification_evidence(facts.verification_command),
 +                remaining_item="Rerun verification after the implementation changed again",
 +                next_step=_stale_verification_follow_up(facts.verification_command),
 +            )
 +            _append_unique_provenance(
 +                evidence_provenance,
 +                EvidenceProvenance(
 +                    category="verification",
 +                    source="dod.last_verification_result",
 +                    summary=(
 +                        "previous verification became stale for "
 +                        f"`{facts.verification_command}` after new mutating work"
 +                        if facts.verification_command
 +                        else "previous verification became stale after new mutating work"
 +                    ),
 +                    status=EvidenceProvenanceStatus.MISSING.value,
 +                    subject=facts.verification_command,
 +                ),
 +            )
          elif not facts.has_verification_evidence:
              _append_follow_through_gap(
                  missing_evidence,
      has_install_evidence = _has_install_evidence(task_lower, action_types, actions_taken)
      has_verification_evidence = _has_verification_evidence(action_types, actions_taken)
      has_failed_verification = False
 +    has_stale_verification = False
      verification_command: str | None = None
      pending_items: list[str] = []
              has_install_evidence=has_install_evidence,
              has_verification_evidence=has_verification_evidence,
              has_failed_verification=has_failed_verification,
 +            has_stale_verification=has_stale_verification,
              verification_command=verification_command,
              pending_items=pending_items,
              accomplished=accomplished,
          dod.last_verification_result == "failed"
          or any(not evidence.passed for evidence in dod.evidence)
+     )
 +    has_stale_verification = dod.last_verification_result == "stale"
      has_recorded_work = has_recorded_work or bool(
          dod.touched_files
          or dod.successful_commands
          or dod.completed_items
          or has_verification_evidence
          or has_failed_verification
 +        or has_stale_verification
+     )
      for evidence in dod.evidence:
          if not evidence.passed:
          has_install_evidence=has_install_evidence,
          has_verification_evidence=has_verification_evidence,
          has_failed_verification=has_failed_verification,
 +        has_stale_verification=has_stale_verification,
          verification_command=verification_command,
          pending_items=pending_items,
          accomplished=accomplished,
      return "a passing verification result (current verification is still failing)"
 +def _stale_verification_evidence(verification_command: str | None) -> str:
 +    if verification_command:
 +        return (
 +            f"a fresh passing verification result from `{verification_command}` "
 +            "(previous verification became stale after new mutating work)"
 +        )
 +    return "a fresh passing verification result after new mutating work"
++
++
  def _verification_follow_up(
      *,
      task_lower: str,
      return "Fix the failing verification result and rerun it"
 +def _stale_verification_follow_up(verification_command: str | None) -> str:
 +    if verification_command:
 +        return f"Rerun `{verification_command}` now that the implementation changed again"
 +    return "Rerun the relevant verification now that the implementation changed again"
++
++
  def _verification_provenance(
      *,
      dod: DefinitionOfDone | None,
+             )
          return observations
 +    if dod.last_verification_result == VerificationObservationStatus.STALE.value:
 +        if verification_command:
 +            return [
 +                VerificationObservation(
 +                    status=VerificationObservationStatus.STALE.value,
 +                    summary=(
 +                        "verification became stale for "
 +                        f"`{verification_command}` after new mutating work"
 +                    ),
 +                    command=verification_command,
 +                )
 +            ]
 +        return [
 +            VerificationObservation(
 +                status=VerificationObservationStatus.STALE.value,
 +                summary="previous verification became stale after new mutating work",
 +            )
 +        ]
++
      if verification_command:
          return [
              VerificationObservation(

src/loader/runtime/tool_batches.pymodified

  from ..llm.base import ToolCall
  from .context import RuntimeContext
 -from .dod import DefinitionOfDone, DefinitionOfDoneStore, record_successful_tool_call
 +from .dod import (
 +    DefinitionOfDone,
 +    DefinitionOfDoneStore,
 +    is_state_mutating_tool_call,
 +    record_successful_tool_call,
 +)
  from .events import AgentEvent, TurnSummary
 +from .evidence_provenance import EvidenceProvenance, EvidenceProvenanceStatus
  from .executor import ToolExecutionState, ToolExecutor
 +from .policy_timeline import append_verification_timeline_entry
  from .tool_batch_checks import ToolBatchConfidenceGate, ToolBatchVerificationGate
  from .tool_batch_recovery import ToolBatchRecoveryController
 +from .verification_observations import (
 +    VerificationObservation,
 +    VerificationObservationStatus,
 +)
  from .workflow import sync_todos_to_definition_of_done
  EventSink = Callable[[AgentEvent], Awaitable[None]]
  ConfirmationHandler = Callable[[str, str, str], Awaitable[bool]] | None
  UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None
 +_VERIFY_ITEM = "Collect verification evidence"
++
  @dataclass
  class ToolBatchResult:
      ) -> str | None:
          """Update DoD bookkeeping after a successful tool execution."""
 +        previously_verified = dod.last_verification_result == "passed"
          record_successful_tool_call(dod, tool_call)
 +        if previously_verified and is_state_mutating_tool_call(tool_call):
 +            _mark_verification_stale(
 +                context=self.context,
 +                summary=summary,
 +                dod=dod,
 +                tool_call=tool_call,
 +            )
          if tool_call.name == "TodoWrite" and outcome.registry_result is not None:
              new_todos = outcome.registry_result.metadata.get("new_todos", [])
              if isinstance(new_todos, list):
          self.dod_store.save(dod)
          self.context.recovery_context = None
          return None
++
++
 +def _mark_verification_stale(
 +    *,
 +    context: RuntimeContext,
 +    summary: TurnSummary,
 +    dod: DefinitionOfDone,
 +    tool_call: ToolCall,
 +) -> None:
 +    detail = _stale_verification_detail(tool_call)
 +    append_verification_timeline_entry(
 +        context,
 +        summary,
 +        reason_code="verification_stale",
 +        reason_summary="previous verification became stale after new mutating work",
 +        evidence_summary=[f"fresh verification required after {detail}"],
 +        evidence_provenance=_stale_verification_provenance(dod, detail=detail),
 +        verification_observations=_stale_verification_observations(
 +            dod,
 +            detail=detail,
 +        ),
 +    )
 +    dod.last_verification_result = VerificationObservationStatus.STALE.value
 +    dod.evidence = []
 +    while _VERIFY_ITEM in dod.completed_items:
 +        dod.completed_items.remove(_VERIFY_ITEM)
 +    if _VERIFY_ITEM not in dod.pending_items:
 +        dod.pending_items.append(_VERIFY_ITEM)
++
++
 +def _stale_verification_observations(
 +    dod: DefinitionOfDone,
 +    *,
 +    detail: str,
 +) -> list[VerificationObservation]:
 +    return [
 +        VerificationObservation(
 +            status=VerificationObservationStatus.STALE.value,
 +            summary=f"verification became stale for `{command}` after new mutating work",
 +            command=command,
 +            kind="runtime",
 +            detail=detail,
 +        )
 +        for command in _stale_verification_commands(dod)
 +    ]
++
++
 +def _stale_verification_provenance(
 +    dod: DefinitionOfDone,
 +    *,
 +    detail: str,
 +) -> list[EvidenceProvenance]:
 +    return [
 +        EvidenceProvenance(
 +            category="verification",
 +            source="tool_execution",
 +            summary=f"fresh verification required for `{command}` after new mutating work",
 +            status=EvidenceProvenanceStatus.MISSING.value,
 +            subject=command,
 +            detail=detail,
 +        )
 +        for command in _stale_verification_commands(dod)
 +    ]
++
++
 +def _stale_verification_commands(dod: DefinitionOfDone) -> list[str]:
 +    commands = [command for command in dod.verification_commands if command]
 +    if commands:
 +        return commands
 +    observed = [evidence.command for evidence in dod.evidence if evidence.command]
 +    if observed:
 +        return observed
 +    return ["verification"]
++
++
 +def _stale_verification_detail(tool_call: ToolCall) -> str:
 +    if tool_call.name in {"write", "edit", "patch"}:
 +        file_path = str(tool_call.arguments.get("file_path", "")).strip()
 +        if file_path:
 +            return f"{tool_call.name} changed {file_path}"
 +    if tool_call.name == "bash":
 +        command = str(tool_call.arguments.get("command", "")).strip()
 +        if command:
 +            return f"bash ran `{command}`"
 +    return f"{tool_call.name} changed the workspace"

src/loader/runtime/verification_observations.pymodified

      """How one verification observation resolved at runtime."""
      PENDING = "pending"
 +    STALE = "stale"
      PASSED = "passed"
      FAILED = "failed"
      SKIPPED = "skipped"

tests/test_completion_policy.pymodified

+     ]
 +def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None:
 +    dod = create_definition_of_done("Run pytest -q and make sure it works.")
 +    dod.verification_commands = ["pytest -q"]
 +    dod.last_verification_result = "stale"
++
 +    check = assess_completion_follow_through(
 +        task="Run pytest -q and make sure it works.",
 +        response="The tests were already handled.",
 +        actions_taken=["write: README.md"],
 +        dod=dod,
 +    )
++
 +    assert check.is_complete is False
 +    assert check.missing_evidence == [
 +        "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)"
 +    ]
 +    assert check.suggested_next_steps == [
 +        "Rerun `pytest -q` now that the implementation changed again"
 +    ]
++
++
  def test_completion_assessment_attaches_typed_verification_provenance() -> None:
      dod = create_definition_of_done("Run pytest -q and make sure it works.")
      dod.verification_commands = ["pytest -q"]
      assert events[0].type == "completion_check"
 +@pytest.mark.asyncio
 +async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted(
 +    temp_dir: Path,
 +) -> None:
 +    context = build_context(
 +        temp_dir,
 +        safeguards=FakeSafeguards(),
 +        max_continuation_prompts=1,
 +    )
 +    policy = CompletionPolicy(context)
 +    dod = create_definition_of_done("Run pytest -q and make sure it works.")
 +    dod.verification_commands = ["pytest -q"]
 +    dod.last_verification_result = "stale"
 +    events = []
++
 +    async def emit(event) -> None:
 +        events.append(event)
++
 +    decision = await policy.maybe_continue_for_completion(
 +        content="The tests were already handled.",
 +        response_content="The tests were already handled.",
 +        task="Run pytest -q and make sure it works.",
 +        actions_taken=["write: README.md"],
 +        continuation_count=1,
 +        emit=emit,
 +        dod=dod,
 +    )
++
 +    assert decision.should_continue is False
 +    assert decision.should_finalize is True
 +    assert decision.decision_code == "continuation_budget_exhausted"
 +    assert decision.decision_summary == (
 +        "stopped because the continuation budget was exhausted while observed "
 +        "verification still showed verification became stale for `pytest -q` "
 +        "after new mutating work"
 +    )
 +    assert decision.final_response == (
 +        "I stopped because the continuation budget was exhausted and observed "
 +        "verification still showed: verification became stale for `pytest -q` "
 +        "after new mutating work."
 +    )
 +    assert [item.status for item in decision.verification_observations] == [
 +        VerificationObservationStatus.STALE.value
 +    ]
 +    assert events[0].type == "completion_check"
++
++
  @pytest.mark.asyncio
  async def test_completion_policy_finalizes_when_budget_is_exhausted(
      temp_dir: Path,

tests/test_tool_batches.pymodified

  from loader.llm.base import Message, Role, ToolCall
  from loader.runtime.context import RuntimeContext
 -from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
 +from loader.runtime.dod import (
 +    DefinitionOfDoneStore,
 +    VerificationEvidence,
 +    create_definition_of_done,
 +)
  from loader.runtime.events import AgentEvent, TurnSummary
  from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
  from loader.runtime.permissions import (
  class FakeSession:
      def __init__(self, messages: list[Message]) -> None:
          self.messages = list(messages)
 +        self.workflow_timeline = []
      def append(self, message: Message) -> None:
          self.messages.append(message)
 +    def append_workflow_timeline_entry(self, entry) -> None:
 +        self.workflow_timeline.append(entry)
++
  class FakeCodeFilter:
      def reset(self) -> None:
      assert context.session.messages[-1].role == Role.TOOL
      assert context.session.messages[-1].content == "file contents"
      assert any(event.type == "verification" for event in events)
++
++
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutation(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should be disabled in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run for this scenario")
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +    )
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    tool_call = ToolCall(
 +        id="write-1",
 +        name="write",
 +        arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"},
 +    )
 +    executor = FakeExecutor(
 +        [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)]
 +    )
 +    summary = TurnSummary(final_response="")
 +    dod = create_definition_of_done("Update README and verify it still works.")
 +    dod.verification_commands = ["uv run pytest -q"]
 +    dod.last_verification_result = "passed"
 +    dod.evidence = [
 +        VerificationEvidence(
 +            command="uv run pytest -q",
 +            passed=True,
 +            stdout="401 passed",
 +            kind="test",
 +        )
 +    ]
 +    dod.completed_items.append("Collect verification evidence")
 +    events: list[AgentEvent] = []
++
 +    async def emit(event: AgentEvent) -> None:
 +        events.append(event)
++
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=emit,
 +        summary=summary,
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert dod.last_verification_result == "stale"
 +    assert dod.evidence == []
 +    assert "Collect verification evidence" in dod.pending_items
 +    assert "Collect verification evidence" not in dod.completed_items
 +    assert summary.workflow_timeline[-1].reason_code == "verification_stale"
 +    assert summary.workflow_timeline[-1].policy_outcome == "stale"
 +    assert summary.workflow_timeline[-1].verification_observations[0].status == "stale"
 +    assert (
 +        summary.workflow_timeline[-1].verification_observations[0].command
 +        == "uv run pytest -q"
 +    )