"""Tests for completion-policy helpers.""" from __future__ import annotations from pathlib import Path from types import SimpleNamespace import pytest from loader.llm.base import Message, Role, ToolCall from loader.runtime.completion_policy import CompletionPolicy from loader.runtime.context import RuntimeContext from loader.runtime.dod import VerificationEvidence, create_definition_of_done from loader.runtime.events import TurnSummary from loader.runtime.evidence_provenance import EvidenceProvenanceStatus from loader.runtime.permissions import ( PermissionMode, build_permission_policy, load_permission_rules, ) from loader.runtime.task_completion import ( assess_completion_follow_through, assess_completion_follow_through_with_provenance, detect_premature_completion, get_continuation_prompt, ) from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done from loader.runtime.verification_observations import ( VerificationObservationStatus, verification_attempt_id, ) from loader.tools.base import create_default_registry from tests.helpers.runtime_harness import ScriptedBackend class FakeCodeFilter: def reset(self) -> None: return None class FakeSafeguards: def __init__(self, *, text_loop: tuple[bool, str] = (False, "")) -> None: self.action_tracker = object() self.validator = object() self.code_filter = FakeCodeFilter() self._text_loop = text_loop self.recorded: list[str] = [] def filter_stream_chunk(self, content: str) -> str: return content def filter_complete_content(self, content: str) -> str: return content def should_steer(self) -> bool: return False def get_steering_message(self) -> str | None: return None def record_response(self, content: str) -> None: self.recorded.append(content) def detect_text_loop(self, content: str) -> tuple[bool, str]: return self._text_loop def detect_loop(self) -> tuple[bool, str]: return False, "" class FakeSession: def __init__(self) -> None: self.messages: list[Message] = [] def append(self, message: Message) -> None: self.messages.append(message) def build_context( temp_dir: Path, *, safeguards: FakeSafeguards, max_continuation_prompts: int = 5, use_quick_completion: bool = True, ) -> RuntimeContext: registry = create_default_registry(temp_dir) registry.configure_workspace_root(temp_dir) rule_status = load_permission_rules(temp_dir) policy = build_permission_policy( active_mode=PermissionMode.WORKSPACE_WRITE, workspace_root=temp_dir, tool_requirements=registry.get_tool_requirements(), rules=rule_status.rules, ) return RuntimeContext( project_root=temp_dir, backend=ScriptedBackend(), registry=registry, session=FakeSession(), # type: ignore[arg-type] config=SimpleNamespace( force_react=False, reasoning=SimpleNamespace( max_continuation_prompts=max_continuation_prompts, use_quick_completion=use_quick_completion, ), ), capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type] project_context=None, permission_policy=policy, permission_config_status=rule_status, workflow_mode="execute", safeguards=safeguards, ) def test_completion_policy_finalize_response_text_keeps_original_response() -> None: response = CompletionPolicy.finalize_response_text( content="Inspected the file successfully.", actions_taken=["read: README.md"], ) assert response == "Inspected the file successfully." def test_detect_premature_completion_respects_explicit_done_without_actions() -> None: assert detect_premature_completion( "Explain how Loader works.", "Done.", [], ) is False def test_get_continuation_prompt_surfaces_missing_verification_steps() -> None: prompt = get_continuation_prompt( "Create the script and test that it works.", ["write: script.py"], "The script has been created.", ) assert "Continue with" in prompt assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower() def test_assess_completion_follow_through_tracks_missing_evidence() -> None: check = assess_completion_follow_through( task="Create the script and test that it works.", response="The script has been created.", actions_taken=["write: script.py"], ) assert check.is_complete is False assert "showing the requested work was actually carried out" in check.required_evidence assert "showing the result was run or verified" in check.required_evidence assert check.missing_evidence == ["showing the result was run or verified"] assert check.suggested_next_steps == [ "Execute what you created or run the relevant tests now" ] def test_assess_completion_follow_through_accepts_informational_tasks() -> None: check = assess_completion_follow_through( task="Explain how Loader's workflow timeline works.", response="Loader records workflow decisions and policy events in a timeline.", actions_taken=[], ) assert check.is_complete is True assert check.required_evidence == [] assert check.missing_evidence == [] def test_assess_completion_follow_through_uses_passing_verification_evidence() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.evidence = [ VerificationEvidence( command="pytest -q", passed=True, stdout="342 passed", kind="test", ) ] dod.last_verification_result = "passed" check = assess_completion_follow_through( task="Run pytest -q and make sure it works.", response="The test suite passed.", actions_taken=[], dod=dod, ) assert check.is_complete is True assert check.missing_evidence == [] assert "verified: pytest -q" in check.accomplished def test_assess_completion_follow_through_surfaces_failing_verification() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.evidence = [ VerificationEvidence( command="pytest -q", passed=False, stderr="1 failed", kind="test", ) ] dod.last_verification_result = "failed" check = assess_completion_follow_through( task="Run pytest -q and make sure it works.", response="The tests are done.", actions_taken=[], dod=dod, ) assert check.is_complete is False assert check.missing_evidence == [ "a passing verification result from `pytest -q` (current verification is still failing)" ] assert check.suggested_next_steps == [ "Fix the failing `pytest -q` result and rerun it" ] def test_assess_completion_follow_through_surfaces_planned_verification() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "planned" check = assess_completion_follow_through( task="Run pytest -q and make sure it works.", response="The tests are next.", actions_taken=["write: README.md"], dod=dod, ) assert check.is_complete is False assert check.missing_evidence == [ "a passing verification result from `pytest -q` (verification is planned but has not run yet)" ] assert check.suggested_next_steps == ["Run the planned verification `pytest -q` now"] def test_assess_completion_follow_through_surfaces_pending_verification() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "pending" check = assess_completion_follow_through( task="Run pytest -q and make sure it works.", response="Verification is underway.", actions_taken=["write: README.md"], dod=dod, ) assert check.is_complete is False assert check.missing_evidence == [ "a completed passing verification result from `pytest -q` (verification is still pending)" ] assert check.suggested_next_steps == [ "Finish running `pytest -q` and capture the result" ] def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "stale" dod.verification_attempt_counter = 2 dod.active_verification_attempt_id = verification_attempt_id(2) dod.active_verification_attempt_number = 2 check = assess_completion_follow_through( task="Run pytest -q and make sure it works.", response="The tests were already handled.", actions_taken=["write: README.md"], dod=dod, ) assert check.is_complete is False assert check.missing_evidence == [ "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)" ] assert check.suggested_next_steps == [ "Rerun `pytest -q` now that the implementation changed again" ] def test_completion_assessment_projects_superseded_verification_attempt_for_stale_result() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "stale" dod.verification_attempt_counter = 2 dod.active_verification_attempt_id = verification_attempt_id(2) dod.active_verification_attempt_number = 2 assessment = assess_completion_follow_through_with_provenance( task="Run pytest -q and make sure it works.", response="The tests were already handled.", actions_taken=["write: README.md"], dod=dod, ) assert [item.status for item in assessment.verification_observations] == [ VerificationObservationStatus.STALE.value ] assert assessment.verification_observations[0].attempt_id == verification_attempt_id(1) assert assessment.verification_observations[0].attempt_number == 1 assert assessment.verification_observations[0].supersedes_attempt_id == ( verification_attempt_id(2) ) def test_completion_assessment_attaches_typed_verification_provenance() -> None: dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.evidence = [ VerificationEvidence( command="pytest -q", passed=False, stderr="1 failed", kind="test", ) ] dod.last_verification_result = "failed" assessment = assess_completion_follow_through_with_provenance( task="Run pytest -q and make sure it works.", response="The tests are done.", actions_taken=[], dod=dod, ) assert assessment.check.is_complete is False assert [item.status for item in assessment.evidence_provenance] == [ EvidenceProvenanceStatus.CONTRADICTS.value ] assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`" def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None: dod = create_definition_of_done("Fix the chapter links in index.html.") sync_todos_to_definition_of_done( dod, [ { "content": "First, examine the current index.html file to understand its structure", "active_form": "Working on: First, examine the current index.html file to understand its structure", "status": "pending", }, { "content": "List and read all HTML files in the chapters directory to extract chapter information", "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information", "status": "pending", }, { "content": "Parse chapter titles from each HTML file", "active_form": "Working on: Parse chapter titles from each HTML file", "status": "pending", }, { "content": "Update index.html with correct chapter links and titles", "active_form": "Working on: Update index.html with correct chapter links and titles", "status": "pending", }, ], ) advance_todos_from_tool_call( dod, ToolCall( id="read-index", name="read", arguments={"file_path": "/tmp/fortran/index.html"}, ), ) advance_todos_from_tool_call( dod, ToolCall( id="glob-chapters", name="glob", arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"}, ), ) advance_todos_from_tool_call( dod, ToolCall( id="read-chapter", name="read", arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, ), ) assessment = assess_completion_follow_through_with_provenance( task="Update /tmp/fortran/index.html so every chapter link is correct.", response="I'll update the index.html file with the correct chapter links and titles.", actions_taken=[ "read: {'file_path': '/tmp/fortran/index.html'}", "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}", "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}", ], dod=dod, ) assert assessment.check.missing_evidence[0] == ( "completion of tracked work items " "(Update index.html with correct chapter links and titles)" ) assert assessment.check.suggested_next_steps[0] == ( "Complete the tracked item: Update index.html with correct chapter links and titles" ) @pytest.mark.asyncio async def test_completion_policy_stops_for_text_loop_using_runtime_context( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(text_loop=(True, "assistant repeated the same summary")), ) policy = CompletionPolicy(context) summary = TurnSummary(final_response="") events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_stop_for_text_loop( content="Same summary again.", emit=emit, summary=summary, ) assert decision.should_stop is True assert decision.decision_code == "text_loop_bailout" assert decision.decision_summary == ( "stopped after detecting a repeated text loop" ) assert summary.final_response == ( "I stopped because I was repeating myself and couldn't make further progress." ) assert summary.assistant_messages[-1].role == Role.ASSISTANT assert context.session.messages[-1].content == summary.final_response assert events[0].type == "error" assert events[1].type == "response" @pytest.mark.asyncio async def test_completion_policy_requests_continuation_using_runtime_context( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), ) policy = CompletionPolicy(context) events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="I can handle that.", response_content="I can handle that.", task="Create the file and verify it works.", actions_taken=[], continuation_count=0, emit=emit, ) assert decision.should_continue is True assert decision.decision_code == "premature_completion_nudge" assert decision.decision_summary == ( "requested one continuation because the non-mutating response looked incomplete" ) assert decision.completion_check is not None assert decision.completion_check.missing_evidence == [ "showing the requested work was actually carried out", "showing the result was run or verified", ] assert context.session.messages[-2] == Message( role=Role.ASSISTANT, content="I can handle that.", ) assert context.session.messages[-1].role == Role.USER assert "verify it works" in context.session.messages[-1].content.lower() assert events[0].type == "completion_check" assert events[0].completion_check is not None assert events[0].completion_check.missing_evidence == [ "showing the requested work was actually carried out", "showing the result was run or verified", ] assert [item.status for item in decision.evidence_provenance] == [ EvidenceProvenanceStatus.MISSING.value, EvidenceProvenanceStatus.MISSING.value, ] @pytest.mark.asyncio async def test_completion_policy_accepts_passed_verification_from_dod( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), ) policy = CompletionPolicy(context) dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.evidence = [ VerificationEvidence( command="pytest -q", passed=True, stdout="342 passed", kind="test", ) ] dod.last_verification_result = "passed" events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="The tests passed.", response_content="The tests passed.", task="Run pytest -q and make sure it works.", actions_taken=[], continuation_count=0, emit=emit, dod=dod, ) assert decision.should_continue is False assert decision.should_finalize is False assert decision.decision_code == "completion_response_accepted" assert decision.completion_check is not None assert decision.completion_check.missing_evidence == [] assert events == [] assert [item.summary for item in decision.evidence_provenance] == [ "verification passed for `pytest -q`" ] @pytest.mark.asyncio async def test_completion_policy_finalizes_with_concrete_failed_verification_gap( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), max_continuation_prompts=1, ) policy = CompletionPolicy(context) dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.evidence = [ VerificationEvidence( command="pytest -q", passed=False, stderr="1 failed", kind="test", ) ] dod.last_verification_result = "failed" dod.verification_attempt_counter = 2 dod.active_verification_attempt_id = verification_attempt_id(2) dod.active_verification_attempt_number = 2 events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="The tests are done.", response_content="The tests are done.", task="Run pytest -q and make sure it works.", actions_taken=[], continuation_count=1, emit=emit, dod=dod, ) assert decision.should_continue is False assert decision.should_finalize is True assert decision.decision_code == "continuation_budget_exhausted" assert decision.decision_summary == ( "stopped because the continuation budget was exhausted while observed " "verification still showed verification failed for `pytest -q` " "[1 failed; attempt 2]" ) assert decision.completion_check is not None assert decision.completion_check.missing_evidence == [ "a passing verification result from `pytest -q` (current verification is still failing)" ] assert decision.final_response == ( "I stopped because the continuation budget was exhausted and observed " "verification still showed: verification failed for `pytest -q` " "[1 failed; attempt 2]." ) assert events[0].type == "completion_check" assert [item.status for item in decision.evidence_provenance] == [ EvidenceProvenanceStatus.CONTRADICTS.value ] assert [item.status for item in decision.verification_observations] == [ VerificationObservationStatus.FAILED.value ] assert decision.verification_observations[0].attempt_number == 2 @pytest.mark.asyncio async def test_completion_policy_uses_missing_observed_verification_when_budget_is_exhausted( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), max_continuation_prompts=1, ) policy = CompletionPolicy(context) dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "failed" dod.verification_attempt_counter = 3 dod.active_verification_attempt_id = verification_attempt_id(3) dod.active_verification_attempt_number = 3 events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="The tests are done.", response_content="The tests are done.", task="Run pytest -q and make sure it works.", actions_taken=[], continuation_count=1, emit=emit, dod=dod, ) assert decision.should_continue is False assert decision.should_finalize is True assert decision.decision_code == "continuation_budget_exhausted" assert decision.decision_summary == ( "stopped because the continuation budget was exhausted while observed " "verification still showed verification did not produce an observed " "result for `pytest -q` [attempt 3]" ) assert decision.final_response == ( "I stopped because the continuation budget was exhausted and observed " "verification still showed: verification did not produce an observed " "result for `pytest -q` [attempt 3]." ) assert [item.status for item in decision.verification_observations] == [ VerificationObservationStatus.MISSING.value ] assert decision.verification_observations[0].attempt_number == 3 assert events[0].type == "completion_check" @pytest.mark.asyncio async def test_completion_policy_uses_pending_observed_verification_when_budget_is_exhausted( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), max_continuation_prompts=1, ) policy = CompletionPolicy(context) dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "pending" dod.verification_attempt_counter = 4 dod.active_verification_attempt_id = verification_attempt_id(4) dod.active_verification_attempt_number = 4 events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="Verification is underway.", response_content="Verification is underway.", task="Run pytest -q and make sure it works.", actions_taken=["write: README.md"], continuation_count=1, emit=emit, dod=dod, ) assert decision.should_continue is False assert decision.should_finalize is True assert decision.decision_code == "continuation_budget_exhausted" assert decision.decision_summary == ( "stopped because the continuation budget was exhausted while observed " "verification still showed verification pending for `pytest -q` [attempt 4]" ) assert decision.final_response == ( "I stopped because the continuation budget was exhausted and observed " "verification still showed: verification pending for `pytest -q` [attempt 4]." ) assert [item.status for item in decision.verification_observations] == [ VerificationObservationStatus.PENDING.value ] assert decision.verification_observations[0].attempt_number == 4 assert events[0].type == "completion_check" @pytest.mark.asyncio async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), max_continuation_prompts=1, ) policy = CompletionPolicy(context) dod = create_definition_of_done("Run pytest -q and make sure it works.") dod.verification_commands = ["pytest -q"] dod.last_verification_result = "stale" dod.verification_attempt_counter = 2 dod.active_verification_attempt_id = verification_attempt_id(2) dod.active_verification_attempt_number = 2 events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="The tests were already handled.", response_content="The tests were already handled.", task="Run pytest -q and make sure it works.", actions_taken=["write: README.md"], continuation_count=1, emit=emit, dod=dod, ) assert decision.should_continue is False assert decision.should_finalize is True assert decision.decision_code == "continuation_budget_exhausted" assert decision.decision_summary == ( "stopped because the continuation budget was exhausted while observed " "verification still showed verification became stale for `pytest -q` " "after new mutating work [attempt 1 -> attempt 2]" ) assert decision.final_response == ( "I stopped because the continuation budget was exhausted and observed " "verification still showed: verification became stale for `pytest -q` " "after new mutating work [attempt 1 -> attempt 2]." ) assert [item.status for item in decision.verification_observations] == [ VerificationObservationStatus.STALE.value ] assert decision.verification_observations[0].attempt_number == 1 assert decision.verification_observations[0].supersedes_attempt_id == ( verification_attempt_id(2) ) assert events[0].type == "completion_check" @pytest.mark.asyncio async def test_completion_policy_finalizes_when_budget_is_exhausted( temp_dir: Path, ) -> None: context = build_context( temp_dir, safeguards=FakeSafeguards(), max_continuation_prompts=1, ) policy = CompletionPolicy(context) events = [] async def emit(event) -> None: events.append(event) decision = await policy.maybe_continue_for_completion( content="I looked into it.", response_content="I looked into it.", task="Fix the README heading.", actions_taken=[], continuation_count=1, emit=emit, ) assert decision.should_continue is False assert decision.should_finalize is True assert decision.decision_code == "continuation_budget_exhausted" assert decision.completion_check is not None assert decision.completion_check.missing_evidence == [ "showing the requested work was actually carried out" ] assert "Missing evidence" in decision.final_response assert decision.verification_observations == [] assert events[0].type == "completion_check"