loader Public

Watch 0 Fork 0 Star 0
Python · 25344 bytes Raw Blame History
  
        1
        """Tests for completion-policy helpers."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role
      
        11
        from loader.runtime.completion_policy import CompletionPolicy
      
        12
        from loader.runtime.context import RuntimeContext
      
        13
        from loader.runtime.dod import VerificationEvidence, create_definition_of_done
      
        14
        from loader.runtime.events import TurnSummary
      
        15
        from loader.runtime.evidence_provenance import EvidenceProvenanceStatus
      
        16
        from loader.runtime.permissions import (
      
        17
            PermissionMode,
      
        18
            build_permission_policy,
      
        19
            load_permission_rules,
      
        20
        )
      
        21
        from loader.runtime.task_completion import (
      
        22
            assess_completion_follow_through,
      
        23
            assess_completion_follow_through_with_provenance,
      
        24
            detect_premature_completion,
      
        25
            get_continuation_prompt,
      
        26
        )
      
        27
        from loader.runtime.verification_observations import (
      
        28
            VerificationObservationStatus,
      
        29
            verification_attempt_id,
      
        30
        )
      
        31
        from loader.tools.base import create_default_registry
      
        32
        from tests.helpers.runtime_harness import ScriptedBackend
      
        33
        
        34
        
        35
        class FakeCodeFilter:
      
        36
            def reset(self) -> None:
      
        37
                return None
      
        38
        
        39
        
        40
        class FakeSafeguards:
      
        41
            def __init__(self, *, text_loop: tuple[bool, str] = (False, "")) -> None:
      
        42
                self.action_tracker = object()
      
        43
                self.validator = object()
      
        44
                self.code_filter = FakeCodeFilter()
      
        45
                self._text_loop = text_loop
      
        46
                self.recorded: list[str] = []
      
        47
        
        48
            def filter_stream_chunk(self, content: str) -> str:
      
        49
                return content
      
        50
        
        51
            def filter_complete_content(self, content: str) -> str:
      
        52
                return content
      
        53
        
        54
            def should_steer(self) -> bool:
      
        55
                return False
      
        56
        
        57
            def get_steering_message(self) -> str | None:
      
        58
                return None
      
        59
        
        60
            def record_response(self, content: str) -> None:
      
        61
                self.recorded.append(content)
      
        62
        
        63
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        64
                return self._text_loop
      
        65
        
        66
            def detect_loop(self) -> tuple[bool, str]:
      
        67
                return False, ""
      
        68
        
        69
        
        70
        class FakeSession:
      
        71
            def __init__(self) -> None:
      
        72
                self.messages: list[Message] = []
      
        73
        
        74
            def append(self, message: Message) -> None:
      
        75
                self.messages.append(message)
      
        76
        
        77
        
        78
        def build_context(
      
        79
            temp_dir: Path,
      
        80
            *,
      
        81
            safeguards: FakeSafeguards,
      
        82
            max_continuation_prompts: int = 5,
      
        83
            use_quick_completion: bool = True,
      
        84
        ) -> RuntimeContext:
      
        85
            registry = create_default_registry(temp_dir)
      
        86
            registry.configure_workspace_root(temp_dir)
      
        87
            rule_status = load_permission_rules(temp_dir)
      
        88
            policy = build_permission_policy(
      
        89
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        90
                workspace_root=temp_dir,
      
        91
                tool_requirements=registry.get_tool_requirements(),
      
        92
                rules=rule_status.rules,
      
        93
            )
      
        94
            return RuntimeContext(
      
        95
                project_root=temp_dir,
      
        96
                backend=ScriptedBackend(),
      
        97
                registry=registry,
      
        98
                session=FakeSession(),  # type: ignore[arg-type]
      
        99
                config=SimpleNamespace(
      
        100
                    force_react=False,
      
        101
                    reasoning=SimpleNamespace(
      
        102
                        max_continuation_prompts=max_continuation_prompts,
      
        103
                        use_quick_completion=use_quick_completion,
      
        104
                    ),
      
        105
                ),
      
        106
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        107
                project_context=None,
      
        108
                permission_policy=policy,
      
        109
                permission_config_status=rule_status,
      
        110
                workflow_mode="execute",
      
        111
                safeguards=safeguards,
      
        112
            )
      
        113
        
        114
        
        115
        def test_completion_policy_finalize_response_text_keeps_original_response() -> None:
      
        116
            response = CompletionPolicy.finalize_response_text(
      
        117
                content="Inspected the file successfully.",
      
        118
                actions_taken=["read: README.md"],
      
        119
            )
      
        120
        
        121
            assert response == "Inspected the file successfully."
      
        122
        
        123
        
        124
        def test_detect_premature_completion_respects_explicit_done_without_actions() -> None:
      
        125
            assert detect_premature_completion(
      
        126
                "Explain how Loader works.",
      
        127
                "Done.",
      
        128
                [],
      
        129
            ) is False
      
        130
        
        131
        
        132
        def test_get_continuation_prompt_surfaces_missing_verification_steps() -> None:
      
        133
            prompt = get_continuation_prompt(
      
        134
                "Create the script and test that it works.",
      
        135
                ["write: script.py"],
      
        136
                "The script has been created.",
      
        137
            )
      
        138
        
        139
            assert "Continue with" in prompt
      
        140
            assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
      
        141
        
        142
        
        143
        def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
      
        144
            check = assess_completion_follow_through(
      
        145
                task="Create the script and test that it works.",
      
        146
                response="The script has been created.",
      
        147
                actions_taken=["write: script.py"],
      
        148
            )
      
        149
        
        150
            assert check.is_complete is False
      
        151
            assert "showing the requested work was actually carried out" in check.required_evidence
      
        152
            assert "showing the result was run or verified" in check.required_evidence
      
        153
            assert check.missing_evidence == ["showing the result was run or verified"]
      
        154
            assert check.suggested_next_steps == [
      
        155
                "Execute what you created or run the relevant tests now"
      
        156
            ]
      
        157
        
        158
        
        159
        def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
      
        160
            check = assess_completion_follow_through(
      
        161
                task="Explain how Loader's workflow timeline works.",
      
        162
                response="Loader records workflow decisions and policy events in a timeline.",
      
        163
                actions_taken=[],
      
        164
            )
      
        165
        
        166
            assert check.is_complete is True
      
        167
            assert check.required_evidence == []
      
        168
            assert check.missing_evidence == []
      
        169
        
        170
        
        171
        def test_assess_completion_follow_through_uses_passing_verification_evidence() -> None:
      
        172
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        173
            dod.verification_commands = ["pytest -q"]
      
        174
            dod.evidence = [
      
        175
                VerificationEvidence(
      
        176
                    command="pytest -q",
      
        177
                    passed=True,
      
        178
                    stdout="342 passed",
      
        179
                    kind="test",
      
        180
                )
      
        181
            ]
      
        182
            dod.last_verification_result = "passed"
      
        183
        
        184
            check = assess_completion_follow_through(
      
        185
                task="Run pytest -q and make sure it works.",
      
        186
                response="The test suite passed.",
      
        187
                actions_taken=[],
      
        188
                dod=dod,
      
        189
            )
      
        190
        
        191
            assert check.is_complete is True
      
        192
            assert check.missing_evidence == []
      
        193
            assert "verified: pytest -q" in check.accomplished
      
        194
        
        195
        
        196
        def test_assess_completion_follow_through_surfaces_failing_verification() -> None:
      
        197
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        198
            dod.verification_commands = ["pytest -q"]
      
        199
            dod.evidence = [
      
        200
                VerificationEvidence(
      
        201
                    command="pytest -q",
      
        202
                    passed=False,
      
        203
                    stderr="1 failed",
      
        204
                    kind="test",
      
        205
                )
      
        206
            ]
      
        207
            dod.last_verification_result = "failed"
      
        208
        
        209
            check = assess_completion_follow_through(
      
        210
                task="Run pytest -q and make sure it works.",
      
        211
                response="The tests are done.",
      
        212
                actions_taken=[],
      
        213
                dod=dod,
      
        214
            )
      
        215
        
        216
            assert check.is_complete is False
      
        217
            assert check.missing_evidence == [
      
        218
                "a passing verification result from `pytest -q` (current verification is still failing)"
      
        219
            ]
      
        220
            assert check.suggested_next_steps == [
      
        221
                "Fix the failing `pytest -q` result and rerun it"
      
        222
            ]
      
        223
        
        224
        
        225
        def test_assess_completion_follow_through_surfaces_planned_verification() -> None:
      
        226
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        227
            dod.verification_commands = ["pytest -q"]
      
        228
            dod.last_verification_result = "planned"
      
        229
        
        230
            check = assess_completion_follow_through(
      
        231
                task="Run pytest -q and make sure it works.",
      
        232
                response="The tests are next.",
      
        233
                actions_taken=["write: README.md"],
      
        234
                dod=dod,
      
        235
            )
      
        236
        
        237
            assert check.is_complete is False
      
        238
            assert check.missing_evidence == [
      
        239
                "a passing verification result from `pytest -q` (verification is planned but has not run yet)"
      
        240
            ]
      
        241
            assert check.suggested_next_steps == ["Run the planned verification `pytest -q` now"]
      
        242
        
        243
        
        244
        def test_assess_completion_follow_through_surfaces_pending_verification() -> None:
      
        245
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        246
            dod.verification_commands = ["pytest -q"]
      
        247
            dod.last_verification_result = "pending"
      
        248
        
        249
            check = assess_completion_follow_through(
      
        250
                task="Run pytest -q and make sure it works.",
      
        251
                response="Verification is underway.",
      
        252
                actions_taken=["write: README.md"],
      
        253
                dod=dod,
      
        254
            )
      
        255
        
        256
            assert check.is_complete is False
      
        257
            assert check.missing_evidence == [
      
        258
                "a completed passing verification result from `pytest -q` (verification is still pending)"
      
        259
            ]
      
        260
            assert check.suggested_next_steps == [
      
        261
                "Finish running `pytest -q` and capture the result"
      
        262
            ]
      
        263
        
        264
        
        265
        def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None:
      
        266
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        267
            dod.verification_commands = ["pytest -q"]
      
        268
            dod.last_verification_result = "stale"
      
        269
            dod.verification_attempt_counter = 2
      
        270
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        271
            dod.active_verification_attempt_number = 2
      
        272
        
        273
            check = assess_completion_follow_through(
      
        274
                task="Run pytest -q and make sure it works.",
      
        275
                response="The tests were already handled.",
      
        276
                actions_taken=["write: README.md"],
      
        277
                dod=dod,
      
        278
            )
      
        279
        
        280
            assert check.is_complete is False
      
        281
            assert check.missing_evidence == [
      
        282
                "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)"
      
        283
            ]
      
        284
            assert check.suggested_next_steps == [
      
        285
                "Rerun `pytest -q` now that the implementation changed again"
      
        286
            ]
      
        287
        
        288
        
        289
        def test_completion_assessment_projects_superseded_verification_attempt_for_stale_result() -> None:
      
        290
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        291
            dod.verification_commands = ["pytest -q"]
      
        292
            dod.last_verification_result = "stale"
      
        293
            dod.verification_attempt_counter = 2
      
        294
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        295
            dod.active_verification_attempt_number = 2
      
        296
        
        297
            assessment = assess_completion_follow_through_with_provenance(
      
        298
                task="Run pytest -q and make sure it works.",
      
        299
                response="The tests were already handled.",
      
        300
                actions_taken=["write: README.md"],
      
        301
                dod=dod,
      
        302
            )
      
        303
        
        304
            assert [item.status for item in assessment.verification_observations] == [
      
        305
                VerificationObservationStatus.STALE.value
      
        306
            ]
      
        307
            assert assessment.verification_observations[0].attempt_id == verification_attempt_id(1)
      
        308
            assert assessment.verification_observations[0].attempt_number == 1
      
        309
            assert assessment.verification_observations[0].supersedes_attempt_id == (
      
        310
                verification_attempt_id(2)
      
        311
            )
      
        312
        
        313
        
        314
        def test_completion_assessment_attaches_typed_verification_provenance() -> None:
      
        315
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        316
            dod.verification_commands = ["pytest -q"]
      
        317
            dod.evidence = [
      
        318
                VerificationEvidence(
      
        319
                    command="pytest -q",
      
        320
                    passed=False,
      
        321
                    stderr="1 failed",
      
        322
                    kind="test",
      
        323
                )
      
        324
            ]
      
        325
            dod.last_verification_result = "failed"
      
        326
        
        327
            assessment = assess_completion_follow_through_with_provenance(
      
        328
                task="Run pytest -q and make sure it works.",
      
        329
                response="The tests are done.",
      
        330
                actions_taken=[],
      
        331
                dod=dod,
      
        332
            )
      
        333
        
        334
            assert assessment.check.is_complete is False
      
        335
            assert [item.status for item in assessment.evidence_provenance] == [
      
        336
                EvidenceProvenanceStatus.CONTRADICTS.value
      
        337
            ]
      
        338
            assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
      
        339
        
        340
        
        341
        @pytest.mark.asyncio
      
        342
        async def test_completion_policy_stops_for_text_loop_using_runtime_context(
      
        343
            temp_dir: Path,
      
        344
        ) -> None:
      
        345
            context = build_context(
      
        346
                temp_dir,
      
        347
                safeguards=FakeSafeguards(text_loop=(True, "assistant repeated the same summary")),
      
        348
            )
      
        349
            policy = CompletionPolicy(context)
      
        350
            summary = TurnSummary(final_response="")
      
        351
            events = []
      
        352
        
        353
            async def emit(event) -> None:
      
        354
                events.append(event)
      
        355
        
        356
            decision = await policy.maybe_stop_for_text_loop(
      
        357
                content="Same summary again.",
      
        358
                emit=emit,
      
        359
                summary=summary,
      
        360
            )
      
        361
        
        362
            assert decision.should_stop is True
      
        363
            assert decision.decision_code == "text_loop_bailout"
      
        364
            assert decision.decision_summary == (
      
        365
                "stopped after detecting a repeated text loop"
      
        366
            )
      
        367
            assert summary.final_response == (
      
        368
                "I stopped because I was repeating myself and couldn't make further progress."
      
        369
            )
      
        370
            assert summary.assistant_messages[-1].role == Role.ASSISTANT
      
        371
            assert context.session.messages[-1].content == summary.final_response
      
        372
            assert events[0].type == "error"
      
        373
            assert events[1].type == "response"
      
        374
        
        375
        
        376
        @pytest.mark.asyncio
      
        377
        async def test_completion_policy_requests_continuation_using_runtime_context(
      
        378
            temp_dir: Path,
      
        379
        ) -> None:
      
        380
            context = build_context(
      
        381
                temp_dir,
      
        382
                safeguards=FakeSafeguards(),
      
        383
            )
      
        384
            policy = CompletionPolicy(context)
      
        385
            events = []
      
        386
        
        387
            async def emit(event) -> None:
      
        388
                events.append(event)
      
        389
        
        390
            decision = await policy.maybe_continue_for_completion(
      
        391
                content="I can handle that.",
      
        392
                response_content="I can handle that.",
      
        393
                task="Create the file and verify it works.",
      
        394
                actions_taken=[],
      
        395
                continuation_count=0,
      
        396
                emit=emit,
      
        397
            )
      
        398
        
        399
            assert decision.should_continue is True
      
        400
            assert decision.decision_code == "premature_completion_nudge"
      
        401
            assert decision.decision_summary == (
      
        402
                "requested one continuation because the non-mutating response looked incomplete"
      
        403
            )
      
        404
            assert decision.completion_check is not None
      
        405
            assert decision.completion_check.missing_evidence == [
      
        406
                "showing the requested work was actually carried out",
      
        407
                "showing the result was run or verified",
      
        408
            ]
      
        409
            assert context.session.messages[-2] == Message(
      
        410
                role=Role.ASSISTANT,
      
        411
                content="I can handle that.",
      
        412
            )
      
        413
            assert context.session.messages[-1].role == Role.USER
      
        414
            assert "verify it works" in context.session.messages[-1].content.lower()
      
        415
            assert events[0].type == "completion_check"
      
        416
            assert events[0].completion_check is not None
      
        417
            assert events[0].completion_check.missing_evidence == [
      
        418
                "showing the requested work was actually carried out",
      
        419
                "showing the result was run or verified",
      
        420
            ]
      
        421
            assert [item.status for item in decision.evidence_provenance] == [
      
        422
                EvidenceProvenanceStatus.MISSING.value,
      
        423
                EvidenceProvenanceStatus.MISSING.value,
      
        424
            ]
      
        425
        
        426
        
        427
        @pytest.mark.asyncio
      
        428
        async def test_completion_policy_accepts_passed_verification_from_dod(
      
        429
            temp_dir: Path,
      
        430
        ) -> None:
      
        431
            context = build_context(
      
        432
                temp_dir,
      
        433
                safeguards=FakeSafeguards(),
      
        434
            )
      
        435
            policy = CompletionPolicy(context)
      
        436
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        437
            dod.verification_commands = ["pytest -q"]
      
        438
            dod.evidence = [
      
        439
                VerificationEvidence(
      
        440
                    command="pytest -q",
      
        441
                    passed=True,
      
        442
                    stdout="342 passed",
      
        443
                    kind="test",
      
        444
                )
      
        445
            ]
      
        446
            dod.last_verification_result = "passed"
      
        447
            events = []
      
        448
        
        449
            async def emit(event) -> None:
      
        450
                events.append(event)
      
        451
        
        452
            decision = await policy.maybe_continue_for_completion(
      
        453
                content="The tests passed.",
      
        454
                response_content="The tests passed.",
      
        455
                task="Run pytest -q and make sure it works.",
      
        456
                actions_taken=[],
      
        457
                continuation_count=0,
      
        458
                emit=emit,
      
        459
                dod=dod,
      
        460
            )
      
        461
        
        462
            assert decision.should_continue is False
      
        463
            assert decision.should_finalize is False
      
        464
            assert decision.decision_code == "completion_response_accepted"
      
        465
            assert decision.completion_check is not None
      
        466
            assert decision.completion_check.missing_evidence == []
      
        467
            assert events == []
      
        468
            assert [item.summary for item in decision.evidence_provenance] == [
      
        469
                "verification passed for `pytest -q`"
      
        470
            ]
      
        471
        
        472
        
        473
        @pytest.mark.asyncio
      
        474
        async def test_completion_policy_finalizes_with_concrete_failed_verification_gap(
      
        475
            temp_dir: Path,
      
        476
        ) -> None:
      
        477
            context = build_context(
      
        478
                temp_dir,
      
        479
                safeguards=FakeSafeguards(),
      
        480
                max_continuation_prompts=1,
      
        481
            )
      
        482
            policy = CompletionPolicy(context)
      
        483
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        484
            dod.verification_commands = ["pytest -q"]
      
        485
            dod.evidence = [
      
        486
                VerificationEvidence(
      
        487
                    command="pytest -q",
      
        488
                    passed=False,
      
        489
                    stderr="1 failed",
      
        490
                    kind="test",
      
        491
                )
      
        492
            ]
      
        493
            dod.last_verification_result = "failed"
      
        494
            dod.verification_attempt_counter = 2
      
        495
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        496
            dod.active_verification_attempt_number = 2
      
        497
            events = []
      
        498
        
        499
            async def emit(event) -> None:
      
        500
                events.append(event)
      
        501
        
        502
            decision = await policy.maybe_continue_for_completion(
      
        503
                content="The tests are done.",
      
        504
                response_content="The tests are done.",
      
        505
                task="Run pytest -q and make sure it works.",
      
        506
                actions_taken=[],
      
        507
                continuation_count=1,
      
        508
                emit=emit,
      
        509
                dod=dod,
      
        510
            )
      
        511
        
        512
            assert decision.should_continue is False
      
        513
            assert decision.should_finalize is True
      
        514
            assert decision.decision_code == "continuation_budget_exhausted"
      
        515
            assert decision.decision_summary == (
      
        516
                "stopped because the continuation budget was exhausted while observed "
      
        517
                "verification still showed verification failed for `pytest -q` "
      
        518
                "[1 failed; attempt 2]"
      
        519
            )
      
        520
            assert decision.completion_check is not None
      
        521
            assert decision.completion_check.missing_evidence == [
      
        522
                "a passing verification result from `pytest -q` (current verification is still failing)"
      
        523
            ]
      
        524
            assert decision.final_response == (
      
        525
                "I stopped because the continuation budget was exhausted and observed "
      
        526
                "verification still showed: verification failed for `pytest -q` "
      
        527
                "[1 failed; attempt 2]."
      
        528
            )
      
        529
            assert events[0].type == "completion_check"
      
        530
            assert [item.status for item in decision.evidence_provenance] == [
      
        531
                EvidenceProvenanceStatus.CONTRADICTS.value
      
        532
            ]
      
        533
            assert [item.status for item in decision.verification_observations] == [
      
        534
                VerificationObservationStatus.FAILED.value
      
        535
            ]
      
        536
            assert decision.verification_observations[0].attempt_number == 2
      
        537
        
        538
        
        539
        @pytest.mark.asyncio
      
        540
        async def test_completion_policy_uses_missing_observed_verification_when_budget_is_exhausted(
      
        541
            temp_dir: Path,
      
        542
        ) -> None:
      
        543
            context = build_context(
      
        544
                temp_dir,
      
        545
                safeguards=FakeSafeguards(),
      
        546
                max_continuation_prompts=1,
      
        547
            )
      
        548
            policy = CompletionPolicy(context)
      
        549
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        550
            dod.verification_commands = ["pytest -q"]
      
        551
            dod.last_verification_result = "failed"
      
        552
            dod.verification_attempt_counter = 3
      
        553
            dod.active_verification_attempt_id = verification_attempt_id(3)
      
        554
            dod.active_verification_attempt_number = 3
      
        555
            events = []
      
        556
        
        557
            async def emit(event) -> None:
      
        558
                events.append(event)
      
        559
        
        560
            decision = await policy.maybe_continue_for_completion(
      
        561
                content="The tests are done.",
      
        562
                response_content="The tests are done.",
      
        563
                task="Run pytest -q and make sure it works.",
      
        564
                actions_taken=[],
      
        565
                continuation_count=1,
      
        566
                emit=emit,
      
        567
                dod=dod,
      
        568
            )
      
        569
        
        570
            assert decision.should_continue is False
      
        571
            assert decision.should_finalize is True
      
        572
            assert decision.decision_code == "continuation_budget_exhausted"
      
        573
            assert decision.decision_summary == (
      
        574
                "stopped because the continuation budget was exhausted while observed "
      
        575
                "verification still showed verification did not produce an observed "
      
        576
                "result for `pytest -q` [attempt 3]"
      
        577
            )
      
        578
            assert decision.final_response == (
      
        579
                "I stopped because the continuation budget was exhausted and observed "
      
        580
                "verification still showed: verification did not produce an observed "
      
        581
                "result for `pytest -q` [attempt 3]."
      
        582
            )
      
        583
            assert [item.status for item in decision.verification_observations] == [
      
        584
                VerificationObservationStatus.MISSING.value
      
        585
            ]
      
        586
            assert decision.verification_observations[0].attempt_number == 3
      
        587
            assert events[0].type == "completion_check"
      
        588
        
        589
        
        590
        @pytest.mark.asyncio
      
        591
        async def test_completion_policy_uses_pending_observed_verification_when_budget_is_exhausted(
      
        592
            temp_dir: Path,
      
        593
        ) -> None:
      
        594
            context = build_context(
      
        595
                temp_dir,
      
        596
                safeguards=FakeSafeguards(),
      
        597
                max_continuation_prompts=1,
      
        598
            )
      
        599
            policy = CompletionPolicy(context)
      
        600
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        601
            dod.verification_commands = ["pytest -q"]
      
        602
            dod.last_verification_result = "pending"
      
        603
            dod.verification_attempt_counter = 4
      
        604
            dod.active_verification_attempt_id = verification_attempt_id(4)
      
        605
            dod.active_verification_attempt_number = 4
      
        606
            events = []
      
        607
        
        608
            async def emit(event) -> None:
      
        609
                events.append(event)
      
        610
        
        611
            decision = await policy.maybe_continue_for_completion(
      
        612
                content="Verification is underway.",
      
        613
                response_content="Verification is underway.",
      
        614
                task="Run pytest -q and make sure it works.",
      
        615
                actions_taken=["write: README.md"],
      
        616
                continuation_count=1,
      
        617
                emit=emit,
      
        618
                dod=dod,
      
        619
            )
      
        620
        
        621
            assert decision.should_continue is False
      
        622
            assert decision.should_finalize is True
      
        623
            assert decision.decision_code == "continuation_budget_exhausted"
      
        624
            assert decision.decision_summary == (
      
        625
                "stopped because the continuation budget was exhausted while observed "
      
        626
                "verification still showed verification pending for `pytest -q` [attempt 4]"
      
        627
            )
      
        628
            assert decision.final_response == (
      
        629
                "I stopped because the continuation budget was exhausted and observed "
      
        630
                "verification still showed: verification pending for `pytest -q` [attempt 4]."
      
        631
            )
      
        632
            assert [item.status for item in decision.verification_observations] == [
      
        633
                VerificationObservationStatus.PENDING.value
      
        634
            ]
      
        635
            assert decision.verification_observations[0].attempt_number == 4
      
        636
            assert events[0].type == "completion_check"
      
        637
        
        638
        
        639
        @pytest.mark.asyncio
      
        640
        async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted(
      
        641
            temp_dir: Path,
      
        642
        ) -> None:
      
        643
            context = build_context(
      
        644
                temp_dir,
      
        645
                safeguards=FakeSafeguards(),
      
        646
                max_continuation_prompts=1,
      
        647
            )
      
        648
            policy = CompletionPolicy(context)
      
        649
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        650
            dod.verification_commands = ["pytest -q"]
      
        651
            dod.last_verification_result = "stale"
      
        652
            dod.verification_attempt_counter = 2
      
        653
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        654
            dod.active_verification_attempt_number = 2
      
        655
            events = []
      
        656
        
        657
            async def emit(event) -> None:
      
        658
                events.append(event)
      
        659
        
        660
            decision = await policy.maybe_continue_for_completion(
      
        661
                content="The tests were already handled.",
      
        662
                response_content="The tests were already handled.",
      
        663
                task="Run pytest -q and make sure it works.",
      
        664
                actions_taken=["write: README.md"],
      
        665
                continuation_count=1,
      
        666
                emit=emit,
      
        667
                dod=dod,
      
        668
            )
      
        669
        
        670
            assert decision.should_continue is False
      
        671
            assert decision.should_finalize is True
      
        672
            assert decision.decision_code == "continuation_budget_exhausted"
      
        673
            assert decision.decision_summary == (
      
        674
                "stopped because the continuation budget was exhausted while observed "
      
        675
                "verification still showed verification became stale for `pytest -q` "
      
        676
                "after new mutating work [attempt 1 -> attempt 2]"
      
        677
            )
      
        678
            assert decision.final_response == (
      
        679
                "I stopped because the continuation budget was exhausted and observed "
      
        680
                "verification still showed: verification became stale for `pytest -q` "
      
        681
                "after new mutating work [attempt 1 -> attempt 2]."
      
        682
            )
      
        683
            assert [item.status for item in decision.verification_observations] == [
      
        684
                VerificationObservationStatus.STALE.value
      
        685
            ]
      
        686
            assert decision.verification_observations[0].attempt_number == 1
      
        687
            assert decision.verification_observations[0].supersedes_attempt_id == (
      
        688
                verification_attempt_id(2)
      
        689
            )
      
        690
            assert events[0].type == "completion_check"
      
        691
        
        692
        
        693
        @pytest.mark.asyncio
      
        694
        async def test_completion_policy_finalizes_when_budget_is_exhausted(
      
        695
            temp_dir: Path,
      
        696
        ) -> None:
      
        697
            context = build_context(
      
        698
                temp_dir,
      
        699
                safeguards=FakeSafeguards(),
      
        700
                max_continuation_prompts=1,
      
        701
            )
      
        702
            policy = CompletionPolicy(context)
      
        703
            events = []
      
        704
        
        705
            async def emit(event) -> None:
      
        706
                events.append(event)
      
        707
        
        708
            decision = await policy.maybe_continue_for_completion(
      
        709
                content="I looked into it.",
      
        710
                response_content="I looked into it.",
      
        711
                task="Fix the README heading.",
      
        712
                actions_taken=[],
      
        713
                continuation_count=1,
      
        714
                emit=emit,
      
        715
            )
      
        716
        
        717
            assert decision.should_continue is False
      
        718
            assert decision.should_finalize is True
      
        719
            assert decision.decision_code == "continuation_budget_exhausted"
      
        720
            assert decision.completion_check is not None
      
        721
            assert decision.completion_check.missing_evidence == [
      
        722
                "showing the requested work was actually carried out"
      
        723
            ]
      
        724
            assert "Missing evidence" in decision.final_response
      
        725
            assert decision.verification_observations == []
      
        726
            assert events[0].type == "completion_check"