loader Public

Watch 0 Fork 0 Star 0
Python · 28256 bytes Raw Blame History
  
        1
        """Tests for completion-policy helpers."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.completion_policy import CompletionPolicy
      
        12
        from loader.runtime.context import RuntimeContext
      
        13
        from loader.runtime.dod import VerificationEvidence, create_definition_of_done
      
        14
        from loader.runtime.events import TurnSummary
      
        15
        from loader.runtime.evidence_provenance import EvidenceProvenanceStatus
      
        16
        from loader.runtime.permissions import (
      
        17
            PermissionMode,
      
        18
            build_permission_policy,
      
        19
            load_permission_rules,
      
        20
        )
      
        21
        from loader.runtime.task_completion import (
      
        22
            assess_completion_follow_through,
      
        23
            assess_completion_follow_through_with_provenance,
      
        24
            detect_premature_completion,
      
        25
            get_continuation_prompt,
      
        26
        )
      
        27
        from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done
      
        28
        from loader.runtime.verification_observations import (
      
        29
            VerificationObservationStatus,
      
        30
            verification_attempt_id,
      
        31
        )
      
        32
        from loader.tools.base import create_default_registry
      
        33
        from tests.helpers.runtime_harness import ScriptedBackend
      
        34
        
        35
        
        36
        class FakeCodeFilter:
      
        37
            def reset(self) -> None:
      
        38
                return None
      
        39
        
        40
        
        41
        class FakeSafeguards:
      
        42
            def __init__(self, *, text_loop: tuple[bool, str] = (False, "")) -> None:
      
        43
                self.action_tracker = object()
      
        44
                self.validator = object()
      
        45
                self.code_filter = FakeCodeFilter()
      
        46
                self._text_loop = text_loop
      
        47
                self.recorded: list[str] = []
      
        48
        
        49
            def filter_stream_chunk(self, content: str) -> str:
      
        50
                return content
      
        51
        
        52
            def filter_complete_content(self, content: str) -> str:
      
        53
                return content
      
        54
        
        55
            def should_steer(self) -> bool:
      
        56
                return False
      
        57
        
        58
            def get_steering_message(self) -> str | None:
      
        59
                return None
      
        60
        
        61
            def record_response(self, content: str) -> None:
      
        62
                self.recorded.append(content)
      
        63
        
        64
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        65
                return self._text_loop
      
        66
        
        67
            def detect_loop(self) -> tuple[bool, str]:
      
        68
                return False, ""
      
        69
        
        70
        
        71
        class FakeSession:
      
        72
            def __init__(self) -> None:
      
        73
                self.messages: list[Message] = []
      
        74
        
        75
            def append(self, message: Message) -> None:
      
        76
                self.messages.append(message)
      
        77
        
        78
        
        79
        def build_context(
      
        80
            temp_dir: Path,
      
        81
            *,
      
        82
            safeguards: FakeSafeguards,
      
        83
            max_continuation_prompts: int = 5,
      
        84
            use_quick_completion: bool = True,
      
        85
        ) -> RuntimeContext:
      
        86
            registry = create_default_registry(temp_dir)
      
        87
            registry.configure_workspace_root(temp_dir)
      
        88
            rule_status = load_permission_rules(temp_dir)
      
        89
            policy = build_permission_policy(
      
        90
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        91
                workspace_root=temp_dir,
      
        92
                tool_requirements=registry.get_tool_requirements(),
      
        93
                rules=rule_status.rules,
      
        94
            )
      
        95
            return RuntimeContext(
      
        96
                project_root=temp_dir,
      
        97
                backend=ScriptedBackend(),
      
        98
                registry=registry,
      
        99
                session=FakeSession(),  # type: ignore[arg-type]
      
        100
                config=SimpleNamespace(
      
        101
                    force_react=False,
      
        102
                    reasoning=SimpleNamespace(
      
        103
                        max_continuation_prompts=max_continuation_prompts,
      
        104
                        use_quick_completion=use_quick_completion,
      
        105
                    ),
      
        106
                ),
      
        107
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        108
                project_context=None,
      
        109
                permission_policy=policy,
      
        110
                permission_config_status=rule_status,
      
        111
                workflow_mode="execute",
      
        112
                safeguards=safeguards,
      
        113
            )
      
        114
        
        115
        
        116
        def test_completion_policy_finalize_response_text_keeps_original_response() -> None:
      
        117
            response = CompletionPolicy.finalize_response_text(
      
        118
                content="Inspected the file successfully.",
      
        119
                actions_taken=["read: README.md"],
      
        120
            )
      
        121
        
        122
            assert response == "Inspected the file successfully."
      
        123
        
        124
        
        125
        def test_detect_premature_completion_respects_explicit_done_without_actions() -> None:
      
        126
            assert detect_premature_completion(
      
        127
                "Explain how Loader works.",
      
        128
                "Done.",
      
        129
                [],
      
        130
            ) is False
      
        131
        
        132
        
        133
        def test_get_continuation_prompt_surfaces_missing_verification_steps() -> None:
      
        134
            prompt = get_continuation_prompt(
      
        135
                "Create the script and test that it works.",
      
        136
                ["write: script.py"],
      
        137
                "The script has been created.",
      
        138
            )
      
        139
        
        140
            assert "Continue with" in prompt
      
        141
            assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
      
        142
        
        143
        
        144
        def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
      
        145
            check = assess_completion_follow_through(
      
        146
                task="Create the script and test that it works.",
      
        147
                response="The script has been created.",
      
        148
                actions_taken=["write: script.py"],
      
        149
            )
      
        150
        
        151
            assert check.is_complete is False
      
        152
            assert "showing the requested work was actually carried out" in check.required_evidence
      
        153
            assert "showing the result was run or verified" in check.required_evidence
      
        154
            assert check.missing_evidence == ["showing the result was run or verified"]
      
        155
            assert check.suggested_next_steps == [
      
        156
                "Execute what you created or run the relevant tests now"
      
        157
            ]
      
        158
        
        159
        
        160
        def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
      
        161
            check = assess_completion_follow_through(
      
        162
                task="Explain how Loader's workflow timeline works.",
      
        163
                response="Loader records workflow decisions and policy events in a timeline.",
      
        164
                actions_taken=[],
      
        165
            )
      
        166
        
        167
            assert check.is_complete is True
      
        168
            assert check.required_evidence == []
      
        169
            assert check.missing_evidence == []
      
        170
        
        171
        
        172
        def test_assess_completion_follow_through_uses_passing_verification_evidence() -> None:
      
        173
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        174
            dod.verification_commands = ["pytest -q"]
      
        175
            dod.evidence = [
      
        176
                VerificationEvidence(
      
        177
                    command="pytest -q",
      
        178
                    passed=True,
      
        179
                    stdout="342 passed",
      
        180
                    kind="test",
      
        181
                )
      
        182
            ]
      
        183
            dod.last_verification_result = "passed"
      
        184
        
        185
            check = assess_completion_follow_through(
      
        186
                task="Run pytest -q and make sure it works.",
      
        187
                response="The test suite passed.",
      
        188
                actions_taken=[],
      
        189
                dod=dod,
      
        190
            )
      
        191
        
        192
            assert check.is_complete is True
      
        193
            assert check.missing_evidence == []
      
        194
            assert "verified: pytest -q" in check.accomplished
      
        195
        
        196
        
        197
        def test_assess_completion_follow_through_surfaces_failing_verification() -> None:
      
        198
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        199
            dod.verification_commands = ["pytest -q"]
      
        200
            dod.evidence = [
      
        201
                VerificationEvidence(
      
        202
                    command="pytest -q",
      
        203
                    passed=False,
      
        204
                    stderr="1 failed",
      
        205
                    kind="test",
      
        206
                )
      
        207
            ]
      
        208
            dod.last_verification_result = "failed"
      
        209
        
        210
            check = assess_completion_follow_through(
      
        211
                task="Run pytest -q and make sure it works.",
      
        212
                response="The tests are done.",
      
        213
                actions_taken=[],
      
        214
                dod=dod,
      
        215
            )
      
        216
        
        217
            assert check.is_complete is False
      
        218
            assert check.missing_evidence == [
      
        219
                "a passing verification result from `pytest -q` (current verification is still failing)"
      
        220
            ]
      
        221
            assert check.suggested_next_steps == [
      
        222
                "Fix the failing `pytest -q` result and rerun it"
      
        223
            ]
      
        224
        
        225
        
        226
        def test_assess_completion_follow_through_surfaces_planned_verification() -> None:
      
        227
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        228
            dod.verification_commands = ["pytest -q"]
      
        229
            dod.last_verification_result = "planned"
      
        230
        
        231
            check = assess_completion_follow_through(
      
        232
                task="Run pytest -q and make sure it works.",
      
        233
                response="The tests are next.",
      
        234
                actions_taken=["write: README.md"],
      
        235
                dod=dod,
      
        236
            )
      
        237
        
        238
            assert check.is_complete is False
      
        239
            assert check.missing_evidence == [
      
        240
                "a passing verification result from `pytest -q` (verification is planned but has not run yet)"
      
        241
            ]
      
        242
            assert check.suggested_next_steps == ["Run the planned verification `pytest -q` now"]
      
        243
        
        244
        
        245
        def test_assess_completion_follow_through_surfaces_pending_verification() -> None:
      
        246
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        247
            dod.verification_commands = ["pytest -q"]
      
        248
            dod.last_verification_result = "pending"
      
        249
        
        250
            check = assess_completion_follow_through(
      
        251
                task="Run pytest -q and make sure it works.",
      
        252
                response="Verification is underway.",
      
        253
                actions_taken=["write: README.md"],
      
        254
                dod=dod,
      
        255
            )
      
        256
        
        257
            assert check.is_complete is False
      
        258
            assert check.missing_evidence == [
      
        259
                "a completed passing verification result from `pytest -q` (verification is still pending)"
      
        260
            ]
      
        261
            assert check.suggested_next_steps == [
      
        262
                "Finish running `pytest -q` and capture the result"
      
        263
            ]
      
        264
        
        265
        
        266
        def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None:
      
        267
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        268
            dod.verification_commands = ["pytest -q"]
      
        269
            dod.last_verification_result = "stale"
      
        270
            dod.verification_attempt_counter = 2
      
        271
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        272
            dod.active_verification_attempt_number = 2
      
        273
        
        274
            check = assess_completion_follow_through(
      
        275
                task="Run pytest -q and make sure it works.",
      
        276
                response="The tests were already handled.",
      
        277
                actions_taken=["write: README.md"],
      
        278
                dod=dod,
      
        279
            )
      
        280
        
        281
            assert check.is_complete is False
      
        282
            assert check.missing_evidence == [
      
        283
                "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)"
      
        284
            ]
      
        285
            assert check.suggested_next_steps == [
      
        286
                "Rerun `pytest -q` now that the implementation changed again"
      
        287
            ]
      
        288
        
        289
        
        290
        def test_completion_assessment_projects_superseded_verification_attempt_for_stale_result() -> None:
      
        291
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        292
            dod.verification_commands = ["pytest -q"]
      
        293
            dod.last_verification_result = "stale"
      
        294
            dod.verification_attempt_counter = 2
      
        295
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        296
            dod.active_verification_attempt_number = 2
      
        297
        
        298
            assessment = assess_completion_follow_through_with_provenance(
      
        299
                task="Run pytest -q and make sure it works.",
      
        300
                response="The tests were already handled.",
      
        301
                actions_taken=["write: README.md"],
      
        302
                dod=dod,
      
        303
            )
      
        304
        
        305
            assert [item.status for item in assessment.verification_observations] == [
      
        306
                VerificationObservationStatus.STALE.value
      
        307
            ]
      
        308
            assert assessment.verification_observations[0].attempt_id == verification_attempt_id(1)
      
        309
            assert assessment.verification_observations[0].attempt_number == 1
      
        310
            assert assessment.verification_observations[0].supersedes_attempt_id == (
      
        311
                verification_attempt_id(2)
      
        312
            )
      
        313
        
        314
        
        315
        def test_completion_assessment_attaches_typed_verification_provenance() -> None:
      
        316
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        317
            dod.verification_commands = ["pytest -q"]
      
        318
            dod.evidence = [
      
        319
                VerificationEvidence(
      
        320
                    command="pytest -q",
      
        321
                    passed=False,
      
        322
                    stderr="1 failed",
      
        323
                    kind="test",
      
        324
                )
      
        325
            ]
      
        326
            dod.last_verification_result = "failed"
      
        327
        
        328
            assessment = assess_completion_follow_through_with_provenance(
      
        329
                task="Run pytest -q and make sure it works.",
      
        330
                response="The tests are done.",
      
        331
                actions_taken=[],
      
        332
                dod=dod,
      
        333
            )
      
        334
        
        335
            assert assessment.check.is_complete is False
      
        336
            assert [item.status for item in assessment.evidence_provenance] == [
      
        337
                EvidenceProvenanceStatus.CONTRADICTS.value
      
        338
            ]
      
        339
            assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
      
        340
        
        341
        
        342
        def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None:
      
        343
            dod = create_definition_of_done("Fix the chapter links in index.html.")
      
        344
            sync_todos_to_definition_of_done(
      
        345
                dod,
      
        346
                [
      
        347
                    {
      
        348
                        "content": "First, examine the current index.html file to understand its structure",
      
        349
                        "active_form": "Working on: First, examine the current index.html file to understand its structure",
      
        350
                        "status": "pending",
      
        351
                    },
      
        352
                    {
      
        353
                        "content": "List and read all HTML files in the chapters directory to extract chapter information",
      
        354
                        "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information",
      
        355
                        "status": "pending",
      
        356
                    },
      
        357
                    {
      
        358
                        "content": "Parse chapter titles from each HTML file",
      
        359
                        "active_form": "Working on: Parse chapter titles from each HTML file",
      
        360
                        "status": "pending",
      
        361
                    },
      
        362
                    {
      
        363
                        "content": "Update index.html with correct chapter links and titles",
      
        364
                        "active_form": "Working on: Update index.html with correct chapter links and titles",
      
        365
                        "status": "pending",
      
        366
                    },
      
        367
                ],
      
        368
            )
      
        369
            advance_todos_from_tool_call(
      
        370
                dod,
      
        371
                ToolCall(
      
        372
                    id="read-index",
      
        373
                    name="read",
      
        374
                    arguments={"file_path": "/tmp/fortran/index.html"},
      
        375
                ),
      
        376
            )
      
        377
            advance_todos_from_tool_call(
      
        378
                dod,
      
        379
                ToolCall(
      
        380
                    id="glob-chapters",
      
        381
                    name="glob",
      
        382
                    arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"},
      
        383
                ),
      
        384
            )
      
        385
            advance_todos_from_tool_call(
      
        386
                dod,
      
        387
                ToolCall(
      
        388
                    id="read-chapter",
      
        389
                    name="read",
      
        390
                    arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"},
      
        391
                ),
      
        392
            )
      
        393
        
        394
            assessment = assess_completion_follow_through_with_provenance(
      
        395
                task="Update /tmp/fortran/index.html so every chapter link is correct.",
      
        396
                response="I'll update the index.html file with the correct chapter links and titles.",
      
        397
                actions_taken=[
      
        398
                    "read: {'file_path': '/tmp/fortran/index.html'}",
      
        399
                    "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}",
      
        400
                    "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}",
      
        401
                ],
      
        402
                dod=dod,
      
        403
            )
      
        404
        
        405
            assert assessment.check.missing_evidence[0] == (
      
        406
                "completion of tracked work items "
      
        407
                "(Update index.html with correct chapter links and titles)"
      
        408
            )
      
        409
            assert assessment.check.suggested_next_steps[0] == (
      
        410
                "Complete the tracked item: Update index.html with correct chapter links and titles"
      
        411
            )
      
        412
        
        413
        
        414
        @pytest.mark.asyncio
      
        415
        async def test_completion_policy_stops_for_text_loop_using_runtime_context(
      
        416
            temp_dir: Path,
      
        417
        ) -> None:
      
        418
            context = build_context(
      
        419
                temp_dir,
      
        420
                safeguards=FakeSafeguards(text_loop=(True, "assistant repeated the same summary")),
      
        421
            )
      
        422
            policy = CompletionPolicy(context)
      
        423
            summary = TurnSummary(final_response="")
      
        424
            events = []
      
        425
        
        426
            async def emit(event) -> None:
      
        427
                events.append(event)
      
        428
        
        429
            decision = await policy.maybe_stop_for_text_loop(
      
        430
                content="Same summary again.",
      
        431
                emit=emit,
      
        432
                summary=summary,
      
        433
            )
      
        434
        
        435
            assert decision.should_stop is True
      
        436
            assert decision.decision_code == "text_loop_bailout"
      
        437
            assert decision.decision_summary == (
      
        438
                "stopped after detecting a repeated text loop"
      
        439
            )
      
        440
            assert summary.final_response == (
      
        441
                "I stopped because I was repeating myself and couldn't make further progress."
      
        442
            )
      
        443
            assert summary.assistant_messages[-1].role == Role.ASSISTANT
      
        444
            assert context.session.messages[-1].content == summary.final_response
      
        445
            assert events[0].type == "error"
      
        446
            assert events[1].type == "response"
      
        447
        
        448
        
        449
        @pytest.mark.asyncio
      
        450
        async def test_completion_policy_requests_continuation_using_runtime_context(
      
        451
            temp_dir: Path,
      
        452
        ) -> None:
      
        453
            context = build_context(
      
        454
                temp_dir,
      
        455
                safeguards=FakeSafeguards(),
      
        456
            )
      
        457
            policy = CompletionPolicy(context)
      
        458
            events = []
      
        459
        
        460
            async def emit(event) -> None:
      
        461
                events.append(event)
      
        462
        
        463
            decision = await policy.maybe_continue_for_completion(
      
        464
                content="I can handle that.",
      
        465
                response_content="I can handle that.",
      
        466
                task="Create the file and verify it works.",
      
        467
                actions_taken=[],
      
        468
                continuation_count=0,
      
        469
                emit=emit,
      
        470
            )
      
        471
        
        472
            assert decision.should_continue is True
      
        473
            assert decision.decision_code == "premature_completion_nudge"
      
        474
            assert decision.decision_summary == (
      
        475
                "requested one continuation because the non-mutating response looked incomplete"
      
        476
            )
      
        477
            assert decision.completion_check is not None
      
        478
            assert decision.completion_check.missing_evidence == [
      
        479
                "showing the requested work was actually carried out",
      
        480
                "showing the result was run or verified",
      
        481
            ]
      
        482
            assert context.session.messages[-2] == Message(
      
        483
                role=Role.ASSISTANT,
      
        484
                content="I can handle that.",
      
        485
            )
      
        486
            assert context.session.messages[-1].role == Role.USER
      
        487
            assert "verify it works" in context.session.messages[-1].content.lower()
      
        488
            assert events[0].type == "completion_check"
      
        489
            assert events[0].completion_check is not None
      
        490
            assert events[0].completion_check.missing_evidence == [
      
        491
                "showing the requested work was actually carried out",
      
        492
                "showing the result was run or verified",
      
        493
            ]
      
        494
            assert [item.status for item in decision.evidence_provenance] == [
      
        495
                EvidenceProvenanceStatus.MISSING.value,
      
        496
                EvidenceProvenanceStatus.MISSING.value,
      
        497
            ]
      
        498
        
        499
        
        500
        @pytest.mark.asyncio
      
        501
        async def test_completion_policy_accepts_passed_verification_from_dod(
      
        502
            temp_dir: Path,
      
        503
        ) -> None:
      
        504
            context = build_context(
      
        505
                temp_dir,
      
        506
                safeguards=FakeSafeguards(),
      
        507
            )
      
        508
            policy = CompletionPolicy(context)
      
        509
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        510
            dod.verification_commands = ["pytest -q"]
      
        511
            dod.evidence = [
      
        512
                VerificationEvidence(
      
        513
                    command="pytest -q",
      
        514
                    passed=True,
      
        515
                    stdout="342 passed",
      
        516
                    kind="test",
      
        517
                )
      
        518
            ]
      
        519
            dod.last_verification_result = "passed"
      
        520
            events = []
      
        521
        
        522
            async def emit(event) -> None:
      
        523
                events.append(event)
      
        524
        
        525
            decision = await policy.maybe_continue_for_completion(
      
        526
                content="The tests passed.",
      
        527
                response_content="The tests passed.",
      
        528
                task="Run pytest -q and make sure it works.",
      
        529
                actions_taken=[],
      
        530
                continuation_count=0,
      
        531
                emit=emit,
      
        532
                dod=dod,
      
        533
            )
      
        534
        
        535
            assert decision.should_continue is False
      
        536
            assert decision.should_finalize is False
      
        537
            assert decision.decision_code == "completion_response_accepted"
      
        538
            assert decision.completion_check is not None
      
        539
            assert decision.completion_check.missing_evidence == []
      
        540
            assert events == []
      
        541
            assert [item.summary for item in decision.evidence_provenance] == [
      
        542
                "verification passed for `pytest -q`"
      
        543
            ]
      
        544
        
        545
        
        546
        @pytest.mark.asyncio
      
        547
        async def test_completion_policy_finalizes_with_concrete_failed_verification_gap(
      
        548
            temp_dir: Path,
      
        549
        ) -> None:
      
        550
            context = build_context(
      
        551
                temp_dir,
      
        552
                safeguards=FakeSafeguards(),
      
        553
                max_continuation_prompts=1,
      
        554
            )
      
        555
            policy = CompletionPolicy(context)
      
        556
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        557
            dod.verification_commands = ["pytest -q"]
      
        558
            dod.evidence = [
      
        559
                VerificationEvidence(
      
        560
                    command="pytest -q",
      
        561
                    passed=False,
      
        562
                    stderr="1 failed",
      
        563
                    kind="test",
      
        564
                )
      
        565
            ]
      
        566
            dod.last_verification_result = "failed"
      
        567
            dod.verification_attempt_counter = 2
      
        568
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        569
            dod.active_verification_attempt_number = 2
      
        570
            events = []
      
        571
        
        572
            async def emit(event) -> None:
      
        573
                events.append(event)
      
        574
        
        575
            decision = await policy.maybe_continue_for_completion(
      
        576
                content="The tests are done.",
      
        577
                response_content="The tests are done.",
      
        578
                task="Run pytest -q and make sure it works.",
      
        579
                actions_taken=[],
      
        580
                continuation_count=1,
      
        581
                emit=emit,
      
        582
                dod=dod,
      
        583
            )
      
        584
        
        585
            assert decision.should_continue is False
      
        586
            assert decision.should_finalize is True
      
        587
            assert decision.decision_code == "continuation_budget_exhausted"
      
        588
            assert decision.decision_summary == (
      
        589
                "stopped because the continuation budget was exhausted while observed "
      
        590
                "verification still showed verification failed for `pytest -q` "
      
        591
                "[1 failed; attempt 2]"
      
        592
            )
      
        593
            assert decision.completion_check is not None
      
        594
            assert decision.completion_check.missing_evidence == [
      
        595
                "a passing verification result from `pytest -q` (current verification is still failing)"
      
        596
            ]
      
        597
            assert decision.final_response == (
      
        598
                "I stopped because the continuation budget was exhausted and observed "
      
        599
                "verification still showed: verification failed for `pytest -q` "
      
        600
                "[1 failed; attempt 2]."
      
        601
            )
      
        602
            assert events[0].type == "completion_check"
      
        603
            assert [item.status for item in decision.evidence_provenance] == [
      
        604
                EvidenceProvenanceStatus.CONTRADICTS.value
      
        605
            ]
      
        606
            assert [item.status for item in decision.verification_observations] == [
      
        607
                VerificationObservationStatus.FAILED.value
      
        608
            ]
      
        609
            assert decision.verification_observations[0].attempt_number == 2
      
        610
        
        611
        
        612
        @pytest.mark.asyncio
      
        613
        async def test_completion_policy_uses_missing_observed_verification_when_budget_is_exhausted(
      
        614
            temp_dir: Path,
      
        615
        ) -> None:
      
        616
            context = build_context(
      
        617
                temp_dir,
      
        618
                safeguards=FakeSafeguards(),
      
        619
                max_continuation_prompts=1,
      
        620
            )
      
        621
            policy = CompletionPolicy(context)
      
        622
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        623
            dod.verification_commands = ["pytest -q"]
      
        624
            dod.last_verification_result = "failed"
      
        625
            dod.verification_attempt_counter = 3
      
        626
            dod.active_verification_attempt_id = verification_attempt_id(3)
      
        627
            dod.active_verification_attempt_number = 3
      
        628
            events = []
      
        629
        
        630
            async def emit(event) -> None:
      
        631
                events.append(event)
      
        632
        
        633
            decision = await policy.maybe_continue_for_completion(
      
        634
                content="The tests are done.",
      
        635
                response_content="The tests are done.",
      
        636
                task="Run pytest -q and make sure it works.",
      
        637
                actions_taken=[],
      
        638
                continuation_count=1,
      
        639
                emit=emit,
      
        640
                dod=dod,
      
        641
            )
      
        642
        
        643
            assert decision.should_continue is False
      
        644
            assert decision.should_finalize is True
      
        645
            assert decision.decision_code == "continuation_budget_exhausted"
      
        646
            assert decision.decision_summary == (
      
        647
                "stopped because the continuation budget was exhausted while observed "
      
        648
                "verification still showed verification did not produce an observed "
      
        649
                "result for `pytest -q` [attempt 3]"
      
        650
            )
      
        651
            assert decision.final_response == (
      
        652
                "I stopped because the continuation budget was exhausted and observed "
      
        653
                "verification still showed: verification did not produce an observed "
      
        654
                "result for `pytest -q` [attempt 3]."
      
        655
            )
      
        656
            assert [item.status for item in decision.verification_observations] == [
      
        657
                VerificationObservationStatus.MISSING.value
      
        658
            ]
      
        659
            assert decision.verification_observations[0].attempt_number == 3
      
        660
            assert events[0].type == "completion_check"
      
        661
        
        662
        
        663
        @pytest.mark.asyncio
      
        664
        async def test_completion_policy_uses_pending_observed_verification_when_budget_is_exhausted(
      
        665
            temp_dir: Path,
      
        666
        ) -> None:
      
        667
            context = build_context(
      
        668
                temp_dir,
      
        669
                safeguards=FakeSafeguards(),
      
        670
                max_continuation_prompts=1,
      
        671
            )
      
        672
            policy = CompletionPolicy(context)
      
        673
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        674
            dod.verification_commands = ["pytest -q"]
      
        675
            dod.last_verification_result = "pending"
      
        676
            dod.verification_attempt_counter = 4
      
        677
            dod.active_verification_attempt_id = verification_attempt_id(4)
      
        678
            dod.active_verification_attempt_number = 4
      
        679
            events = []
      
        680
        
        681
            async def emit(event) -> None:
      
        682
                events.append(event)
      
        683
        
        684
            decision = await policy.maybe_continue_for_completion(
      
        685
                content="Verification is underway.",
      
        686
                response_content="Verification is underway.",
      
        687
                task="Run pytest -q and make sure it works.",
      
        688
                actions_taken=["write: README.md"],
      
        689
                continuation_count=1,
      
        690
                emit=emit,
      
        691
                dod=dod,
      
        692
            )
      
        693
        
        694
            assert decision.should_continue is False
      
        695
            assert decision.should_finalize is True
      
        696
            assert decision.decision_code == "continuation_budget_exhausted"
      
        697
            assert decision.decision_summary == (
      
        698
                "stopped because the continuation budget was exhausted while observed "
      
        699
                "verification still showed verification pending for `pytest -q` [attempt 4]"
      
        700
            )
      
        701
            assert decision.final_response == (
      
        702
                "I stopped because the continuation budget was exhausted and observed "
      
        703
                "verification still showed: verification pending for `pytest -q` [attempt 4]."
      
        704
            )
      
        705
            assert [item.status for item in decision.verification_observations] == [
      
        706
                VerificationObservationStatus.PENDING.value
      
        707
            ]
      
        708
            assert decision.verification_observations[0].attempt_number == 4
      
        709
            assert events[0].type == "completion_check"
      
        710
        
        711
        
        712
        @pytest.mark.asyncio
      
        713
        async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted(
      
        714
            temp_dir: Path,
      
        715
        ) -> None:
      
        716
            context = build_context(
      
        717
                temp_dir,
      
        718
                safeguards=FakeSafeguards(),
      
        719
                max_continuation_prompts=1,
      
        720
            )
      
        721
            policy = CompletionPolicy(context)
      
        722
            dod = create_definition_of_done("Run pytest -q and make sure it works.")
      
        723
            dod.verification_commands = ["pytest -q"]
      
        724
            dod.last_verification_result = "stale"
      
        725
            dod.verification_attempt_counter = 2
      
        726
            dod.active_verification_attempt_id = verification_attempt_id(2)
      
        727
            dod.active_verification_attempt_number = 2
      
        728
            events = []
      
        729
        
        730
            async def emit(event) -> None:
      
        731
                events.append(event)
      
        732
        
        733
            decision = await policy.maybe_continue_for_completion(
      
        734
                content="The tests were already handled.",
      
        735
                response_content="The tests were already handled.",
      
        736
                task="Run pytest -q and make sure it works.",
      
        737
                actions_taken=["write: README.md"],
      
        738
                continuation_count=1,
      
        739
                emit=emit,
      
        740
                dod=dod,
      
        741
            )
      
        742
        
        743
            assert decision.should_continue is False
      
        744
            assert decision.should_finalize is True
      
        745
            assert decision.decision_code == "continuation_budget_exhausted"
      
        746
            assert decision.decision_summary == (
      
        747
                "stopped because the continuation budget was exhausted while observed "
      
        748
                "verification still showed verification became stale for `pytest -q` "
      
        749
                "after new mutating work [attempt 1 -> attempt 2]"
      
        750
            )
      
        751
            assert decision.final_response == (
      
        752
                "I stopped because the continuation budget was exhausted and observed "
      
        753
                "verification still showed: verification became stale for `pytest -q` "
      
        754
                "after new mutating work [attempt 1 -> attempt 2]."
      
        755
            )
      
        756
            assert [item.status for item in decision.verification_observations] == [
      
        757
                VerificationObservationStatus.STALE.value
      
        758
            ]
      
        759
            assert decision.verification_observations[0].attempt_number == 1
      
        760
            assert decision.verification_observations[0].supersedes_attempt_id == (
      
        761
                verification_attempt_id(2)
      
        762
            )
      
        763
            assert events[0].type == "completion_check"
      
        764
        
        765
        
        766
        @pytest.mark.asyncio
      
        767
        async def test_completion_policy_finalizes_when_budget_is_exhausted(
      
        768
            temp_dir: Path,
      
        769
        ) -> None:
      
        770
            context = build_context(
      
        771
                temp_dir,
      
        772
                safeguards=FakeSafeguards(),
      
        773
                max_continuation_prompts=1,
      
        774
            )
      
        775
            policy = CompletionPolicy(context)
      
        776
            events = []
      
        777
        
        778
            async def emit(event) -> None:
      
        779
                events.append(event)
      
        780
        
        781
            decision = await policy.maybe_continue_for_completion(
      
        782
                content="I looked into it.",
      
        783
                response_content="I looked into it.",
      
        784
                task="Fix the README heading.",
      
        785
                actions_taken=[],
      
        786
                continuation_count=1,
      
        787
                emit=emit,
      
        788
            )
      
        789
        
        790
            assert decision.should_continue is False
      
        791
            assert decision.should_finalize is True
      
        792
            assert decision.decision_code == "continuation_budget_exhausted"
      
        793
            assert decision.completion_check is not None
      
        794
            assert decision.completion_check.missing_evidence == [
      
        795
                "showing the requested work was actually carried out"
      
        796
            ]
      
        797
            assert "Missing evidence" in decision.final_response
      
        798
            assert decision.verification_observations == []
      
        799
            assert events[0].type == "completion_check"