loader Public

Watch 0 Fork 0 Star 0

Python · 21126 bytes Raw Blame History

  
        1
        """Tests for finalization helpers on RuntimeContext."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        from types import SimpleNamespace
      
        7
        
        8
        import pytest
      
        9
        
        10
        from loader.llm.base import Message, Role, ToolCall
      
        11
        from loader.runtime.completion_trace import CompletionTraceEntry
      
        12
        from loader.runtime.context import RuntimeContext
      
        13
        from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
      
        14
        from loader.runtime.events import TurnSummary
      
        15
        from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
      
        16
        from loader.runtime.finalization import TurnFinalizer
      
        17
        from loader.runtime.permissions import (
      
        18
            PermissionMode,
      
        19
            build_permission_policy,
      
        20
            load_permission_rules,
      
        21
        )
      
        22
        from loader.runtime.tracing import RuntimeTracer
      
        23
        from loader.runtime.verification_observations import VerificationObservationStatus
      
        24
        from loader.tools.base import ToolResult as RegistryToolResult
      
        25
        from loader.tools.base import create_default_registry
      
        26
        from tests.helpers.runtime_harness import ScriptedBackend
      
        27
        
        28
        
        29
        class FakeSession:
      
        30
            def __init__(self) -> None:
      
        31
                self.messages: list[Message] = []
      
        32
                self.session_id = "session-test-123"
      
        33
                self.recorded_calls: list[dict[str, object]] = []
      
        34
                self.last_completion_decision_code = "verification_passed"
      
        35
                self.last_completion_decision_summary = (
      
        36
                    "accepted the response after verification evidence passed"
      
        37
                )
      
        38
                self.completion_trace = [
      
        39
                    CompletionTraceEntry(
      
        40
                        stage="definition_of_done",
      
        41
                        outcome="complete",
      
        42
                        decision_code="verification_passed",
      
        43
                        decision_summary="accepted the response after verification evidence passed",
      
        44
                    )
      
        45
                ]
      
        46
                self.last_turn_transition_summary = (
      
        47
                    "completion -> finalize [terminal] Finalizing completed turn"
      
        48
                )
      
        49
                self.workflow_timeline = []
      
        50
        
        51
            def append(self, message: Message) -> None:
      
        52
                self.messages.append(message)
      
        53
        
        54
            def append_workflow_timeline_entry(self, entry) -> None:
      
        55
                self.workflow_timeline.append(entry)
      
        56
        
        57
            def record_turn_usage(
      
        58
                self,
      
        59
                usage: dict[str, int],
      
        60
                *,
      
        61
                tool_calls: int,
      
        62
                iterations: int,
      
        63
            ) -> dict[str, int]:
      
        64
                payload = {
      
        65
                    "usage": dict(usage),
      
        66
                    "tool_calls": tool_calls,
      
        67
                    "iterations": iterations,
      
        68
                }
      
        69
                self.recorded_calls.append(payload)
      
        70
                return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
      
        71
        
        72
        
        73
        class FakeCodeFilter:
      
        74
            def reset(self) -> None:
      
        75
                return None
      
        76
        
        77
        
        78
        class FakeSafeguards:
      
        79
            def __init__(self) -> None:
      
        80
                self.action_tracker = object()
      
        81
                self.validator = object()
      
        82
                self.code_filter = FakeCodeFilter()
      
        83
        
        84
            def filter_stream_chunk(self, content: str) -> str:
      
        85
                return content
      
        86
        
        87
            def filter_complete_content(self, content: str) -> str:
      
        88
                return content
      
        89
        
        90
            def should_steer(self) -> bool:
      
        91
                return False
      
        92
        
        93
            def get_steering_message(self) -> str | None:
      
        94
                return None
      
        95
        
        96
            def record_response(self, content: str) -> None:
      
        97
                return None
      
        98
        
        99
            def detect_text_loop(self, content: str) -> tuple[bool, str]:
      
        100
                return False, ""
      
        101
        
        102
            def detect_loop(self) -> tuple[bool, str]:
      
        103
                return False, ""
      
        104
        
        105
        
        106
        class FakeExecutor:
      
        107
            def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
      
        108
                self._outcomes = list(outcomes)
      
        109
        
        110
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        111
                if not self._outcomes:
      
        112
                    raise AssertionError("No fake verification outcome queued")
      
        113
                return self._outcomes.pop(0)
      
        114
        
        115
        
        116
        class RecordingExecutor:
      
        117
            def __init__(self) -> None:
      
        118
                self.commands: list[str] = []
      
        119
        
        120
            async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
      
        121
                command = str(tool_call.arguments.get("command", ""))
      
        122
                self.commands.append(command)
      
        123
                return tool_outcome(
      
        124
                    tool_call=tool_call,
      
        125
                    output="ok",
      
        126
                    is_error=False,
      
        127
                    exit_code=0,
      
        128
                    stdout="ok",
      
        129
                )
      
        130
        
        131
        
        132
        def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
      
        133
            registry = create_default_registry(temp_dir)
      
        134
            registry.configure_workspace_root(temp_dir)
      
        135
            rule_status = load_permission_rules(temp_dir)
      
        136
            policy = build_permission_policy(
      
        137
                active_mode=PermissionMode.WORKSPACE_WRITE,
      
        138
                workspace_root=temp_dir,
      
        139
                tool_requirements=registry.get_tool_requirements(),
      
        140
                rules=rule_status.rules,
      
        141
            )
      
        142
            return RuntimeContext(
      
        143
                project_root=temp_dir,
      
        144
                backend=ScriptedBackend(),
      
        145
                registry=registry,
      
        146
                session=session,  # type: ignore[arg-type]
      
        147
                config=SimpleNamespace(
      
        148
                    force_react=False,
      
        149
                    verification_retry_budget=3,
      
        150
                    reasoning=SimpleNamespace(
      
        151
                        rollback=False,
      
        152
                        show_rollback_plan=False,
      
        153
                        completion_check=True,
      
        154
                        use_quick_completion=True,
      
        155
                        max_continuation_prompts=5,
      
        156
                        self_critique=False,
      
        157
                        confidence_scoring=False,
      
        158
                        min_confidence_for_action=3,
      
        159
                        verification=False,
      
        160
                    ),
      
        161
                ),
      
        162
                capability_profile=SimpleNamespace(supports_native_tools=True),  # type: ignore[arg-type]
      
        163
                project_context=None,
      
        164
                permission_policy=policy,
      
        165
                permission_config_status=rule_status,
      
        166
                workflow_mode="execute",
      
        167
                safeguards=FakeSafeguards(),
      
        168
            )
      
        169
        
        170
        
        171
        def tool_outcome(
      
        172
            *,
      
        173
            tool_call: ToolCall,
      
        174
            output: str,
      
        175
            is_error: bool,
      
        176
            exit_code: int,
      
        177
            stdout: str = "",
      
        178
            stderr: str = "",
      
        179
        ) -> ToolExecutionOutcome:
      
        180
            return ToolExecutionOutcome(
      
        181
                tool_call=tool_call,
      
        182
                state=ToolExecutionState.EXECUTED,
      
        183
                message=Message.tool_result_message(
      
        184
                    tool_call_id=tool_call.id,
      
        185
                    display_content=output,
      
        186
                    result_content=output,
      
        187
                    is_error=is_error,
      
        188
                ),
      
        189
                event_content=output,
      
        190
                is_error=is_error,
      
        191
                result_output=output,
      
        192
                registry_result=RegistryToolResult(
      
        193
                    output=output,
      
        194
                    is_error=is_error,
      
        195
                    metadata={
      
        196
                        "exit_code": exit_code,
      
        197
                        "stdout": stdout,
      
        198
                        "stderr": stderr,
      
        199
                    },
      
        200
                ),
      
        201
            )
      
        202
        
        203
        
        204
        async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
      
        205
            return None
      
        206
        
        207
        
        208
        def test_turn_finalizer_finalize_summary_uses_runtime_context(
      
        209
            temp_dir: Path,
      
        210
            monkeypatch: pytest.MonkeyPatch,
      
        211
        ) -> None:
      
        212
            session = FakeSession()
      
        213
            context = build_context(temp_dir, session)
      
        214
            tracer = RuntimeTracer()
      
        215
            tracer.record("turn.completed", reason="done")
      
        216
            finalizer = TurnFinalizer(
      
        217
                context,
      
        218
                tracer,
      
        219
                DefinitionOfDoneStore(temp_dir),
      
        220
                set_workflow_mode=_noop_set_workflow_mode,
      
        221
            )
      
        222
            dod = create_definition_of_done("Finish the task")
      
        223
            dod.status = "done"
      
        224
            summary = TurnSummary(
      
        225
                final_response="All set.",
      
        226
                definition_of_done=dod,
      
        227
                iterations=2,
      
        228
                usage={"prompt_tokens": 10},
      
        229
                tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
      
        230
            )
      
        231
            captured: dict[str, str] = {}
      
        232
        
        233
            def capture_definition_of_done(self, summary_text: str) -> None:
      
        234
                captured["summary"] = summary_text
      
        235
        
        236
            monkeypatch.setattr(
      
        237
                "loader.runtime.finalization.MemoryStore.capture_definition_of_done",
      
        238
                capture_definition_of_done,
      
        239
            )
      
        240
        
        241
            final_summary = finalizer.finalize_summary(summary)
      
        242
        
        243
            assert final_summary.session_id == "session-test-123"
      
        244
            assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
      
        245
            assert session.recorded_calls == [
      
        246
                {
      
        247
                    "usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
      
        248
                    "tool_calls": 1,
      
        249
                    "iterations": 2,
      
        250
                }
      
        251
            ]
      
        252
            assert "summary" in captured
      
        253
            assert final_summary.trace
      
        254
            assert final_summary.completion_decision_code == "verification_passed"
      
        255
            assert final_summary.completion_decision_summary == (
      
        256
                "accepted the response after verification evidence passed"
      
        257
            )
      
        258
            assert [entry.decision_code for entry in final_summary.completion_trace] == [
      
        259
                "verification_passed"
      
        260
            ]
      
        261
        
        262
        
        263
        @pytest.mark.asyncio
      
        264
        async def test_turn_finalizer_records_skipped_verification_observation(
      
        265
            temp_dir: Path,
      
        266
        ) -> None:
      
        267
            session = FakeSession()
      
        268
            context = build_context(temp_dir, session)
      
        269
            finalizer = TurnFinalizer(
      
        270
                context,
      
        271
                RuntimeTracer(),
      
        272
                DefinitionOfDoneStore(temp_dir),
      
        273
                set_workflow_mode=_noop_set_workflow_mode,
      
        274
            )
      
        275
            dod = create_definition_of_done("Explain Loader's clarify loop.")
      
        276
            summary = TurnSummary(final_response="")
      
        277
            events = []
      
        278
        
        279
            async def capture(event) -> None:
      
        280
                events.append(event)
      
        281
        
        282
            result = await finalizer.run_definition_of_done_gate(
      
        283
                dod=dod,
      
        284
                candidate_response="Loader uses a bounded clarify loop before execution.",
      
        285
                emit=capture,
      
        286
                summary=summary,
      
        287
                executor=FakeExecutor([]),  # type: ignore[arg-type]
      
        288
            )
      
        289
        
        290
            assert result.should_continue is False
      
        291
            assert result.reason_code == "non_mutating_response_accepted"
      
        292
            assert [item.status for item in result.verification_observations] == [
      
        293
                VerificationObservationStatus.SKIPPED.value
      
        294
            ]
      
        295
            assert [item.summary for item in result.verification_observations] == [
      
        296
                "verification was skipped because no mutating work required checks"
      
        297
            ]
      
        298
            assert summary.verification_status == "skipped"
      
        299
            assert session.workflow_timeline[-1].kind == "verify_skip"
      
        300
            assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
      
        301
                VerificationObservationStatus.SKIPPED.value
      
        302
            ]
      
        303
            assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
      
        304
        
        305
        
        306
        @pytest.mark.asyncio
      
        307
        async def test_turn_finalizer_records_passed_verification_observation(
      
        308
            temp_dir: Path,
      
        309
        ) -> None:
      
        310
            session = FakeSession()
      
        311
            context = build_context(temp_dir, session)
      
        312
            finalizer = TurnFinalizer(
      
        313
                context,
      
        314
                RuntimeTracer(),
      
        315
                DefinitionOfDoneStore(temp_dir),
      
        316
                set_workflow_mode=_noop_set_workflow_mode,
      
        317
            )
      
        318
            dod = create_definition_of_done("Update the runtime tests.")
      
        319
            dod.mutating_actions.append("write")
      
        320
            dod.verification_commands = ["uv run pytest -q"]
      
        321
            summary = TurnSummary(final_response="")
      
        322
            tool_call = ToolCall(
      
        323
                id="verify-1-1",
      
        324
                name="bash",
      
        325
                arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
      
        326
            )
      
        327
        
        328
            async def capture(event) -> None:
      
        329
                return None
      
        330
        
        331
            result = await finalizer.run_definition_of_done_gate(
      
        332
                dod=dod,
      
        333
                candidate_response="Updated the runtime tests.",
      
        334
                emit=capture,
      
        335
                summary=summary,
      
        336
                executor=FakeExecutor(
      
        337
                    [
      
        338
                        tool_outcome(
      
        339
                            tool_call=tool_call,
      
        340
                            output="219 passed",
      
        341
                            is_error=False,
      
        342
                            exit_code=0,
      
        343
                            stdout="219 passed",
      
        344
                        )
      
        345
                    ]
      
        346
                ),  # type: ignore[arg-type]
      
        347
            )
      
        348
        
        349
            assert result.should_continue is False
      
        350
            assert result.reason_code == "verification_passed"
      
        351
            assert [item.status for item in result.verification_observations] == [
      
        352
                VerificationObservationStatus.PASSED.value
      
        353
            ]
      
        354
            assert result.verification_observations[0].attempt_id == "verification-attempt-1"
      
        355
            assert result.verification_observations[0].attempt_number == 1
      
        356
            assert result.verification_observations[0].command == "uv run pytest -q"
      
        357
            assert result.verification_observations[0].detail == "219 passed"
      
        358
            assert summary.verification_status == "passed"
      
        359
            assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
      
        360
                "verification_pending",
      
        361
                "verification_command_passed",
      
        362
            ]
      
        363
            assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
      
        364
                VerificationObservationStatus.PENDING.value
      
        365
            ]
      
        366
            assert (
      
        367
                session.workflow_timeline[-2].verification_observations[0].attempt_id
      
        368
                == "verification-attempt-1"
      
        369
            )
      
        370
            assert session.workflow_timeline[-2].verification_observations[0].command == (
      
        371
                "uv run pytest -q"
      
        372
            )
      
        373
            assert session.workflow_timeline[-1].kind == "verify_observation"
      
        374
            assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
      
        375
            assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
      
        376
                VerificationObservationStatus.PASSED.value
      
        377
            ]
      
        378
        
        379
        
        380
        @pytest.mark.asyncio
      
        381
        async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
      
        382
            temp_dir: Path,
      
        383
        ) -> None:
      
        384
            chapters = temp_dir / "chapters"
      
        385
            chapters.mkdir()
      
        386
            (chapters / "01-introduction.html").write_text(
      
        387
                "<h1>Chapter 1: Introduction to Fortran</h1>\n"
      
        388
            )
      
        389
            index = temp_dir / "index.html"
      
        390
            index.write_text(
      
        391
                "\n".join(
      
        392
                    [
      
        393
                        '<ul class="chapter-list">',
      
        394
                        '  <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
      
        395
                        "</ul>",
      
        396
                    ]
      
        397
                )
      
        398
            )
      
        399
        
        400
            session = FakeSession()
      
        401
            context = build_context(temp_dir, session)
      
        402
            finalizer = TurnFinalizer(
      
        403
                context,
      
        404
                RuntimeTracer(),
      
        405
                DefinitionOfDoneStore(temp_dir),
      
        406
                set_workflow_mode=_noop_set_workflow_mode,
      
        407
            )
      
        408
            dod = create_definition_of_done(
      
        409
                "Update index.html so the table of contents links and chapter titles are correct."
      
        410
            )
      
        411
            dod.mutating_actions.append("edit")
      
        412
            dod.touched_files.append(str(index))
      
        413
            dod.verification_commands = ['grep -n "href=" index.html']
      
        414
            summary = TurnSummary(final_response="")
      
        415
            executor = RecordingExecutor()
      
        416
        
        417
            async def capture(event) -> None:
      
        418
                return None
      
        419
        
        420
            result = await finalizer.run_definition_of_done_gate(
      
        421
                dod=dod,
      
        422
                candidate_response="Updated the index.html links.",
      
        423
                emit=capture,
      
        424
                summary=summary,
      
        425
                executor=executor,  # type: ignore[arg-type]
      
        426
            )
      
        427
        
        428
            assert result.should_continue is False
      
        429
            assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
      
        430
            assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
      
        431
            assert (
      
        432
                session.workflow_timeline[-1].verification_observations[0].attempt_id
      
        433
                == "verification-attempt-1"
      
        434
            )
      
        435
        
        436
        
        437
        @pytest.mark.asyncio
      
        438
        async def test_turn_finalizer_records_missing_verification_observation(
      
        439
            temp_dir: Path,
      
        440
        ) -> None:
      
        441
            session = FakeSession()
      
        442
            context = build_context(temp_dir, session)
      
        443
            finalizer = TurnFinalizer(
      
        444
                context,
      
        445
                RuntimeTracer(),
      
        446
                DefinitionOfDoneStore(temp_dir),
      
        447
                set_workflow_mode=_noop_set_workflow_mode,
      
        448
            )
      
        449
            dod = create_definition_of_done("Edit the loader bootstrap.")
      
        450
            dod.mutating_actions.append("edit")
      
        451
            summary = TurnSummary(final_response="")
      
        452
        
        453
            async def capture(event) -> None:
      
        454
                return None
      
        455
        
        456
            result = await finalizer.run_definition_of_done_gate(
      
        457
                dod=dod,
      
        458
                candidate_response="Updated the bootstrap code.",
      
        459
                emit=capture,
      
        460
                summary=summary,
      
        461
                executor=FakeExecutor([]),  # type: ignore[arg-type]
      
        462
            )
      
        463
        
        464
            assert result.should_continue is True
      
        465
            assert result.reason_code == "verification_failed_reentry"
      
        466
            assert [item.status for item in result.verification_observations] == [
      
        467
                VerificationObservationStatus.MISSING.value
      
        468
            ]
      
        469
            assert result.verification_observations[0].attempt_id == "verification-attempt-1"
      
        470
            assert result.verification_observations[0].attempt_number == 1
      
        471
            assert [item.summary for item in result.verification_observations] == [
      
        472
                "verification commands were still missing at execution time"
      
        473
            ]
      
        474
            assert summary.verification_status == "failed"
      
        475
            assert session.workflow_timeline[-1].kind == "verify_observation"
      
        476
            assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
      
        477
            assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
      
        478
                VerificationObservationStatus.MISSING.value
      
        479
            ]
      
        480
            assert (
      
        481
                session.workflow_timeline[-1].verification_observations[0].attempt_id
      
        482
                == "verification-attempt-1"
      
        483
            )
      
        484
            assert session.messages[-1].role == Role.USER
      
        485
            assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
      
        486
        
        487
        
        488
        @pytest.mark.asyncio
      
        489
        async def test_turn_finalizer_does_not_reverify_without_new_changes(
      
        490
            temp_dir: Path,
      
        491
        ) -> None:
      
        492
            session = FakeSession()
      
        493
            context = build_context(temp_dir, session)
      
        494
            finalizer = TurnFinalizer(
      
        495
                context,
      
        496
                RuntimeTracer(),
      
        497
                DefinitionOfDoneStore(temp_dir),
      
        498
                set_workflow_mode=_noop_set_workflow_mode,
      
        499
            )
      
        500
            index = temp_dir / "index.html"
      
        501
            index.write_text("<ul></ul>\n")
      
        502
            dod = create_definition_of_done("Fix the chapter list in index.html.")
      
        503
            dod.mutating_actions.append("edit")
      
        504
            dod.touched_files.append(str(index))
      
        505
            dod.line_changes = 12
      
        506
            dod.last_verification_result = "failed"
      
        507
            dod.last_verification_signature = (
      
        508
                f"lines={dod.line_changes};touched={index};actions=1;commands="
      
        509
            )
      
        510
            dod.evidence = []
      
        511
            summary = TurnSummary(final_response="")
      
        512
            executor = RecordingExecutor()
      
        513
        
        514
            async def capture(event) -> None:
      
        515
                return None
      
        516
        
        517
            result = await finalizer.run_definition_of_done_gate(
      
        518
                dod=dod,
      
        519
                candidate_response="I checked the file again.",
      
        520
                emit=capture,
      
        521
                summary=summary,
      
        522
                executor=executor,  # type: ignore[arg-type]
      
        523
            )
      
        524
        
        525
            assert result.should_continue is True
      
        526
            assert result.reason_code == "verification_failed_no_new_changes"
      
        527
            assert executor.commands == []
      
        528
            assert summary.verification_status == "failed"
      
        529
            assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
      
        530
        
        531
        
        532
        @pytest.mark.asyncio
      
        533
        async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
      
        534
            temp_dir: Path,
      
        535
            monkeypatch: pytest.MonkeyPatch,
      
        536
        ) -> None:
      
        537
            session = FakeSession()
      
        538
            context = build_context(temp_dir, session)
      
        539
            finalizer = TurnFinalizer(
      
        540
                context,
      
        541
                RuntimeTracer(),
      
        542
                DefinitionOfDoneStore(temp_dir),
      
        543
                set_workflow_mode=_noop_set_workflow_mode,
      
        544
            )
      
        545
            dod = create_definition_of_done(
      
        546
                "Update index.html so the table of contents links and chapter titles are correct."
      
        547
            )
      
        548
            dod.mutating_actions.append("edit")
      
        549
            dod.touched_files.append(str(temp_dir / "index.html"))
      
        550
            dod.verification_commands = [
      
        551
                "python3 - <<'PY'\nprint('semantic ok')\nPY",
      
        552
                "html5validator --root /tmp/fortran-qwen-recovery-check/",
      
        553
            ]
      
        554
            summary = TurnSummary(final_response="")
      
        555
            semantic_call = ToolCall(
      
        556
                id="verify-1-1",
      
        557
                name="bash",
      
        558
                arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
      
        559
            )
      
        560
            html5validator_call = ToolCall(
      
        561
                id="verify-1-2",
      
        562
                name="bash",
      
        563
                arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
      
        564
            )
      
        565
        
        566
            async def capture(event) -> None:
      
        567
                return None
      
        568
        
        569
            monkeypatch.setattr(
      
        570
                "loader.runtime.finalization.derive_verification_commands",
      
        571
                lambda *args, **kwargs: [],
      
        572
            )
      
        573
        
        574
            result = await finalizer.run_definition_of_done_gate(
      
        575
                dod=dod,
      
        576
                candidate_response="Updated the chapter links and titles.",
      
        577
                emit=capture,
      
        578
                summary=summary,
      
        579
                executor=FakeExecutor(
      
        580
                    [
      
        581
                        tool_outcome(
      
        582
                            tool_call=semantic_call,
      
        583
                            output="semantic ok",
      
        584
                            is_error=False,
      
        585
                            exit_code=0,
      
        586
                            stdout="semantic ok",
      
        587
                        ),
      
        588
                        tool_outcome(
      
        589
                            tool_call=html5validator_call,
      
        590
                            output="/bin/sh: html5validator: command not found",
      
        591
                            is_error=True,
      
        592
                            exit_code=127,
      
        593
                            stderr="/bin/sh: html5validator: command not found",
      
        594
                        ),
      
        595
                    ]
      
        596
                ),  # type: ignore[arg-type]
      
        597
            )
      
        598
        
        599
            assert result.should_continue is False
      
        600
            assert result.reason_code == "verification_passed"
      
        601
            assert summary.verification_status == "passed"
      
        602
            assert dod.status == "done"
      
        603
            assert dod.last_verification_result == "passed"
      
        604
            assert [item.passed for item in dod.evidence] == [True, False]
      
        605
            assert [item.skipped for item in dod.evidence] == [False, True]
      
        606
            assert "SKIP" in result.final_response
      
        607
            assert "html5validator" in result.final_response
      
        608
            assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
      
        609
            assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
      
        610
            assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
      
        611
                VerificationObservationStatus.SKIPPED.value
      
        612
            ]

1	"""Tests for finalization helpers on RuntimeContext."""
2
3	from __future__ import annotations
4
5	from pathlib import Path
6	from types import SimpleNamespace
7
8	import pytest
9
10	from loader.llm.base import Message, Role, ToolCall
11	from loader.runtime.completion_trace import CompletionTraceEntry
12	from loader.runtime.context import RuntimeContext
13	from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
14	from loader.runtime.events import TurnSummary
15	from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
16	from loader.runtime.finalization import TurnFinalizer
17	from loader.runtime.permissions import (
18	PermissionMode,
19	build_permission_policy,
20	load_permission_rules,
21	)
22	from loader.runtime.tracing import RuntimeTracer
23	from loader.runtime.verification_observations import VerificationObservationStatus
24	from loader.tools.base import ToolResult as RegistryToolResult
25	from loader.tools.base import create_default_registry
26	from tests.helpers.runtime_harness import ScriptedBackend
27
28
29	class FakeSession:
30	def __init__(self) -> None:
31	self.messages: list[Message] = []
32	self.session_id = "session-test-123"
33	self.recorded_calls: list[dict[str, object]] = []
34	self.last_completion_decision_code = "verification_passed"
35	self.last_completion_decision_summary = (
36	"accepted the response after verification evidence passed"
37	)
38	self.completion_trace = [
39	CompletionTraceEntry(
40	stage="definition_of_done",
41	outcome="complete",
42	decision_code="verification_passed",
43	decision_summary="accepted the response after verification evidence passed",
44	)
45	]
46	self.last_turn_transition_summary = (
47	"completion -> finalize [terminal] Finalizing completed turn"
48	)
49	self.workflow_timeline = []
50
51	def append(self, message: Message) -> None:
52	self.messages.append(message)
53
54	def append_workflow_timeline_entry(self, entry) -> None:
55	self.workflow_timeline.append(entry)
56
57	def record_turn_usage(
58	self,
59	usage: dict[str, int],
60	*,
61	tool_calls: int,
62	iterations: int,
63	) -> dict[str, int]:
64	payload = {
65	"usage": dict(usage),
66	"tool_calls": tool_calls,
67	"iterations": iterations,
68	}
69	self.recorded_calls.append(payload)
70	return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
71
72
73	class FakeCodeFilter:
74	def reset(self) -> None:
75	return None
76
77
78	class FakeSafeguards:
79	def __init__(self) -> None:
80	self.action_tracker = object()
81	self.validator = object()
82	self.code_filter = FakeCodeFilter()
83
84	def filter_stream_chunk(self, content: str) -> str:
85	return content
86
87	def filter_complete_content(self, content: str) -> str:
88	return content
89
90	def should_steer(self) -> bool:
91	return False
92
93	def get_steering_message(self) -> str \| None:
94	return None
95
96	def record_response(self, content: str) -> None:
97	return None
98
99	def detect_text_loop(self, content: str) -> tuple[bool, str]:
100	return False, ""
101
102	def detect_loop(self) -> tuple[bool, str]:
103	return False, ""
104
105
106	class FakeExecutor:
107	def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
108	self._outcomes = list(outcomes)
109
110	async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
111	if not self._outcomes:
112	raise AssertionError("No fake verification outcome queued")
113	return self._outcomes.pop(0)
114
115
116	class RecordingExecutor:
117	def __init__(self) -> None:
118	self.commands: list[str] = []
119
120	async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
121	command = str(tool_call.arguments.get("command", ""))
122	self.commands.append(command)
123	return tool_outcome(
124	tool_call=tool_call,
125	output="ok",
126	is_error=False,
127	exit_code=0,
128	stdout="ok",
129	)
130
131
132	def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
133	registry = create_default_registry(temp_dir)
134	registry.configure_workspace_root(temp_dir)
135	rule_status = load_permission_rules(temp_dir)
136	policy = build_permission_policy(
137	active_mode=PermissionMode.WORKSPACE_WRITE,
138	workspace_root=temp_dir,
139	tool_requirements=registry.get_tool_requirements(),
140	rules=rule_status.rules,
141	)
142	return RuntimeContext(
143	project_root=temp_dir,
144	backend=ScriptedBackend(),
145	registry=registry,
146	session=session, # type: ignore[arg-type]
147	config=SimpleNamespace(
148	force_react=False,
149	verification_retry_budget=3,
150	reasoning=SimpleNamespace(
151	rollback=False,
152	show_rollback_plan=False,
153	completion_check=True,
154	use_quick_completion=True,
155	max_continuation_prompts=5,
156	self_critique=False,
157	confidence_scoring=False,
158	min_confidence_for_action=3,
159	verification=False,
160	),
161	),
162	capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
163	project_context=None,
164	permission_policy=policy,
165	permission_config_status=rule_status,
166	workflow_mode="execute",
167	safeguards=FakeSafeguards(),
168	)
169
170
171	def tool_outcome(
172	*,
173	tool_call: ToolCall,
174	output: str,
175	is_error: bool,
176	exit_code: int,
177	stdout: str = "",
178	stderr: str = "",
179	) -> ToolExecutionOutcome:
180	return ToolExecutionOutcome(
181	tool_call=tool_call,
182	state=ToolExecutionState.EXECUTED,
183	message=Message.tool_result_message(
184	tool_call_id=tool_call.id,
185	display_content=output,
186	result_content=output,
187	is_error=is_error,
188	),
189	event_content=output,
190	is_error=is_error,
191	result_output=output,
192	registry_result=RegistryToolResult(
193	output=output,
194	is_error=is_error,
195	metadata={
196	"exit_code": exit_code,
197	"stdout": stdout,
198	"stderr": stderr,
199	},
200	),
201	)
202
203
204	async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
205	return None
206
207
208	def test_turn_finalizer_finalize_summary_uses_runtime_context(
209	temp_dir: Path,
210	monkeypatch: pytest.MonkeyPatch,
211	) -> None:
212	session = FakeSession()
213	context = build_context(temp_dir, session)
214	tracer = RuntimeTracer()
215	tracer.record("turn.completed", reason="done")
216	finalizer = TurnFinalizer(
217	context,
218	tracer,
219	DefinitionOfDoneStore(temp_dir),
220	set_workflow_mode=_noop_set_workflow_mode,
221	)
222	dod = create_definition_of_done("Finish the task")
223	dod.status = "done"
224	summary = TurnSummary(
225	final_response="All set.",
226	definition_of_done=dod,
227	iterations=2,
228	usage={"prompt_tokens": 10},
229	tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
230	)
231	captured: dict[str, str] = {}
232
233	def capture_definition_of_done(self, summary_text: str) -> None:
234	captured["summary"] = summary_text
235
236	monkeypatch.setattr(
237	"loader.runtime.finalization.MemoryStore.capture_definition_of_done",
238	capture_definition_of_done,
239	)
240
241	final_summary = finalizer.finalize_summary(summary)
242
243	assert final_summary.session_id == "session-test-123"
244	assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
245	assert session.recorded_calls == [
246	{
247	"usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
248	"tool_calls": 1,
249	"iterations": 2,
250	}
251	]
252	assert "summary" in captured
253	assert final_summary.trace
254	assert final_summary.completion_decision_code == "verification_passed"
255	assert final_summary.completion_decision_summary == (
256	"accepted the response after verification evidence passed"
257	)
258	assert [entry.decision_code for entry in final_summary.completion_trace] == [
259	"verification_passed"
260	]
261
262
263	@pytest.mark.asyncio
264	async def test_turn_finalizer_records_skipped_verification_observation(
265	temp_dir: Path,
266	) -> None:
267	session = FakeSession()
268	context = build_context(temp_dir, session)
269	finalizer = TurnFinalizer(
270	context,
271	RuntimeTracer(),
272	DefinitionOfDoneStore(temp_dir),
273	set_workflow_mode=_noop_set_workflow_mode,
274	)
275	dod = create_definition_of_done("Explain Loader's clarify loop.")
276	summary = TurnSummary(final_response="")
277	events = []
278
279	async def capture(event) -> None:
280	events.append(event)
281
282	result = await finalizer.run_definition_of_done_gate(
283	dod=dod,
284	candidate_response="Loader uses a bounded clarify loop before execution.",
285	emit=capture,
286	summary=summary,
287	executor=FakeExecutor([]), # type: ignore[arg-type]
288	)
289
290	assert result.should_continue is False
291	assert result.reason_code == "non_mutating_response_accepted"
292	assert [item.status for item in result.verification_observations] == [
293	VerificationObservationStatus.SKIPPED.value
294	]
295	assert [item.summary for item in result.verification_observations] == [
296	"verification was skipped because no mutating work required checks"
297	]
298	assert summary.verification_status == "skipped"
299	assert session.workflow_timeline[-1].kind == "verify_skip"
300	assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
301	VerificationObservationStatus.SKIPPED.value
302	]
303	assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
304
305
306	@pytest.mark.asyncio
307	async def test_turn_finalizer_records_passed_verification_observation(
308	temp_dir: Path,
309	) -> None:
310	session = FakeSession()
311	context = build_context(temp_dir, session)
312	finalizer = TurnFinalizer(
313	context,
314	RuntimeTracer(),
315	DefinitionOfDoneStore(temp_dir),
316	set_workflow_mode=_noop_set_workflow_mode,
317	)
318	dod = create_definition_of_done("Update the runtime tests.")
319	dod.mutating_actions.append("write")
320	dod.verification_commands = ["uv run pytest -q"]
321	summary = TurnSummary(final_response="")
322	tool_call = ToolCall(
323	id="verify-1-1",
324	name="bash",
325	arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
326	)
327
328	async def capture(event) -> None:
329	return None
330
331	result = await finalizer.run_definition_of_done_gate(
332	dod=dod,
333	candidate_response="Updated the runtime tests.",
334	emit=capture,
335	summary=summary,
336	executor=FakeExecutor(
337	[
338	tool_outcome(
339	tool_call=tool_call,
340	output="219 passed",
341	is_error=False,
342	exit_code=0,
343	stdout="219 passed",
344	)
345	]
346	), # type: ignore[arg-type]
347	)
348
349	assert result.should_continue is False
350	assert result.reason_code == "verification_passed"
351	assert [item.status for item in result.verification_observations] == [
352	VerificationObservationStatus.PASSED.value
353	]
354	assert result.verification_observations[0].attempt_id == "verification-attempt-1"
355	assert result.verification_observations[0].attempt_number == 1
356	assert result.verification_observations[0].command == "uv run pytest -q"
357	assert result.verification_observations[0].detail == "219 passed"
358	assert summary.verification_status == "passed"
359	assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
360	"verification_pending",
361	"verification_command_passed",
362	]
363	assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
364	VerificationObservationStatus.PENDING.value
365	]
366	assert (
367	session.workflow_timeline[-2].verification_observations[0].attempt_id
368	== "verification-attempt-1"
369	)
370	assert session.workflow_timeline[-2].verification_observations[0].command == (
371	"uv run pytest -q"
372	)
373	assert session.workflow_timeline[-1].kind == "verify_observation"
374	assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
375	assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
376	VerificationObservationStatus.PASSED.value
377	]
378
379
380	@pytest.mark.asyncio
381	async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
382	temp_dir: Path,
383	) -> None:
384	chapters = temp_dir / "chapters"
385	chapters.mkdir()
386	(chapters / "01-introduction.html").write_text(
387	"<h1>Chapter 1: Introduction to Fortran</h1>\n"
388	)
389	index = temp_dir / "index.html"
390	index.write_text(
391	"\n".join(
392	[
393	'<ul class="chapter-list">',
394	' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
395	"</ul>",
396	]
397	)
398	)
399
400	session = FakeSession()
401	context = build_context(temp_dir, session)
402	finalizer = TurnFinalizer(
403	context,
404	RuntimeTracer(),
405	DefinitionOfDoneStore(temp_dir),
406	set_workflow_mode=_noop_set_workflow_mode,
407	)
408	dod = create_definition_of_done(
409	"Update index.html so the table of contents links and chapter titles are correct."
410	)
411	dod.mutating_actions.append("edit")
412	dod.touched_files.append(str(index))
413	dod.verification_commands = ['grep -n "href=" index.html']
414	summary = TurnSummary(final_response="")
415	executor = RecordingExecutor()
416
417	async def capture(event) -> None:
418	return None
419
420	result = await finalizer.run_definition_of_done_gate(
421	dod=dod,
422	candidate_response="Updated the index.html links.",
423	emit=capture,
424	summary=summary,
425	executor=executor, # type: ignore[arg-type]
426	)
427
428	assert result.should_continue is False
429	assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
430	assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
431	assert (
432	session.workflow_timeline[-1].verification_observations[0].attempt_id
433	== "verification-attempt-1"
434	)
435
436
437	@pytest.mark.asyncio
438	async def test_turn_finalizer_records_missing_verification_observation(
439	temp_dir: Path,
440	) -> None:
441	session = FakeSession()
442	context = build_context(temp_dir, session)
443	finalizer = TurnFinalizer(
444	context,
445	RuntimeTracer(),
446	DefinitionOfDoneStore(temp_dir),
447	set_workflow_mode=_noop_set_workflow_mode,
448	)
449	dod = create_definition_of_done("Edit the loader bootstrap.")
450	dod.mutating_actions.append("edit")
451	summary = TurnSummary(final_response="")
452
453	async def capture(event) -> None:
454	return None
455
456	result = await finalizer.run_definition_of_done_gate(
457	dod=dod,
458	candidate_response="Updated the bootstrap code.",
459	emit=capture,
460	summary=summary,
461	executor=FakeExecutor([]), # type: ignore[arg-type]
462	)
463
464	assert result.should_continue is True
465	assert result.reason_code == "verification_failed_reentry"
466	assert [item.status for item in result.verification_observations] == [
467	VerificationObservationStatus.MISSING.value
468	]
469	assert result.verification_observations[0].attempt_id == "verification-attempt-1"
470	assert result.verification_observations[0].attempt_number == 1
471	assert [item.summary for item in result.verification_observations] == [
472	"verification commands were still missing at execution time"
473	]
474	assert summary.verification_status == "failed"
475	assert session.workflow_timeline[-1].kind == "verify_observation"
476	assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
477	assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
478	VerificationObservationStatus.MISSING.value
479	]
480	assert (
481	session.workflow_timeline[-1].verification_observations[0].attempt_id
482	== "verification-attempt-1"
483	)
484	assert session.messages[-1].role == Role.USER
485	assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
486
487
488	@pytest.mark.asyncio
489	async def test_turn_finalizer_does_not_reverify_without_new_changes(
490	temp_dir: Path,
491	) -> None:
492	session = FakeSession()
493	context = build_context(temp_dir, session)
494	finalizer = TurnFinalizer(
495	context,
496	RuntimeTracer(),
497	DefinitionOfDoneStore(temp_dir),
498	set_workflow_mode=_noop_set_workflow_mode,
499	)
500	index = temp_dir / "index.html"
501	index.write_text("<ul></ul>\n")
502	dod = create_definition_of_done("Fix the chapter list in index.html.")
503	dod.mutating_actions.append("edit")
504	dod.touched_files.append(str(index))
505	dod.line_changes = 12
506	dod.last_verification_result = "failed"
507	dod.last_verification_signature = (
508	f"lines={dod.line_changes};touched={index};actions=1;commands="
509	)
510	dod.evidence = []
511	summary = TurnSummary(final_response="")
512	executor = RecordingExecutor()
513
514	async def capture(event) -> None:
515	return None
516
517	result = await finalizer.run_definition_of_done_gate(
518	dod=dod,
519	candidate_response="I checked the file again.",
520	emit=capture,
521	summary=summary,
522	executor=executor, # type: ignore[arg-type]
523	)
524
525	assert result.should_continue is True
526	assert result.reason_code == "verification_failed_no_new_changes"
527	assert executor.commands == []
528	assert summary.verification_status == "failed"
529	assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
530
531
532	@pytest.mark.asyncio
533	async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
534	temp_dir: Path,
535	monkeypatch: pytest.MonkeyPatch,
536	) -> None:
537	session = FakeSession()
538	context = build_context(temp_dir, session)
539	finalizer = TurnFinalizer(
540	context,
541	RuntimeTracer(),
542	DefinitionOfDoneStore(temp_dir),
543	set_workflow_mode=_noop_set_workflow_mode,
544	)
545	dod = create_definition_of_done(
546	"Update index.html so the table of contents links and chapter titles are correct."
547	)
548	dod.mutating_actions.append("edit")
549	dod.touched_files.append(str(temp_dir / "index.html"))
550	dod.verification_commands = [
551	"python3 - <<'PY'\nprint('semantic ok')\nPY",
552	"html5validator --root /tmp/fortran-qwen-recovery-check/",
553	]
554	summary = TurnSummary(final_response="")
555	semantic_call = ToolCall(
556	id="verify-1-1",
557	name="bash",
558	arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
559	)
560	html5validator_call = ToolCall(
561	id="verify-1-2",
562	name="bash",
563	arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
564	)
565
566	async def capture(event) -> None:
567	return None
568
569	monkeypatch.setattr(
570	"loader.runtime.finalization.derive_verification_commands",
571	lambda args, *kwargs: [],
572	)
573
574	result = await finalizer.run_definition_of_done_gate(
575	dod=dod,
576	candidate_response="Updated the chapter links and titles.",
577	emit=capture,
578	summary=summary,
579	executor=FakeExecutor(
580	[
581	tool_outcome(
582	tool_call=semantic_call,
583	output="semantic ok",
584	is_error=False,
585	exit_code=0,
586	stdout="semantic ok",
587	),
588	tool_outcome(
589	tool_call=html5validator_call,
590	output="/bin/sh: html5validator: command not found",
591	is_error=True,
592	exit_code=127,
593	stderr="/bin/sh: html5validator: command not found",
594	),
595	]
596	), # type: ignore[arg-type]
597	)
598
599	assert result.should_continue is False
600	assert result.reason_code == "verification_passed"
601	assert summary.verification_status == "passed"
602	assert dod.status == "done"
603	assert dod.last_verification_result == "passed"
604	assert [item.passed for item in dod.evidence] == [True, False]
605	assert [item.skipped for item in dod.evidence] == [False, True]
606	assert "SKIP" in result.final_response
607	assert "html5validator" in result.final_response
608	assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
609	assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
610	assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
611	VerificationObservationStatus.SKIPPED.value
612	]