Python · 21126 bytes Raw Blame History
1 """Tests for finalization helpers on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.completion_trace import CompletionTraceEntry
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done
14 from loader.runtime.events import TurnSummary
15 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
16 from loader.runtime.finalization import TurnFinalizer
17 from loader.runtime.permissions import (
18 PermissionMode,
19 build_permission_policy,
20 load_permission_rules,
21 )
22 from loader.runtime.tracing import RuntimeTracer
23 from loader.runtime.verification_observations import VerificationObservationStatus
24 from loader.tools.base import ToolResult as RegistryToolResult
25 from loader.tools.base import create_default_registry
26 from tests.helpers.runtime_harness import ScriptedBackend
27
28
29 class FakeSession:
30 def __init__(self) -> None:
31 self.messages: list[Message] = []
32 self.session_id = "session-test-123"
33 self.recorded_calls: list[dict[str, object]] = []
34 self.last_completion_decision_code = "verification_passed"
35 self.last_completion_decision_summary = (
36 "accepted the response after verification evidence passed"
37 )
38 self.completion_trace = [
39 CompletionTraceEntry(
40 stage="definition_of_done",
41 outcome="complete",
42 decision_code="verification_passed",
43 decision_summary="accepted the response after verification evidence passed",
44 )
45 ]
46 self.last_turn_transition_summary = (
47 "completion -> finalize [terminal] Finalizing completed turn"
48 )
49 self.workflow_timeline = []
50
51 def append(self, message: Message) -> None:
52 self.messages.append(message)
53
54 def append_workflow_timeline_entry(self, entry) -> None:
55 self.workflow_timeline.append(entry)
56
57 def record_turn_usage(
58 self,
59 usage: dict[str, int],
60 *,
61 tool_calls: int,
62 iterations: int,
63 ) -> dict[str, int]:
64 payload = {
65 "usage": dict(usage),
66 "tool_calls": tool_calls,
67 "iterations": iterations,
68 }
69 self.recorded_calls.append(payload)
70 return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
71
72
73 class FakeCodeFilter:
74 def reset(self) -> None:
75 return None
76
77
78 class FakeSafeguards:
79 def __init__(self) -> None:
80 self.action_tracker = object()
81 self.validator = object()
82 self.code_filter = FakeCodeFilter()
83
84 def filter_stream_chunk(self, content: str) -> str:
85 return content
86
87 def filter_complete_content(self, content: str) -> str:
88 return content
89
90 def should_steer(self) -> bool:
91 return False
92
93 def get_steering_message(self) -> str | None:
94 return None
95
96 def record_response(self, content: str) -> None:
97 return None
98
99 def detect_text_loop(self, content: str) -> tuple[bool, str]:
100 return False, ""
101
102 def detect_loop(self) -> tuple[bool, str]:
103 return False, ""
104
105
106 class FakeExecutor:
107 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
108 self._outcomes = list(outcomes)
109
110 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
111 if not self._outcomes:
112 raise AssertionError("No fake verification outcome queued")
113 return self._outcomes.pop(0)
114
115
116 class RecordingExecutor:
117 def __init__(self) -> None:
118 self.commands: list[str] = []
119
120 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
121 command = str(tool_call.arguments.get("command", ""))
122 self.commands.append(command)
123 return tool_outcome(
124 tool_call=tool_call,
125 output="ok",
126 is_error=False,
127 exit_code=0,
128 stdout="ok",
129 )
130
131
132 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
133 registry = create_default_registry(temp_dir)
134 registry.configure_workspace_root(temp_dir)
135 rule_status = load_permission_rules(temp_dir)
136 policy = build_permission_policy(
137 active_mode=PermissionMode.WORKSPACE_WRITE,
138 workspace_root=temp_dir,
139 tool_requirements=registry.get_tool_requirements(),
140 rules=rule_status.rules,
141 )
142 return RuntimeContext(
143 project_root=temp_dir,
144 backend=ScriptedBackend(),
145 registry=registry,
146 session=session, # type: ignore[arg-type]
147 config=SimpleNamespace(
148 force_react=False,
149 verification_retry_budget=3,
150 reasoning=SimpleNamespace(
151 rollback=False,
152 show_rollback_plan=False,
153 completion_check=True,
154 use_quick_completion=True,
155 max_continuation_prompts=5,
156 self_critique=False,
157 confidence_scoring=False,
158 min_confidence_for_action=3,
159 verification=False,
160 ),
161 ),
162 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
163 project_context=None,
164 permission_policy=policy,
165 permission_config_status=rule_status,
166 workflow_mode="execute",
167 safeguards=FakeSafeguards(),
168 )
169
170
171 def tool_outcome(
172 *,
173 tool_call: ToolCall,
174 output: str,
175 is_error: bool,
176 exit_code: int,
177 stdout: str = "",
178 stderr: str = "",
179 ) -> ToolExecutionOutcome:
180 return ToolExecutionOutcome(
181 tool_call=tool_call,
182 state=ToolExecutionState.EXECUTED,
183 message=Message.tool_result_message(
184 tool_call_id=tool_call.id,
185 display_content=output,
186 result_content=output,
187 is_error=is_error,
188 ),
189 event_content=output,
190 is_error=is_error,
191 result_output=output,
192 registry_result=RegistryToolResult(
193 output=output,
194 is_error=is_error,
195 metadata={
196 "exit_code": exit_code,
197 "stdout": stdout,
198 "stderr": stderr,
199 },
200 ),
201 )
202
203
204 async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
205 return None
206
207
208 def test_turn_finalizer_finalize_summary_uses_runtime_context(
209 temp_dir: Path,
210 monkeypatch: pytest.MonkeyPatch,
211 ) -> None:
212 session = FakeSession()
213 context = build_context(temp_dir, session)
214 tracer = RuntimeTracer()
215 tracer.record("turn.completed", reason="done")
216 finalizer = TurnFinalizer(
217 context,
218 tracer,
219 DefinitionOfDoneStore(temp_dir),
220 set_workflow_mode=_noop_set_workflow_mode,
221 )
222 dod = create_definition_of_done("Finish the task")
223 dod.status = "done"
224 summary = TurnSummary(
225 final_response="All set.",
226 definition_of_done=dod,
227 iterations=2,
228 usage={"prompt_tokens": 10},
229 tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
230 )
231 captured: dict[str, str] = {}
232
233 def capture_definition_of_done(self, summary_text: str) -> None:
234 captured["summary"] = summary_text
235
236 monkeypatch.setattr(
237 "loader.runtime.finalization.MemoryStore.capture_definition_of_done",
238 capture_definition_of_done,
239 )
240
241 final_summary = finalizer.finalize_summary(summary)
242
243 assert final_summary.session_id == "session-test-123"
244 assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
245 assert session.recorded_calls == [
246 {
247 "usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
248 "tool_calls": 1,
249 "iterations": 2,
250 }
251 ]
252 assert "summary" in captured
253 assert final_summary.trace
254 assert final_summary.completion_decision_code == "verification_passed"
255 assert final_summary.completion_decision_summary == (
256 "accepted the response after verification evidence passed"
257 )
258 assert [entry.decision_code for entry in final_summary.completion_trace] == [
259 "verification_passed"
260 ]
261
262
263 @pytest.mark.asyncio
264 async def test_turn_finalizer_records_skipped_verification_observation(
265 temp_dir: Path,
266 ) -> None:
267 session = FakeSession()
268 context = build_context(temp_dir, session)
269 finalizer = TurnFinalizer(
270 context,
271 RuntimeTracer(),
272 DefinitionOfDoneStore(temp_dir),
273 set_workflow_mode=_noop_set_workflow_mode,
274 )
275 dod = create_definition_of_done("Explain Loader's clarify loop.")
276 summary = TurnSummary(final_response="")
277 events = []
278
279 async def capture(event) -> None:
280 events.append(event)
281
282 result = await finalizer.run_definition_of_done_gate(
283 dod=dod,
284 candidate_response="Loader uses a bounded clarify loop before execution.",
285 emit=capture,
286 summary=summary,
287 executor=FakeExecutor([]), # type: ignore[arg-type]
288 )
289
290 assert result.should_continue is False
291 assert result.reason_code == "non_mutating_response_accepted"
292 assert [item.status for item in result.verification_observations] == [
293 VerificationObservationStatus.SKIPPED.value
294 ]
295 assert [item.summary for item in result.verification_observations] == [
296 "verification was skipped because no mutating work required checks"
297 ]
298 assert summary.verification_status == "skipped"
299 assert session.workflow_timeline[-1].kind == "verify_skip"
300 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
301 VerificationObservationStatus.SKIPPED.value
302 ]
303 assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
304
305
306 @pytest.mark.asyncio
307 async def test_turn_finalizer_records_passed_verification_observation(
308 temp_dir: Path,
309 ) -> None:
310 session = FakeSession()
311 context = build_context(temp_dir, session)
312 finalizer = TurnFinalizer(
313 context,
314 RuntimeTracer(),
315 DefinitionOfDoneStore(temp_dir),
316 set_workflow_mode=_noop_set_workflow_mode,
317 )
318 dod = create_definition_of_done("Update the runtime tests.")
319 dod.mutating_actions.append("write")
320 dod.verification_commands = ["uv run pytest -q"]
321 summary = TurnSummary(final_response="")
322 tool_call = ToolCall(
323 id="verify-1-1",
324 name="bash",
325 arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
326 )
327
328 async def capture(event) -> None:
329 return None
330
331 result = await finalizer.run_definition_of_done_gate(
332 dod=dod,
333 candidate_response="Updated the runtime tests.",
334 emit=capture,
335 summary=summary,
336 executor=FakeExecutor(
337 [
338 tool_outcome(
339 tool_call=tool_call,
340 output="219 passed",
341 is_error=False,
342 exit_code=0,
343 stdout="219 passed",
344 )
345 ]
346 ), # type: ignore[arg-type]
347 )
348
349 assert result.should_continue is False
350 assert result.reason_code == "verification_passed"
351 assert [item.status for item in result.verification_observations] == [
352 VerificationObservationStatus.PASSED.value
353 ]
354 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
355 assert result.verification_observations[0].attempt_number == 1
356 assert result.verification_observations[0].command == "uv run pytest -q"
357 assert result.verification_observations[0].detail == "219 passed"
358 assert summary.verification_status == "passed"
359 assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
360 "verification_pending",
361 "verification_command_passed",
362 ]
363 assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
364 VerificationObservationStatus.PENDING.value
365 ]
366 assert (
367 session.workflow_timeline[-2].verification_observations[0].attempt_id
368 == "verification-attempt-1"
369 )
370 assert session.workflow_timeline[-2].verification_observations[0].command == (
371 "uv run pytest -q"
372 )
373 assert session.workflow_timeline[-1].kind == "verify_observation"
374 assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
375 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
376 VerificationObservationStatus.PASSED.value
377 ]
378
379
380 @pytest.mark.asyncio
381 async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
382 temp_dir: Path,
383 ) -> None:
384 chapters = temp_dir / "chapters"
385 chapters.mkdir()
386 (chapters / "01-introduction.html").write_text(
387 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
388 )
389 index = temp_dir / "index.html"
390 index.write_text(
391 "\n".join(
392 [
393 '<ul class="chapter-list">',
394 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
395 "</ul>",
396 ]
397 )
398 )
399
400 session = FakeSession()
401 context = build_context(temp_dir, session)
402 finalizer = TurnFinalizer(
403 context,
404 RuntimeTracer(),
405 DefinitionOfDoneStore(temp_dir),
406 set_workflow_mode=_noop_set_workflow_mode,
407 )
408 dod = create_definition_of_done(
409 "Update index.html so the table of contents links and chapter titles are correct."
410 )
411 dod.mutating_actions.append("edit")
412 dod.touched_files.append(str(index))
413 dod.verification_commands = ['grep -n "href=" index.html']
414 summary = TurnSummary(final_response="")
415 executor = RecordingExecutor()
416
417 async def capture(event) -> None:
418 return None
419
420 result = await finalizer.run_definition_of_done_gate(
421 dod=dod,
422 candidate_response="Updated the index.html links.",
423 emit=capture,
424 summary=summary,
425 executor=executor, # type: ignore[arg-type]
426 )
427
428 assert result.should_continue is False
429 assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
430 assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
431 assert (
432 session.workflow_timeline[-1].verification_observations[0].attempt_id
433 == "verification-attempt-1"
434 )
435
436
437 @pytest.mark.asyncio
438 async def test_turn_finalizer_records_missing_verification_observation(
439 temp_dir: Path,
440 ) -> None:
441 session = FakeSession()
442 context = build_context(temp_dir, session)
443 finalizer = TurnFinalizer(
444 context,
445 RuntimeTracer(),
446 DefinitionOfDoneStore(temp_dir),
447 set_workflow_mode=_noop_set_workflow_mode,
448 )
449 dod = create_definition_of_done("Edit the loader bootstrap.")
450 dod.mutating_actions.append("edit")
451 summary = TurnSummary(final_response="")
452
453 async def capture(event) -> None:
454 return None
455
456 result = await finalizer.run_definition_of_done_gate(
457 dod=dod,
458 candidate_response="Updated the bootstrap code.",
459 emit=capture,
460 summary=summary,
461 executor=FakeExecutor([]), # type: ignore[arg-type]
462 )
463
464 assert result.should_continue is True
465 assert result.reason_code == "verification_failed_reentry"
466 assert [item.status for item in result.verification_observations] == [
467 VerificationObservationStatus.MISSING.value
468 ]
469 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
470 assert result.verification_observations[0].attempt_number == 1
471 assert [item.summary for item in result.verification_observations] == [
472 "verification commands were still missing at execution time"
473 ]
474 assert summary.verification_status == "failed"
475 assert session.workflow_timeline[-1].kind == "verify_observation"
476 assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
477 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
478 VerificationObservationStatus.MISSING.value
479 ]
480 assert (
481 session.workflow_timeline[-1].verification_observations[0].attempt_id
482 == "verification-attempt-1"
483 )
484 assert session.messages[-1].role == Role.USER
485 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
486
487
488 @pytest.mark.asyncio
489 async def test_turn_finalizer_does_not_reverify_without_new_changes(
490 temp_dir: Path,
491 ) -> None:
492 session = FakeSession()
493 context = build_context(temp_dir, session)
494 finalizer = TurnFinalizer(
495 context,
496 RuntimeTracer(),
497 DefinitionOfDoneStore(temp_dir),
498 set_workflow_mode=_noop_set_workflow_mode,
499 )
500 index = temp_dir / "index.html"
501 index.write_text("<ul></ul>\n")
502 dod = create_definition_of_done("Fix the chapter list in index.html.")
503 dod.mutating_actions.append("edit")
504 dod.touched_files.append(str(index))
505 dod.line_changes = 12
506 dod.last_verification_result = "failed"
507 dod.last_verification_signature = (
508 f"lines={dod.line_changes};touched={index};actions=1;commands="
509 )
510 dod.evidence = []
511 summary = TurnSummary(final_response="")
512 executor = RecordingExecutor()
513
514 async def capture(event) -> None:
515 return None
516
517 result = await finalizer.run_definition_of_done_gate(
518 dod=dod,
519 candidate_response="I checked the file again.",
520 emit=capture,
521 summary=summary,
522 executor=executor, # type: ignore[arg-type]
523 )
524
525 assert result.should_continue is True
526 assert result.reason_code == "verification_failed_no_new_changes"
527 assert executor.commands == []
528 assert summary.verification_status == "failed"
529 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
530
531
532 @pytest.mark.asyncio
533 async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
534 temp_dir: Path,
535 monkeypatch: pytest.MonkeyPatch,
536 ) -> None:
537 session = FakeSession()
538 context = build_context(temp_dir, session)
539 finalizer = TurnFinalizer(
540 context,
541 RuntimeTracer(),
542 DefinitionOfDoneStore(temp_dir),
543 set_workflow_mode=_noop_set_workflow_mode,
544 )
545 dod = create_definition_of_done(
546 "Update index.html so the table of contents links and chapter titles are correct."
547 )
548 dod.mutating_actions.append("edit")
549 dod.touched_files.append(str(temp_dir / "index.html"))
550 dod.verification_commands = [
551 "python3 - <<'PY'\nprint('semantic ok')\nPY",
552 "html5validator --root /tmp/fortran-qwen-recovery-check/",
553 ]
554 summary = TurnSummary(final_response="")
555 semantic_call = ToolCall(
556 id="verify-1-1",
557 name="bash",
558 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
559 )
560 html5validator_call = ToolCall(
561 id="verify-1-2",
562 name="bash",
563 arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
564 )
565
566 async def capture(event) -> None:
567 return None
568
569 monkeypatch.setattr(
570 "loader.runtime.finalization.derive_verification_commands",
571 lambda *args, **kwargs: [],
572 )
573
574 result = await finalizer.run_definition_of_done_gate(
575 dod=dod,
576 candidate_response="Updated the chapter links and titles.",
577 emit=capture,
578 summary=summary,
579 executor=FakeExecutor(
580 [
581 tool_outcome(
582 tool_call=semantic_call,
583 output="semantic ok",
584 is_error=False,
585 exit_code=0,
586 stdout="semantic ok",
587 ),
588 tool_outcome(
589 tool_call=html5validator_call,
590 output="/bin/sh: html5validator: command not found",
591 is_error=True,
592 exit_code=127,
593 stderr="/bin/sh: html5validator: command not found",
594 ),
595 ]
596 ), # type: ignore[arg-type]
597 )
598
599 assert result.should_continue is False
600 assert result.reason_code == "verification_passed"
601 assert summary.verification_status == "passed"
602 assert dod.status == "done"
603 assert dod.last_verification_result == "passed"
604 assert [item.passed for item in dod.evidence] == [True, False]
605 assert [item.skipped for item in dod.evidence] == [False, True]
606 assert "SKIP" in result.final_response
607 assert "html5validator" in result.final_response
608 assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
609 assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
610 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
611 VerificationObservationStatus.SKIPPED.value
612 ]