Python · 42568 bytes Raw Blame History
1 """Tests for finalization helpers on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.completion_trace import CompletionTraceEntry
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import (
14 DefinitionOfDoneStore,
15 VerificationEvidence,
16 create_definition_of_done,
17 )
18 from loader.runtime.events import TurnSummary
19 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
20 from loader.runtime.finalization import (
21 TurnFinalizer,
22 _build_verification_repair_guidance,
23 )
24 from loader.runtime.permissions import (
25 PermissionMode,
26 build_permission_policy,
27 load_permission_rules,
28 )
29 from loader.runtime.repair_focus import extract_active_repair_context
30 from loader.runtime.tracing import RuntimeTracer
31 from loader.runtime.verification_observations import VerificationObservationStatus
32 from loader.tools.base import ToolResult as RegistryToolResult
33 from loader.tools.base import create_default_registry
34 from tests.helpers.runtime_harness import ScriptedBackend
35
36
37 class FakeSession:
38 def __init__(self) -> None:
39 self.messages: list[Message] = []
40 self.session_id = "session-test-123"
41 self.recorded_calls: list[dict[str, object]] = []
42 self.last_completion_decision_code = "verification_passed"
43 self.last_completion_decision_summary = (
44 "accepted the response after verification evidence passed"
45 )
46 self.completion_trace = [
47 CompletionTraceEntry(
48 stage="definition_of_done",
49 outcome="complete",
50 decision_code="verification_passed",
51 decision_summary="accepted the response after verification evidence passed",
52 )
53 ]
54 self.last_turn_transition_summary = (
55 "completion -> finalize [terminal] Finalizing completed turn"
56 )
57 self.workflow_timeline = []
58
59 def append(self, message: Message) -> None:
60 self.messages.append(message)
61
62 def append_workflow_timeline_entry(self, entry) -> None:
63 self.workflow_timeline.append(entry)
64
65 def record_turn_usage(
66 self,
67 usage: dict[str, int],
68 *,
69 tool_calls: int,
70 iterations: int,
71 ) -> dict[str, int]:
72 payload = {
73 "usage": dict(usage),
74 "tool_calls": tool_calls,
75 "iterations": iterations,
76 }
77 self.recorded_calls.append(payload)
78 return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
79
80
81 class FakeCodeFilter:
82 def reset(self) -> None:
83 return None
84
85
86 class FakeSafeguards:
87 def __init__(self) -> None:
88 self.action_tracker = object()
89 self.validator = object()
90 self.code_filter = FakeCodeFilter()
91
92 def filter_stream_chunk(self, content: str) -> str:
93 return content
94
95 def filter_complete_content(self, content: str) -> str:
96 return content
97
98 def should_steer(self) -> bool:
99 return False
100
101 def get_steering_message(self) -> str | None:
102 return None
103
104 def record_response(self, content: str) -> None:
105 return None
106
107 def detect_text_loop(self, content: str) -> tuple[bool, str]:
108 return False, ""
109
110 def detect_loop(self) -> tuple[bool, str]:
111 return False, ""
112
113
114 class FakeExecutor:
115 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
116 self._outcomes = list(outcomes)
117
118 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
119 if not self._outcomes:
120 raise AssertionError("No fake verification outcome queued")
121 return self._outcomes.pop(0)
122
123
124 class RecordingExecutor:
125 def __init__(self) -> None:
126 self.commands: list[str] = []
127
128 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
129 command = str(tool_call.arguments.get("command", ""))
130 self.commands.append(command)
131 return tool_outcome(
132 tool_call=tool_call,
133 output="ok",
134 is_error=False,
135 exit_code=0,
136 stdout="ok",
137 )
138
139
140 class SelectiveRecordingExecutor:
141 def __init__(self, failing_match: str) -> None:
142 self.commands: list[str] = []
143 self.failing_match = failing_match
144
145 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
146 command = str(tool_call.arguments.get("command", ""))
147 self.commands.append(command)
148 failed = self.failing_match in command
149 return tool_outcome(
150 tool_call=tool_call,
151 output="failed" if failed else "ok",
152 is_error=failed,
153 exit_code=1 if failed else 0,
154 stdout="" if failed else "ok",
155 stderr="failed" if failed else "",
156 )
157
158
159 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
160 registry = create_default_registry(temp_dir)
161 registry.configure_workspace_root(temp_dir)
162 rule_status = load_permission_rules(temp_dir)
163 policy = build_permission_policy(
164 active_mode=PermissionMode.WORKSPACE_WRITE,
165 workspace_root=temp_dir,
166 tool_requirements=registry.get_tool_requirements(),
167 rules=rule_status.rules,
168 )
169 return RuntimeContext(
170 project_root=temp_dir,
171 backend=ScriptedBackend(),
172 registry=registry,
173 session=session, # type: ignore[arg-type]
174 config=SimpleNamespace(
175 force_react=False,
176 verification_retry_budget=3,
177 reasoning=SimpleNamespace(
178 rollback=False,
179 show_rollback_plan=False,
180 completion_check=True,
181 use_quick_completion=True,
182 max_continuation_prompts=5,
183 self_critique=False,
184 confidence_scoring=False,
185 min_confidence_for_action=3,
186 verification=False,
187 ),
188 ),
189 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
190 project_context=None,
191 permission_policy=policy,
192 permission_config_status=rule_status,
193 workflow_mode="execute",
194 safeguards=FakeSafeguards(),
195 )
196
197
198 def tool_outcome(
199 *,
200 tool_call: ToolCall,
201 output: str,
202 is_error: bool,
203 exit_code: int,
204 stdout: str = "",
205 stderr: str = "",
206 ) -> ToolExecutionOutcome:
207 return ToolExecutionOutcome(
208 tool_call=tool_call,
209 state=ToolExecutionState.EXECUTED,
210 message=Message.tool_result_message(
211 tool_call_id=tool_call.id,
212 display_content=output,
213 result_content=output,
214 is_error=is_error,
215 ),
216 event_content=output,
217 is_error=is_error,
218 result_output=output,
219 registry_result=RegistryToolResult(
220 output=output,
221 is_error=is_error,
222 metadata={
223 "exit_code": exit_code,
224 "stdout": stdout,
225 "stderr": stderr,
226 },
227 ),
228 )
229
230
231 async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
232 return None
233
234
235 def test_turn_finalizer_finalize_summary_uses_runtime_context(
236 temp_dir: Path,
237 monkeypatch: pytest.MonkeyPatch,
238 ) -> None:
239 session = FakeSession()
240 context = build_context(temp_dir, session)
241 tracer = RuntimeTracer()
242 tracer.record("turn.completed", reason="done")
243 finalizer = TurnFinalizer(
244 context,
245 tracer,
246 DefinitionOfDoneStore(temp_dir),
247 set_workflow_mode=_noop_set_workflow_mode,
248 )
249 dod = create_definition_of_done("Finish the task")
250 dod.status = "done"
251 summary = TurnSummary(
252 final_response="All set.",
253 definition_of_done=dod,
254 iterations=2,
255 usage={"prompt_tokens": 10},
256 tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
257 )
258 captured: dict[str, str] = {}
259
260 def capture_definition_of_done(self, summary_text: str) -> None:
261 captured["summary"] = summary_text
262
263 monkeypatch.setattr(
264 "loader.runtime.finalization.MemoryStore.capture_definition_of_done",
265 capture_definition_of_done,
266 )
267
268 final_summary = finalizer.finalize_summary(summary)
269
270 assert final_summary.session_id == "session-test-123"
271 assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
272 assert session.recorded_calls == [
273 {
274 "usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
275 "tool_calls": 1,
276 "iterations": 2,
277 }
278 ]
279 assert "summary" in captured
280 assert final_summary.trace
281 assert final_summary.completion_decision_code == "verification_passed"
282 assert final_summary.completion_decision_summary == (
283 "accepted the response after verification evidence passed"
284 )
285 assert [entry.decision_code for entry in final_summary.completion_trace] == [
286 "verification_passed"
287 ]
288
289
290 def test_verification_repair_guidance_uses_existing_artifacts_as_source_of_truth(
291 temp_dir: Path,
292 ) -> None:
293 guide_root = temp_dir / "guides" / "nginx"
294 chapters = guide_root / "chapters"
295 chapters.mkdir(parents=True)
296 index_path = guide_root / "index.html"
297 chapter_one = chapters / "01-getting-started.html"
298 chapter_two = chapters / "02-installation.html"
299 chapter_three = chapters / "03-first-website.html"
300 chapter_four = chapters / "04-configuration-basics.html"
301
302 for path in (index_path, chapter_one, chapter_two, chapter_three, chapter_four):
303 path.write_text("<html></html>\n")
304
305 implementation_plan = temp_dir / "implementation.md"
306 implementation_plan.write_text(
307 "\n".join(
308 [
309 "# Implementation Plan",
310 "",
311 "## File Changes",
312 f"- `{guide_root}/`",
313 f"- `{chapters}/`",
314 f"- `{index_path}`",
315 f"- `{chapter_one}`",
316 f"- `{chapter_two}`",
317 f"- `{chapter_three}`",
318 f"- `{chapter_four}`",
319 "",
320 ]
321 )
322 )
323
324 dod = create_definition_of_done("Repair the nginx guide index.")
325 dod.implementation_plan = str(implementation_plan)
326 dod.evidence = [
327 VerificationEvidence(
328 command="verify-links",
329 passed=False,
330 output=(
331 "Missing local HTML links:\n"
332 f"{index_path}:chapters/01-introduction.html -> {chapters / '01-introduction.html'}\n"
333 f"{index_path}:chapters/04-server-blocks.html -> {chapters / '04-server-blocks.html'}\n"
334 ),
335 )
336 ]
337
338 guidance = _build_verification_repair_guidance(
339 dod,
340 project_root=temp_dir,
341 )
342
343 assert "Use the existing artifact files as the source of truth" in guidance
344 assert str(chapter_one) in guidance
345 assert str(chapter_two) in guidance
346 assert str(chapter_four) in guidance
347
348
349 def test_verification_repair_guidance_does_not_create_out_of_scope_link_target(
350 temp_dir: Path,
351 ) -> None:
352 guide_root = temp_dir / "guides" / "nginx"
353 chapters = guide_root / "chapters"
354 chapters.mkdir(parents=True)
355 index_path = guide_root / "index.html"
356 chapter_one = chapters / "01-introduction.html"
357 index_path.write_text('<a href="../index.html">All guides</a>\n')
358 chapter_one.write_text('<a href="../index.html">Back</a>\n')
359 parent_index = temp_dir / "guides" / "index.html"
360
361 implementation_plan = temp_dir / "implementation.md"
362 implementation_plan.write_text(
363 "\n".join(
364 [
365 "# Implementation Plan",
366 "",
367 "## File Changes",
368 f"- `{guide_root}/`",
369 f"- `{chapters}/`",
370 f"- `{index_path}`",
371 f"- `{chapter_one}`",
372 "",
373 ]
374 )
375 )
376
377 dod = create_definition_of_done("Create the nginx guide under guides/nginx.")
378 dod.implementation_plan = str(implementation_plan)
379 dod.touched_files.extend([str(index_path), str(chapter_one)])
380 dod.evidence = [
381 VerificationEvidence(
382 command="verify-links",
383 passed=False,
384 output=(
385 "Missing local HTML links:\n"
386 f"{index_path}:../index.html -> {parent_index}\n"
387 ),
388 )
389 ]
390
391 guidance = _build_verification_repair_guidance(
392 dod,
393 project_root=temp_dir,
394 )
395 repair = extract_active_repair_context([Message(role=Role.USER, content=guidance)])
396
397 assert "outside the requested artifact scope" in guidance
398 assert "do not create that outside file" in guidance
399 assert f"create `{parent_index}`" not in guidance
400 assert repair is not None
401 assert str(parent_index.resolve(strict=False)) not in repair.allowed_paths
402 assert str(index_path.resolve(strict=False)) in repair.allowed_paths
403
404
405 @pytest.mark.asyncio
406 async def test_turn_finalizer_records_skipped_verification_observation(
407 temp_dir: Path,
408 ) -> None:
409 session = FakeSession()
410 context = build_context(temp_dir, session)
411 finalizer = TurnFinalizer(
412 context,
413 RuntimeTracer(),
414 DefinitionOfDoneStore(temp_dir),
415 set_workflow_mode=_noop_set_workflow_mode,
416 )
417 dod = create_definition_of_done("Explain Loader's clarify loop.")
418 summary = TurnSummary(final_response="")
419 events = []
420
421 async def capture(event) -> None:
422 events.append(event)
423
424 result = await finalizer.run_definition_of_done_gate(
425 dod=dod,
426 candidate_response="Loader uses a bounded clarify loop before execution.",
427 emit=capture,
428 summary=summary,
429 executor=FakeExecutor([]), # type: ignore[arg-type]
430 )
431
432 assert result.should_continue is False
433 assert result.reason_code == "non_mutating_response_accepted"
434 assert [item.status for item in result.verification_observations] == [
435 VerificationObservationStatus.SKIPPED.value
436 ]
437 assert [item.summary for item in result.verification_observations] == [
438 "verification was skipped because no mutating work required checks"
439 ]
440 assert summary.verification_status == "skipped"
441 assert "Complete the requested work" not in dod.pending_items
442 assert "Complete the requested work" in dod.completed_items
443 assert session.workflow_timeline[-1].kind == "verify_skip"
444 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
445 VerificationObservationStatus.SKIPPED.value
446 ]
447 assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
448
449
450 @pytest.mark.asyncio
451 async def test_turn_finalizer_accepts_noop_completion_with_task_restatement_todo(
452 temp_dir: Path,
453 ) -> None:
454 session = FakeSession()
455 context = build_context(temp_dir, session)
456 finalizer = TurnFinalizer(
457 context,
458 RuntimeTracer(),
459 DefinitionOfDoneStore(temp_dir),
460 set_workflow_mode=_noop_set_workflow_mode,
461 )
462 task = (
463 "Have a look at ~/Loader/guides/fortran/index.html, then "
464 "~/Loader/guides/fortran/chapters. The table of contents links in "
465 "index.html are inaccurate and the href’s are wrong. Let’s update the "
466 "links and their link texts to be correct."
467 )
468 dod = create_definition_of_done(task)
469 dod.pending_items = [task, "Complete the requested work"]
470 summary = TurnSummary(final_response="")
471
472 async def capture(event) -> None:
473 return None
474
475 result = await finalizer.run_definition_of_done_gate(
476 dod=dod,
477 candidate_response="The table of contents is already correct, so no edit is needed.",
478 emit=capture,
479 summary=summary,
480 executor=FakeExecutor([]), # type: ignore[arg-type]
481 )
482
483 assert result.should_continue is False
484 assert result.reason_code == "non_mutating_response_accepted"
485
486
487 @pytest.mark.asyncio
488 async def test_turn_finalizer_records_passed_verification_observation(
489 temp_dir: Path,
490 ) -> None:
491 session = FakeSession()
492 context = build_context(temp_dir, session)
493 finalizer = TurnFinalizer(
494 context,
495 RuntimeTracer(),
496 DefinitionOfDoneStore(temp_dir),
497 set_workflow_mode=_noop_set_workflow_mode,
498 )
499 dod = create_definition_of_done("Update the runtime tests.")
500 dod.mutating_actions.append("write")
501 dod.verification_commands = ["uv run pytest -q"]
502 summary = TurnSummary(final_response="")
503 tool_call = ToolCall(
504 id="verify-1-1",
505 name="bash",
506 arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
507 )
508
509 async def capture(event) -> None:
510 return None
511
512 result = await finalizer.run_definition_of_done_gate(
513 dod=dod,
514 candidate_response="Updated the runtime tests.",
515 emit=capture,
516 summary=summary,
517 executor=FakeExecutor(
518 [
519 tool_outcome(
520 tool_call=tool_call,
521 output="219 passed",
522 is_error=False,
523 exit_code=0,
524 stdout="219 passed",
525 )
526 ]
527 ), # type: ignore[arg-type]
528 )
529
530 assert result.should_continue is False
531 assert result.reason_code == "verification_passed"
532 assert [item.status for item in result.verification_observations] == [
533 VerificationObservationStatus.PASSED.value
534 ]
535 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
536 assert result.verification_observations[0].attempt_number == 1
537 assert result.verification_observations[0].command == "uv run pytest -q"
538 assert result.verification_observations[0].detail == "219 passed"
539 assert summary.verification_status == "passed"
540 assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
541 "verification_pending",
542 "verification_command_passed",
543 ]
544 assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
545 VerificationObservationStatus.PENDING.value
546 ]
547 assert (
548 session.workflow_timeline[-2].verification_observations[0].attempt_id
549 == "verification-attempt-1"
550 )
551 assert session.workflow_timeline[-2].verification_observations[0].command == (
552 "uv run pytest -q"
553 )
554 assert session.workflow_timeline[-1].kind == "verify_observation"
555 assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
556 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
557 VerificationObservationStatus.PASSED.value
558 ]
559
560
561 @pytest.mark.asyncio
562 async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
563 temp_dir: Path,
564 ) -> None:
565 chapters = temp_dir / "chapters"
566 chapters.mkdir()
567 (chapters / "01-introduction.html").write_text(
568 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
569 )
570 index = temp_dir / "index.html"
571 index.write_text(
572 "\n".join(
573 [
574 '<ul class="chapter-list">',
575 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
576 "</ul>",
577 ]
578 )
579 )
580
581 session = FakeSession()
582 context = build_context(temp_dir, session)
583 finalizer = TurnFinalizer(
584 context,
585 RuntimeTracer(),
586 DefinitionOfDoneStore(temp_dir),
587 set_workflow_mode=_noop_set_workflow_mode,
588 )
589 dod = create_definition_of_done(
590 "Update index.html so the table of contents links and chapter titles are correct."
591 )
592 dod.mutating_actions.append("edit")
593 dod.touched_files.append(str(index))
594 dod.verification_commands = ['grep -n "href=" index.html']
595 summary = TurnSummary(final_response="")
596 executor = RecordingExecutor()
597
598 async def capture(event) -> None:
599 return None
600
601 result = await finalizer.run_definition_of_done_gate(
602 dod=dod,
603 candidate_response="Updated the index.html links.",
604 emit=capture,
605 summary=summary,
606 executor=executor, # type: ignore[arg-type]
607 )
608
609 assert result.should_continue is False
610 assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
611 assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
612 assert (
613 session.workflow_timeline[-1].verification_observations[0].attempt_id
614 == "verification-attempt-1"
615 )
616
617
618 @pytest.mark.asyncio
619 async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
620 temp_dir: Path,
621 ) -> None:
622 (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
623 (temp_dir / "package.json").write_text("{}\n")
624 external_root = temp_dir.parent / "external-nginx-guide"
625 external_root.mkdir(exist_ok=True)
626 external_index = external_root / "index.html"
627 external_index.write_text("<html></html>\n")
628
629 session = FakeSession()
630 context = build_context(temp_dir, session)
631 finalizer = TurnFinalizer(
632 context,
633 RuntimeTracer(),
634 DefinitionOfDoneStore(temp_dir),
635 set_workflow_mode=_noop_set_workflow_mode,
636 )
637 dod = create_definition_of_done("Create an external nginx guide.")
638 dod.mutating_actions.append("write")
639 dod.touched_files.append(str(external_index))
640 dod.verification_commands = [
641 f"ls -la {external_root}",
642 f"grep -n \"html\" {external_index}",
643 ]
644 summary = TurnSummary(final_response="")
645 executor = RecordingExecutor()
646
647 async def capture(event) -> None:
648 return None
649
650 result = await finalizer.run_definition_of_done_gate(
651 dod=dod,
652 candidate_response="Created the external nginx guide.",
653 emit=capture,
654 summary=summary,
655 executor=executor, # type: ignore[arg-type]
656 )
657
658 assert result.should_continue is False
659 assert executor.commands == [
660 f"ls -la {external_root}",
661 f'grep -n "html" {external_index}',
662 ]
663
664
665 @pytest.mark.asyncio
666 async def test_turn_finalizer_filters_reference_side_verification_commands(
667 temp_dir: Path,
668 ) -> None:
669 guide_root = temp_dir / "Loader" / "guides" / "nginx"
670 chapters = guide_root / "chapters"
671 chapters.mkdir(parents=True)
672 index_path = guide_root / "index.html"
673 chapter_one = chapters / "01-introduction.html"
674 index_path.write_text("<html><body><h1>Guide</h1></body></html>\n")
675 chapter_one.write_text("<html><body><h1>Intro</h1></body></html>\n")
676
677 reference_root = temp_dir / "Loader" / "guides" / "fortran"
678 reference_root.mkdir(parents=True)
679
680 implementation_plan = temp_dir / "implementation.md"
681 implementation_plan.write_text(
682 "\n".join(
683 [
684 "# Implementation Plan",
685 "",
686 "## File Changes",
687 f"- `{guide_root}`",
688 f"- `{chapters}`",
689 f"- `{index_path}`",
690 f"- `{chapter_one}`",
691 "",
692 ]
693 )
694 )
695 verification_plan = temp_dir / "verification.md"
696 verification_plan.write_text(
697 "\n".join(
698 [
699 "# Verification Plan",
700 "",
701 "## Verification Commands",
702 "```bash",
703 f"ls -la {guide_root}",
704 f"ls -la {reference_root}",
705 "```",
706 "",
707 ]
708 )
709 )
710
711 session = FakeSession()
712 context = build_context(temp_dir, session)
713 finalizer = TurnFinalizer(
714 context,
715 RuntimeTracer(),
716 DefinitionOfDoneStore(temp_dir),
717 set_workflow_mode=_noop_set_workflow_mode,
718 )
719 dod = create_definition_of_done("Create an nginx guide from an external reference.")
720 dod.mutating_actions.append("write")
721 dod.touched_files.extend([str(index_path), str(chapter_one)])
722 dod.implementation_plan = str(implementation_plan)
723 dod.verification_plan = str(verification_plan)
724 summary = TurnSummary(final_response="")
725 executor = RecordingExecutor()
726
727 async def capture(event) -> None:
728 return None
729
730 result = await finalizer.run_definition_of_done_gate(
731 dod=dod,
732 candidate_response="Created the nginx guide.",
733 emit=capture,
734 summary=summary,
735 executor=executor, # type: ignore[arg-type]
736 )
737
738 assert result.should_continue is False
739 assert any(str(guide_root) in command for command in executor.commands)
740 assert all(str(reference_root) not in command for command in executor.commands)
741
742
743 @pytest.mark.asyncio
744 async def test_turn_finalizer_blocks_completion_when_planned_artifacts_are_missing(
745 temp_dir: Path,
746 ) -> None:
747 docs = temp_dir / "docs"
748 chapters = docs / "chapters"
749 chapters.mkdir(parents=True)
750 index = docs / "index.html"
751 first = chapters / "01-intro.html"
752 second = chapters / "02-installation.html"
753 index.write_text(
754 "\n".join(
755 [
756 '<a href="chapters/01-intro.html">Intro</a>',
757 '<a href="chapters/02-installation.html">Installation</a>',
758 ]
759 )
760 )
761 first.write_text("<h1>Intro</h1>\n")
762 implementation_plan = temp_dir / "implementation.md"
763 implementation_plan.write_text(
764 "\n".join(
765 [
766 "# Implementation Plan",
767 "",
768 "## File Changes",
769 f"- `{index}`",
770 f"- `{first}`",
771 f"- `{second}`",
772 ]
773 )
774 )
775
776 session = FakeSession()
777 context = build_context(temp_dir, session)
778 finalizer = TurnFinalizer(
779 context,
780 RuntimeTracer(),
781 DefinitionOfDoneStore(temp_dir),
782 set_workflow_mode=_noop_set_workflow_mode,
783 )
784 dod = create_definition_of_done("Create a small multi-page HTML guide.")
785 dod.mutating_actions.append("write")
786 dod.touched_files.extend([str(index), str(first)])
787 dod.implementation_plan = str(implementation_plan)
788 dod.verification_commands = [f"ls -la {docs}"]
789 summary = TurnSummary(final_response="")
790 executor = RecordingExecutor()
791
792 async def capture(event) -> None:
793 return None
794
795 result = await finalizer.run_definition_of_done_gate(
796 dod=dod,
797 candidate_response="Finished the guide.",
798 emit=capture,
799 summary=summary,
800 executor=executor, # type: ignore[arg-type]
801 )
802
803 assert result.should_continue is True
804 assert result.reason_code == "planned_artifacts_missing_continue"
805 assert executor.commands == []
806 assert dod.status == "draft"
807 assert "Complete the requested work" in dod.pending_items
808 assert "Complete the requested work" not in dod.completed_items
809 assert session.messages[-1].content.startswith("[PLANNED ARTIFACTS STILL MISSING]")
810 assert "`02-installation.html`" in session.messages[-1].content
811
812
813 @pytest.mark.asyncio
814 async def test_turn_finalizer_records_missing_verification_observation(
815 temp_dir: Path,
816 ) -> None:
817 session = FakeSession()
818 context = build_context(temp_dir, session)
819 finalizer = TurnFinalizer(
820 context,
821 RuntimeTracer(),
822 DefinitionOfDoneStore(temp_dir),
823 set_workflow_mode=_noop_set_workflow_mode,
824 )
825 dod = create_definition_of_done("Edit the loader bootstrap.")
826 dod.mutating_actions.append("edit")
827 summary = TurnSummary(final_response="")
828
829 async def capture(event) -> None:
830 return None
831
832 result = await finalizer.run_definition_of_done_gate(
833 dod=dod,
834 candidate_response="Updated the bootstrap code.",
835 emit=capture,
836 summary=summary,
837 executor=FakeExecutor([]), # type: ignore[arg-type]
838 )
839
840 assert result.should_continue is True
841 assert result.reason_code == "verification_failed_reentry"
842 assert [item.status for item in result.verification_observations] == [
843 VerificationObservationStatus.MISSING.value
844 ]
845 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
846 assert result.verification_observations[0].attempt_number == 1
847 assert [item.summary for item in result.verification_observations] == [
848 "verification commands were still missing at execution time"
849 ]
850 assert summary.verification_status == "failed"
851 assert session.workflow_timeline[-1].kind == "verify_observation"
852 assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
853 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
854 VerificationObservationStatus.MISSING.value
855 ]
856 assert (
857 session.workflow_timeline[-1].verification_observations[0].attempt_id
858 == "verification-attempt-1"
859 )
860 assert session.messages[-1].role == Role.USER
861 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
862
863
864 @pytest.mark.asyncio
865 async def test_turn_finalizer_ignores_unplanned_expansion_pending_items_once_plan_exists(
866 temp_dir: Path,
867 ) -> None:
868 session = FakeSession()
869 context = build_context(temp_dir, session)
870 finalizer = TurnFinalizer(
871 context,
872 RuntimeTracer(),
873 DefinitionOfDoneStore(temp_dir),
874 set_workflow_mode=_noop_set_workflow_mode,
875 )
876
877 docs = temp_dir / "guides" / "nginx"
878 chapters = docs / "chapters"
879 docs.mkdir(parents=True)
880 chapters.mkdir()
881 index = docs / "index.html"
882 first = chapters / "01-getting-started.html"
883 second = chapters / "02-installation.html"
884 index.write_text("<html></html>\n")
885 first.write_text("<h1>One</h1>\n")
886 second.write_text("<h1>Two</h1>\n")
887
888 implementation_plan = temp_dir / "implementation.md"
889 implementation_plan.write_text(
890 "\n".join(
891 [
892 "# Implementation Plan",
893 "",
894 "## File Changes",
895 f"- `{docs}/`",
896 f"- `{chapters}/`",
897 f"- `{index}`",
898 f"- `{first}`",
899 f"- `{second}`",
900 "",
901 ]
902 )
903 )
904
905 dod = create_definition_of_done("Create a small multi-page HTML guide.")
906 dod.implementation_plan = str(implementation_plan)
907 dod.pending_items = [
908 "Create 07-performance-tuning.html",
909 "Complete the requested work",
910 ]
911 summary = TurnSummary(final_response="")
912
913 async def capture(event) -> None:
914 return None
915
916 result = await finalizer.run_definition_of_done_gate(
917 dod=dod,
918 candidate_response="Finished the guide.",
919 emit=capture,
920 summary=summary,
921 executor=FakeExecutor([]), # type: ignore[arg-type]
922 )
923
924 assert result.should_continue is False
925 assert result.reason_code == "non_mutating_response_accepted"
926
927
928 @pytest.mark.asyncio
929 async def test_turn_finalizer_verification_failure_reentry_points_at_concrete_repair(
930 temp_dir: Path,
931 monkeypatch: pytest.MonkeyPatch,
932 ) -> None:
933 session = FakeSession()
934 context = build_context(temp_dir, session)
935 queued_messages: list[str] = []
936 context.queue_steering_message_callback = queued_messages.append
937 finalizer = TurnFinalizer(
938 context,
939 RuntimeTracer(),
940 DefinitionOfDoneStore(temp_dir),
941 set_workflow_mode=_noop_set_workflow_mode,
942 )
943 broken_file = temp_dir / "guides" / "nginx" / "chapters" / "05-advanced-configurations.html"
944 broken_file.parent.mkdir(parents=True, exist_ok=True)
945 broken_file.write_text('<link rel="stylesheet" href="../styles.css">\n')
946 missing_target = temp_dir / "guides" / "nginx" / "styles.css"
947 dod = create_definition_of_done("Create the nginx guide.")
948 dod.mutating_actions.append("write")
949 dod.touched_files.append(str(broken_file))
950 dod.verification_commands = ["python3 verify_links.py"]
951 summary = TurnSummary(final_response="")
952 verify_call = ToolCall(
953 id="verify-1-1",
954 name="bash",
955 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
956 )
957 failure_output = (
958 "Missing local HTML links:\n"
959 f"{broken_file}:../styles.css -> {missing_target}\n"
960 )
961
962 async def capture(event) -> None:
963 return None
964
965 monkeypatch.setattr(
966 "loader.runtime.finalization.derive_verification_commands",
967 lambda *args, **kwargs: [],
968 )
969
970 result = await finalizer.run_definition_of_done_gate(
971 dod=dod,
972 candidate_response="The guide is complete.",
973 emit=capture,
974 summary=summary,
975 executor=FakeExecutor(
976 [
977 tool_outcome(
978 tool_call=verify_call,
979 output=failure_output,
980 is_error=True,
981 exit_code=1,
982 stdout=failure_output,
983 )
984 ]
985 ), # type: ignore[arg-type]
986 )
987
988 assert result.should_continue is True
989 assert result.reason_code == "verification_failed_reentry"
990 assert queued_messages
991 assert str(broken_file) in queued_messages[-1]
992 assert "../styles.css" in queued_messages[-1]
993 assert str(missing_target) in queued_messages[-1]
994 assert "Do not restart discovery or reread unrelated references." in queued_messages[-1]
995 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
996 assert f"Immediate next step: edit `{broken_file}`." in session.messages[-1].content
997 assert f"create `{missing_target}`" in session.messages[-1].content
998 assert (
999 "Do not reread unrelated reference materials or restart discovery"
1000 in session.messages[-1].content
1001 )
1002
1003
1004 @pytest.mark.asyncio
1005 async def test_turn_finalizer_verification_failure_reentry_prioritizes_missing_planned_outputs(
1006 temp_dir: Path,
1007 monkeypatch: pytest.MonkeyPatch,
1008 ) -> None:
1009 session = FakeSession()
1010 context = build_context(temp_dir, session)
1011 queued_messages: list[str] = []
1012 context.queue_steering_message_callback = queued_messages.append
1013 finalizer = TurnFinalizer(
1014 context,
1015 RuntimeTracer(),
1016 DefinitionOfDoneStore(temp_dir),
1017 set_workflow_mode=_noop_set_workflow_mode,
1018 )
1019 guide_root = temp_dir / "guides" / "nginx"
1020 chapters = guide_root / "chapters"
1021 chapters.mkdir(parents=True, exist_ok=True)
1022 index = guide_root / "index.html"
1023 first = chapters / "01-installation.html"
1024 second = chapters / "02-configuration.html"
1025 third = chapters / "03-basic-usage.html"
1026 index.write_text(
1027 "\n".join(
1028 [
1029 '<a href="chapters/01-installation.html">Installation</a>',
1030 '<a href="chapters/02-configuration.html">Configuration</a>',
1031 '<a href="chapters/03-basic-usage.html">Basic Usage</a>',
1032 ]
1033 )
1034 )
1035 first.write_text("<h1>Installation</h1>\n")
1036 implementation_plan = temp_dir / "implementation.md"
1037 implementation_plan.write_text(
1038 "\n".join(
1039 [
1040 "# Implementation Plan",
1041 "",
1042 "## File Changes",
1043 f"- `{guide_root}/`",
1044 f"- `{chapters}/`",
1045 f"- `{index}`",
1046 f"- `{first}`",
1047 "",
1048 ]
1049 )
1050 )
1051 dod = create_definition_of_done("Create the nginx guide.")
1052 dod.mutating_actions.append("write")
1053 dod.touched_files.extend([str(index), str(first)])
1054 dod.implementation_plan = str(implementation_plan)
1055 dod.verification_commands = ["python3 verify_links.py"]
1056 summary = TurnSummary(final_response="")
1057 verify_call = ToolCall(
1058 id="verify-1-1",
1059 name="bash",
1060 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1061 )
1062 normalized_second = str(second.resolve(strict=False))
1063 normalized_third = str(third.resolve(strict=False))
1064 failure_output = (
1065 "Missing local HTML links:\n"
1066 f"{index}:chapters/02-configuration.html -> {second}\n"
1067 f"{index}:chapters/03-basic-usage.html -> {third}\n"
1068 )
1069
1070 async def capture(event) -> None:
1071 return None
1072
1073 monkeypatch.setattr(
1074 "loader.runtime.finalization.derive_verification_commands",
1075 lambda *args, **kwargs: [],
1076 )
1077
1078 result = await finalizer.run_definition_of_done_gate(
1079 dod=dod,
1080 candidate_response="The guide is complete.",
1081 emit=capture,
1082 summary=summary,
1083 executor=FakeExecutor(
1084 [
1085 tool_outcome(
1086 tool_call=verify_call,
1087 output=failure_output,
1088 is_error=True,
1089 exit_code=1,
1090 stdout=failure_output,
1091 )
1092 ]
1093 ), # type: ignore[arg-type]
1094 )
1095
1096 assert result.should_continue is True
1097 assert result.reason_code == "verification_failed_reentry"
1098 assert queued_messages
1099 assert normalized_second in queued_messages[-1]
1100 assert "Do not rewrite the existing aggregate files" in queued_messages[-1]
1101 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1102 assert f"Immediate next step: write `{normalized_second}`." in session.messages[-1].content
1103 assert (
1104 f"creating missing planned artifact `{normalized_second}`"
1105 in session.messages[-1].content
1106 )
1107 assert (
1108 f"creating missing planned artifact `{normalized_third}`"
1109 in session.messages[-1].content
1110 )
1111 assert f"Immediate next step: edit `{index}`." not in session.messages[-1].content
1112 assert "Do not rewrite existing aggregate files" in session.messages[-1].content
1113
1114
1115 @pytest.mark.asyncio
1116 async def test_turn_finalizer_does_not_reverify_without_new_changes(
1117 temp_dir: Path,
1118 ) -> None:
1119 session = FakeSession()
1120 context = build_context(temp_dir, session)
1121 finalizer = TurnFinalizer(
1122 context,
1123 RuntimeTracer(),
1124 DefinitionOfDoneStore(temp_dir),
1125 set_workflow_mode=_noop_set_workflow_mode,
1126 )
1127 index = temp_dir / "index.html"
1128 index.write_text("<ul></ul>\n")
1129 dod = create_definition_of_done("Fix the chapter list in index.html.")
1130 dod.mutating_actions.append("edit")
1131 dod.touched_files.append(str(index))
1132 dod.line_changes = 12
1133 dod.last_verification_result = "failed"
1134 dod.last_verification_signature = (
1135 f"lines={dod.line_changes};touched={index};actions=1;commands="
1136 )
1137 dod.evidence = []
1138 summary = TurnSummary(final_response="")
1139 executor = RecordingExecutor()
1140
1141 async def capture(event) -> None:
1142 return None
1143
1144 result = await finalizer.run_definition_of_done_gate(
1145 dod=dod,
1146 candidate_response="I checked the file again.",
1147 emit=capture,
1148 summary=summary,
1149 executor=executor, # type: ignore[arg-type]
1150 )
1151
1152 assert result.should_continue is True
1153 assert result.reason_code == "verification_failed_no_new_changes"
1154 assert executor.commands == []
1155 assert summary.verification_status == "failed"
1156 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
1157
1158
1159 @pytest.mark.asyncio
1160 async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
1161 temp_dir: Path,
1162 monkeypatch: pytest.MonkeyPatch,
1163 ) -> None:
1164 session = FakeSession()
1165 context = build_context(temp_dir, session)
1166 finalizer = TurnFinalizer(
1167 context,
1168 RuntimeTracer(),
1169 DefinitionOfDoneStore(temp_dir),
1170 set_workflow_mode=_noop_set_workflow_mode,
1171 )
1172 dod = create_definition_of_done(
1173 "Update index.html so the table of contents links and chapter titles are correct."
1174 )
1175 dod.mutating_actions.append("edit")
1176 dod.touched_files.append(str(temp_dir / "index.html"))
1177 dod.verification_commands = [
1178 "python3 - <<'PY'\nprint('semantic ok')\nPY",
1179 "html5validator --root /tmp/fortran-qwen-recovery-check/",
1180 ]
1181 summary = TurnSummary(final_response="")
1182 semantic_call = ToolCall(
1183 id="verify-1-1",
1184 name="bash",
1185 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1186 )
1187 html5validator_call = ToolCall(
1188 id="verify-1-2",
1189 name="bash",
1190 arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
1191 )
1192
1193 async def capture(event) -> None:
1194 return None
1195
1196 monkeypatch.setattr(
1197 "loader.runtime.finalization.derive_verification_commands",
1198 lambda *args, **kwargs: [],
1199 )
1200
1201 result = await finalizer.run_definition_of_done_gate(
1202 dod=dod,
1203 candidate_response="Updated the chapter links and titles.",
1204 emit=capture,
1205 summary=summary,
1206 executor=FakeExecutor(
1207 [
1208 tool_outcome(
1209 tool_call=semantic_call,
1210 output="semantic ok",
1211 is_error=False,
1212 exit_code=0,
1213 stdout="semantic ok",
1214 ),
1215 tool_outcome(
1216 tool_call=html5validator_call,
1217 output="/bin/sh: html5validator: command not found",
1218 is_error=True,
1219 exit_code=127,
1220 stderr="/bin/sh: html5validator: command not found",
1221 ),
1222 ]
1223 ), # type: ignore[arg-type]
1224 )
1225
1226 assert result.should_continue is False
1227 assert result.reason_code == "verification_passed"
1228 assert summary.verification_status == "passed"
1229 assert dod.status == "done"
1230 assert dod.last_verification_result == "passed"
1231 assert [item.passed for item in dod.evidence] == [True, False]
1232 assert [item.skipped for item in dod.evidence] == [False, True]
1233 assert "SKIP" in result.final_response
1234 assert "html5validator" in result.final_response
1235 assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
1236 assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
1237 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
1238 VerificationObservationStatus.SKIPPED.value
1239 ]