Python · 46211 bytes Raw Blame History
1 """Tests for finalization helpers on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.completion_trace import CompletionTraceEntry
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import (
14 DefinitionOfDoneStore,
15 VerificationEvidence,
16 create_definition_of_done,
17 )
18 from loader.runtime.events import TurnSummary
19 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
20 from loader.runtime.finalization import (
21 TurnFinalizer,
22 _build_verification_repair_guidance,
23 )
24 from loader.runtime.permissions import (
25 PermissionMode,
26 build_permission_policy,
27 load_permission_rules,
28 )
29 from loader.runtime.repair_focus import extract_active_repair_context
30 from loader.runtime.tracing import RuntimeTracer
31 from loader.runtime.verification_observations import VerificationObservationStatus
32 from loader.tools.base import ToolResult as RegistryToolResult
33 from loader.tools.base import create_default_registry
34 from tests.helpers.runtime_harness import ScriptedBackend
35
36
37 class FakeSession:
38 def __init__(self) -> None:
39 self.messages: list[Message] = []
40 self.session_id = "session-test-123"
41 self.recorded_calls: list[dict[str, object]] = []
42 self.last_completion_decision_code = "verification_passed"
43 self.last_completion_decision_summary = (
44 "accepted the response after verification evidence passed"
45 )
46 self.completion_trace = [
47 CompletionTraceEntry(
48 stage="definition_of_done",
49 outcome="complete",
50 decision_code="verification_passed",
51 decision_summary="accepted the response after verification evidence passed",
52 )
53 ]
54 self.last_turn_transition_summary = (
55 "completion -> finalize [terminal] Finalizing completed turn"
56 )
57 self.workflow_timeline = []
58
59 def append(self, message: Message) -> None:
60 self.messages.append(message)
61
62 def append_workflow_timeline_entry(self, entry) -> None:
63 self.workflow_timeline.append(entry)
64
65 def record_turn_usage(
66 self,
67 usage: dict[str, int],
68 *,
69 tool_calls: int,
70 iterations: int,
71 ) -> dict[str, int]:
72 payload = {
73 "usage": dict(usage),
74 "tool_calls": tool_calls,
75 "iterations": iterations,
76 }
77 self.recorded_calls.append(payload)
78 return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
79
80
81 class FakeCodeFilter:
82 def reset(self) -> None:
83 return None
84
85
86 class FakeSafeguards:
87 def __init__(self) -> None:
88 self.action_tracker = object()
89 self.validator = object()
90 self.code_filter = FakeCodeFilter()
91
92 def filter_stream_chunk(self, content: str) -> str:
93 return content
94
95 def filter_complete_content(self, content: str) -> str:
96 return content
97
98 def should_steer(self) -> bool:
99 return False
100
101 def get_steering_message(self) -> str | None:
102 return None
103
104 def record_response(self, content: str) -> None:
105 return None
106
107 def detect_text_loop(self, content: str) -> tuple[bool, str]:
108 return False, ""
109
110 def detect_loop(self) -> tuple[bool, str]:
111 return False, ""
112
113
114 class FakeExecutor:
115 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
116 self._outcomes = list(outcomes)
117
118 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
119 if not self._outcomes:
120 raise AssertionError("No fake verification outcome queued")
121 return self._outcomes.pop(0)
122
123
124 class RecordingExecutor:
125 def __init__(self) -> None:
126 self.commands: list[str] = []
127
128 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
129 command = str(tool_call.arguments.get("command", ""))
130 self.commands.append(command)
131 return tool_outcome(
132 tool_call=tool_call,
133 output="ok",
134 is_error=False,
135 exit_code=0,
136 stdout="ok",
137 )
138
139
140 class SelectiveRecordingExecutor:
141 def __init__(self, failing_match: str) -> None:
142 self.commands: list[str] = []
143 self.failing_match = failing_match
144
145 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
146 command = str(tool_call.arguments.get("command", ""))
147 self.commands.append(command)
148 failed = self.failing_match in command
149 return tool_outcome(
150 tool_call=tool_call,
151 output="failed" if failed else "ok",
152 is_error=failed,
153 exit_code=1 if failed else 0,
154 stdout="" if failed else "ok",
155 stderr="failed" if failed else "",
156 )
157
158
159 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
160 registry = create_default_registry(temp_dir)
161 registry.configure_workspace_root(temp_dir)
162 rule_status = load_permission_rules(temp_dir)
163 policy = build_permission_policy(
164 active_mode=PermissionMode.WORKSPACE_WRITE,
165 workspace_root=temp_dir,
166 tool_requirements=registry.get_tool_requirements(),
167 rules=rule_status.rules,
168 )
169 return RuntimeContext(
170 project_root=temp_dir,
171 backend=ScriptedBackend(),
172 registry=registry,
173 session=session, # type: ignore[arg-type]
174 config=SimpleNamespace(
175 force_react=False,
176 verification_retry_budget=3,
177 reasoning=SimpleNamespace(
178 rollback=False,
179 show_rollback_plan=False,
180 completion_check=True,
181 use_quick_completion=True,
182 max_continuation_prompts=5,
183 self_critique=False,
184 confidence_scoring=False,
185 min_confidence_for_action=3,
186 verification=False,
187 ),
188 ),
189 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
190 project_context=None,
191 permission_policy=policy,
192 permission_config_status=rule_status,
193 workflow_mode="execute",
194 safeguards=FakeSafeguards(),
195 )
196
197
198 def tool_outcome(
199 *,
200 tool_call: ToolCall,
201 output: str,
202 is_error: bool,
203 exit_code: int,
204 stdout: str = "",
205 stderr: str = "",
206 ) -> ToolExecutionOutcome:
207 return ToolExecutionOutcome(
208 tool_call=tool_call,
209 state=ToolExecutionState.EXECUTED,
210 message=Message.tool_result_message(
211 tool_call_id=tool_call.id,
212 display_content=output,
213 result_content=output,
214 is_error=is_error,
215 ),
216 event_content=output,
217 is_error=is_error,
218 result_output=output,
219 registry_result=RegistryToolResult(
220 output=output,
221 is_error=is_error,
222 metadata={
223 "exit_code": exit_code,
224 "stdout": stdout,
225 "stderr": stderr,
226 },
227 ),
228 )
229
230
231 async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
232 return None
233
234
235 def test_turn_finalizer_finalize_summary_uses_runtime_context(
236 temp_dir: Path,
237 monkeypatch: pytest.MonkeyPatch,
238 ) -> None:
239 session = FakeSession()
240 context = build_context(temp_dir, session)
241 tracer = RuntimeTracer()
242 tracer.record("turn.completed", reason="done")
243 finalizer = TurnFinalizer(
244 context,
245 tracer,
246 DefinitionOfDoneStore(temp_dir),
247 set_workflow_mode=_noop_set_workflow_mode,
248 )
249 dod = create_definition_of_done("Finish the task")
250 dod.status = "done"
251 summary = TurnSummary(
252 final_response="All set.",
253 definition_of_done=dod,
254 iterations=2,
255 usage={"prompt_tokens": 10},
256 tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
257 )
258 captured: dict[str, str] = {}
259
260 def capture_definition_of_done(self, summary_text: str) -> None:
261 captured["summary"] = summary_text
262
263 monkeypatch.setattr(
264 "loader.runtime.finalization.MemoryStore.capture_definition_of_done",
265 capture_definition_of_done,
266 )
267
268 final_summary = finalizer.finalize_summary(summary)
269
270 assert final_summary.session_id == "session-test-123"
271 assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
272 assert session.recorded_calls == [
273 {
274 "usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
275 "tool_calls": 1,
276 "iterations": 2,
277 }
278 ]
279 assert "summary" in captured
280 assert final_summary.trace
281 assert final_summary.completion_decision_code == "verification_passed"
282 assert final_summary.completion_decision_summary == (
283 "accepted the response after verification evidence passed"
284 )
285 assert [entry.decision_code for entry in final_summary.completion_trace] == [
286 "verification_passed"
287 ]
288
289
290 def test_verification_repair_guidance_uses_existing_artifacts_as_source_of_truth(
291 temp_dir: Path,
292 ) -> None:
293 guide_root = temp_dir / "guides" / "nginx"
294 chapters = guide_root / "chapters"
295 chapters.mkdir(parents=True)
296 index_path = guide_root / "index.html"
297 chapter_one = chapters / "01-getting-started.html"
298 chapter_two = chapters / "02-installation.html"
299 chapter_three = chapters / "03-first-website.html"
300 chapter_four = chapters / "04-configuration-basics.html"
301
302 for path in (index_path, chapter_one, chapter_two, chapter_three, chapter_four):
303 path.write_text("<html></html>\n")
304
305 implementation_plan = temp_dir / "implementation.md"
306 implementation_plan.write_text(
307 "\n".join(
308 [
309 "# Implementation Plan",
310 "",
311 "## File Changes",
312 f"- `{guide_root}/`",
313 f"- `{chapters}/`",
314 f"- `{index_path}`",
315 f"- `{chapter_one}`",
316 f"- `{chapter_two}`",
317 f"- `{chapter_three}`",
318 f"- `{chapter_four}`",
319 "",
320 ]
321 )
322 )
323
324 dod = create_definition_of_done("Repair the nginx guide index.")
325 dod.implementation_plan = str(implementation_plan)
326 dod.evidence = [
327 VerificationEvidence(
328 command="verify-links",
329 passed=False,
330 output=(
331 "Missing local HTML links:\n"
332 f"{index_path}:chapters/01-introduction.html -> {chapters / '01-introduction.html'}\n"
333 f"{index_path}:chapters/04-server-blocks.html -> {chapters / '04-server-blocks.html'}\n"
334 ),
335 )
336 ]
337
338 guidance = _build_verification_repair_guidance(
339 dod,
340 project_root=temp_dir,
341 )
342
343 assert "Use the existing artifact files as the source of truth" in guidance
344 assert str(chapter_one) in guidance
345 assert str(chapter_two) in guidance
346 assert str(chapter_four) in guidance
347
348
349 def test_verification_repair_guidance_does_not_create_out_of_scope_link_target(
350 temp_dir: Path,
351 ) -> None:
352 guide_root = temp_dir / "guides" / "nginx"
353 chapters = guide_root / "chapters"
354 chapters.mkdir(parents=True)
355 index_path = guide_root / "index.html"
356 chapter_one = chapters / "01-introduction.html"
357 index_path.write_text('<a href="../index.html">All guides</a>\n')
358 chapter_one.write_text('<a href="../index.html">Back</a>\n')
359 parent_index = temp_dir / "guides" / "index.html"
360
361 implementation_plan = temp_dir / "implementation.md"
362 implementation_plan.write_text(
363 "\n".join(
364 [
365 "# Implementation Plan",
366 "",
367 "## File Changes",
368 f"- `{guide_root}/`",
369 f"- `{chapters}/`",
370 f"- `{index_path}`",
371 f"- `{chapter_one}`",
372 "",
373 ]
374 )
375 )
376
377 dod = create_definition_of_done("Create the nginx guide under guides/nginx.")
378 dod.implementation_plan = str(implementation_plan)
379 dod.touched_files.extend([str(index_path), str(chapter_one)])
380 dod.evidence = [
381 VerificationEvidence(
382 command="verify-links",
383 passed=False,
384 output=(
385 "Missing local HTML links:\n"
386 f"{index_path}:../index.html -> {parent_index}\n"
387 ),
388 )
389 ]
390
391 guidance = _build_verification_repair_guidance(
392 dod,
393 project_root=temp_dir,
394 )
395 repair = extract_active_repair_context([Message(role=Role.USER, content=guidance)])
396
397 assert "outside the requested artifact scope" in guidance
398 assert "do not create that outside file" in guidance
399 assert f"create `{parent_index}`" not in guidance
400 assert repair is not None
401 assert str(parent_index.resolve(strict=False)) not in repair.allowed_paths
402 assert str(index_path.resolve(strict=False)) in repair.allowed_paths
403
404
405 def test_verification_repair_guidance_replaces_stale_focus_for_html_quality_issue(
406 temp_dir: Path,
407 ) -> None:
408 stale_index = temp_dir / "guides" / "nginx" / "index.html"
409 stale_index.parent.mkdir(parents=True)
410 stale_index.write_text("<h1>Index</h1>\n")
411 first_chapter = temp_dir / "guides" / "nginx" / "chapters" / "01-introduction.html"
412 third_chapter = temp_dir / "guides" / "nginx" / "chapters" / "03-configuration.html"
413 first_chapter.parent.mkdir(parents=True)
414 first_chapter.write_text("<h1>Intro</h1>\n")
415 third_chapter.write_text("<h1>Config</h1>\n")
416 stale_message = Message(
417 role=Role.USER,
418 content=(
419 "Repair focus:\n"
420 f"- Fix the broken local reference `../index.html` in `{stale_index}`.\n"
421 f"- Immediate next step: edit `{stale_index}`.\n"
422 ),
423 )
424 dod = create_definition_of_done("Create an equally thorough HTML guide.")
425 dod.evidence = [
426 VerificationEvidence(
427 command="quality",
428 passed=False,
429 output=(
430 "HTML guide content quality issues:\n"
431 f"{first_chapter}: insufficient structured content (13 blocks, expected at least 18)\n"
432 f"{third_chapter}: thin content (1505 text chars, expected at least 1758)\n"
433 ),
434 )
435 ]
436
437 guidance = _build_verification_repair_guidance(
438 dod,
439 project_root=temp_dir,
440 )
441 repair = extract_active_repair_context(
442 [stale_message, Message(role=Role.USER, content=guidance)]
443 )
444
445 assert guidance.startswith("Repair focus:")
446 assert f"Immediate next step: edit `{first_chapter}` with a substantial" in guidance
447 assert "Repair every listed quality target in order before any final answer" in guidance
448 assert "HTML guide content quality issues" not in guidance
449 assert repair is not None
450 assert repair.artifact_path == str(first_chapter.resolve(strict=False))
451 assert str(stale_index.resolve(strict=False)) not in repair.allowed_paths
452 assert str(third_chapter.resolve(strict=False)) in repair.allowed_paths
453
454
455 def test_verification_repair_guidance_keeps_multi_file_quality_worklist(
456 temp_dir: Path,
457 ) -> None:
458 chapters = temp_dir / "guides" / "nginx" / "chapters"
459 chapters.mkdir(parents=True)
460 chapter_paths = [
461 chapters / f"{index:02d}-chapter-{index}.html"
462 for index in range(1, 9)
463 ]
464 for path in chapter_paths:
465 path.write_text(f"<h1>{path.stem}</h1>\n")
466 dod = create_definition_of_done("Create an equally thorough HTML guide.")
467 dod.evidence = [
468 VerificationEvidence(
469 command="quality",
470 passed=False,
471 output=(
472 "HTML guide content quality issues:\n"
473 + "\n".join(
474 f"{path}: thin content (200 text chars, expected at least 1758)"
475 for path in chapter_paths
476 )
477 ),
478 )
479 ]
480
481 guidance = _build_verification_repair_guidance(
482 dod,
483 project_root=temp_dir,
484 )
485 repair = extract_active_repair_context(
486 [Message(role=Role.USER, content=guidance)]
487 )
488
489 assert f"Improve `{chapter_paths[0]}`: thin content" in guidance
490 assert f"Improve `{chapter_paths[-1]}`: thin content" in guidance
491 assert "add enough concrete prose" in guidance
492 assert "do not stop after touching only the first file" in guidance
493 assert repair is not None
494 assert repair.artifact_path == str(chapter_paths[0].resolve(strict=False))
495 assert str(chapter_paths[-1].resolve(strict=False)) in repair.allowed_paths
496
497
498 @pytest.mark.asyncio
499 async def test_turn_finalizer_records_skipped_verification_observation(
500 temp_dir: Path,
501 ) -> None:
502 session = FakeSession()
503 context = build_context(temp_dir, session)
504 finalizer = TurnFinalizer(
505 context,
506 RuntimeTracer(),
507 DefinitionOfDoneStore(temp_dir),
508 set_workflow_mode=_noop_set_workflow_mode,
509 )
510 dod = create_definition_of_done("Explain Loader's clarify loop.")
511 summary = TurnSummary(final_response="")
512 events = []
513
514 async def capture(event) -> None:
515 events.append(event)
516
517 result = await finalizer.run_definition_of_done_gate(
518 dod=dod,
519 candidate_response="Loader uses a bounded clarify loop before execution.",
520 emit=capture,
521 summary=summary,
522 executor=FakeExecutor([]), # type: ignore[arg-type]
523 )
524
525 assert result.should_continue is False
526 assert result.reason_code == "non_mutating_response_accepted"
527 assert [item.status for item in result.verification_observations] == [
528 VerificationObservationStatus.SKIPPED.value
529 ]
530 assert [item.summary for item in result.verification_observations] == [
531 "verification was skipped because no mutating work required checks"
532 ]
533 assert summary.verification_status == "skipped"
534 assert "Complete the requested work" not in dod.pending_items
535 assert "Complete the requested work" in dod.completed_items
536 assert session.workflow_timeline[-1].kind == "verify_skip"
537 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
538 VerificationObservationStatus.SKIPPED.value
539 ]
540 assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
541
542
543 @pytest.mark.asyncio
544 async def test_turn_finalizer_accepts_noop_completion_with_task_restatement_todo(
545 temp_dir: Path,
546 ) -> None:
547 session = FakeSession()
548 context = build_context(temp_dir, session)
549 finalizer = TurnFinalizer(
550 context,
551 RuntimeTracer(),
552 DefinitionOfDoneStore(temp_dir),
553 set_workflow_mode=_noop_set_workflow_mode,
554 )
555 task = (
556 "Have a look at ~/Loader/guides/fortran/index.html, then "
557 "~/Loader/guides/fortran/chapters. The table of contents links in "
558 "index.html are inaccurate and the href’s are wrong. Let’s update the "
559 "links and their link texts to be correct."
560 )
561 dod = create_definition_of_done(task)
562 dod.pending_items = [task, "Complete the requested work"]
563 summary = TurnSummary(final_response="")
564
565 async def capture(event) -> None:
566 return None
567
568 result = await finalizer.run_definition_of_done_gate(
569 dod=dod,
570 candidate_response="The table of contents is already correct, so no edit is needed.",
571 emit=capture,
572 summary=summary,
573 executor=FakeExecutor([]), # type: ignore[arg-type]
574 )
575
576 assert result.should_continue is False
577 assert result.reason_code == "non_mutating_response_accepted"
578
579
580 @pytest.mark.asyncio
581 async def test_turn_finalizer_records_passed_verification_observation(
582 temp_dir: Path,
583 ) -> None:
584 session = FakeSession()
585 context = build_context(temp_dir, session)
586 finalizer = TurnFinalizer(
587 context,
588 RuntimeTracer(),
589 DefinitionOfDoneStore(temp_dir),
590 set_workflow_mode=_noop_set_workflow_mode,
591 )
592 dod = create_definition_of_done("Update the runtime tests.")
593 dod.mutating_actions.append("write")
594 dod.verification_commands = ["uv run pytest -q"]
595 summary = TurnSummary(final_response="")
596 tool_call = ToolCall(
597 id="verify-1-1",
598 name="bash",
599 arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
600 )
601
602 async def capture(event) -> None:
603 return None
604
605 result = await finalizer.run_definition_of_done_gate(
606 dod=dod,
607 candidate_response="Updated the runtime tests.",
608 emit=capture,
609 summary=summary,
610 executor=FakeExecutor(
611 [
612 tool_outcome(
613 tool_call=tool_call,
614 output="219 passed",
615 is_error=False,
616 exit_code=0,
617 stdout="219 passed",
618 )
619 ]
620 ), # type: ignore[arg-type]
621 )
622
623 assert result.should_continue is False
624 assert result.reason_code == "verification_passed"
625 assert [item.status for item in result.verification_observations] == [
626 VerificationObservationStatus.PASSED.value
627 ]
628 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
629 assert result.verification_observations[0].attempt_number == 1
630 assert result.verification_observations[0].command == "uv run pytest -q"
631 assert result.verification_observations[0].detail == "219 passed"
632 assert summary.verification_status == "passed"
633 assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
634 "verification_pending",
635 "verification_command_passed",
636 ]
637 assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
638 VerificationObservationStatus.PENDING.value
639 ]
640 assert (
641 session.workflow_timeline[-2].verification_observations[0].attempt_id
642 == "verification-attempt-1"
643 )
644 assert session.workflow_timeline[-2].verification_observations[0].command == (
645 "uv run pytest -q"
646 )
647 assert session.workflow_timeline[-1].kind == "verify_observation"
648 assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
649 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
650 VerificationObservationStatus.PASSED.value
651 ]
652
653
654 @pytest.mark.asyncio
655 async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
656 temp_dir: Path,
657 ) -> None:
658 chapters = temp_dir / "chapters"
659 chapters.mkdir()
660 (chapters / "01-introduction.html").write_text(
661 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
662 )
663 index = temp_dir / "index.html"
664 index.write_text(
665 "\n".join(
666 [
667 '<ul class="chapter-list">',
668 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
669 "</ul>",
670 ]
671 )
672 )
673
674 session = FakeSession()
675 context = build_context(temp_dir, session)
676 finalizer = TurnFinalizer(
677 context,
678 RuntimeTracer(),
679 DefinitionOfDoneStore(temp_dir),
680 set_workflow_mode=_noop_set_workflow_mode,
681 )
682 dod = create_definition_of_done(
683 "Update index.html so the table of contents links and chapter titles are correct."
684 )
685 dod.mutating_actions.append("edit")
686 dod.touched_files.append(str(index))
687 dod.verification_commands = ['grep -n "href=" index.html']
688 summary = TurnSummary(final_response="")
689 executor = RecordingExecutor()
690
691 async def capture(event) -> None:
692 return None
693
694 result = await finalizer.run_definition_of_done_gate(
695 dod=dod,
696 candidate_response="Updated the index.html links.",
697 emit=capture,
698 summary=summary,
699 executor=executor, # type: ignore[arg-type]
700 )
701
702 assert result.should_continue is False
703 assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
704 assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
705 assert (
706 session.workflow_timeline[-1].verification_observations[0].attempt_id
707 == "verification-attempt-1"
708 )
709
710
711 @pytest.mark.asyncio
712 async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
713 temp_dir: Path,
714 ) -> None:
715 (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
716 (temp_dir / "package.json").write_text("{}\n")
717 external_root = temp_dir.parent / "external-nginx-guide"
718 external_root.mkdir(exist_ok=True)
719 external_index = external_root / "index.html"
720 external_index.write_text("<html></html>\n")
721
722 session = FakeSession()
723 context = build_context(temp_dir, session)
724 finalizer = TurnFinalizer(
725 context,
726 RuntimeTracer(),
727 DefinitionOfDoneStore(temp_dir),
728 set_workflow_mode=_noop_set_workflow_mode,
729 )
730 dod = create_definition_of_done("Create an external nginx guide.")
731 dod.mutating_actions.append("write")
732 dod.touched_files.append(str(external_index))
733 dod.verification_commands = [
734 f"ls -la {external_root}",
735 f"grep -n \"html\" {external_index}",
736 ]
737 summary = TurnSummary(final_response="")
738 executor = RecordingExecutor()
739
740 async def capture(event) -> None:
741 return None
742
743 result = await finalizer.run_definition_of_done_gate(
744 dod=dod,
745 candidate_response="Created the external nginx guide.",
746 emit=capture,
747 summary=summary,
748 executor=executor, # type: ignore[arg-type]
749 )
750
751 assert result.should_continue is False
752 assert executor.commands == [
753 f"ls -la {external_root}",
754 f'grep -n "html" {external_index}',
755 ]
756
757
758 @pytest.mark.asyncio
759 async def test_turn_finalizer_filters_reference_side_verification_commands(
760 temp_dir: Path,
761 ) -> None:
762 guide_root = temp_dir / "Loader" / "guides" / "nginx"
763 chapters = guide_root / "chapters"
764 chapters.mkdir(parents=True)
765 index_path = guide_root / "index.html"
766 chapter_one = chapters / "01-introduction.html"
767 index_path.write_text("<html><body><h1>Guide</h1></body></html>\n")
768 chapter_one.write_text("<html><body><h1>Intro</h1></body></html>\n")
769
770 reference_root = temp_dir / "Loader" / "guides" / "fortran"
771 reference_root.mkdir(parents=True)
772
773 implementation_plan = temp_dir / "implementation.md"
774 implementation_plan.write_text(
775 "\n".join(
776 [
777 "# Implementation Plan",
778 "",
779 "## File Changes",
780 f"- `{guide_root}`",
781 f"- `{chapters}`",
782 f"- `{index_path}`",
783 f"- `{chapter_one}`",
784 "",
785 ]
786 )
787 )
788 verification_plan = temp_dir / "verification.md"
789 verification_plan.write_text(
790 "\n".join(
791 [
792 "# Verification Plan",
793 "",
794 "## Verification Commands",
795 "```bash",
796 f"ls -la {guide_root}",
797 f"ls -la {reference_root}",
798 "```",
799 "",
800 ]
801 )
802 )
803
804 session = FakeSession()
805 context = build_context(temp_dir, session)
806 finalizer = TurnFinalizer(
807 context,
808 RuntimeTracer(),
809 DefinitionOfDoneStore(temp_dir),
810 set_workflow_mode=_noop_set_workflow_mode,
811 )
812 dod = create_definition_of_done("Create an nginx guide from an external reference.")
813 dod.mutating_actions.append("write")
814 dod.touched_files.extend([str(index_path), str(chapter_one)])
815 dod.implementation_plan = str(implementation_plan)
816 dod.verification_plan = str(verification_plan)
817 summary = TurnSummary(final_response="")
818 executor = RecordingExecutor()
819
820 async def capture(event) -> None:
821 return None
822
823 result = await finalizer.run_definition_of_done_gate(
824 dod=dod,
825 candidate_response="Created the nginx guide.",
826 emit=capture,
827 summary=summary,
828 executor=executor, # type: ignore[arg-type]
829 )
830
831 assert result.should_continue is False
832 assert any(str(guide_root) in command for command in executor.commands)
833 assert all(str(reference_root) not in command for command in executor.commands)
834
835
836 @pytest.mark.asyncio
837 async def test_turn_finalizer_blocks_completion_when_planned_artifacts_are_missing(
838 temp_dir: Path,
839 ) -> None:
840 docs = temp_dir / "docs"
841 chapters = docs / "chapters"
842 chapters.mkdir(parents=True)
843 index = docs / "index.html"
844 first = chapters / "01-intro.html"
845 second = chapters / "02-installation.html"
846 index.write_text(
847 "\n".join(
848 [
849 '<a href="chapters/01-intro.html">Intro</a>',
850 '<a href="chapters/02-installation.html">Installation</a>',
851 ]
852 )
853 )
854 first.write_text("<h1>Intro</h1>\n")
855 implementation_plan = temp_dir / "implementation.md"
856 implementation_plan.write_text(
857 "\n".join(
858 [
859 "# Implementation Plan",
860 "",
861 "## File Changes",
862 f"- `{index}`",
863 f"- `{first}`",
864 f"- `{second}`",
865 ]
866 )
867 )
868
869 session = FakeSession()
870 context = build_context(temp_dir, session)
871 finalizer = TurnFinalizer(
872 context,
873 RuntimeTracer(),
874 DefinitionOfDoneStore(temp_dir),
875 set_workflow_mode=_noop_set_workflow_mode,
876 )
877 dod = create_definition_of_done("Create a small multi-page HTML guide.")
878 dod.mutating_actions.append("write")
879 dod.touched_files.extend([str(index), str(first)])
880 dod.implementation_plan = str(implementation_plan)
881 dod.verification_commands = [f"ls -la {docs}"]
882 summary = TurnSummary(final_response="")
883 executor = RecordingExecutor()
884
885 async def capture(event) -> None:
886 return None
887
888 result = await finalizer.run_definition_of_done_gate(
889 dod=dod,
890 candidate_response="Finished the guide.",
891 emit=capture,
892 summary=summary,
893 executor=executor, # type: ignore[arg-type]
894 )
895
896 assert result.should_continue is True
897 assert result.reason_code == "planned_artifacts_missing_continue"
898 assert executor.commands == []
899 assert dod.status == "draft"
900 assert "Complete the requested work" in dod.pending_items
901 assert "Complete the requested work" not in dod.completed_items
902 assert session.messages[-1].content.startswith("[PLANNED ARTIFACTS STILL MISSING]")
903 assert "`02-installation.html`" in session.messages[-1].content
904
905
906 @pytest.mark.asyncio
907 async def test_turn_finalizer_records_missing_verification_observation(
908 temp_dir: Path,
909 ) -> None:
910 session = FakeSession()
911 context = build_context(temp_dir, session)
912 finalizer = TurnFinalizer(
913 context,
914 RuntimeTracer(),
915 DefinitionOfDoneStore(temp_dir),
916 set_workflow_mode=_noop_set_workflow_mode,
917 )
918 dod = create_definition_of_done("Edit the loader bootstrap.")
919 dod.mutating_actions.append("edit")
920 summary = TurnSummary(final_response="")
921
922 async def capture(event) -> None:
923 return None
924
925 result = await finalizer.run_definition_of_done_gate(
926 dod=dod,
927 candidate_response="Updated the bootstrap code.",
928 emit=capture,
929 summary=summary,
930 executor=FakeExecutor([]), # type: ignore[arg-type]
931 )
932
933 assert result.should_continue is True
934 assert result.reason_code == "verification_failed_reentry"
935 assert [item.status for item in result.verification_observations] == [
936 VerificationObservationStatus.MISSING.value
937 ]
938 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
939 assert result.verification_observations[0].attempt_number == 1
940 assert [item.summary for item in result.verification_observations] == [
941 "verification commands were still missing at execution time"
942 ]
943 assert summary.verification_status == "failed"
944 assert session.workflow_timeline[-1].kind == "verify_observation"
945 assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
946 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
947 VerificationObservationStatus.MISSING.value
948 ]
949 assert (
950 session.workflow_timeline[-1].verification_observations[0].attempt_id
951 == "verification-attempt-1"
952 )
953 assert session.messages[-1].role == Role.USER
954 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
955
956
957 @pytest.mark.asyncio
958 async def test_turn_finalizer_ignores_unplanned_expansion_pending_items_once_plan_exists(
959 temp_dir: Path,
960 ) -> None:
961 session = FakeSession()
962 context = build_context(temp_dir, session)
963 finalizer = TurnFinalizer(
964 context,
965 RuntimeTracer(),
966 DefinitionOfDoneStore(temp_dir),
967 set_workflow_mode=_noop_set_workflow_mode,
968 )
969
970 docs = temp_dir / "guides" / "nginx"
971 chapters = docs / "chapters"
972 docs.mkdir(parents=True)
973 chapters.mkdir()
974 index = docs / "index.html"
975 first = chapters / "01-getting-started.html"
976 second = chapters / "02-installation.html"
977 index.write_text("<html></html>\n")
978 first.write_text("<h1>One</h1>\n")
979 second.write_text("<h1>Two</h1>\n")
980
981 implementation_plan = temp_dir / "implementation.md"
982 implementation_plan.write_text(
983 "\n".join(
984 [
985 "# Implementation Plan",
986 "",
987 "## File Changes",
988 f"- `{docs}/`",
989 f"- `{chapters}/`",
990 f"- `{index}`",
991 f"- `{first}`",
992 f"- `{second}`",
993 "",
994 ]
995 )
996 )
997
998 dod = create_definition_of_done("Create a small multi-page HTML guide.")
999 dod.implementation_plan = str(implementation_plan)
1000 dod.pending_items = [
1001 "Create 07-performance-tuning.html",
1002 "Complete the requested work",
1003 ]
1004 summary = TurnSummary(final_response="")
1005
1006 async def capture(event) -> None:
1007 return None
1008
1009 result = await finalizer.run_definition_of_done_gate(
1010 dod=dod,
1011 candidate_response="Finished the guide.",
1012 emit=capture,
1013 summary=summary,
1014 executor=FakeExecutor([]), # type: ignore[arg-type]
1015 )
1016
1017 assert result.should_continue is False
1018 assert result.reason_code == "non_mutating_response_accepted"
1019
1020
1021 @pytest.mark.asyncio
1022 async def test_turn_finalizer_verification_failure_reentry_points_at_concrete_repair(
1023 temp_dir: Path,
1024 monkeypatch: pytest.MonkeyPatch,
1025 ) -> None:
1026 session = FakeSession()
1027 context = build_context(temp_dir, session)
1028 queued_messages: list[str] = []
1029 context.queue_steering_message_callback = queued_messages.append
1030 finalizer = TurnFinalizer(
1031 context,
1032 RuntimeTracer(),
1033 DefinitionOfDoneStore(temp_dir),
1034 set_workflow_mode=_noop_set_workflow_mode,
1035 )
1036 broken_file = temp_dir / "guides" / "nginx" / "chapters" / "05-advanced-configurations.html"
1037 broken_file.parent.mkdir(parents=True, exist_ok=True)
1038 broken_file.write_text('<link rel="stylesheet" href="../styles.css">\n')
1039 missing_target = temp_dir / "guides" / "nginx" / "styles.css"
1040 dod = create_definition_of_done("Create the nginx guide.")
1041 dod.mutating_actions.append("write")
1042 dod.touched_files.append(str(broken_file))
1043 dod.verification_commands = ["python3 verify_links.py"]
1044 summary = TurnSummary(final_response="")
1045 verify_call = ToolCall(
1046 id="verify-1-1",
1047 name="bash",
1048 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1049 )
1050 failure_output = (
1051 "Missing local HTML links:\n"
1052 f"{broken_file}:../styles.css -> {missing_target}\n"
1053 )
1054
1055 async def capture(event) -> None:
1056 return None
1057
1058 monkeypatch.setattr(
1059 "loader.runtime.finalization.derive_verification_commands",
1060 lambda *args, **kwargs: [],
1061 )
1062
1063 result = await finalizer.run_definition_of_done_gate(
1064 dod=dod,
1065 candidate_response="The guide is complete.",
1066 emit=capture,
1067 summary=summary,
1068 executor=FakeExecutor(
1069 [
1070 tool_outcome(
1071 tool_call=verify_call,
1072 output=failure_output,
1073 is_error=True,
1074 exit_code=1,
1075 stdout=failure_output,
1076 )
1077 ]
1078 ), # type: ignore[arg-type]
1079 )
1080
1081 assert result.should_continue is True
1082 assert result.reason_code == "verification_failed_reentry"
1083 assert queued_messages
1084 assert str(broken_file) in queued_messages[-1]
1085 assert "../styles.css" in queued_messages[-1]
1086 assert str(missing_target) in queued_messages[-1]
1087 assert "Do not restart discovery or reread unrelated references." in queued_messages[-1]
1088 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1089 assert f"Immediate next step: edit `{broken_file}`." in session.messages[-1].content
1090 assert f"create `{missing_target}`" in session.messages[-1].content
1091 assert (
1092 "Do not reread unrelated reference materials or restart discovery"
1093 in session.messages[-1].content
1094 )
1095
1096
1097 @pytest.mark.asyncio
1098 async def test_turn_finalizer_verification_failure_reentry_prioritizes_missing_planned_outputs(
1099 temp_dir: Path,
1100 monkeypatch: pytest.MonkeyPatch,
1101 ) -> None:
1102 session = FakeSession()
1103 context = build_context(temp_dir, session)
1104 queued_messages: list[str] = []
1105 context.queue_steering_message_callback = queued_messages.append
1106 finalizer = TurnFinalizer(
1107 context,
1108 RuntimeTracer(),
1109 DefinitionOfDoneStore(temp_dir),
1110 set_workflow_mode=_noop_set_workflow_mode,
1111 )
1112 guide_root = temp_dir / "guides" / "nginx"
1113 chapters = guide_root / "chapters"
1114 chapters.mkdir(parents=True, exist_ok=True)
1115 index = guide_root / "index.html"
1116 first = chapters / "01-installation.html"
1117 second = chapters / "02-configuration.html"
1118 third = chapters / "03-basic-usage.html"
1119 index.write_text(
1120 "\n".join(
1121 [
1122 '<a href="chapters/01-installation.html">Installation</a>',
1123 '<a href="chapters/02-configuration.html">Configuration</a>',
1124 '<a href="chapters/03-basic-usage.html">Basic Usage</a>',
1125 ]
1126 )
1127 )
1128 first.write_text("<h1>Installation</h1>\n")
1129 implementation_plan = temp_dir / "implementation.md"
1130 implementation_plan.write_text(
1131 "\n".join(
1132 [
1133 "# Implementation Plan",
1134 "",
1135 "## File Changes",
1136 f"- `{guide_root}/`",
1137 f"- `{chapters}/`",
1138 f"- `{index}`",
1139 f"- `{first}`",
1140 "",
1141 ]
1142 )
1143 )
1144 dod = create_definition_of_done("Create the nginx guide.")
1145 dod.mutating_actions.append("write")
1146 dod.touched_files.extend([str(index), str(first)])
1147 dod.implementation_plan = str(implementation_plan)
1148 dod.verification_commands = ["python3 verify_links.py"]
1149 summary = TurnSummary(final_response="")
1150 verify_call = ToolCall(
1151 id="verify-1-1",
1152 name="bash",
1153 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1154 )
1155 normalized_second = str(second.resolve(strict=False))
1156 normalized_third = str(third.resolve(strict=False))
1157 failure_output = (
1158 "Missing local HTML links:\n"
1159 f"{index}:chapters/02-configuration.html -> {second}\n"
1160 f"{index}:chapters/03-basic-usage.html -> {third}\n"
1161 )
1162
1163 async def capture(event) -> None:
1164 return None
1165
1166 monkeypatch.setattr(
1167 "loader.runtime.finalization.derive_verification_commands",
1168 lambda *args, **kwargs: [],
1169 )
1170
1171 result = await finalizer.run_definition_of_done_gate(
1172 dod=dod,
1173 candidate_response="The guide is complete.",
1174 emit=capture,
1175 summary=summary,
1176 executor=FakeExecutor(
1177 [
1178 tool_outcome(
1179 tool_call=verify_call,
1180 output=failure_output,
1181 is_error=True,
1182 exit_code=1,
1183 stdout=failure_output,
1184 )
1185 ]
1186 ), # type: ignore[arg-type]
1187 )
1188
1189 assert result.should_continue is True
1190 assert result.reason_code == "verification_failed_reentry"
1191 assert queued_messages
1192 assert normalized_second in queued_messages[-1]
1193 assert "Do not rewrite the existing aggregate files" in queued_messages[-1]
1194 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1195 assert f"Immediate next step: write `{normalized_second}`." in session.messages[-1].content
1196 assert (
1197 f"creating missing planned artifact `{normalized_second}`"
1198 in session.messages[-1].content
1199 )
1200 assert (
1201 f"creating missing planned artifact `{normalized_third}`"
1202 in session.messages[-1].content
1203 )
1204 assert f"Immediate next step: edit `{index}`." not in session.messages[-1].content
1205 assert "Do not rewrite existing aggregate files" in session.messages[-1].content
1206
1207
1208 @pytest.mark.asyncio
1209 async def test_turn_finalizer_does_not_reverify_without_new_changes(
1210 temp_dir: Path,
1211 ) -> None:
1212 session = FakeSession()
1213 context = build_context(temp_dir, session)
1214 finalizer = TurnFinalizer(
1215 context,
1216 RuntimeTracer(),
1217 DefinitionOfDoneStore(temp_dir),
1218 set_workflow_mode=_noop_set_workflow_mode,
1219 )
1220 index = temp_dir / "index.html"
1221 index.write_text("<ul></ul>\n")
1222 dod = create_definition_of_done("Fix the chapter list in index.html.")
1223 dod.mutating_actions.append("edit")
1224 dod.touched_files.append(str(index))
1225 dod.line_changes = 12
1226 dod.last_verification_result = "failed"
1227 dod.last_verification_signature = (
1228 f"lines={dod.line_changes};touched={index};actions=1;commands="
1229 )
1230 dod.evidence = []
1231 summary = TurnSummary(final_response="")
1232 executor = RecordingExecutor()
1233
1234 async def capture(event) -> None:
1235 return None
1236
1237 result = await finalizer.run_definition_of_done_gate(
1238 dod=dod,
1239 candidate_response="I checked the file again.",
1240 emit=capture,
1241 summary=summary,
1242 executor=executor, # type: ignore[arg-type]
1243 )
1244
1245 assert result.should_continue is True
1246 assert result.reason_code == "verification_failed_no_new_changes"
1247 assert executor.commands == []
1248 assert summary.verification_status == "failed"
1249 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
1250
1251
1252 @pytest.mark.asyncio
1253 async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
1254 temp_dir: Path,
1255 monkeypatch: pytest.MonkeyPatch,
1256 ) -> None:
1257 session = FakeSession()
1258 context = build_context(temp_dir, session)
1259 finalizer = TurnFinalizer(
1260 context,
1261 RuntimeTracer(),
1262 DefinitionOfDoneStore(temp_dir),
1263 set_workflow_mode=_noop_set_workflow_mode,
1264 )
1265 dod = create_definition_of_done(
1266 "Update index.html so the table of contents links and chapter titles are correct."
1267 )
1268 dod.mutating_actions.append("edit")
1269 dod.touched_files.append(str(temp_dir / "index.html"))
1270 dod.verification_commands = [
1271 "python3 - <<'PY'\nprint('semantic ok')\nPY",
1272 "html5validator --root /tmp/fortran-qwen-recovery-check/",
1273 ]
1274 summary = TurnSummary(final_response="")
1275 semantic_call = ToolCall(
1276 id="verify-1-1",
1277 name="bash",
1278 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1279 )
1280 html5validator_call = ToolCall(
1281 id="verify-1-2",
1282 name="bash",
1283 arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
1284 )
1285
1286 async def capture(event) -> None:
1287 return None
1288
1289 monkeypatch.setattr(
1290 "loader.runtime.finalization.derive_verification_commands",
1291 lambda *args, **kwargs: [],
1292 )
1293
1294 result = await finalizer.run_definition_of_done_gate(
1295 dod=dod,
1296 candidate_response="Updated the chapter links and titles.",
1297 emit=capture,
1298 summary=summary,
1299 executor=FakeExecutor(
1300 [
1301 tool_outcome(
1302 tool_call=semantic_call,
1303 output="semantic ok",
1304 is_error=False,
1305 exit_code=0,
1306 stdout="semantic ok",
1307 ),
1308 tool_outcome(
1309 tool_call=html5validator_call,
1310 output="/bin/sh: html5validator: command not found",
1311 is_error=True,
1312 exit_code=127,
1313 stderr="/bin/sh: html5validator: command not found",
1314 ),
1315 ]
1316 ), # type: ignore[arg-type]
1317 )
1318
1319 assert result.should_continue is False
1320 assert result.reason_code == "verification_passed"
1321 assert summary.verification_status == "passed"
1322 assert dod.status == "done"
1323 assert dod.last_verification_result == "passed"
1324 assert [item.passed for item in dod.evidence] == [True, False]
1325 assert [item.skipped for item in dod.evidence] == [False, True]
1326 assert "SKIP" in result.final_response
1327 assert "html5validator" in result.final_response
1328 assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
1329 assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
1330 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
1331 VerificationObservationStatus.SKIPPED.value
1332 ]