Python · 40532 bytes Raw Blame History
1 """Tests for finalization helpers on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.completion_trace import CompletionTraceEntry
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import (
14 DefinitionOfDoneStore,
15 VerificationEvidence,
16 create_definition_of_done,
17 )
18 from loader.runtime.events import TurnSummary
19 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
20 from loader.runtime.finalization import (
21 TurnFinalizer,
22 _build_verification_repair_guidance,
23 )
24 from loader.runtime.permissions import (
25 PermissionMode,
26 build_permission_policy,
27 load_permission_rules,
28 )
29 from loader.runtime.tracing import RuntimeTracer
30 from loader.runtime.verification_observations import VerificationObservationStatus
31 from loader.tools.base import ToolResult as RegistryToolResult
32 from loader.tools.base import create_default_registry
33 from tests.helpers.runtime_harness import ScriptedBackend
34
35
36 class FakeSession:
37 def __init__(self) -> None:
38 self.messages: list[Message] = []
39 self.session_id = "session-test-123"
40 self.recorded_calls: list[dict[str, object]] = []
41 self.last_completion_decision_code = "verification_passed"
42 self.last_completion_decision_summary = (
43 "accepted the response after verification evidence passed"
44 )
45 self.completion_trace = [
46 CompletionTraceEntry(
47 stage="definition_of_done",
48 outcome="complete",
49 decision_code="verification_passed",
50 decision_summary="accepted the response after verification evidence passed",
51 )
52 ]
53 self.last_turn_transition_summary = (
54 "completion -> finalize [terminal] Finalizing completed turn"
55 )
56 self.workflow_timeline = []
57
58 def append(self, message: Message) -> None:
59 self.messages.append(message)
60
61 def append_workflow_timeline_entry(self, entry) -> None:
62 self.workflow_timeline.append(entry)
63
64 def record_turn_usage(
65 self,
66 usage: dict[str, int],
67 *,
68 tool_calls: int,
69 iterations: int,
70 ) -> dict[str, int]:
71 payload = {
72 "usage": dict(usage),
73 "tool_calls": tool_calls,
74 "iterations": iterations,
75 }
76 self.recorded_calls.append(payload)
77 return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
78
79
80 class FakeCodeFilter:
81 def reset(self) -> None:
82 return None
83
84
85 class FakeSafeguards:
86 def __init__(self) -> None:
87 self.action_tracker = object()
88 self.validator = object()
89 self.code_filter = FakeCodeFilter()
90
91 def filter_stream_chunk(self, content: str) -> str:
92 return content
93
94 def filter_complete_content(self, content: str) -> str:
95 return content
96
97 def should_steer(self) -> bool:
98 return False
99
100 def get_steering_message(self) -> str | None:
101 return None
102
103 def record_response(self, content: str) -> None:
104 return None
105
106 def detect_text_loop(self, content: str) -> tuple[bool, str]:
107 return False, ""
108
109 def detect_loop(self) -> tuple[bool, str]:
110 return False, ""
111
112
113 class FakeExecutor:
114 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
115 self._outcomes = list(outcomes)
116
117 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
118 if not self._outcomes:
119 raise AssertionError("No fake verification outcome queued")
120 return self._outcomes.pop(0)
121
122
123 class RecordingExecutor:
124 def __init__(self) -> None:
125 self.commands: list[str] = []
126
127 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
128 command = str(tool_call.arguments.get("command", ""))
129 self.commands.append(command)
130 return tool_outcome(
131 tool_call=tool_call,
132 output="ok",
133 is_error=False,
134 exit_code=0,
135 stdout="ok",
136 )
137
138
139 class SelectiveRecordingExecutor:
140 def __init__(self, failing_match: str) -> None:
141 self.commands: list[str] = []
142 self.failing_match = failing_match
143
144 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
145 command = str(tool_call.arguments.get("command", ""))
146 self.commands.append(command)
147 failed = self.failing_match in command
148 return tool_outcome(
149 tool_call=tool_call,
150 output="failed" if failed else "ok",
151 is_error=failed,
152 exit_code=1 if failed else 0,
153 stdout="" if failed else "ok",
154 stderr="failed" if failed else "",
155 )
156
157
158 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
159 registry = create_default_registry(temp_dir)
160 registry.configure_workspace_root(temp_dir)
161 rule_status = load_permission_rules(temp_dir)
162 policy = build_permission_policy(
163 active_mode=PermissionMode.WORKSPACE_WRITE,
164 workspace_root=temp_dir,
165 tool_requirements=registry.get_tool_requirements(),
166 rules=rule_status.rules,
167 )
168 return RuntimeContext(
169 project_root=temp_dir,
170 backend=ScriptedBackend(),
171 registry=registry,
172 session=session, # type: ignore[arg-type]
173 config=SimpleNamespace(
174 force_react=False,
175 verification_retry_budget=3,
176 reasoning=SimpleNamespace(
177 rollback=False,
178 show_rollback_plan=False,
179 completion_check=True,
180 use_quick_completion=True,
181 max_continuation_prompts=5,
182 self_critique=False,
183 confidence_scoring=False,
184 min_confidence_for_action=3,
185 verification=False,
186 ),
187 ),
188 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
189 project_context=None,
190 permission_policy=policy,
191 permission_config_status=rule_status,
192 workflow_mode="execute",
193 safeguards=FakeSafeguards(),
194 )
195
196
197 def tool_outcome(
198 *,
199 tool_call: ToolCall,
200 output: str,
201 is_error: bool,
202 exit_code: int,
203 stdout: str = "",
204 stderr: str = "",
205 ) -> ToolExecutionOutcome:
206 return ToolExecutionOutcome(
207 tool_call=tool_call,
208 state=ToolExecutionState.EXECUTED,
209 message=Message.tool_result_message(
210 tool_call_id=tool_call.id,
211 display_content=output,
212 result_content=output,
213 is_error=is_error,
214 ),
215 event_content=output,
216 is_error=is_error,
217 result_output=output,
218 registry_result=RegistryToolResult(
219 output=output,
220 is_error=is_error,
221 metadata={
222 "exit_code": exit_code,
223 "stdout": stdout,
224 "stderr": stderr,
225 },
226 ),
227 )
228
229
230 async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
231 return None
232
233
234 def test_turn_finalizer_finalize_summary_uses_runtime_context(
235 temp_dir: Path,
236 monkeypatch: pytest.MonkeyPatch,
237 ) -> None:
238 session = FakeSession()
239 context = build_context(temp_dir, session)
240 tracer = RuntimeTracer()
241 tracer.record("turn.completed", reason="done")
242 finalizer = TurnFinalizer(
243 context,
244 tracer,
245 DefinitionOfDoneStore(temp_dir),
246 set_workflow_mode=_noop_set_workflow_mode,
247 )
248 dod = create_definition_of_done("Finish the task")
249 dod.status = "done"
250 summary = TurnSummary(
251 final_response="All set.",
252 definition_of_done=dod,
253 iterations=2,
254 usage={"prompt_tokens": 10},
255 tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
256 )
257 captured: dict[str, str] = {}
258
259 def capture_definition_of_done(self, summary_text: str) -> None:
260 captured["summary"] = summary_text
261
262 monkeypatch.setattr(
263 "loader.runtime.finalization.MemoryStore.capture_definition_of_done",
264 capture_definition_of_done,
265 )
266
267 final_summary = finalizer.finalize_summary(summary)
268
269 assert final_summary.session_id == "session-test-123"
270 assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
271 assert session.recorded_calls == [
272 {
273 "usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
274 "tool_calls": 1,
275 "iterations": 2,
276 }
277 ]
278 assert "summary" in captured
279 assert final_summary.trace
280 assert final_summary.completion_decision_code == "verification_passed"
281 assert final_summary.completion_decision_summary == (
282 "accepted the response after verification evidence passed"
283 )
284 assert [entry.decision_code for entry in final_summary.completion_trace] == [
285 "verification_passed"
286 ]
287
288
289 def test_verification_repair_guidance_uses_existing_artifacts_as_source_of_truth(
290 temp_dir: Path,
291 ) -> None:
292 guide_root = temp_dir / "guides" / "nginx"
293 chapters = guide_root / "chapters"
294 chapters.mkdir(parents=True)
295 index_path = guide_root / "index.html"
296 chapter_one = chapters / "01-getting-started.html"
297 chapter_two = chapters / "02-installation.html"
298 chapter_three = chapters / "03-first-website.html"
299 chapter_four = chapters / "04-configuration-basics.html"
300
301 for path in (index_path, chapter_one, chapter_two, chapter_three, chapter_four):
302 path.write_text("<html></html>\n")
303
304 implementation_plan = temp_dir / "implementation.md"
305 implementation_plan.write_text(
306 "\n".join(
307 [
308 "# Implementation Plan",
309 "",
310 "## File Changes",
311 f"- `{guide_root}/`",
312 f"- `{chapters}/`",
313 f"- `{index_path}`",
314 f"- `{chapter_one}`",
315 f"- `{chapter_two}`",
316 f"- `{chapter_three}`",
317 f"- `{chapter_four}`",
318 "",
319 ]
320 )
321 )
322
323 dod = create_definition_of_done("Repair the nginx guide index.")
324 dod.implementation_plan = str(implementation_plan)
325 dod.evidence = [
326 VerificationEvidence(
327 command="verify-links",
328 passed=False,
329 output=(
330 "Missing local HTML links:\n"
331 f"{index_path}:chapters/01-introduction.html -> {chapters / '01-introduction.html'}\n"
332 f"{index_path}:chapters/04-server-blocks.html -> {chapters / '04-server-blocks.html'}\n"
333 ),
334 )
335 ]
336
337 guidance = _build_verification_repair_guidance(
338 dod,
339 project_root=temp_dir,
340 )
341
342 assert "Use the existing artifact files as the source of truth" in guidance
343 assert str(chapter_one) in guidance
344 assert str(chapter_two) in guidance
345 assert str(chapter_four) in guidance
346
347
348 @pytest.mark.asyncio
349 async def test_turn_finalizer_records_skipped_verification_observation(
350 temp_dir: Path,
351 ) -> None:
352 session = FakeSession()
353 context = build_context(temp_dir, session)
354 finalizer = TurnFinalizer(
355 context,
356 RuntimeTracer(),
357 DefinitionOfDoneStore(temp_dir),
358 set_workflow_mode=_noop_set_workflow_mode,
359 )
360 dod = create_definition_of_done("Explain Loader's clarify loop.")
361 summary = TurnSummary(final_response="")
362 events = []
363
364 async def capture(event) -> None:
365 events.append(event)
366
367 result = await finalizer.run_definition_of_done_gate(
368 dod=dod,
369 candidate_response="Loader uses a bounded clarify loop before execution.",
370 emit=capture,
371 summary=summary,
372 executor=FakeExecutor([]), # type: ignore[arg-type]
373 )
374
375 assert result.should_continue is False
376 assert result.reason_code == "non_mutating_response_accepted"
377 assert [item.status for item in result.verification_observations] == [
378 VerificationObservationStatus.SKIPPED.value
379 ]
380 assert [item.summary for item in result.verification_observations] == [
381 "verification was skipped because no mutating work required checks"
382 ]
383 assert summary.verification_status == "skipped"
384 assert "Complete the requested work" not in dod.pending_items
385 assert "Complete the requested work" in dod.completed_items
386 assert session.workflow_timeline[-1].kind == "verify_skip"
387 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
388 VerificationObservationStatus.SKIPPED.value
389 ]
390 assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
391
392
393 @pytest.mark.asyncio
394 async def test_turn_finalizer_accepts_noop_completion_with_task_restatement_todo(
395 temp_dir: Path,
396 ) -> None:
397 session = FakeSession()
398 context = build_context(temp_dir, session)
399 finalizer = TurnFinalizer(
400 context,
401 RuntimeTracer(),
402 DefinitionOfDoneStore(temp_dir),
403 set_workflow_mode=_noop_set_workflow_mode,
404 )
405 task = (
406 "Have a look at ~/Loader/guides/fortran/index.html, then "
407 "~/Loader/guides/fortran/chapters. The table of contents links in "
408 "index.html are inaccurate and the href’s are wrong. Let’s update the "
409 "links and their link texts to be correct."
410 )
411 dod = create_definition_of_done(task)
412 dod.pending_items = [task, "Complete the requested work"]
413 summary = TurnSummary(final_response="")
414
415 async def capture(event) -> None:
416 return None
417
418 result = await finalizer.run_definition_of_done_gate(
419 dod=dod,
420 candidate_response="The table of contents is already correct, so no edit is needed.",
421 emit=capture,
422 summary=summary,
423 executor=FakeExecutor([]), # type: ignore[arg-type]
424 )
425
426 assert result.should_continue is False
427 assert result.reason_code == "non_mutating_response_accepted"
428
429
430 @pytest.mark.asyncio
431 async def test_turn_finalizer_records_passed_verification_observation(
432 temp_dir: Path,
433 ) -> None:
434 session = FakeSession()
435 context = build_context(temp_dir, session)
436 finalizer = TurnFinalizer(
437 context,
438 RuntimeTracer(),
439 DefinitionOfDoneStore(temp_dir),
440 set_workflow_mode=_noop_set_workflow_mode,
441 )
442 dod = create_definition_of_done("Update the runtime tests.")
443 dod.mutating_actions.append("write")
444 dod.verification_commands = ["uv run pytest -q"]
445 summary = TurnSummary(final_response="")
446 tool_call = ToolCall(
447 id="verify-1-1",
448 name="bash",
449 arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
450 )
451
452 async def capture(event) -> None:
453 return None
454
455 result = await finalizer.run_definition_of_done_gate(
456 dod=dod,
457 candidate_response="Updated the runtime tests.",
458 emit=capture,
459 summary=summary,
460 executor=FakeExecutor(
461 [
462 tool_outcome(
463 tool_call=tool_call,
464 output="219 passed",
465 is_error=False,
466 exit_code=0,
467 stdout="219 passed",
468 )
469 ]
470 ), # type: ignore[arg-type]
471 )
472
473 assert result.should_continue is False
474 assert result.reason_code == "verification_passed"
475 assert [item.status for item in result.verification_observations] == [
476 VerificationObservationStatus.PASSED.value
477 ]
478 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
479 assert result.verification_observations[0].attempt_number == 1
480 assert result.verification_observations[0].command == "uv run pytest -q"
481 assert result.verification_observations[0].detail == "219 passed"
482 assert summary.verification_status == "passed"
483 assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
484 "verification_pending",
485 "verification_command_passed",
486 ]
487 assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
488 VerificationObservationStatus.PENDING.value
489 ]
490 assert (
491 session.workflow_timeline[-2].verification_observations[0].attempt_id
492 == "verification-attempt-1"
493 )
494 assert session.workflow_timeline[-2].verification_observations[0].command == (
495 "uv run pytest -q"
496 )
497 assert session.workflow_timeline[-1].kind == "verify_observation"
498 assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
499 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
500 VerificationObservationStatus.PASSED.value
501 ]
502
503
504 @pytest.mark.asyncio
505 async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
506 temp_dir: Path,
507 ) -> None:
508 chapters = temp_dir / "chapters"
509 chapters.mkdir()
510 (chapters / "01-introduction.html").write_text(
511 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
512 )
513 index = temp_dir / "index.html"
514 index.write_text(
515 "\n".join(
516 [
517 '<ul class="chapter-list">',
518 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
519 "</ul>",
520 ]
521 )
522 )
523
524 session = FakeSession()
525 context = build_context(temp_dir, session)
526 finalizer = TurnFinalizer(
527 context,
528 RuntimeTracer(),
529 DefinitionOfDoneStore(temp_dir),
530 set_workflow_mode=_noop_set_workflow_mode,
531 )
532 dod = create_definition_of_done(
533 "Update index.html so the table of contents links and chapter titles are correct."
534 )
535 dod.mutating_actions.append("edit")
536 dod.touched_files.append(str(index))
537 dod.verification_commands = ['grep -n "href=" index.html']
538 summary = TurnSummary(final_response="")
539 executor = RecordingExecutor()
540
541 async def capture(event) -> None:
542 return None
543
544 result = await finalizer.run_definition_of_done_gate(
545 dod=dod,
546 candidate_response="Updated the index.html links.",
547 emit=capture,
548 summary=summary,
549 executor=executor, # type: ignore[arg-type]
550 )
551
552 assert result.should_continue is False
553 assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
554 assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
555 assert (
556 session.workflow_timeline[-1].verification_observations[0].attempt_id
557 == "verification-attempt-1"
558 )
559
560
561 @pytest.mark.asyncio
562 async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
563 temp_dir: Path,
564 ) -> None:
565 (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
566 (temp_dir / "package.json").write_text("{}\n")
567 external_root = temp_dir.parent / "external-nginx-guide"
568 external_root.mkdir(exist_ok=True)
569 external_index = external_root / "index.html"
570 external_index.write_text("<html></html>\n")
571
572 session = FakeSession()
573 context = build_context(temp_dir, session)
574 finalizer = TurnFinalizer(
575 context,
576 RuntimeTracer(),
577 DefinitionOfDoneStore(temp_dir),
578 set_workflow_mode=_noop_set_workflow_mode,
579 )
580 dod = create_definition_of_done("Create an external nginx guide.")
581 dod.mutating_actions.append("write")
582 dod.touched_files.append(str(external_index))
583 dod.verification_commands = [
584 f"ls -la {external_root}",
585 f"grep -n \"html\" {external_index}",
586 ]
587 summary = TurnSummary(final_response="")
588 executor = RecordingExecutor()
589
590 async def capture(event) -> None:
591 return None
592
593 result = await finalizer.run_definition_of_done_gate(
594 dod=dod,
595 candidate_response="Created the external nginx guide.",
596 emit=capture,
597 summary=summary,
598 executor=executor, # type: ignore[arg-type]
599 )
600
601 assert result.should_continue is False
602 assert executor.commands == [
603 f"ls -la {external_root}",
604 f'grep -n "html" {external_index}',
605 ]
606
607
608 @pytest.mark.asyncio
609 async def test_turn_finalizer_filters_reference_side_verification_commands(
610 temp_dir: Path,
611 ) -> None:
612 guide_root = temp_dir / "Loader" / "guides" / "nginx"
613 chapters = guide_root / "chapters"
614 chapters.mkdir(parents=True)
615 index_path = guide_root / "index.html"
616 chapter_one = chapters / "01-introduction.html"
617 index_path.write_text("<html><body><h1>Guide</h1></body></html>\n")
618 chapter_one.write_text("<html><body><h1>Intro</h1></body></html>\n")
619
620 reference_root = temp_dir / "Loader" / "guides" / "fortran"
621 reference_root.mkdir(parents=True)
622
623 implementation_plan = temp_dir / "implementation.md"
624 implementation_plan.write_text(
625 "\n".join(
626 [
627 "# Implementation Plan",
628 "",
629 "## File Changes",
630 f"- `{guide_root}`",
631 f"- `{chapters}`",
632 f"- `{index_path}`",
633 f"- `{chapter_one}`",
634 "",
635 ]
636 )
637 )
638 verification_plan = temp_dir / "verification.md"
639 verification_plan.write_text(
640 "\n".join(
641 [
642 "# Verification Plan",
643 "",
644 "## Verification Commands",
645 "```bash",
646 f"ls -la {guide_root}",
647 f"ls -la {reference_root}",
648 "```",
649 "",
650 ]
651 )
652 )
653
654 session = FakeSession()
655 context = build_context(temp_dir, session)
656 finalizer = TurnFinalizer(
657 context,
658 RuntimeTracer(),
659 DefinitionOfDoneStore(temp_dir),
660 set_workflow_mode=_noop_set_workflow_mode,
661 )
662 dod = create_definition_of_done("Create an nginx guide from an external reference.")
663 dod.mutating_actions.append("write")
664 dod.touched_files.extend([str(index_path), str(chapter_one)])
665 dod.implementation_plan = str(implementation_plan)
666 dod.verification_plan = str(verification_plan)
667 summary = TurnSummary(final_response="")
668 executor = RecordingExecutor()
669
670 async def capture(event) -> None:
671 return None
672
673 result = await finalizer.run_definition_of_done_gate(
674 dod=dod,
675 candidate_response="Created the nginx guide.",
676 emit=capture,
677 summary=summary,
678 executor=executor, # type: ignore[arg-type]
679 )
680
681 assert result.should_continue is False
682 assert any(str(guide_root) in command for command in executor.commands)
683 assert all(str(reference_root) not in command for command in executor.commands)
684
685
686 @pytest.mark.asyncio
687 async def test_turn_finalizer_blocks_completion_when_planned_artifacts_are_missing(
688 temp_dir: Path,
689 ) -> None:
690 docs = temp_dir / "docs"
691 chapters = docs / "chapters"
692 chapters.mkdir(parents=True)
693 index = docs / "index.html"
694 first = chapters / "01-intro.html"
695 second = chapters / "02-installation.html"
696 index.write_text(
697 "\n".join(
698 [
699 '<a href="chapters/01-intro.html">Intro</a>',
700 '<a href="chapters/02-installation.html">Installation</a>',
701 ]
702 )
703 )
704 first.write_text("<h1>Intro</h1>\n")
705 implementation_plan = temp_dir / "implementation.md"
706 implementation_plan.write_text(
707 "\n".join(
708 [
709 "# Implementation Plan",
710 "",
711 "## File Changes",
712 f"- `{index}`",
713 f"- `{first}`",
714 f"- `{second}`",
715 ]
716 )
717 )
718
719 session = FakeSession()
720 context = build_context(temp_dir, session)
721 finalizer = TurnFinalizer(
722 context,
723 RuntimeTracer(),
724 DefinitionOfDoneStore(temp_dir),
725 set_workflow_mode=_noop_set_workflow_mode,
726 )
727 dod = create_definition_of_done("Create a small multi-page HTML guide.")
728 dod.mutating_actions.append("write")
729 dod.touched_files.extend([str(index), str(first)])
730 dod.implementation_plan = str(implementation_plan)
731 dod.verification_commands = [f"ls -la {docs}"]
732 summary = TurnSummary(final_response="")
733 executor = RecordingExecutor()
734
735 async def capture(event) -> None:
736 return None
737
738 result = await finalizer.run_definition_of_done_gate(
739 dod=dod,
740 candidate_response="Finished the guide.",
741 emit=capture,
742 summary=summary,
743 executor=executor, # type: ignore[arg-type]
744 )
745
746 assert result.should_continue is True
747 assert result.reason_code == "planned_artifacts_missing_continue"
748 assert executor.commands == []
749 assert dod.status == "draft"
750 assert "Complete the requested work" in dod.pending_items
751 assert "Complete the requested work" not in dod.completed_items
752 assert session.messages[-1].content.startswith("[PLANNED ARTIFACTS STILL MISSING]")
753 assert "`02-installation.html`" in session.messages[-1].content
754
755
756 @pytest.mark.asyncio
757 async def test_turn_finalizer_records_missing_verification_observation(
758 temp_dir: Path,
759 ) -> None:
760 session = FakeSession()
761 context = build_context(temp_dir, session)
762 finalizer = TurnFinalizer(
763 context,
764 RuntimeTracer(),
765 DefinitionOfDoneStore(temp_dir),
766 set_workflow_mode=_noop_set_workflow_mode,
767 )
768 dod = create_definition_of_done("Edit the loader bootstrap.")
769 dod.mutating_actions.append("edit")
770 summary = TurnSummary(final_response="")
771
772 async def capture(event) -> None:
773 return None
774
775 result = await finalizer.run_definition_of_done_gate(
776 dod=dod,
777 candidate_response="Updated the bootstrap code.",
778 emit=capture,
779 summary=summary,
780 executor=FakeExecutor([]), # type: ignore[arg-type]
781 )
782
783 assert result.should_continue is True
784 assert result.reason_code == "verification_failed_reentry"
785 assert [item.status for item in result.verification_observations] == [
786 VerificationObservationStatus.MISSING.value
787 ]
788 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
789 assert result.verification_observations[0].attempt_number == 1
790 assert [item.summary for item in result.verification_observations] == [
791 "verification commands were still missing at execution time"
792 ]
793 assert summary.verification_status == "failed"
794 assert session.workflow_timeline[-1].kind == "verify_observation"
795 assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
796 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
797 VerificationObservationStatus.MISSING.value
798 ]
799 assert (
800 session.workflow_timeline[-1].verification_observations[0].attempt_id
801 == "verification-attempt-1"
802 )
803 assert session.messages[-1].role == Role.USER
804 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
805
806
807 @pytest.mark.asyncio
808 async def test_turn_finalizer_ignores_unplanned_expansion_pending_items_once_plan_exists(
809 temp_dir: Path,
810 ) -> None:
811 session = FakeSession()
812 context = build_context(temp_dir, session)
813 finalizer = TurnFinalizer(
814 context,
815 RuntimeTracer(),
816 DefinitionOfDoneStore(temp_dir),
817 set_workflow_mode=_noop_set_workflow_mode,
818 )
819
820 docs = temp_dir / "guides" / "nginx"
821 chapters = docs / "chapters"
822 docs.mkdir(parents=True)
823 chapters.mkdir()
824 index = docs / "index.html"
825 first = chapters / "01-getting-started.html"
826 second = chapters / "02-installation.html"
827 index.write_text("<html></html>\n")
828 first.write_text("<h1>One</h1>\n")
829 second.write_text("<h1>Two</h1>\n")
830
831 implementation_plan = temp_dir / "implementation.md"
832 implementation_plan.write_text(
833 "\n".join(
834 [
835 "# Implementation Plan",
836 "",
837 "## File Changes",
838 f"- `{docs}/`",
839 f"- `{chapters}/`",
840 f"- `{index}`",
841 f"- `{first}`",
842 f"- `{second}`",
843 "",
844 ]
845 )
846 )
847
848 dod = create_definition_of_done("Create a small multi-page HTML guide.")
849 dod.implementation_plan = str(implementation_plan)
850 dod.pending_items = [
851 "Create 07-performance-tuning.html",
852 "Complete the requested work",
853 ]
854 summary = TurnSummary(final_response="")
855
856 async def capture(event) -> None:
857 return None
858
859 result = await finalizer.run_definition_of_done_gate(
860 dod=dod,
861 candidate_response="Finished the guide.",
862 emit=capture,
863 summary=summary,
864 executor=FakeExecutor([]), # type: ignore[arg-type]
865 )
866
867 assert result.should_continue is False
868 assert result.reason_code == "non_mutating_response_accepted"
869
870
871 @pytest.mark.asyncio
872 async def test_turn_finalizer_verification_failure_reentry_points_at_concrete_repair(
873 temp_dir: Path,
874 monkeypatch: pytest.MonkeyPatch,
875 ) -> None:
876 session = FakeSession()
877 context = build_context(temp_dir, session)
878 queued_messages: list[str] = []
879 context.queue_steering_message_callback = queued_messages.append
880 finalizer = TurnFinalizer(
881 context,
882 RuntimeTracer(),
883 DefinitionOfDoneStore(temp_dir),
884 set_workflow_mode=_noop_set_workflow_mode,
885 )
886 broken_file = temp_dir / "guides" / "nginx" / "chapters" / "05-advanced-configurations.html"
887 broken_file.parent.mkdir(parents=True, exist_ok=True)
888 broken_file.write_text('<link rel="stylesheet" href="../styles.css">\n')
889 missing_target = temp_dir / "guides" / "nginx" / "styles.css"
890 dod = create_definition_of_done("Create the nginx guide.")
891 dod.mutating_actions.append("write")
892 dod.touched_files.append(str(broken_file))
893 dod.verification_commands = ["python3 verify_links.py"]
894 summary = TurnSummary(final_response="")
895 verify_call = ToolCall(
896 id="verify-1-1",
897 name="bash",
898 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
899 )
900 failure_output = (
901 "Missing local HTML links:\n"
902 f"{broken_file}:../styles.css -> {missing_target}\n"
903 )
904
905 async def capture(event) -> None:
906 return None
907
908 monkeypatch.setattr(
909 "loader.runtime.finalization.derive_verification_commands",
910 lambda *args, **kwargs: [],
911 )
912
913 result = await finalizer.run_definition_of_done_gate(
914 dod=dod,
915 candidate_response="The guide is complete.",
916 emit=capture,
917 summary=summary,
918 executor=FakeExecutor(
919 [
920 tool_outcome(
921 tool_call=verify_call,
922 output=failure_output,
923 is_error=True,
924 exit_code=1,
925 stdout=failure_output,
926 )
927 ]
928 ), # type: ignore[arg-type]
929 )
930
931 assert result.should_continue is True
932 assert result.reason_code == "verification_failed_reentry"
933 assert queued_messages
934 assert str(broken_file) in queued_messages[-1]
935 assert "../styles.css" in queued_messages[-1]
936 assert str(missing_target) in queued_messages[-1]
937 assert "Do not restart discovery or reread unrelated references." in queued_messages[-1]
938 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
939 assert f"Immediate next step: edit `{broken_file}`." in session.messages[-1].content
940 assert f"create `{missing_target}`" in session.messages[-1].content
941 assert (
942 "Do not reread unrelated reference materials or restart discovery"
943 in session.messages[-1].content
944 )
945
946
947 @pytest.mark.asyncio
948 async def test_turn_finalizer_verification_failure_reentry_prioritizes_missing_planned_outputs(
949 temp_dir: Path,
950 monkeypatch: pytest.MonkeyPatch,
951 ) -> None:
952 session = FakeSession()
953 context = build_context(temp_dir, session)
954 queued_messages: list[str] = []
955 context.queue_steering_message_callback = queued_messages.append
956 finalizer = TurnFinalizer(
957 context,
958 RuntimeTracer(),
959 DefinitionOfDoneStore(temp_dir),
960 set_workflow_mode=_noop_set_workflow_mode,
961 )
962 guide_root = temp_dir / "guides" / "nginx"
963 chapters = guide_root / "chapters"
964 chapters.mkdir(parents=True, exist_ok=True)
965 index = guide_root / "index.html"
966 first = chapters / "01-installation.html"
967 second = chapters / "02-configuration.html"
968 third = chapters / "03-basic-usage.html"
969 index.write_text(
970 "\n".join(
971 [
972 '<a href="chapters/01-installation.html">Installation</a>',
973 '<a href="chapters/02-configuration.html">Configuration</a>',
974 '<a href="chapters/03-basic-usage.html">Basic Usage</a>',
975 ]
976 )
977 )
978 first.write_text("<h1>Installation</h1>\n")
979 implementation_plan = temp_dir / "implementation.md"
980 implementation_plan.write_text(
981 "\n".join(
982 [
983 "# Implementation Plan",
984 "",
985 "## File Changes",
986 f"- `{guide_root}/`",
987 f"- `{chapters}/`",
988 f"- `{index}`",
989 f"- `{first}`",
990 "",
991 ]
992 )
993 )
994 dod = create_definition_of_done("Create the nginx guide.")
995 dod.mutating_actions.append("write")
996 dod.touched_files.extend([str(index), str(first)])
997 dod.implementation_plan = str(implementation_plan)
998 dod.verification_commands = ["python3 verify_links.py"]
999 summary = TurnSummary(final_response="")
1000 verify_call = ToolCall(
1001 id="verify-1-1",
1002 name="bash",
1003 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1004 )
1005 normalized_second = str(second.resolve(strict=False))
1006 normalized_third = str(third.resolve(strict=False))
1007 failure_output = (
1008 "Missing local HTML links:\n"
1009 f"{index}:chapters/02-configuration.html -> {second}\n"
1010 f"{index}:chapters/03-basic-usage.html -> {third}\n"
1011 )
1012
1013 async def capture(event) -> None:
1014 return None
1015
1016 monkeypatch.setattr(
1017 "loader.runtime.finalization.derive_verification_commands",
1018 lambda *args, **kwargs: [],
1019 )
1020
1021 result = await finalizer.run_definition_of_done_gate(
1022 dod=dod,
1023 candidate_response="The guide is complete.",
1024 emit=capture,
1025 summary=summary,
1026 executor=FakeExecutor(
1027 [
1028 tool_outcome(
1029 tool_call=verify_call,
1030 output=failure_output,
1031 is_error=True,
1032 exit_code=1,
1033 stdout=failure_output,
1034 )
1035 ]
1036 ), # type: ignore[arg-type]
1037 )
1038
1039 assert result.should_continue is True
1040 assert result.reason_code == "verification_failed_reentry"
1041 assert queued_messages
1042 assert normalized_second in queued_messages[-1]
1043 assert "Do not rewrite the existing aggregate files" in queued_messages[-1]
1044 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1045 assert f"Immediate next step: write `{normalized_second}`." in session.messages[-1].content
1046 assert (
1047 f"creating missing planned artifact `{normalized_second}`"
1048 in session.messages[-1].content
1049 )
1050 assert (
1051 f"creating missing planned artifact `{normalized_third}`"
1052 in session.messages[-1].content
1053 )
1054 assert f"Immediate next step: edit `{index}`." not in session.messages[-1].content
1055 assert "Do not rewrite existing aggregate files" in session.messages[-1].content
1056
1057
1058 @pytest.mark.asyncio
1059 async def test_turn_finalizer_does_not_reverify_without_new_changes(
1060 temp_dir: Path,
1061 ) -> None:
1062 session = FakeSession()
1063 context = build_context(temp_dir, session)
1064 finalizer = TurnFinalizer(
1065 context,
1066 RuntimeTracer(),
1067 DefinitionOfDoneStore(temp_dir),
1068 set_workflow_mode=_noop_set_workflow_mode,
1069 )
1070 index = temp_dir / "index.html"
1071 index.write_text("<ul></ul>\n")
1072 dod = create_definition_of_done("Fix the chapter list in index.html.")
1073 dod.mutating_actions.append("edit")
1074 dod.touched_files.append(str(index))
1075 dod.line_changes = 12
1076 dod.last_verification_result = "failed"
1077 dod.last_verification_signature = (
1078 f"lines={dod.line_changes};touched={index};actions=1;commands="
1079 )
1080 dod.evidence = []
1081 summary = TurnSummary(final_response="")
1082 executor = RecordingExecutor()
1083
1084 async def capture(event) -> None:
1085 return None
1086
1087 result = await finalizer.run_definition_of_done_gate(
1088 dod=dod,
1089 candidate_response="I checked the file again.",
1090 emit=capture,
1091 summary=summary,
1092 executor=executor, # type: ignore[arg-type]
1093 )
1094
1095 assert result.should_continue is True
1096 assert result.reason_code == "verification_failed_no_new_changes"
1097 assert executor.commands == []
1098 assert summary.verification_status == "failed"
1099 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
1100
1101
1102 @pytest.mark.asyncio
1103 async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
1104 temp_dir: Path,
1105 monkeypatch: pytest.MonkeyPatch,
1106 ) -> None:
1107 session = FakeSession()
1108 context = build_context(temp_dir, session)
1109 finalizer = TurnFinalizer(
1110 context,
1111 RuntimeTracer(),
1112 DefinitionOfDoneStore(temp_dir),
1113 set_workflow_mode=_noop_set_workflow_mode,
1114 )
1115 dod = create_definition_of_done(
1116 "Update index.html so the table of contents links and chapter titles are correct."
1117 )
1118 dod.mutating_actions.append("edit")
1119 dod.touched_files.append(str(temp_dir / "index.html"))
1120 dod.verification_commands = [
1121 "python3 - <<'PY'\nprint('semantic ok')\nPY",
1122 "html5validator --root /tmp/fortran-qwen-recovery-check/",
1123 ]
1124 summary = TurnSummary(final_response="")
1125 semantic_call = ToolCall(
1126 id="verify-1-1",
1127 name="bash",
1128 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1129 )
1130 html5validator_call = ToolCall(
1131 id="verify-1-2",
1132 name="bash",
1133 arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
1134 )
1135
1136 async def capture(event) -> None:
1137 return None
1138
1139 monkeypatch.setattr(
1140 "loader.runtime.finalization.derive_verification_commands",
1141 lambda *args, **kwargs: [],
1142 )
1143
1144 result = await finalizer.run_definition_of_done_gate(
1145 dod=dod,
1146 candidate_response="Updated the chapter links and titles.",
1147 emit=capture,
1148 summary=summary,
1149 executor=FakeExecutor(
1150 [
1151 tool_outcome(
1152 tool_call=semantic_call,
1153 output="semantic ok",
1154 is_error=False,
1155 exit_code=0,
1156 stdout="semantic ok",
1157 ),
1158 tool_outcome(
1159 tool_call=html5validator_call,
1160 output="/bin/sh: html5validator: command not found",
1161 is_error=True,
1162 exit_code=127,
1163 stderr="/bin/sh: html5validator: command not found",
1164 ),
1165 ]
1166 ), # type: ignore[arg-type]
1167 )
1168
1169 assert result.should_continue is False
1170 assert result.reason_code == "verification_passed"
1171 assert summary.verification_status == "passed"
1172 assert dod.status == "done"
1173 assert dod.last_verification_result == "passed"
1174 assert [item.passed for item in dod.evidence] == [True, False]
1175 assert [item.skipped for item in dod.evidence] == [False, True]
1176 assert "SKIP" in result.final_response
1177 assert "html5validator" in result.final_response
1178 assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
1179 assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
1180 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
1181 VerificationObservationStatus.SKIPPED.value
1182 ]