Python · 51306 bytes Raw Blame History
1 """Tests for finalization helpers on RuntimeContext."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.completion_trace import CompletionTraceEntry
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import (
14 DefinitionOfDoneStore,
15 VerificationEvidence,
16 create_definition_of_done,
17 )
18 from loader.runtime.events import TurnSummary
19 from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState
20 from loader.runtime.finalization import (
21 TurnFinalizer,
22 _build_verification_repair_guidance,
23 )
24 from loader.runtime.permissions import (
25 PermissionMode,
26 build_permission_policy,
27 load_permission_rules,
28 )
29 from loader.runtime.repair_focus import extract_active_repair_context
30 from loader.runtime.tracing import RuntimeTracer
31 from loader.runtime.verification_observations import VerificationObservationStatus
32 from loader.tools.base import ToolResult as RegistryToolResult
33 from loader.tools.base import create_default_registry
34 from tests.helpers.runtime_harness import ScriptedBackend
35
36
37 class FakeSession:
38 def __init__(self) -> None:
39 self.messages: list[Message] = []
40 self.session_id = "session-test-123"
41 self.recorded_calls: list[dict[str, object]] = []
42 self.last_completion_decision_code = "verification_passed"
43 self.last_completion_decision_summary = (
44 "accepted the response after verification evidence passed"
45 )
46 self.completion_trace = [
47 CompletionTraceEntry(
48 stage="definition_of_done",
49 outcome="complete",
50 decision_code="verification_passed",
51 decision_summary="accepted the response after verification evidence passed",
52 )
53 ]
54 self.last_turn_transition_summary = (
55 "completion -> finalize [terminal] Finalizing completed turn"
56 )
57 self.workflow_timeline = []
58
59 def append(self, message: Message) -> None:
60 self.messages.append(message)
61
62 def append_workflow_timeline_entry(self, entry) -> None:
63 self.workflow_timeline.append(entry)
64
65 def record_turn_usage(
66 self,
67 usage: dict[str, int],
68 *,
69 tool_calls: int,
70 iterations: int,
71 ) -> dict[str, int]:
72 payload = {
73 "usage": dict(usage),
74 "tool_calls": tool_calls,
75 "iterations": iterations,
76 }
77 self.recorded_calls.append(payload)
78 return {"turns": 1, "tool_calls": tool_calls, "iterations": iterations}
79
80
81 class FakeCodeFilter:
82 def reset(self) -> None:
83 return None
84
85
86 class FakeSafeguards:
87 def __init__(self) -> None:
88 self.action_tracker = object()
89 self.validator = object()
90 self.code_filter = FakeCodeFilter()
91
92 def filter_stream_chunk(self, content: str) -> str:
93 return content
94
95 def filter_complete_content(self, content: str) -> str:
96 return content
97
98 def should_steer(self) -> bool:
99 return False
100
101 def get_steering_message(self) -> str | None:
102 return None
103
104 def record_response(self, content: str) -> None:
105 return None
106
107 def detect_text_loop(self, content: str) -> tuple[bool, str]:
108 return False, ""
109
110 def detect_loop(self) -> tuple[bool, str]:
111 return False, ""
112
113
114 class FakeExecutor:
115 def __init__(self, outcomes: list[ToolExecutionOutcome]) -> None:
116 self._outcomes = list(outcomes)
117
118 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
119 if not self._outcomes:
120 raise AssertionError("No fake verification outcome queued")
121 return self._outcomes.pop(0)
122
123
124 class RecordingExecutor:
125 def __init__(self) -> None:
126 self.commands: list[str] = []
127
128 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
129 command = str(tool_call.arguments.get("command", ""))
130 self.commands.append(command)
131 return tool_outcome(
132 tool_call=tool_call,
133 output="ok",
134 is_error=False,
135 exit_code=0,
136 stdout="ok",
137 )
138
139
140 class SelectiveRecordingExecutor:
141 def __init__(self, failing_match: str) -> None:
142 self.commands: list[str] = []
143 self.failing_match = failing_match
144
145 async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome:
146 command = str(tool_call.arguments.get("command", ""))
147 self.commands.append(command)
148 failed = self.failing_match in command
149 return tool_outcome(
150 tool_call=tool_call,
151 output="failed" if failed else "ok",
152 is_error=failed,
153 exit_code=1 if failed else 0,
154 stdout="" if failed else "ok",
155 stderr="failed" if failed else "",
156 )
157
158
159 def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext:
160 registry = create_default_registry(temp_dir)
161 registry.configure_workspace_root(temp_dir)
162 rule_status = load_permission_rules(temp_dir)
163 policy = build_permission_policy(
164 active_mode=PermissionMode.WORKSPACE_WRITE,
165 workspace_root=temp_dir,
166 tool_requirements=registry.get_tool_requirements(),
167 rules=rule_status.rules,
168 )
169 return RuntimeContext(
170 project_root=temp_dir,
171 backend=ScriptedBackend(),
172 registry=registry,
173 session=session, # type: ignore[arg-type]
174 config=SimpleNamespace(
175 force_react=False,
176 verification_retry_budget=3,
177 reasoning=SimpleNamespace(
178 rollback=False,
179 show_rollback_plan=False,
180 completion_check=True,
181 use_quick_completion=True,
182 max_continuation_prompts=5,
183 self_critique=False,
184 confidence_scoring=False,
185 min_confidence_for_action=3,
186 verification=False,
187 ),
188 ),
189 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
190 project_context=None,
191 permission_policy=policy,
192 permission_config_status=rule_status,
193 workflow_mode="execute",
194 safeguards=FakeSafeguards(),
195 )
196
197
198 def tool_outcome(
199 *,
200 tool_call: ToolCall,
201 output: str,
202 is_error: bool,
203 exit_code: int,
204 stdout: str = "",
205 stderr: str = "",
206 ) -> ToolExecutionOutcome:
207 return ToolExecutionOutcome(
208 tool_call=tool_call,
209 state=ToolExecutionState.EXECUTED,
210 message=Message.tool_result_message(
211 tool_call_id=tool_call.id,
212 display_content=output,
213 result_content=output,
214 is_error=is_error,
215 ),
216 event_content=output,
217 is_error=is_error,
218 result_output=output,
219 registry_result=RegistryToolResult(
220 output=output,
221 is_error=is_error,
222 metadata={
223 "exit_code": exit_code,
224 "stdout": stdout,
225 "stderr": stderr,
226 },
227 ),
228 )
229
230
231 async def _noop_set_workflow_mode(mode, dod, emit, summary) -> None:
232 return None
233
234
235 def test_turn_finalizer_finalize_summary_uses_runtime_context(
236 temp_dir: Path,
237 monkeypatch: pytest.MonkeyPatch,
238 ) -> None:
239 session = FakeSession()
240 context = build_context(temp_dir, session)
241 tracer = RuntimeTracer()
242 tracer.record("turn.completed", reason="done")
243 finalizer = TurnFinalizer(
244 context,
245 tracer,
246 DefinitionOfDoneStore(temp_dir),
247 set_workflow_mode=_noop_set_workflow_mode,
248 )
249 dod = create_definition_of_done("Finish the task")
250 dod.status = "done"
251 summary = TurnSummary(
252 final_response="All set.",
253 definition_of_done=dod,
254 iterations=2,
255 usage={"prompt_tokens": 10},
256 tool_result_messages=[Message(role=Role.TOOL, content="tool output")],
257 )
258 captured: dict[str, str] = {}
259
260 def capture_definition_of_done(self, summary_text: str) -> None:
261 captured["summary"] = summary_text
262
263 monkeypatch.setattr(
264 "loader.runtime.finalization.MemoryStore.capture_definition_of_done",
265 capture_definition_of_done,
266 )
267
268 final_summary = finalizer.finalize_summary(summary)
269
270 assert final_summary.session_id == "session-test-123"
271 assert final_summary.cumulative_usage == {"turns": 1, "tool_calls": 1, "iterations": 2}
272 assert session.recorded_calls == [
273 {
274 "usage": {"prompt_tokens": 10, "tool_calls": 1, "iterations": 2},
275 "tool_calls": 1,
276 "iterations": 2,
277 }
278 ]
279 assert "summary" in captured
280 assert final_summary.trace
281 assert final_summary.completion_decision_code == "verification_passed"
282 assert final_summary.completion_decision_summary == (
283 "accepted the response after verification evidence passed"
284 )
285 assert [entry.decision_code for entry in final_summary.completion_trace] == [
286 "verification_passed"
287 ]
288
289
290 def test_verification_repair_guidance_uses_existing_artifacts_as_source_of_truth(
291 temp_dir: Path,
292 ) -> None:
293 guide_root = temp_dir / "guides" / "nginx"
294 chapters = guide_root / "chapters"
295 chapters.mkdir(parents=True)
296 index_path = guide_root / "index.html"
297 chapter_one = chapters / "01-getting-started.html"
298 chapter_two = chapters / "02-installation.html"
299 chapter_three = chapters / "03-first-website.html"
300 chapter_four = chapters / "04-configuration-basics.html"
301
302 for path in (index_path, chapter_one, chapter_two, chapter_three, chapter_four):
303 path.write_text("<html></html>\n")
304
305 implementation_plan = temp_dir / "implementation.md"
306 implementation_plan.write_text(
307 "\n".join(
308 [
309 "# Implementation Plan",
310 "",
311 "## File Changes",
312 f"- `{guide_root}/`",
313 f"- `{chapters}/`",
314 f"- `{index_path}`",
315 f"- `{chapter_one}`",
316 f"- `{chapter_two}`",
317 f"- `{chapter_three}`",
318 f"- `{chapter_four}`",
319 "",
320 ]
321 )
322 )
323
324 dod = create_definition_of_done("Repair the nginx guide index.")
325 dod.implementation_plan = str(implementation_plan)
326 dod.evidence = [
327 VerificationEvidence(
328 command="verify-links",
329 passed=False,
330 output=(
331 "Missing local HTML links:\n"
332 f"{index_path}:chapters/01-introduction.html -> {chapters / '01-introduction.html'}\n"
333 f"{index_path}:chapters/04-server-blocks.html -> {chapters / '04-server-blocks.html'}\n"
334 ),
335 )
336 ]
337
338 guidance = _build_verification_repair_guidance(
339 dod,
340 project_root=temp_dir,
341 )
342
343 assert "Use the existing artifact files as the source of truth" in guidance
344 assert str(chapter_one) in guidance
345 assert str(chapter_two) in guidance
346 assert str(chapter_four) in guidance
347
348
349 def test_verification_repair_guidance_does_not_create_out_of_scope_link_target(
350 temp_dir: Path,
351 ) -> None:
352 guide_root = temp_dir / "guides" / "nginx"
353 chapters = guide_root / "chapters"
354 chapters.mkdir(parents=True)
355 index_path = guide_root / "index.html"
356 chapter_one = chapters / "01-introduction.html"
357 index_path.write_text('<a href="../index.html">All guides</a>\n')
358 chapter_one.write_text('<a href="../index.html">Back</a>\n')
359 parent_index = temp_dir / "guides" / "index.html"
360
361 implementation_plan = temp_dir / "implementation.md"
362 implementation_plan.write_text(
363 "\n".join(
364 [
365 "# Implementation Plan",
366 "",
367 "## File Changes",
368 f"- `{guide_root}/`",
369 f"- `{chapters}/`",
370 f"- `{index_path}`",
371 f"- `{chapter_one}`",
372 "",
373 ]
374 )
375 )
376
377 dod = create_definition_of_done("Create the nginx guide under guides/nginx.")
378 dod.implementation_plan = str(implementation_plan)
379 dod.touched_files.extend([str(index_path), str(chapter_one)])
380 dod.evidence = [
381 VerificationEvidence(
382 command="verify-links",
383 passed=False,
384 output=(
385 "Missing local HTML links:\n"
386 f"{index_path}:../index.html -> {parent_index}\n"
387 ),
388 )
389 ]
390
391 guidance = _build_verification_repair_guidance(
392 dod,
393 project_root=temp_dir,
394 )
395 repair = extract_active_repair_context([Message(role=Role.USER, content=guidance)])
396
397 assert "outside the requested artifact scope" in guidance
398 assert "do not create that outside file" in guidance
399 assert f"create `{parent_index}`" not in guidance
400 assert repair is not None
401 assert str(parent_index.resolve(strict=False)) not in repair.allowed_paths
402 assert str(index_path.resolve(strict=False)) in repair.allowed_paths
403
404
405 def test_verification_repair_guidance_replaces_stale_focus_for_html_quality_issue(
406 temp_dir: Path,
407 ) -> None:
408 stale_index = temp_dir / "guides" / "nginx" / "index.html"
409 stale_index.parent.mkdir(parents=True)
410 stale_index.write_text("<h1>Index</h1>\n")
411 first_chapter = temp_dir / "guides" / "nginx" / "chapters" / "01-introduction.html"
412 third_chapter = temp_dir / "guides" / "nginx" / "chapters" / "03-configuration.html"
413 first_chapter.parent.mkdir(parents=True)
414 first_chapter.write_text("<h1>Intro</h1>\n")
415 third_chapter.write_text("<h1>Config</h1>\n")
416 stale_message = Message(
417 role=Role.USER,
418 content=(
419 "Repair focus:\n"
420 f"- Fix the broken local reference `../index.html` in `{stale_index}`.\n"
421 f"- Immediate next step: edit `{stale_index}`.\n"
422 ),
423 )
424 dod = create_definition_of_done("Create an equally thorough HTML guide.")
425 dod.evidence = [
426 VerificationEvidence(
427 command="quality",
428 passed=False,
429 output=(
430 "HTML guide content quality issues:\n"
431 f"{first_chapter}: insufficient structured content (13 blocks, expected at least 18)\n"
432 f"{third_chapter}: thin content (1505 text chars, expected at least 1758)\n"
433 ),
434 )
435 ]
436
437 guidance = _build_verification_repair_guidance(
438 dod,
439 project_root=temp_dir,
440 )
441 repair = extract_active_repair_context(
442 [stale_message, Message(role=Role.USER, content=guidance)]
443 )
444
445 assert guidance.startswith("Repair focus:")
446 assert f"Immediate next step: edit `{first_chapter}` with a substantial" in guidance
447 assert "Repair every listed quality target in order before any final answer" in guidance
448 assert "HTML guide content quality issues" not in guidance
449 assert repair is not None
450 assert repair.artifact_path == str(first_chapter.resolve(strict=False))
451 assert str(stale_index.resolve(strict=False)) not in repair.allowed_paths
452 assert str(third_chapter.resolve(strict=False)) in repair.allowed_paths
453
454
455 def test_verification_repair_guidance_prioritizes_structural_html_quality_issue(
456 temp_dir: Path,
457 ) -> None:
458 chapter = temp_dir / "guides" / "nginx" / "chapters" / "08-troubleshooting.html"
459 chapter.parent.mkdir(parents=True)
460 chapter.write_text(
461 "<!DOCTYPE html><html><body><h1>Troubleshooting</h1></body></html>\n"
462 "<p>Trailing content.</p>\n"
463 )
464 dod = create_definition_of_done("Create an equally thorough HTML guide.")
465 dod.evidence = [
466 VerificationEvidence(
467 command="quality",
468 passed=False,
469 output=(
470 "HTML guide content quality issues:\n"
471 f"{chapter}: expected exactly one closing </html> tag (found 2)\n"
472 ),
473 )
474 ]
475
476 guidance = _build_verification_repair_guidance(
477 dod,
478 project_root=temp_dir,
479 )
480 repair = extract_active_repair_context([Message(role=Role.USER, content=guidance)])
481
482 assert f"Improve `{chapter}`: expected exactly one closing </html> tag" in guidance
483 assert f"Immediate next step: replace `{chapter}` with one complete" in guidance
484 assert "replace the malformed file with one complete valid HTML document" in guidance
485 assert "do not append more content after an existing closing tag" in guidance
486 assert repair is not None
487 assert repair.artifact_path == str(chapter.resolve(strict=False))
488
489
490 def test_verification_repair_guidance_keeps_multi_file_quality_worklist(
491 temp_dir: Path,
492 ) -> None:
493 chapters = temp_dir / "guides" / "nginx" / "chapters"
494 chapters.mkdir(parents=True)
495 chapter_paths = [
496 chapters / f"{index:02d}-chapter-{index}.html"
497 for index in range(1, 9)
498 ]
499 for path in chapter_paths:
500 path.write_text(f"<h1>{path.stem}</h1>\n")
501 dod = create_definition_of_done("Create an equally thorough HTML guide.")
502 dod.evidence = [
503 VerificationEvidence(
504 command="quality",
505 passed=False,
506 output=(
507 "HTML guide content quality issues:\n"
508 + "\n".join(
509 f"{path}: thin content (200 text chars, expected at least 1758)"
510 for path in chapter_paths
511 )
512 ),
513 )
514 ]
515
516 guidance = _build_verification_repair_guidance(
517 dod,
518 project_root=temp_dir,
519 )
520 repair = extract_active_repair_context(
521 [Message(role=Role.USER, content=guidance)]
522 )
523
524 assert f"Improve `{chapter_paths[0]}`: thin content" in guidance
525 assert f"Improve `{chapter_paths[-1]}`: thin content" in guidance
526 assert "add enough concrete prose" in guidance
527 assert "bounded append-style" in guidance
528 assert "avoid whole-file rewrites" in guidance
529 assert "not table-of-contents inflation" in guidance
530 assert "do not add duplicate navigation entries" in guidance
531 assert "do not stop after touching only the first file" in guidance
532 assert repair is not None
533 assert repair.artifact_path == str(chapter_paths[0].resolve(strict=False))
534 assert str(chapter_paths[-1].resolve(strict=False)) in repair.allowed_paths
535
536
537 def test_verification_repair_guidance_keeps_quality_targets_with_link_repairs(
538 temp_dir: Path,
539 ) -> None:
540 guide = temp_dir / "guides" / "nginx"
541 chapters = guide / "chapters"
542 chapters.mkdir(parents=True)
543 broken_asset_page = chapters / "07-performance.html"
544 thin_page = chapters / "06-security.html"
545 broken_asset_page.write_text('<link rel="stylesheet" href="../styles.css">\n')
546 thin_page.write_text("<h1>Security</h1>\n")
547 dod = create_definition_of_done("Create an equally thorough HTML guide.")
548 dod.evidence = [
549 VerificationEvidence(
550 command="links",
551 passed=False,
552 output=(
553 "Missing local HTML links:\n"
554 f"{broken_asset_page}:../styles.css -> {guide / 'styles.css'}\n"
555 ),
556 ),
557 VerificationEvidence(
558 command="quality",
559 passed=False,
560 output=(
561 "HTML guide content quality issues:\n"
562 f"{thin_page}: thin content (1348 text chars, expected at least 1758)\n"
563 ),
564 ),
565 ]
566
567 guidance = _build_verification_repair_guidance(
568 dod,
569 project_root=temp_dir,
570 )
571 repair = extract_active_repair_context(
572 [Message(role=Role.USER, content=guidance)]
573 )
574
575 assert f"Fix the broken local reference `../styles.css` in `{broken_asset_page}`" in guidance
576 assert f"Improve `{thin_page}`: thin content" in guidance
577 assert "continue with the listed content-quality targets" in guidance
578 assert "do not declare completion while any listed quality target remains" in guidance
579 assert repair is not None
580 assert repair.artifact_path == str(broken_asset_page.resolve(strict=False))
581 assert str(thin_page.resolve(strict=False)) in repair.allowed_paths
582
583
584 @pytest.mark.asyncio
585 async def test_turn_finalizer_records_skipped_verification_observation(
586 temp_dir: Path,
587 ) -> None:
588 session = FakeSession()
589 context = build_context(temp_dir, session)
590 finalizer = TurnFinalizer(
591 context,
592 RuntimeTracer(),
593 DefinitionOfDoneStore(temp_dir),
594 set_workflow_mode=_noop_set_workflow_mode,
595 )
596 dod = create_definition_of_done("Explain Loader's clarify loop.")
597 summary = TurnSummary(final_response="")
598 events = []
599
600 async def capture(event) -> None:
601 events.append(event)
602
603 result = await finalizer.run_definition_of_done_gate(
604 dod=dod,
605 candidate_response="Loader uses a bounded clarify loop before execution.",
606 emit=capture,
607 summary=summary,
608 executor=FakeExecutor([]), # type: ignore[arg-type]
609 )
610
611 assert result.should_continue is False
612 assert result.reason_code == "non_mutating_response_accepted"
613 assert [item.status for item in result.verification_observations] == [
614 VerificationObservationStatus.SKIPPED.value
615 ]
616 assert [item.summary for item in result.verification_observations] == [
617 "verification was skipped because no mutating work required checks"
618 ]
619 assert summary.verification_status == "skipped"
620 assert "Complete the requested work" not in dod.pending_items
621 assert "Complete the requested work" in dod.completed_items
622 assert session.workflow_timeline[-1].kind == "verify_skip"
623 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
624 VerificationObservationStatus.SKIPPED.value
625 ]
626 assert any(event.type == "dod_status" and event.dod_status == "done" for event in events)
627
628
629 @pytest.mark.asyncio
630 async def test_turn_finalizer_accepts_noop_completion_with_task_restatement_todo(
631 temp_dir: Path,
632 ) -> None:
633 session = FakeSession()
634 context = build_context(temp_dir, session)
635 finalizer = TurnFinalizer(
636 context,
637 RuntimeTracer(),
638 DefinitionOfDoneStore(temp_dir),
639 set_workflow_mode=_noop_set_workflow_mode,
640 )
641 task = (
642 "Have a look at ~/Loader/guides/fortran/index.html, then "
643 "~/Loader/guides/fortran/chapters. The table of contents links in "
644 "index.html are inaccurate and the href’s are wrong. Let’s update the "
645 "links and their link texts to be correct."
646 )
647 dod = create_definition_of_done(task)
648 dod.pending_items = [task, "Complete the requested work"]
649 summary = TurnSummary(final_response="")
650
651 async def capture(event) -> None:
652 return None
653
654 result = await finalizer.run_definition_of_done_gate(
655 dod=dod,
656 candidate_response="The table of contents is already correct, so no edit is needed.",
657 emit=capture,
658 summary=summary,
659 executor=FakeExecutor([]), # type: ignore[arg-type]
660 )
661
662 assert result.should_continue is False
663 assert result.reason_code == "non_mutating_response_accepted"
664
665
666 @pytest.mark.asyncio
667 async def test_turn_finalizer_records_passed_verification_observation(
668 temp_dir: Path,
669 ) -> None:
670 session = FakeSession()
671 context = build_context(temp_dir, session)
672 finalizer = TurnFinalizer(
673 context,
674 RuntimeTracer(),
675 DefinitionOfDoneStore(temp_dir),
676 set_workflow_mode=_noop_set_workflow_mode,
677 )
678 dod = create_definition_of_done("Update the runtime tests.")
679 dod.mutating_actions.append("write")
680 dod.verification_commands = ["uv run pytest -q"]
681 summary = TurnSummary(final_response="")
682 tool_call = ToolCall(
683 id="verify-1-1",
684 name="bash",
685 arguments={"command": "uv run pytest -q", "cwd": str(temp_dir)},
686 )
687
688 async def capture(event) -> None:
689 return None
690
691 result = await finalizer.run_definition_of_done_gate(
692 dod=dod,
693 candidate_response="Updated the runtime tests.",
694 emit=capture,
695 summary=summary,
696 executor=FakeExecutor(
697 [
698 tool_outcome(
699 tool_call=tool_call,
700 output="219 passed",
701 is_error=False,
702 exit_code=0,
703 stdout="219 passed",
704 )
705 ]
706 ), # type: ignore[arg-type]
707 )
708
709 assert result.should_continue is False
710 assert result.reason_code == "verification_passed"
711 assert [item.status for item in result.verification_observations] == [
712 VerificationObservationStatus.PASSED.value
713 ]
714 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
715 assert result.verification_observations[0].attempt_number == 1
716 assert result.verification_observations[0].command == "uv run pytest -q"
717 assert result.verification_observations[0].detail == "219 passed"
718 assert summary.verification_status == "passed"
719 assert [entry.reason_code for entry in session.workflow_timeline[-2:]] == [
720 "verification_pending",
721 "verification_command_passed",
722 ]
723 assert [item.status for item in session.workflow_timeline[-2].verification_observations] == [
724 VerificationObservationStatus.PENDING.value
725 ]
726 assert (
727 session.workflow_timeline[-2].verification_observations[0].attempt_id
728 == "verification-attempt-1"
729 )
730 assert session.workflow_timeline[-2].verification_observations[0].command == (
731 "uv run pytest -q"
732 )
733 assert session.workflow_timeline[-1].kind == "verify_observation"
734 assert session.workflow_timeline[-1].reason_code == "verification_command_passed"
735 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
736 VerificationObservationStatus.PASSED.value
737 ]
738
739
740 @pytest.mark.asyncio
741 async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands(
742 temp_dir: Path,
743 ) -> None:
744 chapters = temp_dir / "chapters"
745 chapters.mkdir()
746 (chapters / "01-introduction.html").write_text(
747 "<h1>Chapter 1: Introduction to Fortran</h1>\n"
748 )
749 index = temp_dir / "index.html"
750 index.write_text(
751 "\n".join(
752 [
753 '<ul class="chapter-list">',
754 ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>',
755 "</ul>",
756 ]
757 )
758 )
759
760 session = FakeSession()
761 context = build_context(temp_dir, session)
762 finalizer = TurnFinalizer(
763 context,
764 RuntimeTracer(),
765 DefinitionOfDoneStore(temp_dir),
766 set_workflow_mode=_noop_set_workflow_mode,
767 )
768 dod = create_definition_of_done(
769 "Update index.html so the table of contents links and chapter titles are correct."
770 )
771 dod.mutating_actions.append("edit")
772 dod.touched_files.append(str(index))
773 dod.verification_commands = ['grep -n "href=" index.html']
774 summary = TurnSummary(final_response="")
775 executor = RecordingExecutor()
776
777 async def capture(event) -> None:
778 return None
779
780 result = await finalizer.run_definition_of_done_gate(
781 dod=dod,
782 candidate_response="Updated the index.html links.",
783 emit=capture,
784 summary=summary,
785 executor=executor, # type: ignore[arg-type]
786 )
787
788 assert result.should_continue is False
789 assert any(command == 'grep -n "href=" index.html' for command in executor.commands)
790 assert any(command.startswith("python3 - <<'PY'") for command in executor.commands)
791 assert (
792 session.workflow_timeline[-1].verification_observations[0].attempt_id
793 == "verification-attempt-1"
794 )
795
796
797 @pytest.mark.asyncio
798 async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan(
799 temp_dir: Path,
800 ) -> None:
801 (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n")
802 (temp_dir / "package.json").write_text("{}\n")
803 external_root = temp_dir.parent / "external-nginx-guide"
804 external_root.mkdir(exist_ok=True)
805 external_index = external_root / "index.html"
806 external_index.write_text("<html></html>\n")
807
808 session = FakeSession()
809 context = build_context(temp_dir, session)
810 finalizer = TurnFinalizer(
811 context,
812 RuntimeTracer(),
813 DefinitionOfDoneStore(temp_dir),
814 set_workflow_mode=_noop_set_workflow_mode,
815 )
816 dod = create_definition_of_done("Create an external nginx guide.")
817 dod.mutating_actions.append("write")
818 dod.touched_files.append(str(external_index))
819 dod.verification_commands = [
820 f"ls -la {external_root}",
821 f"grep -n \"html\" {external_index}",
822 ]
823 summary = TurnSummary(final_response="")
824 executor = RecordingExecutor()
825
826 async def capture(event) -> None:
827 return None
828
829 result = await finalizer.run_definition_of_done_gate(
830 dod=dod,
831 candidate_response="Created the external nginx guide.",
832 emit=capture,
833 summary=summary,
834 executor=executor, # type: ignore[arg-type]
835 )
836
837 assert result.should_continue is False
838 assert executor.commands == [
839 f"ls -la {external_root}",
840 f'grep -n "html" {external_index}',
841 ]
842
843
844 @pytest.mark.asyncio
845 async def test_turn_finalizer_filters_reference_side_verification_commands(
846 temp_dir: Path,
847 ) -> None:
848 guide_root = temp_dir / "Loader" / "guides" / "nginx"
849 chapters = guide_root / "chapters"
850 chapters.mkdir(parents=True)
851 index_path = guide_root / "index.html"
852 chapter_one = chapters / "01-introduction.html"
853 index_path.write_text("<html><body><h1>Guide</h1></body></html>\n")
854 chapter_one.write_text("<html><body><h1>Intro</h1></body></html>\n")
855
856 reference_root = temp_dir / "Loader" / "guides" / "fortran"
857 reference_root.mkdir(parents=True)
858
859 implementation_plan = temp_dir / "implementation.md"
860 implementation_plan.write_text(
861 "\n".join(
862 [
863 "# Implementation Plan",
864 "",
865 "## File Changes",
866 f"- `{guide_root}`",
867 f"- `{chapters}`",
868 f"- `{index_path}`",
869 f"- `{chapter_one}`",
870 "",
871 ]
872 )
873 )
874 verification_plan = temp_dir / "verification.md"
875 verification_plan.write_text(
876 "\n".join(
877 [
878 "# Verification Plan",
879 "",
880 "## Verification Commands",
881 "```bash",
882 f"ls -la {guide_root}",
883 f"ls -la {reference_root}",
884 "```",
885 "",
886 ]
887 )
888 )
889
890 session = FakeSession()
891 context = build_context(temp_dir, session)
892 finalizer = TurnFinalizer(
893 context,
894 RuntimeTracer(),
895 DefinitionOfDoneStore(temp_dir),
896 set_workflow_mode=_noop_set_workflow_mode,
897 )
898 dod = create_definition_of_done("Create an nginx guide from an external reference.")
899 dod.mutating_actions.append("write")
900 dod.touched_files.extend([str(index_path), str(chapter_one)])
901 dod.implementation_plan = str(implementation_plan)
902 dod.verification_plan = str(verification_plan)
903 summary = TurnSummary(final_response="")
904 executor = RecordingExecutor()
905
906 async def capture(event) -> None:
907 return None
908
909 result = await finalizer.run_definition_of_done_gate(
910 dod=dod,
911 candidate_response="Created the nginx guide.",
912 emit=capture,
913 summary=summary,
914 executor=executor, # type: ignore[arg-type]
915 )
916
917 assert result.should_continue is False
918 assert any(str(guide_root) in command for command in executor.commands)
919 assert all(str(reference_root) not in command for command in executor.commands)
920
921
922 @pytest.mark.asyncio
923 async def test_turn_finalizer_blocks_completion_when_planned_artifacts_are_missing(
924 temp_dir: Path,
925 ) -> None:
926 docs = temp_dir / "docs"
927 chapters = docs / "chapters"
928 chapters.mkdir(parents=True)
929 index = docs / "index.html"
930 first = chapters / "01-intro.html"
931 second = chapters / "02-installation.html"
932 index.write_text(
933 "\n".join(
934 [
935 '<a href="chapters/01-intro.html">Intro</a>',
936 '<a href="chapters/02-installation.html">Installation</a>',
937 ]
938 )
939 )
940 first.write_text("<h1>Intro</h1>\n")
941 implementation_plan = temp_dir / "implementation.md"
942 implementation_plan.write_text(
943 "\n".join(
944 [
945 "# Implementation Plan",
946 "",
947 "## File Changes",
948 f"- `{index}`",
949 f"- `{first}`",
950 f"- `{second}`",
951 ]
952 )
953 )
954
955 session = FakeSession()
956 context = build_context(temp_dir, session)
957 finalizer = TurnFinalizer(
958 context,
959 RuntimeTracer(),
960 DefinitionOfDoneStore(temp_dir),
961 set_workflow_mode=_noop_set_workflow_mode,
962 )
963 dod = create_definition_of_done("Create a small multi-page HTML guide.")
964 dod.mutating_actions.append("write")
965 dod.touched_files.extend([str(index), str(first)])
966 dod.implementation_plan = str(implementation_plan)
967 dod.verification_commands = [f"ls -la {docs}"]
968 summary = TurnSummary(final_response="")
969 executor = RecordingExecutor()
970
971 async def capture(event) -> None:
972 return None
973
974 result = await finalizer.run_definition_of_done_gate(
975 dod=dod,
976 candidate_response="Finished the guide.",
977 emit=capture,
978 summary=summary,
979 executor=executor, # type: ignore[arg-type]
980 )
981
982 assert result.should_continue is True
983 assert result.reason_code == "planned_artifacts_missing_continue"
984 assert executor.commands == []
985 assert dod.status == "draft"
986 assert "Complete the requested work" in dod.pending_items
987 assert "Complete the requested work" not in dod.completed_items
988 assert session.messages[-1].content.startswith("[PLANNED ARTIFACTS STILL MISSING]")
989 assert "`02-installation.html`" in session.messages[-1].content
990
991
992 @pytest.mark.asyncio
993 async def test_turn_finalizer_records_missing_verification_observation(
994 temp_dir: Path,
995 ) -> None:
996 session = FakeSession()
997 context = build_context(temp_dir, session)
998 finalizer = TurnFinalizer(
999 context,
1000 RuntimeTracer(),
1001 DefinitionOfDoneStore(temp_dir),
1002 set_workflow_mode=_noop_set_workflow_mode,
1003 )
1004 dod = create_definition_of_done("Edit the loader bootstrap.")
1005 dod.mutating_actions.append("edit")
1006 summary = TurnSummary(final_response="")
1007
1008 async def capture(event) -> None:
1009 return None
1010
1011 result = await finalizer.run_definition_of_done_gate(
1012 dod=dod,
1013 candidate_response="Updated the bootstrap code.",
1014 emit=capture,
1015 summary=summary,
1016 executor=FakeExecutor([]), # type: ignore[arg-type]
1017 )
1018
1019 assert result.should_continue is True
1020 assert result.reason_code == "verification_failed_reentry"
1021 assert [item.status for item in result.verification_observations] == [
1022 VerificationObservationStatus.MISSING.value
1023 ]
1024 assert result.verification_observations[0].attempt_id == "verification-attempt-1"
1025 assert result.verification_observations[0].attempt_number == 1
1026 assert [item.summary for item in result.verification_observations] == [
1027 "verification commands were still missing at execution time"
1028 ]
1029 assert summary.verification_status == "failed"
1030 assert session.workflow_timeline[-1].kind == "verify_observation"
1031 assert session.workflow_timeline[-1].reason_code == "verification_commands_missing"
1032 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
1033 VerificationObservationStatus.MISSING.value
1034 ]
1035 assert (
1036 session.workflow_timeline[-1].verification_observations[0].attempt_id
1037 == "verification-attempt-1"
1038 )
1039 assert session.messages[-1].role == Role.USER
1040 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1041
1042
1043 @pytest.mark.asyncio
1044 async def test_turn_finalizer_ignores_unplanned_expansion_pending_items_once_plan_exists(
1045 temp_dir: Path,
1046 ) -> None:
1047 session = FakeSession()
1048 context = build_context(temp_dir, session)
1049 finalizer = TurnFinalizer(
1050 context,
1051 RuntimeTracer(),
1052 DefinitionOfDoneStore(temp_dir),
1053 set_workflow_mode=_noop_set_workflow_mode,
1054 )
1055
1056 docs = temp_dir / "guides" / "nginx"
1057 chapters = docs / "chapters"
1058 docs.mkdir(parents=True)
1059 chapters.mkdir()
1060 index = docs / "index.html"
1061 first = chapters / "01-getting-started.html"
1062 second = chapters / "02-installation.html"
1063 index.write_text("<html></html>\n")
1064 first.write_text("<h1>One</h1>\n")
1065 second.write_text("<h1>Two</h1>\n")
1066
1067 implementation_plan = temp_dir / "implementation.md"
1068 implementation_plan.write_text(
1069 "\n".join(
1070 [
1071 "# Implementation Plan",
1072 "",
1073 "## File Changes",
1074 f"- `{docs}/`",
1075 f"- `{chapters}/`",
1076 f"- `{index}`",
1077 f"- `{first}`",
1078 f"- `{second}`",
1079 "",
1080 ]
1081 )
1082 )
1083
1084 dod = create_definition_of_done("Create a small multi-page HTML guide.")
1085 dod.implementation_plan = str(implementation_plan)
1086 dod.pending_items = [
1087 "Create 07-performance-tuning.html",
1088 "Complete the requested work",
1089 ]
1090 summary = TurnSummary(final_response="")
1091
1092 async def capture(event) -> None:
1093 return None
1094
1095 result = await finalizer.run_definition_of_done_gate(
1096 dod=dod,
1097 candidate_response="Finished the guide.",
1098 emit=capture,
1099 summary=summary,
1100 executor=FakeExecutor([]), # type: ignore[arg-type]
1101 )
1102
1103 assert result.should_continue is False
1104 assert result.reason_code == "non_mutating_response_accepted"
1105
1106
1107 @pytest.mark.asyncio
1108 async def test_turn_finalizer_verification_failure_reentry_points_at_concrete_repair(
1109 temp_dir: Path,
1110 monkeypatch: pytest.MonkeyPatch,
1111 ) -> None:
1112 session = FakeSession()
1113 context = build_context(temp_dir, session)
1114 queued_messages: list[str] = []
1115 context.queue_steering_message_callback = queued_messages.append
1116 finalizer = TurnFinalizer(
1117 context,
1118 RuntimeTracer(),
1119 DefinitionOfDoneStore(temp_dir),
1120 set_workflow_mode=_noop_set_workflow_mode,
1121 )
1122 broken_file = temp_dir / "guides" / "nginx" / "chapters" / "05-advanced-configurations.html"
1123 broken_file.parent.mkdir(parents=True, exist_ok=True)
1124 broken_file.write_text('<link rel="stylesheet" href="../styles.css">\n')
1125 missing_target = temp_dir / "guides" / "nginx" / "styles.css"
1126 dod = create_definition_of_done("Create the nginx guide.")
1127 dod.mutating_actions.append("write")
1128 dod.touched_files.append(str(broken_file))
1129 dod.verification_commands = ["python3 verify_links.py"]
1130 summary = TurnSummary(final_response="")
1131 verify_call = ToolCall(
1132 id="verify-1-1",
1133 name="bash",
1134 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1135 )
1136 failure_output = (
1137 "Missing local HTML links:\n"
1138 f"{broken_file}:../styles.css -> {missing_target}\n"
1139 )
1140
1141 async def capture(event) -> None:
1142 return None
1143
1144 monkeypatch.setattr(
1145 "loader.runtime.finalization.derive_verification_commands",
1146 lambda *args, **kwargs: [],
1147 )
1148
1149 result = await finalizer.run_definition_of_done_gate(
1150 dod=dod,
1151 candidate_response="The guide is complete.",
1152 emit=capture,
1153 summary=summary,
1154 executor=FakeExecutor(
1155 [
1156 tool_outcome(
1157 tool_call=verify_call,
1158 output=failure_output,
1159 is_error=True,
1160 exit_code=1,
1161 stdout=failure_output,
1162 )
1163 ]
1164 ), # type: ignore[arg-type]
1165 )
1166
1167 assert result.should_continue is True
1168 assert result.reason_code == "verification_failed_reentry"
1169 assert queued_messages
1170 assert str(broken_file) in queued_messages[-1]
1171 assert "../styles.css" in queued_messages[-1]
1172 assert str(missing_target) in queued_messages[-1]
1173 assert "Do not restart discovery or reread unrelated references." in queued_messages[-1]
1174 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1175 assert f"Immediate next step: edit `{broken_file}`." in session.messages[-1].content
1176 assert f"create `{missing_target}`" in session.messages[-1].content
1177 assert (
1178 "Do not reread unrelated reference materials or restart discovery"
1179 in session.messages[-1].content
1180 )
1181
1182
1183 @pytest.mark.asyncio
1184 async def test_turn_finalizer_verification_failure_reentry_prioritizes_missing_planned_outputs(
1185 temp_dir: Path,
1186 monkeypatch: pytest.MonkeyPatch,
1187 ) -> None:
1188 session = FakeSession()
1189 context = build_context(temp_dir, session)
1190 queued_messages: list[str] = []
1191 context.queue_steering_message_callback = queued_messages.append
1192 finalizer = TurnFinalizer(
1193 context,
1194 RuntimeTracer(),
1195 DefinitionOfDoneStore(temp_dir),
1196 set_workflow_mode=_noop_set_workflow_mode,
1197 )
1198 guide_root = temp_dir / "guides" / "nginx"
1199 chapters = guide_root / "chapters"
1200 chapters.mkdir(parents=True, exist_ok=True)
1201 index = guide_root / "index.html"
1202 first = chapters / "01-installation.html"
1203 second = chapters / "02-configuration.html"
1204 third = chapters / "03-basic-usage.html"
1205 index.write_text(
1206 "\n".join(
1207 [
1208 '<a href="chapters/01-installation.html">Installation</a>',
1209 '<a href="chapters/02-configuration.html">Configuration</a>',
1210 '<a href="chapters/03-basic-usage.html">Basic Usage</a>',
1211 ]
1212 )
1213 )
1214 first.write_text("<h1>Installation</h1>\n")
1215 implementation_plan = temp_dir / "implementation.md"
1216 implementation_plan.write_text(
1217 "\n".join(
1218 [
1219 "# Implementation Plan",
1220 "",
1221 "## File Changes",
1222 f"- `{guide_root}/`",
1223 f"- `{chapters}/`",
1224 f"- `{index}`",
1225 f"- `{first}`",
1226 "",
1227 ]
1228 )
1229 )
1230 dod = create_definition_of_done("Create the nginx guide.")
1231 dod.mutating_actions.append("write")
1232 dod.touched_files.extend([str(index), str(first)])
1233 dod.implementation_plan = str(implementation_plan)
1234 dod.verification_commands = ["python3 verify_links.py"]
1235 summary = TurnSummary(final_response="")
1236 verify_call = ToolCall(
1237 id="verify-1-1",
1238 name="bash",
1239 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1240 )
1241 normalized_second = str(second.resolve(strict=False))
1242 normalized_third = str(third.resolve(strict=False))
1243 failure_output = (
1244 "Missing local HTML links:\n"
1245 f"{index}:chapters/02-configuration.html -> {second}\n"
1246 f"{index}:chapters/03-basic-usage.html -> {third}\n"
1247 )
1248
1249 async def capture(event) -> None:
1250 return None
1251
1252 monkeypatch.setattr(
1253 "loader.runtime.finalization.derive_verification_commands",
1254 lambda *args, **kwargs: [],
1255 )
1256
1257 result = await finalizer.run_definition_of_done_gate(
1258 dod=dod,
1259 candidate_response="The guide is complete.",
1260 emit=capture,
1261 summary=summary,
1262 executor=FakeExecutor(
1263 [
1264 tool_outcome(
1265 tool_call=verify_call,
1266 output=failure_output,
1267 is_error=True,
1268 exit_code=1,
1269 stdout=failure_output,
1270 )
1271 ]
1272 ), # type: ignore[arg-type]
1273 )
1274
1275 assert result.should_continue is True
1276 assert result.reason_code == "verification_failed_reentry"
1277 assert queued_messages
1278 assert normalized_second in queued_messages[-1]
1279 assert "Do not rewrite the existing aggregate files" in queued_messages[-1]
1280 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1281 assert f"Immediate next step: write `{normalized_second}`." in session.messages[-1].content
1282 assert (
1283 f"creating missing planned artifact `{normalized_second}`"
1284 in session.messages[-1].content
1285 )
1286 assert (
1287 f"creating missing planned artifact `{normalized_third}`"
1288 in session.messages[-1].content
1289 )
1290 assert f"Immediate next step: edit `{index}`." not in session.messages[-1].content
1291 assert "Do not rewrite existing aggregate files" in session.messages[-1].content
1292
1293
1294 @pytest.mark.asyncio
1295 async def test_turn_finalizer_does_not_reverify_without_new_changes(
1296 temp_dir: Path,
1297 ) -> None:
1298 session = FakeSession()
1299 context = build_context(temp_dir, session)
1300 finalizer = TurnFinalizer(
1301 context,
1302 RuntimeTracer(),
1303 DefinitionOfDoneStore(temp_dir),
1304 set_workflow_mode=_noop_set_workflow_mode,
1305 )
1306 index = temp_dir / "index.html"
1307 index.write_text("<ul></ul>\n")
1308 dod = create_definition_of_done("Fix the chapter list in index.html.")
1309 dod.mutating_actions.append("edit")
1310 dod.touched_files.append(str(index))
1311 dod.line_changes = 12
1312 dod.last_verification_result = "failed"
1313 dod.last_verification_signature = (
1314 f"lines={dod.line_changes};touched={index};actions=1;commands="
1315 )
1316 dod.evidence = []
1317 summary = TurnSummary(final_response="")
1318 executor = RecordingExecutor()
1319
1320 async def capture(event) -> None:
1321 return None
1322
1323 result = await finalizer.run_definition_of_done_gate(
1324 dod=dod,
1325 candidate_response="I checked the file again.",
1326 emit=capture,
1327 summary=summary,
1328 executor=executor, # type: ignore[arg-type]
1329 )
1330
1331 assert result.should_continue is True
1332 assert result.reason_code == "verification_failed_no_new_changes"
1333 assert executor.commands == []
1334 assert summary.verification_status == "failed"
1335 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
1336
1337
1338 @pytest.mark.asyncio
1339 async def test_turn_finalizer_extends_retry_budget_when_failures_change(
1340 temp_dir: Path,
1341 ) -> None:
1342 session = FakeSession()
1343 context = build_context(temp_dir, session)
1344 finalizer = TurnFinalizer(
1345 context,
1346 RuntimeTracer(),
1347 DefinitionOfDoneStore(temp_dir),
1348 set_workflow_mode=_noop_set_workflow_mode,
1349 )
1350 target = temp_dir / "chapter.html"
1351 target.write_text("<h1>Chapter</h1>\n")
1352 dod = create_definition_of_done("Expand the generated chapter.")
1353 dod.retry_count = dod.retry_budget
1354 dod.mutating_actions.append("patch")
1355 dod.touched_files.append(str(target))
1356 dod.line_changes = 20
1357 dod.last_verification_result = "failed"
1358 dod.last_verification_signature = "lines=10;touched=chapter.html;actions=1;commands="
1359 dod.last_failed_verification_issue_signature = "old failing artifact set"
1360 dod.verification_commands = ["python check_quality.py"]
1361 summary = TurnSummary(final_response="")
1362 executor = SelectiveRecordingExecutor("check_quality.py")
1363
1364 async def capture(event) -> None:
1365 return None
1366
1367 result = await finalizer.run_definition_of_done_gate(
1368 dod=dod,
1369 candidate_response="I expanded one failing file.",
1370 emit=capture,
1371 summary=summary,
1372 executor=executor, # type: ignore[arg-type]
1373 )
1374
1375 assert result.should_continue is True
1376 assert result.reason_code == "verification_failed_reentry"
1377 assert dod.retry_count == 1
1378 assert dod.status == "fixing"
1379 assert "python check_quality.py" in executor.commands
1380 assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1381
1382
1383 @pytest.mark.asyncio
1384 async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
1385 temp_dir: Path,
1386 monkeypatch: pytest.MonkeyPatch,
1387 ) -> None:
1388 session = FakeSession()
1389 context = build_context(temp_dir, session)
1390 finalizer = TurnFinalizer(
1391 context,
1392 RuntimeTracer(),
1393 DefinitionOfDoneStore(temp_dir),
1394 set_workflow_mode=_noop_set_workflow_mode,
1395 )
1396 dod = create_definition_of_done(
1397 "Update index.html so the table of contents links and chapter titles are correct."
1398 )
1399 dod.mutating_actions.append("edit")
1400 dod.touched_files.append(str(temp_dir / "index.html"))
1401 dod.verification_commands = [
1402 "python3 - <<'PY'\nprint('semantic ok')\nPY",
1403 "html5validator --root /tmp/fortran-qwen-recovery-check/",
1404 ]
1405 summary = TurnSummary(final_response="")
1406 semantic_call = ToolCall(
1407 id="verify-1-1",
1408 name="bash",
1409 arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)},
1410 )
1411 html5validator_call = ToolCall(
1412 id="verify-1-2",
1413 name="bash",
1414 arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)},
1415 )
1416
1417 async def capture(event) -> None:
1418 return None
1419
1420 monkeypatch.setattr(
1421 "loader.runtime.finalization.derive_verification_commands",
1422 lambda *args, **kwargs: [],
1423 )
1424
1425 result = await finalizer.run_definition_of_done_gate(
1426 dod=dod,
1427 candidate_response="Updated the chapter links and titles.",
1428 emit=capture,
1429 summary=summary,
1430 executor=FakeExecutor(
1431 [
1432 tool_outcome(
1433 tool_call=semantic_call,
1434 output="semantic ok",
1435 is_error=False,
1436 exit_code=0,
1437 stdout="semantic ok",
1438 ),
1439 tool_outcome(
1440 tool_call=html5validator_call,
1441 output="/bin/sh: html5validator: command not found",
1442 is_error=True,
1443 exit_code=127,
1444 stderr="/bin/sh: html5validator: command not found",
1445 ),
1446 ]
1447 ), # type: ignore[arg-type]
1448 )
1449
1450 assert result.should_continue is False
1451 assert result.reason_code == "verification_passed"
1452 assert summary.verification_status == "passed"
1453 assert dod.status == "done"
1454 assert dod.last_verification_result == "passed"
1455 assert [item.passed for item in dod.evidence] == [True, False]
1456 assert [item.skipped for item in dod.evidence] == [False, True]
1457 assert "SKIP" in result.final_response
1458 assert "html5validator" in result.final_response
1459 assert session.workflow_timeline[-2].reason_code == "verification_command_passed"
1460 assert session.workflow_timeline[-1].reason_code == "verification_command_skipped"
1461 assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [
1462 VerificationObservationStatus.SKIPPED.value
1463 ]