Python · 28256 bytes Raw Blame History
1 """Tests for completion-policy helpers."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role, ToolCall
11 from loader.runtime.completion_policy import CompletionPolicy
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import VerificationEvidence, create_definition_of_done
14 from loader.runtime.events import TurnSummary
15 from loader.runtime.evidence_provenance import EvidenceProvenanceStatus
16 from loader.runtime.permissions import (
17 PermissionMode,
18 build_permission_policy,
19 load_permission_rules,
20 )
21 from loader.runtime.task_completion import (
22 assess_completion_follow_through,
23 assess_completion_follow_through_with_provenance,
24 detect_premature_completion,
25 get_continuation_prompt,
26 )
27 from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done
28 from loader.runtime.verification_observations import (
29 VerificationObservationStatus,
30 verification_attempt_id,
31 )
32 from loader.tools.base import create_default_registry
33 from tests.helpers.runtime_harness import ScriptedBackend
34
35
36 class FakeCodeFilter:
37 def reset(self) -> None:
38 return None
39
40
41 class FakeSafeguards:
42 def __init__(self, *, text_loop: tuple[bool, str] = (False, "")) -> None:
43 self.action_tracker = object()
44 self.validator = object()
45 self.code_filter = FakeCodeFilter()
46 self._text_loop = text_loop
47 self.recorded: list[str] = []
48
49 def filter_stream_chunk(self, content: str) -> str:
50 return content
51
52 def filter_complete_content(self, content: str) -> str:
53 return content
54
55 def should_steer(self) -> bool:
56 return False
57
58 def get_steering_message(self) -> str | None:
59 return None
60
61 def record_response(self, content: str) -> None:
62 self.recorded.append(content)
63
64 def detect_text_loop(self, content: str) -> tuple[bool, str]:
65 return self._text_loop
66
67 def detect_loop(self) -> tuple[bool, str]:
68 return False, ""
69
70
71 class FakeSession:
72 def __init__(self) -> None:
73 self.messages: list[Message] = []
74
75 def append(self, message: Message) -> None:
76 self.messages.append(message)
77
78
79 def build_context(
80 temp_dir: Path,
81 *,
82 safeguards: FakeSafeguards,
83 max_continuation_prompts: int = 5,
84 use_quick_completion: bool = True,
85 ) -> RuntimeContext:
86 registry = create_default_registry(temp_dir)
87 registry.configure_workspace_root(temp_dir)
88 rule_status = load_permission_rules(temp_dir)
89 policy = build_permission_policy(
90 active_mode=PermissionMode.WORKSPACE_WRITE,
91 workspace_root=temp_dir,
92 tool_requirements=registry.get_tool_requirements(),
93 rules=rule_status.rules,
94 )
95 return RuntimeContext(
96 project_root=temp_dir,
97 backend=ScriptedBackend(),
98 registry=registry,
99 session=FakeSession(), # type: ignore[arg-type]
100 config=SimpleNamespace(
101 force_react=False,
102 reasoning=SimpleNamespace(
103 max_continuation_prompts=max_continuation_prompts,
104 use_quick_completion=use_quick_completion,
105 ),
106 ),
107 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
108 project_context=None,
109 permission_policy=policy,
110 permission_config_status=rule_status,
111 workflow_mode="execute",
112 safeguards=safeguards,
113 )
114
115
116 def test_completion_policy_finalize_response_text_keeps_original_response() -> None:
117 response = CompletionPolicy.finalize_response_text(
118 content="Inspected the file successfully.",
119 actions_taken=["read: README.md"],
120 )
121
122 assert response == "Inspected the file successfully."
123
124
125 def test_detect_premature_completion_respects_explicit_done_without_actions() -> None:
126 assert detect_premature_completion(
127 "Explain how Loader works.",
128 "Done.",
129 [],
130 ) is False
131
132
133 def test_get_continuation_prompt_surfaces_missing_verification_steps() -> None:
134 prompt = get_continuation_prompt(
135 "Create the script and test that it works.",
136 ["write: script.py"],
137 "The script has been created.",
138 )
139
140 assert "Continue with" in prompt
141 assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
142
143
144 def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
145 check = assess_completion_follow_through(
146 task="Create the script and test that it works.",
147 response="The script has been created.",
148 actions_taken=["write: script.py"],
149 )
150
151 assert check.is_complete is False
152 assert "showing the requested work was actually carried out" in check.required_evidence
153 assert "showing the result was run or verified" in check.required_evidence
154 assert check.missing_evidence == ["showing the result was run or verified"]
155 assert check.suggested_next_steps == [
156 "Execute what you created or run the relevant tests now"
157 ]
158
159
160 def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
161 check = assess_completion_follow_through(
162 task="Explain how Loader's workflow timeline works.",
163 response="Loader records workflow decisions and policy events in a timeline.",
164 actions_taken=[],
165 )
166
167 assert check.is_complete is True
168 assert check.required_evidence == []
169 assert check.missing_evidence == []
170
171
172 def test_assess_completion_follow_through_uses_passing_verification_evidence() -> None:
173 dod = create_definition_of_done("Run pytest -q and make sure it works.")
174 dod.verification_commands = ["pytest -q"]
175 dod.evidence = [
176 VerificationEvidence(
177 command="pytest -q",
178 passed=True,
179 stdout="342 passed",
180 kind="test",
181 )
182 ]
183 dod.last_verification_result = "passed"
184
185 check = assess_completion_follow_through(
186 task="Run pytest -q and make sure it works.",
187 response="The test suite passed.",
188 actions_taken=[],
189 dod=dod,
190 )
191
192 assert check.is_complete is True
193 assert check.missing_evidence == []
194 assert "verified: pytest -q" in check.accomplished
195
196
197 def test_assess_completion_follow_through_surfaces_failing_verification() -> None:
198 dod = create_definition_of_done("Run pytest -q and make sure it works.")
199 dod.verification_commands = ["pytest -q"]
200 dod.evidence = [
201 VerificationEvidence(
202 command="pytest -q",
203 passed=False,
204 stderr="1 failed",
205 kind="test",
206 )
207 ]
208 dod.last_verification_result = "failed"
209
210 check = assess_completion_follow_through(
211 task="Run pytest -q and make sure it works.",
212 response="The tests are done.",
213 actions_taken=[],
214 dod=dod,
215 )
216
217 assert check.is_complete is False
218 assert check.missing_evidence == [
219 "a passing verification result from `pytest -q` (current verification is still failing)"
220 ]
221 assert check.suggested_next_steps == [
222 "Fix the failing `pytest -q` result and rerun it"
223 ]
224
225
226 def test_assess_completion_follow_through_surfaces_planned_verification() -> None:
227 dod = create_definition_of_done("Run pytest -q and make sure it works.")
228 dod.verification_commands = ["pytest -q"]
229 dod.last_verification_result = "planned"
230
231 check = assess_completion_follow_through(
232 task="Run pytest -q and make sure it works.",
233 response="The tests are next.",
234 actions_taken=["write: README.md"],
235 dod=dod,
236 )
237
238 assert check.is_complete is False
239 assert check.missing_evidence == [
240 "a passing verification result from `pytest -q` (verification is planned but has not run yet)"
241 ]
242 assert check.suggested_next_steps == ["Run the planned verification `pytest -q` now"]
243
244
245 def test_assess_completion_follow_through_surfaces_pending_verification() -> None:
246 dod = create_definition_of_done("Run pytest -q and make sure it works.")
247 dod.verification_commands = ["pytest -q"]
248 dod.last_verification_result = "pending"
249
250 check = assess_completion_follow_through(
251 task="Run pytest -q and make sure it works.",
252 response="Verification is underway.",
253 actions_taken=["write: README.md"],
254 dod=dod,
255 )
256
257 assert check.is_complete is False
258 assert check.missing_evidence == [
259 "a completed passing verification result from `pytest -q` (verification is still pending)"
260 ]
261 assert check.suggested_next_steps == [
262 "Finish running `pytest -q` and capture the result"
263 ]
264
265
266 def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None:
267 dod = create_definition_of_done("Run pytest -q and make sure it works.")
268 dod.verification_commands = ["pytest -q"]
269 dod.last_verification_result = "stale"
270 dod.verification_attempt_counter = 2
271 dod.active_verification_attempt_id = verification_attempt_id(2)
272 dod.active_verification_attempt_number = 2
273
274 check = assess_completion_follow_through(
275 task="Run pytest -q and make sure it works.",
276 response="The tests were already handled.",
277 actions_taken=["write: README.md"],
278 dod=dod,
279 )
280
281 assert check.is_complete is False
282 assert check.missing_evidence == [
283 "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)"
284 ]
285 assert check.suggested_next_steps == [
286 "Rerun `pytest -q` now that the implementation changed again"
287 ]
288
289
290 def test_completion_assessment_projects_superseded_verification_attempt_for_stale_result() -> None:
291 dod = create_definition_of_done("Run pytest -q and make sure it works.")
292 dod.verification_commands = ["pytest -q"]
293 dod.last_verification_result = "stale"
294 dod.verification_attempt_counter = 2
295 dod.active_verification_attempt_id = verification_attempt_id(2)
296 dod.active_verification_attempt_number = 2
297
298 assessment = assess_completion_follow_through_with_provenance(
299 task="Run pytest -q and make sure it works.",
300 response="The tests were already handled.",
301 actions_taken=["write: README.md"],
302 dod=dod,
303 )
304
305 assert [item.status for item in assessment.verification_observations] == [
306 VerificationObservationStatus.STALE.value
307 ]
308 assert assessment.verification_observations[0].attempt_id == verification_attempt_id(1)
309 assert assessment.verification_observations[0].attempt_number == 1
310 assert assessment.verification_observations[0].supersedes_attempt_id == (
311 verification_attempt_id(2)
312 )
313
314
315 def test_completion_assessment_attaches_typed_verification_provenance() -> None:
316 dod = create_definition_of_done("Run pytest -q and make sure it works.")
317 dod.verification_commands = ["pytest -q"]
318 dod.evidence = [
319 VerificationEvidence(
320 command="pytest -q",
321 passed=False,
322 stderr="1 failed",
323 kind="test",
324 )
325 ]
326 dod.last_verification_result = "failed"
327
328 assessment = assess_completion_follow_through_with_provenance(
329 task="Run pytest -q and make sure it works.",
330 response="The tests are done.",
331 actions_taken=[],
332 dod=dod,
333 )
334
335 assert assessment.check.is_complete is False
336 assert [item.status for item in assessment.evidence_provenance] == [
337 EvidenceProvenanceStatus.CONTRADICTS.value
338 ]
339 assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
340
341
342 def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None:
343 dod = create_definition_of_done("Fix the chapter links in index.html.")
344 sync_todos_to_definition_of_done(
345 dod,
346 [
347 {
348 "content": "First, examine the current index.html file to understand its structure",
349 "active_form": "Working on: First, examine the current index.html file to understand its structure",
350 "status": "pending",
351 },
352 {
353 "content": "List and read all HTML files in the chapters directory to extract chapter information",
354 "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information",
355 "status": "pending",
356 },
357 {
358 "content": "Parse chapter titles from each HTML file",
359 "active_form": "Working on: Parse chapter titles from each HTML file",
360 "status": "pending",
361 },
362 {
363 "content": "Update index.html with correct chapter links and titles",
364 "active_form": "Working on: Update index.html with correct chapter links and titles",
365 "status": "pending",
366 },
367 ],
368 )
369 advance_todos_from_tool_call(
370 dod,
371 ToolCall(
372 id="read-index",
373 name="read",
374 arguments={"file_path": "/tmp/fortran/index.html"},
375 ),
376 )
377 advance_todos_from_tool_call(
378 dod,
379 ToolCall(
380 id="glob-chapters",
381 name="glob",
382 arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"},
383 ),
384 )
385 advance_todos_from_tool_call(
386 dod,
387 ToolCall(
388 id="read-chapter",
389 name="read",
390 arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"},
391 ),
392 )
393
394 assessment = assess_completion_follow_through_with_provenance(
395 task="Update /tmp/fortran/index.html so every chapter link is correct.",
396 response="I'll update the index.html file with the correct chapter links and titles.",
397 actions_taken=[
398 "read: {'file_path': '/tmp/fortran/index.html'}",
399 "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}",
400 "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}",
401 ],
402 dod=dod,
403 )
404
405 assert assessment.check.missing_evidence[0] == (
406 "completion of tracked work items "
407 "(Update index.html with correct chapter links and titles)"
408 )
409 assert assessment.check.suggested_next_steps[0] == (
410 "Complete the tracked item: Update index.html with correct chapter links and titles"
411 )
412
413
414 @pytest.mark.asyncio
415 async def test_completion_policy_stops_for_text_loop_using_runtime_context(
416 temp_dir: Path,
417 ) -> None:
418 context = build_context(
419 temp_dir,
420 safeguards=FakeSafeguards(text_loop=(True, "assistant repeated the same summary")),
421 )
422 policy = CompletionPolicy(context)
423 summary = TurnSummary(final_response="")
424 events = []
425
426 async def emit(event) -> None:
427 events.append(event)
428
429 decision = await policy.maybe_stop_for_text_loop(
430 content="Same summary again.",
431 emit=emit,
432 summary=summary,
433 )
434
435 assert decision.should_stop is True
436 assert decision.decision_code == "text_loop_bailout"
437 assert decision.decision_summary == (
438 "stopped after detecting a repeated text loop"
439 )
440 assert summary.final_response == (
441 "I stopped because I was repeating myself and couldn't make further progress."
442 )
443 assert summary.assistant_messages[-1].role == Role.ASSISTANT
444 assert context.session.messages[-1].content == summary.final_response
445 assert events[0].type == "error"
446 assert events[1].type == "response"
447
448
449 @pytest.mark.asyncio
450 async def test_completion_policy_requests_continuation_using_runtime_context(
451 temp_dir: Path,
452 ) -> None:
453 context = build_context(
454 temp_dir,
455 safeguards=FakeSafeguards(),
456 )
457 policy = CompletionPolicy(context)
458 events = []
459
460 async def emit(event) -> None:
461 events.append(event)
462
463 decision = await policy.maybe_continue_for_completion(
464 content="I can handle that.",
465 response_content="I can handle that.",
466 task="Create the file and verify it works.",
467 actions_taken=[],
468 continuation_count=0,
469 emit=emit,
470 )
471
472 assert decision.should_continue is True
473 assert decision.decision_code == "premature_completion_nudge"
474 assert decision.decision_summary == (
475 "requested one continuation because the non-mutating response looked incomplete"
476 )
477 assert decision.completion_check is not None
478 assert decision.completion_check.missing_evidence == [
479 "showing the requested work was actually carried out",
480 "showing the result was run or verified",
481 ]
482 assert context.session.messages[-2] == Message(
483 role=Role.ASSISTANT,
484 content="I can handle that.",
485 )
486 assert context.session.messages[-1].role == Role.USER
487 assert "verify it works" in context.session.messages[-1].content.lower()
488 assert events[0].type == "completion_check"
489 assert events[0].completion_check is not None
490 assert events[0].completion_check.missing_evidence == [
491 "showing the requested work was actually carried out",
492 "showing the result was run or verified",
493 ]
494 assert [item.status for item in decision.evidence_provenance] == [
495 EvidenceProvenanceStatus.MISSING.value,
496 EvidenceProvenanceStatus.MISSING.value,
497 ]
498
499
500 @pytest.mark.asyncio
501 async def test_completion_policy_accepts_passed_verification_from_dod(
502 temp_dir: Path,
503 ) -> None:
504 context = build_context(
505 temp_dir,
506 safeguards=FakeSafeguards(),
507 )
508 policy = CompletionPolicy(context)
509 dod = create_definition_of_done("Run pytest -q and make sure it works.")
510 dod.verification_commands = ["pytest -q"]
511 dod.evidence = [
512 VerificationEvidence(
513 command="pytest -q",
514 passed=True,
515 stdout="342 passed",
516 kind="test",
517 )
518 ]
519 dod.last_verification_result = "passed"
520 events = []
521
522 async def emit(event) -> None:
523 events.append(event)
524
525 decision = await policy.maybe_continue_for_completion(
526 content="The tests passed.",
527 response_content="The tests passed.",
528 task="Run pytest -q and make sure it works.",
529 actions_taken=[],
530 continuation_count=0,
531 emit=emit,
532 dod=dod,
533 )
534
535 assert decision.should_continue is False
536 assert decision.should_finalize is False
537 assert decision.decision_code == "completion_response_accepted"
538 assert decision.completion_check is not None
539 assert decision.completion_check.missing_evidence == []
540 assert events == []
541 assert [item.summary for item in decision.evidence_provenance] == [
542 "verification passed for `pytest -q`"
543 ]
544
545
546 @pytest.mark.asyncio
547 async def test_completion_policy_finalizes_with_concrete_failed_verification_gap(
548 temp_dir: Path,
549 ) -> None:
550 context = build_context(
551 temp_dir,
552 safeguards=FakeSafeguards(),
553 max_continuation_prompts=1,
554 )
555 policy = CompletionPolicy(context)
556 dod = create_definition_of_done("Run pytest -q and make sure it works.")
557 dod.verification_commands = ["pytest -q"]
558 dod.evidence = [
559 VerificationEvidence(
560 command="pytest -q",
561 passed=False,
562 stderr="1 failed",
563 kind="test",
564 )
565 ]
566 dod.last_verification_result = "failed"
567 dod.verification_attempt_counter = 2
568 dod.active_verification_attempt_id = verification_attempt_id(2)
569 dod.active_verification_attempt_number = 2
570 events = []
571
572 async def emit(event) -> None:
573 events.append(event)
574
575 decision = await policy.maybe_continue_for_completion(
576 content="The tests are done.",
577 response_content="The tests are done.",
578 task="Run pytest -q and make sure it works.",
579 actions_taken=[],
580 continuation_count=1,
581 emit=emit,
582 dod=dod,
583 )
584
585 assert decision.should_continue is False
586 assert decision.should_finalize is True
587 assert decision.decision_code == "continuation_budget_exhausted"
588 assert decision.decision_summary == (
589 "stopped because the continuation budget was exhausted while observed "
590 "verification still showed verification failed for `pytest -q` "
591 "[1 failed; attempt 2]"
592 )
593 assert decision.completion_check is not None
594 assert decision.completion_check.missing_evidence == [
595 "a passing verification result from `pytest -q` (current verification is still failing)"
596 ]
597 assert decision.final_response == (
598 "I stopped because the continuation budget was exhausted and observed "
599 "verification still showed: verification failed for `pytest -q` "
600 "[1 failed; attempt 2]."
601 )
602 assert events[0].type == "completion_check"
603 assert [item.status for item in decision.evidence_provenance] == [
604 EvidenceProvenanceStatus.CONTRADICTS.value
605 ]
606 assert [item.status for item in decision.verification_observations] == [
607 VerificationObservationStatus.FAILED.value
608 ]
609 assert decision.verification_observations[0].attempt_number == 2
610
611
612 @pytest.mark.asyncio
613 async def test_completion_policy_uses_missing_observed_verification_when_budget_is_exhausted(
614 temp_dir: Path,
615 ) -> None:
616 context = build_context(
617 temp_dir,
618 safeguards=FakeSafeguards(),
619 max_continuation_prompts=1,
620 )
621 policy = CompletionPolicy(context)
622 dod = create_definition_of_done("Run pytest -q and make sure it works.")
623 dod.verification_commands = ["pytest -q"]
624 dod.last_verification_result = "failed"
625 dod.verification_attempt_counter = 3
626 dod.active_verification_attempt_id = verification_attempt_id(3)
627 dod.active_verification_attempt_number = 3
628 events = []
629
630 async def emit(event) -> None:
631 events.append(event)
632
633 decision = await policy.maybe_continue_for_completion(
634 content="The tests are done.",
635 response_content="The tests are done.",
636 task="Run pytest -q and make sure it works.",
637 actions_taken=[],
638 continuation_count=1,
639 emit=emit,
640 dod=dod,
641 )
642
643 assert decision.should_continue is False
644 assert decision.should_finalize is True
645 assert decision.decision_code == "continuation_budget_exhausted"
646 assert decision.decision_summary == (
647 "stopped because the continuation budget was exhausted while observed "
648 "verification still showed verification did not produce an observed "
649 "result for `pytest -q` [attempt 3]"
650 )
651 assert decision.final_response == (
652 "I stopped because the continuation budget was exhausted and observed "
653 "verification still showed: verification did not produce an observed "
654 "result for `pytest -q` [attempt 3]."
655 )
656 assert [item.status for item in decision.verification_observations] == [
657 VerificationObservationStatus.MISSING.value
658 ]
659 assert decision.verification_observations[0].attempt_number == 3
660 assert events[0].type == "completion_check"
661
662
663 @pytest.mark.asyncio
664 async def test_completion_policy_uses_pending_observed_verification_when_budget_is_exhausted(
665 temp_dir: Path,
666 ) -> None:
667 context = build_context(
668 temp_dir,
669 safeguards=FakeSafeguards(),
670 max_continuation_prompts=1,
671 )
672 policy = CompletionPolicy(context)
673 dod = create_definition_of_done("Run pytest -q and make sure it works.")
674 dod.verification_commands = ["pytest -q"]
675 dod.last_verification_result = "pending"
676 dod.verification_attempt_counter = 4
677 dod.active_verification_attempt_id = verification_attempt_id(4)
678 dod.active_verification_attempt_number = 4
679 events = []
680
681 async def emit(event) -> None:
682 events.append(event)
683
684 decision = await policy.maybe_continue_for_completion(
685 content="Verification is underway.",
686 response_content="Verification is underway.",
687 task="Run pytest -q and make sure it works.",
688 actions_taken=["write: README.md"],
689 continuation_count=1,
690 emit=emit,
691 dod=dod,
692 )
693
694 assert decision.should_continue is False
695 assert decision.should_finalize is True
696 assert decision.decision_code == "continuation_budget_exhausted"
697 assert decision.decision_summary == (
698 "stopped because the continuation budget was exhausted while observed "
699 "verification still showed verification pending for `pytest -q` [attempt 4]"
700 )
701 assert decision.final_response == (
702 "I stopped because the continuation budget was exhausted and observed "
703 "verification still showed: verification pending for `pytest -q` [attempt 4]."
704 )
705 assert [item.status for item in decision.verification_observations] == [
706 VerificationObservationStatus.PENDING.value
707 ]
708 assert decision.verification_observations[0].attempt_number == 4
709 assert events[0].type == "completion_check"
710
711
712 @pytest.mark.asyncio
713 async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted(
714 temp_dir: Path,
715 ) -> None:
716 context = build_context(
717 temp_dir,
718 safeguards=FakeSafeguards(),
719 max_continuation_prompts=1,
720 )
721 policy = CompletionPolicy(context)
722 dod = create_definition_of_done("Run pytest -q and make sure it works.")
723 dod.verification_commands = ["pytest -q"]
724 dod.last_verification_result = "stale"
725 dod.verification_attempt_counter = 2
726 dod.active_verification_attempt_id = verification_attempt_id(2)
727 dod.active_verification_attempt_number = 2
728 events = []
729
730 async def emit(event) -> None:
731 events.append(event)
732
733 decision = await policy.maybe_continue_for_completion(
734 content="The tests were already handled.",
735 response_content="The tests were already handled.",
736 task="Run pytest -q and make sure it works.",
737 actions_taken=["write: README.md"],
738 continuation_count=1,
739 emit=emit,
740 dod=dod,
741 )
742
743 assert decision.should_continue is False
744 assert decision.should_finalize is True
745 assert decision.decision_code == "continuation_budget_exhausted"
746 assert decision.decision_summary == (
747 "stopped because the continuation budget was exhausted while observed "
748 "verification still showed verification became stale for `pytest -q` "
749 "after new mutating work [attempt 1 -> attempt 2]"
750 )
751 assert decision.final_response == (
752 "I stopped because the continuation budget was exhausted and observed "
753 "verification still showed: verification became stale for `pytest -q` "
754 "after new mutating work [attempt 1 -> attempt 2]."
755 )
756 assert [item.status for item in decision.verification_observations] == [
757 VerificationObservationStatus.STALE.value
758 ]
759 assert decision.verification_observations[0].attempt_number == 1
760 assert decision.verification_observations[0].supersedes_attempt_id == (
761 verification_attempt_id(2)
762 )
763 assert events[0].type == "completion_check"
764
765
766 @pytest.mark.asyncio
767 async def test_completion_policy_finalizes_when_budget_is_exhausted(
768 temp_dir: Path,
769 ) -> None:
770 context = build_context(
771 temp_dir,
772 safeguards=FakeSafeguards(),
773 max_continuation_prompts=1,
774 )
775 policy = CompletionPolicy(context)
776 events = []
777
778 async def emit(event) -> None:
779 events.append(event)
780
781 decision = await policy.maybe_continue_for_completion(
782 content="I looked into it.",
783 response_content="I looked into it.",
784 task="Fix the README heading.",
785 actions_taken=[],
786 continuation_count=1,
787 emit=emit,
788 )
789
790 assert decision.should_continue is False
791 assert decision.should_finalize is True
792 assert decision.decision_code == "continuation_budget_exhausted"
793 assert decision.completion_check is not None
794 assert decision.completion_check.missing_evidence == [
795 "showing the requested work was actually carried out"
796 ]
797 assert "Missing evidence" in decision.final_response
798 assert decision.verification_observations == []
799 assert events[0].type == "completion_check"