Python · 25344 bytes Raw Blame History
1 """Tests for completion-policy helpers."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6 from types import SimpleNamespace
7
8 import pytest
9
10 from loader.llm.base import Message, Role
11 from loader.runtime.completion_policy import CompletionPolicy
12 from loader.runtime.context import RuntimeContext
13 from loader.runtime.dod import VerificationEvidence, create_definition_of_done
14 from loader.runtime.events import TurnSummary
15 from loader.runtime.evidence_provenance import EvidenceProvenanceStatus
16 from loader.runtime.permissions import (
17 PermissionMode,
18 build_permission_policy,
19 load_permission_rules,
20 )
21 from loader.runtime.task_completion import (
22 assess_completion_follow_through,
23 assess_completion_follow_through_with_provenance,
24 detect_premature_completion,
25 get_continuation_prompt,
26 )
27 from loader.runtime.verification_observations import (
28 VerificationObservationStatus,
29 verification_attempt_id,
30 )
31 from loader.tools.base import create_default_registry
32 from tests.helpers.runtime_harness import ScriptedBackend
33
34
35 class FakeCodeFilter:
36 def reset(self) -> None:
37 return None
38
39
40 class FakeSafeguards:
41 def __init__(self, *, text_loop: tuple[bool, str] = (False, "")) -> None:
42 self.action_tracker = object()
43 self.validator = object()
44 self.code_filter = FakeCodeFilter()
45 self._text_loop = text_loop
46 self.recorded: list[str] = []
47
48 def filter_stream_chunk(self, content: str) -> str:
49 return content
50
51 def filter_complete_content(self, content: str) -> str:
52 return content
53
54 def should_steer(self) -> bool:
55 return False
56
57 def get_steering_message(self) -> str | None:
58 return None
59
60 def record_response(self, content: str) -> None:
61 self.recorded.append(content)
62
63 def detect_text_loop(self, content: str) -> tuple[bool, str]:
64 return self._text_loop
65
66 def detect_loop(self) -> tuple[bool, str]:
67 return False, ""
68
69
70 class FakeSession:
71 def __init__(self) -> None:
72 self.messages: list[Message] = []
73
74 def append(self, message: Message) -> None:
75 self.messages.append(message)
76
77
78 def build_context(
79 temp_dir: Path,
80 *,
81 safeguards: FakeSafeguards,
82 max_continuation_prompts: int = 5,
83 use_quick_completion: bool = True,
84 ) -> RuntimeContext:
85 registry = create_default_registry(temp_dir)
86 registry.configure_workspace_root(temp_dir)
87 rule_status = load_permission_rules(temp_dir)
88 policy = build_permission_policy(
89 active_mode=PermissionMode.WORKSPACE_WRITE,
90 workspace_root=temp_dir,
91 tool_requirements=registry.get_tool_requirements(),
92 rules=rule_status.rules,
93 )
94 return RuntimeContext(
95 project_root=temp_dir,
96 backend=ScriptedBackend(),
97 registry=registry,
98 session=FakeSession(), # type: ignore[arg-type]
99 config=SimpleNamespace(
100 force_react=False,
101 reasoning=SimpleNamespace(
102 max_continuation_prompts=max_continuation_prompts,
103 use_quick_completion=use_quick_completion,
104 ),
105 ),
106 capability_profile=SimpleNamespace(supports_native_tools=True), # type: ignore[arg-type]
107 project_context=None,
108 permission_policy=policy,
109 permission_config_status=rule_status,
110 workflow_mode="execute",
111 safeguards=safeguards,
112 )
113
114
115 def test_completion_policy_finalize_response_text_keeps_original_response() -> None:
116 response = CompletionPolicy.finalize_response_text(
117 content="Inspected the file successfully.",
118 actions_taken=["read: README.md"],
119 )
120
121 assert response == "Inspected the file successfully."
122
123
124 def test_detect_premature_completion_respects_explicit_done_without_actions() -> None:
125 assert detect_premature_completion(
126 "Explain how Loader works.",
127 "Done.",
128 [],
129 ) is False
130
131
132 def test_get_continuation_prompt_surfaces_missing_verification_steps() -> None:
133 prompt = get_continuation_prompt(
134 "Create the script and test that it works.",
135 ["write: script.py"],
136 "The script has been created.",
137 )
138
139 assert "Continue with" in prompt
140 assert "run the relevant tests" in prompt.lower() or "verify" in prompt.lower()
141
142
143 def test_assess_completion_follow_through_tracks_missing_evidence() -> None:
144 check = assess_completion_follow_through(
145 task="Create the script and test that it works.",
146 response="The script has been created.",
147 actions_taken=["write: script.py"],
148 )
149
150 assert check.is_complete is False
151 assert "showing the requested work was actually carried out" in check.required_evidence
152 assert "showing the result was run or verified" in check.required_evidence
153 assert check.missing_evidence == ["showing the result was run or verified"]
154 assert check.suggested_next_steps == [
155 "Execute what you created or run the relevant tests now"
156 ]
157
158
159 def test_assess_completion_follow_through_accepts_informational_tasks() -> None:
160 check = assess_completion_follow_through(
161 task="Explain how Loader's workflow timeline works.",
162 response="Loader records workflow decisions and policy events in a timeline.",
163 actions_taken=[],
164 )
165
166 assert check.is_complete is True
167 assert check.required_evidence == []
168 assert check.missing_evidence == []
169
170
171 def test_assess_completion_follow_through_uses_passing_verification_evidence() -> None:
172 dod = create_definition_of_done("Run pytest -q and make sure it works.")
173 dod.verification_commands = ["pytest -q"]
174 dod.evidence = [
175 VerificationEvidence(
176 command="pytest -q",
177 passed=True,
178 stdout="342 passed",
179 kind="test",
180 )
181 ]
182 dod.last_verification_result = "passed"
183
184 check = assess_completion_follow_through(
185 task="Run pytest -q and make sure it works.",
186 response="The test suite passed.",
187 actions_taken=[],
188 dod=dod,
189 )
190
191 assert check.is_complete is True
192 assert check.missing_evidence == []
193 assert "verified: pytest -q" in check.accomplished
194
195
196 def test_assess_completion_follow_through_surfaces_failing_verification() -> None:
197 dod = create_definition_of_done("Run pytest -q and make sure it works.")
198 dod.verification_commands = ["pytest -q"]
199 dod.evidence = [
200 VerificationEvidence(
201 command="pytest -q",
202 passed=False,
203 stderr="1 failed",
204 kind="test",
205 )
206 ]
207 dod.last_verification_result = "failed"
208
209 check = assess_completion_follow_through(
210 task="Run pytest -q and make sure it works.",
211 response="The tests are done.",
212 actions_taken=[],
213 dod=dod,
214 )
215
216 assert check.is_complete is False
217 assert check.missing_evidence == [
218 "a passing verification result from `pytest -q` (current verification is still failing)"
219 ]
220 assert check.suggested_next_steps == [
221 "Fix the failing `pytest -q` result and rerun it"
222 ]
223
224
225 def test_assess_completion_follow_through_surfaces_planned_verification() -> None:
226 dod = create_definition_of_done("Run pytest -q and make sure it works.")
227 dod.verification_commands = ["pytest -q"]
228 dod.last_verification_result = "planned"
229
230 check = assess_completion_follow_through(
231 task="Run pytest -q and make sure it works.",
232 response="The tests are next.",
233 actions_taken=["write: README.md"],
234 dod=dod,
235 )
236
237 assert check.is_complete is False
238 assert check.missing_evidence == [
239 "a passing verification result from `pytest -q` (verification is planned but has not run yet)"
240 ]
241 assert check.suggested_next_steps == ["Run the planned verification `pytest -q` now"]
242
243
244 def test_assess_completion_follow_through_surfaces_pending_verification() -> None:
245 dod = create_definition_of_done("Run pytest -q and make sure it works.")
246 dod.verification_commands = ["pytest -q"]
247 dod.last_verification_result = "pending"
248
249 check = assess_completion_follow_through(
250 task="Run pytest -q and make sure it works.",
251 response="Verification is underway.",
252 actions_taken=["write: README.md"],
253 dod=dod,
254 )
255
256 assert check.is_complete is False
257 assert check.missing_evidence == [
258 "a completed passing verification result from `pytest -q` (verification is still pending)"
259 ]
260 assert check.suggested_next_steps == [
261 "Finish running `pytest -q` and capture the result"
262 ]
263
264
265 def test_assess_completion_follow_through_requires_fresh_verification_when_stale() -> None:
266 dod = create_definition_of_done("Run pytest -q and make sure it works.")
267 dod.verification_commands = ["pytest -q"]
268 dod.last_verification_result = "stale"
269 dod.verification_attempt_counter = 2
270 dod.active_verification_attempt_id = verification_attempt_id(2)
271 dod.active_verification_attempt_number = 2
272
273 check = assess_completion_follow_through(
274 task="Run pytest -q and make sure it works.",
275 response="The tests were already handled.",
276 actions_taken=["write: README.md"],
277 dod=dod,
278 )
279
280 assert check.is_complete is False
281 assert check.missing_evidence == [
282 "a fresh passing verification result from `pytest -q` (previous verification became stale after new mutating work)"
283 ]
284 assert check.suggested_next_steps == [
285 "Rerun `pytest -q` now that the implementation changed again"
286 ]
287
288
289 def test_completion_assessment_projects_superseded_verification_attempt_for_stale_result() -> None:
290 dod = create_definition_of_done("Run pytest -q and make sure it works.")
291 dod.verification_commands = ["pytest -q"]
292 dod.last_verification_result = "stale"
293 dod.verification_attempt_counter = 2
294 dod.active_verification_attempt_id = verification_attempt_id(2)
295 dod.active_verification_attempt_number = 2
296
297 assessment = assess_completion_follow_through_with_provenance(
298 task="Run pytest -q and make sure it works.",
299 response="The tests were already handled.",
300 actions_taken=["write: README.md"],
301 dod=dod,
302 )
303
304 assert [item.status for item in assessment.verification_observations] == [
305 VerificationObservationStatus.STALE.value
306 ]
307 assert assessment.verification_observations[0].attempt_id == verification_attempt_id(1)
308 assert assessment.verification_observations[0].attempt_number == 1
309 assert assessment.verification_observations[0].supersedes_attempt_id == (
310 verification_attempt_id(2)
311 )
312
313
314 def test_completion_assessment_attaches_typed_verification_provenance() -> None:
315 dod = create_definition_of_done("Run pytest -q and make sure it works.")
316 dod.verification_commands = ["pytest -q"]
317 dod.evidence = [
318 VerificationEvidence(
319 command="pytest -q",
320 passed=False,
321 stderr="1 failed",
322 kind="test",
323 )
324 ]
325 dod.last_verification_result = "failed"
326
327 assessment = assess_completion_follow_through_with_provenance(
328 task="Run pytest -q and make sure it works.",
329 response="The tests are done.",
330 actions_taken=[],
331 dod=dod,
332 )
333
334 assert assessment.check.is_complete is False
335 assert [item.status for item in assessment.evidence_provenance] == [
336 EvidenceProvenanceStatus.CONTRADICTS.value
337 ]
338 assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`"
339
340
341 @pytest.mark.asyncio
342 async def test_completion_policy_stops_for_text_loop_using_runtime_context(
343 temp_dir: Path,
344 ) -> None:
345 context = build_context(
346 temp_dir,
347 safeguards=FakeSafeguards(text_loop=(True, "assistant repeated the same summary")),
348 )
349 policy = CompletionPolicy(context)
350 summary = TurnSummary(final_response="")
351 events = []
352
353 async def emit(event) -> None:
354 events.append(event)
355
356 decision = await policy.maybe_stop_for_text_loop(
357 content="Same summary again.",
358 emit=emit,
359 summary=summary,
360 )
361
362 assert decision.should_stop is True
363 assert decision.decision_code == "text_loop_bailout"
364 assert decision.decision_summary == (
365 "stopped after detecting a repeated text loop"
366 )
367 assert summary.final_response == (
368 "I stopped because I was repeating myself and couldn't make further progress."
369 )
370 assert summary.assistant_messages[-1].role == Role.ASSISTANT
371 assert context.session.messages[-1].content == summary.final_response
372 assert events[0].type == "error"
373 assert events[1].type == "response"
374
375
376 @pytest.mark.asyncio
377 async def test_completion_policy_requests_continuation_using_runtime_context(
378 temp_dir: Path,
379 ) -> None:
380 context = build_context(
381 temp_dir,
382 safeguards=FakeSafeguards(),
383 )
384 policy = CompletionPolicy(context)
385 events = []
386
387 async def emit(event) -> None:
388 events.append(event)
389
390 decision = await policy.maybe_continue_for_completion(
391 content="I can handle that.",
392 response_content="I can handle that.",
393 task="Create the file and verify it works.",
394 actions_taken=[],
395 continuation_count=0,
396 emit=emit,
397 )
398
399 assert decision.should_continue is True
400 assert decision.decision_code == "premature_completion_nudge"
401 assert decision.decision_summary == (
402 "requested one continuation because the non-mutating response looked incomplete"
403 )
404 assert decision.completion_check is not None
405 assert decision.completion_check.missing_evidence == [
406 "showing the requested work was actually carried out",
407 "showing the result was run or verified",
408 ]
409 assert context.session.messages[-2] == Message(
410 role=Role.ASSISTANT,
411 content="I can handle that.",
412 )
413 assert context.session.messages[-1].role == Role.USER
414 assert "verify it works" in context.session.messages[-1].content.lower()
415 assert events[0].type == "completion_check"
416 assert events[0].completion_check is not None
417 assert events[0].completion_check.missing_evidence == [
418 "showing the requested work was actually carried out",
419 "showing the result was run or verified",
420 ]
421 assert [item.status for item in decision.evidence_provenance] == [
422 EvidenceProvenanceStatus.MISSING.value,
423 EvidenceProvenanceStatus.MISSING.value,
424 ]
425
426
427 @pytest.mark.asyncio
428 async def test_completion_policy_accepts_passed_verification_from_dod(
429 temp_dir: Path,
430 ) -> None:
431 context = build_context(
432 temp_dir,
433 safeguards=FakeSafeguards(),
434 )
435 policy = CompletionPolicy(context)
436 dod = create_definition_of_done("Run pytest -q and make sure it works.")
437 dod.verification_commands = ["pytest -q"]
438 dod.evidence = [
439 VerificationEvidence(
440 command="pytest -q",
441 passed=True,
442 stdout="342 passed",
443 kind="test",
444 )
445 ]
446 dod.last_verification_result = "passed"
447 events = []
448
449 async def emit(event) -> None:
450 events.append(event)
451
452 decision = await policy.maybe_continue_for_completion(
453 content="The tests passed.",
454 response_content="The tests passed.",
455 task="Run pytest -q and make sure it works.",
456 actions_taken=[],
457 continuation_count=0,
458 emit=emit,
459 dod=dod,
460 )
461
462 assert decision.should_continue is False
463 assert decision.should_finalize is False
464 assert decision.decision_code == "completion_response_accepted"
465 assert decision.completion_check is not None
466 assert decision.completion_check.missing_evidence == []
467 assert events == []
468 assert [item.summary for item in decision.evidence_provenance] == [
469 "verification passed for `pytest -q`"
470 ]
471
472
473 @pytest.mark.asyncio
474 async def test_completion_policy_finalizes_with_concrete_failed_verification_gap(
475 temp_dir: Path,
476 ) -> None:
477 context = build_context(
478 temp_dir,
479 safeguards=FakeSafeguards(),
480 max_continuation_prompts=1,
481 )
482 policy = CompletionPolicy(context)
483 dod = create_definition_of_done("Run pytest -q and make sure it works.")
484 dod.verification_commands = ["pytest -q"]
485 dod.evidence = [
486 VerificationEvidence(
487 command="pytest -q",
488 passed=False,
489 stderr="1 failed",
490 kind="test",
491 )
492 ]
493 dod.last_verification_result = "failed"
494 dod.verification_attempt_counter = 2
495 dod.active_verification_attempt_id = verification_attempt_id(2)
496 dod.active_verification_attempt_number = 2
497 events = []
498
499 async def emit(event) -> None:
500 events.append(event)
501
502 decision = await policy.maybe_continue_for_completion(
503 content="The tests are done.",
504 response_content="The tests are done.",
505 task="Run pytest -q and make sure it works.",
506 actions_taken=[],
507 continuation_count=1,
508 emit=emit,
509 dod=dod,
510 )
511
512 assert decision.should_continue is False
513 assert decision.should_finalize is True
514 assert decision.decision_code == "continuation_budget_exhausted"
515 assert decision.decision_summary == (
516 "stopped because the continuation budget was exhausted while observed "
517 "verification still showed verification failed for `pytest -q` "
518 "[1 failed; attempt 2]"
519 )
520 assert decision.completion_check is not None
521 assert decision.completion_check.missing_evidence == [
522 "a passing verification result from `pytest -q` (current verification is still failing)"
523 ]
524 assert decision.final_response == (
525 "I stopped because the continuation budget was exhausted and observed "
526 "verification still showed: verification failed for `pytest -q` "
527 "[1 failed; attempt 2]."
528 )
529 assert events[0].type == "completion_check"
530 assert [item.status for item in decision.evidence_provenance] == [
531 EvidenceProvenanceStatus.CONTRADICTS.value
532 ]
533 assert [item.status for item in decision.verification_observations] == [
534 VerificationObservationStatus.FAILED.value
535 ]
536 assert decision.verification_observations[0].attempt_number == 2
537
538
539 @pytest.mark.asyncio
540 async def test_completion_policy_uses_missing_observed_verification_when_budget_is_exhausted(
541 temp_dir: Path,
542 ) -> None:
543 context = build_context(
544 temp_dir,
545 safeguards=FakeSafeguards(),
546 max_continuation_prompts=1,
547 )
548 policy = CompletionPolicy(context)
549 dod = create_definition_of_done("Run pytest -q and make sure it works.")
550 dod.verification_commands = ["pytest -q"]
551 dod.last_verification_result = "failed"
552 dod.verification_attempt_counter = 3
553 dod.active_verification_attempt_id = verification_attempt_id(3)
554 dod.active_verification_attempt_number = 3
555 events = []
556
557 async def emit(event) -> None:
558 events.append(event)
559
560 decision = await policy.maybe_continue_for_completion(
561 content="The tests are done.",
562 response_content="The tests are done.",
563 task="Run pytest -q and make sure it works.",
564 actions_taken=[],
565 continuation_count=1,
566 emit=emit,
567 dod=dod,
568 )
569
570 assert decision.should_continue is False
571 assert decision.should_finalize is True
572 assert decision.decision_code == "continuation_budget_exhausted"
573 assert decision.decision_summary == (
574 "stopped because the continuation budget was exhausted while observed "
575 "verification still showed verification did not produce an observed "
576 "result for `pytest -q` [attempt 3]"
577 )
578 assert decision.final_response == (
579 "I stopped because the continuation budget was exhausted and observed "
580 "verification still showed: verification did not produce an observed "
581 "result for `pytest -q` [attempt 3]."
582 )
583 assert [item.status for item in decision.verification_observations] == [
584 VerificationObservationStatus.MISSING.value
585 ]
586 assert decision.verification_observations[0].attempt_number == 3
587 assert events[0].type == "completion_check"
588
589
590 @pytest.mark.asyncio
591 async def test_completion_policy_uses_pending_observed_verification_when_budget_is_exhausted(
592 temp_dir: Path,
593 ) -> None:
594 context = build_context(
595 temp_dir,
596 safeguards=FakeSafeguards(),
597 max_continuation_prompts=1,
598 )
599 policy = CompletionPolicy(context)
600 dod = create_definition_of_done("Run pytest -q and make sure it works.")
601 dod.verification_commands = ["pytest -q"]
602 dod.last_verification_result = "pending"
603 dod.verification_attempt_counter = 4
604 dod.active_verification_attempt_id = verification_attempt_id(4)
605 dod.active_verification_attempt_number = 4
606 events = []
607
608 async def emit(event) -> None:
609 events.append(event)
610
611 decision = await policy.maybe_continue_for_completion(
612 content="Verification is underway.",
613 response_content="Verification is underway.",
614 task="Run pytest -q and make sure it works.",
615 actions_taken=["write: README.md"],
616 continuation_count=1,
617 emit=emit,
618 dod=dod,
619 )
620
621 assert decision.should_continue is False
622 assert decision.should_finalize is True
623 assert decision.decision_code == "continuation_budget_exhausted"
624 assert decision.decision_summary == (
625 "stopped because the continuation budget was exhausted while observed "
626 "verification still showed verification pending for `pytest -q` [attempt 4]"
627 )
628 assert decision.final_response == (
629 "I stopped because the continuation budget was exhausted and observed "
630 "verification still showed: verification pending for `pytest -q` [attempt 4]."
631 )
632 assert [item.status for item in decision.verification_observations] == [
633 VerificationObservationStatus.PENDING.value
634 ]
635 assert decision.verification_observations[0].attempt_number == 4
636 assert events[0].type == "completion_check"
637
638
639 @pytest.mark.asyncio
640 async def test_completion_policy_uses_stale_observed_verification_when_budget_is_exhausted(
641 temp_dir: Path,
642 ) -> None:
643 context = build_context(
644 temp_dir,
645 safeguards=FakeSafeguards(),
646 max_continuation_prompts=1,
647 )
648 policy = CompletionPolicy(context)
649 dod = create_definition_of_done("Run pytest -q and make sure it works.")
650 dod.verification_commands = ["pytest -q"]
651 dod.last_verification_result = "stale"
652 dod.verification_attempt_counter = 2
653 dod.active_verification_attempt_id = verification_attempt_id(2)
654 dod.active_verification_attempt_number = 2
655 events = []
656
657 async def emit(event) -> None:
658 events.append(event)
659
660 decision = await policy.maybe_continue_for_completion(
661 content="The tests were already handled.",
662 response_content="The tests were already handled.",
663 task="Run pytest -q and make sure it works.",
664 actions_taken=["write: README.md"],
665 continuation_count=1,
666 emit=emit,
667 dod=dod,
668 )
669
670 assert decision.should_continue is False
671 assert decision.should_finalize is True
672 assert decision.decision_code == "continuation_budget_exhausted"
673 assert decision.decision_summary == (
674 "stopped because the continuation budget was exhausted while observed "
675 "verification still showed verification became stale for `pytest -q` "
676 "after new mutating work [attempt 1 -> attempt 2]"
677 )
678 assert decision.final_response == (
679 "I stopped because the continuation budget was exhausted and observed "
680 "verification still showed: verification became stale for `pytest -q` "
681 "after new mutating work [attempt 1 -> attempt 2]."
682 )
683 assert [item.status for item in decision.verification_observations] == [
684 VerificationObservationStatus.STALE.value
685 ]
686 assert decision.verification_observations[0].attempt_number == 1
687 assert decision.verification_observations[0].supersedes_attempt_id == (
688 verification_attempt_id(2)
689 )
690 assert events[0].type == "completion_check"
691
692
693 @pytest.mark.asyncio
694 async def test_completion_policy_finalizes_when_budget_is_exhausted(
695 temp_dir: Path,
696 ) -> None:
697 context = build_context(
698 temp_dir,
699 safeguards=FakeSafeguards(),
700 max_continuation_prompts=1,
701 )
702 policy = CompletionPolicy(context)
703 events = []
704
705 async def emit(event) -> None:
706 events.append(event)
707
708 decision = await policy.maybe_continue_for_completion(
709 content="I looked into it.",
710 response_content="I looked into it.",
711 task="Fix the README heading.",
712 actions_taken=[],
713 continuation_count=1,
714 emit=emit,
715 )
716
717 assert decision.should_continue is False
718 assert decision.should_finalize is True
719 assert decision.decision_code == "continuation_budget_exhausted"
720 assert decision.completion_check is not None
721 assert decision.completion_check.missing_evidence == [
722 "showing the requested work was actually carried out"
723 ]
724 assert "Missing evidence" in decision.final_response
725 assert decision.verification_observations == []
726 assert events[0].type == "completion_check"