tenseleyflow/loader / 803dde9

Browse files

Make completion policy attempt-aware

Authored by espadonne
SHA
803dde9c8ddcaf0e0275f0c330edaaf53e7683fd
Parents
15460af
Tree
af93e54

3 changed files

StatusFile+-
M src/loader/runtime/completion_policy.py 12 2
M src/loader/runtime/task_completion.py 87 0
M tests/test_completion_policy.py 61 9
src/loader/runtime/completion_policy.pymodified
@@ -12,7 +12,11 @@ from .events import AgentEvent, TurnSummary
1212
 from .evidence_provenance import EvidenceProvenance
1313
 from .reasoning_types import TaskCompletionCheck
1414
 from .task_completion import assess_completion_follow_through_with_provenance
15
-from .verification_observations import VerificationObservation, VerificationObservationStatus
15
+from .verification_observations import (
16
+    VerificationObservation,
17
+    VerificationObservationStatus,
18
+    describe_verification_attempt,
19
+)
1620
 
1721
 EventSink = Callable[[AgentEvent], Awaitable[None]]
1822
 
@@ -278,8 +282,14 @@ class CompletionPolicy:
278282
     @staticmethod
279283
     def _render_observation(entry: VerificationObservation) -> str:
280284
         summary = entry.summary.strip()
285
+        details: list[str] = []
281286
         if entry.detail:
282287
             detail = entry.detail.strip()
283288
             if detail and detail not in summary:
284
-                return f"{summary} [{detail}]"
289
+                details.append(detail)
290
+        attempt = describe_verification_attempt(entry)
291
+        if attempt and attempt not in summary:
292
+            details.append(attempt)
293
+        if details:
294
+            return f"{summary} [{'; '.join(details)}]"
285295
         return summary
src/loader/runtime/task_completion.pymodified
@@ -9,8 +9,10 @@ from .dod import DefinitionOfDone
99
 from .evidence_provenance import EvidenceProvenance, EvidenceProvenanceStatus
1010
 from .reasoning_types import TaskCompletionCheck
1111
 from .verification_observations import (
12
+    VerificationAttempt,
1213
     VerificationObservation,
1314
     VerificationObservationStatus,
15
+    verification_attempt_id,
1416
 )
1517
 
1618
 _ACTION_VERBS = ("create", "write", "make", "edit", "fix", "add", "delete", "run")
@@ -1033,6 +1035,41 @@ def _verification_provenance(
10331035
     ]
10341036
 
10351037
 
1038
+def _active_verification_attempt(
1039
+    dod: DefinitionOfDone,
1040
+) -> VerificationAttempt | None:
1041
+    if (
1042
+        dod.active_verification_attempt_id
1043
+        and dod.active_verification_attempt_number is not None
1044
+    ):
1045
+        return VerificationAttempt(
1046
+            attempt_id=dod.active_verification_attempt_id,
1047
+            attempt_number=dod.active_verification_attempt_number,
1048
+        )
1049
+    if dod.active_verification_attempt_number is not None:
1050
+        return VerificationAttempt(
1051
+            attempt_id=verification_attempt_id(
1052
+                dod.active_verification_attempt_number
1053
+            ),
1054
+            attempt_number=dod.active_verification_attempt_number,
1055
+        )
1056
+    return None
1057
+
1058
+
1059
+def _stale_verification_attempt(
1060
+    dod: DefinitionOfDone,
1061
+) -> VerificationAttempt | None:
1062
+    active_attempt = _active_verification_attempt(dod)
1063
+    if active_attempt is None or active_attempt.attempt_number <= 1:
1064
+        return None
1065
+    stale_attempt_number = active_attempt.attempt_number - 1
1066
+    return VerificationAttempt(
1067
+        attempt_id=verification_attempt_id(stale_attempt_number),
1068
+        attempt_number=stale_attempt_number,
1069
+        supersedes_attempt_id=active_attempt.attempt_id,
1070
+    )
1071
+
1072
+
10361073
 def _observed_completion_verification(
10371074
     *,
10381075
     dod: DefinitionOfDone | None,
@@ -1042,6 +1079,7 @@ def _observed_completion_verification(
10421079
     if dod is None or not requires_verification:
10431080
         return []
10441081
 
1082
+    active_attempt = _active_verification_attempt(dod)
10451083
     observations: list[VerificationObservation] = []
10461084
     observed_commands: set[str] = set()
10471085
     for evidence in dod.evidence:
@@ -1068,6 +1106,10 @@ def _observed_completion_verification(
10681106
                 kind=evidence.kind,
10691107
                 exit_code=evidence.exit_code,
10701108
                 detail=_verification_detail(evidence),
1109
+                attempt_id=active_attempt.attempt_id if active_attempt else None,
1110
+                attempt_number=(
1111
+                    active_attempt.attempt_number if active_attempt else None
1112
+                ),
10711113
             )
10721114
         )
10731115
 
@@ -1080,6 +1122,10 @@ def _observed_completion_verification(
10801122
                     status=VerificationObservationStatus.MISSING.value,
10811123
                     summary=f"verification did not produce an observed result for `{command}`",
10821124
                     command=command,
1125
+                    attempt_id=active_attempt.attempt_id if active_attempt else None,
1126
+                    attempt_number=(
1127
+                        active_attempt.attempt_number if active_attempt else None
1128
+                    ),
10831129
                 )
10841130
             )
10851131
         return observations
@@ -1091,12 +1137,20 @@ def _observed_completion_verification(
10911137
                     status=VerificationObservationStatus.PENDING.value,
10921138
                     summary=f"verification pending for `{verification_command}`",
10931139
                     command=verification_command,
1140
+                    attempt_id=active_attempt.attempt_id if active_attempt else None,
1141
+                    attempt_number=(
1142
+                        active_attempt.attempt_number if active_attempt else None
1143
+                    ),
10941144
                 )
10951145
             ]
10961146
         return [
10971147
             VerificationObservation(
10981148
                 status=VerificationObservationStatus.PENDING.value,
10991149
                 summary="verification is pending for the active command set",
1150
+                attempt_id=active_attempt.attempt_id if active_attempt else None,
1151
+                attempt_number=(
1152
+                    active_attempt.attempt_number if active_attempt else None
1153
+                ),
11001154
             )
11011155
         ]
11021156
 
@@ -1107,16 +1161,25 @@ def _observed_completion_verification(
11071161
                     status=VerificationObservationStatus.PLANNED.value,
11081162
                     summary=f"verification planned for `{verification_command}`",
11091163
                     command=verification_command,
1164
+                    attempt_id=active_attempt.attempt_id if active_attempt else None,
1165
+                    attempt_number=(
1166
+                        active_attempt.attempt_number if active_attempt else None
1167
+                    ),
11101168
                 )
11111169
             ]
11121170
         return [
11131171
             VerificationObservation(
11141172
                 status=VerificationObservationStatus.PLANNED.value,
11151173
                 summary="verification is planned but has not run yet",
1174
+                attempt_id=active_attempt.attempt_id if active_attempt else None,
1175
+                attempt_number=(
1176
+                    active_attempt.attempt_number if active_attempt else None
1177
+                ),
11161178
             )
11171179
         ]
11181180
 
11191181
     if dod.last_verification_result == VerificationObservationStatus.STALE.value:
1182
+        stale_attempt = _stale_verification_attempt(dod)
11201183
         if verification_command:
11211184
             return [
11221185
                 VerificationObservation(
@@ -1126,12 +1189,32 @@ def _observed_completion_verification(
11261189
                         f"`{verification_command}` after new mutating work"
11271190
                     ),
11281191
                     command=verification_command,
1192
+                    attempt_id=(
1193
+                        stale_attempt.attempt_id if stale_attempt else None
1194
+                    ),
1195
+                    attempt_number=(
1196
+                        stale_attempt.attempt_number if stale_attempt else None
1197
+                    ),
1198
+                    supersedes_attempt_id=(
1199
+                        stale_attempt.supersedes_attempt_id
1200
+                        if stale_attempt
1201
+                        else None
1202
+                    ),
11291203
                 )
11301204
             ]
11311205
         return [
11321206
             VerificationObservation(
11331207
                 status=VerificationObservationStatus.STALE.value,
11341208
                 summary="previous verification became stale after new mutating work",
1209
+                attempt_id=stale_attempt.attempt_id if stale_attempt else None,
1210
+                attempt_number=(
1211
+                    stale_attempt.attempt_number if stale_attempt else None
1212
+                ),
1213
+                supersedes_attempt_id=(
1214
+                    stale_attempt.supersedes_attempt_id
1215
+                    if stale_attempt
1216
+                    else None
1217
+                ),
11351218
             )
11361219
         ]
11371220
 
@@ -1144,6 +1227,10 @@ def _observed_completion_verification(
11441227
                     f"`{verification_command}`"
11451228
                 ),
11461229
                 command=verification_command,
1230
+                attempt_id=active_attempt.attempt_id if active_attempt else None,
1231
+                attempt_number=(
1232
+                    active_attempt.attempt_number if active_attempt else None
1233
+                ),
11471234
             )
11481235
         ]
11491236
     return []
tests/test_completion_policy.pymodified
@@ -24,7 +24,10 @@ from loader.runtime.task_completion import (
2424
     detect_premature_completion,
2525
     get_continuation_prompt,
2626
 )
27
-from loader.runtime.verification_observations import VerificationObservationStatus
27
+from loader.runtime.verification_observations import (
28
+    VerificationObservationStatus,
29
+    verification_attempt_id,
30
+)
2831
 from loader.tools.base import create_default_registry
2932
 from tests.helpers.runtime_harness import ScriptedBackend
3033
 
@@ -263,6 +266,9 @@ def test_assess_completion_follow_through_requires_fresh_verification_when_stale
263266
     dod = create_definition_of_done("Run pytest -q and make sure it works.")
264267
     dod.verification_commands = ["pytest -q"]
265268
     dod.last_verification_result = "stale"
269
+    dod.verification_attempt_counter = 2
270
+    dod.active_verification_attempt_id = verification_attempt_id(2)
271
+    dod.active_verification_attempt_number = 2
266272
 
267273
     check = assess_completion_follow_through(
268274
         task="Run pytest -q and make sure it works.",
@@ -280,6 +286,31 @@ def test_assess_completion_follow_through_requires_fresh_verification_when_stale
280286
     ]
281287
 
282288
 
289
+def test_completion_assessment_projects_superseded_verification_attempt_for_stale_result() -> None:
290
+    dod = create_definition_of_done("Run pytest -q and make sure it works.")
291
+    dod.verification_commands = ["pytest -q"]
292
+    dod.last_verification_result = "stale"
293
+    dod.verification_attempt_counter = 2
294
+    dod.active_verification_attempt_id = verification_attempt_id(2)
295
+    dod.active_verification_attempt_number = 2
296
+
297
+    assessment = assess_completion_follow_through_with_provenance(
298
+        task="Run pytest -q and make sure it works.",
299
+        response="The tests were already handled.",
300
+        actions_taken=["write: README.md"],
301
+        dod=dod,
302
+    )
303
+
304
+    assert [item.status for item in assessment.verification_observations] == [
305
+        VerificationObservationStatus.STALE.value
306
+    ]
307
+    assert assessment.verification_observations[0].attempt_id == verification_attempt_id(1)
308
+    assert assessment.verification_observations[0].attempt_number == 1
309
+    assert assessment.verification_observations[0].supersedes_attempt_id == (
310
+        verification_attempt_id(2)
311
+    )
312
+
313
+
283314
 def test_completion_assessment_attaches_typed_verification_provenance() -> None:
284315
     dod = create_definition_of_done("Run pytest -q and make sure it works.")
285316
     dod.verification_commands = ["pytest -q"]
@@ -460,6 +491,9 @@ async def test_completion_policy_finalizes_with_concrete_failed_verification_gap
460491
         )
461492
     ]
462493
     dod.last_verification_result = "failed"
494
+    dod.verification_attempt_counter = 2
495
+    dod.active_verification_attempt_id = verification_attempt_id(2)
496
+    dod.active_verification_attempt_number = 2
463497
     events = []
464498
 
465499
     async def emit(event) -> None:
@@ -480,7 +514,8 @@ async def test_completion_policy_finalizes_with_concrete_failed_verification_gap
480514
     assert decision.decision_code == "continuation_budget_exhausted"
481515
     assert decision.decision_summary == (
482516
         "stopped because the continuation budget was exhausted while observed "
483
-        "verification still showed verification failed for `pytest -q` [1 failed]"
517
+        "verification still showed verification failed for `pytest -q` "
518
+        "[1 failed; attempt 2]"
484519
     )
485520
     assert decision.completion_check is not None
486521
     assert decision.completion_check.missing_evidence == [
@@ -488,7 +523,8 @@ async def test_completion_policy_finalizes_with_concrete_failed_verification_gap
488523
     ]
489524
     assert decision.final_response == (
490525
         "I stopped because the continuation budget was exhausted and observed "
491
-        "verification still showed: verification failed for `pytest -q` [1 failed]."
526
+        "verification still showed: verification failed for `pytest -q` "
527
+        "[1 failed; attempt 2]."
492528
     )
493529
     assert events[0].type == "completion_check"
494530
     assert [item.status for item in decision.evidence_provenance] == [
@@ -497,6 +533,7 @@ async def test_completion_policy_finalizes_with_concrete_failed_verification_gap
497533
     assert [item.status for item in decision.verification_observations] == [
498534
         VerificationObservationStatus.FAILED.value
499535
     ]
536
+    assert decision.verification_observations[0].attempt_number == 2
500537
 
501538
 
502539
 @pytest.mark.asyncio
@@ -512,6 +549,9 @@ async def test_completion_policy_uses_missing_observed_verification_when_budget_
512549
     dod = create_definition_of_done("Run pytest -q and make sure it works.")
513550
     dod.verification_commands = ["pytest -q"]
514551
     dod.last_verification_result = "failed"
552
+    dod.verification_attempt_counter = 3
553
+    dod.active_verification_attempt_id = verification_attempt_id(3)
554
+    dod.active_verification_attempt_number = 3
515555
     events = []
516556
 
517557
     async def emit(event) -> None:
@@ -533,16 +573,17 @@ async def test_completion_policy_uses_missing_observed_verification_when_budget_
533573
     assert decision.decision_summary == (
534574
         "stopped because the continuation budget was exhausted while observed "
535575
         "verification still showed verification did not produce an observed "
536
-        "result for `pytest -q`"
576
+        "result for `pytest -q` [attempt 3]"
537577
     )
538578
     assert decision.final_response == (
539579
         "I stopped because the continuation budget was exhausted and observed "
540580
         "verification still showed: verification did not produce an observed "
541
-        "result for `pytest -q`."
581
+        "result for `pytest -q` [attempt 3]."
542582
     )
543583
     assert [item.status for item in decision.verification_observations] == [
544584
         VerificationObservationStatus.MISSING.value
545585
     ]
586
+    assert decision.verification_observations[0].attempt_number == 3
546587
     assert events[0].type == "completion_check"
547588
 
548589
 
@@ -559,6 +600,9 @@ async def test_completion_policy_uses_pending_observed_verification_when_budget_
559600
     dod = create_definition_of_done("Run pytest -q and make sure it works.")
560601
     dod.verification_commands = ["pytest -q"]
561602
     dod.last_verification_result = "pending"
603
+    dod.verification_attempt_counter = 4
604
+    dod.active_verification_attempt_id = verification_attempt_id(4)
605
+    dod.active_verification_attempt_number = 4
562606
     events = []
563607
 
564608
     async def emit(event) -> None:
@@ -579,15 +623,16 @@ async def test_completion_policy_uses_pending_observed_verification_when_budget_
579623
     assert decision.decision_code == "continuation_budget_exhausted"
580624
     assert decision.decision_summary == (
581625
         "stopped because the continuation budget was exhausted while observed "
582
-        "verification still showed verification pending for `pytest -q`"
626
+        "verification still showed verification pending for `pytest -q` [attempt 4]"
583627
     )
584628
     assert decision.final_response == (
585629
         "I stopped because the continuation budget was exhausted and observed "
586
-        "verification still showed: verification pending for `pytest -q`."
630
+        "verification still showed: verification pending for `pytest -q` [attempt 4]."
587631
     )
588632
     assert [item.status for item in decision.verification_observations] == [
589633
         VerificationObservationStatus.PENDING.value
590634
     ]
635
+    assert decision.verification_observations[0].attempt_number == 4
591636
     assert events[0].type == "completion_check"
592637
 
593638
 
@@ -604,6 +649,9 @@ async def test_completion_policy_uses_stale_observed_verification_when_budget_is
604649
     dod = create_definition_of_done("Run pytest -q and make sure it works.")
605650
     dod.verification_commands = ["pytest -q"]
606651
     dod.last_verification_result = "stale"
652
+    dod.verification_attempt_counter = 2
653
+    dod.active_verification_attempt_id = verification_attempt_id(2)
654
+    dod.active_verification_attempt_number = 2
607655
     events = []
608656
 
609657
     async def emit(event) -> None:
@@ -625,16 +673,20 @@ async def test_completion_policy_uses_stale_observed_verification_when_budget_is
625673
     assert decision.decision_summary == (
626674
         "stopped because the continuation budget was exhausted while observed "
627675
         "verification still showed verification became stale for `pytest -q` "
628
-        "after new mutating work"
676
+        "after new mutating work [attempt 1 -> attempt 2]"
629677
     )
630678
     assert decision.final_response == (
631679
         "I stopped because the continuation budget was exhausted and observed "
632680
         "verification still showed: verification became stale for `pytest -q` "
633
-        "after new mutating work."
681
+        "after new mutating work [attempt 1 -> attempt 2]."
634682
     )
635683
     assert [item.status for item in decision.verification_observations] == [
636684
         VerificationObservationStatus.STALE.value
637685
     ]
686
+    assert decision.verification_observations[0].attempt_number == 1
687
+    assert decision.verification_observations[0].supersedes_attempt_id == (
688
+        verification_attempt_id(2)
689
+    )
638690
     assert events[0].type == "completion_check"
639691
 
640692