tenseleyflow/loader / eabd30b

Browse files

Track repair progress in verification retries

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
eabd30b108d131d2a72a451ead95f1091df9df34
Parents
549ed74
Tree
305ee5c

4 changed files

StatusFile+-
M src/loader/runtime/dod.py 43 5
M src/loader/runtime/finalization.py 40 0
M tests/test_dod.py 31 0
M tests/test_finalization.py 45 0
src/loader/runtime/dod.pymodified
@@ -11,6 +11,7 @@ from pathlib import Path
1111
 from typing import Any, Literal
1212
 
1313
 from ..llm.base import Message, ToolCall
14
+from ..tools.fs_safety import StructuredPatchHunk, coerce_structured_patch_payload
1415
 from ..tools.shell_tools import BashTool
1516
 from .verification_observations import VerificationAttempt, verification_attempt_id
1617
 
@@ -157,6 +158,7 @@ class DefinitionOfDone:
157158
     storage_path: str | None = None
158159
     last_verification_result: str | None = None
159160
     last_verification_signature: str | None = None
161
+    last_failed_verification_issue_signature: str | None = None
160162
     verification_attempt_counter: int = 0
161163
     active_verification_attempt_id: str | None = None
162164
     active_verification_attempt_number: int | None = None
@@ -196,6 +198,9 @@ class DefinitionOfDone:
196198
             storage_path=data.get("storage_path"),
197199
             last_verification_result=data.get("last_verification_result"),
198200
             last_verification_signature=data.get("last_verification_signature"),
201
+            last_failed_verification_issue_signature=data.get(
202
+                "last_failed_verification_issue_signature"
203
+            ),
199204
             verification_attempt_counter=int(data.get("verification_attempt_counter", 0)),
200205
             active_verification_attempt_id=data.get("active_verification_attempt_id"),
201206
             active_verification_attempt_number=(
@@ -286,11 +291,8 @@ def record_successful_tool_call(
286291
         file_path = _resolve_touched_path(tool_call.arguments.get("file_path", ""))
287292
         if file_path:
288293
             _append_unique(dod.touched_files, file_path)
289
-        for hunk in tool_call.arguments.get("hunks", []):
290
-            if not isinstance(hunk, dict):
291
-                continue
292
-            old_lines = int(hunk.get("old_lines", 0))
293
-            new_lines = int(hunk.get("new_lines", 0))
294
+        for hunk in _coerce_patch_hunks_for_accounting(tool_call.arguments):
295
+            old_lines, new_lines = _patch_hunk_line_counts(hunk)
294296
             dod.line_changes += max(old_lines, new_lines)
295297
     elif tool_call.name == "bash":
296298
         command = str(tool_call.arguments.get("command", "")).strip()
@@ -691,6 +693,42 @@ def _count_lines(content: str) -> int:
691693
     return content.count("\n") + 1
692694
 
693695
 
696
+def _coerce_patch_hunks_for_accounting(
697
+    arguments: dict[str, object],
698
+) -> list[dict[str, object] | StructuredPatchHunk]:
699
+    for key in ("hunks", "structured_patch", "structuredPatch"):
700
+        hunks = coerce_structured_patch_payload(arguments.get(key))
701
+        if hunks:
702
+            return hunks
703
+    return []
704
+
705
+
706
+def _patch_hunk_line_counts(
707
+    hunk: dict[str, object] | StructuredPatchHunk,
708
+) -> tuple[int, int]:
709
+    if isinstance(hunk, StructuredPatchHunk):
710
+        return hunk.old_lines, hunk.new_lines
711
+
712
+    old_lines = _coerce_int(hunk.get("old_lines"), default=0)
713
+    new_lines_value = hunk.get("new_lines")
714
+    if isinstance(new_lines_value, list):
715
+        new_lines = len(new_lines_value)
716
+        old_start = _coerce_int(hunk.get("old_start"), default=0)
717
+        old_end = _coerce_int(hunk.get("old_end"), default=old_start - 1)
718
+        if old_lines <= 0 and old_start > 0:
719
+            old_lines = max(0, old_end - old_start + 1)
720
+    else:
721
+        new_lines = _coerce_int(new_lines_value, default=0)
722
+    return max(0, old_lines), max(0, new_lines)
723
+
724
+
725
+def _coerce_int(value: object, *, default: int) -> int:
726
+    try:
727
+        return int(value)  # type: ignore[arg-type]
728
+    except (TypeError, ValueError):
729
+        return default
730
+
731
+
694732
 def _is_verification_command(command: str) -> bool:
695733
     command_lower = command.lower()
696734
     signals = (
src/loader/runtime/finalization.pymodified
@@ -461,6 +461,8 @@ class TurnFinalizer:
461461
             dod.pending_items = []
462462
             dod.status = "done"
463463
             dod.last_verification_result = "passed"
464
+            dod.last_failed_verification_issue_signature = None
465
+            dod.retry_count = 0
464466
             dod.confidence = "high"
465467
             summary.verification_status = "passed"
466468
             summary.definition_of_done = dod
@@ -485,6 +487,11 @@ class TurnFinalizer:
485487
             )
486488
 
487489
         dod.last_verification_result = "failed"
490
+        failure_issue_signature = _verification_failure_issue_signature(dod)
491
+        if _verification_failure_issue_changed(dod, failure_issue_signature):
492
+            dod.retry_count = 0
493
+        if failure_issue_signature:
494
+            dod.last_failed_verification_issue_signature = failure_issue_signature
488495
         summary.verification_status = "failed"
489496
         summary.definition_of_done = dod
490497
         failed_provenance = _verification_result_provenance(dod, passed=False)
@@ -1115,6 +1122,39 @@ def _verification_state_signature(dod: DefinitionOfDone) -> str:
11151122
     )
11161123
 
11171124
 
1125
+def _verification_failure_issue_signature(dod: DefinitionOfDone) -> str:
1126
+    parts: list[str] = []
1127
+    for evidence in dod.evidence:
1128
+        if evidence.passed or evidence.skipped:
1129
+            continue
1130
+        detail = "\n".join(
1131
+            str(value).strip()
1132
+            for value in (
1133
+                evidence.command,
1134
+                evidence.stdout,
1135
+                evidence.stderr,
1136
+                evidence.output,
1137
+            )
1138
+            if str(value).strip()
1139
+        )
1140
+        normalized_lines = [
1141
+            " ".join(line.strip().split())
1142
+            for line in detail.splitlines()
1143
+            if line.strip()
1144
+        ]
1145
+        if normalized_lines:
1146
+            parts.append("\n".join(normalized_lines))
1147
+    return "\n---\n".join(sorted(parts))
1148
+
1149
+
1150
+def _verification_failure_issue_changed(
1151
+    dod: DefinitionOfDone,
1152
+    failure_issue_signature: str,
1153
+) -> bool:
1154
+    previous = dod.last_failed_verification_issue_signature
1155
+    return bool(previous and failure_issue_signature and previous != failure_issue_signature)
1156
+
1157
+
11181158
 def _normalize_pending_statement(value: str) -> str:
11191159
     return " ".join(value.strip().lower().split())
11201160
 
tests/test_dod.pymodified
@@ -1,5 +1,6 @@
11
 """Tests for definition-of-done state and persistence."""
22
 
3
+import json
34
 import subprocess
45
 from pathlib import Path
56
 
@@ -112,6 +113,36 @@ def test_record_successful_tool_call_preserves_absolute_path_string(tmp_path: Pa
112113
     assert dod.touched_files == [str(absolute_path)]
113114
 
114115
 
116
+def test_record_successful_tool_call_counts_json_string_patch_hunks(
117
+    tmp_path: Path,
118
+) -> None:
119
+    dod = create_definition_of_done("Patch generated HTML content.")
120
+    target = tmp_path / "chapter.html"
121
+    hunks = json.dumps(
122
+        [
123
+            {
124
+                "old_start": 10,
125
+                "old_lines": 2,
126
+                "new_start": 10,
127
+                "new_lines": 8,
128
+                "lines": ["-old", "-body", "+new", "+expanded"],
129
+            }
130
+        ]
131
+    )[:-1]
132
+
133
+    record_successful_tool_call(
134
+        dod,
135
+        ToolCall(
136
+            id="patch-1",
137
+            name="patch",
138
+            arguments={"file_path": str(target), "hunks": hunks},
139
+        ),
140
+    )
141
+
142
+    assert dod.touched_files == [str(target)]
143
+    assert dod.line_changes == 8
144
+
145
+
115146
 def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Path) -> None:
116147
     chapters = tmp_path / "chapters"
117148
     chapters.mkdir()
tests/test_finalization.pymodified
@@ -1300,6 +1300,51 @@ async def test_turn_finalizer_does_not_reverify_without_new_changes(
13001300
     assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]")
13011301
 
13021302
 
1303
+@pytest.mark.asyncio
1304
+async def test_turn_finalizer_extends_retry_budget_when_failures_change(
1305
+    temp_dir: Path,
1306
+) -> None:
1307
+    session = FakeSession()
1308
+    context = build_context(temp_dir, session)
1309
+    finalizer = TurnFinalizer(
1310
+        context,
1311
+        RuntimeTracer(),
1312
+        DefinitionOfDoneStore(temp_dir),
1313
+        set_workflow_mode=_noop_set_workflow_mode,
1314
+    )
1315
+    target = temp_dir / "chapter.html"
1316
+    target.write_text("<h1>Chapter</h1>\n")
1317
+    dod = create_definition_of_done("Expand the generated chapter.")
1318
+    dod.retry_count = dod.retry_budget
1319
+    dod.mutating_actions.append("patch")
1320
+    dod.touched_files.append(str(target))
1321
+    dod.line_changes = 20
1322
+    dod.last_verification_result = "failed"
1323
+    dod.last_verification_signature = "lines=10;touched=chapter.html;actions=1;commands="
1324
+    dod.last_failed_verification_issue_signature = "old failing artifact set"
1325
+    dod.verification_commands = ["python check_quality.py"]
1326
+    summary = TurnSummary(final_response="")
1327
+    executor = SelectiveRecordingExecutor("check_quality.py")
1328
+
1329
+    async def capture(event) -> None:
1330
+        return None
1331
+
1332
+    result = await finalizer.run_definition_of_done_gate(
1333
+        dod=dod,
1334
+        candidate_response="I expanded one failing file.",
1335
+        emit=capture,
1336
+        summary=summary,
1337
+        executor=executor,  # type: ignore[arg-type]
1338
+    )
1339
+
1340
+    assert result.should_continue is True
1341
+    assert result.reason_code == "verification_failed_reentry"
1342
+    assert dod.retry_count == 1
1343
+    assert dod.status == "fixing"
1344
+    assert "python check_quality.py" in executor.commands
1345
+    assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]")
1346
+
1347
+
13031348
 @pytest.mark.asyncio
13041349
 async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes(
13051350
     temp_dir: Path,