Prioritize concrete missing outputs
- SHA
8056705e83c8772c93189c34988f92e8adf90cfb- Parents
-
09f576c - Tree
d28b7a9
8056705
8056705e83c8772c93189c34988f92e8adf90cfb09f576c
d28b7a9| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/tool_batches.py
|
42 | 0 |
| M |
src/loader/runtime/workflow.py
|
15 | 0 |
| M |
tests/test_tool_batches.py
|
136 | 1 |
| M |
tests/test_workflow.py
|
36 | 0 |
src/loader/runtime/tool_batches.pymodified@@ -350,8 +350,10 @@ class ToolBatchRunner: | ||
| 350 | 350 | max_items=2, |
| 351 | 351 | ) |
| 352 | 352 | if _should_prioritize_missing_artifact( |
| 353 | + dod=dod, | |
| 353 | 354 | next_pending=next_pending, |
| 354 | 355 | missing_artifact=missing_artifact, |
| 356 | + project_root=self.context.project_root, | |
| 355 | 357 | ): |
| 356 | 358 | prefix = "Reuse the earlier observation instead of repeating it. " |
| 357 | 359 | if confirmed_facts: |
@@ -802,8 +804,10 @@ class ToolBatchRunner: | ||
| 802 | 804 | if not completed_label or not next_pending or next_pending == completed_label: |
| 803 | 805 | return |
| 804 | 806 | if _should_prioritize_missing_artifact( |
| 807 | + dod=dod, | |
| 805 | 808 | next_pending=next_pending, |
| 806 | 809 | missing_artifact=missing_artifact, |
| 810 | + project_root=self.context.project_root, | |
| 807 | 811 | ): |
| 808 | 812 | if not has_artifact_progress: |
| 809 | 813 | compact_handoff = _compact_missing_artifact_handoff( |
@@ -1102,6 +1106,7 @@ class ToolBatchRunner: | ||
| 1102 | 1106 | and not _todo_is_mutation_step(next_pending) |
| 1103 | 1107 | and not _todo_is_consistency_review_step(next_pending) |
| 1104 | 1108 | and not _should_prioritize_missing_artifact( |
| 1109 | + dod=dod, | |
| 1105 | 1110 | next_pending=next_pending, |
| 1106 | 1111 | missing_artifact=( |
| 1107 | 1112 | missing_artifact |
@@ -1111,6 +1116,7 @@ class ToolBatchRunner: | ||
| 1111 | 1116 | ) |
| 1112 | 1117 | else None |
| 1113 | 1118 | ), |
| 1119 | + project_root=self.context.project_root, | |
| 1114 | 1120 | ) |
| 1115 | 1121 | ): |
| 1116 | 1122 | self.context.queue_steering_message( |
@@ -1143,18 +1149,54 @@ def _todo_is_consistency_review_step(item: str) -> bool: | ||
| 1143 | 1149 | |
| 1144 | 1150 | def _should_prioritize_missing_artifact( |
| 1145 | 1151 | *, |
| 1152 | + dod: DefinitionOfDone, | |
| 1146 | 1153 | next_pending: str | None, |
| 1147 | 1154 | missing_artifact: tuple[Path, bool] | None, |
| 1155 | + project_root: Path, | |
| 1148 | 1156 | ) -> bool: |
| 1149 | 1157 | if missing_artifact is None: |
| 1150 | 1158 | return False |
| 1151 | 1159 | if not next_pending: |
| 1152 | 1160 | return True |
| 1161 | + if _pending_todo_conflicts_with_missing_artifact( | |
| 1162 | + dod, | |
| 1163 | + item=next_pending, | |
| 1164 | + missing_artifact=missing_artifact, | |
| 1165 | + project_root=project_root, | |
| 1166 | + ): | |
| 1167 | + return True | |
| 1153 | 1168 | if _todo_is_consistency_review_step(next_pending): |
| 1154 | 1169 | return True |
| 1155 | 1170 | return not _todo_is_mutation_step(next_pending) |
| 1156 | 1171 | |
| 1157 | 1172 | |
| 1173 | +def _pending_todo_conflicts_with_missing_artifact( | |
| 1174 | + dod: DefinitionOfDone, | |
| 1175 | + *, | |
| 1176 | + item: str, | |
| 1177 | + missing_artifact: tuple[Path, bool], | |
| 1178 | + project_root: Path, | |
| 1179 | +) -> bool: | |
| 1180 | + text = item.strip().lower() | |
| 1181 | + if not text or item in _TODO_NUDGE_EXCLUDED_ITEMS: | |
| 1182 | + return False | |
| 1183 | + | |
| 1184 | + target, expect_directory = missing_artifact | |
| 1185 | + inferred_target = infer_pending_todo_output_target( | |
| 1186 | + dod, | |
| 1187 | + item, | |
| 1188 | + project_root=project_root, | |
| 1189 | + ) | |
| 1190 | + if inferred_target is None: | |
| 1191 | + return not expect_directory and _todo_is_mutation_step(item) | |
| 1192 | + | |
| 1193 | + inferred_target = inferred_target.resolve(strict=False) | |
| 1194 | + target = target.resolve(strict=False) | |
| 1195 | + if expect_directory: | |
| 1196 | + return target != inferred_target and target not in inferred_target.parents | |
| 1197 | + return inferred_target != target | |
| 1198 | + | |
| 1199 | + | |
| 1158 | 1200 | def _next_missing_planned_artifact( |
| 1159 | 1201 | dod: DefinitionOfDone, |
| 1160 | 1202 | *, |
src/loader/runtime/workflow.pymodified@@ -181,6 +181,10 @@ _ARTIFACT_SET_COMPLETION_HINTS = ( | ||
| 181 | 181 | "formatted", |
| 182 | 182 | "formatting", |
| 183 | 183 | "review", |
| 184 | + "style", | |
| 185 | + "same style", | |
| 186 | + "same structure", | |
| 187 | + "follow the same", | |
| 184 | 188 | ) |
| 185 | 189 | _BROAD_SETUP_HINTS = ( |
| 186 | 190 | "directory structure", |
@@ -1236,6 +1240,17 @@ def _todo_progress_score(item: str, tool_call: ToolCall) -> int: | ||
| 1236 | 1240 | ) |
| 1237 | 1241 | ): |
| 1238 | 1242 | return 0 |
| 1243 | + if ( | |
| 1244 | + is_discovery_tool | |
| 1245 | + and _todo_requires_complete_artifact_set(text) | |
| 1246 | + and not ( | |
| 1247 | + _contains_any(text, _READ_STEP_HINTS) | |
| 1248 | + or _contains_any(text, _SEARCH_STEP_HINTS) | |
| 1249 | + or _contains_any(text, _PARSE_STEP_HINTS) | |
| 1250 | + or _contains_any(text, _VERIFY_STEP_HINTS) | |
| 1251 | + ) | |
| 1252 | + ): | |
| 1253 | + return 0 | |
| 1239 | 1254 | |
| 1240 | 1255 | if basename and basename in text: |
| 1241 | 1256 | score += 3 |
tests/test_tool_batches.pymodified@@ -688,7 +688,7 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 688 | 688 | |
| 689 | 689 | assert len(queued_messages) == 1 |
| 690 | 690 | assert "Reuse the earlier observation instead of repeating it." in queued_messages[0] |
| 691 | - assert "Continue with the next pending item: `Create the remaining chapter files`." in queued_messages[0] | |
| 691 | + assert "A declared output artifact is still missing." in queued_messages[0] | |
| 692 | 692 | assert "Resume by creating `04-variables.html` now." in queued_messages[0] |
| 693 | 693 | assert f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads." in queued_messages[0] |
| 694 | 694 | |
@@ -1303,6 +1303,139 @@ async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_t | ||
| 1303 | 1303 | assert "Update `" not in queued_messages[0] |
| 1304 | 1304 | |
| 1305 | 1305 | |
| 1306 | +@pytest.mark.asyncio | |
| 1307 | +async def test_tool_batch_runner_successful_reference_read_prioritizes_concrete_missing_artifact( | |
| 1308 | + temp_dir: Path, | |
| 1309 | +) -> None: | |
| 1310 | + async def assess_confidence( | |
| 1311 | + tool_name: str, | |
| 1312 | + tool_args: dict, | |
| 1313 | + context: str, | |
| 1314 | + ) -> ConfidenceAssessment: | |
| 1315 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1316 | + | |
| 1317 | + async def verify_action( | |
| 1318 | + tool_name: str, | |
| 1319 | + tool_args: dict, | |
| 1320 | + result: str, | |
| 1321 | + expected: str = "", | |
| 1322 | + ) -> ActionVerification: | |
| 1323 | + raise AssertionError("Verification should not run for this scenario") | |
| 1324 | + | |
| 1325 | + guide_root = temp_dir / "Loader" / "guides" / "nginx" | |
| 1326 | + chapters = guide_root / "chapters" | |
| 1327 | + chapters.mkdir(parents=True) | |
| 1328 | + chapter_one = chapters / "01-introduction.html" | |
| 1329 | + chapter_one.write_text("<html></html>\n") | |
| 1330 | + index_path = guide_root / "index.html" | |
| 1331 | + | |
| 1332 | + reference = temp_dir / "Loader" / "guides" / "fortran" / "index.html" | |
| 1333 | + reference.parent.mkdir(parents=True, exist_ok=True) | |
| 1334 | + reference.write_text("<h1>Fortran Beginner's Guide</h1>\n") | |
| 1335 | + | |
| 1336 | + implementation_plan = temp_dir / "implementation.md" | |
| 1337 | + implementation_plan.write_text( | |
| 1338 | + "\n".join( | |
| 1339 | + [ | |
| 1340 | + "# Implementation Plan", | |
| 1341 | + "", | |
| 1342 | + "## File Changes", | |
| 1343 | + f"- `{guide_root}/`", | |
| 1344 | + f"- `{chapters}/`", | |
| 1345 | + f"- `{index_path}`", | |
| 1346 | + f"- `{chapter_one}`", | |
| 1347 | + f"- `{chapters / '02-installation.html'}`", | |
| 1348 | + "", | |
| 1349 | + ] | |
| 1350 | + ) | |
| 1351 | + ) | |
| 1352 | + | |
| 1353 | + context = build_context( | |
| 1354 | + temp_dir=temp_dir, | |
| 1355 | + messages=[], | |
| 1356 | + safeguards=FakeSafeguards(), | |
| 1357 | + assess_confidence=assess_confidence, | |
| 1358 | + verify_action=verify_action, | |
| 1359 | + auto_recover=False, | |
| 1360 | + ) | |
| 1361 | + queued_messages: list[str] = [] | |
| 1362 | + context.queue_steering_message_callback = queued_messages.append | |
| 1363 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 1364 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1365 | + dod.implementation_plan = str(implementation_plan) | |
| 1366 | + dod.touched_files.append(str(chapter_one)) | |
| 1367 | + sync_todos_to_definition_of_done( | |
| 1368 | + dod, | |
| 1369 | + [ | |
| 1370 | + { | |
| 1371 | + "content": "Examine the existing Fortran guide structure to understand the format and cadence", | |
| 1372 | + "active_form": "Working on: Examine the existing Fortran guide structure to understand the format and cadence", | |
| 1373 | + "status": "pending", | |
| 1374 | + }, | |
| 1375 | + { | |
| 1376 | + "content": "Create each chapter file with appropriate content", | |
| 1377 | + "active_form": "Working on: Create each chapter file with appropriate content", | |
| 1378 | + "status": "pending", | |
| 1379 | + }, | |
| 1380 | + { | |
| 1381 | + "content": "Ensure all files follow the same structure and style as the Fortran guide", | |
| 1382 | + "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide", | |
| 1383 | + "status": "pending", | |
| 1384 | + }, | |
| 1385 | + ], | |
| 1386 | + ) | |
| 1387 | + tool_call = ToolCall( | |
| 1388 | + id="read-reference-index", | |
| 1389 | + name="read", | |
| 1390 | + arguments={"file_path": str(reference)}, | |
| 1391 | + ) | |
| 1392 | + read_output = "Observation [read]: Result: <h1>Fortran Beginner's Guide</h1>\n" | |
| 1393 | + executor = FakeExecutor( | |
| 1394 | + [ | |
| 1395 | + ToolExecutionOutcome( | |
| 1396 | + tool_call=tool_call, | |
| 1397 | + state=ToolExecutionState.EXECUTED, | |
| 1398 | + message=Message.tool_result_message( | |
| 1399 | + tool_call_id=tool_call.id, | |
| 1400 | + display_content=read_output, | |
| 1401 | + result_content=read_output, | |
| 1402 | + ), | |
| 1403 | + event_content=read_output, | |
| 1404 | + is_error=False, | |
| 1405 | + result_output=read_output, | |
| 1406 | + ) | |
| 1407 | + ] | |
| 1408 | + ) | |
| 1409 | + | |
| 1410 | + summary = TurnSummary(final_response="") | |
| 1411 | + await runner.execute_batch( | |
| 1412 | + tool_calls=[tool_call], | |
| 1413 | + tool_source="assistant", | |
| 1414 | + pending_tool_calls_seen=set(), | |
| 1415 | + emit=_noop_emit, | |
| 1416 | + summary=summary, | |
| 1417 | + dod=dod, | |
| 1418 | + executor=executor, # type: ignore[arg-type] | |
| 1419 | + on_confirmation=None, | |
| 1420 | + on_user_question=None, | |
| 1421 | + emit_confirmation=None, | |
| 1422 | + consecutive_errors=0, | |
| 1423 | + ) | |
| 1424 | + | |
| 1425 | + assert queued_messages | |
| 1426 | + assert any( | |
| 1427 | + "Confirmed progress: `Examine the existing Fortran guide structure to understand the format and cadence`" | |
| 1428 | + in message | |
| 1429 | + for message in queued_messages | |
| 1430 | + ) | |
| 1431 | + assert any("Resume by creating `index.html` now." in message for message in queued_messages) | |
| 1432 | + assert not any( | |
| 1433 | + "Continue with the next pending item: `Create each chapter file with appropriate content`" | |
| 1434 | + in message | |
| 1435 | + for message in queued_messages | |
| 1436 | + ) | |
| 1437 | + | |
| 1438 | + | |
| 1306 | 1439 | @pytest.mark.asyncio |
| 1307 | 1440 | async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete( |
| 1308 | 1441 | temp_dir: Path, |
@@ -1808,8 +1941,10 @@ async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_rev | ||
| 1808 | 1941 | ], |
| 1809 | 1942 | ) |
| 1810 | 1943 | assert tool_batches_should_prioritize_missing_artifact( |
| 1944 | + dod=dod, | |
| 1811 | 1945 | next_pending=dod.pending_items[0], |
| 1812 | 1946 | missing_artifact=(chapters / "06-ssl-configuration.html", False), |
| 1947 | + project_root=temp_dir, | |
| 1813 | 1948 | ) |
| 1814 | 1949 | |
| 1815 | 1950 | tool_call = ToolCall( |
tests/test_workflow.pymodified@@ -1024,6 +1024,42 @@ def test_advance_todos_from_tool_call_does_not_complete_linking_step_from_glob() | ||
| 1024 | 1024 | assert "Link all chapters together properly in the index file" in dod.pending_items |
| 1025 | 1025 | |
| 1026 | 1026 | |
| 1027 | +def test_advance_todos_from_tool_call_does_not_complete_aggregate_style_step_from_reference_read() -> None: | |
| 1028 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1029 | + sync_todos_to_definition_of_done( | |
| 1030 | + dod, | |
| 1031 | + [ | |
| 1032 | + { | |
| 1033 | + "content": "Create each chapter file with appropriate content", | |
| 1034 | + "active_form": "Working on: Create each chapter file with appropriate content", | |
| 1035 | + "status": "pending", | |
| 1036 | + }, | |
| 1037 | + { | |
| 1038 | + "content": "Ensure all files follow the same structure and style as the Fortran guide", | |
| 1039 | + "active_form": "Working on: Ensure all files follow the same structure and style as the Fortran guide", | |
| 1040 | + "status": "pending", | |
| 1041 | + }, | |
| 1042 | + ], | |
| 1043 | + ) | |
| 1044 | + | |
| 1045 | + assert ( | |
| 1046 | + advance_todos_from_tool_call( | |
| 1047 | + dod, | |
| 1048 | + ToolCall( | |
| 1049 | + id="read-reference-index", | |
| 1050 | + name="read", | |
| 1051 | + arguments={"file_path": "~/Loader/guides/fortran/index.html"}, | |
| 1052 | + ), | |
| 1053 | + ) | |
| 1054 | + is False | |
| 1055 | + ) | |
| 1056 | + assert "Create each chapter file with appropriate content" in dod.pending_items | |
| 1057 | + assert ( | |
| 1058 | + "Ensure all files follow the same structure and style as the Fortran guide" | |
| 1059 | + in dod.pending_items | |
| 1060 | + ) | |
| 1061 | + | |
| 1062 | + | |
| 1027 | 1063 | def test_sync_todos_to_definition_of_done_keeps_linking_step_pending_while_artifacts_missing( |
| 1028 | 1064 | temp_dir: Path, |
| 1029 | 1065 | ) -> None: |