Strengthen qwen progress and verification handoffs
- SHA
297e213d29e61587549386472439e606500e8b69- Parents
-
918941f - Tree
7288310
297e213
297e213d29e61587549386472439e606500e8b69918941f
7288310| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/artifact_invalidation.py
|
40 | 3 |
| M |
src/loader/runtime/dod.py
|
28 | 0 |
| M |
src/loader/runtime/finalization.py
|
1 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
34 | 1 |
| M |
src/loader/runtime/workflow.py
|
170 | 1 |
| M |
src/loader/runtime/workflow_lanes.py
|
44 | 0 |
| M |
src/loader/runtime/workflow_recovery.py
|
19 | 0 |
| M |
tests/test_artifact_invalidation.py
|
25 | 0 |
| M |
tests/test_dod.py
|
23 | 0 |
| M |
tests/test_finalization.py
|
47 | 0 |
| M |
tests/test_tool_batches.py
|
95 | 0 |
| M |
tests/test_workflow.py
|
151 | 0 |
| M |
tests/test_workflow_runtime.py
|
15 | 0 |
src/loader/runtime/artifact_invalidation.pymodified@@ -49,13 +49,18 @@ class ArtifactInvalidationAssessor: | ||
| 49 | 49 | unexpected_paths = [ |
| 50 | 50 | name |
| 51 | 51 | for path in touched_files |
| 52 | - if (name := _path_name(path)) and name.lower() not in plan_text | |
| 52 | + if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path) | |
| 53 | 53 | ] |
| 54 | 54 | confirmed_touchpoints = [ |
| 55 | 55 | name |
| 56 | 56 | for path in touched_files |
| 57 | 57 | if (name := _path_name(path)) |
| 58 | 58 | ] |
| 59 | + confirmed_touchpoint_keys = { | |
| 60 | + _path_reference_identity(path) | |
| 61 | + for path in touched_files | |
| 62 | + if _path_reference_identity(path) | |
| 63 | + } | |
| 59 | 64 | inferred_touchpoints = [ |
| 60 | 65 | item |
| 61 | 66 | for item in _extract_path_mentions( |
@@ -63,7 +68,7 @@ class ArtifactInvalidationAssessor: | ||
| 63 | 68 | implementation_text, |
| 64 | 69 | verification_text, |
| 65 | 70 | ) |
| 66 | - if _path_name(item) not in confirmed_touchpoints | |
| 71 | + if _path_reference_identity(item) not in confirmed_touchpoint_keys | |
| 67 | 72 | ] |
| 68 | 73 | stale_plan = False |
| 69 | 74 | stale_brief = False |
@@ -147,7 +152,11 @@ class ArtifactInvalidationAssessor: | ||
| 147 | 152 | ) |
| 148 | 153 | |
| 149 | 154 | out_of_brief_paths = [ |
| 150 | - name for name in unexpected_paths if name.lower() not in brief_text | |
| 155 | + name | |
| 156 | + for path in touched_files | |
| 157 | + if (name := _path_name(path)) | |
| 158 | + and name in unexpected_paths | |
| 159 | + and not _text_covers_path_reference(brief_text, path) | |
| 151 | 160 | ] |
| 152 | 161 | if out_of_brief_paths: |
| 153 | 162 | stale_brief = True |
@@ -200,6 +209,34 @@ def _path_name(path: str) -> str: | ||
| 200 | 209 | return normalized.rsplit("/", maxsplit=1)[-1].strip() |
| 201 | 210 | |
| 202 | 211 | |
| 212 | +def _path_reference_identity(path: str) -> str: | |
| 213 | + normalized = _path_name(path) | |
| 214 | + if not normalized: | |
| 215 | + return "" | |
| 216 | + return _canonical_path_reference(normalized) | |
| 217 | + | |
| 218 | + | |
| 219 | +def _text_covers_path_reference(text: str, path: str) -> bool: | |
| 220 | + normalized_text = text.lower() | |
| 221 | + candidates = [candidate for candidate in (str(path).strip(), _path_name(path)) if candidate] | |
| 222 | + | |
| 223 | + for candidate in candidates: | |
| 224 | + if candidate.lower() in normalized_text: | |
| 225 | + return True | |
| 226 | + | |
| 227 | + canonical_text = _canonical_path_reference(text) | |
| 228 | + return any( | |
| 229 | + canonical_candidate and canonical_candidate in canonical_text | |
| 230 | + for canonical_candidate in (_canonical_path_reference(candidate) for candidate in candidates) | |
| 231 | + ) | |
| 232 | + | |
| 233 | + | |
| 234 | +def _canonical_path_reference(value: str) -> str: | |
| 235 | + normalized = value.lower().strip() | |
| 236 | + normalized = re.sub(r"[^a-z0-9]+", " ", normalized) | |
| 237 | + return " ".join(normalized.split()) | |
| 238 | + | |
| 239 | + | |
| 203 | 240 | def _text_covers_requirement(text: str, requirement: str) -> bool: |
| 204 | 241 | normalized_text = text.lower() |
| 205 | 242 | normalized_requirement = requirement.lower() |
src/loader/runtime/dod.pymodified@@ -208,6 +208,7 @@ def derive_verification_commands( | ||
| 208 | 208 | *, |
| 209 | 209 | project_root: Path, |
| 210 | 210 | task_statement: str, |
| 211 | + supplement_existing: bool = False, | |
| 211 | 212 | ) -> list[str]: |
| 212 | 213 | """Generate verification commands from execution history and project shape.""" |
| 213 | 214 | |
@@ -234,6 +235,8 @@ def derive_verification_commands( | ||
| 234 | 235 | |
| 235 | 236 | if commands: |
| 236 | 237 | return commands |
| 238 | + if supplement_existing: | |
| 239 | + return commands | |
| 237 | 240 | |
| 238 | 241 | if dod.task_size == "small": |
| 239 | 242 | for path_str in dod.touched_files[:3]: |
@@ -245,6 +248,11 @@ def derive_verification_commands( | ||
| 245 | 248 | commands, |
| 246 | 249 | f"python -m py_compile {shlex.quote(str(effective_path))}", |
| 247 | 250 | ) |
| 251 | + elif _uses_external_artifacts_only(dod, project_root=project_root): | |
| 252 | + for path_str in dod.touched_files[:3]: | |
| 253 | + path = Path(path_str) | |
| 254 | + effective_path = path if path.is_absolute() else (project_root / path) | |
| 255 | + _append_unique(commands, f"test -f {shlex.quote(str(effective_path))}") | |
| 248 | 256 | else: |
| 249 | 257 | if (project_root / "pyproject.toml").exists(): |
| 250 | 258 | _append_unique(commands, "uv run pytest -q") |
@@ -407,6 +415,26 @@ def _append_unique(items: list[str], value: str) -> None: | ||
| 407 | 415 | items.append(value) |
| 408 | 416 | |
| 409 | 417 | |
| 418 | +def _uses_external_artifacts_only(dod: DefinitionOfDone, *, project_root: Path) -> bool: | |
| 419 | + touched = [Path(path) for path in dod.touched_files if str(path).strip()] | |
| 420 | + if not touched: | |
| 421 | + return False | |
| 422 | + try: | |
| 423 | + root = project_root.resolve() | |
| 424 | + except FileNotFoundError: | |
| 425 | + root = project_root | |
| 426 | + external = [path for path in touched if not _path_is_within_root(path, root)] | |
| 427 | + return bool(external) and len(external) == len(touched) | |
| 428 | + | |
| 429 | + | |
| 430 | +def _path_is_within_root(path: Path, root: Path) -> bool: | |
| 431 | + try: | |
| 432 | + path.resolve().relative_to(root) | |
| 433 | + return True | |
| 434 | + except ValueError: | |
| 435 | + return False | |
| 436 | + | |
| 437 | + | |
| 410 | 438 | def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]: |
| 411 | 439 | """Build a todo item list from the current DoD state. |
| 412 | 440 | |
src/loader/runtime/finalization.pymodified@@ -294,6 +294,7 @@ class TurnFinalizer: | ||
| 294 | 294 | dod, |
| 295 | 295 | project_root=self.context.project_root, |
| 296 | 296 | task_statement=dod.task_statement, |
| 297 | + supplement_existing=True, | |
| 297 | 298 | ): |
| 298 | 299 | if command not in dod.verification_commands: |
| 299 | 300 | dod.verification_commands.append(command) |
src/loader/runtime/tool_batches.pymodified@@ -46,6 +46,19 @@ _TODO_NUDGE_EXCLUDED_ITEMS = { | ||
| 46 | 46 | "Complete the requested work", |
| 47 | 47 | _VERIFY_ITEM, |
| 48 | 48 | } |
| 49 | +_MUTATION_TODO_HINTS = ( | |
| 50 | + "create", | |
| 51 | + "update", | |
| 52 | + "edit", | |
| 53 | + "write", | |
| 54 | + "fix", | |
| 55 | + "modify", | |
| 56 | + "change", | |
| 57 | + "patch", | |
| 58 | + "replace", | |
| 59 | + "correct", | |
| 60 | + "rewrite", | |
| 61 | +) | |
| 49 | 62 | |
| 50 | 63 | |
| 51 | 64 | @dataclass |
@@ -290,18 +303,26 @@ class ToolBatchRunner: | ||
| 290 | 303 | max_items=2, |
| 291 | 304 | ) |
| 292 | 305 | if next_pending and not html_toc_rule.task_targets_html_toc(current_task): |
| 306 | + mutation_suffix = "" | |
| 307 | + if _todo_is_mutation_step(next_pending): | |
| 308 | + mutation_suffix = ( | |
| 309 | + " You already have enough evidence for that step, so stop gathering " | |
| 310 | + "more reference material and perform the change now." | |
| 311 | + ) | |
| 293 | 312 | if confirmed_facts: |
| 294 | 313 | self.context.queue_steering_message( |
| 295 | 314 | "Reuse the earlier observation instead of repeating it. " |
| 296 | 315 | f"Confirmed facts: {confirmed_facts}. " |
| 297 | 316 | f"Continue with the next pending item: `{next_pending}`. " |
| 298 | 317 | "Only gather more evidence if a specific fact required for that step is still unknown." |
| 318 | + + mutation_suffix | |
| 299 | 319 | ) |
| 300 | 320 | else: |
| 301 | 321 | self.context.queue_steering_message( |
| 302 | 322 | "Reuse the earlier observation instead of repeating it. " |
| 303 | 323 | f"Continue with the next pending item: `{next_pending}`. " |
| 304 | 324 | "Only gather more evidence if a specific fact required for that step is still unknown." |
| 325 | + + mutation_suffix | |
| 305 | 326 | ) |
| 306 | 327 | return |
| 307 | 328 | |
@@ -752,10 +773,17 @@ class ToolBatchRunner: | ||
| 752 | 773 | if not completed_label or not next_pending or next_pending == completed_label: |
| 753 | 774 | return |
| 754 | 775 | |
| 776 | + mutation_suffix = "" | |
| 777 | + if _todo_is_mutation_step(next_pending): | |
| 778 | + mutation_suffix = ( | |
| 779 | + " You already have enough evidence for that step, so stop gathering " | |
| 780 | + "more reference material and perform the change now." | |
| 781 | + ) | |
| 782 | + | |
| 755 | 783 | self.context.queue_steering_message( |
| 756 | 784 | f"Confirmed progress: `{completed_label}` is now satisfied by the successful " |
| 757 | 785 | f"`{tool_call.name}` result. Continue with the next pending item: " |
| 758 | - f"`{next_pending}` instead of rereading the same evidence." | |
| 786 | + f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}" | |
| 759 | 787 | ) |
| 760 | 788 | |
| 761 | 789 | |
@@ -795,6 +823,11 @@ def _mark_verification_stale( | ||
| 795 | 823 | dod.pending_items.append(_VERIFY_ITEM) |
| 796 | 824 | |
| 797 | 825 | |
| 826 | +def _todo_is_mutation_step(label: str) -> bool: | |
| 827 | + lowered = label.lower() | |
| 828 | + return any(token in lowered for token in _MUTATION_TODO_HINTS) | |
| 829 | + | |
| 830 | + | |
| 798 | 831 | def _mark_verification_planned( |
| 799 | 832 | *, |
| 800 | 833 | context: RuntimeContext, |
src/loader/runtime/workflow.pymodified@@ -125,6 +125,75 @@ _VERIFY_STEP_HINTS = ( | ||
| 125 | 125 | "confirm", |
| 126 | 126 | "check", |
| 127 | 127 | ) |
| 128 | +_AGGREGATE_TODO_HINTS = ( | |
| 129 | + "each ", | |
| 130 | + "all ", | |
| 131 | + "every ", | |
| 132 | + "sequence", | |
| 133 | + "multiple ", | |
| 134 | + "across ", | |
| 135 | + "consistently", | |
| 136 | + "properly linked", | |
| 137 | + "directory structure", | |
| 138 | +) | |
| 139 | +_ACTIONABLE_STEP_VERBS = { | |
| 140 | + "add", | |
| 141 | + "apply", | |
| 142 | + "build", | |
| 143 | + "check", | |
| 144 | + "confirm", | |
| 145 | + "create", | |
| 146 | + "document", | |
| 147 | + "edit", | |
| 148 | + "ensure", | |
| 149 | + "fix", | |
| 150 | + "implement", | |
| 151 | + "inspect", | |
| 152 | + "list", | |
| 153 | + "move", | |
| 154 | + "parse", | |
| 155 | + "patch", | |
| 156 | + "read", | |
| 157 | + "refactor", | |
| 158 | + "remove", | |
| 159 | + "rename", | |
| 160 | + "reorder", | |
| 161 | + "rerun", | |
| 162 | + "re-run", | |
| 163 | + "review", | |
| 164 | + "run", | |
| 165 | + "search", | |
| 166 | + "test", | |
| 167 | + "update", | |
| 168 | + "validate", | |
| 169 | + "verify", | |
| 170 | + "write", | |
| 171 | +} | |
| 172 | +_RETROSPECTIVE_STEP_VERBS = { | |
| 173 | + "added", | |
| 174 | + "applied", | |
| 175 | + "built", | |
| 176 | + "checked", | |
| 177 | + "completed", | |
| 178 | + "confirmed", | |
| 179 | + "created", | |
| 180 | + "edited", | |
| 181 | + "ensured", | |
| 182 | + "examined", | |
| 183 | + "generated", | |
| 184 | + "implemented", | |
| 185 | + "inspected", | |
| 186 | + "listed", | |
| 187 | + "looked", | |
| 188 | + "parsed", | |
| 189 | + "patched", | |
| 190 | + "read", | |
| 191 | + "reviewed", | |
| 192 | + "updated", | |
| 193 | + "validated", | |
| 194 | + "verified", | |
| 195 | + "wrote", | |
| 196 | +} | |
| 128 | 197 | _TASK_COVERAGE_STOP_WORDS = { |
| 129 | 198 | "the", |
| 130 | 199 | "and", |
@@ -491,6 +560,41 @@ class PlanningArtifacts: | ||
| 491 | 560 | implementation_steps=list(self.implementation_steps), |
| 492 | 561 | ) |
| 493 | 562 | |
| 563 | + def with_progress_context( | |
| 564 | + self, | |
| 565 | + *, | |
| 566 | + touched_files: list[str], | |
| 567 | + completed_items: list[str], | |
| 568 | + ) -> PlanningArtifacts: | |
| 569 | + """Return one copy that preserves already-confirmed execution progress.""" | |
| 570 | + | |
| 571 | + progress_items: list[str] = [] | |
| 572 | + for raw_path in touched_files: | |
| 573 | + path_text = str(raw_path).strip() | |
| 574 | + if not path_text: | |
| 575 | + continue | |
| 576 | + progress_items.append(f"Already touched during execution: `{path_text}`.") | |
| 577 | + for raw_item in completed_items: | |
| 578 | + item = str(raw_item).strip() | |
| 579 | + if not item or item in _SPECIAL_TODO_ITEMS: | |
| 580 | + continue | |
| 581 | + progress_items.append(f"Already completed during execution: {item}.") | |
| 582 | + | |
| 583 | + if not progress_items: | |
| 584 | + return self | |
| 585 | + | |
| 586 | + return PlanningArtifacts( | |
| 587 | + implementation_markdown=_replace_markdown_section_items( | |
| 588 | + self.implementation_markdown, | |
| 589 | + "Confirmed Progress", | |
| 590 | + list(dict.fromkeys(progress_items)), | |
| 591 | + ), | |
| 592 | + verification_markdown=self.verification_markdown, | |
| 593 | + verification_commands=list(self.verification_commands), | |
| 594 | + acceptance_criteria=list(self.acceptance_criteria), | |
| 595 | + implementation_steps=list(self.implementation_steps), | |
| 596 | + ) | |
| 597 | + | |
| 494 | 598 | |
| 495 | 599 | class WorkflowArtifactStore: |
| 496 | 600 | """Persist briefs and plans under `.loader/`.""" |
@@ -627,6 +731,15 @@ def merge_refreshed_todos_with_existing_scope( | ||
| 627 | 731 | and item not in _SPECIAL_TODO_ITEMS |
| 628 | 732 | and _task_text_covers_requirement(task_statement, item) |
| 629 | 733 | ] |
| 734 | + refreshed_candidates = [ | |
| 735 | + item.strip() | |
| 736 | + for item in refreshed_steps | |
| 737 | + if item.strip() | |
| 738 | + and ( | |
| 739 | + not (grounded_completed or grounded_pending) | |
| 740 | + or _looks_actionable_refresh_step(item) | |
| 741 | + ) | |
| 742 | + ] | |
| 630 | 743 | |
| 631 | 744 | todos: list[dict[str, str]] = [] |
| 632 | 745 | seen: set[str] = set() |
@@ -641,7 +754,7 @@ def merge_refreshed_todos_with_existing_scope( | ||
| 641 | 754 | "status": "completed", |
| 642 | 755 | } |
| 643 | 756 | ) |
| 644 | - for item in [*grounded_pending, *refreshed_steps]: | |
| 757 | + for item in [*grounded_pending, *refreshed_candidates]: | |
| 645 | 758 | label = item.strip() |
| 646 | 759 | if not label or label in seen: |
| 647 | 760 | continue |
@@ -740,7 +853,14 @@ def _todo_progress_score(item: str, tool_call: ToolCall) -> int: | ||
| 740 | 853 | elif _looks_like_read_command(command): |
| 741 | 854 | if _contains_any(text, _READ_STEP_HINTS): |
| 742 | 855 | score += 2 |
| 856 | + elif _looks_like_fs_mutation_command(command): | |
| 857 | + if _contains_any(text, _MUTATION_STEP_HINTS): | |
| 858 | + score += 3 | |
| 859 | + if "directory" in text and "mkdir" in command: | |
| 860 | + score += 2 | |
| 743 | 861 | elif name in {"write", "edit", "patch"}: |
| 862 | + if _todo_describes_aggregate_mutation(text) and basename and basename not in text: | |
| 863 | + return 0 | |
| 744 | 864 | if _contains_any(text, _MUTATION_STEP_HINTS): |
| 745 | 865 | score += 3 |
| 746 | 866 | |
@@ -753,6 +873,13 @@ def _contains_any(text: str, candidates: tuple[str, ...]) -> bool: | ||
| 753 | 873 | return any(candidate in text for candidate in candidates) |
| 754 | 874 | |
| 755 | 875 | |
| 876 | +def _todo_describes_aggregate_mutation(text: str) -> bool: | |
| 877 | + return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any( | |
| 878 | + text, | |
| 879 | + _MUTATION_STEP_HINTS, | |
| 880 | + ) | |
| 881 | + | |
| 882 | + | |
| 756 | 883 | def _looks_like_search_command(command: str) -> bool: |
| 757 | 884 | return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob ")) |
| 758 | 885 | |
@@ -781,6 +908,27 @@ def _looks_like_verification_command(command: str) -> bool: | ||
| 781 | 908 | ) |
| 782 | 909 | |
| 783 | 910 | |
| 911 | +def _looks_like_fs_mutation_command(command: str) -> bool: | |
| 912 | + stripped = command.strip() | |
| 913 | + return any( | |
| 914 | + stripped.startswith(prefix) | |
| 915 | + for prefix in ( | |
| 916 | + "mkdir ", | |
| 917 | + "mkdir\t", | |
| 918 | + "touch ", | |
| 919 | + "touch\t", | |
| 920 | + "cp ", | |
| 921 | + "cp\t", | |
| 922 | + "mv ", | |
| 923 | + "mv\t", | |
| 924 | + "ln ", | |
| 925 | + "ln\t", | |
| 926 | + "install ", | |
| 927 | + "install\t", | |
| 928 | + ) | |
| 929 | + ) | |
| 930 | + | |
| 931 | + | |
| 784 | 932 | def extract_verification_commands_from_markdown(markdown: str) -> list[str]: |
| 785 | 933 | """Extract verification commands from a verification-plan markdown document.""" |
| 786 | 934 | |
@@ -1057,6 +1205,27 @@ def _requirement_describes_output_scope(requirement: str) -> bool: | ||
| 1057 | 1205 | ) |
| 1058 | 1206 | |
| 1059 | 1207 | |
| 1208 | +def _looks_actionable_refresh_step(step: str) -> bool: | |
| 1209 | + normalized = step.strip() | |
| 1210 | + if not normalized: | |
| 1211 | + return False | |
| 1212 | + if re.fullmatch(r"(?:[\w.-]+/)*[\w.-]+\.[A-Za-z0-9]+", normalized): | |
| 1213 | + return False | |
| 1214 | + | |
| 1215 | + lowered = normalized.lower() | |
| 1216 | + lowered = re.sub(r"^(?:first|next|then|finally|afterward|afterwards)\b[,:]?\s*", "", lowered) | |
| 1217 | + first_word_match = re.match(r"^[a-z-]+", lowered) | |
| 1218 | + if first_word_match is None: | |
| 1219 | + return False | |
| 1220 | + | |
| 1221 | + first_word = first_word_match.group(0) | |
| 1222 | + if first_word in _RETROSPECTIVE_STEP_VERBS: | |
| 1223 | + return False | |
| 1224 | + if first_word in _ACTIONABLE_STEP_VERBS: | |
| 1225 | + return True | |
| 1226 | + return False | |
| 1227 | + | |
| 1228 | + | |
| 1060 | 1229 | def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None: |
| 1061 | 1230 | if section in brief.explicit_sections: |
| 1062 | 1231 | return |
src/loader/runtime/workflow_lanes.pymodified@@ -208,6 +208,10 @@ class WorkflowLaneRunner: | ||
| 208 | 208 | refreshed_acceptance_criteria=list(artifacts.acceptance_criteria), |
| 209 | 209 | ) |
| 210 | 210 | artifacts = artifacts.with_acceptance_criteria(preserved_acceptance) |
| 211 | + artifacts = artifacts.with_progress_context( | |
| 212 | + touched_files=list(dod.touched_files), | |
| 213 | + completed_items=list(dod.completed_items), | |
| 214 | + ) | |
| 211 | 215 | implementation_path, verification_path = self.artifact_store.write_plan( |
| 212 | 216 | task, |
| 213 | 217 | artifacts, |
@@ -610,6 +614,41 @@ class WorkflowLaneRunner: | ||
| 610 | 614 | |
| 611 | 615 | refresh_block = "" |
| 612 | 616 | if refresh_reasons: |
| 617 | + progress_lines: list[str] = [] | |
| 618 | + touched = [str(path).strip() for path in dod.touched_files if str(path).strip()] | |
| 619 | + completed = [ | |
| 620 | + item.strip() | |
| 621 | + for item in dod.completed_items | |
| 622 | + if item.strip() | |
| 623 | + and item not in {"Complete the requested work", "Collect verification evidence"} | |
| 624 | + ] | |
| 625 | + pending = [ | |
| 626 | + item.strip() | |
| 627 | + for item in dod.pending_items | |
| 628 | + if item.strip() | |
| 629 | + and item not in {"Complete the requested work", "Collect verification evidence"} | |
| 630 | + ] | |
| 631 | + if touched: | |
| 632 | + progress_lines.extend( | |
| 633 | + [ | |
| 634 | + "Already touched during execution:", | |
| 635 | + *[f"- {item}" for item in touched[:12]], | |
| 636 | + ] | |
| 637 | + ) | |
| 638 | + if completed: | |
| 639 | + progress_lines.extend( | |
| 640 | + [ | |
| 641 | + "Already completed work:", | |
| 642 | + *[f"- {item}" for item in completed[:12]], | |
| 643 | + ] | |
| 644 | + ) | |
| 645 | + if pending: | |
| 646 | + progress_lines.extend( | |
| 647 | + [ | |
| 648 | + "Still pending:", | |
| 649 | + *[f"- {item}" for item in pending[:12]], | |
| 650 | + ] | |
| 651 | + ) | |
| 613 | 652 | refresh_block = ( |
| 614 | 653 | "Refresh the existing planning artifacts instead of creating a fresh plan " |
| 615 | 654 | "from scratch.\n" |
@@ -619,6 +658,11 @@ class WorkflowLaneRunner: | ||
| 619 | 658 | "artifact.\n" |
| 620 | 659 | "Use the current task state and these recovery reasons:\n" |
| 621 | 660 | + "\n".join(f"- {item}" for item in refresh_reasons) |
| 661 | + + ( | |
| 662 | + ("\n\nCurrent execution progress:\n" + "\n".join(progress_lines)) | |
| 663 | + if progress_lines | |
| 664 | + else "" | |
| 665 | + ) | |
| 622 | 666 | + "\n\n" |
| 623 | 667 | ) |
| 624 | 668 | |
src/loader/runtime/workflow_recovery.pymodified@@ -29,6 +29,10 @@ UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None | ||
| 29 | 29 | WorkflowModeSetter = Callable[..., Awaitable[None]] |
| 30 | 30 | TimelineAppender = Callable[..., None] |
| 31 | 31 | BridgeAppender = Callable[[DefinitionOfDone], None] |
| 32 | +_RECOVERY_TODO_EXCLUDED_ITEMS = { | |
| 33 | + "Complete the requested work", | |
| 34 | + "Collect verification evidence", | |
| 35 | +} | |
| 32 | 36 | |
| 33 | 37 | |
| 34 | 38 | class WorkflowRecoveryController: |
@@ -186,6 +190,21 @@ class WorkflowRecoveryController: | ||
| 186 | 190 | summary=summary, |
| 187 | 191 | ) |
| 188 | 192 | self.append_execute_bridge(dod) |
| 193 | + next_pending = next( | |
| 194 | + ( | |
| 195 | + item | |
| 196 | + for item in dod.pending_items | |
| 197 | + if item not in _RECOVERY_TODO_EXCLUDED_ITEMS | |
| 198 | + ), | |
| 199 | + None, | |
| 200 | + ) | |
| 201 | + if next_pending: | |
| 202 | + self.context.queue_steering_message( | |
| 203 | + "Plan refresh preserved the progress already made. " | |
| 204 | + f"Reuse the existing files and confirmed facts, then continue with the next " | |
| 205 | + f"pending item: `{next_pending}`. " | |
| 206 | + "Do not restart from initial discovery unless a specific missing fact blocks that step." | |
| 207 | + ) | |
| 189 | 208 | return True |
| 190 | 209 | |
| 191 | 210 | async def _run_clarify_reentry_for_drift( |
tests/test_artifact_invalidation.pymodified@@ -67,3 +67,28 @@ def test_artifact_invalidation_can_force_full_replan_when_brief_and_plan_drift() | ||
| 67 | 67 | for item in freshness.evidence |
| 68 | 68 | ) |
| 69 | 69 | assert freshness.evidence_summary |
| 70 | + | |
| 71 | + | |
| 72 | +def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint() -> None: | |
| 73 | + assessor = ArtifactInvalidationAssessor() | |
| 74 | + | |
| 75 | + freshness = assessor.assess( | |
| 76 | + task_statement="Build a multi-file nginx guide.", | |
| 77 | + clarify_text=None, | |
| 78 | + implementation_text=( | |
| 79 | + "# Implementation Plan\n" | |
| 80 | + "- Create 01-getting-started.html in the chapters directory.\n" | |
| 81 | + ), | |
| 82 | + verification_text=( | |
| 83 | + "# Verification Plan\n" | |
| 84 | + "## Acceptance Criteria\n" | |
| 85 | + "- 01-getting-started.html exists.\n" | |
| 86 | + ), | |
| 87 | + acceptance_criteria=["01-getting-started.html exists."], | |
| 88 | + touched_files=["/tmp/chapters/01_getting_started.html"], | |
| 89 | + last_verification_result=None, | |
| 90 | + ) | |
| 91 | + | |
| 92 | + assert freshness.stale_plan is False | |
| 93 | + assert freshness.stale_brief is False | |
| 94 | + assert "touched_files_outside_plan" not in freshness.reason_codes | |
tests/test_dod.pymodified@@ -143,6 +143,29 @@ def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Pat | ||
| 143 | 143 | assert not any(command == f"test -f {index}" for command in commands) |
| 144 | 144 | |
| 145 | 145 | |
| 146 | +def test_derive_verification_commands_avoids_repo_defaults_for_external_artifacts( | |
| 147 | + tmp_path: Path, | |
| 148 | +) -> None: | |
| 149 | + (tmp_path / "pyproject.toml").write_text("[project]\nname='loader'\n") | |
| 150 | + (tmp_path / "package.json").write_text("{}\n") | |
| 151 | + external_root = tmp_path.parent / "external-guide" | |
| 152 | + external_root.mkdir(exist_ok=True) | |
| 153 | + external_index = external_root / "index.html" | |
| 154 | + external_index.write_text("<html></html>\n") | |
| 155 | + | |
| 156 | + dod = create_definition_of_done("Create an external nginx guide.") | |
| 157 | + dod.task_size = "standard" | |
| 158 | + dod.touched_files = [str(external_index)] | |
| 159 | + | |
| 160 | + commands = derive_verification_commands( | |
| 161 | + dod, | |
| 162 | + project_root=tmp_path, | |
| 163 | + task_statement=dod.task_statement, | |
| 164 | + ) | |
| 165 | + | |
| 166 | + assert commands == [f"test -f {external_index}"] | |
| 167 | + | |
| 168 | + | |
| 146 | 169 | def test_build_verification_summary_keeps_concrete_missing_link_details() -> None: |
| 147 | 170 | summary = build_verification_summary( |
| 148 | 171 | [ |
tests/test_finalization.pymodified@@ -434,6 +434,53 @@ async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_comma | ||
| 434 | 434 | ) |
| 435 | 435 | |
| 436 | 436 | |
| 437 | +@pytest.mark.asyncio | |
| 438 | +async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan( | |
| 439 | + temp_dir: Path, | |
| 440 | +) -> None: | |
| 441 | + (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n") | |
| 442 | + (temp_dir / "package.json").write_text("{}\n") | |
| 443 | + external_root = temp_dir.parent / "external-nginx-guide" | |
| 444 | + external_root.mkdir(exist_ok=True) | |
| 445 | + external_index = external_root / "index.html" | |
| 446 | + external_index.write_text("<html></html>\n") | |
| 447 | + | |
| 448 | + session = FakeSession() | |
| 449 | + context = build_context(temp_dir, session) | |
| 450 | + finalizer = TurnFinalizer( | |
| 451 | + context, | |
| 452 | + RuntimeTracer(), | |
| 453 | + DefinitionOfDoneStore(temp_dir), | |
| 454 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 455 | + ) | |
| 456 | + dod = create_definition_of_done("Create an external nginx guide.") | |
| 457 | + dod.mutating_actions.append("write") | |
| 458 | + dod.touched_files.append(str(external_index)) | |
| 459 | + dod.verification_commands = [ | |
| 460 | + f"ls -la {external_root}", | |
| 461 | + f"grep -n \"html\" {external_index}", | |
| 462 | + ] | |
| 463 | + summary = TurnSummary(final_response="") | |
| 464 | + executor = RecordingExecutor() | |
| 465 | + | |
| 466 | + async def capture(event) -> None: | |
| 467 | + return None | |
| 468 | + | |
| 469 | + result = await finalizer.run_definition_of_done_gate( | |
| 470 | + dod=dod, | |
| 471 | + candidate_response="Created the external nginx guide.", | |
| 472 | + emit=capture, | |
| 473 | + summary=summary, | |
| 474 | + executor=executor, # type: ignore[arg-type] | |
| 475 | + ) | |
| 476 | + | |
| 477 | + assert result.should_continue is False | |
| 478 | + assert executor.commands == [ | |
| 479 | + f"ls -la {external_root}", | |
| 480 | + f'grep -n "html" {external_index}', | |
| 481 | + ] | |
| 482 | + | |
| 483 | + | |
| 437 | 484 | @pytest.mark.asyncio |
| 438 | 485 | async def test_turn_finalizer_records_missing_verification_observation( |
| 439 | 486 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -1041,6 +1041,10 @@ async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progre | ||
| 1041 | 1041 | in message |
| 1042 | 1042 | for message in queued_messages |
| 1043 | 1043 | ) |
| 1044 | + assert any( | |
| 1045 | + "stop gathering more reference material and perform the change now" in message | |
| 1046 | + for message in queued_messages | |
| 1047 | + ) | |
| 1044 | 1048 | |
| 1045 | 1049 | |
| 1046 | 1050 | @pytest.mark.asyncio |
@@ -1161,6 +1165,97 @@ async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_t | ||
| 1161 | 1165 | assert "Update `" not in queued_messages[0] |
| 1162 | 1166 | |
| 1163 | 1167 | |
| 1168 | +@pytest.mark.asyncio | |
| 1169 | +async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | |
| 1170 | + temp_dir: Path, | |
| 1171 | +) -> None: | |
| 1172 | + async def assess_confidence( | |
| 1173 | + tool_name: str, | |
| 1174 | + tool_args: dict, | |
| 1175 | + context: str, | |
| 1176 | + ) -> ConfidenceAssessment: | |
| 1177 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1178 | + | |
| 1179 | + async def verify_action( | |
| 1180 | + tool_name: str, | |
| 1181 | + tool_args: dict, | |
| 1182 | + result: str, | |
| 1183 | + expected: str = "", | |
| 1184 | + ) -> ActionVerification: | |
| 1185 | + raise AssertionError("Verification should not run for this scenario") | |
| 1186 | + | |
| 1187 | + reference = temp_dir / "fortran" / "index.html" | |
| 1188 | + reference.parent.mkdir(parents=True) | |
| 1189 | + reference.write_text("<h1>Fortran Beginner's Guide</h1>\n") | |
| 1190 | + | |
| 1191 | + context = build_context( | |
| 1192 | + temp_dir=temp_dir, | |
| 1193 | + messages=[], | |
| 1194 | + safeguards=FakeSafeguards(), | |
| 1195 | + assess_confidence=assess_confidence, | |
| 1196 | + verify_action=verify_action, | |
| 1197 | + auto_recover=False, | |
| 1198 | + ) | |
| 1199 | + queued_messages: list[str] = [] | |
| 1200 | + context.queue_steering_message_callback = queued_messages.append | |
| 1201 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 1202 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1203 | + sync_todos_to_definition_of_done( | |
| 1204 | + dod, | |
| 1205 | + [ | |
| 1206 | + { | |
| 1207 | + "content": "Examine the existing Fortran guide structure to understand the cadence and format", | |
| 1208 | + "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format", | |
| 1209 | + "status": "pending", | |
| 1210 | + }, | |
| 1211 | + { | |
| 1212 | + "content": "Create the nginx index.html file", | |
| 1213 | + "active_form": "Working on: Create the nginx index.html file", | |
| 1214 | + "status": "pending", | |
| 1215 | + }, | |
| 1216 | + ], | |
| 1217 | + ) | |
| 1218 | + tool_call = ToolCall( | |
| 1219 | + id="read-reference", | |
| 1220 | + name="read", | |
| 1221 | + arguments={"file_path": str(reference)}, | |
| 1222 | + ) | |
| 1223 | + executor = FakeExecutor( | |
| 1224 | + [ | |
| 1225 | + tool_outcome( | |
| 1226 | + tool_call=tool_call, | |
| 1227 | + output="<h1>Fortran Beginner's Guide</h1>\n", | |
| 1228 | + is_error=False, | |
| 1229 | + ) | |
| 1230 | + ] | |
| 1231 | + ) | |
| 1232 | + | |
| 1233 | + summary = TurnSummary(final_response="") | |
| 1234 | + await runner.execute_batch( | |
| 1235 | + tool_calls=[tool_call], | |
| 1236 | + tool_source="assistant", | |
| 1237 | + pending_tool_calls_seen=set(), | |
| 1238 | + emit=_noop_emit, | |
| 1239 | + summary=summary, | |
| 1240 | + dod=dod, | |
| 1241 | + executor=executor, # type: ignore[arg-type] | |
| 1242 | + on_confirmation=None, | |
| 1243 | + on_user_question=None, | |
| 1244 | + emit_confirmation=None, | |
| 1245 | + consecutive_errors=0, | |
| 1246 | + ) | |
| 1247 | + | |
| 1248 | + assert any( | |
| 1249 | + "Continue with the next pending item: `Create the nginx index.html file`" | |
| 1250 | + in message | |
| 1251 | + for message in queued_messages | |
| 1252 | + ) | |
| 1253 | + assert any( | |
| 1254 | + "stop gathering more reference material and perform the change now" in message | |
| 1255 | + for message in queued_messages | |
| 1256 | + ) | |
| 1257 | + | |
| 1258 | + | |
| 1164 | 1259 | @pytest.mark.asyncio |
| 1165 | 1260 | async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid( |
| 1166 | 1261 | temp_dir: Path, |
tests/test_workflow.pymodified@@ -345,6 +345,47 @@ def test_planning_artifacts_with_acceptance_criteria_rewrites_verification_markd | ||
| 345 | 345 | ) |
| 346 | 346 | |
| 347 | 347 | |
| 348 | +def test_planning_artifacts_with_progress_context_records_touched_and_completed_work() -> None: | |
| 349 | + artifacts = PlanningArtifacts.from_model_output( | |
| 350 | + "\n".join( | |
| 351 | + [ | |
| 352 | + "# Implementation Plan", | |
| 353 | + "", | |
| 354 | + "## Execution Order", | |
| 355 | + "1. Create the guide files.", | |
| 356 | + "", | |
| 357 | + "<<<VERIFICATION>>>", | |
| 358 | + "", | |
| 359 | + "# Verification Plan", | |
| 360 | + "", | |
| 361 | + "## Acceptance Criteria", | |
| 362 | + "- At least one chapter file exists.", | |
| 363 | + "", | |
| 364 | + "## Verification Commands", | |
| 365 | + "- `find chapters -name \"*.html\" | wc -l`", | |
| 366 | + ] | |
| 367 | + ), | |
| 368 | + task_statement="Create a thorough nginx guide.", | |
| 369 | + ) | |
| 370 | + | |
| 371 | + updated = artifacts.with_progress_context( | |
| 372 | + touched_files=["/tmp/nginx/index.html"], | |
| 373 | + completed_items=[ | |
| 374 | + "Create the guide scaffold", | |
| 375 | + "Collect verification evidence", | |
| 376 | + ], | |
| 377 | + ) | |
| 378 | + | |
| 379 | + assert "## Confirmed Progress" in updated.implementation_markdown | |
| 380 | + assert "Already touched during execution: `/tmp/nginx/index.html`." in ( | |
| 381 | + updated.implementation_markdown | |
| 382 | + ) | |
| 383 | + assert "Already completed during execution: Create the guide scaffold." in ( | |
| 384 | + updated.implementation_markdown | |
| 385 | + ) | |
| 386 | + assert "Collect verification evidence" not in updated.implementation_markdown | |
| 387 | + | |
| 388 | + | |
| 348 | 389 | def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None: |
| 349 | 390 | task = ( |
| 350 | 391 | "Create an equally thorough nginx guide with index.html plus chapter files " |
@@ -371,6 +412,48 @@ def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> | ||
| 371 | 412 | ) |
| 372 | 413 | |
| 373 | 414 | |
| 415 | +def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() -> None: | |
| 416 | + task = ( | |
| 417 | + "Create an equally thorough nginx guide with index.html plus chapter files " | |
| 418 | + "covering getting started, installation, first website setup, configs, and " | |
| 419 | + "advanced topics." | |
| 420 | + ) | |
| 421 | + | |
| 422 | + todos = merge_refreshed_todos_with_existing_scope( | |
| 423 | + task, | |
| 424 | + existing_pending_items=[ | |
| 425 | + "Create each chapter file in sequence, following the same structure as the Fortran guide", | |
| 426 | + "Ensure all files are properly linked and formatted consistently", | |
| 427 | + ], | |
| 428 | + existing_completed_items=[ | |
| 429 | + "First, examine the existing Fortran guide structure to understand the format and cadence", | |
| 430 | + "Create the directory structure for the new nginx guide", | |
| 431 | + "Create the main index.html file", | |
| 432 | + ], | |
| 433 | + refreshed_steps=[ | |
| 434 | + "First examined the existing Fortran guide structure to understand format and cadence", | |
| 435 | + "Created the main index.html file with navigation", | |
| 436 | + "Created chapter files in sequence:", | |
| 437 | + "01-getting-started.html", | |
| 438 | + "02-installation.html", | |
| 439 | + "03-first-website.html", | |
| 440 | + "04-configuring.html", | |
| 441 | + "All files properly linked with navigation between chapters", | |
| 442 | + "Verify the final navigation links across the guide", | |
| 443 | + ], | |
| 444 | + ) | |
| 445 | + | |
| 446 | + labels = {item["content"]: item["status"] for item in todos} | |
| 447 | + assert ( | |
| 448 | + labels["Create each chapter file in sequence, following the same structure as the Fortran guide"] | |
| 449 | + == "pending" | |
| 450 | + ) | |
| 451 | + assert labels["Ensure all files are properly linked and formatted consistently"] == "pending" | |
| 452 | + assert labels["Verify the final navigation links across the guide"] == "pending" | |
| 453 | + assert "Created chapter files in sequence:" not in labels | |
| 454 | + assert "04-configuring.html" not in labels | |
| 455 | + | |
| 456 | + | |
| 374 | 457 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 375 | 458 | store = WorkflowArtifactStore(tmp_path) |
| 376 | 459 | brief = ClarifyBrief.fallback( |
@@ -528,3 +611,71 @@ def test_advance_todos_from_tool_call_tracks_plan_progress() -> None: | ||
| 528 | 611 | ), |
| 529 | 612 | ) |
| 530 | 613 | assert "Verify the updated index.html file is properly formatted" in dod.completed_items |
| 614 | + | |
| 615 | + | |
| 616 | +def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() -> None: | |
| 617 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 618 | + sync_todos_to_definition_of_done( | |
| 619 | + dod, | |
| 620 | + [ | |
| 621 | + { | |
| 622 | + "content": "Create each chapter file in sequence, following the same structure as the Fortran guide", | |
| 623 | + "active_form": "Working on: Create each chapter file in sequence, following the same structure as the Fortran guide", | |
| 624 | + "status": "pending", | |
| 625 | + }, | |
| 626 | + { | |
| 627 | + "content": "Ensure all files are properly linked and formatted consistently", | |
| 628 | + "active_form": "Working on: Ensure all files are properly linked and formatted consistently", | |
| 629 | + "status": "pending", | |
| 630 | + }, | |
| 631 | + ], | |
| 632 | + ) | |
| 633 | + | |
| 634 | + assert ( | |
| 635 | + advance_todos_from_tool_call( | |
| 636 | + dod, | |
| 637 | + ToolCall( | |
| 638 | + id="write-one-chapter", | |
| 639 | + name="write", | |
| 640 | + arguments={ | |
| 641 | + "file_path": "/tmp/nginx/chapters/01-getting-started.html", | |
| 642 | + "content": "<html></html>", | |
| 643 | + }, | |
| 644 | + ), | |
| 645 | + ) | |
| 646 | + is False | |
| 647 | + ) | |
| 648 | + assert ( | |
| 649 | + "Create each chapter file in sequence, following the same structure as the Fortran guide" | |
| 650 | + in dod.pending_items | |
| 651 | + ) | |
| 652 | + | |
| 653 | + | |
| 654 | +def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None: | |
| 655 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 656 | + sync_todos_to_definition_of_done( | |
| 657 | + dod, | |
| 658 | + [ | |
| 659 | + { | |
| 660 | + "content": "Create the nginx directory structure", | |
| 661 | + "active_form": "Working on: Create the nginx directory structure", | |
| 662 | + "status": "pending", | |
| 663 | + }, | |
| 664 | + { | |
| 665 | + "content": "Create index.html for nginx guide", | |
| 666 | + "active_form": "Working on: Create index.html for nginx guide", | |
| 667 | + "status": "pending", | |
| 668 | + }, | |
| 669 | + ], | |
| 670 | + ) | |
| 671 | + | |
| 672 | + assert advance_todos_from_tool_call( | |
| 673 | + dod, | |
| 674 | + ToolCall( | |
| 675 | + id="mkdir-nginx", | |
| 676 | + name="bash", | |
| 677 | + arguments={"command": "mkdir -p ~/Loader/guides/nginx/chapters"}, | |
| 678 | + ), | |
| 679 | + ) | |
| 680 | + assert "Create the nginx directory structure" in dod.completed_items | |
| 681 | + assert "Create index.html for nginx guide" in dod.pending_items | |
tests/test_workflow_runtime.pymodified@@ -1438,6 +1438,21 @@ async def test_stale_plan_artifacts_trigger_targeted_plan_refresh( | ||
| 1438 | 1438 | entry.reason_code == "plan_refresh_completed" |
| 1439 | 1439 | for entry in run.agent.last_turn_summary.workflow_timeline |
| 1440 | 1440 | ) |
| 1441 | + refresh_prompt = next( | |
| 1442 | + invocation.messages[-1].content | |
| 1443 | + for invocation in backend.invocations | |
| 1444 | + if "Refresh the existing planning artifacts instead of creating a fresh plan from scratch." | |
| 1445 | + in invocation.messages[-1].content | |
| 1446 | + ) | |
| 1447 | + assert "Current execution progress:" in refresh_prompt | |
| 1448 | + assert "Already touched during execution:" in refresh_prompt | |
| 1449 | + assert f"- {target}" in refresh_prompt | |
| 1450 | + assert any( | |
| 1451 | + "Plan refresh preserved the progress already made." in message.content | |
| 1452 | + and "Do not restart from initial discovery" in message.content | |
| 1453 | + for invocation in backend.invocations | |
| 1454 | + for message in invocation.messages | |
| 1455 | + ) | |
| 1441 | 1456 | |
| 1442 | 1457 | |
| 1443 | 1458 | @pytest.mark.asyncio |