Strengthen qwen progress and verification handoffs
- SHA
297e213d29e61587549386472439e606500e8b69- Parents
-
918941f - Tree
7288310
297e213
297e213d29e61587549386472439e606500e8b69918941f
7288310| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/artifact_invalidation.py
|
40 | 3 |
| M |
src/loader/runtime/dod.py
|
28 | 0 |
| M |
src/loader/runtime/finalization.py
|
1 | 0 |
| M |
src/loader/runtime/tool_batches.py
|
34 | 1 |
| M |
src/loader/runtime/workflow.py
|
170 | 1 |
| M |
src/loader/runtime/workflow_lanes.py
|
44 | 0 |
| M |
src/loader/runtime/workflow_recovery.py
|
19 | 0 |
| M |
tests/test_artifact_invalidation.py
|
25 | 0 |
| M |
tests/test_dod.py
|
23 | 0 |
| M |
tests/test_finalization.py
|
47 | 0 |
| M |
tests/test_tool_batches.py
|
95 | 0 |
| M |
tests/test_workflow.py
|
151 | 0 |
| M |
tests/test_workflow_runtime.py
|
15 | 0 |
src/loader/runtime/artifact_invalidation.pymodified@@ -49,13 +49,18 @@ class ArtifactInvalidationAssessor: | |||
| 49 | unexpected_paths = [ | 49 | unexpected_paths = [ |
| 50 | name | 50 | name |
| 51 | for path in touched_files | 51 | for path in touched_files |
| 52 | - if (name := _path_name(path)) and name.lower() not in plan_text | 52 | + if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path) |
| 53 | ] | 53 | ] |
| 54 | confirmed_touchpoints = [ | 54 | confirmed_touchpoints = [ |
| 55 | name | 55 | name |
| 56 | for path in touched_files | 56 | for path in touched_files |
| 57 | if (name := _path_name(path)) | 57 | if (name := _path_name(path)) |
| 58 | ] | 58 | ] |
| 59 | + confirmed_touchpoint_keys = { | ||
| 60 | + _path_reference_identity(path) | ||
| 61 | + for path in touched_files | ||
| 62 | + if _path_reference_identity(path) | ||
| 63 | + } | ||
| 59 | inferred_touchpoints = [ | 64 | inferred_touchpoints = [ |
| 60 | item | 65 | item |
| 61 | for item in _extract_path_mentions( | 66 | for item in _extract_path_mentions( |
@@ -63,7 +68,7 @@ class ArtifactInvalidationAssessor: | |||
| 63 | implementation_text, | 68 | implementation_text, |
| 64 | verification_text, | 69 | verification_text, |
| 65 | ) | 70 | ) |
| 66 | - if _path_name(item) not in confirmed_touchpoints | 71 | + if _path_reference_identity(item) not in confirmed_touchpoint_keys |
| 67 | ] | 72 | ] |
| 68 | stale_plan = False | 73 | stale_plan = False |
| 69 | stale_brief = False | 74 | stale_brief = False |
@@ -147,7 +152,11 @@ class ArtifactInvalidationAssessor: | |||
| 147 | ) | 152 | ) |
| 148 | 153 | ||
| 149 | out_of_brief_paths = [ | 154 | out_of_brief_paths = [ |
| 150 | - name for name in unexpected_paths if name.lower() not in brief_text | 155 | + name |
| 156 | + for path in touched_files | ||
| 157 | + if (name := _path_name(path)) | ||
| 158 | + and name in unexpected_paths | ||
| 159 | + and not _text_covers_path_reference(brief_text, path) | ||
| 151 | ] | 160 | ] |
| 152 | if out_of_brief_paths: | 161 | if out_of_brief_paths: |
| 153 | stale_brief = True | 162 | stale_brief = True |
@@ -200,6 +209,34 @@ def _path_name(path: str) -> str: | |||
| 200 | return normalized.rsplit("/", maxsplit=1)[-1].strip() | 209 | return normalized.rsplit("/", maxsplit=1)[-1].strip() |
| 201 | 210 | ||
| 202 | 211 | ||
| 212 | +def _path_reference_identity(path: str) -> str: | ||
| 213 | + normalized = _path_name(path) | ||
| 214 | + if not normalized: | ||
| 215 | + return "" | ||
| 216 | + return _canonical_path_reference(normalized) | ||
| 217 | + | ||
| 218 | + | ||
| 219 | +def _text_covers_path_reference(text: str, path: str) -> bool: | ||
| 220 | + normalized_text = text.lower() | ||
| 221 | + candidates = [candidate for candidate in (str(path).strip(), _path_name(path)) if candidate] | ||
| 222 | + | ||
| 223 | + for candidate in candidates: | ||
| 224 | + if candidate.lower() in normalized_text: | ||
| 225 | + return True | ||
| 226 | + | ||
| 227 | + canonical_text = _canonical_path_reference(text) | ||
| 228 | + return any( | ||
| 229 | + canonical_candidate and canonical_candidate in canonical_text | ||
| 230 | + for canonical_candidate in (_canonical_path_reference(candidate) for candidate in candidates) | ||
| 231 | + ) | ||
| 232 | + | ||
| 233 | + | ||
| 234 | +def _canonical_path_reference(value: str) -> str: | ||
| 235 | + normalized = value.lower().strip() | ||
| 236 | + normalized = re.sub(r"[^a-z0-9]+", " ", normalized) | ||
| 237 | + return " ".join(normalized.split()) | ||
| 238 | + | ||
| 239 | + | ||
| 203 | def _text_covers_requirement(text: str, requirement: str) -> bool: | 240 | def _text_covers_requirement(text: str, requirement: str) -> bool: |
| 204 | normalized_text = text.lower() | 241 | normalized_text = text.lower() |
| 205 | normalized_requirement = requirement.lower() | 242 | normalized_requirement = requirement.lower() |
src/loader/runtime/dod.pymodified@@ -208,6 +208,7 @@ def derive_verification_commands( | |||
| 208 | *, | 208 | *, |
| 209 | project_root: Path, | 209 | project_root: Path, |
| 210 | task_statement: str, | 210 | task_statement: str, |
| 211 | + supplement_existing: bool = False, | ||
| 211 | ) -> list[str]: | 212 | ) -> list[str]: |
| 212 | """Generate verification commands from execution history and project shape.""" | 213 | """Generate verification commands from execution history and project shape.""" |
| 213 | 214 | ||
@@ -234,6 +235,8 @@ def derive_verification_commands( | |||
| 234 | 235 | ||
| 235 | if commands: | 236 | if commands: |
| 236 | return commands | 237 | return commands |
| 238 | + if supplement_existing: | ||
| 239 | + return commands | ||
| 237 | 240 | ||
| 238 | if dod.task_size == "small": | 241 | if dod.task_size == "small": |
| 239 | for path_str in dod.touched_files[:3]: | 242 | for path_str in dod.touched_files[:3]: |
@@ -245,6 +248,11 @@ def derive_verification_commands( | |||
| 245 | commands, | 248 | commands, |
| 246 | f"python -m py_compile {shlex.quote(str(effective_path))}", | 249 | f"python -m py_compile {shlex.quote(str(effective_path))}", |
| 247 | ) | 250 | ) |
| 251 | + elif _uses_external_artifacts_only(dod, project_root=project_root): | ||
| 252 | + for path_str in dod.touched_files[:3]: | ||
| 253 | + path = Path(path_str) | ||
| 254 | + effective_path = path if path.is_absolute() else (project_root / path) | ||
| 255 | + _append_unique(commands, f"test -f {shlex.quote(str(effective_path))}") | ||
| 248 | else: | 256 | else: |
| 249 | if (project_root / "pyproject.toml").exists(): | 257 | if (project_root / "pyproject.toml").exists(): |
| 250 | _append_unique(commands, "uv run pytest -q") | 258 | _append_unique(commands, "uv run pytest -q") |
@@ -407,6 +415,26 @@ def _append_unique(items: list[str], value: str) -> None: | |||
| 407 | items.append(value) | 415 | items.append(value) |
| 408 | 416 | ||
| 409 | 417 | ||
| 418 | +def _uses_external_artifacts_only(dod: DefinitionOfDone, *, project_root: Path) -> bool: | ||
| 419 | + touched = [Path(path) for path in dod.touched_files if str(path).strip()] | ||
| 420 | + if not touched: | ||
| 421 | + return False | ||
| 422 | + try: | ||
| 423 | + root = project_root.resolve() | ||
| 424 | + except FileNotFoundError: | ||
| 425 | + root = project_root | ||
| 426 | + external = [path for path in touched if not _path_is_within_root(path, root)] | ||
| 427 | + return bool(external) and len(external) == len(touched) | ||
| 428 | + | ||
| 429 | + | ||
| 430 | +def _path_is_within_root(path: Path, root: Path) -> bool: | ||
| 431 | + try: | ||
| 432 | + path.resolve().relative_to(root) | ||
| 433 | + return True | ||
| 434 | + except ValueError: | ||
| 435 | + return False | ||
| 436 | + | ||
| 437 | + | ||
| 410 | def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]: | 438 | def synthesize_todo_items(dod: DefinitionOfDone) -> list[dict[str, str]]: |
| 411 | """Build a todo item list from the current DoD state. | 439 | """Build a todo item list from the current DoD state. |
| 412 | 440 | ||
src/loader/runtime/finalization.pymodified@@ -294,6 +294,7 @@ class TurnFinalizer: | |||
| 294 | dod, | 294 | dod, |
| 295 | project_root=self.context.project_root, | 295 | project_root=self.context.project_root, |
| 296 | task_statement=dod.task_statement, | 296 | task_statement=dod.task_statement, |
| 297 | + supplement_existing=True, | ||
| 297 | ): | 298 | ): |
| 298 | if command not in dod.verification_commands: | 299 | if command not in dod.verification_commands: |
| 299 | dod.verification_commands.append(command) | 300 | dod.verification_commands.append(command) |
src/loader/runtime/tool_batches.pymodified@@ -46,6 +46,19 @@ _TODO_NUDGE_EXCLUDED_ITEMS = { | |||
| 46 | "Complete the requested work", | 46 | "Complete the requested work", |
| 47 | _VERIFY_ITEM, | 47 | _VERIFY_ITEM, |
| 48 | } | 48 | } |
| 49 | +_MUTATION_TODO_HINTS = ( | ||
| 50 | + "create", | ||
| 51 | + "update", | ||
| 52 | + "edit", | ||
| 53 | + "write", | ||
| 54 | + "fix", | ||
| 55 | + "modify", | ||
| 56 | + "change", | ||
| 57 | + "patch", | ||
| 58 | + "replace", | ||
| 59 | + "correct", | ||
| 60 | + "rewrite", | ||
| 61 | +) | ||
| 49 | 62 | ||
| 50 | 63 | ||
| 51 | @dataclass | 64 | @dataclass |
@@ -290,18 +303,26 @@ class ToolBatchRunner: | |||
| 290 | max_items=2, | 303 | max_items=2, |
| 291 | ) | 304 | ) |
| 292 | if next_pending and not html_toc_rule.task_targets_html_toc(current_task): | 305 | if next_pending and not html_toc_rule.task_targets_html_toc(current_task): |
| 306 | + mutation_suffix = "" | ||
| 307 | + if _todo_is_mutation_step(next_pending): | ||
| 308 | + mutation_suffix = ( | ||
| 309 | + " You already have enough evidence for that step, so stop gathering " | ||
| 310 | + "more reference material and perform the change now." | ||
| 311 | + ) | ||
| 293 | if confirmed_facts: | 312 | if confirmed_facts: |
| 294 | self.context.queue_steering_message( | 313 | self.context.queue_steering_message( |
| 295 | "Reuse the earlier observation instead of repeating it. " | 314 | "Reuse the earlier observation instead of repeating it. " |
| 296 | f"Confirmed facts: {confirmed_facts}. " | 315 | f"Confirmed facts: {confirmed_facts}. " |
| 297 | f"Continue with the next pending item: `{next_pending}`. " | 316 | f"Continue with the next pending item: `{next_pending}`. " |
| 298 | "Only gather more evidence if a specific fact required for that step is still unknown." | 317 | "Only gather more evidence if a specific fact required for that step is still unknown." |
| 318 | + + mutation_suffix | ||
| 299 | ) | 319 | ) |
| 300 | else: | 320 | else: |
| 301 | self.context.queue_steering_message( | 321 | self.context.queue_steering_message( |
| 302 | "Reuse the earlier observation instead of repeating it. " | 322 | "Reuse the earlier observation instead of repeating it. " |
| 303 | f"Continue with the next pending item: `{next_pending}`. " | 323 | f"Continue with the next pending item: `{next_pending}`. " |
| 304 | "Only gather more evidence if a specific fact required for that step is still unknown." | 324 | "Only gather more evidence if a specific fact required for that step is still unknown." |
| 325 | + + mutation_suffix | ||
| 305 | ) | 326 | ) |
| 306 | return | 327 | return |
| 307 | 328 | ||
@@ -752,10 +773,17 @@ class ToolBatchRunner: | |||
| 752 | if not completed_label or not next_pending or next_pending == completed_label: | 773 | if not completed_label or not next_pending or next_pending == completed_label: |
| 753 | return | 774 | return |
| 754 | 775 | ||
| 776 | + mutation_suffix = "" | ||
| 777 | + if _todo_is_mutation_step(next_pending): | ||
| 778 | + mutation_suffix = ( | ||
| 779 | + " You already have enough evidence for that step, so stop gathering " | ||
| 780 | + "more reference material and perform the change now." | ||
| 781 | + ) | ||
| 782 | + | ||
| 755 | self.context.queue_steering_message( | 783 | self.context.queue_steering_message( |
| 756 | f"Confirmed progress: `{completed_label}` is now satisfied by the successful " | 784 | f"Confirmed progress: `{completed_label}` is now satisfied by the successful " |
| 757 | f"`{tool_call.name}` result. Continue with the next pending item: " | 785 | f"`{tool_call.name}` result. Continue with the next pending item: " |
| 758 | - f"`{next_pending}` instead of rereading the same evidence." | 786 | + f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}" |
| 759 | ) | 787 | ) |
| 760 | 788 | ||
| 761 | 789 | ||
@@ -795,6 +823,11 @@ def _mark_verification_stale( | |||
| 795 | dod.pending_items.append(_VERIFY_ITEM) | 823 | dod.pending_items.append(_VERIFY_ITEM) |
| 796 | 824 | ||
| 797 | 825 | ||
| 826 | +def _todo_is_mutation_step(label: str) -> bool: | ||
| 827 | + lowered = label.lower() | ||
| 828 | + return any(token in lowered for token in _MUTATION_TODO_HINTS) | ||
| 829 | + | ||
| 830 | + | ||
| 798 | def _mark_verification_planned( | 831 | def _mark_verification_planned( |
| 799 | *, | 832 | *, |
| 800 | context: RuntimeContext, | 833 | context: RuntimeContext, |
src/loader/runtime/workflow.pymodified@@ -125,6 +125,75 @@ _VERIFY_STEP_HINTS = ( | |||
| 125 | "confirm", | 125 | "confirm", |
| 126 | "check", | 126 | "check", |
| 127 | ) | 127 | ) |
| 128 | +_AGGREGATE_TODO_HINTS = ( | ||
| 129 | + "each ", | ||
| 130 | + "all ", | ||
| 131 | + "every ", | ||
| 132 | + "sequence", | ||
| 133 | + "multiple ", | ||
| 134 | + "across ", | ||
| 135 | + "consistently", | ||
| 136 | + "properly linked", | ||
| 137 | + "directory structure", | ||
| 138 | +) | ||
| 139 | +_ACTIONABLE_STEP_VERBS = { | ||
| 140 | + "add", | ||
| 141 | + "apply", | ||
| 142 | + "build", | ||
| 143 | + "check", | ||
| 144 | + "confirm", | ||
| 145 | + "create", | ||
| 146 | + "document", | ||
| 147 | + "edit", | ||
| 148 | + "ensure", | ||
| 149 | + "fix", | ||
| 150 | + "implement", | ||
| 151 | + "inspect", | ||
| 152 | + "list", | ||
| 153 | + "move", | ||
| 154 | + "parse", | ||
| 155 | + "patch", | ||
| 156 | + "read", | ||
| 157 | + "refactor", | ||
| 158 | + "remove", | ||
| 159 | + "rename", | ||
| 160 | + "reorder", | ||
| 161 | + "rerun", | ||
| 162 | + "re-run", | ||
| 163 | + "review", | ||
| 164 | + "run", | ||
| 165 | + "search", | ||
| 166 | + "test", | ||
| 167 | + "update", | ||
| 168 | + "validate", | ||
| 169 | + "verify", | ||
| 170 | + "write", | ||
| 171 | +} | ||
| 172 | +_RETROSPECTIVE_STEP_VERBS = { | ||
| 173 | + "added", | ||
| 174 | + "applied", | ||
| 175 | + "built", | ||
| 176 | + "checked", | ||
| 177 | + "completed", | ||
| 178 | + "confirmed", | ||
| 179 | + "created", | ||
| 180 | + "edited", | ||
| 181 | + "ensured", | ||
| 182 | + "examined", | ||
| 183 | + "generated", | ||
| 184 | + "implemented", | ||
| 185 | + "inspected", | ||
| 186 | + "listed", | ||
| 187 | + "looked", | ||
| 188 | + "parsed", | ||
| 189 | + "patched", | ||
| 190 | + "read", | ||
| 191 | + "reviewed", | ||
| 192 | + "updated", | ||
| 193 | + "validated", | ||
| 194 | + "verified", | ||
| 195 | + "wrote", | ||
| 196 | +} | ||
| 128 | _TASK_COVERAGE_STOP_WORDS = { | 197 | _TASK_COVERAGE_STOP_WORDS = { |
| 129 | "the", | 198 | "the", |
| 130 | "and", | 199 | "and", |
@@ -491,6 +560,41 @@ class PlanningArtifacts: | |||
| 491 | implementation_steps=list(self.implementation_steps), | 560 | implementation_steps=list(self.implementation_steps), |
| 492 | ) | 561 | ) |
| 493 | 562 | ||
| 563 | + def with_progress_context( | ||
| 564 | + self, | ||
| 565 | + *, | ||
| 566 | + touched_files: list[str], | ||
| 567 | + completed_items: list[str], | ||
| 568 | + ) -> PlanningArtifacts: | ||
| 569 | + """Return one copy that preserves already-confirmed execution progress.""" | ||
| 570 | + | ||
| 571 | + progress_items: list[str] = [] | ||
| 572 | + for raw_path in touched_files: | ||
| 573 | + path_text = str(raw_path).strip() | ||
| 574 | + if not path_text: | ||
| 575 | + continue | ||
| 576 | + progress_items.append(f"Already touched during execution: `{path_text}`.") | ||
| 577 | + for raw_item in completed_items: | ||
| 578 | + item = str(raw_item).strip() | ||
| 579 | + if not item or item in _SPECIAL_TODO_ITEMS: | ||
| 580 | + continue | ||
| 581 | + progress_items.append(f"Already completed during execution: {item}.") | ||
| 582 | + | ||
| 583 | + if not progress_items: | ||
| 584 | + return self | ||
| 585 | + | ||
| 586 | + return PlanningArtifacts( | ||
| 587 | + implementation_markdown=_replace_markdown_section_items( | ||
| 588 | + self.implementation_markdown, | ||
| 589 | + "Confirmed Progress", | ||
| 590 | + list(dict.fromkeys(progress_items)), | ||
| 591 | + ), | ||
| 592 | + verification_markdown=self.verification_markdown, | ||
| 593 | + verification_commands=list(self.verification_commands), | ||
| 594 | + acceptance_criteria=list(self.acceptance_criteria), | ||
| 595 | + implementation_steps=list(self.implementation_steps), | ||
| 596 | + ) | ||
| 597 | + | ||
| 494 | 598 | ||
| 495 | class WorkflowArtifactStore: | 599 | class WorkflowArtifactStore: |
| 496 | """Persist briefs and plans under `.loader/`.""" | 600 | """Persist briefs and plans under `.loader/`.""" |
@@ -627,6 +731,15 @@ def merge_refreshed_todos_with_existing_scope( | |||
| 627 | and item not in _SPECIAL_TODO_ITEMS | 731 | and item not in _SPECIAL_TODO_ITEMS |
| 628 | and _task_text_covers_requirement(task_statement, item) | 732 | and _task_text_covers_requirement(task_statement, item) |
| 629 | ] | 733 | ] |
| 734 | + refreshed_candidates = [ | ||
| 735 | + item.strip() | ||
| 736 | + for item in refreshed_steps | ||
| 737 | + if item.strip() | ||
| 738 | + and ( | ||
| 739 | + not (grounded_completed or grounded_pending) | ||
| 740 | + or _looks_actionable_refresh_step(item) | ||
| 741 | + ) | ||
| 742 | + ] | ||
| 630 | 743 | ||
| 631 | todos: list[dict[str, str]] = [] | 744 | todos: list[dict[str, str]] = [] |
| 632 | seen: set[str] = set() | 745 | seen: set[str] = set() |
@@ -641,7 +754,7 @@ def merge_refreshed_todos_with_existing_scope( | |||
| 641 | "status": "completed", | 754 | "status": "completed", |
| 642 | } | 755 | } |
| 643 | ) | 756 | ) |
| 644 | - for item in [*grounded_pending, *refreshed_steps]: | 757 | + for item in [*grounded_pending, *refreshed_candidates]: |
| 645 | label = item.strip() | 758 | label = item.strip() |
| 646 | if not label or label in seen: | 759 | if not label or label in seen: |
| 647 | continue | 760 | continue |
@@ -740,7 +853,14 @@ def _todo_progress_score(item: str, tool_call: ToolCall) -> int: | |||
| 740 | elif _looks_like_read_command(command): | 853 | elif _looks_like_read_command(command): |
| 741 | if _contains_any(text, _READ_STEP_HINTS): | 854 | if _contains_any(text, _READ_STEP_HINTS): |
| 742 | score += 2 | 855 | score += 2 |
| 856 | + elif _looks_like_fs_mutation_command(command): | ||
| 857 | + if _contains_any(text, _MUTATION_STEP_HINTS): | ||
| 858 | + score += 3 | ||
| 859 | + if "directory" in text and "mkdir" in command: | ||
| 860 | + score += 2 | ||
| 743 | elif name in {"write", "edit", "patch"}: | 861 | elif name in {"write", "edit", "patch"}: |
| 862 | + if _todo_describes_aggregate_mutation(text) and basename and basename not in text: | ||
| 863 | + return 0 | ||
| 744 | if _contains_any(text, _MUTATION_STEP_HINTS): | 864 | if _contains_any(text, _MUTATION_STEP_HINTS): |
| 745 | score += 3 | 865 | score += 3 |
| 746 | 866 | ||
@@ -753,6 +873,13 @@ def _contains_any(text: str, candidates: tuple[str, ...]) -> bool: | |||
| 753 | return any(candidate in text for candidate in candidates) | 873 | return any(candidate in text for candidate in candidates) |
| 754 | 874 | ||
| 755 | 875 | ||
| 876 | +def _todo_describes_aggregate_mutation(text: str) -> bool: | ||
| 877 | + return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any( | ||
| 878 | + text, | ||
| 879 | + _MUTATION_STEP_HINTS, | ||
| 880 | + ) | ||
| 881 | + | ||
| 882 | + | ||
| 756 | def _looks_like_search_command(command: str) -> bool: | 883 | def _looks_like_search_command(command: str) -> bool: |
| 757 | return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob ")) | 884 | return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob ")) |
| 758 | 885 | ||
@@ -781,6 +908,27 @@ def _looks_like_verification_command(command: str) -> bool: | |||
| 781 | ) | 908 | ) |
| 782 | 909 | ||
| 783 | 910 | ||
| 911 | +def _looks_like_fs_mutation_command(command: str) -> bool: | ||
| 912 | + stripped = command.strip() | ||
| 913 | + return any( | ||
| 914 | + stripped.startswith(prefix) | ||
| 915 | + for prefix in ( | ||
| 916 | + "mkdir ", | ||
| 917 | + "mkdir\t", | ||
| 918 | + "touch ", | ||
| 919 | + "touch\t", | ||
| 920 | + "cp ", | ||
| 921 | + "cp\t", | ||
| 922 | + "mv ", | ||
| 923 | + "mv\t", | ||
| 924 | + "ln ", | ||
| 925 | + "ln\t", | ||
| 926 | + "install ", | ||
| 927 | + "install\t", | ||
| 928 | + ) | ||
| 929 | + ) | ||
| 930 | + | ||
| 931 | + | ||
| 784 | def extract_verification_commands_from_markdown(markdown: str) -> list[str]: | 932 | def extract_verification_commands_from_markdown(markdown: str) -> list[str]: |
| 785 | """Extract verification commands from a verification-plan markdown document.""" | 933 | """Extract verification commands from a verification-plan markdown document.""" |
| 786 | 934 | ||
@@ -1057,6 +1205,27 @@ def _requirement_describes_output_scope(requirement: str) -> bool: | |||
| 1057 | ) | 1205 | ) |
| 1058 | 1206 | ||
| 1059 | 1207 | ||
| 1208 | +def _looks_actionable_refresh_step(step: str) -> bool: | ||
| 1209 | + normalized = step.strip() | ||
| 1210 | + if not normalized: | ||
| 1211 | + return False | ||
| 1212 | + if re.fullmatch(r"(?:[\w.-]+/)*[\w.-]+\.[A-Za-z0-9]+", normalized): | ||
| 1213 | + return False | ||
| 1214 | + | ||
| 1215 | + lowered = normalized.lower() | ||
| 1216 | + lowered = re.sub(r"^(?:first|next|then|finally|afterward|afterwards)\b[,:]?\s*", "", lowered) | ||
| 1217 | + first_word_match = re.match(r"^[a-z-]+", lowered) | ||
| 1218 | + if first_word_match is None: | ||
| 1219 | + return False | ||
| 1220 | + | ||
| 1221 | + first_word = first_word_match.group(0) | ||
| 1222 | + if first_word in _RETROSPECTIVE_STEP_VERBS: | ||
| 1223 | + return False | ||
| 1224 | + if first_word in _ACTIONABLE_STEP_VERBS: | ||
| 1225 | + return True | ||
| 1226 | + return False | ||
| 1227 | + | ||
| 1228 | + | ||
| 1060 | def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None: | 1229 | def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None: |
| 1061 | if section in brief.explicit_sections: | 1230 | if section in brief.explicit_sections: |
| 1062 | return | 1231 | return |
src/loader/runtime/workflow_lanes.pymodified@@ -208,6 +208,10 @@ class WorkflowLaneRunner: | |||
| 208 | refreshed_acceptance_criteria=list(artifacts.acceptance_criteria), | 208 | refreshed_acceptance_criteria=list(artifacts.acceptance_criteria), |
| 209 | ) | 209 | ) |
| 210 | artifacts = artifacts.with_acceptance_criteria(preserved_acceptance) | 210 | artifacts = artifacts.with_acceptance_criteria(preserved_acceptance) |
| 211 | + artifacts = artifacts.with_progress_context( | ||
| 212 | + touched_files=list(dod.touched_files), | ||
| 213 | + completed_items=list(dod.completed_items), | ||
| 214 | + ) | ||
| 211 | implementation_path, verification_path = self.artifact_store.write_plan( | 215 | implementation_path, verification_path = self.artifact_store.write_plan( |
| 212 | task, | 216 | task, |
| 213 | artifacts, | 217 | artifacts, |
@@ -610,6 +614,41 @@ class WorkflowLaneRunner: | |||
| 610 | 614 | ||
| 611 | refresh_block = "" | 615 | refresh_block = "" |
| 612 | if refresh_reasons: | 616 | if refresh_reasons: |
| 617 | + progress_lines: list[str] = [] | ||
| 618 | + touched = [str(path).strip() for path in dod.touched_files if str(path).strip()] | ||
| 619 | + completed = [ | ||
| 620 | + item.strip() | ||
| 621 | + for item in dod.completed_items | ||
| 622 | + if item.strip() | ||
| 623 | + and item not in {"Complete the requested work", "Collect verification evidence"} | ||
| 624 | + ] | ||
| 625 | + pending = [ | ||
| 626 | + item.strip() | ||
| 627 | + for item in dod.pending_items | ||
| 628 | + if item.strip() | ||
| 629 | + and item not in {"Complete the requested work", "Collect verification evidence"} | ||
| 630 | + ] | ||
| 631 | + if touched: | ||
| 632 | + progress_lines.extend( | ||
| 633 | + [ | ||
| 634 | + "Already touched during execution:", | ||
| 635 | + *[f"- {item}" for item in touched[:12]], | ||
| 636 | + ] | ||
| 637 | + ) | ||
| 638 | + if completed: | ||
| 639 | + progress_lines.extend( | ||
| 640 | + [ | ||
| 641 | + "Already completed work:", | ||
| 642 | + *[f"- {item}" for item in completed[:12]], | ||
| 643 | + ] | ||
| 644 | + ) | ||
| 645 | + if pending: | ||
| 646 | + progress_lines.extend( | ||
| 647 | + [ | ||
| 648 | + "Still pending:", | ||
| 649 | + *[f"- {item}" for item in pending[:12]], | ||
| 650 | + ] | ||
| 651 | + ) | ||
| 613 | refresh_block = ( | 652 | refresh_block = ( |
| 614 | "Refresh the existing planning artifacts instead of creating a fresh plan " | 653 | "Refresh the existing planning artifacts instead of creating a fresh plan " |
| 615 | "from scratch.\n" | 654 | "from scratch.\n" |
@@ -619,6 +658,11 @@ class WorkflowLaneRunner: | |||
| 619 | "artifact.\n" | 658 | "artifact.\n" |
| 620 | "Use the current task state and these recovery reasons:\n" | 659 | "Use the current task state and these recovery reasons:\n" |
| 621 | + "\n".join(f"- {item}" for item in refresh_reasons) | 660 | + "\n".join(f"- {item}" for item in refresh_reasons) |
| 661 | + + ( | ||
| 662 | + ("\n\nCurrent execution progress:\n" + "\n".join(progress_lines)) | ||
| 663 | + if progress_lines | ||
| 664 | + else "" | ||
| 665 | + ) | ||
| 622 | + "\n\n" | 666 | + "\n\n" |
| 623 | ) | 667 | ) |
| 624 | 668 | ||
src/loader/runtime/workflow_recovery.pymodified@@ -29,6 +29,10 @@ UserQuestionHandler = Callable[[str, list[str] | None], Awaitable[str]] | None | |||
| 29 | WorkflowModeSetter = Callable[..., Awaitable[None]] | 29 | WorkflowModeSetter = Callable[..., Awaitable[None]] |
| 30 | TimelineAppender = Callable[..., None] | 30 | TimelineAppender = Callable[..., None] |
| 31 | BridgeAppender = Callable[[DefinitionOfDone], None] | 31 | BridgeAppender = Callable[[DefinitionOfDone], None] |
| 32 | +_RECOVERY_TODO_EXCLUDED_ITEMS = { | ||
| 33 | + "Complete the requested work", | ||
| 34 | + "Collect verification evidence", | ||
| 35 | +} | ||
| 32 | 36 | ||
| 33 | 37 | ||
| 34 | class WorkflowRecoveryController: | 38 | class WorkflowRecoveryController: |
@@ -186,6 +190,21 @@ class WorkflowRecoveryController: | |||
| 186 | summary=summary, | 190 | summary=summary, |
| 187 | ) | 191 | ) |
| 188 | self.append_execute_bridge(dod) | 192 | self.append_execute_bridge(dod) |
| 193 | + next_pending = next( | ||
| 194 | + ( | ||
| 195 | + item | ||
| 196 | + for item in dod.pending_items | ||
| 197 | + if item not in _RECOVERY_TODO_EXCLUDED_ITEMS | ||
| 198 | + ), | ||
| 199 | + None, | ||
| 200 | + ) | ||
| 201 | + if next_pending: | ||
| 202 | + self.context.queue_steering_message( | ||
| 203 | + "Plan refresh preserved the progress already made. " | ||
| 204 | + f"Reuse the existing files and confirmed facts, then continue with the next " | ||
| 205 | + f"pending item: `{next_pending}`. " | ||
| 206 | + "Do not restart from initial discovery unless a specific missing fact blocks that step." | ||
| 207 | + ) | ||
| 189 | return True | 208 | return True |
| 190 | 209 | ||
| 191 | async def _run_clarify_reentry_for_drift( | 210 | async def _run_clarify_reentry_for_drift( |
tests/test_artifact_invalidation.pymodified@@ -67,3 +67,28 @@ def test_artifact_invalidation_can_force_full_replan_when_brief_and_plan_drift() | |||
| 67 | for item in freshness.evidence | 67 | for item in freshness.evidence |
| 68 | ) | 68 | ) |
| 69 | assert freshness.evidence_summary | 69 | assert freshness.evidence_summary |
| 70 | + | ||
| 71 | + | ||
| 72 | +def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint() -> None: | ||
| 73 | + assessor = ArtifactInvalidationAssessor() | ||
| 74 | + | ||
| 75 | + freshness = assessor.assess( | ||
| 76 | + task_statement="Build a multi-file nginx guide.", | ||
| 77 | + clarify_text=None, | ||
| 78 | + implementation_text=( | ||
| 79 | + "# Implementation Plan\n" | ||
| 80 | + "- Create 01-getting-started.html in the chapters directory.\n" | ||
| 81 | + ), | ||
| 82 | + verification_text=( | ||
| 83 | + "# Verification Plan\n" | ||
| 84 | + "## Acceptance Criteria\n" | ||
| 85 | + "- 01-getting-started.html exists.\n" | ||
| 86 | + ), | ||
| 87 | + acceptance_criteria=["01-getting-started.html exists."], | ||
| 88 | + touched_files=["/tmp/chapters/01_getting_started.html"], | ||
| 89 | + last_verification_result=None, | ||
| 90 | + ) | ||
| 91 | + | ||
| 92 | + assert freshness.stale_plan is False | ||
| 93 | + assert freshness.stale_brief is False | ||
| 94 | + assert "touched_files_outside_plan" not in freshness.reason_codes | ||
tests/test_dod.pymodified@@ -143,6 +143,29 @@ def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Pat | |||
| 143 | assert not any(command == f"test -f {index}" for command in commands) | 143 | assert not any(command == f"test -f {index}" for command in commands) |
| 144 | 144 | ||
| 145 | 145 | ||
| 146 | +def test_derive_verification_commands_avoids_repo_defaults_for_external_artifacts( | ||
| 147 | + tmp_path: Path, | ||
| 148 | +) -> None: | ||
| 149 | + (tmp_path / "pyproject.toml").write_text("[project]\nname='loader'\n") | ||
| 150 | + (tmp_path / "package.json").write_text("{}\n") | ||
| 151 | + external_root = tmp_path.parent / "external-guide" | ||
| 152 | + external_root.mkdir(exist_ok=True) | ||
| 153 | + external_index = external_root / "index.html" | ||
| 154 | + external_index.write_text("<html></html>\n") | ||
| 155 | + | ||
| 156 | + dod = create_definition_of_done("Create an external nginx guide.") | ||
| 157 | + dod.task_size = "standard" | ||
| 158 | + dod.touched_files = [str(external_index)] | ||
| 159 | + | ||
| 160 | + commands = derive_verification_commands( | ||
| 161 | + dod, | ||
| 162 | + project_root=tmp_path, | ||
| 163 | + task_statement=dod.task_statement, | ||
| 164 | + ) | ||
| 165 | + | ||
| 166 | + assert commands == [f"test -f {external_index}"] | ||
| 167 | + | ||
| 168 | + | ||
| 146 | def test_build_verification_summary_keeps_concrete_missing_link_details() -> None: | 169 | def test_build_verification_summary_keeps_concrete_missing_link_details() -> None: |
| 147 | summary = build_verification_summary( | 170 | summary = build_verification_summary( |
| 148 | [ | 171 | [ |
tests/test_finalization.pymodified@@ -434,6 +434,53 @@ async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_comma | |||
| 434 | ) | 434 | ) |
| 435 | 435 | ||
| 436 | 436 | ||
| 437 | +@pytest.mark.asyncio | ||
| 438 | +async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verification_plan( | ||
| 439 | + temp_dir: Path, | ||
| 440 | +) -> None: | ||
| 441 | + (temp_dir / "pyproject.toml").write_text("[project]\nname='loader'\n") | ||
| 442 | + (temp_dir / "package.json").write_text("{}\n") | ||
| 443 | + external_root = temp_dir.parent / "external-nginx-guide" | ||
| 444 | + external_root.mkdir(exist_ok=True) | ||
| 445 | + external_index = external_root / "index.html" | ||
| 446 | + external_index.write_text("<html></html>\n") | ||
| 447 | + | ||
| 448 | + session = FakeSession() | ||
| 449 | + context = build_context(temp_dir, session) | ||
| 450 | + finalizer = TurnFinalizer( | ||
| 451 | + context, | ||
| 452 | + RuntimeTracer(), | ||
| 453 | + DefinitionOfDoneStore(temp_dir), | ||
| 454 | + set_workflow_mode=_noop_set_workflow_mode, | ||
| 455 | + ) | ||
| 456 | + dod = create_definition_of_done("Create an external nginx guide.") | ||
| 457 | + dod.mutating_actions.append("write") | ||
| 458 | + dod.touched_files.append(str(external_index)) | ||
| 459 | + dod.verification_commands = [ | ||
| 460 | + f"ls -la {external_root}", | ||
| 461 | + f"grep -n \"html\" {external_index}", | ||
| 462 | + ] | ||
| 463 | + summary = TurnSummary(final_response="") | ||
| 464 | + executor = RecordingExecutor() | ||
| 465 | + | ||
| 466 | + async def capture(event) -> None: | ||
| 467 | + return None | ||
| 468 | + | ||
| 469 | + result = await finalizer.run_definition_of_done_gate( | ||
| 470 | + dod=dod, | ||
| 471 | + candidate_response="Created the external nginx guide.", | ||
| 472 | + emit=capture, | ||
| 473 | + summary=summary, | ||
| 474 | + executor=executor, # type: ignore[arg-type] | ||
| 475 | + ) | ||
| 476 | + | ||
| 477 | + assert result.should_continue is False | ||
| 478 | + assert executor.commands == [ | ||
| 479 | + f"ls -la {external_root}", | ||
| 480 | + f'grep -n "html" {external_index}', | ||
| 481 | + ] | ||
| 482 | + | ||
| 483 | + | ||
| 437 | @pytest.mark.asyncio | 484 | @pytest.mark.asyncio |
| 438 | async def test_turn_finalizer_records_missing_verification_observation( | 485 | async def test_turn_finalizer_records_missing_verification_observation( |
| 439 | temp_dir: Path, | 486 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -1041,6 +1041,10 @@ async def test_tool_batch_runner_queues_next_pending_todo_after_discovery_progre | |||
| 1041 | in message | 1041 | in message |
| 1042 | for message in queued_messages | 1042 | for message in queued_messages |
| 1043 | ) | 1043 | ) |
| 1044 | + assert any( | ||
| 1045 | + "stop gathering more reference material and perform the change now" in message | ||
| 1046 | + for message in queued_messages | ||
| 1047 | + ) | ||
| 1044 | 1048 | ||
| 1045 | 1049 | ||
| 1046 | @pytest.mark.asyncio | 1050 | @pytest.mark.asyncio |
@@ -1161,6 +1165,97 @@ async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_t | |||
| 1161 | assert "Update `" not in queued_messages[0] | 1165 | assert "Update `" not in queued_messages[0] |
| 1162 | 1166 | ||
| 1163 | 1167 | ||
| 1168 | +@pytest.mark.asyncio | ||
| 1169 | +async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | ||
| 1170 | + temp_dir: Path, | ||
| 1171 | +) -> None: | ||
| 1172 | + async def assess_confidence( | ||
| 1173 | + tool_name: str, | ||
| 1174 | + tool_args: dict, | ||
| 1175 | + context: str, | ||
| 1176 | + ) -> ConfidenceAssessment: | ||
| 1177 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | ||
| 1178 | + | ||
| 1179 | + async def verify_action( | ||
| 1180 | + tool_name: str, | ||
| 1181 | + tool_args: dict, | ||
| 1182 | + result: str, | ||
| 1183 | + expected: str = "", | ||
| 1184 | + ) -> ActionVerification: | ||
| 1185 | + raise AssertionError("Verification should not run for this scenario") | ||
| 1186 | + | ||
| 1187 | + reference = temp_dir / "fortran" / "index.html" | ||
| 1188 | + reference.parent.mkdir(parents=True) | ||
| 1189 | + reference.write_text("<h1>Fortran Beginner's Guide</h1>\n") | ||
| 1190 | + | ||
| 1191 | + context = build_context( | ||
| 1192 | + temp_dir=temp_dir, | ||
| 1193 | + messages=[], | ||
| 1194 | + safeguards=FakeSafeguards(), | ||
| 1195 | + assess_confidence=assess_confidence, | ||
| 1196 | + verify_action=verify_action, | ||
| 1197 | + auto_recover=False, | ||
| 1198 | + ) | ||
| 1199 | + queued_messages: list[str] = [] | ||
| 1200 | + context.queue_steering_message_callback = queued_messages.append | ||
| 1201 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | ||
| 1202 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | ||
| 1203 | + sync_todos_to_definition_of_done( | ||
| 1204 | + dod, | ||
| 1205 | + [ | ||
| 1206 | + { | ||
| 1207 | + "content": "Examine the existing Fortran guide structure to understand the cadence and format", | ||
| 1208 | + "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format", | ||
| 1209 | + "status": "pending", | ||
| 1210 | + }, | ||
| 1211 | + { | ||
| 1212 | + "content": "Create the nginx index.html file", | ||
| 1213 | + "active_form": "Working on: Create the nginx index.html file", | ||
| 1214 | + "status": "pending", | ||
| 1215 | + }, | ||
| 1216 | + ], | ||
| 1217 | + ) | ||
| 1218 | + tool_call = ToolCall( | ||
| 1219 | + id="read-reference", | ||
| 1220 | + name="read", | ||
| 1221 | + arguments={"file_path": str(reference)}, | ||
| 1222 | + ) | ||
| 1223 | + executor = FakeExecutor( | ||
| 1224 | + [ | ||
| 1225 | + tool_outcome( | ||
| 1226 | + tool_call=tool_call, | ||
| 1227 | + output="<h1>Fortran Beginner's Guide</h1>\n", | ||
| 1228 | + is_error=False, | ||
| 1229 | + ) | ||
| 1230 | + ] | ||
| 1231 | + ) | ||
| 1232 | + | ||
| 1233 | + summary = TurnSummary(final_response="") | ||
| 1234 | + await runner.execute_batch( | ||
| 1235 | + tool_calls=[tool_call], | ||
| 1236 | + tool_source="assistant", | ||
| 1237 | + pending_tool_calls_seen=set(), | ||
| 1238 | + emit=_noop_emit, | ||
| 1239 | + summary=summary, | ||
| 1240 | + dod=dod, | ||
| 1241 | + executor=executor, # type: ignore[arg-type] | ||
| 1242 | + on_confirmation=None, | ||
| 1243 | + on_user_question=None, | ||
| 1244 | + emit_confirmation=None, | ||
| 1245 | + consecutive_errors=0, | ||
| 1246 | + ) | ||
| 1247 | + | ||
| 1248 | + assert any( | ||
| 1249 | + "Continue with the next pending item: `Create the nginx index.html file`" | ||
| 1250 | + in message | ||
| 1251 | + for message in queued_messages | ||
| 1252 | + ) | ||
| 1253 | + assert any( | ||
| 1254 | + "stop gathering more reference material and perform the change now" in message | ||
| 1255 | + for message in queued_messages | ||
| 1256 | + ) | ||
| 1257 | + | ||
| 1258 | + | ||
| 1164 | @pytest.mark.asyncio | 1259 | @pytest.mark.asyncio |
| 1165 | async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid( | 1260 | async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid( |
| 1166 | temp_dir: Path, | 1261 | temp_dir: Path, |
tests/test_workflow.pymodified@@ -345,6 +345,47 @@ def test_planning_artifacts_with_acceptance_criteria_rewrites_verification_markd | |||
| 345 | ) | 345 | ) |
| 346 | 346 | ||
| 347 | 347 | ||
| 348 | +def test_planning_artifacts_with_progress_context_records_touched_and_completed_work() -> None: | ||
| 349 | + artifacts = PlanningArtifacts.from_model_output( | ||
| 350 | + "\n".join( | ||
| 351 | + [ | ||
| 352 | + "# Implementation Plan", | ||
| 353 | + "", | ||
| 354 | + "## Execution Order", | ||
| 355 | + "1. Create the guide files.", | ||
| 356 | + "", | ||
| 357 | + "<<<VERIFICATION>>>", | ||
| 358 | + "", | ||
| 359 | + "# Verification Plan", | ||
| 360 | + "", | ||
| 361 | + "## Acceptance Criteria", | ||
| 362 | + "- At least one chapter file exists.", | ||
| 363 | + "", | ||
| 364 | + "## Verification Commands", | ||
| 365 | + "- `find chapters -name \"*.html\" | wc -l`", | ||
| 366 | + ] | ||
| 367 | + ), | ||
| 368 | + task_statement="Create a thorough nginx guide.", | ||
| 369 | + ) | ||
| 370 | + | ||
| 371 | + updated = artifacts.with_progress_context( | ||
| 372 | + touched_files=["/tmp/nginx/index.html"], | ||
| 373 | + completed_items=[ | ||
| 374 | + "Create the guide scaffold", | ||
| 375 | + "Collect verification evidence", | ||
| 376 | + ], | ||
| 377 | + ) | ||
| 378 | + | ||
| 379 | + assert "## Confirmed Progress" in updated.implementation_markdown | ||
| 380 | + assert "Already touched during execution: `/tmp/nginx/index.html`." in ( | ||
| 381 | + updated.implementation_markdown | ||
| 382 | + ) | ||
| 383 | + assert "Already completed during execution: Create the guide scaffold." in ( | ||
| 384 | + updated.implementation_markdown | ||
| 385 | + ) | ||
| 386 | + assert "Collect verification evidence" not in updated.implementation_markdown | ||
| 387 | + | ||
| 388 | + | ||
| 348 | def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None: | 389 | def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> None: |
| 349 | task = ( | 390 | task = ( |
| 350 | "Create an equally thorough nginx guide with index.html plus chapter files " | 391 | "Create an equally thorough nginx guide with index.html plus chapter files " |
@@ -371,6 +412,48 @@ def test_merge_refreshed_todos_with_existing_scope_keeps_grounded_progress() -> | |||
| 371 | ) | 412 | ) |
| 372 | 413 | ||
| 373 | 414 | ||
| 415 | +def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() -> None: | ||
| 416 | + task = ( | ||
| 417 | + "Create an equally thorough nginx guide with index.html plus chapter files " | ||
| 418 | + "covering getting started, installation, first website setup, configs, and " | ||
| 419 | + "advanced topics." | ||
| 420 | + ) | ||
| 421 | + | ||
| 422 | + todos = merge_refreshed_todos_with_existing_scope( | ||
| 423 | + task, | ||
| 424 | + existing_pending_items=[ | ||
| 425 | + "Create each chapter file in sequence, following the same structure as the Fortran guide", | ||
| 426 | + "Ensure all files are properly linked and formatted consistently", | ||
| 427 | + ], | ||
| 428 | + existing_completed_items=[ | ||
| 429 | + "First, examine the existing Fortran guide structure to understand the format and cadence", | ||
| 430 | + "Create the directory structure for the new nginx guide", | ||
| 431 | + "Create the main index.html file", | ||
| 432 | + ], | ||
| 433 | + refreshed_steps=[ | ||
| 434 | + "First examined the existing Fortran guide structure to understand format and cadence", | ||
| 435 | + "Created the main index.html file with navigation", | ||
| 436 | + "Created chapter files in sequence:", | ||
| 437 | + "01-getting-started.html", | ||
| 438 | + "02-installation.html", | ||
| 439 | + "03-first-website.html", | ||
| 440 | + "04-configuring.html", | ||
| 441 | + "All files properly linked with navigation between chapters", | ||
| 442 | + "Verify the final navigation links across the guide", | ||
| 443 | + ], | ||
| 444 | + ) | ||
| 445 | + | ||
| 446 | + labels = {item["content"]: item["status"] for item in todos} | ||
| 447 | + assert ( | ||
| 448 | + labels["Create each chapter file in sequence, following the same structure as the Fortran guide"] | ||
| 449 | + == "pending" | ||
| 450 | + ) | ||
| 451 | + assert labels["Ensure all files are properly linked and formatted consistently"] == "pending" | ||
| 452 | + assert labels["Verify the final navigation links across the guide"] == "pending" | ||
| 453 | + assert "Created chapter files in sequence:" not in labels | ||
| 454 | + assert "04-configuring.html" not in labels | ||
| 455 | + | ||
| 456 | + | ||
| 374 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: | 457 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 375 | store = WorkflowArtifactStore(tmp_path) | 458 | store = WorkflowArtifactStore(tmp_path) |
| 376 | brief = ClarifyBrief.fallback( | 459 | brief = ClarifyBrief.fallback( |
@@ -528,3 +611,71 @@ def test_advance_todos_from_tool_call_tracks_plan_progress() -> None: | |||
| 528 | ), | 611 | ), |
| 529 | ) | 612 | ) |
| 530 | assert "Verify the updated index.html file is properly formatted" in dod.completed_items | 613 | assert "Verify the updated index.html file is properly formatted" in dod.completed_items |
| 614 | + | ||
| 615 | + | ||
| 616 | +def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() -> None: | ||
| 617 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | ||
| 618 | + sync_todos_to_definition_of_done( | ||
| 619 | + dod, | ||
| 620 | + [ | ||
| 621 | + { | ||
| 622 | + "content": "Create each chapter file in sequence, following the same structure as the Fortran guide", | ||
| 623 | + "active_form": "Working on: Create each chapter file in sequence, following the same structure as the Fortran guide", | ||
| 624 | + "status": "pending", | ||
| 625 | + }, | ||
| 626 | + { | ||
| 627 | + "content": "Ensure all files are properly linked and formatted consistently", | ||
| 628 | + "active_form": "Working on: Ensure all files are properly linked and formatted consistently", | ||
| 629 | + "status": "pending", | ||
| 630 | + }, | ||
| 631 | + ], | ||
| 632 | + ) | ||
| 633 | + | ||
| 634 | + assert ( | ||
| 635 | + advance_todos_from_tool_call( | ||
| 636 | + dod, | ||
| 637 | + ToolCall( | ||
| 638 | + id="write-one-chapter", | ||
| 639 | + name="write", | ||
| 640 | + arguments={ | ||
| 641 | + "file_path": "/tmp/nginx/chapters/01-getting-started.html", | ||
| 642 | + "content": "<html></html>", | ||
| 643 | + }, | ||
| 644 | + ), | ||
| 645 | + ) | ||
| 646 | + is False | ||
| 647 | + ) | ||
| 648 | + assert ( | ||
| 649 | + "Create each chapter file in sequence, following the same structure as the Fortran guide" | ||
| 650 | + in dod.pending_items | ||
| 651 | + ) | ||
| 652 | + | ||
| 653 | + | ||
| 654 | +def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None: | ||
| 655 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | ||
| 656 | + sync_todos_to_definition_of_done( | ||
| 657 | + dod, | ||
| 658 | + [ | ||
| 659 | + { | ||
| 660 | + "content": "Create the nginx directory structure", | ||
| 661 | + "active_form": "Working on: Create the nginx directory structure", | ||
| 662 | + "status": "pending", | ||
| 663 | + }, | ||
| 664 | + { | ||
| 665 | + "content": "Create index.html for nginx guide", | ||
| 666 | + "active_form": "Working on: Create index.html for nginx guide", | ||
| 667 | + "status": "pending", | ||
| 668 | + }, | ||
| 669 | + ], | ||
| 670 | + ) | ||
| 671 | + | ||
| 672 | + assert advance_todos_from_tool_call( | ||
| 673 | + dod, | ||
| 674 | + ToolCall( | ||
| 675 | + id="mkdir-nginx", | ||
| 676 | + name="bash", | ||
| 677 | + arguments={"command": "mkdir -p ~/Loader/guides/nginx/chapters"}, | ||
| 678 | + ), | ||
| 679 | + ) | ||
| 680 | + assert "Create the nginx directory structure" in dod.completed_items | ||
| 681 | + assert "Create index.html for nginx guide" in dod.pending_items | ||
tests/test_workflow_runtime.pymodified@@ -1438,6 +1438,21 @@ async def test_stale_plan_artifacts_trigger_targeted_plan_refresh( | |||
| 1438 | entry.reason_code == "plan_refresh_completed" | 1438 | entry.reason_code == "plan_refresh_completed" |
| 1439 | for entry in run.agent.last_turn_summary.workflow_timeline | 1439 | for entry in run.agent.last_turn_summary.workflow_timeline |
| 1440 | ) | 1440 | ) |
| 1441 | + refresh_prompt = next( | ||
| 1442 | + invocation.messages[-1].content | ||
| 1443 | + for invocation in backend.invocations | ||
| 1444 | + if "Refresh the existing planning artifacts instead of creating a fresh plan from scratch." | ||
| 1445 | + in invocation.messages[-1].content | ||
| 1446 | + ) | ||
| 1447 | + assert "Current execution progress:" in refresh_prompt | ||
| 1448 | + assert "Already touched during execution:" in refresh_prompt | ||
| 1449 | + assert f"- {target}" in refresh_prompt | ||
| 1450 | + assert any( | ||
| 1451 | + "Plan refresh preserved the progress already made." in message.content | ||
| 1452 | + and "Do not restart from initial discovery" in message.content | ||
| 1453 | + for invocation in backend.invocations | ||
| 1454 | + for message in invocation.messages | ||
| 1455 | + ) | ||
| 1441 | 1456 | ||
| 1442 | 1457 | ||
| 1443 | @pytest.mark.asyncio | 1458 | @pytest.mark.asyncio |