Harden qwen recovery loops and verification
- SHA
3a703a1e283382d4405282386f066a805123c57f- Parents
-
f0d4490 - Tree
132f2bd
3a703a1
3a703a1e283382d4405282386f066a805123c57ff0d4490
132f2bd| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/finalization.py
|
8 | 0 |
| M |
src/loader/runtime/recovery.py
|
8 | 6 |
| M |
src/loader/runtime/safeguard_services.py
|
30 | 7 |
| M |
src/loader/runtime/tool_batch_recovery.py
|
107 | 2 |
| M |
src/loader/runtime/tool_batches.py
|
43 | 0 |
| M |
src/loader/runtime/workflow.py
|
222 | 2 |
| M |
src/loader/tools/file_tools.py
|
27 | 10 |
| M |
src/loader/tools/fs_safety.py
|
62 | 0 |
| M |
src/loader/utils/file_mutations.py
|
19 | 1 |
| M |
tests/test_completion_policy.py
|
74 | 1 |
| M |
tests/test_expanded_tools.py
|
37 | 0 |
| M |
tests/test_finalization.py
|
69 | 1 |
| M |
tests/test_runtime_harness.py
|
100 | 0 |
| M |
tests/test_safeguard_services.py
|
15 | 1 |
| M |
tests/test_tool_batch_policies.py
|
62 | 0 |
| M |
tests/test_tool_batches.py
|
102 | 0 |
| M |
tests/test_workflow.py
|
112 | 0 |
src/loader/runtime/finalization.pymodified@@ -231,6 +231,14 @@ class TurnFinalizer: | ||
| 231 | 231 | project_root=self.context.project_root, |
| 232 | 232 | task_statement=dod.task_statement, |
| 233 | 233 | ) |
| 234 | + else: | |
| 235 | + for command in derive_verification_commands( | |
| 236 | + dod, | |
| 237 | + project_root=self.context.project_root, | |
| 238 | + task_statement=dod.task_statement, | |
| 239 | + ): | |
| 240 | + if command not in dod.verification_commands: | |
| 241 | + dod.verification_commands.append(command) | |
| 234 | 242 | |
| 235 | 243 | await self.set_workflow_mode( |
| 236 | 244 | ModeDecision.transition( |
src/loader/runtime/recovery.pymodified@@ -690,15 +690,17 @@ The command failed. You MUST analyze the error and take a DIFFERENT action. | ||
| 690 | 690 | {hints} |
| 691 | 691 | |
| 692 | 692 | ## CRITICAL RULES: |
| 693 | -1. **INVESTIGATE FIRST** - Read config files, list directories, check what exists | |
| 694 | -2. **DO NOT** just retry the same command with slight variations | |
| 695 | -3. **DO NOT** try `npm start` then `npm run start` - these are the same thing! | |
| 696 | -4. **READ THE ERROR** - It usually tells you exactly what's wrong | |
| 697 | -5. If the error says "missing script: start", read package.json to see what scripts exist | |
| 693 | +1. Start from the error and the state you already know | |
| 694 | +2. Investigate only if a specific fact is still missing | |
| 695 | +3. If you already have enough confirmed evidence, apply the fix instead of rereading the same files | |
| 696 | +4. **DO NOT** just retry the same command with slight variations | |
| 697 | +5. **DO NOT** try `npm start` then `npm run start` - these are the same thing! | |
| 698 | +6. **READ THE ERROR** - It usually tells you exactly what's wrong | |
| 699 | +7. If the error says "missing script: start", read package.json to see what scripts exist | |
| 698 | 700 | |
| 699 | 701 | ## Current attempt: {attempt_count}/{max_retries} |
| 700 | 702 | |
| 701 | -**Your next action should gather information OR try a fundamentally different approach.** | |
| 703 | +**Your next action should either gather the missing information OR apply the fix using confirmed findings.** | |
| 702 | 704 | What will you do?""" |
| 703 | 705 | |
| 704 | 706 | |
src/loader/runtime/safeguard_services.pymodified@@ -76,6 +76,11 @@ class ActionTracker: | ||
| 76 | 76 | sig = str(hash(str(hunks))) |
| 77 | 77 | return sig in self._files_edited.get(norm_path, []) |
| 78 | 78 | |
| 79 | + def would_duplicate_raw_patch(self, file_path: str, patch_text: str) -> bool: | |
| 80 | + norm_path = self._normalize_path(file_path) | |
| 81 | + sig = str(hash(patch_text)) | |
| 82 | + return sig in self._files_edited.get(norm_path, []) | |
| 83 | + | |
| 79 | 84 | def would_duplicate_command(self, command: str) -> bool: |
| 80 | 85 | norm_cmd = self._normalize_command(command) |
| 81 | 86 | return norm_cmd in self._commands_run |
@@ -123,8 +128,12 @@ class ActionTracker: | ||
| 123 | 128 | elif tool_name == "patch": |
| 124 | 129 | file_path = arguments.get("file_path", "") |
| 125 | 130 | hunks = arguments.get("hunks", []) |
| 126 | - if isinstance(hunks, list) and self.would_duplicate_patch(file_path, hunks): | |
| 131 | + raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text") | |
| 132 | + if isinstance(hunks, list) and hunks and self.would_duplicate_patch(file_path, hunks): | |
| 127 | 133 | return True, f"Same patch already applied to: {file_path}" |
| 134 | + if isinstance(raw_patch, str) and raw_patch.strip(): | |
| 135 | + if self.would_duplicate_raw_patch(file_path, raw_patch): | |
| 136 | + return True, f"Same patch already applied to: {file_path}" | |
| 128 | 137 | |
| 129 | 138 | elif tool_name == "read": |
| 130 | 139 | read_key = self._make_read_key(arguments) |
@@ -135,7 +144,8 @@ class ActionTracker: | ||
| 135 | 144 | ( |
| 136 | 145 | "Already read " |
| 137 | 146 | f"{str(arguments.get('file_path', '')).strip()} " |
| 138 | - "recently without any intervening changes" | |
| 147 | + "recently without any intervening changes; " | |
| 148 | + "reuse the earlier read result instead of rereading" | |
| 139 | 149 | ), |
| 140 | 150 | repeat_threshold=self.READ_REPEAT_THRESHOLD, |
| 141 | 151 | ) |
@@ -148,7 +158,10 @@ class ActionTracker: | ||
| 148 | 158 | duplicate, reason = self._check_recent_observation( |
| 149 | 159 | self._recent_searches, |
| 150 | 160 | observation_key, |
| 151 | - "Already ran the same search recently without any intervening changes", | |
| 161 | + ( | |
| 162 | + "Already ran the same search recently without any intervening " | |
| 163 | + "changes; reuse the earlier search result instead of rerunning it" | |
| 164 | + ), | |
| 152 | 165 | repeat_threshold=self.SEARCH_REPEAT_THRESHOLD, |
| 153 | 166 | ) |
| 154 | 167 | if duplicate: |
@@ -160,7 +173,10 @@ class ActionTracker: | ||
| 160 | 173 | duplicate, reason = self._check_recent_observation( |
| 161 | 174 | self._recent_bash_observations, |
| 162 | 175 | self._normalize_command(command), |
| 163 | - "Already ran the same read-only shell probe recently without any intervening changes", | |
| 176 | + ( | |
| 177 | + "Already ran the same read-only shell probe recently without any " | |
| 178 | + "intervening changes; reuse the earlier shell output instead of rerunning it" | |
| 179 | + ), | |
| 164 | 180 | repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD, |
| 165 | 181 | ) |
| 166 | 182 | if duplicate: |
@@ -196,7 +212,11 @@ class ActionTracker: | ||
| 196 | 212 | file_path = arguments.get("file_path", "") |
| 197 | 213 | hunks = arguments.get("hunks", []) |
| 198 | 214 | if file_path: |
| 199 | - self.record_edit(file_path, str(hunks), "structured_patch") | |
| 215 | + raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text") | |
| 216 | + if isinstance(hunks, list) and hunks: | |
| 217 | + self.record_edit(file_path, str(hunks), "structured_patch") | |
| 218 | + elif isinstance(raw_patch, str) and raw_patch.strip(): | |
| 219 | + self.record_edit(file_path, raw_patch, "raw_patch") | |
| 200 | 220 | self._note_mutation() |
| 201 | 221 | |
| 202 | 222 | elif tool_name == "read": |
@@ -592,6 +612,7 @@ class PreActionValidator: | ||
| 592 | 612 | def _validate_patch(self, arguments: dict) -> ValidationResult: |
| 593 | 613 | file_path = arguments.get("file_path", "") |
| 594 | 614 | hunks = arguments.get("hunks", []) |
| 615 | + raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text") | |
| 595 | 616 | |
| 596 | 617 | if not file_path or not str(file_path).strip(): |
| 597 | 618 | return ValidationResult( |
@@ -605,11 +626,13 @@ class PreActionValidator: | ||
| 605 | 626 | if not path_result.valid: |
| 606 | 627 | return path_result |
| 607 | 628 | |
| 608 | - if not isinstance(hunks, list) or not hunks: | |
| 629 | + has_hunks = isinstance(hunks, list) and bool(hunks) | |
| 630 | + has_raw_patch = isinstance(raw_patch, str) and bool(raw_patch.strip()) | |
| 631 | + if not has_hunks and not has_raw_patch: | |
| 609 | 632 | return ValidationResult( |
| 610 | 633 | valid=False, |
| 611 | 634 | reason="Patch hunks are missing", |
| 612 | - suggestion="Provide one or more structured patch hunks", | |
| 635 | + suggestion="Provide structured patch hunks or a unified diff patch string", | |
| 613 | 636 | severity="error", |
| 614 | 637 | ) |
| 615 | 638 | |
src/loader/runtime/tool_batch_recovery.pymodified@@ -3,6 +3,9 @@ | ||
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from collections.abc import Awaitable, Callable |
| 6 | +from difflib import SequenceMatcher | |
| 7 | +from pathlib import Path | |
| 8 | +import re | |
| 6 | 9 | |
| 7 | 10 | from ..llm.base import Message, ToolCall |
| 8 | 11 | from .compaction import infer_preferred_next_step, summarize_confirmed_facts |
@@ -82,7 +85,11 @@ class ToolBatchRecoveryController: | ||
| 82 | 85 | tool_call.arguments, |
| 83 | 86 | outcome.result_output, |
| 84 | 87 | ) |
| 85 | - recovery_prompt = self._augment_recovery_prompt(recovery_prompt) | |
| 88 | + recovery_prompt = self._augment_recovery_prompt( | |
| 89 | + recovery_prompt, | |
| 90 | + tool_call=tool_call, | |
| 91 | + outcome=outcome, | |
| 92 | + ) | |
| 86 | 93 | return Message.tool_result_message( |
| 87 | 94 | tool_call_id=tool_call.id, |
| 88 | 95 | display_content=recovery_prompt, |
@@ -106,7 +113,13 @@ class ToolBatchRecoveryController: | ||
| 106 | 113 | is_error=True, |
| 107 | 114 | ) |
| 108 | 115 | |
| 109 | - def _augment_recovery_prompt(self, prompt: str) -> str: | |
| 116 | + def _augment_recovery_prompt( | |
| 117 | + self, | |
| 118 | + prompt: str, | |
| 119 | + *, | |
| 120 | + tool_call: ToolCall, | |
| 121 | + outcome: ToolExecutionOutcome, | |
| 122 | + ) -> str: | |
| 110 | 123 | """Append transcript-aware recovery guidance when recent facts exist.""" |
| 111 | 124 | |
| 112 | 125 | session = self.context.session |
@@ -116,6 +129,7 @@ class ToolBatchRecoveryController: | ||
| 116 | 129 | session.messages, |
| 117 | 130 | current_task=current_task, |
| 118 | 131 | ) |
| 132 | + actionable_known_state = bool(confirmed_facts and preferred_next_step) | |
| 119 | 133 | if not confirmed_facts and not preferred_next_step and not current_task: |
| 120 | 134 | return prompt |
| 121 | 135 | |
@@ -130,4 +144,95 @@ class ToolBatchRecoveryController: | ||
| 130 | 144 | "- Preserve progress: do not restart by rereading already-confirmed files " |
| 131 | 145 | "unless you need genuinely new evidence." |
| 132 | 146 | ) |
| 147 | + if actionable_known_state: | |
| 148 | + lines.extend( | |
| 149 | + [ | |
| 150 | + "", | |
| 151 | + "## ACTION BIAS FOR THIS RECOVERY", | |
| 152 | + "- The confirmed findings above are already enough to keep moving.", | |
| 153 | + "- Prefer edit/write/patch on the target file over rereading the same files.", | |
| 154 | + "- Only inspect one more file if a specific filename, href, or title is still unknown.", | |
| 155 | + "- Treat the preferred next step as the default path forward.", | |
| 156 | + ] | |
| 157 | + ) | |
| 158 | + candidate_lines = self._file_not_found_candidate_lines(tool_call, outcome) | |
| 159 | + if candidate_lines: | |
| 160 | + lines.extend(["", "## LIKELY FILE CANDIDATES", *candidate_lines]) | |
| 133 | 161 | return "\n".join(lines) |
| 162 | + | |
| 163 | + def _file_not_found_candidate_lines( | |
| 164 | + self, | |
| 165 | + tool_call: ToolCall, | |
| 166 | + outcome: ToolExecutionOutcome, | |
| 167 | + ) -> list[str]: | |
| 168 | + if tool_call.name not in {"read", "write", "edit", "patch"}: | |
| 169 | + return [] | |
| 170 | + if "not found" not in outcome.result_output.lower(): | |
| 171 | + return [] | |
| 172 | + | |
| 173 | + missing_path = self._canonicalize_path( | |
| 174 | + str( | |
| 175 | + tool_call.arguments.get("file_path") | |
| 176 | + or tool_call.arguments.get("path") | |
| 177 | + or "" | |
| 178 | + ).strip() | |
| 179 | + ) | |
| 180 | + if not missing_path: | |
| 181 | + return [] | |
| 182 | + | |
| 183 | + candidates = self._rank_known_file_candidates(missing_path) | |
| 184 | + if not candidates: | |
| 185 | + return [] | |
| 186 | + | |
| 187 | + names = ", ".join(f"`{Path(candidate).name}`" for candidate in candidates[:3]) | |
| 188 | + return [ | |
| 189 | + f"- Requested file does not exist: `{missing_path}`", | |
| 190 | + f"- Closest known files in the same directory: {names}", | |
| 191 | + "- Prefer one of those exact filenames instead of retrying the missing path.", | |
| 192 | + ] | |
| 193 | + | |
| 194 | + def _rank_known_file_candidates(self, missing_path: str) -> list[str]: | |
| 195 | + missing_parent = str(Path(missing_path).parent) | |
| 196 | + missing_name = Path(missing_path).name | |
| 197 | + missing_prefix = missing_name.split("-", 1)[0] | |
| 198 | + | |
| 199 | + ranked: list[tuple[float, str]] = [] | |
| 200 | + seen: set[str] = set() | |
| 201 | + for candidate in self._known_file_paths(): | |
| 202 | + if candidate == missing_path: | |
| 203 | + continue | |
| 204 | + if str(Path(candidate).parent) != missing_parent: | |
| 205 | + continue | |
| 206 | + name = Path(candidate).name | |
| 207 | + if name in seen: | |
| 208 | + continue | |
| 209 | + seen.add(name) | |
| 210 | + | |
| 211 | + score = SequenceMatcher(None, missing_name, name).ratio() | |
| 212 | + if missing_prefix and name.startswith(f"{missing_prefix}-"): | |
| 213 | + score += 1.0 | |
| 214 | + ranked.append((score, candidate)) | |
| 215 | + | |
| 216 | + ranked.sort(key=lambda item: (-item[0], item[1])) | |
| 217 | + return [candidate for _, candidate in ranked] | |
| 218 | + | |
| 219 | + def _known_file_paths(self) -> list[str]: | |
| 220 | + pattern = re.compile(r"(?:~|/)[^\s`\"']+\.html") | |
| 221 | + discovered: list[str] = [] | |
| 222 | + seen: set[str] = set() | |
| 223 | + for message in self.context.session.messages: | |
| 224 | + for raw_path in pattern.findall(message.content): | |
| 225 | + candidate = self._canonicalize_path(raw_path) | |
| 226 | + if not candidate or candidate in seen: | |
| 227 | + continue | |
| 228 | + seen.add(candidate) | |
| 229 | + discovered.append(candidate) | |
| 230 | + return discovered | |
| 231 | + | |
| 232 | + def _canonicalize_path(self, raw_path: str) -> str: | |
| 233 | + if not raw_path: | |
| 234 | + return "" | |
| 235 | + try: | |
| 236 | + return str(Path(raw_path).expanduser().resolve(strict=False)) | |
| 237 | + except (OSError, RuntimeError, ValueError): | |
| 238 | + return str(Path(raw_path).expanduser()) | |
src/loader/runtime/tool_batches.pymodified@@ -31,6 +31,8 @@ from .verification_observations import ( | ||
| 31 | 31 | VerificationObservationStatus, |
| 32 | 32 | ) |
| 33 | 33 | from .workflow import sync_todos_to_definition_of_done |
| 34 | +from .workflow import advance_todos_from_tool_call | |
| 35 | +from .compaction import infer_preferred_next_step | |
| 34 | 36 | |
| 35 | 37 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 36 | 38 | ConfirmationHandler = ( |
@@ -215,6 +217,8 @@ class ToolBatchRunner: | ||
| 215 | 217 | # otherwise the model operates blind and loops. |
| 216 | 218 | self.context.session.append(outcome.message) |
| 217 | 219 | summary.tool_result_messages.append(outcome.message) |
| 220 | + if outcome.state == ToolExecutionState.DUPLICATE: | |
| 221 | + self._queue_duplicate_observation_nudge(tool_call) | |
| 218 | 222 | |
| 219 | 223 | should_continue = await self.verification_gate.should_continue( |
| 220 | 224 | tool_call=tool_call, |
@@ -247,6 +251,43 @@ class ToolBatchRunner: | ||
| 247 | 251 | |
| 248 | 252 | return result |
| 249 | 253 | |
| 254 | + def _queue_duplicate_observation_nudge(self, tool_call: ToolCall) -> None: | |
| 255 | + """Queue a concrete next-step nudge after duplicate observational actions.""" | |
| 256 | + | |
| 257 | + if tool_call.name not in {"read", "glob", "grep", "bash"}: | |
| 258 | + return | |
| 259 | + | |
| 260 | + current_task = getattr(self.context.session, "current_task", None) | |
| 261 | + preferred_next_step = infer_preferred_next_step( | |
| 262 | + self.context.session.messages, | |
| 263 | + current_task=current_task, | |
| 264 | + ) | |
| 265 | + if preferred_next_step: | |
| 266 | + self.context.queue_steering_message( | |
| 267 | + "Reuse the earlier observation instead of repeating it. " | |
| 268 | + f"{preferred_next_step} " | |
| 269 | + "Only gather more evidence if a specific filename, href, or title is still unknown." | |
| 270 | + ) | |
| 271 | + return | |
| 272 | + | |
| 273 | + target_path = str( | |
| 274 | + tool_call.arguments.get("file_path") | |
| 275 | + or tool_call.arguments.get("path") | |
| 276 | + or "" | |
| 277 | + ).strip() | |
| 278 | + if target_path: | |
| 279 | + self.context.queue_steering_message( | |
| 280 | + "Reuse the earlier observation instead of repeating it. " | |
| 281 | + f"Use the current contents of `{target_path}` and take a different next step. " | |
| 282 | + "Only gather more evidence if a specific filename, href, or title is still unknown." | |
| 283 | + ) | |
| 284 | + return | |
| 285 | + | |
| 286 | + self.context.queue_steering_message( | |
| 287 | + "Reuse the earlier observation instead of repeating it. " | |
| 288 | + "Choose a different next step that makes progress." | |
| 289 | + ) | |
| 290 | + | |
| 250 | 291 | async def _record_successful_execution( |
| 251 | 292 | self, |
| 252 | 293 | *, |
@@ -279,6 +320,8 @@ class ToolBatchRunner: | ||
| 279 | 320 | new_todos = outcome.registry_result.metadata.get("new_todos", []) |
| 280 | 321 | if isinstance(new_todos, list): |
| 281 | 322 | sync_todos_to_definition_of_done(dod, new_todos) |
| 323 | + else: | |
| 324 | + advance_todos_from_tool_call(dod, tool_call) | |
| 282 | 325 | self.dod_store.save(dod) |
| 283 | 326 | recovery_context = self.context.recovery_context |
| 284 | 327 | if recovery_context is not None: |
src/loader/runtime/workflow.pymodified@@ -8,6 +8,7 @@ from datetime import UTC, datetime | ||
| 8 | 8 | from pathlib import Path |
| 9 | 9 | from typing import ClassVar |
| 10 | 10 | |
| 11 | +from ..llm.base import ToolCall | |
| 11 | 12 | from .clarify_grounding import ClarifyGrounding |
| 12 | 13 | from .dod import slugify |
| 13 | 14 | from .workflow_policy import ( |
@@ -43,6 +44,7 @@ __all__ = [ | ||
| 43 | 44 | "WorkflowSignalPacket", |
| 44 | 45 | "WorkflowTimelineEntry", |
| 45 | 46 | "WorkflowTimelineEntryKind", |
| 47 | + "advance_todos_from_tool_call", | |
| 46 | 48 | "build_execute_bridge", |
| 47 | 49 | "enrich_clarify_brief_with_grounding", |
| 48 | 50 | "extract_verification_commands_from_markdown", |
@@ -67,6 +69,64 @@ _GENERIC_CONSTRAINTS = { | ||
| 67 | 69 | _GENERIC_ASSUMPTIONS = { |
| 68 | 70 | "Unspecified details stay unchanged unless evidence says otherwise.", |
| 69 | 71 | } |
| 72 | +_SPECIAL_TODO_ITEMS = { | |
| 73 | + "Complete the requested work", | |
| 74 | + "Collect verification evidence", | |
| 75 | +} | |
| 76 | +_READ_STEP_HINTS = ( | |
| 77 | + "read", | |
| 78 | + "examine", | |
| 79 | + "inspect", | |
| 80 | + "review", | |
| 81 | + "check", | |
| 82 | + "look at", | |
| 83 | + "look through", | |
| 84 | + "open", | |
| 85 | + "understand", | |
| 86 | + "study", | |
| 87 | +) | |
| 88 | +_SEARCH_STEP_HINTS = ( | |
| 89 | + "list", | |
| 90 | + "find", | |
| 91 | + "search", | |
| 92 | + "scan", | |
| 93 | + "discover", | |
| 94 | + "locate", | |
| 95 | + "enumerate", | |
| 96 | + "gather", | |
| 97 | +) | |
| 98 | +_PARSE_STEP_HINTS = ( | |
| 99 | + "parse", | |
| 100 | + "extract", | |
| 101 | + "identify", | |
| 102 | + "map", | |
| 103 | + "determine", | |
| 104 | +) | |
| 105 | +_MUTATION_STEP_HINTS = ( | |
| 106 | + "update", | |
| 107 | + "edit", | |
| 108 | + "write", | |
| 109 | + "fix", | |
| 110 | + "modify", | |
| 111 | + "change", | |
| 112 | + "patch", | |
| 113 | + "replace", | |
| 114 | + "correct", | |
| 115 | + "rewrite", | |
| 116 | +) | |
| 117 | +_VERIFY_STEP_HINTS = ( | |
| 118 | + "verify", | |
| 119 | + "validation", | |
| 120 | + "validate", | |
| 121 | + "test", | |
| 122 | + "confirm", | |
| 123 | + "check", | |
| 124 | +) | |
| 125 | +_SHELL_COMMAND_START = re.compile( | |
| 126 | + r"(?<![\w/.-])(" | |
| 127 | + r"ls|grep|pytest|uv|python3?|html5validator|cargo|npm|node|mypy|ruff|find|git|cat|sed|head|tail" | |
| 128 | + r")\b" | |
| 129 | +) | |
| 70 | 130 | |
| 71 | 131 | _SECTION_ALIASES = { |
| 72 | 132 | "task statement": "task_statement", |
@@ -486,6 +546,116 @@ def sync_todos_to_definition_of_done( | ||
| 486 | 546 | dod.completed_items = list(dict.fromkeys(completed + special_completed)) |
| 487 | 547 | |
| 488 | 548 | |
| 549 | +def advance_todos_from_tool_call(dod, tool_call: ToolCall) -> bool: | |
| 550 | + """Advance the best-matching pending todo from a successful tool call.""" | |
| 551 | + | |
| 552 | + best_index: int | None = None | |
| 553 | + best_score = 0 | |
| 554 | + | |
| 555 | + for index, item in enumerate(dod.pending_items): | |
| 556 | + label = item.strip() | |
| 557 | + if not label or label in _SPECIAL_TODO_ITEMS: | |
| 558 | + continue | |
| 559 | + score = _todo_progress_score(label, tool_call) | |
| 560 | + if score > best_score: | |
| 561 | + best_index = index | |
| 562 | + best_score = score | |
| 563 | + | |
| 564 | + if best_index is None or best_score <= 0: | |
| 565 | + return False | |
| 566 | + | |
| 567 | + completed = dod.pending_items.pop(best_index) | |
| 568 | + if completed not in dod.completed_items: | |
| 569 | + dod.completed_items.append(completed) | |
| 570 | + return True | |
| 571 | + | |
| 572 | + | |
| 573 | +def _todo_progress_score(item: str, tool_call: ToolCall) -> int: | |
| 574 | + text = item.lower() | |
| 575 | + name = tool_call.name | |
| 576 | + file_path = str(tool_call.arguments.get("file_path", "")).strip().lower() | |
| 577 | + path = str(tool_call.arguments.get("path", "")).strip().lower() | |
| 578 | + pattern = str(tool_call.arguments.get("pattern", "")).strip().lower() | |
| 579 | + command = str(tool_call.arguments.get("command", "")).strip().lower() | |
| 580 | + combined = " ".join(part for part in (file_path, path, pattern, command) if part) | |
| 581 | + | |
| 582 | + path_hint = file_path or path | |
| 583 | + basename = Path(path_hint).name.lower() if path_hint else "" | |
| 584 | + parent = Path(path_hint).parent.name.lower() if path_hint else "" | |
| 585 | + | |
| 586 | + score = 0 | |
| 587 | + if basename and basename in text: | |
| 588 | + score += 3 | |
| 589 | + if parent and parent not in {"", "."} and parent in text: | |
| 590 | + score += 2 | |
| 591 | + if "index" in text and "index" in combined: | |
| 592 | + score += 2 | |
| 593 | + if "chapter" in text and ("chapter" in basename or "chapters" in combined): | |
| 594 | + score += 1 | |
| 595 | + if "html" in text and ".html" in combined: | |
| 596 | + score += 1 | |
| 597 | + | |
| 598 | + if name == "read": | |
| 599 | + if _contains_any(text, _READ_STEP_HINTS): | |
| 600 | + score += 2 | |
| 601 | + if _contains_any(text, _PARSE_STEP_HINTS) and ".html" in combined: | |
| 602 | + score += 1 | |
| 603 | + elif name in {"glob", "grep"}: | |
| 604 | + if _contains_any(text, _SEARCH_STEP_HINTS): | |
| 605 | + score += 2 | |
| 606 | + if name == "glob" and _contains_any(text, _READ_STEP_HINTS) and ".html" in combined: | |
| 607 | + score += 1 | |
| 608 | + elif name == "bash": | |
| 609 | + if _looks_like_verification_command(command): | |
| 610 | + if _contains_any(text, _VERIFY_STEP_HINTS): | |
| 611 | + score += 3 | |
| 612 | + elif _looks_like_search_command(command): | |
| 613 | + if _contains_any(text, _SEARCH_STEP_HINTS): | |
| 614 | + score += 2 | |
| 615 | + elif _looks_like_read_command(command): | |
| 616 | + if _contains_any(text, _READ_STEP_HINTS): | |
| 617 | + score += 2 | |
| 618 | + elif name in {"write", "edit", "patch"}: | |
| 619 | + if _contains_any(text, _MUTATION_STEP_HINTS): | |
| 620 | + score += 3 | |
| 621 | + | |
| 622 | + if name in {"write", "edit", "patch"} and _contains_any(text, _VERIFY_STEP_HINTS): | |
| 623 | + return 0 | |
| 624 | + return score | |
| 625 | + | |
| 626 | + | |
| 627 | +def _contains_any(text: str, candidates: tuple[str, ...]) -> bool: | |
| 628 | + return any(candidate in text for candidate in candidates) | |
| 629 | + | |
| 630 | + | |
| 631 | +def _looks_like_search_command(command: str) -> bool: | |
| 632 | + return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob ")) | |
| 633 | + | |
| 634 | + | |
| 635 | +def _looks_like_read_command(command: str) -> bool: | |
| 636 | + return any(token in command for token in ("cat ", "sed ", "head ", "tail ")) | |
| 637 | + | |
| 638 | + | |
| 639 | +def _looks_like_verification_command(command: str) -> bool: | |
| 640 | + return any( | |
| 641 | + token in command | |
| 642 | + for token in ( | |
| 643 | + "pytest", | |
| 644 | + "unittest", | |
| 645 | + " test", | |
| 646 | + " check", | |
| 647 | + " verify", | |
| 648 | + "html5validator", | |
| 649 | + "mypy", | |
| 650 | + "ruff", | |
| 651 | + "lint", | |
| 652 | + "grep ", | |
| 653 | + "diff ", | |
| 654 | + "cmp ", | |
| 655 | + ) | |
| 656 | + ) | |
| 657 | + | |
| 658 | + | |
| 489 | 659 | def extract_verification_commands_from_markdown(markdown: str) -> list[str]: |
| 490 | 660 | """Extract verification commands from a verification-plan markdown document.""" |
| 491 | 661 | |
@@ -686,11 +856,61 @@ def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None: | ||
| 686 | 856 | def _extract_commands(items: list[str]) -> list[str]: |
| 687 | 857 | commands: list[str] = [] |
| 688 | 858 | for item in items: |
| 689 | - match = re.match(r"^`(.+)`$", item) | |
| 690 | - commands.append((match.group(1) if match else item).strip()) | |
| 859 | + text = item.strip() | |
| 860 | + if not text: | |
| 861 | + continue | |
| 862 | + | |
| 863 | + # Code fences often contain shell comments plus the actual command lines. | |
| 864 | + if "```" in text: | |
| 865 | + text = text.replace("```bash", "```").replace("```sh", "```") | |
| 866 | + if "\n" not in text: | |
| 867 | + commands.extend(_extract_collapsed_shell_commands(text)) | |
| 868 | + continue | |
| 869 | + | |
| 870 | + lines = text.splitlines() if "\n" in text or "```" in text else [text] | |
| 871 | + for line in lines: | |
| 872 | + candidate = line.strip() | |
| 873 | + if not candidate or candidate.startswith("```"): | |
| 874 | + continue | |
| 875 | + candidate = re.sub(r"^-\s+", "", candidate) | |
| 876 | + match = re.match(r"^`(.+)`$", candidate) | |
| 877 | + candidate = (match.group(1) if match else candidate).strip() | |
| 878 | + if candidate.startswith("#"): | |
| 879 | + candidate = _extract_shell_command_from_text(candidate) | |
| 880 | + if not candidate: | |
| 881 | + continue | |
| 882 | + if candidate: | |
| 883 | + commands.append(candidate) | |
| 691 | 884 | return [command for command in commands if command] |
| 692 | 885 | |
| 693 | 886 | |
| 887 | +def _extract_collapsed_shell_commands(text: str) -> list[str]: | |
| 888 | + stripped = re.sub(r"```(?:\w+)?", "", text).strip() | |
| 889 | + if not stripped: | |
| 890 | + return [] | |
| 891 | + | |
| 892 | + matches = list(_SHELL_COMMAND_START.finditer(stripped)) | |
| 893 | + if not matches: | |
| 894 | + extracted = _extract_shell_command_from_text(stripped) | |
| 895 | + return [extracted] if extracted else [] | |
| 896 | + | |
| 897 | + commands: list[str] = [] | |
| 898 | + for index, match in enumerate(matches): | |
| 899 | + start = match.start() | |
| 900 | + end = matches[index + 1].start() if index + 1 < len(matches) else len(stripped) | |
| 901 | + candidate = stripped[start:end].strip() | |
| 902 | + if candidate: | |
| 903 | + commands.append(candidate) | |
| 904 | + return commands | |
| 905 | + | |
| 906 | + | |
| 907 | +def _extract_shell_command_from_text(text: str) -> str: | |
| 908 | + match = _SHELL_COMMAND_START.search(text) | |
| 909 | + if match is None: | |
| 910 | + return "" | |
| 911 | + return text[match.start():].strip() | |
| 912 | + | |
| 913 | + | |
| 694 | 914 | def _has_concrete_anchor(task: str) -> bool: |
| 695 | 915 | return any( |
| 696 | 916 | re.search(pattern, task) |
src/loader/tools/file_tools.pymodified@@ -13,6 +13,7 @@ from .fs_safety import ( | ||
| 13 | 13 | ensure_safe_to_read, |
| 14 | 14 | ensure_safe_to_write, |
| 15 | 15 | make_structured_patch, |
| 16 | + parse_unified_diff_patch, | |
| 16 | 17 | resolve_workspace_path, |
| 17 | 18 | ) |
| 18 | 19 | |
@@ -447,7 +448,8 @@ class PatchTool(Tool): | ||
| 447 | 448 | def description(self) -> str: |
| 448 | 449 | return ( |
| 449 | 450 | "Apply structured patch hunks to a file. Prefer this for larger " |
| 450 | - "or multi-line edits where exact old/new string replacement is brittle." | |
| 451 | + "or multi-line edits where exact old/new string replacement is brittle. " | |
| 452 | + "A raw unified diff string is also accepted via `patch`." | |
| 451 | 453 | ) |
| 452 | 454 | |
| 453 | 455 | @property |
@@ -483,8 +485,15 @@ class PatchTool(Tool): | ||
| 483 | 485 | ], |
| 484 | 486 | }, |
| 485 | 487 | }, |
| 488 | + "patch": { | |
| 489 | + "type": "string", | |
| 490 | + "description": ( | |
| 491 | + "Optional unified diff patch string. Loader will parse this " | |
| 492 | + "into structured hunks when possible." | |
| 493 | + ), | |
| 494 | + }, | |
| 486 | 495 | }, |
| 487 | - "required": ["file_path", "hunks"], | |
| 496 | + "required": ["file_path"], | |
| 488 | 497 | } |
| 489 | 498 | |
| 490 | 499 | @property |
@@ -505,7 +514,8 @@ class PatchTool(Tool): | ||
| 505 | 514 | async def execute( |
| 506 | 515 | self, |
| 507 | 516 | file_path: str, |
| 508 | - hunks: list[dict[str, Any]], | |
| 517 | + hunks: list[dict[str, Any]] | None = None, | |
| 518 | + patch: str | None = None, | |
| 509 | 519 | **kwargs: Any, |
| 510 | 520 | ) -> ToolResult: |
| 511 | 521 | kwargs.pop("_skip_confirmation", None) |
@@ -544,13 +554,20 @@ class PatchTool(Tool): | ||
| 544 | 554 | ensure_safe_to_read(path) |
| 545 | 555 | original_content = await asyncio.to_thread(path.read_text) |
| 546 | 556 | original_lines = original_content.splitlines() |
| 547 | - parsed_hunks = [ | |
| 548 | - StructuredPatchHunk.from_dict_with_original( | |
| 549 | - hunk, | |
| 550 | - original_lines=original_lines, | |
| 551 | - ) | |
| 552 | - for hunk in hunks | |
| 553 | - ] | |
| 557 | + raw_patch = patch or kwargs.get("diff") or kwargs.get("patch_text") | |
| 558 | + parsed_hunks: list[StructuredPatchHunk] | |
| 559 | + if hunks: | |
| 560 | + parsed_hunks = [ | |
| 561 | + StructuredPatchHunk.from_dict_with_original( | |
| 562 | + hunk, | |
| 563 | + original_lines=original_lines, | |
| 564 | + ) | |
| 565 | + for hunk in hunks | |
| 566 | + ] | |
| 567 | + elif isinstance(raw_patch, str) and raw_patch.strip(): | |
| 568 | + parsed_hunks = parse_unified_diff_patch(raw_patch) | |
| 569 | + else: | |
| 570 | + parsed_hunks = [] | |
| 554 | 571 | if not parsed_hunks: |
| 555 | 572 | raise ValueError("hunks must not be empty") |
| 556 | 573 | updated_content = apply_structured_patch(original_content, parsed_hunks) |
src/loader/tools/fs_safety.pymodified@@ -4,6 +4,7 @@ from __future__ import annotations | ||
| 4 | 4 | |
| 5 | 5 | from dataclasses import asdict, dataclass |
| 6 | 6 | from pathlib import Path |
| 7 | +import re | |
| 7 | 8 | |
| 8 | 9 | MAX_READ_SIZE = 10 * 1024 * 1024 |
| 9 | 10 | MAX_WRITE_SIZE = 10 * 1024 * 1024 |
@@ -233,3 +234,64 @@ def _expect_patch_line( | ||
| 233 | 234 | "structured patch context mismatch: " |
| 234 | 235 | f"expected {expected!r}, found {actual!r}" |
| 235 | 236 | ) |
| 237 | + | |
| 238 | + | |
| 239 | +_UNIFIED_DIFF_HUNK_RE = re.compile( | |
| 240 | + r"^@@ -(?P<old_start>\d+)(?:,(?P<old_lines>\d+))? " | |
| 241 | + r"\+(?P<new_start>\d+)(?:,(?P<new_lines>\d+))? @@" | |
| 242 | +) | |
| 243 | + | |
| 244 | + | |
| 245 | +def parse_unified_diff_patch(patch_text: str) -> list[StructuredPatchHunk]: | |
| 246 | + """Parse a unified diff string into structured patch hunks.""" | |
| 247 | + | |
| 248 | + if not str(patch_text).strip(): | |
| 249 | + raise ValueError("patch text is empty") | |
| 250 | + | |
| 251 | + hunks: list[StructuredPatchHunk] = [] | |
| 252 | + current_hunk: StructuredPatchHunk | None = None | |
| 253 | + | |
| 254 | + for raw_line in str(patch_text).splitlines(): | |
| 255 | + if raw_line.startswith(("--- ", "+++ ")): | |
| 256 | + continue | |
| 257 | + if raw_line.startswith("@@"): | |
| 258 | + match = _UNIFIED_DIFF_HUNK_RE.match(raw_line) | |
| 259 | + if match is None: | |
| 260 | + raise ValueError( | |
| 261 | + "patch text contains an invalid unified-diff hunk header" | |
| 262 | + ) | |
| 263 | + if current_hunk is not None: | |
| 264 | + hunks.append(current_hunk) | |
| 265 | + current_hunk = StructuredPatchHunk( | |
| 266 | + old_start=int(match.group("old_start")), | |
| 267 | + old_lines=int(match.group("old_lines") or 1), | |
| 268 | + new_start=int(match.group("new_start")), | |
| 269 | + new_lines=int(match.group("new_lines") or 1), | |
| 270 | + lines=[], | |
| 271 | + ) | |
| 272 | + continue | |
| 273 | + | |
| 274 | + if raw_line == r"\ No newline at end of file": | |
| 275 | + continue | |
| 276 | + | |
| 277 | + if current_hunk is None: | |
| 278 | + if not raw_line.strip(): | |
| 279 | + continue | |
| 280 | + raise ValueError( | |
| 281 | + "patch text must include at least one unified-diff hunk header" | |
| 282 | + ) | |
| 283 | + | |
| 284 | + prefix = raw_line[:1] | |
| 285 | + if prefix not in {" ", "+", "-"}: | |
| 286 | + raise ValueError( | |
| 287 | + "patch text contains a diff line without a valid prefix" | |
| 288 | + ) | |
| 289 | + current_hunk.lines.append(raw_line) | |
| 290 | + | |
| 291 | + if current_hunk is not None: | |
| 292 | + hunks.append(current_hunk) | |
| 293 | + | |
| 294 | + if not hunks: | |
| 295 | + raise ValueError("patch text must include at least one unified-diff hunk") | |
| 296 | + | |
| 297 | + return hunks | |
src/loader/utils/file_mutations.pymodified@@ -11,7 +11,11 @@ from rich.console import Group | ||
| 11 | 11 | from rich.panel import Panel |
| 12 | 12 | from rich.text import Text |
| 13 | 13 | |
| 14 | -from ..tools.fs_safety import StructuredPatchHunk, make_structured_patch | |
| 14 | +from ..tools.fs_safety import ( | |
| 15 | + StructuredPatchHunk, | |
| 16 | + make_structured_patch, | |
| 17 | + parse_unified_diff_patch, | |
| 18 | +) | |
| 15 | 19 | |
| 16 | 20 | FILE_MUTATION_TOOLS = {"write", "edit", "patch"} |
| 17 | 21 | DIFF_TRUNCATION_NOTICE = "truncated for display; full result preserved in session" |
@@ -98,6 +102,8 @@ def build_file_mutation_preview( | ||
| 98 | 102 | structured_patch = _coerce_patch_hunks(info.get("hunks")) or _coerce_patch_hunks( |
| 99 | 103 | args.get("hunks") |
| 100 | 104 | ) |
| 105 | + if not structured_patch and tool_name == "patch": | |
| 106 | + structured_patch = _coerce_raw_patch_hunks(info) or _coerce_raw_patch_hunks(args) | |
| 101 | 107 | |
| 102 | 108 | old_text = _extract_old_text(tool_name, info) or _extract_old_text(tool_name, args) |
| 103 | 109 | new_text = _extract_new_text(tool_name, info) or _extract_new_text(tool_name, args) |
@@ -192,6 +198,18 @@ def _coerce_patch_hunks(value: Any) -> list[StructuredPatchHunk]: | ||
| 192 | 198 | return hunks |
| 193 | 199 | |
| 194 | 200 | |
| 201 | +def _coerce_raw_patch_hunks(payload: dict[str, Any]) -> list[StructuredPatchHunk]: | |
| 202 | + for key in ("patch", "diff", "patch_text"): | |
| 203 | + value = payload.get(key) | |
| 204 | + if not isinstance(value, str) or not value.strip(): | |
| 205 | + continue | |
| 206 | + try: | |
| 207 | + return parse_unified_diff_patch(value) | |
| 208 | + except ValueError: | |
| 209 | + continue | |
| 210 | + return [] | |
| 211 | + | |
| 212 | + | |
| 195 | 213 | def _extract_file_path(payload: dict[str, Any]) -> str | None: |
| 196 | 214 | for key in ("file_path", "filePath", "path", "filename", "file"): |
| 197 | 215 | value = payload.get(key) |
tests/test_completion_policy.pymodified@@ -7,7 +7,7 @@ from types import SimpleNamespace | ||
| 7 | 7 | |
| 8 | 8 | import pytest |
| 9 | 9 | |
| 10 | -from loader.llm.base import Message, Role | |
| 10 | +from loader.llm.base import Message, Role, ToolCall | |
| 11 | 11 | from loader.runtime.completion_policy import CompletionPolicy |
| 12 | 12 | from loader.runtime.context import RuntimeContext |
| 13 | 13 | from loader.runtime.dod import VerificationEvidence, create_definition_of_done |
@@ -24,6 +24,7 @@ from loader.runtime.task_completion import ( | ||
| 24 | 24 | detect_premature_completion, |
| 25 | 25 | get_continuation_prompt, |
| 26 | 26 | ) |
| 27 | +from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done | |
| 27 | 28 | from loader.runtime.verification_observations import ( |
| 28 | 29 | VerificationObservationStatus, |
| 29 | 30 | verification_attempt_id, |
@@ -338,6 +339,78 @@ def test_completion_assessment_attaches_typed_verification_provenance() -> None: | ||
| 338 | 339 | assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`" |
| 339 | 340 | |
| 340 | 341 | |
| 342 | +def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None: | |
| 343 | + dod = create_definition_of_done("Fix the chapter links in index.html.") | |
| 344 | + sync_todos_to_definition_of_done( | |
| 345 | + dod, | |
| 346 | + [ | |
| 347 | + { | |
| 348 | + "content": "First, examine the current index.html file to understand its structure", | |
| 349 | + "active_form": "Working on: First, examine the current index.html file to understand its structure", | |
| 350 | + "status": "pending", | |
| 351 | + }, | |
| 352 | + { | |
| 353 | + "content": "List and read all HTML files in the chapters directory to extract chapter information", | |
| 354 | + "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information", | |
| 355 | + "status": "pending", | |
| 356 | + }, | |
| 357 | + { | |
| 358 | + "content": "Parse chapter titles from each HTML file", | |
| 359 | + "active_form": "Working on: Parse chapter titles from each HTML file", | |
| 360 | + "status": "pending", | |
| 361 | + }, | |
| 362 | + { | |
| 363 | + "content": "Update index.html with correct chapter links and titles", | |
| 364 | + "active_form": "Working on: Update index.html with correct chapter links and titles", | |
| 365 | + "status": "pending", | |
| 366 | + }, | |
| 367 | + ], | |
| 368 | + ) | |
| 369 | + advance_todos_from_tool_call( | |
| 370 | + dod, | |
| 371 | + ToolCall( | |
| 372 | + id="read-index", | |
| 373 | + name="read", | |
| 374 | + arguments={"file_path": "/tmp/fortran/index.html"}, | |
| 375 | + ), | |
| 376 | + ) | |
| 377 | + advance_todos_from_tool_call( | |
| 378 | + dod, | |
| 379 | + ToolCall( | |
| 380 | + id="glob-chapters", | |
| 381 | + name="glob", | |
| 382 | + arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"}, | |
| 383 | + ), | |
| 384 | + ) | |
| 385 | + advance_todos_from_tool_call( | |
| 386 | + dod, | |
| 387 | + ToolCall( | |
| 388 | + id="read-chapter", | |
| 389 | + name="read", | |
| 390 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | |
| 391 | + ), | |
| 392 | + ) | |
| 393 | + | |
| 394 | + assessment = assess_completion_follow_through_with_provenance( | |
| 395 | + task="Update /tmp/fortran/index.html so every chapter link is correct.", | |
| 396 | + response="I'll update the index.html file with the correct chapter links and titles.", | |
| 397 | + actions_taken=[ | |
| 398 | + "read: {'file_path': '/tmp/fortran/index.html'}", | |
| 399 | + "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}", | |
| 400 | + "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}", | |
| 401 | + ], | |
| 402 | + dod=dod, | |
| 403 | + ) | |
| 404 | + | |
| 405 | + assert assessment.check.missing_evidence[0] == ( | |
| 406 | + "completion of tracked work items " | |
| 407 | + "(Update index.html with correct chapter links and titles)" | |
| 408 | + ) | |
| 409 | + assert assessment.check.suggested_next_steps[0] == ( | |
| 410 | + "Complete the tracked item: Update index.html with correct chapter links and titles" | |
| 411 | + ) | |
| 412 | + | |
| 413 | + | |
| 341 | 414 | @pytest.mark.asyncio |
| 342 | 415 | async def test_completion_policy_stops_for_text_loop_using_runtime_context( |
| 343 | 416 | temp_dir: Path, |
tests/test_expanded_tools.pymodified@@ -87,6 +87,43 @@ async def test_patch_tool_accepts_replacement_block_hunks(temp_dir: Path) -> Non | ||
| 87 | 87 | assert result.metadata["structured_patch"] |
| 88 | 88 | |
| 89 | 89 | |
| 90 | +@pytest.mark.asyncio | |
| 91 | +async def test_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None: | |
| 92 | + target = temp_dir / "sample.txt" | |
| 93 | + target.write_text("alpha\nbeta\ngamma\n") | |
| 94 | + tool = PatchTool(workspace_root=temp_dir) | |
| 95 | + | |
| 96 | + result = await tool.execute( | |
| 97 | + file_path=str(target), | |
| 98 | + patch=( | |
| 99 | + "--- a/sample.txt\n" | |
| 100 | + "+++ b/sample.txt\n" | |
| 101 | + "@@ -2,1 +2,1 @@\n" | |
| 102 | + "-beta\n" | |
| 103 | + "+beta updated\n" | |
| 104 | + ), | |
| 105 | + ) | |
| 106 | + | |
| 107 | + assert result.is_error is False | |
| 108 | + assert target.read_text() == "alpha\nbeta updated\ngamma\n" | |
| 109 | + assert result.metadata["structured_patch"] | |
| 110 | + | |
| 111 | + | |
| 112 | +@pytest.mark.asyncio | |
| 113 | +async def test_patch_tool_rejects_invalid_unified_diff_string(temp_dir: Path) -> None: | |
| 114 | + target = temp_dir / "sample.txt" | |
| 115 | + target.write_text("alpha\nbeta\ngamma\n") | |
| 116 | + tool = PatchTool(workspace_root=temp_dir) | |
| 117 | + | |
| 118 | + result = await tool.execute( | |
| 119 | + file_path=str(target), | |
| 120 | + patch="--- a/sample.txt\n+++ b/sample.txt\n@@ ...\n", | |
| 121 | + ) | |
| 122 | + | |
| 123 | + assert result.is_error is True | |
| 124 | + assert "invalid unified-diff hunk header" in result.output | |
| 125 | + | |
| 126 | + | |
| 90 | 127 | @pytest.mark.asyncio |
| 91 | 128 | async def test_git_tool_inspects_read_only_repo_state(temp_dir: Path) -> None: |
| 92 | 129 | subprocess.run(["git", "init", "--quiet"], cwd=temp_dir, check=True) |
tests/test_finalization.pymodified@@ -113,6 +113,22 @@ class FakeExecutor: | ||
| 113 | 113 | return self._outcomes.pop(0) |
| 114 | 114 | |
| 115 | 115 | |
| 116 | +class RecordingExecutor: | |
| 117 | + def __init__(self) -> None: | |
| 118 | + self.commands: list[str] = [] | |
| 119 | + | |
| 120 | + async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome: | |
| 121 | + command = str(tool_call.arguments.get("command", "")) | |
| 122 | + self.commands.append(command) | |
| 123 | + return tool_outcome( | |
| 124 | + tool_call=tool_call, | |
| 125 | + output="ok", | |
| 126 | + is_error=False, | |
| 127 | + exit_code=0, | |
| 128 | + stdout="ok", | |
| 129 | + ) | |
| 130 | + | |
| 131 | + | |
| 116 | 132 | def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext: |
| 117 | 133 | registry = create_default_registry(temp_dir) |
| 118 | 134 | registry.configure_workspace_root(temp_dir) |
@@ -301,7 +317,6 @@ async def test_turn_finalizer_records_passed_verification_observation( | ||
| 301 | 317 | ) |
| 302 | 318 | dod = create_definition_of_done("Update the runtime tests.") |
| 303 | 319 | dod.mutating_actions.append("write") |
| 304 | - dod.touched_files.append(str(temp_dir / "tests" / "test_runtime.py")) | |
| 305 | 320 | dod.verification_commands = ["uv run pytest -q"] |
| 306 | 321 | summary = TurnSummary(final_response="") |
| 307 | 322 | tool_call = ToolCall( |
@@ -360,6 +375,59 @@ async def test_turn_finalizer_records_passed_verification_observation( | ||
| 360 | 375 | assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [ |
| 361 | 376 | VerificationObservationStatus.PASSED.value |
| 362 | 377 | ] |
| 378 | + | |
| 379 | + | |
| 380 | +@pytest.mark.asyncio | |
| 381 | +async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands( | |
| 382 | + temp_dir: Path, | |
| 383 | +) -> None: | |
| 384 | + chapters = temp_dir / "chapters" | |
| 385 | + chapters.mkdir() | |
| 386 | + (chapters / "01-introduction.html").write_text( | |
| 387 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 388 | + ) | |
| 389 | + index = temp_dir / "index.html" | |
| 390 | + index.write_text( | |
| 391 | + "\n".join( | |
| 392 | + [ | |
| 393 | + '<ul class="chapter-list">', | |
| 394 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>', | |
| 395 | + "</ul>", | |
| 396 | + ] | |
| 397 | + ) | |
| 398 | + ) | |
| 399 | + | |
| 400 | + session = FakeSession() | |
| 401 | + context = build_context(temp_dir, session) | |
| 402 | + finalizer = TurnFinalizer( | |
| 403 | + context, | |
| 404 | + RuntimeTracer(), | |
| 405 | + DefinitionOfDoneStore(temp_dir), | |
| 406 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 407 | + ) | |
| 408 | + dod = create_definition_of_done( | |
| 409 | + "Update index.html so the table of contents links and chapter titles are correct." | |
| 410 | + ) | |
| 411 | + dod.mutating_actions.append("edit") | |
| 412 | + dod.touched_files.append(str(index)) | |
| 413 | + dod.verification_commands = ['grep -n "href=" index.html'] | |
| 414 | + summary = TurnSummary(final_response="") | |
| 415 | + executor = RecordingExecutor() | |
| 416 | + | |
| 417 | + async def capture(event) -> None: | |
| 418 | + return None | |
| 419 | + | |
| 420 | + result = await finalizer.run_definition_of_done_gate( | |
| 421 | + dod=dod, | |
| 422 | + candidate_response="Updated the index.html links.", | |
| 423 | + emit=capture, | |
| 424 | + summary=summary, | |
| 425 | + executor=executor, # type: ignore[arg-type] | |
| 426 | + ) | |
| 427 | + | |
| 428 | + assert result.should_continue is False | |
| 429 | + assert any(command == 'grep -n "href=" index.html' for command in executor.commands) | |
| 430 | + assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in executor.commands) | |
| 363 | 431 | assert ( |
| 364 | 432 | session.workflow_timeline[-1].verification_observations[0].attempt_id |
| 365 | 433 | == "verification-attempt-1" |
tests/test_runtime_harness.pymodified@@ -901,6 +901,46 @@ async def test_raw_json_patch_tool_call_fallback(temp_dir: Path) -> None: | ||
| 901 | 901 | assert "Patched sample.txt." in run.response |
| 902 | 902 | |
| 903 | 903 | |
| 904 | +@pytest.mark.asyncio | |
| 905 | +async def test_native_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None: | |
| 906 | + target = temp_dir / "sample.txt" | |
| 907 | + target.write_text("alpha\nbeta\ngamma\n") | |
| 908 | + | |
| 909 | + backend = ScriptedBackend( | |
| 910 | + completions=[ | |
| 911 | + native_tool_response( | |
| 912 | + ToolCall( | |
| 913 | + id="patch-1", | |
| 914 | + name="patch", | |
| 915 | + arguments={ | |
| 916 | + "file_path": str(target), | |
| 917 | + "patch": ( | |
| 918 | + "--- a/sample.txt\n" | |
| 919 | + "+++ b/sample.txt\n" | |
| 920 | + "@@ -2,1 +2,1 @@\n" | |
| 921 | + "-beta\n" | |
| 922 | + "+beta updated\n" | |
| 923 | + ), | |
| 924 | + }, | |
| 925 | + ), | |
| 926 | + content="I'll patch the file directly.", | |
| 927 | + ), | |
| 928 | + final_response("Patched sample.txt."), | |
| 929 | + ] | |
| 930 | + ) | |
| 931 | + | |
| 932 | + run = await run_scenario( | |
| 933 | + "Update sample.txt.", | |
| 934 | + backend, | |
| 935 | + config=non_streaming_config(), | |
| 936 | + project_root=temp_dir, | |
| 937 | + ) | |
| 938 | + | |
| 939 | + assert tool_event_names(run) == ["patch"] | |
| 940 | + assert target.read_text() == "alpha\nbeta updated\ngamma\n" | |
| 941 | + assert "Patched sample.txt." in run.response | |
| 942 | + | |
| 943 | + | |
| 904 | 944 | @pytest.mark.asyncio |
| 905 | 945 | async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None: |
| 906 | 946 | raw_json = json.dumps( |
@@ -1766,6 +1806,66 @@ async def test_duplicate_read_is_skipped_without_intervening_mutation( | ||
| 1766 | 1806 | assert "existing file contents" in run.response |
| 1767 | 1807 | |
| 1768 | 1808 | |
| 1809 | +@pytest.mark.asyncio | |
| 1810 | +async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence( | |
| 1811 | + temp_dir: Path, | |
| 1812 | +) -> None: | |
| 1813 | + chapters = temp_dir / "chapters" | |
| 1814 | + chapters.mkdir() | |
| 1815 | + (chapters / "01-introduction.html").write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n") | |
| 1816 | + (chapters / "02-setup.html").write_text("<h1>Chapter 2: Setting Up Fortran</h1>\n") | |
| 1817 | + index_file = temp_dir / "index.html" | |
| 1818 | + index_file.write_text("broken table of contents\n") | |
| 1819 | + | |
| 1820 | + backend = ScriptedBackend( | |
| 1821 | + completions=[ | |
| 1822 | + native_tool_response( | |
| 1823 | + ToolCall( | |
| 1824 | + id="glob-1", | |
| 1825 | + name="glob", | |
| 1826 | + arguments={"path": str(chapters), "pattern": "*.html"}, | |
| 1827 | + ), | |
| 1828 | + content="I'll inspect the chapter inventory first.", | |
| 1829 | + ), | |
| 1830 | + native_tool_response( | |
| 1831 | + ToolCall( | |
| 1832 | + id="read-1", | |
| 1833 | + name="read", | |
| 1834 | + arguments={"file_path": str(index_file)}, | |
| 1835 | + ), | |
| 1836 | + content="I'll inspect the index next.", | |
| 1837 | + ), | |
| 1838 | + native_tool_response( | |
| 1839 | + ToolCall( | |
| 1840 | + id="read-2", | |
| 1841 | + name="read", | |
| 1842 | + arguments={"file_path": str(index_file)}, | |
| 1843 | + ), | |
| 1844 | + content="I'll reopen the index.", | |
| 1845 | + ), | |
| 1846 | + final_response("I'll reuse the earlier evidence and patch the index next."), | |
| 1847 | + ] | |
| 1848 | + ) | |
| 1849 | + | |
| 1850 | + run = await run_scenario( | |
| 1851 | + "Update index.html so the table of contents links are correct.", | |
| 1852 | + backend, | |
| 1853 | + config=non_streaming_config(), | |
| 1854 | + project_root=temp_dir, | |
| 1855 | + ) | |
| 1856 | + | |
| 1857 | + messages = tool_result_messages(run) | |
| 1858 | + steering_messages = [ | |
| 1859 | + event.content | |
| 1860 | + for event in run.events | |
| 1861 | + if event.type == "steering" and event.content | |
| 1862 | + ] | |
| 1863 | + | |
| 1864 | + assert any("reuse the earlier read result instead of rereading" in message for message in messages) | |
| 1865 | + assert any("Reuse the earlier observation instead of repeating it." in message for message in steering_messages) | |
| 1866 | + assert any("index.html" in message for message in steering_messages) | |
| 1867 | + | |
| 1868 | + | |
| 1769 | 1869 | @pytest.mark.asyncio |
| 1770 | 1870 | async def test_interleaved_reread_is_allowed_once_without_intervening_mutation( |
| 1771 | 1871 | temp_dir: Path, |
tests/test_safeguard_services.pymodified@@ -158,11 +158,25 @@ def test_pre_action_validator_blocks_patch_without_hunks() -> None: | ||
| 158 | 158 | assert result == ValidationResult( |
| 159 | 159 | valid=False, |
| 160 | 160 | reason="Patch hunks are missing", |
| 161 | - suggestion="Provide one or more structured patch hunks", | |
| 161 | + suggestion="Provide structured patch hunks or a unified diff patch string", | |
| 162 | 162 | severity="error", |
| 163 | 163 | ) |
| 164 | 164 | |
| 165 | 165 | |
| 166 | +def test_pre_action_validator_allows_patch_string_without_hunks() -> None: | |
| 167 | + validator = PreActionValidator() | |
| 168 | + | |
| 169 | + result = validator.validate( | |
| 170 | + "patch", | |
| 171 | + { | |
| 172 | + "file_path": "notes.txt", | |
| 173 | + "patch": "--- a/notes.txt\n+++ b/notes.txt\n@@ -1,1 +1,1 @@\n-old\n+new\n", | |
| 174 | + }, | |
| 175 | + ) | |
| 176 | + | |
| 177 | + assert result == ValidationResult(valid=True) | |
| 178 | + | |
| 179 | + | |
| 166 | 180 | def test_runtime_safeguards_wrap_runtime_owned_services() -> None: |
| 167 | 181 | safeguards = RuntimeSafeguards() |
| 168 | 182 | |
tests/test_tool_batch_policies.pymodified@@ -349,12 +349,74 @@ async def test_tool_batch_recovery_controller_includes_known_state_for_missing_f | ||
| 349 | 349 | |
| 350 | 350 | assert follow_up is not None |
| 351 | 351 | assert "## CONTINUE FROM KNOWN STATE" in follow_up.content |
| 352 | + assert "apply the fix using confirmed findings" in follow_up.content | |
| 353 | + assert "## ACTION BIAS FOR THIS RECOVERY" in follow_up.content | |
| 354 | + assert "Prefer edit/write/patch on the target file" in follow_up.content | |
| 352 | 355 | assert "04-variables.html" in follow_up.content |
| 353 | 356 | assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content |
| 354 | 357 | assert "`~/Loader/guides/fortran/index.html`" in follow_up.content |
| 355 | 358 | assert any(event.type == "recovery" for event in events) |
| 356 | 359 | |
| 357 | 360 | |
| 361 | +@pytest.mark.asyncio | |
| 362 | +async def test_tool_batch_recovery_controller_suggests_known_sibling_files( | |
| 363 | + temp_dir: Path, | |
| 364 | +) -> None: | |
| 365 | + async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: | |
| 366 | + raise AssertionError("Confidence should not run here") | |
| 367 | + | |
| 368 | + async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: | |
| 369 | + raise AssertionError("Verification should not run here") | |
| 370 | + | |
| 371 | + messages = [ | |
| 372 | + Message( | |
| 373 | + role=Role.TOOL, | |
| 374 | + content=( | |
| 375 | + "Observation [glob]: Result: " | |
| 376 | + "/private/tmp/fortran-qwen-recovery-check/chapters/01-introduction.html\n" | |
| 377 | + "/private/tmp/fortran-qwen-recovery-check/chapters/02-setup.html\n" | |
| 378 | + "/private/tmp/fortran-qwen-recovery-check/chapters/03-basics.html\n" | |
| 379 | + "/private/tmp/fortran-qwen-recovery-check/chapters/04-variables.html\n" | |
| 380 | + "/private/tmp/fortran-qwen-recovery-check/chapters/05-input-output.html" | |
| 381 | + ), | |
| 382 | + tool_results=[], | |
| 383 | + ), | |
| 384 | + ] | |
| 385 | + context = build_context( | |
| 386 | + temp_dir=temp_dir, | |
| 387 | + messages=messages, | |
| 388 | + assess_confidence=assess_confidence, | |
| 389 | + verify_action=verify_action, | |
| 390 | + ) | |
| 391 | + controller = ToolBatchRecoveryController(context) | |
| 392 | + tool_call = ToolCall( | |
| 393 | + id="read-missing", | |
| 394 | + name="read", | |
| 395 | + arguments={"file_path": "/tmp/fortran-qwen-recovery-check/chapters/04-data-types.html"}, | |
| 396 | + ) | |
| 397 | + outcome = tool_outcome( | |
| 398 | + tool_call=tool_call, | |
| 399 | + output="File not found: /tmp/fortran-qwen-recovery-check/chapters/04-data-types.html", | |
| 400 | + is_error=True, | |
| 401 | + ) | |
| 402 | + | |
| 403 | + events: list[AgentEvent] = [] | |
| 404 | + | |
| 405 | + async def emit(event: AgentEvent) -> None: | |
| 406 | + events.append(event) | |
| 407 | + | |
| 408 | + follow_up = await controller.build_follow_up( | |
| 409 | + tool_call=tool_call, | |
| 410 | + outcome=outcome, | |
| 411 | + emit=emit, | |
| 412 | + ) | |
| 413 | + | |
| 414 | + assert follow_up is not None | |
| 415 | + assert "## LIKELY FILE CANDIDATES" in follow_up.content | |
| 416 | + assert "`04-variables.html`" in follow_up.content | |
| 417 | + assert "instead of retrying the missing path" in follow_up.content | |
| 418 | + | |
| 419 | + | |
| 358 | 420 | @pytest.mark.asyncio |
| 359 | 421 | async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files( |
| 360 | 422 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -540,6 +540,108 @@ async def test_tool_batch_runner_clears_recovery_context_after_successful_mutati | ||
| 540 | 540 | assert context.recovery_context is None |
| 541 | 541 | |
| 542 | 542 | |
| 543 | +@pytest.mark.asyncio | |
| 544 | +async def test_tool_batch_runner_queues_duplicate_observation_nudge( | |
| 545 | + temp_dir: Path, | |
| 546 | +) -> None: | |
| 547 | + async def assess_confidence( | |
| 548 | + tool_name: str, | |
| 549 | + tool_args: dict, | |
| 550 | + context: str, | |
| 551 | + ) -> ConfidenceAssessment: | |
| 552 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 553 | + | |
| 554 | + async def verify_action( | |
| 555 | + tool_name: str, | |
| 556 | + tool_args: dict, | |
| 557 | + result: str, | |
| 558 | + expected: str = "", | |
| 559 | + ) -> ActionVerification: | |
| 560 | + raise AssertionError("Verification should not run for this scenario") | |
| 561 | + | |
| 562 | + messages = [ | |
| 563 | + Message( | |
| 564 | + role=Role.TOOL, | |
| 565 | + content=( | |
| 566 | + "Observation [glob]: Result: " | |
| 567 | + f"{temp_dir}/chapters/01-introduction.html\n" | |
| 568 | + f"{temp_dir}/chapters/02-setup.html\n" | |
| 569 | + f"{temp_dir}/chapters/03-basics.html" | |
| 570 | + ), | |
| 571 | + tool_results=[], | |
| 572 | + ), | |
| 573 | + Message( | |
| 574 | + role=Role.ASSISTANT, | |
| 575 | + content="I should update the index now.", | |
| 576 | + tool_calls=[ | |
| 577 | + ToolCall( | |
| 578 | + id="read-index", | |
| 579 | + name="read", | |
| 580 | + arguments={"file_path": str(temp_dir / 'index.html')}, | |
| 581 | + ) | |
| 582 | + ], | |
| 583 | + ), | |
| 584 | + ] | |
| 585 | + context = build_context( | |
| 586 | + temp_dir=temp_dir, | |
| 587 | + messages=messages, | |
| 588 | + safeguards=FakeSafeguards(), | |
| 589 | + assess_confidence=assess_confidence, | |
| 590 | + verify_action=verify_action, | |
| 591 | + auto_recover=False, | |
| 592 | + ) | |
| 593 | + context.session.current_task = ( | |
| 594 | + f"Update {temp_dir / 'index.html'} with the right chapter links." | |
| 595 | + ) | |
| 596 | + queued_messages: list[str] = [] | |
| 597 | + context.queue_steering_message_callback = queued_messages.append | |
| 598 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 599 | + tool_call = ToolCall( | |
| 600 | + id="read-dup", | |
| 601 | + name="read", | |
| 602 | + arguments={"file_path": str(temp_dir / "index.html")}, | |
| 603 | + ) | |
| 604 | + duplicate_message = ( | |
| 605 | + "[Skipped - duplicate action: Already read " | |
| 606 | + f"{temp_dir / 'index.html'} recently without any intervening changes; " | |
| 607 | + "reuse the earlier read result instead of rereading]" | |
| 608 | + ) | |
| 609 | + executor = FakeExecutor( | |
| 610 | + [ | |
| 611 | + ToolExecutionOutcome( | |
| 612 | + tool_call=tool_call, | |
| 613 | + state=ToolExecutionState.DUPLICATE, | |
| 614 | + message=Message.tool_result_message( | |
| 615 | + tool_call_id=tool_call.id, | |
| 616 | + display_content=duplicate_message, | |
| 617 | + result_content=duplicate_message, | |
| 618 | + ), | |
| 619 | + event_content=duplicate_message, | |
| 620 | + is_error=False, | |
| 621 | + result_output=duplicate_message, | |
| 622 | + ) | |
| 623 | + ] | |
| 624 | + ) | |
| 625 | + | |
| 626 | + await runner.execute_batch( | |
| 627 | + tool_calls=[tool_call], | |
| 628 | + tool_source="assistant", | |
| 629 | + pending_tool_calls_seen=set(), | |
| 630 | + emit=_noop_emit, | |
| 631 | + summary=TurnSummary(final_response=""), | |
| 632 | + dod=create_definition_of_done("Fix the chapter links"), | |
| 633 | + executor=executor, # type: ignore[arg-type] | |
| 634 | + on_confirmation=None, | |
| 635 | + on_user_question=None, | |
| 636 | + emit_confirmation=None, | |
| 637 | + consecutive_errors=0, | |
| 638 | + ) | |
| 639 | + | |
| 640 | + assert len(queued_messages) == 1 | |
| 641 | + assert "Reuse the earlier observation instead of repeating it." in queued_messages[0] | |
| 642 | + assert "index.html" in queued_messages[0] | |
| 643 | + | |
| 644 | + | |
| 543 | 645 | async def _noop_emit(event: AgentEvent) -> None: |
| 544 | 646 | return None |
| 545 | 647 | |
tests/test_workflow.pymodified@@ -4,6 +4,7 @@ from __future__ import annotations | ||
| 4 | 4 | |
| 5 | 5 | from pathlib import Path |
| 6 | 6 | |
| 7 | +from loader.llm.base import ToolCall | |
| 7 | 8 | from loader.runtime.clarify_grounding import ClarifyGrounding, ClarifyRepoFact |
| 8 | 9 | from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done |
| 9 | 10 | from loader.runtime.workflow import ( |
@@ -12,6 +13,7 @@ from loader.runtime.workflow import ( | ||
| 12 | 13 | PlanningArtifacts, |
| 13 | 14 | WorkflowArtifactStore, |
| 14 | 15 | WorkflowMode, |
| 16 | + advance_todos_from_tool_call, | |
| 15 | 17 | build_execute_bridge, |
| 16 | 18 | enrich_clarify_brief_with_grounding, |
| 17 | 19 | extract_verification_commands_from_markdown, |
@@ -183,6 +185,26 @@ def test_planning_artifacts_recover_embedded_verification_from_legacy_separator( | ||
| 183 | 185 | ] |
| 184 | 186 | |
| 185 | 187 | |
| 188 | +def test_extract_verification_commands_from_markdown_splits_code_blocks() -> None: | |
| 189 | + markdown = "\n".join( | |
| 190 | + [ | |
| 191 | + "# Verification Plan", | |
| 192 | + "", | |
| 193 | + "## Verification Commands", | |
| 194 | + "```bash", | |
| 195 | + "# Check chapter files", | |
| 196 | + "ls chapters", | |
| 197 | + "grep -n \"href=\" index.html", | |
| 198 | + "```", | |
| 199 | + ] | |
| 200 | + ) | |
| 201 | + | |
| 202 | + assert extract_verification_commands_from_markdown(markdown) == [ | |
| 203 | + "ls chapters", | |
| 204 | + 'grep -n "href=" index.html', | |
| 205 | + ] | |
| 206 | + | |
| 207 | + | |
| 186 | 208 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 187 | 209 | store = WorkflowArtifactStore(tmp_path) |
| 188 | 210 | brief = ClarifyBrief.fallback( |
@@ -250,3 +272,93 @@ def test_sync_todos_to_definition_of_done_preserves_runtime_items() -> None: | ||
| 250 | 272 | assert "Writing router" in dod.pending_items |
| 251 | 273 | assert "Collect verification evidence" in dod.pending_items |
| 252 | 274 | assert "Update tests" in dod.completed_items |
| 275 | + | |
| 276 | + | |
| 277 | +def test_advance_todos_from_tool_call_tracks_plan_progress() -> None: | |
| 278 | + dod = create_definition_of_done("Fix the chapter links in index.html.") | |
| 279 | + sync_todos_to_definition_of_done( | |
| 280 | + dod, | |
| 281 | + [ | |
| 282 | + { | |
| 283 | + "content": "First, examine the current index.html file to understand its structure", | |
| 284 | + "active_form": "Working on: First, examine the current index.html file to understand its structure", | |
| 285 | + "status": "pending", | |
| 286 | + }, | |
| 287 | + { | |
| 288 | + "content": "List and read all HTML files in the chapters directory to extract chapter information", | |
| 289 | + "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information", | |
| 290 | + "status": "pending", | |
| 291 | + }, | |
| 292 | + { | |
| 293 | + "content": "Parse chapter titles from each HTML file", | |
| 294 | + "active_form": "Working on: Parse chapter titles from each HTML file", | |
| 295 | + "status": "pending", | |
| 296 | + }, | |
| 297 | + { | |
| 298 | + "content": "Update index.html with correct chapter links and titles", | |
| 299 | + "active_form": "Working on: Update index.html with correct chapter links and titles", | |
| 300 | + "status": "pending", | |
| 301 | + }, | |
| 302 | + { | |
| 303 | + "content": "Verify the updated index.html file is properly formatted", | |
| 304 | + "active_form": "Working on: Verify the updated index.html file is properly formatted", | |
| 305 | + "status": "pending", | |
| 306 | + }, | |
| 307 | + ], | |
| 308 | + ) | |
| 309 | + | |
| 310 | + assert advance_todos_from_tool_call( | |
| 311 | + dod, | |
| 312 | + ToolCall( | |
| 313 | + id="read-index", | |
| 314 | + name="read", | |
| 315 | + arguments={"file_path": "/tmp/fortran/index.html"}, | |
| 316 | + ), | |
| 317 | + ) | |
| 318 | + assert ( | |
| 319 | + "First, examine the current index.html file to understand its structure" | |
| 320 | + in dod.completed_items | |
| 321 | + ) | |
| 322 | + | |
| 323 | + assert advance_todos_from_tool_call( | |
| 324 | + dod, | |
| 325 | + ToolCall( | |
| 326 | + id="glob-chapters", | |
| 327 | + name="glob", | |
| 328 | + arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"}, | |
| 329 | + ), | |
| 330 | + ) | |
| 331 | + assert ( | |
| 332 | + "List and read all HTML files in the chapters directory to extract chapter information" | |
| 333 | + in dod.completed_items | |
| 334 | + ) | |
| 335 | + | |
| 336 | + assert advance_todos_from_tool_call( | |
| 337 | + dod, | |
| 338 | + ToolCall( | |
| 339 | + id="read-chapter", | |
| 340 | + name="read", | |
| 341 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | |
| 342 | + ), | |
| 343 | + ) | |
| 344 | + assert "Parse chapter titles from each HTML file" in dod.completed_items | |
| 345 | + | |
| 346 | + assert advance_todos_from_tool_call( | |
| 347 | + dod, | |
| 348 | + ToolCall( | |
| 349 | + id="patch-index", | |
| 350 | + name="patch", | |
| 351 | + arguments={"file_path": "/tmp/fortran/index.html", "hunks": []}, | |
| 352 | + ), | |
| 353 | + ) | |
| 354 | + assert "Update index.html with correct chapter links and titles" in dod.completed_items | |
| 355 | + | |
| 356 | + assert advance_todos_from_tool_call( | |
| 357 | + dod, | |
| 358 | + ToolCall( | |
| 359 | + id="verify-index", | |
| 360 | + name="bash", | |
| 361 | + arguments={"command": "grep -o 'href=\"[^\"]*\"' /tmp/fortran/index.html"}, | |
| 362 | + ), | |
| 363 | + ) | |
| 364 | + assert "Verify the updated index.html file is properly formatted" in dod.completed_items | |