Harden qwen recovery loops and verification
- SHA
3a703a1e283382d4405282386f066a805123c57f- Parents
-
f0d4490 - Tree
132f2bd
3a703a1
3a703a1e283382d4405282386f066a805123c57ff0d4490
132f2bd| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/finalization.py
|
8 | 0 |
| M |
src/loader/runtime/recovery.py
|
8 | 6 |
| M |
src/loader/runtime/safeguard_services.py
|
30 | 7 |
| M |
src/loader/runtime/tool_batch_recovery.py
|
107 | 2 |
| M |
src/loader/runtime/tool_batches.py
|
43 | 0 |
| M |
src/loader/runtime/workflow.py
|
222 | 2 |
| M |
src/loader/tools/file_tools.py
|
27 | 10 |
| M |
src/loader/tools/fs_safety.py
|
62 | 0 |
| M |
src/loader/utils/file_mutations.py
|
19 | 1 |
| M |
tests/test_completion_policy.py
|
74 | 1 |
| M |
tests/test_expanded_tools.py
|
37 | 0 |
| M |
tests/test_finalization.py
|
69 | 1 |
| M |
tests/test_runtime_harness.py
|
100 | 0 |
| M |
tests/test_safeguard_services.py
|
15 | 1 |
| M |
tests/test_tool_batch_policies.py
|
62 | 0 |
| M |
tests/test_tool_batches.py
|
102 | 0 |
| M |
tests/test_workflow.py
|
112 | 0 |
src/loader/runtime/finalization.pymodified@@ -231,6 +231,14 @@ class TurnFinalizer: | |||
| 231 | project_root=self.context.project_root, | 231 | project_root=self.context.project_root, |
| 232 | task_statement=dod.task_statement, | 232 | task_statement=dod.task_statement, |
| 233 | ) | 233 | ) |
| 234 | + else: | ||
| 235 | + for command in derive_verification_commands( | ||
| 236 | + dod, | ||
| 237 | + project_root=self.context.project_root, | ||
| 238 | + task_statement=dod.task_statement, | ||
| 239 | + ): | ||
| 240 | + if command not in dod.verification_commands: | ||
| 241 | + dod.verification_commands.append(command) | ||
| 234 | 242 | ||
| 235 | await self.set_workflow_mode( | 243 | await self.set_workflow_mode( |
| 236 | ModeDecision.transition( | 244 | ModeDecision.transition( |
src/loader/runtime/recovery.pymodified@@ -690,15 +690,17 @@ The command failed. You MUST analyze the error and take a DIFFERENT action. | |||
| 690 | {hints} | 690 | {hints} |
| 691 | 691 | ||
| 692 | ## CRITICAL RULES: | 692 | ## CRITICAL RULES: |
| 693 | -1. **INVESTIGATE FIRST** - Read config files, list directories, check what exists | 693 | +1. Start from the error and the state you already know |
| 694 | -2. **DO NOT** just retry the same command with slight variations | 694 | +2. Investigate only if a specific fact is still missing |
| 695 | -3. **DO NOT** try `npm start` then `npm run start` - these are the same thing! | 695 | +3. If you already have enough confirmed evidence, apply the fix instead of rereading the same files |
| 696 | -4. **READ THE ERROR** - It usually tells you exactly what's wrong | 696 | +4. **DO NOT** just retry the same command with slight variations |
| 697 | -5. If the error says "missing script: start", read package.json to see what scripts exist | 697 | +5. **DO NOT** try `npm start` then `npm run start` - these are the same thing! |
| 698 | +6. **READ THE ERROR** - It usually tells you exactly what's wrong | ||
| 699 | +7. If the error says "missing script: start", read package.json to see what scripts exist | ||
| 698 | 700 | ||
| 699 | ## Current attempt: {attempt_count}/{max_retries} | 701 | ## Current attempt: {attempt_count}/{max_retries} |
| 700 | 702 | ||
| 701 | -**Your next action should gather information OR try a fundamentally different approach.** | 703 | +**Your next action should either gather the missing information OR apply the fix using confirmed findings.** |
| 702 | What will you do?""" | 704 | What will you do?""" |
| 703 | 705 | ||
| 704 | 706 | ||
src/loader/runtime/safeguard_services.pymodified@@ -76,6 +76,11 @@ class ActionTracker: | |||
| 76 | sig = str(hash(str(hunks))) | 76 | sig = str(hash(str(hunks))) |
| 77 | return sig in self._files_edited.get(norm_path, []) | 77 | return sig in self._files_edited.get(norm_path, []) |
| 78 | 78 | ||
| 79 | + def would_duplicate_raw_patch(self, file_path: str, patch_text: str) -> bool: | ||
| 80 | + norm_path = self._normalize_path(file_path) | ||
| 81 | + sig = str(hash(patch_text)) | ||
| 82 | + return sig in self._files_edited.get(norm_path, []) | ||
| 83 | + | ||
| 79 | def would_duplicate_command(self, command: str) -> bool: | 84 | def would_duplicate_command(self, command: str) -> bool: |
| 80 | norm_cmd = self._normalize_command(command) | 85 | norm_cmd = self._normalize_command(command) |
| 81 | return norm_cmd in self._commands_run | 86 | return norm_cmd in self._commands_run |
@@ -123,8 +128,12 @@ class ActionTracker: | |||
| 123 | elif tool_name == "patch": | 128 | elif tool_name == "patch": |
| 124 | file_path = arguments.get("file_path", "") | 129 | file_path = arguments.get("file_path", "") |
| 125 | hunks = arguments.get("hunks", []) | 130 | hunks = arguments.get("hunks", []) |
| 126 | - if isinstance(hunks, list) and self.would_duplicate_patch(file_path, hunks): | 131 | + raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text") |
| 132 | + if isinstance(hunks, list) and hunks and self.would_duplicate_patch(file_path, hunks): | ||
| 127 | return True, f"Same patch already applied to: {file_path}" | 133 | return True, f"Same patch already applied to: {file_path}" |
| 134 | + if isinstance(raw_patch, str) and raw_patch.strip(): | ||
| 135 | + if self.would_duplicate_raw_patch(file_path, raw_patch): | ||
| 136 | + return True, f"Same patch already applied to: {file_path}" | ||
| 128 | 137 | ||
| 129 | elif tool_name == "read": | 138 | elif tool_name == "read": |
| 130 | read_key = self._make_read_key(arguments) | 139 | read_key = self._make_read_key(arguments) |
@@ -135,7 +144,8 @@ class ActionTracker: | |||
| 135 | ( | 144 | ( |
| 136 | "Already read " | 145 | "Already read " |
| 137 | f"{str(arguments.get('file_path', '')).strip()} " | 146 | f"{str(arguments.get('file_path', '')).strip()} " |
| 138 | - "recently without any intervening changes" | 147 | + "recently without any intervening changes; " |
| 148 | + "reuse the earlier read result instead of rereading" | ||
| 139 | ), | 149 | ), |
| 140 | repeat_threshold=self.READ_REPEAT_THRESHOLD, | 150 | repeat_threshold=self.READ_REPEAT_THRESHOLD, |
| 141 | ) | 151 | ) |
@@ -148,7 +158,10 @@ class ActionTracker: | |||
| 148 | duplicate, reason = self._check_recent_observation( | 158 | duplicate, reason = self._check_recent_observation( |
| 149 | self._recent_searches, | 159 | self._recent_searches, |
| 150 | observation_key, | 160 | observation_key, |
| 151 | - "Already ran the same search recently without any intervening changes", | 161 | + ( |
| 162 | + "Already ran the same search recently without any intervening " | ||
| 163 | + "changes; reuse the earlier search result instead of rerunning it" | ||
| 164 | + ), | ||
| 152 | repeat_threshold=self.SEARCH_REPEAT_THRESHOLD, | 165 | repeat_threshold=self.SEARCH_REPEAT_THRESHOLD, |
| 153 | ) | 166 | ) |
| 154 | if duplicate: | 167 | if duplicate: |
@@ -160,7 +173,10 @@ class ActionTracker: | |||
| 160 | duplicate, reason = self._check_recent_observation( | 173 | duplicate, reason = self._check_recent_observation( |
| 161 | self._recent_bash_observations, | 174 | self._recent_bash_observations, |
| 162 | self._normalize_command(command), | 175 | self._normalize_command(command), |
| 163 | - "Already ran the same read-only shell probe recently without any intervening changes", | 176 | + ( |
| 177 | + "Already ran the same read-only shell probe recently without any " | ||
| 178 | + "intervening changes; reuse the earlier shell output instead of rerunning it" | ||
| 179 | + ), | ||
| 164 | repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD, | 180 | repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD, |
| 165 | ) | 181 | ) |
| 166 | if duplicate: | 182 | if duplicate: |
@@ -196,7 +212,11 @@ class ActionTracker: | |||
| 196 | file_path = arguments.get("file_path", "") | 212 | file_path = arguments.get("file_path", "") |
| 197 | hunks = arguments.get("hunks", []) | 213 | hunks = arguments.get("hunks", []) |
| 198 | if file_path: | 214 | if file_path: |
| 199 | - self.record_edit(file_path, str(hunks), "structured_patch") | 215 | + raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text") |
| 216 | + if isinstance(hunks, list) and hunks: | ||
| 217 | + self.record_edit(file_path, str(hunks), "structured_patch") | ||
| 218 | + elif isinstance(raw_patch, str) and raw_patch.strip(): | ||
| 219 | + self.record_edit(file_path, raw_patch, "raw_patch") | ||
| 200 | self._note_mutation() | 220 | self._note_mutation() |
| 201 | 221 | ||
| 202 | elif tool_name == "read": | 222 | elif tool_name == "read": |
@@ -592,6 +612,7 @@ class PreActionValidator: | |||
| 592 | def _validate_patch(self, arguments: dict) -> ValidationResult: | 612 | def _validate_patch(self, arguments: dict) -> ValidationResult: |
| 593 | file_path = arguments.get("file_path", "") | 613 | file_path = arguments.get("file_path", "") |
| 594 | hunks = arguments.get("hunks", []) | 614 | hunks = arguments.get("hunks", []) |
| 615 | + raw_patch = arguments.get("patch") or arguments.get("diff") or arguments.get("patch_text") | ||
| 595 | 616 | ||
| 596 | if not file_path or not str(file_path).strip(): | 617 | if not file_path or not str(file_path).strip(): |
| 597 | return ValidationResult( | 618 | return ValidationResult( |
@@ -605,11 +626,13 @@ class PreActionValidator: | |||
| 605 | if not path_result.valid: | 626 | if not path_result.valid: |
| 606 | return path_result | 627 | return path_result |
| 607 | 628 | ||
| 608 | - if not isinstance(hunks, list) or not hunks: | 629 | + has_hunks = isinstance(hunks, list) and bool(hunks) |
| 630 | + has_raw_patch = isinstance(raw_patch, str) and bool(raw_patch.strip()) | ||
| 631 | + if not has_hunks and not has_raw_patch: | ||
| 609 | return ValidationResult( | 632 | return ValidationResult( |
| 610 | valid=False, | 633 | valid=False, |
| 611 | reason="Patch hunks are missing", | 634 | reason="Patch hunks are missing", |
| 612 | - suggestion="Provide one or more structured patch hunks", | 635 | + suggestion="Provide structured patch hunks or a unified diff patch string", |
| 613 | severity="error", | 636 | severity="error", |
| 614 | ) | 637 | ) |
| 615 | 638 | ||
src/loader/runtime/tool_batch_recovery.pymodified@@ -3,6 +3,9 @@ | |||
| 3 | from __future__ import annotations | 3 | from __future__ import annotations |
| 4 | 4 | ||
| 5 | from collections.abc import Awaitable, Callable | 5 | from collections.abc import Awaitable, Callable |
| 6 | +from difflib import SequenceMatcher | ||
| 7 | +from pathlib import Path | ||
| 8 | +import re | ||
| 6 | 9 | ||
| 7 | from ..llm.base import Message, ToolCall | 10 | from ..llm.base import Message, ToolCall |
| 8 | from .compaction import infer_preferred_next_step, summarize_confirmed_facts | 11 | from .compaction import infer_preferred_next_step, summarize_confirmed_facts |
@@ -82,7 +85,11 @@ class ToolBatchRecoveryController: | |||
| 82 | tool_call.arguments, | 85 | tool_call.arguments, |
| 83 | outcome.result_output, | 86 | outcome.result_output, |
| 84 | ) | 87 | ) |
| 85 | - recovery_prompt = self._augment_recovery_prompt(recovery_prompt) | 88 | + recovery_prompt = self._augment_recovery_prompt( |
| 89 | + recovery_prompt, | ||
| 90 | + tool_call=tool_call, | ||
| 91 | + outcome=outcome, | ||
| 92 | + ) | ||
| 86 | return Message.tool_result_message( | 93 | return Message.tool_result_message( |
| 87 | tool_call_id=tool_call.id, | 94 | tool_call_id=tool_call.id, |
| 88 | display_content=recovery_prompt, | 95 | display_content=recovery_prompt, |
@@ -106,7 +113,13 @@ class ToolBatchRecoveryController: | |||
| 106 | is_error=True, | 113 | is_error=True, |
| 107 | ) | 114 | ) |
| 108 | 115 | ||
| 109 | - def _augment_recovery_prompt(self, prompt: str) -> str: | 116 | + def _augment_recovery_prompt( |
| 117 | + self, | ||
| 118 | + prompt: str, | ||
| 119 | + *, | ||
| 120 | + tool_call: ToolCall, | ||
| 121 | + outcome: ToolExecutionOutcome, | ||
| 122 | + ) -> str: | ||
| 110 | """Append transcript-aware recovery guidance when recent facts exist.""" | 123 | """Append transcript-aware recovery guidance when recent facts exist.""" |
| 111 | 124 | ||
| 112 | session = self.context.session | 125 | session = self.context.session |
@@ -116,6 +129,7 @@ class ToolBatchRecoveryController: | |||
| 116 | session.messages, | 129 | session.messages, |
| 117 | current_task=current_task, | 130 | current_task=current_task, |
| 118 | ) | 131 | ) |
| 132 | + actionable_known_state = bool(confirmed_facts and preferred_next_step) | ||
| 119 | if not confirmed_facts and not preferred_next_step and not current_task: | 133 | if not confirmed_facts and not preferred_next_step and not current_task: |
| 120 | return prompt | 134 | return prompt |
| 121 | 135 | ||
@@ -130,4 +144,95 @@ class ToolBatchRecoveryController: | |||
| 130 | "- Preserve progress: do not restart by rereading already-confirmed files " | 144 | "- Preserve progress: do not restart by rereading already-confirmed files " |
| 131 | "unless you need genuinely new evidence." | 145 | "unless you need genuinely new evidence." |
| 132 | ) | 146 | ) |
| 147 | + if actionable_known_state: | ||
| 148 | + lines.extend( | ||
| 149 | + [ | ||
| 150 | + "", | ||
| 151 | + "## ACTION BIAS FOR THIS RECOVERY", | ||
| 152 | + "- The confirmed findings above are already enough to keep moving.", | ||
| 153 | + "- Prefer edit/write/patch on the target file over rereading the same files.", | ||
| 154 | + "- Only inspect one more file if a specific filename, href, or title is still unknown.", | ||
| 155 | + "- Treat the preferred next step as the default path forward.", | ||
| 156 | + ] | ||
| 157 | + ) | ||
| 158 | + candidate_lines = self._file_not_found_candidate_lines(tool_call, outcome) | ||
| 159 | + if candidate_lines: | ||
| 160 | + lines.extend(["", "## LIKELY FILE CANDIDATES", *candidate_lines]) | ||
| 133 | return "\n".join(lines) | 161 | return "\n".join(lines) |
| 162 | + | ||
| 163 | + def _file_not_found_candidate_lines( | ||
| 164 | + self, | ||
| 165 | + tool_call: ToolCall, | ||
| 166 | + outcome: ToolExecutionOutcome, | ||
| 167 | + ) -> list[str]: | ||
| 168 | + if tool_call.name not in {"read", "write", "edit", "patch"}: | ||
| 169 | + return [] | ||
| 170 | + if "not found" not in outcome.result_output.lower(): | ||
| 171 | + return [] | ||
| 172 | + | ||
| 173 | + missing_path = self._canonicalize_path( | ||
| 174 | + str( | ||
| 175 | + tool_call.arguments.get("file_path") | ||
| 176 | + or tool_call.arguments.get("path") | ||
| 177 | + or "" | ||
| 178 | + ).strip() | ||
| 179 | + ) | ||
| 180 | + if not missing_path: | ||
| 181 | + return [] | ||
| 182 | + | ||
| 183 | + candidates = self._rank_known_file_candidates(missing_path) | ||
| 184 | + if not candidates: | ||
| 185 | + return [] | ||
| 186 | + | ||
| 187 | + names = ", ".join(f"`{Path(candidate).name}`" for candidate in candidates[:3]) | ||
| 188 | + return [ | ||
| 189 | + f"- Requested file does not exist: `{missing_path}`", | ||
| 190 | + f"- Closest known files in the same directory: {names}", | ||
| 191 | + "- Prefer one of those exact filenames instead of retrying the missing path.", | ||
| 192 | + ] | ||
| 193 | + | ||
| 194 | + def _rank_known_file_candidates(self, missing_path: str) -> list[str]: | ||
| 195 | + missing_parent = str(Path(missing_path).parent) | ||
| 196 | + missing_name = Path(missing_path).name | ||
| 197 | + missing_prefix = missing_name.split("-", 1)[0] | ||
| 198 | + | ||
| 199 | + ranked: list[tuple[float, str]] = [] | ||
| 200 | + seen: set[str] = set() | ||
| 201 | + for candidate in self._known_file_paths(): | ||
| 202 | + if candidate == missing_path: | ||
| 203 | + continue | ||
| 204 | + if str(Path(candidate).parent) != missing_parent: | ||
| 205 | + continue | ||
| 206 | + name = Path(candidate).name | ||
| 207 | + if name in seen: | ||
| 208 | + continue | ||
| 209 | + seen.add(name) | ||
| 210 | + | ||
| 211 | + score = SequenceMatcher(None, missing_name, name).ratio() | ||
| 212 | + if missing_prefix and name.startswith(f"{missing_prefix}-"): | ||
| 213 | + score += 1.0 | ||
| 214 | + ranked.append((score, candidate)) | ||
| 215 | + | ||
| 216 | + ranked.sort(key=lambda item: (-item[0], item[1])) | ||
| 217 | + return [candidate for _, candidate in ranked] | ||
| 218 | + | ||
| 219 | + def _known_file_paths(self) -> list[str]: | ||
| 220 | + pattern = re.compile(r"(?:~|/)[^\s`\"']+\.html") | ||
| 221 | + discovered: list[str] = [] | ||
| 222 | + seen: set[str] = set() | ||
| 223 | + for message in self.context.session.messages: | ||
| 224 | + for raw_path in pattern.findall(message.content): | ||
| 225 | + candidate = self._canonicalize_path(raw_path) | ||
| 226 | + if not candidate or candidate in seen: | ||
| 227 | + continue | ||
| 228 | + seen.add(candidate) | ||
| 229 | + discovered.append(candidate) | ||
| 230 | + return discovered | ||
| 231 | + | ||
| 232 | + def _canonicalize_path(self, raw_path: str) -> str: | ||
| 233 | + if not raw_path: | ||
| 234 | + return "" | ||
| 235 | + try: | ||
| 236 | + return str(Path(raw_path).expanduser().resolve(strict=False)) | ||
| 237 | + except (OSError, RuntimeError, ValueError): | ||
| 238 | + return str(Path(raw_path).expanduser()) | ||
src/loader/runtime/tool_batches.pymodified@@ -31,6 +31,8 @@ from .verification_observations import ( | |||
| 31 | VerificationObservationStatus, | 31 | VerificationObservationStatus, |
| 32 | ) | 32 | ) |
| 33 | from .workflow import sync_todos_to_definition_of_done | 33 | from .workflow import sync_todos_to_definition_of_done |
| 34 | +from .workflow import advance_todos_from_tool_call | ||
| 35 | +from .compaction import infer_preferred_next_step | ||
| 34 | 36 | ||
| 35 | EventSink = Callable[[AgentEvent], Awaitable[None]] | 37 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 36 | ConfirmationHandler = ( | 38 | ConfirmationHandler = ( |
@@ -215,6 +217,8 @@ class ToolBatchRunner: | |||
| 215 | # otherwise the model operates blind and loops. | 217 | # otherwise the model operates blind and loops. |
| 216 | self.context.session.append(outcome.message) | 218 | self.context.session.append(outcome.message) |
| 217 | summary.tool_result_messages.append(outcome.message) | 219 | summary.tool_result_messages.append(outcome.message) |
| 220 | + if outcome.state == ToolExecutionState.DUPLICATE: | ||
| 221 | + self._queue_duplicate_observation_nudge(tool_call) | ||
| 218 | 222 | ||
| 219 | should_continue = await self.verification_gate.should_continue( | 223 | should_continue = await self.verification_gate.should_continue( |
| 220 | tool_call=tool_call, | 224 | tool_call=tool_call, |
@@ -247,6 +251,43 @@ class ToolBatchRunner: | |||
| 247 | 251 | ||
| 248 | return result | 252 | return result |
| 249 | 253 | ||
| 254 | + def _queue_duplicate_observation_nudge(self, tool_call: ToolCall) -> None: | ||
| 255 | + """Queue a concrete next-step nudge after duplicate observational actions.""" | ||
| 256 | + | ||
| 257 | + if tool_call.name not in {"read", "glob", "grep", "bash"}: | ||
| 258 | + return | ||
| 259 | + | ||
| 260 | + current_task = getattr(self.context.session, "current_task", None) | ||
| 261 | + preferred_next_step = infer_preferred_next_step( | ||
| 262 | + self.context.session.messages, | ||
| 263 | + current_task=current_task, | ||
| 264 | + ) | ||
| 265 | + if preferred_next_step: | ||
| 266 | + self.context.queue_steering_message( | ||
| 267 | + "Reuse the earlier observation instead of repeating it. " | ||
| 268 | + f"{preferred_next_step} " | ||
| 269 | + "Only gather more evidence if a specific filename, href, or title is still unknown." | ||
| 270 | + ) | ||
| 271 | + return | ||
| 272 | + | ||
| 273 | + target_path = str( | ||
| 274 | + tool_call.arguments.get("file_path") | ||
| 275 | + or tool_call.arguments.get("path") | ||
| 276 | + or "" | ||
| 277 | + ).strip() | ||
| 278 | + if target_path: | ||
| 279 | + self.context.queue_steering_message( | ||
| 280 | + "Reuse the earlier observation instead of repeating it. " | ||
| 281 | + f"Use the current contents of `{target_path}` and take a different next step. " | ||
| 282 | + "Only gather more evidence if a specific filename, href, or title is still unknown." | ||
| 283 | + ) | ||
| 284 | + return | ||
| 285 | + | ||
| 286 | + self.context.queue_steering_message( | ||
| 287 | + "Reuse the earlier observation instead of repeating it. " | ||
| 288 | + "Choose a different next step that makes progress." | ||
| 289 | + ) | ||
| 290 | + | ||
| 250 | async def _record_successful_execution( | 291 | async def _record_successful_execution( |
| 251 | self, | 292 | self, |
| 252 | *, | 293 | *, |
@@ -279,6 +320,8 @@ class ToolBatchRunner: | |||
| 279 | new_todos = outcome.registry_result.metadata.get("new_todos", []) | 320 | new_todos = outcome.registry_result.metadata.get("new_todos", []) |
| 280 | if isinstance(new_todos, list): | 321 | if isinstance(new_todos, list): |
| 281 | sync_todos_to_definition_of_done(dod, new_todos) | 322 | sync_todos_to_definition_of_done(dod, new_todos) |
| 323 | + else: | ||
| 324 | + advance_todos_from_tool_call(dod, tool_call) | ||
| 282 | self.dod_store.save(dod) | 325 | self.dod_store.save(dod) |
| 283 | recovery_context = self.context.recovery_context | 326 | recovery_context = self.context.recovery_context |
| 284 | if recovery_context is not None: | 327 | if recovery_context is not None: |
src/loader/runtime/workflow.pymodified@@ -8,6 +8,7 @@ from datetime import UTC, datetime | |||
| 8 | from pathlib import Path | 8 | from pathlib import Path |
| 9 | from typing import ClassVar | 9 | from typing import ClassVar |
| 10 | 10 | ||
| 11 | +from ..llm.base import ToolCall | ||
| 11 | from .clarify_grounding import ClarifyGrounding | 12 | from .clarify_grounding import ClarifyGrounding |
| 12 | from .dod import slugify | 13 | from .dod import slugify |
| 13 | from .workflow_policy import ( | 14 | from .workflow_policy import ( |
@@ -43,6 +44,7 @@ __all__ = [ | |||
| 43 | "WorkflowSignalPacket", | 44 | "WorkflowSignalPacket", |
| 44 | "WorkflowTimelineEntry", | 45 | "WorkflowTimelineEntry", |
| 45 | "WorkflowTimelineEntryKind", | 46 | "WorkflowTimelineEntryKind", |
| 47 | + "advance_todos_from_tool_call", | ||
| 46 | "build_execute_bridge", | 48 | "build_execute_bridge", |
| 47 | "enrich_clarify_brief_with_grounding", | 49 | "enrich_clarify_brief_with_grounding", |
| 48 | "extract_verification_commands_from_markdown", | 50 | "extract_verification_commands_from_markdown", |
@@ -67,6 +69,64 @@ _GENERIC_CONSTRAINTS = { | |||
| 67 | _GENERIC_ASSUMPTIONS = { | 69 | _GENERIC_ASSUMPTIONS = { |
| 68 | "Unspecified details stay unchanged unless evidence says otherwise.", | 70 | "Unspecified details stay unchanged unless evidence says otherwise.", |
| 69 | } | 71 | } |
| 72 | +_SPECIAL_TODO_ITEMS = { | ||
| 73 | + "Complete the requested work", | ||
| 74 | + "Collect verification evidence", | ||
| 75 | +} | ||
| 76 | +_READ_STEP_HINTS = ( | ||
| 77 | + "read", | ||
| 78 | + "examine", | ||
| 79 | + "inspect", | ||
| 80 | + "review", | ||
| 81 | + "check", | ||
| 82 | + "look at", | ||
| 83 | + "look through", | ||
| 84 | + "open", | ||
| 85 | + "understand", | ||
| 86 | + "study", | ||
| 87 | +) | ||
| 88 | +_SEARCH_STEP_HINTS = ( | ||
| 89 | + "list", | ||
| 90 | + "find", | ||
| 91 | + "search", | ||
| 92 | + "scan", | ||
| 93 | + "discover", | ||
| 94 | + "locate", | ||
| 95 | + "enumerate", | ||
| 96 | + "gather", | ||
| 97 | +) | ||
| 98 | +_PARSE_STEP_HINTS = ( | ||
| 99 | + "parse", | ||
| 100 | + "extract", | ||
| 101 | + "identify", | ||
| 102 | + "map", | ||
| 103 | + "determine", | ||
| 104 | +) | ||
| 105 | +_MUTATION_STEP_HINTS = ( | ||
| 106 | + "update", | ||
| 107 | + "edit", | ||
| 108 | + "write", | ||
| 109 | + "fix", | ||
| 110 | + "modify", | ||
| 111 | + "change", | ||
| 112 | + "patch", | ||
| 113 | + "replace", | ||
| 114 | + "correct", | ||
| 115 | + "rewrite", | ||
| 116 | +) | ||
| 117 | +_VERIFY_STEP_HINTS = ( | ||
| 118 | + "verify", | ||
| 119 | + "validation", | ||
| 120 | + "validate", | ||
| 121 | + "test", | ||
| 122 | + "confirm", | ||
| 123 | + "check", | ||
| 124 | +) | ||
| 125 | +_SHELL_COMMAND_START = re.compile( | ||
| 126 | + r"(?<![\w/.-])(" | ||
| 127 | + r"ls|grep|pytest|uv|python3?|html5validator|cargo|npm|node|mypy|ruff|find|git|cat|sed|head|tail" | ||
| 128 | + r")\b" | ||
| 129 | +) | ||
| 70 | 130 | ||
| 71 | _SECTION_ALIASES = { | 131 | _SECTION_ALIASES = { |
| 72 | "task statement": "task_statement", | 132 | "task statement": "task_statement", |
@@ -486,6 +546,116 @@ def sync_todos_to_definition_of_done( | |||
| 486 | dod.completed_items = list(dict.fromkeys(completed + special_completed)) | 546 | dod.completed_items = list(dict.fromkeys(completed + special_completed)) |
| 487 | 547 | ||
| 488 | 548 | ||
| 549 | +def advance_todos_from_tool_call(dod, tool_call: ToolCall) -> bool: | ||
| 550 | + """Advance the best-matching pending todo from a successful tool call.""" | ||
| 551 | + | ||
| 552 | + best_index: int | None = None | ||
| 553 | + best_score = 0 | ||
| 554 | + | ||
| 555 | + for index, item in enumerate(dod.pending_items): | ||
| 556 | + label = item.strip() | ||
| 557 | + if not label or label in _SPECIAL_TODO_ITEMS: | ||
| 558 | + continue | ||
| 559 | + score = _todo_progress_score(label, tool_call) | ||
| 560 | + if score > best_score: | ||
| 561 | + best_index = index | ||
| 562 | + best_score = score | ||
| 563 | + | ||
| 564 | + if best_index is None or best_score <= 0: | ||
| 565 | + return False | ||
| 566 | + | ||
| 567 | + completed = dod.pending_items.pop(best_index) | ||
| 568 | + if completed not in dod.completed_items: | ||
| 569 | + dod.completed_items.append(completed) | ||
| 570 | + return True | ||
| 571 | + | ||
| 572 | + | ||
| 573 | +def _todo_progress_score(item: str, tool_call: ToolCall) -> int: | ||
| 574 | + text = item.lower() | ||
| 575 | + name = tool_call.name | ||
| 576 | + file_path = str(tool_call.arguments.get("file_path", "")).strip().lower() | ||
| 577 | + path = str(tool_call.arguments.get("path", "")).strip().lower() | ||
| 578 | + pattern = str(tool_call.arguments.get("pattern", "")).strip().lower() | ||
| 579 | + command = str(tool_call.arguments.get("command", "")).strip().lower() | ||
| 580 | + combined = " ".join(part for part in (file_path, path, pattern, command) if part) | ||
| 581 | + | ||
| 582 | + path_hint = file_path or path | ||
| 583 | + basename = Path(path_hint).name.lower() if path_hint else "" | ||
| 584 | + parent = Path(path_hint).parent.name.lower() if path_hint else "" | ||
| 585 | + | ||
| 586 | + score = 0 | ||
| 587 | + if basename and basename in text: | ||
| 588 | + score += 3 | ||
| 589 | + if parent and parent not in {"", "."} and parent in text: | ||
| 590 | + score += 2 | ||
| 591 | + if "index" in text and "index" in combined: | ||
| 592 | + score += 2 | ||
| 593 | + if "chapter" in text and ("chapter" in basename or "chapters" in combined): | ||
| 594 | + score += 1 | ||
| 595 | + if "html" in text and ".html" in combined: | ||
| 596 | + score += 1 | ||
| 597 | + | ||
| 598 | + if name == "read": | ||
| 599 | + if _contains_any(text, _READ_STEP_HINTS): | ||
| 600 | + score += 2 | ||
| 601 | + if _contains_any(text, _PARSE_STEP_HINTS) and ".html" in combined: | ||
| 602 | + score += 1 | ||
| 603 | + elif name in {"glob", "grep"}: | ||
| 604 | + if _contains_any(text, _SEARCH_STEP_HINTS): | ||
| 605 | + score += 2 | ||
| 606 | + if name == "glob" and _contains_any(text, _READ_STEP_HINTS) and ".html" in combined: | ||
| 607 | + score += 1 | ||
| 608 | + elif name == "bash": | ||
| 609 | + if _looks_like_verification_command(command): | ||
| 610 | + if _contains_any(text, _VERIFY_STEP_HINTS): | ||
| 611 | + score += 3 | ||
| 612 | + elif _looks_like_search_command(command): | ||
| 613 | + if _contains_any(text, _SEARCH_STEP_HINTS): | ||
| 614 | + score += 2 | ||
| 615 | + elif _looks_like_read_command(command): | ||
| 616 | + if _contains_any(text, _READ_STEP_HINTS): | ||
| 617 | + score += 2 | ||
| 618 | + elif name in {"write", "edit", "patch"}: | ||
| 619 | + if _contains_any(text, _MUTATION_STEP_HINTS): | ||
| 620 | + score += 3 | ||
| 621 | + | ||
| 622 | + if name in {"write", "edit", "patch"} and _contains_any(text, _VERIFY_STEP_HINTS): | ||
| 623 | + return 0 | ||
| 624 | + return score | ||
| 625 | + | ||
| 626 | + | ||
| 627 | +def _contains_any(text: str, candidates: tuple[str, ...]) -> bool: | ||
| 628 | + return any(candidate in text for candidate in candidates) | ||
| 629 | + | ||
| 630 | + | ||
| 631 | +def _looks_like_search_command(command: str) -> bool: | ||
| 632 | + return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob ")) | ||
| 633 | + | ||
| 634 | + | ||
| 635 | +def _looks_like_read_command(command: str) -> bool: | ||
| 636 | + return any(token in command for token in ("cat ", "sed ", "head ", "tail ")) | ||
| 637 | + | ||
| 638 | + | ||
| 639 | +def _looks_like_verification_command(command: str) -> bool: | ||
| 640 | + return any( | ||
| 641 | + token in command | ||
| 642 | + for token in ( | ||
| 643 | + "pytest", | ||
| 644 | + "unittest", | ||
| 645 | + " test", | ||
| 646 | + " check", | ||
| 647 | + " verify", | ||
| 648 | + "html5validator", | ||
| 649 | + "mypy", | ||
| 650 | + "ruff", | ||
| 651 | + "lint", | ||
| 652 | + "grep ", | ||
| 653 | + "diff ", | ||
| 654 | + "cmp ", | ||
| 655 | + ) | ||
| 656 | + ) | ||
| 657 | + | ||
| 658 | + | ||
| 489 | def extract_verification_commands_from_markdown(markdown: str) -> list[str]: | 659 | def extract_verification_commands_from_markdown(markdown: str) -> list[str]: |
| 490 | """Extract verification commands from a verification-plan markdown document.""" | 660 | """Extract verification commands from a verification-plan markdown document.""" |
| 491 | 661 | ||
@@ -686,11 +856,61 @@ def _mark_explicit_section(brief: ClarifyBrief, section: str) -> None: | |||
| 686 | def _extract_commands(items: list[str]) -> list[str]: | 856 | def _extract_commands(items: list[str]) -> list[str]: |
| 687 | commands: list[str] = [] | 857 | commands: list[str] = [] |
| 688 | for item in items: | 858 | for item in items: |
| 689 | - match = re.match(r"^`(.+)`$", item) | 859 | + text = item.strip() |
| 690 | - commands.append((match.group(1) if match else item).strip()) | 860 | + if not text: |
| 861 | + continue | ||
| 862 | + | ||
| 863 | + # Code fences often contain shell comments plus the actual command lines. | ||
| 864 | + if "```" in text: | ||
| 865 | + text = text.replace("```bash", "```").replace("```sh", "```") | ||
| 866 | + if "\n" not in text: | ||
| 867 | + commands.extend(_extract_collapsed_shell_commands(text)) | ||
| 868 | + continue | ||
| 869 | + | ||
| 870 | + lines = text.splitlines() if "\n" in text or "```" in text else [text] | ||
| 871 | + for line in lines: | ||
| 872 | + candidate = line.strip() | ||
| 873 | + if not candidate or candidate.startswith("```"): | ||
| 874 | + continue | ||
| 875 | + candidate = re.sub(r"^-\s+", "", candidate) | ||
| 876 | + match = re.match(r"^`(.+)`$", candidate) | ||
| 877 | + candidate = (match.group(1) if match else candidate).strip() | ||
| 878 | + if candidate.startswith("#"): | ||
| 879 | + candidate = _extract_shell_command_from_text(candidate) | ||
| 880 | + if not candidate: | ||
| 881 | + continue | ||
| 882 | + if candidate: | ||
| 883 | + commands.append(candidate) | ||
| 691 | return [command for command in commands if command] | 884 | return [command for command in commands if command] |
| 692 | 885 | ||
| 693 | 886 | ||
| 887 | +def _extract_collapsed_shell_commands(text: str) -> list[str]: | ||
| 888 | + stripped = re.sub(r"```(?:\w+)?", "", text).strip() | ||
| 889 | + if not stripped: | ||
| 890 | + return [] | ||
| 891 | + | ||
| 892 | + matches = list(_SHELL_COMMAND_START.finditer(stripped)) | ||
| 893 | + if not matches: | ||
| 894 | + extracted = _extract_shell_command_from_text(stripped) | ||
| 895 | + return [extracted] if extracted else [] | ||
| 896 | + | ||
| 897 | + commands: list[str] = [] | ||
| 898 | + for index, match in enumerate(matches): | ||
| 899 | + start = match.start() | ||
| 900 | + end = matches[index + 1].start() if index + 1 < len(matches) else len(stripped) | ||
| 901 | + candidate = stripped[start:end].strip() | ||
| 902 | + if candidate: | ||
| 903 | + commands.append(candidate) | ||
| 904 | + return commands | ||
| 905 | + | ||
| 906 | + | ||
| 907 | +def _extract_shell_command_from_text(text: str) -> str: | ||
| 908 | + match = _SHELL_COMMAND_START.search(text) | ||
| 909 | + if match is None: | ||
| 910 | + return "" | ||
| 911 | + return text[match.start():].strip() | ||
| 912 | + | ||
| 913 | + | ||
| 694 | def _has_concrete_anchor(task: str) -> bool: | 914 | def _has_concrete_anchor(task: str) -> bool: |
| 695 | return any( | 915 | return any( |
| 696 | re.search(pattern, task) | 916 | re.search(pattern, task) |
src/loader/tools/file_tools.pymodified@@ -13,6 +13,7 @@ from .fs_safety import ( | |||
| 13 | ensure_safe_to_read, | 13 | ensure_safe_to_read, |
| 14 | ensure_safe_to_write, | 14 | ensure_safe_to_write, |
| 15 | make_structured_patch, | 15 | make_structured_patch, |
| 16 | + parse_unified_diff_patch, | ||
| 16 | resolve_workspace_path, | 17 | resolve_workspace_path, |
| 17 | ) | 18 | ) |
| 18 | 19 | ||
@@ -447,7 +448,8 @@ class PatchTool(Tool): | |||
| 447 | def description(self) -> str: | 448 | def description(self) -> str: |
| 448 | return ( | 449 | return ( |
| 449 | "Apply structured patch hunks to a file. Prefer this for larger " | 450 | "Apply structured patch hunks to a file. Prefer this for larger " |
| 450 | - "or multi-line edits where exact old/new string replacement is brittle." | 451 | + "or multi-line edits where exact old/new string replacement is brittle. " |
| 452 | + "A raw unified diff string is also accepted via `patch`." | ||
| 451 | ) | 453 | ) |
| 452 | 454 | ||
| 453 | @property | 455 | @property |
@@ -483,8 +485,15 @@ class PatchTool(Tool): | |||
| 483 | ], | 485 | ], |
| 484 | }, | 486 | }, |
| 485 | }, | 487 | }, |
| 488 | + "patch": { | ||
| 489 | + "type": "string", | ||
| 490 | + "description": ( | ||
| 491 | + "Optional unified diff patch string. Loader will parse this " | ||
| 492 | + "into structured hunks when possible." | ||
| 493 | + ), | ||
| 494 | + }, | ||
| 486 | }, | 495 | }, |
| 487 | - "required": ["file_path", "hunks"], | 496 | + "required": ["file_path"], |
| 488 | } | 497 | } |
| 489 | 498 | ||
| 490 | @property | 499 | @property |
@@ -505,7 +514,8 @@ class PatchTool(Tool): | |||
| 505 | async def execute( | 514 | async def execute( |
| 506 | self, | 515 | self, |
| 507 | file_path: str, | 516 | file_path: str, |
| 508 | - hunks: list[dict[str, Any]], | 517 | + hunks: list[dict[str, Any]] | None = None, |
| 518 | + patch: str | None = None, | ||
| 509 | **kwargs: Any, | 519 | **kwargs: Any, |
| 510 | ) -> ToolResult: | 520 | ) -> ToolResult: |
| 511 | kwargs.pop("_skip_confirmation", None) | 521 | kwargs.pop("_skip_confirmation", None) |
@@ -544,13 +554,20 @@ class PatchTool(Tool): | |||
| 544 | ensure_safe_to_read(path) | 554 | ensure_safe_to_read(path) |
| 545 | original_content = await asyncio.to_thread(path.read_text) | 555 | original_content = await asyncio.to_thread(path.read_text) |
| 546 | original_lines = original_content.splitlines() | 556 | original_lines = original_content.splitlines() |
| 547 | - parsed_hunks = [ | 557 | + raw_patch = patch or kwargs.get("diff") or kwargs.get("patch_text") |
| 548 | - StructuredPatchHunk.from_dict_with_original( | 558 | + parsed_hunks: list[StructuredPatchHunk] |
| 549 | - hunk, | 559 | + if hunks: |
| 550 | - original_lines=original_lines, | 560 | + parsed_hunks = [ |
| 551 | - ) | 561 | + StructuredPatchHunk.from_dict_with_original( |
| 552 | - for hunk in hunks | 562 | + hunk, |
| 553 | - ] | 563 | + original_lines=original_lines, |
| 564 | + ) | ||
| 565 | + for hunk in hunks | ||
| 566 | + ] | ||
| 567 | + elif isinstance(raw_patch, str) and raw_patch.strip(): | ||
| 568 | + parsed_hunks = parse_unified_diff_patch(raw_patch) | ||
| 569 | + else: | ||
| 570 | + parsed_hunks = [] | ||
| 554 | if not parsed_hunks: | 571 | if not parsed_hunks: |
| 555 | raise ValueError("hunks must not be empty") | 572 | raise ValueError("hunks must not be empty") |
| 556 | updated_content = apply_structured_patch(original_content, parsed_hunks) | 573 | updated_content = apply_structured_patch(original_content, parsed_hunks) |
src/loader/tools/fs_safety.pymodified@@ -4,6 +4,7 @@ from __future__ import annotations | |||
| 4 | 4 | ||
| 5 | from dataclasses import asdict, dataclass | 5 | from dataclasses import asdict, dataclass |
| 6 | from pathlib import Path | 6 | from pathlib import Path |
| 7 | +import re | ||
| 7 | 8 | ||
| 8 | MAX_READ_SIZE = 10 * 1024 * 1024 | 9 | MAX_READ_SIZE = 10 * 1024 * 1024 |
| 9 | MAX_WRITE_SIZE = 10 * 1024 * 1024 | 10 | MAX_WRITE_SIZE = 10 * 1024 * 1024 |
@@ -233,3 +234,64 @@ def _expect_patch_line( | |||
| 233 | "structured patch context mismatch: " | 234 | "structured patch context mismatch: " |
| 234 | f"expected {expected!r}, found {actual!r}" | 235 | f"expected {expected!r}, found {actual!r}" |
| 235 | ) | 236 | ) |
| 237 | + | ||
| 238 | + | ||
| 239 | +_UNIFIED_DIFF_HUNK_RE = re.compile( | ||
| 240 | + r"^@@ -(?P<old_start>\d+)(?:,(?P<old_lines>\d+))? " | ||
| 241 | + r"\+(?P<new_start>\d+)(?:,(?P<new_lines>\d+))? @@" | ||
| 242 | +) | ||
| 243 | + | ||
| 244 | + | ||
| 245 | +def parse_unified_diff_patch(patch_text: str) -> list[StructuredPatchHunk]: | ||
| 246 | + """Parse a unified diff string into structured patch hunks.""" | ||
| 247 | + | ||
| 248 | + if not str(patch_text).strip(): | ||
| 249 | + raise ValueError("patch text is empty") | ||
| 250 | + | ||
| 251 | + hunks: list[StructuredPatchHunk] = [] | ||
| 252 | + current_hunk: StructuredPatchHunk | None = None | ||
| 253 | + | ||
| 254 | + for raw_line in str(patch_text).splitlines(): | ||
| 255 | + if raw_line.startswith(("--- ", "+++ ")): | ||
| 256 | + continue | ||
| 257 | + if raw_line.startswith("@@"): | ||
| 258 | + match = _UNIFIED_DIFF_HUNK_RE.match(raw_line) | ||
| 259 | + if match is None: | ||
| 260 | + raise ValueError( | ||
| 261 | + "patch text contains an invalid unified-diff hunk header" | ||
| 262 | + ) | ||
| 263 | + if current_hunk is not None: | ||
| 264 | + hunks.append(current_hunk) | ||
| 265 | + current_hunk = StructuredPatchHunk( | ||
| 266 | + old_start=int(match.group("old_start")), | ||
| 267 | + old_lines=int(match.group("old_lines") or 1), | ||
| 268 | + new_start=int(match.group("new_start")), | ||
| 269 | + new_lines=int(match.group("new_lines") or 1), | ||
| 270 | + lines=[], | ||
| 271 | + ) | ||
| 272 | + continue | ||
| 273 | + | ||
| 274 | + if raw_line == r"\ No newline at end of file": | ||
| 275 | + continue | ||
| 276 | + | ||
| 277 | + if current_hunk is None: | ||
| 278 | + if not raw_line.strip(): | ||
| 279 | + continue | ||
| 280 | + raise ValueError( | ||
| 281 | + "patch text must include at least one unified-diff hunk header" | ||
| 282 | + ) | ||
| 283 | + | ||
| 284 | + prefix = raw_line[:1] | ||
| 285 | + if prefix not in {" ", "+", "-"}: | ||
| 286 | + raise ValueError( | ||
| 287 | + "patch text contains a diff line without a valid prefix" | ||
| 288 | + ) | ||
| 289 | + current_hunk.lines.append(raw_line) | ||
| 290 | + | ||
| 291 | + if current_hunk is not None: | ||
| 292 | + hunks.append(current_hunk) | ||
| 293 | + | ||
| 294 | + if not hunks: | ||
| 295 | + raise ValueError("patch text must include at least one unified-diff hunk") | ||
| 296 | + | ||
| 297 | + return hunks | ||
src/loader/utils/file_mutations.pymodified@@ -11,7 +11,11 @@ from rich.console import Group | |||
| 11 | from rich.panel import Panel | 11 | from rich.panel import Panel |
| 12 | from rich.text import Text | 12 | from rich.text import Text |
| 13 | 13 | ||
| 14 | -from ..tools.fs_safety import StructuredPatchHunk, make_structured_patch | 14 | +from ..tools.fs_safety import ( |
| 15 | + StructuredPatchHunk, | ||
| 16 | + make_structured_patch, | ||
| 17 | + parse_unified_diff_patch, | ||
| 18 | +) | ||
| 15 | 19 | ||
| 16 | FILE_MUTATION_TOOLS = {"write", "edit", "patch"} | 20 | FILE_MUTATION_TOOLS = {"write", "edit", "patch"} |
| 17 | DIFF_TRUNCATION_NOTICE = "truncated for display; full result preserved in session" | 21 | DIFF_TRUNCATION_NOTICE = "truncated for display; full result preserved in session" |
@@ -98,6 +102,8 @@ def build_file_mutation_preview( | |||
| 98 | structured_patch = _coerce_patch_hunks(info.get("hunks")) or _coerce_patch_hunks( | 102 | structured_patch = _coerce_patch_hunks(info.get("hunks")) or _coerce_patch_hunks( |
| 99 | args.get("hunks") | 103 | args.get("hunks") |
| 100 | ) | 104 | ) |
| 105 | + if not structured_patch and tool_name == "patch": | ||
| 106 | + structured_patch = _coerce_raw_patch_hunks(info) or _coerce_raw_patch_hunks(args) | ||
| 101 | 107 | ||
| 102 | old_text = _extract_old_text(tool_name, info) or _extract_old_text(tool_name, args) | 108 | old_text = _extract_old_text(tool_name, info) or _extract_old_text(tool_name, args) |
| 103 | new_text = _extract_new_text(tool_name, info) or _extract_new_text(tool_name, args) | 109 | new_text = _extract_new_text(tool_name, info) or _extract_new_text(tool_name, args) |
@@ -192,6 +198,18 @@ def _coerce_patch_hunks(value: Any) -> list[StructuredPatchHunk]: | |||
| 192 | return hunks | 198 | return hunks |
| 193 | 199 | ||
| 194 | 200 | ||
| 201 | +def _coerce_raw_patch_hunks(payload: dict[str, Any]) -> list[StructuredPatchHunk]: | ||
| 202 | + for key in ("patch", "diff", "patch_text"): | ||
| 203 | + value = payload.get(key) | ||
| 204 | + if not isinstance(value, str) or not value.strip(): | ||
| 205 | + continue | ||
| 206 | + try: | ||
| 207 | + return parse_unified_diff_patch(value) | ||
| 208 | + except ValueError: | ||
| 209 | + continue | ||
| 210 | + return [] | ||
| 211 | + | ||
| 212 | + | ||
| 195 | def _extract_file_path(payload: dict[str, Any]) -> str | None: | 213 | def _extract_file_path(payload: dict[str, Any]) -> str | None: |
| 196 | for key in ("file_path", "filePath", "path", "filename", "file"): | 214 | for key in ("file_path", "filePath", "path", "filename", "file"): |
| 197 | value = payload.get(key) | 215 | value = payload.get(key) |
tests/test_completion_policy.pymodified@@ -7,7 +7,7 @@ from types import SimpleNamespace | |||
| 7 | 7 | ||
| 8 | import pytest | 8 | import pytest |
| 9 | 9 | ||
| 10 | -from loader.llm.base import Message, Role | 10 | +from loader.llm.base import Message, Role, ToolCall |
| 11 | from loader.runtime.completion_policy import CompletionPolicy | 11 | from loader.runtime.completion_policy import CompletionPolicy |
| 12 | from loader.runtime.context import RuntimeContext | 12 | from loader.runtime.context import RuntimeContext |
| 13 | from loader.runtime.dod import VerificationEvidence, create_definition_of_done | 13 | from loader.runtime.dod import VerificationEvidence, create_definition_of_done |
@@ -24,6 +24,7 @@ from loader.runtime.task_completion import ( | |||
| 24 | detect_premature_completion, | 24 | detect_premature_completion, |
| 25 | get_continuation_prompt, | 25 | get_continuation_prompt, |
| 26 | ) | 26 | ) |
| 27 | +from loader.runtime.workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done | ||
| 27 | from loader.runtime.verification_observations import ( | 28 | from loader.runtime.verification_observations import ( |
| 28 | VerificationObservationStatus, | 29 | VerificationObservationStatus, |
| 29 | verification_attempt_id, | 30 | verification_attempt_id, |
@@ -338,6 +339,78 @@ def test_completion_assessment_attaches_typed_verification_provenance() -> None: | |||
| 338 | assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`" | 339 | assert assessment.evidence_provenance[0].summary == "verification failed for `pytest -q`" |
| 339 | 340 | ||
| 340 | 341 | ||
| 342 | +def test_completion_assessment_uses_advanced_todo_progress_for_next_step() -> None: | ||
| 343 | + dod = create_definition_of_done("Fix the chapter links in index.html.") | ||
| 344 | + sync_todos_to_definition_of_done( | ||
| 345 | + dod, | ||
| 346 | + [ | ||
| 347 | + { | ||
| 348 | + "content": "First, examine the current index.html file to understand its structure", | ||
| 349 | + "active_form": "Working on: First, examine the current index.html file to understand its structure", | ||
| 350 | + "status": "pending", | ||
| 351 | + }, | ||
| 352 | + { | ||
| 353 | + "content": "List and read all HTML files in the chapters directory to extract chapter information", | ||
| 354 | + "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information", | ||
| 355 | + "status": "pending", | ||
| 356 | + }, | ||
| 357 | + { | ||
| 358 | + "content": "Parse chapter titles from each HTML file", | ||
| 359 | + "active_form": "Working on: Parse chapter titles from each HTML file", | ||
| 360 | + "status": "pending", | ||
| 361 | + }, | ||
| 362 | + { | ||
| 363 | + "content": "Update index.html with correct chapter links and titles", | ||
| 364 | + "active_form": "Working on: Update index.html with correct chapter links and titles", | ||
| 365 | + "status": "pending", | ||
| 366 | + }, | ||
| 367 | + ], | ||
| 368 | + ) | ||
| 369 | + advance_todos_from_tool_call( | ||
| 370 | + dod, | ||
| 371 | + ToolCall( | ||
| 372 | + id="read-index", | ||
| 373 | + name="read", | ||
| 374 | + arguments={"file_path": "/tmp/fortran/index.html"}, | ||
| 375 | + ), | ||
| 376 | + ) | ||
| 377 | + advance_todos_from_tool_call( | ||
| 378 | + dod, | ||
| 379 | + ToolCall( | ||
| 380 | + id="glob-chapters", | ||
| 381 | + name="glob", | ||
| 382 | + arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"}, | ||
| 383 | + ), | ||
| 384 | + ) | ||
| 385 | + advance_todos_from_tool_call( | ||
| 386 | + dod, | ||
| 387 | + ToolCall( | ||
| 388 | + id="read-chapter", | ||
| 389 | + name="read", | ||
| 390 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | ||
| 391 | + ), | ||
| 392 | + ) | ||
| 393 | + | ||
| 394 | + assessment = assess_completion_follow_through_with_provenance( | ||
| 395 | + task="Update /tmp/fortran/index.html so every chapter link is correct.", | ||
| 396 | + response="I'll update the index.html file with the correct chapter links and titles.", | ||
| 397 | + actions_taken=[ | ||
| 398 | + "read: {'file_path': '/tmp/fortran/index.html'}", | ||
| 399 | + "glob: {'path': '/tmp/fortran/chapters', 'pattern': '*.html'}", | ||
| 400 | + "read: {'file_path': '/tmp/fortran/chapters/01-introduction.html'}", | ||
| 401 | + ], | ||
| 402 | + dod=dod, | ||
| 403 | + ) | ||
| 404 | + | ||
| 405 | + assert assessment.check.missing_evidence[0] == ( | ||
| 406 | + "completion of tracked work items " | ||
| 407 | + "(Update index.html with correct chapter links and titles)" | ||
| 408 | + ) | ||
| 409 | + assert assessment.check.suggested_next_steps[0] == ( | ||
| 410 | + "Complete the tracked item: Update index.html with correct chapter links and titles" | ||
| 411 | + ) | ||
| 412 | + | ||
| 413 | + | ||
| 341 | @pytest.mark.asyncio | 414 | @pytest.mark.asyncio |
| 342 | async def test_completion_policy_stops_for_text_loop_using_runtime_context( | 415 | async def test_completion_policy_stops_for_text_loop_using_runtime_context( |
| 343 | temp_dir: Path, | 416 | temp_dir: Path, |
tests/test_expanded_tools.pymodified@@ -87,6 +87,43 @@ async def test_patch_tool_accepts_replacement_block_hunks(temp_dir: Path) -> Non | |||
| 87 | assert result.metadata["structured_patch"] | 87 | assert result.metadata["structured_patch"] |
| 88 | 88 | ||
| 89 | 89 | ||
| 90 | +@pytest.mark.asyncio | ||
| 91 | +async def test_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None: | ||
| 92 | + target = temp_dir / "sample.txt" | ||
| 93 | + target.write_text("alpha\nbeta\ngamma\n") | ||
| 94 | + tool = PatchTool(workspace_root=temp_dir) | ||
| 95 | + | ||
| 96 | + result = await tool.execute( | ||
| 97 | + file_path=str(target), | ||
| 98 | + patch=( | ||
| 99 | + "--- a/sample.txt\n" | ||
| 100 | + "+++ b/sample.txt\n" | ||
| 101 | + "@@ -2,1 +2,1 @@\n" | ||
| 102 | + "-beta\n" | ||
| 103 | + "+beta updated\n" | ||
| 104 | + ), | ||
| 105 | + ) | ||
| 106 | + | ||
| 107 | + assert result.is_error is False | ||
| 108 | + assert target.read_text() == "alpha\nbeta updated\ngamma\n" | ||
| 109 | + assert result.metadata["structured_patch"] | ||
| 110 | + | ||
| 111 | + | ||
| 112 | +@pytest.mark.asyncio | ||
| 113 | +async def test_patch_tool_rejects_invalid_unified_diff_string(temp_dir: Path) -> None: | ||
| 114 | + target = temp_dir / "sample.txt" | ||
| 115 | + target.write_text("alpha\nbeta\ngamma\n") | ||
| 116 | + tool = PatchTool(workspace_root=temp_dir) | ||
| 117 | + | ||
| 118 | + result = await tool.execute( | ||
| 119 | + file_path=str(target), | ||
| 120 | + patch="--- a/sample.txt\n+++ b/sample.txt\n@@ ...\n", | ||
| 121 | + ) | ||
| 122 | + | ||
| 123 | + assert result.is_error is True | ||
| 124 | + assert "invalid unified-diff hunk header" in result.output | ||
| 125 | + | ||
| 126 | + | ||
| 90 | @pytest.mark.asyncio | 127 | @pytest.mark.asyncio |
| 91 | async def test_git_tool_inspects_read_only_repo_state(temp_dir: Path) -> None: | 128 | async def test_git_tool_inspects_read_only_repo_state(temp_dir: Path) -> None: |
| 92 | subprocess.run(["git", "init", "--quiet"], cwd=temp_dir, check=True) | 129 | subprocess.run(["git", "init", "--quiet"], cwd=temp_dir, check=True) |
tests/test_finalization.pymodified@@ -113,6 +113,22 @@ class FakeExecutor: | |||
| 113 | return self._outcomes.pop(0) | 113 | return self._outcomes.pop(0) |
| 114 | 114 | ||
| 115 | 115 | ||
| 116 | +class RecordingExecutor: | ||
| 117 | + def __init__(self) -> None: | ||
| 118 | + self.commands: list[str] = [] | ||
| 119 | + | ||
| 120 | + async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome: | ||
| 121 | + command = str(tool_call.arguments.get("command", "")) | ||
| 122 | + self.commands.append(command) | ||
| 123 | + return tool_outcome( | ||
| 124 | + tool_call=tool_call, | ||
| 125 | + output="ok", | ||
| 126 | + is_error=False, | ||
| 127 | + exit_code=0, | ||
| 128 | + stdout="ok", | ||
| 129 | + ) | ||
| 130 | + | ||
| 131 | + | ||
| 116 | def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext: | 132 | def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext: |
| 117 | registry = create_default_registry(temp_dir) | 133 | registry = create_default_registry(temp_dir) |
| 118 | registry.configure_workspace_root(temp_dir) | 134 | registry.configure_workspace_root(temp_dir) |
@@ -301,7 +317,6 @@ async def test_turn_finalizer_records_passed_verification_observation( | |||
| 301 | ) | 317 | ) |
| 302 | dod = create_definition_of_done("Update the runtime tests.") | 318 | dod = create_definition_of_done("Update the runtime tests.") |
| 303 | dod.mutating_actions.append("write") | 319 | dod.mutating_actions.append("write") |
| 304 | - dod.touched_files.append(str(temp_dir / "tests" / "test_runtime.py")) | ||
| 305 | dod.verification_commands = ["uv run pytest -q"] | 320 | dod.verification_commands = ["uv run pytest -q"] |
| 306 | summary = TurnSummary(final_response="") | 321 | summary = TurnSummary(final_response="") |
| 307 | tool_call = ToolCall( | 322 | tool_call = ToolCall( |
@@ -360,6 +375,59 @@ async def test_turn_finalizer_records_passed_verification_observation( | |||
| 360 | assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [ | 375 | assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [ |
| 361 | VerificationObservationStatus.PASSED.value | 376 | VerificationObservationStatus.PASSED.value |
| 362 | ] | 377 | ] |
| 378 | + | ||
| 379 | + | ||
| 380 | +@pytest.mark.asyncio | ||
| 381 | +async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_commands( | ||
| 382 | + temp_dir: Path, | ||
| 383 | +) -> None: | ||
| 384 | + chapters = temp_dir / "chapters" | ||
| 385 | + chapters.mkdir() | ||
| 386 | + (chapters / "01-introduction.html").write_text( | ||
| 387 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | ||
| 388 | + ) | ||
| 389 | + index = temp_dir / "index.html" | ||
| 390 | + index.write_text( | ||
| 391 | + "\n".join( | ||
| 392 | + [ | ||
| 393 | + '<ul class="chapter-list">', | ||
| 394 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>', | ||
| 395 | + "</ul>", | ||
| 396 | + ] | ||
| 397 | + ) | ||
| 398 | + ) | ||
| 399 | + | ||
| 400 | + session = FakeSession() | ||
| 401 | + context = build_context(temp_dir, session) | ||
| 402 | + finalizer = TurnFinalizer( | ||
| 403 | + context, | ||
| 404 | + RuntimeTracer(), | ||
| 405 | + DefinitionOfDoneStore(temp_dir), | ||
| 406 | + set_workflow_mode=_noop_set_workflow_mode, | ||
| 407 | + ) | ||
| 408 | + dod = create_definition_of_done( | ||
| 409 | + "Update index.html so the table of contents links and chapter titles are correct." | ||
| 410 | + ) | ||
| 411 | + dod.mutating_actions.append("edit") | ||
| 412 | + dod.touched_files.append(str(index)) | ||
| 413 | + dod.verification_commands = ['grep -n "href=" index.html'] | ||
| 414 | + summary = TurnSummary(final_response="") | ||
| 415 | + executor = RecordingExecutor() | ||
| 416 | + | ||
| 417 | + async def capture(event) -> None: | ||
| 418 | + return None | ||
| 419 | + | ||
| 420 | + result = await finalizer.run_definition_of_done_gate( | ||
| 421 | + dod=dod, | ||
| 422 | + candidate_response="Updated the index.html links.", | ||
| 423 | + emit=capture, | ||
| 424 | + summary=summary, | ||
| 425 | + executor=executor, # type: ignore[arg-type] | ||
| 426 | + ) | ||
| 427 | + | ||
| 428 | + assert result.should_continue is False | ||
| 429 | + assert any(command == 'grep -n "href=" index.html' for command in executor.commands) | ||
| 430 | + assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in executor.commands) | ||
| 363 | assert ( | 431 | assert ( |
| 364 | session.workflow_timeline[-1].verification_observations[0].attempt_id | 432 | session.workflow_timeline[-1].verification_observations[0].attempt_id |
| 365 | == "verification-attempt-1" | 433 | == "verification-attempt-1" |
tests/test_runtime_harness.pymodified@@ -901,6 +901,46 @@ async def test_raw_json_patch_tool_call_fallback(temp_dir: Path) -> None: | |||
| 901 | assert "Patched sample.txt." in run.response | 901 | assert "Patched sample.txt." in run.response |
| 902 | 902 | ||
| 903 | 903 | ||
| 904 | +@pytest.mark.asyncio | ||
| 905 | +async def test_native_patch_tool_accepts_unified_diff_string(temp_dir: Path) -> None: | ||
| 906 | + target = temp_dir / "sample.txt" | ||
| 907 | + target.write_text("alpha\nbeta\ngamma\n") | ||
| 908 | + | ||
| 909 | + backend = ScriptedBackend( | ||
| 910 | + completions=[ | ||
| 911 | + native_tool_response( | ||
| 912 | + ToolCall( | ||
| 913 | + id="patch-1", | ||
| 914 | + name="patch", | ||
| 915 | + arguments={ | ||
| 916 | + "file_path": str(target), | ||
| 917 | + "patch": ( | ||
| 918 | + "--- a/sample.txt\n" | ||
| 919 | + "+++ b/sample.txt\n" | ||
| 920 | + "@@ -2,1 +2,1 @@\n" | ||
| 921 | + "-beta\n" | ||
| 922 | + "+beta updated\n" | ||
| 923 | + ), | ||
| 924 | + }, | ||
| 925 | + ), | ||
| 926 | + content="I'll patch the file directly.", | ||
| 927 | + ), | ||
| 928 | + final_response("Patched sample.txt."), | ||
| 929 | + ] | ||
| 930 | + ) | ||
| 931 | + | ||
| 932 | + run = await run_scenario( | ||
| 933 | + "Update sample.txt.", | ||
| 934 | + backend, | ||
| 935 | + config=non_streaming_config(), | ||
| 936 | + project_root=temp_dir, | ||
| 937 | + ) | ||
| 938 | + | ||
| 939 | + assert tool_event_names(run) == ["patch"] | ||
| 940 | + assert target.read_text() == "alpha\nbeta updated\ngamma\n" | ||
| 941 | + assert "Patched sample.txt." in run.response | ||
| 942 | + | ||
| 943 | + | ||
| 904 | @pytest.mark.asyncio | 944 | @pytest.mark.asyncio |
| 905 | async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None: | 945 | async def test_raw_json_ask_user_question_tool_call_fallback(temp_dir: Path) -> None: |
| 906 | raw_json = json.dumps( | 946 | raw_json = json.dumps( |
@@ -1766,6 +1806,66 @@ async def test_duplicate_read_is_skipped_without_intervening_mutation( | |||
| 1766 | assert "existing file contents" in run.response | 1806 | assert "existing file contents" in run.response |
| 1767 | 1807 | ||
| 1768 | 1808 | ||
| 1809 | +@pytest.mark.asyncio | ||
| 1810 | +async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence( | ||
| 1811 | + temp_dir: Path, | ||
| 1812 | +) -> None: | ||
| 1813 | + chapters = temp_dir / "chapters" | ||
| 1814 | + chapters.mkdir() | ||
| 1815 | + (chapters / "01-introduction.html").write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n") | ||
| 1816 | + (chapters / "02-setup.html").write_text("<h1>Chapter 2: Setting Up Fortran</h1>\n") | ||
| 1817 | + index_file = temp_dir / "index.html" | ||
| 1818 | + index_file.write_text("broken table of contents\n") | ||
| 1819 | + | ||
| 1820 | + backend = ScriptedBackend( | ||
| 1821 | + completions=[ | ||
| 1822 | + native_tool_response( | ||
| 1823 | + ToolCall( | ||
| 1824 | + id="glob-1", | ||
| 1825 | + name="glob", | ||
| 1826 | + arguments={"path": str(chapters), "pattern": "*.html"}, | ||
| 1827 | + ), | ||
| 1828 | + content="I'll inspect the chapter inventory first.", | ||
| 1829 | + ), | ||
| 1830 | + native_tool_response( | ||
| 1831 | + ToolCall( | ||
| 1832 | + id="read-1", | ||
| 1833 | + name="read", | ||
| 1834 | + arguments={"file_path": str(index_file)}, | ||
| 1835 | + ), | ||
| 1836 | + content="I'll inspect the index next.", | ||
| 1837 | + ), | ||
| 1838 | + native_tool_response( | ||
| 1839 | + ToolCall( | ||
| 1840 | + id="read-2", | ||
| 1841 | + name="read", | ||
| 1842 | + arguments={"file_path": str(index_file)}, | ||
| 1843 | + ), | ||
| 1844 | + content="I'll reopen the index.", | ||
| 1845 | + ), | ||
| 1846 | + final_response("I'll reuse the earlier evidence and patch the index next."), | ||
| 1847 | + ] | ||
| 1848 | + ) | ||
| 1849 | + | ||
| 1850 | + run = await run_scenario( | ||
| 1851 | + "Update index.html so the table of contents links are correct.", | ||
| 1852 | + backend, | ||
| 1853 | + config=non_streaming_config(), | ||
| 1854 | + project_root=temp_dir, | ||
| 1855 | + ) | ||
| 1856 | + | ||
| 1857 | + messages = tool_result_messages(run) | ||
| 1858 | + steering_messages = [ | ||
| 1859 | + event.content | ||
| 1860 | + for event in run.events | ||
| 1861 | + if event.type == "steering" and event.content | ||
| 1862 | + ] | ||
| 1863 | + | ||
| 1864 | + assert any("reuse the earlier read result instead of rereading" in message for message in messages) | ||
| 1865 | + assert any("Reuse the earlier observation instead of repeating it." in message for message in steering_messages) | ||
| 1866 | + assert any("index.html" in message for message in steering_messages) | ||
| 1867 | + | ||
| 1868 | + | ||
| 1769 | @pytest.mark.asyncio | 1869 | @pytest.mark.asyncio |
| 1770 | async def test_interleaved_reread_is_allowed_once_without_intervening_mutation( | 1870 | async def test_interleaved_reread_is_allowed_once_without_intervening_mutation( |
| 1771 | temp_dir: Path, | 1871 | temp_dir: Path, |
tests/test_safeguard_services.pymodified@@ -158,11 +158,25 @@ def test_pre_action_validator_blocks_patch_without_hunks() -> None: | |||
| 158 | assert result == ValidationResult( | 158 | assert result == ValidationResult( |
| 159 | valid=False, | 159 | valid=False, |
| 160 | reason="Patch hunks are missing", | 160 | reason="Patch hunks are missing", |
| 161 | - suggestion="Provide one or more structured patch hunks", | 161 | + suggestion="Provide structured patch hunks or a unified diff patch string", |
| 162 | severity="error", | 162 | severity="error", |
| 163 | ) | 163 | ) |
| 164 | 164 | ||
| 165 | 165 | ||
| 166 | +def test_pre_action_validator_allows_patch_string_without_hunks() -> None: | ||
| 167 | + validator = PreActionValidator() | ||
| 168 | + | ||
| 169 | + result = validator.validate( | ||
| 170 | + "patch", | ||
| 171 | + { | ||
| 172 | + "file_path": "notes.txt", | ||
| 173 | + "patch": "--- a/notes.txt\n+++ b/notes.txt\n@@ -1,1 +1,1 @@\n-old\n+new\n", | ||
| 174 | + }, | ||
| 175 | + ) | ||
| 176 | + | ||
| 177 | + assert result == ValidationResult(valid=True) | ||
| 178 | + | ||
| 179 | + | ||
| 166 | def test_runtime_safeguards_wrap_runtime_owned_services() -> None: | 180 | def test_runtime_safeguards_wrap_runtime_owned_services() -> None: |
| 167 | safeguards = RuntimeSafeguards() | 181 | safeguards = RuntimeSafeguards() |
| 168 | 182 | ||
tests/test_tool_batch_policies.pymodified@@ -349,12 +349,74 @@ async def test_tool_batch_recovery_controller_includes_known_state_for_missing_f | |||
| 349 | 349 | ||
| 350 | assert follow_up is not None | 350 | assert follow_up is not None |
| 351 | assert "## CONTINUE FROM KNOWN STATE" in follow_up.content | 351 | assert "## CONTINUE FROM KNOWN STATE" in follow_up.content |
| 352 | + assert "apply the fix using confirmed findings" in follow_up.content | ||
| 353 | + assert "## ACTION BIAS FOR THIS RECOVERY" in follow_up.content | ||
| 354 | + assert "Prefer edit/write/patch on the target file" in follow_up.content | ||
| 352 | assert "04-variables.html" in follow_up.content | 355 | assert "04-variables.html" in follow_up.content |
| 353 | assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content | 356 | assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content |
| 354 | assert "`~/Loader/guides/fortran/index.html`" in follow_up.content | 357 | assert "`~/Loader/guides/fortran/index.html`" in follow_up.content |
| 355 | assert any(event.type == "recovery" for event in events) | 358 | assert any(event.type == "recovery" for event in events) |
| 356 | 359 | ||
| 357 | 360 | ||
| 361 | +@pytest.mark.asyncio | ||
| 362 | +async def test_tool_batch_recovery_controller_suggests_known_sibling_files( | ||
| 363 | + temp_dir: Path, | ||
| 364 | +) -> None: | ||
| 365 | + async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: | ||
| 366 | + raise AssertionError("Confidence should not run here") | ||
| 367 | + | ||
| 368 | + async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: | ||
| 369 | + raise AssertionError("Verification should not run here") | ||
| 370 | + | ||
| 371 | + messages = [ | ||
| 372 | + Message( | ||
| 373 | + role=Role.TOOL, | ||
| 374 | + content=( | ||
| 375 | + "Observation [glob]: Result: " | ||
| 376 | + "/private/tmp/fortran-qwen-recovery-check/chapters/01-introduction.html\n" | ||
| 377 | + "/private/tmp/fortran-qwen-recovery-check/chapters/02-setup.html\n" | ||
| 378 | + "/private/tmp/fortran-qwen-recovery-check/chapters/03-basics.html\n" | ||
| 379 | + "/private/tmp/fortran-qwen-recovery-check/chapters/04-variables.html\n" | ||
| 380 | + "/private/tmp/fortran-qwen-recovery-check/chapters/05-input-output.html" | ||
| 381 | + ), | ||
| 382 | + tool_results=[], | ||
| 383 | + ), | ||
| 384 | + ] | ||
| 385 | + context = build_context( | ||
| 386 | + temp_dir=temp_dir, | ||
| 387 | + messages=messages, | ||
| 388 | + assess_confidence=assess_confidence, | ||
| 389 | + verify_action=verify_action, | ||
| 390 | + ) | ||
| 391 | + controller = ToolBatchRecoveryController(context) | ||
| 392 | + tool_call = ToolCall( | ||
| 393 | + id="read-missing", | ||
| 394 | + name="read", | ||
| 395 | + arguments={"file_path": "/tmp/fortran-qwen-recovery-check/chapters/04-data-types.html"}, | ||
| 396 | + ) | ||
| 397 | + outcome = tool_outcome( | ||
| 398 | + tool_call=tool_call, | ||
| 399 | + output="File not found: /tmp/fortran-qwen-recovery-check/chapters/04-data-types.html", | ||
| 400 | + is_error=True, | ||
| 401 | + ) | ||
| 402 | + | ||
| 403 | + events: list[AgentEvent] = [] | ||
| 404 | + | ||
| 405 | + async def emit(event: AgentEvent) -> None: | ||
| 406 | + events.append(event) | ||
| 407 | + | ||
| 408 | + follow_up = await controller.build_follow_up( | ||
| 409 | + tool_call=tool_call, | ||
| 410 | + outcome=outcome, | ||
| 411 | + emit=emit, | ||
| 412 | + ) | ||
| 413 | + | ||
| 414 | + assert follow_up is not None | ||
| 415 | + assert "## LIKELY FILE CANDIDATES" in follow_up.content | ||
| 416 | + assert "`04-variables.html`" in follow_up.content | ||
| 417 | + assert "instead of retrying the missing path" in follow_up.content | ||
| 418 | + | ||
| 419 | + | ||
| 358 | @pytest.mark.asyncio | 420 | @pytest.mark.asyncio |
| 359 | async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files( | 421 | async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files( |
| 360 | temp_dir: Path, | 422 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -540,6 +540,108 @@ async def test_tool_batch_runner_clears_recovery_context_after_successful_mutati | |||
| 540 | assert context.recovery_context is None | 540 | assert context.recovery_context is None |
| 541 | 541 | ||
| 542 | 542 | ||
| 543 | +@pytest.mark.asyncio | ||
| 544 | +async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 545 | + temp_dir: Path, | ||
| 546 | +) -> None: | ||
| 547 | + async def assess_confidence( | ||
| 548 | + tool_name: str, | ||
| 549 | + tool_args: dict, | ||
| 550 | + context: str, | ||
| 551 | + ) -> ConfidenceAssessment: | ||
| 552 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | ||
| 553 | + | ||
| 554 | + async def verify_action( | ||
| 555 | + tool_name: str, | ||
| 556 | + tool_args: dict, | ||
| 557 | + result: str, | ||
| 558 | + expected: str = "", | ||
| 559 | + ) -> ActionVerification: | ||
| 560 | + raise AssertionError("Verification should not run for this scenario") | ||
| 561 | + | ||
| 562 | + messages = [ | ||
| 563 | + Message( | ||
| 564 | + role=Role.TOOL, | ||
| 565 | + content=( | ||
| 566 | + "Observation [glob]: Result: " | ||
| 567 | + f"{temp_dir}/chapters/01-introduction.html\n" | ||
| 568 | + f"{temp_dir}/chapters/02-setup.html\n" | ||
| 569 | + f"{temp_dir}/chapters/03-basics.html" | ||
| 570 | + ), | ||
| 571 | + tool_results=[], | ||
| 572 | + ), | ||
| 573 | + Message( | ||
| 574 | + role=Role.ASSISTANT, | ||
| 575 | + content="I should update the index now.", | ||
| 576 | + tool_calls=[ | ||
| 577 | + ToolCall( | ||
| 578 | + id="read-index", | ||
| 579 | + name="read", | ||
| 580 | + arguments={"file_path": str(temp_dir / 'index.html')}, | ||
| 581 | + ) | ||
| 582 | + ], | ||
| 583 | + ), | ||
| 584 | + ] | ||
| 585 | + context = build_context( | ||
| 586 | + temp_dir=temp_dir, | ||
| 587 | + messages=messages, | ||
| 588 | + safeguards=FakeSafeguards(), | ||
| 589 | + assess_confidence=assess_confidence, | ||
| 590 | + verify_action=verify_action, | ||
| 591 | + auto_recover=False, | ||
| 592 | + ) | ||
| 593 | + context.session.current_task = ( | ||
| 594 | + f"Update {temp_dir / 'index.html'} with the right chapter links." | ||
| 595 | + ) | ||
| 596 | + queued_messages: list[str] = [] | ||
| 597 | + context.queue_steering_message_callback = queued_messages.append | ||
| 598 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | ||
| 599 | + tool_call = ToolCall( | ||
| 600 | + id="read-dup", | ||
| 601 | + name="read", | ||
| 602 | + arguments={"file_path": str(temp_dir / "index.html")}, | ||
| 603 | + ) | ||
| 604 | + duplicate_message = ( | ||
| 605 | + "[Skipped - duplicate action: Already read " | ||
| 606 | + f"{temp_dir / 'index.html'} recently without any intervening changes; " | ||
| 607 | + "reuse the earlier read result instead of rereading]" | ||
| 608 | + ) | ||
| 609 | + executor = FakeExecutor( | ||
| 610 | + [ | ||
| 611 | + ToolExecutionOutcome( | ||
| 612 | + tool_call=tool_call, | ||
| 613 | + state=ToolExecutionState.DUPLICATE, | ||
| 614 | + message=Message.tool_result_message( | ||
| 615 | + tool_call_id=tool_call.id, | ||
| 616 | + display_content=duplicate_message, | ||
| 617 | + result_content=duplicate_message, | ||
| 618 | + ), | ||
| 619 | + event_content=duplicate_message, | ||
| 620 | + is_error=False, | ||
| 621 | + result_output=duplicate_message, | ||
| 622 | + ) | ||
| 623 | + ] | ||
| 624 | + ) | ||
| 625 | + | ||
| 626 | + await runner.execute_batch( | ||
| 627 | + tool_calls=[tool_call], | ||
| 628 | + tool_source="assistant", | ||
| 629 | + pending_tool_calls_seen=set(), | ||
| 630 | + emit=_noop_emit, | ||
| 631 | + summary=TurnSummary(final_response=""), | ||
| 632 | + dod=create_definition_of_done("Fix the chapter links"), | ||
| 633 | + executor=executor, # type: ignore[arg-type] | ||
| 634 | + on_confirmation=None, | ||
| 635 | + on_user_question=None, | ||
| 636 | + emit_confirmation=None, | ||
| 637 | + consecutive_errors=0, | ||
| 638 | + ) | ||
| 639 | + | ||
| 640 | + assert len(queued_messages) == 1 | ||
| 641 | + assert "Reuse the earlier observation instead of repeating it." in queued_messages[0] | ||
| 642 | + assert "index.html" in queued_messages[0] | ||
| 643 | + | ||
| 644 | + | ||
| 543 | async def _noop_emit(event: AgentEvent) -> None: | 645 | async def _noop_emit(event: AgentEvent) -> None: |
| 544 | return None | 646 | return None |
| 545 | 647 | ||
tests/test_workflow.pymodified@@ -4,6 +4,7 @@ from __future__ import annotations | |||
| 4 | 4 | ||
| 5 | from pathlib import Path | 5 | from pathlib import Path |
| 6 | 6 | ||
| 7 | +from loader.llm.base import ToolCall | ||
| 7 | from loader.runtime.clarify_grounding import ClarifyGrounding, ClarifyRepoFact | 8 | from loader.runtime.clarify_grounding import ClarifyGrounding, ClarifyRepoFact |
| 8 | from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done | 9 | from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done |
| 9 | from loader.runtime.workflow import ( | 10 | from loader.runtime.workflow import ( |
@@ -12,6 +13,7 @@ from loader.runtime.workflow import ( | |||
| 12 | PlanningArtifacts, | 13 | PlanningArtifacts, |
| 13 | WorkflowArtifactStore, | 14 | WorkflowArtifactStore, |
| 14 | WorkflowMode, | 15 | WorkflowMode, |
| 16 | + advance_todos_from_tool_call, | ||
| 15 | build_execute_bridge, | 17 | build_execute_bridge, |
| 16 | enrich_clarify_brief_with_grounding, | 18 | enrich_clarify_brief_with_grounding, |
| 17 | extract_verification_commands_from_markdown, | 19 | extract_verification_commands_from_markdown, |
@@ -183,6 +185,26 @@ def test_planning_artifacts_recover_embedded_verification_from_legacy_separator( | |||
| 183 | ] | 185 | ] |
| 184 | 186 | ||
| 185 | 187 | ||
| 188 | +def test_extract_verification_commands_from_markdown_splits_code_blocks() -> None: | ||
| 189 | + markdown = "\n".join( | ||
| 190 | + [ | ||
| 191 | + "# Verification Plan", | ||
| 192 | + "", | ||
| 193 | + "## Verification Commands", | ||
| 194 | + "```bash", | ||
| 195 | + "# Check chapter files", | ||
| 196 | + "ls chapters", | ||
| 197 | + "grep -n \"href=\" index.html", | ||
| 198 | + "```", | ||
| 199 | + ] | ||
| 200 | + ) | ||
| 201 | + | ||
| 202 | + assert extract_verification_commands_from_markdown(markdown) == [ | ||
| 203 | + "ls chapters", | ||
| 204 | + 'grep -n "href=" index.html', | ||
| 205 | + ] | ||
| 206 | + | ||
| 207 | + | ||
| 186 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: | 208 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 187 | store = WorkflowArtifactStore(tmp_path) | 209 | store = WorkflowArtifactStore(tmp_path) |
| 188 | brief = ClarifyBrief.fallback( | 210 | brief = ClarifyBrief.fallback( |
@@ -250,3 +272,93 @@ def test_sync_todos_to_definition_of_done_preserves_runtime_items() -> None: | |||
| 250 | assert "Writing router" in dod.pending_items | 272 | assert "Writing router" in dod.pending_items |
| 251 | assert "Collect verification evidence" in dod.pending_items | 273 | assert "Collect verification evidence" in dod.pending_items |
| 252 | assert "Update tests" in dod.completed_items | 274 | assert "Update tests" in dod.completed_items |
| 275 | + | ||
| 276 | + | ||
| 277 | +def test_advance_todos_from_tool_call_tracks_plan_progress() -> None: | ||
| 278 | + dod = create_definition_of_done("Fix the chapter links in index.html.") | ||
| 279 | + sync_todos_to_definition_of_done( | ||
| 280 | + dod, | ||
| 281 | + [ | ||
| 282 | + { | ||
| 283 | + "content": "First, examine the current index.html file to understand its structure", | ||
| 284 | + "active_form": "Working on: First, examine the current index.html file to understand its structure", | ||
| 285 | + "status": "pending", | ||
| 286 | + }, | ||
| 287 | + { | ||
| 288 | + "content": "List and read all HTML files in the chapters directory to extract chapter information", | ||
| 289 | + "active_form": "Working on: List and read all HTML files in the chapters directory to extract chapter information", | ||
| 290 | + "status": "pending", | ||
| 291 | + }, | ||
| 292 | + { | ||
| 293 | + "content": "Parse chapter titles from each HTML file", | ||
| 294 | + "active_form": "Working on: Parse chapter titles from each HTML file", | ||
| 295 | + "status": "pending", | ||
| 296 | + }, | ||
| 297 | + { | ||
| 298 | + "content": "Update index.html with correct chapter links and titles", | ||
| 299 | + "active_form": "Working on: Update index.html with correct chapter links and titles", | ||
| 300 | + "status": "pending", | ||
| 301 | + }, | ||
| 302 | + { | ||
| 303 | + "content": "Verify the updated index.html file is properly formatted", | ||
| 304 | + "active_form": "Working on: Verify the updated index.html file is properly formatted", | ||
| 305 | + "status": "pending", | ||
| 306 | + }, | ||
| 307 | + ], | ||
| 308 | + ) | ||
| 309 | + | ||
| 310 | + assert advance_todos_from_tool_call( | ||
| 311 | + dod, | ||
| 312 | + ToolCall( | ||
| 313 | + id="read-index", | ||
| 314 | + name="read", | ||
| 315 | + arguments={"file_path": "/tmp/fortran/index.html"}, | ||
| 316 | + ), | ||
| 317 | + ) | ||
| 318 | + assert ( | ||
| 319 | + "First, examine the current index.html file to understand its structure" | ||
| 320 | + in dod.completed_items | ||
| 321 | + ) | ||
| 322 | + | ||
| 323 | + assert advance_todos_from_tool_call( | ||
| 324 | + dod, | ||
| 325 | + ToolCall( | ||
| 326 | + id="glob-chapters", | ||
| 327 | + name="glob", | ||
| 328 | + arguments={"path": "/tmp/fortran/chapters", "pattern": "*.html"}, | ||
| 329 | + ), | ||
| 330 | + ) | ||
| 331 | + assert ( | ||
| 332 | + "List and read all HTML files in the chapters directory to extract chapter information" | ||
| 333 | + in dod.completed_items | ||
| 334 | + ) | ||
| 335 | + | ||
| 336 | + assert advance_todos_from_tool_call( | ||
| 337 | + dod, | ||
| 338 | + ToolCall( | ||
| 339 | + id="read-chapter", | ||
| 340 | + name="read", | ||
| 341 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | ||
| 342 | + ), | ||
| 343 | + ) | ||
| 344 | + assert "Parse chapter titles from each HTML file" in dod.completed_items | ||
| 345 | + | ||
| 346 | + assert advance_todos_from_tool_call( | ||
| 347 | + dod, | ||
| 348 | + ToolCall( | ||
| 349 | + id="patch-index", | ||
| 350 | + name="patch", | ||
| 351 | + arguments={"file_path": "/tmp/fortran/index.html", "hunks": []}, | ||
| 352 | + ), | ||
| 353 | + ) | ||
| 354 | + assert "Update index.html with correct chapter links and titles" in dod.completed_items | ||
| 355 | + | ||
| 356 | + assert advance_todos_from_tool_call( | ||
| 357 | + dod, | ||
| 358 | + ToolCall( | ||
| 359 | + id="verify-index", | ||
| 360 | + name="bash", | ||
| 361 | + arguments={"command": "grep -o 'href=\"[^\"]*\"' /tmp/fortran/index.html"}, | ||
| 362 | + ), | ||
| 363 | + ) | ||
| 364 | + assert "Verify the updated index.html file is properly formatted" in dod.completed_items | ||