Refine qwen recovery loops and verification
- SHA
f0d44903e030e62ea940f004e583842d18ab991c- Parents
-
03423c5 - Tree
dde0785
f0d4490
f0d44903e030e62ea940f004e583842d18ab991c03423c5
dde0785| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/dod.py
|
83 | 0 |
| M |
src/loader/runtime/finalization.py
|
9 | 0 |
| M |
src/loader/runtime/hooks.py
|
30 | 0 |
| M |
src/loader/runtime/safeguard_services.py
|
73 | 22 |
| M |
src/loader/runtime/workflow.py
|
41 | 2 |
| M |
tests/test_dod.py
|
36 | 0 |
| M |
tests/test_permissions.py
|
39 | 0 |
| M |
tests/test_runtime_harness.py
|
53 | 0 |
| M |
tests/test_safeguard_services.py
|
43 | 0 |
| M |
tests/test_workflow.py
|
33 | 0 |
| M |
tests/test_workflow_runtime.py
|
79 | 0 |
src/loader/runtime/dod.pymodified@@ -208,6 +208,11 @@ def derive_verification_commands( | ||
| 208 | 208 | """Generate verification commands from execution history and project shape.""" |
| 209 | 209 | |
| 210 | 210 | commands: list[str] = [] |
| 211 | + semantic_command = _derive_html_toc_verification_command( | |
| 212 | + dod, | |
| 213 | + project_root=project_root, | |
| 214 | + task_statement=task_statement, | |
| 215 | + ) | |
| 211 | 216 | |
| 212 | 217 | explicit = [cmd for cmd in dod.successful_commands if _is_verification_command(cmd)] |
| 213 | 218 | for command in explicit: |
@@ -220,6 +225,9 @@ def derive_verification_commands( | ||
| 220 | 225 | if path.suffix == ".py": |
| 221 | 226 | _append_unique(commands, f"python {shlex.quote(path.name)}") |
| 222 | 227 | |
| 228 | + if semantic_command: | |
| 229 | + _append_unique(commands, semantic_command) | |
| 230 | + | |
| 223 | 231 | if commands: |
| 224 | 232 | return commands |
| 225 | 233 | |
@@ -465,6 +473,81 @@ def _extract_files_from_bash(command: str) -> list[str]: | ||
| 465 | 473 | return [] |
| 466 | 474 | |
| 467 | 475 | |
| 476 | +def _derive_html_toc_verification_command( | |
| 477 | + dod: DefinitionOfDone, | |
| 478 | + *, | |
| 479 | + project_root: Path, | |
| 480 | + task_statement: str, | |
| 481 | +) -> str | None: | |
| 482 | + task_hints = " ".join([task_statement, *dod.acceptance_criteria]).lower() | |
| 483 | + if not any( | |
| 484 | + hint in task_hints | |
| 485 | + for hint in ("href", "link", "links", "table of contents", "chapter title") | |
| 486 | + ): | |
| 487 | + return None | |
| 488 | + | |
| 489 | + for path_str in dod.touched_files: | |
| 490 | + path = Path(path_str) | |
| 491 | + effective_path = path if path.is_absolute() else (project_root / path) | |
| 492 | + if effective_path.name != "index.html" or effective_path.suffix != ".html": | |
| 493 | + continue | |
| 494 | + if not (effective_path.parent / "chapters").is_dir(): | |
| 495 | + continue | |
| 496 | + return _build_html_toc_verification_command(effective_path) | |
| 497 | + return None | |
| 498 | + | |
| 499 | + | |
| 500 | +def _build_html_toc_verification_command(index_path: Path) -> str: | |
| 501 | + path_literal = repr(str(index_path)) | |
| 502 | + return "\n".join( | |
| 503 | + [ | |
| 504 | + "/usr/bin/python3 - <<'PY'", | |
| 505 | + "from pathlib import Path", | |
| 506 | + "import re", | |
| 507 | + "import sys", | |
| 508 | + "", | |
| 509 | + f"index = Path({path_literal}).expanduser()", | |
| 510 | + "root = index.parent", | |
| 511 | + "text = index.read_text()", | |
| 512 | + "section_match = re.search(r'<ul class=\"chapter-list\">(.*?)</ul>', text, re.S)", | |
| 513 | + "if section_match is None:", | |
| 514 | + " print('Missing chapter-list table of contents', file=sys.stderr)", | |
| 515 | + " raise SystemExit(1)", | |
| 516 | + "links = re.findall(r'<a href=\"([^\"]+)\">([^<]+)</a>', section_match.group(1))", | |
| 517 | + "if not links:", | |
| 518 | + " print('No chapter links found in table of contents', file=sys.stderr)", | |
| 519 | + " raise SystemExit(1)", | |
| 520 | + "", | |
| 521 | + "missing = []", | |
| 522 | + "mismatched = []", | |
| 523 | + "for href, label in links:", | |
| 524 | + " target = (root / href).resolve()", | |
| 525 | + " if not target.exists():", | |
| 526 | + " missing.append(f'{href} -> missing')", | |
| 527 | + " continue", | |
| 528 | + " body = target.read_text()", | |
| 529 | + " match = re.search(r'<h1>(.*?)</h1>', body, re.S)", | |
| 530 | + " title = match.group(1).strip() if match else ''", | |
| 531 | + " if title and label.strip() != title:", | |
| 532 | + " mismatched.append(f'{href} -> {label.strip()} != {title}')", | |
| 533 | + "", | |
| 534 | + "if missing or mismatched:", | |
| 535 | + " if missing:", | |
| 536 | + " print('Missing links:', file=sys.stderr)", | |
| 537 | + " for item in missing:", | |
| 538 | + " print(item, file=sys.stderr)", | |
| 539 | + " if mismatched:", | |
| 540 | + " print('Title mismatches:', file=sys.stderr)", | |
| 541 | + " for item in mismatched:", | |
| 542 | + " print(item, file=sys.stderr)", | |
| 543 | + " raise SystemExit(1)", | |
| 544 | + "", | |
| 545 | + "print(f'validated {len(links)} toc links in {index.name}')", | |
| 546 | + "PY", | |
| 547 | + ] | |
| 548 | + ) | |
| 549 | + | |
| 550 | + | |
| 468 | 551 | def _first_non_empty_line(text: str) -> str: |
| 469 | 552 | for line in text.splitlines(): |
| 470 | 553 | stripped = line.strip() |
src/loader/runtime/finalization.pymodified@@ -216,6 +216,15 @@ class TurnFinalizer: | ||
| 216 | 216 | Path(dod.verification_plan).read_text() |
| 217 | 217 | ) |
| 218 | 218 | |
| 219 | + if ( | |
| 220 | + not dod.verification_commands | |
| 221 | + and dod.implementation_plan | |
| 222 | + and Path(dod.implementation_plan).exists() | |
| 223 | + ): | |
| 224 | + dod.verification_commands = extract_verification_commands_from_markdown( | |
| 225 | + Path(dod.implementation_plan).read_text() | |
| 226 | + ) | |
| 227 | + | |
| 219 | 228 | if not dod.verification_commands: |
| 220 | 229 | dod.verification_commands = derive_verification_commands( |
| 221 | 230 | dod, |
src/loader/runtime/hooks.pymodified@@ -131,6 +131,35 @@ class FilePathAliasHook(BaseToolHook): | ||
| 131 | 131 | return HookResult() |
| 132 | 132 | |
| 133 | 133 | |
| 134 | +class SearchPathAliasHook(BaseToolHook): | |
| 135 | + """Normalize common search-path aliases before validation and execution.""" | |
| 136 | + | |
| 137 | + _SEARCH_TOOLS = frozenset({"glob", "grep"}) | |
| 138 | + _ALIASES = ("directory", "dir", "folder") | |
| 139 | + | |
| 140 | + async def pre_tool_use(self, context: HookContext) -> HookResult: | |
| 141 | + if context.tool_call.name not in self._SEARCH_TOOLS: | |
| 142 | + return HookResult() | |
| 143 | + | |
| 144 | + arguments = context.tool_call.arguments | |
| 145 | + path = str(arguments.get("path", "")).strip() | |
| 146 | + if path: | |
| 147 | + return HookResult() | |
| 148 | + | |
| 149 | + for alias in self._ALIASES: | |
| 150 | + candidate = arguments.get(alias) | |
| 151 | + if not str(candidate or "").strip(): | |
| 152 | + continue | |
| 153 | + | |
| 154 | + updated_arguments = dict(arguments) | |
| 155 | + updated_arguments["path"] = candidate | |
| 156 | + for cleanup_key in self._ALIASES: | |
| 157 | + updated_arguments.pop(cleanup_key, None) | |
| 158 | + return HookResult(updated_arguments=updated_arguments) | |
| 159 | + | |
| 160 | + return HookResult() | |
| 161 | + | |
| 162 | + | |
| 134 | 163 | class HookManager: |
| 135 | 164 | """Runs tool hooks across Loader's three lifecycle events.""" |
| 136 | 165 | |
@@ -327,6 +356,7 @@ def build_default_tool_hooks( | ||
| 327 | 356 | return HookManager( |
| 328 | 357 | [ |
| 329 | 358 | FilePathAliasHook(), |
| 359 | + SearchPathAliasHook(), | |
| 330 | 360 | DuplicateActionHook(action_tracker), |
| 331 | 361 | ActionValidationHook(validator), |
| 332 | 362 | RollbackTrackingHook(registry, rollback_plan), |
src/loader/runtime/safeguard_services.pymodified@@ -16,6 +16,9 @@ class ActionTracker: | ||
| 16 | 16 | LOOP_REPEAT_THRESHOLD = 2 |
| 17 | 17 | MAX_RESPONSE_HISTORY = 5 |
| 18 | 18 | OBSERVATION_REPEAT_WINDOW = 8 |
| 19 | + READ_REPEAT_THRESHOLD = 3 | |
| 20 | + SEARCH_REPEAT_THRESHOLD = 2 | |
| 21 | + BASH_OBSERVATION_REPEAT_THRESHOLD = 2 | |
| 19 | 22 | |
| 20 | 23 | def __init__(self) -> None: |
| 21 | 24 | self._file_writes: dict[str, list[str]] = {} |
@@ -26,9 +29,9 @@ class ActionTracker: | ||
| 26 | 29 | self._response_history: list[str] = [] |
| 27 | 30 | self._action_index = 0 |
| 28 | 31 | self._mutation_epoch = 0 |
| 29 | - self._recent_reads: dict[str, tuple[int, int]] = {} | |
| 30 | - self._recent_searches: dict[str, tuple[int, int]] = {} | |
| 31 | - self._recent_bash_observations: dict[str, tuple[int, int]] = {} | |
| 32 | + self._recent_reads: dict[str, tuple[int, int, int]] = {} | |
| 33 | + self._recent_searches: dict[str, tuple[int, int, int]] = {} | |
| 34 | + self._recent_bash_observations: dict[str, tuple[int, int, int]] = {} | |
| 32 | 35 | |
| 33 | 36 | def reset(self) -> None: |
| 34 | 37 | self._file_writes.clear() |
@@ -124,12 +127,17 @@ class ActionTracker: | ||
| 124 | 127 | return True, f"Same patch already applied to: {file_path}" |
| 125 | 128 | |
| 126 | 129 | elif tool_name == "read": |
| 127 | - file_path = str(arguments.get("file_path", "")).strip() | |
| 128 | - if file_path: | |
| 130 | + read_key = self._make_read_key(arguments) | |
| 131 | + if read_key: | |
| 129 | 132 | duplicate, reason = self._check_recent_observation( |
| 130 | 133 | self._recent_reads, |
| 131 | - self._normalize_path(file_path), | |
| 132 | - f"Already read {file_path} recently without any intervening changes", | |
| 134 | + read_key, | |
| 135 | + ( | |
| 136 | + "Already read " | |
| 137 | + f"{str(arguments.get('file_path', '')).strip()} " | |
| 138 | + "recently without any intervening changes" | |
| 139 | + ), | |
| 140 | + repeat_threshold=self.READ_REPEAT_THRESHOLD, | |
| 133 | 141 | ) |
| 134 | 142 | if duplicate: |
| 135 | 143 | return True, reason |
@@ -141,6 +149,7 @@ class ActionTracker: | ||
| 141 | 149 | self._recent_searches, |
| 142 | 150 | observation_key, |
| 143 | 151 | "Already ran the same search recently without any intervening changes", |
| 152 | + repeat_threshold=self.SEARCH_REPEAT_THRESHOLD, | |
| 144 | 153 | ) |
| 145 | 154 | if duplicate: |
| 146 | 155 | return True, reason |
@@ -152,6 +161,7 @@ class ActionTracker: | ||
| 152 | 161 | self._recent_bash_observations, |
| 153 | 162 | self._normalize_command(command), |
| 154 | 163 | "Already ran the same read-only shell probe recently without any intervening changes", |
| 164 | + repeat_threshold=self.BASH_OBSERVATION_REPEAT_THRESHOLD, | |
| 155 | 165 | ) |
| 156 | 166 | if duplicate: |
| 157 | 167 | return True, reason |
@@ -190,19 +200,19 @@ class ActionTracker: | ||
| 190 | 200 | self._note_mutation() |
| 191 | 201 | |
| 192 | 202 | elif tool_name == "read": |
| 193 | - file_path = str(arguments.get("file_path", "")).strip() | |
| 194 | - if file_path: | |
| 195 | - self._recent_reads[self._normalize_path(file_path)] = ( | |
| 196 | - self._mutation_epoch, | |
| 197 | - self._action_index, | |
| 203 | + read_key = self._make_read_key(arguments) | |
| 204 | + if read_key: | |
| 205 | + self._record_observation( | |
| 206 | + self._recent_reads, | |
| 207 | + read_key, | |
| 198 | 208 | ) |
| 199 | 209 | |
| 200 | 210 | elif tool_name in {"glob", "grep"}: |
| 201 | 211 | observation_key = self._make_search_key(tool_name, arguments) |
| 202 | 212 | if observation_key: |
| 203 | - self._recent_searches[observation_key] = ( | |
| 204 | - self._mutation_epoch, | |
| 205 | - self._action_index, | |
| 213 | + self._record_observation( | |
| 214 | + self._recent_searches, | |
| 215 | + observation_key, | |
| 206 | 216 | ) |
| 207 | 217 | |
| 208 | 218 | elif tool_name == "bash": |
@@ -212,9 +222,9 @@ class ActionTracker: | ||
| 212 | 222 | if self._is_mutating_bash(command): |
| 213 | 223 | self._note_mutation() |
| 214 | 224 | elif self._is_observational_bash(command): |
| 215 | - self._recent_bash_observations[self._normalize_command(command)] = ( | |
| 216 | - self._mutation_epoch, | |
| 217 | - self._action_index, | |
| 225 | + self._record_observation( | |
| 226 | + self._recent_bash_observations, | |
| 227 | + self._normalize_command(command), | |
| 218 | 228 | ) |
| 219 | 229 | |
| 220 | 230 | def detect_loop(self) -> tuple[bool, str]: |
@@ -299,20 +309,49 @@ class ActionTracker: | ||
| 299 | 309 | |
| 300 | 310 | def _check_recent_observation( |
| 301 | 311 | self, |
| 302 | - cache: dict[str, tuple[int, int]], | |
| 312 | + cache: dict[str, tuple[int, int, int]], | |
| 303 | 313 | key: str, |
| 304 | 314 | reason: str, |
| 315 | + *, | |
| 316 | + repeat_threshold: int, | |
| 305 | 317 | ) -> tuple[bool, str]: |
| 306 | 318 | last_seen = cache.get(key) |
| 307 | 319 | if last_seen is None: |
| 308 | 320 | return False, "" |
| 309 | 321 | |
| 310 | - last_epoch, last_index = last_seen | |
| 322 | + last_epoch, last_index, repeat_count = last_seen | |
| 311 | 323 | if last_epoch != self._mutation_epoch: |
| 312 | 324 | return False, "" |
| 313 | - if (self._action_index - last_index) > self.OBSERVATION_REPEAT_WINDOW: | |
| 325 | + gap = self._action_index - last_index | |
| 326 | + if gap > self.OBSERVATION_REPEAT_WINDOW: | |
| 314 | 327 | return False, "" |
| 315 | - return True, reason | |
| 328 | + if gap <= 0: | |
| 329 | + return True, reason | |
| 330 | + if repeat_count >= repeat_threshold: | |
| 331 | + return True, reason | |
| 332 | + return False, "" | |
| 333 | + | |
| 334 | + def _record_observation( | |
| 335 | + self, | |
| 336 | + cache: dict[str, tuple[int, int, int]], | |
| 337 | + key: str, | |
| 338 | + ) -> None: | |
| 339 | + last_seen = cache.get(key) | |
| 340 | + if last_seen is None: | |
| 341 | + cache[key] = (self._mutation_epoch, self._action_index, 1) | |
| 342 | + return | |
| 343 | + | |
| 344 | + last_epoch, last_index, repeat_count = last_seen | |
| 345 | + gap = self._action_index - last_index | |
| 346 | + if last_epoch != self._mutation_epoch or gap > self.OBSERVATION_REPEAT_WINDOW: | |
| 347 | + cache[key] = (self._mutation_epoch, self._action_index, 1) | |
| 348 | + return | |
| 349 | + | |
| 350 | + cache[key] = ( | |
| 351 | + self._mutation_epoch, | |
| 352 | + self._action_index, | |
| 353 | + repeat_count + 1, | |
| 354 | + ) | |
| 316 | 355 | |
| 317 | 356 | def _make_search_key(self, tool_name: str, arguments: dict) -> str | None: |
| 318 | 357 | pattern = str(arguments.get("pattern", "")).strip() |
@@ -322,6 +361,18 @@ class ActionTracker: | ||
| 322 | 361 | normalized_path = self._normalize_path(path) if path else "" |
| 323 | 362 | return f"{tool_name}:{normalized_path}:{pattern}" |
| 324 | 363 | |
| 364 | + def _make_read_key(self, arguments: dict) -> str | None: | |
| 365 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 366 | + if not file_path: | |
| 367 | + return None | |
| 368 | + offset = str(arguments.get("offset", "")).strip() | |
| 369 | + limit = str(arguments.get("limit", "")).strip() | |
| 370 | + return ( | |
| 371 | + f"{self._normalize_path(file_path)}" | |
| 372 | + f":offset={offset or 'full'}" | |
| 373 | + f":limit={limit or 'all'}" | |
| 374 | + ) | |
| 375 | + | |
| 325 | 376 | def _is_observational_bash(self, command: str) -> bool: |
| 326 | 377 | norm_cmd = self._normalize_command(command) |
| 327 | 378 | if not norm_cmd: |
src/loader/runtime/workflow.pymodified@@ -52,6 +52,10 @@ __all__ = [ | ||
| 52 | 52 | ] |
| 53 | 53 | |
| 54 | 54 | VERIFICATION_SEPARATOR = "<<<VERIFICATION>>>" |
| 55 | +_VERIFICATION_SEPARATORS = ( | |
| 56 | + VERIFICATION_SEPARATOR, | |
| 57 | + "<<VERIFICATION>>", | |
| 58 | +) | |
| 55 | 59 | _GENERIC_TOUCHPOINTS = { |
| 56 | 60 | "Determine the concrete files during execution.", |
| 57 | 61 | "Identify exact files during planning or execution.", |
@@ -518,12 +522,47 @@ def build_execute_bridge( | ||
| 518 | 522 | |
| 519 | 523 | |
| 520 | 524 | def _split_plan_output(model_output: str) -> tuple[str, str]: |
| 521 | - if VERIFICATION_SEPARATOR in model_output: | |
| 522 | - implementation, verification = model_output.split(VERIFICATION_SEPARATOR, maxsplit=1) | |
| 525 | + for separator in _VERIFICATION_SEPARATORS: | |
| 526 | + if separator not in model_output: | |
| 527 | + continue | |
| 528 | + implementation, verification = model_output.split(separator, maxsplit=1) | |
| 529 | + split = _split_embedded_verification_heading( | |
| 530 | + implementation.strip(), | |
| 531 | + fallback_verification=verification.strip(), | |
| 532 | + ) | |
| 533 | + if split is not None: | |
| 534 | + return split | |
| 523 | 535 | return implementation.strip(), verification.strip() |
| 536 | + | |
| 537 | + split = _split_embedded_verification_heading(model_output.strip()) | |
| 538 | + if split is not None: | |
| 539 | + return split | |
| 524 | 540 | return model_output.strip(), "" |
| 525 | 541 | |
| 526 | 542 | |
| 543 | +def _split_embedded_verification_heading( | |
| 544 | + implementation_markdown: str, | |
| 545 | + *, | |
| 546 | + fallback_verification: str = "", | |
| 547 | +) -> tuple[str, str] | None: | |
| 548 | + match = re.search(r"(?m)^#\s+Verification Plan\s*$", implementation_markdown) | |
| 549 | + if match is None: | |
| 550 | + if fallback_verification.strip(): | |
| 551 | + return implementation_markdown, fallback_verification.strip() | |
| 552 | + return None | |
| 553 | + | |
| 554 | + implementation = implementation_markdown[:match.start()].rstrip() | |
| 555 | + verification = implementation_markdown[match.start():].strip() | |
| 556 | + if not implementation: | |
| 557 | + implementation = implementation_markdown.strip() | |
| 558 | + verification = fallback_verification.strip() | |
| 559 | + if not verification: | |
| 560 | + verification = fallback_verification.strip() | |
| 561 | + if not implementation or not verification: | |
| 562 | + return None | |
| 563 | + return implementation, verification | |
| 564 | + | |
| 565 | + | |
| 527 | 566 | def _ensure_heading(markdown: str, heading: str) -> str: |
| 528 | 567 | stripped = markdown.strip() |
| 529 | 568 | if not stripped: |
tests/test_dod.pymodified@@ -103,3 +103,39 @@ def test_record_successful_tool_call_preserves_absolute_path_string(tmp_path: Pa | ||
| 103 | 103 | ) |
| 104 | 104 | |
| 105 | 105 | assert dod.touched_files == [str(absolute_path)] |
| 106 | + | |
| 107 | + | |
| 108 | +def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Path) -> None: | |
| 109 | + chapters = tmp_path / "chapters" | |
| 110 | + chapters.mkdir() | |
| 111 | + (chapters / "01-introduction.html").write_text( | |
| 112 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 113 | + ) | |
| 114 | + index = tmp_path / "index.html" | |
| 115 | + index.write_text( | |
| 116 | + "\n".join( | |
| 117 | + [ | |
| 118 | + '<ul class="chapter-list">', | |
| 119 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>', | |
| 120 | + "</ul>", | |
| 121 | + ] | |
| 122 | + ) | |
| 123 | + ) | |
| 124 | + | |
| 125 | + dod = create_definition_of_done( | |
| 126 | + "Update index.html so the table of contents links and hrefs are correct." | |
| 127 | + ) | |
| 128 | + dod.acceptance_criteria = [ | |
| 129 | + "All table of contents links in index.html point to existing chapter files.", | |
| 130 | + "All link texts match the actual chapter titles.", | |
| 131 | + ] | |
| 132 | + dod.touched_files = [str(index)] | |
| 133 | + | |
| 134 | + commands = derive_verification_commands( | |
| 135 | + dod, | |
| 136 | + project_root=tmp_path, | |
| 137 | + task_statement=dod.task_statement, | |
| 138 | + ) | |
| 139 | + | |
| 140 | + assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in commands) | |
| 141 | + assert not any(command == f"test -f {index}" for command in commands) | |
tests/test_permissions.pymodified@@ -15,6 +15,7 @@ from loader.runtime.hooks import ( | ||
| 15 | 15 | HookContext, |
| 16 | 16 | HookManager, |
| 17 | 17 | HookResult, |
| 18 | + SearchPathAliasHook, | |
| 18 | 19 | ) |
| 19 | 20 | from loader.runtime.permissions import ( |
| 20 | 21 | PermissionMode, |
@@ -342,3 +343,41 @@ async def test_file_path_alias_hook_canonicalizes_common_aliases( | ||
| 342 | 343 | assert result.updated_arguments["file_path"] == expected_path |
| 343 | 344 | for alias in ("file", "filepath", "filePath", "filename", "path"): |
| 344 | 345 | assert alias not in result.updated_arguments |
| 346 | + | |
| 347 | + | |
| 348 | +@pytest.mark.asyncio | |
| 349 | +@pytest.mark.parametrize( | |
| 350 | + ("tool_name", "arguments", "expected_path"), | |
| 351 | + [ | |
| 352 | + ("glob", {"pattern": "*.html", "directory": "chapters"}, "chapters"), | |
| 353 | + ("grep", {"pattern": "alpha", "dir": "src"}, "src"), | |
| 354 | + ], | |
| 355 | +) | |
| 356 | +async def test_search_path_alias_hook_canonicalizes_common_aliases( | |
| 357 | + temp_dir: Path, | |
| 358 | + tool_name: str, | |
| 359 | + arguments: dict[str, object], | |
| 360 | + expected_path: str, | |
| 361 | +) -> None: | |
| 362 | + registry = create_default_registry(temp_dir) | |
| 363 | + policy = build_permission_policy( | |
| 364 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 365 | + workspace_root=temp_dir, | |
| 366 | + tool_requirements=registry.get_tool_requirements(), | |
| 367 | + ) | |
| 368 | + hook = SearchPathAliasHook() | |
| 369 | + | |
| 370 | + result = await hook.pre_tool_use( | |
| 371 | + HookContext( | |
| 372 | + tool_call=ToolCall(id=f"{tool_name}-1", name=tool_name, arguments=arguments), | |
| 373 | + tool=registry.get(tool_name), | |
| 374 | + registry=registry, | |
| 375 | + permission_policy=policy, | |
| 376 | + source="native", | |
| 377 | + ) | |
| 378 | + ) | |
| 379 | + | |
| 380 | + assert result.updated_arguments is not None | |
| 381 | + assert result.updated_arguments["path"] == expected_path | |
| 382 | + for alias in ("directory", "dir", "folder"): | |
| 383 | + assert alias not in result.updated_arguments | |
tests/test_runtime_harness.pymodified@@ -1766,6 +1766,59 @@ async def test_duplicate_read_is_skipped_without_intervening_mutation( | ||
| 1766 | 1766 | assert "existing file contents" in run.response |
| 1767 | 1767 | |
| 1768 | 1768 | |
| 1769 | +@pytest.mark.asyncio | |
| 1770 | +async def test_interleaved_reread_is_allowed_once_without_intervening_mutation( | |
| 1771 | + temp_dir: Path, | |
| 1772 | +) -> None: | |
| 1773 | + index_file = temp_dir / "index.html" | |
| 1774 | + chapter_file = temp_dir / "chapter-1.html" | |
| 1775 | + index_file.write_text("table of contents\n") | |
| 1776 | + chapter_file.write_text("chapter body\n") | |
| 1777 | + | |
| 1778 | + backend = ScriptedBackend( | |
| 1779 | + completions=[ | |
| 1780 | + native_tool_response( | |
| 1781 | + ToolCall( | |
| 1782 | + id="read-1", | |
| 1783 | + name="read", | |
| 1784 | + arguments={"file_path": str(index_file)}, | |
| 1785 | + ), | |
| 1786 | + content="I'll inspect the index first.", | |
| 1787 | + ), | |
| 1788 | + native_tool_response( | |
| 1789 | + ToolCall( | |
| 1790 | + id="read-2", | |
| 1791 | + name="read", | |
| 1792 | + arguments={"file_path": str(chapter_file)}, | |
| 1793 | + ), | |
| 1794 | + content="I'll inspect the chapter next.", | |
| 1795 | + ), | |
| 1796 | + native_tool_response( | |
| 1797 | + ToolCall( | |
| 1798 | + id="read-3", | |
| 1799 | + name="read", | |
| 1800 | + arguments={"file_path": str(index_file)}, | |
| 1801 | + ), | |
| 1802 | + content="I'll reopen the index to reconcile the findings.", | |
| 1803 | + ), | |
| 1804 | + final_response("I re-opened the index after checking the chapter."), | |
| 1805 | + ] | |
| 1806 | + ) | |
| 1807 | + | |
| 1808 | + run = await run_scenario( | |
| 1809 | + "Inspect the index, inspect a chapter, then return to the index.", | |
| 1810 | + backend, | |
| 1811 | + config=non_streaming_config(), | |
| 1812 | + project_root=temp_dir, | |
| 1813 | + ) | |
| 1814 | + | |
| 1815 | + assert tool_event_names(run) == ["read", "read", "read"] | |
| 1816 | + messages = tool_result_messages(run) | |
| 1817 | + assert not any("Skipped - duplicate action" in message for message in messages) | |
| 1818 | + assert sum("table of contents" in message for message in messages) == 2 | |
| 1819 | + assert any("chapter body" in message for message in messages) | |
| 1820 | + | |
| 1821 | + | |
| 1769 | 1822 | @pytest.mark.asyncio |
| 1770 | 1823 | async def test_repeated_bash_probe_is_allowed_after_mutation( |
| 1771 | 1824 | temp_dir: Path, |
tests/test_safeguard_services.pymodified@@ -88,6 +88,49 @@ def test_action_tracker_blocks_repeated_read_without_changes(tmp_path) -> None: | ||
| 88 | 88 | assert str(file_path) in reason |
| 89 | 89 | |
| 90 | 90 | |
| 91 | +def test_action_tracker_allows_one_interleaved_reread_without_changes(tmp_path) -> None: | |
| 92 | + tracker = ActionTracker() | |
| 93 | + index_path = tmp_path / "index.html" | |
| 94 | + chapter_path = tmp_path / "chapter-1.html" | |
| 95 | + | |
| 96 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 97 | + tracker.record_tool_call("read", {"file_path": str(chapter_path)}) | |
| 98 | + | |
| 99 | + assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "") | |
| 100 | + | |
| 101 | + | |
| 102 | +def test_action_tracker_allows_reading_a_different_slice_of_the_same_file(tmp_path) -> None: | |
| 103 | + tracker = ActionTracker() | |
| 104 | + index_path = tmp_path / "index.html" | |
| 105 | + | |
| 106 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 107 | + | |
| 108 | + assert tracker.check_tool_call( | |
| 109 | + "read", | |
| 110 | + {"file_path": str(index_path), "offset": 1, "limit": 50}, | |
| 111 | + ) == (False, "") | |
| 112 | + | |
| 113 | + | |
| 114 | +def test_action_tracker_blocks_fourth_interleaved_reread_without_changes(tmp_path) -> None: | |
| 115 | + tracker = ActionTracker() | |
| 116 | + index_path = tmp_path / "index.html" | |
| 117 | + chapter_a = tmp_path / "chapter-1.html" | |
| 118 | + chapter_b = tmp_path / "chapter-2.html" | |
| 119 | + chapter_c = tmp_path / "chapter-3.html" | |
| 120 | + | |
| 121 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 122 | + tracker.record_tool_call("read", {"file_path": str(chapter_a)}) | |
| 123 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 124 | + tracker.record_tool_call("read", {"file_path": str(chapter_b)}) | |
| 125 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 126 | + tracker.record_tool_call("read", {"file_path": str(chapter_c)}) | |
| 127 | + | |
| 128 | + is_duplicate, reason = tracker.check_tool_call("read", {"file_path": str(index_path)}) | |
| 129 | + | |
| 130 | + assert is_duplicate is True | |
| 131 | + assert str(index_path) in reason | |
| 132 | + | |
| 133 | + | |
| 91 | 134 | def test_action_tracker_allows_repeated_read_after_mutation(tmp_path) -> None: |
| 92 | 135 | tracker = ActionTracker() |
| 93 | 136 | file_path = tmp_path / "index.html" |
tests/test_workflow.pymodified@@ -150,6 +150,39 @@ def test_planning_artifacts_round_trip_and_extract_commands() -> None: | ||
| 150 | 150 | ] |
| 151 | 151 | |
| 152 | 152 | |
| 153 | +def test_planning_artifacts_recover_embedded_verification_from_legacy_separator() -> None: | |
| 154 | + artifacts = PlanningArtifacts.from_model_output( | |
| 155 | + "\n".join( | |
| 156 | + [ | |
| 157 | + "# Implementation Plan", | |
| 158 | + "", | |
| 159 | + "## Execution Order", | |
| 160 | + "1. Inspect index.html.", | |
| 161 | + "2. Fix the chapter links.", | |
| 162 | + "", | |
| 163 | + "# Verification Plan", | |
| 164 | + "", | |
| 165 | + "## Acceptance Criteria", | |
| 166 | + "- All chapter links point to real files.", | |
| 167 | + "", | |
| 168 | + "## Verification Commands", | |
| 169 | + "- `grep -o 'href=\"[^\"]*\"' index.html`", | |
| 170 | + "- `ls chapters`", | |
| 171 | + "", | |
| 172 | + "<<VERIFICATION>>", | |
| 173 | + ] | |
| 174 | + ), | |
| 175 | + task_statement="Fix the broken chapter links in index.html.", | |
| 176 | + ) | |
| 177 | + | |
| 178 | + assert "## Verification Commands" not in artifacts.implementation_markdown | |
| 179 | + assert "## Verification Commands" in artifacts.verification_markdown | |
| 180 | + assert artifacts.verification_commands == [ | |
| 181 | + "grep -o 'href=\"[^\"]*\"' index.html", | |
| 182 | + "ls chapters", | |
| 183 | + ] | |
| 184 | + | |
| 185 | + | |
| 153 | 186 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 154 | 187 | store = WorkflowArtifactStore(tmp_path) |
| 155 | 188 | brief = ClarifyBrief.fallback( |
tests/test_workflow_runtime.pymodified@@ -72,6 +72,16 @@ def artifact_kinds(run) -> list[str]: | ||
| 72 | 72 | ] |
| 73 | 73 | |
| 74 | 74 | |
| 75 | +def verification_commands(run) -> list[str]: | |
| 76 | + """Return verification-phase bash commands.""" | |
| 77 | + | |
| 78 | + return [ | |
| 79 | + str((event.tool_args or {}).get("command", "")) | |
| 80 | + for event in run.events | |
| 81 | + if event.type == "tool_call" and event.phase == "verification" | |
| 82 | + ] | |
| 83 | + | |
| 84 | + | |
| 75 | 85 | def workflow_timeline_kinds(run) -> list[str]: |
| 76 | 86 | assert run.agent.last_turn_summary is not None |
| 77 | 87 | return [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline] |
@@ -1247,6 +1257,75 @@ async def test_verify_failure_returns_to_execute_without_retriggering_plan( | ||
| 1247 | 1257 | assert "fixed output" in target.read_text() |
| 1248 | 1258 | |
| 1249 | 1259 | |
| 1260 | +@pytest.mark.asyncio | |
| 1261 | +async def test_plan_mode_recovers_verification_commands_from_legacy_separator( | |
| 1262 | + temp_dir: Path, | |
| 1263 | +) -> None: | |
| 1264 | + target = temp_dir / "planned.txt" | |
| 1265 | + backend = ScriptedBackend( | |
| 1266 | + completions=[ | |
| 1267 | + CompletionResponse( | |
| 1268 | + content="\n".join( | |
| 1269 | + [ | |
| 1270 | + "# Implementation Plan", | |
| 1271 | + "", | |
| 1272 | + "## File Changes", | |
| 1273 | + f"- Create {target.name} in the workspace root.", | |
| 1274 | + "", | |
| 1275 | + "## Execution Order", | |
| 1276 | + f"1. Write {target.name}.", | |
| 1277 | + "2. Verify the file exists.", | |
| 1278 | + "", | |
| 1279 | + "## Risks", | |
| 1280 | + "- Losing the verification commands during parsing.", | |
| 1281 | + "", | |
| 1282 | + "# Verification Plan", | |
| 1283 | + "", | |
| 1284 | + "## Acceptance Criteria", | |
| 1285 | + f"- {target.name} exists in the workspace root.", | |
| 1286 | + "", | |
| 1287 | + "## Verification Commands", | |
| 1288 | + f"- `test -f {target}`", | |
| 1289 | + "", | |
| 1290 | + "## Notes", | |
| 1291 | + "- This simulates a legacy separator emitted after the plan body.", | |
| 1292 | + "", | |
| 1293 | + "<<VERIFICATION>>", | |
| 1294 | + ] | |
| 1295 | + ) | |
| 1296 | + ), | |
| 1297 | + CompletionResponse( | |
| 1298 | + content="I'll create the planned artifact.", | |
| 1299 | + tool_calls=[ | |
| 1300 | + ToolCall( | |
| 1301 | + id="write-1", | |
| 1302 | + name="write", | |
| 1303 | + arguments={ | |
| 1304 | + "file_path": str(target), | |
| 1305 | + "content": "planned output\n", | |
| 1306 | + }, | |
| 1307 | + ) | |
| 1308 | + ], | |
| 1309 | + ), | |
| 1310 | + CompletionResponse(content="The planned artifact is in place."), | |
| 1311 | + ] | |
| 1312 | + ) | |
| 1313 | + | |
| 1314 | + run = await run_scenario( | |
| 1315 | + "Implement a persistent workflow mode router with clarify artifacts, " | |
| 1316 | + "planning artifacts, and verification-plan wiring in the runtime.", | |
| 1317 | + backend, | |
| 1318 | + config=non_streaming_config(), | |
| 1319 | + project_root=temp_dir, | |
| 1320 | + ) | |
| 1321 | + | |
| 1322 | + dod = run.agent.last_turn_summary.definition_of_done | |
| 1323 | + assert dod is not None | |
| 1324 | + assert dod.verification_commands == [f"test -f {target}"] | |
| 1325 | + assert verification_commands(run) == [f"test -f {target}"] | |
| 1326 | + assert Path(dod.verification_plan).read_text().count("## Verification Commands") == 1 | |
| 1327 | + | |
| 1328 | + | |
| 1250 | 1329 | @pytest.mark.asyncio |
| 1251 | 1330 | async def test_stale_plan_artifacts_trigger_targeted_plan_refresh( |
| 1252 | 1331 | temp_dir: Path, |