Strengthen qwen recovery and repair flow
- SHA
df5639d514ff7b3f567a54307ba0f5d0bd3e03df- Parents
-
297e213 - Tree
a3395ea
df5639d
df5639d514ff7b3f567a54307ba0f5d0bd3e03df297e213
a3395easrc/loader/runtime/artifact_invalidation.pymodified@@ -34,6 +34,8 @@ class ArtifactInvalidationAssessor: | ||
| 34 | 34 | acceptance_criteria: list[str], |
| 35 | 35 | touched_files: list[str], |
| 36 | 36 | last_verification_result: str | None, |
| 37 | + retry_count: int = 0, | |
| 38 | + planned_artifacts_complete: bool = False, | |
| 37 | 39 | ) -> ArtifactFreshness: |
| 38 | 40 | """Return stale-artifact state and the recommended recovery strategy.""" |
| 39 | 41 | |
@@ -46,10 +48,12 @@ class ArtifactInvalidationAssessor: | ||
| 46 | 48 | reason_codes: list[str] = [] |
| 47 | 49 | evidence: list[ArtifactEvidence] = [] |
| 48 | 50 | |
| 51 | + allow_repair_local_touchpoints = planned_artifacts_complete and retry_count > 0 | |
| 49 | 52 | unexpected_paths = [ |
| 50 | 53 | name |
| 51 | 54 | for path in touched_files |
| 52 | - if (name := _path_name(path)) and not _text_covers_path_reference(plan_text, path) | |
| 55 | + if (name := _path_name(path)) | |
| 56 | + and not _text_covers_path_reference(plan_text, path) | |
| 53 | 57 | ] |
| 54 | 58 | confirmed_touchpoints = [ |
| 55 | 59 | name |
@@ -86,13 +90,21 @@ class ArtifactInvalidationAssessor: | ||
| 86 | 90 | f"Persisted artifacts still point at `{item}`.", |
| 87 | 91 | ) |
| 88 | 92 | |
| 89 | - if unexpected_paths: | |
| 93 | + if unexpected_paths and not allow_repair_local_touchpoints: | |
| 90 | 94 | stale_plan = True |
| 91 | 95 | reason_codes.append("touched_files_outside_plan") |
| 92 | 96 | reasons.append( |
| 93 | 97 | "Touched files outside the current plan: " |
| 94 | 98 | + ", ".join(dict.fromkeys(unexpected_paths)) |
| 95 | 99 | ) |
| 100 | + elif unexpected_paths: | |
| 101 | + for item in dict.fromkeys(unexpected_paths): | |
| 102 | + _append_evidence( | |
| 103 | + evidence, | |
| 104 | + ArtifactEvidenceKind.CONFIRMED_TOUCHPOINT, | |
| 105 | + "Verification repair touched supplemental file " | |
| 106 | + f"`{item}` after the originally planned artifacts were complete.", | |
| 107 | + ) | |
| 96 | 108 | |
| 97 | 109 | acceptance_anchors = [ |
| 98 | 110 | item |
src/loader/runtime/compaction.pymodified@@ -8,7 +8,6 @@ from dataclasses import dataclass | ||
| 8 | 8 | from pathlib import Path |
| 9 | 9 | |
| 10 | 10 | from ..llm.base import Message, Role, ToolCall |
| 11 | -from .semantic_rules import html_toc as html_toc_rule | |
| 12 | 11 | |
| 13 | 12 | DEFAULT_AUTO_COMPACTION_INPUT_TOKENS_THRESHOLD = 100_000 |
| 14 | 13 | MIN_AUTO_COMPACTION_INPUT_TOKENS_THRESHOLD = 12_000 |
@@ -337,19 +336,7 @@ def infer_preferred_next_step( | ||
| 337 | 336 | current_task=current_task, |
| 338 | 337 | focus_path=focus_path, |
| 339 | 338 | ) |
| 340 | - has_confirmed_titles = _summarize_html_title_discovery(relevant_messages) is not None | |
| 341 | - verification_gap = _summarize_latest_html_verification_gap(relevant_messages) | |
| 342 | 339 | if target_path: |
| 343 | - if verification_gap: | |
| 344 | - return ( | |
| 345 | - f"Update `{target_path}` to fix the specific verification failures " | |
| 346 | - f"({verification_gap}) instead of restarting discovery." | |
| 347 | - ) | |
| 348 | - if has_confirmed_titles: | |
| 349 | - return ( | |
| 350 | - f"Update `{target_path}` using the confirmed chapter file/title pairs " | |
| 351 | - "instead of rereading files." | |
| 352 | - ) | |
| 353 | 340 | return ( |
| 354 | 341 | f"Update `{target_path}` using the confirmed findings instead of " |
| 355 | 342 | "restarting earlier discovery steps." |
@@ -440,27 +427,6 @@ def _collect_confirmed_facts(messages: list[Message]) -> list[str]: | ||
| 440 | 427 | if explicit_mapping_fact: |
| 441 | 428 | facts.append(explicit_mapping_fact) |
| 442 | 429 | |
| 443 | - verification_gap_fact = _collect_html_verification_gap_fact( | |
| 444 | - messages, | |
| 445 | - tool_calls_by_id=tool_calls_by_id, | |
| 446 | - ) | |
| 447 | - if verification_gap_fact: | |
| 448 | - facts.append(verification_gap_fact) | |
| 449 | - | |
| 450 | - title_fact = _summarize_html_title_discovery( | |
| 451 | - messages, | |
| 452 | - tool_calls_by_id=tool_calls_by_id, | |
| 453 | - ) | |
| 454 | - if title_fact: | |
| 455 | - facts.append(title_fact) | |
| 456 | - | |
| 457 | - file_fact = _collect_html_file_discovery_fact( | |
| 458 | - messages, | |
| 459 | - tool_calls_by_id=tool_calls_by_id, | |
| 460 | - ) | |
| 461 | - if file_fact: | |
| 462 | - facts.append(file_fact) | |
| 463 | - | |
| 464 | 430 | return facts |
| 465 | 431 | |
| 466 | 432 | |
@@ -529,162 +495,6 @@ def _summarize_html_mappings(payload: str) -> str | None: | ||
| 529 | 495 | return f"Filename mappings confirmed: {preview}" |
| 530 | 496 | |
| 531 | 497 | |
| 532 | -def _summarize_html_title_discovery( | |
| 533 | - messages: list[Message], | |
| 534 | - *, | |
| 535 | - max_pairs: int = 4, | |
| 536 | - tool_calls_by_id: dict[str, ToolCall] | None = None, | |
| 537 | -) -> str | None: | |
| 538 | - if tool_calls_by_id is None: | |
| 539 | - tool_calls_by_id = { | |
| 540 | - tool_call.id: tool_call | |
| 541 | - for message in messages | |
| 542 | - for tool_call in message.tool_calls | |
| 543 | - } | |
| 544 | - | |
| 545 | - confirmed_pairs: list[str] = [] | |
| 546 | - for message in messages: | |
| 547 | - if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 548 | - continue | |
| 549 | - if any(result.is_error for result in message.tool_results): | |
| 550 | - continue | |
| 551 | - | |
| 552 | - tool_call = next( | |
| 553 | - ( | |
| 554 | - tool_calls_by_id.get(result.tool_call_id) | |
| 555 | - for result in message.tool_results | |
| 556 | - if result.tool_call_id in tool_calls_by_id | |
| 557 | - ), | |
| 558 | - None, | |
| 559 | - ) | |
| 560 | - if tool_call is None or tool_call.name != "read": | |
| 561 | - continue | |
| 562 | - | |
| 563 | - raw_path = tool_call.arguments.get("file_path") | |
| 564 | - if not isinstance(raw_path, str): | |
| 565 | - continue | |
| 566 | - normalized_path = _normalize_path_candidate(raw_path) or raw_path | |
| 567 | - if html_toc_rule.is_html_toc_index_path(normalized_path) or "/chapters/" not in normalized_path: | |
| 568 | - continue | |
| 569 | - | |
| 570 | - payload = "\n".join( | |
| 571 | - result.content.strip() | |
| 572 | - for result in message.tool_results | |
| 573 | - if result.content.strip() | |
| 574 | - ) or message.content | |
| 575 | - title = html_toc_rule.extract_html_title_from_text(payload) | |
| 576 | - if not title: | |
| 577 | - continue | |
| 578 | - | |
| 579 | - pair = f"{Path(normalized_path).name} = {title}" | |
| 580 | - if pair not in confirmed_pairs: | |
| 581 | - confirmed_pairs.append(pair) | |
| 582 | - | |
| 583 | - if not confirmed_pairs: | |
| 584 | - return None | |
| 585 | - | |
| 586 | - preview = ", ".join(confirmed_pairs[:max_pairs]) | |
| 587 | - if len(confirmed_pairs) > max_pairs: | |
| 588 | - preview += ", ..." | |
| 589 | - return f"Chapter titles confirmed: {preview}" | |
| 590 | - | |
| 591 | - | |
| 592 | -def _collect_html_file_discovery_fact( | |
| 593 | - messages: list[Message], | |
| 594 | - *, | |
| 595 | - tool_calls_by_id: dict[str, ToolCall], | |
| 596 | -) -> str | None: | |
| 597 | - filenames: list[str] = [] | |
| 598 | - for message in messages: | |
| 599 | - if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 600 | - continue | |
| 601 | - if any(result.is_error for result in message.tool_results): | |
| 602 | - continue | |
| 603 | - | |
| 604 | - tool_name = _resolve_tool_name( | |
| 605 | - message, | |
| 606 | - tool_calls_by_id=tool_calls_by_id, | |
| 607 | - ) | |
| 608 | - if tool_name not in {"glob", "bash"}: | |
| 609 | - continue | |
| 610 | - | |
| 611 | - payload = "\n".join( | |
| 612 | - result.content.strip() | |
| 613 | - for result in message.tool_results | |
| 614 | - if result.content.strip() | |
| 615 | - ) or message.content | |
| 616 | - matches = re.findall(r"([A-Za-z0-9_.-]+\.html)", payload) | |
| 617 | - for name in matches: | |
| 618 | - if name not in filenames: | |
| 619 | - filenames.append(name) | |
| 620 | - | |
| 621 | - if len(filenames) < 3: | |
| 622 | - return None | |
| 623 | - | |
| 624 | - preview = ", ".join(filenames[:6]) | |
| 625 | - if len(filenames) > 6: | |
| 626 | - preview += ", ..." | |
| 627 | - return f"Existing files include {preview}" | |
| 628 | - | |
| 629 | - | |
| 630 | -def _collect_html_verification_gap_fact( | |
| 631 | - messages: list[Message], | |
| 632 | - *, | |
| 633 | - tool_calls_by_id: dict[str, ToolCall], | |
| 634 | -) -> str | None: | |
| 635 | - gap = _summarize_latest_html_verification_gap( | |
| 636 | - messages, | |
| 637 | - tool_calls_by_id=tool_calls_by_id, | |
| 638 | - ) | |
| 639 | - if not gap: | |
| 640 | - return None | |
| 641 | - return f"Verification gaps: {gap}" | |
| 642 | - | |
| 643 | - | |
| 644 | -def _summarize_latest_html_verification_gap( | |
| 645 | - messages: list[Message], | |
| 646 | - *, | |
| 647 | - max_items: int = 2, | |
| 648 | - tool_calls_by_id: dict[str, ToolCall] | None = None, | |
| 649 | -) -> str | None: | |
| 650 | - if tool_calls_by_id is None: | |
| 651 | - tool_calls_by_id = { | |
| 652 | - tool_call.id: tool_call | |
| 653 | - for message in messages | |
| 654 | - for tool_call in message.tool_calls | |
| 655 | - } | |
| 656 | - | |
| 657 | - for message in reversed(messages): | |
| 658 | - if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 659 | - continue | |
| 660 | - if not any(result.is_error for result in message.tool_results): | |
| 661 | - continue | |
| 662 | - tool_name = _resolve_tool_name( | |
| 663 | - message, | |
| 664 | - tool_calls_by_id=tool_calls_by_id, | |
| 665 | - ) | |
| 666 | - if tool_name != "bash": | |
| 667 | - continue | |
| 668 | - | |
| 669 | - payload = "\n".join( | |
| 670 | - result.content.strip() | |
| 671 | - for result in message.tool_results | |
| 672 | - if result.content.strip() | |
| 673 | - ) or message.content | |
| 674 | - gap = html_toc_rule.summarize_html_toc_verification_gap( | |
| 675 | - payload, | |
| 676 | - max_items=max_items, | |
| 677 | - ) | |
| 678 | - if gap: | |
| 679 | - return gap | |
| 680 | - | |
| 681 | - return None | |
| 682 | - | |
| 683 | - | |
| 684 | -def _summarize_html_file_discovery(payload: str) -> str | None: | |
| 685 | - return html_toc_rule.summarize_html_file_discovery(payload) | |
| 686 | - | |
| 687 | - | |
| 688 | 498 | def _resolve_tool_name( |
| 689 | 499 | message: Message, |
| 690 | 500 | *, |
@@ -710,9 +520,6 @@ def _choose_target_path( | ||
| 710 | 520 | if focus_path: |
| 711 | 521 | normalized_focus = _normalize_path_candidate(focus_path) |
| 712 | 522 | if normalized_focus: |
| 713 | - resolved_focus = html_toc_rule.resolve_html_toc_index_path(normalized_focus) | |
| 714 | - if resolved_focus is not None: | |
| 715 | - return str(resolved_focus) | |
| 716 | 523 | return normalized_focus |
| 717 | 524 | |
| 718 | 525 | candidates: Counter[str] = Counter() |
@@ -727,9 +534,9 @@ def _choose_target_path( | ||
| 727 | 534 | if not normalized: |
| 728 | 535 | continue |
| 729 | 536 | path_name = Path(normalized).name |
| 730 | - if html_toc_rule.is_html_toc_index_path(normalized): | |
| 537 | + if path_name == "index.html": | |
| 731 | 538 | candidates[normalized] += 10 |
| 732 | - elif path_name.endswith(".html") and "/chapters/" not in normalized: | |
| 539 | + elif "." in path_name: | |
| 733 | 540 | candidates[normalized] += 4 |
| 734 | 541 | |
| 735 | 542 | if candidates: |
@@ -738,9 +545,6 @@ def _choose_target_path( | ||
| 738 | 545 | if not current_task: |
| 739 | 546 | return None |
| 740 | 547 | current_task_paths = extract_key_files([Message(role=Role.USER, content=current_task)], limit=3) |
| 741 | - for path in current_task_paths: | |
| 742 | - if html_toc_rule.is_html_toc_index_path(path): | |
| 743 | - return path | |
| 744 | 548 | return current_task_paths[0] if current_task_paths else None |
| 745 | 549 | |
| 746 | 550 | |
@@ -770,14 +574,7 @@ def _focus_path_anchors(focus_path: str) -> tuple[str, ...]: | ||
| 770 | 574 | ) |
| 771 | 575 | focus = Path(normalized_focus).expanduser() |
| 772 | 576 | anchors = {str(focus)} |
| 773 | - | |
| 774 | - resolved_index = html_toc_rule.resolve_html_toc_index_path(focus) | |
| 775 | - if resolved_index is not None: | |
| 776 | - anchors.add(str(resolved_index)) | |
| 777 | - anchors.add(str(resolved_index.parent)) | |
| 778 | - anchors.add(str(resolved_index.parent / "chapters")) | |
| 779 | - else: | |
| 780 | - anchors.add(str(focus.parent)) | |
| 577 | + anchors.add(str(focus.parent)) | |
| 781 | 578 | |
| 782 | 579 | return tuple(anchor for anchor in anchors if anchor) |
| 783 | 580 | |
src/loader/runtime/dod.pymodified@@ -12,7 +12,6 @@ from typing import Any, Literal | ||
| 12 | 12 | |
| 13 | 13 | from ..llm.base import ToolCall |
| 14 | 14 | from ..tools.shell_tools import BashTool |
| 15 | -from .semantic_rules import html_toc as html_toc_rule | |
| 16 | 15 | from .verification_observations import VerificationAttempt, verification_attempt_id |
| 17 | 16 | |
| 18 | 17 | TaskSize = Literal["small", "standard", "large"] |
@@ -20,6 +19,38 @@ DoDStatus = Literal["draft", "in_progress", "verifying", "fixing", "done", "fail | ||
| 20 | 19 | VerificationConfidence = Literal["high", "medium", "low"] |
| 21 | 20 | VerificationKind = Literal["test", "typecheck", "lint", "build", "smoke", "runtime", "manual"] |
| 22 | 21 | |
| 22 | +_DIRECTORY_CONTENT_HINTS = ( | |
| 23 | + "file", | |
| 24 | + "files", | |
| 25 | + "chapter", | |
| 26 | + "chapters", | |
| 27 | + "page", | |
| 28 | + "pages", | |
| 29 | + "test", | |
| 30 | + "tests", | |
| 31 | + "artifact", | |
| 32 | + "artifacts", | |
| 33 | + "document", | |
| 34 | + "documents", | |
| 35 | + "content", | |
| 36 | + "entry", | |
| 37 | + "entries", | |
| 38 | +) | |
| 39 | +_DIRECTORY_MUTATION_HINTS = ( | |
| 40 | + "create", | |
| 41 | + "creating", | |
| 42 | + "generate", | |
| 43 | + "generating", | |
| 44 | + "write", | |
| 45 | + "writing", | |
| 46 | + "add", | |
| 47 | + "adding", | |
| 48 | + "build", | |
| 49 | + "building", | |
| 50 | + "populate", | |
| 51 | + "populating", | |
| 52 | +) | |
| 53 | + | |
| 23 | 54 | |
| 24 | 55 | @dataclass |
| 25 | 56 | class VerificationEvidence: |
@@ -213,10 +244,13 @@ def derive_verification_commands( | ||
| 213 | 244 | """Generate verification commands from execution history and project shape.""" |
| 214 | 245 | |
| 215 | 246 | commands: list[str] = [] |
| 216 | - semantic_command = _derive_html_toc_verification_command( | |
| 247 | + html_link_command = _derive_local_html_link_verification_command( | |
| 248 | + dod, | |
| 249 | + project_root=project_root, | |
| 250 | + ) | |
| 251 | + planned_artifact_targets = collect_planned_artifact_targets( | |
| 217 | 252 | dod, |
| 218 | 253 | project_root=project_root, |
| 219 | - task_statement=task_statement, | |
| 220 | 254 | ) |
| 221 | 255 | |
| 222 | 256 | explicit = [cmd for cmd in dod.successful_commands if _is_verification_command(cmd)] |
@@ -230,8 +264,10 @@ def derive_verification_commands( | ||
| 230 | 264 | if path.suffix == ".py": |
| 231 | 265 | _append_unique(commands, f"python {shlex.quote(path.name)}") |
| 232 | 266 | |
| 233 | - if semantic_command: | |
| 234 | - _append_unique(commands, semantic_command) | |
| 267 | + if html_link_command: | |
| 268 | + _append_unique(commands, html_link_command) | |
| 269 | + for command in _build_planned_artifact_verification_commands(planned_artifact_targets): | |
| 270 | + _append_unique(commands, command) | |
| 235 | 271 | |
| 236 | 272 | if commands: |
| 237 | 273 | return commands |
@@ -512,30 +548,490 @@ def _extract_files_from_bash(command: str) -> list[str]: | ||
| 512 | 548 | return [] |
| 513 | 549 | |
| 514 | 550 | |
| 515 | -def _derive_html_toc_verification_command( | |
| 551 | +def _derive_local_html_link_verification_command( | |
| 516 | 552 | dod: DefinitionOfDone, |
| 517 | 553 | *, |
| 518 | 554 | project_root: Path, |
| 519 | - task_statement: str, | |
| 520 | 555 | ) -> str | None: |
| 521 | - task_hints = " ".join([task_statement, *dod.acceptance_criteria]).lower() | |
| 522 | - if not html_toc_rule.task_targets_html_toc(task_hints): | |
| 523 | - return None | |
| 524 | - | |
| 556 | + html_paths: list[Path] = [] | |
| 525 | 557 | for path_str in dod.touched_files: |
| 526 | 558 | path = Path(path_str) |
| 527 | 559 | effective_path = path if path.is_absolute() else (project_root / path) |
| 528 | - command = html_toc_rule.build_html_toc_verification_command(effective_path) | |
| 529 | - if command: | |
| 530 | - return command | |
| 560 | + if effective_path.suffix.lower() != ".html" or not effective_path.exists(): | |
| 561 | + continue | |
| 562 | + html_paths.append(effective_path) | |
| 563 | + | |
| 564 | + unique_paths = list(dict.fromkeys(str(path) for path in html_paths)) | |
| 565 | + resolved_paths = [Path(path) for path in unique_paths] | |
| 566 | + if not resolved_paths: | |
| 567 | + return None | |
| 568 | + if not any(_html_file_contains_local_links(path) for path in resolved_paths): | |
| 569 | + return None | |
| 570 | + return _build_local_html_link_verification_command(resolved_paths) | |
| 571 | + | |
| 572 | + | |
| 573 | +def collect_planned_artifact_targets( | |
| 574 | + dod: DefinitionOfDone, | |
| 575 | + *, | |
| 576 | + project_root: Path, | |
| 577 | + max_paths: int | None = None, | |
| 578 | +) -> list[tuple[Path, bool]]: | |
| 579 | + if not dod.implementation_plan: | |
| 580 | + return [] | |
| 581 | + | |
| 582 | + plan_path = Path(dod.implementation_plan) | |
| 583 | + if not plan_path.exists(): | |
| 584 | + return [] | |
| 585 | + | |
| 586 | + markdown = plan_path.read_text() | |
| 587 | + file_change_lines = _extract_markdown_section_lines(markdown, "File Changes") | |
| 588 | + candidates = _extract_planned_path_literals(file_change_lines or markdown.splitlines()) | |
| 589 | + if not candidates: | |
| 590 | + confirmed_progress_lines = _extract_markdown_section_lines( | |
| 591 | + markdown, | |
| 592 | + "Confirmed Progress", | |
| 593 | + ) | |
| 594 | + candidates = _extract_planned_path_literals(confirmed_progress_lines) | |
| 595 | + targets: list[tuple[Path, bool]] = [] | |
| 596 | + seen: set[tuple[str, bool]] = set() | |
| 597 | + | |
| 598 | + selected_candidates = candidates if max_paths is None else candidates[:max_paths] | |
| 599 | + for raw_path in selected_candidates: | |
| 600 | + effective_path = _resolve_planned_artifact_path(raw_path, project_root=project_root) | |
| 601 | + if effective_path is None: | |
| 602 | + continue | |
| 603 | + expect_directory = raw_path.endswith("/") | |
| 604 | + if not expect_directory and not effective_path.suffix: | |
| 605 | + continue | |
| 606 | + key = (str(effective_path), expect_directory) | |
| 607 | + if key in seen: | |
| 608 | + continue | |
| 609 | + seen.add(key) | |
| 610 | + targets.append((effective_path, expect_directory)) | |
| 611 | + return targets | |
| 612 | + | |
| 613 | + | |
| 614 | +def all_planned_artifacts_exist( | |
| 615 | + dod: DefinitionOfDone, | |
| 616 | + *, | |
| 617 | + project_root: Path, | |
| 618 | + max_paths: int | None = None, | |
| 619 | +) -> bool: | |
| 620 | + targets = collect_planned_artifact_targets( | |
| 621 | + dod, | |
| 622 | + project_root=project_root, | |
| 623 | + max_paths=max_paths, | |
| 624 | + ) | |
| 625 | + if not targets: | |
| 626 | + return False | |
| 627 | + if not all( | |
| 628 | + planned_artifact_target_satisfied( | |
| 629 | + dod, | |
| 630 | + target=target, | |
| 631 | + expect_directory=expect_directory, | |
| 632 | + project_root=project_root, | |
| 633 | + ) | |
| 634 | + for target, expect_directory in targets | |
| 635 | + ): | |
| 636 | + return False | |
| 637 | + return not _planned_html_outputs_have_missing_local_links( | |
| 638 | + dod, | |
| 639 | + project_root=project_root, | |
| 640 | + targets=targets, | |
| 641 | + ) | |
| 642 | + | |
| 643 | + | |
| 644 | +def planned_artifact_target_satisfied( | |
| 645 | + dod: DefinitionOfDone, | |
| 646 | + *, | |
| 647 | + target: Path, | |
| 648 | + expect_directory: bool, | |
| 649 | + project_root: Path, | |
| 650 | +) -> bool: | |
| 651 | + """Return whether one planned file or directory target is substantively satisfied.""" | |
| 652 | + | |
| 653 | + if not expect_directory: | |
| 654 | + return target.is_file() | |
| 655 | + if not target.is_dir(): | |
| 656 | + return False | |
| 657 | + if not planned_directory_requires_generated_files( | |
| 658 | + dod, | |
| 659 | + target=target, | |
| 660 | + project_root=project_root, | |
| 661 | + ): | |
| 662 | + return True | |
| 663 | + return _directory_contains_files(target) | |
| 664 | + | |
| 665 | + | |
| 666 | +def infer_next_declared_html_output_file( | |
| 667 | + *, | |
| 668 | + target: Path, | |
| 669 | + project_root: Path, | |
| 670 | +) -> Path | None: | |
| 671 | + """Return the first missing HTML file already declared within an output directory.""" | |
| 672 | + | |
| 673 | + missing_targets = collect_missing_declared_html_output_files( | |
| 674 | + target=target, | |
| 675 | + project_root=project_root, | |
| 676 | + ) | |
| 677 | + return missing_targets[0] if missing_targets else None | |
| 678 | + | |
| 679 | + | |
| 680 | +def collect_missing_declared_html_output_files( | |
| 681 | + *, | |
| 682 | + target: Path, | |
| 683 | + project_root: Path, | |
| 684 | +) -> tuple[Path, ...]: | |
| 685 | + """Return missing HTML outputs already declared within the current artifact graph.""" | |
| 686 | + | |
| 687 | + normalized_target = target.resolve(strict=False) | |
| 688 | + artifact_root = _resolve_declared_html_artifact_root( | |
| 689 | + normalized_target, | |
| 690 | + project_root=project_root.resolve(strict=False), | |
| 691 | + ) | |
| 692 | + if artifact_root is None: | |
| 693 | + return () | |
| 694 | + | |
| 695 | + html_files = [path for path in sorted(artifact_root.rglob("*.html")) if path.is_file()] | |
| 696 | + if not html_files: | |
| 697 | + return () | |
| 698 | + | |
| 699 | + missing_targets: list[Path] = [] | |
| 700 | + seen: set[str] = set() | |
| 701 | + for html_file in html_files: | |
| 702 | + try: | |
| 703 | + content = html_file.read_text() | |
| 704 | + except OSError: | |
| 705 | + continue | |
| 706 | + for resolved_target in _iter_local_html_targets(html_file, content): | |
| 707 | + if resolved_target.exists(): | |
| 708 | + continue | |
| 709 | + if resolved_target.suffix.lower() not in {".html", ".htm"}: | |
| 710 | + continue | |
| 711 | + try: | |
| 712 | + resolved_target.relative_to(artifact_root) | |
| 713 | + resolved_target.relative_to(normalized_target) | |
| 714 | + except ValueError: | |
| 715 | + continue | |
| 716 | + key = str(resolved_target) | |
| 717 | + if key in seen: | |
| 718 | + continue | |
| 719 | + seen.add(key) | |
| 720 | + missing_targets.append(resolved_target) | |
| 721 | + return tuple(missing_targets) | |
| 722 | + | |
| 723 | + | |
| 724 | +def _build_planned_artifact_verification_commands( | |
| 725 | + targets: list[tuple[Path, bool]], | |
| 726 | +) -> list[str]: | |
| 727 | + commands: list[str] = [] | |
| 728 | + for effective_path, expect_directory in targets: | |
| 729 | + command = ( | |
| 730 | + f"test -d {shlex.quote(str(effective_path))}" | |
| 731 | + if expect_directory | |
| 732 | + else f"test -f {shlex.quote(str(effective_path))}" | |
| 733 | + ) | |
| 734 | + _append_unique(commands, command) | |
| 735 | + return commands | |
| 736 | + | |
| 737 | + | |
| 738 | +def _extract_markdown_section_lines(markdown: str, heading: str) -> list[str]: | |
| 739 | + current_heading: str | None = None | |
| 740 | + collected: list[str] = [] | |
| 741 | + for line in markdown.splitlines(): | |
| 742 | + stripped = line.strip() | |
| 743 | + if stripped.startswith("## "): | |
| 744 | + current_heading = stripped[3:].strip().lower() | |
| 745 | + continue | |
| 746 | + if current_heading == heading.lower(): | |
| 747 | + collected.append(line) | |
| 748 | + return collected | |
| 749 | + | |
| 750 | + | |
| 751 | +def _extract_planned_path_literals(lines: list[str]) -> list[str]: | |
| 752 | + paths: list[str] = [] | |
| 753 | + seen: set[str] = set() | |
| 754 | + | |
| 755 | + for line in lines: | |
| 756 | + candidates = re.findall(r"`([^`]+)`", line) | |
| 757 | + if not candidates: | |
| 758 | + stripped = line.strip() | |
| 759 | + stripped = re.sub(r"^[-*+]\s+", "", stripped) | |
| 760 | + stripped = re.sub(r"^\d+[.)]\s+", "", stripped) | |
| 761 | + stripped = stripped.strip("`'\",.:;()[]{}") | |
| 762 | + candidates = [stripped] if _looks_like_path_literal(stripped) else [] | |
| 763 | + for candidate in candidates: | |
| 764 | + normalized = candidate.strip("`'\",.:;()[]{}") | |
| 765 | + if not _looks_like_path_literal(normalized) or normalized in seen: | |
| 766 | + continue | |
| 767 | + seen.add(normalized) | |
| 768 | + paths.append(normalized) | |
| 769 | + return paths | |
| 770 | + | |
| 771 | + | |
| 772 | +def _resolve_declared_html_artifact_root( | |
| 773 | + target: Path, | |
| 774 | + *, | |
| 775 | + project_root: Path, | |
| 776 | +) -> Path | None: | |
| 777 | + for candidate in [target, *target.parents]: | |
| 778 | + if (candidate / "index.html").is_file(): | |
| 779 | + return candidate | |
| 780 | + if candidate == project_root or candidate == candidate.parent: | |
| 781 | + break | |
| 782 | + | |
| 783 | + fallback = target if target.exists() else target.parent | |
| 784 | + if fallback.exists(): | |
| 785 | + return fallback | |
| 531 | 786 | return None |
| 532 | 787 | |
| 533 | 788 | |
| 534 | -def _build_html_toc_verification_command(index_path: Path) -> str: | |
| 535 | - command = html_toc_rule.build_html_toc_verification_command(index_path) | |
| 536 | - if command is None: | |
| 537 | - raise ValueError(f"{index_path} is not a valid HTML TOC target") | |
| 538 | - return command | |
| 789 | +def _iter_local_html_targets(file_path: Path, content: str) -> list[Path]: | |
| 790 | + pattern = re.compile(r'href\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE) | |
| 791 | + targets: list[Path] = [] | |
| 792 | + seen: set[str] = set() | |
| 793 | + for href in pattern.findall(content): | |
| 794 | + candidate = href.strip() | |
| 795 | + if not _is_local_html_link_target(candidate): | |
| 796 | + continue | |
| 797 | + resolved = (file_path.parent / candidate).resolve(strict=False) | |
| 798 | + key = str(resolved) | |
| 799 | + if key in seen: | |
| 800 | + continue | |
| 801 | + seen.add(key) | |
| 802 | + targets.append(resolved) | |
| 803 | + return targets | |
| 804 | + | |
| 805 | + | |
| 806 | +def _is_local_html_link_target(href: str) -> bool: | |
| 807 | + candidate = href.strip() | |
| 808 | + if not candidate or candidate.startswith(("#", "http://", "https://", "mailto:")): | |
| 809 | + return False | |
| 810 | + if "?" in candidate: | |
| 811 | + candidate = candidate.split("?", 1)[0] | |
| 812 | + if "#" in candidate: | |
| 813 | + candidate = candidate.split("#", 1)[0] | |
| 814 | + return Path(candidate).suffix.lower() in {".html", ".htm"} | |
| 815 | + | |
| 816 | + | |
| 817 | +def _looks_like_path_literal(value: str) -> bool: | |
| 818 | + if not value or " " in value: | |
| 819 | + return False | |
| 820 | + if value.startswith(("http://", "https://")): | |
| 821 | + return False | |
| 822 | + return ( | |
| 823 | + value.startswith(("~/", "./", "../", "/")) | |
| 824 | + or "/" in value | |
| 825 | + or value.endswith("/") | |
| 826 | + ) | |
| 827 | + | |
| 828 | + | |
| 829 | +def _resolve_planned_artifact_path( | |
| 830 | + raw_path: str, | |
| 831 | + *, | |
| 832 | + project_root: Path, | |
| 833 | +) -> Path | None: | |
| 834 | + text = raw_path.strip() | |
| 835 | + if not text: | |
| 836 | + return None | |
| 837 | + path = Path(text).expanduser() | |
| 838 | + if path.is_absolute(): | |
| 839 | + return path | |
| 840 | + return project_root / path | |
| 841 | + | |
| 842 | + | |
| 843 | +def planned_directory_requires_generated_files( | |
| 844 | + dod: DefinitionOfDone, | |
| 845 | + *, | |
| 846 | + target: Path, | |
| 847 | + project_root: Path, | |
| 848 | +) -> bool: | |
| 849 | + """Return whether a planned directory is expected to contain generated files.""" | |
| 850 | + | |
| 851 | + plan_path = Path(dod.implementation_plan) if dod.implementation_plan else None | |
| 852 | + if plan_path is not None and plan_path.exists(): | |
| 853 | + markdown = plan_path.read_text() | |
| 854 | + file_change_lines = _extract_markdown_section_lines(markdown, "File Changes") | |
| 855 | + if any( | |
| 856 | + _line_describes_directory_contents(line, target=target, project_root=project_root) | |
| 857 | + for line in file_change_lines | |
| 858 | + ): | |
| 859 | + return True | |
| 860 | + | |
| 861 | + execution_lines = _extract_markdown_section_lines(markdown, "Execution Order") | |
| 862 | + if any( | |
| 863 | + _line_mentions_directory_generation(line, target=target) | |
| 864 | + for line in execution_lines | |
| 865 | + ): | |
| 866 | + return True | |
| 867 | + | |
| 868 | + todo_lines = [*dod.pending_items, *dod.completed_items] | |
| 869 | + return any( | |
| 870 | + _line_mentions_directory_generation(line, target=target) | |
| 871 | + for line in todo_lines | |
| 872 | + ) | |
| 873 | + | |
| 874 | + | |
| 875 | +def _line_describes_directory_contents( | |
| 876 | + line: str, | |
| 877 | + *, | |
| 878 | + target: Path, | |
| 879 | + project_root: Path, | |
| 880 | +) -> bool: | |
| 881 | + lowered = line.lower() | |
| 882 | + if not any(hint in lowered for hint in _DIRECTORY_CONTENT_HINTS): | |
| 883 | + return False | |
| 884 | + | |
| 885 | + target_text = str(target) | |
| 886 | + relative_target = str(target.relative_to(project_root)) if target.is_relative_to(project_root) else "" | |
| 887 | + if target_text in line or relative_target and relative_target in line: | |
| 888 | + return True | |
| 889 | + return _line_mentions_directory_generation(line, target=target) | |
| 890 | + | |
| 891 | + | |
| 892 | +def _line_mentions_directory_generation(line: str, *, target: Path) -> bool: | |
| 893 | + lowered = line.lower() | |
| 894 | + if not any(hint in lowered for hint in _DIRECTORY_CONTENT_HINTS): | |
| 895 | + return False | |
| 896 | + if not any(hint in lowered for hint in _DIRECTORY_MUTATION_HINTS) and "directory for" not in lowered: | |
| 897 | + return False | |
| 898 | + directory_tokens = _directory_tokens(target) | |
| 899 | + return any(token in lowered for token in directory_tokens) | |
| 900 | + | |
| 901 | + | |
| 902 | +def _directory_tokens(target: Path) -> set[str]: | |
| 903 | + tokens: set[str] = set() | |
| 904 | + for raw_token in re.split(r"[^a-z0-9]+", target.name.lower()): | |
| 905 | + token = raw_token.strip() | |
| 906 | + if len(token) < 2: | |
| 907 | + continue | |
| 908 | + tokens.add(token) | |
| 909 | + if token.endswith("ies") and len(token) > 3: | |
| 910 | + tokens.add(f"{token[:-3]}y") | |
| 911 | + elif token.endswith("s") and len(token) > 3: | |
| 912 | + tokens.add(token[:-1]) | |
| 913 | + return tokens | |
| 914 | + | |
| 915 | + | |
| 916 | +def _directory_contains_files(target: Path) -> bool: | |
| 917 | + try: | |
| 918 | + return any(child.is_file() for child in target.rglob("*")) | |
| 919 | + except OSError: | |
| 920 | + return False | |
| 921 | + | |
| 922 | + | |
| 923 | +def _html_file_contains_local_links(path: Path) -> bool: | |
| 924 | + pattern = re.compile(r'href\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE) | |
| 925 | + try: | |
| 926 | + text = path.read_text() | |
| 927 | + except OSError: | |
| 928 | + return False | |
| 929 | + return any(_is_local_html_link_target(href) for href in pattern.findall(text)) | |
| 930 | + | |
| 931 | + | |
| 932 | +def _planned_html_outputs_have_missing_local_links( | |
| 933 | + dod: DefinitionOfDone, | |
| 934 | + *, | |
| 935 | + project_root: Path, | |
| 936 | + targets: list[tuple[Path, bool]], | |
| 937 | +) -> bool: | |
| 938 | + html_paths: list[Path] = [] | |
| 939 | + for raw_path in dod.touched_files: | |
| 940 | + path = Path(raw_path) | |
| 941 | + effective_path = path if path.is_absolute() else (project_root / path) | |
| 942 | + if effective_path.suffix.lower() != ".html" or not effective_path.exists(): | |
| 943 | + continue | |
| 944 | + html_paths.append(effective_path) | |
| 945 | + | |
| 946 | + for target, expect_directory in targets: | |
| 947 | + if expect_directory or target.suffix.lower() != ".html" or not target.exists(): | |
| 948 | + continue | |
| 949 | + html_paths.append(target) | |
| 950 | + | |
| 951 | + seen: set[str] = set() | |
| 952 | + for path in html_paths: | |
| 953 | + normalized = str(path) | |
| 954 | + if normalized in seen: | |
| 955 | + continue | |
| 956 | + seen.add(normalized) | |
| 957 | + if _html_file_has_missing_local_links(path): | |
| 958 | + return True | |
| 959 | + return False | |
| 960 | + | |
| 961 | + | |
| 962 | +def _html_file_has_missing_local_links(path: Path) -> bool: | |
| 963 | + pattern = re.compile(r'href\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE) | |
| 964 | + try: | |
| 965 | + text = path.read_text() | |
| 966 | + except OSError: | |
| 967 | + return False | |
| 968 | + for href in pattern.findall(text): | |
| 969 | + target = href.strip() | |
| 970 | + if not _is_local_html_link_target(target): | |
| 971 | + continue | |
| 972 | + normalized = target.split("#", 1)[0].split("?", 1)[0].strip() | |
| 973 | + if not normalized: | |
| 974 | + continue | |
| 975 | + if not (path.parent / normalized).resolve().exists(): | |
| 976 | + return True | |
| 977 | + return False | |
| 978 | + | |
| 979 | + | |
| 980 | +def _is_local_html_link_target(href: str) -> bool: | |
| 981 | + target = href.strip() | |
| 982 | + if not target: | |
| 983 | + return False | |
| 984 | + if target.startswith(("#", "mailto:", "tel:", "javascript:")): | |
| 985 | + return False | |
| 986 | + if "://" in target: | |
| 987 | + return False | |
| 988 | + target = target.split("#", 1)[0].split("?", 1)[0].strip() | |
| 989 | + return bool(target) | |
| 990 | + | |
| 991 | + | |
| 992 | +def _build_local_html_link_verification_command(paths: list[Path]) -> str: | |
| 993 | + serialized_paths = ", ".join(repr(str(path)) for path in paths) | |
| 994 | + return "\n".join( | |
| 995 | + [ | |
| 996 | + "python3 - <<'PY'", | |
| 997 | + "from pathlib import Path", | |
| 998 | + "import re", | |
| 999 | + "", | |
| 1000 | + f"paths = [{serialized_paths}]", | |
| 1001 | + ( | |
| 1002 | + r"pattern = re.compile(r'href\s*=\s*[\"\\\']([^\"\\\']+)[\"\\\']', " | |
| 1003 | + "re.IGNORECASE)" | |
| 1004 | + ), | |
| 1005 | + "checked = 0", | |
| 1006 | + "missing = []", | |
| 1007 | + "for raw_path in paths:", | |
| 1008 | + " html_path = Path(raw_path)", | |
| 1009 | + " if not html_path.exists():", | |
| 1010 | + " continue", | |
| 1011 | + " text = html_path.read_text()", | |
| 1012 | + " for href in pattern.findall(text):", | |
| 1013 | + " target = href.strip()", | |
| 1014 | + " if not target:", | |
| 1015 | + " continue", | |
| 1016 | + " if target.startswith((\"#\", \"mailto:\", \"tel:\", \"javascript:\")):", | |
| 1017 | + " continue", | |
| 1018 | + " if \"://\" in target:", | |
| 1019 | + " continue", | |
| 1020 | + " target = target.split(\"#\", 1)[0].split(\"?\", 1)[0].strip()", | |
| 1021 | + " if not target:", | |
| 1022 | + " continue", | |
| 1023 | + " checked += 1", | |
| 1024 | + " resolved = (html_path.parent / target).resolve()", | |
| 1025 | + " if not resolved.exists():", | |
| 1026 | + " missing.append(f\"{html_path}:{href} -> {resolved}\")", | |
| 1027 | + "if missing:", | |
| 1028 | + " print(\"Missing local HTML links:\")", | |
| 1029 | + " print(\"\\n\".join(missing))", | |
| 1030 | + " raise SystemExit(1)", | |
| 1031 | + "print(f\"Checked {checked} local HTML links across {len(paths)} file(s).\")", | |
| 1032 | + "PY", | |
| 1033 | + ] | |
| 1034 | + ) | |
| 539 | 1035 | |
| 540 | 1036 | |
| 541 | 1037 | def _first_non_empty_line(text: str) -> str: |
src/loader/runtime/explore.pymodified@@ -109,6 +109,7 @@ class ExploreRuntime: | ||
| 109 | 109 | registry=self.registry, |
| 110 | 110 | rollback_plan=None, |
| 111 | 111 | workspace_root=self.context.project_root, |
| 112 | + session=self.context.session, | |
| 112 | 113 | ), |
| 113 | 114 | ) |
| 114 | 115 | |
src/loader/runtime/finalization.pymodified@@ -14,6 +14,7 @@ from .dod import ( | ||
| 14 | 14 | DefinitionOfDoneStore, |
| 15 | 15 | VerificationEvidence, |
| 16 | 16 | build_verification_summary, |
| 17 | + collect_planned_artifact_targets, | |
| 17 | 18 | derive_verification_commands, |
| 18 | 19 | ensure_active_verification_attempt, |
| 19 | 20 | synthesize_todo_items, |
@@ -28,7 +29,6 @@ from .executor import ToolExecutor | ||
| 28 | 29 | from .logging import get_runtime_logger |
| 29 | 30 | from .memory import MemoryStore |
| 30 | 31 | from .policy_timeline import append_verification_timeline_entry |
| 31 | -from .semantic_rules import html_toc as html_toc_rule | |
| 32 | 32 | from .session import normalize_usage |
| 33 | 33 | from .tracing import RuntimeTracer |
| 34 | 34 | from .verification_observations import ( |
@@ -41,6 +41,7 @@ from .workflow import ( | ||
| 41 | 41 | WorkflowMode, |
| 42 | 42 | WorkflowTimelineEntry, |
| 43 | 43 | WorkflowTimelineEntryKind, |
| 44 | + effective_pending_todo_items, | |
| 44 | 45 | extract_verification_commands_from_markdown, |
| 45 | 46 | ) |
| 46 | 47 | |
@@ -98,13 +99,20 @@ class TurnFinalizer: | ||
| 98 | 99 | """Gate completion on DoD state and verification evidence.""" |
| 99 | 100 | |
| 100 | 101 | implementation_item = "Complete the requested work" |
| 101 | - if implementation_item in dod.pending_items: | |
| 102 | - dod.pending_items.remove(implementation_item) | |
| 103 | - dod.completed_items.append(implementation_item) | |
| 102 | + verification_item = "Collect verification evidence" | |
| 104 | 103 | |
| 105 | 104 | tracked_pending_items = [ |
| 106 | - item for item in dod.pending_items if item != "Collect verification evidence" | |
| 105 | + item | |
| 106 | + for item in effective_pending_todo_items( | |
| 107 | + dod, | |
| 108 | + project_root=self.context.project_root, | |
| 109 | + ) | |
| 110 | + if item not in {implementation_item, verification_item} | |
| 107 | 111 | ] |
| 112 | + missing_planned_artifacts = _missing_planned_artifact_labels( | |
| 113 | + dod, | |
| 114 | + project_root=self.context.project_root, | |
| 115 | + ) | |
| 108 | 116 | |
| 109 | 117 | mutating_paths = [path for path in dod.touched_files if path] |
| 110 | 118 | requires_verification = bool(mutating_paths or dod.mutating_actions) |
@@ -115,6 +123,60 @@ class TurnFinalizer: | ||
| 115 | 123 | reason=f"files={mutating_paths[:3]}, actions={len(dod.mutating_actions)}" |
| 116 | 124 | if requires_verification else None, |
| 117 | 125 | ) |
| 126 | + if missing_planned_artifacts: | |
| 127 | + recovery_nudge = _build_missing_artifact_recovery_nudge( | |
| 128 | + _first_missing_planned_artifact( | |
| 129 | + dod, | |
| 130 | + project_root=self.context.project_root, | |
| 131 | + ) | |
| 132 | + ) | |
| 133 | + if recovery_nudge: | |
| 134 | + self.context.queue_steering_message(recovery_nudge) | |
| 135 | + missing_provenance = [ | |
| 136 | + EvidenceProvenance( | |
| 137 | + category="tracked_work", | |
| 138 | + source="dod.implementation_plan", | |
| 139 | + summary=f"planned artifact still missing: {label}", | |
| 140 | + status=EvidenceProvenanceStatus.MISSING.value, | |
| 141 | + subject=label, | |
| 142 | + ) | |
| 143 | + for label in missing_planned_artifacts | |
| 144 | + ] | |
| 145 | + missing_text = "\n".join( | |
| 146 | + f"- {label}" for label in missing_planned_artifacts[:8] | |
| 147 | + ) | |
| 148 | + pending_text = "" | |
| 149 | + if tracked_pending_items: | |
| 150 | + pending_text = ( | |
| 151 | + "\nRemaining tracked work:\n" | |
| 152 | + + "\n".join(f"- {item}" for item in tracked_pending_items[:6]) | |
| 153 | + ) | |
| 154 | + self.dod_store.save(dod) | |
| 155 | + await self.emit_dod_status(emit, dod) | |
| 156 | + self.context.session.append( | |
| 157 | + Message( | |
| 158 | + role=Role.USER, | |
| 159 | + content=( | |
| 160 | + "[PLANNED ARTIFACTS STILL MISSING]\n" | |
| 161 | + "The explicit implementation plan is not complete yet. " | |
| 162 | + "Do not move to verification or final confirmation.\n\n" | |
| 163 | + "Missing planned artifacts:\n" | |
| 164 | + f"{missing_text}" | |
| 165 | + f"{pending_text}\n\n" | |
| 166 | + "Continue by creating or updating the missing planned artifacts." | |
| 167 | + ), | |
| 168 | + ) | |
| 169 | + ) | |
| 170 | + return CompletionGateResult( | |
| 171 | + should_continue=True, | |
| 172 | + reason_code="planned_artifacts_missing_continue", | |
| 173 | + reason_summary=( | |
| 174 | + "continued because explicitly planned artifacts were still missing " | |
| 175 | + "before verification" | |
| 176 | + ), | |
| 177 | + final_response="", | |
| 178 | + evidence_provenance=missing_provenance, | |
| 179 | + ) | |
| 118 | 180 | if tracked_pending_items and not requires_verification: |
| 119 | 181 | pending_provenance = [ |
| 120 | 182 | EvidenceProvenance( |
@@ -149,6 +211,10 @@ class TurnFinalizer: | ||
| 149 | 211 | ) |
| 150 | 212 | |
| 151 | 213 | if not requires_verification: |
| 214 | + if implementation_item in dod.pending_items: | |
| 215 | + dod.pending_items.remove(implementation_item) | |
| 216 | + if implementation_item not in dod.completed_items: | |
| 217 | + dod.completed_items.append(implementation_item) | |
| 152 | 218 | skip_provenance = [ |
| 153 | 219 | EvidenceProvenance( |
| 154 | 220 | category="verification", |
@@ -240,9 +306,15 @@ class TurnFinalizer: | ||
| 240 | 306 | f"Task: {dod.task_statement}\n" |
| 241 | 307 | "No new file changes were made since the last failed verification.\n\n" |
| 242 | 308 | f"{build_verification_summary(dod.evidence)}\n\n" |
| 243 | - f"{_build_verification_repair_guidance(dod)}\n\n" | |
| 309 | + f"{_build_verification_repair_guidance(dod, project_root=self.context.project_root)}\n\n" | |
| 244 | 310 | "Apply a concrete edit or patch before trying to finish again." |
| 245 | 311 | ) |
| 312 | + recovery_nudge = _build_verification_failure_recovery_nudge( | |
| 313 | + dod, | |
| 314 | + project_root=self.context.project_root, | |
| 315 | + ) | |
| 316 | + if recovery_nudge: | |
| 317 | + self.context.queue_steering_message(recovery_nudge) | |
| 246 | 318 | self.context.session.append(Message(role=Role.USER, content=repair_prompt)) |
| 247 | 319 | return CompletionGateResult( |
| 248 | 320 | should_continue=True, |
@@ -407,6 +479,12 @@ class TurnFinalizer: | ||
| 407 | 479 | dod.confidence = "medium" |
| 408 | 480 | self.dod_store.save(dod) |
| 409 | 481 | await self.emit_dod_status(emit, dod) |
| 482 | + recovery_nudge = _build_verification_failure_recovery_nudge( | |
| 483 | + dod, | |
| 484 | + project_root=self.context.project_root, | |
| 485 | + ) | |
| 486 | + if recovery_nudge: | |
| 487 | + self.context.queue_steering_message(recovery_nudge) | |
| 410 | 488 | await self.set_workflow_mode( |
| 411 | 489 | ModeDecision.transition( |
| 412 | 490 | WorkflowMode.EXECUTE, |
@@ -424,7 +502,7 @@ class TurnFinalizer: | ||
| 424 | 502 | f"Attempt: {dod.retry_count}/{dod.retry_budget}\n" |
| 425 | 503 | f"Pending items: {', '.join(dod.pending_items)}\n\n" |
| 426 | 504 | f"{build_verification_summary(dod.evidence)}\n\n" |
| 427 | - f"{_build_verification_repair_guidance(dod)}\n\n" | |
| 505 | + f"{_build_verification_repair_guidance(dod, project_root=self.context.project_root)}\n\n" | |
| 428 | 506 | "Fix the failures above, then finish the task again." |
| 429 | 507 | ) |
| 430 | 508 | self.context.session.append(Message(role=Role.USER, content=failure_prompt)) |
@@ -710,6 +788,72 @@ def _verification_result_provenance( | ||
| 710 | 788 | return entries |
| 711 | 789 | |
| 712 | 790 | |
| 791 | +def _missing_planned_artifact_labels( | |
| 792 | + dod: DefinitionOfDone, | |
| 793 | + *, | |
| 794 | + project_root: Path, | |
| 795 | +) -> list[str]: | |
| 796 | + labels: list[str] = [] | |
| 797 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 798 | + dod, | |
| 799 | + project_root=project_root, | |
| 800 | + max_paths=12, | |
| 801 | + ): | |
| 802 | + exists = target.is_dir() if expect_directory else target.is_file() | |
| 803 | + if exists: | |
| 804 | + continue | |
| 805 | + label = target.name or str(target) | |
| 806 | + if expect_directory and not label.endswith("/"): | |
| 807 | + label += "/" | |
| 808 | + labels.append(f"`{label}`") | |
| 809 | + return labels | |
| 810 | + | |
| 811 | + | |
| 812 | +def _first_missing_planned_artifact( | |
| 813 | + dod: DefinitionOfDone, | |
| 814 | + *, | |
| 815 | + project_root: Path, | |
| 816 | +) -> tuple[Path, bool] | None: | |
| 817 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 818 | + dod, | |
| 819 | + project_root=project_root, | |
| 820 | + max_paths=12, | |
| 821 | + ): | |
| 822 | + exists = target.is_dir() if expect_directory else target.is_file() | |
| 823 | + if not exists: | |
| 824 | + return target, expect_directory | |
| 825 | + return None | |
| 826 | + | |
| 827 | + | |
| 828 | +def _build_missing_artifact_recovery_nudge( | |
| 829 | + missing_artifact: tuple[Path, bool] | None, | |
| 830 | +) -> str | None: | |
| 831 | + if missing_artifact is None: | |
| 832 | + return None | |
| 833 | + | |
| 834 | + target, expect_directory = missing_artifact | |
| 835 | + label = target.name or str(target) | |
| 836 | + if expect_directory and not label.endswith("/"): | |
| 837 | + label += "/" | |
| 838 | + | |
| 839 | + if expect_directory: | |
| 840 | + return ( | |
| 841 | + "Your prior completion claim was incorrect because " | |
| 842 | + f"`{label}` does not exist yet. Do not summarize, mark completion, or " | |
| 843 | + "write bookkeeping notes yet. Your next response should be one concrete " | |
| 844 | + f"tool call that creates `{target}`. If a specific missing fact blocks " | |
| 845 | + "that step, ask one precise question." | |
| 846 | + ) | |
| 847 | + | |
| 848 | + return ( | |
| 849 | + "Your prior completion claim was incorrect because " | |
| 850 | + f"`{label}` does not exist yet. Do not summarize, mark completion, or " | |
| 851 | + "write bookkeeping notes yet. Your next response should be one concrete " | |
| 852 | + f"`write` or `edit`-style tool call that creates or updates `{target}`. " | |
| 853 | + "If a specific missing fact blocks that step, ask one precise question." | |
| 854 | + ) | |
| 855 | + | |
| 856 | + | |
| 713 | 857 | def _verification_result_observations( |
| 714 | 858 | dod: DefinitionOfDone, |
| 715 | 859 | *, |
@@ -938,49 +1082,254 @@ def _verification_state_signature(dod: DefinitionOfDone) -> str: | ||
| 938 | 1082 | ) |
| 939 | 1083 | |
| 940 | 1084 | |
| 941 | -def _build_verification_repair_guidance(dod: DefinitionOfDone) -> str: | |
| 942 | - fixes = _extract_verification_repairs(dod.evidence) | |
| 943 | - if not fixes: | |
| 1085 | +def _build_verification_repair_guidance( | |
| 1086 | + dod: DefinitionOfDone, | |
| 1087 | + *, | |
| 1088 | + project_root: Path, | |
| 1089 | +) -> str: | |
| 1090 | + repair_targets = _extract_verification_repair_targets(dod.evidence) | |
| 1091 | + fixes = _extract_verification_repairs( | |
| 1092 | + dod.evidence, | |
| 1093 | + repair_targets=repair_targets, | |
| 1094 | + ) | |
| 1095 | + repair_source_paths = _existing_repair_source_paths( | |
| 1096 | + dod, | |
| 1097 | + repair_targets=repair_targets, | |
| 1098 | + project_root=project_root, | |
| 1099 | + ) | |
| 1100 | + if not fixes and not repair_targets: | |
| 944 | 1101 | return ( |
| 945 | 1102 | "Use the failed verification evidence directly, avoid rereading unrelated " |
| 946 | 1103 | "files, and fix the target file before retrying." |
| 947 | 1104 | ) |
| 948 | 1105 | |
| 949 | - return "\n".join( | |
| 950 | - [ | |
| 951 | - "Repair focus:", | |
| 952 | - *[f"- {item}" for item in fixes], | |
| 953 | - "- Reuse these exact failures instead of restarting discovery from earlier chapters.", | |
| 954 | - ] | |
| 955 | - ) | |
| 1106 | + lines = ["Repair focus:"] | |
| 1107 | + lines.extend(f"- {item}" for item in fixes) | |
| 1108 | + primary_target = repair_targets[0] if repair_targets else None | |
| 1109 | + if primary_target is not None: | |
| 1110 | + lines.extend( | |
| 1111 | + [ | |
| 1112 | + f"- Immediate next step: edit `{primary_target.artifact_path}`.", | |
| 1113 | + "- If the broken reference should remain, create " | |
| 1114 | + f"`{primary_target.expected_path}`; otherwise remove or replace " | |
| 1115 | + f"`{primary_target.failing_reference}`.", | |
| 1116 | + *( | |
| 1117 | + [ | |
| 1118 | + "- Use the existing artifact files as the source of truth while " | |
| 1119 | + "repairing this file: " | |
| 1120 | + + ", ".join(f"`{path}`" for path in repair_source_paths[:6]) | |
| 1121 | + + (", ..." if len(repair_source_paths) > 6 else "") | |
| 1122 | + ] | |
| 1123 | + if repair_source_paths | |
| 1124 | + else [] | |
| 1125 | + ), | |
| 1126 | + "- Do not reread unrelated reference materials or restart discovery " | |
| 1127 | + "while this concrete repair target is unresolved.", | |
| 1128 | + ] | |
| 1129 | + ) | |
| 1130 | + else: | |
| 1131 | + lines.append( | |
| 1132 | + "- Reuse these exact failures instead of restarting discovery from earlier " | |
| 1133 | + "chapters." | |
| 1134 | + ) | |
| 1135 | + return "\n".join(lines) | |
| 956 | 1136 | |
| 957 | 1137 | |
| 958 | 1138 | def _extract_verification_repairs( |
| 959 | 1139 | evidence_items: list[VerificationEvidence], |
| 1140 | + *, | |
| 1141 | + repair_targets: list[VerificationRepairTarget] | None = None, | |
| 960 | 1142 | ) -> list[str]: |
| 961 | 1143 | fixes: list[str] = [] |
| 1144 | + target_map = { | |
| 1145 | + (target.artifact_path, target.failing_reference, target.expected_path): target | |
| 1146 | + for target in (repair_targets or _extract_verification_repair_targets(evidence_items)) | |
| 1147 | + } | |
| 1148 | + for target in target_map.values(): | |
| 1149 | + item = ( | |
| 1150 | + f"Fix the broken local reference `{target.failing_reference}` in " | |
| 1151 | + f"`{target.artifact_path}`." | |
| 1152 | + ) | |
| 1153 | + if item not in fixes: | |
| 1154 | + fixes.append(item) | |
| 962 | 1155 | for evidence in evidence_items: |
| 963 | 1156 | for candidate in (evidence.stderr, evidence.output, evidence.stdout): |
| 964 | - missing, mismatches = html_toc_rule.parse_html_toc_verification_failures( | |
| 965 | - str(candidate) | |
| 966 | - ) | |
| 967 | - for href in missing: | |
| 968 | - item = ( | |
| 969 | - f"Fix the missing TOC href `{href}` in the target HTML " | |
| 970 | - "table-of-contents page." | |
| 971 | - ) | |
| 972 | - if item not in fixes: | |
| 973 | - fixes.append(item) | |
| 974 | - for mismatch in mismatches: | |
| 1157 | + for problem in _extract_missing_local_html_links(str(candidate)): | |
| 1158 | + parsed = _parse_missing_local_html_link(problem) | |
| 1159 | + if parsed is not None: | |
| 1160 | + key = ( | |
| 1161 | + parsed.artifact_path, | |
| 1162 | + parsed.failing_reference, | |
| 1163 | + parsed.expected_path, | |
| 1164 | + ) | |
| 1165 | + if key in target_map: | |
| 1166 | + continue | |
| 975 | 1167 | item = ( |
| 976 | - f"Fix the TOC label mismatch `{mismatch}` in the target HTML " | |
| 977 | - "table-of-contents page." | |
| 1168 | + "Fix the missing local HTML link " | |
| 1169 | + f"`{problem}` in the edited artifact set." | |
| 978 | 1170 | ) |
| 979 | 1171 | if item not in fixes: |
| 980 | 1172 | fixes.append(item) |
| 981 | 1173 | return fixes |
| 982 | 1174 | |
| 983 | 1175 | |
| 1176 | +@dataclass(frozen=True) | |
| 1177 | +class VerificationRepairTarget: | |
| 1178 | + """Structured repair target extracted from failed verification evidence.""" | |
| 1179 | + | |
| 1180 | + artifact_path: str | |
| 1181 | + failing_reference: str | |
| 1182 | + expected_path: str | |
| 1183 | + | |
| 1184 | + | |
| 1185 | +def _build_verification_failure_recovery_nudge( | |
| 1186 | + dod: DefinitionOfDone, | |
| 1187 | + *, | |
| 1188 | + project_root: Path, | |
| 1189 | +) -> str | None: | |
| 1190 | + repair_targets = _extract_verification_repair_targets(dod.evidence) | |
| 1191 | + repair_source_paths = _existing_repair_source_paths( | |
| 1192 | + dod, | |
| 1193 | + repair_targets=repair_targets, | |
| 1194 | + project_root=project_root, | |
| 1195 | + ) | |
| 1196 | + if repair_targets: | |
| 1197 | + primary_target = repair_targets[0] | |
| 1198 | + source_hint = "" | |
| 1199 | + if repair_source_paths: | |
| 1200 | + preview = ", ".join(f"`{path}`" for path in repair_source_paths[:4]) | |
| 1201 | + if len(repair_source_paths) > 4: | |
| 1202 | + preview += ", ..." | |
| 1203 | + source_hint = ( | |
| 1204 | + " Use the existing artifact files already on disk as the source of truth: " | |
| 1205 | + f"{preview}." | |
| 1206 | + ) | |
| 1207 | + return ( | |
| 1208 | + "Verification already identified the concrete repair target. " | |
| 1209 | + "Do not restart discovery or reread unrelated references. " | |
| 1210 | + "Your next response should be one concrete `edit` or `write`-style tool " | |
| 1211 | + f"call that updates `{primary_target.artifact_path}` to repair " | |
| 1212 | + f"`{primary_target.failing_reference}`. " | |
| 1213 | + f"If that reference should stay, create `{primary_target.expected_path}`; " | |
| 1214 | + "otherwise remove or replace the broken local reference." | |
| 1215 | + f"{source_hint}" | |
| 1216 | + ) | |
| 1217 | + | |
| 1218 | + fixes = _extract_verification_repairs(dod.evidence, repair_targets=repair_targets) | |
| 1219 | + if not fixes: | |
| 1220 | + return None | |
| 1221 | + return ( | |
| 1222 | + "Verification already identified a concrete failure in the active artifact set. " | |
| 1223 | + "Reuse that evidence directly, apply one concrete edit or patch, and do not " | |
| 1224 | + "restart discovery unless a specific missing fact blocks the repair." | |
| 1225 | + ) | |
| 1226 | + | |
| 1227 | + | |
| 1228 | +def _existing_repair_source_paths( | |
| 1229 | + dod: DefinitionOfDone, | |
| 1230 | + *, | |
| 1231 | + repair_targets: list[VerificationRepairTarget], | |
| 1232 | + project_root: Path, | |
| 1233 | +) -> list[str]: | |
| 1234 | + if not repair_targets: | |
| 1235 | + return [] | |
| 1236 | + | |
| 1237 | + candidate_dirs = { | |
| 1238 | + Path(target.expected_path).parent.resolve(strict=False) | |
| 1239 | + for target in repair_targets | |
| 1240 | + if str(target.expected_path).strip() | |
| 1241 | + } | |
| 1242 | + candidate_dirs.update( | |
| 1243 | + Path(target.artifact_path).parent.resolve(strict=False) | |
| 1244 | + for target in repair_targets | |
| 1245 | + if str(target.artifact_path).strip() | |
| 1246 | + ) | |
| 1247 | + | |
| 1248 | + paths: list[str] = [] | |
| 1249 | + seen: set[str] = set() | |
| 1250 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 1251 | + dod, | |
| 1252 | + project_root=project_root, | |
| 1253 | + max_paths=24, | |
| 1254 | + ): | |
| 1255 | + if expect_directory or not target.is_file(): | |
| 1256 | + continue | |
| 1257 | + resolved = target.resolve(strict=False) | |
| 1258 | + if resolved.parent not in candidate_dirs: | |
| 1259 | + continue | |
| 1260 | + normalized = str(resolved) | |
| 1261 | + if normalized in seen: | |
| 1262 | + continue | |
| 1263 | + seen.add(normalized) | |
| 1264 | + paths.append(normalized) | |
| 1265 | + return paths | |
| 1266 | + | |
| 1267 | + | |
| 1268 | +def _extract_verification_repair_targets( | |
| 1269 | + evidence_items: list[VerificationEvidence], | |
| 1270 | +) -> list[VerificationRepairTarget]: | |
| 1271 | + targets: list[VerificationRepairTarget] = [] | |
| 1272 | + seen: set[tuple[str, str, str]] = set() | |
| 1273 | + for evidence in evidence_items: | |
| 1274 | + for candidate in (evidence.stderr, evidence.output, evidence.stdout): | |
| 1275 | + for problem in _extract_missing_local_html_links(str(candidate)): | |
| 1276 | + parsed = _parse_missing_local_html_link(problem) | |
| 1277 | + if parsed is None: | |
| 1278 | + continue | |
| 1279 | + key = ( | |
| 1280 | + parsed.artifact_path, | |
| 1281 | + parsed.failing_reference, | |
| 1282 | + parsed.expected_path, | |
| 1283 | + ) | |
| 1284 | + if key in seen: | |
| 1285 | + continue | |
| 1286 | + seen.add(key) | |
| 1287 | + targets.append(parsed) | |
| 1288 | + return targets | |
| 1289 | + | |
| 1290 | + | |
| 1291 | +def _parse_missing_local_html_link(problem: str) -> VerificationRepairTarget | None: | |
| 1292 | + if " -> " not in problem: | |
| 1293 | + return None | |
| 1294 | + broken_target, expected_path = problem.split(" -> ", 1) | |
| 1295 | + broken_target = broken_target.strip() | |
| 1296 | + expected_path = expected_path.strip() | |
| 1297 | + if not broken_target or not expected_path or ":" not in broken_target: | |
| 1298 | + return None | |
| 1299 | + artifact_path, failing_reference = broken_target.rsplit(":", 1) | |
| 1300 | + artifact_path = artifact_path.strip() | |
| 1301 | + failing_reference = failing_reference.strip() | |
| 1302 | + if not artifact_path or not failing_reference: | |
| 1303 | + return None | |
| 1304 | + return VerificationRepairTarget( | |
| 1305 | + artifact_path=artifact_path, | |
| 1306 | + failing_reference=failing_reference, | |
| 1307 | + expected_path=expected_path, | |
| 1308 | + ) | |
| 1309 | + | |
| 1310 | + | |
| 1311 | +def _extract_missing_local_html_links(text: str) -> list[str]: | |
| 1312 | + if "Missing local HTML links:" not in text: | |
| 1313 | + return [] | |
| 1314 | + | |
| 1315 | + problems: list[str] = [] | |
| 1316 | + capture = False | |
| 1317 | + for raw_line in text.splitlines(): | |
| 1318 | + line = raw_line.strip() | |
| 1319 | + if not line: | |
| 1320 | + continue | |
| 1321 | + if line == "Missing local HTML links:": | |
| 1322 | + capture = True | |
| 1323 | + continue | |
| 1324 | + if not capture: | |
| 1325 | + continue | |
| 1326 | + if " -> " not in line: | |
| 1327 | + continue | |
| 1328 | + if line not in problems: | |
| 1329 | + problems.append(line) | |
| 1330 | + return problems | |
| 1331 | + | |
| 1332 | + | |
| 984 | 1333 | def _classify_verification_kind(command: str) -> str: |
| 985 | 1334 | """Classify the verification command into a summary kind.""" |
| 986 | 1335 | |
src/loader/runtime/hooks.pymodified@@ -2,6 +2,7 @@ | ||
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | +import shlex | |
| 5 | 6 | from collections.abc import Iterable |
| 6 | 7 | from dataclasses import dataclass, field |
| 7 | 8 | from enum import StrEnum |
@@ -11,10 +12,27 @@ from typing import Any, Protocol | ||
| 11 | 12 | from ..llm.base import ToolCall |
| 12 | 13 | from ..tools.base import Tool, ToolRegistry |
| 13 | 14 | from ..tools.base import ToolResult as RegistryToolResult |
| 15 | +from .dod import ( | |
| 16 | + DefinitionOfDoneStore, | |
| 17 | + all_planned_artifacts_exist, | |
| 18 | + collect_missing_declared_html_output_files, | |
| 19 | + collect_planned_artifact_targets, | |
| 20 | + planned_artifact_target_satisfied, | |
| 21 | +) | |
| 14 | 22 | from .memory import MemoryStore |
| 15 | 23 | from .permissions import PermissionOverride, PermissionPolicy |
| 24 | +from .repair_focus import ( | |
| 25 | + extract_active_repair_context, | |
| 26 | + normalize_repair_path, | |
| 27 | + path_matches_allowed_paths, | |
| 28 | + path_within_allowed_roots, | |
| 29 | +) | |
| 16 | 30 | from .rollback import RollbackPlan, create_rollback_plan_for_action, is_destructive_tool |
| 17 | -from .safeguard_services import ActionTracker, PreActionValidator | |
| 31 | +from .safeguard_services import ( | |
| 32 | + ActionTracker, | |
| 33 | + PreActionValidator, | |
| 34 | + extract_shell_text_rewrite_target, | |
| 35 | +) | |
| 18 | 36 | |
| 19 | 37 | |
| 20 | 38 | class HookEvent(StrEnum): |
@@ -204,13 +222,21 @@ class RelativePathContextHook(BaseToolHook): | ||
| 204 | 222 | |
| 205 | 223 | arguments = context.tool_call.arguments |
| 206 | 224 | raw_path = str(arguments.get(argument_key, "")).strip() |
| 207 | - if not raw_path or raw_path.startswith(("/", "~")): | |
| 225 | + if not raw_path: | |
| 208 | 226 | return HookResult() |
| 209 | 227 | |
| 210 | - resolved = self._resolve_recent_context_path( | |
| 211 | - raw_path, | |
| 212 | - require_existing=True, | |
| 213 | - ) | |
| 228 | + require_existing = context.tool_call.name in {"read", "glob", "grep", "edit", "patch"} | |
| 229 | + resolved: str | None = None | |
| 230 | + if raw_path.startswith("/"): | |
| 231 | + resolved = self._resolve_workspace_mirror_path( | |
| 232 | + raw_path, | |
| 233 | + require_existing=require_existing, | |
| 234 | + ) | |
| 235 | + elif not raw_path.startswith("~"): | |
| 236 | + resolved = self._resolve_recent_context_path( | |
| 237 | + raw_path, | |
| 238 | + require_existing=require_existing, | |
| 239 | + ) | |
| 214 | 240 | if resolved is None: |
| 215 | 241 | return HookResult() |
| 216 | 242 | |
@@ -245,6 +271,551 @@ class RelativePathContextHook(BaseToolHook): | ||
| 245 | 271 | return str(candidate) |
| 246 | 272 | return None |
| 247 | 273 | |
| 274 | + def _resolve_workspace_mirror_path( | |
| 275 | + self, | |
| 276 | + raw_path: str, | |
| 277 | + *, | |
| 278 | + require_existing: bool, | |
| 279 | + ) -> str | None: | |
| 280 | + candidate = Path(raw_path).expanduser() | |
| 281 | + try: | |
| 282 | + resolved = candidate.resolve(strict=False) | |
| 283 | + except Exception: | |
| 284 | + resolved = candidate | |
| 285 | + | |
| 286 | + try: | |
| 287 | + relative = resolved.relative_to(self.workspace_root) | |
| 288 | + except ValueError: | |
| 289 | + return None | |
| 290 | + if not relative.parts: | |
| 291 | + return None | |
| 292 | + | |
| 293 | + anchor = relative.parts[0] | |
| 294 | + for base_dir in self.action_tracker.recent_path_contexts(): | |
| 295 | + base_path = Path(base_dir).expanduser() | |
| 296 | + try: | |
| 297 | + resolved_base = base_path.resolve(strict=False) | |
| 298 | + except Exception: | |
| 299 | + resolved_base = base_path | |
| 300 | + if resolved_base == self.workspace_root: | |
| 301 | + continue | |
| 302 | + try: | |
| 303 | + resolved_base.relative_to(self.workspace_root) | |
| 304 | + continue | |
| 305 | + except ValueError: | |
| 306 | + pass | |
| 307 | + | |
| 308 | + try: | |
| 309 | + anchor_index = resolved_base.parts.index(anchor) | |
| 310 | + except ValueError: | |
| 311 | + continue | |
| 312 | + if anchor_index <= 0: | |
| 313 | + continue | |
| 314 | + | |
| 315 | + anchor_root = Path(*resolved_base.parts[: anchor_index + 1]) | |
| 316 | + remapped = Path(*resolved_base.parts[:anchor_index]).joinpath(*relative.parts) | |
| 317 | + if remapped == resolved: | |
| 318 | + continue | |
| 319 | + if require_existing: | |
| 320 | + if remapped.exists(): | |
| 321 | + return str(remapped) | |
| 322 | + continue | |
| 323 | + if remapped.exists() or remapped.parent.exists() or anchor_root.exists(): | |
| 324 | + return str(remapped) | |
| 325 | + return None | |
| 326 | + | |
| 327 | + | |
| 328 | +_OBSERVATION_TOOLS = frozenset({"read", "glob", "grep", "bash"}) | |
| 329 | +_MUTATION_TOOLS = frozenset({"write", "edit", "patch", "bash"}) | |
| 330 | +_READ_ONLY_BASH_PREFIXES = frozenset( | |
| 331 | + {"ls", "pwd", "find", "stat", "cat", "head", "tail", "rg", "grep"} | |
| 332 | +) | |
| 333 | +_MUTATING_BASH_FRAGMENTS = ( | |
| 334 | + " >", | |
| 335 | + ">>", | |
| 336 | + "| tee", | |
| 337 | + "touch ", | |
| 338 | + "mkdir ", | |
| 339 | + "rm ", | |
| 340 | + "mv ", | |
| 341 | + "cp ", | |
| 342 | + "sed -i", | |
| 343 | + "perl -pi", | |
| 344 | + "git add", | |
| 345 | + "git commit", | |
| 346 | + "git apply", | |
| 347 | +) | |
| 348 | + | |
| 349 | + | |
| 350 | +def _extract_observation_paths(tool_call: ToolCall) -> list[str]: | |
| 351 | + arguments = tool_call.arguments | |
| 352 | + if tool_call.name == "read": | |
| 353 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 354 | + return [file_path] if file_path else [] | |
| 355 | + | |
| 356 | + if tool_call.name in {"glob", "grep"}: | |
| 357 | + candidates: list[str] = [] | |
| 358 | + search_path = str(arguments.get("path", "")).strip() | |
| 359 | + if search_path: | |
| 360 | + anchored_path = _derive_search_anchor(search_path, str(arguments.get("pattern", "")).strip()) | |
| 361 | + candidates.append(anchored_path or search_path) | |
| 362 | + pattern = str(arguments.get("pattern", "")).strip() | |
| 363 | + if not search_path and pattern.startswith(("/", "~")): | |
| 364 | + candidates.append(str(Path(pattern).expanduser().parent)) | |
| 365 | + return candidates | |
| 366 | + | |
| 367 | + command = str(arguments.get("command", "")).strip() | |
| 368 | + if not _is_read_only_bash(command): | |
| 369 | + return [] | |
| 370 | + return _extract_bash_paths(command) | |
| 371 | + | |
| 372 | + | |
| 373 | +def _is_read_only_bash(command: str) -> bool: | |
| 374 | + normalized = " ".join(command.split()) | |
| 375 | + if not normalized: | |
| 376 | + return False | |
| 377 | + if extract_shell_text_rewrite_target(normalized) is not None: | |
| 378 | + return False | |
| 379 | + if any(fragment in normalized for fragment in _MUTATING_BASH_FRAGMENTS): | |
| 380 | + return False | |
| 381 | + try: | |
| 382 | + argv = shlex.split(normalized) | |
| 383 | + except ValueError: | |
| 384 | + return False | |
| 385 | + if not argv: | |
| 386 | + return False | |
| 387 | + return argv[0] in _READ_ONLY_BASH_PREFIXES | |
| 388 | + | |
| 389 | + | |
| 390 | +def _extract_bash_paths(command: str) -> list[str]: | |
| 391 | + try: | |
| 392 | + argv = shlex.split(command) | |
| 393 | + except ValueError: | |
| 394 | + return [] | |
| 395 | + observed: list[str] = [] | |
| 396 | + for token in argv[1:]: | |
| 397 | + candidate = token.strip() | |
| 398 | + if not candidate or candidate.startswith("-"): | |
| 399 | + continue | |
| 400 | + if candidate.startswith(("/", "~")): | |
| 401 | + observed.append(candidate) | |
| 402 | + return observed | |
| 403 | + | |
| 404 | + | |
| 405 | +def _derive_search_anchor(search_path: str, pattern: str) -> str: | |
| 406 | + normalized_search_path = str(search_path or "").strip() | |
| 407 | + normalized_pattern = str(pattern or "").strip() | |
| 408 | + if not normalized_search_path or not normalized_pattern: | |
| 409 | + return normalized_search_path | |
| 410 | + | |
| 411 | + literal_segments: list[str] = [] | |
| 412 | + for segment in normalized_pattern.split("/"): | |
| 413 | + cleaned = segment.strip() | |
| 414 | + if not cleaned or cleaned == ".": | |
| 415 | + continue | |
| 416 | + if any(token in cleaned for token in ("*", "?", "[")): | |
| 417 | + continue | |
| 418 | + literal_segments.append(cleaned) | |
| 419 | + | |
| 420 | + if not literal_segments: | |
| 421 | + return normalized_search_path | |
| 422 | + | |
| 423 | + if "." in literal_segments[-1]: | |
| 424 | + literal_segments = literal_segments[:-1] | |
| 425 | + if not literal_segments: | |
| 426 | + return normalized_search_path | |
| 427 | + | |
| 428 | + try: | |
| 429 | + anchored = Path(normalized_search_path).expanduser().joinpath(*literal_segments) | |
| 430 | + except (OSError, RuntimeError, ValueError): | |
| 431 | + return normalized_search_path | |
| 432 | + return str(anchored) | |
| 433 | + | |
| 434 | + | |
| 435 | +def _extract_mutation_paths(tool_call: ToolCall) -> list[str]: | |
| 436 | + arguments = tool_call.arguments | |
| 437 | + if tool_call.name in {"write", "edit", "patch"}: | |
| 438 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 439 | + return [file_path] if file_path else [] | |
| 440 | + | |
| 441 | + if tool_call.name != "bash": | |
| 442 | + return [] | |
| 443 | + | |
| 444 | + command = str(arguments.get("command", "")).strip() | |
| 445 | + if not command or not _is_mutating_bash(command): | |
| 446 | + return [] | |
| 447 | + target = extract_shell_text_rewrite_target(command) | |
| 448 | + return [target] if target else [] | |
| 449 | + | |
| 450 | + | |
| 451 | +def _is_mutating_bash(command: str) -> bool: | |
| 452 | + normalized = " ".join(command.split()) | |
| 453 | + if not normalized: | |
| 454 | + return False | |
| 455 | + if extract_shell_text_rewrite_target(normalized) is not None: | |
| 456 | + return True | |
| 457 | + if any(fragment in normalized for fragment in _MUTATING_BASH_FRAGMENTS): | |
| 458 | + return True | |
| 459 | + try: | |
| 460 | + argv = shlex.split(normalized) | |
| 461 | + except ValueError: | |
| 462 | + return False | |
| 463 | + if not argv: | |
| 464 | + return False | |
| 465 | + return argv[0] in {"touch", "mkdir", "rm", "mv", "cp", "chmod", "chown"} | |
| 466 | + | |
| 467 | + | |
| 468 | +def _repair_declared_output_paths(repair: Any, *, project_root: Path) -> set[str]: | |
| 469 | + declared_outputs: set[str] = set() | |
| 470 | + for root in getattr(repair, "allowed_roots", ()) or (): | |
| 471 | + normalized_root = normalize_repair_path(root) | |
| 472 | + if not normalized_root: | |
| 473 | + continue | |
| 474 | + for path in collect_missing_declared_html_output_files( | |
| 475 | + target=Path(normalized_root), | |
| 476 | + project_root=project_root, | |
| 477 | + ): | |
| 478 | + declared_outputs.add(normalize_repair_path(str(path))) | |
| 479 | + return declared_outputs | |
| 480 | + | |
| 481 | + | |
| 482 | +class ActiveRepairScopeHook(BaseToolHook): | |
| 483 | + """Keep fix-mode observations anchored to the active artifact set.""" | |
| 484 | + | |
| 485 | + def __init__( | |
| 486 | + self, | |
| 487 | + *, | |
| 488 | + dod_store: DefinitionOfDoneStore, | |
| 489 | + project_root: Path, | |
| 490 | + session: Any, | |
| 491 | + ) -> None: | |
| 492 | + self.dod_store = dod_store | |
| 493 | + self.project_root = project_root | |
| 494 | + self.session = session | |
| 495 | + | |
| 496 | + async def pre_tool_use(self, context: HookContext) -> HookResult: | |
| 497 | + if context.tool_call.name not in _OBSERVATION_TOOLS: | |
| 498 | + return HookResult() | |
| 499 | + if context.source == "verification": | |
| 500 | + return HookResult() | |
| 501 | + | |
| 502 | + repair = self._active_repair_context() | |
| 503 | + if repair is None: | |
| 504 | + return HookResult() | |
| 505 | + | |
| 506 | + observed_paths = _extract_observation_paths(context.tool_call) | |
| 507 | + if not observed_paths: | |
| 508 | + return HookResult() | |
| 509 | + declared_output_paths = _repair_declared_output_paths( | |
| 510 | + repair, | |
| 511 | + project_root=self.project_root, | |
| 512 | + ) | |
| 513 | + if repair.allowed_paths: | |
| 514 | + if all(path_matches_allowed_paths(path, repair.allowed_paths) for path in observed_paths): | |
| 515 | + return HookResult() | |
| 516 | + if declared_output_paths and all( | |
| 517 | + normalize_repair_path(path) in declared_output_paths | |
| 518 | + for path in observed_paths | |
| 519 | + ): | |
| 520 | + return HookResult() | |
| 521 | + if context.tool_call.name in {"glob", "grep", "bash"} and repair.allowed_roots: | |
| 522 | + if all(path_within_allowed_roots(path, repair.allowed_roots) for path in observed_paths): | |
| 523 | + return HookResult() | |
| 524 | + | |
| 525 | + allowed_preview = ", ".join(f"`{path}`" for path in repair.allowed_paths[:3]) | |
| 526 | + if len(repair.allowed_paths) > 3: | |
| 527 | + allowed_preview += ", ..." | |
| 528 | + declared_preview = ", ".join( | |
| 529 | + f"`{Path(path).name or path}`" | |
| 530 | + for path in sorted(declared_output_paths)[:3] | |
| 531 | + ) | |
| 532 | + if len(declared_output_paths) > 3: | |
| 533 | + declared_preview += ", ..." | |
| 534 | + suggestion_suffix = ( | |
| 535 | + f" Declared sibling outputs currently allowed inside this repair set include: {declared_preview}." | |
| 536 | + if declared_preview | |
| 537 | + else "" | |
| 538 | + ) | |
| 539 | + return HookResult( | |
| 540 | + decision=HookDecision.DENY, | |
| 541 | + message=( | |
| 542 | + "[Blocked - active repair scope: verification already identified " | |
| 543 | + f"`{repair.artifact_path}` as the current repair target. " | |
| 544 | + "Stay on the concrete repair files until that repair passes.] " | |
| 545 | + "Suggestion: inspect or edit only " | |
| 546 | + f"{allowed_preview} and do not reopen unrelated reference materials." | |
| 547 | + f"{suggestion_suffix}" | |
| 548 | + ), | |
| 549 | + terminal_state="blocked", | |
| 550 | + ) | |
| 551 | + | |
| 552 | + if not repair.allowed_roots: | |
| 553 | + return HookResult() | |
| 554 | + if all(path_within_allowed_roots(path, repair.allowed_roots) for path in observed_paths): | |
| 555 | + return HookResult() | |
| 556 | + | |
| 557 | + roots_preview = ", ".join(f"`{root}`" for root in repair.allowed_roots[:2]) | |
| 558 | + if len(repair.allowed_roots) > 2: | |
| 559 | + roots_preview += ", ..." | |
| 560 | + return HookResult( | |
| 561 | + decision=HookDecision.DENY, | |
| 562 | + message=( | |
| 563 | + "[Blocked - active repair scope: verification already identified " | |
| 564 | + f"`{repair.artifact_path}` as the current repair target. " | |
| 565 | + "Stay inside the current artifact set until that repair passes.] " | |
| 566 | + "Suggestion: inspect or edit files under " | |
| 567 | + f"{roots_preview} and do not reopen unrelated reference materials." | |
| 568 | + ), | |
| 569 | + terminal_state="blocked", | |
| 570 | + ) | |
| 571 | + | |
| 572 | + def _active_repair_context(self): | |
| 573 | + dod_path = getattr(self.session, "active_dod_path", None) | |
| 574 | + if not dod_path: | |
| 575 | + return None | |
| 576 | + path = Path(str(dod_path)) | |
| 577 | + if not path.exists(): | |
| 578 | + return None | |
| 579 | + dod = self.dod_store.load(path) | |
| 580 | + if dod.status == "done": | |
| 581 | + return None | |
| 582 | + return extract_active_repair_context(getattr(self.session, "messages", [])) | |
| 583 | + | |
| 584 | + | |
| 585 | +class ActiveRepairMutationScopeHook(BaseToolHook): | |
| 586 | + """Keep repair-phase mutations pinned to the concrete repair targets.""" | |
| 587 | + | |
| 588 | + def __init__( | |
| 589 | + self, | |
| 590 | + *, | |
| 591 | + dod_store: DefinitionOfDoneStore, | |
| 592 | + project_root: Path, | |
| 593 | + session: Any, | |
| 594 | + ) -> None: | |
| 595 | + self.dod_store = dod_store | |
| 596 | + self.project_root = project_root | |
| 597 | + self.session = session | |
| 598 | + | |
| 599 | + async def pre_tool_use(self, context: HookContext) -> HookResult: | |
| 600 | + if context.tool_call.name not in _MUTATION_TOOLS: | |
| 601 | + return HookResult() | |
| 602 | + if context.source == "verification": | |
| 603 | + return HookResult() | |
| 604 | + | |
| 605 | + repair = self._active_repair_context() | |
| 606 | + if repair is None or not repair.allowed_paths: | |
| 607 | + return HookResult() | |
| 608 | + allowed_paths = {normalize_repair_path(path) for path in repair.allowed_paths} | |
| 609 | + | |
| 610 | + mutation_paths = _extract_mutation_paths(context.tool_call) | |
| 611 | + if not mutation_paths: | |
| 612 | + if context.tool_call.name == "bash" and _is_mutating_bash( | |
| 613 | + str(context.tool_call.arguments.get("command", "")).strip() | |
| 614 | + ): | |
| 615 | + return HookResult( | |
| 616 | + decision=HookDecision.DENY, | |
| 617 | + message=( | |
| 618 | + "[Blocked - active repair mutation scope: the current repair already " | |
| 619 | + f"identifies `{repair.artifact_path}` as the concrete target.] " | |
| 620 | + "Suggestion: use write/edit/patch directly on one of the active repair " | |
| 621 | + "files instead of a broad shell mutation." | |
| 622 | + ), | |
| 623 | + terminal_state="blocked", | |
| 624 | + ) | |
| 625 | + return HookResult() | |
| 626 | + normalized_mutation_paths = [ | |
| 627 | + normalize_repair_path(path) for path in mutation_paths if str(path).strip() | |
| 628 | + ] | |
| 629 | + allowed_declared_outputs = _repair_declared_output_paths( | |
| 630 | + repair, | |
| 631 | + project_root=self.project_root, | |
| 632 | + ) | |
| 633 | + | |
| 634 | + if normalized_mutation_paths and all( | |
| 635 | + path in allowed_paths for path in normalized_mutation_paths | |
| 636 | + ): | |
| 637 | + return HookResult() | |
| 638 | + if normalized_mutation_paths and all( | |
| 639 | + path in allowed_paths or path in allowed_declared_outputs | |
| 640 | + for path in normalized_mutation_paths | |
| 641 | + ): | |
| 642 | + return HookResult() | |
| 643 | + | |
| 644 | + allowed_preview = ", ".join(f"`{path}`" for path in repair.allowed_paths[:3]) | |
| 645 | + if len(repair.allowed_paths) > 3: | |
| 646 | + allowed_preview += ", ..." | |
| 647 | + declared_preview = ", ".join( | |
| 648 | + f"`{Path(path).name or path}`" | |
| 649 | + for path in sorted(allowed_declared_outputs)[:3] | |
| 650 | + ) | |
| 651 | + if len(allowed_declared_outputs) > 3: | |
| 652 | + declared_preview += ", ..." | |
| 653 | + suggestion_suffix = ( | |
| 654 | + f" Declared sibling outputs currently allowed inside this repair set include: {declared_preview}." | |
| 655 | + if declared_preview | |
| 656 | + else "" | |
| 657 | + ) | |
| 658 | + return HookResult( | |
| 659 | + decision=HookDecision.DENY, | |
| 660 | + message=( | |
| 661 | + "[Blocked - active repair mutation scope: verification already identified " | |
| 662 | + f"`{repair.artifact_path}` as the current repair target.] Suggestion: keep " | |
| 663 | + f"mutations on the active repair files only: {allowed_preview}." | |
| 664 | + f"{suggestion_suffix}" | |
| 665 | + ), | |
| 666 | + terminal_state="blocked", | |
| 667 | + ) | |
| 668 | + | |
| 669 | + def _active_repair_context(self): | |
| 670 | + dod_path = getattr(self.session, "active_dod_path", None) | |
| 671 | + if not dod_path: | |
| 672 | + return None | |
| 673 | + path = Path(str(dod_path)) | |
| 674 | + if not path.exists(): | |
| 675 | + return None | |
| 676 | + dod = self.dod_store.load(path) | |
| 677 | + if dod.status == "done": | |
| 678 | + return None | |
| 679 | + return extract_active_repair_context(getattr(self.session, "messages", [])) | |
| 680 | + | |
| 681 | +class LateReferenceDriftHook(BaseToolHook): | |
| 682 | + """Block reopening old reference paths once planned artifacts are well underway.""" | |
| 683 | + | |
| 684 | + _MIN_COMPLETED_FILES = 3 | |
| 685 | + | |
| 686 | + def __init__(self, *, dod_store: DefinitionOfDoneStore, project_root: Path, session: Any) -> None: | |
| 687 | + self.dod_store = dod_store | |
| 688 | + self.project_root = project_root | |
| 689 | + self.session = session | |
| 690 | + | |
| 691 | + async def pre_tool_use(self, context: HookContext) -> HookResult: | |
| 692 | + if context.tool_call.name not in _OBSERVATION_TOOLS: | |
| 693 | + return HookResult() | |
| 694 | + | |
| 695 | + completed_scope = self._completed_artifact_scope() | |
| 696 | + if completed_scope is not None: | |
| 697 | + observed_paths = _extract_observation_paths(context.tool_call) | |
| 698 | + if not observed_paths: | |
| 699 | + return HookResult() | |
| 700 | + if all(path_within_allowed_roots(path, completed_scope) for path in observed_paths): | |
| 701 | + return HookResult() | |
| 702 | + | |
| 703 | + roots_preview = ", ".join(f"`{root}`" for root in completed_scope[:2]) | |
| 704 | + if len(completed_scope) > 2: | |
| 705 | + roots_preview += ", ..." | |
| 706 | + return HookResult( | |
| 707 | + decision=HookDecision.DENY, | |
| 708 | + message=( | |
| 709 | + "[Blocked - completed artifact set scope: all explicitly planned artifacts " | |
| 710 | + "already exist.] Suggestion: stay within the current output roots under " | |
| 711 | + f"{roots_preview} and use those files as the source of truth instead of " | |
| 712 | + "reopening earlier reference materials." | |
| 713 | + ), | |
| 714 | + terminal_state="blocked", | |
| 715 | + ) | |
| 716 | + | |
| 717 | + late_stage = self._late_stage_missing_artifact() | |
| 718 | + if late_stage is None: | |
| 719 | + return HookResult() | |
| 720 | + missing_artifact, planned_roots = late_stage | |
| 721 | + observed_paths = _extract_observation_paths(context.tool_call) | |
| 722 | + if not observed_paths: | |
| 723 | + return HookResult() | |
| 724 | + if all(path_within_allowed_roots(path, planned_roots) for path in observed_paths): | |
| 725 | + return HookResult() | |
| 726 | + | |
| 727 | + roots_preview = ", ".join(f"`{root}`" for root in planned_roots[:2]) | |
| 728 | + if len(planned_roots) > 2: | |
| 729 | + roots_preview += ", ..." | |
| 730 | + return HookResult( | |
| 731 | + decision=HookDecision.DENY, | |
| 732 | + message=( | |
| 733 | + "[Blocked - late reference drift: several planned artifacts already exist and " | |
| 734 | + f"`{missing_artifact}` is still missing.] Suggestion: finish the next missing " | |
| 735 | + f"artifact inside {roots_preview} before reopening earlier reference materials." | |
| 736 | + ), | |
| 737 | + terminal_state="blocked", | |
| 738 | + ) | |
| 739 | + | |
| 740 | + def _late_stage_missing_artifact(self) -> tuple[str, tuple[str, ...]] | None: | |
| 741 | + dod_path = getattr(self.session, "active_dod_path", None) | |
| 742 | + if not dod_path: | |
| 743 | + return None | |
| 744 | + path = Path(str(dod_path)) | |
| 745 | + if not path.exists(): | |
| 746 | + return None | |
| 747 | + dod = self.dod_store.load(path) | |
| 748 | + if dod.status == "done": | |
| 749 | + return None | |
| 750 | + | |
| 751 | + planned_targets = collect_planned_artifact_targets( | |
| 752 | + dod, | |
| 753 | + project_root=self.project_root, | |
| 754 | + ) | |
| 755 | + if not planned_targets: | |
| 756 | + return None | |
| 757 | + | |
| 758 | + missing_label = "" | |
| 759 | + completed_files = 0 | |
| 760 | + planned_roots: list[str] = [] | |
| 761 | + seen_roots: set[str] = set() | |
| 762 | + for target, expect_directory in planned_targets: | |
| 763 | + satisfied = planned_artifact_target_satisfied( | |
| 764 | + dod, | |
| 765 | + target=target, | |
| 766 | + expect_directory=expect_directory, | |
| 767 | + project_root=self.project_root, | |
| 768 | + ) | |
| 769 | + if not expect_directory: | |
| 770 | + if satisfied: | |
| 771 | + completed_files += 1 | |
| 772 | + elif not missing_label: | |
| 773 | + missing_label = str(target) | |
| 774 | + root = str(target.parent) | |
| 775 | + else: | |
| 776 | + if not satisfied and not missing_label: | |
| 777 | + missing_label = str(target) | |
| 778 | + root = str(target) | |
| 779 | + if root not in seen_roots: | |
| 780 | + planned_roots.append(root) | |
| 781 | + seen_roots.add(root) | |
| 782 | + | |
| 783 | + if not missing_label: | |
| 784 | + return None | |
| 785 | + if completed_files < self._MIN_COMPLETED_FILES: | |
| 786 | + return None | |
| 787 | + return missing_label, tuple(planned_roots) | |
| 788 | + | |
| 789 | + def _completed_artifact_scope(self) -> tuple[str, ...] | None: | |
| 790 | + dod_path = getattr(self.session, "active_dod_path", None) | |
| 791 | + if not dod_path: | |
| 792 | + return None | |
| 793 | + path = Path(str(dod_path)) | |
| 794 | + if not path.exists(): | |
| 795 | + return None | |
| 796 | + dod = self.dod_store.load(path) | |
| 797 | + if dod.status in {"done", "fixing"}: | |
| 798 | + return None | |
| 799 | + | |
| 800 | + planned_targets = collect_planned_artifact_targets( | |
| 801 | + dod, | |
| 802 | + project_root=self.project_root, | |
| 803 | + ) | |
| 804 | + if not planned_targets: | |
| 805 | + return None | |
| 806 | + if not all_planned_artifacts_exist(dod, project_root=self.project_root): | |
| 807 | + return None | |
| 808 | + | |
| 809 | + planned_roots: list[str] = [] | |
| 810 | + seen_roots: set[str] = set() | |
| 811 | + for target, expect_directory in planned_targets: | |
| 812 | + root = str(target if expect_directory else target.parent) | |
| 813 | + if root in seen_roots: | |
| 814 | + continue | |
| 815 | + seen_roots.add(root) | |
| 816 | + planned_roots.append(root) | |
| 817 | + return tuple(planned_roots) | |
| 818 | + | |
| 248 | 819 | |
| 249 | 820 | class HookManager: |
| 250 | 821 | """Runs tool hooks across Loader's three lifecycle events.""" |
@@ -437,6 +1008,7 @@ def build_default_tool_hooks( | ||
| 437 | 1008 | registry: ToolRegistry, |
| 438 | 1009 | rollback_plan: RollbackPlan | None, |
| 439 | 1010 | workspace_root: Path, |
| 1011 | + session: Any, | |
| 440 | 1012 | ) -> HookManager: |
| 441 | 1013 | """Build Loader's default tool hook stack for one runtime turn.""" |
| 442 | 1014 | |
@@ -445,6 +1017,21 @@ def build_default_tool_hooks( | ||
| 445 | 1017 | FilePathAliasHook(), |
| 446 | 1018 | SearchPathAliasHook(), |
| 447 | 1019 | RelativePathContextHook(action_tracker, workspace_root), |
| 1020 | + ActiveRepairScopeHook( | |
| 1021 | + dod_store=DefinitionOfDoneStore(workspace_root), | |
| 1022 | + project_root=workspace_root, | |
| 1023 | + session=session, | |
| 1024 | + ), | |
| 1025 | + ActiveRepairMutationScopeHook( | |
| 1026 | + dod_store=DefinitionOfDoneStore(workspace_root), | |
| 1027 | + project_root=workspace_root, | |
| 1028 | + session=session, | |
| 1029 | + ), | |
| 1030 | + LateReferenceDriftHook( | |
| 1031 | + dod_store=DefinitionOfDoneStore(workspace_root), | |
| 1032 | + project_root=workspace_root, | |
| 1033 | + session=session, | |
| 1034 | + ), | |
| 448 | 1035 | DuplicateActionHook(action_tracker), |
| 449 | 1036 | ActionValidationHook(validator), |
| 450 | 1037 | RollbackTrackingHook(registry, rollback_plan), |
src/loader/runtime/repair.pymodified@@ -2,11 +2,67 @@ | ||
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | +import re | |
| 5 | 6 | from dataclasses import dataclass, field |
| 7 | +from pathlib import Path | |
| 6 | 8 | |
| 7 | 9 | from ..llm.base import ToolCall |
| 8 | 10 | from .context import RuntimeContext |
| 11 | +from .dod import ( | |
| 12 | + DefinitionOfDone, | |
| 13 | + collect_planned_artifact_targets, | |
| 14 | + infer_next_declared_html_output_file, | |
| 15 | + planned_artifact_target_satisfied, | |
| 16 | +) | |
| 9 | 17 | from .parsing import parse_tool_calls |
| 18 | +from .workflow import effective_pending_todo_items, reconcile_aggregate_completion_steps | |
| 19 | + | |
| 20 | +_SPECIAL_DOD_ITEMS = { | |
| 21 | + "Complete the requested work", | |
| 22 | + "Collect verification evidence", | |
| 23 | +} | |
| 24 | +_LATE_STAGE_EMPTY_RETRY_EXTRA = 2 | |
| 25 | +_WORKING_NOTE_TOOL_NAMES = ( | |
| 26 | + "notepad_write_working", | |
| 27 | + "notepad_append", | |
| 28 | + "notepad_write_priority", | |
| 29 | + "notepad_write_manual", | |
| 30 | +) | |
| 31 | +_MUTATION_TODO_HINTS = ( | |
| 32 | + "create", | |
| 33 | + "creating", | |
| 34 | + "update", | |
| 35 | + "updating", | |
| 36 | + "edit", | |
| 37 | + "editing", | |
| 38 | + "write", | |
| 39 | + "writing", | |
| 40 | + "fix", | |
| 41 | + "fixing", | |
| 42 | + "modify", | |
| 43 | + "modifying", | |
| 44 | + "change", | |
| 45 | + "changing", | |
| 46 | + "patch", | |
| 47 | + "patching", | |
| 48 | + "replace", | |
| 49 | + "replacing", | |
| 50 | + "correct", | |
| 51 | + "correcting", | |
| 52 | + "rewrite", | |
| 53 | + "rewriting", | |
| 54 | +) | |
| 55 | +_CONSISTENCY_REVIEW_HINTS = ( | |
| 56 | + "consistent", | |
| 57 | + "consistently", | |
| 58 | + "formatted", | |
| 59 | + "link", | |
| 60 | + "linked", | |
| 61 | + "navigation", | |
| 62 | + "work properly", | |
| 63 | + "all files", | |
| 64 | + "every file", | |
| 65 | +) | |
| 10 | 66 | |
| 11 | 67 | |
| 12 | 68 | @dataclass(slots=True) |
@@ -52,29 +108,37 @@ class ResponseRepairer: | ||
| 52 | 108 | original_task: str | None, |
| 53 | 109 | empty_retry_count: int, |
| 54 | 110 | max_empty_retries: int, |
| 111 | + dod: DefinitionOfDone | None = None, | |
| 55 | 112 | ) -> EmptyResponseDecision: |
| 56 | 113 | """Return the next action when the assistant responds with empty content.""" |
| 57 | 114 | |
| 58 | - _ = task, original_task, max_empty_retries | |
| 59 | - if empty_retry_count == 1: | |
| 115 | + _ = task, original_task | |
| 116 | + effective_max_empty_retries = self._effective_max_empty_retries( | |
| 117 | + dod, | |
| 118 | + base_max_empty_retries=max_empty_retries, | |
| 119 | + ) | |
| 120 | + if empty_retry_count <= effective_max_empty_retries: | |
| 60 | 121 | return EmptyResponseDecision( |
| 61 | 122 | should_continue=True, |
| 62 | 123 | reason_code="empty_response_retry", |
| 63 | - reason_summary="retried after the assistant returned an empty response", | |
| 64 | - retry_message=( | |
| 65 | - "[EMPTY ASSISTANT RESPONSE]\n" | |
| 66 | - "Your last response was empty. Respond directly to the task " | |
| 67 | - "or call tools if needed. Do not return an empty response." | |
| 124 | + reason_summary=( | |
| 125 | + "retried after the assistant returned an empty response" | |
| 126 | + ), | |
| 127 | + retry_message=self._build_empty_response_retry_message( | |
| 128 | + dod, | |
| 129 | + retry_number=empty_retry_count, | |
| 130 | + max_empty_retries=effective_max_empty_retries, | |
| 68 | 131 | ), |
| 69 | 132 | ) |
| 70 | 133 | |
| 71 | 134 | return EmptyResponseDecision( |
| 72 | 135 | should_continue=False, |
| 73 | 136 | reason_code="empty_response_retry_exhausted", |
| 74 | - reason_summary="stopped after the assistant returned empty responses twice", | |
| 137 | + reason_summary="stopped after the assistant returned empty responses repeatedly", | |
| 75 | 138 | final_response=( |
| 76 | - "I didn't get a usable response from the model after retrying once. " | |
| 77 | - "Please try again or switch to a different backend/model." | |
| 139 | + "I didn't get a usable response from the model after " | |
| 140 | + f"retrying {effective_max_empty_retries} times. Please try again or " | |
| 141 | + "switch to a different backend/model." | |
| 78 | 142 | ), |
| 79 | 143 | failure="assistant returned empty output repeatedly", |
| 80 | 144 | ) |
@@ -167,3 +231,383 @@ class ResponseRepairer: | ||
| 167 | 231 | allowed_tool_names=allowed_tool_names, |
| 168 | 232 | ) |
| 169 | 233 | return parsed.tool_calls |
| 234 | + | |
| 235 | + def _build_empty_response_retry_message( | |
| 236 | + self, | |
| 237 | + dod: DefinitionOfDone | None, | |
| 238 | + *, | |
| 239 | + retry_number: int, | |
| 240 | + max_empty_retries: int, | |
| 241 | + ) -> str: | |
| 242 | + progress_lines: list[str] = [] | |
| 243 | + if dod is not None: | |
| 244 | + reconcile_aggregate_completion_steps( | |
| 245 | + dod, | |
| 246 | + project_root=self.context.project_root, | |
| 247 | + ) | |
| 248 | + latest_working_note = self._latest_working_note() | |
| 249 | + if latest_working_note: | |
| 250 | + progress_lines.append( | |
| 251 | + "Latest working note: " + latest_working_note | |
| 252 | + ) | |
| 253 | + | |
| 254 | + planned_lines = self._planned_artifact_progress_lines(dod) | |
| 255 | + progress_lines.extend(planned_lines) | |
| 256 | + progress_lines.extend( | |
| 257 | + self._next_step_resume_lines( | |
| 258 | + dod, | |
| 259 | + retry_number=retry_number, | |
| 260 | + ) | |
| 261 | + ) | |
| 262 | + | |
| 263 | + touched = [ | |
| 264 | + f"`{Path(path).name or path}`" | |
| 265 | + for path in dod.touched_files[-3:] | |
| 266 | + if str(path).strip() | |
| 267 | + ] | |
| 268 | + if touched: | |
| 269 | + progress_lines.append( | |
| 270 | + "Confirmed touched files: " + ", ".join(touched) | |
| 271 | + ) | |
| 272 | + | |
| 273 | + completed = [ | |
| 274 | + item | |
| 275 | + for item in dod.completed_items | |
| 276 | + if item not in _SPECIAL_DOD_ITEMS | |
| 277 | + ] | |
| 278 | + if completed: | |
| 279 | + progress_lines.append( | |
| 280 | + "Confirmed completed work: " + "; ".join(completed[-2:]) | |
| 281 | + ) | |
| 282 | + | |
| 283 | + next_pending = next( | |
| 284 | + ( | |
| 285 | + item | |
| 286 | + for item in dod.pending_items | |
| 287 | + if item not in _SPECIAL_DOD_ITEMS | |
| 288 | + ), | |
| 289 | + None, | |
| 290 | + ) | |
| 291 | + if next_pending: | |
| 292 | + progress_lines.append(f"Next pending item: {next_pending}") | |
| 293 | + todo_refresh = self._todo_refresh_retry_line(dod) | |
| 294 | + if todo_refresh: | |
| 295 | + progress_lines.append(todo_refresh) | |
| 296 | + | |
| 297 | + if not progress_lines: | |
| 298 | + return ( | |
| 299 | + "[EMPTY ASSISTANT RESPONSE]\n" | |
| 300 | + f"Your last response was empty (retry {retry_number}/{max_empty_retries}). " | |
| 301 | + "Respond directly to the task " | |
| 302 | + "or call tools if needed. Do not return an empty response." | |
| 303 | + ) | |
| 304 | + | |
| 305 | + return "\n".join( | |
| 306 | + [ | |
| 307 | + "[EMPTY ASSISTANT RESPONSE]", | |
| 308 | + ( | |
| 309 | + "Your last response was empty " | |
| 310 | + f"(retry {retry_number}/{max_empty_retries}). Continue from the " | |
| 311 | + "confirmed progress below instead of restarting." | |
| 312 | + ), | |
| 313 | + *[f"- {line}" for line in progress_lines], | |
| 314 | + "", | |
| 315 | + "Respond directly to the task or call tools if needed. Do not return an empty response.", | |
| 316 | + ] | |
| 317 | + ) | |
| 318 | + | |
| 319 | + def _todo_refresh_retry_line(self, dod: DefinitionOfDone) -> str | None: | |
| 320 | + non_special_pending = [ | |
| 321 | + item for item in dod.pending_items if item not in _SPECIAL_DOD_ITEMS | |
| 322 | + ] | |
| 323 | + non_special_completed = [ | |
| 324 | + item for item in dod.completed_items if item not in _SPECIAL_DOD_ITEMS | |
| 325 | + ] | |
| 326 | + if len(dod.touched_files) < 2 and (len(non_special_pending) + len(non_special_completed)) < 3: | |
| 327 | + return None | |
| 328 | + return ( | |
| 329 | + "If the tracked steps are stale, refresh `TodoWrite` alongside the next " | |
| 330 | + "concrete mutation instead of spending a full turn on bookkeeping alone." | |
| 331 | + ) | |
| 332 | + | |
| 333 | + def _effective_max_empty_retries( | |
| 334 | + self, | |
| 335 | + dod: DefinitionOfDone | None, | |
| 336 | + *, | |
| 337 | + base_max_empty_retries: int, | |
| 338 | + ) -> int: | |
| 339 | + if dod is None: | |
| 340 | + return base_max_empty_retries | |
| 341 | + completed_artifacts, missing_artifacts = self._planned_artifact_counts(dod) | |
| 342 | + if completed_artifacts < 3 or missing_artifacts == 0: | |
| 343 | + return base_max_empty_retries | |
| 344 | + return base_max_empty_retries + _LATE_STAGE_EMPTY_RETRY_EXTRA | |
| 345 | + | |
| 346 | + def _planned_artifact_counts(self, dod: DefinitionOfDone) -> tuple[int, int]: | |
| 347 | + completed = 0 | |
| 348 | + missing = 0 | |
| 349 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 350 | + dod, | |
| 351 | + project_root=self.context.project_root, | |
| 352 | + max_paths=12, | |
| 353 | + ): | |
| 354 | + if planned_artifact_target_satisfied( | |
| 355 | + dod, | |
| 356 | + target=target, | |
| 357 | + expect_directory=expect_directory, | |
| 358 | + project_root=self.context.project_root, | |
| 359 | + ): | |
| 360 | + completed += 1 | |
| 361 | + else: | |
| 362 | + missing += 1 | |
| 363 | + return completed, missing | |
| 364 | + | |
| 365 | + def _planned_artifact_progress_lines(self, dod: DefinitionOfDone) -> list[str]: | |
| 366 | + targets = collect_planned_artifact_targets( | |
| 367 | + dod, | |
| 368 | + project_root=self.context.project_root, | |
| 369 | + max_paths=12, | |
| 370 | + ) | |
| 371 | + if not targets: | |
| 372 | + return [] | |
| 373 | + | |
| 374 | + missing_labels = [ | |
| 375 | + self._format_artifact_label(target, expect_directory=expect_directory) | |
| 376 | + for target, expect_directory in targets | |
| 377 | + if not planned_artifact_target_satisfied( | |
| 378 | + dod, | |
| 379 | + target=target, | |
| 380 | + expect_directory=expect_directory, | |
| 381 | + project_root=self.context.project_root, | |
| 382 | + ) | |
| 383 | + ] | |
| 384 | + if not missing_labels: | |
| 385 | + return [] | |
| 386 | + | |
| 387 | + lines = [f"Next missing planned artifact: {missing_labels[0]}"] | |
| 388 | + first_missing_target, first_missing_is_directory = next( | |
| 389 | + ( | |
| 390 | + (target, expect_directory) | |
| 391 | + for target, expect_directory in targets | |
| 392 | + if not planned_artifact_target_satisfied( | |
| 393 | + dod, | |
| 394 | + target=target, | |
| 395 | + expect_directory=expect_directory, | |
| 396 | + project_root=self.context.project_root, | |
| 397 | + ) | |
| 398 | + ), | |
| 399 | + (None, False), | |
| 400 | + ) | |
| 401 | + if first_missing_target is not None and first_missing_is_directory: | |
| 402 | + next_output_file = infer_next_declared_html_output_file( | |
| 403 | + target=first_missing_target, | |
| 404 | + project_root=self.context.project_root, | |
| 405 | + ) | |
| 406 | + if next_output_file is not None: | |
| 407 | + lines.append( | |
| 408 | + "Next declared output under " | |
| 409 | + f"{self._format_artifact_label(first_missing_target, expect_directory=True)}: " | |
| 410 | + f"{self._format_artifact_label(next_output_file, expect_directory=False)}" | |
| 411 | + ) | |
| 412 | + if len(missing_labels) > 1: | |
| 413 | + preview = ", ".join(missing_labels[:3]) | |
| 414 | + if len(missing_labels) > 3: | |
| 415 | + preview += ", ..." | |
| 416 | + lines.append("Remaining planned artifacts: " + preview) | |
| 417 | + return lines | |
| 418 | + | |
| 419 | + def _next_step_resume_lines( | |
| 420 | + self, | |
| 421 | + dod: DefinitionOfDone, | |
| 422 | + *, | |
| 423 | + retry_number: int, | |
| 424 | + ) -> list[str]: | |
| 425 | + completed_artifacts, _ = self._planned_artifact_counts(dod) | |
| 426 | + next_pending = next( | |
| 427 | + ( | |
| 428 | + item | |
| 429 | + for item in effective_pending_todo_items( | |
| 430 | + dod, | |
| 431 | + project_root=self.context.project_root, | |
| 432 | + ) | |
| 433 | + if item not in _SPECIAL_DOD_ITEMS | |
| 434 | + ), | |
| 435 | + None, | |
| 436 | + ) | |
| 437 | + if ( | |
| 438 | + completed_artifacts == 0 | |
| 439 | + and next_pending | |
| 440 | + and not _todo_is_mutation_step(next_pending) | |
| 441 | + and not _todo_is_consistency_review_step(next_pending) | |
| 442 | + ): | |
| 443 | + lines = [f"Resume with this exact next step: advance `{next_pending}`."] | |
| 444 | + lines.append( | |
| 445 | + "Make the next response one concrete evidence-gathering tool call that " | |
| 446 | + "directly advances that step." | |
| 447 | + ) | |
| 448 | + lines.append( | |
| 449 | + "Do not jump ahead to later artifact creation, verification, or a " | |
| 450 | + "completion summary until that discovery step is satisfied." | |
| 451 | + ) | |
| 452 | + if retry_number >= 2: | |
| 453 | + lines.append( | |
| 454 | + "Do not restart from scratch or return another working note; emit the " | |
| 455 | + "next evidence-gathering tool call now." | |
| 456 | + ) | |
| 457 | + else: | |
| 458 | + lines.append( | |
| 459 | + "Do not restart from scratch unless one specific missing fact blocks " | |
| 460 | + "that discovery step." | |
| 461 | + ) | |
| 462 | + return lines | |
| 463 | + | |
| 464 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 465 | + dod, | |
| 466 | + project_root=self.context.project_root, | |
| 467 | + max_paths=12, | |
| 468 | + ): | |
| 469 | + if planned_artifact_target_satisfied( | |
| 470 | + dod, | |
| 471 | + target=target, | |
| 472 | + expect_directory=expect_directory, | |
| 473 | + project_root=self.context.project_root, | |
| 474 | + ): | |
| 475 | + continue | |
| 476 | + label = self._format_artifact_label( | |
| 477 | + target, | |
| 478 | + expect_directory=expect_directory, | |
| 479 | + ) | |
| 480 | + if expect_directory: | |
| 481 | + next_output_file = infer_next_declared_html_output_file( | |
| 482 | + target=target, | |
| 483 | + project_root=self.context.project_root, | |
| 484 | + ) | |
| 485 | + if next_output_file is not None: | |
| 486 | + next_output_label = self._format_artifact_label( | |
| 487 | + next_output_file, | |
| 488 | + expect_directory=False, | |
| 489 | + ) | |
| 490 | + if next_pending and _todo_is_mutation_step(next_pending): | |
| 491 | + lines = [ | |
| 492 | + "Resume with this exact next step: continue " | |
| 493 | + f"`{next_pending}` by creating {next_output_label}." | |
| 494 | + ] | |
| 495 | + else: | |
| 496 | + lines = [ | |
| 497 | + "Resume with this exact next step: create " | |
| 498 | + f"{next_output_label}." | |
| 499 | + ] | |
| 500 | + lines.append( | |
| 501 | + f"It is the next missing declared output under {label}." | |
| 502 | + ) | |
| 503 | + lines.append( | |
| 504 | + f"Prefer one `write` call for `{next_output_file}` before more research." | |
| 505 | + ) | |
| 506 | + if not next_output_file.parent.exists(): | |
| 507 | + lines.append( | |
| 508 | + "The `write` tool can create that file's parent directories " | |
| 509 | + "automatically, so do the write in one step instead of stopping " | |
| 510 | + "for a separate mkdir." | |
| 511 | + ) | |
| 512 | + if retry_number >= 2: | |
| 513 | + lines.append( | |
| 514 | + "Do not restart discovery; emit the next mutation tool call now." | |
| 515 | + ) | |
| 516 | + else: | |
| 517 | + lines.append( | |
| 518 | + "Do not restart discovery unless one specific missing fact blocks this step." | |
| 519 | + ) | |
| 520 | + return lines | |
| 521 | + if expect_directory and target.is_dir(): | |
| 522 | + if next_pending and _todo_is_mutation_step(next_pending): | |
| 523 | + lines = [ | |
| 524 | + "Resume with this exact next step: continue " | |
| 525 | + f"`{next_pending}` by creating the next output file under {label}." | |
| 526 | + ] | |
| 527 | + else: | |
| 528 | + lines = [ | |
| 529 | + "Resume with this exact next step: create the next output file " | |
| 530 | + f"under {label}." | |
| 531 | + ] | |
| 532 | + lines.append( | |
| 533 | + f"Prefer one concrete `write` call for a file inside `{target}` before more research." | |
| 534 | + ) | |
| 535 | + else: | |
| 536 | + lines = [f"Resume with this exact next step: create {label}."] | |
| 537 | + if expect_directory and not target.is_dir(): | |
| 538 | + lines.append( | |
| 539 | + f"Prefer one concrete directory-creation step for `{target}` before more research." | |
| 540 | + ) | |
| 541 | + elif not expect_directory: | |
| 542 | + lines.append( | |
| 543 | + f"Prefer one `write` call for `{target}` before any more reference reads." | |
| 544 | + ) | |
| 545 | + if not target.parent.exists(): | |
| 546 | + lines.append( | |
| 547 | + "The `write` tool can create that file's parent directories " | |
| 548 | + "automatically, so do the write in one step instead of stopping " | |
| 549 | + "for a separate mkdir." | |
| 550 | + ) | |
| 551 | + lines.append( | |
| 552 | + "Shape the next response as one concrete `write(file_path=..., " | |
| 553 | + "content=...)` tool call for that exact path." | |
| 554 | + ) | |
| 555 | + if completed_artifacts >= 3: | |
| 556 | + lines.append( | |
| 557 | + "Follow the same one-file-at-a-time mutation pattern that already " | |
| 558 | + "created the confirmed planned artifacts." | |
| 559 | + ) | |
| 560 | + lines.append( | |
| 561 | + "Your next response should be the concrete mutation tool call itself, " | |
| 562 | + "not TodoWrite alone, verification, or a completion summary." | |
| 563 | + ) | |
| 564 | + if retry_number >= 2: | |
| 565 | + lines.append( | |
| 566 | + "Do not restart discovery; emit the next mutation tool call now." | |
| 567 | + ) | |
| 568 | + else: | |
| 569 | + lines.append( | |
| 570 | + "Do not restart discovery unless one specific missing fact blocks this step." | |
| 571 | + ) | |
| 572 | + return lines | |
| 573 | + return [] | |
| 574 | + | |
| 575 | + @staticmethod | |
| 576 | + def _format_artifact_label(path: Path, *, expect_directory: bool) -> str: | |
| 577 | + label = path.name or str(path) | |
| 578 | + if expect_directory and not label.endswith("/"): | |
| 579 | + label += "/" | |
| 580 | + return f"`{label}`" | |
| 581 | + | |
| 582 | + def _latest_working_note(self) -> str | None: | |
| 583 | + messages = list(getattr(self.context.session, "messages", []) or []) | |
| 584 | + for message in reversed(messages): | |
| 585 | + content = str(getattr(message, "content", "") or "").strip() | |
| 586 | + if not content: | |
| 587 | + continue | |
| 588 | + for tool_name in _WORKING_NOTE_TOOL_NAMES: | |
| 589 | + prefix = f"Observation [{tool_name}]: Result:" | |
| 590 | + if prefix not in content: | |
| 591 | + continue | |
| 592 | + note = content.split(prefix, 1)[1].strip() | |
| 593 | + if not note: | |
| 594 | + continue | |
| 595 | + first_line = next( | |
| 596 | + (line.strip() for line in note.splitlines() if line.strip()), | |
| 597 | + "", | |
| 598 | + ) | |
| 599 | + if not first_line: | |
| 600 | + continue | |
| 601 | + first_line = re.sub(r"^-\s*\[[^\]]+\]\s*", "", first_line).strip() | |
| 602 | + return first_line or None | |
| 603 | + return None | |
| 604 | + | |
| 605 | + | |
| 606 | +def _todo_is_mutation_step(label: str) -> bool: | |
| 607 | + lowered = label.lower() | |
| 608 | + return any(token in lowered for token in _MUTATION_TODO_HINTS) | |
| 609 | + | |
| 610 | + | |
| 611 | +def _todo_is_consistency_review_step(label: str) -> bool: | |
| 612 | + lowered = label.lower() | |
| 613 | + return any(token in lowered for token in _CONSISTENCY_REVIEW_HINTS) | |
src/loader/runtime/repair_focus.pyadded@@ -0,0 +1,132 @@ | ||
| 1 | +"""Shared helpers for extracting and enforcing active repair focus.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import re | |
| 6 | +from dataclasses import dataclass | |
| 7 | +from os import sep | |
| 8 | +from pathlib import Path | |
| 9 | + | |
| 10 | +from ..llm.base import Message | |
| 11 | + | |
| 12 | + | |
| 13 | +@dataclass(frozen=True) | |
| 14 | +class ActiveRepairContext: | |
| 15 | + """Concrete repair focus extracted from recent verification feedback.""" | |
| 16 | + | |
| 17 | + artifact_path: str | |
| 18 | + repair_lines: list[str] | |
| 19 | + allowed_paths: tuple[str, ...] | |
| 20 | + allowed_roots: tuple[str, ...] | |
| 21 | + | |
| 22 | + | |
| 23 | +def extract_active_repair_context( | |
| 24 | + messages: list[Message], | |
| 25 | +) -> ActiveRepairContext | None: | |
| 26 | + """Return the most recent concrete repair target from session history.""" | |
| 27 | + | |
| 28 | + for message in reversed(messages): | |
| 29 | + content = str(getattr(message, "content", "") or "") | |
| 30 | + if "Repair focus:" not in content: | |
| 31 | + continue | |
| 32 | + | |
| 33 | + repair_lines: list[str] = [] | |
| 34 | + artifact_path = "" | |
| 35 | + absolute_paths: list[str] = [] | |
| 36 | + capture = False | |
| 37 | + for raw_line in content.splitlines(): | |
| 38 | + line = raw_line.strip() | |
| 39 | + if not capture: | |
| 40 | + if line == "Repair focus:": | |
| 41 | + capture = True | |
| 42 | + continue | |
| 43 | + if not line: | |
| 44 | + if repair_lines: | |
| 45 | + break | |
| 46 | + continue | |
| 47 | + if not line.startswith("- "): | |
| 48 | + if repair_lines: | |
| 49 | + break | |
| 50 | + continue | |
| 51 | + | |
| 52 | + repair_lines.append(line) | |
| 53 | + if not artifact_path: | |
| 54 | + match = re.search(r"Immediate next step: edit `([^`]+)`", line) | |
| 55 | + if match: | |
| 56 | + artifact_path = normalize_repair_path(match.group(1)) | |
| 57 | + | |
| 58 | + for candidate in re.findall(r"`([^`]+)`", line): | |
| 59 | + if not candidate.startswith(("/", "~")): | |
| 60 | + continue | |
| 61 | + normalized = normalize_repair_path(candidate) | |
| 62 | + if normalized not in absolute_paths: | |
| 63 | + absolute_paths.append(normalized) | |
| 64 | + | |
| 65 | + if repair_lines: | |
| 66 | + if artifact_path: | |
| 67 | + if artifact_path not in absolute_paths: | |
| 68 | + absolute_paths.insert(0, artifact_path) | |
| 69 | + allowed_paths = tuple( | |
| 70 | + sorted( | |
| 71 | + absolute_paths, | |
| 72 | + key=lambda item: (not Path(item).exists(), item), | |
| 73 | + ) | |
| 74 | + ) | |
| 75 | + allowed_roots = _collapse_roots(_path_roots(set(absolute_paths))) | |
| 76 | + return ActiveRepairContext( | |
| 77 | + artifact_path=artifact_path, | |
| 78 | + repair_lines=repair_lines, | |
| 79 | + allowed_paths=allowed_paths, | |
| 80 | + allowed_roots=allowed_roots, | |
| 81 | + ) | |
| 82 | + return None | |
| 83 | + | |
| 84 | + | |
| 85 | +def path_within_allowed_roots(path: str, allowed_roots: tuple[str, ...]) -> bool: | |
| 86 | + """Return whether the normalized path stays within the repair artifact set.""" | |
| 87 | + | |
| 88 | + normalized = normalize_repair_path(path) | |
| 89 | + normalized_roots = tuple( | |
| 90 | + normalize_repair_path(root) for root in allowed_roots if str(root).strip() | |
| 91 | + ) | |
| 92 | + return any( | |
| 93 | + normalized == root or normalized.startswith(f"{root}{sep}") | |
| 94 | + for root in normalized_roots | |
| 95 | + ) | |
| 96 | + | |
| 97 | + | |
| 98 | +def path_matches_allowed_paths(path: str, allowed_paths: tuple[str, ...]) -> bool: | |
| 99 | + """Return whether the normalized path matches one concrete repair file.""" | |
| 100 | + | |
| 101 | + normalized = normalize_repair_path(path) | |
| 102 | + normalized_paths = { | |
| 103 | + normalize_repair_path(candidate) for candidate in allowed_paths if str(candidate).strip() | |
| 104 | + } | |
| 105 | + return normalized in normalized_paths | |
| 106 | + | |
| 107 | + | |
| 108 | +def normalize_repair_path(raw_path: str) -> str: | |
| 109 | + text = str(raw_path or "").strip() | |
| 110 | + if not text: | |
| 111 | + return "" | |
| 112 | + try: | |
| 113 | + return str(Path(text).expanduser().resolve(strict=False)) | |
| 114 | + except (OSError, RuntimeError, ValueError): | |
| 115 | + return str(Path(text).expanduser()) | |
| 116 | + | |
| 117 | + | |
| 118 | +def _path_roots(paths: set[str]) -> set[str]: | |
| 119 | + roots: set[str] = set() | |
| 120 | + for raw_path in paths: | |
| 121 | + path = Path(raw_path) | |
| 122 | + roots.add(str(path.parent)) | |
| 123 | + return roots | |
| 124 | + | |
| 125 | + | |
| 126 | +def _collapse_roots(roots: set[str]) -> tuple[str, ...]: | |
| 127 | + collapsed: list[str] = [] | |
| 128 | + for root in sorted(roots, key=lambda item: (len(item), item)): | |
| 129 | + if any(root == candidate or root.startswith(f"{candidate}{sep}") for candidate in collapsed): | |
| 130 | + continue | |
| 131 | + collapsed.append(root) | |
| 132 | + return tuple(collapsed) | |
src/loader/runtime/safeguard_services.pymodified@@ -8,8 +8,6 @@ from dataclasses import dataclass | ||
| 8 | 8 | from difflib import get_close_matches |
| 9 | 9 | from pathlib import Path |
| 10 | 10 | |
| 11 | -from .semantic_rules import html_toc as html_toc_rule | |
| 12 | - | |
| 13 | 11 | TEXT_REWRITE_SUFFIXES = frozenset( |
| 14 | 12 | { |
| 15 | 13 | ".c", |
@@ -145,7 +143,6 @@ class ActionTracker: | ||
| 145 | 143 | READ_REPEAT_THRESHOLD = 3 |
| 146 | 144 | SEARCH_REPEAT_THRESHOLD = 2 |
| 147 | 145 | BASH_OBSERVATION_REPEAT_THRESHOLD = 2 |
| 148 | - HTML_CHAPTER_EVIDENCE_THRESHOLD = 3 | |
| 149 | 146 | RECENT_PATH_CONTEXT_LIMIT = 12 |
| 150 | 147 | |
| 151 | 148 | def __init__(self) -> None: |
@@ -160,10 +157,7 @@ class ActionTracker: | ||
| 160 | 157 | self._recent_reads: dict[str, tuple[int, int, int]] = {} |
| 161 | 158 | self._recent_searches: dict[str, tuple[int, int, int]] = {} |
| 162 | 159 | self._recent_bash_observations: dict[str, tuple[int, int, int]] = {} |
| 163 | - self._recent_html_directory_reads: dict[str, tuple[int, set[str]]] = {} | |
| 164 | 160 | self._recent_path_contexts: list[str] = [] |
| 165 | - self._validated_html_tocs: dict[str, int] = {} | |
| 166 | - self._verified_html_inventory_dirs: set[str] = set() | |
| 167 | 161 | |
| 168 | 162 | def reset(self) -> None: |
| 169 | 163 | self._file_writes.clear() |
@@ -177,10 +171,7 @@ class ActionTracker: | ||
| 177 | 171 | self._recent_reads.clear() |
| 178 | 172 | self._recent_searches.clear() |
| 179 | 173 | self._recent_bash_observations.clear() |
| 180 | - self._recent_html_directory_reads.clear() | |
| 181 | 174 | self._recent_path_contexts.clear() |
| 182 | - self._validated_html_tocs.clear() | |
| 183 | - self._verified_html_inventory_dirs.clear() | |
| 184 | 175 | |
| 185 | 176 | def _normalize_path(self, path: str) -> str: |
| 186 | 177 | expanded = Path(path).expanduser() |
@@ -250,22 +241,6 @@ class ActionTracker: | ||
| 250 | 241 | def recent_path_contexts(self) -> list[str]: |
| 251 | 242 | return list(self._recent_path_contexts) |
| 252 | 243 | |
| 253 | - def note_validated_html_toc(self, index_path: str) -> None: | |
| 254 | - """Record that one index currently satisfies the semantic chapter-link check.""" | |
| 255 | - | |
| 256 | - normalized = self._normalize_path(index_path) | |
| 257 | - if not html_toc_rule.is_html_toc_index_path(normalized): | |
| 258 | - return | |
| 259 | - self._validated_html_tocs[normalized] = self._mutation_epoch | |
| 260 | - | |
| 261 | - def note_verified_html_inventory(self, index_path: str) -> None: | |
| 262 | - """Record that one sibling chapter inventory is already known exactly.""" | |
| 263 | - | |
| 264 | - normalized = self._normalize_path(index_path) | |
| 265 | - path = Path(normalized) | |
| 266 | - chapters_dir = path if html_toc_rule.is_html_toc_chapters_dir(path) else path.parent / "chapters" | |
| 267 | - self._verified_html_inventory_dirs.add(self._normalize_path(str(chapters_dir))) | |
| 268 | - | |
| 269 | 244 | def check_tool_call(self, tool_name: str, arguments: dict) -> tuple[bool, str]: |
| 270 | 245 | if tool_name == "write": |
| 271 | 246 | file_path = arguments.get("file_path", "") |
@@ -291,28 +266,8 @@ class ActionTracker: | ||
| 291 | 266 | return True, f"Same patch already applied to: {file_path}" |
| 292 | 267 | |
| 293 | 268 | elif tool_name == "read": |
| 294 | - inventory_duplicate, inventory_reason = self._check_verified_html_inventory_observation( | |
| 295 | - tool_name, | |
| 296 | - arguments, | |
| 297 | - ) | |
| 298 | - if inventory_duplicate: | |
| 299 | - return True, inventory_reason | |
| 300 | - validated_duplicate, validated_reason = self._check_validated_html_toc_observation( | |
| 301 | - tool_name, | |
| 302 | - arguments, | |
| 303 | - ) | |
| 304 | - if validated_duplicate: | |
| 305 | - return True, validated_reason | |
| 306 | 269 | read_key = self._make_read_key(arguments) |
| 307 | 270 | if read_key: |
| 308 | - sufficiency_duplicate, sufficiency_reason = ( | |
| 309 | - self._check_html_observation_sufficiency( | |
| 310 | - tool_name, | |
| 311 | - arguments, | |
| 312 | - ) | |
| 313 | - ) | |
| 314 | - if sufficiency_duplicate: | |
| 315 | - return True, sufficiency_reason | |
| 316 | 271 | duplicate, reason = self._check_recent_observation( |
| 317 | 272 | self._recent_reads, |
| 318 | 273 | read_key, |
@@ -328,28 +283,8 @@ class ActionTracker: | ||
| 328 | 283 | return True, reason |
| 329 | 284 | |
| 330 | 285 | elif tool_name in {"glob", "grep"}: |
| 331 | - inventory_duplicate, inventory_reason = self._check_verified_html_inventory_observation( | |
| 332 | - tool_name, | |
| 333 | - arguments, | |
| 334 | - ) | |
| 335 | - if inventory_duplicate: | |
| 336 | - return True, inventory_reason | |
| 337 | - validated_duplicate, validated_reason = self._check_validated_html_toc_observation( | |
| 338 | - tool_name, | |
| 339 | - arguments, | |
| 340 | - ) | |
| 341 | - if validated_duplicate: | |
| 342 | - return True, validated_reason | |
| 343 | 286 | observation_key = self._make_search_key(tool_name, arguments) |
| 344 | 287 | if observation_key: |
| 345 | - sufficiency_duplicate, sufficiency_reason = ( | |
| 346 | - self._check_html_observation_sufficiency( | |
| 347 | - tool_name, | |
| 348 | - arguments, | |
| 349 | - ) | |
| 350 | - ) | |
| 351 | - if sufficiency_duplicate: | |
| 352 | - return True, sufficiency_reason | |
| 353 | 288 | duplicate, reason = self._check_recent_observation( |
| 354 | 289 | self._recent_searches, |
| 355 | 290 | observation_key, |
@@ -365,18 +300,6 @@ class ActionTracker: | ||
| 365 | 300 | elif tool_name == "bash": |
| 366 | 301 | command = str(arguments.get("command", "")).strip() |
| 367 | 302 | if self._is_observational_bash(command): |
| 368 | - inventory_duplicate, inventory_reason = self._check_verified_html_inventory_observation( | |
| 369 | - tool_name, | |
| 370 | - arguments, | |
| 371 | - ) | |
| 372 | - if inventory_duplicate: | |
| 373 | - return True, inventory_reason | |
| 374 | - validated_duplicate, validated_reason = self._check_validated_html_toc_observation( | |
| 375 | - tool_name, | |
| 376 | - arguments, | |
| 377 | - ) | |
| 378 | - if validated_duplicate: | |
| 379 | - return True, validated_reason | |
| 380 | 303 | duplicate, reason = self._check_recent_observation( |
| 381 | 304 | self._recent_bash_observations, |
| 382 | 305 | self._normalize_command(command), |
@@ -406,7 +329,6 @@ class ActionTracker: | ||
| 406 | 329 | if file_path: |
| 407 | 330 | self.record_file_create(file_path, content) |
| 408 | 331 | self._record_path_context(file_path) |
| 409 | - self._clear_verified_html_inventory_for_path(file_path) | |
| 410 | 332 | self._note_mutation() |
| 411 | 333 | |
| 412 | 334 | elif tool_name == "edit": |
@@ -416,7 +338,6 @@ class ActionTracker: | ||
| 416 | 338 | if file_path: |
| 417 | 339 | self.record_edit(file_path, old_string, new_string) |
| 418 | 340 | self._record_path_context(file_path) |
| 419 | - self._clear_verified_html_inventory_for_path(file_path) | |
| 420 | 341 | self._note_mutation() |
| 421 | 342 | |
| 422 | 343 | elif tool_name == "patch": |
@@ -429,7 +350,6 @@ class ActionTracker: | ||
| 429 | 350 | elif isinstance(raw_patch, str) and raw_patch.strip(): |
| 430 | 351 | self.record_edit(file_path, raw_patch, "raw_patch") |
| 431 | 352 | self._record_path_context(file_path) |
| 432 | - self._clear_verified_html_inventory_for_path(file_path) | |
| 433 | 353 | self._note_mutation() |
| 434 | 354 | |
| 435 | 355 | elif tool_name == "read": |
@@ -442,7 +362,6 @@ class ActionTracker: | ||
| 442 | 362 | file_path = str(arguments.get("file_path", "")).strip() |
| 443 | 363 | if file_path: |
| 444 | 364 | self._record_path_context(file_path) |
| 445 | - self._record_html_directory_read(arguments) | |
| 446 | 365 | |
| 447 | 366 | elif tool_name in {"glob", "grep"}: |
| 448 | 367 | observation_key = self._make_search_key(tool_name, arguments) |
@@ -460,9 +379,6 @@ class ActionTracker: | ||
| 460 | 379 | if command: |
| 461 | 380 | self.record_command(command) |
| 462 | 381 | if self._is_mutating_bash(command): |
| 463 | - target = extract_shell_text_rewrite_target(command) | |
| 464 | - if target: | |
| 465 | - self._clear_verified_html_inventory_for_path(target) | |
| 466 | 382 | self._note_mutation() |
| 467 | 383 | elif self._is_observational_bash(command): |
| 468 | 384 | self._record_observation( |
@@ -678,230 +594,6 @@ class ActionTracker: | ||
| 678 | 594 | if len(self._recent_path_contexts) > self.RECENT_PATH_CONTEXT_LIMIT: |
| 679 | 595 | del self._recent_path_contexts[self.RECENT_PATH_CONTEXT_LIMIT :] |
| 680 | 596 | |
| 681 | - def _record_html_directory_read(self, arguments: dict) -> None: | |
| 682 | - file_path = str(arguments.get("file_path", "")).strip() | |
| 683 | - if not file_path: | |
| 684 | - return | |
| 685 | - normalized_path = self._normalize_path(file_path) | |
| 686 | - path = Path(normalized_path) | |
| 687 | - if not html_toc_rule.is_html_toc_chapter_file(path): | |
| 688 | - return | |
| 689 | - | |
| 690 | - directory = str(path.parent) | |
| 691 | - last_seen = self._recent_html_directory_reads.get(directory) | |
| 692 | - if last_seen is None or last_seen[0] != self._mutation_epoch: | |
| 693 | - self._recent_html_directory_reads[directory] = ( | |
| 694 | - self._mutation_epoch, | |
| 695 | - {path.name}, | |
| 696 | - ) | |
| 697 | - return | |
| 698 | - | |
| 699 | - _, seen_files = last_seen | |
| 700 | - updated = set(seen_files) | |
| 701 | - updated.add(path.name) | |
| 702 | - self._recent_html_directory_reads[directory] = ( | |
| 703 | - self._mutation_epoch, | |
| 704 | - updated, | |
| 705 | - ) | |
| 706 | - | |
| 707 | - def _check_html_observation_sufficiency( | |
| 708 | - self, | |
| 709 | - tool_name: str, | |
| 710 | - arguments: dict, | |
| 711 | - ) -> tuple[bool, str]: | |
| 712 | - if tool_name == "read": | |
| 713 | - file_path = str(arguments.get("file_path", "")).strip() | |
| 714 | - if not file_path: | |
| 715 | - return False, "" | |
| 716 | - normalized_path = self._normalize_path(file_path) | |
| 717 | - path = Path(normalized_path) | |
| 718 | - if not html_toc_rule.is_html_toc_index_path(path): | |
| 719 | - return False, "" | |
| 720 | - chapters_dir = str(path.parent / "chapters") | |
| 721 | - chapter_count = self._chapter_evidence_count(chapters_dir) | |
| 722 | - if chapter_count < self.HTML_CHAPTER_EVIDENCE_THRESHOLD: | |
| 723 | - return False, "" | |
| 724 | - read_key = self._make_read_key(arguments) | |
| 725 | - if read_key is None: | |
| 726 | - return False, "" | |
| 727 | - last_seen = self._recent_reads.get(read_key) | |
| 728 | - if last_seen is None: | |
| 729 | - return False, "" | |
| 730 | - _, _, repeat_count = last_seen | |
| 731 | - if repeat_count < 2: | |
| 732 | - return False, "" | |
| 733 | - return ( | |
| 734 | - True, | |
| 735 | - "Already confirmed multiple linked chapter files in " | |
| 736 | - f"{html_toc_rule.describe_html_toc_chapters_dir(path)}; reuse that file/title " | |
| 737 | - f"evidence and update {html_toc_rule.describe_html_toc_target(path)} instead of " | |
| 738 | - "rereading it", | |
| 739 | - ) | |
| 740 | - | |
| 741 | - if tool_name in {"glob", "grep"}: | |
| 742 | - search_path = str(arguments.get("path", "")).strip() | |
| 743 | - if not search_path: | |
| 744 | - return False, "" | |
| 745 | - normalized_path = self._normalize_path(search_path) | |
| 746 | - path = Path(normalized_path) | |
| 747 | - if not html_toc_rule.is_html_toc_chapters_dir(path): | |
| 748 | - return False, "" | |
| 749 | - chapter_count = self._chapter_evidence_count(str(path)) | |
| 750 | - if chapter_count < self.HTML_CHAPTER_EVIDENCE_THRESHOLD: | |
| 751 | - return False, "" | |
| 752 | - observation_key = self._make_search_key(tool_name, arguments) | |
| 753 | - if observation_key is None or observation_key not in self._recent_searches: | |
| 754 | - return False, "" | |
| 755 | - return ( | |
| 756 | - True, | |
| 757 | - "Already confirmed multiple linked chapter files in " | |
| 758 | - f"{html_toc_rule.describe_html_toc_chapters_dir(path)}; reuse that filename/title " | |
| 759 | - f"evidence and update {html_toc_rule.describe_html_toc_target(path)} instead of " | |
| 760 | - "rerunning the directory search", | |
| 761 | - ) | |
| 762 | - | |
| 763 | - return False, "" | |
| 764 | - | |
| 765 | - def _chapter_evidence_count(self, directory: str) -> int: | |
| 766 | - last_seen = self._recent_html_directory_reads.get(directory) | |
| 767 | - if last_seen is None: | |
| 768 | - return 0 | |
| 769 | - last_epoch, seen_files = last_seen | |
| 770 | - if last_epoch != self._mutation_epoch: | |
| 771 | - return 0 | |
| 772 | - return len(seen_files) | |
| 773 | - | |
| 774 | - def _check_validated_html_toc_observation( | |
| 775 | - self, | |
| 776 | - tool_name: str, | |
| 777 | - arguments: dict, | |
| 778 | - ) -> tuple[bool, str]: | |
| 779 | - related_paths = self._validated_html_related_paths(tool_name, arguments) | |
| 780 | - if not related_paths: | |
| 781 | - return False, "" | |
| 782 | - | |
| 783 | - for path in related_paths: | |
| 784 | - if self._matches_validated_html_toc(path): | |
| 785 | - return ( | |
| 786 | - True, | |
| 787 | - html_toc_rule.build_validated_html_toc_observation_reason(path), | |
| 788 | - ) | |
| 789 | - return False, "" | |
| 790 | - | |
| 791 | - def _check_verified_html_inventory_observation( | |
| 792 | - self, | |
| 793 | - tool_name: str, | |
| 794 | - arguments: dict, | |
| 795 | - ) -> tuple[bool, str]: | |
| 796 | - related_paths = self._verified_inventory_related_paths(tool_name, arguments) | |
| 797 | - if not related_paths: | |
| 798 | - return False, "" | |
| 799 | - | |
| 800 | - for path in related_paths: | |
| 801 | - if self._matches_verified_html_inventory(path): | |
| 802 | - return ( | |
| 803 | - True, | |
| 804 | - html_toc_rule.build_verified_html_inventory_observation_reason(path), | |
| 805 | - ) | |
| 806 | - return False, "" | |
| 807 | - | |
| 808 | - def _validated_html_related_paths( | |
| 809 | - self, | |
| 810 | - tool_name: str, | |
| 811 | - arguments: dict, | |
| 812 | - ) -> list[str]: | |
| 813 | - if tool_name == "read": | |
| 814 | - file_path = str(arguments.get("file_path", "")).strip() | |
| 815 | - return [self._normalize_path(file_path)] if file_path else [] | |
| 816 | - | |
| 817 | - if tool_name in {"glob", "grep"}: | |
| 818 | - search_path = str(arguments.get("path", "")).strip() | |
| 819 | - return [self._normalize_path(search_path)] if search_path else [] | |
| 820 | - | |
| 821 | - if tool_name == "bash": | |
| 822 | - command = str(arguments.get("command", "")).strip() | |
| 823 | - if not command: | |
| 824 | - return [] | |
| 825 | - return self._extract_observational_bash_paths(command) | |
| 826 | - | |
| 827 | - return [] | |
| 828 | - | |
| 829 | - def _verified_inventory_related_paths( | |
| 830 | - self, | |
| 831 | - tool_name: str, | |
| 832 | - arguments: dict, | |
| 833 | - ) -> list[str]: | |
| 834 | - if tool_name == "read": | |
| 835 | - file_path = str(arguments.get("file_path", "")).strip() | |
| 836 | - return [self._normalize_path(file_path)] if file_path else [] | |
| 837 | - | |
| 838 | - if tool_name in {"glob", "grep"}: | |
| 839 | - search_path = str(arguments.get("path", "")).strip() | |
| 840 | - return [self._normalize_path(search_path)] if search_path else [] | |
| 841 | - | |
| 842 | - if tool_name == "bash": | |
| 843 | - command = str(arguments.get("command", "")).strip() | |
| 844 | - if not command: | |
| 845 | - return [] | |
| 846 | - return self._extract_observational_bash_paths(command) | |
| 847 | - | |
| 848 | - return [] | |
| 849 | - | |
| 850 | - def _matches_validated_html_toc(self, path: str) -> bool: | |
| 851 | - normalized = self._normalize_path(path) | |
| 852 | - candidate = Path(normalized) | |
| 853 | - for index_path, epoch in self._validated_html_tocs.items(): | |
| 854 | - if epoch != self._mutation_epoch: | |
| 855 | - continue | |
| 856 | - index = Path(index_path) | |
| 857 | - chapters = Path(self._normalize_path(str(index.parent / "chapters"))) | |
| 858 | - if candidate == index or candidate == chapters: | |
| 859 | - return True | |
| 860 | - if candidate.parent == chapters: | |
| 861 | - return True | |
| 862 | - return False | |
| 863 | - | |
| 864 | - def _matches_verified_html_inventory(self, path: str) -> bool: | |
| 865 | - normalized = self._normalize_path(path) | |
| 866 | - candidate = Path(normalized) | |
| 867 | - for directory in self._verified_html_inventory_dirs: | |
| 868 | - chapters = Path(directory) | |
| 869 | - if candidate == chapters or candidate.parent == chapters: | |
| 870 | - return True | |
| 871 | - return False | |
| 872 | - | |
| 873 | - def _clear_verified_html_inventory_for_path(self, path_value: str) -> None: | |
| 874 | - normalized = self._normalize_path(path_value) | |
| 875 | - candidate = Path(normalized) | |
| 876 | - stale: set[str] = set() | |
| 877 | - for directory in self._verified_html_inventory_dirs: | |
| 878 | - chapters = Path(directory) | |
| 879 | - if candidate == chapters or candidate.parent == chapters: | |
| 880 | - stale.add(directory) | |
| 881 | - self._verified_html_inventory_dirs.difference_update(stale) | |
| 882 | - | |
| 883 | - def _extract_observational_bash_paths(self, command: str) -> list[str]: | |
| 884 | - norm_cmd = self._normalize_command(command) | |
| 885 | - try: | |
| 886 | - argv = shlex.split(norm_cmd) | |
| 887 | - except ValueError: | |
| 888 | - return [] | |
| 889 | - if not argv: | |
| 890 | - return [] | |
| 891 | - | |
| 892 | - paths: list[str] = [] | |
| 893 | - for token in argv[1:]: | |
| 894 | - candidate = _strip_shell_token(token) | |
| 895 | - if not candidate or candidate.startswith("-"): | |
| 896 | - continue | |
| 897 | - if any(marker in candidate for marker in ("/", "~")) or Path(candidate).suffix == ".html": | |
| 898 | - paths.append(self._normalize_path(candidate)) | |
| 899 | - continue | |
| 900 | - if candidate.rstrip("/").endswith("chapters"): | |
| 901 | - paths.append(self._normalize_path(candidate)) | |
| 902 | - return paths | |
| 903 | - | |
| 904 | - | |
| 905 | 597 | @dataclass |
| 906 | 598 | class ValidationResult: |
| 907 | 599 | """Result of pre-action validation.""" |
@@ -1023,6 +715,10 @@ class PreActionValidator: | ||
| 1023 | 715 | if not path_result.valid: |
| 1024 | 716 | return path_result |
| 1025 | 717 | |
| 718 | + sibling_result = self._validate_numbered_sibling_conflict(str(file_path)) | |
| 719 | + if not sibling_result.valid: | |
| 720 | + return sibling_result | |
| 721 | + | |
| 1026 | 722 | if content is None or (isinstance(content, str) and not content.strip()): |
| 1027 | 723 | return ValidationResult( |
| 1028 | 724 | valid=True, |
@@ -1040,6 +736,13 @@ class PreActionValidator: | ||
| 1040 | 736 | severity="block", |
| 1041 | 737 | ) |
| 1042 | 738 | |
| 739 | + html_declared_target_result = self._validate_html_declared_target_set( | |
| 740 | + str(file_path), | |
| 741 | + str(content), | |
| 742 | + ) | |
| 743 | + if not html_declared_target_result.valid: | |
| 744 | + return html_declared_target_result | |
| 745 | + | |
| 1043 | 746 | return ValidationResult(valid=True) |
| 1044 | 747 | |
| 1045 | 748 | def _validate_edit(self, arguments: dict) -> ValidationResult: |
@@ -1087,6 +790,13 @@ class PreActionValidator: | ||
| 1087 | 790 | if not html_index_result.valid: |
| 1088 | 791 | return html_index_result |
| 1089 | 792 | |
| 793 | + html_declared_target_result = self._validate_html_declared_target_set( | |
| 794 | + str(file_path), | |
| 795 | + str(new_string), | |
| 796 | + ) | |
| 797 | + if not html_declared_target_result.valid: | |
| 798 | + return html_declared_target_result | |
| 799 | + | |
| 1090 | 800 | return ValidationResult(valid=True) |
| 1091 | 801 | |
| 1092 | 802 | def _validate_patch(self, arguments: dict) -> ValidationResult: |
@@ -1106,6 +816,10 @@ class PreActionValidator: | ||
| 1106 | 816 | if not path_result.valid: |
| 1107 | 817 | return path_result |
| 1108 | 818 | |
| 819 | + sibling_result = self._validate_numbered_sibling_conflict(str(file_path)) | |
| 820 | + if not sibling_result.valid: | |
| 821 | + return sibling_result | |
| 822 | + | |
| 1109 | 823 | has_hunks = isinstance(hunks, list) and bool(hunks) |
| 1110 | 824 | has_raw_patch = isinstance(raw_patch, str) and bool(raw_patch.strip()) |
| 1111 | 825 | if not has_hunks and not has_raw_patch: |
@@ -1118,6 +832,42 @@ class PreActionValidator: | ||
| 1118 | 832 | |
| 1119 | 833 | return ValidationResult(valid=True) |
| 1120 | 834 | |
| 835 | + def _validate_numbered_sibling_conflict(self, file_path: str) -> ValidationResult: | |
| 836 | + path = Path(file_path).expanduser() | |
| 837 | + if path.exists() or not path.suffix or not path.parent.exists(): | |
| 838 | + return ValidationResult(valid=True) | |
| 839 | + | |
| 840 | + prefix_match = re.match(r"^(\d+)[-_]", path.name) | |
| 841 | + if prefix_match is None: | |
| 842 | + return ValidationResult(valid=True) | |
| 843 | + | |
| 844 | + prefix = prefix_match.group(1) | |
| 845 | + siblings = sorted( | |
| 846 | + candidate | |
| 847 | + for candidate in path.parent.iterdir() | |
| 848 | + if ( | |
| 849 | + candidate.is_file() | |
| 850 | + and candidate.suffix == path.suffix | |
| 851 | + and candidate.name != path.name | |
| 852 | + and re.match(rf"^{re.escape(prefix)}[-_]", candidate.name) | |
| 853 | + ) | |
| 854 | + ) | |
| 855 | + if not siblings: | |
| 856 | + return ValidationResult(valid=True) | |
| 857 | + | |
| 858 | + preview = ", ".join(candidate.name for candidate in siblings[:3]) | |
| 859 | + if len(siblings) > 3: | |
| 860 | + preview += ", ..." | |
| 861 | + return ValidationResult( | |
| 862 | + valid=False, | |
| 863 | + reason="New file conflicts with an existing numbered sibling", | |
| 864 | + suggestion=( | |
| 865 | + f"Reuse the confirmed numbered file in `{path.parent}` instead of " | |
| 866 | + f"creating an alternate filename for step {prefix}, for example: {preview}" | |
| 867 | + ), | |
| 868 | + severity="error", | |
| 869 | + ) | |
| 870 | + | |
| 1121 | 871 | def _validate_read(self, arguments: dict) -> ValidationResult: |
| 1122 | 872 | file_path = arguments.get("file_path", "") |
| 1123 | 873 | |
@@ -1129,7 +879,19 @@ class PreActionValidator: | ||
| 1129 | 879 | severity="error", |
| 1130 | 880 | ) |
| 1131 | 881 | |
| 1132 | - return self._validate_path(file_path) | |
| 882 | + path_result = self._validate_path(file_path) | |
| 883 | + if not path_result.valid: | |
| 884 | + return path_result | |
| 885 | + | |
| 886 | + sibling_result = self._validate_numbered_sibling_conflict(str(file_path)) | |
| 887 | + if not sibling_result.valid: | |
| 888 | + return ValidationResult( | |
| 889 | + valid=False, | |
| 890 | + reason="Read target conflicts with an existing numbered sibling", | |
| 891 | + suggestion=sibling_result.suggestion, | |
| 892 | + severity="error", | |
| 893 | + ) | |
| 894 | + return path_result | |
| 1133 | 895 | |
| 1134 | 896 | def _validate_search(self, tool_name: str, arguments: dict) -> ValidationResult: |
| 1135 | 897 | pattern = arguments.get("pattern", "") |
@@ -1150,7 +912,7 @@ class PreActionValidator: | ||
| 1150 | 912 | content: str, |
| 1151 | 913 | ) -> ValidationResult: |
| 1152 | 914 | normalized = Path(file_path).expanduser() |
| 1153 | - if not html_toc_rule.is_html_toc_index_path(normalized) or "<a " not in content: | |
| 915 | + if normalized.suffix.lower() != ".html" or "<a " not in content: | |
| 1154 | 916 | return ValidationResult(valid=True) |
| 1155 | 917 | |
| 1156 | 918 | link_pairs = re.findall(r'<a\s+href="([^"]+)">([^<]+)</a>', content) |
@@ -1159,65 +921,147 @@ class PreActionValidator: | ||
| 1159 | 921 | |
| 1160 | 922 | root = normalized.parent |
| 1161 | 923 | missing: list[str] = [] |
| 1162 | - mismatched: list[str] = [] | |
| 1163 | - for href, label in link_pairs: | |
| 924 | + for href, _label in link_pairs: | |
| 925 | + target_text = href.strip() | |
| 926 | + if not target_text or target_text.startswith(("#", "mailto:", "tel:", "javascript:")): | |
| 927 | + continue | |
| 928 | + if "://" in target_text: | |
| 929 | + continue | |
| 1164 | 930 | target = (root / href).resolve(strict=False) |
| 1165 | 931 | if not target.exists(): |
| 1166 | 932 | if href not in missing: |
| 1167 | 933 | missing.append(href) |
| 1168 | - continue | |
| 1169 | - | |
| 1170 | - title = html_toc_rule.read_html_title(target) | |
| 1171 | - if title and label.strip() != title: | |
| 1172 | - if href not in mismatched: | |
| 1173 | - mismatched.append(href) | |
| 1174 | 934 | |
| 1175 | 935 | if missing: |
| 1176 | - suggestions = self._suggest_existing_html_targets(root, missing) | |
| 1177 | - preview_items = [ | |
| 1178 | - html_toc_rule.format_html_inventory_entry(root, root / suggestion) | |
| 1179 | - for suggestion in suggestions | |
| 1180 | - ] | |
| 1181 | - if not preview_items: | |
| 1182 | - preview_items = missing | |
| 1183 | - preview = ", ".join(preview_items[:3]) | |
| 1184 | - if len(preview_items) > 3: | |
| 936 | + preview = ", ".join(missing[:3]) | |
| 937 | + if len(missing) > 3: | |
| 1185 | 938 | preview += ", ..." |
| 1186 | 939 | return ValidationResult( |
| 1187 | 940 | valid=False, |
| 1188 | - reason="Edited TOC references chapter files that do not exist", | |
| 1189 | - suggestion=( | |
| 1190 | - f"Use only existing chapter href/title pairs from beside " | |
| 1191 | - f"{html_toc_rule.describe_html_toc_target(normalized)}, for example: " | |
| 1192 | - f"{preview}" | |
| 1193 | - ), | |
| 1194 | - severity="error", | |
| 1195 | - ) | |
| 1196 | - | |
| 1197 | - if mismatched: | |
| 1198 | - exact_entries = [ | |
| 1199 | - html_toc_rule.format_html_inventory_entry(root, (root / href).resolve(strict=False)) | |
| 1200 | - for href in mismatched | |
| 1201 | - if (root / href).resolve(strict=False).exists() | |
| 1202 | - ] | |
| 1203 | - if not exact_entries: | |
| 1204 | - exact_entries = mismatched | |
| 1205 | - preview = "; ".join(exact_entries[:2]) | |
| 1206 | - if len(exact_entries) > 2: | |
| 1207 | - preview += "; ..." | |
| 1208 | - return ValidationResult( | |
| 1209 | - valid=False, | |
| 1210 | - reason="Edited TOC labels do not match the linked chapter titles", | |
| 941 | + reason="Edited HTML links point to files that do not exist", | |
| 1211 | 942 | suggestion=( |
| 1212 | - f"Copy the exact href/title pair from the linked HTML file for " | |
| 1213 | - f"{html_toc_rule.describe_html_toc_target(normalized)}, for example: " | |
| 1214 | - f"{preview}" | |
| 943 | + "Use only existing local targets for href values and avoid " | |
| 944 | + f"introducing missing links, for example fix: {preview}" | |
| 1215 | 945 | ), |
| 1216 | 946 | severity="error", |
| 1217 | 947 | ) |
| 1218 | 948 | |
| 1219 | 949 | return ValidationResult(valid=True) |
| 1220 | 950 | |
| 951 | + def _validate_html_declared_target_set( | |
| 952 | + self, | |
| 953 | + file_path: str, | |
| 954 | + content: str, | |
| 955 | + ) -> ValidationResult: | |
| 956 | + normalized = Path(file_path).expanduser() | |
| 957 | + if normalized.suffix.lower() != ".html" or normalized.name.lower() == "index.html": | |
| 958 | + return ValidationResult(valid=True) | |
| 959 | + | |
| 960 | + local_targets = self._collect_local_html_targets(normalized, content) | |
| 961 | + if not local_targets: | |
| 962 | + return ValidationResult(valid=True) | |
| 963 | + | |
| 964 | + root = self._resolve_html_artifact_root(normalized) | |
| 965 | + existing_html_files = [ | |
| 966 | + path | |
| 967 | + for path in root.rglob("*.html") | |
| 968 | + if path.is_file() and path != normalized | |
| 969 | + ] | |
| 970 | + if not existing_html_files: | |
| 971 | + return ValidationResult(valid=True) | |
| 972 | + | |
| 973 | + declared_targets = self._collect_declared_html_targets(root, existing_html_files) | |
| 974 | + undeclared_missing: list[str] = [] | |
| 975 | + for href, resolved in local_targets: | |
| 976 | + if resolved.exists(): | |
| 977 | + continue | |
| 978 | + relative_target = self._relative_html_target(root, resolved) | |
| 979 | + if relative_target is None: | |
| 980 | + continue | |
| 981 | + if relative_target not in declared_targets and href not in undeclared_missing: | |
| 982 | + undeclared_missing.append(href) | |
| 983 | + | |
| 984 | + if not undeclared_missing: | |
| 985 | + return ValidationResult(valid=True) | |
| 986 | + | |
| 987 | + preview = ", ".join(undeclared_missing[:3]) | |
| 988 | + if len(undeclared_missing) > 3: | |
| 989 | + preview += ", ..." | |
| 990 | + declared_preview = ", ".join(sorted(declared_targets)[:3]) | |
| 991 | + suggestion = ( | |
| 992 | + "Keep non-root HTML pages within the current declared local-link set and " | |
| 993 | + f"avoid introducing new missing sibling targets, for example fix: {preview}" | |
| 994 | + ) | |
| 995 | + if declared_preview: | |
| 996 | + suggestion += f". Already-declared local targets include: {declared_preview}" | |
| 997 | + return ValidationResult( | |
| 998 | + valid=False, | |
| 999 | + reason="HTML page introduces new local targets outside the current declared artifact set", | |
| 1000 | + suggestion=suggestion, | |
| 1001 | + severity="error", | |
| 1002 | + ) | |
| 1003 | + | |
| 1004 | + def _collect_local_html_targets( | |
| 1005 | + self, | |
| 1006 | + file_path: Path, | |
| 1007 | + content: str, | |
| 1008 | + ) -> list[tuple[str, Path]]: | |
| 1009 | + pattern = re.compile(r'href\s*=\s*["\']([^"\']+)["\']', re.IGNORECASE) | |
| 1010 | + targets: list[tuple[str, Path]] = [] | |
| 1011 | + seen: set[str] = set() | |
| 1012 | + for href in pattern.findall(content): | |
| 1013 | + target_text = href.strip() | |
| 1014 | + if not self._is_local_html_link_target(target_text): | |
| 1015 | + continue | |
| 1016 | + resolved = (file_path.parent / target_text).resolve(strict=False) | |
| 1017 | + key = f"{target_text}::{resolved}" | |
| 1018 | + if key in seen: | |
| 1019 | + continue | |
| 1020 | + seen.add(key) | |
| 1021 | + targets.append((target_text, resolved)) | |
| 1022 | + return targets | |
| 1023 | + | |
| 1024 | + def _collect_declared_html_targets( | |
| 1025 | + self, | |
| 1026 | + root: Path, | |
| 1027 | + html_files: list[Path], | |
| 1028 | + ) -> set[str]: | |
| 1029 | + declared: set[str] = set() | |
| 1030 | + for html_file in html_files: | |
| 1031 | + try: | |
| 1032 | + text = html_file.read_text() | |
| 1033 | + except OSError: | |
| 1034 | + continue | |
| 1035 | + for _href, resolved in self._collect_local_html_targets(html_file, text): | |
| 1036 | + relative_target = self._relative_html_target(root, resolved) | |
| 1037 | + if relative_target is not None: | |
| 1038 | + declared.add(relative_target) | |
| 1039 | + return declared | |
| 1040 | + | |
| 1041 | + def _resolve_html_artifact_root(self, file_path: Path) -> Path: | |
| 1042 | + for candidate in [file_path.parent, *file_path.parents]: | |
| 1043 | + if (candidate / "index.html").exists(): | |
| 1044 | + return candidate | |
| 1045 | + return file_path.parent | |
| 1046 | + | |
| 1047 | + def _relative_html_target(self, root: Path, target: Path) -> str | None: | |
| 1048 | + try: | |
| 1049 | + return str(target.relative_to(root)) | |
| 1050 | + except ValueError: | |
| 1051 | + return None | |
| 1052 | + | |
| 1053 | + @staticmethod | |
| 1054 | + def _is_local_html_link_target(href: str) -> bool: | |
| 1055 | + target = href.strip() | |
| 1056 | + if not target: | |
| 1057 | + return False | |
| 1058 | + if target.startswith(("#", "mailto:", "tel:", "javascript:")): | |
| 1059 | + return False | |
| 1060 | + if "://" in target: | |
| 1061 | + return False | |
| 1062 | + normalized = target.split("#", 1)[0].split("?", 1)[0].strip().lower() | |
| 1063 | + return normalized.endswith(".html") | |
| 1064 | + | |
| 1221 | 1065 | def _suggest_existing_html_targets(self, root: Path, missing: list[str]) -> list[str]: |
| 1222 | 1066 | available_by_directory: dict[Path, list[str]] = {} |
| 1223 | 1067 | suggestions: list[str] = [] |
src/loader/runtime/tool_batch_recovery.pymodified@@ -6,6 +6,7 @@ import re | ||
| 6 | 6 | from collections.abc import Awaitable, Callable |
| 7 | 7 | from difflib import SequenceMatcher |
| 8 | 8 | from pathlib import Path |
| 9 | +from typing import Any | |
| 9 | 10 | |
| 10 | 11 | from ..llm.base import Message, Role, ToolCall |
| 11 | 12 | from .compaction import ( |
@@ -17,7 +18,7 @@ from .context import RuntimeContext | ||
| 17 | 18 | from .events import AgentEvent |
| 18 | 19 | from .executor import ToolExecutionOutcome |
| 19 | 20 | from .recovery import RecoveryContext, format_failure_message, format_recovery_prompt |
| 20 | -from .semantic_rules import html_toc as html_toc_rule | |
| 21 | +from .repair_focus import ActiveRepairContext, extract_active_repair_context | |
| 21 | 22 | |
| 22 | 23 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 23 | 24 | |
@@ -59,7 +60,9 @@ class ToolBatchRecoveryController: | ||
| 59 | 60 | type="error", |
| 60 | 61 | content=( |
| 61 | 62 | "Loop detected: already tried a similar command. " |
| 62 | - "Try a DIFFERENT approach (e.g., read a config file first)." | |
| 63 | + "Try a different next step using the files and facts you already have " | |
| 64 | + "(for example, make the specific edit, verify the current result, or " | |
| 65 | + "inspect one concrete unresolved target)." | |
| 63 | 66 | ), |
| 64 | 67 | tool_name=tool_call.name, |
| 65 | 68 | ) |
@@ -129,21 +132,71 @@ class ToolBatchRecoveryController: | ||
| 129 | 132 | |
| 130 | 133 | session = self.context.session |
| 131 | 134 | current_task = getattr(session, "current_task", None) |
| 132 | - focus_path = self._preferred_focus_path( | |
| 133 | - tool_call=tool_call, | |
| 134 | - current_task=current_task, | |
| 135 | - ) | |
| 135 | + active_repair = self._active_repair_context() | |
| 136 | + effective_task = current_task | |
| 137 | + if active_repair is not None and active_repair.artifact_path: | |
| 138 | + effective_task = ( | |
| 139 | + "Repair the current artifact using the failed verification evidence: " | |
| 140 | + f"{active_repair.artifact_path}" | |
| 141 | + ) | |
| 142 | + focus_path = active_repair.artifact_path | |
| 143 | + preferred_next_step = ( | |
| 144 | + f"Update `{active_repair.artifact_path}` to resolve the current " | |
| 145 | + "verification failures." | |
| 146 | + ) | |
| 147 | + else: | |
| 148 | + focus_path = self._preferred_focus_path( | |
| 149 | + tool_call=tool_call, | |
| 150 | + current_task=current_task, | |
| 151 | + ) | |
| 152 | + preferred_next_step = infer_preferred_next_step( | |
| 153 | + session.messages, | |
| 154 | + current_task=effective_task, | |
| 155 | + focus_path=focus_path or None, | |
| 156 | + ) | |
| 136 | 157 | confirmed_facts = summarize_confirmed_facts(session.messages) |
| 137 | - preferred_next_step = infer_preferred_next_step( | |
| 138 | - session.messages, | |
| 139 | - current_task=current_task, | |
| 140 | - focus_path=focus_path or None, | |
| 141 | - ) | |
| 142 | - actionable_known_state = bool(confirmed_facts and preferred_next_step) | |
| 143 | 158 | lines = [prompt] |
| 144 | - if confirmed_facts or preferred_next_step or current_task: | |
| 159 | + candidate_lines = self._file_not_found_candidate_lines( | |
| 160 | + tool_call, | |
| 161 | + outcome, | |
| 162 | + active_repair=active_repair, | |
| 163 | + ) | |
| 164 | + actionable_known_state = bool( | |
| 165 | + active_repair or current_task or confirmed_facts or preferred_next_step or candidate_lines | |
| 166 | + ) | |
| 167 | + if active_repair is not None: | |
| 168 | + lines.extend(["", "## ACTIVE REPAIR TARGET"]) | |
| 169 | + lines.append( | |
| 170 | + "- Verification already failed on the current artifact set. " | |
| 171 | + "Stay on this repair until the broken local references are fixed." | |
| 172 | + ) | |
| 173 | + lines.extend(active_repair.repair_lines) | |
| 174 | + drifted_path = self._canonicalize_path( | |
| 175 | + str( | |
| 176 | + tool_call.arguments.get("file_path") | |
| 177 | + or tool_call.arguments.get("path") | |
| 178 | + or "" | |
| 179 | + ).strip() | |
| 180 | + ) | |
| 181 | + if ( | |
| 182 | + drifted_path | |
| 183 | + and active_repair.artifact_path | |
| 184 | + and drifted_path != active_repair.artifact_path | |
| 185 | + ): | |
| 186 | + lines.append( | |
| 187 | + f"- The failed tool call drifted to `{drifted_path}`. " | |
| 188 | + f"Return to `{active_repair.artifact_path}` instead of reopening " | |
| 189 | + "the original discovery task." | |
| 190 | + ) | |
| 191 | + lines.append( | |
| 192 | + "- Treat this repair as higher priority than the original discovery " | |
| 193 | + "prompt until verification passes." | |
| 194 | + ) | |
| 195 | + if active_repair or confirmed_facts or preferred_next_step or current_task: | |
| 145 | 196 | lines.extend(["", "## CONTINUE FROM KNOWN STATE"]) |
| 146 | - if current_task: | |
| 197 | + if active_repair is not None and active_repair.artifact_path: | |
| 198 | + lines.append(f"- Active repair target: `{active_repair.artifact_path}`") | |
| 199 | + elif current_task: | |
| 147 | 200 | lines.append(f"- Current task: {current_task}") |
| 148 | 201 | if confirmed_facts: |
| 149 | 202 | lines.append(f"- Confirmed facts: {confirmed_facts}") |
@@ -153,18 +206,28 @@ class ToolBatchRecoveryController: | ||
| 153 | 206 | "- Preserve progress: do not restart by rereading already-confirmed files " |
| 154 | 207 | "unless you need genuinely new evidence." |
| 155 | 208 | ) |
| 209 | + if active_repair is not None: | |
| 210 | + lines.append( | |
| 211 | + "- Do not go back to the original reference guide or invent alternate " | |
| 212 | + "paths while this repair target is unresolved." | |
| 213 | + ) | |
| 156 | 214 | if actionable_known_state: |
| 215 | + target_line = ( | |
| 216 | + f"- Prefer edit/write/patch on `{active_repair.artifact_path}` over " | |
| 217 | + "rereading the same files." | |
| 218 | + if active_repair is not None and active_repair.artifact_path | |
| 219 | + else "- Prefer edit/write/patch on the target file over rereading the same files." | |
| 220 | + ) | |
| 157 | 221 | lines.extend( |
| 158 | 222 | [ |
| 159 | 223 | "", |
| 160 | 224 | "## ACTION BIAS FOR THIS RECOVERY", |
| 161 | 225 | "- The confirmed findings above are already enough to keep moving.", |
| 162 | - "- Prefer edit/write/patch on the target file over rereading the same files.", | |
| 226 | + target_line, | |
| 163 | 227 | "- Only inspect one more file if a specific filename, href, or title is still unknown.", |
| 164 | 228 | "- Treat the preferred next step as the default path forward.", |
| 165 | 229 | ] |
| 166 | 230 | ) |
| 167 | - candidate_lines = self._file_not_found_candidate_lines(tool_call, outcome) | |
| 168 | 231 | if candidate_lines: |
| 169 | 232 | lines.extend(["", "## LIKELY FILE CANDIDATES", *candidate_lines]) |
| 170 | 233 | target_excerpt_lines = self._target_excerpt_lines(tool_call) |
@@ -229,6 +292,8 @@ class ToolBatchRecoveryController: | ||
| 229 | 292 | self, |
| 230 | 293 | tool_call: ToolCall, |
| 231 | 294 | outcome: ToolExecutionOutcome, |
| 295 | + *, | |
| 296 | + active_repair: ActiveRepairContext | None = None, | |
| 232 | 297 | ) -> list[str]: |
| 233 | 298 | if tool_call.name not in {"read", "write", "edit", "patch"}: |
| 234 | 299 | return [] |
@@ -247,14 +312,26 @@ class ToolBatchRecoveryController: | ||
| 247 | 312 | |
| 248 | 313 | candidates = self._rank_known_file_candidates(missing_path) |
| 249 | 314 | if not candidates: |
| 315 | + if active_repair is not None and active_repair.artifact_path: | |
| 316 | + return [ | |
| 317 | + f"- Requested file does not exist: `{missing_path}`", | |
| 318 | + f"- Active repair target is `{active_repair.artifact_path}`.", | |
| 319 | + "- Repair the known target instead of inventing a new path.", | |
| 320 | + ] | |
| 250 | 321 | return [] |
| 251 | 322 | |
| 252 | 323 | names = ", ".join(self._describe_candidate(candidate) for candidate in candidates[:3]) |
| 253 | - return [ | |
| 324 | + lines = [ | |
| 254 | 325 | f"- Requested file does not exist: `{missing_path}`", |
| 255 | 326 | f"- Closest known files in the same directory: {names}", |
| 256 | 327 | "- Prefer one of those exact filenames instead of retrying the missing path.", |
| 257 | 328 | ] |
| 329 | + if active_repair is not None and active_repair.artifact_path: | |
| 330 | + lines.append( | |
| 331 | + f"- Keep the repair centered on `{active_repair.artifact_path}` rather than " | |
| 332 | + "switching back to broad discovery." | |
| 333 | + ) | |
| 334 | + return lines | |
| 258 | 335 | |
| 259 | 336 | def _rank_known_file_candidates(self, missing_path: str) -> list[str]: |
| 260 | 337 | missing_parent = str(Path(missing_path).parent) |
@@ -316,51 +393,261 @@ class ToolBatchRecoveryController: | ||
| 316 | 393 | |
| 317 | 394 | def _describe_candidate(self, candidate: str) -> str: |
| 318 | 395 | path = Path(candidate) |
| 319 | - label = f"`{path.name}`" | |
| 320 | - if path.suffix == ".html": | |
| 321 | - title = html_toc_rule.read_html_title(path) | |
| 322 | - if title: | |
| 323 | - return f"{label} = {title}" | |
| 324 | - return label | |
| 396 | + return f"`{path.name}`" | |
| 325 | 397 | |
| 326 | 398 | def _target_excerpt_lines(self, tool_call: ToolCall) -> list[str]: |
| 327 | - file_path = str( | |
| 399 | + if tool_call.name not in {"edit", "patch"}: | |
| 400 | + return [] | |
| 401 | + | |
| 402 | + raw_path = str( | |
| 328 | 403 | tool_call.arguments.get("file_path") |
| 329 | 404 | or tool_call.arguments.get("path") |
| 330 | 405 | or "" |
| 331 | 406 | ).strip() |
| 332 | - if not file_path: | |
| 407 | + target_path = self._canonicalize_path(raw_path) | |
| 408 | + if not target_path: | |
| 333 | 409 | return [] |
| 334 | - current_task = getattr(self.context.session, "current_task", None) | |
| 335 | - if not html_toc_rule.task_targets_html_toc(current_task): | |
| 410 | + | |
| 411 | + path = Path(target_path) | |
| 412 | + if not path.is_file(): | |
| 336 | 413 | return [] |
| 337 | 414 | |
| 338 | - inventory = html_toc_rule.summarize_html_inventory(file_path, limit=12) | |
| 339 | - excerpt = html_toc_rule.extract_html_toc_excerpt(file_path) | |
| 340 | - if not inventory and not excerpt: | |
| 415 | + try: | |
| 416 | + content = path.read_text() | |
| 417 | + except Exception: | |
| 341 | 418 | return [] |
| 342 | 419 | |
| 343 | - lines: list[str] = [] | |
| 344 | - if inventory: | |
| 345 | - lines.append(f"- Verified chapter inventory: {inventory}") | |
| 346 | - if excerpt: | |
| 347 | - lines.append("- Current TOC block:") | |
| 348 | - lines.extend(f" {line}" for line in excerpt.splitlines()) | |
| 349 | - replacement = html_toc_rule.build_html_toc_replacement_block(file_path) | |
| 350 | - if replacement: | |
| 351 | - lines.append("- Suggested replacement block:") | |
| 352 | - lines.extend(f" {line}" for line in replacement.splitlines()) | |
| 353 | - if excerpt and replacement: | |
| 354 | - lines.append("- Exact edit guidance:") | |
| 355 | - lines.append(f" file_path: {file_path}") | |
| 356 | - lines.append(" old_string: use the Current TOC block above exactly") | |
| 357 | - lines.append(" new_string: use the Suggested replacement block above exactly") | |
| 358 | - lines.append(" Do not rewrite the whole file.") | |
| 359 | - edit_template = html_toc_rule.build_html_toc_edit_call_template(file_path) | |
| 360 | - if edit_template: | |
| 361 | - lines.append("- Suggested edit call:") | |
| 362 | - lines.extend(f" {line}" for line in edit_template.splitlines()) | |
| 363 | - return lines | |
| 420 | + file_lines = content.splitlines() | |
| 421 | + if not file_lines: | |
| 422 | + return [ | |
| 423 | + f"- Target file: `{target_path}`", | |
| 424 | + "- The file is currently empty.", | |
| 425 | + "- Use the exact on-disk state above when preparing the next mutation.", | |
| 426 | + ] | |
| 427 | + | |
| 428 | + start, end, label = self._excerpt_window_for_tool_call( | |
| 429 | + file_lines=file_lines, | |
| 430 | + content=content, | |
| 431 | + tool_call=tool_call, | |
| 432 | + ) | |
| 433 | + excerpt = self._format_excerpt_lines(file_lines, start, end) | |
| 434 | + if not excerpt: | |
| 435 | + return [] | |
| 436 | + | |
| 437 | + return [ | |
| 438 | + f"- Target file: `{target_path}`", | |
| 439 | + f"- {label}", | |
| 440 | + *excerpt, | |
| 441 | + "- Use the exact on-disk text above when preparing the next mutation.", | |
| 442 | + "- If several adjacent lines are wrong, replace the containing block in one edit instead of retrying a smaller substitution.", | |
| 443 | + ] | |
| 444 | + | |
| 445 | + def _excerpt_window_for_tool_call( | |
| 446 | + self, | |
| 447 | + *, | |
| 448 | + file_lines: list[str], | |
| 449 | + content: str, | |
| 450 | + tool_call: ToolCall, | |
| 451 | + ) -> tuple[int, int, str]: | |
| 452 | + if tool_call.name == "edit": | |
| 453 | + window = self._edit_excerpt_window( | |
| 454 | + file_lines=file_lines, | |
| 455 | + content=content, | |
| 456 | + arguments=tool_call.arguments, | |
| 457 | + ) | |
| 458 | + if window is not None: | |
| 459 | + return window | |
| 460 | + if tool_call.name == "patch": | |
| 461 | + window = self._patch_excerpt_window( | |
| 462 | + file_lines=file_lines, | |
| 463 | + arguments=tool_call.arguments, | |
| 464 | + ) | |
| 465 | + if window is not None: | |
| 466 | + return window | |
| 467 | + return self._bounded_window( | |
| 468 | + file_lines=file_lines, | |
| 469 | + start=0, | |
| 470 | + length=min(10, len(file_lines)), | |
| 471 | + label="Current file contents:", | |
| 472 | + ) | |
| 473 | + | |
| 474 | + def _edit_excerpt_window( | |
| 475 | + self, | |
| 476 | + *, | |
| 477 | + file_lines: list[str], | |
| 478 | + content: str, | |
| 479 | + arguments: dict[str, Any], | |
| 480 | + ) -> tuple[int, int, str] | None: | |
| 481 | + old_string = str(arguments.get("old_string") or "") | |
| 482 | + new_string = str(arguments.get("new_string") or "") | |
| 483 | + | |
| 484 | + if old_string: | |
| 485 | + exact_window = self._exact_string_window( | |
| 486 | + content=content, | |
| 487 | + file_lines=file_lines, | |
| 488 | + needle=old_string, | |
| 489 | + label="Current file contents for the requested edit:", | |
| 490 | + ) | |
| 491 | + if exact_window is not None: | |
| 492 | + return exact_window | |
| 493 | + | |
| 494 | + anchor = old_string or new_string | |
| 495 | + approximate_window = self._approximate_string_window( | |
| 496 | + file_lines=file_lines, | |
| 497 | + needle=anchor, | |
| 498 | + label="Closest on-disk block to the requested edit:", | |
| 499 | + ) | |
| 500 | + if approximate_window is not None: | |
| 501 | + return approximate_window | |
| 502 | + return None | |
| 503 | + | |
| 504 | + def _patch_excerpt_window( | |
| 505 | + self, | |
| 506 | + *, | |
| 507 | + file_lines: list[str], | |
| 508 | + arguments: dict[str, Any], | |
| 509 | + ) -> tuple[int, int, str] | None: | |
| 510 | + hunks = arguments.get("hunks") | |
| 511 | + if not isinstance(hunks, list) or not hunks: | |
| 512 | + return None | |
| 513 | + | |
| 514 | + first_hunk = hunks[0] | |
| 515 | + if not isinstance(first_hunk, dict): | |
| 516 | + return None | |
| 517 | + | |
| 518 | + anchor_lines: list[str] = [] | |
| 519 | + raw_lines = first_hunk.get("lines") | |
| 520 | + if isinstance(raw_lines, list): | |
| 521 | + for raw_line in raw_lines: | |
| 522 | + if not isinstance(raw_line, str) or not raw_line: | |
| 523 | + continue | |
| 524 | + if raw_line[0] in {" ", "-"}: | |
| 525 | + anchor_lines.append(raw_line[1:]) | |
| 526 | + | |
| 527 | + anchor = "\n".join(anchor_lines).strip() | |
| 528 | + approximate_window = self._approximate_string_window( | |
| 529 | + file_lines=file_lines, | |
| 530 | + needle=anchor, | |
| 531 | + label="Closest on-disk block to the requested patch:", | |
| 532 | + ) | |
| 533 | + if approximate_window is not None: | |
| 534 | + return approximate_window | |
| 535 | + | |
| 536 | + old_start = first_hunk.get("old_start", 1) | |
| 537 | + old_lines = first_hunk.get("old_lines", len(anchor_lines) or 1) | |
| 538 | + try: | |
| 539 | + start = max(0, int(old_start) - 1) | |
| 540 | + except (TypeError, ValueError): | |
| 541 | + start = 0 | |
| 542 | + try: | |
| 543 | + length = max(1, int(old_lines)) | |
| 544 | + except (TypeError, ValueError): | |
| 545 | + length = max(1, len(anchor_lines) or 1) | |
| 546 | + return self._bounded_window( | |
| 547 | + file_lines=file_lines, | |
| 548 | + start=start, | |
| 549 | + length=length, | |
| 550 | + label="Current file contents near the requested patch location:", | |
| 551 | + ) | |
| 552 | + | |
| 553 | + def _exact_string_window( | |
| 554 | + self, | |
| 555 | + *, | |
| 556 | + content: str, | |
| 557 | + file_lines: list[str], | |
| 558 | + needle: str, | |
| 559 | + label: str, | |
| 560 | + ) -> tuple[int, int, str] | None: | |
| 561 | + if not needle: | |
| 562 | + return None | |
| 563 | + index = content.find(needle) | |
| 564 | + if index == -1: | |
| 565 | + return None | |
| 566 | + start_line = content[:index].count("\n") | |
| 567 | + block_length = max(1, len(needle.splitlines())) | |
| 568 | + return self._bounded_window( | |
| 569 | + file_lines=file_lines, | |
| 570 | + start=start_line, | |
| 571 | + length=block_length, | |
| 572 | + label=label, | |
| 573 | + ) | |
| 574 | + | |
| 575 | + def _approximate_string_window( | |
| 576 | + self, | |
| 577 | + *, | |
| 578 | + file_lines: list[str], | |
| 579 | + needle: str, | |
| 580 | + label: str, | |
| 581 | + ) -> tuple[int, int, str] | None: | |
| 582 | + normalized_needle = self._normalize_match_text(needle) | |
| 583 | + if not normalized_needle: | |
| 584 | + return None | |
| 585 | + | |
| 586 | + needle_lines = [line for line in needle.splitlines() if line.strip()] | |
| 587 | + if not needle_lines: | |
| 588 | + needle_lines = [needle.strip()] | |
| 589 | + | |
| 590 | + min_window = 1 | |
| 591 | + max_window = min(len(file_lines), max(1, len(needle_lines) + 2)) | |
| 592 | + best_score = 0.0 | |
| 593 | + best_start = 0 | |
| 594 | + best_length = min(max_window, max(1, len(needle_lines))) | |
| 595 | + for window_length in range(min_window, max_window + 1): | |
| 596 | + for start in range(0, len(file_lines) - window_length + 1): | |
| 597 | + candidate = "\n".join(file_lines[start : start + window_length]) | |
| 598 | + score = SequenceMatcher( | |
| 599 | + None, | |
| 600 | + normalized_needle, | |
| 601 | + self._normalize_match_text(candidate), | |
| 602 | + ).ratio() | |
| 603 | + if score > best_score: | |
| 604 | + best_score = score | |
| 605 | + best_start = start | |
| 606 | + best_length = window_length | |
| 607 | + | |
| 608 | + if best_score < 0.25: | |
| 609 | + return None | |
| 610 | + | |
| 611 | + return self._bounded_window( | |
| 612 | + file_lines=file_lines, | |
| 613 | + start=best_start, | |
| 614 | + length=best_length, | |
| 615 | + label=label, | |
| 616 | + ) | |
| 617 | + | |
| 618 | + def _bounded_window( | |
| 619 | + self, | |
| 620 | + *, | |
| 621 | + file_lines: list[str], | |
| 622 | + start: int, | |
| 623 | + length: int, | |
| 624 | + label: str, | |
| 625 | + ) -> tuple[int, int, str]: | |
| 626 | + context_before = 2 | |
| 627 | + context_after = 2 | |
| 628 | + start_index = max(0, start - context_before) | |
| 629 | + end_index = min(len(file_lines), start + max(1, length) + context_after) | |
| 630 | + return start_index, end_index, label | |
| 631 | + | |
| 632 | + def _format_excerpt_lines( | |
| 633 | + self, | |
| 634 | + file_lines: list[str], | |
| 635 | + start: int, | |
| 636 | + end: int, | |
| 637 | + ) -> list[str]: | |
| 638 | + if start >= end: | |
| 639 | + return [] | |
| 640 | + width = len(str(end)) | |
| 641 | + return [ | |
| 642 | + f" {line_number:>{width}} | {file_lines[line_number - 1]}" | |
| 643 | + for line_number in range(start + 1, end + 1) | |
| 644 | + ] | |
| 645 | + | |
| 646 | + def _normalize_match_text(self, text: str) -> str: | |
| 647 | + return " ".join(str(text or "").split()) | |
| 648 | + | |
| 649 | + def _active_repair_context(self) -> ActiveRepairContext | None: | |
| 650 | + return extract_active_repair_context(self.context.session.messages) | |
| 364 | 651 | |
| 365 | 652 | def _canonicalize_path(self, raw_path: str) -> str: |
| 366 | 653 | if not raw_path: |
src/loader/runtime/tool_batches.pymodified@@ -7,16 +7,20 @@ from dataclasses import dataclass, field | ||
| 7 | 7 | from pathlib import Path |
| 8 | 8 | from typing import Any |
| 9 | 9 | |
| 10 | -from ..llm.base import Role, ToolCall | |
| 10 | +from ..llm.base import ToolCall | |
| 11 | 11 | from .compaction import infer_preferred_next_step, summarize_confirmed_facts |
| 12 | 12 | from .context import RuntimeContext |
| 13 | 13 | from .dod import ( |
| 14 | 14 | DefinitionOfDone, |
| 15 | 15 | DefinitionOfDoneStore, |
| 16 | + all_planned_artifacts_exist, | |
| 16 | 17 | begin_new_verification_attempt, |
| 18 | + collect_planned_artifact_targets, | |
| 17 | 19 | derive_verification_commands, |
| 18 | 20 | ensure_active_verification_attempt, |
| 21 | + infer_next_declared_html_output_file, | |
| 19 | 22 | is_state_mutating_tool_call, |
| 23 | + planned_artifact_target_satisfied, | |
| 20 | 24 | record_successful_tool_call, |
| 21 | 25 | synthesize_todo_items, |
| 22 | 26 | ) |
@@ -25,15 +29,20 @@ from .evidence_provenance import EvidenceProvenance, EvidenceProvenanceStatus | ||
| 25 | 29 | from .executor import ToolExecutionState, ToolExecutor |
| 26 | 30 | from .logging import get_runtime_logger |
| 27 | 31 | from .policy_timeline import append_verification_timeline_entry |
| 32 | +from .repair_focus import extract_active_repair_context | |
| 28 | 33 | from .safeguard_services import extract_shell_text_rewrite_target |
| 29 | -from .semantic_rules import html_toc as html_toc_rule | |
| 30 | 34 | from .tool_batch_checks import ToolBatchConfidenceGate, ToolBatchVerificationGate |
| 31 | 35 | from .tool_batch_recovery import ToolBatchRecoveryController |
| 32 | 36 | from .verification_observations import ( |
| 33 | 37 | VerificationObservation, |
| 34 | 38 | VerificationObservationStatus, |
| 35 | 39 | ) |
| 36 | -from .workflow import advance_todos_from_tool_call, sync_todos_to_definition_of_done | |
| 40 | +from .workflow import ( | |
| 41 | + advance_todos_from_tool_call, | |
| 42 | + effective_pending_todo_items, | |
| 43 | + reconcile_aggregate_completion_steps, | |
| 44 | + sync_todos_to_definition_of_done, | |
| 45 | +) | |
| 37 | 46 | |
| 38 | 47 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 39 | 48 | ConfirmationHandler = ( |
@@ -48,17 +57,46 @@ _TODO_NUDGE_EXCLUDED_ITEMS = { | ||
| 48 | 57 | } |
| 49 | 58 | _MUTATION_TODO_HINTS = ( |
| 50 | 59 | "create", |
| 60 | + "creating", | |
| 51 | 61 | "update", |
| 62 | + "updating", | |
| 52 | 63 | "edit", |
| 64 | + "editing", | |
| 53 | 65 | "write", |
| 66 | + "writing", | |
| 54 | 67 | "fix", |
| 68 | + "fixing", | |
| 55 | 69 | "modify", |
| 70 | + "modifying", | |
| 56 | 71 | "change", |
| 72 | + "changing", | |
| 57 | 73 | "patch", |
| 74 | + "patching", | |
| 58 | 75 | "replace", |
| 76 | + "replacing", | |
| 59 | 77 | "correct", |
| 78 | + "correcting", | |
| 60 | 79 | "rewrite", |
| 80 | + "rewriting", | |
| 81 | +) | |
| 82 | +_CONSISTENCY_REVIEW_HINTS = ( | |
| 83 | + "consistent", | |
| 84 | + "consistently", | |
| 85 | + "formatted", | |
| 86 | + "link", | |
| 87 | + "linked", | |
| 88 | + "navigation", | |
| 89 | + "work properly", | |
| 90 | + "all files", | |
| 91 | + "every file", | |
| 92 | + "ensure", | |
| 61 | 93 | ) |
| 94 | +_BOOKKEEPING_NOTE_TOOL_NAMES = { | |
| 95 | + "notepad_write_working", | |
| 96 | + "notepad_append", | |
| 97 | + "notepad_write_priority", | |
| 98 | + "notepad_write_manual", | |
| 99 | +} | |
| 62 | 100 | |
| 63 | 101 | |
| 64 | 102 | @dataclass |
@@ -88,7 +126,6 @@ class ToolBatchRunner: | ||
| 88 | 126 | self.confidence_gate = confidence_gate or ToolBatchConfidenceGate(context) |
| 89 | 127 | self.recovery_controller = recovery_controller or ToolBatchRecoveryController(context) |
| 90 | 128 | self.verification_gate = verification_gate or ToolBatchVerificationGate(context) |
| 91 | - self._inventory_hint_targets: set[str] = set() | |
| 92 | 129 | |
| 93 | 130 | async def execute_batch( |
| 94 | 131 | self, |
@@ -205,10 +242,6 @@ class ToolBatchRunner: | ||
| 205 | 242 | if label: |
| 206 | 243 | completed_labels.append(label) |
| 207 | 244 | await _emit_batch_todos() |
| 208 | - self._annotate_verified_html_inventory(executed_tool_call, outcome) | |
| 209 | - self._queue_verified_html_inventory_nudge(executed_tool_call) | |
| 210 | - self._annotate_validated_html_toc_completion(executed_tool_call, outcome) | |
| 211 | - self._queue_validated_html_toc_completion_nudge(executed_tool_call) | |
| 212 | 245 | if loop_response is not None: |
| 213 | 246 | result.halted = True |
| 214 | 247 | result.final_response = loop_response |
@@ -244,6 +277,16 @@ class ToolBatchRunner: | ||
| 244 | 277 | if outcome.state == ToolExecutionState.DUPLICATE: |
| 245 | 278 | self._queue_duplicate_observation_nudge(tool_call, dod=dod) |
| 246 | 279 | elif outcome.state == ToolExecutionState.BLOCKED: |
| 280 | + self._queue_blocked_active_repair_nudge(outcome.event_content) | |
| 281 | + self._queue_blocked_active_repair_mutation_nudge(outcome.event_content) | |
| 282 | + self._queue_blocked_completed_artifact_scope_nudge( | |
| 283 | + outcome.event_content, | |
| 284 | + dod=dod, | |
| 285 | + ) | |
| 286 | + self._queue_blocked_late_reference_drift_nudge( | |
| 287 | + outcome.event_content, | |
| 288 | + dod=dod, | |
| 289 | + ) | |
| 247 | 290 | self._queue_blocked_shell_rewrite_nudge(tool_call) |
| 248 | 291 | self._queue_blocked_html_edit_nudge(tool_call, outcome.event_content) |
| 249 | 292 | |
@@ -290,10 +333,17 @@ class ToolBatchRunner: | ||
| 290 | 333 | return |
| 291 | 334 | |
| 292 | 335 | current_task = getattr(self.context.session, "current_task", None) |
| 336 | + missing_artifact = _next_missing_planned_artifact( | |
| 337 | + dod, | |
| 338 | + project_root=self.context.project_root, | |
| 339 | + ) | |
| 293 | 340 | next_pending = next( |
| 294 | 341 | ( |
| 295 | 342 | item |
| 296 | - for item in dod.pending_items | |
| 343 | + for item in effective_pending_todo_items( | |
| 344 | + dod, | |
| 345 | + project_root=self.context.project_root, | |
| 346 | + ) | |
| 297 | 347 | if item not in _TODO_NUDGE_EXCLUDED_ITEMS |
| 298 | 348 | ), |
| 299 | 349 | None, |
@@ -302,13 +352,35 @@ class ToolBatchRunner: | ||
| 302 | 352 | self.context.session.messages, |
| 303 | 353 | max_items=2, |
| 304 | 354 | ) |
| 305 | - if next_pending and not html_toc_rule.task_targets_html_toc(current_task): | |
| 355 | + if _should_prioritize_missing_artifact( | |
| 356 | + next_pending=next_pending, | |
| 357 | + missing_artifact=missing_artifact, | |
| 358 | + ): | |
| 359 | + prefix = "Reuse the earlier observation instead of repeating it. " | |
| 360 | + if confirmed_facts: | |
| 361 | + prefix += f"Confirmed facts: {confirmed_facts}. " | |
| 362 | + self.context.queue_steering_message( | |
| 363 | + prefix | |
| 364 | + + "An explicitly planned artifact is still missing." | |
| 365 | + + _missing_artifact_resume_suffix( | |
| 366 | + missing_artifact, | |
| 367 | + project_root=self.context.project_root, | |
| 368 | + ) | |
| 369 | + + " Do not switch into review or consistency-check mode until the missing artifact exists." | |
| 370 | + ) | |
| 371 | + return | |
| 372 | + if next_pending: | |
| 306 | 373 | mutation_suffix = "" |
| 307 | 374 | if _todo_is_mutation_step(next_pending): |
| 308 | - mutation_suffix = ( | |
| 309 | - " You already have enough evidence for that step, so stop gathering " | |
| 310 | - "more reference material and perform the change now." | |
| 375 | + mutation_suffix = _missing_artifact_resume_suffix( | |
| 376 | + missing_artifact, | |
| 377 | + project_root=self.context.project_root, | |
| 311 | 378 | ) |
| 379 | + if not mutation_suffix: | |
| 380 | + mutation_suffix = ( | |
| 381 | + " You already have enough evidence for that step, so stop gathering " | |
| 382 | + "more reference material and perform the change now." | |
| 383 | + ) | |
| 312 | 384 | if confirmed_facts: |
| 313 | 385 | self.context.queue_steering_message( |
| 314 | 386 | "Reuse the earlier observation instead of repeating it. " |
@@ -326,6 +398,37 @@ class ToolBatchRunner: | ||
| 326 | 398 | ) |
| 327 | 399 | return |
| 328 | 400 | |
| 401 | + if missing_artifact is not None: | |
| 402 | + self.context.queue_steering_message( | |
| 403 | + "Reuse the earlier observation instead of repeating it. " | |
| 404 | + + _missing_artifact_resume_suffix( | |
| 405 | + missing_artifact, | |
| 406 | + project_root=self.context.project_root, | |
| 407 | + ).strip() | |
| 408 | + ) | |
| 409 | + return | |
| 410 | + | |
| 411 | + if all_planned_artifacts_exist(dod, project_root=self.context.project_root): | |
| 412 | + verification_commands = dod.verification_commands or derive_verification_commands( | |
| 413 | + dod, | |
| 414 | + project_root=self.context.project_root, | |
| 415 | + task_statement=current_task, | |
| 416 | + supplement_existing=True, | |
| 417 | + ) | |
| 418 | + verification_suffix = ( | |
| 419 | + "Move to verification or final confirmation using the files already on disk." | |
| 420 | + if verification_commands | |
| 421 | + else "Finish the current review using the files already on disk." | |
| 422 | + ) | |
| 423 | + self.context.queue_steering_message( | |
| 424 | + "Reuse the earlier observation instead of repeating it. " | |
| 425 | + "All explicitly planned artifacts already exist. " | |
| 426 | + "Use the current task artifacts as the source of truth and do not reopen " | |
| 427 | + "reference materials unless one specific gap is still unknown. " | |
| 428 | + + verification_suffix | |
| 429 | + ) | |
| 430 | + return | |
| 431 | + | |
| 329 | 432 | preferred_next_step = infer_preferred_next_step( |
| 330 | 433 | self.context.session.messages, |
| 331 | 434 | current_task=current_task, |
@@ -401,277 +504,159 @@ class ToolBatchRunner: | ||
| 401 | 504 | f"Apply the change to `{target}` with edit/patch/write." |
| 402 | 505 | ) |
| 403 | 506 | |
| 404 | - def _queue_blocked_html_edit_nudge(self, tool_call: ToolCall, event_content: str) -> None: | |
| 405 | - """Steer blocked TOC edits back to the confirmed chapter inventory.""" | |
| 406 | - | |
| 407 | - if tool_call.name not in {"edit", "patch"}: | |
| 408 | - return | |
| 409 | - if not self._targets_html_toc_task(): | |
| 410 | - return | |
| 411 | - | |
| 412 | - target_path = str(tool_call.arguments.get("file_path", "")).strip() | |
| 413 | - if not html_toc_rule.is_html_toc_index_path(target_path): | |
| 414 | - return | |
| 507 | + def _queue_blocked_active_repair_nudge(self, event_content: str) -> None: | |
| 508 | + """Reinforce active repair focus after an out-of-scope blocked observation.""" | |
| 415 | 509 | |
| 416 | - validation = html_toc_rule.validate_html_toc(target_path) | |
| 417 | - if ( | |
| 418 | - "old_string and new_string are identical" in event_content | |
| 419 | - and validation is not None | |
| 420 | - and validation.valid | |
| 421 | - ): | |
| 422 | - action_tracker = getattr(self.context.safeguards, "action_tracker", None) | |
| 423 | - note_validated = getattr(action_tracker, "note_validated_html_toc", None) | |
| 424 | - if callable(note_validated): | |
| 425 | - note_validated(target_path) | |
| 426 | - target_label = html_toc_rule.describe_html_toc_target(target_path) | |
| 427 | - self.context.queue_steering_message( | |
| 428 | - f"The HTML table-of-contents target {target_label} already matches the " | |
| 429 | - "validated replacement block. " | |
| 430 | - f"Semantic verification preview: validated {validation.link_count} linked " | |
| 431 | - "entries. " | |
| 432 | - "Do not call `edit`, `patch`, or reread the same TOC again. Briefly state " | |
| 433 | - f"that {target_label} is already updated so Loader can continue the " | |
| 434 | - "verification gate or finish the task." | |
| 435 | - ) | |
| 510 | + if "[Blocked - active repair scope:" not in event_content: | |
| 436 | 511 | return |
| 437 | 512 | |
| 438 | - current_task = getattr(self.context.session, "current_task", None) | |
| 439 | - confirmed_facts = summarize_confirmed_facts( | |
| 440 | - self.context.session.messages, | |
| 441 | - max_items=2, | |
| 442 | - focus_path=target_path, | |
| 443 | - ) | |
| 444 | - preferred_next_step = infer_preferred_next_step( | |
| 445 | - self.context.session.messages, | |
| 446 | - current_task=current_task, | |
| 447 | - focus_path=target_path, | |
| 448 | - ) | |
| 449 | - verified_inventory = html_toc_rule.summarize_html_inventory(target_path, limit=12) | |
| 450 | - current_excerpt = html_toc_rule.extract_html_toc_excerpt(target_path) | |
| 451 | - suggested_replacement = html_toc_rule.build_html_toc_replacement_block(target_path) | |
| 452 | - suggested_call = html_toc_rule.build_html_toc_edit_call_template(target_path) | |
| 453 | - target_label = html_toc_rule.describe_html_toc_target(target_path) | |
| 454 | - excerpt_suffix = ( | |
| 455 | - f"\nCurrent TOC block:\n{current_excerpt}" | |
| 456 | - if current_excerpt | |
| 457 | - else "" | |
| 458 | - ) | |
| 459 | - replacement_suffix = ( | |
| 460 | - f"\nSuggested replacement block:\n{suggested_replacement}" | |
| 461 | - if suggested_replacement | |
| 462 | - else "" | |
| 463 | - ) | |
| 464 | - call_suffix = ( | |
| 465 | - f"\nSuggested edit call:\n{suggested_call}" | |
| 466 | - if suggested_call | |
| 467 | - else "" | |
| 468 | - ) | |
| 469 | - | |
| 470 | - if preferred_next_step and confirmed_facts and verified_inventory: | |
| 471 | - self.context.queue_steering_message( | |
| 472 | - f"Use the current TOC target contents plus the verified sibling inventory for " | |
| 473 | - f"{target_label} instead of guessing. " | |
| 474 | - f"Confirmed facts: {confirmed_facts}. " | |
| 475 | - f"Known chapter inventory: {verified_inventory}. " | |
| 476 | - f"{preferred_next_step} " | |
| 477 | - f"Apply those exact href/title pairs in {target_label}. " | |
| 478 | - "Do not rewrite the whole document. For `edit`, set `old_string` to the " | |
| 479 | - "current TOC block above exactly and set `new_string` to the suggested " | |
| 480 | - "replacement block below exactly." | |
| 481 | - f"{excerpt_suffix}" | |
| 482 | - f"{replacement_suffix}" | |
| 483 | - f"{call_suffix}" | |
| 484 | - ) | |
| 513 | + repair = extract_active_repair_context(self.context.session.messages) | |
| 514 | + if repair is None: | |
| 485 | 515 | return |
| 486 | 516 | |
| 487 | - if verified_inventory: | |
| 517 | + if repair.allowed_paths: | |
| 518 | + allowed_preview = ", ".join(f"`{path}`" for path in repair.allowed_paths[:3]) | |
| 519 | + if len(repair.allowed_paths) > 3: | |
| 520 | + allowed_preview += ", ..." | |
| 488 | 521 | self.context.queue_steering_message( |
| 489 | - f"Use the current TOC target contents plus the verified sibling inventory for " | |
| 490 | - f"{target_label} instead of guessing. " | |
| 491 | - f"Known chapter inventory: {verified_inventory}. " | |
| 492 | - f"Apply those exact href/title pairs in {target_label}. " | |
| 493 | - "Do not rewrite the whole document. For `edit`, set `old_string` to the " | |
| 494 | - "current TOC block above exactly and set `new_string` to the suggested " | |
| 495 | - "replacement block below exactly." | |
| 496 | - f"{excerpt_suffix}" | |
| 497 | - f"{replacement_suffix}" | |
| 498 | - f"{call_suffix}" | |
| 522 | + "Verification already identified the active repair target. " | |
| 523 | + f"Stay on the concrete repair files {allowed_preview} " | |
| 524 | + f"and repair `{repair.artifact_path}` directly. " | |
| 525 | + "Do not reopen unrelated reference materials while this repair target is unresolved." | |
| 499 | 526 | ) |
| 500 | 527 | return |
| 501 | 528 | |
| 529 | + roots_preview = ", ".join(f"`{root}`" for root in repair.allowed_roots[:2]) | |
| 530 | + if len(repair.allowed_roots) > 2: | |
| 531 | + roots_preview += ", ..." | |
| 502 | 532 | self.context.queue_steering_message( |
| 503 | - f"Use the current TOC target contents when retrying the edit for {target_label} " | |
| 504 | - "instead of guessing. " | |
| 505 | - f"{excerpt_suffix}".strip() | |
| 533 | + "Verification already identified the active repair target. " | |
| 534 | + f"Stay within the current artifact set under {roots_preview} " | |
| 535 | + f"and repair `{repair.artifact_path}` directly. " | |
| 536 | + "Do not reopen unrelated reference materials while this repair target is unresolved." | |
| 506 | 537 | ) |
| 507 | 538 | |
| 508 | - def _queue_verified_html_inventory_nudge(self, tool_call: ToolCall) -> None: | |
| 509 | - """Proactively hand off verified chapter inventory after sibling discovery.""" | |
| 510 | - | |
| 511 | - if tool_call.name != "glob": | |
| 512 | - return | |
| 513 | - | |
| 514 | - chapters_path = str(tool_call.arguments.get("path", "")).strip() | |
| 515 | - if not chapters_path.endswith("chapters"): | |
| 516 | - return | |
| 517 | - | |
| 518 | - index_path = str(Path(chapters_path).expanduser().parent / "index.html") | |
| 519 | - if index_path in self._inventory_hint_targets: | |
| 520 | - return | |
| 539 | + def _queue_blocked_active_repair_mutation_nudge(self, event_content: str) -> None: | |
| 540 | + """Keep repair-phase mutations pinned to the named repair files.""" | |
| 521 | 541 | |
| 522 | - if not self._targets_html_toc_task(): | |
| 542 | + if "[Blocked - active repair mutation scope:" not in event_content: | |
| 523 | 543 | return |
| 524 | 544 | |
| 525 | - verified_inventory = html_toc_rule.summarize_html_inventory(index_path, limit=12) | |
| 526 | - if not verified_inventory: | |
| 545 | + repair = extract_active_repair_context(self.context.session.messages) | |
| 546 | + if repair is None or not repair.allowed_paths: | |
| 527 | 547 | return |
| 528 | 548 | |
| 529 | - self._inventory_hint_targets.add(index_path) | |
| 530 | - target_label = html_toc_rule.describe_html_toc_target(index_path) | |
| 531 | - chapters_label = html_toc_rule.describe_html_toc_chapters_dir(index_path) | |
| 549 | + allowed_preview = ", ".join(f"`{path}`" for path in repair.allowed_paths[:3]) | |
| 550 | + if len(repair.allowed_paths) > 3: | |
| 551 | + allowed_preview += ", ..." | |
| 532 | 552 | self.context.queue_steering_message( |
| 533 | - f"You already have the verified sibling inventory needed for {target_label}. " | |
| 534 | - f"Known chapter inventory: {verified_inventory}. " | |
| 535 | - f"Update {target_label} using those exact href/title pairs instead of rereading " | |
| 536 | - f"files in {chapters_label} unless one specific title is still unknown." | |
| 553 | + "Verification already identified the concrete repair files. " | |
| 554 | + f"Keep mutations pinned to {allowed_preview} " | |
| 555 | + f"and repair `{repair.artifact_path}` before widening the change set." | |
| 537 | 556 | ) |
| 538 | 557 | |
| 539 | - def _annotate_verified_html_inventory(self, tool_call: ToolCall, outcome) -> None: | |
| 540 | - """Attach verified chapter inventory directly to a successful discovery result.""" | |
| 541 | - | |
| 542 | - if tool_call.name != "glob": | |
| 543 | - return | |
| 544 | - | |
| 545 | - chapters_path = str(tool_call.arguments.get("path", "")).strip() | |
| 546 | - if not chapters_path.endswith("chapters"): | |
| 547 | - return | |
| 548 | - | |
| 549 | - if not self._targets_html_toc_task(): | |
| 550 | - return | |
| 551 | - | |
| 552 | - index_path = str(Path(chapters_path).expanduser().parent / "index.html") | |
| 553 | - verified_inventory = html_toc_rule.summarize_html_inventory(index_path, limit=12) | |
| 554 | - if not verified_inventory: | |
| 555 | - return | |
| 556 | - | |
| 557 | - action_tracker = getattr(self.context.safeguards, "action_tracker", None) | |
| 558 | - note_inventory = getattr(action_tracker, "note_verified_html_inventory", None) | |
| 559 | - if callable(note_inventory): | |
| 560 | - note_inventory(index_path) | |
| 561 | - | |
| 562 | - note = f"Verified chapter inventory: {verified_inventory}" | |
| 563 | - merged_event = outcome.event_content | |
| 564 | - if note not in merged_event: | |
| 565 | - merged_event = f"{note}\n{merged_event}".strip() | |
| 566 | - outcome.event_content = merged_event | |
| 567 | - outcome.result_output = merged_event | |
| 568 | - outcome.message.content = f"{note}\n{outcome.message.content}".strip() | |
| 569 | - if outcome.message.tool_results: | |
| 570 | - outcome.message.tool_results[0].content = merged_event | |
| 571 | - | |
| 572 | - def _annotate_validated_html_toc_completion(self, tool_call: ToolCall, outcome) -> None: | |
| 573 | - """Attach semantic TOC validation evidence to a successful mutating result.""" | |
| 558 | + def _queue_blocked_late_reference_drift_nudge( | |
| 559 | + self, | |
| 560 | + event_content: str, | |
| 561 | + *, | |
| 562 | + dod: DefinitionOfDone, | |
| 563 | + ) -> None: | |
| 564 | + """Reinforce missing-artifact progress after late-stage reference drift is blocked.""" | |
| 574 | 565 | |
| 575 | - if not self._targets_html_toc_task(): | |
| 576 | - return | |
| 577 | - target_path = self._validated_html_toc_target(tool_call) | |
| 578 | - if target_path is None: | |
| 566 | + if "[Blocked - late reference drift:" not in event_content: | |
| 579 | 567 | return |
| 580 | 568 | |
| 581 | - validation = html_toc_rule.validate_html_toc(target_path) | |
| 582 | - if validation is None or not validation.valid: | |
| 569 | + missing_artifact = _next_missing_planned_artifact( | |
| 570 | + dod, | |
| 571 | + project_root=self.context.project_root, | |
| 572 | + ) | |
| 573 | + if missing_artifact is None: | |
| 583 | 574 | return |
| 584 | 575 | |
| 585 | - action_tracker = getattr(self.context.safeguards, "action_tracker", None) | |
| 586 | - note_validated = getattr(action_tracker, "note_validated_html_toc", None) | |
| 587 | - if callable(note_validated): | |
| 588 | - note_validated(target_path) | |
| 576 | + planned_roots: list[str] = [] | |
| 577 | + seen_roots: set[str] = set() | |
| 578 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 579 | + dod, | |
| 580 | + project_root=self.context.project_root, | |
| 581 | + ): | |
| 582 | + root = str(target if expect_directory else target.parent) | |
| 583 | + if root in seen_roots: | |
| 584 | + continue | |
| 585 | + seen_roots.add(root) | |
| 586 | + planned_roots.append(root) | |
| 589 | 587 | |
| 590 | - note = ( | |
| 591 | - "Semantic verification preview: " | |
| 592 | - f"validated {validation.link_count} toc links in {Path(target_path).name}" | |
| 588 | + roots_preview = ", ".join(f"`{root}`" for root in planned_roots[:2]) | |
| 589 | + if len(planned_roots) > 2: | |
| 590 | + roots_preview += ", ..." | |
| 591 | + self.context.queue_steering_message( | |
| 592 | + "Late-stage reference rereads are no longer helping. " | |
| 593 | + "One explicitly planned artifact is still missing." | |
| 594 | + + _missing_artifact_resume_suffix( | |
| 595 | + missing_artifact, | |
| 596 | + project_root=self.context.project_root, | |
| 597 | + ) | |
| 598 | + + f" Stay within the current output roots under {roots_preview}" | |
| 599 | + + " and finish that artifact before reopening older reference materials." | |
| 593 | 600 | ) |
| 594 | - merged_event = outcome.event_content | |
| 595 | - if note not in merged_event: | |
| 596 | - merged_event = f"{merged_event}\n{note}".strip() | |
| 597 | - outcome.event_content = merged_event | |
| 598 | - outcome.result_output = merged_event | |
| 599 | - outcome.message.content = f"{outcome.message.content}\n{note}".strip() | |
| 600 | - if outcome.message.tool_results: | |
| 601 | - outcome.message.tool_results[0].content = merged_event | |
| 602 | 601 | |
| 603 | - def _queue_validated_html_toc_completion_nudge(self, tool_call: ToolCall) -> None: | |
| 604 | - """Push the next model turn toward finishing once the TOC already validates.""" | |
| 602 | + def _queue_blocked_completed_artifact_scope_nudge( | |
| 603 | + self, | |
| 604 | + event_content: str, | |
| 605 | + *, | |
| 606 | + dod: DefinitionOfDone, | |
| 607 | + ) -> None: | |
| 608 | + """Keep post-build review anchored to the generated artifact set.""" | |
| 605 | 609 | |
| 606 | - if not self._targets_html_toc_task(): | |
| 607 | - return | |
| 608 | - target_path = self._validated_html_toc_target(tool_call) | |
| 609 | - if target_path is None: | |
| 610 | + if "[Blocked - completed artifact set scope:" not in event_content: | |
| 610 | 611 | return |
| 611 | 612 | |
| 612 | - validation = html_toc_rule.validate_html_toc(target_path) | |
| 613 | - if validation is None or not validation.valid: | |
| 614 | - return | |
| 613 | + planned_roots: list[str] = [] | |
| 614 | + seen_roots: set[str] = set() | |
| 615 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 616 | + dod, | |
| 617 | + project_root=self.context.project_root, | |
| 618 | + ): | |
| 619 | + root = str(target if expect_directory else target.parent) | |
| 620 | + if root in seen_roots: | |
| 621 | + continue | |
| 622 | + seen_roots.add(root) | |
| 623 | + planned_roots.append(root) | |
| 615 | 624 | |
| 616 | - if tool_call.name == "read": | |
| 617 | - target_label = html_toc_rule.describe_html_toc_target(target_path) | |
| 618 | - chapters_label = html_toc_rule.describe_html_toc_chapters_dir(target_path) | |
| 625 | + next_pending = next( | |
| 626 | + ( | |
| 627 | + item | |
| 628 | + for item in effective_pending_todo_items( | |
| 629 | + dod, | |
| 630 | + project_root=self.context.project_root, | |
| 631 | + ) | |
| 632 | + if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 633 | + ), | |
| 634 | + None, | |
| 635 | + ) | |
| 636 | + roots_preview = ", ".join(f"`{root}`" for root in planned_roots[:2]) | |
| 637 | + if len(planned_roots) > 2: | |
| 638 | + roots_preview += ", ..." | |
| 639 | + if next_pending and _todo_is_consistency_review_step(next_pending): | |
| 619 | 640 | self.context.queue_steering_message( |
| 620 | - f"The HTML table-of-contents target {target_label} already satisfies the " | |
| 621 | - "verified link/title constraints. " | |
| 622 | - f"Semantic verification preview: validated {validation.link_count} linked " | |
| 623 | - "entries. " | |
| 624 | - "No TOC edit is required unless you can point to one specific incorrect href or " | |
| 625 | - f"title. Do not reread {target_label} or files in {chapters_label} again. " | |
| 626 | - "Briefly state that the table of contents is already correct so Loader can " | |
| 627 | - "finish the task." | |
| 641 | + "All explicitly planned artifacts already exist. " | |
| 642 | + f"Stay within the current output roots under {roots_preview} and continue " | |
| 643 | + f"with `{next_pending}` using the generated files as the source of truth. " | |
| 644 | + "Do not reopen earlier reference materials." | |
| 628 | 645 | ) |
| 629 | 646 | return |
| 630 | 647 | |
| 631 | - target_label = html_toc_rule.describe_html_toc_target(target_path) | |
| 632 | - chapters_label = html_toc_rule.describe_html_toc_chapters_dir(target_path) | |
| 633 | 648 | self.context.queue_steering_message( |
| 634 | - f"The HTML table-of-contents target {target_label} already satisfies the " | |
| 635 | - "verified link/title constraints. " | |
| 636 | - f"Semantic verification preview: validated {validation.link_count} linked " | |
| 637 | - "entries. " | |
| 638 | - f"Do not reread {target_label} or files in {chapters_label} unless a specific " | |
| 639 | - "href or title is still unresolved. Briefly state that the table of contents has " | |
| 640 | - "been updated so Loader can run the verification gate." | |
| 641 | - ) | |
| 642 | - | |
| 643 | - @staticmethod | |
| 644 | - def _validated_html_toc_target(tool_call: ToolCall) -> str | None: | |
| 645 | - """Return the index target for a validated HTML TOC action.""" | |
| 646 | - | |
| 647 | - target_path = "" | |
| 648 | - if tool_call.name in {"write", "edit", "patch", "read"}: | |
| 649 | - target_path = str(tool_call.arguments.get("file_path", "")).strip() | |
| 650 | - elif tool_call.name == "bash": | |
| 651 | - target_path = ( | |
| 652 | - extract_shell_text_rewrite_target( | |
| 653 | - str(tool_call.arguments.get("command", "")) | |
| 654 | - ) | |
| 655 | - or "" | |
| 656 | - ).strip() | |
| 657 | - | |
| 658 | - if not target_path: | |
| 659 | - return None | |
| 660 | - if not html_toc_rule.is_html_toc_index_path(target_path): | |
| 661 | - return None | |
| 662 | - return str(Path(target_path).expanduser()) | |
| 663 | - | |
| 664 | - def _targets_html_toc_task(self) -> bool: | |
| 665 | - current_task = str(getattr(self.context.session, "current_task", "") or "").lower() | |
| 666 | - if not current_task: | |
| 667 | - for message in reversed(getattr(self.context.session, "messages", [])): | |
| 668 | - if getattr(message, "role", None) != Role.USER: | |
| 669 | - continue | |
| 670 | - content = str(getattr(message, "content", "") or "").strip().lower() | |
| 671 | - if content: | |
| 672 | - current_task = content | |
| 673 | - break | |
| 674 | - return html_toc_rule.task_targets_html_toc(current_task) | |
| 649 | + "All explicitly planned artifacts already exist. " | |
| 650 | + f"Stay within the current output roots under {roots_preview} " | |
| 651 | + "and move to verification or final confirmation using the generated files. " | |
| 652 | + "Do not reopen earlier reference materials." | |
| 653 | + ) | |
| 654 | + | |
| 655 | + def _queue_blocked_html_edit_nudge(self, tool_call: ToolCall, event_content: str) -> None: | |
| 656 | + """Keep blocked edit feedback generic; avoid task-class-specific steering.""" | |
| 657 | + | |
| 658 | + _ = tool_call, event_content | |
| 659 | + return | |
| 675 | 660 | |
| 676 | 661 | async def _record_successful_execution( |
| 677 | 662 | self, |
@@ -704,15 +689,36 @@ class ToolBatchRunner: | ||
| 704 | 689 | if tool_call.name == "TodoWrite" and outcome.registry_result is not None: |
| 705 | 690 | new_todos = outcome.registry_result.metadata.get("new_todos", []) |
| 706 | 691 | if isinstance(new_todos, list): |
| 707 | - sync_todos_to_definition_of_done(dod, new_todos) | |
| 692 | + sync_todos_to_definition_of_done( | |
| 693 | + dod, | |
| 694 | + new_todos, | |
| 695 | + project_root=self.context.project_root, | |
| 696 | + ) | |
| 697 | + self._queue_todowrite_resume_nudge(dod=dod) | |
| 708 | 698 | else: |
| 709 | 699 | pending_before = list(dod.pending_items) |
| 710 | 700 | if advance_todos_from_tool_call(dod, tool_call): |
| 701 | + reconcile_aggregate_completion_steps( | |
| 702 | + dod, | |
| 703 | + project_root=self.context.project_root, | |
| 704 | + ) | |
| 711 | 705 | self._queue_next_pending_todo_nudge( |
| 712 | 706 | tool_call=tool_call, |
| 713 | 707 | pending_before=pending_before, |
| 714 | 708 | dod=dod, |
| 715 | 709 | ) |
| 710 | + self._queue_bookkeeping_resume_nudge( | |
| 711 | + tool_call=tool_call, | |
| 712 | + dod=dod, | |
| 713 | + ) | |
| 714 | + self._queue_missing_artifact_progress_nudge( | |
| 715 | + tool_call=tool_call, | |
| 716 | + dod=dod, | |
| 717 | + ) | |
| 718 | + self._queue_planned_artifact_handoff_nudge( | |
| 719 | + tool_call=tool_call, | |
| 720 | + dod=dod, | |
| 721 | + ) | |
| 716 | 722 | self.dod_store.save(dod) |
| 717 | 723 | recovery_context = self.context.recovery_context |
| 718 | 724 | if recovery_context is not None: |
@@ -765,7 +771,10 @@ class ToolBatchRunner: | ||
| 765 | 771 | next_pending = next( |
| 766 | 772 | ( |
| 767 | 773 | item |
| 768 | - for item in dod.pending_items | |
| 774 | + for item in effective_pending_todo_items( | |
| 775 | + dod, | |
| 776 | + project_root=self.context.project_root, | |
| 777 | + ) | |
| 769 | 778 | if item not in _TODO_NUDGE_EXCLUDED_ITEMS |
| 770 | 779 | ), |
| 771 | 780 | None, |
@@ -773,12 +782,36 @@ class ToolBatchRunner: | ||
| 773 | 782 | if not completed_label or not next_pending or next_pending == completed_label: |
| 774 | 783 | return |
| 775 | 784 | |
| 785 | + missing_artifact = _next_missing_planned_artifact( | |
| 786 | + dod, | |
| 787 | + project_root=self.context.project_root, | |
| 788 | + ) | |
| 789 | + if _should_prioritize_missing_artifact( | |
| 790 | + next_pending=next_pending, | |
| 791 | + missing_artifact=missing_artifact, | |
| 792 | + ): | |
| 793 | + self.context.queue_steering_message( | |
| 794 | + f"Confirmed progress: `{completed_label}` is now satisfied by the successful " | |
| 795 | + f"`{tool_call.name}` result. One explicitly planned artifact is still missing." | |
| 796 | + + _missing_artifact_resume_suffix( | |
| 797 | + missing_artifact, | |
| 798 | + project_root=self.context.project_root, | |
| 799 | + ) | |
| 800 | + + " Do not switch into review or consistency-check mode until the missing artifact exists." | |
| 801 | + ) | |
| 802 | + return | |
| 803 | + | |
| 776 | 804 | mutation_suffix = "" |
| 777 | 805 | if _todo_is_mutation_step(next_pending): |
| 778 | - mutation_suffix = ( | |
| 779 | - " You already have enough evidence for that step, so stop gathering " | |
| 780 | - "more reference material and perform the change now." | |
| 806 | + mutation_suffix = _missing_artifact_resume_suffix( | |
| 807 | + missing_artifact, | |
| 808 | + project_root=self.context.project_root, | |
| 781 | 809 | ) |
| 810 | + if not mutation_suffix: | |
| 811 | + mutation_suffix = ( | |
| 812 | + " You already have enough evidence for that step, so stop gathering " | |
| 813 | + "more reference material and perform the change now." | |
| 814 | + ) | |
| 782 | 815 | |
| 783 | 816 | self.context.queue_steering_message( |
| 784 | 817 | f"Confirmed progress: `{completed_label}` is now satisfied by the successful " |
@@ -786,6 +819,375 @@ class ToolBatchRunner: | ||
| 786 | 819 | f"`{next_pending}` instead of rereading the same evidence.{mutation_suffix}" |
| 787 | 820 | ) |
| 788 | 821 | |
| 822 | + def _queue_planned_artifact_handoff_nudge( | |
| 823 | + self, | |
| 824 | + *, | |
| 825 | + tool_call: ToolCall, | |
| 826 | + dod: DefinitionOfDone, | |
| 827 | + ) -> None: | |
| 828 | + if not is_state_mutating_tool_call(tool_call): | |
| 829 | + return | |
| 830 | + if not all_planned_artifacts_exist(dod, project_root=self.context.project_root): | |
| 831 | + return | |
| 832 | + | |
| 833 | + next_pending = next( | |
| 834 | + ( | |
| 835 | + item | |
| 836 | + for item in effective_pending_todo_items( | |
| 837 | + dod, | |
| 838 | + project_root=self.context.project_root, | |
| 839 | + ) | |
| 840 | + if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 841 | + ), | |
| 842 | + None, | |
| 843 | + ) | |
| 844 | + verification_commands = dod.verification_commands or derive_verification_commands( | |
| 845 | + dod, | |
| 846 | + project_root=self.context.project_root, | |
| 847 | + task_statement=getattr(self.context.session, "current_task", "") or "", | |
| 848 | + supplement_existing=True, | |
| 849 | + ) | |
| 850 | + | |
| 851 | + if next_pending and _todo_is_consistency_review_step(next_pending): | |
| 852 | + verification_suffix = ( | |
| 853 | + " Move to verification once no specific mismatch remains." | |
| 854 | + if verification_commands | |
| 855 | + else " Avoid another full reread unless one specific inconsistency is still unknown." | |
| 856 | + ) | |
| 857 | + self.context.queue_steering_message( | |
| 858 | + "All explicitly planned artifacts now exist. " | |
| 859 | + f"Continue with the next pending item: `{next_pending}`. " | |
| 860 | + "Use the files already on disk as the source of truth instead of restarting " | |
| 861 | + "discovery or inventing alternate filenames." | |
| 862 | + + verification_suffix | |
| 863 | + ) | |
| 864 | + return | |
| 865 | + | |
| 866 | + if verification_commands: | |
| 867 | + self.context.queue_steering_message( | |
| 868 | + "All explicitly planned artifacts now exist. " | |
| 869 | + "Do not expand the artifact set or restart discovery unless a specific gap is " | |
| 870 | + "still known. Move to verification or final confirmation using the files that " | |
| 871 | + "already exist." | |
| 872 | + ) | |
| 873 | + | |
| 874 | + def _queue_missing_artifact_progress_nudge( | |
| 875 | + self, | |
| 876 | + *, | |
| 877 | + tool_call: ToolCall, | |
| 878 | + dod: DefinitionOfDone, | |
| 879 | + ) -> None: | |
| 880 | + if not is_state_mutating_tool_call(tool_call): | |
| 881 | + return | |
| 882 | + missing_artifact = _next_missing_planned_artifact( | |
| 883 | + dod, | |
| 884 | + project_root=self.context.project_root, | |
| 885 | + ) | |
| 886 | + if missing_artifact is None: | |
| 887 | + return | |
| 888 | + | |
| 889 | + current_label = _current_mutation_label(tool_call) | |
| 890 | + todo_refresh = _todo_refresh_guidance( | |
| 891 | + dod, | |
| 892 | + project_root=self.context.project_root, | |
| 893 | + ) | |
| 894 | + self.context.queue_steering_message( | |
| 895 | + f"Confirmed progress: {current_label} is now recorded." | |
| 896 | + " One explicitly planned artifact is still missing." | |
| 897 | + + _missing_artifact_resume_suffix( | |
| 898 | + missing_artifact, | |
| 899 | + project_root=self.context.project_root, | |
| 900 | + ) | |
| 901 | + + todo_refresh | |
| 902 | + + " Do not move to verification, final confirmation, or TodoWrite-only " | |
| 903 | + "bookkeeping until that artifact exists." | |
| 904 | + + " Do not spend another turn on working notes or rediscovery alone." | |
| 905 | + ) | |
| 906 | + | |
| 907 | + def _queue_todowrite_resume_nudge( | |
| 908 | + self, | |
| 909 | + *, | |
| 910 | + dod: DefinitionOfDone, | |
| 911 | + ) -> None: | |
| 912 | + missing_artifact = _next_missing_planned_artifact( | |
| 913 | + dod, | |
| 914 | + project_root=self.context.project_root, | |
| 915 | + ) | |
| 916 | + next_pending = next( | |
| 917 | + ( | |
| 918 | + item | |
| 919 | + for item in effective_pending_todo_items( | |
| 920 | + dod, | |
| 921 | + project_root=self.context.project_root, | |
| 922 | + ) | |
| 923 | + if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 924 | + ), | |
| 925 | + None, | |
| 926 | + ) | |
| 927 | + if missing_artifact is None: | |
| 928 | + if next_pending and _todo_is_mutation_step(next_pending): | |
| 929 | + self.context.queue_steering_message( | |
| 930 | + "Todo tracking is updated. Continue with the next pending item: " | |
| 931 | + f"`{next_pending}`. Use the current output files as the source of " | |
| 932 | + "truth, and do not reopen reference materials unless one specific " | |
| 933 | + "fact required for that step is still unknown. Perform the mutation " | |
| 934 | + "now instead of spending another turn on planning, rereads, or " | |
| 935 | + "verification." | |
| 936 | + ) | |
| 937 | + return | |
| 938 | + | |
| 939 | + if ( | |
| 940 | + next_pending | |
| 941 | + and _todo_is_consistency_review_step(next_pending) | |
| 942 | + and not all_planned_artifacts_exist( | |
| 943 | + dod, | |
| 944 | + project_root=self.context.project_root, | |
| 945 | + ) | |
| 946 | + ): | |
| 947 | + self.context.queue_steering_message( | |
| 948 | + "Todo tracking is updated. Continue with the next pending item: " | |
| 949 | + f"`{next_pending}`. Use the current output files as the source of " | |
| 950 | + "truth, and do not reopen reference materials unless one specific " | |
| 951 | + "mismatch is still unknown." | |
| 952 | + ) | |
| 953 | + return | |
| 954 | + | |
| 955 | + if not all_planned_artifacts_exist(dod, project_root=self.context.project_root): | |
| 956 | + return | |
| 957 | + | |
| 958 | + verification_commands = dod.verification_commands or derive_verification_commands( | |
| 959 | + dod, | |
| 960 | + project_root=self.context.project_root, | |
| 961 | + task_statement=getattr(self.context.session, "current_task", "") or "", | |
| 962 | + supplement_existing=True, | |
| 963 | + ) | |
| 964 | + if next_pending and _todo_is_consistency_review_step(next_pending): | |
| 965 | + verification_suffix = ( | |
| 966 | + " Move to verification once no specific mismatch remains." | |
| 967 | + if verification_commands | |
| 968 | + else " Finish the targeted consistency pass without reopening reference materials." | |
| 969 | + ) | |
| 970 | + self.context.queue_steering_message( | |
| 971 | + "Todo tracking is updated. All explicitly planned artifacts now exist. " | |
| 972 | + f"Continue with the next pending item: `{next_pending}`. " | |
| 973 | + "Use the current output files as the source of truth, and do not restart " | |
| 974 | + "early discovery or reopen reference materials." | |
| 975 | + + verification_suffix | |
| 976 | + ) | |
| 977 | + return | |
| 978 | + | |
| 979 | + verification_suffix = ( | |
| 980 | + " Move to verification or final confirmation using the files already on disk." | |
| 981 | + if verification_commands | |
| 982 | + else " Finish the task using the files already on disk." | |
| 983 | + ) | |
| 984 | + self.context.queue_steering_message( | |
| 985 | + "Todo tracking is updated. All explicitly planned artifacts now exist. " | |
| 986 | + "Do not restart discovery, reopen reference materials, or spend another turn " | |
| 987 | + "on TodoWrite alone." | |
| 988 | + + verification_suffix | |
| 989 | + ) | |
| 990 | + return | |
| 991 | + | |
| 992 | + todo_refresh = _todo_refresh_guidance( | |
| 993 | + dod, | |
| 994 | + project_root=self.context.project_root, | |
| 995 | + ) | |
| 996 | + next_pending_suffix = ( | |
| 997 | + f" Continue with the next pending item: `{next_pending}`." | |
| 998 | + if next_pending | |
| 999 | + else "" | |
| 1000 | + ) | |
| 1001 | + self.context.queue_steering_message( | |
| 1002 | + "Todo tracking is updated. An explicitly planned artifact is still missing." | |
| 1003 | + + next_pending_suffix | |
| 1004 | + + _missing_artifact_resume_suffix( | |
| 1005 | + missing_artifact, | |
| 1006 | + project_root=self.context.project_root, | |
| 1007 | + ) | |
| 1008 | + + todo_refresh | |
| 1009 | + + " Do not spend the next turn on TodoWrite alone, bookkeeping notes, " | |
| 1010 | + "verification, or final confirmation until that artifact exists." | |
| 1011 | + ) | |
| 1012 | + | |
| 1013 | + def _queue_bookkeeping_resume_nudge( | |
| 1014 | + self, | |
| 1015 | + *, | |
| 1016 | + tool_call: ToolCall, | |
| 1017 | + dod: DefinitionOfDone, | |
| 1018 | + ) -> None: | |
| 1019 | + if tool_call.name not in _BOOKKEEPING_NOTE_TOOL_NAMES: | |
| 1020 | + return | |
| 1021 | + | |
| 1022 | + missing_artifact = _next_missing_planned_artifact( | |
| 1023 | + dod, | |
| 1024 | + project_root=self.context.project_root, | |
| 1025 | + ) | |
| 1026 | + if missing_artifact is None: | |
| 1027 | + return | |
| 1028 | + | |
| 1029 | + next_pending = next( | |
| 1030 | + ( | |
| 1031 | + item | |
| 1032 | + for item in effective_pending_todo_items( | |
| 1033 | + dod, | |
| 1034 | + project_root=self.context.project_root, | |
| 1035 | + ) | |
| 1036 | + if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 1037 | + ), | |
| 1038 | + None, | |
| 1039 | + ) | |
| 1040 | + todo_refresh = _todo_refresh_guidance( | |
| 1041 | + dod, | |
| 1042 | + project_root=self.context.project_root, | |
| 1043 | + ) | |
| 1044 | + if ( | |
| 1045 | + next_pending | |
| 1046 | + and not _todo_is_mutation_step(next_pending) | |
| 1047 | + and not _todo_is_consistency_review_step(next_pending) | |
| 1048 | + ): | |
| 1049 | + self.context.queue_steering_message( | |
| 1050 | + "Bookkeeping note is recorded. Continue with the next pending item: " | |
| 1051 | + f"`{next_pending}`. Make your next response one concrete evidence-gathering " | |
| 1052 | + "tool call that advances that step, not another bookkeeping-only turn." | |
| 1053 | + + todo_refresh | |
| 1054 | + + " Do not jump ahead to later artifact creation, verification, or final " | |
| 1055 | + "confirmation until that step is satisfied." | |
| 1056 | + ) | |
| 1057 | + return | |
| 1058 | + | |
| 1059 | + self.context.queue_steering_message( | |
| 1060 | + "Bookkeeping note is recorded. An explicitly planned artifact is still missing." | |
| 1061 | + + _missing_artifact_resume_suffix( | |
| 1062 | + missing_artifact, | |
| 1063 | + project_root=self.context.project_root, | |
| 1064 | + ) | |
| 1065 | + + todo_refresh | |
| 1066 | + + " Do not spend the next turn on additional notes, rediscovery, " | |
| 1067 | + "verification, or final confirmation until that artifact exists." | |
| 1068 | + ) | |
| 1069 | + | |
| 1070 | + | |
| 1071 | +def _todo_is_consistency_review_step(item: str) -> bool: | |
| 1072 | + text = item.lower() | |
| 1073 | + return any(hint in text for hint in _CONSISTENCY_REVIEW_HINTS) | |
| 1074 | + | |
| 1075 | + | |
| 1076 | +def _should_prioritize_missing_artifact( | |
| 1077 | + *, | |
| 1078 | + next_pending: str | None, | |
| 1079 | + missing_artifact: tuple[Path, bool] | None, | |
| 1080 | +) -> bool: | |
| 1081 | + if missing_artifact is None: | |
| 1082 | + return False | |
| 1083 | + if not next_pending: | |
| 1084 | + return True | |
| 1085 | + if _todo_is_consistency_review_step(next_pending): | |
| 1086 | + return True | |
| 1087 | + return not _todo_is_mutation_step(next_pending) | |
| 1088 | + | |
| 1089 | + | |
| 1090 | +def _next_missing_planned_artifact( | |
| 1091 | + dod: DefinitionOfDone, | |
| 1092 | + *, | |
| 1093 | + project_root: Path, | |
| 1094 | +) -> tuple[Path, bool] | None: | |
| 1095 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 1096 | + dod, | |
| 1097 | + project_root=project_root, | |
| 1098 | + max_paths=12, | |
| 1099 | + ): | |
| 1100 | + if not planned_artifact_target_satisfied( | |
| 1101 | + dod, | |
| 1102 | + target=target, | |
| 1103 | + expect_directory=expect_directory, | |
| 1104 | + project_root=project_root, | |
| 1105 | + ): | |
| 1106 | + return target, expect_directory | |
| 1107 | + return None | |
| 1108 | + | |
| 1109 | + | |
| 1110 | +def _missing_artifact_resume_suffix( | |
| 1111 | + missing_artifact: tuple[Path, bool] | None, | |
| 1112 | + *, | |
| 1113 | + project_root: Path, | |
| 1114 | +) -> str: | |
| 1115 | + if missing_artifact is None: | |
| 1116 | + return "" | |
| 1117 | + | |
| 1118 | + target, expect_directory = missing_artifact | |
| 1119 | + label = target.name or str(target) | |
| 1120 | + if expect_directory and not label.endswith("/"): | |
| 1121 | + label += "/" | |
| 1122 | + if expect_directory: | |
| 1123 | + next_output_file = infer_next_declared_html_output_file( | |
| 1124 | + target=target, | |
| 1125 | + project_root=project_root, | |
| 1126 | + ) | |
| 1127 | + if next_output_file is not None: | |
| 1128 | + guidance = ( | |
| 1129 | + f" Resume by creating `{next_output_file.name}` now. It is the next missing " | |
| 1130 | + f"declared output under `{label}`. Prefer one `write` call for " | |
| 1131 | + f"`{next_output_file}` instead of more rereads." | |
| 1132 | + ) | |
| 1133 | + if not next_output_file.parent.exists(): | |
| 1134 | + guidance += ( | |
| 1135 | + " The `write` tool can create that file's parent directories automatically," | |
| 1136 | + " so do the write in one step instead of stopping for a separate mkdir." | |
| 1137 | + ) | |
| 1138 | + guidance += ( | |
| 1139 | + " Make your next response the concrete mutation tool call itself, not another" | |
| 1140 | + " bookkeeping-only turn." | |
| 1141 | + ) | |
| 1142 | + return guidance | |
| 1143 | + if target.is_dir(): | |
| 1144 | + return ( | |
| 1145 | + f" Resume by creating the next output file under `{label}` now. Prefer one " | |
| 1146 | + f"concrete `write` call for a file inside `{target}` instead of more rereads." | |
| 1147 | + " Make your next response the concrete mutation tool call itself, not another" | |
| 1148 | + " bookkeeping-only turn." | |
| 1149 | + ) | |
| 1150 | + return ( | |
| 1151 | + f" Resume by creating `{label}` now. Prefer one concrete directory-creation " | |
| 1152 | + f"step for `{target}` instead of more rereads." | |
| 1153 | + ) | |
| 1154 | + guidance = ( | |
| 1155 | + f" Resume by creating `{label}` now. Prefer one `write` call for `{target}` " | |
| 1156 | + "instead of more rereads." | |
| 1157 | + ) | |
| 1158 | + if not target.parent.exists(): | |
| 1159 | + guidance += ( | |
| 1160 | + " The `write` tool can create that file's parent directories automatically," | |
| 1161 | + " so do the write in one step instead of stopping for a separate mkdir." | |
| 1162 | + ) | |
| 1163 | + guidance += ( | |
| 1164 | + " Make your next response the concrete mutation tool call itself, not another" | |
| 1165 | + " bookkeeping-only turn." | |
| 1166 | + ) | |
| 1167 | + return guidance | |
| 1168 | + | |
| 1169 | + | |
| 1170 | +def _todo_refresh_guidance( | |
| 1171 | + dod: DefinitionOfDone, | |
| 1172 | + *, | |
| 1173 | + project_root: Path | None = None, | |
| 1174 | +) -> str: | |
| 1175 | + non_special_pending = [ | |
| 1176 | + item | |
| 1177 | + for item in effective_pending_todo_items(dod, project_root=project_root) | |
| 1178 | + if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 1179 | + ] | |
| 1180 | + non_special_completed = [ | |
| 1181 | + item for item in dod.completed_items if item not in _TODO_NUDGE_EXCLUDED_ITEMS | |
| 1182 | + ] | |
| 1183 | + if len(dod.touched_files) < 2 and (len(non_special_pending) + len(non_special_completed)) < 3: | |
| 1184 | + return "" | |
| 1185 | + return ( | |
| 1186 | + " If the tracked steps no longer match the confirmed progress, refresh `TodoWrite` " | |
| 1187 | + "in the same response as the next concrete step instead of spending a full turn on " | |
| 1188 | + "bookkeeping alone." | |
| 1189 | + ) | |
| 1190 | + | |
| 789 | 1191 | |
| 790 | 1192 | def _mark_verification_stale( |
| 791 | 1193 | *, |
@@ -953,6 +1355,18 @@ def _stale_verification_detail(tool_call: ToolCall) -> str: | ||
| 953 | 1355 | return f"{tool_call.name} changed the workspace" |
| 954 | 1356 | |
| 955 | 1357 | |
| 1358 | +def _current_mutation_label(tool_call: ToolCall) -> str: | |
| 1359 | + if tool_call.name in {"write", "edit", "patch"}: | |
| 1360 | + file_path = str(tool_call.arguments.get("file_path", "")).strip() | |
| 1361 | + if file_path: | |
| 1362 | + return f"`{Path(file_path).name or file_path}`" | |
| 1363 | + if tool_call.name == "bash": | |
| 1364 | + command = str(tool_call.arguments.get("command", "")).strip() | |
| 1365 | + if command: | |
| 1366 | + return f"`{command}`" | |
| 1367 | + return f"the successful `{tool_call.name}` result" | |
| 1368 | + | |
| 1369 | + | |
| 956 | 1370 | def _tool_call_label(tool_call: ToolCall) -> str: |
| 957 | 1371 | """Human-readable label for one tool call.""" |
| 958 | 1372 | name = tool_call.name |
src/loader/runtime/turn_completion.pymodified@@ -230,10 +230,6 @@ class TurnCompletionController: | ||
| 230 | 230 | actions_taken=actions_taken, |
| 231 | 231 | ) |
| 232 | 232 | |
| 233 | - final_message = Message(role=Role.ASSISTANT, content=response_content) | |
| 234 | - self.context.session.append(final_message) | |
| 235 | - summary.assistant_messages.append(final_message) | |
| 236 | - | |
| 237 | 233 | gate_result = await self.finalizer.run_definition_of_done_gate( |
| 238 | 234 | dod=dod, |
| 239 | 235 | candidate_response=final_response, |
@@ -261,6 +257,9 @@ class TurnCompletionController: | ||
| 261 | 257 | continuation_count=continuation_count, |
| 262 | 258 | ) |
| 263 | 259 | final_response = gate_result.final_response |
| 260 | + final_message = Message(role=Role.ASSISTANT, content=response_content) | |
| 261 | + self.context.session.append(final_message) | |
| 262 | + summary.assistant_messages.append(final_message) | |
| 264 | 263 | self._record_completion_decision( |
| 265 | 264 | summary=summary, |
| 266 | 265 | decision_code=gate_result.reason_code, |
src/loader/runtime/turn_iteration.pymodified@@ -135,9 +135,11 @@ class TurnIterationController: | ||
| 135 | 135 | extracted_iterations=extracted_iterations, |
| 136 | 136 | continuation_count=continuation_count, |
| 137 | 137 | consecutive_errors=consecutive_errors, |
| 138 | + dod=dod, | |
| 138 | 139 | emit=emit, |
| 139 | 140 | summary=summary, |
| 140 | 141 | ) |
| 142 | + reset_empty_retry_count = 0 | |
| 141 | 143 | |
| 142 | 144 | analysis = self.repairer.analyze_response( |
| 143 | 145 | content=assistant_turn.content, |
@@ -196,7 +198,7 @@ class TurnIterationController: | ||
| 196 | 198 | return TurnIterationDecision( |
| 197 | 199 | action=TurnIterationAction.CONTINUE, |
| 198 | 200 | continuation_count=route_decision.continuation_count, |
| 199 | - empty_retry_count=empty_retry_count, | |
| 201 | + empty_retry_count=reset_empty_retry_count, | |
| 200 | 202 | extracted_iterations=extracted_iterations, |
| 201 | 203 | consecutive_errors=route_decision.consecutive_errors, |
| 202 | 204 | new_actions_taken=route_decision.new_actions_taken, |
@@ -205,7 +207,7 @@ class TurnIterationController: | ||
| 205 | 207 | return TurnIterationDecision( |
| 206 | 208 | action=TurnIterationAction.FINALIZE, |
| 207 | 209 | continuation_count=route_decision.continuation_count, |
| 208 | - empty_retry_count=empty_retry_count, | |
| 210 | + empty_retry_count=reset_empty_retry_count, | |
| 209 | 211 | extracted_iterations=extracted_iterations, |
| 210 | 212 | consecutive_errors=route_decision.consecutive_errors, |
| 211 | 213 | new_actions_taken=route_decision.new_actions_taken, |
@@ -215,7 +217,7 @@ class TurnIterationController: | ||
| 215 | 217 | return TurnIterationDecision( |
| 216 | 218 | action=TurnIterationAction.COMPLETE, |
| 217 | 219 | continuation_count=route_decision.continuation_count, |
| 218 | - empty_retry_count=empty_retry_count, | |
| 220 | + empty_retry_count=reset_empty_retry_count, | |
| 219 | 221 | extracted_iterations=extracted_iterations, |
| 220 | 222 | consecutive_errors=route_decision.consecutive_errors, |
| 221 | 223 | new_actions_taken=route_decision.new_actions_taken, |
@@ -231,6 +233,7 @@ class TurnIterationController: | ||
| 231 | 233 | extracted_iterations: int, |
| 232 | 234 | continuation_count: int, |
| 233 | 235 | consecutive_errors: int, |
| 236 | + dod: DefinitionOfDone, | |
| 234 | 237 | emit: EventSink, |
| 235 | 238 | summary: TurnSummary, |
| 236 | 239 | ) -> TurnIterationDecision: |
@@ -247,6 +250,7 @@ class TurnIterationController: | ||
| 247 | 250 | original_task=original_task, |
| 248 | 251 | empty_retry_count=next_empty_retry_count, |
| 249 | 252 | max_empty_retries=max_empty_retries, |
| 253 | + dod=dod, | |
| 250 | 254 | ) |
| 251 | 255 | if empty_decision.should_continue and empty_decision.retry_message: |
| 252 | 256 | if empty_decision.reason_code and empty_decision.reason_summary: |
@@ -289,9 +293,11 @@ class TurnIterationController: | ||
| 289 | 293 | ) |
| 290 | 294 | await emit(AgentEvent(type="response", content=final_response)) |
| 291 | 295 | return TurnIterationDecision( |
| 292 | - action=TurnIterationAction.COMPLETE, | |
| 296 | + action=TurnIterationAction.FINALIZE, | |
| 293 | 297 | continuation_count=continuation_count, |
| 294 | 298 | empty_retry_count=next_empty_retry_count, |
| 295 | 299 | extracted_iterations=extracted_iterations, |
| 296 | 300 | consecutive_errors=consecutive_errors, |
| 301 | + finalize_reason_code=empty_decision.reason_code, | |
| 302 | + finalize_reason_summary=empty_decision.reason_summary, | |
| 297 | 303 | ) |
src/loader/runtime/turn_loop.pymodified@@ -40,7 +40,7 @@ class TurnLoopState: | ||
| 40 | 40 | empty_retry_count: int = 0 |
| 41 | 41 | extracted_iterations: int = 0 |
| 42 | 42 | consecutive_errors: int = 0 |
| 43 | - max_empty_retries: int = 5 | |
| 43 | + max_empty_retries: int = 2 | |
| 44 | 44 | max_extracted_iterations: int = 3 |
| 45 | 45 | |
| 46 | 46 | |
src/loader/runtime/turn_preparation.pymodified@@ -160,6 +160,7 @@ class TurnPreparationController: | ||
| 160 | 160 | registry=self.context.registry, |
| 161 | 161 | rollback_plan=rollback_plan, |
| 162 | 162 | workspace_root=self.context.project_root, |
| 163 | + session=self.context.session, | |
| 163 | 164 | ), |
| 164 | 165 | ) |
| 165 | 166 | return executor, rollback_plan |
src/loader/runtime/workflow.pymodified@@ -10,7 +10,12 @@ from typing import ClassVar | ||
| 10 | 10 | |
| 11 | 11 | from ..llm.base import ToolCall |
| 12 | 12 | from .clarify_grounding import ClarifyGrounding |
| 13 | -from .dod import slugify | |
| 13 | +from .dod import ( | |
| 14 | + all_planned_artifacts_exist, | |
| 15 | + collect_planned_artifact_targets, | |
| 16 | + planned_artifact_target_satisfied, | |
| 17 | + slugify, | |
| 18 | +) | |
| 14 | 19 | from .workflow_policy import ( |
| 15 | 20 | ArtifactEvidence, |
| 16 | 21 | ArtifactEvidenceKind, |
@@ -46,12 +51,14 @@ __all__ = [ | ||
| 46 | 51 | "WorkflowTimelineEntryKind", |
| 47 | 52 | "advance_todos_from_tool_call", |
| 48 | 53 | "build_execute_bridge", |
| 54 | + "effective_pending_todo_items", | |
| 49 | 55 | "enrich_clarify_brief_with_grounding", |
| 50 | 56 | "extract_verification_commands_from_markdown", |
| 51 | 57 | "load_brief", |
| 52 | 58 | "load_planning_artifacts", |
| 53 | 59 | "merge_refreshed_todos_with_existing_scope", |
| 54 | 60 | "preserve_task_grounded_acceptance_criteria", |
| 61 | + "reconcile_aggregate_completion_steps", | |
| 55 | 62 | "sync_todos_to_definition_of_done", |
| 56 | 63 | ] |
| 57 | 64 | |
@@ -106,16 +113,35 @@ _PARSE_STEP_HINTS = ( | ||
| 106 | 113 | ) |
| 107 | 114 | _MUTATION_STEP_HINTS = ( |
| 108 | 115 | "create", |
| 116 | + "creating", | |
| 109 | 117 | "update", |
| 118 | + "updating", | |
| 110 | 119 | "edit", |
| 120 | + "editing", | |
| 111 | 121 | "write", |
| 122 | + "writing", | |
| 112 | 123 | "fix", |
| 124 | + "fixing", | |
| 113 | 125 | "modify", |
| 126 | + "modifying", | |
| 114 | 127 | "change", |
| 128 | + "changing", | |
| 115 | 129 | "patch", |
| 130 | + "patching", | |
| 116 | 131 | "replace", |
| 132 | + "replacing", | |
| 117 | 133 | "correct", |
| 134 | + "correcting", | |
| 118 | 135 | "rewrite", |
| 136 | + "rewriting", | |
| 137 | +) | |
| 138 | +_CREATION_STEP_HINTS = ( | |
| 139 | + "create", | |
| 140 | + "creating", | |
| 141 | + "generate", | |
| 142 | + "generating", | |
| 143 | + "scaffold", | |
| 144 | + "scaffolding", | |
| 119 | 145 | ) |
| 120 | 146 | _VERIFY_STEP_HINTS = ( |
| 121 | 147 | "verify", |
@@ -136,6 +162,20 @@ _AGGREGATE_TODO_HINTS = ( | ||
| 136 | 162 | "properly linked", |
| 137 | 163 | "directory structure", |
| 138 | 164 | ) |
| 165 | +_ARTIFACT_SET_COMPLETION_HINTS = ( | |
| 166 | + "link", | |
| 167 | + "links", | |
| 168 | + "linked", | |
| 169 | + "navigation", | |
| 170 | + "consistency", | |
| 171 | + "consistent", | |
| 172 | + "formatted", | |
| 173 | + "formatting", | |
| 174 | + "review", | |
| 175 | +) | |
| 176 | +_TODO_FILE_CANDIDATE_PATTERN = re.compile( | |
| 177 | + r"(?:[A-Za-z0-9_.-]+/)*[A-Za-z0-9_.-]+\.[A-Za-z0-9]+" | |
| 178 | +) | |
| 139 | 179 | _ACTIONABLE_STEP_VERBS = { |
| 140 | 180 | "add", |
| 141 | 181 | "apply", |
@@ -560,6 +600,25 @@ class PlanningArtifacts: | ||
| 560 | 600 | implementation_steps=list(self.implementation_steps), |
| 561 | 601 | ) |
| 562 | 602 | |
| 603 | + def with_file_changes(self, file_changes: list[str]) -> PlanningArtifacts: | |
| 604 | + """Return one copy with a rewritten file-changes section.""" | |
| 605 | + | |
| 606 | + normalized = [item.strip() for item in file_changes if item.strip()] | |
| 607 | + if not normalized: | |
| 608 | + return self | |
| 609 | + | |
| 610 | + return PlanningArtifacts( | |
| 611 | + implementation_markdown=_replace_markdown_section_items( | |
| 612 | + self.implementation_markdown, | |
| 613 | + "File Changes", | |
| 614 | + normalized, | |
| 615 | + ), | |
| 616 | + verification_markdown=self.verification_markdown, | |
| 617 | + verification_commands=list(self.verification_commands), | |
| 618 | + acceptance_criteria=list(self.acceptance_criteria), | |
| 619 | + implementation_steps=list(self.implementation_steps), | |
| 620 | + ) | |
| 621 | + | |
| 563 | 622 | def with_progress_context( |
| 564 | 623 | self, |
| 565 | 624 | *, |
@@ -650,6 +709,8 @@ def load_planning_artifacts( | ||
| 650 | 709 | def sync_todos_to_definition_of_done( |
| 651 | 710 | dod, |
| 652 | 711 | todos: list[dict[str, str]], |
| 712 | + *, | |
| 713 | + project_root: Path | None = None, | |
| 653 | 714 | ) -> None: |
| 654 | 715 | """Reflect todo state into DoD pending/completed items.""" |
| 655 | 716 | |
@@ -671,24 +732,100 @@ def sync_todos_to_definition_of_done( | ||
| 671 | 732 | "Collect verification evidence", |
| 672 | 733 | } |
| 673 | 734 | ] |
| 735 | + existing_completed = { | |
| 736 | + item.strip() | |
| 737 | + for item in dod.completed_items | |
| 738 | + if item.strip() and item not in _SPECIAL_TODO_ITEMS | |
| 739 | + } | |
| 674 | 740 | |
| 675 | 741 | pending: list[str] = [] |
| 676 | 742 | completed: list[str] = [] |
| 677 | 743 | for item in todos: |
| 678 | 744 | status = str(item.get("status", "")).strip().lower() |
| 679 | - label = str( | |
| 680 | - item.get("active_form") if status == "in_progress" else item.get("content", "") | |
| 681 | - ).strip() | |
| 682 | - if not label: | |
| 745 | + content = str(item.get("content", "")).strip() | |
| 746 | + active_form = str(item.get("active_form", "")).strip() | |
| 747 | + label = active_form if status == "in_progress" else content | |
| 748 | + if not label and not content: | |
| 749 | + continue | |
| 750 | + # Treat exact todo items as monotonic. If a successful tool call already | |
| 751 | + # marked the same todo complete, a stale TodoWrite snapshot should not | |
| 752 | + # regress it back to pending / in progress. | |
| 753 | + if status != "completed" and ( | |
| 754 | + content in existing_completed or active_form in existing_completed | |
| 755 | + ): | |
| 756 | + completed.append(content or active_form or label) | |
| 683 | 757 | continue |
| 684 | 758 | if status == "completed": |
| 685 | - completed.append(str(item.get("content", label)).strip()) | |
| 759 | + completed.append(content or label) | |
| 686 | 760 | else: |
| 687 | 761 | pending.append(label) |
| 688 | 762 | |
| 689 | 763 | dod.pending_items = list(dict.fromkeys(pending + special_pending)) |
| 690 | 764 | dod.completed_items = list(dict.fromkeys(completed + special_completed)) |
| 691 | 765 | |
| 766 | + if project_root is not None: | |
| 767 | + _reopen_aggregate_completion_steps_for_missing_artifacts( | |
| 768 | + dod, | |
| 769 | + project_root=project_root, | |
| 770 | + ) | |
| 771 | + _reopen_directory_content_steps_for_incomplete_artifacts( | |
| 772 | + dod, | |
| 773 | + project_root=project_root, | |
| 774 | + ) | |
| 775 | + dod.pending_items = effective_pending_todo_items( | |
| 776 | + dod, | |
| 777 | + project_root=project_root, | |
| 778 | + ) | |
| 779 | + | |
| 780 | + | |
| 781 | +def effective_pending_todo_items( | |
| 782 | + dod, | |
| 783 | + *, | |
| 784 | + project_root: Path | None = None, | |
| 785 | +) -> list[str]: | |
| 786 | + """Return pending todo items after filtering stale artifact-expansion drift.""" | |
| 787 | + | |
| 788 | + pending_items = [item for item in dod.pending_items if item.strip()] | |
| 789 | + if not pending_items or project_root is None or dod.status == "fixing": | |
| 790 | + return pending_items | |
| 791 | + | |
| 792 | + planned_targets = collect_planned_artifact_targets( | |
| 793 | + dod, | |
| 794 | + project_root=project_root, | |
| 795 | + max_paths=24, | |
| 796 | + ) | |
| 797 | + if not planned_targets: | |
| 798 | + return pending_items | |
| 799 | + if not all_planned_artifacts_exist(dod, project_root=project_root, max_paths=24): | |
| 800 | + return pending_items | |
| 801 | + | |
| 802 | + planned_files = { | |
| 803 | + target.name.lower() | |
| 804 | + for target, expect_directory in planned_targets | |
| 805 | + if not expect_directory | |
| 806 | + } | |
| 807 | + if not planned_files: | |
| 808 | + return pending_items | |
| 809 | + | |
| 810 | + filtered_items = [ | |
| 811 | + item | |
| 812 | + for item in pending_items | |
| 813 | + if not _todo_targets_unplanned_artifact(item, planned_files) | |
| 814 | + ] | |
| 815 | + filtered_items = [ | |
| 816 | + item | |
| 817 | + for item in filtered_items | |
| 818 | + if not _todo_describes_stale_creation_after_artifacts_exist( | |
| 819 | + item, | |
| 820 | + planned_files, | |
| 821 | + ) | |
| 822 | + ] | |
| 823 | + return [ | |
| 824 | + item | |
| 825 | + for item in filtered_items | |
| 826 | + if not _todo_describes_stale_discovery_after_artifacts_exist(item) | |
| 827 | + ] | |
| 828 | + | |
| 692 | 829 | |
| 693 | 830 | def preserve_task_grounded_acceptance_criteria( |
| 694 | 831 | task_statement: str, |
@@ -714,6 +851,7 @@ def merge_refreshed_todos_with_existing_scope( | ||
| 714 | 851 | existing_pending_items: list[str], |
| 715 | 852 | existing_completed_items: list[str], |
| 716 | 853 | refreshed_steps: list[str], |
| 854 | + planned_files: set[str] | None = None, | |
| 717 | 855 | ) -> list[dict[str, str]]: |
| 718 | 856 | """Merge one refreshed plan with task-grounded todo scope already in flight.""" |
| 719 | 857 | |
@@ -740,6 +878,12 @@ def merge_refreshed_todos_with_existing_scope( | ||
| 740 | 878 | or _looks_actionable_refresh_step(item) |
| 741 | 879 | ) |
| 742 | 880 | ] |
| 881 | + if planned_files: | |
| 882 | + refreshed_candidates = [ | |
| 883 | + item | |
| 884 | + for item in refreshed_candidates | |
| 885 | + if not _todo_targets_unplanned_artifact(item, planned_files) | |
| 886 | + ] | |
| 743 | 887 | |
| 744 | 888 | todos: list[dict[str, str]] = [] |
| 745 | 889 | seen: set[str] = set() |
@@ -839,6 +983,12 @@ def _todo_progress_score(item: str, tool_call: ToolCall) -> int: | ||
| 839 | 983 | if _contains_any(text, _PARSE_STEP_HINTS) and ".html" in combined: |
| 840 | 984 | score += 1 |
| 841 | 985 | elif name in {"glob", "grep"}: |
| 986 | + if not ( | |
| 987 | + _contains_any(text, _SEARCH_STEP_HINTS) | |
| 988 | + or _contains_any(text, _READ_STEP_HINTS) | |
| 989 | + or _contains_any(text, _PARSE_STEP_HINTS) | |
| 990 | + ): | |
| 991 | + return 0 | |
| 842 | 992 | if _contains_any(text, _SEARCH_STEP_HINTS): |
| 843 | 993 | score += 2 |
| 844 | 994 | if name == "glob" and _contains_any(text, _READ_STEP_HINTS) and ".html" in combined: |
@@ -874,12 +1024,231 @@ def _contains_any(text: str, candidates: tuple[str, ...]) -> bool: | ||
| 874 | 1024 | |
| 875 | 1025 | |
| 876 | 1026 | def _todo_describes_aggregate_mutation(text: str) -> bool: |
| 877 | - return _contains_any(text, _AGGREGATE_TODO_HINTS) and _contains_any( | |
| 1027 | + return ( | |
| 1028 | + _contains_any(text, _AGGREGATE_TODO_HINTS) | |
| 1029 | + or _todo_mentions_plural_output_set(text) | |
| 1030 | + ) and _contains_any( | |
| 878 | 1031 | text, |
| 879 | 1032 | _MUTATION_STEP_HINTS, |
| 880 | 1033 | ) |
| 881 | 1034 | |
| 882 | 1035 | |
| 1036 | +def _todo_requires_complete_artifact_set(text: str) -> bool: | |
| 1037 | + return ( | |
| 1038 | + _contains_any(text, _AGGREGATE_TODO_HINTS) | |
| 1039 | + or _todo_mentions_plural_output_set(text) | |
| 1040 | + ) and _contains_any( | |
| 1041 | + text, | |
| 1042 | + _ARTIFACT_SET_COMPLETION_HINTS, | |
| 1043 | + ) | |
| 1044 | + | |
| 1045 | + | |
| 1046 | +def _todo_mentions_plural_output_set(text: str) -> bool: | |
| 1047 | + if _TODO_FILE_CANDIDATE_PATTERN.search(text): | |
| 1048 | + return False | |
| 1049 | + return any( | |
| 1050 | + phrase in text | |
| 1051 | + for phrase in ( | |
| 1052 | + "chapter files", | |
| 1053 | + "all chapters", | |
| 1054 | + "chapters", | |
| 1055 | + "files following", | |
| 1056 | + "files with", | |
| 1057 | + "output files", | |
| 1058 | + "artifacts", | |
| 1059 | + "documents", | |
| 1060 | + "sections", | |
| 1061 | + "pages", | |
| 1062 | + ) | |
| 1063 | + ) | |
| 1064 | + | |
| 1065 | + | |
| 1066 | +def _todo_targets_unplanned_artifact(item: str, planned_files: set[str]) -> bool: | |
| 1067 | + if item in _SPECIAL_TODO_ITEMS: | |
| 1068 | + return False | |
| 1069 | + | |
| 1070 | + text = item.strip().lower() | |
| 1071 | + if not text or not _contains_any(text, _MUTATION_STEP_HINTS): | |
| 1072 | + return False | |
| 1073 | + | |
| 1074 | + candidates = { | |
| 1075 | + Path(match).name.lower() | |
| 1076 | + for match in _TODO_FILE_CANDIDATE_PATTERN.findall(text) | |
| 1077 | + } | |
| 1078 | + if not candidates: | |
| 1079 | + return False | |
| 1080 | + | |
| 1081 | + return candidates.isdisjoint(planned_files) | |
| 1082 | + | |
| 1083 | + | |
| 1084 | +def _todo_describes_stale_discovery_after_artifacts_exist(item: str) -> bool: | |
| 1085 | + text = item.strip().lower() | |
| 1086 | + if not text or item in _SPECIAL_TODO_ITEMS: | |
| 1087 | + return False | |
| 1088 | + if _contains_any(text, _VERIFY_STEP_HINTS): | |
| 1089 | + return False | |
| 1090 | + if _contains_any(text, _MUTATION_STEP_HINTS): | |
| 1091 | + return False | |
| 1092 | + if _contains_any(text, _ARTIFACT_SET_COMPLETION_HINTS): | |
| 1093 | + return False | |
| 1094 | + return ( | |
| 1095 | + _contains_any(text, _READ_STEP_HINTS) | |
| 1096 | + or _contains_any(text, _SEARCH_STEP_HINTS) | |
| 1097 | + or _contains_any(text, _PARSE_STEP_HINTS) | |
| 1098 | + ) | |
| 1099 | + | |
| 1100 | + | |
| 1101 | +def _todo_describes_stale_creation_after_artifacts_exist( | |
| 1102 | + item: str, | |
| 1103 | + planned_files: set[str], | |
| 1104 | +) -> bool: | |
| 1105 | + text = item.strip().lower() | |
| 1106 | + if not text or item in _SPECIAL_TODO_ITEMS: | |
| 1107 | + return False | |
| 1108 | + if _contains_any(text, _VERIFY_STEP_HINTS): | |
| 1109 | + return False | |
| 1110 | + if not _contains_any(text, _CREATION_STEP_HINTS): | |
| 1111 | + return False | |
| 1112 | + candidates = { | |
| 1113 | + Path(match).name.lower() | |
| 1114 | + for match in _TODO_FILE_CANDIDATE_PATTERN.findall(text) | |
| 1115 | + } | |
| 1116 | + if not candidates: | |
| 1117 | + return False | |
| 1118 | + return not candidates.isdisjoint(planned_files) | |
| 1119 | + | |
| 1120 | + | |
| 1121 | +def _todo_describes_directory_content_creation( | |
| 1122 | + item: str, | |
| 1123 | + directories: list[Path], | |
| 1124 | +) -> bool: | |
| 1125 | + text = item.strip().lower() | |
| 1126 | + if not text or item in _SPECIAL_TODO_ITEMS: | |
| 1127 | + return False | |
| 1128 | + if not _contains_any(text, _CREATION_STEP_HINTS): | |
| 1129 | + return False | |
| 1130 | + if not any( | |
| 1131 | + token in text | |
| 1132 | + for token in ( | |
| 1133 | + "file", | |
| 1134 | + "files", | |
| 1135 | + "chapter", | |
| 1136 | + "chapters", | |
| 1137 | + "page", | |
| 1138 | + "pages", | |
| 1139 | + "artifact", | |
| 1140 | + "artifacts", | |
| 1141 | + "content", | |
| 1142 | + "test", | |
| 1143 | + "tests", | |
| 1144 | + ) | |
| 1145 | + ): | |
| 1146 | + return False | |
| 1147 | + | |
| 1148 | + for directory in directories: | |
| 1149 | + name = directory.name.lower() | |
| 1150 | + tokens = {name} | |
| 1151 | + if name.endswith("ies") and len(name) > 3: | |
| 1152 | + tokens.add(f"{name[:-3]}y") | |
| 1153 | + elif name.endswith("s") and len(name) > 3: | |
| 1154 | + tokens.add(name[:-1]) | |
| 1155 | + if any(token in text for token in tokens): | |
| 1156 | + return True | |
| 1157 | + return False | |
| 1158 | + | |
| 1159 | + | |
| 1160 | +def _reopen_aggregate_completion_steps_for_missing_artifacts( | |
| 1161 | + dod, | |
| 1162 | + *, | |
| 1163 | + project_root: Path, | |
| 1164 | +) -> None: | |
| 1165 | + planned_targets = collect_planned_artifact_targets( | |
| 1166 | + dod, | |
| 1167 | + project_root=project_root, | |
| 1168 | + max_paths=12, | |
| 1169 | + ) | |
| 1170 | + if not planned_targets: | |
| 1171 | + return | |
| 1172 | + | |
| 1173 | + if all_planned_artifacts_exist(dod, project_root=project_root, max_paths=12): | |
| 1174 | + return | |
| 1175 | + | |
| 1176 | + retained_completed: list[str] = [] | |
| 1177 | + reopened_pending: list[str] = [] | |
| 1178 | + for item in dod.completed_items: | |
| 1179 | + text = item.strip().lower() | |
| 1180 | + if item in _SPECIAL_TODO_ITEMS or not _todo_requires_complete_artifact_set(text): | |
| 1181 | + retained_completed.append(item) | |
| 1182 | + continue | |
| 1183 | + reopened_pending.append(item) | |
| 1184 | + | |
| 1185 | + if not reopened_pending: | |
| 1186 | + return | |
| 1187 | + | |
| 1188 | + dod.completed_items = retained_completed | |
| 1189 | + dod.pending_items = list(dict.fromkeys(dod.pending_items + reopened_pending)) | |
| 1190 | + | |
| 1191 | + | |
| 1192 | +def _reopen_directory_content_steps_for_incomplete_artifacts( | |
| 1193 | + dod, | |
| 1194 | + *, | |
| 1195 | + project_root: Path, | |
| 1196 | +) -> None: | |
| 1197 | + planned_targets = collect_planned_artifact_targets( | |
| 1198 | + dod, | |
| 1199 | + project_root=project_root, | |
| 1200 | + max_paths=12, | |
| 1201 | + ) | |
| 1202 | + if not planned_targets: | |
| 1203 | + return | |
| 1204 | + | |
| 1205 | + incomplete_directories = [ | |
| 1206 | + target | |
| 1207 | + for target, expect_directory in planned_targets | |
| 1208 | + if expect_directory | |
| 1209 | + and not planned_artifact_target_satisfied( | |
| 1210 | + dod, | |
| 1211 | + target=target, | |
| 1212 | + expect_directory=True, | |
| 1213 | + project_root=project_root, | |
| 1214 | + ) | |
| 1215 | + ] | |
| 1216 | + if not incomplete_directories: | |
| 1217 | + return | |
| 1218 | + | |
| 1219 | + retained_completed: list[str] = [] | |
| 1220 | + reopened_pending: list[str] = [] | |
| 1221 | + for item in dod.completed_items: | |
| 1222 | + if item in _SPECIAL_TODO_ITEMS: | |
| 1223 | + retained_completed.append(item) | |
| 1224 | + continue | |
| 1225 | + if _todo_describes_directory_content_creation(item, incomplete_directories): | |
| 1226 | + reopened_pending.append(item) | |
| 1227 | + continue | |
| 1228 | + retained_completed.append(item) | |
| 1229 | + | |
| 1230 | + if not reopened_pending: | |
| 1231 | + return | |
| 1232 | + | |
| 1233 | + dod.completed_items = retained_completed | |
| 1234 | + dod.pending_items = list(dict.fromkeys(dod.pending_items + reopened_pending)) | |
| 1235 | + | |
| 1236 | + | |
| 1237 | +def reconcile_aggregate_completion_steps( | |
| 1238 | + dod, | |
| 1239 | + *, | |
| 1240 | + project_root: Path | None, | |
| 1241 | +) -> None: | |
| 1242 | + """Reopen aggregate completion steps when planned artifacts are still missing.""" | |
| 1243 | + | |
| 1244 | + if project_root is None: | |
| 1245 | + return | |
| 1246 | + _reopen_aggregate_completion_steps_for_missing_artifacts( | |
| 1247 | + dod, | |
| 1248 | + project_root=project_root, | |
| 1249 | + ) | |
| 1250 | + | |
| 1251 | + | |
| 883 | 1252 | def _looks_like_search_command(command: str) -> bool: |
| 884 | 1253 | return any(token in command for token in (" ls", "ls ", "find ", "rg ", "grep ", "glob ")) |
| 885 | 1254 | |
src/loader/runtime/workflow_lanes.pymodified@@ -22,7 +22,7 @@ from .clarify_strategy import ( | ||
| 22 | 22 | describe_clarify_stage, |
| 23 | 23 | ) |
| 24 | 24 | from .context import RuntimeContext |
| 25 | -from .dod import DefinitionOfDone, DefinitionOfDoneStore | |
| 25 | +from .dod import DefinitionOfDone, DefinitionOfDoneStore, collect_planned_artifact_targets | |
| 26 | 26 | from .events import AgentEvent, TurnSummary |
| 27 | 27 | from .executor import ToolExecutor |
| 28 | 28 | from .workflow import ( |
@@ -208,6 +208,12 @@ class WorkflowLaneRunner: | ||
| 208 | 208 | refreshed_acceptance_criteria=list(artifacts.acceptance_criteria), |
| 209 | 209 | ) |
| 210 | 210 | artifacts = artifacts.with_acceptance_criteria(preserved_acceptance) |
| 211 | + preserved_file_changes = _preserved_file_change_items( | |
| 212 | + dod, | |
| 213 | + project_root=self.context.project_root, | |
| 214 | + ) | |
| 215 | + if preserved_file_changes: | |
| 216 | + artifacts = artifacts.with_file_changes(preserved_file_changes) | |
| 211 | 217 | artifacts = artifacts.with_progress_context( |
| 212 | 218 | touched_files=list(dod.touched_files), |
| 213 | 219 | completed_items=list(dod.completed_items), |
@@ -309,11 +315,16 @@ class WorkflowLaneRunner: | ||
| 309 | 315 | assert executor is not None |
| 310 | 316 | |
| 311 | 317 | if preserve_existing_scope: |
| 318 | + planned_files = _planned_file_names_for_refresh( | |
| 319 | + dod, | |
| 320 | + project_root=self.context.project_root, | |
| 321 | + ) | |
| 312 | 322 | todos = merge_refreshed_todos_with_existing_scope( |
| 313 | 323 | task_statement, |
| 314 | 324 | existing_pending_items=list(dod.pending_items), |
| 315 | 325 | existing_completed_items=list(dod.completed_items), |
| 316 | 326 | refreshed_steps=list(artifacts.implementation_steps[:8]), |
| 327 | + planned_files=planned_files, | |
| 317 | 328 | ) |
| 318 | 329 | else: |
| 319 | 330 | todos = [ |
@@ -369,7 +380,11 @@ class WorkflowLaneRunner: | ||
| 369 | 380 | if outcome.registry_result is not None: |
| 370 | 381 | new_todos = outcome.registry_result.metadata.get("new_todos", []) |
| 371 | 382 | if isinstance(new_todos, list): |
| 372 | - sync_todos_to_definition_of_done(dod, new_todos) | |
| 383 | + sync_todos_to_definition_of_done( | |
| 384 | + dod, | |
| 385 | + new_todos, | |
| 386 | + project_root=self.context.project_root, | |
| 387 | + ) | |
| 373 | 388 | self.dod_store.save(dod) |
| 374 | 389 | |
| 375 | 390 | async def _run_clarify_round( |
@@ -720,3 +735,37 @@ class WorkflowLaneRunner: | ||
| 720 | 735 | decision_boundaries=list(brief.decision_boundaries), |
| 721 | 736 | likely_touchpoints=list(brief.likely_touchpoints), |
| 722 | 737 | ) |
| 738 | + | |
| 739 | + | |
| 740 | +def _preserved_file_change_items( | |
| 741 | + dod: DefinitionOfDone, | |
| 742 | + *, | |
| 743 | + project_root: Path, | |
| 744 | +) -> list[str]: | |
| 745 | + items: list[str] = [] | |
| 746 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 747 | + dod, | |
| 748 | + project_root=project_root, | |
| 749 | + max_paths=24, | |
| 750 | + ): | |
| 751 | + path_text = str(target) | |
| 752 | + if expect_directory and not path_text.endswith("/"): | |
| 753 | + path_text += "/" | |
| 754 | + items.append(f"`{path_text}`") | |
| 755 | + return items | |
| 756 | + | |
| 757 | + | |
| 758 | +def _planned_file_names_for_refresh( | |
| 759 | + dod: DefinitionOfDone, | |
| 760 | + *, | |
| 761 | + project_root: Path, | |
| 762 | +) -> set[str]: | |
| 763 | + return { | |
| 764 | + target.name.lower() | |
| 765 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 766 | + dod, | |
| 767 | + project_root=project_root, | |
| 768 | + max_paths=24, | |
| 769 | + ) | |
| 770 | + if not expect_directory | |
| 771 | + } | |
src/loader/runtime/workflow_recovery.pymodified@@ -10,7 +10,7 @@ from .artifact_invalidation import ( | ||
| 10 | 10 | WorkflowRecoveryStrategy, |
| 11 | 11 | ) |
| 12 | 12 | from .context import RuntimeContext |
| 13 | -from .dod import DefinitionOfDone | |
| 13 | +from .dod import DefinitionOfDone, collect_planned_artifact_targets | |
| 14 | 14 | from .events import AgentEvent, TurnSummary |
| 15 | 15 | from .executor import ToolExecutor |
| 16 | 16 | from .workflow import ( |
@@ -128,6 +128,10 @@ class WorkflowRecoveryController: | ||
| 128 | 128 | def plan_freshness(self, dod: DefinitionOfDone) -> ArtifactFreshness: |
| 129 | 129 | """Assess whether the persisted workflow artifacts are stale.""" |
| 130 | 130 | |
| 131 | + planned_artifacts_complete = not _first_missing_planned_artifact( | |
| 132 | + dod, | |
| 133 | + project_root=self.context.project_root, | |
| 134 | + ) | |
| 131 | 135 | return self.artifact_invalidation.assess( |
| 132 | 136 | task_statement=dod.task_statement, |
| 133 | 137 | clarify_text=self._artifact_text(dod.clarify_brief), |
@@ -136,6 +140,8 @@ class WorkflowRecoveryController: | ||
| 136 | 140 | acceptance_criteria=list(dod.acceptance_criteria), |
| 137 | 141 | touched_files=list(dod.touched_files), |
| 138 | 142 | last_verification_result=dod.last_verification_result, |
| 143 | + retry_count=dod.retry_count, | |
| 144 | + planned_artifacts_complete=planned_artifacts_complete, | |
| 139 | 145 | ) |
| 140 | 146 | |
| 141 | 147 | async def _run_plan_refresh_reentry( |
@@ -198,6 +204,25 @@ class WorkflowRecoveryController: | ||
| 198 | 204 | ), |
| 199 | 205 | None, |
| 200 | 206 | ) |
| 207 | + missing_artifact = _first_missing_planned_artifact( | |
| 208 | + dod, | |
| 209 | + project_root=self.context.project_root, | |
| 210 | + ) | |
| 211 | + if _should_prioritize_missing_artifact( | |
| 212 | + next_pending=next_pending, | |
| 213 | + missing_artifact=missing_artifact, | |
| 214 | + ): | |
| 215 | + target, expect_directory = missing_artifact | |
| 216 | + label = target.name or str(target) | |
| 217 | + if expect_directory and not label.endswith("/"): | |
| 218 | + label += "/" | |
| 219 | + self.context.queue_steering_message( | |
| 220 | + "Plan refresh preserved the progress already made. " | |
| 221 | + "Reuse the existing files and confirmed facts, then resume by creating " | |
| 222 | + f"`{label}`. Prefer one concrete mutation step for `{target}` before " | |
| 223 | + "any more review or consistency-check work." | |
| 224 | + ) | |
| 225 | + return True | |
| 201 | 226 | if next_pending: |
| 202 | 227 | self.context.queue_steering_message( |
| 203 | 228 | "Plan refresh preserved the progress already made. " |
@@ -350,3 +375,62 @@ class WorkflowRecoveryController: | ||
| 350 | 375 | @staticmethod |
| 351 | 376 | def _recovery_evidence_summary(freshness: ArtifactFreshness) -> list[str]: |
| 352 | 377 | return list(freshness.evidence_summary) |
| 378 | + | |
| 379 | + | |
| 380 | +def _first_missing_planned_artifact( | |
| 381 | + dod: DefinitionOfDone, | |
| 382 | + *, | |
| 383 | + project_root: Path, | |
| 384 | +) -> tuple[Path, bool] | None: | |
| 385 | + for target, expect_directory in collect_planned_artifact_targets( | |
| 386 | + dod, | |
| 387 | + project_root=project_root, | |
| 388 | + max_paths=12, | |
| 389 | + ): | |
| 390 | + exists = target.is_dir() if expect_directory else target.is_file() | |
| 391 | + if not exists: | |
| 392 | + return target, expect_directory | |
| 393 | + return None | |
| 394 | + | |
| 395 | + | |
| 396 | +def _should_prioritize_missing_artifact( | |
| 397 | + *, | |
| 398 | + next_pending: str | None, | |
| 399 | + missing_artifact: tuple[Path, bool] | None, | |
| 400 | +) -> bool: | |
| 401 | + if missing_artifact is None: | |
| 402 | + return False | |
| 403 | + if not next_pending: | |
| 404 | + return True | |
| 405 | + lowered = next_pending.lower() | |
| 406 | + if any( | |
| 407 | + hint in lowered | |
| 408 | + for hint in ( | |
| 409 | + "verify", | |
| 410 | + "validation", | |
| 411 | + "validate", | |
| 412 | + "review", | |
| 413 | + "consistent", | |
| 414 | + "consistently", | |
| 415 | + "linked", | |
| 416 | + "format", | |
| 417 | + "formatted", | |
| 418 | + ) | |
| 419 | + ): | |
| 420 | + return True | |
| 421 | + return not any( | |
| 422 | + hint in lowered | |
| 423 | + for hint in ( | |
| 424 | + "create", | |
| 425 | + "update", | |
| 426 | + "edit", | |
| 427 | + "write", | |
| 428 | + "fix", | |
| 429 | + "modify", | |
| 430 | + "change", | |
| 431 | + "patch", | |
| 432 | + "replace", | |
| 433 | + "correct", | |
| 434 | + "rewrite", | |
| 435 | + ) | |
| 436 | + ) | |
src/loader/tools/workflow_tools.pymodified@@ -117,6 +117,7 @@ class TodoWriteTool(Tool): | ||
| 117 | 117 | |
| 118 | 118 | store_path = self._store_path() |
| 119 | 119 | old_todos = await asyncio.to_thread(self._read_existing_items, store_path) |
| 120 | + items = self._merge_partial_update(old_todos, items) | |
| 120 | 121 | |
| 121 | 122 | all_done = all(item.status == "completed" for item in items) |
| 122 | 123 | persisted_items = [] if all_done else [item.to_dict() for item in items] |
@@ -144,6 +145,29 @@ class TodoWriteTool(Tool): | ||
| 144 | 145 | metadata=payload, |
| 145 | 146 | ) |
| 146 | 147 | |
| 148 | + def _merge_partial_update( | |
| 149 | + self, | |
| 150 | + old_todos: list[dict[str, Any]], | |
| 151 | + items: list[TodoItem], | |
| 152 | + ) -> list[TodoItem]: | |
| 153 | + """Preserve omitted todos when the model sends a narrow status update.""" | |
| 154 | + | |
| 155 | + old_items = [TodoItem.from_dict(item) for item in old_todos if isinstance(item, dict)] | |
| 156 | + if not old_items or len(items) >= len(old_items): | |
| 157 | + return items | |
| 158 | + | |
| 159 | + old_by_content = {item.content: item for item in old_items if item.content} | |
| 160 | + if not old_by_content: | |
| 161 | + return items | |
| 162 | + if not all(item.content in old_by_content for item in items): | |
| 163 | + return items | |
| 164 | + | |
| 165 | + updates = {item.content: item for item in items} | |
| 166 | + merged: list[TodoItem] = [] | |
| 167 | + for old_item in old_items: | |
| 168 | + merged.append(updates.get(old_item.content, old_item)) | |
| 169 | + return merged | |
| 170 | + | |
| 147 | 171 | def _store_path(self) -> Path: |
| 148 | 172 | return active_todo_store_path(self.workspace_root or Path.cwd()) |
| 149 | 173 | |
tests/test_artifact_invalidation.pymodified@@ -92,3 +92,49 @@ def test_artifact_invalidation_treats_path_separator_variants_as_same_touchpoint | ||
| 92 | 92 | assert freshness.stale_plan is False |
| 93 | 93 | assert freshness.stale_brief is False |
| 94 | 94 | assert "touched_files_outside_plan" not in freshness.reason_codes |
| 95 | + | |
| 96 | + | |
| 97 | +def test_artifact_invalidation_allows_supplemental_repair_files_after_failed_verification() -> None: | |
| 98 | + assessor = ArtifactInvalidationAssessor() | |
| 99 | + | |
| 100 | + freshness = assessor.assess( | |
| 101 | + task_statement="Build a multi-file nginx guide.", | |
| 102 | + clarify_text=None, | |
| 103 | + implementation_text=( | |
| 104 | + "# Implementation Plan\n" | |
| 105 | + "- Create index.html.\n" | |
| 106 | + "- Create 01-getting-started.html.\n" | |
| 107 | + "- Create 02-installation.html.\n" | |
| 108 | + ), | |
| 109 | + verification_text=( | |
| 110 | + "# Verification Plan\n" | |
| 111 | + "## Acceptance Criteria\n" | |
| 112 | + "- index.html exists.\n" | |
| 113 | + "- 01-getting-started.html exists.\n" | |
| 114 | + "- 02-installation.html exists.\n" | |
| 115 | + ), | |
| 116 | + acceptance_criteria=[ | |
| 117 | + "index.html exists.", | |
| 118 | + "01-getting-started.html exists.", | |
| 119 | + "02-installation.html exists.", | |
| 120 | + ], | |
| 121 | + touched_files=[ | |
| 122 | + "/tmp/guides/nginx/index.html", | |
| 123 | + "/tmp/guides/nginx/chapters/01-getting-started.html", | |
| 124 | + "/tmp/guides/nginx/chapters/02-installation.html", | |
| 125 | + "/tmp/guides/nginx/styles.css", | |
| 126 | + ], | |
| 127 | + last_verification_result="planned", | |
| 128 | + retry_count=1, | |
| 129 | + planned_artifacts_complete=True, | |
| 130 | + ) | |
| 131 | + | |
| 132 | + assert freshness.stale_plan is False | |
| 133 | + assert freshness.stale_brief is False | |
| 134 | + assert freshness.recovery_strategy == WorkflowRecoveryStrategy.NONE.value | |
| 135 | + assert "touched_files_outside_plan" not in freshness.reason_codes | |
| 136 | + assert any( | |
| 137 | + item.kind == ArtifactEvidenceKind.CONFIRMED_TOUCHPOINT.value | |
| 138 | + and "styles.css" in item.summary | |
| 139 | + for item in freshness.evidence | |
| 140 | + ) | |
tests/test_compaction.pymodified@@ -149,12 +149,12 @@ def test_build_session_summary_preserves_confirmed_facts_and_next_step() -> None | ||
| 149 | 149 | |
| 150 | 150 | assert "Confirmed facts:" in summary |
| 151 | 151 | assert "02-basic-syntax.html -> 02-setup.html" in summary |
| 152 | - assert "02-setup.html = Chapter 2: Setting Up Fortran" in summary | |
| 152 | + assert "02-setup.html = Chapter 2: Setting Up Fortran" not in summary | |
| 153 | 153 | assert "Preferred next step:" in summary |
| 154 | 154 | assert "`~/Loader/guides/fortran/index.html`" in summary |
| 155 | 155 | |
| 156 | 156 | |
| 157 | -def test_summarize_confirmed_facts_extracts_chapter_titles_from_read_results() -> None: | |
| 157 | +def test_summarize_confirmed_facts_ignores_reference_chapter_title_reads() -> None: | |
| 158 | 158 | messages = [ |
| 159 | 159 | Message( |
| 160 | 160 | role=Role.ASSISTANT, |
@@ -186,10 +186,7 @@ def test_summarize_confirmed_facts_extracts_chapter_titles_from_read_results() - | ||
| 186 | 186 | |
| 187 | 187 | confirmed_facts = summarize_confirmed_facts(messages, max_items=2) |
| 188 | 188 | |
| 189 | - assert confirmed_facts is not None | |
| 190 | - assert "Chapter titles confirmed:" in confirmed_facts | |
| 191 | - assert "01-introduction.html = Chapter 1: Introduction to Fortran" in confirmed_facts | |
| 192 | - assert "02-setup.html = Chapter 2: Setting Up Fortran" in confirmed_facts | |
| 189 | + assert confirmed_facts is None | |
| 193 | 190 | |
| 194 | 191 | |
| 195 | 192 | def test_infer_preferred_next_step_uses_confirmed_chapter_pairs() -> None: |
@@ -222,10 +219,7 @@ def test_infer_preferred_next_step_uses_confirmed_chapter_pairs() -> None: | ||
| 222 | 219 | current_task="Update /tmp/fortran/index.html so the chapter list matches the real files.", |
| 223 | 220 | ) |
| 224 | 221 | |
| 225 | - assert next_step == ( | |
| 226 | - "Update `/tmp/fortran/index.html` using the confirmed chapter file/title pairs " | |
| 227 | - "instead of rereading files." | |
| 228 | - ) | |
| 222 | + assert next_step is None | |
| 229 | 223 | |
| 230 | 224 | |
| 231 | 225 | def test_infer_preferred_next_step_uses_latest_verification_gap() -> None: |
@@ -278,13 +272,8 @@ def test_infer_preferred_next_step_uses_latest_verification_gap() -> None: | ||
| 278 | 272 | current_task="Update /tmp/fortran/index.html so the chapter list matches the real files.", |
| 279 | 273 | ) |
| 280 | 274 | |
| 281 | - assert confirmed_facts is not None | |
| 282 | - assert "Verification gaps: missing TOC links chapters/05-control-structures.html" in confirmed_facts | |
| 283 | - assert next_step == ( | |
| 284 | - "Update `/tmp/fortran/index.html` to fix the specific verification failures " | |
| 285 | - "(missing TOC links chapters/05-control-structures.html, " | |
| 286 | - "chapters/06-input-output.html) instead of restarting discovery." | |
| 287 | - ) | |
| 275 | + assert confirmed_facts is None | |
| 276 | + assert next_step is None | |
| 288 | 277 | |
| 289 | 278 | |
| 290 | 279 | def test_compact_session_messages_uses_single_continuation_instruction_block() -> None: |
tests/test_dod.pymodified@@ -6,8 +6,10 @@ from loader.llm.base import ToolCall | ||
| 6 | 6 | from loader.runtime.dod import ( |
| 7 | 7 | DefinitionOfDoneStore, |
| 8 | 8 | VerificationEvidence, |
| 9 | + all_planned_artifacts_exist, | |
| 9 | 10 | begin_new_verification_attempt, |
| 10 | 11 | build_verification_summary, |
| 12 | + collect_planned_artifact_targets, | |
| 11 | 13 | create_definition_of_done, |
| 12 | 14 | derive_verification_commands, |
| 13 | 15 | determine_task_size, |
@@ -166,6 +168,172 @@ def test_derive_verification_commands_avoids_repo_defaults_for_external_artifact | ||
| 166 | 168 | assert commands == [f"test -f {external_index}"] |
| 167 | 169 | |
| 168 | 170 | |
| 171 | +def test_derive_verification_commands_adds_generic_local_html_link_check( | |
| 172 | + tmp_path: Path, | |
| 173 | +) -> None: | |
| 174 | + docs = tmp_path / "docs" | |
| 175 | + docs.mkdir() | |
| 176 | + index = docs / "index.html" | |
| 177 | + index.write_text('<a href="chapters/01-intro.html">Intro</a>\n') | |
| 178 | + | |
| 179 | + dod = create_definition_of_done("Create a small multi-page HTML guide.") | |
| 180 | + dod.touched_files = [str(index)] | |
| 181 | + | |
| 182 | + commands = derive_verification_commands( | |
| 183 | + dod, | |
| 184 | + project_root=tmp_path, | |
| 185 | + task_statement=dod.task_statement, | |
| 186 | + supplement_existing=True, | |
| 187 | + ) | |
| 188 | + | |
| 189 | + assert any("Missing local HTML links:" in command for command in commands) | |
| 190 | + | |
| 191 | + | |
| 192 | +def test_derive_verification_commands_adds_planned_artifact_existence_checks( | |
| 193 | + tmp_path: Path, | |
| 194 | +) -> None: | |
| 195 | + implementation_plan = tmp_path / "implementation.md" | |
| 196 | + implementation_plan.write_text( | |
| 197 | + "\n".join( | |
| 198 | + [ | |
| 199 | + "# Implementation Plan", | |
| 200 | + "", | |
| 201 | + "## File Changes", | |
| 202 | + "- `docs/index.html`", | |
| 203 | + "- `docs/chapters/01-intro.html`", | |
| 204 | + "- `docs/chapters/02-installation.html`", | |
| 205 | + "- `docs/chapters/`", | |
| 206 | + ] | |
| 207 | + ) | |
| 208 | + ) | |
| 209 | + | |
| 210 | + dod = create_definition_of_done("Create a multi-page HTML guide.") | |
| 211 | + dod.implementation_plan = str(implementation_plan) | |
| 212 | + | |
| 213 | + commands = derive_verification_commands( | |
| 214 | + dod, | |
| 215 | + project_root=tmp_path, | |
| 216 | + task_statement=dod.task_statement, | |
| 217 | + supplement_existing=True, | |
| 218 | + ) | |
| 219 | + | |
| 220 | + assert f"test -f {tmp_path / 'docs/index.html'}" in commands | |
| 221 | + assert f"test -f {tmp_path / 'docs/chapters/01-intro.html'}" in commands | |
| 222 | + assert f"test -f {tmp_path / 'docs/chapters/02-installation.html'}" in commands | |
| 223 | + assert f"test -d {tmp_path / 'docs/chapters'}" in commands | |
| 224 | + | |
| 225 | + | |
| 226 | +def test_collect_planned_artifact_targets_ignores_prose_path_fragments_in_refreshed_plan( | |
| 227 | + tmp_path: Path, | |
| 228 | +) -> None: | |
| 229 | + implementation_plan = tmp_path / "implementation.md" | |
| 230 | + touched_index = tmp_path / "external" / "guides" / "nginx" / "index.html" | |
| 231 | + touched_index.parent.mkdir(parents=True) | |
| 232 | + touched_index.write_text("<html></html>\n") | |
| 233 | + implementation_plan.write_text( | |
| 234 | + "\n".join( | |
| 235 | + [ | |
| 236 | + "# Implementation Plan", | |
| 237 | + "", | |
| 238 | + "## File Changes", | |
| 239 | + "- Created main index.html file with proper structure and navigation", | |
| 240 | + "- Created the nginx guide directory structure (chapters/)", | |
| 241 | + "- Created the first chapter file (01-introduction.html) with appropriate content", | |
| 242 | + "", | |
| 243 | + "## Confirmed Progress", | |
| 244 | + f"- Already touched during execution: `{touched_index}`.", | |
| 245 | + ] | |
| 246 | + ) | |
| 247 | + ) | |
| 248 | + | |
| 249 | + dod = create_definition_of_done("Create an external nginx guide.") | |
| 250 | + dod.implementation_plan = str(implementation_plan) | |
| 251 | + | |
| 252 | + targets = collect_planned_artifact_targets(dod, project_root=tmp_path) | |
| 253 | + | |
| 254 | + assert (tmp_path / "chapters", True) not in targets | |
| 255 | + assert (tmp_path / "01-introduction.html", False) not in targets | |
| 256 | + assert targets == [(touched_index, False)] | |
| 257 | + | |
| 258 | + | |
| 259 | +def test_all_planned_artifacts_exist_requires_file_contents_for_planned_output_directory( | |
| 260 | + tmp_path: Path, | |
| 261 | +) -> None: | |
| 262 | + implementation_plan = tmp_path / "implementation.md" | |
| 263 | + implementation_plan.write_text( | |
| 264 | + "\n".join( | |
| 265 | + [ | |
| 266 | + "# Implementation Plan", | |
| 267 | + "", | |
| 268 | + "## File Changes", | |
| 269 | + f"- `{tmp_path / 'guide' / 'index.html'}`", | |
| 270 | + f"- `{tmp_path / 'guide' / 'chapters'}/` (directory for chapter files)", | |
| 271 | + "", | |
| 272 | + "## Execution Order", | |
| 273 | + "- Create chapter files with appropriate content", | |
| 274 | + ] | |
| 275 | + ) | |
| 276 | + ) | |
| 277 | + | |
| 278 | + guide_root = tmp_path / "guide" | |
| 279 | + chapters = guide_root / "chapters" | |
| 280 | + guide_root.mkdir() | |
| 281 | + chapters.mkdir() | |
| 282 | + (guide_root / "index.html").write_text("<html></html>\n") | |
| 283 | + | |
| 284 | + dod = create_definition_of_done("Create a multi-file guide with chapters.") | |
| 285 | + dod.implementation_plan = str(implementation_plan) | |
| 286 | + dod.completed_items = ["Create chapter files with appropriate content"] | |
| 287 | + | |
| 288 | + assert all_planned_artifacts_exist(dod, project_root=tmp_path) is False | |
| 289 | + | |
| 290 | + (chapters / "01-getting-started.html").write_text("<h1>Intro</h1>\n") | |
| 291 | + | |
| 292 | + assert all_planned_artifacts_exist(dod, project_root=tmp_path) is True | |
| 293 | + | |
| 294 | + | |
| 295 | +def test_all_planned_artifacts_exist_stays_false_while_touched_html_links_missing( | |
| 296 | + tmp_path: Path, | |
| 297 | +) -> None: | |
| 298 | + implementation_plan = tmp_path / "implementation.md" | |
| 299 | + implementation_plan.write_text( | |
| 300 | + "\n".join( | |
| 301 | + [ | |
| 302 | + "# Implementation Plan", | |
| 303 | + "", | |
| 304 | + "## File Changes", | |
| 305 | + f"- `{tmp_path / 'guide' / 'index.html'}`", | |
| 306 | + f"- `{tmp_path / 'guide' / 'chapters'}/` (directory for chapter files)", | |
| 307 | + "", | |
| 308 | + "## Execution Order", | |
| 309 | + "- Create chapter files with appropriate content", | |
| 310 | + ] | |
| 311 | + ) | |
| 312 | + ) | |
| 313 | + | |
| 314 | + guide_root = tmp_path / "guide" | |
| 315 | + chapters = guide_root / "chapters" | |
| 316 | + guide_root.mkdir() | |
| 317 | + chapters.mkdir() | |
| 318 | + index = guide_root / "index.html" | |
| 319 | + index.write_text( | |
| 320 | + '<a href="chapters/01-introduction.html">Intro</a>\n' | |
| 321 | + '<a href="chapters/02-setup.html">Setup</a>\n' | |
| 322 | + ) | |
| 323 | + (chapters / "01-introduction.html").write_text("<h1>Intro</h1>\n") | |
| 324 | + | |
| 325 | + dod = create_definition_of_done("Create a multi-file guide with chapters.") | |
| 326 | + dod.implementation_plan = str(implementation_plan) | |
| 327 | + dod.touched_files = [str(index), str(chapters / "01-introduction.html")] | |
| 328 | + dod.completed_items = ["Create chapter files with appropriate content"] | |
| 329 | + | |
| 330 | + assert all_planned_artifacts_exist(dod, project_root=tmp_path) is False | |
| 331 | + | |
| 332 | + (chapters / "02-setup.html").write_text("<h1>Setup</h1>\n") | |
| 333 | + | |
| 334 | + assert all_planned_artifacts_exist(dod, project_root=tmp_path) is True | |
| 335 | + | |
| 336 | + | |
| 169 | 337 | def test_build_verification_summary_keeps_concrete_missing_link_details() -> None: |
| 170 | 338 | summary = build_verification_summary( |
| 171 | 339 | [ |
tests/test_finalization.pymodified@@ -10,10 +10,17 @@ import pytest | ||
| 10 | 10 | from loader.llm.base import Message, Role, ToolCall |
| 11 | 11 | from loader.runtime.completion_trace import CompletionTraceEntry |
| 12 | 12 | from loader.runtime.context import RuntimeContext |
| 13 | -from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done | |
| 13 | +from loader.runtime.dod import ( | |
| 14 | + DefinitionOfDoneStore, | |
| 15 | + VerificationEvidence, | |
| 16 | + create_definition_of_done, | |
| 17 | +) | |
| 14 | 18 | from loader.runtime.events import TurnSummary |
| 15 | 19 | from loader.runtime.executor import ToolExecutionOutcome, ToolExecutionState |
| 16 | -from loader.runtime.finalization import TurnFinalizer | |
| 20 | +from loader.runtime.finalization import ( | |
| 21 | + TurnFinalizer, | |
| 22 | + _build_verification_repair_guidance, | |
| 23 | +) | |
| 17 | 24 | from loader.runtime.permissions import ( |
| 18 | 25 | PermissionMode, |
| 19 | 26 | build_permission_policy, |
@@ -129,6 +136,25 @@ class RecordingExecutor: | ||
| 129 | 136 | ) |
| 130 | 137 | |
| 131 | 138 | |
| 139 | +class SelectiveRecordingExecutor: | |
| 140 | + def __init__(self, failing_match: str) -> None: | |
| 141 | + self.commands: list[str] = [] | |
| 142 | + self.failing_match = failing_match | |
| 143 | + | |
| 144 | + async def execute_tool_call(self, tool_call: ToolCall, **_: object) -> ToolExecutionOutcome: | |
| 145 | + command = str(tool_call.arguments.get("command", "")) | |
| 146 | + self.commands.append(command) | |
| 147 | + failed = self.failing_match in command | |
| 148 | + return tool_outcome( | |
| 149 | + tool_call=tool_call, | |
| 150 | + output="failed" if failed else "ok", | |
| 151 | + is_error=failed, | |
| 152 | + exit_code=1 if failed else 0, | |
| 153 | + stdout="" if failed else "ok", | |
| 154 | + stderr="failed" if failed else "", | |
| 155 | + ) | |
| 156 | + | |
| 157 | + | |
| 132 | 158 | def build_context(temp_dir: Path, session: FakeSession) -> RuntimeContext: |
| 133 | 159 | registry = create_default_registry(temp_dir) |
| 134 | 160 | registry.configure_workspace_root(temp_dir) |
@@ -260,6 +286,65 @@ def test_turn_finalizer_finalize_summary_uses_runtime_context( | ||
| 260 | 286 | ] |
| 261 | 287 | |
| 262 | 288 | |
| 289 | +def test_verification_repair_guidance_uses_existing_artifacts_as_source_of_truth( | |
| 290 | + temp_dir: Path, | |
| 291 | +) -> None: | |
| 292 | + guide_root = temp_dir / "guides" / "nginx" | |
| 293 | + chapters = guide_root / "chapters" | |
| 294 | + chapters.mkdir(parents=True) | |
| 295 | + index_path = guide_root / "index.html" | |
| 296 | + chapter_one = chapters / "01-getting-started.html" | |
| 297 | + chapter_two = chapters / "02-installation.html" | |
| 298 | + chapter_three = chapters / "03-first-website.html" | |
| 299 | + chapter_four = chapters / "04-configuration-basics.html" | |
| 300 | + | |
| 301 | + for path in (index_path, chapter_one, chapter_two, chapter_three, chapter_four): | |
| 302 | + path.write_text("<html></html>\n") | |
| 303 | + | |
| 304 | + implementation_plan = temp_dir / "implementation.md" | |
| 305 | + implementation_plan.write_text( | |
| 306 | + "\n".join( | |
| 307 | + [ | |
| 308 | + "# Implementation Plan", | |
| 309 | + "", | |
| 310 | + "## File Changes", | |
| 311 | + f"- `{guide_root}/`", | |
| 312 | + f"- `{chapters}/`", | |
| 313 | + f"- `{index_path}`", | |
| 314 | + f"- `{chapter_one}`", | |
| 315 | + f"- `{chapter_two}`", | |
| 316 | + f"- `{chapter_three}`", | |
| 317 | + f"- `{chapter_four}`", | |
| 318 | + "", | |
| 319 | + ] | |
| 320 | + ) | |
| 321 | + ) | |
| 322 | + | |
| 323 | + dod = create_definition_of_done("Repair the nginx guide index.") | |
| 324 | + dod.implementation_plan = str(implementation_plan) | |
| 325 | + dod.evidence = [ | |
| 326 | + VerificationEvidence( | |
| 327 | + command="verify-links", | |
| 328 | + passed=False, | |
| 329 | + output=( | |
| 330 | + "Missing local HTML links:\n" | |
| 331 | + f"{index_path}:chapters/01-introduction.html -> {chapters / '01-introduction.html'}\n" | |
| 332 | + f"{index_path}:chapters/04-server-blocks.html -> {chapters / '04-server-blocks.html'}\n" | |
| 333 | + ), | |
| 334 | + ) | |
| 335 | + ] | |
| 336 | + | |
| 337 | + guidance = _build_verification_repair_guidance( | |
| 338 | + dod, | |
| 339 | + project_root=temp_dir, | |
| 340 | + ) | |
| 341 | + | |
| 342 | + assert "Use the existing artifact files as the source of truth" in guidance | |
| 343 | + assert str(chapter_one) in guidance | |
| 344 | + assert str(chapter_two) in guidance | |
| 345 | + assert str(chapter_four) in guidance | |
| 346 | + | |
| 347 | + | |
| 263 | 348 | @pytest.mark.asyncio |
| 264 | 349 | async def test_turn_finalizer_records_skipped_verification_observation( |
| 265 | 350 | temp_dir: Path, |
@@ -296,6 +381,8 @@ async def test_turn_finalizer_records_skipped_verification_observation( | ||
| 296 | 381 | "verification was skipped because no mutating work required checks" |
| 297 | 382 | ] |
| 298 | 383 | assert summary.verification_status == "skipped" |
| 384 | + assert "Complete the requested work" not in dod.pending_items | |
| 385 | + assert "Complete the requested work" in dod.completed_items | |
| 299 | 386 | assert session.workflow_timeline[-1].kind == "verify_skip" |
| 300 | 387 | assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [ |
| 301 | 388 | VerificationObservationStatus.SKIPPED.value |
@@ -481,6 +568,76 @@ async def test_turn_finalizer_does_not_append_repo_defaults_to_external_verifica | ||
| 481 | 568 | ] |
| 482 | 569 | |
| 483 | 570 | |
| 571 | +@pytest.mark.asyncio | |
| 572 | +async def test_turn_finalizer_blocks_completion_when_planned_artifacts_are_missing( | |
| 573 | + temp_dir: Path, | |
| 574 | +) -> None: | |
| 575 | + docs = temp_dir / "docs" | |
| 576 | + chapters = docs / "chapters" | |
| 577 | + chapters.mkdir(parents=True) | |
| 578 | + index = docs / "index.html" | |
| 579 | + first = chapters / "01-intro.html" | |
| 580 | + second = chapters / "02-installation.html" | |
| 581 | + index.write_text( | |
| 582 | + "\n".join( | |
| 583 | + [ | |
| 584 | + '<a href="chapters/01-intro.html">Intro</a>', | |
| 585 | + '<a href="chapters/02-installation.html">Installation</a>', | |
| 586 | + ] | |
| 587 | + ) | |
| 588 | + ) | |
| 589 | + first.write_text("<h1>Intro</h1>\n") | |
| 590 | + implementation_plan = temp_dir / "implementation.md" | |
| 591 | + implementation_plan.write_text( | |
| 592 | + "\n".join( | |
| 593 | + [ | |
| 594 | + "# Implementation Plan", | |
| 595 | + "", | |
| 596 | + "## File Changes", | |
| 597 | + f"- `{index}`", | |
| 598 | + f"- `{first}`", | |
| 599 | + f"- `{second}`", | |
| 600 | + ] | |
| 601 | + ) | |
| 602 | + ) | |
| 603 | + | |
| 604 | + session = FakeSession() | |
| 605 | + context = build_context(temp_dir, session) | |
| 606 | + finalizer = TurnFinalizer( | |
| 607 | + context, | |
| 608 | + RuntimeTracer(), | |
| 609 | + DefinitionOfDoneStore(temp_dir), | |
| 610 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 611 | + ) | |
| 612 | + dod = create_definition_of_done("Create a small multi-page HTML guide.") | |
| 613 | + dod.mutating_actions.append("write") | |
| 614 | + dod.touched_files.extend([str(index), str(first)]) | |
| 615 | + dod.implementation_plan = str(implementation_plan) | |
| 616 | + dod.verification_commands = [f"ls -la {docs}"] | |
| 617 | + summary = TurnSummary(final_response="") | |
| 618 | + executor = RecordingExecutor() | |
| 619 | + | |
| 620 | + async def capture(event) -> None: | |
| 621 | + return None | |
| 622 | + | |
| 623 | + result = await finalizer.run_definition_of_done_gate( | |
| 624 | + dod=dod, | |
| 625 | + candidate_response="Finished the guide.", | |
| 626 | + emit=capture, | |
| 627 | + summary=summary, | |
| 628 | + executor=executor, # type: ignore[arg-type] | |
| 629 | + ) | |
| 630 | + | |
| 631 | + assert result.should_continue is True | |
| 632 | + assert result.reason_code == "planned_artifacts_missing_continue" | |
| 633 | + assert executor.commands == [] | |
| 634 | + assert dod.status == "draft" | |
| 635 | + assert "Complete the requested work" in dod.pending_items | |
| 636 | + assert "Complete the requested work" not in dod.completed_items | |
| 637 | + assert session.messages[-1].content.startswith("[PLANNED ARTIFACTS STILL MISSING]") | |
| 638 | + assert "`02-installation.html`" in session.messages[-1].content | |
| 639 | + | |
| 640 | + | |
| 484 | 641 | @pytest.mark.asyncio |
| 485 | 642 | async def test_turn_finalizer_records_missing_verification_observation( |
| 486 | 643 | temp_dir: Path, |
@@ -532,6 +689,146 @@ async def test_turn_finalizer_records_missing_verification_observation( | ||
| 532 | 689 | assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]") |
| 533 | 690 | |
| 534 | 691 | |
| 692 | +@pytest.mark.asyncio | |
| 693 | +async def test_turn_finalizer_ignores_unplanned_expansion_pending_items_once_plan_exists( | |
| 694 | + temp_dir: Path, | |
| 695 | +) -> None: | |
| 696 | + session = FakeSession() | |
| 697 | + context = build_context(temp_dir, session) | |
| 698 | + finalizer = TurnFinalizer( | |
| 699 | + context, | |
| 700 | + RuntimeTracer(), | |
| 701 | + DefinitionOfDoneStore(temp_dir), | |
| 702 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 703 | + ) | |
| 704 | + | |
| 705 | + docs = temp_dir / "guides" / "nginx" | |
| 706 | + chapters = docs / "chapters" | |
| 707 | + docs.mkdir(parents=True) | |
| 708 | + chapters.mkdir() | |
| 709 | + index = docs / "index.html" | |
| 710 | + first = chapters / "01-getting-started.html" | |
| 711 | + second = chapters / "02-installation.html" | |
| 712 | + index.write_text("<html></html>\n") | |
| 713 | + first.write_text("<h1>One</h1>\n") | |
| 714 | + second.write_text("<h1>Two</h1>\n") | |
| 715 | + | |
| 716 | + implementation_plan = temp_dir / "implementation.md" | |
| 717 | + implementation_plan.write_text( | |
| 718 | + "\n".join( | |
| 719 | + [ | |
| 720 | + "# Implementation Plan", | |
| 721 | + "", | |
| 722 | + "## File Changes", | |
| 723 | + f"- `{docs}/`", | |
| 724 | + f"- `{chapters}/`", | |
| 725 | + f"- `{index}`", | |
| 726 | + f"- `{first}`", | |
| 727 | + f"- `{second}`", | |
| 728 | + "", | |
| 729 | + ] | |
| 730 | + ) | |
| 731 | + ) | |
| 732 | + | |
| 733 | + dod = create_definition_of_done("Create a small multi-page HTML guide.") | |
| 734 | + dod.implementation_plan = str(implementation_plan) | |
| 735 | + dod.pending_items = [ | |
| 736 | + "Create 07-performance-tuning.html", | |
| 737 | + "Complete the requested work", | |
| 738 | + ] | |
| 739 | + summary = TurnSummary(final_response="") | |
| 740 | + | |
| 741 | + async def capture(event) -> None: | |
| 742 | + return None | |
| 743 | + | |
| 744 | + result = await finalizer.run_definition_of_done_gate( | |
| 745 | + dod=dod, | |
| 746 | + candidate_response="Finished the guide.", | |
| 747 | + emit=capture, | |
| 748 | + summary=summary, | |
| 749 | + executor=FakeExecutor([]), # type: ignore[arg-type] | |
| 750 | + ) | |
| 751 | + | |
| 752 | + assert result.should_continue is False | |
| 753 | + assert result.reason_code == "non_mutating_response_accepted" | |
| 754 | + | |
| 755 | + | |
| 756 | +@pytest.mark.asyncio | |
| 757 | +async def test_turn_finalizer_verification_failure_reentry_points_at_concrete_repair( | |
| 758 | + temp_dir: Path, | |
| 759 | + monkeypatch: pytest.MonkeyPatch, | |
| 760 | +) -> None: | |
| 761 | + session = FakeSession() | |
| 762 | + context = build_context(temp_dir, session) | |
| 763 | + queued_messages: list[str] = [] | |
| 764 | + context.queue_steering_message_callback = queued_messages.append | |
| 765 | + finalizer = TurnFinalizer( | |
| 766 | + context, | |
| 767 | + RuntimeTracer(), | |
| 768 | + DefinitionOfDoneStore(temp_dir), | |
| 769 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 770 | + ) | |
| 771 | + broken_file = temp_dir / "guides" / "nginx" / "chapters" / "05-advanced-configurations.html" | |
| 772 | + broken_file.parent.mkdir(parents=True, exist_ok=True) | |
| 773 | + broken_file.write_text('<link rel="stylesheet" href="../styles.css">\n') | |
| 774 | + missing_target = temp_dir / "guides" / "nginx" / "styles.css" | |
| 775 | + dod = create_definition_of_done("Create the nginx guide.") | |
| 776 | + dod.mutating_actions.append("write") | |
| 777 | + dod.touched_files.append(str(broken_file)) | |
| 778 | + dod.verification_commands = ["python3 verify_links.py"] | |
| 779 | + summary = TurnSummary(final_response="") | |
| 780 | + verify_call = ToolCall( | |
| 781 | + id="verify-1-1", | |
| 782 | + name="bash", | |
| 783 | + arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)}, | |
| 784 | + ) | |
| 785 | + failure_output = ( | |
| 786 | + "Missing local HTML links:\n" | |
| 787 | + f"{broken_file}:../styles.css -> {missing_target}\n" | |
| 788 | + ) | |
| 789 | + | |
| 790 | + async def capture(event) -> None: | |
| 791 | + return None | |
| 792 | + | |
| 793 | + monkeypatch.setattr( | |
| 794 | + "loader.runtime.finalization.derive_verification_commands", | |
| 795 | + lambda *args, **kwargs: [], | |
| 796 | + ) | |
| 797 | + | |
| 798 | + result = await finalizer.run_definition_of_done_gate( | |
| 799 | + dod=dod, | |
| 800 | + candidate_response="The guide is complete.", | |
| 801 | + emit=capture, | |
| 802 | + summary=summary, | |
| 803 | + executor=FakeExecutor( | |
| 804 | + [ | |
| 805 | + tool_outcome( | |
| 806 | + tool_call=verify_call, | |
| 807 | + output=failure_output, | |
| 808 | + is_error=True, | |
| 809 | + exit_code=1, | |
| 810 | + stdout=failure_output, | |
| 811 | + ) | |
| 812 | + ] | |
| 813 | + ), # type: ignore[arg-type] | |
| 814 | + ) | |
| 815 | + | |
| 816 | + assert result.should_continue is True | |
| 817 | + assert result.reason_code == "verification_failed_reentry" | |
| 818 | + assert queued_messages | |
| 819 | + assert str(broken_file) in queued_messages[-1] | |
| 820 | + assert "../styles.css" in queued_messages[-1] | |
| 821 | + assert str(missing_target) in queued_messages[-1] | |
| 822 | + assert "Do not restart discovery or reread unrelated references." in queued_messages[-1] | |
| 823 | + assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]") | |
| 824 | + assert f"Immediate next step: edit `{broken_file}`." in session.messages[-1].content | |
| 825 | + assert f"create `{missing_target}`" in session.messages[-1].content | |
| 826 | + assert ( | |
| 827 | + "Do not reread unrelated reference materials or restart discovery" | |
| 828 | + in session.messages[-1].content | |
| 829 | + ) | |
| 830 | + | |
| 831 | + | |
| 535 | 832 | @pytest.mark.asyncio |
| 536 | 833 | async def test_turn_finalizer_does_not_reverify_without_new_changes( |
| 537 | 834 | temp_dir: Path, |
tests/test_permissions.pymodified@@ -6,15 +6,20 @@ from pathlib import Path | ||
| 6 | 6 | |
| 7 | 7 | import pytest |
| 8 | 8 | |
| 9 | -from loader.llm.base import ToolCall | |
| 9 | +from loader.llm.base import Message, Role, ToolCall | |
| 10 | +from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done | |
| 10 | 11 | from loader.runtime.executor import ToolExecutionState, ToolExecutor |
| 11 | 12 | from loader.runtime.hooks import ( |
| 13 | + ActiveRepairMutationScopeHook, | |
| 14 | + ActiveRepairScopeHook, | |
| 12 | 15 | BaseToolHook, |
| 13 | 16 | FilePathAliasHook, |
| 14 | - HookDecision, | |
| 15 | 17 | HookContext, |
| 18 | + HookDecision, | |
| 16 | 19 | HookManager, |
| 17 | 20 | HookResult, |
| 21 | + LateReferenceDriftHook, | |
| 22 | + RelativePathContextHook, | |
| 18 | 23 | SearchPathAliasHook, |
| 19 | 24 | ) |
| 20 | 25 | from loader.runtime.permissions import ( |
@@ -24,6 +29,7 @@ from loader.runtime.permissions import ( | ||
| 24 | 29 | PermissionRuleSet, |
| 25 | 30 | build_permission_policy, |
| 26 | 31 | ) |
| 32 | +from loader.runtime.safeguard_services import ActionTracker | |
| 27 | 33 | from loader.runtime.tracing import RuntimeTracer |
| 28 | 34 | from loader.tools.base import create_default_registry |
| 29 | 35 | |
@@ -413,3 +419,1007 @@ async def test_search_path_alias_hook_splits_full_glob_pattern( | ||
| 413 | 419 | assert result.updated_arguments is not None |
| 414 | 420 | assert result.updated_arguments["path"] == str(chapters) |
| 415 | 421 | assert result.updated_arguments["pattern"] == "*.html" |
| 422 | + | |
| 423 | + | |
| 424 | +@pytest.mark.asyncio | |
| 425 | +async def test_relative_path_context_hook_remaps_workspace_mirror_of_external_root( | |
| 426 | + temp_dir: Path, | |
| 427 | +) -> None: | |
| 428 | + workspace_root = temp_dir / "workspace" | |
| 429 | + workspace_root.mkdir() | |
| 430 | + external_root = temp_dir / "external-home" | |
| 431 | + external_fortran = external_root / "Loader" / "guides" / "fortran" | |
| 432 | + external_fortran.mkdir(parents=True) | |
| 433 | + (external_fortran / "index.html").write_text("<html></html>\n") | |
| 434 | + (external_root / "Loader" / "guides").mkdir(exist_ok=True) | |
| 435 | + | |
| 436 | + registry = create_default_registry(workspace_root) | |
| 437 | + policy = build_permission_policy( | |
| 438 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 439 | + workspace_root=workspace_root, | |
| 440 | + tool_requirements=registry.get_tool_requirements(), | |
| 441 | + ) | |
| 442 | + action_tracker = ActionTracker() | |
| 443 | + action_tracker.record_tool_call( | |
| 444 | + "read", | |
| 445 | + {"file_path": str(external_fortran / "index.html")}, | |
| 446 | + ) | |
| 447 | + hook = RelativePathContextHook(action_tracker, workspace_root) | |
| 448 | + | |
| 449 | + mirrored_workspace_path = workspace_root / "Loader" / "guides" / "nginx" / "index.html" | |
| 450 | + expected_external_path = external_root / "Loader" / "guides" / "nginx" / "index.html" | |
| 451 | + | |
| 452 | + result = await hook.pre_tool_use( | |
| 453 | + HookContext( | |
| 454 | + tool_call=ToolCall( | |
| 455 | + id="write-1", | |
| 456 | + name="write", | |
| 457 | + arguments={ | |
| 458 | + "file_path": str(mirrored_workspace_path), | |
| 459 | + "content": "<html></html>\n", | |
| 460 | + }, | |
| 461 | + ), | |
| 462 | + tool=registry.get("write"), | |
| 463 | + registry=registry, | |
| 464 | + permission_policy=policy, | |
| 465 | + source="native", | |
| 466 | + ) | |
| 467 | + ) | |
| 468 | + | |
| 469 | + assert result.updated_arguments is not None | |
| 470 | + assert Path(result.updated_arguments["file_path"]).resolve() == expected_external_path.resolve() | |
| 471 | + | |
| 472 | + | |
| 473 | +class FakeSession: | |
| 474 | + def __init__(self, *, active_dod_path: str, messages: list[Message]) -> None: | |
| 475 | + self.active_dod_path = active_dod_path | |
| 476 | + self.messages = messages | |
| 477 | + | |
| 478 | + | |
| 479 | +@pytest.mark.asyncio | |
| 480 | +async def test_active_repair_scope_hook_blocks_reference_reads_while_fixing( | |
| 481 | + temp_dir: Path, | |
| 482 | +) -> None: | |
| 483 | + registry = create_default_registry(temp_dir) | |
| 484 | + policy = build_permission_policy( | |
| 485 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 486 | + workspace_root=temp_dir, | |
| 487 | + tool_requirements=registry.get_tool_requirements(), | |
| 488 | + ) | |
| 489 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 490 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 491 | + dod.status = "fixing" | |
| 492 | + dod_path = dod_store.save(dod) | |
| 493 | + repair_target = temp_dir / "guide" / "index.html" | |
| 494 | + session = FakeSession( | |
| 495 | + active_dod_path=str(dod_path), | |
| 496 | + messages=[ | |
| 497 | + Message( | |
| 498 | + role=Role.ASSISTANT, | |
| 499 | + content=( | |
| 500 | + "Repair focus:\n" | |
| 501 | + f"- Fix the broken local reference `chapters/01-introduction.html` in `{repair_target}`.\n" | |
| 502 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 503 | + f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-introduction.html'}`; otherwise remove or replace `chapters/01-introduction.html`.\n" | |
| 504 | + ), | |
| 505 | + ) | |
| 506 | + ], | |
| 507 | + ) | |
| 508 | + hook = ActiveRepairScopeHook( | |
| 509 | + dod_store=dod_store, | |
| 510 | + project_root=temp_dir, | |
| 511 | + session=session, | |
| 512 | + ) | |
| 513 | + | |
| 514 | + result = await hook.pre_tool_use( | |
| 515 | + HookContext( | |
| 516 | + tool_call=ToolCall( | |
| 517 | + id="read-1", | |
| 518 | + name="read", | |
| 519 | + arguments={"file_path": str(temp_dir / "reference" / "index.html")}, | |
| 520 | + ), | |
| 521 | + tool=registry.get("read"), | |
| 522 | + registry=registry, | |
| 523 | + permission_policy=policy, | |
| 524 | + source="native", | |
| 525 | + ) | |
| 526 | + ) | |
| 527 | + | |
| 528 | + assert result.decision == HookDecision.DENY | |
| 529 | + assert result.terminal_state == "blocked" | |
| 530 | + assert result.message is not None | |
| 531 | + assert "active repair scope" in result.message | |
| 532 | + assert str(repair_target) in result.message | |
| 533 | + | |
| 534 | + | |
| 535 | +@pytest.mark.asyncio | |
| 536 | +async def test_active_repair_scope_hook_allows_reads_inside_active_artifact_set( | |
| 537 | + temp_dir: Path, | |
| 538 | +) -> None: | |
| 539 | + registry = create_default_registry(temp_dir) | |
| 540 | + policy = build_permission_policy( | |
| 541 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 542 | + workspace_root=temp_dir, | |
| 543 | + tool_requirements=registry.get_tool_requirements(), | |
| 544 | + ) | |
| 545 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 546 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 547 | + dod.status = "fixing" | |
| 548 | + dod_path = dod_store.save(dod) | |
| 549 | + repair_target = temp_dir / "guide" / "index.html" | |
| 550 | + chapter_path = temp_dir / "guide" / "chapters" / "01-getting-started.html" | |
| 551 | + session = FakeSession( | |
| 552 | + active_dod_path=str(dod_path), | |
| 553 | + messages=[ | |
| 554 | + Message( | |
| 555 | + role=Role.ASSISTANT, | |
| 556 | + content=( | |
| 557 | + "Repair focus:\n" | |
| 558 | + f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n" | |
| 559 | + f"- Fix the broken local reference `../styles.css` in `{chapter_path}`.\n" | |
| 560 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 561 | + f"- If the broken reference should remain, create `{chapter_path}`; otherwise remove or replace `chapters/01-getting-started.html`.\n" | |
| 562 | + ), | |
| 563 | + ) | |
| 564 | + ], | |
| 565 | + ) | |
| 566 | + hook = ActiveRepairScopeHook( | |
| 567 | + dod_store=dod_store, | |
| 568 | + project_root=temp_dir, | |
| 569 | + session=session, | |
| 570 | + ) | |
| 571 | + | |
| 572 | + result = await hook.pre_tool_use( | |
| 573 | + HookContext( | |
| 574 | + tool_call=ToolCall( | |
| 575 | + id="read-1", | |
| 576 | + name="read", | |
| 577 | + arguments={"file_path": str(chapter_path)}, | |
| 578 | + ), | |
| 579 | + tool=registry.get("read"), | |
| 580 | + registry=registry, | |
| 581 | + permission_policy=policy, | |
| 582 | + source="native", | |
| 583 | + ) | |
| 584 | + ) | |
| 585 | + | |
| 586 | + assert result.decision == HookDecision.CONTINUE | |
| 587 | + | |
| 588 | + | |
| 589 | +@pytest.mark.asyncio | |
| 590 | +async def test_active_repair_scope_hook_allows_verification_source_outside_repair_target( | |
| 591 | + temp_dir: Path, | |
| 592 | +) -> None: | |
| 593 | + registry = create_default_registry(temp_dir) | |
| 594 | + policy = build_permission_policy( | |
| 595 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 596 | + workspace_root=temp_dir, | |
| 597 | + tool_requirements=registry.get_tool_requirements(), | |
| 598 | + ) | |
| 599 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 600 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 601 | + dod.status = "in_progress" | |
| 602 | + dod_path = dod_store.save(dod) | |
| 603 | + repair_target = temp_dir / "guide" / "chapters" / "06-troubleshooting.html" | |
| 604 | + session = FakeSession( | |
| 605 | + active_dod_path=str(dod_path), | |
| 606 | + messages=[ | |
| 607 | + Message( | |
| 608 | + role=Role.ASSISTANT, | |
| 609 | + content=( | |
| 610 | + "Repair focus:\n" | |
| 611 | + f"- Fix the broken local reference `01-introduction.html` in `{repair_target}`.\n" | |
| 612 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 613 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 614 | + ), | |
| 615 | + ) | |
| 616 | + ], | |
| 617 | + ) | |
| 618 | + hook = ActiveRepairScopeHook( | |
| 619 | + dod_store=dod_store, | |
| 620 | + project_root=temp_dir, | |
| 621 | + session=session, | |
| 622 | + ) | |
| 623 | + | |
| 624 | + result = await hook.pre_tool_use( | |
| 625 | + HookContext( | |
| 626 | + tool_call=ToolCall( | |
| 627 | + id="verify-1", | |
| 628 | + name="read", | |
| 629 | + arguments={"file_path": str(temp_dir / "guide" / "index.html")}, | |
| 630 | + ), | |
| 631 | + tool=registry.get("read"), | |
| 632 | + registry=registry, | |
| 633 | + permission_policy=policy, | |
| 634 | + source="verification", | |
| 635 | + ) | |
| 636 | + ) | |
| 637 | + | |
| 638 | + assert result.decision == HookDecision.CONTINUE | |
| 639 | + | |
| 640 | + | |
| 641 | +@pytest.mark.asyncio | |
| 642 | +async def test_active_repair_scope_hook_blocks_local_rereads_outside_concrete_repair_files( | |
| 643 | + temp_dir: Path, | |
| 644 | +) -> None: | |
| 645 | + registry = create_default_registry(temp_dir) | |
| 646 | + policy = build_permission_policy( | |
| 647 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 648 | + workspace_root=temp_dir, | |
| 649 | + tool_requirements=registry.get_tool_requirements(), | |
| 650 | + ) | |
| 651 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 652 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 653 | + dod.status = "in_progress" | |
| 654 | + dod_path = dod_store.save(dod) | |
| 655 | + repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html" | |
| 656 | + stylesheet = temp_dir / "guide" / "styles.css" | |
| 657 | + other_chapter = temp_dir / "guide" / "chapters" / "01-getting-started.html" | |
| 658 | + session = FakeSession( | |
| 659 | + active_dod_path=str(dod_path), | |
| 660 | + messages=[ | |
| 661 | + Message( | |
| 662 | + role=Role.ASSISTANT, | |
| 663 | + content=( | |
| 664 | + "Repair focus:\n" | |
| 665 | + f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n" | |
| 666 | + f"- Fix the broken local reference `../styles.css` in `{temp_dir / 'guide' / 'chapters' / '06-troubleshooting.html'}`.\n" | |
| 667 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 668 | + f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n" | |
| 669 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 670 | + ), | |
| 671 | + ) | |
| 672 | + ], | |
| 673 | + ) | |
| 674 | + hook = ActiveRepairScopeHook( | |
| 675 | + dod_store=dod_store, | |
| 676 | + project_root=temp_dir, | |
| 677 | + session=session, | |
| 678 | + ) | |
| 679 | + | |
| 680 | + result = await hook.pre_tool_use( | |
| 681 | + HookContext( | |
| 682 | + tool_call=ToolCall( | |
| 683 | + id="read-1", | |
| 684 | + name="read", | |
| 685 | + arguments={"file_path": str(other_chapter)}, | |
| 686 | + ), | |
| 687 | + tool=registry.get("read"), | |
| 688 | + registry=registry, | |
| 689 | + permission_policy=policy, | |
| 690 | + source="native", | |
| 691 | + ) | |
| 692 | + ) | |
| 693 | + | |
| 694 | + assert result.decision == HookDecision.DENY | |
| 695 | + assert result.terminal_state == "blocked" | |
| 696 | + assert result.message is not None | |
| 697 | + assert "active repair scope" in result.message | |
| 698 | + assert str(repair_target) in result.message | |
| 699 | + assert str(stylesheet) in result.message | |
| 700 | + | |
| 701 | + | |
| 702 | +@pytest.mark.asyncio | |
| 703 | +async def test_active_repair_scope_hook_allows_scoped_glob_within_active_artifact_roots( | |
| 704 | + temp_dir: Path, | |
| 705 | +) -> None: | |
| 706 | + registry = create_default_registry(temp_dir) | |
| 707 | + policy = build_permission_policy( | |
| 708 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 709 | + workspace_root=temp_dir, | |
| 710 | + tool_requirements=registry.get_tool_requirements(), | |
| 711 | + ) | |
| 712 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 713 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 714 | + dod.status = "in_progress" | |
| 715 | + dod_path = dod_store.save(dod) | |
| 716 | + repair_target = temp_dir / "guide" / "index.html" | |
| 717 | + guide_root = temp_dir / "guide" | |
| 718 | + session = FakeSession( | |
| 719 | + active_dod_path=str(dod_path), | |
| 720 | + messages=[ | |
| 721 | + Message( | |
| 722 | + role=Role.ASSISTANT, | |
| 723 | + content=( | |
| 724 | + "Repair focus:\n" | |
| 725 | + f"- Fix the broken local reference `chapters/troubleshooting.html` in `{repair_target}`.\n" | |
| 726 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 727 | + f"- If the broken reference should remain, create `{guide_root / 'chapters' / 'troubleshooting.html'}`; otherwise remove or replace `chapters/troubleshooting.html`.\n" | |
| 728 | + "- Use the existing artifact files as the source of truth while repairing this file: " | |
| 729 | + f"`{guide_root / 'chapters' / 'introduction.html'}`, `{guide_root / 'chapters' / 'installation.html'}`, `{guide_root / 'chapters' / 'configuration.html'}`.\n" | |
| 730 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 731 | + ), | |
| 732 | + ) | |
| 733 | + ], | |
| 734 | + ) | |
| 735 | + hook = ActiveRepairScopeHook( | |
| 736 | + dod_store=dod_store, | |
| 737 | + project_root=temp_dir, | |
| 738 | + session=session, | |
| 739 | + ) | |
| 740 | + | |
| 741 | + result = await hook.pre_tool_use( | |
| 742 | + HookContext( | |
| 743 | + tool_call=ToolCall( | |
| 744 | + id="glob-1", | |
| 745 | + name="glob", | |
| 746 | + arguments={ | |
| 747 | + "path": str(temp_dir), | |
| 748 | + "pattern": "**/guide/chapters/*.html", | |
| 749 | + }, | |
| 750 | + ), | |
| 751 | + tool=registry.get("glob"), | |
| 752 | + registry=registry, | |
| 753 | + permission_policy=policy, | |
| 754 | + source="native", | |
| 755 | + ) | |
| 756 | + ) | |
| 757 | + | |
| 758 | + assert result.decision == HookDecision.CONTINUE | |
| 759 | + | |
| 760 | + | |
| 761 | +@pytest.mark.asyncio | |
| 762 | +async def test_active_repair_scope_hook_allows_declared_missing_sibling_reads( | |
| 763 | + temp_dir: Path, | |
| 764 | +) -> None: | |
| 765 | + registry = create_default_registry(temp_dir) | |
| 766 | + policy = build_permission_policy( | |
| 767 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 768 | + workspace_root=temp_dir, | |
| 769 | + tool_requirements=registry.get_tool_requirements(), | |
| 770 | + ) | |
| 771 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 772 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 773 | + dod.status = "in_progress" | |
| 774 | + dod_path = dod_store.save(dod) | |
| 775 | + guide_root = temp_dir / "guide" | |
| 776 | + chapters = guide_root / "chapters" | |
| 777 | + chapters.mkdir(parents=True) | |
| 778 | + repair_target = guide_root / "index.html" | |
| 779 | + existing_chapter = chapters / "overview.html" | |
| 780 | + next_chapter = chapters / "installation.html" | |
| 781 | + repair_target.write_text( | |
| 782 | + "\n".join( | |
| 783 | + [ | |
| 784 | + "<html>", | |
| 785 | + '<a href="chapters/overview.html">Overview</a>', | |
| 786 | + '<a href="chapters/installation.html">Installation</a>', | |
| 787 | + "</html>", | |
| 788 | + ] | |
| 789 | + ) | |
| 790 | + + "\n" | |
| 791 | + ) | |
| 792 | + existing_chapter.write_text("<h1>Overview</h1>\n") | |
| 793 | + | |
| 794 | + session = FakeSession( | |
| 795 | + active_dod_path=str(dod_path), | |
| 796 | + messages=[ | |
| 797 | + Message( | |
| 798 | + role=Role.ASSISTANT, | |
| 799 | + content=( | |
| 800 | + "Repair focus:\n" | |
| 801 | + f"- Fix the broken local reference `chapters/overview.html` in `{repair_target}`.\n" | |
| 802 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 803 | + f"- If the broken reference should remain, create `{existing_chapter}`; otherwise remove or replace `chapters/overview.html`.\n" | |
| 804 | + "- Use the existing artifact files as the source of truth while repairing this file: " | |
| 805 | + f"`{existing_chapter}`.\n" | |
| 806 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 807 | + ), | |
| 808 | + ) | |
| 809 | + ], | |
| 810 | + ) | |
| 811 | + hook = ActiveRepairScopeHook( | |
| 812 | + dod_store=dod_store, | |
| 813 | + project_root=temp_dir, | |
| 814 | + session=session, | |
| 815 | + ) | |
| 816 | + | |
| 817 | + result = await hook.pre_tool_use( | |
| 818 | + HookContext( | |
| 819 | + tool_call=ToolCall( | |
| 820 | + id="read-allowed-sibling", | |
| 821 | + name="read", | |
| 822 | + arguments={"file_path": str(next_chapter)}, | |
| 823 | + ), | |
| 824 | + tool=registry.get("read"), | |
| 825 | + registry=registry, | |
| 826 | + permission_policy=policy, | |
| 827 | + source="native", | |
| 828 | + ) | |
| 829 | + ) | |
| 830 | + | |
| 831 | + assert result.decision == HookDecision.CONTINUE | |
| 832 | + | |
| 833 | + | |
| 834 | +@pytest.mark.asyncio | |
| 835 | +async def test_active_repair_scope_hook_blocks_reference_reads_during_in_progress_repair( | |
| 836 | + temp_dir: Path, | |
| 837 | +) -> None: | |
| 838 | + registry = create_default_registry(temp_dir) | |
| 839 | + policy = build_permission_policy( | |
| 840 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 841 | + workspace_root=temp_dir, | |
| 842 | + tool_requirements=registry.get_tool_requirements(), | |
| 843 | + ) | |
| 844 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 845 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 846 | + dod.status = "in_progress" | |
| 847 | + dod_path = dod_store.save(dod) | |
| 848 | + repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html" | |
| 849 | + session = FakeSession( | |
| 850 | + active_dod_path=str(dod_path), | |
| 851 | + messages=[ | |
| 852 | + Message( | |
| 853 | + role=Role.ASSISTANT, | |
| 854 | + content=( | |
| 855 | + "Repair focus:\n" | |
| 856 | + f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n" | |
| 857 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 858 | + f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'styles.css'}`; otherwise remove or replace `../styles.css`.\n" | |
| 859 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 860 | + ), | |
| 861 | + ) | |
| 862 | + ], | |
| 863 | + ) | |
| 864 | + hook = ActiveRepairScopeHook( | |
| 865 | + dod_store=dod_store, | |
| 866 | + project_root=temp_dir, | |
| 867 | + session=session, | |
| 868 | + ) | |
| 869 | + | |
| 870 | + result = await hook.pre_tool_use( | |
| 871 | + HookContext( | |
| 872 | + tool_call=ToolCall( | |
| 873 | + id="read-1", | |
| 874 | + name="read", | |
| 875 | + arguments={"file_path": str(temp_dir / "reference" / "index.html")}, | |
| 876 | + ), | |
| 877 | + tool=registry.get("read"), | |
| 878 | + registry=registry, | |
| 879 | + permission_policy=policy, | |
| 880 | + source="native", | |
| 881 | + ) | |
| 882 | + ) | |
| 883 | + | |
| 884 | + assert result.decision == HookDecision.DENY | |
| 885 | + assert result.terminal_state == "blocked" | |
| 886 | + assert result.message is not None | |
| 887 | + assert "active repair scope" in result.message | |
| 888 | + | |
| 889 | + | |
| 890 | +@pytest.mark.asyncio | |
| 891 | +async def test_active_repair_mutation_scope_hook_blocks_writes_outside_named_repair_files( | |
| 892 | + temp_dir: Path, | |
| 893 | +) -> None: | |
| 894 | + registry = create_default_registry(temp_dir) | |
| 895 | + policy = build_permission_policy( | |
| 896 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 897 | + workspace_root=temp_dir, | |
| 898 | + tool_requirements=registry.get_tool_requirements(), | |
| 899 | + ) | |
| 900 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 901 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 902 | + dod.status = "in_progress" | |
| 903 | + dod_path = dod_store.save(dod) | |
| 904 | + repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html" | |
| 905 | + chapter_path = temp_dir / "guide" / "chapters" / "01-getting-started.html" | |
| 906 | + session = FakeSession( | |
| 907 | + active_dod_path=str(dod_path), | |
| 908 | + messages=[ | |
| 909 | + Message( | |
| 910 | + role=Role.ASSISTANT, | |
| 911 | + content=( | |
| 912 | + "Repair focus:\n" | |
| 913 | + f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n" | |
| 914 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 915 | + f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'styles.css'}`; otherwise remove or replace `../styles.css`.\n" | |
| 916 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 917 | + ), | |
| 918 | + ) | |
| 919 | + ], | |
| 920 | + ) | |
| 921 | + hook = ActiveRepairMutationScopeHook( | |
| 922 | + dod_store=dod_store, | |
| 923 | + project_root=temp_dir, | |
| 924 | + session=session, | |
| 925 | + ) | |
| 926 | + | |
| 927 | + result = await hook.pre_tool_use( | |
| 928 | + HookContext( | |
| 929 | + tool_call=ToolCall( | |
| 930 | + id="edit-1", | |
| 931 | + name="edit", | |
| 932 | + arguments={"file_path": str(chapter_path), "old_string": "old", "new_string": "new"}, | |
| 933 | + ), | |
| 934 | + tool=registry.get("edit"), | |
| 935 | + registry=registry, | |
| 936 | + permission_policy=policy, | |
| 937 | + source="native", | |
| 938 | + ) | |
| 939 | + ) | |
| 940 | + | |
| 941 | + assert result.decision == HookDecision.DENY | |
| 942 | + assert result.terminal_state == "blocked" | |
| 943 | + assert result.message is not None | |
| 944 | + assert "active repair mutation scope" in result.message | |
| 945 | + assert str(repair_target) in result.message | |
| 946 | + | |
| 947 | + | |
| 948 | +@pytest.mark.asyncio | |
| 949 | +async def test_active_repair_mutation_scope_hook_allows_expected_repair_file_writes( | |
| 950 | + temp_dir: Path, | |
| 951 | +) -> None: | |
| 952 | + registry = create_default_registry(temp_dir) | |
| 953 | + policy = build_permission_policy( | |
| 954 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 955 | + workspace_root=temp_dir, | |
| 956 | + tool_requirements=registry.get_tool_requirements(), | |
| 957 | + ) | |
| 958 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 959 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 960 | + dod.status = "in_progress" | |
| 961 | + dod_path = dod_store.save(dod) | |
| 962 | + repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html" | |
| 963 | + stylesheet = temp_dir / "guide" / "styles.css" | |
| 964 | + session = FakeSession( | |
| 965 | + active_dod_path=str(dod_path), | |
| 966 | + messages=[ | |
| 967 | + Message( | |
| 968 | + role=Role.ASSISTANT, | |
| 969 | + content=( | |
| 970 | + "Repair focus:\n" | |
| 971 | + f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n" | |
| 972 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 973 | + f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n" | |
| 974 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 975 | + ), | |
| 976 | + ) | |
| 977 | + ], | |
| 978 | + ) | |
| 979 | + hook = ActiveRepairMutationScopeHook( | |
| 980 | + dod_store=dod_store, | |
| 981 | + project_root=temp_dir, | |
| 982 | + session=session, | |
| 983 | + ) | |
| 984 | + | |
| 985 | + result = await hook.pre_tool_use( | |
| 986 | + HookContext( | |
| 987 | + tool_call=ToolCall( | |
| 988 | + id="write-1", | |
| 989 | + name="write", | |
| 990 | + arguments={"file_path": str(stylesheet), "content": "body { color: #222; }\n"}, | |
| 991 | + ), | |
| 992 | + tool=registry.get("write"), | |
| 993 | + registry=registry, | |
| 994 | + permission_policy=policy, | |
| 995 | + source="native", | |
| 996 | + ) | |
| 997 | + ) | |
| 998 | + | |
| 999 | + assert result.decision == HookDecision.CONTINUE | |
| 1000 | + | |
| 1001 | + | |
| 1002 | +@pytest.mark.asyncio | |
| 1003 | +async def test_active_repair_mutation_scope_hook_allows_declared_missing_sibling_outputs( | |
| 1004 | + temp_dir: Path, | |
| 1005 | +) -> None: | |
| 1006 | + registry = create_default_registry(temp_dir) | |
| 1007 | + policy = build_permission_policy( | |
| 1008 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1009 | + workspace_root=temp_dir, | |
| 1010 | + tool_requirements=registry.get_tool_requirements(), | |
| 1011 | + ) | |
| 1012 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1013 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 1014 | + dod.status = "in_progress" | |
| 1015 | + dod_path = dod_store.save(dod) | |
| 1016 | + guide_root = temp_dir / "guide" | |
| 1017 | + chapters = guide_root / "chapters" | |
| 1018 | + chapters.mkdir(parents=True) | |
| 1019 | + repair_target = guide_root / "index.html" | |
| 1020 | + existing_chapter = chapters / "01-introduction.html" | |
| 1021 | + next_chapter = chapters / "02-installation.html" | |
| 1022 | + repair_target.write_text( | |
| 1023 | + "\n".join( | |
| 1024 | + [ | |
| 1025 | + "<html>", | |
| 1026 | + '<a href="chapters/01-introduction.html">Introduction</a>', | |
| 1027 | + '<a href="chapters/02-installation.html">Installation</a>', | |
| 1028 | + "</html>", | |
| 1029 | + ] | |
| 1030 | + ) | |
| 1031 | + + "\n" | |
| 1032 | + ) | |
| 1033 | + existing_chapter.write_text("<h1>Introduction</h1>\n") | |
| 1034 | + | |
| 1035 | + session = FakeSession( | |
| 1036 | + active_dod_path=str(dod_path), | |
| 1037 | + messages=[ | |
| 1038 | + Message( | |
| 1039 | + role=Role.ASSISTANT, | |
| 1040 | + content=( | |
| 1041 | + "Repair focus:\n" | |
| 1042 | + f"- Fix the broken local reference `chapters/01-introduction.html` in `{repair_target}`.\n" | |
| 1043 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 1044 | + f"- If the broken reference should remain, create `{existing_chapter}`; otherwise remove or replace `chapters/01-introduction.html`.\n" | |
| 1045 | + "- Use the existing artifact files as the source of truth while repairing this file: " | |
| 1046 | + f"`{existing_chapter}`.\n" | |
| 1047 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 1048 | + ), | |
| 1049 | + ) | |
| 1050 | + ], | |
| 1051 | + ) | |
| 1052 | + hook = ActiveRepairMutationScopeHook( | |
| 1053 | + dod_store=dod_store, | |
| 1054 | + project_root=temp_dir, | |
| 1055 | + session=session, | |
| 1056 | + ) | |
| 1057 | + | |
| 1058 | + result = await hook.pre_tool_use( | |
| 1059 | + HookContext( | |
| 1060 | + tool_call=ToolCall( | |
| 1061 | + id="write-2", | |
| 1062 | + name="write", | |
| 1063 | + arguments={"file_path": str(next_chapter), "content": "<h1>Installation</h1>\n"}, | |
| 1064 | + ), | |
| 1065 | + tool=registry.get("write"), | |
| 1066 | + registry=registry, | |
| 1067 | + permission_policy=policy, | |
| 1068 | + source="native", | |
| 1069 | + ) | |
| 1070 | + ) | |
| 1071 | + | |
| 1072 | + assert result.decision == HookDecision.CONTINUE | |
| 1073 | + | |
| 1074 | + | |
| 1075 | +@pytest.mark.asyncio | |
| 1076 | +async def test_active_repair_mutation_scope_hook_blocks_broad_mutating_bash( | |
| 1077 | + temp_dir: Path, | |
| 1078 | +) -> None: | |
| 1079 | + registry = create_default_registry(temp_dir) | |
| 1080 | + policy = build_permission_policy( | |
| 1081 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1082 | + workspace_root=temp_dir, | |
| 1083 | + tool_requirements=registry.get_tool_requirements(), | |
| 1084 | + ) | |
| 1085 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1086 | + dod = create_definition_of_done("Repair the active artifact set") | |
| 1087 | + dod.status = "in_progress" | |
| 1088 | + dod_path = dod_store.save(dod) | |
| 1089 | + repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html" | |
| 1090 | + session = FakeSession( | |
| 1091 | + active_dod_path=str(dod_path), | |
| 1092 | + messages=[ | |
| 1093 | + Message( | |
| 1094 | + role=Role.ASSISTANT, | |
| 1095 | + content=( | |
| 1096 | + "Repair focus:\n" | |
| 1097 | + f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n" | |
| 1098 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 1099 | + f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'styles.css'}`; otherwise remove or replace `../styles.css`.\n" | |
| 1100 | + "- Do not reread unrelated reference materials or restart discovery while this concrete repair target is unresolved.\n" | |
| 1101 | + ), | |
| 1102 | + ) | |
| 1103 | + ], | |
| 1104 | + ) | |
| 1105 | + hook = ActiveRepairMutationScopeHook( | |
| 1106 | + dod_store=dod_store, | |
| 1107 | + project_root=temp_dir, | |
| 1108 | + session=session, | |
| 1109 | + ) | |
| 1110 | + | |
| 1111 | + result = await hook.pre_tool_use( | |
| 1112 | + HookContext( | |
| 1113 | + tool_call=ToolCall( | |
| 1114 | + id="bash-1", | |
| 1115 | + name="bash", | |
| 1116 | + arguments={"command": f"mkdir -p {temp_dir / 'guide' / 'assets'}"}, | |
| 1117 | + ), | |
| 1118 | + tool=registry.get("bash"), | |
| 1119 | + registry=registry, | |
| 1120 | + permission_policy=policy, | |
| 1121 | + source="native", | |
| 1122 | + ) | |
| 1123 | + ) | |
| 1124 | + | |
| 1125 | + assert result.decision == HookDecision.DENY | |
| 1126 | + assert result.terminal_state == "blocked" | |
| 1127 | + assert result.message is not None | |
| 1128 | + assert "active repair mutation scope" in result.message | |
| 1129 | + assert str(repair_target) in result.message | |
| 1130 | + | |
| 1131 | + | |
| 1132 | +@pytest.mark.asyncio | |
| 1133 | +async def test_late_reference_drift_hook_blocks_out_of_scope_reference_reads( | |
| 1134 | + temp_dir: Path, | |
| 1135 | +) -> None: | |
| 1136 | + registry = create_default_registry(temp_dir) | |
| 1137 | + policy = build_permission_policy( | |
| 1138 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1139 | + workspace_root=temp_dir, | |
| 1140 | + tool_requirements=registry.get_tool_requirements(), | |
| 1141 | + ) | |
| 1142 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1143 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 1144 | + dod.status = "in_progress" | |
| 1145 | + plan_path = temp_dir / "implementation.md" | |
| 1146 | + plan_path.write_text( | |
| 1147 | + "# File Changes\n" | |
| 1148 | + "- `guide/index.html`\n" | |
| 1149 | + "- `guide/chapters/01-getting-started.html`\n" | |
| 1150 | + "- `guide/chapters/02-installation.html`\n" | |
| 1151 | + "- `guide/chapters/03-first-website.html`\n" | |
| 1152 | + ) | |
| 1153 | + dod.implementation_plan = str(plan_path) | |
| 1154 | + dod_path = dod_store.save(dod) | |
| 1155 | + guide_dir = temp_dir / "guide" / "chapters" | |
| 1156 | + guide_dir.mkdir(parents=True, exist_ok=True) | |
| 1157 | + (temp_dir / "guide" / "index.html").write_text("index") | |
| 1158 | + (guide_dir / "01-getting-started.html").write_text("one") | |
| 1159 | + (guide_dir / "02-installation.html").write_text("two") | |
| 1160 | + session = FakeSession(active_dod_path=str(dod_path), messages=[]) | |
| 1161 | + hook = LateReferenceDriftHook( | |
| 1162 | + dod_store=dod_store, | |
| 1163 | + project_root=temp_dir, | |
| 1164 | + session=session, | |
| 1165 | + ) | |
| 1166 | + | |
| 1167 | + result = await hook.pre_tool_use( | |
| 1168 | + HookContext( | |
| 1169 | + tool_call=ToolCall( | |
| 1170 | + id="read-1", | |
| 1171 | + name="read", | |
| 1172 | + arguments={"file_path": str(temp_dir / "reference" / "index.html")}, | |
| 1173 | + ), | |
| 1174 | + tool=registry.get("read"), | |
| 1175 | + registry=registry, | |
| 1176 | + permission_policy=policy, | |
| 1177 | + source="native", | |
| 1178 | + ) | |
| 1179 | + ) | |
| 1180 | + | |
| 1181 | + assert result.decision == HookDecision.DENY | |
| 1182 | + assert result.terminal_state == "blocked" | |
| 1183 | + assert result.message is not None | |
| 1184 | + assert "late reference drift" in result.message | |
| 1185 | + assert "03-first-website.html" in result.message | |
| 1186 | + | |
| 1187 | + | |
| 1188 | +@pytest.mark.asyncio | |
| 1189 | +async def test_late_reference_drift_hook_allows_reads_inside_planned_artifact_set( | |
| 1190 | + temp_dir: Path, | |
| 1191 | +) -> None: | |
| 1192 | + registry = create_default_registry(temp_dir) | |
| 1193 | + policy = build_permission_policy( | |
| 1194 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1195 | + workspace_root=temp_dir, | |
| 1196 | + tool_requirements=registry.get_tool_requirements(), | |
| 1197 | + ) | |
| 1198 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1199 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 1200 | + dod.status = "in_progress" | |
| 1201 | + plan_path = temp_dir / "implementation.md" | |
| 1202 | + plan_path.write_text( | |
| 1203 | + "# File Changes\n" | |
| 1204 | + "- `guide/index.html`\n" | |
| 1205 | + "- `guide/chapters/01-getting-started.html`\n" | |
| 1206 | + "- `guide/chapters/02-installation.html`\n" | |
| 1207 | + "- `guide/chapters/03-first-website.html`\n" | |
| 1208 | + ) | |
| 1209 | + dod.implementation_plan = str(plan_path) | |
| 1210 | + dod_path = dod_store.save(dod) | |
| 1211 | + guide_dir = temp_dir / "guide" / "chapters" | |
| 1212 | + guide_dir.mkdir(parents=True, exist_ok=True) | |
| 1213 | + target = guide_dir / "02-installation.html" | |
| 1214 | + (temp_dir / "guide" / "index.html").write_text("index") | |
| 1215 | + (guide_dir / "01-getting-started.html").write_text("one") | |
| 1216 | + target.write_text("two") | |
| 1217 | + session = FakeSession(active_dod_path=str(dod_path), messages=[]) | |
| 1218 | + hook = LateReferenceDriftHook( | |
| 1219 | + dod_store=dod_store, | |
| 1220 | + project_root=temp_dir, | |
| 1221 | + session=session, | |
| 1222 | + ) | |
| 1223 | + | |
| 1224 | + result = await hook.pre_tool_use( | |
| 1225 | + HookContext( | |
| 1226 | + tool_call=ToolCall( | |
| 1227 | + id="read-1", | |
| 1228 | + name="read", | |
| 1229 | + arguments={"file_path": str(target)}, | |
| 1230 | + ), | |
| 1231 | + tool=registry.get("read"), | |
| 1232 | + registry=registry, | |
| 1233 | + permission_policy=policy, | |
| 1234 | + source="native", | |
| 1235 | + ) | |
| 1236 | + ) | |
| 1237 | + | |
| 1238 | + assert result.decision == HookDecision.CONTINUE | |
| 1239 | + | |
| 1240 | + | |
| 1241 | +@pytest.mark.asyncio | |
| 1242 | +async def test_late_reference_drift_hook_blocks_reference_reads_after_artifacts_exist( | |
| 1243 | + temp_dir: Path, | |
| 1244 | +) -> None: | |
| 1245 | + registry = create_default_registry(temp_dir) | |
| 1246 | + policy = build_permission_policy( | |
| 1247 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1248 | + workspace_root=temp_dir, | |
| 1249 | + tool_requirements=registry.get_tool_requirements(), | |
| 1250 | + ) | |
| 1251 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1252 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 1253 | + dod.status = "in_progress" | |
| 1254 | + plan_path = temp_dir / "implementation.md" | |
| 1255 | + plan_path.write_text( | |
| 1256 | + "\n".join( | |
| 1257 | + [ | |
| 1258 | + "# Implementation Plan", | |
| 1259 | + "", | |
| 1260 | + "## File Changes", | |
| 1261 | + f"- `{temp_dir / 'guide'}`", | |
| 1262 | + f"- `{temp_dir / 'guide' / 'chapters'}`", | |
| 1263 | + f"- `{temp_dir / 'guide' / 'index.html'}`", | |
| 1264 | + f"- `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`", | |
| 1265 | + f"- `{temp_dir / 'guide' / 'chapters' / '02-installation.html'}`", | |
| 1266 | + "", | |
| 1267 | + ] | |
| 1268 | + ) | |
| 1269 | + ) | |
| 1270 | + dod.implementation_plan = str(plan_path) | |
| 1271 | + guide_dir = temp_dir / "guide" / "chapters" | |
| 1272 | + guide_dir.mkdir(parents=True, exist_ok=True) | |
| 1273 | + (temp_dir / "guide" / "index.html").write_text("index") | |
| 1274 | + (guide_dir / "01-getting-started.html").write_text("one") | |
| 1275 | + (guide_dir / "02-installation.html").write_text("two") | |
| 1276 | + dod_path = dod_store.save(dod) | |
| 1277 | + session = FakeSession(active_dod_path=str(dod_path), messages=[]) | |
| 1278 | + hook = LateReferenceDriftHook( | |
| 1279 | + dod_store=dod_store, | |
| 1280 | + project_root=temp_dir, | |
| 1281 | + session=session, | |
| 1282 | + ) | |
| 1283 | + | |
| 1284 | + result = await hook.pre_tool_use( | |
| 1285 | + HookContext( | |
| 1286 | + tool_call=ToolCall( | |
| 1287 | + id="read-1", | |
| 1288 | + name="read", | |
| 1289 | + arguments={"file_path": str(temp_dir / "reference" / "index.html")}, | |
| 1290 | + ), | |
| 1291 | + tool=registry.get("read"), | |
| 1292 | + registry=registry, | |
| 1293 | + permission_policy=policy, | |
| 1294 | + source="native", | |
| 1295 | + ) | |
| 1296 | + ) | |
| 1297 | + | |
| 1298 | + assert result.decision == HookDecision.DENY | |
| 1299 | + assert result.terminal_state == "blocked" | |
| 1300 | + assert result.message is not None | |
| 1301 | + assert "completed artifact set scope" in result.message | |
| 1302 | + assert str(temp_dir / "guide") in result.message | |
| 1303 | + | |
| 1304 | + | |
| 1305 | +@pytest.mark.asyncio | |
| 1306 | +async def test_late_reference_drift_hook_does_not_treat_empty_output_dir_as_complete_artifact_set( | |
| 1307 | + temp_dir: Path, | |
| 1308 | +) -> None: | |
| 1309 | + registry = create_default_registry(temp_dir) | |
| 1310 | + policy = build_permission_policy( | |
| 1311 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1312 | + workspace_root=temp_dir, | |
| 1313 | + tool_requirements=registry.get_tool_requirements(), | |
| 1314 | + ) | |
| 1315 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1316 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 1317 | + dod.status = "in_progress" | |
| 1318 | + dod.completed_items = ["Create chapter files with appropriate content"] | |
| 1319 | + plan_path = temp_dir / "implementation.md" | |
| 1320 | + plan_path.write_text( | |
| 1321 | + "\n".join( | |
| 1322 | + [ | |
| 1323 | + "# Implementation Plan", | |
| 1324 | + "", | |
| 1325 | + "## File Changes", | |
| 1326 | + f"- `{temp_dir / 'guide' / 'index.html'}`", | |
| 1327 | + f"- `{temp_dir / 'guide' / 'chapters'}/` (directory for chapter files)", | |
| 1328 | + "", | |
| 1329 | + "## Execution Order", | |
| 1330 | + "- Create chapter files with appropriate content", | |
| 1331 | + ] | |
| 1332 | + ) | |
| 1333 | + ) | |
| 1334 | + dod.implementation_plan = str(plan_path) | |
| 1335 | + guide_dir = temp_dir / "guide" / "chapters" | |
| 1336 | + guide_dir.mkdir(parents=True, exist_ok=True) | |
| 1337 | + (temp_dir / "guide" / "index.html").write_text("index") | |
| 1338 | + dod_path = dod_store.save(dod) | |
| 1339 | + session = FakeSession(active_dod_path=str(dod_path), messages=[]) | |
| 1340 | + hook = LateReferenceDriftHook( | |
| 1341 | + dod_store=dod_store, | |
| 1342 | + project_root=temp_dir, | |
| 1343 | + session=session, | |
| 1344 | + ) | |
| 1345 | + | |
| 1346 | + result = await hook.pre_tool_use( | |
| 1347 | + HookContext( | |
| 1348 | + tool_call=ToolCall( | |
| 1349 | + id="read-1", | |
| 1350 | + name="read", | |
| 1351 | + arguments={"file_path": str(temp_dir / "reference" / "index.html")}, | |
| 1352 | + ), | |
| 1353 | + tool=registry.get("read"), | |
| 1354 | + registry=registry, | |
| 1355 | + permission_policy=policy, | |
| 1356 | + source="native", | |
| 1357 | + ) | |
| 1358 | + ) | |
| 1359 | + | |
| 1360 | + assert result.decision == HookDecision.CONTINUE | |
| 1361 | + | |
| 1362 | + | |
| 1363 | +@pytest.mark.asyncio | |
| 1364 | +async def test_late_reference_drift_hook_does_not_block_when_html_outputs_still_link_to_missing_files( | |
| 1365 | + temp_dir: Path, | |
| 1366 | +) -> None: | |
| 1367 | + registry = create_default_registry(temp_dir) | |
| 1368 | + policy = build_permission_policy( | |
| 1369 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 1370 | + workspace_root=temp_dir, | |
| 1371 | + tool_requirements=registry.get_tool_requirements(), | |
| 1372 | + ) | |
| 1373 | + dod_store = DefinitionOfDoneStore(temp_dir) | |
| 1374 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 1375 | + dod.status = "in_progress" | |
| 1376 | + dod.completed_items = ["Create chapter files with appropriate content"] | |
| 1377 | + plan_path = temp_dir / "implementation.md" | |
| 1378 | + plan_path.write_text( | |
| 1379 | + "\n".join( | |
| 1380 | + [ | |
| 1381 | + "# Implementation Plan", | |
| 1382 | + "", | |
| 1383 | + "## File Changes", | |
| 1384 | + f"- `{temp_dir / 'guide' / 'index.html'}`", | |
| 1385 | + f"- `{temp_dir / 'guide' / 'chapters'}/` (directory for chapter files)", | |
| 1386 | + "", | |
| 1387 | + "## Execution Order", | |
| 1388 | + "- Create chapter files with appropriate content", | |
| 1389 | + ] | |
| 1390 | + ) | |
| 1391 | + ) | |
| 1392 | + dod.implementation_plan = str(plan_path) | |
| 1393 | + guide_dir = temp_dir / "guide" | |
| 1394 | + chapters = guide_dir / "chapters" | |
| 1395 | + chapters.mkdir(parents=True, exist_ok=True) | |
| 1396 | + index = guide_dir / "index.html" | |
| 1397 | + index.write_text( | |
| 1398 | + '<a href="chapters/01-getting-started.html">One</a>\n' | |
| 1399 | + '<a href="chapters/02-installation.html">Two</a>\n' | |
| 1400 | + ) | |
| 1401 | + (chapters / "01-getting-started.html").write_text("one") | |
| 1402 | + dod.touched_files = [str(index), str(chapters / "01-getting-started.html")] | |
| 1403 | + dod_path = dod_store.save(dod) | |
| 1404 | + session = FakeSession(active_dod_path=str(dod_path), messages=[]) | |
| 1405 | + hook = LateReferenceDriftHook( | |
| 1406 | + dod_store=dod_store, | |
| 1407 | + project_root=temp_dir, | |
| 1408 | + session=session, | |
| 1409 | + ) | |
| 1410 | + | |
| 1411 | + result = await hook.pre_tool_use( | |
| 1412 | + HookContext( | |
| 1413 | + tool_call=ToolCall( | |
| 1414 | + id="read-1", | |
| 1415 | + name="read", | |
| 1416 | + arguments={"file_path": str(temp_dir / "reference" / "index.html")}, | |
| 1417 | + ), | |
| 1418 | + tool=registry.get("read"), | |
| 1419 | + registry=registry, | |
| 1420 | + permission_policy=policy, | |
| 1421 | + source="native", | |
| 1422 | + ) | |
| 1423 | + ) | |
| 1424 | + | |
| 1425 | + assert result.decision == HookDecision.CONTINUE | |
tests/test_repair.pymodified@@ -8,6 +8,7 @@ from types import SimpleNamespace | ||
| 8 | 8 | |
| 9 | 9 | from loader.llm.base import ToolCall |
| 10 | 10 | from loader.runtime.context import RuntimeContext |
| 11 | +from loader.runtime.dod import create_definition_of_done | |
| 11 | 12 | from loader.runtime.permissions import ( |
| 12 | 13 | PermissionMode, |
| 13 | 14 | build_permission_policy, |
@@ -201,3 +202,569 @@ def test_response_repairer_fails_honestly_when_raw_tool_budget_is_exhausted( | ||
| 201 | 202 | ) |
| 202 | 203 | assert analysis.failure == "raw-text tool recovery budget exhausted" |
| 203 | 204 | assert "Let me know if you'd like me to continue" not in analysis.final_response |
| 205 | + | |
| 206 | + | |
| 207 | +def test_empty_response_retry_message_surfaces_missing_planned_artifacts_and_working_note( | |
| 208 | + temp_dir: Path, | |
| 209 | +) -> None: | |
| 210 | + context = build_context( | |
| 211 | + temp_dir=temp_dir, | |
| 212 | + use_react=False, | |
| 213 | + ) | |
| 214 | + repairer = ResponseRepairer(context) | |
| 215 | + implementation_plan = temp_dir / "implementation.md" | |
| 216 | + implementation_plan.write_text( | |
| 217 | + "\n".join( | |
| 218 | + [ | |
| 219 | + "# Implementation Plan", | |
| 220 | + "", | |
| 221 | + "## File Changes", | |
| 222 | + f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`", | |
| 223 | + f"- `{temp_dir / 'guides' / 'nginx' / 'chapters' / '01-getting-started.html'}`", | |
| 224 | + f"- `{temp_dir / 'guides' / 'nginx' / 'chapters' / '02-installation.html'}`", | |
| 225 | + "", | |
| 226 | + ] | |
| 227 | + ) | |
| 228 | + ) | |
| 229 | + first_artifact = temp_dir / "guides" / "nginx" / "index.html" | |
| 230 | + first_artifact.parent.mkdir(parents=True) | |
| 231 | + first_artifact.write_text("<html></html>\n") | |
| 232 | + | |
| 233 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 234 | + dod.implementation_plan = str(implementation_plan) | |
| 235 | + dod.touched_files.append(str(first_artifact)) | |
| 236 | + dod.completed_items.append("Create the main index.html file") | |
| 237 | + dod.pending_items.append("Create each chapter file in sequence") | |
| 238 | + | |
| 239 | + context.session.append( | |
| 240 | + SimpleNamespace( | |
| 241 | + role="tool", | |
| 242 | + content=( | |
| 243 | + "Observation [notepad_write_working]: Result: " | |
| 244 | + "- [2026-04-21T19:17:34Z] Creating fifth chapter file: Advanced configurations" | |
| 245 | + ), | |
| 246 | + ) | |
| 247 | + ) | |
| 248 | + | |
| 249 | + decision = repairer.handle_empty_response( | |
| 250 | + task="Create a multi-file nginx guide.", | |
| 251 | + original_task=None, | |
| 252 | + empty_retry_count=1, | |
| 253 | + max_empty_retries=2, | |
| 254 | + dod=dod, | |
| 255 | + ) | |
| 256 | + | |
| 257 | + assert decision.should_continue is True | |
| 258 | + assert decision.retry_message is not None | |
| 259 | + assert "Latest working note: Creating fifth chapter file: Advanced configurations" in decision.retry_message | |
| 260 | + assert "Next missing planned artifact: `01-getting-started.html`" in decision.retry_message | |
| 261 | + assert "Remaining planned artifacts: `01-getting-started.html`, `02-installation.html`" in decision.retry_message | |
| 262 | + assert "Resume with this exact next step: create `01-getting-started.html`." in decision.retry_message | |
| 263 | + assert f"Prefer one `write` call for `{temp_dir / 'guides' / 'nginx' / 'chapters' / '01-getting-started.html'}` before any more reference reads." in decision.retry_message | |
| 264 | + assert ( | |
| 265 | + "Shape the next response as one concrete `write(file_path=..., content=...)` " | |
| 266 | + "tool call for that exact path." | |
| 267 | + in decision.retry_message | |
| 268 | + ) | |
| 269 | + assert ( | |
| 270 | + "Your next response should be the concrete mutation tool call itself, " | |
| 271 | + "not TodoWrite alone, verification, or a completion summary." | |
| 272 | + in decision.retry_message | |
| 273 | + ) | |
| 274 | + assert "Do not restart discovery unless one specific missing fact blocks this step." in decision.retry_message | |
| 275 | + | |
| 276 | + | |
| 277 | +def test_empty_response_retry_mentions_write_can_create_missing_parent_directories( | |
| 278 | + temp_dir: Path, | |
| 279 | +) -> None: | |
| 280 | + context = build_context( | |
| 281 | + temp_dir=temp_dir, | |
| 282 | + use_react=False, | |
| 283 | + ) | |
| 284 | + repairer = ResponseRepairer(context) | |
| 285 | + | |
| 286 | + guide_root = temp_dir / "guides" / "nginx" | |
| 287 | + index_path = guide_root / "index.html" | |
| 288 | + | |
| 289 | + implementation_plan = temp_dir / "implementation.md" | |
| 290 | + implementation_plan.write_text( | |
| 291 | + "\n".join( | |
| 292 | + [ | |
| 293 | + "# Implementation Plan", | |
| 294 | + "", | |
| 295 | + "## File Changes", | |
| 296 | + f"- `{index_path}`", | |
| 297 | + "", | |
| 298 | + ] | |
| 299 | + ) | |
| 300 | + ) | |
| 301 | + | |
| 302 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 303 | + dod.implementation_plan = str(implementation_plan) | |
| 304 | + dod.pending_items.extend( | |
| 305 | + [ | |
| 306 | + "Create nginx guide directory structure", | |
| 307 | + "Write main index.html for nginx guide", | |
| 308 | + ] | |
| 309 | + ) | |
| 310 | + | |
| 311 | + decision = repairer.handle_empty_response( | |
| 312 | + task="Create a multi-file nginx guide.", | |
| 313 | + original_task=None, | |
| 314 | + empty_retry_count=1, | |
| 315 | + max_empty_retries=2, | |
| 316 | + dod=dod, | |
| 317 | + ) | |
| 318 | + | |
| 319 | + assert decision.should_continue is True | |
| 320 | + assert decision.retry_message is not None | |
| 321 | + assert "Resume with this exact next step: create `index.html`." in decision.retry_message | |
| 322 | + assert ( | |
| 323 | + "The `write` tool can create that file's parent directories automatically" | |
| 324 | + in decision.retry_message | |
| 325 | + ) | |
| 326 | + assert ( | |
| 327 | + "Shape the next response as one concrete `write(file_path=..., content=...)` " | |
| 328 | + "tool call for that exact path." | |
| 329 | + in decision.retry_message | |
| 330 | + ) | |
| 331 | + | |
| 332 | + | |
| 333 | +def test_empty_response_retry_respects_discovery_first_pending_step( | |
| 334 | + temp_dir: Path, | |
| 335 | +) -> None: | |
| 336 | + context = build_context( | |
| 337 | + temp_dir=temp_dir, | |
| 338 | + use_react=False, | |
| 339 | + ) | |
| 340 | + repairer = ResponseRepairer(context) | |
| 341 | + | |
| 342 | + implementation_plan = temp_dir / "implementation.md" | |
| 343 | + implementation_plan.write_text( | |
| 344 | + "\n".join( | |
| 345 | + [ | |
| 346 | + "# Implementation Plan", | |
| 347 | + "", | |
| 348 | + "## File Changes", | |
| 349 | + f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`", | |
| 350 | + f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`", | |
| 351 | + "", | |
| 352 | + ] | |
| 353 | + ) | |
| 354 | + ) | |
| 355 | + | |
| 356 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 357 | + dod.implementation_plan = str(implementation_plan) | |
| 358 | + dod.pending_items.extend( | |
| 359 | + [ | |
| 360 | + "First, examine the existing fortran guide structure and content to understand the format", | |
| 361 | + "Create the nginx directory structure", | |
| 362 | + "Develop the main index.html file for the nginx guide", | |
| 363 | + ] | |
| 364 | + ) | |
| 365 | + | |
| 366 | + context.session.append( | |
| 367 | + SimpleNamespace( | |
| 368 | + role="tool", | |
| 369 | + content=( | |
| 370 | + "Observation [notepad_write_working]: Result: " | |
| 371 | + "- [2026-04-22T22:42:18Z] Analyzing the fortran guide structure before creating nginx guide" | |
| 372 | + ), | |
| 373 | + ) | |
| 374 | + ) | |
| 375 | + | |
| 376 | + decision = repairer.handle_empty_response( | |
| 377 | + task="Create a multi-file nginx guide.", | |
| 378 | + original_task=None, | |
| 379 | + empty_retry_count=1, | |
| 380 | + max_empty_retries=2, | |
| 381 | + dod=dod, | |
| 382 | + ) | |
| 383 | + | |
| 384 | + assert decision.should_continue is True | |
| 385 | + assert decision.retry_message is not None | |
| 386 | + assert ( | |
| 387 | + "Resume with this exact next step: advance `First, examine the existing fortran guide structure and content to understand the format`." | |
| 388 | + in decision.retry_message | |
| 389 | + ) | |
| 390 | + assert "one concrete evidence-gathering tool call" in decision.retry_message | |
| 391 | + assert "Resume with this exact next step: create `index.html`." not in decision.retry_message | |
| 392 | + | |
| 393 | + | |
| 394 | +def test_empty_response_retry_budget_extends_for_late_stage_multi_artifact_progress( | |
| 395 | + temp_dir: Path, | |
| 396 | +) -> None: | |
| 397 | + context = build_context( | |
| 398 | + temp_dir=temp_dir, | |
| 399 | + use_react=False, | |
| 400 | + ) | |
| 401 | + repairer = ResponseRepairer(context) | |
| 402 | + | |
| 403 | + guide_root = temp_dir / "guides" / "nginx" | |
| 404 | + chapters = guide_root / "chapters" | |
| 405 | + chapters.mkdir(parents=True) | |
| 406 | + index_path = guide_root / "index.html" | |
| 407 | + chapter_one = chapters / "01-getting-started.html" | |
| 408 | + chapter_two = chapters / "02-installation.html" | |
| 409 | + chapter_three = chapters / "03-first-website.html" | |
| 410 | + chapter_four = chapters / "04-configuration-basics.html" | |
| 411 | + index_path.write_text("<html></html>\n") | |
| 412 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 413 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 414 | + chapter_three.write_text("<h1>Three</h1>\n") | |
| 415 | + | |
| 416 | + implementation_plan = temp_dir / "implementation.md" | |
| 417 | + implementation_plan.write_text( | |
| 418 | + "\n".join( | |
| 419 | + [ | |
| 420 | + "# Implementation Plan", | |
| 421 | + "", | |
| 422 | + "## File Changes", | |
| 423 | + f"- `{guide_root}/`", | |
| 424 | + f"- `{chapters}/`", | |
| 425 | + f"- `{index_path}`", | |
| 426 | + f"- `{chapter_one}`", | |
| 427 | + f"- `{chapter_two}`", | |
| 428 | + f"- `{chapter_three}`", | |
| 429 | + f"- `{chapter_four}`", | |
| 430 | + "", | |
| 431 | + ] | |
| 432 | + ) | |
| 433 | + ) | |
| 434 | + | |
| 435 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 436 | + dod.implementation_plan = str(implementation_plan) | |
| 437 | + dod.touched_files.extend( | |
| 438 | + [str(index_path), str(chapter_one), str(chapter_two), str(chapter_three)] | |
| 439 | + ) | |
| 440 | + dod.completed_items.extend( | |
| 441 | + [ | |
| 442 | + "Create the directory structure for the new nginx guide", | |
| 443 | + "Create the main index.html file with proper structure", | |
| 444 | + ] | |
| 445 | + ) | |
| 446 | + dod.pending_items.append("Create each chapter file in sequence") | |
| 447 | + | |
| 448 | + decision = repairer.handle_empty_response( | |
| 449 | + task="Create a multi-file nginx guide.", | |
| 450 | + original_task=None, | |
| 451 | + empty_retry_count=3, | |
| 452 | + max_empty_retries=2, | |
| 453 | + dod=dod, | |
| 454 | + ) | |
| 455 | + | |
| 456 | + assert decision.should_continue is True | |
| 457 | + assert decision.retry_message is not None | |
| 458 | + assert "retry 3/4" in decision.retry_message | |
| 459 | + assert "Follow the same one-file-at-a-time mutation pattern" in decision.retry_message | |
| 460 | + | |
| 461 | + | |
| 462 | +def test_empty_response_retry_points_at_next_output_file_when_planned_directory_is_empty( | |
| 463 | + temp_dir: Path, | |
| 464 | +) -> None: | |
| 465 | + context = build_context( | |
| 466 | + temp_dir=temp_dir, | |
| 467 | + use_react=False, | |
| 468 | + ) | |
| 469 | + repairer = ResponseRepairer(context) | |
| 470 | + | |
| 471 | + guide_root = temp_dir / "guides" / "nginx" | |
| 472 | + chapters = guide_root / "chapters" | |
| 473 | + chapters.mkdir(parents=True) | |
| 474 | + index_path = guide_root / "index.html" | |
| 475 | + index_path.write_text("<html></html>\n") | |
| 476 | + | |
| 477 | + implementation_plan = temp_dir / "implementation.md" | |
| 478 | + implementation_plan.write_text( | |
| 479 | + "\n".join( | |
| 480 | + [ | |
| 481 | + "# Implementation Plan", | |
| 482 | + "", | |
| 483 | + "## File Changes", | |
| 484 | + f"- `{guide_root}/`", | |
| 485 | + f"- `{chapters}/`", | |
| 486 | + f"- `{index_path}`", | |
| 487 | + "", | |
| 488 | + ] | |
| 489 | + ) | |
| 490 | + ) | |
| 491 | + | |
| 492 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 493 | + dod.implementation_plan = str(implementation_plan) | |
| 494 | + dod.touched_files.append(str(index_path)) | |
| 495 | + dod.pending_items.append("Write the introduction chapter") | |
| 496 | + | |
| 497 | + decision = repairer.handle_empty_response( | |
| 498 | + task="Create a multi-file nginx guide.", | |
| 499 | + original_task=None, | |
| 500 | + empty_retry_count=1, | |
| 501 | + max_empty_retries=2, | |
| 502 | + dod=dod, | |
| 503 | + ) | |
| 504 | + | |
| 505 | + assert decision.should_continue is True | |
| 506 | + assert decision.retry_message is not None | |
| 507 | + assert "Next missing planned artifact: `chapters/`" in decision.retry_message | |
| 508 | + assert ( | |
| 509 | + "Resume with this exact next step: continue `Write the introduction chapter` " | |
| 510 | + "by creating the next output file under `chapters/`." | |
| 511 | + in decision.retry_message | |
| 512 | + ) | |
| 513 | + assert ( | |
| 514 | + f"Prefer one concrete `write` call for a file inside `{chapters}` before more research." | |
| 515 | + in decision.retry_message | |
| 516 | + ) | |
| 517 | + | |
| 518 | + | |
| 519 | +def test_empty_response_retry_points_at_declared_child_file_within_incomplete_output_directory( | |
| 520 | + temp_dir: Path, | |
| 521 | +) -> None: | |
| 522 | + context = build_context( | |
| 523 | + temp_dir=temp_dir, | |
| 524 | + use_react=False, | |
| 525 | + ) | |
| 526 | + repairer = ResponseRepairer(context) | |
| 527 | + | |
| 528 | + guide_root = temp_dir / "guides" / "nginx" | |
| 529 | + chapters = guide_root / "chapters" | |
| 530 | + chapters.mkdir(parents=True) | |
| 531 | + index_path = guide_root / "index.html" | |
| 532 | + index_path.write_text( | |
| 533 | + "\n".join( | |
| 534 | + [ | |
| 535 | + "<html>", | |
| 536 | + '<a href="chapters/introduction.html">Introduction</a>', | |
| 537 | + '<a href="chapters/installation.html">Installation</a>', | |
| 538 | + "</html>", | |
| 539 | + ] | |
| 540 | + ) | |
| 541 | + + "\n" | |
| 542 | + ) | |
| 543 | + | |
| 544 | + implementation_plan = temp_dir / "implementation.md" | |
| 545 | + implementation_plan.write_text( | |
| 546 | + "\n".join( | |
| 547 | + [ | |
| 548 | + "# Implementation Plan", | |
| 549 | + "", | |
| 550 | + "## File Changes", | |
| 551 | + f"- `{guide_root}/`", | |
| 552 | + f"- `{chapters}/`", | |
| 553 | + f"- `{index_path}`", | |
| 554 | + "", | |
| 555 | + ] | |
| 556 | + ) | |
| 557 | + ) | |
| 558 | + | |
| 559 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 560 | + dod.implementation_plan = str(implementation_plan) | |
| 561 | + dod.touched_files.append(str(index_path)) | |
| 562 | + dod.pending_items.append("Write the introduction chapter") | |
| 563 | + | |
| 564 | + decision = repairer.handle_empty_response( | |
| 565 | + task="Create a multi-file nginx guide.", | |
| 566 | + original_task=None, | |
| 567 | + empty_retry_count=1, | |
| 568 | + max_empty_retries=2, | |
| 569 | + dod=dod, | |
| 570 | + ) | |
| 571 | + | |
| 572 | + assert decision.should_continue is True | |
| 573 | + assert decision.retry_message is not None | |
| 574 | + assert "Next missing planned artifact: `chapters/`" in decision.retry_message | |
| 575 | + assert "Next declared output under `chapters/`: `introduction.html`" in decision.retry_message | |
| 576 | + assert ( | |
| 577 | + "Resume with this exact next step: continue `Write the introduction chapter` " | |
| 578 | + "by creating `introduction.html`." | |
| 579 | + in decision.retry_message | |
| 580 | + ) | |
| 581 | + assert "It is the next missing declared output under `chapters/`." in decision.retry_message | |
| 582 | + assert "Prefer one `write` call for `" in decision.retry_message | |
| 583 | + assert "introduction.html` before more research." in decision.retry_message | |
| 584 | + | |
| 585 | + | |
| 586 | +def test_empty_response_retry_fails_after_extended_late_stage_budget_is_exhausted( | |
| 587 | + temp_dir: Path, | |
| 588 | +) -> None: | |
| 589 | + context = build_context( | |
| 590 | + temp_dir=temp_dir, | |
| 591 | + use_react=False, | |
| 592 | + ) | |
| 593 | + repairer = ResponseRepairer(context) | |
| 594 | + | |
| 595 | + guide_root = temp_dir / "guides" / "nginx" | |
| 596 | + chapters = guide_root / "chapters" | |
| 597 | + chapters.mkdir(parents=True) | |
| 598 | + index_path = guide_root / "index.html" | |
| 599 | + chapter_one = chapters / "01-getting-started.html" | |
| 600 | + chapter_two = chapters / "02-installation.html" | |
| 601 | + chapter_three = chapters / "03-first-website.html" | |
| 602 | + chapter_four = chapters / "04-configuration-basics.html" | |
| 603 | + index_path.write_text("<html></html>\n") | |
| 604 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 605 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 606 | + chapter_three.write_text("<h1>Three</h1>\n") | |
| 607 | + | |
| 608 | + implementation_plan = temp_dir / "implementation.md" | |
| 609 | + implementation_plan.write_text( | |
| 610 | + "\n".join( | |
| 611 | + [ | |
| 612 | + "# Implementation Plan", | |
| 613 | + "", | |
| 614 | + "## File Changes", | |
| 615 | + f"- `{guide_root}/`", | |
| 616 | + f"- `{chapters}/`", | |
| 617 | + f"- `{index_path}`", | |
| 618 | + f"- `{chapter_one}`", | |
| 619 | + f"- `{chapter_two}`", | |
| 620 | + f"- `{chapter_three}`", | |
| 621 | + f"- `{chapter_four}`", | |
| 622 | + "", | |
| 623 | + ] | |
| 624 | + ) | |
| 625 | + ) | |
| 626 | + | |
| 627 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 628 | + dod.implementation_plan = str(implementation_plan) | |
| 629 | + dod.touched_files.extend( | |
| 630 | + [str(index_path), str(chapter_one), str(chapter_two), str(chapter_three)] | |
| 631 | + ) | |
| 632 | + dod.completed_items.extend( | |
| 633 | + [ | |
| 634 | + "Create the directory structure for the new nginx guide", | |
| 635 | + "Create the main index.html file with proper structure", | |
| 636 | + ] | |
| 637 | + ) | |
| 638 | + dod.pending_items.append("Create each chapter file in sequence") | |
| 639 | + | |
| 640 | + decision = repairer.handle_empty_response( | |
| 641 | + task="Create a multi-file nginx guide.", | |
| 642 | + original_task=None, | |
| 643 | + empty_retry_count=5, | |
| 644 | + max_empty_retries=2, | |
| 645 | + dod=dod, | |
| 646 | + ) | |
| 647 | + | |
| 648 | + assert decision.should_continue is False | |
| 649 | + assert decision.final_response is not None | |
| 650 | + assert "retrying 4 times" in decision.final_response | |
| 651 | + | |
| 652 | + | |
| 653 | +def test_empty_response_retry_mentions_todowrite_when_progress_has_outpaced_tracking( | |
| 654 | + temp_dir: Path, | |
| 655 | +) -> None: | |
| 656 | + context = build_context( | |
| 657 | + temp_dir=temp_dir, | |
| 658 | + use_react=False, | |
| 659 | + ) | |
| 660 | + repairer = ResponseRepairer(context) | |
| 661 | + | |
| 662 | + guide_root = temp_dir / "guides" / "nginx" | |
| 663 | + chapters = guide_root / "chapters" | |
| 664 | + chapters.mkdir(parents=True) | |
| 665 | + implementation_plan = temp_dir / "implementation.md" | |
| 666 | + implementation_plan.write_text( | |
| 667 | + "\n".join( | |
| 668 | + [ | |
| 669 | + "# Implementation Plan", | |
| 670 | + "", | |
| 671 | + "## File Changes", | |
| 672 | + f"- `{guide_root / 'index.html'}`", | |
| 673 | + f"- `{chapters / '01-getting-started.html'}`", | |
| 674 | + f"- `{chapters / '02-installation.html'}`", | |
| 675 | + "", | |
| 676 | + ] | |
| 677 | + ) | |
| 678 | + ) | |
| 679 | + | |
| 680 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 681 | + dod.implementation_plan = str(implementation_plan) | |
| 682 | + dod.touched_files.extend( | |
| 683 | + [ | |
| 684 | + str(guide_root / "index.html"), | |
| 685 | + str(chapters / "01-getting-started.html"), | |
| 686 | + ] | |
| 687 | + ) | |
| 688 | + dod.completed_items.extend( | |
| 689 | + [ | |
| 690 | + "Create the directory structure for the new nginx guide", | |
| 691 | + "Create the main index.html file with proper structure", | |
| 692 | + ] | |
| 693 | + ) | |
| 694 | + dod.pending_items.append("Create each chapter file in sequence") | |
| 695 | + | |
| 696 | + decision = repairer.handle_empty_response( | |
| 697 | + task="Create a multi-file nginx guide.", | |
| 698 | + original_task=None, | |
| 699 | + empty_retry_count=1, | |
| 700 | + max_empty_retries=2, | |
| 701 | + dod=dod, | |
| 702 | + ) | |
| 703 | + | |
| 704 | + assert decision.retry_message is not None | |
| 705 | + assert ( | |
| 706 | + "refresh `TodoWrite` alongside the next concrete mutation" | |
| 707 | + in decision.retry_message | |
| 708 | + ) | |
| 709 | + | |
| 710 | + | |
| 711 | +def test_empty_response_retry_omits_stale_aggregate_completed_work_when_artifacts_missing( | |
| 712 | + temp_dir: Path, | |
| 713 | +) -> None: | |
| 714 | + context = build_context( | |
| 715 | + temp_dir=temp_dir, | |
| 716 | + use_react=False, | |
| 717 | + ) | |
| 718 | + repairer = ResponseRepairer(context) | |
| 719 | + | |
| 720 | + guide_root = temp_dir / "guides" / "nginx" | |
| 721 | + chapters = guide_root / "chapters" | |
| 722 | + chapters.mkdir(parents=True) | |
| 723 | + index_path = guide_root / "index.html" | |
| 724 | + chapter_one = chapters / "01-getting-started.html" | |
| 725 | + chapter_two = chapters / "02-installation.html" | |
| 726 | + chapter_three = chapters / "03-first-website.html" | |
| 727 | + index_path.write_text("<html></html>\n") | |
| 728 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 729 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 730 | + | |
| 731 | + implementation_plan = temp_dir / "implementation.md" | |
| 732 | + implementation_plan.write_text( | |
| 733 | + "\n".join( | |
| 734 | + [ | |
| 735 | + "# Implementation Plan", | |
| 736 | + "", | |
| 737 | + "## File Changes", | |
| 738 | + f"- `{guide_root}/`", | |
| 739 | + f"- `{chapters}/`", | |
| 740 | + f"- `{index_path}`", | |
| 741 | + f"- `{chapter_one}`", | |
| 742 | + f"- `{chapter_two}`", | |
| 743 | + f"- `{chapter_three}`", | |
| 744 | + "", | |
| 745 | + ] | |
| 746 | + ) | |
| 747 | + ) | |
| 748 | + | |
| 749 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 750 | + dod.implementation_plan = str(implementation_plan) | |
| 751 | + dod.touched_files.extend([str(index_path), str(chapter_one), str(chapter_two)]) | |
| 752 | + dod.completed_items.extend( | |
| 753 | + [ | |
| 754 | + "Create the main index.html file with proper structure", | |
| 755 | + "Link all chapters together properly", | |
| 756 | + ] | |
| 757 | + ) | |
| 758 | + dod.pending_items.append("Create each chapter file in sequence") | |
| 759 | + | |
| 760 | + decision = repairer.handle_empty_response( | |
| 761 | + task="Create a multi-file nginx guide.", | |
| 762 | + original_task=None, | |
| 763 | + empty_retry_count=1, | |
| 764 | + max_empty_retries=2, | |
| 765 | + dod=dod, | |
| 766 | + ) | |
| 767 | + | |
| 768 | + assert decision.retry_message is not None | |
| 769 | + assert "Link all chapters together properly" not in decision.retry_message | |
| 770 | + assert "Create the main index.html file with proper structure" in decision.retry_message | |
tests/test_runtime_harness.pymodified@@ -2020,26 +2020,8 @@ async def test_blocked_html_index_edit_queues_inventory_reuse_steering( | ||
| 2020 | 2020 | if event.type == "steering" and event.content |
| 2021 | 2021 | ] |
| 2022 | 2022 | |
| 2023 | - assert any("TOC references chapter files that do not exist" in message for message in messages) | |
| 2024 | - assert any( | |
| 2025 | - "Use the current TOC target contents plus the verified sibling inventory" in message | |
| 2026 | - for message in steering_messages | |
| 2027 | - ) | |
| 2028 | - assert any(str(index_file) in message for message in steering_messages) | |
| 2029 | - assert any( | |
| 2030 | - "chapters/05-input-output.html = Chapter 5: Input and Output" in message | |
| 2031 | - for message in steering_messages | |
| 2032 | - ) | |
| 2033 | - assert any("<ul class=\"chapter-list\">" in message for message in steering_messages) | |
| 2034 | - assert any("Suggested replacement block:" in message for message in steering_messages) | |
| 2035 | - assert any("Do not rewrite the whole document." in message for message in steering_messages) | |
| 2036 | - assert any("set `old_string` to the current TOC block above exactly" in message for message in steering_messages) | |
| 2037 | - assert any("Suggested edit call:" in message for message in steering_messages) | |
| 2038 | - assert any('old_string="""' in message for message in steering_messages) | |
| 2039 | - assert any( | |
| 2040 | - '<li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>' in message | |
| 2041 | - for message in steering_messages | |
| 2042 | - ) | |
| 2023 | + assert any("Edited HTML links point to files that do not exist" in message for message in messages) | |
| 2024 | + assert steering_messages == [] | |
| 2043 | 2025 | |
| 2044 | 2026 | |
| 2045 | 2027 | @pytest.mark.asyncio |
@@ -2080,15 +2062,7 @@ async def test_full_path_glob_pattern_still_injects_verified_html_inventory( | ||
| 2080 | 2062 | |
| 2081 | 2063 | assert tool_event_names(run) == ["glob"] |
| 2082 | 2064 | messages = tool_result_messages(run) |
| 2083 | - assert any( | |
| 2084 | - "Verified chapter inventory: chapters/01-introduction.html = Chapter 1: Introduction to Fortran" | |
| 2085 | - in message | |
| 2086 | - for message in messages | |
| 2087 | - ) | |
| 2088 | - assert any( | |
| 2089 | - "chapters/02-setup.html = Chapter 2: Setting Up Fortran" in message | |
| 2090 | - for message in messages | |
| 2091 | - ) | |
| 2065 | + assert all("Verified chapter inventory:" not in message for message in messages) | |
| 2092 | 2066 | |
| 2093 | 2067 | |
| 2094 | 2068 | @pytest.mark.asyncio |
@@ -2136,16 +2110,8 @@ async def test_verified_html_inventory_blocks_redundant_chapter_reread( | ||
| 2136 | 2110 | ) |
| 2137 | 2111 | |
| 2138 | 2112 | messages = tool_result_messages(run) |
| 2139 | - assert any( | |
| 2140 | - "Verified chapter inventory: chapters/01-introduction.html = Chapter 1: Introduction to Fortran" | |
| 2141 | - in message | |
| 2142 | - for message in messages | |
| 2143 | - ) | |
| 2144 | - assert any( | |
| 2145 | - "verified sibling chapter inventory" | |
| 2146 | - in message | |
| 2147 | - for message in messages | |
| 2148 | - ) | |
| 2113 | + assert all("Verified chapter inventory:" not in message for message in messages) | |
| 2114 | + assert all("verified sibling chapter inventory" not in message for message in messages) | |
| 2149 | 2115 | |
| 2150 | 2116 | |
| 2151 | 2117 | @pytest.mark.asyncio |
@@ -2235,24 +2201,12 @@ async def test_successful_html_toc_edit_blocks_post_success_reread_and_steers_to | ||
| 2235 | 2201 | if event.type == "steering" and event.content |
| 2236 | 2202 | ] |
| 2237 | 2203 | |
| 2238 | - assert any( | |
| 2239 | - "Semantic verification preview: validated 2 toc links in index.html" | |
| 2240 | - in message | |
| 2241 | - for message in messages | |
| 2242 | - ) | |
| 2243 | - assert any( | |
| 2244 | - "already passed semantic link validation" in message | |
| 2204 | + assert all( | |
| 2205 | + "Semantic verification preview:" not in message | |
| 2245 | 2206 | for message in messages |
| 2246 | 2207 | ) |
| 2247 | - assert any( | |
| 2248 | - "already satisfies the verified link/title constraints" in message | |
| 2249 | - for message in steering_messages | |
| 2250 | - ) | |
| 2251 | - assert any( | |
| 2252 | - "Do not reread" in message and "chapters" in message | |
| 2253 | - for message in steering_messages | |
| 2254 | - ) | |
| 2255 | - assert "validated 2 toc links in index.html" in run.response | |
| 2208 | + assert steering_messages == [] | |
| 2209 | + assert "updated index.html" in run.response.lower() | |
| 2256 | 2210 | |
| 2257 | 2211 | |
| 2258 | 2212 | @pytest.mark.asyncio |
@@ -2325,17 +2279,11 @@ async def test_exact_prompt_finishes_when_index_toc_is_already_correct( | ||
| 2325 | 2279 | if event.type == "steering" and event.content |
| 2326 | 2280 | ] |
| 2327 | 2281 | |
| 2328 | - assert any( | |
| 2329 | - "Semantic verification preview: validated 2 toc links in index.html" | |
| 2330 | - in message | |
| 2282 | + assert all( | |
| 2283 | + "Semantic verification preview:" not in message | |
| 2331 | 2284 | for message in messages |
| 2332 | 2285 | ) |
| 2333 | - assert any( | |
| 2334 | - "No TOC edit is required unless you can point to one specific incorrect href or title" | |
| 2335 | - in message | |
| 2336 | - for message in steering_messages | |
| 2337 | - ) | |
| 2338 | - assert any(str(index_file) in message for message in steering_messages) | |
| 2286 | + assert steering_messages == [] | |
| 2339 | 2287 | assert ( |
| 2340 | 2288 | sum( |
| 2341 | 2289 | 1 |
tests/test_runtime_repair_flows.pymodified@@ -99,6 +99,111 @@ async def test_empty_response_retry_injects_honest_user_reminder_and_recovers( | ||
| 99 | 99 | ) |
| 100 | 100 | |
| 101 | 101 | |
| 102 | +@pytest.mark.asyncio | |
| 103 | +async def test_empty_response_retry_carries_forward_confirmed_progress( | |
| 104 | + temp_dir: Path, | |
| 105 | +) -> None: | |
| 106 | + target = temp_dir / "hello.py" | |
| 107 | + backend = ScriptedBackend( | |
| 108 | + completions=[ | |
| 109 | + CompletionResponse( | |
| 110 | + content="I'll create the file now.", | |
| 111 | + tool_calls=[ | |
| 112 | + ToolCall( | |
| 113 | + id="write-1", | |
| 114 | + name="write", | |
| 115 | + arguments={ | |
| 116 | + "file_path": str(target), | |
| 117 | + "content": "print('hello')\n", | |
| 118 | + }, | |
| 119 | + ) | |
| 120 | + ], | |
| 121 | + ), | |
| 122 | + CompletionResponse(content=""), | |
| 123 | + CompletionResponse(content="Recovered after the empty response."), | |
| 124 | + ] | |
| 125 | + ) | |
| 126 | + | |
| 127 | + run = await run_scenario( | |
| 128 | + "Create hello.py with a greeting.", | |
| 129 | + backend, | |
| 130 | + config=non_streaming_config(), | |
| 131 | + project_root=temp_dir, | |
| 132 | + ) | |
| 133 | + | |
| 134 | + assert "Recovered after the empty response." in run.response | |
| 135 | + retry_messages = [ | |
| 136 | + message.content | |
| 137 | + for message in backend.invocations[2].messages | |
| 138 | + if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content | |
| 139 | + ] | |
| 140 | + assert retry_messages | |
| 141 | + assert "retry 1/2" in retry_messages[0] | |
| 142 | + assert "Continue from the confirmed progress below instead of restarting." in retry_messages[0] | |
| 143 | + assert "hello.py" in retry_messages[0] | |
| 144 | + | |
| 145 | + | |
| 146 | +@pytest.mark.asyncio | |
| 147 | +async def test_empty_response_retry_budget_resets_after_successful_turn( | |
| 148 | + temp_dir: Path, | |
| 149 | +) -> None: | |
| 150 | + first = temp_dir / "one.txt" | |
| 151 | + second = temp_dir / "two.txt" | |
| 152 | + backend = ScriptedBackend( | |
| 153 | + completions=[ | |
| 154 | + CompletionResponse(content=""), | |
| 155 | + CompletionResponse( | |
| 156 | + content="I'll create the first file now.", | |
| 157 | + tool_calls=[ | |
| 158 | + ToolCall( | |
| 159 | + id="write-1", | |
| 160 | + name="write", | |
| 161 | + arguments={ | |
| 162 | + "file_path": str(first), | |
| 163 | + "content": "one\n", | |
| 164 | + }, | |
| 165 | + ) | |
| 166 | + ], | |
| 167 | + ), | |
| 168 | + CompletionResponse(content=""), | |
| 169 | + CompletionResponse( | |
| 170 | + content="I'll create the second file now.", | |
| 171 | + tool_calls=[ | |
| 172 | + ToolCall( | |
| 173 | + id="write-2", | |
| 174 | + name="write", | |
| 175 | + arguments={ | |
| 176 | + "file_path": str(second), | |
| 177 | + "content": "two\n", | |
| 178 | + }, | |
| 179 | + ) | |
| 180 | + ], | |
| 181 | + ), | |
| 182 | + CompletionResponse(content="Both files are created."), | |
| 183 | + ] | |
| 184 | + ) | |
| 185 | + | |
| 186 | + run = await run_scenario( | |
| 187 | + "Create one.txt and two.txt.", | |
| 188 | + backend, | |
| 189 | + config=non_streaming_config(), | |
| 190 | + project_root=temp_dir, | |
| 191 | + ) | |
| 192 | + | |
| 193 | + assert run.response.startswith("Both files are created.") | |
| 194 | + retry_messages: list[str] = [] | |
| 195 | + for invocation in backend.invocations: | |
| 196 | + for message in invocation.messages: | |
| 197 | + if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content: | |
| 198 | + continue | |
| 199 | + if retry_messages and retry_messages[-1] == message.content: | |
| 200 | + continue | |
| 201 | + retry_messages.append(message.content) | |
| 202 | + assert len(retry_messages) >= 2 | |
| 203 | + assert all("retry 2/2" not in message for message in retry_messages) | |
| 204 | + assert sum("retry 1/2" in message for message in retry_messages) >= 2 | |
| 205 | + | |
| 206 | + | |
| 102 | 207 | @pytest.mark.asyncio |
| 103 | 208 | async def test_repeated_empty_responses_fail_honestly_after_one_retry( |
| 104 | 209 | temp_dir: Path, |
@@ -107,6 +212,7 @@ async def test_repeated_empty_responses_fail_honestly_after_one_retry( | ||
| 107 | 212 | completions=[ |
| 108 | 213 | CompletionResponse(content=""), |
| 109 | 214 | CompletionResponse(content=""), |
| 215 | + CompletionResponse(content=""), | |
| 110 | 216 | ] |
| 111 | 217 | ) |
| 112 | 218 | |
@@ -119,17 +225,22 @@ async def test_repeated_empty_responses_fail_honestly_after_one_retry( | ||
| 119 | 225 | |
| 120 | 226 | assert tool_event_names(run) == [] |
| 121 | 227 | assert run.response == ( |
| 122 | - "I didn't get a usable response from the model after retrying once. " | |
| 228 | + "I didn't get a usable response from the model after retrying 2 times. " | |
| 123 | 229 | "Please try again or switch to a different backend/model." |
| 124 | 230 | ) |
| 125 | - assert len(backend.invocations) == 2 | |
| 126 | - assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-2:]] == [ | |
| 231 | + assert len(backend.invocations) == 3 | |
| 232 | + assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-3:]] == [ | |
| 233 | + "repair_retry", | |
| 127 | 234 | "repair_retry", |
| 128 | 235 | "repair_fail", |
| 129 | 236 | ] |
| 130 | 237 | assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == ( |
| 131 | 238 | "empty_response_retry_exhausted" |
| 132 | 239 | ) |
| 240 | + assert run.agent.session.last_turn_transition_kind == "terminal" | |
| 241 | + assert run.agent.session.last_turn_transition_reason_code == ( | |
| 242 | + "empty_response_retry_exhausted" | |
| 243 | + ) | |
| 133 | 244 | |
| 134 | 245 | |
| 135 | 246 | @pytest.mark.asyncio |
tests/test_safeguard_services.pymodified@@ -16,8 +16,6 @@ from loader.runtime.safeguards import RuntimeSafeguards | ||
| 16 | 16 | from loader.runtime.semantic_rules.html_toc import ( |
| 17 | 17 | build_html_toc_edit_call_template, |
| 18 | 18 | build_html_toc_replacement_block, |
| 19 | - build_validated_html_toc_observation_reason, | |
| 20 | - build_verified_html_inventory_observation_reason, | |
| 21 | 19 | format_html_inventory_entry, |
| 22 | 20 | task_targets_html_toc, |
| 23 | 21 | validate_html_toc, |
@@ -214,88 +212,6 @@ def test_action_tracker_blocks_repeated_read_without_changes(tmp_path) -> None: | ||
| 214 | 212 | assert str(file_path) in reason |
| 215 | 213 | |
| 216 | 214 | |
| 217 | -def test_action_tracker_blocks_post_validation_html_rereads_until_new_mutation(tmp_path) -> None: | |
| 218 | - tracker = ActionTracker() | |
| 219 | - chapters = tmp_path / "chapters" | |
| 220 | - chapters.mkdir() | |
| 221 | - chapter_path = chapters / "01-introduction.html" | |
| 222 | - chapter_path.write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n") | |
| 223 | - index_path = tmp_path / "index.html" | |
| 224 | - index_path.write_text( | |
| 225 | - '<ul class="chapter-list">\n' | |
| 226 | - ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 227 | - "</ul>\n" | |
| 228 | - ) | |
| 229 | - | |
| 230 | - tracker.note_validated_html_toc(str(index_path)) | |
| 231 | - | |
| 232 | - assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == ( | |
| 233 | - True, | |
| 234 | - build_validated_html_toc_observation_reason(index_path), | |
| 235 | - ) | |
| 236 | - assert tracker.check_tool_call("read", {"file_path": str(chapter_path)}) == ( | |
| 237 | - True, | |
| 238 | - build_validated_html_toc_observation_reason(chapter_path), | |
| 239 | - ) | |
| 240 | - assert tracker.check_tool_call( | |
| 241 | - "glob", | |
| 242 | - {"path": str(chapters), "pattern": "*.html"}, | |
| 243 | - ) == ( | |
| 244 | - True, | |
| 245 | - build_validated_html_toc_observation_reason(chapters), | |
| 246 | - ) | |
| 247 | - assert tracker.check_tool_call( | |
| 248 | - "bash", | |
| 249 | - {"command": f"cat {index_path}"}, | |
| 250 | - ) == ( | |
| 251 | - True, | |
| 252 | - build_validated_html_toc_observation_reason(index_path), | |
| 253 | - ) | |
| 254 | - | |
| 255 | - tracker.record_tool_call( | |
| 256 | - "edit", | |
| 257 | - { | |
| 258 | - "file_path": str(index_path), | |
| 259 | - "old_string": "Chapter 1", | |
| 260 | - "new_string": "Chapter One", | |
| 261 | - }, | |
| 262 | - ) | |
| 263 | - | |
| 264 | - assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "") | |
| 265 | - | |
| 266 | - | |
| 267 | -def test_action_tracker_blocks_chapter_rereads_after_verified_inventory(tmp_path) -> None: | |
| 268 | - tracker = ActionTracker() | |
| 269 | - chapters = tmp_path / "chapters" | |
| 270 | - chapters.mkdir() | |
| 271 | - chapter_path = chapters / "01-introduction.html" | |
| 272 | - chapter_path.write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n") | |
| 273 | - index_path = tmp_path / "index.html" | |
| 274 | - index_path.write_text("<ul></ul>\n") | |
| 275 | - | |
| 276 | - tracker.note_verified_html_inventory(str(index_path)) | |
| 277 | - | |
| 278 | - assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "") | |
| 279 | - assert tracker.check_tool_call("read", {"file_path": str(chapter_path)}) == ( | |
| 280 | - True, | |
| 281 | - build_verified_html_inventory_observation_reason(chapter_path), | |
| 282 | - ) | |
| 283 | - assert tracker.check_tool_call( | |
| 284 | - "glob", | |
| 285 | - {"path": str(chapters), "pattern": "*.html"}, | |
| 286 | - ) == ( | |
| 287 | - True, | |
| 288 | - build_verified_html_inventory_observation_reason(chapters), | |
| 289 | - ) | |
| 290 | - assert tracker.check_tool_call( | |
| 291 | - "bash", | |
| 292 | - {"command": f"head -20 {chapter_path}"}, | |
| 293 | - ) == ( | |
| 294 | - True, | |
| 295 | - build_verified_html_inventory_observation_reason(chapter_path), | |
| 296 | - ) | |
| 297 | - | |
| 298 | - | |
| 299 | 215 | def test_action_tracker_allows_one_interleaved_reread_without_changes(tmp_path) -> None: |
| 300 | 216 | tracker = ActionTracker() |
| 301 | 217 | index_path = tmp_path / "index.html" |
@@ -372,7 +288,7 @@ def test_action_tracker_blocks_second_target_index_reread_after_chapter_discover | ||
| 372 | 288 | is_duplicate, reason = tracker.check_tool_call("read", {"file_path": str(index_path)}) |
| 373 | 289 | |
| 374 | 290 | assert is_duplicate is True |
| 375 | - assert "reuse that file/title evidence" in reason | |
| 291 | + assert "reuse the earlier read result instead of rereading" in reason | |
| 376 | 292 | |
| 377 | 293 | |
| 378 | 294 | def test_action_tracker_blocks_repeated_chapter_directory_search_once_titles_are_known( |
@@ -383,14 +299,12 @@ def test_action_tracker_blocks_repeated_chapter_directory_search_once_titles_are | ||
| 383 | 299 | search_args = {"pattern": "*.html", "path": str(chapters)} |
| 384 | 300 | |
| 385 | 301 | tracker.record_tool_call("glob", search_args) |
| 386 | - tracker.record_tool_call("read", {"file_path": str(chapters / "01-introduction.html")}) | |
| 387 | - tracker.record_tool_call("read", {"file_path": str(chapters / "02-setup.html")}) | |
| 388 | - tracker.record_tool_call("read", {"file_path": str(chapters / "03-basics.html")}) | |
| 302 | + tracker.record_tool_call("glob", search_args) | |
| 389 | 303 | |
| 390 | 304 | is_duplicate, reason = tracker.check_tool_call("glob", search_args) |
| 391 | 305 | |
| 392 | 306 | assert is_duplicate is True |
| 393 | - assert "reuse that filename/title evidence" in reason | |
| 307 | + assert "reuse the earlier search result instead of rerunning it" in reason | |
| 394 | 308 | |
| 395 | 309 | |
| 396 | 310 | def test_action_tracker_allows_repeated_read_after_mutation(tmp_path) -> None: |
@@ -490,8 +404,8 @@ def test_pre_action_validator_blocks_index_edit_with_missing_chapter_href(tmp_pa | ||
| 490 | 404 | ) |
| 491 | 405 | |
| 492 | 406 | assert result.valid is False |
| 493 | - assert result.reason == "Edited TOC references chapter files that do not exist" | |
| 494 | - assert "chapters/05-input-output.html = Chapter 5: Input and Output" in result.suggestion | |
| 407 | + assert result.reason == "Edited HTML links point to files that do not exist" | |
| 408 | + assert "chapters/05-control-structures.html" in result.suggestion | |
| 495 | 409 | |
| 496 | 410 | |
| 497 | 411 | def test_pre_action_validator_blocks_index_edit_with_title_mismatch(tmp_path) -> None: |
@@ -512,12 +426,111 @@ def test_pre_action_validator_blocks_index_edit_with_title_mismatch(tmp_path) -> | ||
| 512 | 426 | }, |
| 513 | 427 | ) |
| 514 | 428 | |
| 429 | + assert result.valid is True | |
| 430 | + | |
| 431 | + | |
| 432 | +def test_pre_action_validator_allows_chapter_write_with_future_target_declared_by_index( | |
| 433 | + tmp_path: Path, | |
| 434 | +) -> None: | |
| 435 | + validator = PreActionValidator() | |
| 436 | + guide = tmp_path / "guide" | |
| 437 | + chapters = guide / "chapters" | |
| 438 | + chapters.mkdir(parents=True) | |
| 439 | + (guide / "index.html").write_text( | |
| 440 | + "\n".join( | |
| 441 | + [ | |
| 442 | + '<a href="chapters/introduction.html">Introduction</a>', | |
| 443 | + '<a href="chapters/installation.html">Installation</a>', | |
| 444 | + "", | |
| 445 | + ] | |
| 446 | + ) | |
| 447 | + ) | |
| 448 | + | |
| 449 | + result = validator.validate( | |
| 450 | + "write", | |
| 451 | + { | |
| 452 | + "file_path": str(chapters / "introduction.html"), | |
| 453 | + "content": '<a href="installation.html">Next</a>\n', | |
| 454 | + }, | |
| 455 | + ) | |
| 456 | + | |
| 457 | + assert result.valid is True | |
| 458 | + | |
| 459 | + | |
| 460 | +def test_pre_action_validator_blocks_chapter_write_with_undeclared_missing_sibling( | |
| 461 | + tmp_path: Path, | |
| 462 | +) -> None: | |
| 463 | + validator = PreActionValidator() | |
| 464 | + guide = tmp_path / "guide" | |
| 465 | + chapters = guide / "chapters" | |
| 466 | + chapters.mkdir(parents=True) | |
| 467 | + (guide / "index.html").write_text( | |
| 468 | + "\n".join( | |
| 469 | + [ | |
| 470 | + '<a href="chapters/introduction.html">Introduction</a>', | |
| 471 | + '<a href="chapters/installation.html">Installation</a>', | |
| 472 | + '<a href="chapters/configuration.html">Configuration</a>', | |
| 473 | + '<a href="chapters/usage.html">Usage</a>', | |
| 474 | + '<a href="chapters/troubleshooting.html">Troubleshooting</a>', | |
| 475 | + "", | |
| 476 | + ] | |
| 477 | + ) | |
| 478 | + ) | |
| 479 | + (chapters / "introduction.html").write_text('<a href="installation.html">Next</a>\n') | |
| 480 | + (chapters / "installation.html").write_text('<a href="configuration.html">Next</a>\n') | |
| 481 | + (chapters / "configuration.html").write_text('<a href="usage.html">Next</a>\n') | |
| 482 | + | |
| 483 | + result = validator.validate( | |
| 484 | + "write", | |
| 485 | + { | |
| 486 | + "file_path": str(chapters / "usage.html"), | |
| 487 | + "content": '<a href="advanced.html">Next</a>\n', | |
| 488 | + }, | |
| 489 | + ) | |
| 490 | + | |
| 515 | 491 | assert result.valid is False |
| 516 | - assert result.reason == "Edited TOC labels do not match the linked chapter titles" | |
| 517 | 492 | assert ( |
| 518 | - "chapters/12-troubleshooting-tips.html = Chapter 12: Troubleshooting and Tips" | |
| 519 | - in result.suggestion | |
| 493 | + result.reason | |
| 494 | + == "HTML page introduces new local targets outside the current declared artifact set" | |
| 520 | 495 | ) |
| 496 | + assert "advanced.html" in result.suggestion | |
| 497 | + | |
| 498 | + | |
| 499 | +def test_pre_action_validator_blocks_missing_numbered_read_with_existing_sibling( | |
| 500 | + tmp_path: Path, | |
| 501 | +) -> None: | |
| 502 | + validator = PreActionValidator() | |
| 503 | + chapters = tmp_path / "chapters" | |
| 504 | + chapters.mkdir() | |
| 505 | + (chapters / "01-getting-started.html").write_text("<h1>Getting Started</h1>\n") | |
| 506 | + | |
| 507 | + result = validator.validate( | |
| 508 | + "read", | |
| 509 | + {"file_path": str(chapters / "01-introduction.html")}, | |
| 510 | + ) | |
| 511 | + | |
| 512 | + assert result.valid is False | |
| 513 | + assert result.reason == "Read target conflicts with an existing numbered sibling" | |
| 514 | + assert "01-getting-started.html" in result.suggestion | |
| 515 | + | |
| 516 | + | |
| 517 | +def test_pre_action_validator_blocks_new_numbered_sibling_drift(tmp_path) -> None: | |
| 518 | + validator = PreActionValidator() | |
| 519 | + chapters = tmp_path / "chapters" | |
| 520 | + chapters.mkdir() | |
| 521 | + (chapters / "01-getting-started.html").write_text("<h1>Getting Started</h1>\n") | |
| 522 | + | |
| 523 | + result = validator.validate( | |
| 524 | + "write", | |
| 525 | + { | |
| 526 | + "file_path": str(chapters / "01-intro.html"), | |
| 527 | + "content": "<h1>Intro</h1>\n", | |
| 528 | + }, | |
| 529 | + ) | |
| 530 | + | |
| 531 | + assert result.valid is False | |
| 532 | + assert result.reason == "New file conflicts with an existing numbered sibling" | |
| 533 | + assert "01-getting-started.html" in result.suggestion | |
| 521 | 534 | |
| 522 | 535 | |
| 523 | 536 | def test_format_html_inventory_entry_handles_tmp_alias_paths() -> None: |
tests/test_tool_batch_policies.pymodified@@ -373,8 +373,6 @@ async def test_tool_batch_recovery_controller_includes_known_state_for_missing_f | ||
| 373 | 373 | assert "Prefer edit/write/patch on the target file" in follow_up.content |
| 374 | 374 | assert "04-variables.html" in follow_up.content |
| 375 | 375 | assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content |
| 376 | - assert "02-setup.html = Chapter 2: Setting Up Fortran" in follow_up.content | |
| 377 | - assert "/Users/mfwolffe/Loader/guides/fortran/index.html" in follow_up.content | |
| 378 | 376 | assert any(event.type == "recovery" for event in events) |
| 379 | 377 | |
| 380 | 378 | |
@@ -430,7 +428,6 @@ async def test_tool_batch_recovery_controller_suggests_known_sibling_files( | ||
| 430 | 428 | assert follow_up is not None |
| 431 | 429 | assert "## LIKELY FILE CANDIDATES" in follow_up.content |
| 432 | 430 | assert "`04-variables.html`" in follow_up.content |
| 433 | - assert "Chapter 4: Variables and Data Types" in follow_up.content | |
| 434 | 431 | assert "instead of retrying the missing path" in follow_up.content |
| 435 | 432 | |
| 436 | 433 | |
@@ -506,17 +503,79 @@ async def test_tool_batch_recovery_controller_includes_current_html_target_excer | ||
| 506 | 503 | |
| 507 | 504 | assert follow_up is not None |
| 508 | 505 | assert "## CURRENT TARGET EXCERPT" in follow_up.content |
| 509 | - assert "Verified chapter inventory:" in follow_up.content | |
| 510 | - assert "<ul class=\"chapter-list\">" in follow_up.content | |
| 511 | - assert "chapters/02-setup.html = Chapter 2: Setting Up Your Environment" in follow_up.content | |
| 512 | - assert "Suggested replacement block:" in follow_up.content | |
| 513 | - assert '<li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>' in follow_up.content | |
| 514 | - assert "Exact edit guidance:" in follow_up.content | |
| 515 | - assert "old_string: use the Current TOC block above exactly" in follow_up.content | |
| 516 | - assert "new_string: use the Suggested replacement block above exactly" in follow_up.content | |
| 517 | - assert "Do not rewrite the whole file." in follow_up.content | |
| 518 | - assert "Suggested edit call:" in follow_up.content | |
| 519 | - assert 'old_string="""' in follow_up.content | |
| 506 | + assert "- Target file:" in follow_up.content | |
| 507 | + assert "index.html" in follow_up.content | |
| 508 | + assert ( | |
| 509 | + "Closest on-disk block to the requested patch:" in follow_up.content | |
| 510 | + or "Current file contents near the requested patch location:" in follow_up.content | |
| 511 | + ) | |
| 512 | + assert '1 | <h2>Table of Contents</h2>' in follow_up.content | |
| 513 | + assert ( | |
| 514 | + '3 | <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>' | |
| 515 | + in follow_up.content | |
| 516 | + ) | |
| 517 | + assert "Use the exact on-disk text above" in follow_up.content | |
| 518 | + assert "Verified chapter inventory:" not in follow_up.content | |
| 519 | + | |
| 520 | + | |
| 521 | +@pytest.mark.asyncio | |
| 522 | +async def test_tool_batch_recovery_controller_includes_current_target_excerpt_for_edit_mismatch( | |
| 523 | + temp_dir: Path, | |
| 524 | +) -> None: | |
| 525 | + async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: | |
| 526 | + raise AssertionError("Confidence should not run here") | |
| 527 | + | |
| 528 | + async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: | |
| 529 | + raise AssertionError("Verification should not run here") | |
| 530 | + | |
| 531 | + guide = temp_dir / "guide.md" | |
| 532 | + guide.write_text( | |
| 533 | + "# Loader Guide\n" | |
| 534 | + "\n" | |
| 535 | + "## Overview\n" | |
| 536 | + "Loader helps agentic coding workflows.\n" | |
| 537 | + "\n" | |
| 538 | + "## Status\n" | |
| 539 | + "The runtime is stable.\n" | |
| 540 | + ) | |
| 541 | + | |
| 542 | + context = build_context( | |
| 543 | + temp_dir=temp_dir, | |
| 544 | + messages=[], | |
| 545 | + assess_confidence=assess_confidence, | |
| 546 | + verify_action=verify_action, | |
| 547 | + ) | |
| 548 | + context.session.current_task = "Update guide.md to mention the runtime is resilient." | |
| 549 | + controller = ToolBatchRecoveryController(context) | |
| 550 | + tool_call = ToolCall( | |
| 551 | + id="edit-guide", | |
| 552 | + name="edit", | |
| 553 | + arguments={ | |
| 554 | + "file_path": str(guide), | |
| 555 | + "old_string": "## Runtime\nThe runtime is stable.\n", | |
| 556 | + "new_string": "## Runtime\nThe runtime is resilient.\n", | |
| 557 | + }, | |
| 558 | + ) | |
| 559 | + outcome = tool_outcome( | |
| 560 | + tool_call=tool_call, | |
| 561 | + output="old_string not found in file. Make sure it matches exactly.", | |
| 562 | + is_error=True, | |
| 563 | + ) | |
| 564 | + | |
| 565 | + follow_up = await controller.build_follow_up( | |
| 566 | + tool_call=tool_call, | |
| 567 | + outcome=outcome, | |
| 568 | + emit=lambda event: _noop_emit(event), | |
| 569 | + ) | |
| 570 | + | |
| 571 | + assert follow_up is not None | |
| 572 | + assert "## CURRENT TARGET EXCERPT" in follow_up.content | |
| 573 | + assert "- Target file:" in follow_up.content | |
| 574 | + assert "guide.md" in follow_up.content | |
| 575 | + assert "Closest on-disk block to the requested edit:" in follow_up.content | |
| 576 | + assert "6 | ## Status" in follow_up.content | |
| 577 | + assert "7 | The runtime is stable." in follow_up.content | |
| 578 | + assert "replace the containing block in one edit" in follow_up.content | |
| 520 | 579 | |
| 521 | 580 | |
| 522 | 581 | @pytest.mark.asyncio |
@@ -610,6 +669,94 @@ async def test_tool_batch_recovery_controller_scopes_known_state_to_active_targe | ||
| 610 | 669 | ) not in follow_up.content |
| 611 | 670 | |
| 612 | 671 | |
| 672 | +@pytest.mark.asyncio | |
| 673 | +async def test_tool_batch_recovery_controller_prioritizes_active_verification_repair_target( | |
| 674 | + temp_dir: Path, | |
| 675 | +) -> None: | |
| 676 | + async def assess_confidence( | |
| 677 | + tool_name: str, | |
| 678 | + tool_args: dict, | |
| 679 | + context: str, | |
| 680 | + ) -> ConfidenceAssessment: | |
| 681 | + raise AssertionError("Confidence should not run here") | |
| 682 | + | |
| 683 | + async def verify_action( | |
| 684 | + tool_name: str, | |
| 685 | + tool_args: dict, | |
| 686 | + result: str, | |
| 687 | + expected: str = "", | |
| 688 | + ) -> ActionVerification: | |
| 689 | + raise AssertionError("Verification should not run here") | |
| 690 | + | |
| 691 | + nginx_root = temp_dir / "Loader" / "guides" / "nginx" | |
| 692 | + chapters = nginx_root / "chapters" | |
| 693 | + chapters.mkdir(parents=True) | |
| 694 | + index = nginx_root / "index.html" | |
| 695 | + index.write_text( | |
| 696 | + "<ul>\n" | |
| 697 | + ' <li><a href="chapters/01-introduction.html">Introduction</a></li>\n' | |
| 698 | + "</ul>\n" | |
| 699 | + ) | |
| 700 | + (chapters / "01-getting-started.html").write_text("<h1>Getting Started</h1>\n") | |
| 701 | + | |
| 702 | + repair_message = ( | |
| 703 | + "[DEFINITION OF DONE CHECK FAILED]\n" | |
| 704 | + "Repair focus:\n" | |
| 705 | + f"- Fix the broken local reference `chapters/01-introduction.html` in `{index}`.\n" | |
| 706 | + f"- Immediate next step: edit `{index}`.\n" | |
| 707 | + f"- If the broken reference should remain, create `{chapters / '01-introduction.html'}`; " | |
| 708 | + "otherwise remove or replace `chapters/01-introduction.html`.\n" | |
| 709 | + "- Do not reread unrelated reference materials or restart discovery while this " | |
| 710 | + "concrete repair target is unresolved.\n" | |
| 711 | + ) | |
| 712 | + | |
| 713 | + context = build_context( | |
| 714 | + temp_dir=temp_dir, | |
| 715 | + messages=[ | |
| 716 | + Message(role=Role.USER, content=repair_message), | |
| 717 | + Message( | |
| 718 | + role=Role.TOOL, | |
| 719 | + content=( | |
| 720 | + "Observation [glob]: Result: " | |
| 721 | + f"{chapters / '01-getting-started.html'}" | |
| 722 | + ), | |
| 723 | + ), | |
| 724 | + ], | |
| 725 | + assess_confidence=assess_confidence, | |
| 726 | + verify_action=verify_action, | |
| 727 | + ) | |
| 728 | + context.session.current_task = ( # type: ignore[attr-defined] | |
| 729 | + "Have a look at ~/Loader/guides/fortran and chapters/ within. Get a feel " | |
| 730 | + "for the structure and cadence of the guide. We are going to make an all " | |
| 731 | + "new equally thorough guide on how to use the nginx tool." | |
| 732 | + ) | |
| 733 | + controller = ToolBatchRecoveryController(context) | |
| 734 | + tool_call = ToolCall( | |
| 735 | + id="read-bad-path", | |
| 736 | + name="read", | |
| 737 | + arguments={"path": "~/nginx-guide/chapter1.html"}, | |
| 738 | + ) | |
| 739 | + outcome = tool_outcome( | |
| 740 | + tool_call=tool_call, | |
| 741 | + output="File not found: ~/nginx-guide/chapter1.html", | |
| 742 | + is_error=True, | |
| 743 | + ) | |
| 744 | + | |
| 745 | + follow_up = await controller.build_follow_up( | |
| 746 | + tool_call=tool_call, | |
| 747 | + outcome=outcome, | |
| 748 | + emit=lambda event: _noop_emit(event), | |
| 749 | + ) | |
| 750 | + | |
| 751 | + assert follow_up is not None | |
| 752 | + assert "## ACTIVE REPAIR TARGET" in follow_up.content | |
| 753 | + assert str(index) in follow_up.content | |
| 754 | + assert "chapters/01-introduction.html" in follow_up.content | |
| 755 | + assert "Do not go back to the original reference guide" in follow_up.content | |
| 756 | + assert "Current task: Have a look at ~/Loader/guides/fortran" not in follow_up.content | |
| 757 | + assert "~/nginx-guide/chapter1.html" in follow_up.content | |
| 758 | + | |
| 759 | + | |
| 613 | 760 | @pytest.mark.asyncio |
| 614 | 761 | async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files( |
| 615 | 762 | temp_dir: Path, |
@@ -671,6 +818,71 @@ async def test_tool_batch_recovery_controller_reuses_context_for_related_missing | ||
| 671 | 818 | assert "02-basic-syntax.html" in follow_up.content |
| 672 | 819 | |
| 673 | 820 | |
| 821 | +@pytest.mark.asyncio | |
| 822 | +async def test_tool_batch_recovery_controller_uses_generic_loop_guidance( | |
| 823 | + temp_dir: Path, | |
| 824 | +) -> None: | |
| 825 | + async def assess_confidence( | |
| 826 | + tool_name: str, | |
| 827 | + tool_args: dict, | |
| 828 | + context: str, | |
| 829 | + ) -> ConfidenceAssessment: | |
| 830 | + raise AssertionError("Confidence should not run here") | |
| 831 | + | |
| 832 | + async def verify_action( | |
| 833 | + tool_name: str, | |
| 834 | + tool_args: dict, | |
| 835 | + result: str, | |
| 836 | + expected: str = "", | |
| 837 | + ) -> ActionVerification: | |
| 838 | + raise AssertionError("Verification should not run here") | |
| 839 | + | |
| 840 | + existing = RecoveryContext( | |
| 841 | + original_tool="read", | |
| 842 | + original_args={"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"}, | |
| 843 | + max_retries=3, | |
| 844 | + ) | |
| 845 | + existing.add_attempt( | |
| 846 | + "read", | |
| 847 | + {"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"}, | |
| 848 | + "File not found: ~/Loader/guides/nginx/chapters/01-introduction.html", | |
| 849 | + ) | |
| 850 | + context = build_context( | |
| 851 | + temp_dir=temp_dir, | |
| 852 | + messages=[], | |
| 853 | + assess_confidence=assess_confidence, | |
| 854 | + verify_action=verify_action, | |
| 855 | + recovery_context=existing, | |
| 856 | + ) | |
| 857 | + controller = ToolBatchRecoveryController(context) | |
| 858 | + tool_call = ToolCall( | |
| 859 | + id="read-missing-repeat", | |
| 860 | + name="read", | |
| 861 | + arguments={"file_path": "~/Loader/guides/nginx/chapters/01-introduction.html"}, | |
| 862 | + ) | |
| 863 | + outcome = tool_outcome( | |
| 864 | + tool_call=tool_call, | |
| 865 | + output="File not found: ~/Loader/guides/nginx/chapters/01-introduction.html", | |
| 866 | + is_error=True, | |
| 867 | + ) | |
| 868 | + events: list[AgentEvent] = [] | |
| 869 | + | |
| 870 | + async def emit(event: AgentEvent) -> None: | |
| 871 | + events.append(event) | |
| 872 | + | |
| 873 | + follow_up = await controller.build_follow_up( | |
| 874 | + tool_call=tool_call, | |
| 875 | + outcome=outcome, | |
| 876 | + emit=emit, | |
| 877 | + ) | |
| 878 | + | |
| 879 | + assert follow_up is not None | |
| 880 | + assert any(event.type == "error" for event in events) | |
| 881 | + error_event = next(event for event in events if event.type == "error") | |
| 882 | + assert "read a config file first" not in error_event.content | |
| 883 | + assert "verify the current result" in error_event.content | |
| 884 | + | |
| 885 | + | |
| 674 | 886 | @pytest.mark.asyncio |
| 675 | 887 | async def test_tool_batch_recovery_controller_resets_context_for_unrelated_failures( |
| 676 | 888 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -27,7 +27,12 @@ from loader.runtime.reasoning_types import ( | ||
| 27 | 27 | ConfidenceLevel, |
| 28 | 28 | ) |
| 29 | 29 | from loader.runtime.recovery import RecoveryContext |
| 30 | -from loader.runtime.tool_batches import ToolBatchRunner | |
| 30 | +from loader.runtime.tool_batches import ( | |
| 31 | + ToolBatchRunner, | |
| 32 | +) | |
| 33 | +from loader.runtime.tool_batches import ( | |
| 34 | + _should_prioritize_missing_artifact as tool_batches_should_prioritize_missing_artifact, | |
| 35 | +) | |
| 31 | 36 | from loader.runtime.workflow import sync_todos_to_definition_of_done |
| 32 | 37 | from loader.tools.base import ToolResult as RegistryToolResult |
| 33 | 38 | from loader.tools.base import create_default_registry |
@@ -610,6 +615,26 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 610 | 615 | verify_action=verify_action, |
| 611 | 616 | auto_recover=False, |
| 612 | 617 | ) |
| 618 | + (temp_dir / "chapters").mkdir() | |
| 619 | + (temp_dir / "index.html").write_text("<ul></ul>\n") | |
| 620 | + (temp_dir / "chapters" / "01-introduction.html").write_text("<h1>Intro</h1>\n") | |
| 621 | + (temp_dir / "chapters" / "02-setup.html").write_text("<h1>Setup</h1>\n") | |
| 622 | + (temp_dir / "chapters" / "03-basics.html").write_text("<h1>Basics</h1>\n") | |
| 623 | + implementation_plan = temp_dir / "implementation.md" | |
| 624 | + implementation_plan.write_text( | |
| 625 | + "\n".join( | |
| 626 | + [ | |
| 627 | + "# Implementation Plan", | |
| 628 | + "", | |
| 629 | + "## File Changes", | |
| 630 | + f"- `{temp_dir / 'index.html'}`", | |
| 631 | + f"- `{temp_dir / 'chapters' / '01-introduction.html'}`", | |
| 632 | + f"- `{temp_dir / 'chapters' / '02-setup.html'}`", | |
| 633 | + f"- `{temp_dir / 'chapters' / '03-basics.html'}`", | |
| 634 | + f"- `{temp_dir / 'chapters' / '04-variables.html'}`", | |
| 635 | + ] | |
| 636 | + ) | |
| 637 | + ) | |
| 613 | 638 | context.session.current_task = ( |
| 614 | 639 | f"Update {temp_dir / 'index.html'} with the right chapter links." |
| 615 | 640 | ) |
@@ -644,13 +669,16 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 644 | 669 | ) |
| 645 | 670 | |
| 646 | 671 | summary = TurnSummary(final_response="") |
| 672 | + dod = create_definition_of_done("Fix the chapter links") | |
| 673 | + dod.implementation_plan = str(implementation_plan) | |
| 674 | + dod.pending_items.append("Create the remaining chapter files") | |
| 647 | 675 | await runner.execute_batch( |
| 648 | 676 | tool_calls=[tool_call], |
| 649 | 677 | tool_source="assistant", |
| 650 | 678 | pending_tool_calls_seen=set(), |
| 651 | 679 | emit=_noop_emit, |
| 652 | 680 | summary=summary, |
| 653 | - dod=create_definition_of_done("Fix the chapter links"), | |
| 681 | + dod=dod, | |
| 654 | 682 | executor=executor, # type: ignore[arg-type] |
| 655 | 683 | on_confirmation=None, |
| 656 | 684 | on_user_question=None, |
@@ -660,8 +688,128 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 660 | 688 | |
| 661 | 689 | assert len(queued_messages) == 1 |
| 662 | 690 | assert "Reuse the earlier observation instead of repeating it." in queued_messages[0] |
| 663 | - assert "01-introduction.html = Chapter 1: Introduction to Fortran" in queued_messages[0] | |
| 664 | - assert "index.html" in queued_messages[0] | |
| 691 | + assert "Continue with the next pending item: `Create the remaining chapter files`." in queued_messages[0] | |
| 692 | + assert "Resume by creating `04-variables.html` now." in queued_messages[0] | |
| 693 | + assert f"Prefer one `write` call for `{temp_dir / 'chapters' / '04-variables.html'}` instead of more rereads." in queued_messages[0] | |
| 694 | + | |
| 695 | + | |
| 696 | +@pytest.mark.asyncio | |
| 697 | +async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo( | |
| 698 | + temp_dir: Path, | |
| 699 | +) -> None: | |
| 700 | + async def assess_confidence( | |
| 701 | + tool_name: str, | |
| 702 | + tool_args: dict, | |
| 703 | + context: str, | |
| 704 | + ) -> ConfidenceAssessment: | |
| 705 | + raise AssertionError("Confidence scoring should not run for this scenario") | |
| 706 | + | |
| 707 | + async def verify_action( | |
| 708 | + tool_name: str, | |
| 709 | + tool_args: dict, | |
| 710 | + result: str, | |
| 711 | + expected: str = "", | |
| 712 | + ) -> ActionVerification: | |
| 713 | + raise AssertionError("Verification should not run for this scenario") | |
| 714 | + | |
| 715 | + context = build_context( | |
| 716 | + temp_dir=temp_dir, | |
| 717 | + messages=[], | |
| 718 | + safeguards=FakeSafeguards(), | |
| 719 | + assess_confidence=assess_confidence, | |
| 720 | + verify_action=verify_action, | |
| 721 | + auto_recover=False, | |
| 722 | + ) | |
| 723 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 724 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 725 | + sync_todos_to_definition_of_done( | |
| 726 | + dod, | |
| 727 | + [ | |
| 728 | + { | |
| 729 | + "content": "Create 03-first-website.html", | |
| 730 | + "active_form": "Creating 03-first-website.html", | |
| 731 | + "status": "pending", | |
| 732 | + }, | |
| 733 | + { | |
| 734 | + "content": "Create 04-configuration-basics.html", | |
| 735 | + "active_form": "Creating 04-configuration-basics.html", | |
| 736 | + "status": "pending", | |
| 737 | + }, | |
| 738 | + ], | |
| 739 | + ) | |
| 740 | + | |
| 741 | + chapter_path = temp_dir / "guides" / "nginx" / "chapters" / "03-first-website.html" | |
| 742 | + chapter_path.parent.mkdir(parents=True) | |
| 743 | + write_call = ToolCall( | |
| 744 | + id="write-ch3", | |
| 745 | + name="write", | |
| 746 | + arguments={"file_path": str(chapter_path), "content": "<html></html>\n"}, | |
| 747 | + ) | |
| 748 | + stale_todo_call = ToolCall( | |
| 749 | + id="todo-stale", | |
| 750 | + name="TodoWrite", | |
| 751 | + arguments={ | |
| 752 | + "todos": [ | |
| 753 | + { | |
| 754 | + "content": "Create 03-first-website.html", | |
| 755 | + "active_form": "Creating 03-first-website.html", | |
| 756 | + "status": "pending", | |
| 757 | + }, | |
| 758 | + { | |
| 759 | + "content": "Create 04-configuration-basics.html", | |
| 760 | + "active_form": "Creating 04-configuration-basics.html", | |
| 761 | + "status": "pending", | |
| 762 | + }, | |
| 763 | + ] | |
| 764 | + }, | |
| 765 | + ) | |
| 766 | + executor = FakeExecutor( | |
| 767 | + [ | |
| 768 | + tool_outcome( | |
| 769 | + tool_call=write_call, | |
| 770 | + output=f"Successfully wrote {chapter_path}", | |
| 771 | + is_error=False, | |
| 772 | + ), | |
| 773 | + tool_outcome( | |
| 774 | + tool_call=stale_todo_call, | |
| 775 | + output="Todos updated", | |
| 776 | + is_error=False, | |
| 777 | + metadata={ | |
| 778 | + "new_todos": [ | |
| 779 | + { | |
| 780 | + "content": "Create 03-first-website.html", | |
| 781 | + "active_form": "Creating 03-first-website.html", | |
| 782 | + "status": "pending", | |
| 783 | + }, | |
| 784 | + { | |
| 785 | + "content": "Create 04-configuration-basics.html", | |
| 786 | + "active_form": "Creating 04-configuration-basics.html", | |
| 787 | + "status": "pending", | |
| 788 | + }, | |
| 789 | + ] | |
| 790 | + }, | |
| 791 | + ), | |
| 792 | + ] | |
| 793 | + ) | |
| 794 | + | |
| 795 | + summary = TurnSummary(final_response="") | |
| 796 | + await runner.execute_batch( | |
| 797 | + tool_calls=[write_call, stale_todo_call], | |
| 798 | + tool_source="assistant", | |
| 799 | + pending_tool_calls_seen=set(), | |
| 800 | + emit=_noop_emit, | |
| 801 | + summary=summary, | |
| 802 | + dod=dod, | |
| 803 | + executor=executor, # type: ignore[arg-type] | |
| 804 | + on_confirmation=None, | |
| 805 | + on_user_question=None, | |
| 806 | + emit_confirmation=None, | |
| 807 | + consecutive_errors=0, | |
| 808 | + ) | |
| 809 | + | |
| 810 | + assert "Create 03-first-website.html" in dod.completed_items | |
| 811 | + assert "Create 03-first-website.html" not in dod.pending_items | |
| 812 | + assert "Create 04-configuration-basics.html" in dod.pending_items | |
| 665 | 813 | |
| 666 | 814 | |
| 667 | 815 | @pytest.mark.asyncio |
@@ -742,15 +890,9 @@ async def test_tool_batch_runner_proactively_queues_verified_html_inventory( | ||
| 742 | 890 | consecutive_errors=0, |
| 743 | 891 | ) |
| 744 | 892 | |
| 745 | - assert len(queued_messages) == 1 | |
| 746 | - assert "verified sibling inventory" in queued_messages[0] | |
| 747 | - assert "chapters/01-introduction.html = Chapter 1: Introduction to Fortran" in queued_messages[0] | |
| 748 | - assert str(temp_dir / "index.html") in queued_messages[0] | |
| 893 | + assert queued_messages == [] | |
| 749 | 894 | assert len(summary.tool_result_messages) == 1 |
| 750 | - assert ( | |
| 751 | - "Verified chapter inventory: chapters/01-introduction.html = Chapter 1: Introduction to Fortran" | |
| 752 | - in summary.tool_result_messages[0].content | |
| 753 | - ) | |
| 895 | + assert "Verified chapter inventory:" not in summary.tool_result_messages[0].content | |
| 754 | 896 | |
| 755 | 897 | |
| 756 | 898 | @pytest.mark.asyncio |
@@ -845,15 +987,11 @@ async def test_tool_batch_runner_marks_validated_html_toc_completion_after_succe | ||
| 845 | 987 | consecutive_errors=0, |
| 846 | 988 | ) |
| 847 | 989 | |
| 848 | - assert any( | |
| 849 | - "Semantic verification preview: validated 2 toc links in index.html" | |
| 850 | - in message.content | |
| 990 | + assert all( | |
| 991 | + "Semantic verification preview:" not in message.content | |
| 851 | 992 | for message in summary.tool_result_messages |
| 852 | 993 | ) |
| 853 | - assert len(queued_messages) == 1 | |
| 854 | - assert "already satisfies the verified link/title constraints" in queued_messages[0] | |
| 855 | - assert f"`{index_path}`" in queued_messages[0] | |
| 856 | - assert f"`{chapters}`" in queued_messages[0] | |
| 994 | + assert queued_messages == [] | |
| 857 | 995 | |
| 858 | 996 | |
| 859 | 997 | @pytest.mark.asyncio |
@@ -1166,7 +1304,7 @@ async def test_tool_batch_runner_duplicate_reference_read_prefers_next_pending_t | ||
| 1166 | 1304 | |
| 1167 | 1305 | |
| 1168 | 1306 | @pytest.mark.asyncio |
| 1169 | -async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | |
| 1307 | +async def test_tool_batch_runner_duplicate_read_ignores_unplanned_expansion_after_plan_complete( | |
| 1170 | 1308 | temp_dir: Path, |
| 1171 | 1309 | ) -> None: |
| 1172 | 1310 | async def assess_confidence( |
@@ -1174,7 +1312,7 @@ async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | ||
| 1174 | 1312 | tool_args: dict, |
| 1175 | 1313 | context: str, |
| 1176 | 1314 | ) -> ConfidenceAssessment: |
| 1177 | - raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1315 | + raise AssertionError("Confidence scoring should not run for this scenario") | |
| 1178 | 1316 | |
| 1179 | 1317 | async def verify_action( |
| 1180 | 1318 | tool_name: str, |
@@ -1184,9 +1322,33 @@ async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | ||
| 1184 | 1322 | ) -> ActionVerification: |
| 1185 | 1323 | raise AssertionError("Verification should not run for this scenario") |
| 1186 | 1324 | |
| 1187 | - reference = temp_dir / "fortran" / "index.html" | |
| 1188 | - reference.parent.mkdir(parents=True) | |
| 1189 | - reference.write_text("<h1>Fortran Beginner's Guide</h1>\n") | |
| 1325 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1326 | + chapters = guide_root / "chapters" | |
| 1327 | + guide_root.mkdir(parents=True) | |
| 1328 | + chapters.mkdir() | |
| 1329 | + index_path = guide_root / "index.html" | |
| 1330 | + chapter_one = chapters / "01-getting-started.html" | |
| 1331 | + chapter_two = chapters / "02-installation.html" | |
| 1332 | + index_path.write_text("<html></html>\n") | |
| 1333 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1334 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1335 | + | |
| 1336 | + implementation_plan = temp_dir / "implementation.md" | |
| 1337 | + implementation_plan.write_text( | |
| 1338 | + "\n".join( | |
| 1339 | + [ | |
| 1340 | + "# Implementation Plan", | |
| 1341 | + "", | |
| 1342 | + "## File Changes", | |
| 1343 | + f"- `{guide_root}/`", | |
| 1344 | + f"- `{chapters}/`", | |
| 1345 | + f"- `{index_path}`", | |
| 1346 | + f"- `{chapter_one}`", | |
| 1347 | + f"- `{chapter_two}`", | |
| 1348 | + "", | |
| 1349 | + ] | |
| 1350 | + ) | |
| 1351 | + ) | |
| 1190 | 1352 | |
| 1191 | 1353 | context = build_context( |
| 1192 | 1354 | temp_dir=temp_dir, |
@@ -1200,32 +1362,36 @@ async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | ||
| 1200 | 1362 | context.queue_steering_message_callback = queued_messages.append |
| 1201 | 1363 | runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 1202 | 1364 | dod = create_definition_of_done("Create a multi-file nginx guide.") |
| 1203 | - sync_todos_to_definition_of_done( | |
| 1204 | - dod, | |
| 1205 | - [ | |
| 1206 | - { | |
| 1207 | - "content": "Examine the existing Fortran guide structure to understand the cadence and format", | |
| 1208 | - "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format", | |
| 1209 | - "status": "pending", | |
| 1210 | - }, | |
| 1211 | - { | |
| 1212 | - "content": "Create the nginx index.html file", | |
| 1213 | - "active_form": "Working on: Create the nginx index.html file", | |
| 1214 | - "status": "pending", | |
| 1215 | - }, | |
| 1216 | - ], | |
| 1217 | - ) | |
| 1365 | + dod.implementation_plan = str(implementation_plan) | |
| 1366 | + dod.pending_items = [ | |
| 1367 | + "Create 07-performance-tuning.html", | |
| 1368 | + "Verify all guide files are linked and complete", | |
| 1369 | + "Complete the requested work", | |
| 1370 | + ] | |
| 1371 | + | |
| 1218 | 1372 | tool_call = ToolCall( |
| 1219 | - id="read-reference", | |
| 1373 | + id="read-dup", | |
| 1220 | 1374 | name="read", |
| 1221 | - arguments={"file_path": str(reference)}, | |
| 1375 | + arguments={"file_path": str(chapter_one)}, | |
| 1376 | + ) | |
| 1377 | + duplicate_message = ( | |
| 1378 | + "[Skipped - duplicate action: Already read " | |
| 1379 | + f"{chapter_one} recently without any intervening changes; " | |
| 1380 | + "reuse the earlier read result instead of rereading]" | |
| 1222 | 1381 | ) |
| 1223 | 1382 | executor = FakeExecutor( |
| 1224 | 1383 | [ |
| 1225 | - tool_outcome( | |
| 1384 | + ToolExecutionOutcome( | |
| 1226 | 1385 | tool_call=tool_call, |
| 1227 | - output="<h1>Fortran Beginner's Guide</h1>\n", | |
| 1386 | + state=ToolExecutionState.DUPLICATE, | |
| 1387 | + message=Message.tool_result_message( | |
| 1388 | + tool_call_id=tool_call.id, | |
| 1389 | + display_content=duplicate_message, | |
| 1390 | + result_content=duplicate_message, | |
| 1391 | + ), | |
| 1392 | + event_content=duplicate_message, | |
| 1228 | 1393 | is_error=False, |
| 1394 | + result_output=duplicate_message, | |
| 1229 | 1395 | ) |
| 1230 | 1396 | ] |
| 1231 | 1397 | ) |
@@ -1245,19 +1411,13 @@ async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | ||
| 1245 | 1411 | consecutive_errors=0, |
| 1246 | 1412 | ) |
| 1247 | 1413 | |
| 1248 | - assert any( | |
| 1249 | - "Continue with the next pending item: `Create the nginx index.html file`" | |
| 1250 | - in message | |
| 1251 | - for message in queued_messages | |
| 1252 | - ) | |
| 1253 | - assert any( | |
| 1254 | - "stop gathering more reference material and perform the change now" in message | |
| 1255 | - for message in queued_messages | |
| 1256 | - ) | |
| 1414 | + assert len(queued_messages) == 1 | |
| 1415 | + assert "Verify all guide files are linked and complete" in queued_messages[0] | |
| 1416 | + assert "Create 07-performance-tuning.html" not in queued_messages[0] | |
| 1257 | 1417 | |
| 1258 | 1418 | |
| 1259 | 1419 | @pytest.mark.asyncio |
| 1260 | -async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid( | |
| 1420 | +async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verification_handoff( | |
| 1261 | 1421 | temp_dir: Path, |
| 1262 | 1422 | ) -> None: |
| 1263 | 1423 | async def assess_confidence( |
@@ -1265,7 +1425,7 @@ async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_va | ||
| 1265 | 1425 | tool_args: dict, |
| 1266 | 1426 | context: str, |
| 1267 | 1427 | ) -> ConfidenceAssessment: |
| 1268 | - raise AssertionError("Confidence scoring should not run in this scenario") | |
| 1428 | + raise AssertionError("Confidence scoring should not run for this scenario") | |
| 1269 | 1429 | |
| 1270 | 1430 | async def verify_action( |
| 1271 | 1431 | tool_name: str, |
@@ -1273,31 +1433,35 @@ async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_va | ||
| 1273 | 1433 | result: str, |
| 1274 | 1434 | expected: str = "", |
| 1275 | 1435 | ) -> ActionVerification: |
| 1276 | - raise AssertionError("Verification should not run in this scenario") | |
| 1436 | + raise AssertionError("Verification should not run for this scenario") | |
| 1277 | 1437 | |
| 1278 | - prompt = ( | |
| 1279 | - "Have a look at ~/Loader/guides/fortran/index.html, then " | |
| 1280 | - "~/Loader/guides/fortran/chapters. The table of contents links in " | |
| 1281 | - "index.html are inaccurate and the href’s are wrong. Let’s update the " | |
| 1282 | - "links and their link texts to be correct." | |
| 1283 | - ) | |
| 1284 | - chapters = temp_dir / "chapters" | |
| 1438 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1439 | + chapters = guide_root / "chapters" | |
| 1440 | + guide_root.mkdir(parents=True) | |
| 1285 | 1441 | chapters.mkdir() |
| 1286 | - (chapters / "01-introduction.html").write_text( | |
| 1287 | - "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 1288 | - ) | |
| 1289 | - (chapters / "02-setup.html").write_text( | |
| 1290 | - "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 1291 | - ) | |
| 1292 | - current_block = ( | |
| 1293 | - "<h2>Table of Contents</h2>\n" | |
| 1294 | - ' <ul class="chapter-list">\n' | |
| 1295 | - ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 1296 | - ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n' | |
| 1297 | - " </ul>\n" | |
| 1442 | + index_path = guide_root / "index.html" | |
| 1443 | + chapter_one = chapters / "01-getting-started.html" | |
| 1444 | + chapter_two = chapters / "02-installation.html" | |
| 1445 | + index_path.write_text("<html></html>\n") | |
| 1446 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1447 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1448 | + | |
| 1449 | + implementation_plan = temp_dir / "implementation.md" | |
| 1450 | + implementation_plan.write_text( | |
| 1451 | + "\n".join( | |
| 1452 | + [ | |
| 1453 | + "# Implementation Plan", | |
| 1454 | + "", | |
| 1455 | + "## File Changes", | |
| 1456 | + f"- `{guide_root}/`", | |
| 1457 | + f"- `{chapters}/`", | |
| 1458 | + f"- `{index_path}`", | |
| 1459 | + f"- `{chapter_one}`", | |
| 1460 | + f"- `{chapter_two}`", | |
| 1461 | + "", | |
| 1462 | + ] | |
| 1463 | + ) | |
| 1298 | 1464 | ) |
| 1299 | - index_path = temp_dir / "index.html" | |
| 1300 | - index_path.write_text(current_block) | |
| 1301 | 1465 | |
| 1302 | 1466 | context = build_context( |
| 1303 | 1467 | temp_dir=temp_dir, |
@@ -1307,40 +1471,52 @@ async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_va | ||
| 1307 | 1471 | verify_action=verify_action, |
| 1308 | 1472 | auto_recover=False, |
| 1309 | 1473 | ) |
| 1310 | - context.session.current_task = prompt # type: ignore[attr-defined] | |
| 1311 | 1474 | queued_messages: list[str] = [] |
| 1312 | 1475 | context.queue_steering_message_callback = queued_messages.append |
| 1313 | 1476 | runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 1477 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1478 | + dod.implementation_plan = str(implementation_plan) | |
| 1479 | + dod.verification_commands = [f"ls -la {guide_root}"] | |
| 1480 | + dod.pending_items = [ | |
| 1481 | + "Create 07-performance-tuning.html", | |
| 1482 | + "Complete the requested work", | |
| 1483 | + ] | |
| 1484 | + | |
| 1314 | 1485 | tool_call = ToolCall( |
| 1315 | - id="edit-1", | |
| 1316 | - name="edit", | |
| 1317 | - arguments={ | |
| 1318 | - "file_path": str(index_path), | |
| 1319 | - "old_string": current_block, | |
| 1320 | - "new_string": current_block, | |
| 1321 | - }, | |
| 1486 | + id="read-dup", | |
| 1487 | + name="read", | |
| 1488 | + arguments={"file_path": str(chapter_one)}, | |
| 1489 | + ) | |
| 1490 | + duplicate_message = ( | |
| 1491 | + "[Skipped - duplicate action: Already read " | |
| 1492 | + f"{chapter_one} recently without any intervening changes; " | |
| 1493 | + "reuse the earlier read result instead of rereading]" | |
| 1322 | 1494 | ) |
| 1323 | 1495 | executor = FakeExecutor( |
| 1324 | 1496 | [ |
| 1325 | - tool_outcome( | |
| 1497 | + ToolExecutionOutcome( | |
| 1326 | 1498 | tool_call=tool_call, |
| 1327 | - output=( | |
| 1328 | - "[Blocked - old_string and new_string are identical - no change " | |
| 1329 | - "would occur] Suggestion: Provide different old and new strings" | |
| 1499 | + state=ToolExecutionState.DUPLICATE, | |
| 1500 | + message=Message.tool_result_message( | |
| 1501 | + tool_call_id=tool_call.id, | |
| 1502 | + display_content=duplicate_message, | |
| 1503 | + result_content=duplicate_message, | |
| 1330 | 1504 | ), |
| 1331 | - is_error=True, | |
| 1332 | - state=ToolExecutionState.BLOCKED, | |
| 1505 | + event_content=duplicate_message, | |
| 1506 | + is_error=False, | |
| 1507 | + result_output=duplicate_message, | |
| 1333 | 1508 | ) |
| 1334 | 1509 | ] |
| 1335 | 1510 | ) |
| 1336 | 1511 | |
| 1512 | + summary = TurnSummary(final_response="") | |
| 1337 | 1513 | await runner.execute_batch( |
| 1338 | 1514 | tool_calls=[tool_call], |
| 1339 | 1515 | tool_source="assistant", |
| 1340 | 1516 | pending_tool_calls_seen=set(), |
| 1341 | 1517 | emit=_noop_emit, |
| 1342 | - summary=TurnSummary(final_response=""), | |
| 1343 | - dod=create_definition_of_done(prompt), | |
| 1518 | + summary=summary, | |
| 1519 | + dod=dod, | |
| 1344 | 1520 | executor=executor, # type: ignore[arg-type] |
| 1345 | 1521 | on_confirmation=None, |
| 1346 | 1522 | on_user_question=None, |
@@ -1349,18 +1525,13 @@ async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_va | ||
| 1349 | 1525 | ) |
| 1350 | 1526 | |
| 1351 | 1527 | assert len(queued_messages) == 1 |
| 1352 | - assert "already matches the validated replacement block" in queued_messages[0] | |
| 1353 | - assert "validated 2 linked entries" in queued_messages[0] | |
| 1354 | - assert f"`{index_path}`" in queued_messages[0] | |
| 1355 | - assert "Do not call `edit`, `patch`, or reread the same TOC again" in queued_messages[0] | |
| 1356 | - | |
| 1357 | - | |
| 1358 | -async def _noop_emit(event: AgentEvent) -> None: | |
| 1359 | - return None | |
| 1528 | + assert "All explicitly planned artifacts already exist." in queued_messages[0] | |
| 1529 | + assert "Move to verification or final confirmation using the files already on disk." in queued_messages[0] | |
| 1530 | + assert "Create 07-performance-tuning.html" not in queued_messages[0] | |
| 1360 | 1531 | |
| 1361 | 1532 | |
| 1362 | 1533 | @pytest.mark.asyncio |
| 1363 | -async def test_tool_batch_runner_marks_verification_planned_after_new_mutation( | |
| 1534 | +async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stale_creation_todos( | |
| 1364 | 1535 | temp_dir: Path, |
| 1365 | 1536 | ) -> None: |
| 1366 | 1537 | async def assess_confidence( |
@@ -1368,7 +1539,7 @@ async def test_tool_batch_runner_marks_verification_planned_after_new_mutation( | ||
| 1368 | 1539 | tool_args: dict, |
| 1369 | 1540 | context: str, |
| 1370 | 1541 | ) -> ConfidenceAssessment: |
| 1371 | - raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1542 | + raise AssertionError("Confidence scoring should not run for this scenario") | |
| 1372 | 1543 | |
| 1373 | 1544 | async def verify_action( |
| 1374 | 1545 | tool_name: str, |
@@ -1378,34 +1549,87 @@ async def test_tool_batch_runner_marks_verification_planned_after_new_mutation( | ||
| 1378 | 1549 | ) -> ActionVerification: |
| 1379 | 1550 | raise AssertionError("Verification should not run for this scenario") |
| 1380 | 1551 | |
| 1552 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1553 | + chapters = guide_root / "chapters" | |
| 1554 | + guide_root.mkdir(parents=True) | |
| 1555 | + chapters.mkdir() | |
| 1556 | + index_path = guide_root / "index.html" | |
| 1557 | + chapter_one = chapters / "01-getting-started.html" | |
| 1558 | + chapter_two = chapters / "02-installation.html" | |
| 1559 | + index_path.write_text("<html></html>\n") | |
| 1560 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1561 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1562 | + | |
| 1563 | + implementation_plan = temp_dir / "implementation.md" | |
| 1564 | + implementation_plan.write_text( | |
| 1565 | + "\n".join( | |
| 1566 | + [ | |
| 1567 | + "# Implementation Plan", | |
| 1568 | + "", | |
| 1569 | + "## File Changes", | |
| 1570 | + f"- `{guide_root}/`", | |
| 1571 | + f"- `{chapters}/`", | |
| 1572 | + f"- `{index_path}`", | |
| 1573 | + f"- `{chapter_one}`", | |
| 1574 | + f"- `{chapter_two}`", | |
| 1575 | + "", | |
| 1576 | + ] | |
| 1577 | + ) | |
| 1578 | + ) | |
| 1579 | + | |
| 1381 | 1580 | context = build_context( |
| 1382 | 1581 | temp_dir=temp_dir, |
| 1383 | 1582 | messages=[], |
| 1384 | 1583 | safeguards=FakeSafeguards(), |
| 1385 | 1584 | assess_confidence=assess_confidence, |
| 1386 | 1585 | verify_action=verify_action, |
| 1586 | + auto_recover=False, | |
| 1387 | 1587 | ) |
| 1588 | + queued_messages: list[str] = [] | |
| 1589 | + context.queue_steering_message_callback = queued_messages.append | |
| 1388 | 1590 | runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 1591 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1592 | + dod.implementation_plan = str(implementation_plan) | |
| 1593 | + dod.verification_commands = [f"ls -la {guide_root}"] | |
| 1594 | + dod.pending_items = [ | |
| 1595 | + "Create 01-getting-started.html", | |
| 1596 | + "Creating 02-installation.html", | |
| 1597 | + "Complete the requested work", | |
| 1598 | + ] | |
| 1599 | + | |
| 1389 | 1600 | tool_call = ToolCall( |
| 1390 | - id="write-1", | |
| 1391 | - name="write", | |
| 1392 | - arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"}, | |
| 1601 | + id="read-dup-built-stale", | |
| 1602 | + name="read", | |
| 1603 | + arguments={"file_path": str(chapter_one)}, | |
| 1604 | + ) | |
| 1605 | + duplicate_message = ( | |
| 1606 | + "[Skipped - duplicate action: Already read " | |
| 1607 | + f"{chapter_one} recently without any intervening changes; " | |
| 1608 | + "reuse the earlier read result instead of rereading]" | |
| 1393 | 1609 | ) |
| 1394 | 1610 | executor = FakeExecutor( |
| 1395 | - [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)] | |
| 1611 | + [ | |
| 1612 | + ToolExecutionOutcome( | |
| 1613 | + tool_call=tool_call, | |
| 1614 | + state=ToolExecutionState.DUPLICATE, | |
| 1615 | + message=Message.tool_result_message( | |
| 1616 | + tool_call_id=tool_call.id, | |
| 1617 | + display_content=duplicate_message, | |
| 1618 | + result_content=duplicate_message, | |
| 1619 | + ), | |
| 1620 | + event_content=duplicate_message, | |
| 1621 | + is_error=False, | |
| 1622 | + result_output=duplicate_message, | |
| 1623 | + ) | |
| 1624 | + ] | |
| 1396 | 1625 | ) |
| 1397 | - summary = TurnSummary(final_response="") | |
| 1398 | - dod = create_definition_of_done("Update README and verify it still works.") | |
| 1399 | - events: list[AgentEvent] = [] | |
| 1400 | - | |
| 1401 | - async def emit(event: AgentEvent) -> None: | |
| 1402 | - events.append(event) | |
| 1403 | 1626 | |
| 1627 | + summary = TurnSummary(final_response="") | |
| 1404 | 1628 | await runner.execute_batch( |
| 1405 | 1629 | tool_calls=[tool_call], |
| 1406 | 1630 | tool_source="assistant", |
| 1407 | 1631 | pending_tool_calls_seen=set(), |
| 1408 | - emit=emit, | |
| 1632 | + emit=_noop_emit, | |
| 1409 | 1633 | summary=summary, |
| 1410 | 1634 | dod=dod, |
| 1411 | 1635 | executor=executor, # type: ignore[arg-type] |
@@ -1415,10 +1639,1500 @@ async def test_tool_batch_runner_marks_verification_planned_after_new_mutation( | ||
| 1415 | 1639 | consecutive_errors=0, |
| 1416 | 1640 | ) |
| 1417 | 1641 | |
| 1418 | - assert dod.last_verification_result == "planned" | |
| 1419 | - assert dod.verification_commands | |
| 1420 | - assert "Collect verification evidence" in dod.pending_items | |
| 1421 | - assert dod.active_verification_attempt_id == "verification-attempt-1" | |
| 1642 | + assert len(queued_messages) == 1 | |
| 1643 | + assert "All explicitly planned artifacts already exist." in queued_messages[0] | |
| 1644 | + assert "Move to verification or final confirmation using the files already on disk." in queued_messages[0] | |
| 1645 | + assert "Create 01-getting-started.html" not in queued_messages[0] | |
| 1646 | + assert "Creating 02-installation.html" not in queued_messages[0] | |
| 1647 | + | |
| 1648 | + | |
| 1649 | +@pytest.mark.asyncio | |
| 1650 | +async def test_tool_batch_runner_observation_handoff_pushes_mutation_step( | |
| 1651 | + temp_dir: Path, | |
| 1652 | +) -> None: | |
| 1653 | + async def assess_confidence( | |
| 1654 | + tool_name: str, | |
| 1655 | + tool_args: dict, | |
| 1656 | + context: str, | |
| 1657 | + ) -> ConfidenceAssessment: | |
| 1658 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1659 | + | |
| 1660 | + async def verify_action( | |
| 1661 | + tool_name: str, | |
| 1662 | + tool_args: dict, | |
| 1663 | + result: str, | |
| 1664 | + expected: str = "", | |
| 1665 | + ) -> ActionVerification: | |
| 1666 | + raise AssertionError("Verification should not run for this scenario") | |
| 1667 | + | |
| 1668 | + reference = temp_dir / "fortran" / "index.html" | |
| 1669 | + reference.parent.mkdir(parents=True) | |
| 1670 | + reference.write_text("<h1>Fortran Beginner's Guide</h1>\n") | |
| 1671 | + | |
| 1672 | + context = build_context( | |
| 1673 | + temp_dir=temp_dir, | |
| 1674 | + messages=[], | |
| 1675 | + safeguards=FakeSafeguards(), | |
| 1676 | + assess_confidence=assess_confidence, | |
| 1677 | + verify_action=verify_action, | |
| 1678 | + auto_recover=False, | |
| 1679 | + ) | |
| 1680 | + queued_messages: list[str] = [] | |
| 1681 | + context.queue_steering_message_callback = queued_messages.append | |
| 1682 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 1683 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1684 | + sync_todos_to_definition_of_done( | |
| 1685 | + dod, | |
| 1686 | + [ | |
| 1687 | + { | |
| 1688 | + "content": "Examine the existing Fortran guide structure to understand the cadence and format", | |
| 1689 | + "active_form": "Working on: Examine the existing Fortran guide structure to understand the cadence and format", | |
| 1690 | + "status": "pending", | |
| 1691 | + }, | |
| 1692 | + { | |
| 1693 | + "content": "Create the nginx index.html file", | |
| 1694 | + "active_form": "Working on: Create the nginx index.html file", | |
| 1695 | + "status": "pending", | |
| 1696 | + }, | |
| 1697 | + ], | |
| 1698 | + ) | |
| 1699 | + tool_call = ToolCall( | |
| 1700 | + id="read-reference", | |
| 1701 | + name="read", | |
| 1702 | + arguments={"file_path": str(reference)}, | |
| 1703 | + ) | |
| 1704 | + executor = FakeExecutor( | |
| 1705 | + [ | |
| 1706 | + tool_outcome( | |
| 1707 | + tool_call=tool_call, | |
| 1708 | + output="<h1>Fortran Beginner's Guide</h1>\n", | |
| 1709 | + is_error=False, | |
| 1710 | + ) | |
| 1711 | + ] | |
| 1712 | + ) | |
| 1713 | + | |
| 1714 | + summary = TurnSummary(final_response="") | |
| 1715 | + await runner.execute_batch( | |
| 1716 | + tool_calls=[tool_call], | |
| 1717 | + tool_source="assistant", | |
| 1718 | + pending_tool_calls_seen=set(), | |
| 1719 | + emit=_noop_emit, | |
| 1720 | + summary=summary, | |
| 1721 | + dod=dod, | |
| 1722 | + executor=executor, # type: ignore[arg-type] | |
| 1723 | + on_confirmation=None, | |
| 1724 | + on_user_question=None, | |
| 1725 | + emit_confirmation=None, | |
| 1726 | + consecutive_errors=0, | |
| 1727 | + ) | |
| 1728 | + | |
| 1729 | + assert any( | |
| 1730 | + "Continue with the next pending item: `Create the nginx index.html file`" | |
| 1731 | + in message | |
| 1732 | + for message in queued_messages | |
| 1733 | + ) | |
| 1734 | + assert any( | |
| 1735 | + "stop gathering more reference material and perform the change now" in message | |
| 1736 | + for message in queued_messages | |
| 1737 | + ) | |
| 1738 | + | |
| 1739 | + | |
| 1740 | +@pytest.mark.asyncio | |
| 1741 | +async def test_duplicate_observation_nudge_prioritizes_missing_artifact_over_review( | |
| 1742 | + temp_dir: Path, | |
| 1743 | +) -> None: | |
| 1744 | + async def assess_confidence( | |
| 1745 | + tool_name: str, | |
| 1746 | + tool_args: dict, | |
| 1747 | + context: str, | |
| 1748 | + ) -> ConfidenceAssessment: | |
| 1749 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1750 | + | |
| 1751 | + async def verify_action( | |
| 1752 | + tool_name: str, | |
| 1753 | + tool_args: dict, | |
| 1754 | + result: str, | |
| 1755 | + expected: str = "", | |
| 1756 | + ) -> ActionVerification: | |
| 1757 | + raise AssertionError("Verification should not run for this scenario") | |
| 1758 | + | |
| 1759 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1760 | + chapters = guide_root / "chapters" | |
| 1761 | + chapters.mkdir(parents=True) | |
| 1762 | + index_path = guide_root / "index.html" | |
| 1763 | + chapter_one = chapters / "01-getting-started.html" | |
| 1764 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1765 | + index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n") | |
| 1766 | + | |
| 1767 | + implementation_plan = temp_dir / "implementation.md" | |
| 1768 | + implementation_plan.write_text( | |
| 1769 | + "\n".join( | |
| 1770 | + [ | |
| 1771 | + "# Implementation Plan", | |
| 1772 | + "", | |
| 1773 | + "## File Changes", | |
| 1774 | + f"- `{index_path}`", | |
| 1775 | + f"- `{chapter_one}`", | |
| 1776 | + f"- `{chapters / '06-ssl-configuration.html'}`", | |
| 1777 | + "", | |
| 1778 | + ] | |
| 1779 | + ) | |
| 1780 | + ) | |
| 1781 | + | |
| 1782 | + context = build_context( | |
| 1783 | + temp_dir=temp_dir, | |
| 1784 | + messages=[], | |
| 1785 | + safeguards=FakeSafeguards(), | |
| 1786 | + assess_confidence=assess_confidence, | |
| 1787 | + verify_action=verify_action, | |
| 1788 | + auto_recover=False, | |
| 1789 | + ) | |
| 1790 | + queued_messages: list[str] = [] | |
| 1791 | + context.queue_steering_message_callback = queued_messages.append | |
| 1792 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 1793 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1794 | + dod.implementation_plan = str(implementation_plan) | |
| 1795 | + sync_todos_to_definition_of_done( | |
| 1796 | + dod, | |
| 1797 | + [ | |
| 1798 | + { | |
| 1799 | + "content": "Ensure all files are properly linked and formatted consistently", | |
| 1800 | + "active_form": "Working on: Ensure all files are properly linked and formatted consistently", | |
| 1801 | + "status": "pending", | |
| 1802 | + }, | |
| 1803 | + { | |
| 1804 | + "content": "Create the final chapter (06-ssl-configuration.html)", | |
| 1805 | + "active_form": "Working on: Create the final chapter (06-ssl-configuration.html)", | |
| 1806 | + "status": "pending", | |
| 1807 | + }, | |
| 1808 | + ], | |
| 1809 | + ) | |
| 1810 | + assert tool_batches_should_prioritize_missing_artifact( | |
| 1811 | + next_pending=dod.pending_items[0], | |
| 1812 | + missing_artifact=(chapters / "06-ssl-configuration.html", False), | |
| 1813 | + ) | |
| 1814 | + | |
| 1815 | + tool_call = ToolCall( | |
| 1816 | + id="dup-read", | |
| 1817 | + name="read", | |
| 1818 | + arguments={"file_path": str(index_path)}, | |
| 1819 | + ) | |
| 1820 | + runner._queue_duplicate_observation_nudge(tool_call, dod=dod) # type: ignore[attr-defined] | |
| 1821 | + | |
| 1822 | + assert queued_messages | |
| 1823 | + message = queued_messages[-1] | |
| 1824 | + assert "06-ssl-configuration.html" in message | |
| 1825 | + assert "Do not switch into review or consistency-check mode" in message | |
| 1826 | + assert ( | |
| 1827 | + "Continue with the next pending item: `Ensure all files are properly linked and formatted consistently`" | |
| 1828 | + not in message | |
| 1829 | + ) | |
| 1830 | + | |
| 1831 | + | |
| 1832 | +@pytest.mark.asyncio | |
| 1833 | +async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifacts_exist( | |
| 1834 | + temp_dir: Path, | |
| 1835 | +) -> None: | |
| 1836 | + async def assess_confidence( | |
| 1837 | + tool_name: str, | |
| 1838 | + tool_args: dict, | |
| 1839 | + context: str, | |
| 1840 | + ) -> ConfidenceAssessment: | |
| 1841 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 1842 | + | |
| 1843 | + async def verify_action( | |
| 1844 | + tool_name: str, | |
| 1845 | + tool_args: dict, | |
| 1846 | + result: str, | |
| 1847 | + expected: str = "", | |
| 1848 | + ) -> ActionVerification: | |
| 1849 | + raise AssertionError("Verification should not run for this scenario") | |
| 1850 | + | |
| 1851 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1852 | + chapters = guide_root / "chapters" | |
| 1853 | + chapters.mkdir(parents=True) | |
| 1854 | + index_path = guide_root / "index.html" | |
| 1855 | + chapter_one = chapters / "01-getting-started.html" | |
| 1856 | + chapter_two = chapters / "02-installation.html" | |
| 1857 | + index_path.write_text("<a href=\"chapters/01-getting-started.html\">One</a>\n") | |
| 1858 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1859 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1860 | + | |
| 1861 | + implementation_plan = temp_dir / "implementation.md" | |
| 1862 | + implementation_plan.write_text( | |
| 1863 | + "\n".join( | |
| 1864 | + [ | |
| 1865 | + "# Implementation Plan", | |
| 1866 | + "", | |
| 1867 | + "## File Changes", | |
| 1868 | + f"- `{chapters}/`", | |
| 1869 | + f"- `{index_path}`", | |
| 1870 | + f"- `{chapter_one}`", | |
| 1871 | + f"- `{chapter_two}`", | |
| 1872 | + "", | |
| 1873 | + ] | |
| 1874 | + ) | |
| 1875 | + ) | |
| 1876 | + | |
| 1877 | + context = build_context( | |
| 1878 | + temp_dir=temp_dir, | |
| 1879 | + messages=[], | |
| 1880 | + safeguards=FakeSafeguards(), | |
| 1881 | + assess_confidence=assess_confidence, | |
| 1882 | + verify_action=verify_action, | |
| 1883 | + auto_recover=False, | |
| 1884 | + ) | |
| 1885 | + queued_messages: list[str] = [] | |
| 1886 | + context.queue_steering_message_callback = queued_messages.append | |
| 1887 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 1888 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1889 | + dod.implementation_plan = str(implementation_plan) | |
| 1890 | + sync_todos_to_definition_of_done( | |
| 1891 | + dod, | |
| 1892 | + [ | |
| 1893 | + { | |
| 1894 | + "content": "Create the guide files", | |
| 1895 | + "active_form": "Working on: Create the guide files", | |
| 1896 | + "status": "completed", | |
| 1897 | + }, | |
| 1898 | + { | |
| 1899 | + "content": "Ensure all files are properly linked and formatted consistently", | |
| 1900 | + "active_form": "Working on: Ensure all files are properly linked and formatted consistently", | |
| 1901 | + "status": "pending", | |
| 1902 | + }, | |
| 1903 | + ], | |
| 1904 | + ) | |
| 1905 | + tool_call = ToolCall( | |
| 1906 | + id="write-final", | |
| 1907 | + name="write", | |
| 1908 | + arguments={ | |
| 1909 | + "file_path": str(chapter_two), | |
| 1910 | + "content": "<h1>Two</h1>\n", | |
| 1911 | + }, | |
| 1912 | + ) | |
| 1913 | + executor = FakeExecutor( | |
| 1914 | + [ | |
| 1915 | + tool_outcome( | |
| 1916 | + tool_call=tool_call, | |
| 1917 | + output=f"Successfully wrote {chapter_two}", | |
| 1918 | + is_error=False, | |
| 1919 | + ) | |
| 1920 | + ] | |
| 1921 | + ) | |
| 1922 | + | |
| 1923 | + summary = TurnSummary(final_response="") | |
| 1924 | + await runner.execute_batch( | |
| 1925 | + tool_calls=[tool_call], | |
| 1926 | + tool_source="assistant", | |
| 1927 | + pending_tool_calls_seen=set(), | |
| 1928 | + emit=_noop_emit, | |
| 1929 | + summary=summary, | |
| 1930 | + dod=dod, | |
| 1931 | + executor=executor, # type: ignore[arg-type] | |
| 1932 | + on_confirmation=None, | |
| 1933 | + on_user_question=None, | |
| 1934 | + emit_confirmation=None, | |
| 1935 | + consecutive_errors=0, | |
| 1936 | + ) | |
| 1937 | + | |
| 1938 | + assert any( | |
| 1939 | + "All explicitly planned artifacts now exist." in message | |
| 1940 | + for message in queued_messages | |
| 1941 | + ) | |
| 1942 | + assert any( | |
| 1943 | + "Ensure all files are properly linked and formatted consistently" in message | |
| 1944 | + for message in queued_messages | |
| 1945 | + ) | |
| 1946 | + assert any( | |
| 1947 | + "Move to verification once no specific mismatch remains." in message | |
| 1948 | + for message in queued_messages | |
| 1949 | + ) | |
| 1950 | + | |
| 1951 | + | |
| 1952 | +@pytest.mark.asyncio | |
| 1953 | +async def test_tool_batch_runner_mutation_handoff_points_at_next_missing_artifact( | |
| 1954 | + temp_dir: Path, | |
| 1955 | +) -> None: | |
| 1956 | + async def assess_confidence( | |
| 1957 | + tool_name: str, | |
| 1958 | + tool_args: dict, | |
| 1959 | + context: str, | |
| 1960 | + ) -> ConfidenceAssessment: | |
| 1961 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 1962 | + | |
| 1963 | + async def verify_action( | |
| 1964 | + tool_name: str, | |
| 1965 | + tool_args: dict, | |
| 1966 | + result: str, | |
| 1967 | + expected: str = "", | |
| 1968 | + ) -> ActionVerification: | |
| 1969 | + raise AssertionError("Verification should not run in this scenario") | |
| 1970 | + | |
| 1971 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1972 | + chapters = guide_root / "chapters" | |
| 1973 | + guide_root.mkdir(parents=True) | |
| 1974 | + chapters.mkdir() | |
| 1975 | + index_path = guide_root / "index.html" | |
| 1976 | + index_path.write_text("<html></html>\n") | |
| 1977 | + chapter_one = chapters / "01-getting-started.html" | |
| 1978 | + chapter_two = chapters / "02-installation.html" | |
| 1979 | + implementation_plan = temp_dir / "implementation.md" | |
| 1980 | + implementation_plan.write_text( | |
| 1981 | + "\n".join( | |
| 1982 | + [ | |
| 1983 | + "# Implementation Plan", | |
| 1984 | + "", | |
| 1985 | + "## File Changes", | |
| 1986 | + f"- `{guide_root}/`", | |
| 1987 | + f"- `{index_path}`", | |
| 1988 | + f"- `{chapter_one}`", | |
| 1989 | + f"- `{chapter_two}`", | |
| 1990 | + "", | |
| 1991 | + ] | |
| 1992 | + ) | |
| 1993 | + ) | |
| 1994 | + | |
| 1995 | + context = build_context( | |
| 1996 | + temp_dir=temp_dir, | |
| 1997 | + messages=[], | |
| 1998 | + safeguards=FakeSafeguards(), | |
| 1999 | + assess_confidence=assess_confidence, | |
| 2000 | + verify_action=verify_action, | |
| 2001 | + auto_recover=False, | |
| 2002 | + ) | |
| 2003 | + queued_messages: list[str] = [] | |
| 2004 | + context.queue_steering_message_callback = queued_messages.append | |
| 2005 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2006 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2007 | + dod.implementation_plan = str(implementation_plan) | |
| 2008 | + sync_todos_to_definition_of_done( | |
| 2009 | + dod, | |
| 2010 | + [ | |
| 2011 | + { | |
| 2012 | + "content": "Create the main index.html file with proper structure", | |
| 2013 | + "active_form": "Working on: Create the main index.html file with proper structure", | |
| 2014 | + "status": "pending", | |
| 2015 | + }, | |
| 2016 | + { | |
| 2017 | + "content": "Create each chapter file in sequence, following the established pattern", | |
| 2018 | + "active_form": "Working on: Create each chapter file in sequence, following the established pattern", | |
| 2019 | + "status": "pending", | |
| 2020 | + }, | |
| 2021 | + { | |
| 2022 | + "content": "Ensure all files are properly linked and formatted consistently", | |
| 2023 | + "active_form": "Working on: Ensure all files are properly linked and formatted consistently", | |
| 2024 | + "status": "pending", | |
| 2025 | + }, | |
| 2026 | + ], | |
| 2027 | + ) | |
| 2028 | + tool_call = ToolCall( | |
| 2029 | + id="write-index", | |
| 2030 | + name="write", | |
| 2031 | + arguments={"file_path": str(index_path), "content": "<html></html>\n"}, | |
| 2032 | + ) | |
| 2033 | + executor = FakeExecutor( | |
| 2034 | + [tool_outcome(tool_call=tool_call, output=f"Successfully wrote {index_path}", is_error=False)] | |
| 2035 | + ) | |
| 2036 | + | |
| 2037 | + summary = TurnSummary(final_response="") | |
| 2038 | + await runner.execute_batch( | |
| 2039 | + tool_calls=[tool_call], | |
| 2040 | + tool_source="assistant", | |
| 2041 | + pending_tool_calls_seen=set(), | |
| 2042 | + emit=_noop_emit, | |
| 2043 | + summary=summary, | |
| 2044 | + dod=dod, | |
| 2045 | + executor=executor, # type: ignore[arg-type] | |
| 2046 | + on_confirmation=None, | |
| 2047 | + on_user_question=None, | |
| 2048 | + emit_confirmation=None, | |
| 2049 | + consecutive_errors=0, | |
| 2050 | + ) | |
| 2051 | + | |
| 2052 | + assert queued_messages | |
| 2053 | + message = queued_messages[-1] | |
| 2054 | + assert "Resume by creating `01-getting-started.html` now." in message | |
| 2055 | + assert "refresh `TodoWrite`" in message | |
| 2056 | + assert "Do not move to verification, final confirmation, or TodoWrite-only bookkeeping" in message | |
| 2057 | + assert "Do not spend another turn on working notes or rediscovery alone." in message | |
| 2058 | + | |
| 2059 | + | |
| 2060 | +@pytest.mark.asyncio | |
| 2061 | +async def test_tool_batch_runner_large_plan_does_not_claim_completion_early( | |
| 2062 | + temp_dir: Path, | |
| 2063 | +) -> None: | |
| 2064 | + async def assess_confidence( | |
| 2065 | + tool_name: str, | |
| 2066 | + tool_args: dict, | |
| 2067 | + context: str, | |
| 2068 | + ) -> ConfidenceAssessment: | |
| 2069 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2070 | + | |
| 2071 | + async def verify_action( | |
| 2072 | + tool_name: str, | |
| 2073 | + tool_args: dict, | |
| 2074 | + result: str, | |
| 2075 | + expected: str = "", | |
| 2076 | + ) -> ActionVerification: | |
| 2077 | + raise AssertionError("Verification should not run in this scenario") | |
| 2078 | + | |
| 2079 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2080 | + chapters = guide_root / "chapters" | |
| 2081 | + guide_root.mkdir(parents=True) | |
| 2082 | + chapters.mkdir() | |
| 2083 | + index_path = guide_root / "index.html" | |
| 2084 | + index_path.write_text("<html></html>\n") | |
| 2085 | + | |
| 2086 | + chapter_paths = [ | |
| 2087 | + chapters / "01-getting-started.html", | |
| 2088 | + chapters / "02-installation.html", | |
| 2089 | + chapters / "03-first-website.html", | |
| 2090 | + chapters / "04-configuration-basics.html", | |
| 2091 | + chapters / "05-advanced-configurations.html", | |
| 2092 | + chapters / "06-performance-tuning.html", | |
| 2093 | + chapters / "07-security-best-practices.html", | |
| 2094 | + ] | |
| 2095 | + for chapter in chapter_paths[:4]: | |
| 2096 | + chapter.write_text(f"<h1>{chapter.stem}</h1>\n") | |
| 2097 | + chapter_paths[4].write_text("<h1>Advanced configurations</h1>\n") | |
| 2098 | + | |
| 2099 | + implementation_plan = temp_dir / "implementation.md" | |
| 2100 | + implementation_plan.write_text( | |
| 2101 | + "\n".join( | |
| 2102 | + [ | |
| 2103 | + "# Implementation Plan", | |
| 2104 | + "", | |
| 2105 | + "## File Changes", | |
| 2106 | + f"- `{guide_root}/`", | |
| 2107 | + f"- `{chapters}/`", | |
| 2108 | + f"- `{index_path}`", | |
| 2109 | + *[f"- `{path}`" for path in chapter_paths], | |
| 2110 | + "", | |
| 2111 | + ] | |
| 2112 | + ) | |
| 2113 | + ) | |
| 2114 | + | |
| 2115 | + context = build_context( | |
| 2116 | + temp_dir=temp_dir, | |
| 2117 | + messages=[], | |
| 2118 | + safeguards=FakeSafeguards(), | |
| 2119 | + assess_confidence=assess_confidence, | |
| 2120 | + verify_action=verify_action, | |
| 2121 | + auto_recover=False, | |
| 2122 | + ) | |
| 2123 | + queued_messages: list[str] = [] | |
| 2124 | + context.queue_steering_message_callback = queued_messages.append | |
| 2125 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2126 | + dod = create_definition_of_done("Create a thorough nginx guide.") | |
| 2127 | + dod.implementation_plan = str(implementation_plan) | |
| 2128 | + sync_todos_to_definition_of_done( | |
| 2129 | + dod, | |
| 2130 | + [ | |
| 2131 | + { | |
| 2132 | + "content": "Create the nginx guide artifacts", | |
| 2133 | + "active_form": "Creating nginx guide artifacts", | |
| 2134 | + "status": "pending", | |
| 2135 | + }, | |
| 2136 | + { | |
| 2137 | + "content": "Verify all guide files are linked and complete", | |
| 2138 | + "active_form": "Verifying guide linkage and completeness", | |
| 2139 | + "status": "pending", | |
| 2140 | + }, | |
| 2141 | + ], | |
| 2142 | + ) | |
| 2143 | + tool_call = ToolCall( | |
| 2144 | + id="write-chapter-05", | |
| 2145 | + name="write", | |
| 2146 | + arguments={ | |
| 2147 | + "file_path": str(chapter_paths[4]), | |
| 2148 | + "content": "<h1>Advanced configurations</h1>\n", | |
| 2149 | + }, | |
| 2150 | + ) | |
| 2151 | + executor = FakeExecutor( | |
| 2152 | + [ | |
| 2153 | + tool_outcome( | |
| 2154 | + tool_call=tool_call, | |
| 2155 | + output=f"Successfully wrote {chapter_paths[4]}", | |
| 2156 | + is_error=False, | |
| 2157 | + ) | |
| 2158 | + ] | |
| 2159 | + ) | |
| 2160 | + | |
| 2161 | + summary = TurnSummary(final_response="") | |
| 2162 | + await runner.execute_batch( | |
| 2163 | + tool_calls=[tool_call], | |
| 2164 | + tool_source="assistant", | |
| 2165 | + pending_tool_calls_seen=set(), | |
| 2166 | + emit=_noop_emit, | |
| 2167 | + summary=summary, | |
| 2168 | + dod=dod, | |
| 2169 | + executor=executor, # type: ignore[arg-type] | |
| 2170 | + on_confirmation=None, | |
| 2171 | + on_user_question=None, | |
| 2172 | + emit_confirmation=None, | |
| 2173 | + consecutive_errors=0, | |
| 2174 | + ) | |
| 2175 | + | |
| 2176 | + assert any( | |
| 2177 | + "Resume by creating `06-performance-tuning.html` now." in message | |
| 2178 | + for message in queued_messages | |
| 2179 | + ) | |
| 2180 | + assert not any( | |
| 2181 | + "All explicitly planned artifacts now exist." in message | |
| 2182 | + for message in queued_messages | |
| 2183 | + ) | |
| 2184 | + | |
| 2185 | + | |
| 2186 | +@pytest.mark.asyncio | |
| 2187 | +async def test_tool_batch_runner_todowrite_with_missing_artifact_requeues_exact_resume_step( | |
| 2188 | + temp_dir: Path, | |
| 2189 | +) -> None: | |
| 2190 | + async def assess_confidence( | |
| 2191 | + tool_name: str, | |
| 2192 | + tool_args: dict, | |
| 2193 | + context: str, | |
| 2194 | + ) -> ConfidenceAssessment: | |
| 2195 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2196 | + | |
| 2197 | + async def verify_action( | |
| 2198 | + tool_name: str, | |
| 2199 | + tool_args: dict, | |
| 2200 | + result: str, | |
| 2201 | + expected: str = "", | |
| 2202 | + ) -> ActionVerification: | |
| 2203 | + raise AssertionError("Verification should not run in this scenario") | |
| 2204 | + | |
| 2205 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2206 | + chapters = guide_root / "chapters" | |
| 2207 | + guide_root.mkdir(parents=True) | |
| 2208 | + chapters.mkdir() | |
| 2209 | + index_path = guide_root / "index.html" | |
| 2210 | + index_path.write_text("<html></html>\n") | |
| 2211 | + chapter_one = chapters / "01-getting-started.html" | |
| 2212 | + chapter_two = chapters / "02-installation.html" | |
| 2213 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 2214 | + | |
| 2215 | + implementation_plan = temp_dir / "implementation.md" | |
| 2216 | + implementation_plan.write_text( | |
| 2217 | + "\n".join( | |
| 2218 | + [ | |
| 2219 | + "# Implementation Plan", | |
| 2220 | + "", | |
| 2221 | + "## File Changes", | |
| 2222 | + f"- `{guide_root}/`", | |
| 2223 | + f"- `{chapters}/`", | |
| 2224 | + f"- `{index_path}`", | |
| 2225 | + f"- `{chapter_one}`", | |
| 2226 | + f"- `{chapter_two}`", | |
| 2227 | + "", | |
| 2228 | + ] | |
| 2229 | + ) | |
| 2230 | + ) | |
| 2231 | + | |
| 2232 | + context = build_context( | |
| 2233 | + temp_dir=temp_dir, | |
| 2234 | + messages=[], | |
| 2235 | + safeguards=FakeSafeguards(), | |
| 2236 | + assess_confidence=assess_confidence, | |
| 2237 | + verify_action=verify_action, | |
| 2238 | + auto_recover=False, | |
| 2239 | + ) | |
| 2240 | + queued_messages: list[str] = [] | |
| 2241 | + context.queue_steering_message_callback = queued_messages.append | |
| 2242 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2243 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2244 | + dod.implementation_plan = str(implementation_plan) | |
| 2245 | + sync_todos_to_definition_of_done( | |
| 2246 | + dod, | |
| 2247 | + [ | |
| 2248 | + { | |
| 2249 | + "content": "Create 01-getting-started.html", | |
| 2250 | + "active_form": "Creating 01-getting-started.html", | |
| 2251 | + "status": "completed", | |
| 2252 | + }, | |
| 2253 | + { | |
| 2254 | + "content": "Create 02-installation.html", | |
| 2255 | + "active_form": "Creating 02-installation.html", | |
| 2256 | + "status": "pending", | |
| 2257 | + }, | |
| 2258 | + ], | |
| 2259 | + ) | |
| 2260 | + dod.touched_files.extend([str(index_path), str(chapter_one)]) | |
| 2261 | + | |
| 2262 | + tool_call = ToolCall( | |
| 2263 | + id="todo-only", | |
| 2264 | + name="TodoWrite", | |
| 2265 | + arguments={ | |
| 2266 | + "todos": [ | |
| 2267 | + { | |
| 2268 | + "content": "Create 01-getting-started.html", | |
| 2269 | + "active_form": "Creating 01-getting-started.html", | |
| 2270 | + "status": "completed", | |
| 2271 | + }, | |
| 2272 | + { | |
| 2273 | + "content": "Create 02-installation.html", | |
| 2274 | + "active_form": "Creating 02-installation.html", | |
| 2275 | + "status": "pending", | |
| 2276 | + }, | |
| 2277 | + ] | |
| 2278 | + }, | |
| 2279 | + ) | |
| 2280 | + executor = FakeExecutor( | |
| 2281 | + [ | |
| 2282 | + tool_outcome( | |
| 2283 | + tool_call=tool_call, | |
| 2284 | + output="Todos updated", | |
| 2285 | + is_error=False, | |
| 2286 | + metadata={ | |
| 2287 | + "new_todos": [ | |
| 2288 | + { | |
| 2289 | + "content": "Create 01-getting-started.html", | |
| 2290 | + "active_form": "Creating 01-getting-started.html", | |
| 2291 | + "status": "completed", | |
| 2292 | + }, | |
| 2293 | + { | |
| 2294 | + "content": "Create 02-installation.html", | |
| 2295 | + "active_form": "Creating 02-installation.html", | |
| 2296 | + "status": "pending", | |
| 2297 | + }, | |
| 2298 | + ] | |
| 2299 | + }, | |
| 2300 | + ) | |
| 2301 | + ] | |
| 2302 | + ) | |
| 2303 | + | |
| 2304 | + summary = TurnSummary(final_response="") | |
| 2305 | + await runner.execute_batch( | |
| 2306 | + tool_calls=[tool_call], | |
| 2307 | + tool_source="assistant", | |
| 2308 | + pending_tool_calls_seen=set(), | |
| 2309 | + emit=_noop_emit, | |
| 2310 | + summary=summary, | |
| 2311 | + dod=dod, | |
| 2312 | + executor=executor, # type: ignore[arg-type] | |
| 2313 | + on_confirmation=None, | |
| 2314 | + on_user_question=None, | |
| 2315 | + emit_confirmation=None, | |
| 2316 | + consecutive_errors=0, | |
| 2317 | + ) | |
| 2318 | + | |
| 2319 | + assert queued_messages | |
| 2320 | + message = queued_messages[-1] | |
| 2321 | + assert "Todo tracking is updated. An explicitly planned artifact is still missing." in message | |
| 2322 | + assert "Resume by creating `02-installation.html` now." in message | |
| 2323 | + assert "refresh `TodoWrite`" in message | |
| 2324 | + assert "Do not spend the next turn on TodoWrite alone" in message | |
| 2325 | + | |
| 2326 | + | |
| 2327 | +@pytest.mark.asyncio | |
| 2328 | +async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verification_handoff( | |
| 2329 | + temp_dir: Path, | |
| 2330 | +) -> None: | |
| 2331 | + async def assess_confidence( | |
| 2332 | + tool_name: str, | |
| 2333 | + tool_args: dict, | |
| 2334 | + context: str, | |
| 2335 | + ) -> ConfidenceAssessment: | |
| 2336 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2337 | + | |
| 2338 | + async def verify_action( | |
| 2339 | + tool_name: str, | |
| 2340 | + tool_args: dict, | |
| 2341 | + result: str, | |
| 2342 | + expected: str = "", | |
| 2343 | + ) -> ActionVerification: | |
| 2344 | + raise AssertionError("Verification should not run in this scenario") | |
| 2345 | + | |
| 2346 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2347 | + chapters = guide_root / "chapters" | |
| 2348 | + guide_root.mkdir(parents=True) | |
| 2349 | + chapters.mkdir() | |
| 2350 | + index_path = guide_root / "index.html" | |
| 2351 | + chapter_one = chapters / "01-getting-started.html" | |
| 2352 | + chapter_two = chapters / "02-installation.html" | |
| 2353 | + index_path.write_text("<html></html>\n") | |
| 2354 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 2355 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 2356 | + | |
| 2357 | + implementation_plan = temp_dir / "implementation.md" | |
| 2358 | + implementation_plan.write_text( | |
| 2359 | + "\n".join( | |
| 2360 | + [ | |
| 2361 | + "# Implementation Plan", | |
| 2362 | + "", | |
| 2363 | + "## File Changes", | |
| 2364 | + f"- `{guide_root}/`", | |
| 2365 | + f"- `{chapters}/`", | |
| 2366 | + f"- `{index_path}`", | |
| 2367 | + f"- `{chapter_one}`", | |
| 2368 | + f"- `{chapter_two}`", | |
| 2369 | + "", | |
| 2370 | + ] | |
| 2371 | + ) | |
| 2372 | + ) | |
| 2373 | + | |
| 2374 | + context = build_context( | |
| 2375 | + temp_dir=temp_dir, | |
| 2376 | + messages=[], | |
| 2377 | + safeguards=FakeSafeguards(), | |
| 2378 | + assess_confidence=assess_confidence, | |
| 2379 | + verify_action=verify_action, | |
| 2380 | + auto_recover=False, | |
| 2381 | + ) | |
| 2382 | + queued_messages: list[str] = [] | |
| 2383 | + context.queue_steering_message_callback = queued_messages.append | |
| 2384 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2385 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2386 | + dod.implementation_plan = str(implementation_plan) | |
| 2387 | + dod.verification_commands = [f"ls -la {guide_root}"] | |
| 2388 | + sync_todos_to_definition_of_done( | |
| 2389 | + dod, | |
| 2390 | + [ | |
| 2391 | + { | |
| 2392 | + "content": "First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 2393 | + "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 2394 | + "status": "pending", | |
| 2395 | + }, | |
| 2396 | + { | |
| 2397 | + "content": "Verify all guide files are linked and complete", | |
| 2398 | + "active_form": "Working on: Verify all guide files are linked and complete", | |
| 2399 | + "status": "pending", | |
| 2400 | + }, | |
| 2401 | + ], | |
| 2402 | + project_root=temp_dir, | |
| 2403 | + ) | |
| 2404 | + | |
| 2405 | + tool_call = ToolCall( | |
| 2406 | + id="todo-only", | |
| 2407 | + name="TodoWrite", | |
| 2408 | + arguments={ | |
| 2409 | + "todos": [ | |
| 2410 | + { | |
| 2411 | + "content": "First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 2412 | + "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 2413 | + "status": "pending", | |
| 2414 | + }, | |
| 2415 | + { | |
| 2416 | + "content": "Verify all guide files are linked and complete", | |
| 2417 | + "active_form": "Working on: Verify all guide files are linked and complete", | |
| 2418 | + "status": "pending", | |
| 2419 | + }, | |
| 2420 | + ] | |
| 2421 | + }, | |
| 2422 | + ) | |
| 2423 | + executor = FakeExecutor( | |
| 2424 | + [ | |
| 2425 | + tool_outcome( | |
| 2426 | + tool_call=tool_call, | |
| 2427 | + output="Todos updated", | |
| 2428 | + is_error=False, | |
| 2429 | + metadata={ | |
| 2430 | + "new_todos": [ | |
| 2431 | + { | |
| 2432 | + "content": "First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 2433 | + "active_form": "Working on: First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 2434 | + "status": "pending", | |
| 2435 | + }, | |
| 2436 | + { | |
| 2437 | + "content": "Verify all guide files are linked and complete", | |
| 2438 | + "active_form": "Working on: Verify all guide files are linked and complete", | |
| 2439 | + "status": "pending", | |
| 2440 | + }, | |
| 2441 | + ] | |
| 2442 | + }, | |
| 2443 | + ) | |
| 2444 | + ] | |
| 2445 | + ) | |
| 2446 | + | |
| 2447 | + summary = TurnSummary(final_response="") | |
| 2448 | + await runner.execute_batch( | |
| 2449 | + tool_calls=[tool_call], | |
| 2450 | + tool_source="assistant", | |
| 2451 | + pending_tool_calls_seen=set(), | |
| 2452 | + emit=_noop_emit, | |
| 2453 | + summary=summary, | |
| 2454 | + dod=dod, | |
| 2455 | + executor=executor, # type: ignore[arg-type] | |
| 2456 | + on_confirmation=None, | |
| 2457 | + on_user_question=None, | |
| 2458 | + emit_confirmation=None, | |
| 2459 | + consecutive_errors=0, | |
| 2460 | + ) | |
| 2461 | + | |
| 2462 | + assert queued_messages | |
| 2463 | + message = queued_messages[-1] | |
| 2464 | + assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message | |
| 2465 | + assert "Verify all guide files are linked and complete" in message | |
| 2466 | + assert "Move to verification once no specific mismatch remains." in message | |
| 2467 | + assert "reopen reference materials" in message | |
| 2468 | + assert "Fortran guide structure" not in message | |
| 2469 | + | |
| 2470 | + | |
| 2471 | +@pytest.mark.asyncio | |
| 2472 | +async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation( | |
| 2473 | + temp_dir: Path, | |
| 2474 | +) -> None: | |
| 2475 | + async def assess_confidence( | |
| 2476 | + tool_name: str, | |
| 2477 | + tool_args: dict, | |
| 2478 | + context: str, | |
| 2479 | + ) -> ConfidenceAssessment: | |
| 2480 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2481 | + | |
| 2482 | + async def verify_action( | |
| 2483 | + tool_name: str, | |
| 2484 | + tool_args: dict, | |
| 2485 | + result: str, | |
| 2486 | + expected: str = "", | |
| 2487 | + ) -> ActionVerification: | |
| 2488 | + raise AssertionError("Verification should not run in this scenario") | |
| 2489 | + | |
| 2490 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2491 | + chapters = guide_root / "chapters" | |
| 2492 | + guide_root.mkdir(parents=True) | |
| 2493 | + chapters.mkdir() | |
| 2494 | + index_path = guide_root / "index.html" | |
| 2495 | + index_path.write_text( | |
| 2496 | + "\n".join( | |
| 2497 | + [ | |
| 2498 | + "<!DOCTYPE html>", | |
| 2499 | + "<html>", | |
| 2500 | + "<body>", | |
| 2501 | + '<a href="chapters/01-introduction.html">Introduction</a>', | |
| 2502 | + "</body>", | |
| 2503 | + "</html>", | |
| 2504 | + "", | |
| 2505 | + ] | |
| 2506 | + ) | |
| 2507 | + ) | |
| 2508 | + | |
| 2509 | + implementation_plan = temp_dir / "implementation.md" | |
| 2510 | + implementation_plan.write_text( | |
| 2511 | + "\n".join( | |
| 2512 | + [ | |
| 2513 | + "# Implementation Plan", | |
| 2514 | + "", | |
| 2515 | + "## File Changes", | |
| 2516 | + f"- `{guide_root}/`", | |
| 2517 | + f"- `{chapters}/`", | |
| 2518 | + f"- `{index_path}`", | |
| 2519 | + "", | |
| 2520 | + ] | |
| 2521 | + ) | |
| 2522 | + ) | |
| 2523 | + | |
| 2524 | + context = build_context( | |
| 2525 | + temp_dir=temp_dir, | |
| 2526 | + messages=[], | |
| 2527 | + safeguards=FakeSafeguards(), | |
| 2528 | + assess_confidence=assess_confidence, | |
| 2529 | + verify_action=verify_action, | |
| 2530 | + auto_recover=False, | |
| 2531 | + ) | |
| 2532 | + queued_messages: list[str] = [] | |
| 2533 | + context.queue_steering_message_callback = queued_messages.append | |
| 2534 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2535 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2536 | + dod.implementation_plan = str(implementation_plan) | |
| 2537 | + dod.touched_files.append(str(index_path)) | |
| 2538 | + sync_todos_to_definition_of_done( | |
| 2539 | + dod, | |
| 2540 | + [ | |
| 2541 | + { | |
| 2542 | + "content": "Examine the existing Fortran guide structure", | |
| 2543 | + "active_form": "Examining the existing Fortran guide structure", | |
| 2544 | + "status": "completed", | |
| 2545 | + }, | |
| 2546 | + { | |
| 2547 | + "content": "Create the nginx directory structure", | |
| 2548 | + "active_form": "Creating the nginx directory structure", | |
| 2549 | + "status": "completed", | |
| 2550 | + }, | |
| 2551 | + { | |
| 2552 | + "content": "Write the introduction chapter", | |
| 2553 | + "active_form": "Writing the introduction chapter", | |
| 2554 | + "status": "pending", | |
| 2555 | + }, | |
| 2556 | + ], | |
| 2557 | + project_root=temp_dir, | |
| 2558 | + ) | |
| 2559 | + | |
| 2560 | + tool_call = ToolCall( | |
| 2561 | + id="todo-next-mutation", | |
| 2562 | + name="TodoWrite", | |
| 2563 | + arguments={ | |
| 2564 | + "todos": [ | |
| 2565 | + { | |
| 2566 | + "content": "Examine the existing Fortran guide structure", | |
| 2567 | + "active_form": "Examining the existing Fortran guide structure", | |
| 2568 | + "status": "completed", | |
| 2569 | + }, | |
| 2570 | + { | |
| 2571 | + "content": "Create the nginx directory structure", | |
| 2572 | + "active_form": "Creating the nginx directory structure", | |
| 2573 | + "status": "completed", | |
| 2574 | + }, | |
| 2575 | + { | |
| 2576 | + "content": "Write the introduction chapter", | |
| 2577 | + "active_form": "Writing the introduction chapter", | |
| 2578 | + "status": "pending", | |
| 2579 | + }, | |
| 2580 | + ] | |
| 2581 | + }, | |
| 2582 | + ) | |
| 2583 | + executor = FakeExecutor( | |
| 2584 | + [ | |
| 2585 | + tool_outcome( | |
| 2586 | + tool_call=tool_call, | |
| 2587 | + output="Todos updated", | |
| 2588 | + is_error=False, | |
| 2589 | + metadata={ | |
| 2590 | + "new_todos": [ | |
| 2591 | + { | |
| 2592 | + "content": "Examine the existing Fortran guide structure", | |
| 2593 | + "active_form": "Examining the existing Fortran guide structure", | |
| 2594 | + "status": "completed", | |
| 2595 | + }, | |
| 2596 | + { | |
| 2597 | + "content": "Create the nginx directory structure", | |
| 2598 | + "active_form": "Creating the nginx directory structure", | |
| 2599 | + "status": "completed", | |
| 2600 | + }, | |
| 2601 | + { | |
| 2602 | + "content": "Write the introduction chapter", | |
| 2603 | + "active_form": "Writing the introduction chapter", | |
| 2604 | + "status": "pending", | |
| 2605 | + }, | |
| 2606 | + ] | |
| 2607 | + }, | |
| 2608 | + ) | |
| 2609 | + ] | |
| 2610 | + ) | |
| 2611 | + | |
| 2612 | + summary = TurnSummary(final_response="") | |
| 2613 | + await runner.execute_batch( | |
| 2614 | + tool_calls=[tool_call], | |
| 2615 | + tool_source="assistant", | |
| 2616 | + pending_tool_calls_seen=set(), | |
| 2617 | + emit=_noop_emit, | |
| 2618 | + summary=summary, | |
| 2619 | + dod=dod, | |
| 2620 | + executor=executor, # type: ignore[arg-type] | |
| 2621 | + on_confirmation=None, | |
| 2622 | + on_user_question=None, | |
| 2623 | + emit_confirmation=None, | |
| 2624 | + consecutive_errors=0, | |
| 2625 | + ) | |
| 2626 | + | |
| 2627 | + assert queued_messages | |
| 2628 | + message = queued_messages[-1] | |
| 2629 | + assert "Todo tracking is updated. An explicitly planned artifact is still missing." in message | |
| 2630 | + assert "Continue with the next pending item: `Write the introduction chapter`." in message | |
| 2631 | + assert "Resume by creating `01-introduction.html` now." in message | |
| 2632 | + assert "It is the next missing declared output under `chapters/`." in message | |
| 2633 | + assert "Prefer one `write` call for `" in message | |
| 2634 | + assert "01-introduction.html` instead of more rereads." in message | |
| 2635 | + assert "Do not spend the next turn on TodoWrite alone" in message | |
| 2636 | + | |
| 2637 | + | |
| 2638 | +@pytest.mark.asyncio | |
| 2639 | +async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_next_missing_file( | |
| 2640 | + temp_dir: Path, | |
| 2641 | +) -> None: | |
| 2642 | + async def assess_confidence( | |
| 2643 | + tool_name: str, | |
| 2644 | + tool_args: dict, | |
| 2645 | + context: str, | |
| 2646 | + ) -> ConfidenceAssessment: | |
| 2647 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2648 | + | |
| 2649 | + async def verify_action( | |
| 2650 | + tool_name: str, | |
| 2651 | + tool_args: dict, | |
| 2652 | + result: str, | |
| 2653 | + expected: str = "", | |
| 2654 | + ) -> ActionVerification: | |
| 2655 | + raise AssertionError("Verification should not run in this scenario") | |
| 2656 | + | |
| 2657 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2658 | + chapters = guide_root / "chapters" | |
| 2659 | + guide_root.mkdir(parents=True) | |
| 2660 | + chapters.mkdir() | |
| 2661 | + index_path = guide_root / "index.html" | |
| 2662 | + index_path.write_text( | |
| 2663 | + "\n".join( | |
| 2664 | + [ | |
| 2665 | + "<html>", | |
| 2666 | + '<a href="chapters/introduction.html">Introduction</a>', | |
| 2667 | + '<a href="chapters/installation.html">Installation</a>', | |
| 2668 | + "</html>", | |
| 2669 | + ] | |
| 2670 | + ) | |
| 2671 | + + "\n" | |
| 2672 | + ) | |
| 2673 | + | |
| 2674 | + implementation_plan = temp_dir / "implementation.md" | |
| 2675 | + implementation_plan.write_text( | |
| 2676 | + "\n".join( | |
| 2677 | + [ | |
| 2678 | + "# Implementation Plan", | |
| 2679 | + "", | |
| 2680 | + "## File Changes", | |
| 2681 | + f"- `{guide_root}/`", | |
| 2682 | + f"- `{chapters}/`", | |
| 2683 | + f"- `{index_path}`", | |
| 2684 | + "", | |
| 2685 | + ] | |
| 2686 | + ) | |
| 2687 | + ) | |
| 2688 | + | |
| 2689 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2690 | + dod.implementation_plan = str(implementation_plan) | |
| 2691 | + dod.pending_items = [ | |
| 2692 | + "Write the introduction chapter", | |
| 2693 | + "Complete the requested work", | |
| 2694 | + ] | |
| 2695 | + dod.touched_files.append(str(index_path)) | |
| 2696 | + | |
| 2697 | + queued_messages: list[str] = [] | |
| 2698 | + context = build_context( | |
| 2699 | + temp_dir=temp_dir, | |
| 2700 | + messages=[], | |
| 2701 | + safeguards=FakeSafeguards(), | |
| 2702 | + assess_confidence=assess_confidence, | |
| 2703 | + verify_action=verify_action, | |
| 2704 | + auto_recover=False, | |
| 2705 | + ) | |
| 2706 | + context.queue_steering_message_callback = queued_messages.append | |
| 2707 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2708 | + | |
| 2709 | + tool_call = ToolCall( | |
| 2710 | + id="todo-1", | |
| 2711 | + name="TodoWrite", | |
| 2712 | + arguments={ | |
| 2713 | + "todos": [ | |
| 2714 | + { | |
| 2715 | + "content": "Write the introduction chapter", | |
| 2716 | + "activeForm": "Writing the introduction chapter", | |
| 2717 | + "status": "pending", | |
| 2718 | + } | |
| 2719 | + ] | |
| 2720 | + }, | |
| 2721 | + ) | |
| 2722 | + executor = FakeExecutor( | |
| 2723 | + [ | |
| 2724 | + tool_outcome( | |
| 2725 | + tool_call=tool_call, | |
| 2726 | + output="Todos updated", | |
| 2727 | + is_error=False, | |
| 2728 | + metadata={ | |
| 2729 | + "new_todos": [ | |
| 2730 | + { | |
| 2731 | + "content": "Write the introduction chapter", | |
| 2732 | + "active_form": "Writing the introduction chapter", | |
| 2733 | + "status": "pending", | |
| 2734 | + } | |
| 2735 | + ] | |
| 2736 | + }, | |
| 2737 | + ) | |
| 2738 | + ] | |
| 2739 | + ) | |
| 2740 | + | |
| 2741 | + summary = TurnSummary(final_response="") | |
| 2742 | + await runner.execute_batch( | |
| 2743 | + tool_calls=[tool_call], | |
| 2744 | + tool_source="assistant", | |
| 2745 | + pending_tool_calls_seen=set(), | |
| 2746 | + emit=_noop_emit, | |
| 2747 | + summary=summary, | |
| 2748 | + dod=dod, | |
| 2749 | + executor=executor, # type: ignore[arg-type] | |
| 2750 | + on_confirmation=None, | |
| 2751 | + on_user_question=None, | |
| 2752 | + emit_confirmation=None, | |
| 2753 | + consecutive_errors=0, | |
| 2754 | + ) | |
| 2755 | + | |
| 2756 | + assert queued_messages | |
| 2757 | + message = queued_messages[-1] | |
| 2758 | + assert "Todo tracking is updated. An explicitly planned artifact is still missing." in message | |
| 2759 | + assert "Continue with the next pending item: `Write the introduction chapter`." in message | |
| 2760 | + assert "Resume by creating `introduction.html` now." in message | |
| 2761 | + assert "It is the next missing declared output under `chapters/`." in message | |
| 2762 | + assert "Prefer one `write` call for `" in message | |
| 2763 | + assert "introduction.html` instead of more rereads." in message | |
| 2764 | + assert "Do not spend the next turn on TodoWrite alone" in message | |
| 2765 | + | |
| 2766 | + | |
| 2767 | +@pytest.mark.asyncio | |
| 2768 | +async def test_tool_batch_runner_bookkeeping_note_with_missing_artifact_requeues_resume_step( | |
| 2769 | + temp_dir: Path, | |
| 2770 | +) -> None: | |
| 2771 | + async def assess_confidence( | |
| 2772 | + tool_name: str, | |
| 2773 | + tool_args: dict, | |
| 2774 | + context: str, | |
| 2775 | + ) -> ConfidenceAssessment: | |
| 2776 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2777 | + | |
| 2778 | + async def verify_action( | |
| 2779 | + tool_name: str, | |
| 2780 | + tool_args: dict, | |
| 2781 | + result: str, | |
| 2782 | + expected: str = "", | |
| 2783 | + ) -> ActionVerification: | |
| 2784 | + raise AssertionError("Verification should not run in this scenario") | |
| 2785 | + | |
| 2786 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2787 | + chapters = guide_root / "chapters" | |
| 2788 | + guide_root.mkdir(parents=True) | |
| 2789 | + chapters.mkdir() | |
| 2790 | + index_path = guide_root / "index.html" | |
| 2791 | + chapter_one = chapters / "01-getting-started.html" | |
| 2792 | + chapter_two = chapters / "02-installation.html" | |
| 2793 | + index_path.write_text("<html></html>\n") | |
| 2794 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 2795 | + | |
| 2796 | + implementation_plan = temp_dir / "implementation.md" | |
| 2797 | + implementation_plan.write_text( | |
| 2798 | + "\n".join( | |
| 2799 | + [ | |
| 2800 | + "# Implementation Plan", | |
| 2801 | + "", | |
| 2802 | + "## File Changes", | |
| 2803 | + f"- `{guide_root}/`", | |
| 2804 | + f"- `{chapters}/`", | |
| 2805 | + f"- `{index_path}`", | |
| 2806 | + f"- `{chapter_one}`", | |
| 2807 | + f"- `{chapter_two}`", | |
| 2808 | + "", | |
| 2809 | + ] | |
| 2810 | + ) | |
| 2811 | + ) | |
| 2812 | + | |
| 2813 | + context = build_context( | |
| 2814 | + temp_dir=temp_dir, | |
| 2815 | + messages=[], | |
| 2816 | + safeguards=FakeSafeguards(), | |
| 2817 | + assess_confidence=assess_confidence, | |
| 2818 | + verify_action=verify_action, | |
| 2819 | + auto_recover=False, | |
| 2820 | + ) | |
| 2821 | + queued_messages: list[str] = [] | |
| 2822 | + context.queue_steering_message_callback = queued_messages.append | |
| 2823 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2824 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2825 | + dod.implementation_plan = str(implementation_plan) | |
| 2826 | + sync_todos_to_definition_of_done( | |
| 2827 | + dod, | |
| 2828 | + [ | |
| 2829 | + { | |
| 2830 | + "content": "Create 01-getting-started.html", | |
| 2831 | + "active_form": "Creating 01-getting-started.html", | |
| 2832 | + "status": "completed", | |
| 2833 | + }, | |
| 2834 | + { | |
| 2835 | + "content": "Create 02-installation.html", | |
| 2836 | + "active_form": "Creating 02-installation.html", | |
| 2837 | + "status": "pending", | |
| 2838 | + }, | |
| 2839 | + ], | |
| 2840 | + project_root=temp_dir, | |
| 2841 | + ) | |
| 2842 | + dod.touched_files.extend([str(index_path), str(chapter_one)]) | |
| 2843 | + | |
| 2844 | + tool_call = ToolCall( | |
| 2845 | + id="working-note", | |
| 2846 | + name="notepad_write_working", | |
| 2847 | + arguments={"content": "Creating the second chapter file: Installation"}, | |
| 2848 | + ) | |
| 2849 | + executor = FakeExecutor( | |
| 2850 | + [ | |
| 2851 | + tool_outcome( | |
| 2852 | + tool_call=tool_call, | |
| 2853 | + output="Working note recorded", | |
| 2854 | + is_error=False, | |
| 2855 | + ) | |
| 2856 | + ] | |
| 2857 | + ) | |
| 2858 | + | |
| 2859 | + summary = TurnSummary(final_response="") | |
| 2860 | + await runner.execute_batch( | |
| 2861 | + tool_calls=[tool_call], | |
| 2862 | + tool_source="assistant", | |
| 2863 | + pending_tool_calls_seen=set(), | |
| 2864 | + emit=_noop_emit, | |
| 2865 | + summary=summary, | |
| 2866 | + dod=dod, | |
| 2867 | + executor=executor, # type: ignore[arg-type] | |
| 2868 | + on_confirmation=None, | |
| 2869 | + on_user_question=None, | |
| 2870 | + emit_confirmation=None, | |
| 2871 | + consecutive_errors=0, | |
| 2872 | + ) | |
| 2873 | + | |
| 2874 | + assert queued_messages | |
| 2875 | + message = queued_messages[-1] | |
| 2876 | + assert "Bookkeeping note is recorded. An explicitly planned artifact is still missing." in message | |
| 2877 | + assert "Resume by creating `02-installation.html` now." in message | |
| 2878 | + assert "Make your next response the concrete mutation tool call itself" in message | |
| 2879 | + assert "refresh `TodoWrite`" in message | |
| 2880 | + assert "Do not spend the next turn on additional notes, rediscovery, verification, or final confirmation" in message | |
| 2881 | + | |
| 2882 | + | |
| 2883 | +@pytest.mark.asyncio | |
| 2884 | +async def test_tool_batch_runner_working_note_respects_discovery_first_pending_step( | |
| 2885 | + temp_dir: Path, | |
| 2886 | +) -> None: | |
| 2887 | + async def assess_confidence( | |
| 2888 | + tool_name: str, | |
| 2889 | + tool_args: dict, | |
| 2890 | + context: str, | |
| 2891 | + ) -> ConfidenceAssessment: | |
| 2892 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 2893 | + | |
| 2894 | + async def verify_action( | |
| 2895 | + tool_name: str, | |
| 2896 | + tool_args: dict, | |
| 2897 | + result: str, | |
| 2898 | + expected: str = "", | |
| 2899 | + ) -> ActionVerification: | |
| 2900 | + raise AssertionError("Verification should not run in this scenario") | |
| 2901 | + | |
| 2902 | + implementation_plan = temp_dir / "implementation.md" | |
| 2903 | + implementation_plan.write_text( | |
| 2904 | + "\n".join( | |
| 2905 | + [ | |
| 2906 | + "# Implementation Plan", | |
| 2907 | + "", | |
| 2908 | + "## File Changes", | |
| 2909 | + f"- `{temp_dir / 'guides' / 'nginx' / 'index.html'}`", | |
| 2910 | + f"- `{temp_dir / 'guides' / 'nginx' / 'chapters'}`", | |
| 2911 | + "", | |
| 2912 | + ] | |
| 2913 | + ) | |
| 2914 | + ) | |
| 2915 | + | |
| 2916 | + context = build_context( | |
| 2917 | + temp_dir=temp_dir, | |
| 2918 | + messages=[], | |
| 2919 | + safeguards=FakeSafeguards(), | |
| 2920 | + assess_confidence=assess_confidence, | |
| 2921 | + verify_action=verify_action, | |
| 2922 | + auto_recover=False, | |
| 2923 | + ) | |
| 2924 | + queued_messages: list[str] = [] | |
| 2925 | + context.queue_steering_message_callback = queued_messages.append | |
| 2926 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2927 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2928 | + dod.implementation_plan = str(implementation_plan) | |
| 2929 | + dod.pending_items.extend( | |
| 2930 | + [ | |
| 2931 | + "First, examine the existing fortran guide structure and content to understand the format", | |
| 2932 | + "Create the nginx directory structure", | |
| 2933 | + "Develop the main index.html file for the nginx guide", | |
| 2934 | + ] | |
| 2935 | + ) | |
| 2936 | + | |
| 2937 | + tool_call = ToolCall( | |
| 2938 | + id="working-note", | |
| 2939 | + name="notepad_write_working", | |
| 2940 | + arguments={"content": "Analyzing the fortran guide structure before creating nginx guide"}, | |
| 2941 | + ) | |
| 2942 | + executor = FakeExecutor( | |
| 2943 | + [ | |
| 2944 | + tool_outcome( | |
| 2945 | + tool_call=tool_call, | |
| 2946 | + output="Working note recorded", | |
| 2947 | + is_error=False, | |
| 2948 | + ) | |
| 2949 | + ] | |
| 2950 | + ) | |
| 2951 | + | |
| 2952 | + summary = TurnSummary(final_response="") | |
| 2953 | + await runner.execute_batch( | |
| 2954 | + tool_calls=[tool_call], | |
| 2955 | + tool_source="assistant", | |
| 2956 | + pending_tool_calls_seen=set(), | |
| 2957 | + emit=_noop_emit, | |
| 2958 | + summary=summary, | |
| 2959 | + dod=dod, | |
| 2960 | + executor=executor, # type: ignore[arg-type] | |
| 2961 | + on_confirmation=None, | |
| 2962 | + on_user_question=None, | |
| 2963 | + emit_confirmation=None, | |
| 2964 | + consecutive_errors=0, | |
| 2965 | + ) | |
| 2966 | + | |
| 2967 | + assert queued_messages | |
| 2968 | + message = queued_messages[-1] | |
| 2969 | + assert ( | |
| 2970 | + "Continue with the next pending item: `First, examine the existing fortran guide structure and content to understand the format`." | |
| 2971 | + in message | |
| 2972 | + ) | |
| 2973 | + assert "one concrete evidence-gathering tool call" in message | |
| 2974 | + assert "Resume by creating `index.html` now." not in message | |
| 2975 | + | |
| 2976 | + | |
| 2977 | +@pytest.mark.asyncio | |
| 2978 | +async def test_tool_batch_runner_hands_off_noop_toc_edit_when_file_is_already_valid( | |
| 2979 | + temp_dir: Path, | |
| 2980 | +) -> None: | |
| 2981 | + async def assess_confidence( | |
| 2982 | + tool_name: str, | |
| 2983 | + tool_args: dict, | |
| 2984 | + context: str, | |
| 2985 | + ) -> ConfidenceAssessment: | |
| 2986 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2987 | + | |
| 2988 | + async def verify_action( | |
| 2989 | + tool_name: str, | |
| 2990 | + tool_args: dict, | |
| 2991 | + result: str, | |
| 2992 | + expected: str = "", | |
| 2993 | + ) -> ActionVerification: | |
| 2994 | + raise AssertionError("Verification should not run in this scenario") | |
| 2995 | + | |
| 2996 | + prompt = ( | |
| 2997 | + "Have a look at ~/Loader/guides/fortran/index.html, then " | |
| 2998 | + "~/Loader/guides/fortran/chapters. The table of contents links in " | |
| 2999 | + "index.html are inaccurate and the href’s are wrong. Let’s update the " | |
| 3000 | + "links and their link texts to be correct." | |
| 3001 | + ) | |
| 3002 | + chapters = temp_dir / "chapters" | |
| 3003 | + chapters.mkdir() | |
| 3004 | + (chapters / "01-introduction.html").write_text( | |
| 3005 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 3006 | + ) | |
| 3007 | + (chapters / "02-setup.html").write_text( | |
| 3008 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 3009 | + ) | |
| 3010 | + current_block = ( | |
| 3011 | + "<h2>Table of Contents</h2>\n" | |
| 3012 | + ' <ul class="chapter-list">\n' | |
| 3013 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 3014 | + ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n' | |
| 3015 | + " </ul>\n" | |
| 3016 | + ) | |
| 3017 | + index_path = temp_dir / "index.html" | |
| 3018 | + index_path.write_text(current_block) | |
| 3019 | + | |
| 3020 | + context = build_context( | |
| 3021 | + temp_dir=temp_dir, | |
| 3022 | + messages=[], | |
| 3023 | + safeguards=FakeSafeguards(), | |
| 3024 | + assess_confidence=assess_confidence, | |
| 3025 | + verify_action=verify_action, | |
| 3026 | + auto_recover=False, | |
| 3027 | + ) | |
| 3028 | + context.session.current_task = prompt # type: ignore[attr-defined] | |
| 3029 | + queued_messages: list[str] = [] | |
| 3030 | + context.queue_steering_message_callback = queued_messages.append | |
| 3031 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 3032 | + tool_call = ToolCall( | |
| 3033 | + id="edit-1", | |
| 3034 | + name="edit", | |
| 3035 | + arguments={ | |
| 3036 | + "file_path": str(index_path), | |
| 3037 | + "old_string": current_block, | |
| 3038 | + "new_string": current_block, | |
| 3039 | + }, | |
| 3040 | + ) | |
| 3041 | + executor = FakeExecutor( | |
| 3042 | + [ | |
| 3043 | + tool_outcome( | |
| 3044 | + tool_call=tool_call, | |
| 3045 | + output=( | |
| 3046 | + "[Blocked - old_string and new_string are identical - no change " | |
| 3047 | + "would occur] Suggestion: Provide different old and new strings" | |
| 3048 | + ), | |
| 3049 | + is_error=True, | |
| 3050 | + state=ToolExecutionState.BLOCKED, | |
| 3051 | + ) | |
| 3052 | + ] | |
| 3053 | + ) | |
| 3054 | + | |
| 3055 | + await runner.execute_batch( | |
| 3056 | + tool_calls=[tool_call], | |
| 3057 | + tool_source="assistant", | |
| 3058 | + pending_tool_calls_seen=set(), | |
| 3059 | + emit=_noop_emit, | |
| 3060 | + summary=TurnSummary(final_response=""), | |
| 3061 | + dod=create_definition_of_done(prompt), | |
| 3062 | + executor=executor, # type: ignore[arg-type] | |
| 3063 | + on_confirmation=None, | |
| 3064 | + on_user_question=None, | |
| 3065 | + emit_confirmation=None, | |
| 3066 | + consecutive_errors=0, | |
| 3067 | + ) | |
| 3068 | + | |
| 3069 | + assert queued_messages == [] | |
| 3070 | + | |
| 3071 | + | |
| 3072 | +async def _noop_emit(event: AgentEvent) -> None: | |
| 3073 | + return None | |
| 3074 | + | |
| 3075 | + | |
| 3076 | +@pytest.mark.asyncio | |
| 3077 | +async def test_tool_batch_runner_marks_verification_planned_after_new_mutation( | |
| 3078 | + temp_dir: Path, | |
| 3079 | +) -> None: | |
| 3080 | + async def assess_confidence( | |
| 3081 | + tool_name: str, | |
| 3082 | + tool_args: dict, | |
| 3083 | + context: str, | |
| 3084 | + ) -> ConfidenceAssessment: | |
| 3085 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 3086 | + | |
| 3087 | + async def verify_action( | |
| 3088 | + tool_name: str, | |
| 3089 | + tool_args: dict, | |
| 3090 | + result: str, | |
| 3091 | + expected: str = "", | |
| 3092 | + ) -> ActionVerification: | |
| 3093 | + raise AssertionError("Verification should not run for this scenario") | |
| 3094 | + | |
| 3095 | + context = build_context( | |
| 3096 | + temp_dir=temp_dir, | |
| 3097 | + messages=[], | |
| 3098 | + safeguards=FakeSafeguards(), | |
| 3099 | + assess_confidence=assess_confidence, | |
| 3100 | + verify_action=verify_action, | |
| 3101 | + ) | |
| 3102 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 3103 | + tool_call = ToolCall( | |
| 3104 | + id="write-1", | |
| 3105 | + name="write", | |
| 3106 | + arguments={"file_path": str(temp_dir / "README.md"), "content": "updated\n"}, | |
| 3107 | + ) | |
| 3108 | + executor = FakeExecutor( | |
| 3109 | + [tool_outcome(tool_call=tool_call, output="wrote file", is_error=False)] | |
| 3110 | + ) | |
| 3111 | + summary = TurnSummary(final_response="") | |
| 3112 | + dod = create_definition_of_done("Update README and verify it still works.") | |
| 3113 | + events: list[AgentEvent] = [] | |
| 3114 | + | |
| 3115 | + async def emit(event: AgentEvent) -> None: | |
| 3116 | + events.append(event) | |
| 3117 | + | |
| 3118 | + await runner.execute_batch( | |
| 3119 | + tool_calls=[tool_call], | |
| 3120 | + tool_source="assistant", | |
| 3121 | + pending_tool_calls_seen=set(), | |
| 3122 | + emit=emit, | |
| 3123 | + summary=summary, | |
| 3124 | + dod=dod, | |
| 3125 | + executor=executor, # type: ignore[arg-type] | |
| 3126 | + on_confirmation=None, | |
| 3127 | + on_user_question=None, | |
| 3128 | + emit_confirmation=None, | |
| 3129 | + consecutive_errors=0, | |
| 3130 | + ) | |
| 3131 | + | |
| 3132 | + assert dod.last_verification_result == "planned" | |
| 3133 | + assert dod.verification_commands | |
| 3134 | + assert "Collect verification evidence" in dod.pending_items | |
| 3135 | + assert dod.active_verification_attempt_id == "verification-attempt-1" | |
| 1422 | 3136 | assert dod.active_verification_attempt_number == 1 |
| 1423 | 3137 | assert summary.workflow_timeline[-1].reason_code == "verification_planned" |
| 1424 | 3138 | assert summary.workflow_timeline[-1].policy_outcome == "planned" |
@@ -1526,3 +3240,238 @@ async def test_tool_batch_runner_marks_passed_verification_stale_after_new_mutat | ||
| 1526 | 3240 | summary.workflow_timeline[-1].verification_observations[0].command |
| 1527 | 3241 | == "uv run pytest -q" |
| 1528 | 3242 | ) |
| 3243 | + | |
| 3244 | + | |
| 3245 | +def test_tool_batch_runner_blocked_active_repair_nudge_uses_repair_scope(temp_dir: Path) -> None: | |
| 3246 | + async def assess_confidence( | |
| 3247 | + tool_name: str, | |
| 3248 | + tool_args: dict, | |
| 3249 | + context: str, | |
| 3250 | + ) -> ConfidenceAssessment: | |
| 3251 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 3252 | + | |
| 3253 | + async def verify_action( | |
| 3254 | + tool_name: str, | |
| 3255 | + tool_args: dict, | |
| 3256 | + result: str, | |
| 3257 | + expected: str = "", | |
| 3258 | + ) -> ActionVerification: | |
| 3259 | + raise AssertionError("Verification should not run in this scenario") | |
| 3260 | + | |
| 3261 | + repair_target = temp_dir / "guide" / "index.html" | |
| 3262 | + context = build_context( | |
| 3263 | + temp_dir=temp_dir, | |
| 3264 | + messages=[ | |
| 3265 | + Message( | |
| 3266 | + role=Role.ASSISTANT, | |
| 3267 | + content=( | |
| 3268 | + "Repair focus:\n" | |
| 3269 | + f"- Fix the broken local reference `chapters/01-getting-started.html` in `{repair_target}`.\n" | |
| 3270 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 3271 | + f"- If the broken reference should remain, create `{temp_dir / 'guide' / 'chapters' / '01-getting-started.html'}`; otherwise remove or replace `chapters/01-getting-started.html`.\n" | |
| 3272 | + ), | |
| 3273 | + ) | |
| 3274 | + ], | |
| 3275 | + safeguards=FakeSafeguards(), | |
| 3276 | + assess_confidence=assess_confidence, | |
| 3277 | + verify_action=verify_action, | |
| 3278 | + ) | |
| 3279 | + queued: list[str] = [] | |
| 3280 | + context.queue_steering_message_callback = queued.append | |
| 3281 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 3282 | + | |
| 3283 | + runner._queue_blocked_active_repair_nudge( | |
| 3284 | + "[Blocked - active repair scope: verification already identified the repair target.]" | |
| 3285 | + ) | |
| 3286 | + | |
| 3287 | + assert queued | |
| 3288 | + assert str(repair_target) in queued[0] | |
| 3289 | + assert str(temp_dir / "guide" / "chapters" / "01-getting-started.html") in queued[0] | |
| 3290 | + assert "Do not reopen unrelated reference materials" in queued[0] | |
| 3291 | + | |
| 3292 | + | |
| 3293 | +def test_tool_batch_runner_blocked_active_repair_mutation_nudge_uses_allowed_paths( | |
| 3294 | + temp_dir: Path, | |
| 3295 | +) -> None: | |
| 3296 | + async def assess_confidence( | |
| 3297 | + tool_name: str, | |
| 3298 | + tool_args: dict, | |
| 3299 | + context: str, | |
| 3300 | + ) -> ConfidenceAssessment: | |
| 3301 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 3302 | + | |
| 3303 | + async def verify_action( | |
| 3304 | + tool_name: str, | |
| 3305 | + tool_args: dict, | |
| 3306 | + result: str, | |
| 3307 | + expected: str = "", | |
| 3308 | + ) -> ActionVerification: | |
| 3309 | + raise AssertionError("Verification should not run in this scenario") | |
| 3310 | + | |
| 3311 | + repair_target = temp_dir / "guide" / "chapters" / "05-advanced-configurations.html" | |
| 3312 | + stylesheet = temp_dir / "guide" / "styles.css" | |
| 3313 | + context = build_context( | |
| 3314 | + temp_dir=temp_dir, | |
| 3315 | + messages=[ | |
| 3316 | + Message( | |
| 3317 | + role=Role.ASSISTANT, | |
| 3318 | + content=( | |
| 3319 | + "Repair focus:\n" | |
| 3320 | + f"- Fix the broken local reference `../styles.css` in `{repair_target}`.\n" | |
| 3321 | + f"- Immediate next step: edit `{repair_target}`.\n" | |
| 3322 | + f"- If the broken reference should remain, create `{stylesheet}`; otherwise remove or replace `../styles.css`.\n" | |
| 3323 | + ), | |
| 3324 | + ) | |
| 3325 | + ], | |
| 3326 | + safeguards=FakeSafeguards(), | |
| 3327 | + assess_confidence=assess_confidence, | |
| 3328 | + verify_action=verify_action, | |
| 3329 | + ) | |
| 3330 | + queued: list[str] = [] | |
| 3331 | + context.queue_steering_message_callback = queued.append | |
| 3332 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 3333 | + | |
| 3334 | + runner._queue_blocked_active_repair_mutation_nudge( | |
| 3335 | + "[Blocked - active repair mutation scope: verification already identified the repair target.]" | |
| 3336 | + ) | |
| 3337 | + | |
| 3338 | + assert queued | |
| 3339 | + assert str(repair_target) in queued[0] | |
| 3340 | + assert str(stylesheet) in queued[0] | |
| 3341 | + assert "before widening the change set" in queued[0] | |
| 3342 | + | |
| 3343 | + | |
| 3344 | +def test_tool_batch_runner_blocked_late_reference_drift_nudge_points_to_missing_artifact( | |
| 3345 | + temp_dir: Path, | |
| 3346 | +) -> None: | |
| 3347 | + async def assess_confidence( | |
| 3348 | + tool_name: str, | |
| 3349 | + tool_args: dict, | |
| 3350 | + context: str, | |
| 3351 | + ) -> ConfidenceAssessment: | |
| 3352 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 3353 | + | |
| 3354 | + async def verify_action( | |
| 3355 | + tool_name: str, | |
| 3356 | + tool_args: dict, | |
| 3357 | + result: str, | |
| 3358 | + expected: str = "", | |
| 3359 | + ) -> ActionVerification: | |
| 3360 | + raise AssertionError("Verification should not run in this scenario") | |
| 3361 | + | |
| 3362 | + context = build_context( | |
| 3363 | + temp_dir=temp_dir, | |
| 3364 | + messages=[], | |
| 3365 | + safeguards=FakeSafeguards(), | |
| 3366 | + assess_confidence=assess_confidence, | |
| 3367 | + verify_action=verify_action, | |
| 3368 | + ) | |
| 3369 | + queued: list[str] = [] | |
| 3370 | + context.queue_steering_message_callback = queued.append | |
| 3371 | + store = DefinitionOfDoneStore(temp_dir) | |
| 3372 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 3373 | + plan_path = temp_dir / "implementation.md" | |
| 3374 | + plan_path.write_text( | |
| 3375 | + "# File Changes\n" | |
| 3376 | + "- `guide/index.html`\n" | |
| 3377 | + "- `guide/chapters/01-getting-started.html`\n" | |
| 3378 | + "- `guide/chapters/02-installation.html`\n" | |
| 3379 | + "- `guide/chapters/03-first-website.html`\n" | |
| 3380 | + ) | |
| 3381 | + dod.implementation_plan = str(plan_path) | |
| 3382 | + (temp_dir / "guide" / "chapters").mkdir(parents=True, exist_ok=True) | |
| 3383 | + (temp_dir / "guide" / "index.html").write_text("index") | |
| 3384 | + (temp_dir / "guide" / "chapters" / "01-getting-started.html").write_text("one") | |
| 3385 | + (temp_dir / "guide" / "chapters" / "02-installation.html").write_text("two") | |
| 3386 | + runner = ToolBatchRunner(context, store) | |
| 3387 | + | |
| 3388 | + runner._queue_blocked_late_reference_drift_nudge( | |
| 3389 | + "[Blocked - late reference drift: several planned artifacts already exist.]", | |
| 3390 | + dod=dod, | |
| 3391 | + ) | |
| 3392 | + | |
| 3393 | + assert queued | |
| 3394 | + assert "03-first-website.html" in queued[0] | |
| 3395 | + assert "older reference materials" in queued[0] | |
| 3396 | + | |
| 3397 | + | |
| 3398 | +def test_tool_batch_runner_blocked_completed_artifact_scope_nudge_prefers_verification( | |
| 3399 | + temp_dir: Path, | |
| 3400 | +) -> None: | |
| 3401 | + async def assess_confidence( | |
| 3402 | + tool_name: str, | |
| 3403 | + tool_args: dict, | |
| 3404 | + context: str, | |
| 3405 | + ) -> ConfidenceAssessment: | |
| 3406 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 3407 | + | |
| 3408 | + async def verify_action( | |
| 3409 | + tool_name: str, | |
| 3410 | + tool_args: dict, | |
| 3411 | + result: str, | |
| 3412 | + expected: str = "", | |
| 3413 | + ) -> ActionVerification: | |
| 3414 | + raise AssertionError("Verification should not run in this scenario") | |
| 3415 | + | |
| 3416 | + guide_root = temp_dir / "guide" | |
| 3417 | + chapters = guide_root / "chapters" | |
| 3418 | + guide_root.mkdir(parents=True) | |
| 3419 | + chapters.mkdir() | |
| 3420 | + index_path = guide_root / "index.html" | |
| 3421 | + chapter_one = chapters / "01-getting-started.html" | |
| 3422 | + chapter_two = chapters / "02-installation.html" | |
| 3423 | + index_path.write_text("index") | |
| 3424 | + chapter_one.write_text("one") | |
| 3425 | + chapter_two.write_text("two") | |
| 3426 | + | |
| 3427 | + implementation_plan = temp_dir / "implementation.md" | |
| 3428 | + implementation_plan.write_text( | |
| 3429 | + "\n".join( | |
| 3430 | + [ | |
| 3431 | + "# Implementation Plan", | |
| 3432 | + "", | |
| 3433 | + "## File Changes", | |
| 3434 | + f"- `{guide_root}`", | |
| 3435 | + f"- `{chapters}`", | |
| 3436 | + f"- `{index_path}`", | |
| 3437 | + f"- `{chapter_one}`", | |
| 3438 | + f"- `{chapter_two}`", | |
| 3439 | + "", | |
| 3440 | + ] | |
| 3441 | + ) | |
| 3442 | + ) | |
| 3443 | + | |
| 3444 | + context = build_context( | |
| 3445 | + temp_dir=temp_dir, | |
| 3446 | + messages=[], | |
| 3447 | + safeguards=FakeSafeguards(), | |
| 3448 | + assess_confidence=assess_confidence, | |
| 3449 | + verify_action=verify_action, | |
| 3450 | + ) | |
| 3451 | + queued: list[str] = [] | |
| 3452 | + context.queue_steering_message_callback = queued.append | |
| 3453 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 3454 | + dod = create_definition_of_done("Create a multi-file guide from a reference") | |
| 3455 | + dod.implementation_plan = str(implementation_plan) | |
| 3456 | + dod.verification_commands = [f"ls -la {guide_root}"] | |
| 3457 | + sync_todos_to_definition_of_done( | |
| 3458 | + dod, | |
| 3459 | + [ | |
| 3460 | + { | |
| 3461 | + "content": "Verify all guide files are linked and complete", | |
| 3462 | + "active_form": "Working on: Verify all guide files are linked and complete", | |
| 3463 | + "status": "pending", | |
| 3464 | + } | |
| 3465 | + ], | |
| 3466 | + project_root=temp_dir, | |
| 3467 | + ) | |
| 3468 | + | |
| 3469 | + runner._queue_blocked_completed_artifact_scope_nudge( | |
| 3470 | + "[Blocked - completed artifact set scope: all explicitly planned artifacts already exist.]", | |
| 3471 | + dod=dod, | |
| 3472 | + ) | |
| 3473 | + | |
| 3474 | + assert queued | |
| 3475 | + assert "All explicitly planned artifacts already exist." in queued[0] | |
| 3476 | + assert "Verify all guide files are linked and complete" in queued[0] | |
| 3477 | + assert "Do not reopen earlier reference materials." in queued[0] | |
tests/test_turn_completion.pymodified@@ -186,6 +186,103 @@ async def test_turn_completion_marks_non_mutating_response_done( | ||
| 186 | 186 | ) |
| 187 | 187 | |
| 188 | 188 | |
| 189 | +@pytest.mark.asyncio | |
| 190 | +async def test_turn_completion_blocks_false_completion_without_preserving_it( | |
| 191 | + temp_dir: Path, | |
| 192 | +) -> None: | |
| 193 | + backend = ScriptedBackend() | |
| 194 | + agent = Agent( | |
| 195 | + backend=backend, | |
| 196 | + config=non_streaming_config(), | |
| 197 | + project_root=temp_dir, | |
| 198 | + ) | |
| 199 | + runtime = ConversationRuntime(agent) | |
| 200 | + events = [] | |
| 201 | + | |
| 202 | + async def capture(event) -> None: | |
| 203 | + events.append(event) | |
| 204 | + | |
| 205 | + prepared = await runtime.turn_preparation.prepare( | |
| 206 | + task=( | |
| 207 | + "Create a multi-file nginx guide under ~/Loader/guides/nginx " | |
| 208 | + "with an index and chapter files." | |
| 209 | + ), | |
| 210 | + emit=capture, | |
| 211 | + requested_mode="execute", | |
| 212 | + original_task=None, | |
| 213 | + on_user_question=None, | |
| 214 | + ) | |
| 215 | + await runtime.phase_tracker.enter( | |
| 216 | + TurnPhase.ASSISTANT, | |
| 217 | + capture, | |
| 218 | + detail="Requesting assistant response", | |
| 219 | + reason_code="request_assistant_response", | |
| 220 | + ) | |
| 221 | + | |
| 222 | + implementation_plan = temp_dir / "implementation.md" | |
| 223 | + implementation_plan.write_text( | |
| 224 | + "# Implementation Plan\n\n" | |
| 225 | + "## File Changes\n\n" | |
| 226 | + "1. Create main index.html file:\n" | |
| 227 | + " - `index.html`\n\n" | |
| 228 | + "2. Create chapter files:\n" | |
| 229 | + " - `chapters/01-getting-started.html`\n" | |
| 230 | + " - `chapters/06-troubleshooting.html`\n" | |
| 231 | + ) | |
| 232 | + chapters_dir = temp_dir / "chapters" | |
| 233 | + chapters_dir.mkdir() | |
| 234 | + (chapters_dir / "01-getting-started.html").write_text("<h1>Getting Started</h1>\n") | |
| 235 | + (temp_dir / "index.html").write_text("<h1>NGINX Guide</h1>\n") | |
| 236 | + | |
| 237 | + prepared.definition_of_done.implementation_plan = str(implementation_plan) | |
| 238 | + prepared.definition_of_done.mutating_actions.append("write") | |
| 239 | + prepared.definition_of_done.touched_files.extend( | |
| 240 | + [ | |
| 241 | + str(temp_dir / "index.html"), | |
| 242 | + str(chapters_dir / "01-getting-started.html"), | |
| 243 | + ] | |
| 244 | + ) | |
| 245 | + | |
| 246 | + queued_messages: list[str] = [] | |
| 247 | + runtime.context.queue_steering_message_callback = queued_messages.append | |
| 248 | + | |
| 249 | + completion_claim = ( | |
| 250 | + "I've successfully completed the NGINX guide with all planned files " | |
| 251 | + "and verified everything is done." | |
| 252 | + ) | |
| 253 | + decision = await runtime.turn_completion.handle_text_response( | |
| 254 | + content=completion_claim, | |
| 255 | + response_content=completion_claim, | |
| 256 | + task=prepared.task, | |
| 257 | + effective_task=prepared.effective_task, | |
| 258 | + iterations=1, | |
| 259 | + max_iterations=agent.config.max_iterations, | |
| 260 | + actions_taken=[], | |
| 261 | + continuation_count=0, | |
| 262 | + dod=prepared.definition_of_done, | |
| 263 | + emit=capture, | |
| 264 | + summary=prepared.summary, | |
| 265 | + executor=prepared.executor, | |
| 266 | + rollback_plan=prepared.rollback_plan, | |
| 267 | + ) | |
| 268 | + | |
| 269 | + assert decision.action == TurnCompletionAction.CONTINUE | |
| 270 | + assert prepared.summary.assistant_messages == [] | |
| 271 | + assert not any( | |
| 272 | + message.role.value == "assistant" and message.content == completion_claim | |
| 273 | + for message in agent.session.messages | |
| 274 | + ) | |
| 275 | + assert agent.session.messages[-1].role.value == "user" | |
| 276 | + assert agent.session.messages[-1].content.startswith( | |
| 277 | + "[PLANNED ARTIFACTS STILL MISSING]" | |
| 278 | + ) | |
| 279 | + assert "`06-troubleshooting.html`" in agent.session.messages[-1].content | |
| 280 | + assert queued_messages | |
| 281 | + assert "06-troubleshooting.html" in queued_messages[-1] | |
| 282 | + assert "Do not summarize, mark completion, or write bookkeeping notes yet" in queued_messages[-1] | |
| 283 | + assert not any(event.type == "response" for event in events) | |
| 284 | + | |
| 285 | + | |
| 189 | 286 | @pytest.mark.asyncio |
| 190 | 287 | async def test_turn_completion_handles_fake_tool_narration_without_reroute( |
| 191 | 288 | temp_dir: Path, |
tests/test_workflow.pymodified@@ -15,10 +15,12 @@ from loader.runtime.workflow import ( | ||
| 15 | 15 | WorkflowMode, |
| 16 | 16 | advance_todos_from_tool_call, |
| 17 | 17 | build_execute_bridge, |
| 18 | + effective_pending_todo_items, | |
| 18 | 19 | enrich_clarify_brief_with_grounding, |
| 19 | 20 | extract_verification_commands_from_markdown, |
| 20 | 21 | merge_refreshed_todos_with_existing_scope, |
| 21 | 22 | preserve_task_grounded_acceptance_criteria, |
| 23 | + reconcile_aggregate_completion_steps, | |
| 22 | 24 | sync_todos_to_definition_of_done, |
| 23 | 25 | ) |
| 24 | 26 | |
@@ -454,6 +456,167 @@ def test_merge_refreshed_todos_with_existing_scope_filters_retro_refresh_noise() | ||
| 454 | 456 | assert "04-configuring.html" not in labels |
| 455 | 457 | |
| 456 | 458 | |
| 459 | +def test_merge_refreshed_todos_with_existing_scope_drops_unplanned_filename_expansion() -> None: | |
| 460 | + task = ( | |
| 461 | + "Create an equally thorough nginx guide with index.html plus chapter files " | |
| 462 | + "covering getting started, installation, configuration, usage, and troubleshooting." | |
| 463 | + ) | |
| 464 | + | |
| 465 | + todos = merge_refreshed_todos_with_existing_scope( | |
| 466 | + task, | |
| 467 | + existing_pending_items=[ | |
| 468 | + "Create chapter files with appropriate content structure", | |
| 469 | + ], | |
| 470 | + existing_completed_items=[ | |
| 471 | + "Create the nginx guide directory structure", | |
| 472 | + "Create introduction.html", | |
| 473 | + ], | |
| 474 | + refreshed_steps=[ | |
| 475 | + "Create optimization.html", | |
| 476 | + "Create security.html", | |
| 477 | + "Ensure consistent chapter navigation", | |
| 478 | + ], | |
| 479 | + planned_files={ | |
| 480 | + "index.html", | |
| 481 | + "introduction.html", | |
| 482 | + "installation.html", | |
| 483 | + "configuration.html", | |
| 484 | + "usage.html", | |
| 485 | + "troubleshooting.html", | |
| 486 | + }, | |
| 487 | + ) | |
| 488 | + | |
| 489 | + labels = {item["content"]: item["status"] for item in todos} | |
| 490 | + assert "Create chapter files with appropriate content structure" in labels | |
| 491 | + assert "Ensure consistent chapter navigation" in labels | |
| 492 | + assert "Create optimization.html" not in labels | |
| 493 | + assert "Create security.html" not in labels | |
| 494 | + | |
| 495 | + | |
| 496 | +def test_planning_artifacts_with_file_changes_replaces_file_change_section() -> None: | |
| 497 | + artifacts = PlanningArtifacts( | |
| 498 | + implementation_markdown="\n".join( | |
| 499 | + [ | |
| 500 | + "# Implementation Plan", | |
| 501 | + "", | |
| 502 | + "## File Changes", | |
| 503 | + "- `old.txt`", | |
| 504 | + "", | |
| 505 | + "## Execution Order", | |
| 506 | + "- Do the work", | |
| 507 | + "", | |
| 508 | + ] | |
| 509 | + ) | |
| 510 | + + "\n", | |
| 511 | + verification_markdown="# Verification Plan\n", | |
| 512 | + verification_commands=[], | |
| 513 | + acceptance_criteria=["task"], | |
| 514 | + implementation_steps=["Do the work"], | |
| 515 | + ) | |
| 516 | + | |
| 517 | + updated = artifacts.with_file_changes( | |
| 518 | + ["`guides/nginx/index.html`", "`guides/nginx/chapters/`"] | |
| 519 | + ) | |
| 520 | + | |
| 521 | + assert "`old.txt`" not in updated.implementation_markdown | |
| 522 | + assert "`guides/nginx/index.html`" in updated.implementation_markdown | |
| 523 | + assert "`guides/nginx/chapters/`" in updated.implementation_markdown | |
| 524 | + | |
| 525 | + | |
| 526 | +def test_effective_pending_todo_items_filters_stale_discovery_after_artifacts_exist( | |
| 527 | + temp_dir: Path, | |
| 528 | +) -> None: | |
| 529 | + guide_root = temp_dir / "guides" / "nginx" | |
| 530 | + chapters = guide_root / "chapters" | |
| 531 | + guide_root.mkdir(parents=True) | |
| 532 | + chapters.mkdir() | |
| 533 | + index_path = guide_root / "index.html" | |
| 534 | + chapter_one = chapters / "01-getting-started.html" | |
| 535 | + chapter_two = chapters / "02-installation.html" | |
| 536 | + index_path.write_text("<html></html>\n") | |
| 537 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 538 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 539 | + | |
| 540 | + implementation_plan = temp_dir / "implementation.md" | |
| 541 | + implementation_plan.write_text( | |
| 542 | + "\n".join( | |
| 543 | + [ | |
| 544 | + "# Implementation Plan", | |
| 545 | + "", | |
| 546 | + "## File Changes", | |
| 547 | + f"- `{guide_root}/`", | |
| 548 | + f"- `{chapters}/`", | |
| 549 | + f"- `{index_path}`", | |
| 550 | + f"- `{chapter_one}`", | |
| 551 | + f"- `{chapter_two}`", | |
| 552 | + "", | |
| 553 | + ] | |
| 554 | + ) | |
| 555 | + ) | |
| 556 | + | |
| 557 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 558 | + dod.implementation_plan = str(implementation_plan) | |
| 559 | + dod.pending_items = [ | |
| 560 | + "First, examine the existing Fortran guide structure to understand the format and content organization", | |
| 561 | + "Verify all guide files are linked and complete", | |
| 562 | + "Complete the requested work", | |
| 563 | + ] | |
| 564 | + | |
| 565 | + pending = effective_pending_todo_items(dod, project_root=temp_dir) | |
| 566 | + | |
| 567 | + assert "Verify all guide files are linked and complete" in pending | |
| 568 | + assert "Complete the requested work" in pending | |
| 569 | + assert not any("Fortran guide structure" in item for item in pending) | |
| 570 | + | |
| 571 | + | |
| 572 | +def test_effective_pending_todo_items_filters_stale_creation_steps_after_artifacts_exist( | |
| 573 | + temp_dir: Path, | |
| 574 | +) -> None: | |
| 575 | + guide_root = temp_dir / "guides" / "nginx" | |
| 576 | + chapters = guide_root / "chapters" | |
| 577 | + guide_root.mkdir(parents=True) | |
| 578 | + chapters.mkdir() | |
| 579 | + index_path = guide_root / "index.html" | |
| 580 | + chapter_one = chapters / "01-getting-started.html" | |
| 581 | + chapter_two = chapters / "02-installation.html" | |
| 582 | + index_path.write_text("<html></html>\n") | |
| 583 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 584 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 585 | + | |
| 586 | + implementation_plan = temp_dir / "implementation.md" | |
| 587 | + implementation_plan.write_text( | |
| 588 | + "\n".join( | |
| 589 | + [ | |
| 590 | + "# Implementation Plan", | |
| 591 | + "", | |
| 592 | + "## File Changes", | |
| 593 | + f"- `{guide_root}/`", | |
| 594 | + f"- `{chapters}/`", | |
| 595 | + f"- `{index_path}`", | |
| 596 | + f"- `{chapter_one}`", | |
| 597 | + f"- `{chapter_two}`", | |
| 598 | + "", | |
| 599 | + ] | |
| 600 | + ) | |
| 601 | + ) | |
| 602 | + | |
| 603 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 604 | + dod.implementation_plan = str(implementation_plan) | |
| 605 | + dod.pending_items = [ | |
| 606 | + "Create 01-getting-started.html", | |
| 607 | + "Creating 02-installation.html", | |
| 608 | + "Verify all guide files are linked and complete", | |
| 609 | + "Complete the requested work", | |
| 610 | + ] | |
| 611 | + | |
| 612 | + pending = effective_pending_todo_items(dod, project_root=temp_dir) | |
| 613 | + | |
| 614 | + assert "Verify all guide files are linked and complete" in pending | |
| 615 | + assert "Complete the requested work" in pending | |
| 616 | + assert "Create 01-getting-started.html" not in pending | |
| 617 | + assert "Creating 02-installation.html" not in pending | |
| 618 | + | |
| 619 | + | |
| 457 | 620 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 458 | 621 | store = WorkflowArtifactStore(tmp_path) |
| 459 | 622 | brief = ClarifyBrief.fallback( |
@@ -523,6 +686,58 @@ def test_sync_todos_to_definition_of_done_preserves_runtime_items() -> None: | ||
| 523 | 686 | assert "Update tests" in dod.completed_items |
| 524 | 687 | |
| 525 | 688 | |
| 689 | +def test_sync_todos_to_definition_of_done_keeps_completed_items_monotonic() -> None: | |
| 690 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 691 | + sync_todos_to_definition_of_done( | |
| 692 | + dod, | |
| 693 | + [ | |
| 694 | + { | |
| 695 | + "content": "Create 03-first-website.html", | |
| 696 | + "active_form": "Creating 03-first-website.html", | |
| 697 | + "status": "pending", | |
| 698 | + }, | |
| 699 | + { | |
| 700 | + "content": "Create 04-configuration-basics.html", | |
| 701 | + "active_form": "Creating 04-configuration-basics.html", | |
| 702 | + "status": "pending", | |
| 703 | + }, | |
| 704 | + ], | |
| 705 | + ) | |
| 706 | + | |
| 707 | + assert advance_todos_from_tool_call( | |
| 708 | + dod, | |
| 709 | + ToolCall( | |
| 710 | + id="write-third-chapter", | |
| 711 | + name="write", | |
| 712 | + arguments={ | |
| 713 | + "file_path": "/tmp/nginx/chapters/03-first-website.html", | |
| 714 | + "content": "<html></html>", | |
| 715 | + }, | |
| 716 | + ), | |
| 717 | + ) | |
| 718 | + assert "Create 03-first-website.html" in dod.completed_items | |
| 719 | + | |
| 720 | + sync_todos_to_definition_of_done( | |
| 721 | + dod, | |
| 722 | + [ | |
| 723 | + { | |
| 724 | + "content": "Create 03-first-website.html", | |
| 725 | + "active_form": "Creating 03-first-website.html", | |
| 726 | + "status": "pending", | |
| 727 | + }, | |
| 728 | + { | |
| 729 | + "content": "Create 04-configuration-basics.html", | |
| 730 | + "active_form": "Creating 04-configuration-basics.html", | |
| 731 | + "status": "pending", | |
| 732 | + }, | |
| 733 | + ], | |
| 734 | + ) | |
| 735 | + | |
| 736 | + assert "Create 03-first-website.html" in dod.completed_items | |
| 737 | + assert "Create 03-first-website.html" not in dod.pending_items | |
| 738 | + assert "Create 04-configuration-basics.html" in dod.pending_items | |
| 739 | + | |
| 740 | + | |
| 526 | 741 | def test_advance_todos_from_tool_call_tracks_plan_progress() -> None: |
| 527 | 742 | dod = create_definition_of_done("Fix the chapter links in index.html.") |
| 528 | 743 | sync_todos_to_definition_of_done( |
@@ -651,6 +866,41 @@ def test_advance_todos_from_tool_call_keeps_aggregate_mutation_steps_pending() - | ||
| 651 | 866 | ) |
| 652 | 867 | |
| 653 | 868 | |
| 869 | +def test_advance_todos_from_tool_call_keeps_plural_chapter_creation_step_pending() -> None: | |
| 870 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 871 | + sync_todos_to_definition_of_done( | |
| 872 | + dod, | |
| 873 | + [ | |
| 874 | + { | |
| 875 | + "content": "Create chapter files following the established pattern", | |
| 876 | + "active_form": "Working on: Create chapter files following the established pattern", | |
| 877 | + "status": "pending", | |
| 878 | + }, | |
| 879 | + { | |
| 880 | + "content": "Ensure consistency with existing guide formatting and content style", | |
| 881 | + "active_form": "Working on: Ensure consistency with existing guide formatting and content style", | |
| 882 | + "status": "pending", | |
| 883 | + }, | |
| 884 | + ], | |
| 885 | + ) | |
| 886 | + | |
| 887 | + assert ( | |
| 888 | + advance_todos_from_tool_call( | |
| 889 | + dod, | |
| 890 | + ToolCall( | |
| 891 | + id="write-one-chapter", | |
| 892 | + name="write", | |
| 893 | + arguments={ | |
| 894 | + "file_path": "/tmp/nginx/chapters/01-overview.html", | |
| 895 | + "content": "<html></html>", | |
| 896 | + }, | |
| 897 | + ), | |
| 898 | + ) | |
| 899 | + is False | |
| 900 | + ) | |
| 901 | + assert "Create chapter files following the established pattern" in dod.pending_items | |
| 902 | + | |
| 903 | + | |
| 654 | 904 | def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() -> None: |
| 655 | 905 | dod = create_definition_of_done("Create a multi-file nginx guide.") |
| 656 | 906 | sync_todos_to_definition_of_done( |
@@ -679,3 +929,283 @@ def test_advance_todos_from_tool_call_tracks_bash_directory_creation_progress() | ||
| 679 | 929 | ) |
| 680 | 930 | assert "Create the nginx directory structure" in dod.completed_items |
| 681 | 931 | assert "Create index.html for nginx guide" in dod.pending_items |
| 932 | + | |
| 933 | + | |
| 934 | +def test_advance_todos_from_tool_call_does_not_complete_linking_step_from_glob() -> None: | |
| 935 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 936 | + sync_todos_to_definition_of_done( | |
| 937 | + dod, | |
| 938 | + [ | |
| 939 | + { | |
| 940 | + "content": "Link all chapters together properly in the index file", | |
| 941 | + "active_form": "Working on: Link all chapters together properly in the index file", | |
| 942 | + "status": "pending", | |
| 943 | + }, | |
| 944 | + ], | |
| 945 | + ) | |
| 946 | + | |
| 947 | + assert ( | |
| 948 | + advance_todos_from_tool_call( | |
| 949 | + dod, | |
| 950 | + ToolCall( | |
| 951 | + id="glob-reference-chapters", | |
| 952 | + name="glob", | |
| 953 | + arguments={"path": "~/Loader", "pattern": "**/fortran/chapters/*"}, | |
| 954 | + ), | |
| 955 | + ) | |
| 956 | + is False | |
| 957 | + ) | |
| 958 | + assert "Link all chapters together properly in the index file" in dod.pending_items | |
| 959 | + | |
| 960 | + | |
| 961 | +def test_sync_todos_to_definition_of_done_keeps_linking_step_pending_while_artifacts_missing( | |
| 962 | + temp_dir: Path, | |
| 963 | +) -> None: | |
| 964 | + guide_root = temp_dir / "guides" / "nginx" | |
| 965 | + chapters = guide_root / "chapters" | |
| 966 | + guide_root.mkdir(parents=True) | |
| 967 | + chapters.mkdir() | |
| 968 | + index_path = guide_root / "index.html" | |
| 969 | + chapter_one = chapters / "01-getting-started.html" | |
| 970 | + chapter_two = chapters / "02-installation.html" | |
| 971 | + index_path.write_text("<html></html>\n") | |
| 972 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 973 | + | |
| 974 | + implementation_plan = temp_dir / "implementation.md" | |
| 975 | + implementation_plan.write_text( | |
| 976 | + "\n".join( | |
| 977 | + [ | |
| 978 | + "# Implementation Plan", | |
| 979 | + "", | |
| 980 | + "## File Changes", | |
| 981 | + f"- `{guide_root}/`", | |
| 982 | + f"- `{chapters}/`", | |
| 983 | + f"- `{index_path}`", | |
| 984 | + f"- `{chapter_one}`", | |
| 985 | + f"- `{chapter_two}`", | |
| 986 | + "", | |
| 987 | + ] | |
| 988 | + ) | |
| 989 | + ) | |
| 990 | + | |
| 991 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 992 | + dod.implementation_plan = str(implementation_plan) | |
| 993 | + sync_todos_to_definition_of_done( | |
| 994 | + dod, | |
| 995 | + [ | |
| 996 | + { | |
| 997 | + "content": "Create 01-getting-started.html chapter file", | |
| 998 | + "active_form": "Creating 01-getting-started.html chapter file", | |
| 999 | + "status": "completed", | |
| 1000 | + }, | |
| 1001 | + { | |
| 1002 | + "content": "Link all chapters together properly in the index file", | |
| 1003 | + "active_form": "Linking chapters in the index file", | |
| 1004 | + "status": "completed", | |
| 1005 | + }, | |
| 1006 | + { | |
| 1007 | + "content": "Create 02-installation.html chapter file", | |
| 1008 | + "active_form": "Creating 02-installation.html chapter file", | |
| 1009 | + "status": "pending", | |
| 1010 | + }, | |
| 1011 | + ], | |
| 1012 | + project_root=temp_dir, | |
| 1013 | + ) | |
| 1014 | + | |
| 1015 | + assert "Link all chapters together properly in the index file" in dod.pending_items | |
| 1016 | + assert "Link all chapters together properly in the index file" not in dod.completed_items | |
| 1017 | + | |
| 1018 | + | |
| 1019 | +def test_sync_todos_to_definition_of_done_allows_linking_step_when_artifacts_exist( | |
| 1020 | + temp_dir: Path, | |
| 1021 | +) -> None: | |
| 1022 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1023 | + chapters = guide_root / "chapters" | |
| 1024 | + guide_root.mkdir(parents=True) | |
| 1025 | + chapters.mkdir() | |
| 1026 | + index_path = guide_root / "index.html" | |
| 1027 | + chapter_one = chapters / "01-getting-started.html" | |
| 1028 | + chapter_two = chapters / "02-installation.html" | |
| 1029 | + index_path.write_text("<html></html>\n") | |
| 1030 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1031 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1032 | + | |
| 1033 | + implementation_plan = temp_dir / "implementation.md" | |
| 1034 | + implementation_plan.write_text( | |
| 1035 | + "\n".join( | |
| 1036 | + [ | |
| 1037 | + "# Implementation Plan", | |
| 1038 | + "", | |
| 1039 | + "## File Changes", | |
| 1040 | + f"- `{guide_root}/`", | |
| 1041 | + f"- `{chapters}/`", | |
| 1042 | + f"- `{index_path}`", | |
| 1043 | + f"- `{chapter_one}`", | |
| 1044 | + f"- `{chapter_two}`", | |
| 1045 | + "", | |
| 1046 | + ] | |
| 1047 | + ) | |
| 1048 | + ) | |
| 1049 | + | |
| 1050 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1051 | + dod.implementation_plan = str(implementation_plan) | |
| 1052 | + sync_todos_to_definition_of_done( | |
| 1053 | + dod, | |
| 1054 | + [ | |
| 1055 | + { | |
| 1056 | + "content": "Link all chapters together properly in the index file", | |
| 1057 | + "active_form": "Linking chapters in the index file", | |
| 1058 | + "status": "completed", | |
| 1059 | + }, | |
| 1060 | + ], | |
| 1061 | + project_root=temp_dir, | |
| 1062 | + ) | |
| 1063 | + | |
| 1064 | + assert "Link all chapters together properly in the index file" in dod.completed_items | |
| 1065 | + | |
| 1066 | + | |
| 1067 | +def test_sync_todos_to_definition_of_done_reopens_directory_content_step_when_output_dir_is_empty( | |
| 1068 | + temp_dir: Path, | |
| 1069 | +) -> None: | |
| 1070 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1071 | + chapters = guide_root / "chapters" | |
| 1072 | + guide_root.mkdir(parents=True) | |
| 1073 | + chapters.mkdir() | |
| 1074 | + index_path = guide_root / "index.html" | |
| 1075 | + index_path.write_text("<html></html>\n") | |
| 1076 | + | |
| 1077 | + implementation_plan = temp_dir / "implementation.md" | |
| 1078 | + implementation_plan.write_text( | |
| 1079 | + "\n".join( | |
| 1080 | + [ | |
| 1081 | + "# Implementation Plan", | |
| 1082 | + "", | |
| 1083 | + "## File Changes", | |
| 1084 | + f"- `{guide_root / 'index.html'}`", | |
| 1085 | + f"- `{chapters}/` (directory for chapter files)", | |
| 1086 | + "", | |
| 1087 | + "## Execution Order", | |
| 1088 | + "- Create chapter files with appropriate content", | |
| 1089 | + ] | |
| 1090 | + ) | |
| 1091 | + ) | |
| 1092 | + | |
| 1093 | + dod = create_definition_of_done("Create an equally thorough nginx guide with chapters.") | |
| 1094 | + dod.implementation_plan = str(implementation_plan) | |
| 1095 | + sync_todos_to_definition_of_done( | |
| 1096 | + dod, | |
| 1097 | + [ | |
| 1098 | + { | |
| 1099 | + "content": "Create chapter files with appropriate content", | |
| 1100 | + "active_form": "Creating chapter files with appropriate content", | |
| 1101 | + "status": "completed", | |
| 1102 | + }, | |
| 1103 | + ], | |
| 1104 | + project_root=temp_dir, | |
| 1105 | + ) | |
| 1106 | + | |
| 1107 | + assert "Create chapter files with appropriate content" in dod.pending_items | |
| 1108 | + assert "Create chapter files with appropriate content" not in dod.completed_items | |
| 1109 | + | |
| 1110 | + | |
| 1111 | +def test_reconcile_aggregate_completion_steps_reopens_linking_step_when_artifacts_missing( | |
| 1112 | + temp_dir: Path, | |
| 1113 | +) -> None: | |
| 1114 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1115 | + chapters = guide_root / "chapters" | |
| 1116 | + guide_root.mkdir(parents=True) | |
| 1117 | + chapters.mkdir() | |
| 1118 | + index_path = guide_root / "index.html" | |
| 1119 | + chapter_one = chapters / "01-getting-started.html" | |
| 1120 | + chapter_two = chapters / "02-installation.html" | |
| 1121 | + chapter_three = chapters / "03-first-website.html" | |
| 1122 | + index_path.write_text("<html></html>\n") | |
| 1123 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1124 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1125 | + | |
| 1126 | + implementation_plan = temp_dir / "implementation.md" | |
| 1127 | + implementation_plan.write_text( | |
| 1128 | + "\n".join( | |
| 1129 | + [ | |
| 1130 | + "# Implementation Plan", | |
| 1131 | + "", | |
| 1132 | + "## File Changes", | |
| 1133 | + f"- `{guide_root}/`", | |
| 1134 | + f"- `{chapters}/`", | |
| 1135 | + f"- `{index_path}`", | |
| 1136 | + f"- `{chapter_one}`", | |
| 1137 | + f"- `{chapter_two}`", | |
| 1138 | + f"- `{chapter_three}`", | |
| 1139 | + "", | |
| 1140 | + ] | |
| 1141 | + ) | |
| 1142 | + ) | |
| 1143 | + | |
| 1144 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1145 | + dod.implementation_plan = str(implementation_plan) | |
| 1146 | + dod.completed_items.append("Link all chapters together properly") | |
| 1147 | + | |
| 1148 | + reconcile_aggregate_completion_steps(dod, project_root=temp_dir) | |
| 1149 | + | |
| 1150 | + assert "Link all chapters together properly" not in dod.completed_items | |
| 1151 | + assert "Link all chapters together properly" in dod.pending_items | |
| 1152 | + | |
| 1153 | + | |
| 1154 | +def test_sync_todos_to_definition_of_done_drops_unplanned_artifact_expansion_after_plan_complete( | |
| 1155 | + temp_dir: Path, | |
| 1156 | +) -> None: | |
| 1157 | + guide_root = temp_dir / "guides" / "nginx" | |
| 1158 | + chapters = guide_root / "chapters" | |
| 1159 | + guide_root.mkdir(parents=True) | |
| 1160 | + chapters.mkdir() | |
| 1161 | + index_path = guide_root / "index.html" | |
| 1162 | + chapter_one = chapters / "01-getting-started.html" | |
| 1163 | + chapter_two = chapters / "02-installation.html" | |
| 1164 | + index_path.write_text("<html></html>\n") | |
| 1165 | + chapter_one.write_text("<h1>One</h1>\n") | |
| 1166 | + chapter_two.write_text("<h1>Two</h1>\n") | |
| 1167 | + | |
| 1168 | + implementation_plan = temp_dir / "implementation.md" | |
| 1169 | + implementation_plan.write_text( | |
| 1170 | + "\n".join( | |
| 1171 | + [ | |
| 1172 | + "# Implementation Plan", | |
| 1173 | + "", | |
| 1174 | + "## File Changes", | |
| 1175 | + f"- `{guide_root}/`", | |
| 1176 | + f"- `{chapters}/`", | |
| 1177 | + f"- `{index_path}`", | |
| 1178 | + f"- `{chapter_one}`", | |
| 1179 | + f"- `{chapter_two}`", | |
| 1180 | + "", | |
| 1181 | + ] | |
| 1182 | + ) | |
| 1183 | + ) | |
| 1184 | + | |
| 1185 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 1186 | + dod.implementation_plan = str(implementation_plan) | |
| 1187 | + sync_todos_to_definition_of_done( | |
| 1188 | + dod, | |
| 1189 | + [ | |
| 1190 | + { | |
| 1191 | + "content": "Create 01-getting-started.html", | |
| 1192 | + "active_form": "Creating 01-getting-started.html", | |
| 1193 | + "status": "completed", | |
| 1194 | + }, | |
| 1195 | + { | |
| 1196 | + "content": "Create 02-installation.html", | |
| 1197 | + "active_form": "Creating 02-installation.html", | |
| 1198 | + "status": "completed", | |
| 1199 | + }, | |
| 1200 | + { | |
| 1201 | + "content": "Create 07-performance-tuning.html", | |
| 1202 | + "active_form": "Creating 07-performance-tuning.html", | |
| 1203 | + "status": "in_progress", | |
| 1204 | + }, | |
| 1205 | + ], | |
| 1206 | + project_root=temp_dir, | |
| 1207 | + ) | |
| 1208 | + | |
| 1209 | + assert "Creating 07-performance-tuning.html" not in dod.pending_items | |
| 1210 | + assert "Create 01-getting-started.html" in dod.completed_items | |
| 1211 | + assert "Create 02-installation.html" in dod.completed_items | |
tests/test_workflow_recovery.pyadded@@ -0,0 +1,20 @@ | ||
| 1 | +"""Focused tests for workflow recovery priority rules.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from pathlib import Path | |
| 6 | + | |
| 7 | +from loader.runtime.workflow_recovery import _should_prioritize_missing_artifact | |
| 8 | + | |
| 9 | + | |
| 10 | +def test_workflow_recovery_prioritizes_missing_artifact_over_review_step() -> None: | |
| 11 | + missing_artifact = (Path("/tmp/guide/06-ssl-configuration.html"), False) | |
| 12 | + | |
| 13 | + assert _should_prioritize_missing_artifact( | |
| 14 | + next_pending="Ensure all files are properly linked and formatted consistently", | |
| 15 | + missing_artifact=missing_artifact, | |
| 16 | + ) | |
| 17 | + assert not _should_prioritize_missing_artifact( | |
| 18 | + next_pending="Create the final chapter (06-ssl-configuration.html)", | |
| 19 | + missing_artifact=missing_artifact, | |
| 20 | + ) | |
tests/test_workflow_tools.pymodified@@ -43,6 +43,65 @@ async def test_todo_write_persists_and_returns_previous_state(tmp_path: Path) -> | ||
| 43 | 43 | assert json.loads(store_path.read_text()) == [] |
| 44 | 44 | |
| 45 | 45 | |
| 46 | +@pytest.mark.asyncio | |
| 47 | +async def test_todo_write_merges_partial_status_updates_with_existing_scope( | |
| 48 | + tmp_path: Path, | |
| 49 | +) -> None: | |
| 50 | + tool = TodoWriteTool(tmp_path) | |
| 51 | + | |
| 52 | + initial = await tool.execute( | |
| 53 | + todos=[ | |
| 54 | + { | |
| 55 | + "content": "Create nginx index", | |
| 56 | + "active_form": "Creating nginx index", | |
| 57 | + "status": "completed", | |
| 58 | + }, | |
| 59 | + { | |
| 60 | + "content": "Create chapter files", | |
| 61 | + "active_form": "Creating chapter files", | |
| 62 | + "status": "in_progress", | |
| 63 | + }, | |
| 64 | + { | |
| 65 | + "content": "Verify links", | |
| 66 | + "active_form": "Verifying links", | |
| 67 | + "status": "pending", | |
| 68 | + }, | |
| 69 | + ] | |
| 70 | + ) | |
| 71 | + partial = await tool.execute( | |
| 72 | + todos=[ | |
| 73 | + { | |
| 74 | + "content": "Create chapter files", | |
| 75 | + "active_form": "Creating chapter files", | |
| 76 | + "status": "completed", | |
| 77 | + } | |
| 78 | + ] | |
| 79 | + ) | |
| 80 | + | |
| 81 | + initial_payload = json.loads(initial.output) | |
| 82 | + partial_payload = json.loads(partial.output) | |
| 83 | + assert initial.is_error is False | |
| 84 | + assert partial.is_error is False | |
| 85 | + assert partial_payload["old_todos"] == initial_payload["new_todos"] | |
| 86 | + assert partial_payload["new_todos"] == [ | |
| 87 | + { | |
| 88 | + "content": "Create nginx index", | |
| 89 | + "active_form": "Creating nginx index", | |
| 90 | + "status": "completed", | |
| 91 | + }, | |
| 92 | + { | |
| 93 | + "content": "Create chapter files", | |
| 94 | + "active_form": "Creating chapter files", | |
| 95 | + "status": "completed", | |
| 96 | + }, | |
| 97 | + { | |
| 98 | + "content": "Verify links", | |
| 99 | + "active_form": "Verifying links", | |
| 100 | + "status": "pending", | |
| 101 | + }, | |
| 102 | + ] | |
| 103 | + | |
| 104 | + | |
| 46 | 105 | @pytest.mark.asyncio |
| 47 | 106 | async def test_todo_write_rejects_invalid_payloads_and_sets_verification_nudge( |
| 48 | 107 | tmp_path: Path, |