Name concrete TodoWrite targets
- SHA
6d1c5ec23dbc5fc901c34bff3b791ad0f8cfb258- Parents
-
67175cc - Tree
dc0e0f5
6d1c5ec
6d1c5ec23dbc5fc901c34bff3b791ad0f8cfb25867175cc
dc0e0f5| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/repair.py
|
3 | 158 |
| M |
src/loader/runtime/tool_batches.py
|
28 | 0 |
| M |
src/loader/runtime/workflow.py
|
172 | 0 |
| M |
tests/test_runtime_repair_flows.py
|
102 | 0 |
| M |
tests/test_tool_batches.py
|
132 | 0 |
src/loader/runtime/repair.pymodified@@ -16,6 +16,7 @@ from .dod import ( | ||
| 16 | 16 | ) |
| 17 | 17 | from .parsing import parse_tool_calls |
| 18 | 18 | from .workflow import ( |
| 19 | + infer_pending_todo_output_target, | |
| 19 | 20 | preferred_pending_todo_item, |
| 20 | 21 | reconcile_aggregate_completion_steps, |
| 21 | 22 | todo_file_candidates, |
@@ -680,112 +681,11 @@ class ResponseRepairer: | ||
| 680 | 681 | dod: DefinitionOfDone, |
| 681 | 682 | item: str, |
| 682 | 683 | ) -> Path | None: |
| 683 | - candidates = todo_file_candidates(item) | |
| 684 | - if not candidates: | |
| 685 | - return self._infer_pending_item_target_from_html_graph(dod, item) | |
| 686 | - | |
| 687 | - planned_targets = collect_planned_artifact_targets( | |
| 688 | - dod, | |
| 689 | - project_root=self.context.project_root, | |
| 690 | - max_paths=12, | |
| 691 | - ) | |
| 692 | - planned_files = { | |
| 693 | - target.name.lower(): target | |
| 694 | - for target, expect_directory in planned_targets | |
| 695 | - if not expect_directory | |
| 696 | - } | |
| 697 | - planned_directories = [ | |
| 698 | - target | |
| 699 | - for target, expect_directory in planned_targets | |
| 700 | - if expect_directory | |
| 701 | - ] | |
| 702 | - touched_paths = [ | |
| 703 | - Path(path) | |
| 704 | - for path in dod.touched_files | |
| 705 | - if str(path).strip() | |
| 706 | - ] | |
| 707 | - | |
| 708 | - for candidate in candidates: | |
| 709 | - candidate_str = str(candidate) | |
| 710 | - if candidate.is_absolute() or candidate_str.startswith("~"): | |
| 711 | - return Path(candidate_str).expanduser() | |
| 712 | - | |
| 713 | - planned_match = planned_files.get(candidate.name.lower()) | |
| 714 | - if planned_match is not None: | |
| 715 | - return planned_match | |
| 716 | - | |
| 717 | - for touched in reversed(touched_paths): | |
| 718 | - if touched.name.lower() == candidate.name.lower(): | |
| 719 | - continue | |
| 720 | - if candidate.suffix and touched.suffix.lower() != candidate.suffix.lower(): | |
| 721 | - continue | |
| 722 | - return touched.parent / candidate.name | |
| 723 | - | |
| 724 | - for directory in planned_directories: | |
| 725 | - return directory / candidate.name | |
| 726 | - | |
| 727 | - return None | |
| 728 | - | |
| 729 | - def _infer_pending_item_target_from_html_graph( | |
| 730 | - self, | |
| 731 | - dod: DefinitionOfDone, | |
| 732 | - item: str, | |
| 733 | - ) -> Path | None: | |
| 734 | - target_label = _normalize_pending_output_label(item) | |
| 735 | - if not target_label: | |
| 736 | - return None | |
| 737 | - | |
| 738 | - html_files = self._pending_item_html_sources(dod) | |
| 739 | - matches: list[tuple[int, bool, Path]] = [] | |
| 740 | - for html_file in html_files: | |
| 741 | - try: | |
| 742 | - content = html_file.read_text() | |
| 743 | - except OSError: | |
| 744 | - continue | |
| 745 | - for href, link_text in _iter_local_html_links(html_file, content): | |
| 746 | - resolved = (html_file.parent / href).resolve(strict=False) | |
| 747 | - score = _pending_output_link_match_score( | |
| 748 | - target_label, | |
| 749 | - _normalize_pending_output_label(link_text), | |
| 750 | - ) | |
| 751 | - if score <= 0: | |
| 752 | - continue | |
| 753 | - matches.append((score, not resolved.exists(), resolved)) | |
| 754 | - | |
| 755 | - if not matches: | |
| 756 | - return None | |
| 757 | - matches.sort(key=lambda item: (item[0], item[1], str(item[2])), reverse=True) | |
| 758 | - return matches[0][2] | |
| 759 | - | |
| 760 | - def _pending_item_html_sources(self, dod: DefinitionOfDone) -> list[Path]: | |
| 761 | - planned_targets = collect_planned_artifact_targets( | |
| 684 | + return infer_pending_todo_output_target( | |
| 762 | 685 | dod, |
| 686 | + item, | |
| 763 | 687 | project_root=self.context.project_root, |
| 764 | - max_paths=12, | |
| 765 | 688 | ) |
| 766 | - html_sources: list[Path] = [] | |
| 767 | - seen: set[str] = set() | |
| 768 | - | |
| 769 | - for raw_path in dod.touched_files: | |
| 770 | - path = Path(raw_path).expanduser().resolve(strict=False) | |
| 771 | - if path.suffix.lower() not in {".html", ".htm"}: | |
| 772 | - continue | |
| 773 | - key = str(path) | |
| 774 | - if key in seen: | |
| 775 | - continue | |
| 776 | - seen.add(key) | |
| 777 | - html_sources.append(path) | |
| 778 | - | |
| 779 | - for target, expect_directory in planned_targets: | |
| 780 | - if expect_directory or target.suffix.lower() not in {".html", ".htm"}: | |
| 781 | - continue | |
| 782 | - key = str(target) | |
| 783 | - if key in seen: | |
| 784 | - continue | |
| 785 | - seen.add(key) | |
| 786 | - html_sources.append(target) | |
| 787 | - | |
| 788 | - return html_sources | |
| 789 | 689 | |
| 790 | 690 | def _preferred_resume_pending_item( |
| 791 | 691 | self, |
@@ -855,58 +755,3 @@ def _todo_is_mutation_step(label: str) -> bool: | ||
| 855 | 755 | def _todo_is_consistency_review_step(label: str) -> bool: |
| 856 | 756 | lowered = label.lower() |
| 857 | 757 | return any(token in lowered for token in _CONSISTENCY_REVIEW_HINTS) |
| 858 | - | |
| 859 | - | |
| 860 | -def _normalize_pending_output_label(value: str) -> str: | |
| 861 | - text = " ".join(str(value).strip().split()).lower() | |
| 862 | - if not text: | |
| 863 | - return "" | |
| 864 | - text = re.sub( | |
| 865 | - r"^(?:working on:\s*)?(?:create|creating|write|writing|build|building|develop|developing)\s+", | |
| 866 | - "", | |
| 867 | - text, | |
| 868 | - ) | |
| 869 | - text = re.sub(r"\bfor nginx guide\b", "", text) | |
| 870 | - text = re.sub(r"[^a-z0-9]+", " ", text) | |
| 871 | - return " ".join(text.split()) | |
| 872 | - | |
| 873 | - | |
| 874 | -def _pending_output_link_match_score(todo_label: str, link_label: str) -> int: | |
| 875 | - if not todo_label or not link_label: | |
| 876 | - return 0 | |
| 877 | - if todo_label == link_label: | |
| 878 | - return 3 | |
| 879 | - if todo_label in link_label or link_label in todo_label: | |
| 880 | - return 2 | |
| 881 | - todo_tokens = {token for token in todo_label.split() if len(token) > 2} | |
| 882 | - link_tokens = {token for token in link_label.split() if len(token) > 2} | |
| 883 | - if not todo_tokens or not link_tokens: | |
| 884 | - return 0 | |
| 885 | - overlap = todo_tokens & link_tokens | |
| 886 | - if len(overlap) >= min(3, len(todo_tokens), len(link_tokens)): | |
| 887 | - return 1 | |
| 888 | - return 0 | |
| 889 | - | |
| 890 | - | |
| 891 | -def _iter_local_html_links(file_path: Path, content: str) -> list[tuple[str, str]]: | |
| 892 | - pattern = re.compile( | |
| 893 | - r"<a\b[^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", | |
| 894 | - re.IGNORECASE | re.DOTALL, | |
| 895 | - ) | |
| 896 | - links: list[tuple[str, str]] = [] | |
| 897 | - seen: set[tuple[str, str]] = set() | |
| 898 | - for href, inner_html in pattern.findall(content): | |
| 899 | - target = href.strip() | |
| 900 | - if not target or target.startswith(("#", "http://", "https://", "mailto:")): | |
| 901 | - continue | |
| 902 | - trimmed_target = target.split("?", 1)[0].split("#", 1)[0] | |
| 903 | - if Path(trimmed_target).suffix.lower() not in {".html", ".htm"}: | |
| 904 | - continue | |
| 905 | - label = re.sub(r"<[^>]+>", " ", inner_html) | |
| 906 | - label = " ".join(label.split()) | |
| 907 | - key = (trimmed_target, label) | |
| 908 | - if key in seen: | |
| 909 | - continue | |
| 910 | - seen.add(key) | |
| 911 | - links.append((trimmed_target, label)) | |
| 912 | - return links | |
src/loader/runtime/tool_batches.pymodified@@ -40,6 +40,7 @@ from .verification_observations import ( | ||
| 40 | 40 | from .workflow import ( |
| 41 | 41 | advance_todos_from_tool_call, |
| 42 | 42 | effective_pending_todo_items, |
| 43 | + infer_pending_todo_output_target, | |
| 43 | 44 | preferred_pending_todo_item, |
| 44 | 45 | reconcile_aggregate_completion_steps, |
| 45 | 46 | sync_todos_to_definition_of_done, |
@@ -937,6 +938,33 @@ class ToolBatchRunner: | ||
| 937 | 938 | ) |
| 938 | 939 | if missing_artifact is None: |
| 939 | 940 | if next_pending and _todo_is_mutation_step(next_pending): |
| 941 | + pending_target = infer_pending_todo_output_target( | |
| 942 | + dod, | |
| 943 | + next_pending, | |
| 944 | + project_root=self.context.project_root, | |
| 945 | + ) | |
| 946 | + if pending_target is not None: | |
| 947 | + concrete_message = ( | |
| 948 | + "Todo tracking is updated. Continue with the next pending item: " | |
| 949 | + f"`{next_pending}`. Resume by creating `{pending_target.name}` now. " | |
| 950 | + f"Prefer one `write` call for `{pending_target}` instead of more rereads. " | |
| 951 | + ) | |
| 952 | + if not pending_target.parent.exists(): | |
| 953 | + concrete_message += ( | |
| 954 | + "The `write` tool can create that file's parent directories " | |
| 955 | + "automatically, so do the write in one step instead of stopping " | |
| 956 | + "for a separate mkdir. " | |
| 957 | + ) | |
| 958 | + concrete_message += ( | |
| 959 | + "Use the current output files as the source of truth, and do not " | |
| 960 | + "reopen reference materials unless one specific fact required for " | |
| 961 | + "that step is still unknown. Make your next response the concrete " | |
| 962 | + "mutation tool call itself, not another bookkeeping-only turn. " | |
| 963 | + "Perform the mutation now instead of spending another turn on " | |
| 964 | + "planning, rereads, or verification." | |
| 965 | + ) | |
| 966 | + self.context.queue_steering_message(concrete_message) | |
| 967 | + return | |
| 940 | 968 | self.context.queue_steering_message( |
| 941 | 969 | "Todo tracking is updated. Continue with the next pending item: " |
| 942 | 970 | f"`{next_pending}`. Use the current output files as the source of " |
src/loader/runtime/workflow.pymodified@@ -54,6 +54,7 @@ __all__ = [ | ||
| 54 | 54 | "effective_pending_todo_items", |
| 55 | 55 | "enrich_clarify_brief_with_grounding", |
| 56 | 56 | "extract_verification_commands_from_markdown", |
| 57 | + "infer_pending_todo_output_target", | |
| 57 | 58 | "load_brief", |
| 58 | 59 | "load_planning_artifacts", |
| 59 | 60 | "merge_refreshed_todos_with_existing_scope", |
@@ -887,6 +888,87 @@ def todo_file_candidates(item: str) -> list[Path]: | ||
| 887 | 888 | return candidates |
| 888 | 889 | |
| 889 | 890 | |
| 891 | +def infer_pending_todo_output_target( | |
| 892 | + dod, | |
| 893 | + item: str, | |
| 894 | + *, | |
| 895 | + project_root: Path | None = None, | |
| 896 | +) -> Path | None: | |
| 897 | + """Infer the concrete file path a pending todo is asking the model to mutate.""" | |
| 898 | + | |
| 899 | + root = project_root or Path.cwd() | |
| 900 | + candidates = todo_file_candidates(item) | |
| 901 | + planned_targets = collect_planned_artifact_targets( | |
| 902 | + dod, | |
| 903 | + project_root=root, | |
| 904 | + max_paths=12, | |
| 905 | + ) | |
| 906 | + | |
| 907 | + if candidates: | |
| 908 | + planned_files = { | |
| 909 | + target.name.lower(): target | |
| 910 | + for target, expect_directory in planned_targets | |
| 911 | + if not expect_directory | |
| 912 | + } | |
| 913 | + planned_directories = [ | |
| 914 | + target | |
| 915 | + for target, expect_directory in planned_targets | |
| 916 | + if expect_directory | |
| 917 | + ] | |
| 918 | + touched_paths = [ | |
| 919 | + Path(path) | |
| 920 | + for path in dod.touched_files | |
| 921 | + if str(path).strip() | |
| 922 | + ] | |
| 923 | + | |
| 924 | + for candidate in candidates: | |
| 925 | + candidate_str = str(candidate) | |
| 926 | + if candidate.is_absolute() or candidate_str.startswith("~"): | |
| 927 | + return Path(candidate_str).expanduser() | |
| 928 | + | |
| 929 | + planned_match = planned_files.get(candidate.name.lower()) | |
| 930 | + if planned_match is not None: | |
| 931 | + return planned_match | |
| 932 | + | |
| 933 | + for touched in reversed(touched_paths): | |
| 934 | + if touched.name.lower() == candidate.name.lower(): | |
| 935 | + continue | |
| 936 | + if candidate.suffix and touched.suffix.lower() != candidate.suffix.lower(): | |
| 937 | + continue | |
| 938 | + return touched.parent / candidate.name | |
| 939 | + | |
| 940 | + for directory in planned_directories: | |
| 941 | + return directory / candidate.name | |
| 942 | + | |
| 943 | + target_label = _normalize_pending_output_label(item) | |
| 944 | + if not target_label: | |
| 945 | + return None | |
| 946 | + | |
| 947 | + matches: list[tuple[int, bool, Path]] = [] | |
| 948 | + for html_file in _pending_item_html_sources( | |
| 949 | + dod, | |
| 950 | + project_root=root, | |
| 951 | + ): | |
| 952 | + try: | |
| 953 | + content = html_file.read_text() | |
| 954 | + except OSError: | |
| 955 | + continue | |
| 956 | + for href, link_text in _iter_local_html_links(content): | |
| 957 | + resolved = (html_file.parent / href).resolve(strict=False) | |
| 958 | + score = _pending_output_link_match_score( | |
| 959 | + target_label, | |
| 960 | + _normalize_pending_output_label(link_text), | |
| 961 | + ) | |
| 962 | + if score <= 0: | |
| 963 | + continue | |
| 964 | + matches.append((score, not resolved.exists(), resolved)) | |
| 965 | + | |
| 966 | + if not matches: | |
| 967 | + return None | |
| 968 | + matches.sort(key=lambda item: (item[0], item[1], str(item[2])), reverse=True) | |
| 969 | + return matches[0][2] | |
| 970 | + | |
| 971 | + | |
| 890 | 972 | def preserve_task_grounded_acceptance_criteria( |
| 891 | 973 | task_statement: str, |
| 892 | 974 | *, |
@@ -905,6 +987,96 @@ def preserve_task_grounded_acceptance_criteria( | ||
| 905 | 987 | return list(dict.fromkeys([*grounded_existing, *refreshed_acceptance_criteria])) |
| 906 | 988 | |
| 907 | 989 | |
| 990 | +def _pending_item_html_sources( | |
| 991 | + dod, | |
| 992 | + *, | |
| 993 | + project_root: Path, | |
| 994 | +) -> list[Path]: | |
| 995 | + planned_targets = collect_planned_artifact_targets( | |
| 996 | + dod, | |
| 997 | + project_root=project_root, | |
| 998 | + max_paths=12, | |
| 999 | + ) | |
| 1000 | + html_sources: list[Path] = [] | |
| 1001 | + seen: set[str] = set() | |
| 1002 | + | |
| 1003 | + for raw_path in dod.touched_files: | |
| 1004 | + path = Path(raw_path).expanduser().resolve(strict=False) | |
| 1005 | + if path.suffix.lower() not in {".html", ".htm"}: | |
| 1006 | + continue | |
| 1007 | + key = str(path) | |
| 1008 | + if key in seen: | |
| 1009 | + continue | |
| 1010 | + seen.add(key) | |
| 1011 | + html_sources.append(path) | |
| 1012 | + | |
| 1013 | + for target, expect_directory in planned_targets: | |
| 1014 | + if expect_directory or target.suffix.lower() not in {".html", ".htm"}: | |
| 1015 | + continue | |
| 1016 | + key = str(target) | |
| 1017 | + if key in seen: | |
| 1018 | + continue | |
| 1019 | + seen.add(key) | |
| 1020 | + html_sources.append(target) | |
| 1021 | + | |
| 1022 | + return html_sources | |
| 1023 | + | |
| 1024 | + | |
| 1025 | +def _normalize_pending_output_label(value: str) -> str: | |
| 1026 | + text = " ".join(str(value).strip().split()).lower() | |
| 1027 | + if not text: | |
| 1028 | + return "" | |
| 1029 | + text = re.sub( | |
| 1030 | + r"^(?:working on:\s*)?(?:create|creating|write|writing|build|building|develop|developing)\s+", | |
| 1031 | + "", | |
| 1032 | + text, | |
| 1033 | + ) | |
| 1034 | + text = re.sub(r"\bfor nginx guide\b", "", text) | |
| 1035 | + text = re.sub(r"[^a-z0-9]+", " ", text) | |
| 1036 | + return " ".join(text.split()) | |
| 1037 | + | |
| 1038 | + | |
| 1039 | +def _pending_output_link_match_score(todo_label: str, link_label: str) -> int: | |
| 1040 | + if not todo_label or not link_label: | |
| 1041 | + return 0 | |
| 1042 | + if todo_label == link_label: | |
| 1043 | + return 3 | |
| 1044 | + if todo_label in link_label or link_label in todo_label: | |
| 1045 | + return 2 | |
| 1046 | + todo_tokens = {token for token in todo_label.split() if len(token) > 2} | |
| 1047 | + link_tokens = {token for token in link_label.split() if len(token) > 2} | |
| 1048 | + if not todo_tokens or not link_tokens: | |
| 1049 | + return 0 | |
| 1050 | + overlap = todo_tokens & link_tokens | |
| 1051 | + if len(overlap) >= min(3, len(todo_tokens), len(link_tokens)): | |
| 1052 | + return 1 | |
| 1053 | + return 0 | |
| 1054 | + | |
| 1055 | + | |
| 1056 | +def _iter_local_html_links(content: str) -> list[tuple[str, str]]: | |
| 1057 | + pattern = re.compile( | |
| 1058 | + r"<a\b[^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", | |
| 1059 | + re.IGNORECASE | re.DOTALL, | |
| 1060 | + ) | |
| 1061 | + links: list[tuple[str, str]] = [] | |
| 1062 | + seen: set[tuple[str, str]] = set() | |
| 1063 | + for href, inner_html in pattern.findall(content): | |
| 1064 | + target = href.strip() | |
| 1065 | + if not target or target.startswith(("#", "http://", "https://", "mailto:")): | |
| 1066 | + continue | |
| 1067 | + trimmed_target = target.split("?", 1)[0].split("#", 1)[0] | |
| 1068 | + if Path(trimmed_target).suffix.lower() not in {".html", ".htm"}: | |
| 1069 | + continue | |
| 1070 | + label = re.sub(r"<[^>]+>", " ", inner_html) | |
| 1071 | + label = " ".join(label.split()) | |
| 1072 | + key = (trimmed_target, label) | |
| 1073 | + if key in seen: | |
| 1074 | + continue | |
| 1075 | + seen.add(key) | |
| 1076 | + links.append((trimmed_target, label)) | |
| 1077 | + return links | |
| 1078 | + | |
| 1079 | + | |
| 908 | 1080 | def merge_refreshed_todos_with_existing_scope( |
| 909 | 1081 | task_statement: str, |
| 910 | 1082 | *, |
tests/test_runtime_repair_flows.pymodified@@ -204,6 +204,108 @@ async def test_empty_response_retry_budget_resets_after_successful_turn( | ||
| 204 | 204 | assert sum("retry 1/2" in message for message in retry_messages) >= 2 |
| 205 | 205 | |
| 206 | 206 | |
| 207 | +@pytest.mark.asyncio | |
| 208 | +async def test_empty_response_retry_budget_resets_after_todowrite_turn( | |
| 209 | + temp_dir: Path, | |
| 210 | +) -> None: | |
| 211 | + first = temp_dir / "index.html" | |
| 212 | + second = temp_dir / "chapters" / "01-introduction.html" | |
| 213 | + backend = ScriptedBackend( | |
| 214 | + completions=[ | |
| 215 | + CompletionResponse(content=""), | |
| 216 | + CompletionResponse( | |
| 217 | + content="I'll create the guide index now.", | |
| 218 | + tool_calls=[ | |
| 219 | + ToolCall( | |
| 220 | + id="write-1", | |
| 221 | + name="write", | |
| 222 | + arguments={ | |
| 223 | + "file_path": str(first), | |
| 224 | + "content": "<html></html>\n", | |
| 225 | + }, | |
| 226 | + ) | |
| 227 | + ], | |
| 228 | + ), | |
| 229 | + CompletionResponse( | |
| 230 | + content="I'll create the first chapter now.", | |
| 231 | + tool_calls=[ | |
| 232 | + ToolCall( | |
| 233 | + id="write-2", | |
| 234 | + name="write", | |
| 235 | + arguments={ | |
| 236 | + "file_path": str(second), | |
| 237 | + "content": "<html></html>\n", | |
| 238 | + }, | |
| 239 | + ) | |
| 240 | + ], | |
| 241 | + ), | |
| 242 | + CompletionResponse( | |
| 243 | + content="I'll update the task list now.", | |
| 244 | + tool_calls=[ | |
| 245 | + ToolCall( | |
| 246 | + id="todo-1", | |
| 247 | + name="TodoWrite", | |
| 248 | + arguments={ | |
| 249 | + "todos": [ | |
| 250 | + { | |
| 251 | + "content": "Create index.html", | |
| 252 | + "status": "completed", | |
| 253 | + "active_form": "Creating index.html", | |
| 254 | + }, | |
| 255 | + { | |
| 256 | + "content": "Create 01-introduction.html", | |
| 257 | + "status": "completed", | |
| 258 | + "active_form": "Creating 01-introduction.html", | |
| 259 | + }, | |
| 260 | + { | |
| 261 | + "content": "Create 02-installation.html", | |
| 262 | + "status": "pending", | |
| 263 | + "active_form": "Creating 02-installation.html", | |
| 264 | + }, | |
| 265 | + ] | |
| 266 | + }, | |
| 267 | + ) | |
| 268 | + ], | |
| 269 | + ), | |
| 270 | + CompletionResponse(content=""), | |
| 271 | + CompletionResponse( | |
| 272 | + content="I'll create the second chapter now.", | |
| 273 | + tool_calls=[ | |
| 274 | + ToolCall( | |
| 275 | + id="write-3", | |
| 276 | + name="write", | |
| 277 | + arguments={ | |
| 278 | + "file_path": str(temp_dir / "chapters" / "02-installation.html"), | |
| 279 | + "content": "<html></html>\n", | |
| 280 | + }, | |
| 281 | + ) | |
| 282 | + ], | |
| 283 | + ), | |
| 284 | + CompletionResponse(content="The guide files are created."), | |
| 285 | + ] | |
| 286 | + ) | |
| 287 | + | |
| 288 | + run = await run_scenario( | |
| 289 | + "Create a small nginx guide.", | |
| 290 | + backend, | |
| 291 | + config=non_streaming_config(), | |
| 292 | + project_root=temp_dir, | |
| 293 | + ) | |
| 294 | + | |
| 295 | + assert run.response.startswith("The guide files are created.") | |
| 296 | + retry_messages: list[str] = [] | |
| 297 | + for invocation in backend.invocations: | |
| 298 | + for message in invocation.messages: | |
| 299 | + if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content: | |
| 300 | + continue | |
| 301 | + if retry_messages and retry_messages[-1] == message.content: | |
| 302 | + continue | |
| 303 | + retry_messages.append(message.content) | |
| 304 | + assert len(retry_messages) >= 2 | |
| 305 | + assert all("retry 2/2" not in message for message in retry_messages) | |
| 306 | + assert sum("retry 1/2" in message for message in retry_messages) >= 2 | |
| 307 | + | |
| 308 | + | |
| 207 | 309 | @pytest.mark.asyncio |
| 208 | 310 | async def test_repeated_empty_responses_fail_honestly_after_one_retry( |
| 209 | 311 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -2884,6 +2884,138 @@ async def test_tool_batch_runner_todowrite_with_declared_child_targets_names_nex | ||
| 2884 | 2884 | assert "Do not spend the next turn on TodoWrite alone" in message |
| 2885 | 2885 | |
| 2886 | 2886 | |
| 2887 | +@pytest.mark.asyncio | |
| 2888 | +async def test_tool_batch_runner_todowrite_names_concrete_pending_file_after_artifacts_exist( | |
| 2889 | + temp_dir: Path, | |
| 2890 | +) -> None: | |
| 2891 | + async def assess_confidence( | |
| 2892 | + tool_name: str, | |
| 2893 | + tool_args: dict, | |
| 2894 | + context: str, | |
| 2895 | + ) -> ConfidenceAssessment: | |
| 2896 | + raise AssertionError("Confidence scoring should not run in this scenario") | |
| 2897 | + | |
| 2898 | + async def verify_action( | |
| 2899 | + tool_name: str, | |
| 2900 | + tool_args: dict, | |
| 2901 | + result: str, | |
| 2902 | + expected: str = "", | |
| 2903 | + ) -> ActionVerification: | |
| 2904 | + raise AssertionError("Verification should not run in this scenario") | |
| 2905 | + | |
| 2906 | + guide_root = temp_dir / "guides" / "nginx" | |
| 2907 | + chapters = guide_root / "chapters" | |
| 2908 | + guide_root.mkdir(parents=True) | |
| 2909 | + chapters.mkdir() | |
| 2910 | + index_path = guide_root / "index.html" | |
| 2911 | + chapter_one = chapters / "01-introduction.html" | |
| 2912 | + index_path.write_text( | |
| 2913 | + "\n".join( | |
| 2914 | + [ | |
| 2915 | + "<html>", | |
| 2916 | + '<a href="chapters/01-introduction.html">Chapter 1: Introduction to NGINX Tool</a>', | |
| 2917 | + '<a href="chapters/02-installation.html">Chapter 2: Installation and Setup</a>', | |
| 2918 | + "</html>", | |
| 2919 | + ] | |
| 2920 | + ) | |
| 2921 | + + "\n" | |
| 2922 | + ) | |
| 2923 | + chapter_one.write_text("<html></html>\n") | |
| 2924 | + | |
| 2925 | + implementation_plan = temp_dir / "implementation.md" | |
| 2926 | + implementation_plan.write_text( | |
| 2927 | + "\n".join( | |
| 2928 | + [ | |
| 2929 | + "# Implementation Plan", | |
| 2930 | + "", | |
| 2931 | + "## File Changes", | |
| 2932 | + f"- `{guide_root}/`", | |
| 2933 | + f"- `{chapters}/`", | |
| 2934 | + f"- `{index_path}`", | |
| 2935 | + "", | |
| 2936 | + ] | |
| 2937 | + ) | |
| 2938 | + ) | |
| 2939 | + | |
| 2940 | + dod = create_definition_of_done("Create a multi-file nginx guide.") | |
| 2941 | + dod.implementation_plan = str(implementation_plan) | |
| 2942 | + dod.pending_items = [ | |
| 2943 | + "Creating Chapter 2: Installation and Setup", | |
| 2944 | + "Complete the requested work", | |
| 2945 | + ] | |
| 2946 | + dod.touched_files.extend([str(index_path), str(chapter_one)]) | |
| 2947 | + | |
| 2948 | + queued_messages: list[str] = [] | |
| 2949 | + context = build_context( | |
| 2950 | + temp_dir=temp_dir, | |
| 2951 | + messages=[], | |
| 2952 | + safeguards=FakeSafeguards(), | |
| 2953 | + assess_confidence=assess_confidence, | |
| 2954 | + verify_action=verify_action, | |
| 2955 | + auto_recover=False, | |
| 2956 | + ) | |
| 2957 | + context.queue_steering_message_callback = queued_messages.append | |
| 2958 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 2959 | + | |
| 2960 | + tool_call = ToolCall( | |
| 2961 | + id="todo-1", | |
| 2962 | + name="TodoWrite", | |
| 2963 | + arguments={ | |
| 2964 | + "todos": [ | |
| 2965 | + { | |
| 2966 | + "content": "Creating Chapter 2: Installation and Setup", | |
| 2967 | + "activeForm": "Creating Chapter 2: Installation and Setup", | |
| 2968 | + "status": "pending", | |
| 2969 | + } | |
| 2970 | + ] | |
| 2971 | + }, | |
| 2972 | + ) | |
| 2973 | + executor = FakeExecutor( | |
| 2974 | + [ | |
| 2975 | + tool_outcome( | |
| 2976 | + tool_call=tool_call, | |
| 2977 | + output="Todos updated", | |
| 2978 | + is_error=False, | |
| 2979 | + metadata={ | |
| 2980 | + "new_todos": [ | |
| 2981 | + { | |
| 2982 | + "content": "Creating Chapter 2: Installation and Setup", | |
| 2983 | + "active_form": "Creating Chapter 2: Installation and Setup", | |
| 2984 | + "status": "pending", | |
| 2985 | + } | |
| 2986 | + ] | |
| 2987 | + }, | |
| 2988 | + ) | |
| 2989 | + ] | |
| 2990 | + ) | |
| 2991 | + | |
| 2992 | + summary = TurnSummary(final_response="") | |
| 2993 | + await runner.execute_batch( | |
| 2994 | + tool_calls=[tool_call], | |
| 2995 | + tool_source="assistant", | |
| 2996 | + pending_tool_calls_seen=set(), | |
| 2997 | + emit=_noop_emit, | |
| 2998 | + summary=summary, | |
| 2999 | + dod=dod, | |
| 3000 | + executor=executor, # type: ignore[arg-type] | |
| 3001 | + on_confirmation=None, | |
| 3002 | + on_user_question=None, | |
| 3003 | + emit_confirmation=None, | |
| 3004 | + consecutive_errors=0, | |
| 3005 | + ) | |
| 3006 | + | |
| 3007 | + assert queued_messages | |
| 3008 | + message = queued_messages[-1] | |
| 3009 | + assert "Todo tracking is updated. Continue with the next pending item: `Creating Chapter 2: Installation and Setup`." in message | |
| 3010 | + assert "Resume by creating `02-installation.html` now." in message | |
| 3011 | + assert ( | |
| 3012 | + f"Prefer one `write` call for `{(chapters / '02-installation.html').resolve(strict=False)}` " | |
| 3013 | + "instead of more rereads." | |
| 3014 | + in message | |
| 3015 | + ) | |
| 3016 | + assert "Make your next response the concrete mutation tool call itself" in message | |
| 3017 | + | |
| 3018 | + | |
| 2887 | 3019 | @pytest.mark.asyncio |
| 2888 | 3020 | async def test_tool_batch_runner_todowrite_uses_observed_sibling_pattern_for_next_file( |
| 2889 | 3021 | temp_dir: Path, |