@@ -1830,7 +1830,7 @@ async def test_tool_batch_runner_duplicate_read_after_plan_complete_pushes_verif |
| 1830 | 1830 | ) |
| 1831 | 1831 | |
| 1832 | 1832 | assert len(persistent_messages) == 1 |
| 1833 | | - assert "All explicitly planned artifacts already exist." in persistent_messages[0] |
| 1833 | + assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0] |
| 1834 | 1834 | assert ( |
| 1835 | 1835 | "Move to verification or final confirmation using the files already on disk." |
| 1836 | 1836 | in persistent_messages[0] |
@@ -1951,7 +1951,7 @@ async def test_tool_batch_runner_duplicate_read_after_plan_complete_ignores_stal |
| 1951 | 1951 | ) |
| 1952 | 1952 | |
| 1953 | 1953 | assert len(persistent_messages) == 1 |
| 1954 | | - assert "All explicitly planned artifacts already exist." in persistent_messages[0] |
| 1954 | + assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0] |
| 1955 | 1955 | assert ( |
| 1956 | 1956 | "Move to verification or final confirmation using the files already on disk." |
| 1957 | 1957 | in persistent_messages[0] |
@@ -3148,7 +3148,7 @@ async def test_tool_batch_runner_hands_off_to_verification_once_planned_artifact |
| 3148 | 3148 | ) |
| 3149 | 3149 | |
| 3150 | 3150 | assert any( |
| 3151 | | - "All explicitly planned artifacts now exist." in message |
| 3151 | + "All explicitly planned artifacts now exist on disk." in message |
| 3152 | 3152 | for message in persistent_messages |
| 3153 | 3153 | ) |
| 3154 | 3154 | assert any( |
@@ -3395,7 +3395,7 @@ async def test_tool_batch_runner_large_plan_does_not_claim_completion_early( |
| 3395 | 3395 | for message in ephemeral_messages |
| 3396 | 3396 | ) |
| 3397 | 3397 | assert not any( |
| 3398 | | - "All explicitly planned artifacts now exist." in message |
| 3398 | + "All explicitly planned artifacts now exist on disk." in message |
| 3399 | 3399 | for message in ephemeral_messages |
| 3400 | 3400 | ) |
| 3401 | 3401 | |
@@ -3802,13 +3802,149 @@ async def test_tool_batch_runner_todowrite_after_artifacts_exist_pushes_verifica |
| 3802 | 3802 | |
| 3803 | 3803 | assert queued_messages |
| 3804 | 3804 | message = queued_messages[-1] |
| 3805 | | - assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message |
| 3805 | + assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message |
| 3806 | 3806 | assert "Verify all guide files are linked and complete" in message |
| 3807 | 3807 | assert "Move to verification once no specific mismatch remains." in message |
| 3808 | 3808 | assert "reopen reference materials" in message |
| 3809 | 3809 | assert "Fortran guide structure" not in message |
| 3810 | 3810 | |
| 3811 | 3811 | |
| 3812 | +@pytest.mark.asyncio |
| 3813 | +async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify( |
| 3814 | + temp_dir: Path, |
| 3815 | +) -> None: |
| 3816 | + async def assess_confidence( |
| 3817 | + tool_name: str, |
| 3818 | + tool_args: dict, |
| 3819 | + context: str, |
| 3820 | + ) -> ConfidenceAssessment: |
| 3821 | + raise AssertionError("Confidence scoring should not run for this scenario") |
| 3822 | + |
| 3823 | + async def verify_action( |
| 3824 | + tool_name: str, |
| 3825 | + tool_args: dict, |
| 3826 | + result: str, |
| 3827 | + expected: str = "", |
| 3828 | + ) -> ActionVerification: |
| 3829 | + raise AssertionError("Verification should not run for this scenario") |
| 3830 | + |
| 3831 | + guide_root = temp_dir / "guides" / "nginx" |
| 3832 | + chapters = guide_root / "chapters" |
| 3833 | + guide_root.mkdir(parents=True) |
| 3834 | + chapters.mkdir() |
| 3835 | + index_path = guide_root / "index.html" |
| 3836 | + chapter_one = chapters / "01-introduction.html" |
| 3837 | + chapter_two = chapters / "02-installation.html" |
| 3838 | + index_path.write_text( |
| 3839 | + "\n".join( |
| 3840 | + [ |
| 3841 | + '<a href="chapters/01-introduction.html">Intro</a>', |
| 3842 | + '<a href="chapters/02-installation.html">Install</a>', |
| 3843 | + '<a href="../index.html">Back</a>', |
| 3844 | + "", |
| 3845 | + ] |
| 3846 | + ) |
| 3847 | + ) |
| 3848 | + chapter_one.write_text("<html></html>\n") |
| 3849 | + chapter_two.write_text("<html></html>\n") |
| 3850 | + |
| 3851 | + implementation_plan = temp_dir / "implementation.md" |
| 3852 | + implementation_plan.write_text( |
| 3853 | + "\n".join( |
| 3854 | + [ |
| 3855 | + "# Implementation Plan", |
| 3856 | + "", |
| 3857 | + "## File Changes", |
| 3858 | + f"- `{guide_root}/`", |
| 3859 | + f"- `{chapters}/`", |
| 3860 | + f"- `{index_path}`", |
| 3861 | + f"- `{chapter_one}`", |
| 3862 | + f"- `{chapter_two}`", |
| 3863 | + "", |
| 3864 | + ] |
| 3865 | + ) |
| 3866 | + ) |
| 3867 | + |
| 3868 | + context = build_context( |
| 3869 | + temp_dir=temp_dir, |
| 3870 | + messages=[], |
| 3871 | + safeguards=FakeSafeguards(), |
| 3872 | + assess_confidence=assess_confidence, |
| 3873 | + verify_action=verify_action, |
| 3874 | + auto_recover=False, |
| 3875 | + ) |
| 3876 | + queued_messages: list[str] = [] |
| 3877 | + context.queue_steering_message_callback = queued_messages.append |
| 3878 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 3879 | + dod = create_definition_of_done("Create a multi-file nginx guide.") |
| 3880 | + dod.implementation_plan = str(implementation_plan) |
| 3881 | + dod.verification_commands = [f"ls -la {guide_root}"] |
| 3882 | + sync_todos_to_definition_of_done( |
| 3883 | + dod, |
| 3884 | + [ |
| 3885 | + { |
| 3886 | + "content": "Create chapter files following the established pattern", |
| 3887 | + "active_form": "Creating chapter files", |
| 3888 | + "status": "in_progress", |
| 3889 | + } |
| 3890 | + ], |
| 3891 | + project_root=temp_dir, |
| 3892 | + ) |
| 3893 | + |
| 3894 | + tool_call = ToolCall( |
| 3895 | + id="todo-post-build", |
| 3896 | + name="TodoWrite", |
| 3897 | + arguments={ |
| 3898 | + "todos": [ |
| 3899 | + { |
| 3900 | + "content": "Create chapter files following the established pattern", |
| 3901 | + "active_form": "Creating chapter files", |
| 3902 | + "status": "in_progress", |
| 3903 | + } |
| 3904 | + ] |
| 3905 | + }, |
| 3906 | + ) |
| 3907 | + executor = FakeExecutor( |
| 3908 | + [ |
| 3909 | + tool_outcome( |
| 3910 | + tool_call=tool_call, |
| 3911 | + output="Todos updated", |
| 3912 | + is_error=False, |
| 3913 | + metadata={ |
| 3914 | + "new_todos": [ |
| 3915 | + { |
| 3916 | + "content": "Create chapter files following the established pattern", |
| 3917 | + "active_form": "Creating chapter files", |
| 3918 | + "status": "in_progress", |
| 3919 | + } |
| 3920 | + ] |
| 3921 | + }, |
| 3922 | + ) |
| 3923 | + ] |
| 3924 | + ) |
| 3925 | + |
| 3926 | + summary = TurnSummary(final_response="") |
| 3927 | + await runner.execute_batch( |
| 3928 | + tool_calls=[tool_call], |
| 3929 | + tool_source="assistant", |
| 3930 | + pending_tool_calls_seen=set(), |
| 3931 | + emit=_noop_emit, |
| 3932 | + summary=summary, |
| 3933 | + dod=dod, |
| 3934 | + executor=executor, # type: ignore[arg-type] |
| 3935 | + on_confirmation=None, |
| 3936 | + on_user_question=None, |
| 3937 | + emit_confirmation=None, |
| 3938 | + consecutive_errors=0, |
| 3939 | + ) |
| 3940 | + |
| 3941 | + assert queued_messages |
| 3942 | + message = queued_messages[-1] |
| 3943 | + assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message |
| 3944 | + assert "Repair or verify the current files instead of expanding the artifact set." in message |
| 3945 | + assert "Move to verification or final confirmation using the files already on disk." in message |
| 3946 | + |
| 3947 | + |
| 3812 | 3948 | @pytest.mark.asyncio |
| 3813 | 3949 | async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation( |
| 3814 | 3950 | temp_dir: Path, |
@@ -5849,6 +5985,7 @@ def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_r |
| 5849 | 5985 | queued: list[str] = [] |
| 5850 | 5986 | context.queue_steering_message_callback = queued.append |
| 5851 | 5987 | runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 5988 | + dod = create_definition_of_done("Create a guide.") |
| 5852 | 5989 | |
| 5853 | 5990 | target = temp_dir / "guide" / "chapters" / "troubleshooting.html" |
| 5854 | 5991 | runner._queue_blocked_html_declared_file_creation_nudge( |
@@ -5865,6 +6002,7 @@ def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_r |
| 5865 | 6002 | "Already-declared local targets include: chapters/advanced-topics.html, " |
| 5866 | 6003 | "chapters/basic-usage.html, chapters/configuration.html" |
| 5867 | 6004 | ), |
| 6005 | + dod=dod, |
| 5868 | 6006 | ) |
| 5869 | 6007 | |
| 5870 | 6008 | assert queued |
@@ -5874,6 +6012,181 @@ def test_tool_batch_runner_blocked_html_declared_file_creation_nudge_points_to_r |
| 5874 | 6012 | assert "retry the file creation" in queued[0] |
| 5875 | 6013 | |
| 5876 | 6014 | |
| 6015 | +def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify( |
| 6016 | + temp_dir: Path, |
| 6017 | +) -> None: |
| 6018 | + async def assess_confidence( |
| 6019 | + tool_name: str, |
| 6020 | + tool_args: dict, |
| 6021 | + context: str, |
| 6022 | + ) -> ConfidenceAssessment: |
| 6023 | + raise AssertionError("Confidence scoring should not run in this scenario") |
| 6024 | + |
| 6025 | + async def verify_action( |
| 6026 | + tool_name: str, |
| 6027 | + tool_args: dict, |
| 6028 | + result: str, |
| 6029 | + expected: str = "", |
| 6030 | + ) -> ActionVerification: |
| 6031 | + raise AssertionError("Verification should not run in this scenario") |
| 6032 | + |
| 6033 | + guide = temp_dir / "guide" |
| 6034 | + chapters = guide / "chapters" |
| 6035 | + guide.mkdir() |
| 6036 | + chapters.mkdir() |
| 6037 | + index = guide / "index.html" |
| 6038 | + index.write_text( |
| 6039 | + "\n".join( |
| 6040 | + [ |
| 6041 | + '<a href="chapters/01-introduction.html">Intro</a>', |
| 6042 | + '<a href="chapters/02-installation.html">Install</a>', |
| 6043 | + '<a href="../index.html">Back</a>', |
| 6044 | + "", |
| 6045 | + ] |
| 6046 | + ) |
| 6047 | + ) |
| 6048 | + (chapters / "01-introduction.html").write_text("<html></html>\n") |
| 6049 | + (chapters / "02-installation.html").write_text("<html></html>\n") |
| 6050 | + |
| 6051 | + implementation_plan = temp_dir / "implementation.md" |
| 6052 | + implementation_plan.write_text( |
| 6053 | + "\n".join( |
| 6054 | + [ |
| 6055 | + "# Implementation Plan", |
| 6056 | + "", |
| 6057 | + "## File Changes", |
| 6058 | + f"- `{index}`", |
| 6059 | + f"- `{chapters / '01-introduction.html'}`", |
| 6060 | + f"- `{chapters / '02-installation.html'}`", |
| 6061 | + "", |
| 6062 | + ] |
| 6063 | + ) |
| 6064 | + ) |
| 6065 | + |
| 6066 | + context = build_context( |
| 6067 | + temp_dir=temp_dir, |
| 6068 | + messages=[], |
| 6069 | + safeguards=FakeSafeguards(), |
| 6070 | + assess_confidence=assess_confidence, |
| 6071 | + verify_action=verify_action, |
| 6072 | + ) |
| 6073 | + queued: list[str] = [] |
| 6074 | + context.queue_steering_message_callback = queued.append |
| 6075 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 6076 | + dod = create_definition_of_done("Create a guide.") |
| 6077 | + dod.implementation_plan = str(implementation_plan) |
| 6078 | + dod.verification_commands = [f"ls -la {guide}"] |
| 6079 | + dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")] |
| 6080 | + |
| 6081 | + target = guide / "chapters" / "08-advanced-configuration.html" |
| 6082 | + runner._queue_blocked_html_declared_file_creation_nudge( |
| 6083 | + ToolCall( |
| 6084 | + id="write-extra", |
| 6085 | + name="write", |
| 6086 | + arguments={"file_path": str(target)}, |
| 6087 | + ), |
| 6088 | + ( |
| 6089 | + "[Blocked - HTML file creation falls outside the current declared artifact set] " |
| 6090 | + "Suggestion: Keep new non-root HTML files within the root-declared artifact set and " |
| 6091 | + f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, " |
| 6092 | + "for example: chapters/08-advanced-configuration.html." |
| 6093 | + ), |
| 6094 | + dod=dod, |
| 6095 | + ) |
| 6096 | + |
| 6097 | + assert queued |
| 6098 | + assert "All explicitly planned artifacts already exist on disk." in queued[0] |
| 6099 | + assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0] |
| 6100 | + assert "Move to verification or final confirmation using the files already on disk." in queued[0] |
| 6101 | + assert "update the guide root" not in queued[0] |
| 6102 | + |
| 6103 | + |
| 6104 | +def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify( |
| 6105 | + temp_dir: Path, |
| 6106 | +) -> None: |
| 6107 | + async def assess_confidence( |
| 6108 | + tool_name: str, |
| 6109 | + tool_args: dict, |
| 6110 | + context: str, |
| 6111 | + ) -> ConfidenceAssessment: |
| 6112 | + raise AssertionError("Confidence scoring should not run in this scenario") |
| 6113 | + |
| 6114 | + async def verify_action( |
| 6115 | + tool_name: str, |
| 6116 | + tool_args: dict, |
| 6117 | + result: str, |
| 6118 | + expected: str = "", |
| 6119 | + ) -> ActionVerification: |
| 6120 | + raise AssertionError("Verification should not run in this scenario") |
| 6121 | + |
| 6122 | + guide = temp_dir / "guide" |
| 6123 | + chapters = guide / "chapters" |
| 6124 | + guide.mkdir() |
| 6125 | + chapters.mkdir() |
| 6126 | + index = guide / "index.html" |
| 6127 | + index.write_text( |
| 6128 | + "\n".join( |
| 6129 | + [ |
| 6130 | + '<a href="chapters/01-introduction.html">Intro</a>', |
| 6131 | + '<a href="chapters/02-installation.html">Install</a>', |
| 6132 | + '<a href="../index.html">Back</a>', |
| 6133 | + "", |
| 6134 | + ] |
| 6135 | + ) |
| 6136 | + ) |
| 6137 | + (chapters / "01-introduction.html").write_text("<html></html>\n") |
| 6138 | + (chapters / "02-installation.html").write_text("<html></html>\n") |
| 6139 | + |
| 6140 | + implementation_plan = temp_dir / "implementation.md" |
| 6141 | + implementation_plan.write_text( |
| 6142 | + "\n".join( |
| 6143 | + [ |
| 6144 | + "# Implementation Plan", |
| 6145 | + "", |
| 6146 | + "## File Changes", |
| 6147 | + f"- `{index}`", |
| 6148 | + f"- `{chapters / '01-introduction.html'}`", |
| 6149 | + f"- `{chapters / '02-installation.html'}`", |
| 6150 | + "", |
| 6151 | + ] |
| 6152 | + ) |
| 6153 | + ) |
| 6154 | + |
| 6155 | + context = build_context( |
| 6156 | + temp_dir=temp_dir, |
| 6157 | + messages=[], |
| 6158 | + safeguards=FakeSafeguards(), |
| 6159 | + assess_confidence=assess_confidence, |
| 6160 | + verify_action=verify_action, |
| 6161 | + ) |
| 6162 | + queued: list[str] = [] |
| 6163 | + context.queue_steering_message_callback = queued.append |
| 6164 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| 6165 | + dod = create_definition_of_done("Create a guide.") |
| 6166 | + dod.implementation_plan = str(implementation_plan) |
| 6167 | + dod.verification_commands = [f"ls -la {guide}"] |
| 6168 | + dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")] |
| 6169 | + |
| 6170 | + runner._queue_blocked_html_missing_target_nudge( |
| 6171 | + ToolCall( |
| 6172 | + id="edit-root", |
| 6173 | + name="edit", |
| 6174 | + arguments={"file_path": str(index)}, |
| 6175 | + ), |
| 6176 | + ( |
| 6177 | + "[Blocked - Edited HTML links point to files that do not exist] " |
| 6178 | + "Suggestion: Use only existing local targets for href values and avoid introducing missing links, " |
| 6179 | + "for example fix: chapters/08-advanced-configuration.html" |
| 6180 | + ), |
| 6181 | + dod=dod, |
| 6182 | + ) |
| 6183 | + |
| 6184 | + assert queued |
| 6185 | + assert "All explicitly planned artifacts already exist on disk." in queued[0] |
| 6186 | + assert "Do not introduce new local-link targets beyond the current output set." in queued[0] |
| 6187 | + assert "Repair the existing generated files instead of expanding the guide." in queued[0] |
| 6188 | + |
| 6189 | + |
| 5877 | 6190 | @pytest.mark.asyncio |
| 5878 | 6191 | async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact( |
| 5879 | 6192 | temp_dir: Path, |