@@ -4276,6 +4276,151 @@ async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing |
| 4276 | assert context.workflow_mode == "verify" | 4276 | assert context.workflow_mode == "verify" |
| 4277 | | 4277 | |
| 4278 | | 4278 | |
| | 4279 | +@pytest.mark.asyncio |
| | 4280 | +async def test_tool_batch_runner_todowrite_during_quality_repair_requires_mutation( |
| | 4281 | + temp_dir: Path, |
| | 4282 | +) -> None: |
| | 4283 | + async def assess_confidence( |
| | 4284 | + tool_name: str, |
| | 4285 | + tool_args: dict, |
| | 4286 | + context: str, |
| | 4287 | + ) -> ConfidenceAssessment: |
| | 4288 | + raise AssertionError("Confidence scoring should not run for this scenario") |
| | 4289 | + |
| | 4290 | + async def verify_action( |
| | 4291 | + tool_name: str, |
| | 4292 | + tool_args: dict, |
| | 4293 | + result: str, |
| | 4294 | + expected: str = "", |
| | 4295 | + ) -> ActionVerification: |
| | 4296 | + raise AssertionError("Verification should not run for this scenario") |
| | 4297 | + |
| | 4298 | + guide_root = temp_dir / "guides" / "nginx" |
| | 4299 | + chapters = guide_root / "chapters" |
| | 4300 | + chapters.mkdir(parents=True) |
| | 4301 | + index_path = guide_root / "index.html" |
| | 4302 | + chapter_one = chapters / "01-introduction.html" |
| | 4303 | + index_path.write_text("<html></html>\n") |
| | 4304 | + chapter_one.write_text("<html></html>\n") |
| | 4305 | + |
| | 4306 | + implementation_plan = temp_dir / "implementation.md" |
| | 4307 | + implementation_plan.write_text( |
| | 4308 | + "\n".join( |
| | 4309 | + [ |
| | 4310 | + "# Implementation Plan", |
| | 4311 | + "", |
| | 4312 | + "## File Changes", |
| | 4313 | + f"- `{guide_root}/`", |
| | 4314 | + f"- `{chapters}/`", |
| | 4315 | + f"- `{index_path}`", |
| | 4316 | + f"- `{chapter_one}`", |
| | 4317 | + "", |
| | 4318 | + ] |
| | 4319 | + ) |
| | 4320 | + ) |
| | 4321 | + |
| | 4322 | + context = build_context( |
| | 4323 | + temp_dir=temp_dir, |
| | 4324 | + messages=[ |
| | 4325 | + Message( |
| | 4326 | + role=Role.USER, |
| | 4327 | + content=( |
| | 4328 | + "Repair focus:\n" |
| | 4329 | + f"- Improve `{chapter_one}`: thin content (409 text chars, expected at least 1758).\n" |
| | 4330 | + f"- Improve `{chapter_one}`: insufficient structured content (6 blocks, expected at least 18).\n" |
| | 4331 | + f"- Immediate next step: edit `{chapter_one}`.\n" |
| | 4332 | + ), |
| | 4333 | + ) |
| | 4334 | + ], |
| | 4335 | + safeguards=FakeSafeguards(), |
| | 4336 | + assess_confidence=assess_confidence, |
| | 4337 | + verify_action=verify_action, |
| | 4338 | + auto_recover=False, |
| | 4339 | + ) |
| | 4340 | + context.set_workflow_mode("verify") |
| | 4341 | + queued_messages: list[str] = [] |
| | 4342 | + emitted_responses: list[str] = [] |
| | 4343 | + context.queue_steering_message_callback = queued_messages.append |
| | 4344 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| | 4345 | + dod = create_definition_of_done("Create a multi-file nginx guide.") |
| | 4346 | + dod.implementation_plan = str(implementation_plan) |
| | 4347 | + dod.verification_commands = [f"ls -la {guide_root}"] |
| | 4348 | + sync_todos_to_definition_of_done( |
| | 4349 | + dod, |
| | 4350 | + [ |
| | 4351 | + { |
| | 4352 | + "content": "Expand generated chapters to satisfy quality verification", |
| | 4353 | + "active_form": "Expanding generated chapters", |
| | 4354 | + "status": "in_progress", |
| | 4355 | + } |
| | 4356 | + ], |
| | 4357 | + project_root=temp_dir, |
| | 4358 | + ) |
| | 4359 | + |
| | 4360 | + tool_call = ToolCall( |
| | 4361 | + id="todo-quality", |
| | 4362 | + name="TodoWrite", |
| | 4363 | + arguments={ |
| | 4364 | + "todos": [ |
| | 4365 | + { |
| | 4366 | + "content": "Expand generated chapters to satisfy quality verification", |
| | 4367 | + "active_form": "Expanding generated chapters", |
| | 4368 | + "status": "completed", |
| | 4369 | + } |
| | 4370 | + ] |
| | 4371 | + }, |
| | 4372 | + ) |
| | 4373 | + executor = FakeExecutor( |
| | 4374 | + [ |
| | 4375 | + tool_outcome( |
| | 4376 | + tool_call=tool_call, |
| | 4377 | + output="Todos updated", |
| | 4378 | + is_error=False, |
| | 4379 | + metadata={ |
| | 4380 | + "new_todos": [ |
| | 4381 | + { |
| | 4382 | + "content": "Expand generated chapters to satisfy quality verification", |
| | 4383 | + "active_form": "Expanding generated chapters", |
| | 4384 | + "status": "completed", |
| | 4385 | + } |
| | 4386 | + ] |
| | 4387 | + }, |
| | 4388 | + ) |
| | 4389 | + ] |
| | 4390 | + ) |
| | 4391 | + |
| | 4392 | + async def emit(event: AgentEvent) -> None: |
| | 4393 | + if event.type == "response": |
| | 4394 | + emitted_responses.append(str(event.content)) |
| | 4395 | + |
| | 4396 | + summary = TurnSummary(final_response="") |
| | 4397 | + result = await runner.execute_batch( |
| | 4398 | + tool_calls=[tool_call], |
| | 4399 | + tool_source="assistant", |
| | 4400 | + pending_tool_calls_seen=set(), |
| | 4401 | + emit=emit, |
| | 4402 | + summary=summary, |
| | 4403 | + dod=dod, |
| | 4404 | + executor=executor, # type: ignore[arg-type] |
| | 4405 | + on_confirmation=None, |
| | 4406 | + on_user_question=None, |
| | 4407 | + emit_confirmation=None, |
| | 4408 | + consecutive_errors=0, |
| | 4409 | + ) |
| | 4410 | + |
| | 4411 | + assert queued_messages |
| | 4412 | + message = queued_messages[-1] |
| | 4413 | + assert "verification still has an active HTML content-quality repair" in message |
| | 4414 | + assert "TodoWrite cannot satisfy that verifier" in message |
| | 4415 | + assert f"Immediate next step: edit `{chapter_one.resolve(strict=False)}`" in message |
| | 4416 | + assert "thin content" in message |
| | 4417 | + assert "Finish with a final response now" not in message |
| | 4418 | + assert context.workflow_mode == "execute" |
| | 4419 | + assert result.halted is False |
| | 4420 | + assert summary.final_response == "" |
| | 4421 | + assert not emitted_responses |
| | 4422 | + |
| | 4423 | + |
| 4279 | @pytest.mark.asyncio | 4424 | @pytest.mark.asyncio |
| 4280 | async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff( | 4425 | async def test_tool_batch_runner_preempts_post_build_audit_after_todowrite_verify_handoff( |
| 4281 | temp_dir: Path, | 4426 | temp_dir: Path, |
@@ -6491,6 +6636,98 @@ def test_tool_batch_runner_blocked_noop_edit_after_full_build_prefers_verificati |
| 6491 | assert "replace the surrounding block" not in queued[0] | 6636 | assert "replace the surrounding block" not in queued[0] |
| 6492 | | 6637 | |
| 6493 | | 6638 | |
| | 6639 | +def test_tool_batch_runner_blocked_noop_edit_keeps_quality_repair_active_after_full_build( |
| | 6640 | + temp_dir: Path, |
| | 6641 | +) -> None: |
| | 6642 | + async def assess_confidence( |
| | 6643 | + tool_name: str, |
| | 6644 | + tool_args: dict, |
| | 6645 | + context: str, |
| | 6646 | + ) -> ConfidenceAssessment: |
| | 6647 | + raise AssertionError("Confidence scoring should be disabled in this scenario") |
| | 6648 | + |
| | 6649 | + async def verify_action( |
| | 6650 | + tool_name: str, |
| | 6651 | + tool_args: dict, |
| | 6652 | + result: str, |
| | 6653 | + expected: str = "", |
| | 6654 | + ) -> ActionVerification: |
| | 6655 | + raise AssertionError("Verification should not run in this scenario") |
| | 6656 | + |
| | 6657 | + guide_root = temp_dir / "guide" |
| | 6658 | + chapters = guide_root / "chapters" |
| | 6659 | + chapters.mkdir(parents=True) |
| | 6660 | + index_path = guide_root / "index.html" |
| | 6661 | + chapter_one = chapters / "01-introduction.html" |
| | 6662 | + chapter_two = chapters / "02-installation.html" |
| | 6663 | + index_path.write_text("<html></html>\n") |
| | 6664 | + chapter_one.write_text("<html></html>\n") |
| | 6665 | + chapter_two.write_text("<html></html>\n") |
| | 6666 | + |
| | 6667 | + implementation_plan = temp_dir / "implementation.md" |
| | 6668 | + implementation_plan.write_text( |
| | 6669 | + "\n".join( |
| | 6670 | + [ |
| | 6671 | + "# Implementation Plan", |
| | 6672 | + "", |
| | 6673 | + "## File Changes", |
| | 6674 | + f"- `{index_path}`", |
| | 6675 | + f"- `{chapter_one}`", |
| | 6676 | + f"- `{chapter_two}`", |
| | 6677 | + "", |
| | 6678 | + ] |
| | 6679 | + ) |
| | 6680 | + ) |
| | 6681 | + |
| | 6682 | + context = build_context( |
| | 6683 | + temp_dir=temp_dir, |
| | 6684 | + messages=[ |
| | 6685 | + Message( |
| | 6686 | + role=Role.USER, |
| | 6687 | + content=( |
| | 6688 | + "Repair focus:\n" |
| | 6689 | + f"- Improve `{chapter_two}`: thin content (504 text chars, expected at least 1758).\n" |
| | 6690 | + f"- Improve `{chapter_two}`: insufficient structured content (6 blocks, expected at least 18).\n" |
| | 6691 | + f"- Immediate next step: edit `{chapter_two}`.\n" |
| | 6692 | + ), |
| | 6693 | + ) |
| | 6694 | + ], |
| | 6695 | + safeguards=FakeSafeguards(), |
| | 6696 | + assess_confidence=assess_confidence, |
| | 6697 | + verify_action=verify_action, |
| | 6698 | + ) |
| | 6699 | + queued: list[str] = [] |
| | 6700 | + context.queue_steering_message_callback = queued.append |
| | 6701 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) |
| | 6702 | + |
| | 6703 | + dod = create_definition_of_done("Create a multi-file guide.") |
| | 6704 | + dod.implementation_plan = str(implementation_plan) |
| | 6705 | + dod.touched_files.extend([str(index_path), str(chapter_one), str(chapter_two)]) |
| | 6706 | + dod.verification_commands = [f"ls -la {guide_root}"] |
| | 6707 | + |
| | 6708 | + runner._queue_blocked_html_edit_nudge( |
| | 6709 | + ToolCall( |
| | 6710 | + id="edit-1", |
| | 6711 | + name="edit", |
| | 6712 | + arguments={ |
| | 6713 | + "file_path": str(chapter_two), |
| | 6714 | + "old_string": "same", |
| | 6715 | + "new_string": "same", |
| | 6716 | + }, |
| | 6717 | + ), |
| | 6718 | + "[Blocked - old_string and new_string are identical - no change would occur] Suggestion: Provide different old and new strings", |
| | 6719 | + dod=dod, |
| | 6720 | + ) |
| | 6721 | + |
| | 6722 | + assert queued |
| | 6723 | + assert "active content-quality repair is not complete" in queued[0] |
| | 6724 | + assert "Repair focus:" in queued[0] |
| | 6725 | + assert f"Immediate next step: edit `{chapter_two}`" in queued[0] |
| | 6726 | + assert "thin content" in queued[0] |
| | 6727 | + assert "TodoWrite cannot satisfy" not in queued[0] |
| | 6728 | + assert "Finish with a final response now" not in queued[0] |
| | 6729 | + |
| | 6730 | + |
| 6494 | async def _noop_emit(event: AgentEvent) -> None: | 6731 | async def _noop_emit(event: AgentEvent) -> None: |
| 6495 | return None | 6732 | return None |
| 6496 | | 6733 | |