`c10dbd1`

Refine repair loop steering

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 1 week ago

SHA: c10dbd184e4fba56433bf8f81f7aca5dd0217ea0
Parents: b210a18
Tree: 15f7763

4 changed files

Status	File	+	-
M	`src/loader/runtime/hooks.py`	4	5
M	`src/loader/runtime/tool_batches.py`	57	0
M	`tests/test_permissions.py`	9	18
M	`tests/test_tool_batches.py`	104	0

src/loader/runtime/hooks.pymodified

      async def pre_tool_use(self, context: HookContext) -> HookResult:
          if context.tool_call.name not in _OBSERVATION_TOOLS:
              return HookResult()
 +        if context.source == "verification":
 +            return HookResult()
          completed_scope = self._completed_artifact_scope()
          if completed_scope is not None:
                  terminal_state="blocked",
+             )
 -        if context.source == "verification":
 -            return HookResult()
+-
          late_stage = self._late_stage_missing_artifact()
          if late_stage is None:
              return HookResult()
          return False
      async def post_tool_use(self, context: HookContext) -> HookResult:
 +        if context.source == "verification":
 +            return HookResult()
          if _tool_call_is_effective_mutation(context.tool_call):
              self._reset_completed_scope_state()
              return HookResult()
          completed_scope = self._completed_artifact_scope()
          if completed_scope is None:
 -            if context.source == "verification":
 -                return HookResult()
              self._reset_completed_scope_state()
              return HookResult()

src/loader/runtime/tool_batches.pymodified

              self.context.session.messages,
              max_items=2,
+         )
 +        edit_mismatch_target = _recent_edit_string_mismatch_target(
 +            self.context.recovery_context,
 +        )
 +        if edit_mismatch_target and _tool_call_targets_path(tool_call, edit_mismatch_target):
 +            self.context.queue_steering_message(
 +                "Reuse the earlier observation instead of repeating it. "
 +                f"The last edit on `{edit_mismatch_target}` failed because `old_string` "
 +                "did not exactly match the current file. Use the already-read contents "
 +                "as the source of truth and send one concrete mutation now: `edit` with "
 +                "an exact `old_string` copied from that file, `patch`, or `write` with "
 +                "the complete replacement content if the change rewrites most of the file. "
 +                "Do not read the same file again first."
 +            )
 +            return
          if _should_prioritize_missing_artifact(
              dod=dod,
              next_pending=next_pending,
+     )
 +def _recent_edit_string_mismatch_target(recovery_context: RecoveryContext | None) -> str:
 +    """Return the active edit target when recovery is from an old_string miss."""
++
 +    if recovery_context is None:
 +        return ""
 +    for attempt in reversed(recovery_context.attempts):
 +        if attempt.tool_name != "edit":
 +            continue
 +        if "old_string not found" not in str(attempt.error or "").lower():
 +            continue
 +        target = str(
 +            attempt.arguments.get("file_path")
 +            or attempt.arguments.get("path")
 +            or ""
 +        ).strip()
 +        if target:
 +            return target
 +    return ""
++
++
 +def _tool_call_targets_path(tool_call: ToolCall, target: str) -> bool:
 +    if not target:
 +        return False
 +    candidate = str(
 +        tool_call.arguments.get("file_path")
 +        or tool_call.arguments.get("path")
 +        or ""
 +    ).strip()
 +    if not candidate and tool_call.name == "bash":
 +        paths = _extract_bash_paths(
 +            str(tool_call.arguments.get("command") or "").strip(),
 +        )
 +        candidate = paths[0] if paths else ""
 +    if not candidate:
 +        return False
 +    try:
 +        return Path(candidate).expanduser().resolve(strict=False) == Path(
 +            target
 +        ).expanduser().resolve(strict=False)
 +    except (OSError, RuntimeError, ValueError):
 +        return candidate == target
++
++
  def _active_repair_focus_preview(repair_lines: list[str], *, max_lines: int = 4) -> str:
      """Compact repair-focus bullets for steering after no-op mutations."""

tests/test_permissions.pymodified

  @pytest.mark.asyncio
 -async def test_late_reference_drift_hook_blocks_verification_reference_reads_after_artifacts_exist(
 +async def test_late_reference_drift_hook_allows_verification_reference_reads_after_artifacts_exist(
      temp_dir: Path,
  ) -> None:
      registry = create_default_registry(temp_dir)
+         )
+     )
 -    assert result.decision == HookDecision.DENY
 -    assert result.terminal_state == "blocked"
 -    assert result.message is not None
 -    assert "completed artifact set scope" in result.message
 +    assert result.decision == HookDecision.CONTINUE
  @pytest.mark.asyncio
  @pytest.mark.asyncio
 -async def test_late_reference_drift_hook_blocks_excessive_post_build_self_audits_during_verification(
 +async def test_late_reference_drift_hook_allows_post_build_self_audits_during_verification(
      temp_dir: Path,
  ) -> None:
      registry = create_default_registry(temp_dir)
          assert result.decision == HookDecision.CONTINUE
          await hook.post_tool_use(context)
 -    blocked = await hook.pre_tool_use(make_context(5))
 +    result = await hook.pre_tool_use(make_context(5))
 -    assert blocked.decision == HookDecision.DENY
 -    assert blocked.terminal_state == "blocked"
 -    assert blocked.message is not None
 -    assert "post-build audit loop" in blocked.message
 +    assert result.decision == HookDecision.CONTINUE
  @pytest.mark.asyncio
              tool=registry.get("bash"),
              registry=registry,
              permission_policy=policy,
 -            source="verification",
 +            source="native",
+         )
+     )
  @pytest.mark.asyncio
 -async def test_late_reference_drift_hook_blocks_relative_bash_post_build_audit_loop(
 +async def test_late_reference_drift_hook_allows_relative_bash_post_build_audit_loop_during_verification(
      temp_dir: Path,
  ) -> None:
      registry = create_default_registry(temp_dir)
          assert result.decision == HookDecision.CONTINUE
          await hook.post_tool_use(context)
 -    blocked = await hook.pre_tool_use(make_context(5))
 +    result = await hook.pre_tool_use(make_context(5))
 -    assert blocked.decision == HookDecision.DENY
 -    assert blocked.terminal_state == "blocked"
 -    assert blocked.message is not None
 -    assert "post-build audit loop" in blocked.message
 +    assert result.decision == HookDecision.CONTINUE
  @pytest.mark.asyncio

tests/test_tool_batches.pymodified

      assert ephemeral_messages == []
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_duplicate_read_after_edit_mismatch_steers_to_mutation(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should not run for this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run for this scenario")
++
 +    target = temp_dir / "guide" / "chapters" / "02-installation.html"
 +    target.parent.mkdir(parents=True)
 +    target.write_text(
 +        "<h1>Chapter 2: Installation Guide</h1>\n"
 +        "<p>This chapter is still too thin.</p>\n"
 +    )
 +    recovery_context = RecoveryContext(
 +        original_tool="edit",
 +        original_args={
 +            "file_path": str(target),
 +            "old_string": "<h1>Installation</h1>",
 +            "new_string": "<h1>Installation</h1><p>Expanded.</p>",
 +        },
 +        max_retries=2,
 +    )
 +    recovery_context.add_attempt(
 +        "edit",
 +        {
 +            "file_path": str(target),
 +            "old_string": "<h1>Installation</h1>",
 +            "new_string": "<h1>Installation</h1><p>Expanded.</p>",
 +        },
 +        "old_string not found in file. Make sure it matches exactly.",
 +    )
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +        recovery_context=recovery_context,
 +        auto_recover=False,
 +    )
 +    persistent_messages: list[str] = []
 +    context.queue_steering_message_callback = persistent_messages.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    tool_call = ToolCall(
 +        id="read-dup-after-edit-miss",
 +        name="read",
 +        arguments={"file_path": str(target)},
 +    )
 +    duplicate_message = (
 +        "[Skipped - duplicate action: Already read "
 +        f"{target} recently without any intervening changes; "
 +        "reuse the earlier read result instead of rereading]"
 +    )
 +    executor = FakeExecutor(
 +        [
 +            ToolExecutionOutcome(
 +                tool_call=tool_call,
 +                state=ToolExecutionState.DUPLICATE,
 +                message=Message.tool_result_message(
 +                    tool_call_id=tool_call.id,
 +                    display_content=duplicate_message,
 +                    result_content=duplicate_message,
 +                ),
 +                event_content=duplicate_message,
 +                is_error=False,
 +                result_output=duplicate_message,
 +            )
 +        ]
 +    )
 +    dod = create_definition_of_done("Expand thin generated guide chapters.")
++
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=_noop_emit,
 +        summary=TurnSummary(final_response=""),
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert len(persistent_messages) == 1
 +    assert "last edit" in persistent_messages[0]
 +    assert "`old_string` did not exactly match" in persistent_messages[0]
 +    assert "send one concrete mutation now" in persistent_messages[0]
 +    assert "`write` with the complete replacement content" in persistent_messages[0]
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_todo_write_does_not_regress_completed_file_todo(
      temp_dir: Path,