`37acacf`

Prefer verification after full build

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 1 week ago

SHA: 37acacf2230e2fd35289c3b4815b9e2c18a0d3f7
Parents: b3febff
Tree: 10ba7e1

4 changed files

Status	File	+	-
M	`src/loader/runtime/dod.py`	25	5
M	`src/loader/runtime/tool_batches.py`	80	15
M	`tests/test_dod.py`	2	0
M	`tests/test_tool_batches.py`	318	5

src/loader/runtime/dod.pymodified

      *,
      project_root: Path,
      max_paths: int | None = None,
 +) -> bool:
 +    if not all_planned_artifact_outputs_exist(
 +        dod,
 +        project_root=project_root,
 +        max_paths=max_paths,
 +    ):
 +        return False
 +    targets = collect_planned_artifact_targets(
 +        dod,
 +        project_root=project_root,
 +        max_paths=max_paths,
 +    )
 +    return not _planned_html_outputs_have_missing_local_links(
 +        dod,
 +        project_root=project_root,
 +        targets=targets,
 +    )
++
++
 +def all_planned_artifact_outputs_exist(
 +    dod: DefinitionOfDone,
 +    *,
 +    project_root: Path,
 +    max_paths: int | None = None,
  ) -> bool:
      targets = collect_planned_artifact_targets(
          dod,
          project_root=project_root,
      ):
          return False
 -    return not _planned_html_outputs_have_missing_local_links(
 -        dod,
 -        project_root=project_root,
 -        targets=targets,
 -    )
 +    return True
  def planned_artifact_target_satisfied(

src/loader/runtime/tool_batches.pymodified

  from .dod import (
      DefinitionOfDone,
      DefinitionOfDoneStore,
 +    all_planned_artifact_outputs_exist,
      all_planned_artifacts_exist,
      begin_new_verification_attempt,
      collect_planned_artifact_targets,
                  self._queue_blocked_html_declared_file_creation_nudge(
                      tool_call,
                      outcome.event_content,
 +                    dod=dod,
+                 )
                  self._queue_blocked_html_declared_target_nudge(
                      tool_call,
                      outcome.event_content,
+                 )
 +                self._queue_blocked_html_missing_target_nudge(
 +                    tool_call,
 +                    outcome.event_content,
 +                    dod=dod,
 +                )
                  self._queue_blocked_active_repair_nudge(outcome.event_content)
                  self._queue_blocked_active_repair_mutation_nudge(outcome.event_content)
                  self._queue_blocked_completed_artifact_scope_nudge(
+             )
              return
 -        if all_planned_artifacts_exist(dod, project_root=self.context.project_root):
 +        if all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root):
              verification_commands = dod.verification_commands or derive_verification_commands(
                  dod,
                  project_root=self.context.project_root,
+             )
              self.context.queue_steering_message(
                  "Reuse the earlier observation instead of repeating it. "
 -                "All explicitly planned artifacts already exist. "
 +                "All explicitly planned artifacts already exist on disk. "
                  "Use the current task artifacts as the source of truth and do not reopen "
                  "reference materials unless one specific gap is still unknown. "
 +                "If anything is still wrong, repair the current files instead of expanding the artifact set. "
                  + verification_suffix
+             )
              return
              return
          if extract_active_repair_context(self.context.session.messages) is not None:
              return
 -        if not all_planned_artifacts_exist(dod, project_root=self.context.project_root):
 +        if not all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root):
              return
          observed_paths = _extract_observation_paths(tool_call)
          self,
          tool_call: ToolCall,
          event_content: str,
 +        *,
 +        dod: DefinitionOfDone,
      ) -> None:
          """Steer blocked undeclared HTML file creation back through the root guide."""
          except ValueError:
              relative_target = target_path.name
 +        if all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root):
 +            verification_commands = dod.verification_commands or derive_verification_commands(
 +                dod,
 +                project_root=self.context.project_root,
 +                task_statement=getattr(self.context.session, "current_task", "") or "",
 +                supplement_existing=True,
 +            )
 +            verification_suffix = (
 +                " Move to verification or final confirmation using the files already on disk."
 +                if verification_commands
 +                else " Finish the task using the files already on disk."
 +            )
 +            self.context.queue_steering_message(
 +                "All explicitly planned artifacts already exist on disk. "
 +                f"Do not expand the output set with `{relative_target}`. "
 +                "Use the current generated files as the source of truth and repair or verify them instead."
 +                + verification_suffix
 +            )
 +            return
++
          guidance = (
              "That new HTML file is outside the current root-declared artifact set. "
              f"Before creating `{relative_target}`, update `{root_index}` so the guide root "
+         )
          self.context.queue_steering_message(guidance)
 +    def _queue_blocked_html_missing_target_nudge(
 +        self,
 +        tool_call: ToolCall,
 +        event_content: str,
 +        *,
 +        dod: DefinitionOfDone,
 +    ) -> None:
 +        """Turn post-build missing-link expansions into verify/repair handoffs."""
++
 +        if tool_call.name not in {"write", "edit", "patch"}:
 +            return
 +        if "Edited HTML links point to files that do not exist" not in event_content:
 +            return
 +        if not all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root):
 +            return
++
 +        verification_commands = dod.verification_commands or derive_verification_commands(
 +            dod,
 +            project_root=self.context.project_root,
 +            task_statement=getattr(self.context.session, "current_task", "") or "",
 +            supplement_existing=True,
 +        )
 +        verification_suffix = (
 +            " Move to verification or final confirmation using the files already on disk."
 +            if verification_commands
 +            else " Finish the task using the files already on disk."
 +        )
 +        self.context.queue_steering_message(
 +            "All explicitly planned artifacts already exist on disk. "
 +            "Do not introduce new local-link targets beyond the current output set. "
 +            "Repair the existing generated files instead of expanding the guide."
 +            + verification_suffix
 +        )
++
      def _queue_blocked_invalid_mutation_nudge(
          self,
          tool_call: ToolCall,
      ) -> None:
          if not is_state_mutating_tool_call(tool_call):
              return
 -        if not all_planned_artifacts_exist(dod, project_root=self.context.project_root):
 +        if not all_planned_artifact_outputs_exist(dod, project_root=self.context.project_root):
              return
          next_pending = preferred_pending_todo_item(
                  else " Avoid another full reread unless one specific inconsistency is still unknown."
+             )
              self.context.queue_steering_message(
 -                "All explicitly planned artifacts now exist. "
 +                "All explicitly planned artifacts now exist on disk. "
                  f"Continue with the next pending item: `{next_pending}`. "
                  "Use the files already on disk as the source of truth instead of restarting "
                  "discovery or inventing alternate filenames."
          if verification_commands:
              self.context.queue_steering_message(
 -                "All explicitly planned artifacts now exist. "
 +                "All explicitly planned artifacts now exist on disk. "
                  "Do not expand the artifact set or restart discovery unless a specific gap is "
                  "still known. Move to verification or final confirmation using the files that "
                  "already exist."
              next_pending=next_pending,
              project_root=self.context.project_root,
+         )
 +        outputs_exist = all_planned_artifact_outputs_exist(
 +            dod,
 +            project_root=self.context.project_root,
 +        )
          if missing_artifact is None:
 -            if next_pending and _todo_is_mutation_step(next_pending):
 +            if next_pending and _todo_is_mutation_step(next_pending) and not outputs_exist:
                  pending_target = infer_pending_todo_output_target(
                      dod,
                      next_pending,
              if (
                  next_pending
                  and _todo_is_consistency_review_step(next_pending)
 -                and not all_planned_artifacts_exist(
 -                    dod,
 -                    project_root=self.context.project_root,
 -                )
 +                and not outputs_exist
              ):
                  self.context.queue_ephemeral_steering_message(
                      "Todo tracking is updated. Continue with the next pending item: "
+                 )
                  return
 -            if not all_planned_artifacts_exist(dod, project_root=self.context.project_root):
 +            if not outputs_exist:
                  return
              verification_commands = dod.verification_commands or derive_verification_commands(
                      else " Finish the targeted consistency pass without reopening reference materials."
+                 )
                  self.context.queue_ephemeral_steering_message(
 -                    "Todo tracking is updated. All explicitly planned artifacts now exist. "
 +                    "Todo tracking is updated. All explicitly planned artifacts now exist on disk. "
                      f"Continue with the next pending item: `{next_pending}`. "
                      "Use the current output files as the source of truth, and do not restart "
                      "early discovery or reopen reference materials."
                  else " Finish the task using the files already on disk."
+             )
              self.context.queue_ephemeral_steering_message(
 -                "Todo tracking is updated. All explicitly planned artifacts now exist. "
 +                "Todo tracking is updated. All explicitly planned artifacts now exist on disk. "
                  "Do not restart discovery, reopen reference materials, or spend another turn "
 -                "on TodoWrite alone."
 +                "on TodoWrite alone. Repair or verify the current files instead of expanding the artifact set."
                  + verification_suffix
+             )
              return

tests/test_dod.pymodified

  from loader.runtime.dod import (
      DefinitionOfDoneStore,
      VerificationEvidence,
 +    all_planned_artifact_outputs_exist,
      all_planned_artifacts_exist,
      begin_new_verification_attempt,
      build_verification_summary,
      dod.completed_items = ["Create chapter files with appropriate content"]
      assert all_planned_artifacts_exist(dod, project_root=tmp_path) is False
 +    assert all_planned_artifact_outputs_exist(dod, project_root=tmp_path) is True
      (chapters / "02-setup.html").write_text("<h1>Setup</h1>\n")

tests/test_tool_batches.pymodified

+     )
      assert len(persistent_messages) == 1
 -    assert "All explicitly planned artifacts already exist." in persistent_messages[0]
 +    assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      assert (
          "Move to verification or final confirmation using the files already on disk."
          in persistent_messages[0]
+     )
      assert len(persistent_messages) == 1
 -    assert "All explicitly planned artifacts already exist." in persistent_messages[0]
 +    assert "All explicitly planned artifacts already exist on disk." in persistent_messages[0]
      assert (
          "Move to verification or final confirmation using the files already on disk."
          in persistent_messages[0]
+     )
      assert any(
 -        "All explicitly planned artifacts now exist." in message
 +        "All explicitly planned artifacts now exist on disk." in message
          for message in persistent_messages
+     )
      assert any(
          for message in ephemeral_messages
+     )
      assert not any(
 -        "All explicitly planned artifacts now exist." in message
 +        "All explicitly planned artifacts now exist on disk." in message
          for message in ephemeral_messages
+     )
      assert queued_messages
      message = queued_messages[-1]
 -    assert "Todo tracking is updated. All explicitly planned artifacts now exist." in message
 +    assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
      assert "Verify all guide files are linked and complete" in message
      assert "Move to verification once no specific mismatch remains." in message
      assert "reopen reference materials" in message
      assert "Fortran guide structure" not in message
 +@pytest.mark.asyncio
 +async def test_tool_batch_runner_todowrite_after_outputs_exist_but_links_missing_still_handoffs_to_verify(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should not run for this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run for this scenario")
++
 +    guide_root = temp_dir / "guides" / "nginx"
 +    chapters = guide_root / "chapters"
 +    guide_root.mkdir(parents=True)
 +    chapters.mkdir()
 +    index_path = guide_root / "index.html"
 +    chapter_one = chapters / "01-introduction.html"
 +    chapter_two = chapters / "02-installation.html"
 +    index_path.write_text(
 +        "\n".join(
 +            [
 +                '<a href="chapters/01-introduction.html">Intro</a>',
 +                '<a href="chapters/02-installation.html">Install</a>',
 +                '<a href="../index.html">Back</a>',
 +                "",
 +            ]
 +        )
 +    )
 +    chapter_one.write_text("<html></html>\n")
 +    chapter_two.write_text("<html></html>\n")
++
 +    implementation_plan = temp_dir / "implementation.md"
 +    implementation_plan.write_text(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## File Changes",
 +                f"- `{guide_root}/`",
 +                f"- `{chapters}/`",
 +                f"- `{index_path}`",
 +                f"- `{chapter_one}`",
 +                f"- `{chapter_two}`",
 +                "",
 +            ]
 +        )
 +    )
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +        auto_recover=False,
 +    )
 +    queued_messages: list[str] = []
 +    context.queue_steering_message_callback = queued_messages.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    dod = create_definition_of_done("Create a multi-file nginx guide.")
 +    dod.implementation_plan = str(implementation_plan)
 +    dod.verification_commands = [f"ls -la {guide_root}"]
 +    sync_todos_to_definition_of_done(
 +        dod,
 +        [
 +            {
 +                "content": "Create chapter files following the established pattern",
 +                "active_form": "Creating chapter files",
 +                "status": "in_progress",
 +            }
 +        ],
 +        project_root=temp_dir,
 +    )
++
 +    tool_call = ToolCall(
 +        id="todo-post-build",
 +        name="TodoWrite",
 +        arguments={
 +            "todos": [
 +                {
 +                    "content": "Create chapter files following the established pattern",
 +                    "active_form": "Creating chapter files",
 +                    "status": "in_progress",
 +                }
 +            ]
 +        },
 +    )
 +    executor = FakeExecutor(
 +        [
 +            tool_outcome(
 +                tool_call=tool_call,
 +                output="Todos updated",
 +                is_error=False,
 +                metadata={
 +                    "new_todos": [
 +                        {
 +                            "content": "Create chapter files following the established pattern",
 +                            "active_form": "Creating chapter files",
 +                            "status": "in_progress",
 +                        }
 +                    ]
 +                },
 +            )
 +        ]
 +    )
++
 +    summary = TurnSummary(final_response="")
 +    await runner.execute_batch(
 +        tool_calls=[tool_call],
 +        tool_source="assistant",
 +        pending_tool_calls_seen=set(),
 +        emit=_noop_emit,
 +        summary=summary,
 +        dod=dod,
 +        executor=executor,  # type: ignore[arg-type]
 +        on_confirmation=None,
 +        on_user_question=None,
 +        emit_confirmation=None,
 +        consecutive_errors=0,
 +    )
++
 +    assert queued_messages
 +    message = queued_messages[-1]
 +    assert "Todo tracking is updated. All explicitly planned artifacts now exist on disk." in message
 +    assert "Repair or verify the current files instead of expanding the artifact set." in message
 +    assert "Move to verification or final confirmation using the files already on disk." in message
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_todowrite_with_existing_output_roots_requeues_next_mutation(
      temp_dir: Path,
      queued: list[str] = []
      context.queue_steering_message_callback = queued.append
      runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    dod = create_definition_of_done("Create a guide.")
      target = temp_dir / "guide" / "chapters" / "troubleshooting.html"
      runner._queue_blocked_html_declared_file_creation_nudge(
              "Already-declared local targets include: chapters/advanced-topics.html, "
              "chapters/basic-usage.html, chapters/configuration.html"
          ),
 +        dod=dod,
+     )
      assert queued
      assert "retry the file creation" in queued[0]
 +def test_tool_batch_runner_blocked_html_declared_file_creation_after_outputs_exist_prefers_verify(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should not run in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run in this scenario")
++
 +    guide = temp_dir / "guide"
 +    chapters = guide / "chapters"
 +    guide.mkdir()
 +    chapters.mkdir()
 +    index = guide / "index.html"
 +    index.write_text(
 +        "\n".join(
 +            [
 +                '<a href="chapters/01-introduction.html">Intro</a>',
 +                '<a href="chapters/02-installation.html">Install</a>',
 +                '<a href="../index.html">Back</a>',
 +                "",
 +            ]
 +        )
 +    )
 +    (chapters / "01-introduction.html").write_text("<html></html>\n")
 +    (chapters / "02-installation.html").write_text("<html></html>\n")
++
 +    implementation_plan = temp_dir / "implementation.md"
 +    implementation_plan.write_text(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## File Changes",
 +                f"- `{index}`",
 +                f"- `{chapters / '01-introduction.html'}`",
 +                f"- `{chapters / '02-installation.html'}`",
 +                "",
 +            ]
 +        )
 +    )
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +    )
 +    queued: list[str] = []
 +    context.queue_steering_message_callback = queued.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    dod = create_definition_of_done("Create a guide.")
 +    dod.implementation_plan = str(implementation_plan)
 +    dod.verification_commands = [f"ls -la {guide}"]
 +    dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
++
 +    target = guide / "chapters" / "08-advanced-configuration.html"
 +    runner._queue_blocked_html_declared_file_creation_nudge(
 +        ToolCall(
 +            id="write-extra",
 +            name="write",
 +            arguments={"file_path": str(target)},
 +        ),
 +        (
 +            "[Blocked - HTML file creation falls outside the current declared artifact set] "
 +            "Suggestion: Keep new non-root HTML files within the root-declared artifact set and "
 +            f"update the guide root `{index.resolve(strict=False)}` before creating undeclared sibling pages, "
 +            "for example: chapters/08-advanced-configuration.html."
 +        ),
 +        dod=dod,
 +    )
++
 +    assert queued
 +    assert "All explicitly planned artifacts already exist on disk." in queued[0]
 +    assert "Do not expand the output set with `chapters/08-advanced-configuration.html`." in queued[0]
 +    assert "Move to verification or final confirmation using the files already on disk." in queued[0]
 +    assert "update the guide root" not in queued[0]
++
++
 +def test_tool_batch_runner_blocked_html_missing_target_after_outputs_exist_prefers_verify(
 +    temp_dir: Path,
 +) -> None:
 +    async def assess_confidence(
 +        tool_name: str,
 +        tool_args: dict,
 +        context: str,
 +    ) -> ConfidenceAssessment:
 +        raise AssertionError("Confidence scoring should not run in this scenario")
++
 +    async def verify_action(
 +        tool_name: str,
 +        tool_args: dict,
 +        result: str,
 +        expected: str = "",
 +    ) -> ActionVerification:
 +        raise AssertionError("Verification should not run in this scenario")
++
 +    guide = temp_dir / "guide"
 +    chapters = guide / "chapters"
 +    guide.mkdir()
 +    chapters.mkdir()
 +    index = guide / "index.html"
 +    index.write_text(
 +        "\n".join(
 +            [
 +                '<a href="chapters/01-introduction.html">Intro</a>',
 +                '<a href="chapters/02-installation.html">Install</a>',
 +                '<a href="../index.html">Back</a>',
 +                "",
 +            ]
 +        )
 +    )
 +    (chapters / "01-introduction.html").write_text("<html></html>\n")
 +    (chapters / "02-installation.html").write_text("<html></html>\n")
++
 +    implementation_plan = temp_dir / "implementation.md"
 +    implementation_plan.write_text(
 +        "\n".join(
 +            [
 +                "# Implementation Plan",
 +                "",
 +                "## File Changes",
 +                f"- `{index}`",
 +                f"- `{chapters / '01-introduction.html'}`",
 +                f"- `{chapters / '02-installation.html'}`",
 +                "",
 +            ]
 +        )
 +    )
++
 +    context = build_context(
 +        temp_dir=temp_dir,
 +        messages=[],
 +        safeguards=FakeSafeguards(),
 +        assess_confidence=assess_confidence,
 +        verify_action=verify_action,
 +    )
 +    queued: list[str] = []
 +    context.queue_steering_message_callback = queued.append
 +    runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir))
 +    dod = create_definition_of_done("Create a guide.")
 +    dod.implementation_plan = str(implementation_plan)
 +    dod.verification_commands = [f"ls -la {guide}"]
 +    dod.touched_files = [str(index), str(chapters / "01-introduction.html"), str(chapters / "02-installation.html")]
++
 +    runner._queue_blocked_html_missing_target_nudge(
 +        ToolCall(
 +            id="edit-root",
 +            name="edit",
 +            arguments={"file_path": str(index)},
 +        ),
 +        (
 +            "[Blocked - Edited HTML links point to files that do not exist] "
 +            "Suggestion: Use only existing local targets for href values and avoid introducing missing links, "
 +            "for example fix: chapters/08-advanced-configuration.html"
 +        ),
 +        dod=dod,
 +    )
++
 +    assert queued
 +    assert "All explicitly planned artifacts already exist on disk." in queued[0]
 +    assert "Do not introduce new local-link targets beyond the current output set." in queued[0]
 +    assert "Repair the existing generated files instead of expanding the guide." in queued[0]
++
++
  @pytest.mark.asyncio
  async def test_tool_batch_runner_blocked_empty_file_path_nudges_concrete_next_artifact(
      temp_dir: Path,