Expand runtime parity coverage for Sprint 01
- SHA
ea386abc531480cb55c15c5c19f11baeaeec3d5a- Parents
-
02dd95a - Tree
a6b6954
ea386ab
ea386abc531480cb55c15c5c19f11baeaeec3d5a02dd95a
a6b6954| Status | File | + | - |
|---|---|---|---|
| M |
.docs/PARITY.md
|
15 | 12 |
| M |
tests/test_runtime_harness.py
|
109 | 0 |
.docs/PARITY.mdmodified@@ -1,8 +1,8 @@ | ||
| 1 | -# Loader Runtime Baseline | |
| 1 | +# Loader Runtime Parity Checkpoint | |
| 2 | 2 | |
| 3 | 3 | Date: 2026-04-06 |
| 4 | 4 | |
| 5 | -This file is the Sprint 00 baseline for Loader's current runtime behavior. It is intentionally narrow and operational: what the loop can do today, what is flaky, what is out of scope, and what scenarios we now measure with deterministic tests. | |
| 5 | +This file tracks the current deterministic runtime baseline for Loader. It stays intentionally narrow and operational: what the runtime can do today, what remains weak, and what scenarios we measure with repeatable tests. | |
| 6 | 6 | |
| 7 | 7 | ## Supported today |
| 8 | 8 | |
@@ -11,19 +11,20 @@ This file is the Sprint 00 baseline for Loader's current runtime behavior. It is | ||
| 11 | 11 | - confirmation callbacks for destructive `write` and `bash` actions |
| 12 | 12 | - raw JSON fallback when the model emits tool syntax in plain text |
| 13 | 13 | - heuristic completion nudges when the model stops before finishing a simple actionable task |
| 14 | +- typed `TurnSummary` output for completed turns, including trace events and tool-result messages | |
| 15 | +- unified tool execution for native and extracted tool calls through `runtime.executor.ToolExecutor` | |
| 16 | +- typed tool-result messages backed by `Message.tool_results` | |
| 14 | 17 | |
| 15 | 18 | ## Known weak spots |
| 16 | 19 | |
| 17 | -- the main runtime still lives in one large loop at [`src/loader/agent/loop.py`](../src/loader/agent/loop.py) | |
| 18 | -- duplicate suppression and pre-validation still try to construct `Message(..., tool_call_id=...)`, which is a known broken contract until Sprint 01 lands | |
| 19 | -- extracted raw-text tool execution duplicates the main tool execution path | |
| 20 | +- the core turn loop moved into [`src/loader/runtime/conversation.py`](../src/loader/runtime/conversation.py), but it is still much larger and more heuristic-heavy than the reference runtime in `refs/claw-code` | |
| 21 | +- planning, decomposition, and several helper behaviors still live in [`src/loader/agent/loop.py`](../src/loader/agent/loop.py), so ownership is cleaner than Sprint 00 but not fully simplified yet | |
| 20 | 22 | - completion is still heuristic, not evidence-backed |
| 21 | 23 | - permissions are confirmation-based, not policy-based |
| 22 | 24 | |
| 23 | 25 | ## Out of scope in the current baseline |
| 24 | 26 | |
| 25 | -- typed turn engine / unified executor | |
| 26 | -- permission modes | |
| 27 | +- permission modes / policy engine | |
| 27 | 28 | - persisted sessions / memory / `.loader/` runtime state |
| 28 | 29 | - mode router, clarify, or planning artifacts |
| 29 | 30 | - doctor / status / session product surfaces |
@@ -42,18 +43,20 @@ The auditable manifest lives at [`tests/fixtures/runtime_parity_manifest.json`]( | ||
| 42 | 43 | - `bash_confirmation_prompt_denied`: green |
| 43 | 44 | - `raw_json_tool_call_fallback`: green |
| 44 | 45 | - `completion_check_continuation`: green |
| 45 | -- `tool_result_contract_regression`: intentionally red in Sprint 00 | |
| 46 | +- `tool_result_contract_regression`: green | |
| 47 | +- `turn_summary_smoke_for_multi_tool_turn`: green | |
| 48 | +- `native_and_raw_tool_paths_share_executor_trace`: green | |
| 46 | 49 | |
| 47 | 50 | ## Verification snapshot |
| 48 | 51 | |
| 49 | 52 | As of 2026-04-06: |
| 50 | 53 | |
| 51 | -- `uv run pytest`: 70 passed, 1 failed | |
| 52 | -- the single failing test is `tests/test_runtime_harness.py::test_tool_result_contract_regression` | |
| 53 | -- that regression proves both broken branches currently raise `TypeError: Message.__init__() got an unexpected keyword argument 'tool_call_id'` | |
| 54 | +- `uv run pytest -q`: 78 passed | |
| 55 | +- `tests/test_runtime_harness.py` is fully green, including the original contract regression | |
| 56 | +- native and extracted tool calls now record the same executor trace events, with source-specific metadata | |
| 54 | 57 | |
| 55 | 58 | ## Definition of honesty |
| 56 | 59 | |
| 57 | 60 | - If a scenario is green here, it should have deterministic automated coverage. |
| 58 | 61 | - If a scenario is flaky or broken, it should be called out here before we claim parity work is done. |
| 59 | -- Sprint 01 should turn the intentional red regression green by fixing the tool-result message contract, not by weakening the test. | |
| 62 | +- Sprint 01 turned the original `tool_call_id` regression green by fixing the message contract, not by weakening the test. | |
tests/test_runtime_harness.pymodified@@ -72,6 +72,14 @@ def tool_result_messages(run) -> list[str]: | ||
| 72 | 72 | return [event.content for event in run.events if event.type == "tool_result"] |
| 73 | 73 | |
| 74 | 74 | |
| 75 | +def trace_event_names(run) -> list[str]: | |
| 76 | + """Return recorded runtime trace event names.""" | |
| 77 | + | |
| 78 | + summary = run.agent.last_turn_summary | |
| 79 | + assert summary is not None | |
| 80 | + return [event.name for event in summary.trace] | |
| 81 | + | |
| 82 | + | |
| 75 | 83 | @pytest.mark.asyncio |
| 76 | 84 | async def test_runtime_parity_manifest_matches_implemented_cases() -> None: |
| 77 | 85 | manifest_names = [entry["name"] for entry in load_manifest()] |
@@ -161,6 +169,42 @@ async def test_multi_tool_turn_roundtrip(temp_dir: Path) -> None: | ||
| 161 | 169 | assert "two parity lines" in run.response |
| 162 | 170 | |
| 163 | 171 | |
| 172 | +@pytest.mark.asyncio | |
| 173 | +async def test_turn_summary_smoke_for_multi_tool_turn(temp_dir: Path) -> None: | |
| 174 | + fixture = temp_dir / "fixture.txt" | |
| 175 | + fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n") | |
| 176 | + | |
| 177 | + backend = ScriptedBackend( | |
| 178 | + completions=[ | |
| 179 | + native_tool_response( | |
| 180 | + ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}), | |
| 181 | + ToolCall( | |
| 182 | + id="grep-1", | |
| 183 | + name="grep", | |
| 184 | + arguments={"pattern": "parity", "path": str(fixture)}, | |
| 185 | + ), | |
| 186 | + content="I'll inspect the file and count parity matches.", | |
| 187 | + ), | |
| 188 | + final_response("The file has two parity lines, including alpha parity line."), | |
| 189 | + ] | |
| 190 | + ) | |
| 191 | + | |
| 192 | + run = await run_scenario( | |
| 193 | + "Inspect the fixture and find parity lines.", | |
| 194 | + backend, | |
| 195 | + config=non_streaming_config(), | |
| 196 | + project_root=temp_dir, | |
| 197 | + ) | |
| 198 | + | |
| 199 | + summary = run.agent.last_turn_summary | |
| 200 | + assert summary is not None | |
| 201 | + assert summary.final_response == run.response | |
| 202 | + assert summary.iterations == 2 | |
| 203 | + assert len(summary.assistant_messages) == 2 | |
| 204 | + assert len(summary.tool_result_messages) == 2 | |
| 205 | + assert "assistant.tool_batch" in trace_event_names(run) | |
| 206 | + | |
| 207 | + | |
| 164 | 208 | @pytest.mark.asyncio |
| 165 | 209 | async def test_write_file_allowed(temp_dir: Path) -> None: |
| 166 | 210 | target = temp_dir / "allowed.txt" |
@@ -355,6 +399,71 @@ async def test_raw_json_tool_call_fallback(temp_dir: Path) -> None: | ||
| 355 | 399 | assert "Recovered the raw JSON tool call" in run.response |
| 356 | 400 | |
| 357 | 401 | |
| 402 | +@pytest.mark.asyncio | |
| 403 | +async def test_native_and_raw_tool_paths_share_executor_trace(temp_dir: Path) -> None: | |
| 404 | + native_fixture = temp_dir / "native.txt" | |
| 405 | + native_fixture.write_text("native parity line\n") | |
| 406 | + native_backend = ScriptedBackend( | |
| 407 | + completions=[ | |
| 408 | + native_tool_response( | |
| 409 | + ToolCall(id="read-1", name="read", arguments={"file_path": str(native_fixture)}), | |
| 410 | + content="I'll inspect the native tool result.", | |
| 411 | + ), | |
| 412 | + final_response("Native read complete."), | |
| 413 | + ] | |
| 414 | + ) | |
| 415 | + native_run = await run_scenario( | |
| 416 | + "Read native.txt.", | |
| 417 | + native_backend, | |
| 418 | + config=non_streaming_config(), | |
| 419 | + project_root=temp_dir, | |
| 420 | + ) | |
| 421 | + | |
| 422 | + raw_fixture = temp_dir / "raw.txt" | |
| 423 | + raw_fixture.write_text("raw parity line\n") | |
| 424 | + raw_json = f'{{"name": "read", "arguments": {{"file_path": "{raw_fixture}"}}}}' | |
| 425 | + raw_backend = ScriptedBackend( | |
| 426 | + streams=[ | |
| 427 | + [ | |
| 428 | + StreamChunk(content=raw_json[:20], is_done=False), | |
| 429 | + StreamChunk(content=raw_json[20:], full_content=raw_json, is_done=True), | |
| 430 | + ], | |
| 431 | + [ | |
| 432 | + StreamChunk( | |
| 433 | + content="Raw read complete.", | |
| 434 | + full_content="Raw read complete.", | |
| 435 | + is_done=True, | |
| 436 | + ) | |
| 437 | + ], | |
| 438 | + ] | |
| 439 | + ) | |
| 440 | + raw_run = await run_scenario( | |
| 441 | + "Read raw.txt.", | |
| 442 | + raw_backend, | |
| 443 | + config=AgentConfig(auto_context=False, max_iterations=8), | |
| 444 | + project_root=temp_dir, | |
| 445 | + ) | |
| 446 | + | |
| 447 | + for run in (native_run, raw_run): | |
| 448 | + names = trace_event_names(run) | |
| 449 | + assert "assistant.tool_batch" in names | |
| 450 | + assert "tool.received" in names | |
| 451 | + assert "tool.executed" in names | |
| 452 | + | |
| 453 | + native_summary = native_run.agent.last_turn_summary | |
| 454 | + raw_summary = raw_run.agent.last_turn_summary | |
| 455 | + assert native_summary is not None | |
| 456 | + assert raw_summary is not None | |
| 457 | + assert any( | |
| 458 | + event.name == "tool.received" and event.data["source"] == "native" | |
| 459 | + for event in native_summary.trace | |
| 460 | + ) | |
| 461 | + assert any( | |
| 462 | + event.name == "tool.received" and event.data["source"] == "raw_text" | |
| 463 | + for event in raw_summary.trace | |
| 464 | + ) | |
| 465 | + | |
| 466 | + | |
| 358 | 467 | @pytest.mark.asyncio |
| 359 | 468 | async def test_completion_check_continuation( |
| 360 | 469 | temp_dir: Path, |