tenseleyflow/loader / ea386ab

Browse files

Expand runtime parity coverage for Sprint 01

Authored by espadonne
SHA
ea386abc531480cb55c15c5c19f11baeaeec3d5a
Parents
02dd95a
Tree
a6b6954

2 changed files

StatusFile+-
M .docs/PARITY.md 15 12
M tests/test_runtime_harness.py 109 0
.docs/PARITY.mdmodified
@@ -1,8 +1,8 @@
1
-# Loader Runtime Baseline
1
+# Loader Runtime Parity Checkpoint
22
 
33
 Date: 2026-04-06
44
 
5
-This file is the Sprint 00 baseline for Loader's current runtime behavior. It is intentionally narrow and operational: what the loop can do today, what is flaky, what is out of scope, and what scenarios we now measure with deterministic tests.
5
+This file tracks the current deterministic runtime baseline for Loader. It stays intentionally narrow and operational: what the runtime can do today, what remains weak, and what scenarios we measure with repeatable tests.
66
 
77
 ## Supported today
88
 
@@ -11,19 +11,20 @@ This file is the Sprint 00 baseline for Loader's current runtime behavior. It is
1111
 - confirmation callbacks for destructive `write` and `bash` actions
1212
 - raw JSON fallback when the model emits tool syntax in plain text
1313
 - heuristic completion nudges when the model stops before finishing a simple actionable task
14
+- typed `TurnSummary` output for completed turns, including trace events and tool-result messages
15
+- unified tool execution for native and extracted tool calls through `runtime.executor.ToolExecutor`
16
+- typed tool-result messages backed by `Message.tool_results`
1417
 
1518
 ## Known weak spots
1619
 
17
-- the main runtime still lives in one large loop at [`src/loader/agent/loop.py`](../src/loader/agent/loop.py)
18
-- duplicate suppression and pre-validation still try to construct `Message(..., tool_call_id=...)`, which is a known broken contract until Sprint 01 lands
19
-- extracted raw-text tool execution duplicates the main tool execution path
20
+- the core turn loop moved into [`src/loader/runtime/conversation.py`](../src/loader/runtime/conversation.py), but it is still much larger and more heuristic-heavy than the reference runtime in `refs/claw-code`
21
+- planning, decomposition, and several helper behaviors still live in [`src/loader/agent/loop.py`](../src/loader/agent/loop.py), so ownership is cleaner than Sprint 00 but not fully simplified yet
2022
 - completion is still heuristic, not evidence-backed
2123
 - permissions are confirmation-based, not policy-based
2224
 
2325
 ## Out of scope in the current baseline
2426
 
25
-- typed turn engine / unified executor
26
-- permission modes
27
+- permission modes / policy engine
2728
 - persisted sessions / memory / `.loader/` runtime state
2829
 - mode router, clarify, or planning artifacts
2930
 - doctor / status / session product surfaces
@@ -42,18 +43,20 @@ The auditable manifest lives at [`tests/fixtures/runtime_parity_manifest.json`](
4243
 - `bash_confirmation_prompt_denied`: green
4344
 - `raw_json_tool_call_fallback`: green
4445
 - `completion_check_continuation`: green
45
-- `tool_result_contract_regression`: intentionally red in Sprint 00
46
+- `tool_result_contract_regression`: green
47
+- `turn_summary_smoke_for_multi_tool_turn`: green
48
+- `native_and_raw_tool_paths_share_executor_trace`: green
4649
 
4750
 ## Verification snapshot
4851
 
4952
 As of 2026-04-06:
5053
 
51
-- `uv run pytest`: 70 passed, 1 failed
52
-- the single failing test is `tests/test_runtime_harness.py::test_tool_result_contract_regression`
53
-- that regression proves both broken branches currently raise `TypeError: Message.__init__() got an unexpected keyword argument 'tool_call_id'`
54
+- `uv run pytest -q`: 78 passed
55
+- `tests/test_runtime_harness.py` is fully green, including the original contract regression
56
+- native and extracted tool calls now record the same executor trace events, with source-specific metadata
5457
 
5558
 ## Definition of honesty
5659
 
5760
 - If a scenario is green here, it should have deterministic automated coverage.
5861
 - If a scenario is flaky or broken, it should be called out here before we claim parity work is done.
59
-- Sprint 01 should turn the intentional red regression green by fixing the tool-result message contract, not by weakening the test.
62
+- Sprint 01 turned the original `tool_call_id` regression green by fixing the message contract, not by weakening the test.
tests/test_runtime_harness.pymodified
@@ -72,6 +72,14 @@ def tool_result_messages(run) -> list[str]:
7272
     return [event.content for event in run.events if event.type == "tool_result"]
7373
 
7474
 
75
+def trace_event_names(run) -> list[str]:
76
+    """Return recorded runtime trace event names."""
77
+
78
+    summary = run.agent.last_turn_summary
79
+    assert summary is not None
80
+    return [event.name for event in summary.trace]
81
+
82
+
7583
 @pytest.mark.asyncio
7684
 async def test_runtime_parity_manifest_matches_implemented_cases() -> None:
7785
     manifest_names = [entry["name"] for entry in load_manifest()]
@@ -161,6 +169,42 @@ async def test_multi_tool_turn_roundtrip(temp_dir: Path) -> None:
161169
     assert "two parity lines" in run.response
162170
 
163171
 
172
+@pytest.mark.asyncio
173
+async def test_turn_summary_smoke_for_multi_tool_turn(temp_dir: Path) -> None:
174
+    fixture = temp_dir / "fixture.txt"
175
+    fixture.write_text("alpha parity line\nbeta line\ngamma parity line\n")
176
+
177
+    backend = ScriptedBackend(
178
+        completions=[
179
+            native_tool_response(
180
+                ToolCall(id="read-1", name="read", arguments={"file_path": str(fixture)}),
181
+                ToolCall(
182
+                    id="grep-1",
183
+                    name="grep",
184
+                    arguments={"pattern": "parity", "path": str(fixture)},
185
+                ),
186
+                content="I'll inspect the file and count parity matches.",
187
+            ),
188
+            final_response("The file has two parity lines, including alpha parity line."),
189
+        ]
190
+    )
191
+
192
+    run = await run_scenario(
193
+        "Inspect the fixture and find parity lines.",
194
+        backend,
195
+        config=non_streaming_config(),
196
+        project_root=temp_dir,
197
+    )
198
+
199
+    summary = run.agent.last_turn_summary
200
+    assert summary is not None
201
+    assert summary.final_response == run.response
202
+    assert summary.iterations == 2
203
+    assert len(summary.assistant_messages) == 2
204
+    assert len(summary.tool_result_messages) == 2
205
+    assert "assistant.tool_batch" in trace_event_names(run)
206
+
207
+
164208
 @pytest.mark.asyncio
165209
 async def test_write_file_allowed(temp_dir: Path) -> None:
166210
     target = temp_dir / "allowed.txt"
@@ -355,6 +399,71 @@ async def test_raw_json_tool_call_fallback(temp_dir: Path) -> None:
355399
     assert "Recovered the raw JSON tool call" in run.response
356400
 
357401
 
402
+@pytest.mark.asyncio
403
+async def test_native_and_raw_tool_paths_share_executor_trace(temp_dir: Path) -> None:
404
+    native_fixture = temp_dir / "native.txt"
405
+    native_fixture.write_text("native parity line\n")
406
+    native_backend = ScriptedBackend(
407
+        completions=[
408
+            native_tool_response(
409
+                ToolCall(id="read-1", name="read", arguments={"file_path": str(native_fixture)}),
410
+                content="I'll inspect the native tool result.",
411
+            ),
412
+            final_response("Native read complete."),
413
+        ]
414
+    )
415
+    native_run = await run_scenario(
416
+        "Read native.txt.",
417
+        native_backend,
418
+        config=non_streaming_config(),
419
+        project_root=temp_dir,
420
+    )
421
+
422
+    raw_fixture = temp_dir / "raw.txt"
423
+    raw_fixture.write_text("raw parity line\n")
424
+    raw_json = f'{{"name": "read", "arguments": {{"file_path": "{raw_fixture}"}}}}'
425
+    raw_backend = ScriptedBackend(
426
+        streams=[
427
+            [
428
+                StreamChunk(content=raw_json[:20], is_done=False),
429
+                StreamChunk(content=raw_json[20:], full_content=raw_json, is_done=True),
430
+            ],
431
+            [
432
+                StreamChunk(
433
+                    content="Raw read complete.",
434
+                    full_content="Raw read complete.",
435
+                    is_done=True,
436
+                )
437
+            ],
438
+        ]
439
+    )
440
+    raw_run = await run_scenario(
441
+        "Read raw.txt.",
442
+        raw_backend,
443
+        config=AgentConfig(auto_context=False, max_iterations=8),
444
+        project_root=temp_dir,
445
+    )
446
+
447
+    for run in (native_run, raw_run):
448
+        names = trace_event_names(run)
449
+        assert "assistant.tool_batch" in names
450
+        assert "tool.received" in names
451
+        assert "tool.executed" in names
452
+
453
+    native_summary = native_run.agent.last_turn_summary
454
+    raw_summary = raw_run.agent.last_turn_summary
455
+    assert native_summary is not None
456
+    assert raw_summary is not None
457
+    assert any(
458
+        event.name == "tool.received" and event.data["source"] == "native"
459
+        for event in native_summary.trace
460
+    )
461
+    assert any(
462
+        event.name == "tool.received" and event.data["source"] == "raw_text"
463
+        for event in raw_summary.trace
464
+    )
465
+
466
+
358467
 @pytest.mark.asyncio
359468
 async def test_completion_check_continuation(
360469
     temp_dir: Path,