Python · 16857 bytes Raw Blame History
1 """Deterministic coverage for current runtime repair behavior."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6
7 import pytest
8
9 from loader.agent.loop import AgentConfig
10 from loader.llm.base import CompletionResponse, Role, ToolCall
11 from tests.helpers.runtime_harness import ScriptedBackend, run_scenario
12
13
14 def non_streaming_config() -> AgentConfig:
15 """Shared deterministic config for repair-flow tests."""
16
17 config = AgentConfig(auto_context=False, stream=False, max_iterations=8)
18 config.reasoning.completion_check = False
19 return config
20
21
22 def tool_event_names(run) -> list[str]:
23 """Return non-verification tool events in order."""
24
25 return [
26 event.tool_name
27 for event in run.events
28 if event.type == "tool_call" and event.tool_name and event.phase != "verification"
29 ]
30
31
32 @pytest.mark.asyncio
33 async def test_first_turn_action_prompt_does_not_inject_prefill_message(
34 temp_dir: Path,
35 ) -> None:
36 backend = ScriptedBackend(
37 completions=[CompletionResponse(content="I can help with that.")]
38 )
39
40 await run_scenario(
41 "Create allowed.txt with a greeting.",
42 backend,
43 config=non_streaming_config(),
44 project_root=temp_dir,
45 )
46
47 assert not any(
48 message.role == Role.ASSISTANT and message.content == "["
49 for message in backend.invocations[0].messages
50 )
51
52
53 @pytest.mark.asyncio
54 async def test_empty_response_retry_injects_honest_user_reminder_and_recovers(
55 temp_dir: Path,
56 ) -> None:
57 fixture = temp_dir / "fixture.txt"
58 fixture.write_text("repair baseline\n")
59 backend = ScriptedBackend(
60 completions=[
61 CompletionResponse(content=""),
62 CompletionResponse(
63 content="I'll inspect the file now.",
64 tool_calls=[
65 ToolCall(
66 id="read-1",
67 name="read",
68 arguments={"file_path": str(fixture)},
69 )
70 ],
71 ),
72 CompletionResponse(content="Recovered after the empty response."),
73 ]
74 )
75
76 run = await run_scenario(
77 "Read the fixture file.",
78 backend,
79 config=non_streaming_config(),
80 project_root=temp_dir,
81 )
82
83 assert tool_event_names(run) == ["read"]
84 assert "Recovered after the empty response." in run.response
85 policy_entries = [
86 entry
87 for entry in run.agent.last_turn_summary.workflow_timeline
88 if entry.kind.startswith(("repair_", "completion_"))
89 ]
90 assert [entry.kind for entry in policy_entries] == [
91 "repair_retry",
92 "completion_complete",
93 ]
94 assert policy_entries[0].policy_stage == "empty_response"
95 assert any(
96 message.role == Role.USER
97 and "[EMPTY ASSISTANT RESPONSE]" in message.content
98 for message in backend.invocations[1].messages
99 )
100
101
102 @pytest.mark.asyncio
103 async def test_empty_response_retry_carries_forward_confirmed_progress(
104 temp_dir: Path,
105 ) -> None:
106 target = temp_dir / "hello.py"
107 backend = ScriptedBackend(
108 completions=[
109 CompletionResponse(
110 content="I'll create the file now.",
111 tool_calls=[
112 ToolCall(
113 id="write-1",
114 name="write",
115 arguments={
116 "file_path": str(target),
117 "content": "print('hello')\n",
118 },
119 )
120 ],
121 ),
122 CompletionResponse(content=""),
123 CompletionResponse(content="Recovered after the empty response."),
124 ]
125 )
126
127 run = await run_scenario(
128 "Create hello.py with a greeting.",
129 backend,
130 config=non_streaming_config(),
131 project_root=temp_dir,
132 )
133
134 assert "Recovered after the empty response." in run.response
135 retry_messages = [
136 message.content
137 for message in backend.invocations[2].messages
138 if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content
139 ]
140 assert retry_messages
141 assert "retry 1/2" in retry_messages[0]
142 assert "Continue from the confirmed progress below instead of restarting." in retry_messages[0]
143 assert "hello.py" in retry_messages[0]
144
145
146 @pytest.mark.asyncio
147 async def test_empty_response_retry_budget_resets_after_successful_turn(
148 temp_dir: Path,
149 ) -> None:
150 first = temp_dir / "one.txt"
151 second = temp_dir / "two.txt"
152 backend = ScriptedBackend(
153 completions=[
154 CompletionResponse(content=""),
155 CompletionResponse(
156 content="I'll create the first file now.",
157 tool_calls=[
158 ToolCall(
159 id="write-1",
160 name="write",
161 arguments={
162 "file_path": str(first),
163 "content": "one\n",
164 },
165 )
166 ],
167 ),
168 CompletionResponse(content=""),
169 CompletionResponse(
170 content="I'll create the second file now.",
171 tool_calls=[
172 ToolCall(
173 id="write-2",
174 name="write",
175 arguments={
176 "file_path": str(second),
177 "content": "two\n",
178 },
179 )
180 ],
181 ),
182 CompletionResponse(content="Both files are created."),
183 ]
184 )
185
186 run = await run_scenario(
187 "Create one.txt and two.txt.",
188 backend,
189 config=non_streaming_config(),
190 project_root=temp_dir,
191 )
192
193 assert run.response.startswith("Both files are created.")
194 retry_messages: list[str] = []
195 for invocation in backend.invocations:
196 for message in invocation.messages:
197 if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
198 continue
199 if retry_messages and retry_messages[-1] == message.content:
200 continue
201 retry_messages.append(message.content)
202 assert len(retry_messages) >= 2
203 assert all("retry 2/2" not in message for message in retry_messages)
204 assert sum("retry 1/2" in message for message in retry_messages) >= 2
205
206
207 @pytest.mark.asyncio
208 async def test_empty_response_retry_replaces_prior_user_interruption_handoff(
209 temp_dir: Path,
210 ) -> None:
211 first = temp_dir / "index.html"
212 second = temp_dir / "chapters" / "01-introduction.html"
213 backend = ScriptedBackend(
214 completions=[
215 CompletionResponse(
216 content="I'll create the guide index now.",
217 tool_calls=[
218 ToolCall(
219 id="write-1",
220 name="write",
221 arguments={
222 "file_path": str(first),
223 "content": "<html><a href=\"chapters/01-introduction.html\">Intro</a></html>\n",
224 },
225 )
226 ],
227 ),
228 CompletionResponse(content=""),
229 CompletionResponse(
230 content="I'll create the chapter now.",
231 tool_calls=[
232 ToolCall(
233 id="write-2",
234 name="write",
235 arguments={
236 "file_path": str(second),
237 "content": "<html></html>\n",
238 },
239 )
240 ],
241 ),
242 CompletionResponse(content="Done."),
243 ]
244 )
245
246 run = await run_scenario(
247 "Create index.html and a first chapter file.",
248 backend,
249 config=non_streaming_config(),
250 project_root=temp_dir,
251 )
252
253 assert run.response.startswith("Done.")
254 retry_invocation_messages = backend.invocations[2].messages
255 user_steering_messages = [
256 message.content
257 for message in retry_invocation_messages
258 if message.role == Role.USER
259 and (
260 "[EMPTY ASSISTANT RESPONSE]" in message.content
261 or "[USER INTERRUPTION]:" in message.content
262 or "[CONTINUE CURRENT STEP]" in message.content
263 )
264 ]
265 assert len(user_steering_messages) == 1
266 assert user_steering_messages[0].startswith("[EMPTY ASSISTANT RESPONSE]")
267 assert "[USER INTERRUPTION]:" not in user_steering_messages[0]
268
269
270 @pytest.mark.asyncio
271 async def test_empty_response_retry_budget_resets_after_todowrite_turn(
272 temp_dir: Path,
273 ) -> None:
274 first = temp_dir / "index.html"
275 second = temp_dir / "chapters" / "01-introduction.html"
276 backend = ScriptedBackend(
277 completions=[
278 CompletionResponse(content=""),
279 CompletionResponse(
280 content="I'll create the guide index now.",
281 tool_calls=[
282 ToolCall(
283 id="write-1",
284 name="write",
285 arguments={
286 "file_path": str(first),
287 "content": "<html></html>\n",
288 },
289 )
290 ],
291 ),
292 CompletionResponse(
293 content="I'll create the first chapter now.",
294 tool_calls=[
295 ToolCall(
296 id="write-2",
297 name="write",
298 arguments={
299 "file_path": str(second),
300 "content": "<html></html>\n",
301 },
302 )
303 ],
304 ),
305 CompletionResponse(
306 content="I'll update the task list now.",
307 tool_calls=[
308 ToolCall(
309 id="todo-1",
310 name="TodoWrite",
311 arguments={
312 "todos": [
313 {
314 "content": "Create index.html",
315 "status": "completed",
316 "active_form": "Creating index.html",
317 },
318 {
319 "content": "Create 01-introduction.html",
320 "status": "completed",
321 "active_form": "Creating 01-introduction.html",
322 },
323 {
324 "content": "Create 02-installation.html",
325 "status": "pending",
326 "active_form": "Creating 02-installation.html",
327 },
328 ]
329 },
330 )
331 ],
332 ),
333 CompletionResponse(content=""),
334 CompletionResponse(
335 content="I'll create the second chapter now.",
336 tool_calls=[
337 ToolCall(
338 id="write-3",
339 name="write",
340 arguments={
341 "file_path": str(temp_dir / "chapters" / "02-installation.html"),
342 "content": "<html></html>\n",
343 },
344 )
345 ],
346 ),
347 CompletionResponse(content="The guide files are created."),
348 ]
349 )
350
351 run = await run_scenario(
352 "Create a small nginx guide.",
353 backend,
354 config=non_streaming_config(),
355 project_root=temp_dir,
356 )
357
358 assert run.response.startswith("The guide files are created.")
359 retry_messages: list[str] = []
360 for invocation in backend.invocations:
361 for message in invocation.messages:
362 if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
363 continue
364 if retry_messages and retry_messages[-1] == message.content:
365 continue
366 retry_messages.append(message.content)
367 assert len(retry_messages) >= 2
368 assert all("retry 2/2" not in message for message in retry_messages)
369 assert sum("retry 1/2" in message for message in retry_messages) >= 2
370
371
372 @pytest.mark.asyncio
373 async def test_repeated_empty_responses_fail_honestly_after_one_retry(
374 temp_dir: Path,
375 ) -> None:
376 backend = ScriptedBackend(
377 completions=[
378 CompletionResponse(content=""),
379 CompletionResponse(content=""),
380 CompletionResponse(content=""),
381 ]
382 )
383
384 run = await run_scenario(
385 "Read the fixture file.",
386 backend,
387 config=non_streaming_config(),
388 project_root=temp_dir,
389 )
390
391 assert tool_event_names(run) == []
392 assert run.response == (
393 "I didn't get a usable response from the model after retrying 2 times. "
394 "Please try again or switch to a different backend/model."
395 )
396 assert len(backend.invocations) == 3
397 assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-3:]] == [
398 "repair_retry",
399 "repair_retry",
400 "repair_fail",
401 ]
402 assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == (
403 "empty_response_retry_exhausted"
404 )
405 assert run.agent.session.last_turn_transition_kind == "terminal"
406 assert run.agent.session.last_turn_transition_reason_code == (
407 "empty_response_retry_exhausted"
408 )
409
410
411 @pytest.mark.asyncio
412 async def test_empty_response_retries_replace_prior_retry_message_within_same_episode(
413 temp_dir: Path,
414 ) -> None:
415 target = temp_dir / "three.txt"
416 backend = ScriptedBackend(
417 completions=[
418 CompletionResponse(content=""),
419 CompletionResponse(content=""),
420 CompletionResponse(
421 content="I'll create the file now.",
422 tool_calls=[
423 ToolCall(
424 id="write-1",
425 name="write",
426 arguments={
427 "file_path": str(target),
428 "content": "three\n",
429 },
430 )
431 ],
432 ),
433 CompletionResponse(content="Done."),
434 ]
435 )
436
437 run = await run_scenario(
438 "Create three.txt.",
439 backend,
440 config=non_streaming_config(),
441 project_root=temp_dir,
442 )
443
444 assert run.response.startswith("Done.")
445 third_invocation_retry_messages = [
446 message.content
447 for message in backend.invocations[2].messages
448 if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content
449 ]
450 assert len(third_invocation_retry_messages) == 1
451 assert "retry 2/2" in third_invocation_retry_messages[0]
452
453
454 @pytest.mark.asyncio
455 async def test_raw_text_tool_recovery_budget_fails_honestly(
456 temp_dir: Path,
457 ) -> None:
458 for name in ("one.txt", "two.txt", "three.txt", "four.txt"):
459 (temp_dir / name).write_text(f"{name}\n")
460
461 backend = ScriptedBackend(
462 completions=[
463 CompletionResponse(
464 content='{"name": "read", "arguments": {"file_path": "one.txt"}}'
465 ),
466 CompletionResponse(
467 content='{"name": "read", "arguments": {"file_path": "two.txt"}}'
468 ),
469 CompletionResponse(
470 content='{"name": "read", "arguments": {"file_path": "three.txt"}}'
471 ),
472 CompletionResponse(
473 content='{"name": "read", "arguments": {"file_path": "four.txt"}}'
474 ),
475 ]
476 )
477
478 run = await run_scenario(
479 "Inspect the text fixtures.",
480 backend,
481 config=non_streaming_config(),
482 project_root=temp_dir,
483 )
484
485 assert tool_event_names(run) == ["read", "read", "read"]
486 assert run.response == (
487 "I couldn't safely continue because the model kept emitting raw-text "
488 "tool calls instead of proper tool invocations. Please try again or "
489 "switch to a different backend/model."
490 )
491 assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-4:]] == [
492 "repair_retry",
493 "repair_retry",
494 "repair_retry",
495 "repair_fail",
496 ]
497 assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == (
498 "raw_text_tool_recovery_exhausted"
499 )
500 assert "Let me know if you'd like me to continue" not in run.response