loader Public

Watch 0 Fork 0 Star 0

Python · 16857 bytes Raw Blame History

  
        1
        """Deterministic coverage for current runtime repair behavior."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        
        7
        import pytest
      
        8
        
        9
        from loader.agent.loop import AgentConfig
      
        10
        from loader.llm.base import CompletionResponse, Role, ToolCall
      
        11
        from tests.helpers.runtime_harness import ScriptedBackend, run_scenario
      
        12
        
        13
        
        14
        def non_streaming_config() -> AgentConfig:
      
        15
            """Shared deterministic config for repair-flow tests."""
      
        16
        
        17
            config = AgentConfig(auto_context=False, stream=False, max_iterations=8)
      
        18
            config.reasoning.completion_check = False
      
        19
            return config
      
        20
        
        21
        
        22
        def tool_event_names(run) -> list[str]:
      
        23
            """Return non-verification tool events in order."""
      
        24
        
        25
            return [
      
        26
                event.tool_name
      
        27
                for event in run.events
      
        28
                if event.type == "tool_call" and event.tool_name and event.phase != "verification"
      
        29
            ]
      
        30
        
        31
        
        32
        @pytest.mark.asyncio
      
        33
        async def test_first_turn_action_prompt_does_not_inject_prefill_message(
      
        34
            temp_dir: Path,
      
        35
        ) -> None:
      
        36
            backend = ScriptedBackend(
      
        37
                completions=[CompletionResponse(content="I can help with that.")]
      
        38
            )
      
        39
        
        40
            await run_scenario(
      
        41
                "Create allowed.txt with a greeting.",
      
        42
                backend,
      
        43
                config=non_streaming_config(),
      
        44
                project_root=temp_dir,
      
        45
            )
      
        46
        
        47
            assert not any(
      
        48
                message.role == Role.ASSISTANT and message.content == "["
      
        49
                for message in backend.invocations[0].messages
      
        50
            )
      
        51
        
        52
        
        53
        @pytest.mark.asyncio
      
        54
        async def test_empty_response_retry_injects_honest_user_reminder_and_recovers(
      
        55
            temp_dir: Path,
      
        56
        ) -> None:
      
        57
            fixture = temp_dir / "fixture.txt"
      
        58
            fixture.write_text("repair baseline\n")
      
        59
            backend = ScriptedBackend(
      
        60
                completions=[
      
        61
                    CompletionResponse(content=""),
      
        62
                    CompletionResponse(
      
        63
                        content="I'll inspect the file now.",
      
        64
                        tool_calls=[
      
        65
                            ToolCall(
      
        66
                                id="read-1",
      
        67
                                name="read",
      
        68
                                arguments={"file_path": str(fixture)},
      
        69
                            )
      
        70
                        ],
      
        71
                    ),
      
        72
                    CompletionResponse(content="Recovered after the empty response."),
      
        73
                ]
      
        74
            )
      
        75
        
        76
            run = await run_scenario(
      
        77
                "Read the fixture file.",
      
        78
                backend,
      
        79
                config=non_streaming_config(),
      
        80
                project_root=temp_dir,
      
        81
            )
      
        82
        
        83
            assert tool_event_names(run) == ["read"]
      
        84
            assert "Recovered after the empty response." in run.response
      
        85
            policy_entries = [
      
        86
                entry
      
        87
                for entry in run.agent.last_turn_summary.workflow_timeline
      
        88
                if entry.kind.startswith(("repair_", "completion_"))
      
        89
            ]
      
        90
            assert [entry.kind for entry in policy_entries] == [
      
        91
                "repair_retry",
      
        92
                "completion_complete",
      
        93
            ]
      
        94
            assert policy_entries[0].policy_stage == "empty_response"
      
        95
            assert any(
      
        96
                message.role == Role.USER
      
        97
                and "[EMPTY ASSISTANT RESPONSE]" in message.content
      
        98
                for message in backend.invocations[1].messages
      
        99
            )
      
        100
        
        101
        
        102
        @pytest.mark.asyncio
      
        103
        async def test_empty_response_retry_carries_forward_confirmed_progress(
      
        104
            temp_dir: Path,
      
        105
        ) -> None:
      
        106
            target = temp_dir / "hello.py"
      
        107
            backend = ScriptedBackend(
      
        108
                completions=[
      
        109
                    CompletionResponse(
      
        110
                        content="I'll create the file now.",
      
        111
                        tool_calls=[
      
        112
                            ToolCall(
      
        113
                                id="write-1",
      
        114
                                name="write",
      
        115
                                arguments={
      
        116
                                    "file_path": str(target),
      
        117
                                    "content": "print('hello')\n",
      
        118
                                },
      
        119
                            )
      
        120
                        ],
      
        121
                    ),
      
        122
                    CompletionResponse(content=""),
      
        123
                    CompletionResponse(content="Recovered after the empty response."),
      
        124
                ]
      
        125
            )
      
        126
        
        127
            run = await run_scenario(
      
        128
                "Create hello.py with a greeting.",
      
        129
                backend,
      
        130
                config=non_streaming_config(),
      
        131
                project_root=temp_dir,
      
        132
            )
      
        133
        
        134
            assert "Recovered after the empty response." in run.response
      
        135
            retry_messages = [
      
        136
                message.content
      
        137
                for message in backend.invocations[2].messages
      
        138
                if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content
      
        139
            ]
      
        140
            assert retry_messages
      
        141
            assert "retry 1/2" in retry_messages[0]
      
        142
            assert "Continue from the confirmed progress below instead of restarting." in retry_messages[0]
      
        143
            assert "hello.py" in retry_messages[0]
      
        144
        
        145
        
        146
        @pytest.mark.asyncio
      
        147
        async def test_empty_response_retry_budget_resets_after_successful_turn(
      
        148
            temp_dir: Path,
      
        149
        ) -> None:
      
        150
            first = temp_dir / "one.txt"
      
        151
            second = temp_dir / "two.txt"
      
        152
            backend = ScriptedBackend(
      
        153
                completions=[
      
        154
                    CompletionResponse(content=""),
      
        155
                    CompletionResponse(
      
        156
                        content="I'll create the first file now.",
      
        157
                        tool_calls=[
      
        158
                            ToolCall(
      
        159
                                id="write-1",
      
        160
                                name="write",
      
        161
                                arguments={
      
        162
                                    "file_path": str(first),
      
        163
                                    "content": "one\n",
      
        164
                                },
      
        165
                            )
      
        166
                        ],
      
        167
                    ),
      
        168
                    CompletionResponse(content=""),
      
        169
                    CompletionResponse(
      
        170
                        content="I'll create the second file now.",
      
        171
                        tool_calls=[
      
        172
                            ToolCall(
      
        173
                                id="write-2",
      
        174
                                name="write",
      
        175
                                arguments={
      
        176
                                    "file_path": str(second),
      
        177
                                    "content": "two\n",
      
        178
                                },
      
        179
                            )
      
        180
                        ],
      
        181
                    ),
      
        182
                    CompletionResponse(content="Both files are created."),
      
        183
                ]
      
        184
            )
      
        185
        
        186
            run = await run_scenario(
      
        187
                "Create one.txt and two.txt.",
      
        188
                backend,
      
        189
                config=non_streaming_config(),
      
        190
                project_root=temp_dir,
      
        191
            )
      
        192
        
        193
            assert run.response.startswith("Both files are created.")
      
        194
            retry_messages: list[str] = []
      
        195
            for invocation in backend.invocations:
      
        196
                for message in invocation.messages:
      
        197
                    if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
      
        198
                        continue
      
        199
                    if retry_messages and retry_messages[-1] == message.content:
      
        200
                        continue
      
        201
                    retry_messages.append(message.content)
      
        202
            assert len(retry_messages) >= 2
      
        203
            assert all("retry 2/2" not in message for message in retry_messages)
      
        204
            assert sum("retry 1/2" in message for message in retry_messages) >= 2
      
        205
        
        206
        
        207
        @pytest.mark.asyncio
      
        208
        async def test_empty_response_retry_replaces_prior_user_interruption_handoff(
      
        209
            temp_dir: Path,
      
        210
        ) -> None:
      
        211
            first = temp_dir / "index.html"
      
        212
            second = temp_dir / "chapters" / "01-introduction.html"
      
        213
            backend = ScriptedBackend(
      
        214
                completions=[
      
        215
                    CompletionResponse(
      
        216
                        content="I'll create the guide index now.",
      
        217
                        tool_calls=[
      
        218
                            ToolCall(
      
        219
                                id="write-1",
      
        220
                                name="write",
      
        221
                                arguments={
      
        222
                                    "file_path": str(first),
      
        223
                                    "content": "<html><a href=\"chapters/01-introduction.html\">Intro</a></html>\n",
      
        224
                                },
      
        225
                            )
      
        226
                        ],
      
        227
                    ),
      
        228
                    CompletionResponse(content=""),
      
        229
                    CompletionResponse(
      
        230
                        content="I'll create the chapter now.",
      
        231
                        tool_calls=[
      
        232
                            ToolCall(
      
        233
                                id="write-2",
      
        234
                                name="write",
      
        235
                                arguments={
      
        236
                                    "file_path": str(second),
      
        237
                                    "content": "<html></html>\n",
      
        238
                                },
      
        239
                            )
      
        240
                        ],
      
        241
                    ),
      
        242
                    CompletionResponse(content="Done."),
      
        243
                ]
      
        244
            )
      
        245
        
        246
            run = await run_scenario(
      
        247
                "Create index.html and a first chapter file.",
      
        248
                backend,
      
        249
                config=non_streaming_config(),
      
        250
                project_root=temp_dir,
      
        251
            )
      
        252
        
        253
            assert run.response.startswith("Done.")
      
        254
            retry_invocation_messages = backend.invocations[2].messages
      
        255
            user_steering_messages = [
      
        256
                message.content
      
        257
                for message in retry_invocation_messages
      
        258
                if message.role == Role.USER
      
        259
                and (
      
        260
                    "[EMPTY ASSISTANT RESPONSE]" in message.content
      
        261
                    or "[USER INTERRUPTION]:" in message.content
      
        262
                    or "[CONTINUE CURRENT STEP]" in message.content
      
        263
                )
      
        264
            ]
      
        265
            assert len(user_steering_messages) == 1
      
        266
            assert user_steering_messages[0].startswith("[EMPTY ASSISTANT RESPONSE]")
      
        267
            assert "[USER INTERRUPTION]:" not in user_steering_messages[0]
      
        268
        
        269
        
        270
        @pytest.mark.asyncio
      
        271
        async def test_empty_response_retry_budget_resets_after_todowrite_turn(
      
        272
            temp_dir: Path,
      
        273
        ) -> None:
      
        274
            first = temp_dir / "index.html"
      
        275
            second = temp_dir / "chapters" / "01-introduction.html"
      
        276
            backend = ScriptedBackend(
      
        277
                completions=[
      
        278
                    CompletionResponse(content=""),
      
        279
                    CompletionResponse(
      
        280
                        content="I'll create the guide index now.",
      
        281
                        tool_calls=[
      
        282
                            ToolCall(
      
        283
                                id="write-1",
      
        284
                                name="write",
      
        285
                                arguments={
      
        286
                                    "file_path": str(first),
      
        287
                                    "content": "<html></html>\n",
      
        288
                                },
      
        289
                            )
      
        290
                        ],
      
        291
                    ),
      
        292
                    CompletionResponse(
      
        293
                        content="I'll create the first chapter now.",
      
        294
                        tool_calls=[
      
        295
                            ToolCall(
      
        296
                                id="write-2",
      
        297
                                name="write",
      
        298
                                arguments={
      
        299
                                    "file_path": str(second),
      
        300
                                    "content": "<html></html>\n",
      
        301
                                },
      
        302
                            )
      
        303
                        ],
      
        304
                    ),
      
        305
                    CompletionResponse(
      
        306
                        content="I'll update the task list now.",
      
        307
                        tool_calls=[
      
        308
                            ToolCall(
      
        309
                                id="todo-1",
      
        310
                                name="TodoWrite",
      
        311
                                arguments={
      
        312
                                    "todos": [
      
        313
                                        {
      
        314
                                            "content": "Create index.html",
      
        315
                                            "status": "completed",
      
        316
                                            "active_form": "Creating index.html",
      
        317
                                        },
      
        318
                                        {
      
        319
                                            "content": "Create 01-introduction.html",
      
        320
                                            "status": "completed",
      
        321
                                            "active_form": "Creating 01-introduction.html",
      
        322
                                        },
      
        323
                                        {
      
        324
                                            "content": "Create 02-installation.html",
      
        325
                                            "status": "pending",
      
        326
                                            "active_form": "Creating 02-installation.html",
      
        327
                                        },
      
        328
                                    ]
      
        329
                                },
      
        330
                            )
      
        331
                        ],
      
        332
                    ),
      
        333
                    CompletionResponse(content=""),
      
        334
                    CompletionResponse(
      
        335
                        content="I'll create the second chapter now.",
      
        336
                        tool_calls=[
      
        337
                            ToolCall(
      
        338
                                id="write-3",
      
        339
                                name="write",
      
        340
                                arguments={
      
        341
                                    "file_path": str(temp_dir / "chapters" / "02-installation.html"),
      
        342
                                    "content": "<html></html>\n",
      
        343
                                },
      
        344
                            )
      
        345
                        ],
      
        346
                    ),
      
        347
                    CompletionResponse(content="The guide files are created."),
      
        348
                ]
      
        349
            )
      
        350
        
        351
            run = await run_scenario(
      
        352
                "Create a small nginx guide.",
      
        353
                backend,
      
        354
                config=non_streaming_config(),
      
        355
                project_root=temp_dir,
      
        356
            )
      
        357
        
        358
            assert run.response.startswith("The guide files are created.")
      
        359
            retry_messages: list[str] = []
      
        360
            for invocation in backend.invocations:
      
        361
                for message in invocation.messages:
      
        362
                    if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
      
        363
                        continue
      
        364
                    if retry_messages and retry_messages[-1] == message.content:
      
        365
                        continue
      
        366
                    retry_messages.append(message.content)
      
        367
            assert len(retry_messages) >= 2
      
        368
            assert all("retry 2/2" not in message for message in retry_messages)
      
        369
            assert sum("retry 1/2" in message for message in retry_messages) >= 2
      
        370
        
        371
        
        372
        @pytest.mark.asyncio
      
        373
        async def test_repeated_empty_responses_fail_honestly_after_one_retry(
      
        374
            temp_dir: Path,
      
        375
        ) -> None:
      
        376
            backend = ScriptedBackend(
      
        377
                completions=[
      
        378
                    CompletionResponse(content=""),
      
        379
                    CompletionResponse(content=""),
      
        380
                    CompletionResponse(content=""),
      
        381
                ]
      
        382
            )
      
        383
        
        384
            run = await run_scenario(
      
        385
                "Read the fixture file.",
      
        386
                backend,
      
        387
                config=non_streaming_config(),
      
        388
                project_root=temp_dir,
      
        389
            )
      
        390
        
        391
            assert tool_event_names(run) == []
      
        392
            assert run.response == (
      
        393
                "I didn't get a usable response from the model after retrying 2 times. "
      
        394
                "Please try again or switch to a different backend/model."
      
        395
            )
      
        396
            assert len(backend.invocations) == 3
      
        397
            assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-3:]] == [
      
        398
                "repair_retry",
      
        399
                "repair_retry",
      
        400
                "repair_fail",
      
        401
            ]
      
        402
            assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == (
      
        403
                "empty_response_retry_exhausted"
      
        404
            )
      
        405
            assert run.agent.session.last_turn_transition_kind == "terminal"
      
        406
            assert run.agent.session.last_turn_transition_reason_code == (
      
        407
                "empty_response_retry_exhausted"
      
        408
            )
      
        409
        
        410
        
        411
        @pytest.mark.asyncio
      
        412
        async def test_empty_response_retries_replace_prior_retry_message_within_same_episode(
      
        413
            temp_dir: Path,
      
        414
        ) -> None:
      
        415
            target = temp_dir / "three.txt"
      
        416
            backend = ScriptedBackend(
      
        417
                completions=[
      
        418
                    CompletionResponse(content=""),
      
        419
                    CompletionResponse(content=""),
      
        420
                    CompletionResponse(
      
        421
                        content="I'll create the file now.",
      
        422
                        tool_calls=[
      
        423
                            ToolCall(
      
        424
                                id="write-1",
      
        425
                                name="write",
      
        426
                                arguments={
      
        427
                                    "file_path": str(target),
      
        428
                                    "content": "three\n",
      
        429
                                },
      
        430
                            )
      
        431
                        ],
      
        432
                    ),
      
        433
                    CompletionResponse(content="Done."),
      
        434
                ]
      
        435
            )
      
        436
        
        437
            run = await run_scenario(
      
        438
                "Create three.txt.",
      
        439
                backend,
      
        440
                config=non_streaming_config(),
      
        441
                project_root=temp_dir,
      
        442
            )
      
        443
        
        444
            assert run.response.startswith("Done.")
      
        445
            third_invocation_retry_messages = [
      
        446
                message.content
      
        447
                for message in backend.invocations[2].messages
      
        448
                if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content
      
        449
            ]
      
        450
            assert len(third_invocation_retry_messages) == 1
      
        451
            assert "retry 2/2" in third_invocation_retry_messages[0]
      
        452
        
        453
        
        454
        @pytest.mark.asyncio
      
        455
        async def test_raw_text_tool_recovery_budget_fails_honestly(
      
        456
            temp_dir: Path,
      
        457
        ) -> None:
      
        458
            for name in ("one.txt", "two.txt", "three.txt", "four.txt"):
      
        459
                (temp_dir / name).write_text(f"{name}\n")
      
        460
        
        461
            backend = ScriptedBackend(
      
        462
                completions=[
      
        463
                    CompletionResponse(
      
        464
                        content='{"name": "read", "arguments": {"file_path": "one.txt"}}'
      
        465
                    ),
      
        466
                    CompletionResponse(
      
        467
                        content='{"name": "read", "arguments": {"file_path": "two.txt"}}'
      
        468
                    ),
      
        469
                    CompletionResponse(
      
        470
                        content='{"name": "read", "arguments": {"file_path": "three.txt"}}'
      
        471
                    ),
      
        472
                    CompletionResponse(
      
        473
                        content='{"name": "read", "arguments": {"file_path": "four.txt"}}'
      
        474
                    ),
      
        475
                ]
      
        476
            )
      
        477
        
        478
            run = await run_scenario(
      
        479
                "Inspect the text fixtures.",
      
        480
                backend,
      
        481
                config=non_streaming_config(),
      
        482
                project_root=temp_dir,
      
        483
            )
      
        484
        
        485
            assert tool_event_names(run) == ["read", "read", "read"]
      
        486
            assert run.response == (
      
        487
                "I couldn't safely continue because the model kept emitting raw-text "
      
        488
                "tool calls instead of proper tool invocations. Please try again or "
      
        489
                "switch to a different backend/model."
      
        490
            )
      
        491
            assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-4:]] == [
      
        492
                "repair_retry",
      
        493
                "repair_retry",
      
        494
                "repair_retry",
      
        495
                "repair_fail",
      
        496
            ]
      
        497
            assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == (
      
        498
                "raw_text_tool_recovery_exhausted"
      
        499
            )
      
        500
            assert "Let me know if you'd like me to continue" not in run.response

1	"""Deterministic coverage for current runtime repair behavior."""
2
3	from __future__ import annotations
4
5	from pathlib import Path
6
7	import pytest
8
9	from loader.agent.loop import AgentConfig
10	from loader.llm.base import CompletionResponse, Role, ToolCall
11	from tests.helpers.runtime_harness import ScriptedBackend, run_scenario
12
13
14	def non_streaming_config() -> AgentConfig:
15	"""Shared deterministic config for repair-flow tests."""
16
17	config = AgentConfig(auto_context=False, stream=False, max_iterations=8)
18	config.reasoning.completion_check = False
19	return config
20
21
22	def tool_event_names(run) -> list[str]:
23	"""Return non-verification tool events in order."""
24
25	return [
26	event.tool_name
27	for event in run.events
28	if event.type == "tool_call" and event.tool_name and event.phase != "verification"
29	]
30
31
32	@pytest.mark.asyncio
33	async def test_first_turn_action_prompt_does_not_inject_prefill_message(
34	temp_dir: Path,
35	) -> None:
36	backend = ScriptedBackend(
37	completions=[CompletionResponse(content="I can help with that.")]
38	)
39
40	await run_scenario(
41	"Create allowed.txt with a greeting.",
42	backend,
43	config=non_streaming_config(),
44	project_root=temp_dir,
45	)
46
47	assert not any(
48	message.role == Role.ASSISTANT and message.content == "["
49	for message in backend.invocations[0].messages
50	)
51
52
53	@pytest.mark.asyncio
54	async def test_empty_response_retry_injects_honest_user_reminder_and_recovers(
55	temp_dir: Path,
56	) -> None:
57	fixture = temp_dir / "fixture.txt"
58	fixture.write_text("repair baseline\n")
59	backend = ScriptedBackend(
60	completions=[
61	CompletionResponse(content=""),
62	CompletionResponse(
63	content="I'll inspect the file now.",
64	tool_calls=[
65	ToolCall(
66	id="read-1",
67	name="read",
68	arguments={"file_path": str(fixture)},
69	)
70	],
71	),
72	CompletionResponse(content="Recovered after the empty response."),
73	]
74	)
75
76	run = await run_scenario(
77	"Read the fixture file.",
78	backend,
79	config=non_streaming_config(),
80	project_root=temp_dir,
81	)
82
83	assert tool_event_names(run) == ["read"]
84	assert "Recovered after the empty response." in run.response
85	policy_entries = [
86	entry
87	for entry in run.agent.last_turn_summary.workflow_timeline
88	if entry.kind.startswith(("repair_", "completion_"))
89	]
90	assert [entry.kind for entry in policy_entries] == [
91	"repair_retry",
92	"completion_complete",
93	]
94	assert policy_entries[0].policy_stage == "empty_response"
95	assert any(
96	message.role == Role.USER
97	and "[EMPTY ASSISTANT RESPONSE]" in message.content
98	for message in backend.invocations[1].messages
99	)
100
101
102	@pytest.mark.asyncio
103	async def test_empty_response_retry_carries_forward_confirmed_progress(
104	temp_dir: Path,
105	) -> None:
106	target = temp_dir / "hello.py"
107	backend = ScriptedBackend(
108	completions=[
109	CompletionResponse(
110	content="I'll create the file now.",
111	tool_calls=[
112	ToolCall(
113	id="write-1",
114	name="write",
115	arguments={
116	"file_path": str(target),
117	"content": "print('hello')\n",
118	},
119	)
120	],
121	),
122	CompletionResponse(content=""),
123	CompletionResponse(content="Recovered after the empty response."),
124	]
125	)
126
127	run = await run_scenario(
128	"Create hello.py with a greeting.",
129	backend,
130	config=non_streaming_config(),
131	project_root=temp_dir,
132	)
133
134	assert "Recovered after the empty response." in run.response
135	retry_messages = [
136	message.content
137	for message in backend.invocations[2].messages
138	if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content
139	]
140	assert retry_messages
141	assert "retry 1/2" in retry_messages[0]
142	assert "Continue from the confirmed progress below instead of restarting." in retry_messages[0]
143	assert "hello.py" in retry_messages[0]
144
145
146	@pytest.mark.asyncio
147	async def test_empty_response_retry_budget_resets_after_successful_turn(
148	temp_dir: Path,
149	) -> None:
150	first = temp_dir / "one.txt"
151	second = temp_dir / "two.txt"
152	backend = ScriptedBackend(
153	completions=[
154	CompletionResponse(content=""),
155	CompletionResponse(
156	content="I'll create the first file now.",
157	tool_calls=[
158	ToolCall(
159	id="write-1",
160	name="write",
161	arguments={
162	"file_path": str(first),
163	"content": "one\n",
164	},
165	)
166	],
167	),
168	CompletionResponse(content=""),
169	CompletionResponse(
170	content="I'll create the second file now.",
171	tool_calls=[
172	ToolCall(
173	id="write-2",
174	name="write",
175	arguments={
176	"file_path": str(second),
177	"content": "two\n",
178	},
179	)
180	],
181	),
182	CompletionResponse(content="Both files are created."),
183	]
184	)
185
186	run = await run_scenario(
187	"Create one.txt and two.txt.",
188	backend,
189	config=non_streaming_config(),
190	project_root=temp_dir,
191	)
192
193	assert run.response.startswith("Both files are created.")
194	retry_messages: list[str] = []
195	for invocation in backend.invocations:
196	for message in invocation.messages:
197	if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
198	continue
199	if retry_messages and retry_messages[-1] == message.content:
200	continue
201	retry_messages.append(message.content)
202	assert len(retry_messages) >= 2
203	assert all("retry 2/2" not in message for message in retry_messages)
204	assert sum("retry 1/2" in message for message in retry_messages) >= 2
205
206
207	@pytest.mark.asyncio
208	async def test_empty_response_retry_replaces_prior_user_interruption_handoff(
209	temp_dir: Path,
210	) -> None:
211	first = temp_dir / "index.html"
212	second = temp_dir / "chapters" / "01-introduction.html"
213	backend = ScriptedBackend(
214	completions=[
215	CompletionResponse(
216	content="I'll create the guide index now.",
217	tool_calls=[
218	ToolCall(
219	id="write-1",
220	name="write",
221	arguments={
222	"file_path": str(first),
223	"content": "<html><a href=\"chapters/01-introduction.html\">Intro</a></html>\n",
224	},
225	)
226	],
227	),
228	CompletionResponse(content=""),
229	CompletionResponse(
230	content="I'll create the chapter now.",
231	tool_calls=[
232	ToolCall(
233	id="write-2",
234	name="write",
235	arguments={
236	"file_path": str(second),
237	"content": "<html></html>\n",
238	},
239	)
240	],
241	),
242	CompletionResponse(content="Done."),
243	]
244	)
245
246	run = await run_scenario(
247	"Create index.html and a first chapter file.",
248	backend,
249	config=non_streaming_config(),
250	project_root=temp_dir,
251	)
252
253	assert run.response.startswith("Done.")
254	retry_invocation_messages = backend.invocations[2].messages
255	user_steering_messages = [
256	message.content
257	for message in retry_invocation_messages
258	if message.role == Role.USER
259	and (
260	"[EMPTY ASSISTANT RESPONSE]" in message.content
261	or "[USER INTERRUPTION]:" in message.content
262	or "[CONTINUE CURRENT STEP]" in message.content
263	)
264	]
265	assert len(user_steering_messages) == 1
266	assert user_steering_messages[0].startswith("[EMPTY ASSISTANT RESPONSE]")
267	assert "[USER INTERRUPTION]:" not in user_steering_messages[0]
268
269
270	@pytest.mark.asyncio
271	async def test_empty_response_retry_budget_resets_after_todowrite_turn(
272	temp_dir: Path,
273	) -> None:
274	first = temp_dir / "index.html"
275	second = temp_dir / "chapters" / "01-introduction.html"
276	backend = ScriptedBackend(
277	completions=[
278	CompletionResponse(content=""),
279	CompletionResponse(
280	content="I'll create the guide index now.",
281	tool_calls=[
282	ToolCall(
283	id="write-1",
284	name="write",
285	arguments={
286	"file_path": str(first),
287	"content": "<html></html>\n",
288	},
289	)
290	],
291	),
292	CompletionResponse(
293	content="I'll create the first chapter now.",
294	tool_calls=[
295	ToolCall(
296	id="write-2",
297	name="write",
298	arguments={
299	"file_path": str(second),
300	"content": "<html></html>\n",
301	},
302	)
303	],
304	),
305	CompletionResponse(
306	content="I'll update the task list now.",
307	tool_calls=[
308	ToolCall(
309	id="todo-1",
310	name="TodoWrite",
311	arguments={
312	"todos": [
313	{
314	"content": "Create index.html",
315	"status": "completed",
316	"active_form": "Creating index.html",
317	},
318	{
319	"content": "Create 01-introduction.html",
320	"status": "completed",
321	"active_form": "Creating 01-introduction.html",
322	},
323	{
324	"content": "Create 02-installation.html",
325	"status": "pending",
326	"active_form": "Creating 02-installation.html",
327	},
328	]
329	},
330	)
331	],
332	),
333	CompletionResponse(content=""),
334	CompletionResponse(
335	content="I'll create the second chapter now.",
336	tool_calls=[
337	ToolCall(
338	id="write-3",
339	name="write",
340	arguments={
341	"file_path": str(temp_dir / "chapters" / "02-installation.html"),
342	"content": "<html></html>\n",
343	},
344	)
345	],
346	),
347	CompletionResponse(content="The guide files are created."),
348	]
349	)
350
351	run = await run_scenario(
352	"Create a small nginx guide.",
353	backend,
354	config=non_streaming_config(),
355	project_root=temp_dir,
356	)
357
358	assert run.response.startswith("The guide files are created.")
359	retry_messages: list[str] = []
360	for invocation in backend.invocations:
361	for message in invocation.messages:
362	if message.role != Role.USER or "[EMPTY ASSISTANT RESPONSE]" not in message.content:
363	continue
364	if retry_messages and retry_messages[-1] == message.content:
365	continue
366	retry_messages.append(message.content)
367	assert len(retry_messages) >= 2
368	assert all("retry 2/2" not in message for message in retry_messages)
369	assert sum("retry 1/2" in message for message in retry_messages) >= 2
370
371
372	@pytest.mark.asyncio
373	async def test_repeated_empty_responses_fail_honestly_after_one_retry(
374	temp_dir: Path,
375	) -> None:
376	backend = ScriptedBackend(
377	completions=[
378	CompletionResponse(content=""),
379	CompletionResponse(content=""),
380	CompletionResponse(content=""),
381	]
382	)
383
384	run = await run_scenario(
385	"Read the fixture file.",
386	backend,
387	config=non_streaming_config(),
388	project_root=temp_dir,
389	)
390
391	assert tool_event_names(run) == []
392	assert run.response == (
393	"I didn't get a usable response from the model after retrying 2 times. "
394	"Please try again or switch to a different backend/model."
395	)
396	assert len(backend.invocations) == 3
397	assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-3:]] == [
398	"repair_retry",
399	"repair_retry",
400	"repair_fail",
401	]
402	assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == (
403	"empty_response_retry_exhausted"
404	)
405	assert run.agent.session.last_turn_transition_kind == "terminal"
406	assert run.agent.session.last_turn_transition_reason_code == (
407	"empty_response_retry_exhausted"
408	)
409
410
411	@pytest.mark.asyncio
412	async def test_empty_response_retries_replace_prior_retry_message_within_same_episode(
413	temp_dir: Path,
414	) -> None:
415	target = temp_dir / "three.txt"
416	backend = ScriptedBackend(
417	completions=[
418	CompletionResponse(content=""),
419	CompletionResponse(content=""),
420	CompletionResponse(
421	content="I'll create the file now.",
422	tool_calls=[
423	ToolCall(
424	id="write-1",
425	name="write",
426	arguments={
427	"file_path": str(target),
428	"content": "three\n",
429	},
430	)
431	],
432	),
433	CompletionResponse(content="Done."),
434	]
435	)
436
437	run = await run_scenario(
438	"Create three.txt.",
439	backend,
440	config=non_streaming_config(),
441	project_root=temp_dir,
442	)
443
444	assert run.response.startswith("Done.")
445	third_invocation_retry_messages = [
446	message.content
447	for message in backend.invocations[2].messages
448	if message.role == Role.USER and "[EMPTY ASSISTANT RESPONSE]" in message.content
449	]
450	assert len(third_invocation_retry_messages) == 1
451	assert "retry 2/2" in third_invocation_retry_messages[0]
452
453
454	@pytest.mark.asyncio
455	async def test_raw_text_tool_recovery_budget_fails_honestly(
456	temp_dir: Path,
457	) -> None:
458	for name in ("one.txt", "two.txt", "three.txt", "four.txt"):
459	(temp_dir / name).write_text(f"{name}\n")
460
461	backend = ScriptedBackend(
462	completions=[
463	CompletionResponse(
464	content='{"name": "read", "arguments": {"file_path": "one.txt"}}'
465	),
466	CompletionResponse(
467	content='{"name": "read", "arguments": {"file_path": "two.txt"}}'
468	),
469	CompletionResponse(
470	content='{"name": "read", "arguments": {"file_path": "three.txt"}}'
471	),
472	CompletionResponse(
473	content='{"name": "read", "arguments": {"file_path": "four.txt"}}'
474	),
475	]
476	)
477
478	run = await run_scenario(
479	"Inspect the text fixtures.",
480	backend,
481	config=non_streaming_config(),
482	project_root=temp_dir,
483	)
484
485	assert tool_event_names(run) == ["read", "read", "read"]
486	assert run.response == (
487	"I couldn't safely continue because the model kept emitting raw-text "
488	"tool calls instead of proper tool invocations. Please try again or "
489	"switch to a different backend/model."
490	)
491	assert [entry.kind for entry in run.agent.last_turn_summary.workflow_timeline[-4:]] == [
492	"repair_retry",
493	"repair_retry",
494	"repair_retry",
495	"repair_fail",
496	]
497	assert run.agent.last_turn_summary.workflow_timeline[-1].reason_code == (
498	"raw_text_tool_recovery_exhausted"
499	)
500	assert "Let me know if you'd like me to continue" not in run.response