Python · 23810 bytes Raw Blame History
1 """Tests for persisted session state and resume support."""
2
3 from __future__ import annotations
4
5 import json
6 from pathlib import Path
7
8 import pytest
9
10 from loader.agent.loop import Agent, AgentConfig, ReasoningConfig
11 from loader.llm.base import CompletionResponse, Message, Role, ToolCall
12 from loader.runtime.completion_trace import CompletionTraceEntry
13 from loader.runtime.evidence_provenance import EvidenceProvenance
14 from loader.runtime.prompt_history import PromptSnapshot
15 from loader.runtime.runtime_handle import RuntimeHandle
16 from loader.runtime.session import ConversationSession
17 from loader.runtime.workflow_ledger import WorkflowLedger, WorkflowLedgerItem
18 from loader.runtime.workflow_policy import WorkflowTimelineEntry
19 from tests.helpers.runtime_harness import ScriptedBackend
20
21
22 def _dummy_system() -> Message:
23 return Message(role=Role.SYSTEM, content="system")
24
25
26 def _dummy_few_shots() -> list[Message]:
27 return []
28
29
30 @pytest.mark.asyncio
31 async def test_session_persists_and_resumes_across_agent_restart(temp_dir: Path) -> None:
32 backend = ScriptedBackend(
33 completions=[
34 CompletionResponse(
35 content="I'll create the file.",
36 tool_calls=[
37 ToolCall(
38 id="write-1",
39 name="write",
40 arguments={
41 "file_path": str(temp_dir / "hello.txt"),
42 "content": "hello\n",
43 },
44 )
45 ],
46 usage={"prompt_tokens": 12, "completion_tokens": 5},
47 ),
48 CompletionResponse(
49 content="The file is written.",
50 usage={"prompt_tokens": 10, "completion_tokens": 4},
51 ),
52 ]
53 )
54 config = AgentConfig(auto_context=False, stream=False)
55 first_agent = Agent(backend=backend, config=config, project_root=temp_dir)
56
57 response = await first_agent.run("Create hello.txt in the workspace root.")
58
59 assert response.startswith("The file is written.")
60 session_id = first_agent.session.session_id
61 assert first_agent.session.storage_path.exists()
62
63 resumed_agent = Agent(
64 backend=ScriptedBackend(completions=[]),
65 config=config,
66 project_root=temp_dir,
67 )
68
69 assert resumed_agent.resume_session(session_id) is True
70 assert resumed_agent.session.session_id == session_id
71 assert resumed_agent._current_task == "Create hello.txt in the workspace root."
72 assert resumed_agent.active_permission_mode == "workspace-write"
73 assert resumed_agent.workflow_mode == first_agent.workflow_mode
74 assert resumed_agent.last_turn_summary is not None
75 assert resumed_agent.last_turn_summary.definition_of_done is not None
76 assert resumed_agent.last_turn_summary.definition_of_done.task_statement == (
77 "Create hello.txt in the workspace root."
78 )
79 assert any(
80 message.role == Role.USER
81 and message.content == "Create hello.txt in the workspace root."
82 for message in resumed_agent.messages
83 )
84
85
86 def test_agent_clear_history_rebuilds_a_fresh_runtime_session(temp_dir: Path) -> None:
87 agent = Agent(
88 backend=ScriptedBackend(),
89 config=AgentConfig(auto_context=False, stream=False),
90 project_root=temp_dir,
91 )
92 original_session_id = agent.session.session_id
93 agent.current_task = "Keep runtime state tidy."
94 agent.prompt_format = "native"
95 agent.prompt_sections = ["Runtime Config", "Workflow Context"]
96 agent.set_workflow_mode("clarify")
97 agent.queue_steering_message("Stay in runtime.")
98
99 agent.clear_history()
100
101 assert agent.session.session_id != original_session_id
102 assert agent.current_task is None
103 assert agent.workflow_mode == "execute"
104 assert agent.prompt_format is None
105 assert agent.prompt_sections == []
106 assert agent.messages == []
107 assert agent.last_turn_summary is None
108 assert agent.drain_steering_messages() == []
109
110
111 def test_session_rotation_kicks_in_at_size_cap(temp_dir: Path) -> None:
112 session = ConversationSession(
113 system_message_factory=_dummy_system,
114 few_shot_factory=_dummy_few_shots,
115 project_root=temp_dir,
116 rotate_after_bytes=250,
117 )
118
119 for index in range(6):
120 session.append(
121 Message(
122 role=Role.USER,
123 content=f"Message {index}: " + ("x" * 120),
124 )
125 )
126
127 assert session.storage_path.exists()
128 assert session.storage_path.with_suffix(".1.json").exists()
129
130
131 def test_session_compaction_persists_summary_and_recent_messages(temp_dir: Path) -> None:
132 session = ConversationSession(
133 system_message_factory=_dummy_system,
134 few_shot_factory=_dummy_few_shots,
135 project_root=temp_dir,
136 messages=[
137 Message(role=Role.USER, content="Kick off runtime audit"),
138 Message(role=Role.ASSISTANT, content="Initial findings"),
139 Message(role=Role.USER, content="Focus on sessions"),
140 Message(role=Role.ASSISTANT, content="Compaction design drafted"),
141 Message(role=Role.USER, content="Preserve the latest four messages"),
142 Message(role=Role.ASSISTANT, content="Ready to compact"),
143 ],
144 auto_compaction_input_tokens_threshold=1,
145 compaction_keep_last_messages=4,
146 )
147
148 result = session.maybe_compact()
149
150 assert result is not None
151 assert session.compaction is not None
152 assert session.storage_path.exists()
153 assert session.messages[0].content.startswith("[COMPACTED CONTEXT]")
154 assert [message.content for message in session.messages[-4:]] == [
155 "Focus on sessions",
156 "Compaction design drafted",
157 "Preserve the latest four messages",
158 "Ready to compact",
159 ]
160
161
162 def test_build_request_messages_omits_large_mutation_tool_calls_from_history(
163 temp_dir: Path,
164 ) -> None:
165 large_html = "<html>" + ("x" * 400) + "</html>"
166 old_block = "old\n" * 120
167 new_block = "new\n" * 120
168 session = ConversationSession(
169 system_message_factory=_dummy_system,
170 few_shot_factory=_dummy_few_shots,
171 project_root=temp_dir,
172 messages=[
173 Message(role=Role.USER, content="Create the guide."),
174 Message(
175 role=Role.ASSISTANT,
176 content="I'll write the first files now.",
177 tool_calls=[
178 ToolCall(
179 id="write-1",
180 name="write",
181 arguments={
182 "file_path": str(temp_dir / "guides" / "nginx" / "index.html"),
183 "content": large_html,
184 },
185 ),
186 ToolCall(
187 id="edit-1",
188 name="edit",
189 arguments={
190 "file_path": str(temp_dir / "README.md"),
191 "old_string": old_block,
192 "new_string": new_block,
193 },
194 ),
195 ],
196 ),
197 ],
198 )
199
200 request_messages = session.build_request_messages()
201
202 assert request_messages[2].tool_calls == []
203 assert request_messages[2].content == "I'll write the first files now."
204 assert session.messages[1].tool_calls[0].arguments["content"] == large_html
205 assert session.messages[1].tool_calls[1].arguments["old_string"] == old_block
206 assert session.messages[1].tool_calls[1].arguments["new_string"] == new_block
207
208
209 def test_session_persists_permission_policy_metadata(temp_dir: Path) -> None:
210 session = ConversationSession(
211 system_message_factory=_dummy_system,
212 few_shot_factory=_dummy_few_shots,
213 project_root=temp_dir,
214 permission_mode="prompt",
215 permission_prompting_enabled=True,
216 permission_rule_counts={"allow": 1, "deny": 2, "ask": 3},
217 permission_rules_source=str(temp_dir / ".loader" / "permission-rules.json"),
218 prompt_format="react",
219 prompt_sections=["Runtime Config", "Workflow Context"],
220 )
221
222 session.update_runtime_state(
223 current_task="Inspect permission history",
224 runtime_owner_type="RuntimeHandle",
225 permission_mode="allow",
226 permission_prompting_enabled=True,
227 permission_rule_counts={"allow": 2, "deny": 1, "ask": 4},
228 permission_rules_source=str(temp_dir / ".loader" / "permission-rules.json"),
229 prompt_format="native",
230 prompt_sections=["Runtime Config", "Workflow Context", "Project Context"],
231 workflow_reason_code="task_is_complex",
232 workflow_reason_summary="task looks complex enough to benefit from a persisted plan",
233 workflow_decision_kind="initial_route",
234 workflow_ambiguity_score=0.2,
235 workflow_complexity_score=0.6,
236 workflow_scheduled_next_mode="execute",
237 last_completion_decision_code="verification_failed_reentry",
238 last_completion_decision_summary=(
239 "continued after verification failed and the runtime re-entered execute mode"
240 ),
241 last_turn_transition_summary="completion -> finalize [terminal] Finalizing completed turn",
242 last_turn_transition_kind="terminal",
243 last_turn_transition_reason_code="turn_complete",
244 )
245 session.append_workflow_timeline_entry(
246 WorkflowTimelineEntry(
247 timestamp="2026-04-07T12:00:00Z",
248 kind="route",
249 mode="plan",
250 reason_code="task_is_complex",
251 summary="plan: workflow pressure favors a persisted plan before execution",
252 decision_kind="initial_route",
253 route_score=0.72,
254 runner_up_mode="clarify",
255 runner_up_score=0.61,
256 scheduled_next_mode="execute",
257 unresolved_questions=["Scope is still broad."],
258 prompt_format="native",
259 prompt_sections=["Runtime Config", "Workflow Context", "Project Context"],
260 )
261 )
262 session.append_completion_trace_entry(
263 CompletionTraceEntry(
264 stage="definition_of_done",
265 outcome="continue",
266 decision_code="verification_failed_reentry",
267 decision_summary=(
268 "continued after verification failed and the runtime "
269 "re-entered execute mode"
270 ),
271 evidence_summary=["verification contradiction: pytest still failed"],
272 )
273 )
274
275 reloaded = ConversationSession.load(
276 project_root=temp_dir,
277 system_message_factory=_dummy_system,
278 few_shot_factory=_dummy_few_shots,
279 session_id=session.session_id,
280 )
281
282 assert reloaded is not None
283 assert reloaded.permission_mode == "allow"
284 assert reloaded.permission_prompting_enabled is True
285 assert reloaded.permission_rule_counts == {"allow": 2, "deny": 1, "ask": 4}
286 assert reloaded.permission_rules_source == str(
287 temp_dir / ".loader" / "permission-rules.json"
288 )
289 assert reloaded.runtime_owner_type == "RuntimeHandle"
290 assert reloaded.runtime_owner_path == "runtime-handle"
291 assert reloaded.prompt_format == "native"
292 assert reloaded.prompt_sections == [
293 "Runtime Config",
294 "Workflow Context",
295 "Project Context",
296 ]
297 assert reloaded.workflow_reason_code == "task_is_complex"
298 assert reloaded.workflow_reason_summary == (
299 "task looks complex enough to benefit from a persisted plan"
300 )
301 assert reloaded.workflow_decision_kind == "initial_route"
302 assert reloaded.workflow_ambiguity_score == pytest.approx(0.2)
303 assert reloaded.workflow_complexity_score == pytest.approx(0.6)
304 assert reloaded.workflow_scheduled_next_mode == "execute"
305 assert reloaded.last_completion_decision_code == "verification_failed_reentry"
306 assert reloaded.last_completion_decision_summary == (
307 "continued after verification failed and the runtime re-entered execute mode"
308 )
309 assert [entry.decision_code for entry in reloaded.completion_trace] == [
310 "verification_failed_reentry"
311 ]
312 assert reloaded.completion_trace[0].evidence_summary == [
313 "verification contradiction: pytest still failed"
314 ]
315 assert reloaded.last_turn_transition_summary == (
316 "completion -> finalize [terminal] Finalizing completed turn"
317 )
318 assert reloaded.last_turn_transition_kind == "terminal"
319 assert reloaded.last_turn_transition_reason_code == "turn_complete"
320 assert len(reloaded.workflow_timeline) == 1
321 assert reloaded.workflow_timeline[0].mode == "plan"
322 assert reloaded.workflow_timeline[0].route_score == pytest.approx(0.72)
323 assert reloaded.workflow_timeline[0].unresolved_questions == [
324 "Scope is still broad."
325 ]
326
327
328 def test_resume_session_updates_runtime_owner_metadata(temp_dir: Path) -> None:
329 agent = Agent(
330 backend=ScriptedBackend(),
331 config=AgentConfig(auto_context=False, stream=False),
332 project_root=temp_dir,
333 )
334 agent.session.persist()
335 session_id = agent.session.session_id
336
337 handle = RuntimeHandle(
338 backend=ScriptedBackend(),
339 config=AgentConfig(auto_context=False, stream=False),
340 project_root=temp_dir,
341 )
342
343 assert handle.resume_session(session_id) is True
344
345 reloaded = ConversationSession.load(
346 project_root=temp_dir,
347 system_message_factory=_dummy_system,
348 few_shot_factory=_dummy_few_shots,
349 session_id=session_id,
350 )
351
352 assert reloaded is not None
353 assert reloaded.runtime_owner_type == "RuntimeHandle"
354 assert reloaded.runtime_owner_path == "runtime-handle"
355
356
357 def test_session_prefers_canonical_workflow_timeline_for_completion_trace(
358 temp_dir: Path,
359 ) -> None:
360 session = ConversationSession(
361 system_message_factory=_dummy_system,
362 few_shot_factory=_dummy_few_shots,
363 project_root=temp_dir,
364 )
365
366 session.update_runtime_state(
367 current_task="Explain why the turn stopped",
368 last_completion_decision_code="continuation_budget_exhausted",
369 last_completion_decision_summary=(
370 "stopped because the continuation budget was exhausted while "
371 "follow-through evidence was still missing"
372 ),
373 )
374 session.append_completion_trace_entry(
375 CompletionTraceEntry(
376 stage="definition_of_done",
377 outcome="complete",
378 decision_code="stale_completion_trace",
379 decision_summary="this legacy trace entry should be ignored",
380 )
381 )
382 session.append_workflow_timeline_entry(
383 WorkflowTimelineEntry(
384 timestamp="2026-04-09T12:00:00Z",
385 kind="completion_check",
386 mode="execute",
387 reason_code="premature_completion_nudge",
388 summary=(
389 "completion: requested one continuation because the non-mutating "
390 "response looked incomplete"
391 ),
392 decision_kind="forced",
393 policy_stage="continuation_check",
394 policy_outcome="continue",
395 evidence_summary=["showing the requested work was actually carried out"],
396 )
397 )
398 session.append_workflow_timeline_entry(
399 WorkflowTimelineEntry(
400 timestamp="2026-04-09T12:01:00Z",
401 kind="completion_finalize",
402 mode="execute",
403 reason_code="continuation_budget_exhausted",
404 summary=(
405 "completion: stopped because the continuation budget was exhausted "
406 "while follow-through evidence was still missing"
407 ),
408 decision_kind="forced",
409 policy_stage="continuation_check",
410 policy_outcome="finalize",
411 evidence_summary=["showing the requested work was actually carried out"],
412 )
413 )
414
415 persisted = json.loads(session.storage_path.read_text())
416 assert "completion_trace" not in persisted
417
418 reloaded = ConversationSession.load(
419 project_root=temp_dir,
420 system_message_factory=_dummy_system,
421 few_shot_factory=_dummy_few_shots,
422 session_id=session.session_id,
423 )
424
425 assert reloaded is not None
426 assert [entry.decision_code for entry in reloaded.completion_trace] == [
427 "premature_completion_nudge",
428 "continuation_budget_exhausted",
429 ]
430 assert reloaded.completion_trace[-1].stage == "continuation_check"
431 assert reloaded.completion_trace[-1].outcome == "finalize"
432 assert reloaded.completion_trace[-1].evidence_summary == [
433 "showing the requested work was actually carried out"
434 ]
435
436
437 def test_session_projects_live_completion_trace_from_workflow_timeline(
438 temp_dir: Path,
439 ) -> None:
440 session = ConversationSession(
441 system_message_factory=_dummy_system,
442 few_shot_factory=_dummy_few_shots,
443 project_root=temp_dir,
444 )
445
446 session.append_workflow_timeline_entry(
447 WorkflowTimelineEntry(
448 timestamp="2026-04-09T12:00:00Z",
449 kind="completion_check",
450 mode="execute",
451 reason_code="completion_response_accepted",
452 summary="completion: accepted the response because follow-through evidence was present",
453 decision_kind="forced",
454 policy_stage="continuation_check",
455 policy_outcome="accept",
456 )
457 )
458 session.append_workflow_timeline_entry(
459 WorkflowTimelineEntry(
460 timestamp="2026-04-09T12:01:00Z",
461 kind="completion_finalize",
462 mode="execute",
463 reason_code="continuation_budget_exhausted",
464 summary="completion: stopped because verification evidence was still missing",
465 decision_kind="forced",
466 policy_stage="continuation_check",
467 policy_outcome="finalize",
468 evidence_summary=["a passing verification result from `pytest -q`"],
469 evidence_provenance=[
470 EvidenceProvenance(
471 category="verification",
472 source="dod.verification_commands",
473 summary="verification evidence was still missing for `pytest -q`",
474 status="missing",
475 subject="pytest -q",
476 )
477 ],
478 )
479 )
480 session.update_runtime_state(
481 last_completion_decision_code="continuation_budget_exhausted",
482 last_completion_decision_summary=(
483 "stopped because verification evidence was still missing"
484 ),
485 )
486
487 assert [entry.decision_code for entry in session.completion_trace] == [
488 "completion_response_accepted",
489 "continuation_budget_exhausted",
490 ]
491 assert session.completion_trace[-1].stage == "continuation_check"
492 assert session.completion_trace[-1].outcome == "finalize"
493 assert session.completion_trace[-1].evidence_summary == [
494 "a passing verification result from `pytest -q`"
495 ]
496 assert [item.summary for item in session.completion_trace[-1].evidence_provenance] == [
497 "verification evidence was still missing for `pytest -q`"
498 ]
499
500
501 def test_session_persists_workflow_ledger_state(temp_dir: Path) -> None:
502 session = ConversationSession(
503 system_message_factory=_dummy_system,
504 few_shot_factory=_dummy_few_shots,
505 project_root=temp_dir,
506 )
507
508 session.update_workflow_ledger(
509 WorkflowLedger(
510 assumptions=[
511 WorkflowLedgerItem(
512 text="notes.txt stays out of scope unless clarified otherwise.",
513 status="contradicted",
514 introduced_phase="clarify",
515 updated_phase="recovery",
516 evidence=["Clarify scope assumed `notes.txt` stayed out of scope."],
517 )
518 ],
519 acceptance_anchors=[
520 WorkflowLedgerItem(
521 text="notes.txt exists in the workspace root.",
522 status="changed",
523 introduced_phase="clarify",
524 updated_phase="recovery",
525 )
526 ],
527 decision_boundaries=[
528 WorkflowLedgerItem(
529 text="Escalate before broad UX changes.",
530 status="tracked",
531 introduced_phase="clarify",
532 )
533 ],
534 )
535 )
536
537 reloaded = ConversationSession.load(
538 project_root=temp_dir,
539 system_message_factory=_dummy_system,
540 few_shot_factory=_dummy_few_shots,
541 session_id=session.session_id,
542 )
543
544 assert reloaded is not None
545 assert reloaded.workflow_ledger.assumptions[0].status == "contradicted"
546 assert reloaded.workflow_ledger.assumptions[0].updated_phase == "recovery"
547 assert reloaded.workflow_ledger.acceptance_anchors[0].status == "changed"
548 assert reloaded.workflow_ledger.decision_boundaries[0].text == (
549 "Escalate before broad UX changes."
550 )
551
552
553 def test_session_persists_prompt_history_state(temp_dir: Path) -> None:
554 session = ConversationSession(
555 system_message_factory=_dummy_system,
556 few_shot_factory=_dummy_few_shots,
557 project_root=temp_dir,
558 )
559
560 session.append_prompt_snapshot(
561 PromptSnapshot(
562 timestamp="2026-04-07T14:00:00Z",
563 workflow_mode="plan",
564 permission_mode="prompt",
565 current_task="Tighten Loader workflow behavior",
566 prompt_format="native",
567 prompt_sections=["Runtime Config", "Workflow Context", "Mode Guidance"],
568 content="# Introduction\nplan around planned.txt\n",
569 )
570 )
571 session.append_prompt_snapshot(
572 PromptSnapshot(
573 timestamp="2026-04-07T14:02:00Z",
574 workflow_mode="execute",
575 permission_mode="prompt",
576 current_task="Tighten Loader workflow behavior",
577 prompt_format="native",
578 prompt_sections=[
579 "Runtime Config",
580 "Workflow Context",
581 "Mode Guidance",
582 "Project Context",
583 ],
584 content="# Introduction\nexecute around notes.txt\n# Project Context\npython\n",
585 )
586 )
587
588 reloaded = ConversationSession.load(
589 project_root=temp_dir,
590 system_message_factory=_dummy_system,
591 few_shot_factory=_dummy_few_shots,
592 session_id=session.session_id,
593 )
594
595 assert reloaded is not None
596 assert len(reloaded.prompt_history) == 2
597 assert reloaded.prompt_history[0].workflow_mode == "plan"
598 assert reloaded.prompt_history[-1].workflow_mode == "execute"
599 assert "notes.txt" in reloaded.prompt_history[-1].content
600
601
602 @pytest.mark.asyncio
603 async def test_turn_summary_usage_rolls_up_into_session_totals(temp_dir: Path) -> None:
604 backend = ScriptedBackend(
605 completions=[
606 CompletionResponse(
607 content="Here's the answer.",
608 usage={"prompt_tokens": 9, "completion_tokens": 3},
609 )
610 ]
611 )
612 agent = Agent(
613 backend=backend,
614 config=AgentConfig(
615 auto_context=False,
616 stream=False,
617 reasoning=ReasoningConfig(completion_check=False),
618 ),
619 project_root=temp_dir,
620 )
621
622 await agent.run("Write a short release-note style summary of what Loader does well.")
623
624 assert agent.last_turn_summary is not None
625 assert agent.last_turn_summary.usage["input_tokens"] == 9
626 assert agent.last_turn_summary.usage["output_tokens"] == 3
627 assert agent.last_turn_summary.cumulative_usage["input_tokens"] == 9
628 assert agent.last_turn_summary.cumulative_usage["output_tokens"] == 3
629 assert agent.last_turn_summary.cumulative_usage["turns"] == 1