Raise num_ctx to 16K and lower compaction threshold to 12K to prevent context overflow in multi-turn sessions
- SHA
ac30b951de2b0d99220b3582db92b77063278371- Parents
-
984bfec - Tree
d66c5ad
ac30b95
ac30b951de2b0d99220b3582db92b77063278371984bfec
d66c5ad| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/agent/loop.py
|
1 | 1 |
| M |
src/loader/llm/ollama.py
|
1 | 1 |
src/loader/agent/loop.pymodified@@ -89,7 +89,7 @@ class AgentConfig: | ||
| 89 | 89 | workflow_mode_override: str | None = None |
| 90 | 90 | stream: bool = True # Stream LLM responses for real-time output |
| 91 | 91 | session_rotate_after_bytes: int = 256 * 1024 |
| 92 | - session_auto_compaction_input_tokens_threshold: int = 100_000 | |
| 92 | + session_auto_compaction_input_tokens_threshold: int = 12_000 # ~75% of default 16K context | |
| 93 | 93 | session_compaction_keep_last_messages: int = 4 |
| 94 | 94 | |
| 95 | 95 | # Reasoning stages configuration |
src/loader/llm/ollama.pymodified@@ -26,7 +26,7 @@ class OllamaBackend(LLMBackend): | ||
| 26 | 26 | base_url: str = "http://localhost:11434", |
| 27 | 27 | timeout: float | None = None, |
| 28 | 28 | force_react: bool = False, |
| 29 | - num_ctx: int = 8192, # Reasonable context, not too slow | |
| 29 | + num_ctx: int = 16384, # 16K context; most models support 32K+ | |
| 30 | 30 | num_gpu: int = -1, # Use all GPU layers by default (fast) |
| 31 | 31 | ): |
| 32 | 32 | self.model = model |