tenseleyflow/parrot / 08b88f7

Browse files

optimize llm significantly

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
08b88f7be56c9ea6269062159442d62de7c44536
Parents
c3e075e
Tree
c7bb2a8

3 changed files

StatusFile+-
M cmd/mock.go 3 2
M internal/config/config.go 1 1
M internal/llm/ollama.go 29 9
cmd/mock.gomodified
@@ -197,8 +197,9 @@ func generateSmartResponse(cmdType, command, exitCode string) (string, *config.C
197197
 	// Build context-aware prompt with personality
198198
 	prompt := prompts.BuildPrompt(cmdType, command, exitCode, cfg.General.Personality)
199199
 	
200
-	// Use a reasonable timeout for LLM responses (10 seconds max)
201
-	maxTimeout := 10 * time.Second
200
+	// Use a reasonable timeout for LLM responses (6 seconds max)
201
+	// With optimized Ollama options, responses should be under 2 seconds when warm
202
+	maxTimeout := 6 * time.Second
202203
 	ctx, cancel := context.WithTimeout(context.Background(), maxTimeout)
203204
 	defer cancel()
204205
 	
internal/config/config.gomodified
@@ -60,7 +60,7 @@ func DefaultConfig() *Config {
6060
 			Provider: "ollama",
6161
 			Endpoint: "http://127.0.0.1:11434",
6262
 			Model:    "llama3.2:3b",
63
-			Timeout:  15,  // Adequate time for local LLM processing (increased for slower systems)
63
+			Timeout:  5,  // 5 seconds with optimized generation options should be plenty
6464
 		},
6565
 		General: GeneralConfig{
6666
 			Personality:  "savage",
internal/llm/ollama.gomodified
@@ -17,9 +17,18 @@ type OllamaClient struct {
1717
 }
1818
 
1919
 type GenerateRequest struct {
20
-	Model  string `json:"model"`
21
-	Prompt string `json:"prompt"`
22
-	Stream bool   `json:"stream"`
20
+	Model     string            `json:"model"`
21
+	Prompt    string            `json:"prompt"`
22
+	Stream    bool              `json:"stream"`
23
+	KeepAlive string            `json:"keep_alive,omitempty"`
24
+	Options   *GenerateOptions  `json:"options,omitempty"`
25
+}
26
+
27
+// GenerateOptions controls Ollama generation behavior for speed optimization
28
+type GenerateOptions struct {
29
+	NumPredict  int     `json:"num_predict,omitempty"`  // Max tokens to generate (60 is plenty for insults)
30
+	NumCtx      int     `json:"num_ctx,omitempty"`      // Context window size (512 is enough for small prompts)
31
+	Temperature float64 `json:"temperature,omitempty"`  // Creativity (0.8 for variety)
2332
 }
2433
 
2534
 type GenerateResponse struct {
@@ -51,9 +60,15 @@ func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, err
5160
 	}
5261
 
5362
 	req := GenerateRequest{
54
-		Model:  c.Model,
55
-		Prompt: prompt,
56
-		Stream: false,
63
+		Model:     c.Model,
64
+		Prompt:    prompt,
65
+		Stream:    false,
66
+		KeepAlive: "10m", // Keep model loaded for 10 minutes to avoid cold starts
67
+		Options: &GenerateOptions{
68
+			NumPredict:  60,  // Limit output tokens (insults are short)
69
+			NumCtx:      512, // Small context window (prompts are ~500 chars)
70
+			Temperature: 0.8, // Good creativity for variety
71
+		},
5772
 	}
5873
 
5974
 	reqBody, err := json.Marshal(req)
@@ -120,9 +135,14 @@ func (c *OllamaClient) WarmupModel() error {
120135
 	}
121136
 
122137
 	req := GenerateRequest{
123
-		Model:  c.Model,
124
-		Prompt: "test", // Minimal prompt to load model
125
-		Stream: false,
138
+		Model:     c.Model,
139
+		Prompt:    "Say OK", // Minimal prompt to load model
140
+		Stream:    false,
141
+		KeepAlive: "10m",
142
+		Options: &GenerateOptions{
143
+			NumPredict: 5,   // Minimal output for warmup
144
+			NumCtx:     256, // Minimal context
145
+		},
126146
 	}
127147
 
128148
 	reqBody, err := json.Marshal(req)