`08b88f7`

optimize llm significantly

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 4 months ago

SHA: 08b88f7be56c9ea6269062159442d62de7c44536
Parents: c3e075e
Tree: c7bb2a8

3 changed files

Status	File	+	-
M	`cmd/mock.go`	3	2
M	`internal/config/config.go`	1	1
M	`internal/llm/ollama.go`	29	9

cmd/mock.gomodified

  	// Build context-aware prompt with personality
  	prompt := prompts.BuildPrompt(cmdType, command, exitCode, cfg.General.Personality)
 -	// Use a reasonable timeout for LLM responses (10 seconds max)
 -	maxTimeout := 10 * time.Second
 +	// Use a reasonable timeout for LLM responses (6 seconds max)
 +	// With optimized Ollama options, responses should be under 2 seconds when warm
 +	maxTimeout := 6 * time.Second
  	ctx, cancel := context.WithTimeout(context.Background(), maxTimeout)
  	defer cancel()

internal/config/config.gomodified

  			Provider: "ollama",
  			Endpoint: "http://127.0.0.1:11434",
  			Model:    "llama3.2:3b",
 -			Timeout:  15,  // Adequate time for local LLM processing (increased for slower systems)
 +			Timeout:  5,  // 5 seconds with optimized generation options should be plenty
  		},
  		General: GeneralConfig{
  			Personality:  "savage",

internal/llm/ollama.gomodified

+ }
  type GenerateRequest struct {
 -	Model  string `json:"model"`
 -	Prompt string `json:"prompt"`
 -	Stream bool   `json:"stream"`
 +	Model     string            `json:"model"`
 +	Prompt    string            `json:"prompt"`
 +	Stream    bool              `json:"stream"`
 +	KeepAlive string            `json:"keep_alive,omitempty"`
 +	Options   *GenerateOptions  `json:"options,omitempty"`
 +}
++
 +// GenerateOptions controls Ollama generation behavior for speed optimization
 +type GenerateOptions struct {
 +	NumPredict  int     `json:"num_predict,omitempty"`  // Max tokens to generate (60 is plenty for insults)
 +	NumCtx      int     `json:"num_ctx,omitempty"`      // Context window size (512 is enough for small prompts)
 +	Temperature float64 `json:"temperature,omitempty"`  // Creativity (0.8 for variety)
+ }
  type GenerateResponse struct {
+ 	}
  	req := GenerateRequest{
 -		Model:  c.Model,
 -		Prompt: prompt,
 -		Stream: false,
 +		Model:     c.Model,
 +		Prompt:    prompt,
 +		Stream:    false,
 +		KeepAlive: "10m", // Keep model loaded for 10 minutes to avoid cold starts
 +		Options: &GenerateOptions{
 +			NumPredict:  60,  // Limit output tokens (insults are short)
 +			NumCtx:      512, // Small context window (prompts are ~500 chars)
 +			Temperature: 0.8, // Good creativity for variety
 +		},
+ 	}
  	reqBody, err := json.Marshal(req)
+ 	}
  	req := GenerateRequest{
 -		Model:  c.Model,
 -		Prompt: "test", // Minimal prompt to load model
 -		Stream: false,
 +		Model:     c.Model,
 +		Prompt:    "Say OK", // Minimal prompt to load model
 +		Stream:    false,
 +		KeepAlive: "10m",
 +		Options: &GenerateOptions{
 +			NumPredict: 5,   // Minimal output for warmup
 +			NumCtx:     256, // Minimal context
 +		},
+ 	}
  	reqBody, err := json.Marshal(req)