Go · 17123 bytes Raw Blame History
1 package llm
2
3 import (
4 "fmt"
5 "math"
6 "time"
7 )
8
9 // BenchmarkSample represents a real command failure with expected outputs
10 type BenchmarkSample struct {
11 ID string
12 Command string
13 ExitCode int
14 Stderr string
15 Context SmartFallbackContext
16 Category string // "git", "npm", "docker", etc.
17 Description string
18 GoldInsults []string // Human-written example insults
19 Tags []string // Expected tags for this scenario
20 }
21
22 // BenchmarkResults contains evaluation metrics
23 type BenchmarkResults struct {
24 SystemName string
25 TotalSamples int
26 AvgRelevance float64
27 AvgLatency time.Duration
28 AvgConfidence float64
29 DiversityScore float64
30 FallbackRate float64
31 MemoryUsageKB int
32 DetailedScores []SampleScore
33 }
34
35 // SampleScore contains per-sample evaluation
36 type SampleScore struct {
37 SampleID string
38 GeneratedInsult string
39 Relevance float64 // 0-1: How relevant to the error
40 Latency time.Duration
41 Confidence float64
42 NoveltyScore float64
43 Method string // "semantic", "tag", "markov", "ensemble"
44 }
45
46 // Benchmark framework for systematic evaluation
47 type Benchmark struct {
48 Name string
49 Samples []BenchmarkSample
50 }
51
52 // NewBenchmark creates a comprehensive benchmark dataset
53 func NewBenchmark() *Benchmark {
54 return &Benchmark{
55 Name: "Parrot Insult Quality Benchmark v1.0",
56 Samples: createBenchmarkSamples(),
57 }
58 }
59
60 // createBenchmarkSamples creates a comprehensive test dataset
61 func createBenchmarkSamples() []BenchmarkSample {
62 samples := []BenchmarkSample{}
63
64 // Git failures
65 samples = append(samples, BenchmarkSample{
66 ID: "git-001",
67 Command: "git push origin main",
68 ExitCode: 1,
69 Stderr: "error: failed to push some refs\nTo github.com:user/repo.git\n ! [rejected] main -> main (fetch first)",
70 Context: SmartFallbackContext{
71 CommandType: "git",
72 Command: "git",
73 Subcommand: "push",
74 GitBranch: "main",
75 ErrorPattern: "permission_denied",
76 IsRepeatedFailure: false,
77 },
78 Category: "git",
79 Description: "Git push rejected on main branch",
80 GoldInsults: []string{
81 "Push rejected. Did you forget to pull first?",
82 "The remote has standards. Your code doesn't meet them.",
83 },
84 Tags: []string{"git", "push", "main_branch"},
85 })
86
87 samples = append(samples, BenchmarkSample{
88 ID: "git-002",
89 Command: "git merge feature/new-ui",
90 ExitCode: 1,
91 Stderr: "CONFLICT (content): Merge conflict in src/app.js\nAutomatic merge failed; fix conflicts and then commit the result.",
92 Context: SmartFallbackContext{
93 CommandType: "git",
94 Command: "git",
95 Subcommand: "merge",
96 GitBranch: "main",
97 ErrorPattern: "merge_conflict",
98 IsRepeatedFailure: false,
99 },
100 Category: "git",
101 Description: "Merge conflict",
102 GoldInsults: []string{
103 "Merge conflict. Maybe communicate with your team?",
104 "<<<<<<< HEAD is not a valid merge resolution strategy",
105 },
106 Tags: []string{"git", "merge", "merge_conflict"},
107 })
108
109 samples = append(samples, BenchmarkSample{
110 ID: "git-003",
111 Command: "git push --force origin main",
112 ExitCode: 1,
113 Stderr: "error: refusing to update checked out branch: refs/heads/main",
114 Context: SmartFallbackContext{
115 CommandType: "git",
116 Command: "git",
117 Subcommand: "push",
118 GitBranch: "main",
119 ErrorPattern: "permission_denied",
120 IsRepeatedFailure: true,
121 TimeOfDay: 2,
122 },
123 Category: "git",
124 Description: "Force push to main at 2 AM (repeated failure)",
125 GoldInsults: []string{
126 "Force pushing to main at 2 AM? Bold strategy.",
127 "--force won't force competence into you",
128 },
129 Tags: []string{"git", "push", "main_branch", "late_night", "repeated"},
130 })
131
132 // NPM failures
133 samples = append(samples, BenchmarkSample{
134 ID: "npm-001",
135 Command: "npm install",
136 ExitCode: 1,
137 Stderr: "npm ERR! code ENOENT\nnpm ERR! syscall open\nnpm ERR! path /home/user/project/package.json\nnpm ERR! errno -2",
138 Context: SmartFallbackContext{
139 CommandType: "nodejs",
140 Command: "npm",
141 Subcommand: "install",
142 ProjectType: "node",
143 ErrorPattern: "not_found",
144 },
145 Category: "npm",
146 Description: "Missing package.json",
147 GoldInsults: []string{
148 "package.json not found. Neither is your organizational skill.",
149 "Are you in the right directory? Rhetorical question.",
150 },
151 Tags: []string{"npm", "install", "not_found"},
152 })
153
154 samples = append(samples, BenchmarkSample{
155 ID: "npm-002",
156 Command: "npm install typescript --save-dev",
157 ExitCode: 1,
158 Stderr: "npm ERR! code ERESOLVE\nnpm ERR! ERESOLVE unable to resolve dependency tree\nnpm ERR! peer dep missing: react@^18.0.0",
159 Context: SmartFallbackContext{
160 CommandType: "nodejs",
161 Command: "npm",
162 Subcommand: "install",
163 ProjectType: "node",
164 ErrorPattern: "dependency",
165 },
166 Category: "npm",
167 Description: "Dependency resolution failure",
168 GoldInsults: []string{
169 "Dependency hell. You're everyone's least favorite dependency.",
170 "ERESOLVE: Can't resolve your incompetence either",
171 },
172 Tags: []string{"npm", "install", "dependency"},
173 })
174
175 samples = append(samples, BenchmarkSample{
176 ID: "npm-003",
177 Command: "npm test",
178 ExitCode: 1,
179 Stderr: "FAIL src/components/App.test.js\n ● App › renders correctly\n expect(received).toEqual(expected)\n Expected: true\n Received: false",
180 Context: SmartFallbackContext{
181 CommandType: "nodejs",
182 Command: "npm",
183 Subcommand: "test",
184 ProjectType: "node",
185 ErrorPattern: "test_failure",
186 IsCI: true,
187 CIProvider: "github",
188 },
189 Category: "npm",
190 Description: "Test failure in CI",
191 GoldInsults: []string{
192 "Tests failed. Shocking absolutely no one who read your code",
193 "Did you test this before committing? Oh wait, that's what CI is for",
194 },
195 Tags: []string{"npm", "test", "test_failure", "ci"},
196 })
197
198 // Docker failures
199 samples = append(samples, BenchmarkSample{
200 ID: "docker-001",
201 Command: "docker build -t myapp .",
202 ExitCode: 1,
203 Stderr: "Step 5/10 : RUN npm install\nERROR [5/10] RUN npm install\nfailed to solve with frontend dockerfile.v0",
204 Context: SmartFallbackContext{
205 CommandType: "docker",
206 Command: "docker",
207 Subcommand: "build",
208 HasDockerfile: true,
209 ErrorPattern: "build_failure",
210 },
211 Category: "docker",
212 Description: "Docker build failure",
213 GoldInsults: []string{
214 "Docker build failed. Can't containerize disaster.",
215 "FROM scratch. You are scratch.",
216 },
217 Tags: []string{"docker", "build", "build_failure"},
218 })
219
220 samples = append(samples, BenchmarkSample{
221 ID: "docker-002",
222 Command: "docker run -p 3000:3000 myapp",
223 ExitCode: 125,
224 Stderr: "docker: Error response from daemon: driver failed programming external connectivity on endpoint\nError starting userland proxy: listen tcp4 0.0.0.0:3000: bind: address already in use.",
225 Context: SmartFallbackContext{
226 CommandType: "docker",
227 Command: "docker",
228 Subcommand: "run",
229 ErrorPattern: "port_in_use",
230 NumericArgs: []int{3000},
231 },
232 Category: "docker",
233 Description: "Port already in use",
234 GoldInsults: []string{
235 "Port 3000 already in use. By someone competent, probably.",
236 "Port conflict. Your existence is a conflict.",
237 },
238 Tags: []string{"docker", "run", "network"},
239 })
240
241 // Python failures
242 samples = append(samples, BenchmarkSample{
243 ID: "python-001",
244 Command: "python app.py",
245 ExitCode: 1,
246 Stderr: "Traceback (most recent call last):\n File \"app.py\", line 5, in <module>\n import requests\nModuleNotFoundError: No module named 'requests'",
247 Context: SmartFallbackContext{
248 CommandType: "python",
249 Command: "python",
250 ProjectType: "python",
251 ErrorPattern: "dependency",
252 FileExtensions: []string{".py"},
253 },
254 Category: "python",
255 Description: "Missing Python module",
256 GoldInsults: []string{
257 "ModuleNotFoundError: Module 'brain' not found",
258 "Did you activate your venv? Don't answer, I know you didn't",
259 },
260 Tags: []string{"python", "dependency"},
261 })
262
263 samples = append(samples, BenchmarkSample{
264 ID: "python-002",
265 Command: "python script.py",
266 ExitCode: 1,
267 Stderr: " File \"script.py\", line 15\n if x == 5\nSyntaxError: invalid syntax",
268 Context: SmartFallbackContext{
269 CommandType: "python",
270 Command: "python",
271 ProjectType: "python",
272 ErrorPattern: "syntax_error",
273 FileExtensions: []string{".py"},
274 },
275 Category: "python",
276 Description: "Python syntax error",
277 GoldInsults: []string{
278 "SyntaxError: Invalid syntax, invalid developer",
279 "Python is trying to tell you something. Maybe listen for once?",
280 },
281 Tags: []string{"python", "syntax"},
282 })
283
284 // Rust failures
285 samples = append(samples, BenchmarkSample{
286 ID: "rust-001",
287 Command: "cargo build",
288 ExitCode: 101,
289 Stderr: "error[E0502]: cannot borrow `x` as mutable because it is also borrowed as immutable\n --> src/main.rs:10:5",
290 Context: SmartFallbackContext{
291 CommandType: "rust",
292 Command: "cargo",
293 Subcommand: "build",
294 ProjectType: "rust",
295 ErrorPattern: "borrow_checker",
296 },
297 Category: "rust",
298 Description: "Borrow checker error",
299 GoldInsults: []string{
300 "Borrow checker says no. And honestly, it has a point.",
301 "Fighting the borrow checker? The borrow checker always wins.",
302 },
303 Tags: []string{"rust", "build", "borrow_checker"},
304 })
305
306 // Permission errors
307 samples = append(samples, BenchmarkSample{
308 ID: "perm-001",
309 Command: "chmod 777 /etc/passwd",
310 ExitCode: 1,
311 Stderr: "chmod: changing permissions of '/etc/passwd': Operation not permitted",
312 Context: SmartFallbackContext{
313 Command: "chmod",
314 ErrorPattern: "permission_denied",
315 NumericArgs: []int{777},
316 },
317 Category: "permission",
318 Description: "Permission denied with chmod 777",
319 GoldInsults: []string{
320 "chmod 777 isn't the answer this time, though I admire your optimism",
321 "777: Jackpot of incompetence",
322 },
323 Tags: []string{"permission", "chmod"},
324 })
325
326 // Late night scenarios
327 samples = append(samples, BenchmarkSample{
328 ID: "time-001",
329 Command: "make build",
330 ExitCode: 2,
331 Stderr: "make: *** [Makefile:15: build] Error 2",
332 Context: SmartFallbackContext{
333 Command: "make",
334 ErrorPattern: "build_failure",
335 TimeOfDay: 3,
336 HasMakefile: true,
337 },
338 Category: "build",
339 Description: "Build failure at 3 AM",
340 GoldInsults: []string{
341 "It's 3 AM. The bugs aren't the only thing that needs fixing",
342 "Late night debugging? Tomorrow-you is going to hate today-you",
343 },
344 Tags: []string{"build", "late_night"},
345 })
346
347 return samples
348 }
349
350 // EvaluateSystem runs the benchmark against a system
351 func (b *Benchmark) EvaluateSystem(system *EnsembleSystem) BenchmarkResults {
352 results := BenchmarkResults{
353 SystemName: "Ensemble ML System",
354 TotalSamples: len(b.Samples),
355 DetailedScores: make([]SampleScore, 0, len(b.Samples)),
356 }
357
358 var totalRelevance float64
359 var totalLatency time.Duration
360 var totalConfidence float64
361 var fallbackCount int
362
363 for _, sample := range b.Samples {
364 start := time.Now()
365 insult := system.GenerateInsult(&sample.Context, "sarcastic")
366 latency := time.Since(start)
367
368 // Calculate relevance score
369 relevance := calculateRelevanceScore(sample, insult)
370
371 // Determine if it was a Markov fallback
372 isFallback := len(insult) > 0 && !containsInsult(system.database.Insults, insult)
373
374 if isFallback {
375 fallbackCount++
376 }
377
378 score := SampleScore{
379 SampleID: sample.ID,
380 GeneratedInsult: insult,
381 Relevance: relevance,
382 Latency: latency,
383 Confidence: 0.75, // Placeholder
384 NoveltyScore: 1.0,
385 Method: determineMethod(isFallback),
386 }
387
388 results.DetailedScores = append(results.DetailedScores, score)
389
390 totalRelevance += relevance
391 totalLatency += latency
392 totalConfidence += score.Confidence
393 }
394
395 results.AvgRelevance = totalRelevance / float64(len(b.Samples))
396 results.AvgLatency = totalLatency / time.Duration(len(b.Samples))
397 results.AvgConfidence = totalConfidence / float64(len(b.Samples))
398 results.FallbackRate = float64(fallbackCount) / float64(len(b.Samples))
399 results.DiversityScore = calculateDiversityScore(results.DetailedScores)
400
401 return results
402 }
403
404 // calculateRelevanceScore measures how relevant the insult is to the error
405 func calculateRelevanceScore(sample BenchmarkSample, insult string) float64 {
406 score := 0.0
407
408 // Check for keyword matches
409 keywords := extractKeywords(sample)
410 for _, keyword := range keywords {
411 if containsWord(insult, keyword) {
412 score += 0.2
413 }
414 }
415
416 // Check for tag matches
417 for _, tag := range sample.Tags {
418 if containsWord(insult, tag) {
419 score += 0.15
420 }
421 }
422
423 // Check similarity to gold insults
424 if len(sample.GoldInsults) > 0 {
425 maxSimilarity := 0.0
426 for _, gold := range sample.GoldInsults {
427 sim := simpleStringSimilarity(insult, gold)
428 if sim > maxSimilarity {
429 maxSimilarity = sim
430 }
431 }
432 score += maxSimilarity * 0.3
433 }
434
435 return math.Min(1.0, score)
436 }
437
438 // extractKeywords extracts key terms from sample
439 func extractKeywords(sample BenchmarkSample) []string {
440 keywords := []string{
441 sample.Context.Command,
442 sample.Context.Subcommand,
443 sample.Context.CommandType,
444 sample.Context.ErrorPattern,
445 }
446
447 if sample.Context.GitBranch != "" {
448 keywords = append(keywords, sample.Context.GitBranch)
449 }
450
451 if sample.Context.ProjectType != "" {
452 keywords = append(keywords, sample.Context.ProjectType)
453 }
454
455 return keywords
456 }
457
458 // containsWord checks if text contains word (case-insensitive)
459 func containsWord(text, word string) bool {
460 textLower := toLower(text)
461 wordLower := toLower(word)
462 return contains(textLower, wordLower)
463 }
464
465 // simpleStringSimilarity calculates basic string similarity
466 func simpleStringSimilarity(s1, s2 string) float64 {
467 // Simple word overlap metric
468 words1 := splitWords(toLower(s1))
469 words2 := splitWords(toLower(s2))
470
471 if len(words1) == 0 || len(words2) == 0 {
472 return 0.0
473 }
474
475 matches := 0
476 for _, w1 := range words1 {
477 for _, w2 := range words2 {
478 if w1 == w2 && len(w1) > 2 { // Skip short words
479 matches++
480 break
481 }
482 }
483 }
484
485 return float64(matches) / float64(max(len(words1), len(words2)))
486 }
487
488 // calculateDiversityScore measures insult variety
489 func calculateDiversityScore(scores []SampleScore) float64 {
490 if len(scores) < 2 {
491 return 1.0
492 }
493
494 // Count unique insults
495 unique := make(map[string]bool)
496 for _, score := range scores {
497 unique[score.GeneratedInsult] = true
498 }
499
500 return float64(len(unique)) / float64(len(scores))
501 }
502
503 // containsInsult checks if insult exists in database
504 func containsInsult(insults []TaggedInsult, target string) bool {
505 for _, insult := range insults {
506 if insult.Text == target {
507 return true
508 }
509 }
510 return false
511 }
512
513 // determineMethod identifies which method generated the insult
514 func determineMethod(isFallback bool) string {
515 if isFallback {
516 return "markov"
517 }
518 return "ensemble"
519 }
520
521 // PrintResults outputs benchmark results
522 func (r *BenchmarkResults) Print() {
523 fmt.Println("╔═══════════════════════════════════════════════════════════╗")
524 fmt.Printf("║ Benchmark Results: %-38s ║\n", r.SystemName)
525 fmt.Println("╠═══════════════════════════════════════════════════════════╣")
526 fmt.Printf("║ Total Samples: %-41d ║\n", r.TotalSamples)
527 fmt.Printf("║ Avg Relevance: %-41.3f ║\n", r.AvgRelevance)
528 fmt.Printf("║ Avg Latency: %-41s ║\n", r.AvgLatency)
529 fmt.Printf("║ Avg Confidence: %-41.3f ║\n", r.AvgConfidence)
530 fmt.Printf("║ Diversity Score: %-41.3f ║\n", r.DiversityScore)
531 fmt.Printf("║ Fallback Rate: %-40.1f%% ║\n", r.FallbackRate*100)
532 fmt.Println("╚═══════════════════════════════════════════════════════════╝")
533 }
534
535 // Helper functions
536 func toLower(s string) string {
537 result := ""
538 for _, r := range s {
539 if r >= 'A' && r <= 'Z' {
540 result += string(r + 32)
541 } else {
542 result += string(r)
543 }
544 }
545 return result
546 }
547
548 func contains(s, substr string) bool {
549 return len(s) >= len(substr) && findSubstring(s, substr) >= 0
550 }
551
552 func findSubstring(s, substr string) int {
553 for i := 0; i <= len(s)-len(substr); i++ {
554 if s[i:i+len(substr)] == substr {
555 return i
556 }
557 }
558 return -1
559 }
560
561 func splitWords(s string) []string {
562 var words []string
563 var current string
564
565 for _, r := range s {
566 if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
567 current += string(r)
568 } else {
569 if len(current) > 0 {
570 words = append(words, current)
571 current = ""
572 }
573 }
574 }
575
576 if len(current) > 0 {
577 words = append(words, current)
578 }
579
580 return words
581 }
582
583 func max(a, b int) int {
584 if a > b {
585 return a
586 }
587 return b
588 }
589