parrot Public

Watch 0 Fork 0 Star 0

Go · 17123 bytes Raw Blame History

  
        1
        package llm
      
        2
        
        3
        import (
      
        4
        	"fmt"
      
        5
        	"math"
      
        6
        	"time"
      
        7
        )
      
        8
        
        9
        // BenchmarkSample represents a real command failure with expected outputs
      
        10
        type BenchmarkSample struct {
      
        11
        	ID          string
      
        12
        	Command     string
      
        13
        	ExitCode    int
      
        14
        	Stderr      string
      
        15
        	Context     SmartFallbackContext
      
        16
        	Category    string // "git", "npm", "docker", etc.
      
        17
        	Description string
      
        18
        	GoldInsults []string // Human-written example insults
      
        19
        	Tags        []string // Expected tags for this scenario
      
        20
        }
      
        21
        
        22
        // BenchmarkResults contains evaluation metrics
      
        23
        type BenchmarkResults struct {
      
        24
        	SystemName      string
      
        25
        	TotalSamples    int
      
        26
        	AvgRelevance    float64
      
        27
        	AvgLatency      time.Duration
      
        28
        	AvgConfidence   float64
      
        29
        	DiversityScore  float64
      
        30
        	FallbackRate    float64
      
        31
        	MemoryUsageKB   int
      
        32
        	DetailedScores  []SampleScore
      
        33
        }
      
        34
        
        35
        // SampleScore contains per-sample evaluation
      
        36
        type SampleScore struct {
      
        37
        	SampleID       string
      
        38
        	GeneratedInsult string
      
        39
        	Relevance      float64 // 0-1: How relevant to the error
      
        40
        	Latency        time.Duration
      
        41
        	Confidence     float64
      
        42
        	NoveltyScore   float64
      
        43
        	Method         string // "semantic", "tag", "markov", "ensemble"
      
        44
        }
      
        45
        
        46
        // Benchmark framework for systematic evaluation
      
        47
        type Benchmark struct {
      
        48
        	Name    string
      
        49
        	Samples []BenchmarkSample
      
        50
        }
      
        51
        
        52
        // NewBenchmark creates a comprehensive benchmark dataset
      
        53
        func NewBenchmark() *Benchmark {
      
        54
        	return &Benchmark{
      
        55
        		Name:    "Parrot Insult Quality Benchmark v1.0",
      
        56
        		Samples: createBenchmarkSamples(),
      
        57
        	}
      
        58
        }
      
        59
        
        60
        // createBenchmarkSamples creates a comprehensive test dataset
      
        61
        func createBenchmarkSamples() []BenchmarkSample {
      
        62
        	samples := []BenchmarkSample{}
      
        63
        
        64
        	// Git failures
      
        65
        	samples = append(samples, BenchmarkSample{
      
        66
        		ID:       "git-001",
      
        67
        		Command:  "git push origin main",
      
        68
        		ExitCode: 1,
      
        69
        		Stderr:   "error: failed to push some refs\nTo github.com:user/repo.git\n ! [rejected] main -> main (fetch first)",
      
        70
        		Context: SmartFallbackContext{
      
        71
        			CommandType:       "git",
      
        72
        			Command:           "git",
      
        73
        			Subcommand:        "push",
      
        74
        			GitBranch:         "main",
      
        75
        			ErrorPattern:      "permission_denied",
      
        76
        			IsRepeatedFailure: false,
      
        77
        		},
      
        78
        		Category:    "git",
      
        79
        		Description: "Git push rejected on main branch",
      
        80
        		GoldInsults: []string{
      
        81
        			"Push rejected. Did you forget to pull first?",
      
        82
        			"The remote has standards. Your code doesn't meet them.",
      
        83
        		},
      
        84
        		Tags: []string{"git", "push", "main_branch"},
      
        85
        	})
      
        86
        
        87
        	samples = append(samples, BenchmarkSample{
      
        88
        		ID:       "git-002",
      
        89
        		Command:  "git merge feature/new-ui",
      
        90
        		ExitCode: 1,
      
        91
        		Stderr:   "CONFLICT (content): Merge conflict in src/app.js\nAutomatic merge failed; fix conflicts and then commit the result.",
      
        92
        		Context: SmartFallbackContext{
      
        93
        			CommandType:       "git",
      
        94
        			Command:           "git",
      
        95
        			Subcommand:        "merge",
      
        96
        			GitBranch:         "main",
      
        97
        			ErrorPattern:      "merge_conflict",
      
        98
        			IsRepeatedFailure: false,
      
        99
        		},
      
        100
        		Category:    "git",
      
        101
        		Description: "Merge conflict",
      
        102
        		GoldInsults: []string{
      
        103
        			"Merge conflict. Maybe communicate with your team?",
      
        104
        			"<<<<<<< HEAD is not a valid merge resolution strategy",
      
        105
        		},
      
        106
        		Tags: []string{"git", "merge", "merge_conflict"},
      
        107
        	})
      
        108
        
        109
        	samples = append(samples, BenchmarkSample{
      
        110
        		ID:       "git-003",
      
        111
        		Command:  "git push --force origin main",
      
        112
        		ExitCode: 1,
      
        113
        		Stderr:   "error: refusing to update checked out branch: refs/heads/main",
      
        114
        		Context: SmartFallbackContext{
      
        115
        			CommandType:       "git",
      
        116
        			Command:           "git",
      
        117
        			Subcommand:        "push",
      
        118
        			GitBranch:         "main",
      
        119
        			ErrorPattern:      "permission_denied",
      
        120
        			IsRepeatedFailure: true,
      
        121
        			TimeOfDay:         2,
      
        122
        		},
      
        123
        		Category:    "git",
      
        124
        		Description: "Force push to main at 2 AM (repeated failure)",
      
        125
        		GoldInsults: []string{
      
        126
        			"Force pushing to main at 2 AM? Bold strategy.",
      
        127
        			"--force won't force competence into you",
      
        128
        		},
      
        129
        		Tags: []string{"git", "push", "main_branch", "late_night", "repeated"},
      
        130
        	})
      
        131
        
        132
        	// NPM failures
      
        133
        	samples = append(samples, BenchmarkSample{
      
        134
        		ID:       "npm-001",
      
        135
        		Command:  "npm install",
      
        136
        		ExitCode: 1,
      
        137
        		Stderr:   "npm ERR! code ENOENT\nnpm ERR! syscall open\nnpm ERR! path /home/user/project/package.json\nnpm ERR! errno -2",
      
        138
        		Context: SmartFallbackContext{
      
        139
        			CommandType:  "nodejs",
      
        140
        			Command:      "npm",
      
        141
        			Subcommand:   "install",
      
        142
        			ProjectType:  "node",
      
        143
        			ErrorPattern: "not_found",
      
        144
        		},
      
        145
        		Category:    "npm",
      
        146
        		Description: "Missing package.json",
      
        147
        		GoldInsults: []string{
      
        148
        			"package.json not found. Neither is your organizational skill.",
      
        149
        			"Are you in the right directory? Rhetorical question.",
      
        150
        		},
      
        151
        		Tags: []string{"npm", "install", "not_found"},
      
        152
        	})
      
        153
        
        154
        	samples = append(samples, BenchmarkSample{
      
        155
        		ID:       "npm-002",
      
        156
        		Command:  "npm install typescript --save-dev",
      
        157
        		ExitCode: 1,
      
        158
        		Stderr:   "npm ERR! code ERESOLVE\nnpm ERR! ERESOLVE unable to resolve dependency tree\nnpm ERR! peer dep missing: react@^18.0.0",
      
        159
        		Context: SmartFallbackContext{
      
        160
        			CommandType:  "nodejs",
      
        161
        			Command:      "npm",
      
        162
        			Subcommand:   "install",
      
        163
        			ProjectType:  "node",
      
        164
        			ErrorPattern: "dependency",
      
        165
        		},
      
        166
        		Category:    "npm",
      
        167
        		Description: "Dependency resolution failure",
      
        168
        		GoldInsults: []string{
      
        169
        			"Dependency hell. You're everyone's least favorite dependency.",
      
        170
        			"ERESOLVE: Can't resolve your incompetence either",
      
        171
        		},
      
        172
        		Tags: []string{"npm", "install", "dependency"},
      
        173
        	})
      
        174
        
        175
        	samples = append(samples, BenchmarkSample{
      
        176
        		ID:       "npm-003",
      
        177
        		Command:  "npm test",
      
        178
        		ExitCode: 1,
      
        179
        		Stderr:   "FAIL src/components/App.test.js\n  ● App › renders correctly\n    expect(received).toEqual(expected)\n    Expected: true\n    Received: false",
      
        180
        		Context: SmartFallbackContext{
      
        181
        			CommandType:  "nodejs",
      
        182
        			Command:      "npm",
      
        183
        			Subcommand:   "test",
      
        184
        			ProjectType:  "node",
      
        185
        			ErrorPattern: "test_failure",
      
        186
        			IsCI:         true,
      
        187
        			CIProvider:   "github",
      
        188
        		},
      
        189
        		Category:    "npm",
      
        190
        		Description: "Test failure in CI",
      
        191
        		GoldInsults: []string{
      
        192
        			"Tests failed. Shocking absolutely no one who read your code",
      
        193
        			"Did you test this before committing? Oh wait, that's what CI is for",
      
        194
        		},
      
        195
        		Tags: []string{"npm", "test", "test_failure", "ci"},
      
        196
        	})
      
        197
        
        198
        	// Docker failures
      
        199
        	samples = append(samples, BenchmarkSample{
      
        200
        		ID:       "docker-001",
      
        201
        		Command:  "docker build -t myapp .",
      
        202
        		ExitCode: 1,
      
        203
        		Stderr:   "Step 5/10 : RUN npm install\nERROR [5/10] RUN npm install\nfailed to solve with frontend dockerfile.v0",
      
        204
        		Context: SmartFallbackContext{
      
        205
        			CommandType:    "docker",
      
        206
        			Command:        "docker",
      
        207
        			Subcommand:     "build",
      
        208
        			HasDockerfile:  true,
      
        209
        			ErrorPattern:   "build_failure",
      
        210
        		},
      
        211
        		Category:    "docker",
      
        212
        		Description: "Docker build failure",
      
        213
        		GoldInsults: []string{
      
        214
        			"Docker build failed. Can't containerize disaster.",
      
        215
        			"FROM scratch. You are scratch.",
      
        216
        		},
      
        217
        		Tags: []string{"docker", "build", "build_failure"},
      
        218
        	})
      
        219
        
        220
        	samples = append(samples, BenchmarkSample{
      
        221
        		ID:       "docker-002",
      
        222
        		Command:  "docker run -p 3000:3000 myapp",
      
        223
        		ExitCode: 125,
      
        224
        		Stderr:   "docker: Error response from daemon: driver failed programming external connectivity on endpoint\nError starting userland proxy: listen tcp4 0.0.0.0:3000: bind: address already in use.",
      
        225
        		Context: SmartFallbackContext{
      
        226
        			CommandType:  "docker",
      
        227
        			Command:      "docker",
      
        228
        			Subcommand:   "run",
      
        229
        			ErrorPattern: "port_in_use",
      
        230
        			NumericArgs:  []int{3000},
      
        231
        		},
      
        232
        		Category:    "docker",
      
        233
        		Description: "Port already in use",
      
        234
        		GoldInsults: []string{
      
        235
        			"Port 3000 already in use. By someone competent, probably.",
      
        236
        			"Port conflict. Your existence is a conflict.",
      
        237
        		},
      
        238
        		Tags: []string{"docker", "run", "network"},
      
        239
        	})
      
        240
        
        241
        	// Python failures
      
        242
        	samples = append(samples, BenchmarkSample{
      
        243
        		ID:       "python-001",
      
        244
        		Command:  "python app.py",
      
        245
        		ExitCode: 1,
      
        246
        		Stderr:   "Traceback (most recent call last):\n  File \"app.py\", line 5, in <module>\n    import requests\nModuleNotFoundError: No module named 'requests'",
      
        247
        		Context: SmartFallbackContext{
      
        248
        			CommandType:  "python",
      
        249
        			Command:      "python",
      
        250
        			ProjectType:  "python",
      
        251
        			ErrorPattern: "dependency",
      
        252
        			FileExtensions: []string{".py"},
      
        253
        		},
      
        254
        		Category:    "python",
      
        255
        		Description: "Missing Python module",
      
        256
        		GoldInsults: []string{
      
        257
        			"ModuleNotFoundError: Module 'brain' not found",
      
        258
        			"Did you activate your venv? Don't answer, I know you didn't",
      
        259
        		},
      
        260
        		Tags: []string{"python", "dependency"},
      
        261
        	})
      
        262
        
        263
        	samples = append(samples, BenchmarkSample{
      
        264
        		ID:       "python-002",
      
        265
        		Command:  "python script.py",
      
        266
        		ExitCode: 1,
      
        267
        		Stderr:   "  File \"script.py\", line 15\n    if x == 5\nSyntaxError: invalid syntax",
      
        268
        		Context: SmartFallbackContext{
      
        269
        			CommandType:    "python",
      
        270
        			Command:        "python",
      
        271
        			ProjectType:    "python",
      
        272
        			ErrorPattern:   "syntax_error",
      
        273
        			FileExtensions: []string{".py"},
      
        274
        		},
      
        275
        		Category:    "python",
      
        276
        		Description: "Python syntax error",
      
        277
        		GoldInsults: []string{
      
        278
        			"SyntaxError: Invalid syntax, invalid developer",
      
        279
        			"Python is trying to tell you something. Maybe listen for once?",
      
        280
        		},
      
        281
        		Tags: []string{"python", "syntax"},
      
        282
        	})
      
        283
        
        284
        	// Rust failures
      
        285
        	samples = append(samples, BenchmarkSample{
      
        286
        		ID:       "rust-001",
      
        287
        		Command:  "cargo build",
      
        288
        		ExitCode: 101,
      
        289
        		Stderr:   "error[E0502]: cannot borrow `x` as mutable because it is also borrowed as immutable\n  --> src/main.rs:10:5",
      
        290
        		Context: SmartFallbackContext{
      
        291
        			CommandType:  "rust",
      
        292
        			Command:      "cargo",
      
        293
        			Subcommand:   "build",
      
        294
        			ProjectType:  "rust",
      
        295
        			ErrorPattern: "borrow_checker",
      
        296
        		},
      
        297
        		Category:    "rust",
      
        298
        		Description: "Borrow checker error",
      
        299
        		GoldInsults: []string{
      
        300
        			"Borrow checker says no. And honestly, it has a point.",
      
        301
        			"Fighting the borrow checker? The borrow checker always wins.",
      
        302
        		},
      
        303
        		Tags: []string{"rust", "build", "borrow_checker"},
      
        304
        	})
      
        305
        
        306
        	// Permission errors
      
        307
        	samples = append(samples, BenchmarkSample{
      
        308
        		ID:       "perm-001",
      
        309
        		Command:  "chmod 777 /etc/passwd",
      
        310
        		ExitCode: 1,
      
        311
        		Stderr:   "chmod: changing permissions of '/etc/passwd': Operation not permitted",
      
        312
        		Context: SmartFallbackContext{
      
        313
        			Command:      "chmod",
      
        314
        			ErrorPattern: "permission_denied",
      
        315
        			NumericArgs:  []int{777},
      
        316
        		},
      
        317
        		Category:    "permission",
      
        318
        		Description: "Permission denied with chmod 777",
      
        319
        		GoldInsults: []string{
      
        320
        			"chmod 777 isn't the answer this time, though I admire your optimism",
      
        321
        			"777: Jackpot of incompetence",
      
        322
        		},
      
        323
        		Tags: []string{"permission", "chmod"},
      
        324
        	})
      
        325
        
        326
        	// Late night scenarios
      
        327
        	samples = append(samples, BenchmarkSample{
      
        328
        		ID:       "time-001",
      
        329
        		Command:  "make build",
      
        330
        		ExitCode: 2,
      
        331
        		Stderr:   "make: *** [Makefile:15: build] Error 2",
      
        332
        		Context: SmartFallbackContext{
      
        333
        			Command:      "make",
      
        334
        			ErrorPattern: "build_failure",
      
        335
        			TimeOfDay:    3,
      
        336
        			HasMakefile:  true,
      
        337
        		},
      
        338
        		Category:    "build",
      
        339
        		Description: "Build failure at 3 AM",
      
        340
        		GoldInsults: []string{
      
        341
        			"It's 3 AM. The bugs aren't the only thing that needs fixing",
      
        342
        			"Late night debugging? Tomorrow-you is going to hate today-you",
      
        343
        		},
      
        344
        		Tags: []string{"build", "late_night"},
      
        345
        	})
      
        346
        
        347
        	return samples
      
        348
        }
      
        349
        
        350
        // EvaluateSystem runs the benchmark against a system
      
        351
        func (b *Benchmark) EvaluateSystem(system *EnsembleSystem) BenchmarkResults {
      
        352
        	results := BenchmarkResults{
      
        353
        		SystemName:     "Ensemble ML System",
      
        354
        		TotalSamples:   len(b.Samples),
      
        355
        		DetailedScores: make([]SampleScore, 0, len(b.Samples)),
      
        356
        	}
      
        357
        
        358
        	var totalRelevance float64
      
        359
        	var totalLatency time.Duration
      
        360
        	var totalConfidence float64
      
        361
        	var fallbackCount int
      
        362
        
        363
        	for _, sample := range b.Samples {
      
        364
        		start := time.Now()
      
        365
        		insult := system.GenerateInsult(&sample.Context, "sarcastic")
      
        366
        		latency := time.Since(start)
      
        367
        
        368
        		// Calculate relevance score
      
        369
        		relevance := calculateRelevanceScore(sample, insult)
      
        370
        
        371
        		// Determine if it was a Markov fallback
      
        372
        		isFallback := len(insult) > 0 && !containsInsult(system.database.Insults, insult)
      
        373
        
        374
        		if isFallback {
      
        375
        			fallbackCount++
      
        376
        		}
      
        377
        
        378
        		score := SampleScore{
      
        379
        			SampleID:        sample.ID,
      
        380
        			GeneratedInsult: insult,
      
        381
        			Relevance:       relevance,
      
        382
        			Latency:         latency,
      
        383
        			Confidence:      0.75, // Placeholder
      
        384
        			NoveltyScore:    1.0,
      
        385
        			Method:          determineMethod(isFallback),
      
        386
        		}
      
        387
        
        388
        		results.DetailedScores = append(results.DetailedScores, score)
      
        389
        
        390
        		totalRelevance += relevance
      
        391
        		totalLatency += latency
      
        392
        		totalConfidence += score.Confidence
      
        393
        	}
      
        394
        
        395
        	results.AvgRelevance = totalRelevance / float64(len(b.Samples))
      
        396
        	results.AvgLatency = totalLatency / time.Duration(len(b.Samples))
      
        397
        	results.AvgConfidence = totalConfidence / float64(len(b.Samples))
      
        398
        	results.FallbackRate = float64(fallbackCount) / float64(len(b.Samples))
      
        399
        	results.DiversityScore = calculateDiversityScore(results.DetailedScores)
      
        400
        
        401
        	return results
      
        402
        }
      
        403
        
        404
        // calculateRelevanceScore measures how relevant the insult is to the error
      
        405
        func calculateRelevanceScore(sample BenchmarkSample, insult string) float64 {
      
        406
        	score := 0.0
      
        407
        
        408
        	// Check for keyword matches
      
        409
        	keywords := extractKeywords(sample)
      
        410
        	for _, keyword := range keywords {
      
        411
        		if containsWord(insult, keyword) {
      
        412
        			score += 0.2
      
        413
        		}
      
        414
        	}
      
        415
        
        416
        	// Check for tag matches
      
        417
        	for _, tag := range sample.Tags {
      
        418
        		if containsWord(insult, tag) {
      
        419
        			score += 0.15
      
        420
        		}
      
        421
        	}
      
        422
        
        423
        	// Check similarity to gold insults
      
        424
        	if len(sample.GoldInsults) > 0 {
      
        425
        		maxSimilarity := 0.0
      
        426
        		for _, gold := range sample.GoldInsults {
      
        427
        			sim := simpleStringSimilarity(insult, gold)
      
        428
        			if sim > maxSimilarity {
      
        429
        				maxSimilarity = sim
      
        430
        			}
      
        431
        		}
      
        432
        		score += maxSimilarity * 0.3
      
        433
        	}
      
        434
        
        435
        	return math.Min(1.0, score)
      
        436
        }
      
        437
        
        438
        // extractKeywords extracts key terms from sample
      
        439
        func extractKeywords(sample BenchmarkSample) []string {
      
        440
        	keywords := []string{
      
        441
        		sample.Context.Command,
      
        442
        		sample.Context.Subcommand,
      
        443
        		sample.Context.CommandType,
      
        444
        		sample.Context.ErrorPattern,
      
        445
        	}
      
        446
        
        447
        	if sample.Context.GitBranch != "" {
      
        448
        		keywords = append(keywords, sample.Context.GitBranch)
      
        449
        	}
      
        450
        
        451
        	if sample.Context.ProjectType != "" {
      
        452
        		keywords = append(keywords, sample.Context.ProjectType)
      
        453
        	}
      
        454
        
        455
        	return keywords
      
        456
        }
      
        457
        
        458
        // containsWord checks if text contains word (case-insensitive)
      
        459
        func containsWord(text, word string) bool {
      
        460
        	textLower := toLower(text)
      
        461
        	wordLower := toLower(word)
      
        462
        	return contains(textLower, wordLower)
      
        463
        }
      
        464
        
        465
        // simpleStringSimilarity calculates basic string similarity
      
        466
        func simpleStringSimilarity(s1, s2 string) float64 {
      
        467
        	// Simple word overlap metric
      
        468
        	words1 := splitWords(toLower(s1))
      
        469
        	words2 := splitWords(toLower(s2))
      
        470
        
        471
        	if len(words1) == 0 || len(words2) == 0 {
      
        472
        		return 0.0
      
        473
        	}
      
        474
        
        475
        	matches := 0
      
        476
        	for _, w1 := range words1 {
      
        477
        		for _, w2 := range words2 {
      
        478
        			if w1 == w2 && len(w1) > 2 { // Skip short words
      
        479
        				matches++
      
        480
        				break
      
        481
        			}
      
        482
        		}
      
        483
        	}
      
        484
        
        485
        	return float64(matches) / float64(max(len(words1), len(words2)))
      
        486
        }
      
        487
        
        488
        // calculateDiversityScore measures insult variety
      
        489
        func calculateDiversityScore(scores []SampleScore) float64 {
      
        490
        	if len(scores) < 2 {
      
        491
        		return 1.0
      
        492
        	}
      
        493
        
        494
        	// Count unique insults
      
        495
        	unique := make(map[string]bool)
      
        496
        	for _, score := range scores {
      
        497
        		unique[score.GeneratedInsult] = true
      
        498
        	}
      
        499
        
        500
        	return float64(len(unique)) / float64(len(scores))
      
        501
        }
      
        502
        
        503
        // containsInsult checks if insult exists in database
      
        504
        func containsInsult(insults []TaggedInsult, target string) bool {
      
        505
        	for _, insult := range insults {
      
        506
        		if insult.Text == target {
      
        507
        			return true
      
        508
        		}
      
        509
        	}
      
        510
        	return false
      
        511
        }
      
        512
        
        513
        // determineMethod identifies which method generated the insult
      
        514
        func determineMethod(isFallback bool) string {
      
        515
        	if isFallback {
      
        516
        		return "markov"
      
        517
        	}
      
        518
        	return "ensemble"
      
        519
        }
      
        520
        
        521
        // PrintResults outputs benchmark results
      
        522
        func (r *BenchmarkResults) Print() {
      
        523
        	fmt.Println("╔═══════════════════════════════════════════════════════════╗")
      
        524
        	fmt.Printf("║ Benchmark Results: %-38s ║\n", r.SystemName)
      
        525
        	fmt.Println("╠═══════════════════════════════════════════════════════════╣")
      
        526
        	fmt.Printf("║ Total Samples:     %-41d ║\n", r.TotalSamples)
      
        527
        	fmt.Printf("║ Avg Relevance:     %-41.3f ║\n", r.AvgRelevance)
      
        528
        	fmt.Printf("║ Avg Latency:       %-41s ║\n", r.AvgLatency)
      
        529
        	fmt.Printf("║ Avg Confidence:    %-41.3f ║\n", r.AvgConfidence)
      
        530
        	fmt.Printf("║ Diversity Score:   %-41.3f ║\n", r.DiversityScore)
      
        531
        	fmt.Printf("║ Fallback Rate:     %-40.1f%% ║\n", r.FallbackRate*100)
      
        532
        	fmt.Println("╚═══════════════════════════════════════════════════════════╝")
      
        533
        }
      
        534
        
        535
        // Helper functions
      
        536
        func toLower(s string) string {
      
        537
        	result := ""
      
        538
        	for _, r := range s {
      
        539
        		if r >= 'A' && r <= 'Z' {
      
        540
        			result += string(r + 32)
      
        541
        		} else {
      
        542
        			result += string(r)
      
        543
        		}
      
        544
        	}
      
        545
        	return result
      
        546
        }
      
        547
        
        548
        func contains(s, substr string) bool {
      
        549
        	return len(s) >= len(substr) && findSubstring(s, substr) >= 0
      
        550
        }
      
        551
        
        552
        func findSubstring(s, substr string) int {
      
        553
        	for i := 0; i <= len(s)-len(substr); i++ {
      
        554
        		if s[i:i+len(substr)] == substr {
      
        555
        			return i
      
        556
        		}
      
        557
        	}
      
        558
        	return -1
      
        559
        }
      
        560
        
        561
        func splitWords(s string) []string {
      
        562
        	var words []string
      
        563
        	var current string
      
        564
        
        565
        	for _, r := range s {
      
        566
        		if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
      
        567
        			current += string(r)
      
        568
        		} else {
      
        569
        			if len(current) > 0 {
      
        570
        				words = append(words, current)
      
        571
        				current = ""
      
        572
        			}
      
        573
        		}
      
        574
        	}
      
        575
        
        576
        	if len(current) > 0 {
      
        577
        		words = append(words, current)
      
        578
        	}
      
        579
        
        580
        	return words
      
        581
        }
      
        582
        
        583
        func max(a, b int) int {
      
        584
        	if a > b {
      
        585
        		return a
      
        586
        	}
      
        587
        	return b
      
        588
        }
      
        589

1	package llm
2
3	import (
4	"fmt"
5	"math"
6	"time"
7	)
8
9	// BenchmarkSample represents a real command failure with expected outputs
10	type BenchmarkSample struct {
11	ID string
12	Command string
13	ExitCode int
14	Stderr string
15	Context SmartFallbackContext
16	Category string // "git", "npm", "docker", etc.
17	Description string
18	GoldInsults []string // Human-written example insults
19	Tags []string // Expected tags for this scenario
20	}
21
22	// BenchmarkResults contains evaluation metrics
23	type BenchmarkResults struct {
24	SystemName string
25	TotalSamples int
26	AvgRelevance float64
27	AvgLatency time.Duration
28	AvgConfidence float64
29	DiversityScore float64
30	FallbackRate float64
31	MemoryUsageKB int
32	DetailedScores []SampleScore
33	}
34
35	// SampleScore contains per-sample evaluation
36	type SampleScore struct {
37	SampleID string
38	GeneratedInsult string
39	Relevance float64 // 0-1: How relevant to the error
40	Latency time.Duration
41	Confidence float64
42	NoveltyScore float64
43	Method string // "semantic", "tag", "markov", "ensemble"
44	}
45
46	// Benchmark framework for systematic evaluation
47	type Benchmark struct {
48	Name string
49	Samples []BenchmarkSample
50	}
51
52	// NewBenchmark creates a comprehensive benchmark dataset
53	func NewBenchmark() *Benchmark {
54	return &Benchmark{
55	Name: "Parrot Insult Quality Benchmark v1.0",
56	Samples: createBenchmarkSamples(),
57	}
58	}
59
60	// createBenchmarkSamples creates a comprehensive test dataset
61	func createBenchmarkSamples() []BenchmarkSample {
62	samples := []BenchmarkSample{}
63
64	// Git failures
65	samples = append(samples, BenchmarkSample{
66	ID: "git-001",
67	Command: "git push origin main",
68	ExitCode: 1,
69	Stderr: "error: failed to push some refs\nTo github.com:user/repo.git\n ! [rejected] main -> main (fetch first)",
70	Context: SmartFallbackContext{
71	CommandType: "git",
72	Command: "git",
73	Subcommand: "push",
74	GitBranch: "main",
75	ErrorPattern: "permission_denied",
76	IsRepeatedFailure: false,
77	},
78	Category: "git",
79	Description: "Git push rejected on main branch",
80	GoldInsults: []string{
81	"Push rejected. Did you forget to pull first?",
82	"The remote has standards. Your code doesn't meet them.",
83	},
84	Tags: []string{"git", "push", "main_branch"},
85	})
86
87	samples = append(samples, BenchmarkSample{
88	ID: "git-002",
89	Command: "git merge feature/new-ui",
90	ExitCode: 1,
91	Stderr: "CONFLICT (content): Merge conflict in src/app.js\nAutomatic merge failed; fix conflicts and then commit the result.",
92	Context: SmartFallbackContext{
93	CommandType: "git",
94	Command: "git",
95	Subcommand: "merge",
96	GitBranch: "main",
97	ErrorPattern: "merge_conflict",
98	IsRepeatedFailure: false,
99	},
100	Category: "git",
101	Description: "Merge conflict",
102	GoldInsults: []string{
103	"Merge conflict. Maybe communicate with your team?",
104	"<<<<<<< HEAD is not a valid merge resolution strategy",
105	},
106	Tags: []string{"git", "merge", "merge_conflict"},
107	})
108
109	samples = append(samples, BenchmarkSample{
110	ID: "git-003",
111	Command: "git push --force origin main",
112	ExitCode: 1,
113	Stderr: "error: refusing to update checked out branch: refs/heads/main",
114	Context: SmartFallbackContext{
115	CommandType: "git",
116	Command: "git",
117	Subcommand: "push",
118	GitBranch: "main",
119	ErrorPattern: "permission_denied",
120	IsRepeatedFailure: true,
121	TimeOfDay: 2,
122	},
123	Category: "git",
124	Description: "Force push to main at 2 AM (repeated failure)",
125	GoldInsults: []string{
126	"Force pushing to main at 2 AM? Bold strategy.",
127	"--force won't force competence into you",
128	},
129	Tags: []string{"git", "push", "main_branch", "late_night", "repeated"},
130	})
131
132	// NPM failures
133	samples = append(samples, BenchmarkSample{
134	ID: "npm-001",
135	Command: "npm install",
136	ExitCode: 1,
137	Stderr: "npm ERR! code ENOENT\nnpm ERR! syscall open\nnpm ERR! path /home/user/project/package.json\nnpm ERR! errno -2",
138	Context: SmartFallbackContext{
139	CommandType: "nodejs",
140	Command: "npm",
141	Subcommand: "install",
142	ProjectType: "node",
143	ErrorPattern: "not_found",
144	},
145	Category: "npm",
146	Description: "Missing package.json",
147	GoldInsults: []string{
148	"package.json not found. Neither is your organizational skill.",
149	"Are you in the right directory? Rhetorical question.",
150	},
151	Tags: []string{"npm", "install", "not_found"},
152	})
153
154	samples = append(samples, BenchmarkSample{
155	ID: "npm-002",
156	Command: "npm install typescript --save-dev",
157	ExitCode: 1,
158	Stderr: "npm ERR! code ERESOLVE\nnpm ERR! ERESOLVE unable to resolve dependency tree\nnpm ERR! peer dep missing: react@^18.0.0",
159	Context: SmartFallbackContext{
160	CommandType: "nodejs",
161	Command: "npm",
162	Subcommand: "install",
163	ProjectType: "node",
164	ErrorPattern: "dependency",
165	},
166	Category: "npm",
167	Description: "Dependency resolution failure",
168	GoldInsults: []string{
169	"Dependency hell. You're everyone's least favorite dependency.",
170	"ERESOLVE: Can't resolve your incompetence either",
171	},
172	Tags: []string{"npm", "install", "dependency"},
173	})
174
175	samples = append(samples, BenchmarkSample{
176	ID: "npm-003",
177	Command: "npm test",
178	ExitCode: 1,
179	Stderr: "FAIL src/components/App.test.js\n ● App › renders correctly\n expect(received).toEqual(expected)\n Expected: true\n Received: false",
180	Context: SmartFallbackContext{
181	CommandType: "nodejs",
182	Command: "npm",
183	Subcommand: "test",
184	ProjectType: "node",
185	ErrorPattern: "test_failure",
186	IsCI: true,
187	CIProvider: "github",
188	},
189	Category: "npm",
190	Description: "Test failure in CI",
191	GoldInsults: []string{
192	"Tests failed. Shocking absolutely no one who read your code",
193	"Did you test this before committing? Oh wait, that's what CI is for",
194	},
195	Tags: []string{"npm", "test", "test_failure", "ci"},
196	})
197
198	// Docker failures
199	samples = append(samples, BenchmarkSample{
200	ID: "docker-001",
201	Command: "docker build -t myapp .",
202	ExitCode: 1,
203	Stderr: "Step 5/10 : RUN npm install\nERROR [5/10] RUN npm install\nfailed to solve with frontend dockerfile.v0",
204	Context: SmartFallbackContext{
205	CommandType: "docker",
206	Command: "docker",
207	Subcommand: "build",
208	HasDockerfile: true,
209	ErrorPattern: "build_failure",
210	},
211	Category: "docker",
212	Description: "Docker build failure",
213	GoldInsults: []string{
214	"Docker build failed. Can't containerize disaster.",
215	"FROM scratch. You are scratch.",
216	},
217	Tags: []string{"docker", "build", "build_failure"},
218	})
219
220	samples = append(samples, BenchmarkSample{
221	ID: "docker-002",
222	Command: "docker run -p 3000:3000 myapp",
223	ExitCode: 125,
224	Stderr: "docker: Error response from daemon: driver failed programming external connectivity on endpoint\nError starting userland proxy: listen tcp4 0.0.0.0:3000: bind: address already in use.",
225	Context: SmartFallbackContext{
226	CommandType: "docker",
227	Command: "docker",
228	Subcommand: "run",
229	ErrorPattern: "port_in_use",
230	NumericArgs: []int{3000},
231	},
232	Category: "docker",
233	Description: "Port already in use",
234	GoldInsults: []string{
235	"Port 3000 already in use. By someone competent, probably.",
236	"Port conflict. Your existence is a conflict.",
237	},
238	Tags: []string{"docker", "run", "network"},
239	})
240
241	// Python failures
242	samples = append(samples, BenchmarkSample{
243	ID: "python-001",
244	Command: "python app.py",
245	ExitCode: 1,
246	Stderr: "Traceback (most recent call last):\n File \"app.py\", line 5, in <module>\n import requests\nModuleNotFoundError: No module named 'requests'",
247	Context: SmartFallbackContext{
248	CommandType: "python",
249	Command: "python",
250	ProjectType: "python",
251	ErrorPattern: "dependency",
252	FileExtensions: []string{".py"},
253	},
254	Category: "python",
255	Description: "Missing Python module",
256	GoldInsults: []string{
257	"ModuleNotFoundError: Module 'brain' not found",
258	"Did you activate your venv? Don't answer, I know you didn't",
259	},
260	Tags: []string{"python", "dependency"},
261	})
262
263	samples = append(samples, BenchmarkSample{
264	ID: "python-002",
265	Command: "python script.py",
266	ExitCode: 1,
267	Stderr: " File \"script.py\", line 15\n if x == 5\nSyntaxError: invalid syntax",
268	Context: SmartFallbackContext{
269	CommandType: "python",
270	Command: "python",
271	ProjectType: "python",
272	ErrorPattern: "syntax_error",
273	FileExtensions: []string{".py"},
274	},
275	Category: "python",
276	Description: "Python syntax error",
277	GoldInsults: []string{
278	"SyntaxError: Invalid syntax, invalid developer",
279	"Python is trying to tell you something. Maybe listen for once?",
280	},
281	Tags: []string{"python", "syntax"},
282	})
283
284	// Rust failures
285	samples = append(samples, BenchmarkSample{
286	ID: "rust-001",
287	Command: "cargo build",
288	ExitCode: 101,
289	Stderr: "error[E0502]: cannot borrow `x` as mutable because it is also borrowed as immutable\n --> src/main.rs:10:5",
290	Context: SmartFallbackContext{
291	CommandType: "rust",
292	Command: "cargo",
293	Subcommand: "build",
294	ProjectType: "rust",
295	ErrorPattern: "borrow_checker",
296	},
297	Category: "rust",
298	Description: "Borrow checker error",
299	GoldInsults: []string{
300	"Borrow checker says no. And honestly, it has a point.",
301	"Fighting the borrow checker? The borrow checker always wins.",
302	},
303	Tags: []string{"rust", "build", "borrow_checker"},
304	})
305
306	// Permission errors
307	samples = append(samples, BenchmarkSample{
308	ID: "perm-001",
309	Command: "chmod 777 /etc/passwd",
310	ExitCode: 1,
311	Stderr: "chmod: changing permissions of '/etc/passwd': Operation not permitted",
312	Context: SmartFallbackContext{
313	Command: "chmod",
314	ErrorPattern: "permission_denied",
315	NumericArgs: []int{777},
316	},
317	Category: "permission",
318	Description: "Permission denied with chmod 777",
319	GoldInsults: []string{
320	"chmod 777 isn't the answer this time, though I admire your optimism",
321	"777: Jackpot of incompetence",
322	},
323	Tags: []string{"permission", "chmod"},
324	})
325
326	// Late night scenarios
327	samples = append(samples, BenchmarkSample{
328	ID: "time-001",
329	Command: "make build",
330	ExitCode: 2,
331	Stderr: "make: *** [Makefile:15: build] Error 2",
332	Context: SmartFallbackContext{
333	Command: "make",
334	ErrorPattern: "build_failure",
335	TimeOfDay: 3,
336	HasMakefile: true,
337	},
338	Category: "build",
339	Description: "Build failure at 3 AM",
340	GoldInsults: []string{
341	"It's 3 AM. The bugs aren't the only thing that needs fixing",
342	"Late night debugging? Tomorrow-you is going to hate today-you",
343	},
344	Tags: []string{"build", "late_night"},
345	})
346
347	return samples
348	}
349
350	// EvaluateSystem runs the benchmark against a system
351	func (b Benchmark) EvaluateSystem(system EnsembleSystem) BenchmarkResults {
352	results := BenchmarkResults{
353	SystemName: "Ensemble ML System",
354	TotalSamples: len(b.Samples),
355	DetailedScores: make([]SampleScore, 0, len(b.Samples)),
356	}
357
358	var totalRelevance float64
359	var totalLatency time.Duration
360	var totalConfidence float64
361	var fallbackCount int
362
363	for _, sample := range b.Samples {
364	start := time.Now()
365	insult := system.GenerateInsult(&sample.Context, "sarcastic")
366	latency := time.Since(start)
367
368	// Calculate relevance score
369	relevance := calculateRelevanceScore(sample, insult)
370
371	// Determine if it was a Markov fallback
372	isFallback := len(insult) > 0 && !containsInsult(system.database.Insults, insult)
373
374	if isFallback {
375	fallbackCount++
376	}
377
378	score := SampleScore{
379	SampleID: sample.ID,
380	GeneratedInsult: insult,
381	Relevance: relevance,
382	Latency: latency,
383	Confidence: 0.75, // Placeholder
384	NoveltyScore: 1.0,
385	Method: determineMethod(isFallback),
386	}
387
388	results.DetailedScores = append(results.DetailedScores, score)
389
390	totalRelevance += relevance
391	totalLatency += latency
392	totalConfidence += score.Confidence
393	}
394
395	results.AvgRelevance = totalRelevance / float64(len(b.Samples))
396	results.AvgLatency = totalLatency / time.Duration(len(b.Samples))
397	results.AvgConfidence = totalConfidence / float64(len(b.Samples))
398	results.FallbackRate = float64(fallbackCount) / float64(len(b.Samples))
399	results.DiversityScore = calculateDiversityScore(results.DetailedScores)
400
401	return results
402	}
403
404	// calculateRelevanceScore measures how relevant the insult is to the error
405	func calculateRelevanceScore(sample BenchmarkSample, insult string) float64 {
406	score := 0.0
407
408	// Check for keyword matches
409	keywords := extractKeywords(sample)
410	for _, keyword := range keywords {
411	if containsWord(insult, keyword) {
412	score += 0.2
413	}
414	}
415
416	// Check for tag matches
417	for _, tag := range sample.Tags {
418	if containsWord(insult, tag) {
419	score += 0.15
420	}
421	}
422
423	// Check similarity to gold insults
424	if len(sample.GoldInsults) > 0 {
425	maxSimilarity := 0.0
426	for _, gold := range sample.GoldInsults {
427	sim := simpleStringSimilarity(insult, gold)
428	if sim > maxSimilarity {
429	maxSimilarity = sim
430	}
431	}
432	score += maxSimilarity * 0.3
433	}
434
435	return math.Min(1.0, score)
436	}
437
438	// extractKeywords extracts key terms from sample
439	func extractKeywords(sample BenchmarkSample) []string {
440	keywords := []string{
441	sample.Context.Command,
442	sample.Context.Subcommand,
443	sample.Context.CommandType,
444	sample.Context.ErrorPattern,
445	}
446
447	if sample.Context.GitBranch != "" {
448	keywords = append(keywords, sample.Context.GitBranch)
449	}
450
451	if sample.Context.ProjectType != "" {
452	keywords = append(keywords, sample.Context.ProjectType)
453	}
454
455	return keywords
456	}
457
458	// containsWord checks if text contains word (case-insensitive)
459	func containsWord(text, word string) bool {
460	textLower := toLower(text)
461	wordLower := toLower(word)
462	return contains(textLower, wordLower)
463	}
464
465	// simpleStringSimilarity calculates basic string similarity
466	func simpleStringSimilarity(s1, s2 string) float64 {
467	// Simple word overlap metric
468	words1 := splitWords(toLower(s1))
469	words2 := splitWords(toLower(s2))
470
471	if len(words1) == 0 \|\| len(words2) == 0 {
472	return 0.0
473	}
474
475	matches := 0
476	for _, w1 := range words1 {
477	for _, w2 := range words2 {
478	if w1 == w2 && len(w1) > 2 { // Skip short words
479	matches++
480	break
481	}
482	}
483	}
484
485	return float64(matches) / float64(max(len(words1), len(words2)))
486	}
487
488	// calculateDiversityScore measures insult variety
489	func calculateDiversityScore(scores []SampleScore) float64 {
490	if len(scores) < 2 {
491	return 1.0
492	}
493
494	// Count unique insults
495	unique := make(map[string]bool)
496	for _, score := range scores {
497	unique[score.GeneratedInsult] = true
498	}
499
500	return float64(len(unique)) / float64(len(scores))
501	}
502
503	// containsInsult checks if insult exists in database
504	func containsInsult(insults []TaggedInsult, target string) bool {
505	for _, insult := range insults {
506	if insult.Text == target {
507	return true
508	}
509	}
510	return false
511	}
512
513	// determineMethod identifies which method generated the insult
514	func determineMethod(isFallback bool) string {
515	if isFallback {
516	return "markov"
517	}
518	return "ensemble"
519	}
520
521	// PrintResults outputs benchmark results
522	func (r *BenchmarkResults) Print() {
523	fmt.Println("╔═══════════════════════════════════════════════════════════╗")
524	fmt.Printf("║ Benchmark Results: %-38s ║\n", r.SystemName)
525	fmt.Println("╠═══════════════════════════════════════════════════════════╣")
526	fmt.Printf("║ Total Samples: %-41d ║\n", r.TotalSamples)
527	fmt.Printf("║ Avg Relevance: %-41.3f ║\n", r.AvgRelevance)
528	fmt.Printf("║ Avg Latency: %-41s ║\n", r.AvgLatency)
529	fmt.Printf("║ Avg Confidence: %-41.3f ║\n", r.AvgConfidence)
530	fmt.Printf("║ Diversity Score: %-41.3f ║\n", r.DiversityScore)
531	fmt.Printf("║ Fallback Rate: %-40.1f%% ║\n", r.FallbackRate*100)
532	fmt.Println("╚═══════════════════════════════════════════════════════════╝")
533	}
534
535	// Helper functions
536	func toLower(s string) string {
537	result := ""
538	for _, r := range s {
539	if r >= 'A' && r <= 'Z' {
540	result += string(r + 32)
541	} else {
542	result += string(r)
543	}
544	}
545	return result
546	}
547
548	func contains(s, substr string) bool {
549	return len(s) >= len(substr) && findSubstring(s, substr) >= 0
550	}
551
552	func findSubstring(s, substr string) int {
553	for i := 0; i <= len(s)-len(substr); i++ {
554	if s[i:i+len(substr)] == substr {
555	return i
556	}
557	}
558	return -1
559	}
560
561	func splitWords(s string) []string {
562	var words []string
563	var current string
564
565	for _, r := range s {
566	if (r >= 'a' && r <= 'z') \|\| (r >= '0' && r <= '9') {
567	current += string(r)
568	} else {
569	if len(current) > 0 {
570	words = append(words, current)
571	current = ""
572	}
573	}
574	}
575
576	if len(current) > 0 {
577	words = append(words, current)
578	}
579
580	return words
581	}
582
583	func max(a, b int) int {
584	if a > b {
585	return a
586	}
587	return b
588	}
589