| 1 | package main |
| 2 | |
| 3 | import ( |
| 4 | "fmt" |
| 5 | "parrot/internal/llm" |
| 6 | ) |
| 7 | |
| 8 | func main() { |
| 9 | fmt.Println("Parrot Insult System Benchmark") |
| 10 | fmt.Println("================================") |
| 11 | |
| 12 | // Create benchmark |
| 13 | benchmark := llm.NewBenchmark() |
| 14 | |
| 15 | fmt.Printf("Loading benchmark with %d samples...\n\n", len(benchmark.Samples)) |
| 16 | |
| 17 | // Initialize ensemble system |
| 18 | db := llm.NewInsultDatabase() |
| 19 | scorer := llm.NewInsultScorer(db) |
| 20 | hist := llm.NewInsultHistory(20) |
| 21 | ensemble := llm.NewEnsembleSystem(db, scorer, hist) |
| 22 | |
| 23 | fmt.Println("Training ensemble system...") |
| 24 | ensemble.Train() |
| 25 | fmt.Println("Training complete!") |
| 26 | |
| 27 | // Run benchmark |
| 28 | fmt.Println("Running benchmark...") |
| 29 | results := benchmark.EvaluateSystem(ensemble) |
| 30 | |
| 31 | // Print results |
| 32 | fmt.Println() |
| 33 | results.Print() |
| 34 | |
| 35 | // Print detailed sample results |
| 36 | fmt.Println("\nDetailed Sample Results:") |
| 37 | fmt.Println("========================") |
| 38 | |
| 39 | for i, score := range results.DetailedScores { |
| 40 | if i >= 10 { // Show first 10 |
| 41 | fmt.Printf("... and %d more samples\n", len(results.DetailedScores)-10) |
| 42 | break |
| 43 | } |
| 44 | |
| 45 | sample := benchmark.Samples[i] |
| 46 | fmt.Printf("Sample: %s (%s)\n", sample.ID, sample.Description) |
| 47 | fmt.Printf(" Command: %s\n", sample.Command) |
| 48 | fmt.Printf(" Generated: %s\n", score.GeneratedInsult) |
| 49 | fmt.Printf(" Relevance: %.3f | Latency: %v | Method: %s\n", |
| 50 | score.Relevance, score.Latency, score.Method) |
| 51 | fmt.Println() |
| 52 | } |
| 53 | |
| 54 | // Summary statistics |
| 55 | fmt.Println("\nAnalysis:") |
| 56 | fmt.Println("=========") |
| 57 | |
| 58 | if results.AvgRelevance < 0.6 { |
| 59 | fmt.Println("⚠️ Low relevance score - need better context matching") |
| 60 | } else if results.AvgRelevance < 0.75 { |
| 61 | fmt.Println("⚡ Moderate relevance - room for improvement") |
| 62 | } else { |
| 63 | fmt.Println("✅ Good relevance scores!") |
| 64 | } |
| 65 | |
| 66 | if results.FallbackRate > 0.3 { |
| 67 | fmt.Println("⚠️ High Markov fallback rate - database may need expansion") |
| 68 | } else { |
| 69 | fmt.Println("✅ Low fallback rate - good database coverage") |
| 70 | } |
| 71 | |
| 72 | if results.DiversityScore < 0.8 { |
| 73 | fmt.Println("⚠️ Low diversity - seeing too many similar insults") |
| 74 | } else { |
| 75 | fmt.Println("✅ Good diversity in selections") |
| 76 | } |
| 77 | |
| 78 | fmt.Println("\nBenchmark complete!") |
| 79 | } |
| 80 |