Go · 2116 bytes Raw Blame History
1 package main
2
3 import (
4 "fmt"
5 "parrot/internal/llm"
6 )
7
8 func main() {
9 fmt.Println("Parrot Insult System Benchmark")
10 fmt.Println("================================")
11
12 // Create benchmark
13 benchmark := llm.NewBenchmark()
14
15 fmt.Printf("Loading benchmark with %d samples...\n\n", len(benchmark.Samples))
16
17 // Initialize ensemble system
18 db := llm.NewInsultDatabase()
19 scorer := llm.NewInsultScorer(db)
20 hist := llm.NewInsultHistory(20)
21 ensemble := llm.NewEnsembleSystem(db, scorer, hist)
22
23 fmt.Println("Training ensemble system...")
24 ensemble.Train()
25 fmt.Println("Training complete!")
26
27 // Run benchmark
28 fmt.Println("Running benchmark...")
29 results := benchmark.EvaluateSystem(ensemble)
30
31 // Print results
32 fmt.Println()
33 results.Print()
34
35 // Print detailed sample results
36 fmt.Println("\nDetailed Sample Results:")
37 fmt.Println("========================")
38
39 for i, score := range results.DetailedScores {
40 if i >= 10 { // Show first 10
41 fmt.Printf("... and %d more samples\n", len(results.DetailedScores)-10)
42 break
43 }
44
45 sample := benchmark.Samples[i]
46 fmt.Printf("Sample: %s (%s)\n", sample.ID, sample.Description)
47 fmt.Printf(" Command: %s\n", sample.Command)
48 fmt.Printf(" Generated: %s\n", score.GeneratedInsult)
49 fmt.Printf(" Relevance: %.3f | Latency: %v | Method: %s\n",
50 score.Relevance, score.Latency, score.Method)
51 fmt.Println()
52 }
53
54 // Summary statistics
55 fmt.Println("\nAnalysis:")
56 fmt.Println("=========")
57
58 if results.AvgRelevance < 0.6 {
59 fmt.Println("⚠️ Low relevance score - need better context matching")
60 } else if results.AvgRelevance < 0.75 {
61 fmt.Println("⚡ Moderate relevance - room for improvement")
62 } else {
63 fmt.Println("✅ Good relevance scores!")
64 }
65
66 if results.FallbackRate > 0.3 {
67 fmt.Println("⚠️ High Markov fallback rate - database may need expansion")
68 } else {
69 fmt.Println("✅ Low fallback rate - good database coverage")
70 }
71
72 if results.DiversityScore < 0.8 {
73 fmt.Println("⚠️ Low diversity - seeing too many similar insults")
74 } else {
75 fmt.Println("✅ Good diversity in selections")
76 }
77
78 fmt.Println("\nBenchmark complete!")
79 }
80