| 1 | package llm |
| 2 | |
| 3 | import ( |
| 4 | "math/rand" |
| 5 | "strings" |
| 6 | "sync" |
| 7 | "time" |
| 8 | ) |
| 9 | |
| 10 | // MarkovGenerator generates novel insults using Markov chains |
| 11 | type MarkovGenerator struct { |
| 12 | mu sync.RWMutex |
| 13 | chains map[string]map[string]int // state -> next_word -> count |
| 14 | starters []string // possible starting words |
| 15 | order int // n-gram order (2 = bigram) |
| 16 | minLength int // minimum generated text length |
| 17 | maxLength int // maximum generated text length |
| 18 | rng *rand.Rand |
| 19 | } |
| 20 | |
| 21 | // NewMarkovGenerator creates a new Markov chain generator |
| 22 | func NewMarkovGenerator(order int) *MarkovGenerator { |
| 23 | return &MarkovGenerator{ |
| 24 | chains: make(map[string]map[string]int), |
| 25 | starters: make([]string, 0), |
| 26 | order: order, |
| 27 | minLength: 30, // Minimum 30 characters |
| 28 | maxLength: 150, // Maximum 150 characters |
| 29 | rng: rand.New(rand.NewSource(time.Now().UnixNano())), |
| 30 | } |
| 31 | } |
| 32 | |
| 33 | // Train trains the Markov chain on a corpus of insults |
| 34 | func (mg *MarkovGenerator) Train(insults []string) { |
| 35 | mg.mu.Lock() |
| 36 | defer mg.mu.Unlock() |
| 37 | |
| 38 | for _, insult := range insults { |
| 39 | mg.trainOnTextUnlocked(insult) |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | // trainOnTextUnlocked trains on a single text (caller must hold lock) |
| 44 | func (mg *MarkovGenerator) trainOnTextUnlocked(text string) { |
| 45 | words := mg.tokenize(text) |
| 46 | if len(words) < mg.order+1 { |
| 47 | return |
| 48 | } |
| 49 | |
| 50 | // Add first state as starter |
| 51 | state := strings.Join(words[:mg.order], " ") |
| 52 | mg.starters = append(mg.starters, state) |
| 53 | |
| 54 | // Build chain |
| 55 | for i := 0; i < len(words)-mg.order; i++ { |
| 56 | state := strings.Join(words[i:i+mg.order], " ") |
| 57 | nextWord := words[i+mg.order] |
| 58 | |
| 59 | if _, exists := mg.chains[state]; !exists { |
| 60 | mg.chains[state] = make(map[string]int) |
| 61 | } |
| 62 | |
| 63 | mg.chains[state][nextWord]++ |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | // tokenize splits text into words |
| 68 | func (mg *MarkovGenerator) tokenize(text string) []string { |
| 69 | // Split on spaces and punctuation, but keep punctuation |
| 70 | var words []string |
| 71 | var currentWord strings.Builder |
| 72 | |
| 73 | for _, r := range text { |
| 74 | if r == ' ' || r == '\n' || r == '\t' { |
| 75 | if currentWord.Len() > 0 { |
| 76 | words = append(words, currentWord.String()) |
| 77 | currentWord.Reset() |
| 78 | } |
| 79 | } else if r == '.' || r == '!' || r == '?' || r == ',' || r == ':' || r == ';' { |
| 80 | if currentWord.Len() > 0 { |
| 81 | words = append(words, currentWord.String()) |
| 82 | currentWord.Reset() |
| 83 | } |
| 84 | words = append(words, string(r)) |
| 85 | } else { |
| 86 | currentWord.WriteRune(r) |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | if currentWord.Len() > 0 { |
| 91 | words = append(words, currentWord.String()) |
| 92 | } |
| 93 | |
| 94 | return words |
| 95 | } |
| 96 | |
| 97 | // Generate generates a novel insult |
| 98 | func (mg *MarkovGenerator) Generate() string { |
| 99 | mg.mu.RLock() |
| 100 | defer mg.mu.RUnlock() |
| 101 | |
| 102 | if len(mg.starters) == 0 || len(mg.chains) == 0 { |
| 103 | return "" // Not trained yet |
| 104 | } |
| 105 | |
| 106 | // Pick a random starting state |
| 107 | state := mg.starters[mg.rng.Intn(len(mg.starters))] |
| 108 | words := strings.Split(state, " ") |
| 109 | |
| 110 | // Generate until we hit max length or a terminal state |
| 111 | attempts := 0 |
| 112 | maxAttempts := 100 |
| 113 | |
| 114 | for len(strings.Join(words, " ")) < mg.maxLength && attempts < maxAttempts { |
| 115 | attempts++ |
| 116 | |
| 117 | // Get next word choices |
| 118 | nextWords := mg.chains[state] |
| 119 | if len(nextWords) == 0 { |
| 120 | break // Terminal state |
| 121 | } |
| 122 | |
| 123 | // Choose next word based on frequency |
| 124 | nextWord := mg.weightedChoice(nextWords) |
| 125 | words = append(words, nextWord) |
| 126 | |
| 127 | // Update state |
| 128 | if len(words) >= mg.order { |
| 129 | state = strings.Join(words[len(words)-mg.order:], " ") |
| 130 | } |
| 131 | |
| 132 | // Stop at sentence endings if we've generated enough |
| 133 | if (nextWord == "." || nextWord == "!" || nextWord == "?") && |
| 134 | len(strings.Join(words, " ")) >= mg.minLength { |
| 135 | break |
| 136 | } |
| 137 | } |
| 138 | |
| 139 | // Reconstruct text with proper spacing |
| 140 | return mg.reconstructText(words) |
| 141 | } |
| 142 | |
| 143 | // weightedChoice selects a word based on frequency weights |
| 144 | func (mg *MarkovGenerator) weightedChoice(choices map[string]int) string { |
| 145 | // Calculate total weight |
| 146 | totalWeight := 0 |
| 147 | for _, count := range choices { |
| 148 | totalWeight += count |
| 149 | } |
| 150 | |
| 151 | // Random selection |
| 152 | r := mg.rng.Intn(totalWeight) |
| 153 | cumulative := 0 |
| 154 | |
| 155 | for word, count := range choices { |
| 156 | cumulative += count |
| 157 | if r < cumulative { |
| 158 | return word |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | // Fallback (shouldn't reach here) |
| 163 | for word := range choices { |
| 164 | return word |
| 165 | } |
| 166 | |
| 167 | return "" |
| 168 | } |
| 169 | |
| 170 | // reconstructText reconstructs text with proper spacing around punctuation |
| 171 | func (mg *MarkovGenerator) reconstructText(words []string) string { |
| 172 | var result strings.Builder |
| 173 | |
| 174 | for i, word := range words { |
| 175 | // Don't add space before punctuation |
| 176 | if i > 0 && !mg.isPunctuation(word) { |
| 177 | result.WriteString(" ") |
| 178 | } |
| 179 | |
| 180 | result.WriteString(word) |
| 181 | } |
| 182 | |
| 183 | return result.String() |
| 184 | } |
| 185 | |
| 186 | // isPunctuation checks if a word is punctuation |
| 187 | func (mg *MarkovGenerator) isPunctuation(word string) bool { |
| 188 | return word == "." || word == "!" || word == "?" || |
| 189 | word == "," || word == ":" || word == ";" || |
| 190 | word == "(" || word == ")" |
| 191 | } |
| 192 | |
| 193 | // GenerateContextual generates an insult with context hints |
| 194 | func (mg *MarkovGenerator) GenerateContextual(seedWords []string) string { |
| 195 | mg.mu.RLock() |
| 196 | defer mg.mu.RUnlock() |
| 197 | |
| 198 | if len(mg.chains) == 0 { |
| 199 | return "" |
| 200 | } |
| 201 | |
| 202 | // Find states that contain any of the seed words |
| 203 | var matchingStarters []string |
| 204 | for _, starter := range mg.starters { |
| 205 | for _, seed := range seedWords { |
| 206 | if strings.Contains(strings.ToLower(starter), strings.ToLower(seed)) { |
| 207 | matchingStarters = append(matchingStarters, starter) |
| 208 | break |
| 209 | } |
| 210 | } |
| 211 | } |
| 212 | |
| 213 | // If we found matching starters, use them; otherwise use any starter |
| 214 | if len(matchingStarters) == 0 { |
| 215 | matchingStarters = mg.starters |
| 216 | } |
| 217 | |
| 218 | // Pick a random matching starter |
| 219 | state := matchingStarters[mg.rng.Intn(len(matchingStarters))] |
| 220 | words := strings.Split(state, " ") |
| 221 | |
| 222 | // Generate as normal |
| 223 | attempts := 0 |
| 224 | maxAttempts := 100 |
| 225 | |
| 226 | for len(strings.Join(words, " ")) < mg.maxLength && attempts < maxAttempts { |
| 227 | attempts++ |
| 228 | |
| 229 | nextWords := mg.chains[state] |
| 230 | if len(nextWords) == 0 { |
| 231 | break |
| 232 | } |
| 233 | |
| 234 | nextWord := mg.weightedChoice(nextWords) |
| 235 | words = append(words, nextWord) |
| 236 | |
| 237 | if len(words) >= mg.order { |
| 238 | state = strings.Join(words[len(words)-mg.order:], " ") |
| 239 | } |
| 240 | |
| 241 | if (nextWord == "." || nextWord == "!" || nextWord == "?") && |
| 242 | len(strings.Join(words, " ")) >= mg.minLength { |
| 243 | break |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | return mg.reconstructText(words) |
| 248 | } |
| 249 | |
| 250 | // GenerateWithTemplate generates using a template with variable slots |
| 251 | func (mg *MarkovGenerator) GenerateWithTemplate(template string, variables map[string]string) string { |
| 252 | result := template |
| 253 | |
| 254 | for key, value := range variables { |
| 255 | placeholder := "{" + key + "}" |
| 256 | result = strings.ReplaceAll(result, placeholder, value) |
| 257 | } |
| 258 | |
| 259 | // Fill remaining slots with Markov-generated content |
| 260 | if strings.Contains(result, "{random}") { |
| 261 | generated := mg.Generate() |
| 262 | result = strings.ReplaceAll(result, "{random}", generated) |
| 263 | } |
| 264 | |
| 265 | return result |
| 266 | } |
| 267 | |
| 268 | // Blend creates a hybrid insult by blending Markov generation with templates |
| 269 | func (mg *MarkovGenerator) Blend(ctx *SmartFallbackContext) string { |
| 270 | // Extract key terms from the context |
| 271 | seedWords := []string{} |
| 272 | |
| 273 | // Add command type |
| 274 | if ctx.CommandType != "" { |
| 275 | seedWords = append(seedWords, ctx.CommandType) |
| 276 | } |
| 277 | |
| 278 | // Add command |
| 279 | if ctx.Command != "" { |
| 280 | seedWords = append(seedWords, ctx.Command) |
| 281 | } |
| 282 | |
| 283 | // Add error pattern |
| 284 | if ctx.ErrorPattern != "" { |
| 285 | seedWords = append(seedWords, strings.ReplaceAll(ctx.ErrorPattern, "_", " ")) |
| 286 | } |
| 287 | |
| 288 | // Generate contextual insult |
| 289 | generated := mg.GenerateContextual(seedWords) |
| 290 | |
| 291 | // Post-process: ensure it's not too similar to training data |
| 292 | if mg.tooSimilarToTraining(generated) { |
| 293 | // Try again with different seed |
| 294 | return mg.Generate() |
| 295 | } |
| 296 | |
| 297 | return generated |
| 298 | } |
| 299 | |
| 300 | // tooSimilarToTraining checks if generated text is too close to training data |
| 301 | func (mg *MarkovGenerator) tooSimilarToTraining(text string) bool { |
| 302 | // Simple heuristic: if the text is very short or contains many consecutive |
| 303 | // words from a single training example, it's too similar |
| 304 | return len(text) < mg.minLength |
| 305 | } |
| 306 | |
| 307 | // HybridGenerate combines Markov with template system for best results |
| 308 | func (mg *MarkovGenerator) HybridGenerate( |
| 309 | ctx *SmartFallbackContext, |
| 310 | templates []string, |
| 311 | ) string { |
| 312 | // 50% chance to use pure Markov, 50% template + Markov |
| 313 | if mg.rng.Float64() < 0.5 { |
| 314 | return mg.Blend(ctx) |
| 315 | } |
| 316 | |
| 317 | // Pick a random template |
| 318 | if len(templates) == 0 { |
| 319 | return mg.Blend(ctx) |
| 320 | } |
| 321 | |
| 322 | template := templates[mg.rng.Intn(len(templates))] |
| 323 | |
| 324 | // Fill template variables |
| 325 | variables := map[string]string{ |
| 326 | "command": ctx.Command, |
| 327 | "commandType": ctx.CommandType, |
| 328 | "exitCode": string(rune(ctx.ExitCode)), |
| 329 | "error": ctx.ErrorPattern, |
| 330 | } |
| 331 | |
| 332 | return mg.GenerateWithTemplate(template, variables) |
| 333 | } |
| 334 | |
| 335 | // GetStats returns statistics about the trained model |
| 336 | func (mg *MarkovGenerator) GetStats() map[string]interface{} { |
| 337 | return map[string]interface{}{ |
| 338 | "states": len(mg.chains), |
| 339 | "starters": len(mg.starters), |
| 340 | "order": mg.order, |
| 341 | "vocabulary": mg.countVocabulary(), |
| 342 | "avg_choices": mg.averageChoices(), |
| 343 | } |
| 344 | } |
| 345 | |
| 346 | func (mg *MarkovGenerator) countVocabulary() int { |
| 347 | vocab := make(map[string]bool) |
| 348 | for state := range mg.chains { |
| 349 | words := strings.Split(state, " ") |
| 350 | for _, word := range words { |
| 351 | vocab[word] = true |
| 352 | } |
| 353 | } |
| 354 | return len(vocab) |
| 355 | } |
| 356 | |
| 357 | func (mg *MarkovGenerator) averageChoices() float64 { |
| 358 | if len(mg.chains) == 0 { |
| 359 | return 0 |
| 360 | } |
| 361 | |
| 362 | total := 0 |
| 363 | for _, choices := range mg.chains { |
| 364 | total += len(choices) |
| 365 | } |
| 366 | |
| 367 | return float64(total) / float64(len(mg.chains)) |
| 368 | } |
| 369 |