Go · 9286 bytes Raw Blame History
1 package llm
2
3 import (
4 "math/rand"
5 "strings"
6 "sync"
7 "time"
8 )
9
10 // MarkovGenerator generates novel insults using Markov chains
11 type MarkovGenerator struct {
12 mu sync.RWMutex
13 chains map[string]map[string]int // state -> next_word -> count
14 starters []string // possible starting words
15 order int // n-gram order (2 = bigram)
16 minLength int // minimum generated text length
17 maxLength int // maximum generated text length
18 rng *rand.Rand
19 }
20
21 // NewMarkovGenerator creates a new Markov chain generator
22 func NewMarkovGenerator(order int) *MarkovGenerator {
23 return &MarkovGenerator{
24 chains: make(map[string]map[string]int),
25 starters: make([]string, 0),
26 order: order,
27 minLength: 30, // Minimum 30 characters
28 maxLength: 150, // Maximum 150 characters
29 rng: rand.New(rand.NewSource(time.Now().UnixNano())),
30 }
31 }
32
33 // Train trains the Markov chain on a corpus of insults
34 func (mg *MarkovGenerator) Train(insults []string) {
35 mg.mu.Lock()
36 defer mg.mu.Unlock()
37
38 for _, insult := range insults {
39 mg.trainOnTextUnlocked(insult)
40 }
41 }
42
43 // trainOnTextUnlocked trains on a single text (caller must hold lock)
44 func (mg *MarkovGenerator) trainOnTextUnlocked(text string) {
45 words := mg.tokenize(text)
46 if len(words) < mg.order+1 {
47 return
48 }
49
50 // Add first state as starter
51 state := strings.Join(words[:mg.order], " ")
52 mg.starters = append(mg.starters, state)
53
54 // Build chain
55 for i := 0; i < len(words)-mg.order; i++ {
56 state := strings.Join(words[i:i+mg.order], " ")
57 nextWord := words[i+mg.order]
58
59 if _, exists := mg.chains[state]; !exists {
60 mg.chains[state] = make(map[string]int)
61 }
62
63 mg.chains[state][nextWord]++
64 }
65 }
66
67 // tokenize splits text into words
68 func (mg *MarkovGenerator) tokenize(text string) []string {
69 // Split on spaces and punctuation, but keep punctuation
70 var words []string
71 var currentWord strings.Builder
72
73 for _, r := range text {
74 if r == ' ' || r == '\n' || r == '\t' {
75 if currentWord.Len() > 0 {
76 words = append(words, currentWord.String())
77 currentWord.Reset()
78 }
79 } else if r == '.' || r == '!' || r == '?' || r == ',' || r == ':' || r == ';' {
80 if currentWord.Len() > 0 {
81 words = append(words, currentWord.String())
82 currentWord.Reset()
83 }
84 words = append(words, string(r))
85 } else {
86 currentWord.WriteRune(r)
87 }
88 }
89
90 if currentWord.Len() > 0 {
91 words = append(words, currentWord.String())
92 }
93
94 return words
95 }
96
97 // Generate generates a novel insult
98 func (mg *MarkovGenerator) Generate() string {
99 mg.mu.RLock()
100 defer mg.mu.RUnlock()
101
102 if len(mg.starters) == 0 || len(mg.chains) == 0 {
103 return "" // Not trained yet
104 }
105
106 // Pick a random starting state
107 state := mg.starters[mg.rng.Intn(len(mg.starters))]
108 words := strings.Split(state, " ")
109
110 // Generate until we hit max length or a terminal state
111 attempts := 0
112 maxAttempts := 100
113
114 for len(strings.Join(words, " ")) < mg.maxLength && attempts < maxAttempts {
115 attempts++
116
117 // Get next word choices
118 nextWords := mg.chains[state]
119 if len(nextWords) == 0 {
120 break // Terminal state
121 }
122
123 // Choose next word based on frequency
124 nextWord := mg.weightedChoice(nextWords)
125 words = append(words, nextWord)
126
127 // Update state
128 if len(words) >= mg.order {
129 state = strings.Join(words[len(words)-mg.order:], " ")
130 }
131
132 // Stop at sentence endings if we've generated enough
133 if (nextWord == "." || nextWord == "!" || nextWord == "?") &&
134 len(strings.Join(words, " ")) >= mg.minLength {
135 break
136 }
137 }
138
139 // Reconstruct text with proper spacing
140 return mg.reconstructText(words)
141 }
142
143 // weightedChoice selects a word based on frequency weights
144 func (mg *MarkovGenerator) weightedChoice(choices map[string]int) string {
145 // Calculate total weight
146 totalWeight := 0
147 for _, count := range choices {
148 totalWeight += count
149 }
150
151 // Random selection
152 r := mg.rng.Intn(totalWeight)
153 cumulative := 0
154
155 for word, count := range choices {
156 cumulative += count
157 if r < cumulative {
158 return word
159 }
160 }
161
162 // Fallback (shouldn't reach here)
163 for word := range choices {
164 return word
165 }
166
167 return ""
168 }
169
170 // reconstructText reconstructs text with proper spacing around punctuation
171 func (mg *MarkovGenerator) reconstructText(words []string) string {
172 var result strings.Builder
173
174 for i, word := range words {
175 // Don't add space before punctuation
176 if i > 0 && !mg.isPunctuation(word) {
177 result.WriteString(" ")
178 }
179
180 result.WriteString(word)
181 }
182
183 return result.String()
184 }
185
186 // isPunctuation checks if a word is punctuation
187 func (mg *MarkovGenerator) isPunctuation(word string) bool {
188 return word == "." || word == "!" || word == "?" ||
189 word == "," || word == ":" || word == ";" ||
190 word == "(" || word == ")"
191 }
192
193 // GenerateContextual generates an insult with context hints
194 func (mg *MarkovGenerator) GenerateContextual(seedWords []string) string {
195 mg.mu.RLock()
196 defer mg.mu.RUnlock()
197
198 if len(mg.chains) == 0 {
199 return ""
200 }
201
202 // Find states that contain any of the seed words
203 var matchingStarters []string
204 for _, starter := range mg.starters {
205 for _, seed := range seedWords {
206 if strings.Contains(strings.ToLower(starter), strings.ToLower(seed)) {
207 matchingStarters = append(matchingStarters, starter)
208 break
209 }
210 }
211 }
212
213 // If we found matching starters, use them; otherwise use any starter
214 if len(matchingStarters) == 0 {
215 matchingStarters = mg.starters
216 }
217
218 // Pick a random matching starter
219 state := matchingStarters[mg.rng.Intn(len(matchingStarters))]
220 words := strings.Split(state, " ")
221
222 // Generate as normal
223 attempts := 0
224 maxAttempts := 100
225
226 for len(strings.Join(words, " ")) < mg.maxLength && attempts < maxAttempts {
227 attempts++
228
229 nextWords := mg.chains[state]
230 if len(nextWords) == 0 {
231 break
232 }
233
234 nextWord := mg.weightedChoice(nextWords)
235 words = append(words, nextWord)
236
237 if len(words) >= mg.order {
238 state = strings.Join(words[len(words)-mg.order:], " ")
239 }
240
241 if (nextWord == "." || nextWord == "!" || nextWord == "?") &&
242 len(strings.Join(words, " ")) >= mg.minLength {
243 break
244 }
245 }
246
247 return mg.reconstructText(words)
248 }
249
250 // GenerateWithTemplate generates using a template with variable slots
251 func (mg *MarkovGenerator) GenerateWithTemplate(template string, variables map[string]string) string {
252 result := template
253
254 for key, value := range variables {
255 placeholder := "{" + key + "}"
256 result = strings.ReplaceAll(result, placeholder, value)
257 }
258
259 // Fill remaining slots with Markov-generated content
260 if strings.Contains(result, "{random}") {
261 generated := mg.Generate()
262 result = strings.ReplaceAll(result, "{random}", generated)
263 }
264
265 return result
266 }
267
268 // Blend creates a hybrid insult by blending Markov generation with templates
269 func (mg *MarkovGenerator) Blend(ctx *SmartFallbackContext) string {
270 // Extract key terms from the context
271 seedWords := []string{}
272
273 // Add command type
274 if ctx.CommandType != "" {
275 seedWords = append(seedWords, ctx.CommandType)
276 }
277
278 // Add command
279 if ctx.Command != "" {
280 seedWords = append(seedWords, ctx.Command)
281 }
282
283 // Add error pattern
284 if ctx.ErrorPattern != "" {
285 seedWords = append(seedWords, strings.ReplaceAll(ctx.ErrorPattern, "_", " "))
286 }
287
288 // Generate contextual insult
289 generated := mg.GenerateContextual(seedWords)
290
291 // Post-process: ensure it's not too similar to training data
292 if mg.tooSimilarToTraining(generated) {
293 // Try again with different seed
294 return mg.Generate()
295 }
296
297 return generated
298 }
299
300 // tooSimilarToTraining checks if generated text is too close to training data
301 func (mg *MarkovGenerator) tooSimilarToTraining(text string) bool {
302 // Simple heuristic: if the text is very short or contains many consecutive
303 // words from a single training example, it's too similar
304 return len(text) < mg.minLength
305 }
306
307 // HybridGenerate combines Markov with template system for best results
308 func (mg *MarkovGenerator) HybridGenerate(
309 ctx *SmartFallbackContext,
310 templates []string,
311 ) string {
312 // 50% chance to use pure Markov, 50% template + Markov
313 if mg.rng.Float64() < 0.5 {
314 return mg.Blend(ctx)
315 }
316
317 // Pick a random template
318 if len(templates) == 0 {
319 return mg.Blend(ctx)
320 }
321
322 template := templates[mg.rng.Intn(len(templates))]
323
324 // Fill template variables
325 variables := map[string]string{
326 "command": ctx.Command,
327 "commandType": ctx.CommandType,
328 "exitCode": string(rune(ctx.ExitCode)),
329 "error": ctx.ErrorPattern,
330 }
331
332 return mg.GenerateWithTemplate(template, variables)
333 }
334
335 // GetStats returns statistics about the trained model
336 func (mg *MarkovGenerator) GetStats() map[string]interface{} {
337 return map[string]interface{}{
338 "states": len(mg.chains),
339 "starters": len(mg.starters),
340 "order": mg.order,
341 "vocabulary": mg.countVocabulary(),
342 "avg_choices": mg.averageChoices(),
343 }
344 }
345
346 func (mg *MarkovGenerator) countVocabulary() int {
347 vocab := make(map[string]bool)
348 for state := range mg.chains {
349 words := strings.Split(state, " ")
350 for _, word := range words {
351 vocab[word] = true
352 }
353 }
354 return len(vocab)
355 }
356
357 func (mg *MarkovGenerator) averageChoices() float64 {
358 if len(mg.chains) == 0 {
359 return 0
360 }
361
362 total := 0
363 for _, choices := range mg.chains {
364 total += len(choices)
365 }
366
367 return float64(total) / float64(len(mg.chains))
368 }
369