tenseleyflow/parrot / 3c407d8

Browse files

fix race conditions in ML engines causing data corruption

Add sync.RWMutex protection to prevent concurrent map access:
- tfidf_engine.go: protect vocabulary and idf maps
- bm25_engine.go: protect vocabulary and idf maps
- markov_generator.go: protect chains and starters maps
- ensemble_system.go: prevent concurrent Train() calls

This fixes the bug where dates appeared in command_frequency
instead of actual commands, and eliminates 'concurrent map
read and map write' panics.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
3c407d83a24359d0715da2f19ae0328ea0b1cdc5
Parents
08b88f7
Tree
38c0833

4 changed files

StatusFile+-
M internal/llm/bm25_engine.go 8 0
M internal/llm/ensemble_system.go 10 3
M internal/llm/markov_generator.go 14 3
M internal/llm/tfidf_engine.go 8 0
internal/llm/bm25_engine.gomodified
@@ -2,11 +2,13 @@ package llm
22
 
33
 import (
44
 	"math"
5
+	"sync"
56
 )
67
 
78
 // BM25Engine implements BM25 ranking algorithm (superior to basic TF-IDF)
89
 // BM25 is the industry standard for text search and ranking
910
 type BM25Engine struct {
11
+	mu            sync.RWMutex
1012
 	vocabulary    map[string]int      // word -> index
1113
 	idf           map[string]float64  // word -> inverse document frequency
1214
 	docLengths    []int               // document lengths
@@ -44,6 +46,9 @@ func (engine *BM25Engine) SetParameters(k1, b float64) {
4446
 
4547
 // BuildCorpus builds the BM25 corpus from documents
4648
 func (engine *BM25Engine) BuildCorpus(documents []string) {
49
+	engine.mu.Lock()
50
+	defer engine.mu.Unlock()
51
+
4752
 	// First pass: extract terms and calculate document frequencies
4853
 	documentFreq := make(map[string]int)
4954
 	engine.docLengths = make([]int, len(documents))
@@ -128,6 +133,9 @@ func (engine *BM25Engine) tokenize(text string) []string {
128133
 
129134
 // Score calculates BM25 score for a query against a document
130135
 func (engine *BM25Engine) Score(query string, document string) float64 {
136
+	engine.mu.RLock()
137
+	defer engine.mu.RUnlock()
138
+
131139
 	queryTerms := engine.extractNGrams(query)
132140
 	docTerms := engine.extractNGrams(document)
133141
 
internal/llm/ensemble_system.gomodified
@@ -3,10 +3,12 @@ package llm
33
 import (
44
 	"math"
55
 	"sort"
6
+	"sync"
67
 )
78
 
89
 // EnsembleSystem combines multiple ML techniques for optimal insult selection
910
 type EnsembleSystem struct {
11
+	mu               sync.RWMutex
1012
 	tfidfEngine      *TFIDFEngine
1113
 	bm25Engine       *BM25Engine  // NEW: Industry-standard BM25 ranking
1214
 	markovGen        *MarkovGenerator
@@ -72,9 +74,13 @@ func NewEnsembleSystem(db *InsultDatabase, scorer *InsultScorer, hist *InsultHis
7274
 
7375
 // Train trains all ML components on the insult database
7476
 func (es *EnsembleSystem) Train() {
77
+	es.mu.Lock()
7578
 	if es.trained {
79
+		es.mu.Unlock()
7680
 		return // Already trained
7781
 	}
82
+	es.trained = true // Mark as training to prevent concurrent attempts
83
+	es.mu.Unlock()
7884
 
7985
 	// Collect all insult texts
8086
 	insults := make([]string, 0, len(es.database.Insults))
@@ -90,8 +96,6 @@ func (es *EnsembleSystem) Train() {
9096
 
9197
 	// Train Markov generator
9298
 	es.markovGen.Train(insults)
93
-
94
-	es.trained = true
9599
 }
96100
 
97101
 // GenerateInsult generates the best possible insult using ensemble methods
@@ -100,7 +104,10 @@ func (es *EnsembleSystem) GenerateInsult(
100104
 	personality string,
101105
 ) string {
102106
 	// Ensure training is done
103
-	if !es.trained {
107
+	es.mu.RLock()
108
+	trained := es.trained
109
+	es.mu.RUnlock()
110
+	if !trained {
104111
 		es.Train()
105112
 	}
106113
 
internal/llm/markov_generator.gomodified
@@ -3,11 +3,13 @@ package llm
33
 import (
44
 	"math/rand"
55
 	"strings"
6
+	"sync"
67
 	"time"
78
 )
89
 
910
 // MarkovGenerator generates novel insults using Markov chains
1011
 type MarkovGenerator struct {
12
+	mu          sync.RWMutex
1113
 	chains      map[string]map[string]int // state -> next_word -> count
1214
 	starters    []string                   // possible starting words
1315
 	order       int                        // n-gram order (2 = bigram)
@@ -30,13 +32,16 @@ func NewMarkovGenerator(order int) *MarkovGenerator {
3032
 
3133
 // Train trains the Markov chain on a corpus of insults
3234
 func (mg *MarkovGenerator) Train(insults []string) {
35
+	mg.mu.Lock()
36
+	defer mg.mu.Unlock()
37
+
3338
 	for _, insult := range insults {
34
-		mg.trainOnText(insult)
39
+		mg.trainOnTextUnlocked(insult)
3540
 	}
3641
 }
3742
 
38
-// trainOnText trains on a single text
39
-func (mg *MarkovGenerator) trainOnText(text string) {
43
+// trainOnTextUnlocked trains on a single text (caller must hold lock)
44
+func (mg *MarkovGenerator) trainOnTextUnlocked(text string) {
4045
 	words := mg.tokenize(text)
4146
 	if len(words) < mg.order+1 {
4247
 		return
@@ -91,6 +96,9 @@ func (mg *MarkovGenerator) tokenize(text string) []string {
9196
 
9297
 // Generate generates a novel insult
9398
 func (mg *MarkovGenerator) Generate() string {
99
+	mg.mu.RLock()
100
+	defer mg.mu.RUnlock()
101
+
94102
 	if len(mg.starters) == 0 || len(mg.chains) == 0 {
95103
 		return "" // Not trained yet
96104
 	}
@@ -184,6 +192,9 @@ func (mg *MarkovGenerator) isPunctuation(word string) bool {
184192
 
185193
 // GenerateContextual generates an insult with context hints
186194
 func (mg *MarkovGenerator) GenerateContextual(seedWords []string) string {
195
+	mg.mu.RLock()
196
+	defer mg.mu.RUnlock()
197
+
187198
 	if len(mg.chains) == 0 {
188199
 		return ""
189200
 	}
internal/llm/tfidf_engine.gomodified
@@ -3,11 +3,13 @@ package llm
33
 import (
44
 	"math"
55
 	"strings"
6
+	"sync"
67
 	"unicode"
78
 )
89
 
910
 // TFIDFEngine implements semantic similarity using TF-IDF vectors
1011
 type TFIDFEngine struct {
12
+	mu             sync.RWMutex
1113
 	vocabulary     map[string]int    // word -> index
1214
 	idf            map[string]float64 // word -> inverse document frequency
1315
 	documentCount  int
@@ -32,6 +34,9 @@ func NewTFIDFEngine() *TFIDFEngine {
3234
 
3335
 // BuildCorpus builds the TF-IDF corpus from a collection of documents
3436
 func (engine *TFIDFEngine) BuildCorpus(documents []string) {
37
+	engine.mu.Lock()
38
+	defer engine.mu.Unlock()
39
+
3540
 	// First pass: build vocabulary and count document frequencies
3641
 	documentFreq := make(map[string]int)
3742
 
@@ -113,6 +118,9 @@ func (engine *TFIDFEngine) tokenize(text string) []string {
113118
 
114119
 // Vectorize converts text to TF-IDF vector
115120
 func (engine *TFIDFEngine) Vectorize(text string) map[string]float64 {
121
+	engine.mu.RLock()
122
+	defer engine.mu.RUnlock()
123
+
116124
 	vector := make(map[string]float64)
117125
 	tokens := engine.extractNGrams(text)
118126