parrot Public

Watch 0 Fork 0 Star 0

Go · 6740 bytes Raw Blame History

  
        1
        package llm
      
        2
        
        3
        import (
      
        4
        	"math"
      
        5
        	"strings"
      
        6
        	"sync"
      
        7
        	"unicode"
      
        8
        )
      
        9
        
        10
        // TFIDFEngine implements semantic similarity using TF-IDF vectors
      
        11
        type TFIDFEngine struct {
      
        12
        	mu             sync.RWMutex
      
        13
        	vocabulary     map[string]int    // word -> index
      
        14
        	idf            map[string]float64 // word -> inverse document frequency
      
        15
        	documentCount  int
      
        16
        	ngramRange     [2]int // min and max n-gram size
      
        17
        }
      
        18
        
        19
        // Document represents a text document with its TF-IDF vector
      
        20
        type Document struct {
      
        21
        	Text   string
      
        22
        	Vector map[string]float64 // sparse vector representation
      
        23
        }
      
        24
        
        25
        // NewTFIDFEngine creates a new TF-IDF engine
      
        26
        func NewTFIDFEngine() *TFIDFEngine {
      
        27
        	return &TFIDFEngine{
      
        28
        		vocabulary:    make(map[string]int),
      
        29
        		idf:           make(map[string]float64),
      
        30
        		documentCount: 0,
      
        31
        		ngramRange:    [2]int{1, 3}, // unigrams, bigrams, trigrams
      
        32
        	}
      
        33
        }
      
        34
        
        35
        // BuildCorpus builds the TF-IDF corpus from a collection of documents
      
        36
        func (engine *TFIDFEngine) BuildCorpus(documents []string) {
      
        37
        	engine.mu.Lock()
      
        38
        	defer engine.mu.Unlock()
      
        39
        
        40
        	// First pass: build vocabulary and count document frequencies
      
        41
        	documentFreq := make(map[string]int)
      
        42
        
        43
        	for _, doc := range documents {
      
        44
        		tokens := engine.extractNGrams(doc)
      
        45
        		seen := make(map[string]bool)
      
        46
        
        47
        		for _, token := range tokens {
      
        48
        			if !seen[token] {
      
        49
        				documentFreq[token]++
      
        50
        				seen[token] = true
      
        51
        			}
      
        52
        
        53
        			if _, exists := engine.vocabulary[token]; !exists {
      
        54
        				engine.vocabulary[token] = len(engine.vocabulary)
      
        55
        			}
      
        56
        		}
      
        57
        	}
      
        58
        
        59
        	engine.documentCount = len(documents)
      
        60
        
        61
        	// Calculate IDF for each term
      
        62
        	for term, docFreq := range documentFreq {
      
        63
        		// IDF = log(N / df) where N is total docs, df is docs containing term
      
        64
        		engine.idf[term] = math.Log(float64(engine.documentCount) / float64(docFreq))
      
        65
        	}
      
        66
        }
      
        67
        
        68
        // extractNGrams extracts n-grams from text
      
        69
        func (engine *TFIDFEngine) extractNGrams(text string) []string {
      
        70
        	text = strings.ToLower(text)
      
        71
        	words := engine.tokenize(text)
      
        72
        
        73
        	var ngrams []string
      
        74
        
        75
        	// Generate n-grams for all sizes in range
      
        76
        	for n := engine.ngramRange[0]; n <= engine.ngramRange[1]; n++ {
      
        77
        		if n > len(words) {
      
        78
        			break
      
        79
        		}
      
        80
        
        81
        		for i := 0; i <= len(words)-n; i++ {
      
        82
        			ngram := strings.Join(words[i:i+n], " ")
      
        83
        			ngrams = append(ngrams, ngram)
      
        84
        		}
      
        85
        	}
      
        86
        
        87
        	return ngrams
      
        88
        }
      
        89
        
        90
        // tokenize splits text into words
      
        91
        func (engine *TFIDFEngine) tokenize(text string) []string {
      
        92
        	var words []string
      
        93
        	var currentWord strings.Builder
      
        94
        
        95
        	for _, r := range text {
      
        96
        		if unicode.IsLetter(r) || unicode.IsNumber(r) || r == '-' || r == '_' {
      
        97
        			currentWord.WriteRune(r)
      
        98
        		} else {
      
        99
        			if currentWord.Len() > 0 {
      
        100
        				word := currentWord.String()
      
        101
        				if len(word) > 1 { // Skip single characters
      
        102
        					words = append(words, word)
      
        103
        				}
      
        104
        				currentWord.Reset()
      
        105
        			}
      
        106
        		}
      
        107
        	}
      
        108
        
        109
        	if currentWord.Len() > 0 {
      
        110
        		word := currentWord.String()
      
        111
        		if len(word) > 1 {
      
        112
        			words = append(words, word)
      
        113
        		}
      
        114
        	}
      
        115
        
        116
        	return words
      
        117
        }
      
        118
        
        119
        // Vectorize converts text to TF-IDF vector
      
        120
        func (engine *TFIDFEngine) Vectorize(text string) map[string]float64 {
      
        121
        	engine.mu.RLock()
      
        122
        	defer engine.mu.RUnlock()
      
        123
        
        124
        	vector := make(map[string]float64)
      
        125
        	tokens := engine.extractNGrams(text)
      
        126
        
        127
        	// Calculate term frequencies
      
        128
        	termFreq := make(map[string]int)
      
        129
        	for _, token := range tokens {
      
        130
        		termFreq[token]++
      
        131
        	}
      
        132
        
        133
        	// Calculate TF-IDF for each term
      
        134
        	totalTerms := len(tokens)
      
        135
        	for term, freq := range termFreq {
      
        136
        		// TF = freq / total_terms
      
        137
        		tf := float64(freq) / float64(totalTerms)
      
        138
        
        139
        		// Get IDF (use 1.0 if term not in vocabulary - rare term)
      
        140
        		idf := 1.0
      
        141
        		if val, exists := engine.idf[term]; exists {
      
        142
        			idf = val
      
        143
        		}
      
        144
        
        145
        		// TF-IDF = TF * IDF
      
        146
        		vector[term] = tf * idf
      
        147
        	}
      
        148
        
        149
        	// Normalize vector
      
        150
        	return engine.normalizeVector(vector)
      
        151
        }
      
        152
        
        153
        // normalizeVector normalizes a vector to unit length
      
        154
        func (engine *TFIDFEngine) normalizeVector(vector map[string]float64) map[string]float64 {
      
        155
        	// Calculate magnitude
      
        156
        	var sumSquares float64
      
        157
        	for _, value := range vector {
      
        158
        		sumSquares += value * value
      
        159
        	}
      
        160
        	magnitude := math.Sqrt(sumSquares)
      
        161
        
        162
        	if magnitude == 0 {
      
        163
        		return vector
      
        164
        	}
      
        165
        
        166
        	// Normalize
      
        167
        	normalized := make(map[string]float64)
      
        168
        	for term, value := range vector {
      
        169
        		normalized[term] = value / magnitude
      
        170
        	}
      
        171
        
        172
        	return normalized
      
        173
        }
      
        174
        
        175
        // CosineSimilarity calculates cosine similarity between two vectors
      
        176
        func (engine *TFIDFEngine) CosineSimilarity(vec1, vec2 map[string]float64) float64 {
      
        177
        	// Calculate dot product
      
        178
        	var dotProduct float64
      
        179
        	for term, val1 := range vec1 {
      
        180
        		if val2, exists := vec2[term]; exists {
      
        181
        			dotProduct += val1 * val2
      
        182
        		}
      
        183
        	}
      
        184
        
        185
        	// Vectors are already normalized, so similarity = dot product
      
        186
        	return dotProduct
      
        187
        }
      
        188
        
        189
        // FindMostSimilar finds the most similar documents to a query
      
        190
        func (engine *TFIDFEngine) FindMostSimilar(
      
        191
        	query string,
      
        192
        	documents []Document,
      
        193
        	topK int,
      
        194
        ) []SimilarityScore {
      
        195
        	queryVec := engine.Vectorize(query)
      
        196
        
        197
        	scores := make([]SimilarityScore, 0, len(documents))
      
        198
        	for i, doc := range documents {
      
        199
        		similarity := engine.CosineSimilarity(queryVec, doc.Vector)
      
        200
        		scores = append(scores, SimilarityScore{
      
        201
        			Index:      i,
      
        202
        			Similarity: similarity,
      
        203
        			Text:       doc.Text,
      
        204
        		})
      
        205
        	}
      
        206
        
        207
        	// Sort by similarity descending
      
        208
        	sortSimilarityScores(scores)
      
        209
        
        210
        	// Return top K
      
        211
        	if len(scores) > topK {
      
        212
        		scores = scores[:topK]
      
        213
        	}
      
        214
        
        215
        	return scores
      
        216
        }
      
        217
        
        218
        // SimilarityScore represents a similarity score for a document
      
        219
        type SimilarityScore struct {
      
        220
        	Index      int
      
        221
        	Similarity float64
      
        222
        	Text       string
      
        223
        }
      
        224
        
        225
        // sortSimilarityScores sorts scores in descending order
      
        226
        func sortSimilarityScores(scores []SimilarityScore) {
      
        227
        	// Simple bubble sort (good enough for small datasets)
      
        228
        	n := len(scores)
      
        229
        	for i := 0; i < n-1; i++ {
      
        230
        		for j := 0; j < n-i-1; j++ {
      
        231
        			if scores[j].Similarity < scores[j+1].Similarity {
      
        232
        				scores[j], scores[j+1] = scores[j+1], scores[j]
      
        233
        			}
      
        234
        		}
      
        235
        	}
      
        236
        }
      
        237
        
        238
        // ExtractKeyPhrases extracts important phrases from text using TF-IDF
      
        239
        func (engine *TFIDFEngine) ExtractKeyPhrases(text string, topN int) []string {
      
        240
        	vector := engine.Vectorize(text)
      
        241
        
        242
        	// Convert to sorted list
      
        243
        	type termScore struct {
      
        244
        		term  string
      
        245
        		score float64
      
        246
        	}
      
        247
        
        248
        	scores := make([]termScore, 0, len(vector))
      
        249
        	for term, score := range vector {
      
        250
        		scores = append(scores, termScore{term, score})
      
        251
        	}
      
        252
        
        253
        	// Sort by score descending
      
        254
        	for i := 0; i < len(scores)-1; i++ {
      
        255
        		for j := 0; j < len(scores)-i-1; j++ {
      
        256
        			if scores[j].score < scores[j+1].score {
      
        257
        				scores[j], scores[j+1] = scores[j+1], scores[j]
      
        258
        			}
      
        259
        		}
      
        260
        	}
      
        261
        
        262
        	// Extract top N terms
      
        263
        	result := make([]string, 0, topN)
      
        264
        	for i := 0; i < topN && i < len(scores); i++ {
      
        265
        		result = append(result, scores[i].term)
      
        266
        	}
      
        267
        
        268
        	return result
      
        269
        }
      
        270
        
        271
        // CalculateSemanticScore calculates semantic similarity between command and insult
      
        272
        func (engine *TFIDFEngine) CalculateSemanticScore(
      
        273
        	command string,
      
        274
        	insult string,
      
        275
        ) float64 {
      
        276
        	cmdVec := engine.Vectorize(command)
      
        277
        	insultVec := engine.Vectorize(insult)
      
        278
        
        279
        	return engine.CosineSimilarity(cmdVec, insultVec)
      
        280
        }
      
        281

1	package llm
2
3	import (
4	"math"
5	"strings"
6	"sync"
7	"unicode"
8	)
9
10	// TFIDFEngine implements semantic similarity using TF-IDF vectors
11	type TFIDFEngine struct {
12	mu sync.RWMutex
13	vocabulary map[string]int // word -> index
14	idf map[string]float64 // word -> inverse document frequency
15	documentCount int
16	ngramRange [2]int // min and max n-gram size
17	}
18
19	// Document represents a text document with its TF-IDF vector
20	type Document struct {
21	Text string
22	Vector map[string]float64 // sparse vector representation
23	}
24
25	// NewTFIDFEngine creates a new TF-IDF engine
26	func NewTFIDFEngine() *TFIDFEngine {
27	return &TFIDFEngine{
28	vocabulary: make(map[string]int),
29	idf: make(map[string]float64),
30	documentCount: 0,
31	ngramRange: [2]int{1, 3}, // unigrams, bigrams, trigrams
32	}
33	}
34
35	// BuildCorpus builds the TF-IDF corpus from a collection of documents
36	func (engine *TFIDFEngine) BuildCorpus(documents []string) {
37	engine.mu.Lock()
38	defer engine.mu.Unlock()
39
40	// First pass: build vocabulary and count document frequencies
41	documentFreq := make(map[string]int)
42
43	for _, doc := range documents {
44	tokens := engine.extractNGrams(doc)
45	seen := make(map[string]bool)
46
47	for _, token := range tokens {
48	if !seen[token] {
49	documentFreq[token]++
50	seen[token] = true
51	}
52
53	if _, exists := engine.vocabulary[token]; !exists {
54	engine.vocabulary[token] = len(engine.vocabulary)
55	}
56	}
57	}
58
59	engine.documentCount = len(documents)
60
61	// Calculate IDF for each term
62	for term, docFreq := range documentFreq {
63	// IDF = log(N / df) where N is total docs, df is docs containing term
64	engine.idf[term] = math.Log(float64(engine.documentCount) / float64(docFreq))
65	}
66	}
67
68	// extractNGrams extracts n-grams from text
69	func (engine *TFIDFEngine) extractNGrams(text string) []string {
70	text = strings.ToLower(text)
71	words := engine.tokenize(text)
72
73	var ngrams []string
74
75	// Generate n-grams for all sizes in range
76	for n := engine.ngramRange[0]; n <= engine.ngramRange[1]; n++ {
77	if n > len(words) {
78	break
79	}
80
81	for i := 0; i <= len(words)-n; i++ {
82	ngram := strings.Join(words[i:i+n], " ")
83	ngrams = append(ngrams, ngram)
84	}
85	}
86
87	return ngrams
88	}
89
90	// tokenize splits text into words
91	func (engine *TFIDFEngine) tokenize(text string) []string {
92	var words []string
93	var currentWord strings.Builder
94
95	for _, r := range text {
96	if unicode.IsLetter(r) \|\| unicode.IsNumber(r) \|\| r == '-' \|\| r == '_' {
97	currentWord.WriteRune(r)
98	} else {
99	if currentWord.Len() > 0 {
100	word := currentWord.String()
101	if len(word) > 1 { // Skip single characters
102	words = append(words, word)
103	}
104	currentWord.Reset()
105	}
106	}
107	}
108
109	if currentWord.Len() > 0 {
110	word := currentWord.String()
111	if len(word) > 1 {
112	words = append(words, word)
113	}
114	}
115
116	return words
117	}
118
119	// Vectorize converts text to TF-IDF vector
120	func (engine *TFIDFEngine) Vectorize(text string) map[string]float64 {
121	engine.mu.RLock()
122	defer engine.mu.RUnlock()
123
124	vector := make(map[string]float64)
125	tokens := engine.extractNGrams(text)
126
127	// Calculate term frequencies
128	termFreq := make(map[string]int)
129	for _, token := range tokens {
130	termFreq[token]++
131	}
132
133	// Calculate TF-IDF for each term
134	totalTerms := len(tokens)
135	for term, freq := range termFreq {
136	// TF = freq / total_terms
137	tf := float64(freq) / float64(totalTerms)
138
139	// Get IDF (use 1.0 if term not in vocabulary - rare term)
140	idf := 1.0
141	if val, exists := engine.idf[term]; exists {
142	idf = val
143	}
144
145	// TF-IDF = TF * IDF
146	vector[term] = tf * idf
147	}
148
149	// Normalize vector
150	return engine.normalizeVector(vector)
151	}
152
153	// normalizeVector normalizes a vector to unit length
154	func (engine *TFIDFEngine) normalizeVector(vector map[string]float64) map[string]float64 {
155	// Calculate magnitude
156	var sumSquares float64
157	for _, value := range vector {
158	sumSquares += value * value
159	}
160	magnitude := math.Sqrt(sumSquares)
161
162	if magnitude == 0 {
163	return vector
164	}
165
166	// Normalize
167	normalized := make(map[string]float64)
168	for term, value := range vector {
169	normalized[term] = value / magnitude
170	}
171
172	return normalized
173	}
174
175	// CosineSimilarity calculates cosine similarity between two vectors
176	func (engine *TFIDFEngine) CosineSimilarity(vec1, vec2 map[string]float64) float64 {
177	// Calculate dot product
178	var dotProduct float64
179	for term, val1 := range vec1 {
180	if val2, exists := vec2[term]; exists {
181	dotProduct += val1 * val2
182	}
183	}
184
185	// Vectors are already normalized, so similarity = dot product
186	return dotProduct
187	}
188
189	// FindMostSimilar finds the most similar documents to a query
190	func (engine *TFIDFEngine) FindMostSimilar(
191	query string,
192	documents []Document,
193	topK int,
194	) []SimilarityScore {
195	queryVec := engine.Vectorize(query)
196
197	scores := make([]SimilarityScore, 0, len(documents))
198	for i, doc := range documents {
199	similarity := engine.CosineSimilarity(queryVec, doc.Vector)
200	scores = append(scores, SimilarityScore{
201	Index: i,
202	Similarity: similarity,
203	Text: doc.Text,
204	})
205	}
206
207	// Sort by similarity descending
208	sortSimilarityScores(scores)
209
210	// Return top K
211	if len(scores) > topK {
212	scores = scores[:topK]
213	}
214
215	return scores
216	}
217
218	// SimilarityScore represents a similarity score for a document
219	type SimilarityScore struct {
220	Index int
221	Similarity float64
222	Text string
223	}
224
225	// sortSimilarityScores sorts scores in descending order
226	func sortSimilarityScores(scores []SimilarityScore) {
227	// Simple bubble sort (good enough for small datasets)
228	n := len(scores)
229	for i := 0; i < n-1; i++ {
230	for j := 0; j < n-i-1; j++ {
231	if scores[j].Similarity < scores[j+1].Similarity {
232	scores[j], scores[j+1] = scores[j+1], scores[j]
233	}
234	}
235	}
236	}
237
238	// ExtractKeyPhrases extracts important phrases from text using TF-IDF
239	func (engine *TFIDFEngine) ExtractKeyPhrases(text string, topN int) []string {
240	vector := engine.Vectorize(text)
241
242	// Convert to sorted list
243	type termScore struct {
244	term string
245	score float64
246	}
247
248	scores := make([]termScore, 0, len(vector))
249	for term, score := range vector {
250	scores = append(scores, termScore{term, score})
251	}
252
253	// Sort by score descending
254	for i := 0; i < len(scores)-1; i++ {
255	for j := 0; j < len(scores)-i-1; j++ {
256	if scores[j].score < scores[j+1].score {
257	scores[j], scores[j+1] = scores[j+1], scores[j]
258	}
259	}
260	}
261
262	// Extract top N terms
263	result := make([]string, 0, topN)
264	for i := 0; i < topN && i < len(scores); i++ {
265	result = append(result, scores[i].term)
266	}
267
268	return result
269	}
270
271	// CalculateSemanticScore calculates semantic similarity between command and insult
272	func (engine *TFIDFEngine) CalculateSemanticScore(
273	command string,
274	insult string,
275	) float64 {
276	cmdVec := engine.Vectorize(command)
277	insultVec := engine.Vectorize(insult)
278
279	return engine.CosineSimilarity(cmdVec, insultVec)
280	}
281