Go · 6740 bytes Raw Blame History
1 package llm
2
3 import (
4 "math"
5 "strings"
6 "sync"
7 "unicode"
8 )
9
10 // TFIDFEngine implements semantic similarity using TF-IDF vectors
11 type TFIDFEngine struct {
12 mu sync.RWMutex
13 vocabulary map[string]int // word -> index
14 idf map[string]float64 // word -> inverse document frequency
15 documentCount int
16 ngramRange [2]int // min and max n-gram size
17 }
18
19 // Document represents a text document with its TF-IDF vector
20 type Document struct {
21 Text string
22 Vector map[string]float64 // sparse vector representation
23 }
24
25 // NewTFIDFEngine creates a new TF-IDF engine
26 func NewTFIDFEngine() *TFIDFEngine {
27 return &TFIDFEngine{
28 vocabulary: make(map[string]int),
29 idf: make(map[string]float64),
30 documentCount: 0,
31 ngramRange: [2]int{1, 3}, // unigrams, bigrams, trigrams
32 }
33 }
34
35 // BuildCorpus builds the TF-IDF corpus from a collection of documents
36 func (engine *TFIDFEngine) BuildCorpus(documents []string) {
37 engine.mu.Lock()
38 defer engine.mu.Unlock()
39
40 // First pass: build vocabulary and count document frequencies
41 documentFreq := make(map[string]int)
42
43 for _, doc := range documents {
44 tokens := engine.extractNGrams(doc)
45 seen := make(map[string]bool)
46
47 for _, token := range tokens {
48 if !seen[token] {
49 documentFreq[token]++
50 seen[token] = true
51 }
52
53 if _, exists := engine.vocabulary[token]; !exists {
54 engine.vocabulary[token] = len(engine.vocabulary)
55 }
56 }
57 }
58
59 engine.documentCount = len(documents)
60
61 // Calculate IDF for each term
62 for term, docFreq := range documentFreq {
63 // IDF = log(N / df) where N is total docs, df is docs containing term
64 engine.idf[term] = math.Log(float64(engine.documentCount) / float64(docFreq))
65 }
66 }
67
68 // extractNGrams extracts n-grams from text
69 func (engine *TFIDFEngine) extractNGrams(text string) []string {
70 text = strings.ToLower(text)
71 words := engine.tokenize(text)
72
73 var ngrams []string
74
75 // Generate n-grams for all sizes in range
76 for n := engine.ngramRange[0]; n <= engine.ngramRange[1]; n++ {
77 if n > len(words) {
78 break
79 }
80
81 for i := 0; i <= len(words)-n; i++ {
82 ngram := strings.Join(words[i:i+n], " ")
83 ngrams = append(ngrams, ngram)
84 }
85 }
86
87 return ngrams
88 }
89
90 // tokenize splits text into words
91 func (engine *TFIDFEngine) tokenize(text string) []string {
92 var words []string
93 var currentWord strings.Builder
94
95 for _, r := range text {
96 if unicode.IsLetter(r) || unicode.IsNumber(r) || r == '-' || r == '_' {
97 currentWord.WriteRune(r)
98 } else {
99 if currentWord.Len() > 0 {
100 word := currentWord.String()
101 if len(word) > 1 { // Skip single characters
102 words = append(words, word)
103 }
104 currentWord.Reset()
105 }
106 }
107 }
108
109 if currentWord.Len() > 0 {
110 word := currentWord.String()
111 if len(word) > 1 {
112 words = append(words, word)
113 }
114 }
115
116 return words
117 }
118
119 // Vectorize converts text to TF-IDF vector
120 func (engine *TFIDFEngine) Vectorize(text string) map[string]float64 {
121 engine.mu.RLock()
122 defer engine.mu.RUnlock()
123
124 vector := make(map[string]float64)
125 tokens := engine.extractNGrams(text)
126
127 // Calculate term frequencies
128 termFreq := make(map[string]int)
129 for _, token := range tokens {
130 termFreq[token]++
131 }
132
133 // Calculate TF-IDF for each term
134 totalTerms := len(tokens)
135 for term, freq := range termFreq {
136 // TF = freq / total_terms
137 tf := float64(freq) / float64(totalTerms)
138
139 // Get IDF (use 1.0 if term not in vocabulary - rare term)
140 idf := 1.0
141 if val, exists := engine.idf[term]; exists {
142 idf = val
143 }
144
145 // TF-IDF = TF * IDF
146 vector[term] = tf * idf
147 }
148
149 // Normalize vector
150 return engine.normalizeVector(vector)
151 }
152
153 // normalizeVector normalizes a vector to unit length
154 func (engine *TFIDFEngine) normalizeVector(vector map[string]float64) map[string]float64 {
155 // Calculate magnitude
156 var sumSquares float64
157 for _, value := range vector {
158 sumSquares += value * value
159 }
160 magnitude := math.Sqrt(sumSquares)
161
162 if magnitude == 0 {
163 return vector
164 }
165
166 // Normalize
167 normalized := make(map[string]float64)
168 for term, value := range vector {
169 normalized[term] = value / magnitude
170 }
171
172 return normalized
173 }
174
175 // CosineSimilarity calculates cosine similarity between two vectors
176 func (engine *TFIDFEngine) CosineSimilarity(vec1, vec2 map[string]float64) float64 {
177 // Calculate dot product
178 var dotProduct float64
179 for term, val1 := range vec1 {
180 if val2, exists := vec2[term]; exists {
181 dotProduct += val1 * val2
182 }
183 }
184
185 // Vectors are already normalized, so similarity = dot product
186 return dotProduct
187 }
188
189 // FindMostSimilar finds the most similar documents to a query
190 func (engine *TFIDFEngine) FindMostSimilar(
191 query string,
192 documents []Document,
193 topK int,
194 ) []SimilarityScore {
195 queryVec := engine.Vectorize(query)
196
197 scores := make([]SimilarityScore, 0, len(documents))
198 for i, doc := range documents {
199 similarity := engine.CosineSimilarity(queryVec, doc.Vector)
200 scores = append(scores, SimilarityScore{
201 Index: i,
202 Similarity: similarity,
203 Text: doc.Text,
204 })
205 }
206
207 // Sort by similarity descending
208 sortSimilarityScores(scores)
209
210 // Return top K
211 if len(scores) > topK {
212 scores = scores[:topK]
213 }
214
215 return scores
216 }
217
218 // SimilarityScore represents a similarity score for a document
219 type SimilarityScore struct {
220 Index int
221 Similarity float64
222 Text string
223 }
224
225 // sortSimilarityScores sorts scores in descending order
226 func sortSimilarityScores(scores []SimilarityScore) {
227 // Simple bubble sort (good enough for small datasets)
228 n := len(scores)
229 for i := 0; i < n-1; i++ {
230 for j := 0; j < n-i-1; j++ {
231 if scores[j].Similarity < scores[j+1].Similarity {
232 scores[j], scores[j+1] = scores[j+1], scores[j]
233 }
234 }
235 }
236 }
237
238 // ExtractKeyPhrases extracts important phrases from text using TF-IDF
239 func (engine *TFIDFEngine) ExtractKeyPhrases(text string, topN int) []string {
240 vector := engine.Vectorize(text)
241
242 // Convert to sorted list
243 type termScore struct {
244 term string
245 score float64
246 }
247
248 scores := make([]termScore, 0, len(vector))
249 for term, score := range vector {
250 scores = append(scores, termScore{term, score})
251 }
252
253 // Sort by score descending
254 for i := 0; i < len(scores)-1; i++ {
255 for j := 0; j < len(scores)-i-1; j++ {
256 if scores[j].score < scores[j+1].score {
257 scores[j], scores[j+1] = scores[j+1], scores[j]
258 }
259 }
260 }
261
262 // Extract top N terms
263 result := make([]string, 0, topN)
264 for i := 0; i < topN && i < len(scores); i++ {
265 result = append(result, scores[i].term)
266 }
267
268 return result
269 }
270
271 // CalculateSemanticScore calculates semantic similarity between command and insult
272 func (engine *TFIDFEngine) CalculateSemanticScore(
273 command string,
274 insult string,
275 ) float64 {
276 cmdVec := engine.Vectorize(command)
277 insultVec := engine.Vectorize(insult)
278
279 return engine.CosineSimilarity(cmdVec, insultVec)
280 }
281