| 1 | package llm |
| 2 | |
| 3 | import ( |
| 4 | "math" |
| 5 | "strings" |
| 6 | "sync" |
| 7 | "unicode" |
| 8 | ) |
| 9 | |
| 10 | // TFIDFEngine implements semantic similarity using TF-IDF vectors |
| 11 | type TFIDFEngine struct { |
| 12 | mu sync.RWMutex |
| 13 | vocabulary map[string]int // word -> index |
| 14 | idf map[string]float64 // word -> inverse document frequency |
| 15 | documentCount int |
| 16 | ngramRange [2]int // min and max n-gram size |
| 17 | } |
| 18 | |
| 19 | // Document represents a text document with its TF-IDF vector |
| 20 | type Document struct { |
| 21 | Text string |
| 22 | Vector map[string]float64 // sparse vector representation |
| 23 | } |
| 24 | |
| 25 | // NewTFIDFEngine creates a new TF-IDF engine |
| 26 | func NewTFIDFEngine() *TFIDFEngine { |
| 27 | return &TFIDFEngine{ |
| 28 | vocabulary: make(map[string]int), |
| 29 | idf: make(map[string]float64), |
| 30 | documentCount: 0, |
| 31 | ngramRange: [2]int{1, 3}, // unigrams, bigrams, trigrams |
| 32 | } |
| 33 | } |
| 34 | |
| 35 | // BuildCorpus builds the TF-IDF corpus from a collection of documents |
| 36 | func (engine *TFIDFEngine) BuildCorpus(documents []string) { |
| 37 | engine.mu.Lock() |
| 38 | defer engine.mu.Unlock() |
| 39 | |
| 40 | // First pass: build vocabulary and count document frequencies |
| 41 | documentFreq := make(map[string]int) |
| 42 | |
| 43 | for _, doc := range documents { |
| 44 | tokens := engine.extractNGrams(doc) |
| 45 | seen := make(map[string]bool) |
| 46 | |
| 47 | for _, token := range tokens { |
| 48 | if !seen[token] { |
| 49 | documentFreq[token]++ |
| 50 | seen[token] = true |
| 51 | } |
| 52 | |
| 53 | if _, exists := engine.vocabulary[token]; !exists { |
| 54 | engine.vocabulary[token] = len(engine.vocabulary) |
| 55 | } |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | engine.documentCount = len(documents) |
| 60 | |
| 61 | // Calculate IDF for each term |
| 62 | for term, docFreq := range documentFreq { |
| 63 | // IDF = log(N / df) where N is total docs, df is docs containing term |
| 64 | engine.idf[term] = math.Log(float64(engine.documentCount) / float64(docFreq)) |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | // extractNGrams extracts n-grams from text |
| 69 | func (engine *TFIDFEngine) extractNGrams(text string) []string { |
| 70 | text = strings.ToLower(text) |
| 71 | words := engine.tokenize(text) |
| 72 | |
| 73 | var ngrams []string |
| 74 | |
| 75 | // Generate n-grams for all sizes in range |
| 76 | for n := engine.ngramRange[0]; n <= engine.ngramRange[1]; n++ { |
| 77 | if n > len(words) { |
| 78 | break |
| 79 | } |
| 80 | |
| 81 | for i := 0; i <= len(words)-n; i++ { |
| 82 | ngram := strings.Join(words[i:i+n], " ") |
| 83 | ngrams = append(ngrams, ngram) |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | return ngrams |
| 88 | } |
| 89 | |
| 90 | // tokenize splits text into words |
| 91 | func (engine *TFIDFEngine) tokenize(text string) []string { |
| 92 | var words []string |
| 93 | var currentWord strings.Builder |
| 94 | |
| 95 | for _, r := range text { |
| 96 | if unicode.IsLetter(r) || unicode.IsNumber(r) || r == '-' || r == '_' { |
| 97 | currentWord.WriteRune(r) |
| 98 | } else { |
| 99 | if currentWord.Len() > 0 { |
| 100 | word := currentWord.String() |
| 101 | if len(word) > 1 { // Skip single characters |
| 102 | words = append(words, word) |
| 103 | } |
| 104 | currentWord.Reset() |
| 105 | } |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | if currentWord.Len() > 0 { |
| 110 | word := currentWord.String() |
| 111 | if len(word) > 1 { |
| 112 | words = append(words, word) |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | return words |
| 117 | } |
| 118 | |
| 119 | // Vectorize converts text to TF-IDF vector |
| 120 | func (engine *TFIDFEngine) Vectorize(text string) map[string]float64 { |
| 121 | engine.mu.RLock() |
| 122 | defer engine.mu.RUnlock() |
| 123 | |
| 124 | vector := make(map[string]float64) |
| 125 | tokens := engine.extractNGrams(text) |
| 126 | |
| 127 | // Calculate term frequencies |
| 128 | termFreq := make(map[string]int) |
| 129 | for _, token := range tokens { |
| 130 | termFreq[token]++ |
| 131 | } |
| 132 | |
| 133 | // Calculate TF-IDF for each term |
| 134 | totalTerms := len(tokens) |
| 135 | for term, freq := range termFreq { |
| 136 | // TF = freq / total_terms |
| 137 | tf := float64(freq) / float64(totalTerms) |
| 138 | |
| 139 | // Get IDF (use 1.0 if term not in vocabulary - rare term) |
| 140 | idf := 1.0 |
| 141 | if val, exists := engine.idf[term]; exists { |
| 142 | idf = val |
| 143 | } |
| 144 | |
| 145 | // TF-IDF = TF * IDF |
| 146 | vector[term] = tf * idf |
| 147 | } |
| 148 | |
| 149 | // Normalize vector |
| 150 | return engine.normalizeVector(vector) |
| 151 | } |
| 152 | |
| 153 | // normalizeVector normalizes a vector to unit length |
| 154 | func (engine *TFIDFEngine) normalizeVector(vector map[string]float64) map[string]float64 { |
| 155 | // Calculate magnitude |
| 156 | var sumSquares float64 |
| 157 | for _, value := range vector { |
| 158 | sumSquares += value * value |
| 159 | } |
| 160 | magnitude := math.Sqrt(sumSquares) |
| 161 | |
| 162 | if magnitude == 0 { |
| 163 | return vector |
| 164 | } |
| 165 | |
| 166 | // Normalize |
| 167 | normalized := make(map[string]float64) |
| 168 | for term, value := range vector { |
| 169 | normalized[term] = value / magnitude |
| 170 | } |
| 171 | |
| 172 | return normalized |
| 173 | } |
| 174 | |
| 175 | // CosineSimilarity calculates cosine similarity between two vectors |
| 176 | func (engine *TFIDFEngine) CosineSimilarity(vec1, vec2 map[string]float64) float64 { |
| 177 | // Calculate dot product |
| 178 | var dotProduct float64 |
| 179 | for term, val1 := range vec1 { |
| 180 | if val2, exists := vec2[term]; exists { |
| 181 | dotProduct += val1 * val2 |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | // Vectors are already normalized, so similarity = dot product |
| 186 | return dotProduct |
| 187 | } |
| 188 | |
| 189 | // FindMostSimilar finds the most similar documents to a query |
| 190 | func (engine *TFIDFEngine) FindMostSimilar( |
| 191 | query string, |
| 192 | documents []Document, |
| 193 | topK int, |
| 194 | ) []SimilarityScore { |
| 195 | queryVec := engine.Vectorize(query) |
| 196 | |
| 197 | scores := make([]SimilarityScore, 0, len(documents)) |
| 198 | for i, doc := range documents { |
| 199 | similarity := engine.CosineSimilarity(queryVec, doc.Vector) |
| 200 | scores = append(scores, SimilarityScore{ |
| 201 | Index: i, |
| 202 | Similarity: similarity, |
| 203 | Text: doc.Text, |
| 204 | }) |
| 205 | } |
| 206 | |
| 207 | // Sort by similarity descending |
| 208 | sortSimilarityScores(scores) |
| 209 | |
| 210 | // Return top K |
| 211 | if len(scores) > topK { |
| 212 | scores = scores[:topK] |
| 213 | } |
| 214 | |
| 215 | return scores |
| 216 | } |
| 217 | |
| 218 | // SimilarityScore represents a similarity score for a document |
| 219 | type SimilarityScore struct { |
| 220 | Index int |
| 221 | Similarity float64 |
| 222 | Text string |
| 223 | } |
| 224 | |
| 225 | // sortSimilarityScores sorts scores in descending order |
| 226 | func sortSimilarityScores(scores []SimilarityScore) { |
| 227 | // Simple bubble sort (good enough for small datasets) |
| 228 | n := len(scores) |
| 229 | for i := 0; i < n-1; i++ { |
| 230 | for j := 0; j < n-i-1; j++ { |
| 231 | if scores[j].Similarity < scores[j+1].Similarity { |
| 232 | scores[j], scores[j+1] = scores[j+1], scores[j] |
| 233 | } |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | // ExtractKeyPhrases extracts important phrases from text using TF-IDF |
| 239 | func (engine *TFIDFEngine) ExtractKeyPhrases(text string, topN int) []string { |
| 240 | vector := engine.Vectorize(text) |
| 241 | |
| 242 | // Convert to sorted list |
| 243 | type termScore struct { |
| 244 | term string |
| 245 | score float64 |
| 246 | } |
| 247 | |
| 248 | scores := make([]termScore, 0, len(vector)) |
| 249 | for term, score := range vector { |
| 250 | scores = append(scores, termScore{term, score}) |
| 251 | } |
| 252 | |
| 253 | // Sort by score descending |
| 254 | for i := 0; i < len(scores)-1; i++ { |
| 255 | for j := 0; j < len(scores)-i-1; j++ { |
| 256 | if scores[j].score < scores[j+1].score { |
| 257 | scores[j], scores[j+1] = scores[j+1], scores[j] |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | // Extract top N terms |
| 263 | result := make([]string, 0, topN) |
| 264 | for i := 0; i < topN && i < len(scores); i++ { |
| 265 | result = append(result, scores[i].term) |
| 266 | } |
| 267 | |
| 268 | return result |
| 269 | } |
| 270 | |
| 271 | // CalculateSemanticScore calculates semantic similarity between command and insult |
| 272 | func (engine *TFIDFEngine) CalculateSemanticScore( |
| 273 | command string, |
| 274 | insult string, |
| 275 | ) float64 { |
| 276 | cmdVec := engine.Vectorize(command) |
| 277 | insultVec := engine.Vectorize(insult) |
| 278 | |
| 279 | return engine.CosineSimilarity(cmdVec, insultVec) |
| 280 | } |
| 281 |