Python · 10534 bytes Raw Blame History
1 """
2 Evaluation and comparison tools for hybrid models
3
4 Compares:
5 - Pure Markov generation
6 - Pure LSTM generation
7 - Hybrid ensemble generation
8
9 Metrics:
10 - Phonotactic quality (consonant/vowel balance)
11 - Diversity (unique characters, patterns)
12 - Corpus similarity (how "on-theme" words are)
13 - Human preference (subjective, requires annotation)
14 """
15
16 import numpy as np
17 from typing import List, Dict, Tuple
18 from collections import Counter
19 import logging
20
21 logger = logging.getLogger(__name__)
22
23
24 class WordQualityMetrics:
25 """
26 Automated metrics for evaluating generated words
27 """
28
29 def __init__(self):
30 self.vowels = set('aeiouAEIOU')
31 self.consonants = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ')
32
33 def vowel_consonant_ratio(self, word: str) -> float:
34 """
35 Calculate vowel to consonant ratio
36
37 Ideal ratio is around 0.4-0.6 for English-like words
38 """
39 vowel_count = sum(1 for c in word if c in self.vowels)
40 consonant_count = sum(1 for c in word if c in self.consonants)
41
42 if consonant_count == 0:
43 return 1.0 # All vowels (bad)
44 return vowel_count / consonant_count
45
46 def max_consecutive_consonants(self, word: str) -> int:
47 """
48 Maximum consecutive consonants
49
50 English rarely has >3 consecutive consonants
51 """
52 max_streak = 0
53 current_streak = 0
54
55 for char in word.lower():
56 if char in self.consonants:
57 current_streak += 1
58 max_streak = max(max_streak, current_streak)
59 else:
60 current_streak = 0
61
62 return max_streak
63
64 def max_consecutive_vowels(self, word: str) -> int:
65 """Maximum consecutive vowels"""
66 max_streak = 0
67 current_streak = 0
68
69 for char in word.lower():
70 if char in self.vowels:
71 current_streak += 1
72 max_streak = max(max_streak, current_streak)
73 else:
74 current_streak = 0
75
76 return max_streak
77
78 def character_diversity(self, word: str) -> float:
79 """
80 Unique characters / total characters
81
82 Higher = more diverse (but not always better)
83 """
84 if not word:
85 return 0.0
86 return len(set(word.lower())) / len(word)
87
88 def bigram_diversity(self, word: str) -> float:
89 """
90 Unique bigrams / total bigrams
91
92 Measures pattern repetition
93 """
94 word = word.lower()
95 if len(word) < 2:
96 return 0.0
97
98 bigrams = [word[i:i+2] for i in range(len(word)-1)]
99 return len(set(bigrams)) / len(bigrams)
100
101 def pronounceability_score(self, word: str) -> float:
102 """
103 Heuristic pronounceability score (0-1)
104
105 Penalizes:
106 - Extreme vowel/consonant ratios
107 - Long consonant/vowel sequences
108 - Very low character diversity
109 """
110 if not word or len(word) < 2:
111 return 0.0
112
113 vc_ratio = self.vowel_consonant_ratio(word)
114 max_cons = self.max_consecutive_consonants(word)
115 max_vow = self.max_consecutive_vowels(word)
116 char_div = self.character_diversity(word)
117
118 # Ideal vowel/consonant ratio is around 0.5
119 vc_score = 1.0 - min(abs(vc_ratio - 0.5), 0.5) / 0.5
120
121 # Penalize long sequences
122 cons_score = max(0, 1.0 - (max_cons - 3) * 0.2) if max_cons > 3 else 1.0
123 vow_score = max(0, 1.0 - (max_vow - 2) * 0.3) if max_vow > 2 else 1.0
124
125 # Encourage moderate diversity
126 div_score = min(char_div * 2, 1.0) # Optimal around 0.5
127
128 # Weighted average
129 score = (vc_score * 0.3 + cons_score * 0.3 + vow_score * 0.2 + div_score * 0.2)
130
131 return score
132
133 def evaluate_word(self, word: str) -> Dict:
134 """Comprehensive word evaluation"""
135 return {
136 'word': word,
137 'length': len(word),
138 'vc_ratio': self.vowel_consonant_ratio(word),
139 'max_cons_streak': self.max_consecutive_consonants(word),
140 'max_vow_streak': self.max_consecutive_vowels(word),
141 'char_diversity': self.character_diversity(word),
142 'bigram_diversity': self.bigram_diversity(word),
143 'pronounceability': self.pronounceability_score(word)
144 }
145
146
147 def compare_generation_methods(markov_instance, hybrid_model,
148 num_samples: int = 100,
149 temperature: float = 1.0,
150 max_length: int = 10) -> Dict:
151 """
152 Generate words using different methods and compare metrics
153
154 Args:
155 markov_instance: Pure Markov model
156 hybrid_model: Hybrid Markov-LSTM model
157 num_samples: Number of words to generate per method
158 temperature: Generation temperature
159 max_length: Maximum word length
160
161 Returns:
162 Comparison statistics dictionary
163 """
164 metrics = WordQualityMetrics()
165
166 # Generate words with each method
167 markov_words = []
168 hybrid_words = []
169
170 logger.info(f"Generating {num_samples} words with each method...")
171
172 for _ in range(num_samples):
173 # Pure Markov
174 markov_word = markov_instance.genny(
175 max_length=max_length,
176 temperature=temperature
177 )
178 markov_words.append(markov_word)
179
180 # Hybrid
181 hybrid_word, _ = hybrid_model.generate(
182 max_length=max_length,
183 temperature=temperature
184 )
185 hybrid_words.append(hybrid_word)
186
187 # Evaluate each set
188 markov_evals = [metrics.evaluate_word(w) for w in markov_words if w]
189 hybrid_evals = [metrics.evaluate_word(w) for w in hybrid_words if w]
190
191 # Aggregate statistics
192 def aggregate_metrics(evals):
193 if not evals:
194 return {}
195
196 return {
197 'avg_length': np.mean([e['length'] for e in evals]),
198 'avg_vc_ratio': np.mean([e['vc_ratio'] for e in evals]),
199 'avg_max_cons_streak': np.mean([e['max_cons_streak'] for e in evals]),
200 'avg_max_vow_streak': np.mean([e['max_vow_streak'] for e in evals]),
201 'avg_char_diversity': np.mean([e['char_diversity'] for e in evals]),
202 'avg_bigram_diversity': np.mean([e['bigram_diversity'] for e in evals]),
203 'avg_pronounceability': np.mean([e['pronounceability'] for e in evals]),
204 'unique_words': len(set([e['word'] for e in evals])),
205 'unique_ratio': len(set([e['word'] for e in evals])) / len(evals)
206 }
207
208 return {
209 'markov': aggregate_metrics(markov_evals),
210 'hybrid': aggregate_metrics(hybrid_evals),
211 'markov_words': markov_words[:20], # Sample words
212 'hybrid_words': hybrid_words[:20]
213 }
214
215
216 def print_comparison_report(comparison: Dict, corpus_name: str = "Unknown"):
217 """
218 Pretty-print comparison report
219 """
220 print(f"\n{'='*70}")
221 print(f" Generation Comparison: {corpus_name}")
222 print(f"{'='*70}\n")
223
224 markov_stats = comparison['markov']
225 hybrid_stats = comparison['hybrid']
226
227 # Create comparison table
228 metrics_to_compare = [
229 ('Average Length', 'avg_length', '{:.2f}'),
230 ('V/C Ratio', 'avg_vc_ratio', '{:.2f}'),
231 ('Max Consonant Streak', 'avg_max_cons_streak', '{:.2f}'),
232 ('Max Vowel Streak', 'avg_max_vow_streak', '{:.2f}'),
233 ('Character Diversity', 'avg_char_diversity', '{:.2f}'),
234 ('Bigram Diversity', 'avg_bigram_diversity', '{:.2f}'),
235 ('Pronounceability', 'avg_pronounceability', '{:.2f}'),
236 ('Unique Words', 'unique_words', '{:d}'),
237 ('Unique Ratio', 'unique_ratio', '{:.2%}'),
238 ]
239
240 print(f"{'Metric':<25} {'Markov':>15} {'Hybrid':>15} {'Difference':>15}")
241 print(f"{'-'*70}")
242
243 for name, key, fmt in metrics_to_compare:
244 markov_val = markov_stats.get(key, 0)
245 hybrid_val = hybrid_stats.get(key, 0)
246
247 if isinstance(markov_val, int):
248 diff = hybrid_val - markov_val
249 diff_str = f"{diff:+d}"
250 else:
251 diff = hybrid_val - markov_val
252 diff_str = f"{diff:+.2f}"
253
254 print(f"{name:<25} {fmt.format(markov_val):>15} {fmt.format(hybrid_val):>15} {diff_str:>15}")
255
256 # Sample words
257 print(f"\n{'='*70}")
258 print(f" Sample Words")
259 print(f"{'='*70}\n")
260
261 print(f"{'Markov':<35} {'Hybrid':<35}")
262 print(f"{'-'*70}")
263
264 for markov_word, hybrid_word in zip(comparison['markov_words'][:10],
265 comparison['hybrid_words'][:10]):
266 print(f"{markov_word:<35} {hybrid_word:<35}")
267
268 print(f"\n{'='*70}\n")
269
270
271 def analyze_hybrid_contributions(hybrid_model, num_samples: int = 20,
272 max_length: int = 10) -> Dict:
273 """
274 Analyze how much Markov vs LSTM contributes to generations
275
276 Returns:
277 Statistics about model contributions
278 """
279 all_metadata = []
280
281 for _ in range(num_samples):
282 word, metadata = hybrid_model.generate(max_length=max_length)
283 all_metadata.append(metadata)
284
285 # Aggregate metadata
286 avg_lstm_confidence = np.mean([m.get('avg_lstm_confidence', 0) for m in all_metadata])
287 avg_markov_influence = np.mean([m.get('avg_markov_influence', 0) for m in all_metadata])
288 avg_lstm_influence = np.mean([m.get('avg_lstm_influence', 0) for m in all_metadata])
289
290 return {
291 'avg_lstm_confidence': avg_lstm_confidence,
292 'avg_markov_influence': avg_markov_influence,
293 'avg_lstm_influence': avg_lstm_influence,
294 'samples': all_metadata[:5] # Keep some samples for inspection
295 }
296
297
298 def print_contribution_analysis(analysis: Dict):
299 """Print hybrid contribution analysis"""
300 print(f"\n{'='*70}")
301 print(f" Hybrid Model Contribution Analysis")
302 print(f"{'='*70}\n")
303
304 print(f"Average LSTM Confidence: {analysis['avg_lstm_confidence']:.2%}")
305 print(f"Average Markov Influence: {analysis['avg_markov_influence']:.2%}")
306 print(f"Average LSTM Influence: {analysis['avg_lstm_influence']:.2%}")
307
308 print(f"\n{'='*70}")
309 print(f" Sample Generation Traces")
310 print(f"{'='*70}\n")
311
312 for i, sample in enumerate(analysis['samples'], 1):
313 print(f"Sample {i}:")
314 print(f" Characters: {''.join(sample['characters'])}")
315 print(f" Avg LSTM confidence: {sample.get('avg_lstm_confidence', 0):.2%}")
316 print(f" Avg Markov influence: {sample.get('avg_markov_influence', 0):.2%}")
317 print(f" Avg LSTM influence: {sample.get('avg_lstm_influence', 0):.2%}")
318 print()