Rust · 29468 bytes Raw Blame History
1 //! Network Health Early Warning System
2 //!
3 //! Monitors overall network health and provides early warnings for potential issues
4
5 use serde::{Deserialize, Serialize};
6 use std::collections::{HashMap, VecDeque};
7 use tokio::time::Duration;
8
9 #[derive(Debug, Clone, Serialize, Deserialize)]
10 pub struct NetworkHealthReport {
11 pub timestamp: crate::SerializableInstant,
12 pub overall_health_score: f32, // 0.0 to 1.0
13 pub critical_alerts: Vec<HealthAlert>,
14 pub warnings: Vec<HealthAlert>,
15 pub network_metrics: GlobalNetworkMetrics,
16 pub regional_health: HashMap<String, RegionalHealth>,
17 pub trend_analysis: HealthTrend,
18 pub risk_assessment: RiskAssessment,
19 }
20
21 #[derive(Debug, Clone, Serialize, Deserialize)]
22 pub struct HealthAlert {
23 pub id: String,
24 pub severity: AlertSeverity,
25 pub alert_type: AlertType,
26 pub message: String,
27 pub affected_nodes: Vec<String>,
28 pub affected_regions: Vec<String>,
29 pub first_detected: crate::SerializableInstant,
30 pub estimated_impact: ImpactAssessment,
31 pub recommended_actions: Vec<String>,
32 }
33
34 #[derive(Debug, Clone, Serialize, Deserialize)]
35 pub enum AlertSeverity {
36 Critical, // Immediate action required
37 High, // Action required within 1 hour
38 Medium, // Action required within 4 hours
39 Low, // Monitor and plan
40 Info, // Informational only
41 }
42
43 #[derive(Debug, Clone, Serialize, Deserialize)]
44 pub enum AlertType {
45 NodeFailures,
46 NetworkPartition,
47 StorageCapacity,
48 PerformanceDegradation,
49 SecurityThreat,
50 DataIntegrity,
51 ConnectivityIssues,
52 ResourceExhaustion,
53 GeographicDisturbance,
54 SystemOverload,
55 }
56
57 #[derive(Debug, Clone, Serialize, Deserialize)]
58 pub struct ImpactAssessment {
59 pub affected_data_percentage: f32,
60 pub performance_impact: f32,
61 pub availability_risk: f32,
62 pub estimated_users_affected: u32,
63 pub data_at_risk: u64, // bytes
64 }
65
66 #[derive(Debug, Clone, Serialize, Deserialize)]
67 pub struct GlobalNetworkMetrics {
68 pub total_nodes: u32,
69 pub healthy_nodes: u32,
70 pub unhealthy_nodes: u32,
71 pub offline_nodes: u32,
72 pub average_uptime: f32,
73 pub network_latency_p50: Duration,
74 pub network_latency_p95: Duration,
75 pub total_storage_capacity: u64,
76 pub used_storage_capacity: u64,
77 pub data_redundancy_level: f32,
78 pub throughput_mbps: f32,
79 pub error_rate: f32,
80 }
81
82 #[derive(Debug, Clone, Serialize, Deserialize)]
83 pub struct RegionalHealth {
84 pub region: String,
85 pub health_score: f32,
86 pub node_count: u32,
87 pub healthy_nodes: u32,
88 pub average_latency: Duration,
89 pub storage_utilization: f32,
90 pub connectivity_status: ConnectivityStatus,
91 pub risk_factors: Vec<RegionalRiskFactor>,
92 }
93
94 #[derive(Debug, Clone, Serialize, Deserialize)]
95 pub enum ConnectivityStatus {
96 Excellent, // All connections stable
97 Good, // Minor connectivity issues
98 Degraded, // Noticeable connectivity problems
99 Poor, // Significant connectivity issues
100 Critical, // Major connectivity failures
101 }
102
103 #[derive(Debug, Clone, Serialize, Deserialize)]
104 pub enum RegionalRiskFactor {
105 HighLatency,
106 NodeConcentration,
107 InfrastructureIssues,
108 NetworkCongestion,
109 GeographicEvents,
110 RegulatoryChanges,
111 }
112
113 #[derive(Debug, Clone, Serialize, Deserialize)]
114 pub struct HealthTrend {
115 pub direction: TrendDirection,
116 pub confidence: f32,
117 pub time_window: Duration,
118 pub key_indicators: Vec<TrendIndicator>,
119 pub predicted_issues: Vec<PredictedIssue>,
120 }
121
122 #[derive(Debug, Clone, Serialize, Deserialize)]
123 pub enum TrendDirection {
124 StronglyImproving,
125 Improving,
126 Stable,
127 Declining,
128 StronglyDeclining,
129 }
130
131 #[derive(Debug, Clone, Serialize, Deserialize)]
132 pub struct TrendIndicator {
133 pub metric: String,
134 pub current_value: f32,
135 pub trend_direction: TrendDirection,
136 pub rate_of_change: f32,
137 pub significance: f32,
138 }
139
140 #[derive(Debug, Clone, Serialize, Deserialize)]
141 pub struct PredictedIssue {
142 pub issue_type: AlertType,
143 pub probability: f32,
144 pub predicted_time: crate::SerializableInstant,
145 pub potential_impact: ImpactAssessment,
146 pub prevention_actions: Vec<String>,
147 }
148
149 #[derive(Debug, Clone, Serialize, Deserialize)]
150 pub struct RiskAssessment {
151 pub overall_risk_level: RiskLevel,
152 pub data_loss_risk: f32,
153 pub availability_risk: f32,
154 pub performance_risk: f32,
155 pub security_risk: f32,
156 pub mitigation_effectiveness: f32,
157 pub risk_factors: Vec<NetworkRiskFactor>,
158 }
159
160 #[derive(Debug, Clone, Serialize, Deserialize)]
161 pub enum RiskLevel {
162 VeryLow,
163 Low,
164 Medium,
165 High,
166 Critical,
167 }
168
169 #[derive(Debug, Clone, Serialize, Deserialize)]
170 pub struct NetworkRiskFactor {
171 pub factor_type: String,
172 pub severity: f32,
173 pub likelihood: f32,
174 pub impact_scope: String,
175 pub mitigation_options: Vec<String>,
176 }
177
178 pub struct NetworkHealthMonitor {
179 health_history: VecDeque<NetworkHealthReport>,
180 active_alerts: HashMap<String, HealthAlert>,
181 node_health_cache: HashMap<String, NodeHealthStatus>,
182 regional_monitors: HashMap<String, RegionalMonitor>,
183 alert_thresholds: AlertThresholds,
184 predictive_models: HashMap<String, HealthPredictionModel>,
185 }
186
187 #[derive(Debug, Clone)]
188 struct NodeHealthStatus {
189 node_id: String,
190 last_seen: crate::SerializableInstant,
191 health_score: f32,
192 metrics: NodeMetrics,
193 status: NodeStatus,
194 }
195
196 #[derive(Debug, Clone)]
197 struct NodeMetrics {
198 cpu_usage: f32,
199 memory_usage: f32,
200 disk_usage: f32,
201 network_latency: Duration,
202 error_count: u32,
203 uptime: Duration,
204 }
205
206 #[derive(Debug, Clone)]
207 enum NodeStatus {
208 Healthy,
209 Warning,
210 Critical,
211 Offline,
212 Unknown,
213 }
214
215 struct RegionalMonitor {
216 region: String,
217 nodes: Vec<String>,
218 health_score_history: VecDeque<f32>,
219 connectivity_matrix: HashMap<String, HashMap<String, Duration>>,
220 last_health_check: crate::SerializableInstant,
221 }
222
223 #[derive(Debug, Clone)]
224 struct AlertThresholds {
225 node_failure_threshold: f32,
226 network_latency_threshold: Duration,
227 storage_utilization_threshold: f32,
228 error_rate_threshold: f32,
229 uptime_threshold: f32,
230 redundancy_threshold: f32,
231 }
232
233 struct HealthPredictionModel {
234 model_type: String,
235 weights: Vec<f32>,
236 accuracy: f32,
237 training_data: VecDeque<HealthDataPoint>,
238 last_prediction: Option<PredictedIssue>,
239 }
240
241 #[derive(Debug, Clone)]
242 struct HealthDataPoint {
243 timestamp: crate::SerializableInstant,
244 metrics: Vec<f32>,
245 outcome: Option<AlertType>,
246 }
247
248 impl NetworkHealthMonitor {
249 pub fn new() -> Self {
250 Self {
251 health_history: VecDeque::with_capacity(1440), // 24 hours of minute-by-minute data
252 active_alerts: HashMap::new(),
253 node_health_cache: HashMap::new(),
254 regional_monitors: HashMap::new(),
255 alert_thresholds: AlertThresholds::default(),
256 predictive_models: HashMap::new(),
257 }
258 }
259
260 pub async fn perform_health_check(&mut self) -> NetworkHealthReport {
261 let timestamp = crate::SerializableInstant::now();
262
263 // Update node health status
264 self.update_node_health_status().await;
265
266 // Calculate global metrics
267 let network_metrics = self.calculate_global_metrics().await;
268
269 // Assess regional health
270 let regional_health = self.assess_regional_health().await;
271
272 // Analyze trends
273 let trend_analysis = self.analyze_health_trends();
274
275 // Assess risks
276 let risk_assessment = self.assess_network_risks(&network_metrics, &regional_health);
277
278 // Calculate overall health score
279 let overall_health_score = self.calculate_overall_health_score(&network_metrics, &regional_health, &risk_assessment);
280
281 // Generate alerts
282 let (critical_alerts, warnings) = self.generate_health_alerts(&network_metrics, &regional_health, &risk_assessment).await;
283
284 let report = NetworkHealthReport {
285 timestamp,
286 overall_health_score,
287 critical_alerts,
288 warnings,
289 network_metrics,
290 regional_health,
291 trend_analysis,
292 risk_assessment,
293 };
294
295 // Store in history
296 self.health_history.push_back(report.clone());
297 if self.health_history.len() > 1440 {
298 self.health_history.pop_front();
299 }
300
301 // Update predictive models
302 self.update_predictive_models(&report).await;
303
304 report
305 }
306
307 pub async fn get_current_health_status(&self) -> Option<NetworkHealthReport> {
308 self.health_history.back().cloned()
309 }
310
311 pub fn get_active_alerts(&self) -> Vec<&HealthAlert> {
312 self.active_alerts.values().collect()
313 }
314
315 pub fn get_critical_alerts(&self) -> Vec<&HealthAlert> {
316 self.active_alerts.values()
317 .filter(|alert| matches!(alert.severity, AlertSeverity::Critical))
318 .collect()
319 }
320
321 pub async fn predict_future_issues(&self, time_horizon: Duration) -> Vec<PredictedIssue> {
322 let mut predictions = Vec::new();
323
324 for model in self.predictive_models.values() {
325 if let Some(prediction) = self.run_prediction_model(model, time_horizon).await {
326 predictions.push(prediction);
327 }
328 }
329
330 // Sort by probability and impact
331 predictions.sort_by(|a, b| {
332 let score_a = a.probability * a.potential_impact.availability_risk;
333 let score_b = b.probability * b.potential_impact.availability_risk;
334 score_b.partial_cmp(&score_a).unwrap()
335 });
336
337 predictions
338 }
339
340 async fn update_node_health_status(&mut self) {
341 // Placeholder: In reality, this would collect metrics from all nodes
342 let now = crate::SerializableInstant::now();
343
344 for node_id in ["node1", "node2", "node3"].iter() {
345 let health_status = NodeHealthStatus {
346 node_id: node_id.to_string(),
347 last_seen: now,
348 health_score: 0.85 + (now.elapsed().as_secs() as f32 % 100.0) / 1000.0,
349 metrics: NodeMetrics {
350 cpu_usage: 0.4,
351 memory_usage: 0.6,
352 disk_usage: 0.3,
353 network_latency: Duration::from_millis(50),
354 error_count: 2,
355 uptime: Duration::from_secs(86400 * 30), // 30 days
356 },
357 status: NodeStatus::Healthy,
358 };
359
360 self.node_health_cache.insert(node_id.to_string(), health_status);
361 }
362 }
363
364 async fn calculate_global_metrics(&self) -> GlobalNetworkMetrics {
365 let total_nodes = self.node_health_cache.len() as u32;
366 let healthy_nodes = self.node_health_cache.values()
367 .filter(|node| matches!(node.status, NodeStatus::Healthy))
368 .count() as u32;
369 let unhealthy_nodes = self.node_health_cache.values()
370 .filter(|node| matches!(node.status, NodeStatus::Warning | NodeStatus::Critical))
371 .count() as u32;
372 let offline_nodes = self.node_health_cache.values()
373 .filter(|node| matches!(node.status, NodeStatus::Offline))
374 .count() as u32;
375
376 let average_uptime = if !self.node_health_cache.is_empty() {
377 self.node_health_cache.values()
378 .map(|node| node.health_score)
379 .sum::<f32>() / total_nodes as f32
380 } else {
381 0.0
382 };
383
384 let latencies: Vec<_> = self.node_health_cache.values()
385 .map(|node| node.metrics.network_latency.as_millis() as f32)
386 .collect();
387
388 let network_latency_p50 = Duration::from_millis(
389 self.calculate_percentile(&latencies, 0.5) as u64
390 );
391 let network_latency_p95 = Duration::from_millis(
392 self.calculate_percentile(&latencies, 0.95) as u64
393 );
394
395 let total_storage_capacity = 100 * 1024 * 1024 * 1024u64; // 100GB per node
396 let used_storage_capacity = (total_storage_capacity as f32 * 0.4) as u64; // 40% used
397
398 let error_rate = self.node_health_cache.values()
399 .map(|node| node.metrics.error_count as f32)
400 .sum::<f32>() / (total_nodes as f32).max(1.0) / 1000.0;
401
402 GlobalNetworkMetrics {
403 total_nodes,
404 healthy_nodes,
405 unhealthy_nodes,
406 offline_nodes,
407 average_uptime,
408 network_latency_p50,
409 network_latency_p95,
410 total_storage_capacity: total_storage_capacity * total_nodes as u64,
411 used_storage_capacity: used_storage_capacity * total_nodes as u64,
412 data_redundancy_level: 2.5, // Average redundancy factor
413 throughput_mbps: 150.0,
414 error_rate,
415 }
416 }
417
418 async fn assess_regional_health(&mut self) -> HashMap<String, RegionalHealth> {
419 let mut regional_health = HashMap::new();
420
421 let regions = vec!["us-east", "us-west", "europe", "asia-pacific"];
422
423 for region in regions {
424 let nodes_in_region: Vec<_> = self.node_health_cache.keys()
425 .filter(|_| true) // Placeholder: filter by region
426 .take(2)
427 .cloned()
428 .collect();
429
430 let node_count = nodes_in_region.len() as u32;
431 let healthy_nodes = nodes_in_region.iter()
432 .filter(|node_id| {
433 if let Some(node) = self.node_health_cache.get(*node_id) {
434 matches!(node.status, NodeStatus::Healthy)
435 } else {
436 false
437 }
438 })
439 .count() as u32;
440
441 let average_latency = if !nodes_in_region.is_empty() {
442 let total_latency: u128 = nodes_in_region.iter()
443 .filter_map(|node_id| self.node_health_cache.get(node_id))
444 .map(|node| node.metrics.network_latency.as_millis())
445 .sum();
446 Duration::from_millis((total_latency / nodes_in_region.len() as u128) as u64)
447 } else {
448 Duration::from_millis(0)
449 };
450
451 let health_score = if node_count > 0 {
452 healthy_nodes as f32 / node_count as f32
453 } else {
454 0.0
455 };
456
457 let connectivity_status = if health_score > 0.9 {
458 ConnectivityStatus::Excellent
459 } else if health_score > 0.8 {
460 ConnectivityStatus::Good
461 } else if health_score > 0.6 {
462 ConnectivityStatus::Degraded
463 } else if health_score > 0.3 {
464 ConnectivityStatus::Poor
465 } else {
466 ConnectivityStatus::Critical
467 };
468
469 let regional = RegionalHealth {
470 region: region.to_string(),
471 health_score,
472 node_count,
473 healthy_nodes,
474 average_latency,
475 storage_utilization: 0.4, // 40% utilized
476 connectivity_status,
477 risk_factors: self.identify_regional_risk_factors(region, health_score),
478 };
479
480 regional_health.insert(region.to_string(), regional);
481 }
482
483 regional_health
484 }
485
486 fn analyze_health_trends(&self) -> HealthTrend {
487 if self.health_history.len() < 5 {
488 return HealthTrend::default();
489 }
490
491 let recent_scores: Vec<_> = self.health_history.iter()
492 .rev()
493 .take(60) // Last hour
494 .map(|report| report.overall_health_score)
495 .collect();
496
497 let trend_direction = self.calculate_trend_direction(&recent_scores);
498 let confidence = self.calculate_trend_confidence(&recent_scores);
499
500 let key_indicators = vec![
501 TrendIndicator {
502 metric: "Overall Health".to_string(),
503 current_value: recent_scores.first().copied().unwrap_or(0.0),
504 trend_direction: trend_direction.clone(),
505 rate_of_change: self.calculate_rate_of_change(&recent_scores),
506 significance: 0.9,
507 },
508 TrendIndicator {
509 metric: "Node Availability".to_string(),
510 current_value: 0.95,
511 trend_direction: TrendDirection::Stable,
512 rate_of_change: 0.001,
513 significance: 0.8,
514 },
515 ];
516
517 let predicted_issues = self.generate_trend_predictions(&recent_scores);
518
519 HealthTrend {
520 direction: trend_direction,
521 confidence,
522 time_window: Duration::from_secs(3600),
523 key_indicators,
524 predicted_issues,
525 }
526 }
527
528 fn assess_network_risks(
529 &self,
530 metrics: &GlobalNetworkMetrics,
531 regional_health: &HashMap<String, RegionalHealth>
532 ) -> RiskAssessment {
533 let data_loss_risk = if metrics.data_redundancy_level < 2.0 { 0.8 }
534 else if metrics.data_redundancy_level < 2.5 { 0.4 }
535 else { 0.1 };
536
537 let availability_risk = 1.0 - (metrics.healthy_nodes as f32 / metrics.total_nodes as f32);
538
539 let performance_risk = if metrics.network_latency_p95 > Duration::from_millis(1000) { 0.7 }
540 else if metrics.network_latency_p95 > Duration::from_millis(500) { 0.4 }
541 else { 0.1 };
542
543 let security_risk = metrics.error_rate * 10.0;
544
545 let overall_risk_score = (data_loss_risk + availability_risk + performance_risk + security_risk) / 4.0;
546 let overall_risk_level = if overall_risk_score > 0.8 { RiskLevel::Critical }
547 else if overall_risk_score > 0.6 { RiskLevel::High }
548 else if overall_risk_score > 0.4 { RiskLevel::Medium }
549 else if overall_risk_score > 0.2 { RiskLevel::Low }
550 else { RiskLevel::VeryLow };
551
552 let risk_factors = vec![
553 NetworkRiskFactor {
554 factor_type: "Node Concentration".to_string(),
555 severity: 0.3,
556 likelihood: 0.4,
557 impact_scope: "Regional availability".to_string(),
558 mitigation_options: vec!["Increase geographic distribution".to_string()],
559 },
560 ];
561
562 RiskAssessment {
563 overall_risk_level,
564 data_loss_risk,
565 availability_risk,
566 performance_risk,
567 security_risk,
568 mitigation_effectiveness: 0.7,
569 risk_factors,
570 }
571 }
572
573 fn calculate_overall_health_score(
574 &self,
575 metrics: &GlobalNetworkMetrics,
576 regional_health: &HashMap<String, RegionalHealth>,
577 risk_assessment: &RiskAssessment,
578 ) -> f32 {
579 let availability_score = metrics.healthy_nodes as f32 / metrics.total_nodes as f32;
580 let performance_score = if metrics.network_latency_p95 < Duration::from_millis(200) { 1.0 }
581 else if metrics.network_latency_p95 < Duration::from_millis(500) { 0.8 }
582 else if metrics.network_latency_p95 < Duration::from_millis(1000) { 0.6 }
583 else { 0.3 };
584
585 let regional_score = if regional_health.is_empty() { 0.5 } else {
586 regional_health.values().map(|r| r.health_score).sum::<f32>() / regional_health.len() as f32
587 };
588
589 let risk_score = 1.0 - risk_assessment.availability_risk;
590
591 (availability_score * 0.4 + performance_score * 0.3 + regional_score * 0.2 + risk_score * 0.1)
592 }
593
594 async fn generate_health_alerts(
595 &mut self,
596 metrics: &GlobalNetworkMetrics,
597 regional_health: &HashMap<String, RegionalHealth>,
598 risk_assessment: &RiskAssessment,
599 ) -> (Vec<HealthAlert>, Vec<HealthAlert>) {
600 let mut critical_alerts = Vec::new();
601 let mut warnings = Vec::new();
602
603 // Check for critical node failures
604 if metrics.offline_nodes > metrics.total_nodes / 4 {
605 let alert = HealthAlert {
606 id: format!("critical_node_failures_{}", crate::SerializableInstant::now().elapsed().as_secs()),
607 severity: AlertSeverity::Critical,
608 alert_type: AlertType::NodeFailures,
609 message: format!("{} nodes are offline ({}% of network)", metrics.offline_nodes,
610 (metrics.offline_nodes as f32 / metrics.total_nodes as f32 * 100.0) as u32),
611 affected_nodes: vec!["multiple".to_string()],
612 affected_regions: regional_health.keys().cloned().collect(),
613 first_detected: crate::SerializableInstant::now(),
614 estimated_impact: ImpactAssessment {
615 affected_data_percentage: metrics.offline_nodes as f32 / metrics.total_nodes as f32,
616 performance_impact: 0.8,
617 availability_risk: 0.9,
618 estimated_users_affected: 10000,
619 data_at_risk: metrics.used_storage_capacity / 4,
620 },
621 recommended_actions: vec![
622 "Investigate node failures immediately".to_string(),
623 "Activate emergency replication".to_string(),
624 "Contact affected regions".to_string(),
625 ],
626 };
627 critical_alerts.push(alert);
628 }
629
630 // Check storage capacity
631 let storage_utilization = metrics.used_storage_capacity as f32 / metrics.total_storage_capacity as f32;
632 if storage_utilization > 0.9 {
633 let alert = HealthAlert {
634 id: format!("storage_capacity_{}", crate::SerializableInstant::now().elapsed().as_secs()),
635 severity: AlertSeverity::High,
636 alert_type: AlertType::StorageCapacity,
637 message: format!("Network storage is {}% full", (storage_utilization * 100.0) as u32),
638 affected_nodes: vec!["all".to_string()],
639 affected_regions: regional_health.keys().cloned().collect(),
640 first_detected: crate::SerializableInstant::now(),
641 estimated_impact: ImpactAssessment {
642 affected_data_percentage: 1.0,
643 performance_impact: 0.6,
644 availability_risk: 0.4,
645 estimated_users_affected: 50000,
646 data_at_risk: metrics.used_storage_capacity,
647 },
648 recommended_actions: vec![
649 "Add storage capacity".to_string(),
650 "Implement data cleanup policies".to_string(),
651 "Scale up storage nodes".to_string(),
652 ],
653 };
654 warnings.push(alert);
655 }
656
657 // Check network performance
658 if metrics.network_latency_p95 > Duration::from_millis(1000) {
659 let alert = HealthAlert {
660 id: format!("network_latency_{}", crate::SerializableInstant::now().elapsed().as_secs()),
661 severity: AlertSeverity::Medium,
662 alert_type: AlertType::PerformanceDegradation,
663 message: format!("Network latency is high: {}ms (95th percentile)",
664 metrics.network_latency_p95.as_millis()),
665 affected_nodes: vec!["multiple".to_string()],
666 affected_regions: regional_health.keys().cloned().collect(),
667 first_detected: crate::SerializableInstant::now(),
668 estimated_impact: ImpactAssessment {
669 affected_data_percentage: 0.0,
670 performance_impact: 0.7,
671 availability_risk: 0.2,
672 estimated_users_affected: 25000,
673 data_at_risk: 0,
674 },
675 recommended_actions: vec![
676 "Investigate network congestion".to_string(),
677 "Optimize routing".to_string(),
678 "Check regional connectivity".to_string(),
679 ],
680 };
681 warnings.push(alert);
682 }
683
684 (critical_alerts, warnings)
685 }
686
687 async fn update_predictive_models(&mut self, report: &NetworkHealthReport) {
688 // Update models based on new health report data
689 let data_point = HealthDataPoint {
690 timestamp: report.timestamp,
691 metrics: vec![
692 report.overall_health_score,
693 report.network_metrics.healthy_nodes as f32 / report.network_metrics.total_nodes as f32,
694 report.network_metrics.network_latency_p95.as_millis() as f32 / 1000.0,
695 report.network_metrics.error_rate,
696 ],
697 outcome: None, // Would be populated when actual issues occur
698 };
699
700 for model in self.predictive_models.values_mut() {
701 model.training_data.push_back(data_point.clone());
702 if model.training_data.len() > 1000 {
703 model.training_data.pop_front();
704 }
705 }
706 }
707
708 async fn run_prediction_model(&self, model: &HealthPredictionModel, time_horizon: Duration) -> Option<PredictedIssue> {
709 if model.training_data.len() < 10 {
710 return None;
711 }
712
713 // Simple prediction based on recent trends
714 let recent_health: Vec<_> = model.training_data.iter()
715 .rev()
716 .take(10)
717 .map(|dp| dp.metrics[0])
718 .collect();
719
720 let trend = self.calculate_rate_of_change(&recent_health);
721 let current_health = recent_health.first().copied().unwrap_or(0.5);
722
723 if trend < -0.01 && current_health < 0.7 {
724 Some(PredictedIssue {
725 issue_type: AlertType::PerformanceDegradation,
726 probability: 0.6,
727 predicted_time: crate::SerializableInstant::now() + time_horizon,
728 potential_impact: ImpactAssessment {
729 affected_data_percentage: 0.3,
730 performance_impact: 0.5,
731 availability_risk: 0.3,
732 estimated_users_affected: 15000,
733 data_at_risk: 1024 * 1024 * 1024, // 1GB
734 },
735 prevention_actions: vec![
736 "Increase monitoring frequency".to_string(),
737 "Prepare additional resources".to_string(),
738 ],
739 })
740 } else {
741 None
742 }
743 }
744
745 fn identify_regional_risk_factors(&self, _region: &str, health_score: f32) -> Vec<RegionalRiskFactor> {
746 let mut factors = Vec::new();
747
748 if health_score < 0.7 {
749 factors.push(RegionalRiskFactor::InfrastructureIssues);
750 }
751
752 factors
753 }
754
755 fn calculate_percentile(&self, values: &[f32], percentile: f32) -> f32 {
756 if values.is_empty() {
757 return 0.0;
758 }
759
760 let mut sorted_values = values.to_vec();
761 sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap());
762
763 let index = (percentile * (sorted_values.len() - 1) as f32).round() as usize;
764 sorted_values[index.min(sorted_values.len() - 1)]
765 }
766
767 fn calculate_trend_direction(&self, values: &[f32]) -> TrendDirection {
768 if values.len() < 2 {
769 return TrendDirection::Stable;
770 }
771
772 let slope = self.calculate_rate_of_change(values);
773
774 if slope > 0.05 { TrendDirection::StronglyImproving }
775 else if slope > 0.02 { TrendDirection::Improving }
776 else if slope > -0.02 { TrendDirection::Stable }
777 else if slope > -0.05 { TrendDirection::Declining }
778 else { TrendDirection::StronglyDeclining }
779 }
780
781 fn calculate_trend_confidence(&self, values: &[f32]) -> f32 {
782 if values.len() < 3 {
783 return 0.1;
784 }
785
786 let mean = values.iter().sum::<f32>() / values.len() as f32;
787 let variance = values.iter()
788 .map(|&x| (x - mean).powi(2))
789 .sum::<f32>() / values.len() as f32;
790
791 1.0 / (1.0 + variance * 10.0)
792 }
793
794 fn calculate_rate_of_change(&self, values: &[f32]) -> f32 {
795 if values.len() < 2 {
796 return 0.0;
797 }
798
799 let first = values.last().copied().unwrap_or(0.0);
800 let last = values.first().copied().unwrap_or(0.0);
801
802 (last - first) / values.len() as f32
803 }
804
805 fn generate_trend_predictions(&self, _values: &[f32]) -> Vec<PredictedIssue> {
806 // Placeholder: Would generate predictions based on trend analysis
807 Vec::new()
808 }
809 }
810
811 impl Default for AlertThresholds {
812 fn default() -> Self {
813 Self {
814 node_failure_threshold: 0.1, // 10% node failures trigger alert
815 network_latency_threshold: Duration::from_millis(500),
816 storage_utilization_threshold: 0.85, // 85% storage usage
817 error_rate_threshold: 0.05, // 5% error rate
818 uptime_threshold: 0.95, // 95% uptime required
819 redundancy_threshold: 2.0, // Minimum 2x redundancy
820 }
821 }
822 }
823
824 impl Default for HealthTrend {
825 fn default() -> Self {
826 Self {
827 direction: TrendDirection::Stable,
828 confidence: 0.5,
829 time_window: Duration::from_secs(3600),
830 key_indicators: Vec::new(),
831 predicted_issues: Vec::new(),
832 }
833 }
834 }