| 1 |
//! Network Health Early Warning System |
| 2 |
//! |
| 3 |
//! Monitors overall network health and provides early warnings for potential issues |
| 4 |
|
| 5 |
use serde::{Deserialize, Serialize}; |
| 6 |
use std::collections::{HashMap, VecDeque}; |
| 7 |
use tokio::time::Duration; |
| 8 |
|
| 9 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 10 |
pub struct NetworkHealthReport { |
| 11 |
pub timestamp: crate::SerializableInstant, |
| 12 |
pub overall_health_score: f32, // 0.0 to 1.0 |
| 13 |
pub critical_alerts: Vec<HealthAlert>, |
| 14 |
pub warnings: Vec<HealthAlert>, |
| 15 |
pub network_metrics: GlobalNetworkMetrics, |
| 16 |
pub regional_health: HashMap<String, RegionalHealth>, |
| 17 |
pub trend_analysis: HealthTrend, |
| 18 |
pub risk_assessment: RiskAssessment, |
| 19 |
} |
| 20 |
|
| 21 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 22 |
pub struct HealthAlert { |
| 23 |
pub id: String, |
| 24 |
pub severity: AlertSeverity, |
| 25 |
pub alert_type: AlertType, |
| 26 |
pub message: String, |
| 27 |
pub affected_nodes: Vec<String>, |
| 28 |
pub affected_regions: Vec<String>, |
| 29 |
pub first_detected: crate::SerializableInstant, |
| 30 |
pub estimated_impact: ImpactAssessment, |
| 31 |
pub recommended_actions: Vec<String>, |
| 32 |
} |
| 33 |
|
| 34 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 35 |
pub enum AlertSeverity { |
| 36 |
Critical, // Immediate action required |
| 37 |
High, // Action required within 1 hour |
| 38 |
Medium, // Action required within 4 hours |
| 39 |
Low, // Monitor and plan |
| 40 |
Info, // Informational only |
| 41 |
} |
| 42 |
|
| 43 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 44 |
pub enum AlertType { |
| 45 |
NodeFailures, |
| 46 |
NetworkPartition, |
| 47 |
StorageCapacity, |
| 48 |
PerformanceDegradation, |
| 49 |
SecurityThreat, |
| 50 |
DataIntegrity, |
| 51 |
ConnectivityIssues, |
| 52 |
ResourceExhaustion, |
| 53 |
GeographicDisturbance, |
| 54 |
SystemOverload, |
| 55 |
} |
| 56 |
|
| 57 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 58 |
pub struct ImpactAssessment { |
| 59 |
pub affected_data_percentage: f32, |
| 60 |
pub performance_impact: f32, |
| 61 |
pub availability_risk: f32, |
| 62 |
pub estimated_users_affected: u32, |
| 63 |
pub data_at_risk: u64, // bytes |
| 64 |
} |
| 65 |
|
| 66 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 67 |
pub struct GlobalNetworkMetrics { |
| 68 |
pub total_nodes: u32, |
| 69 |
pub healthy_nodes: u32, |
| 70 |
pub unhealthy_nodes: u32, |
| 71 |
pub offline_nodes: u32, |
| 72 |
pub average_uptime: f32, |
| 73 |
pub network_latency_p50: Duration, |
| 74 |
pub network_latency_p95: Duration, |
| 75 |
pub total_storage_capacity: u64, |
| 76 |
pub used_storage_capacity: u64, |
| 77 |
pub data_redundancy_level: f32, |
| 78 |
pub throughput_mbps: f32, |
| 79 |
pub error_rate: f32, |
| 80 |
} |
| 81 |
|
| 82 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 83 |
pub struct RegionalHealth { |
| 84 |
pub region: String, |
| 85 |
pub health_score: f32, |
| 86 |
pub node_count: u32, |
| 87 |
pub healthy_nodes: u32, |
| 88 |
pub average_latency: Duration, |
| 89 |
pub storage_utilization: f32, |
| 90 |
pub connectivity_status: ConnectivityStatus, |
| 91 |
pub risk_factors: Vec<RegionalRiskFactor>, |
| 92 |
} |
| 93 |
|
| 94 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 95 |
pub enum ConnectivityStatus { |
| 96 |
Excellent, // All connections stable |
| 97 |
Good, // Minor connectivity issues |
| 98 |
Degraded, // Noticeable connectivity problems |
| 99 |
Poor, // Significant connectivity issues |
| 100 |
Critical, // Major connectivity failures |
| 101 |
} |
| 102 |
|
| 103 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 104 |
pub enum RegionalRiskFactor { |
| 105 |
HighLatency, |
| 106 |
NodeConcentration, |
| 107 |
InfrastructureIssues, |
| 108 |
NetworkCongestion, |
| 109 |
GeographicEvents, |
| 110 |
RegulatoryChanges, |
| 111 |
} |
| 112 |
|
| 113 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 114 |
pub struct HealthTrend { |
| 115 |
pub direction: TrendDirection, |
| 116 |
pub confidence: f32, |
| 117 |
pub time_window: Duration, |
| 118 |
pub key_indicators: Vec<TrendIndicator>, |
| 119 |
pub predicted_issues: Vec<PredictedIssue>, |
| 120 |
} |
| 121 |
|
| 122 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 123 |
pub enum TrendDirection { |
| 124 |
StronglyImproving, |
| 125 |
Improving, |
| 126 |
Stable, |
| 127 |
Declining, |
| 128 |
StronglyDeclining, |
| 129 |
} |
| 130 |
|
| 131 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 132 |
pub struct TrendIndicator { |
| 133 |
pub metric: String, |
| 134 |
pub current_value: f32, |
| 135 |
pub trend_direction: TrendDirection, |
| 136 |
pub rate_of_change: f32, |
| 137 |
pub significance: f32, |
| 138 |
} |
| 139 |
|
| 140 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 141 |
pub struct PredictedIssue { |
| 142 |
pub issue_type: AlertType, |
| 143 |
pub probability: f32, |
| 144 |
pub predicted_time: crate::SerializableInstant, |
| 145 |
pub potential_impact: ImpactAssessment, |
| 146 |
pub prevention_actions: Vec<String>, |
| 147 |
} |
| 148 |
|
| 149 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 150 |
pub struct RiskAssessment { |
| 151 |
pub overall_risk_level: RiskLevel, |
| 152 |
pub data_loss_risk: f32, |
| 153 |
pub availability_risk: f32, |
| 154 |
pub performance_risk: f32, |
| 155 |
pub security_risk: f32, |
| 156 |
pub mitigation_effectiveness: f32, |
| 157 |
pub risk_factors: Vec<NetworkRiskFactor>, |
| 158 |
} |
| 159 |
|
| 160 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 161 |
pub enum RiskLevel { |
| 162 |
VeryLow, |
| 163 |
Low, |
| 164 |
Medium, |
| 165 |
High, |
| 166 |
Critical, |
| 167 |
} |
| 168 |
|
| 169 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 170 |
pub struct NetworkRiskFactor { |
| 171 |
pub factor_type: String, |
| 172 |
pub severity: f32, |
| 173 |
pub likelihood: f32, |
| 174 |
pub impact_scope: String, |
| 175 |
pub mitigation_options: Vec<String>, |
| 176 |
} |
| 177 |
|
| 178 |
pub struct NetworkHealthMonitor { |
| 179 |
health_history: VecDeque<NetworkHealthReport>, |
| 180 |
active_alerts: HashMap<String, HealthAlert>, |
| 181 |
node_health_cache: HashMap<String, NodeHealthStatus>, |
| 182 |
regional_monitors: HashMap<String, RegionalMonitor>, |
| 183 |
alert_thresholds: AlertThresholds, |
| 184 |
predictive_models: HashMap<String, HealthPredictionModel>, |
| 185 |
} |
| 186 |
|
| 187 |
#[derive(Debug, Clone)] |
| 188 |
struct NodeHealthStatus { |
| 189 |
node_id: String, |
| 190 |
last_seen: crate::SerializableInstant, |
| 191 |
health_score: f32, |
| 192 |
metrics: NodeMetrics, |
| 193 |
status: NodeStatus, |
| 194 |
} |
| 195 |
|
| 196 |
#[derive(Debug, Clone)] |
| 197 |
struct NodeMetrics { |
| 198 |
cpu_usage: f32, |
| 199 |
memory_usage: f32, |
| 200 |
disk_usage: f32, |
| 201 |
network_latency: Duration, |
| 202 |
error_count: u32, |
| 203 |
uptime: Duration, |
| 204 |
} |
| 205 |
|
| 206 |
#[derive(Debug, Clone)] |
| 207 |
enum NodeStatus { |
| 208 |
Healthy, |
| 209 |
Warning, |
| 210 |
Critical, |
| 211 |
Offline, |
| 212 |
Unknown, |
| 213 |
} |
| 214 |
|
| 215 |
struct RegionalMonitor { |
| 216 |
region: String, |
| 217 |
nodes: Vec<String>, |
| 218 |
health_score_history: VecDeque<f32>, |
| 219 |
connectivity_matrix: HashMap<String, HashMap<String, Duration>>, |
| 220 |
last_health_check: crate::SerializableInstant, |
| 221 |
} |
| 222 |
|
| 223 |
#[derive(Debug, Clone)] |
| 224 |
struct AlertThresholds { |
| 225 |
node_failure_threshold: f32, |
| 226 |
network_latency_threshold: Duration, |
| 227 |
storage_utilization_threshold: f32, |
| 228 |
error_rate_threshold: f32, |
| 229 |
uptime_threshold: f32, |
| 230 |
redundancy_threshold: f32, |
| 231 |
} |
| 232 |
|
| 233 |
struct HealthPredictionModel { |
| 234 |
model_type: String, |
| 235 |
weights: Vec<f32>, |
| 236 |
accuracy: f32, |
| 237 |
training_data: VecDeque<HealthDataPoint>, |
| 238 |
last_prediction: Option<PredictedIssue>, |
| 239 |
} |
| 240 |
|
| 241 |
#[derive(Debug, Clone)] |
| 242 |
struct HealthDataPoint { |
| 243 |
timestamp: crate::SerializableInstant, |
| 244 |
metrics: Vec<f32>, |
| 245 |
outcome: Option<AlertType>, |
| 246 |
} |
| 247 |
|
| 248 |
impl NetworkHealthMonitor { |
| 249 |
pub fn new() -> Self { |
| 250 |
Self { |
| 251 |
health_history: VecDeque::with_capacity(1440), // 24 hours of minute-by-minute data |
| 252 |
active_alerts: HashMap::new(), |
| 253 |
node_health_cache: HashMap::new(), |
| 254 |
regional_monitors: HashMap::new(), |
| 255 |
alert_thresholds: AlertThresholds::default(), |
| 256 |
predictive_models: HashMap::new(), |
| 257 |
} |
| 258 |
} |
| 259 |
|
| 260 |
pub async fn perform_health_check(&mut self) -> NetworkHealthReport { |
| 261 |
let timestamp = crate::SerializableInstant::now(); |
| 262 |
|
| 263 |
// Update node health status |
| 264 |
self.update_node_health_status().await; |
| 265 |
|
| 266 |
// Calculate global metrics |
| 267 |
let network_metrics = self.calculate_global_metrics().await; |
| 268 |
|
| 269 |
// Assess regional health |
| 270 |
let regional_health = self.assess_regional_health().await; |
| 271 |
|
| 272 |
// Analyze trends |
| 273 |
let trend_analysis = self.analyze_health_trends(); |
| 274 |
|
| 275 |
// Assess risks |
| 276 |
let risk_assessment = self.assess_network_risks(&network_metrics, ®ional_health); |
| 277 |
|
| 278 |
// Calculate overall health score |
| 279 |
let overall_health_score = self.calculate_overall_health_score(&network_metrics, ®ional_health, &risk_assessment); |
| 280 |
|
| 281 |
// Generate alerts |
| 282 |
let (critical_alerts, warnings) = self.generate_health_alerts(&network_metrics, ®ional_health, &risk_assessment).await; |
| 283 |
|
| 284 |
let report = NetworkHealthReport { |
| 285 |
timestamp, |
| 286 |
overall_health_score, |
| 287 |
critical_alerts, |
| 288 |
warnings, |
| 289 |
network_metrics, |
| 290 |
regional_health, |
| 291 |
trend_analysis, |
| 292 |
risk_assessment, |
| 293 |
}; |
| 294 |
|
| 295 |
// Store in history |
| 296 |
self.health_history.push_back(report.clone()); |
| 297 |
if self.health_history.len() > 1440 { |
| 298 |
self.health_history.pop_front(); |
| 299 |
} |
| 300 |
|
| 301 |
// Update predictive models |
| 302 |
self.update_predictive_models(&report).await; |
| 303 |
|
| 304 |
report |
| 305 |
} |
| 306 |
|
| 307 |
pub async fn get_current_health_status(&self) -> Option<NetworkHealthReport> { |
| 308 |
self.health_history.back().cloned() |
| 309 |
} |
| 310 |
|
| 311 |
pub fn get_active_alerts(&self) -> Vec<&HealthAlert> { |
| 312 |
self.active_alerts.values().collect() |
| 313 |
} |
| 314 |
|
| 315 |
pub fn get_critical_alerts(&self) -> Vec<&HealthAlert> { |
| 316 |
self.active_alerts.values() |
| 317 |
.filter(|alert| matches!(alert.severity, AlertSeverity::Critical)) |
| 318 |
.collect() |
| 319 |
} |
| 320 |
|
| 321 |
pub async fn predict_future_issues(&self, time_horizon: Duration) -> Vec<PredictedIssue> { |
| 322 |
let mut predictions = Vec::new(); |
| 323 |
|
| 324 |
for model in self.predictive_models.values() { |
| 325 |
if let Some(prediction) = self.run_prediction_model(model, time_horizon).await { |
| 326 |
predictions.push(prediction); |
| 327 |
} |
| 328 |
} |
| 329 |
|
| 330 |
// Sort by probability and impact |
| 331 |
predictions.sort_by(|a, b| { |
| 332 |
let score_a = a.probability * a.potential_impact.availability_risk; |
| 333 |
let score_b = b.probability * b.potential_impact.availability_risk; |
| 334 |
score_b.partial_cmp(&score_a).unwrap() |
| 335 |
}); |
| 336 |
|
| 337 |
predictions |
| 338 |
} |
| 339 |
|
| 340 |
async fn update_node_health_status(&mut self) { |
| 341 |
// Placeholder: In reality, this would collect metrics from all nodes |
| 342 |
let now = crate::SerializableInstant::now(); |
| 343 |
|
| 344 |
for node_id in ["node1", "node2", "node3"].iter() { |
| 345 |
let health_status = NodeHealthStatus { |
| 346 |
node_id: node_id.to_string(), |
| 347 |
last_seen: now, |
| 348 |
health_score: 0.85 + (now.elapsed().as_secs() as f32 % 100.0) / 1000.0, |
| 349 |
metrics: NodeMetrics { |
| 350 |
cpu_usage: 0.4, |
| 351 |
memory_usage: 0.6, |
| 352 |
disk_usage: 0.3, |
| 353 |
network_latency: Duration::from_millis(50), |
| 354 |
error_count: 2, |
| 355 |
uptime: Duration::from_secs(86400 * 30), // 30 days |
| 356 |
}, |
| 357 |
status: NodeStatus::Healthy, |
| 358 |
}; |
| 359 |
|
| 360 |
self.node_health_cache.insert(node_id.to_string(), health_status); |
| 361 |
} |
| 362 |
} |
| 363 |
|
| 364 |
async fn calculate_global_metrics(&self) -> GlobalNetworkMetrics { |
| 365 |
let total_nodes = self.node_health_cache.len() as u32; |
| 366 |
let healthy_nodes = self.node_health_cache.values() |
| 367 |
.filter(|node| matches!(node.status, NodeStatus::Healthy)) |
| 368 |
.count() as u32; |
| 369 |
let unhealthy_nodes = self.node_health_cache.values() |
| 370 |
.filter(|node| matches!(node.status, NodeStatus::Warning | NodeStatus::Critical)) |
| 371 |
.count() as u32; |
| 372 |
let offline_nodes = self.node_health_cache.values() |
| 373 |
.filter(|node| matches!(node.status, NodeStatus::Offline)) |
| 374 |
.count() as u32; |
| 375 |
|
| 376 |
let average_uptime = if !self.node_health_cache.is_empty() { |
| 377 |
self.node_health_cache.values() |
| 378 |
.map(|node| node.health_score) |
| 379 |
.sum::<f32>() / total_nodes as f32 |
| 380 |
} else { |
| 381 |
0.0 |
| 382 |
}; |
| 383 |
|
| 384 |
let latencies: Vec<_> = self.node_health_cache.values() |
| 385 |
.map(|node| node.metrics.network_latency.as_millis() as f32) |
| 386 |
.collect(); |
| 387 |
|
| 388 |
let network_latency_p50 = Duration::from_millis( |
| 389 |
self.calculate_percentile(&latencies, 0.5) as u64 |
| 390 |
); |
| 391 |
let network_latency_p95 = Duration::from_millis( |
| 392 |
self.calculate_percentile(&latencies, 0.95) as u64 |
| 393 |
); |
| 394 |
|
| 395 |
let total_storage_capacity = 100 * 1024 * 1024 * 1024u64; // 100GB per node |
| 396 |
let used_storage_capacity = (total_storage_capacity as f32 * 0.4) as u64; // 40% used |
| 397 |
|
| 398 |
let error_rate = self.node_health_cache.values() |
| 399 |
.map(|node| node.metrics.error_count as f32) |
| 400 |
.sum::<f32>() / (total_nodes as f32).max(1.0) / 1000.0; |
| 401 |
|
| 402 |
GlobalNetworkMetrics { |
| 403 |
total_nodes, |
| 404 |
healthy_nodes, |
| 405 |
unhealthy_nodes, |
| 406 |
offline_nodes, |
| 407 |
average_uptime, |
| 408 |
network_latency_p50, |
| 409 |
network_latency_p95, |
| 410 |
total_storage_capacity: total_storage_capacity * total_nodes as u64, |
| 411 |
used_storage_capacity: used_storage_capacity * total_nodes as u64, |
| 412 |
data_redundancy_level: 2.5, // Average redundancy factor |
| 413 |
throughput_mbps: 150.0, |
| 414 |
error_rate, |
| 415 |
} |
| 416 |
} |
| 417 |
|
| 418 |
async fn assess_regional_health(&mut self) -> HashMap<String, RegionalHealth> { |
| 419 |
let mut regional_health = HashMap::new(); |
| 420 |
|
| 421 |
let regions = vec!["us-east", "us-west", "europe", "asia-pacific"]; |
| 422 |
|
| 423 |
for region in regions { |
| 424 |
let nodes_in_region: Vec<_> = self.node_health_cache.keys() |
| 425 |
.filter(|_| true) // Placeholder: filter by region |
| 426 |
.take(2) |
| 427 |
.cloned() |
| 428 |
.collect(); |
| 429 |
|
| 430 |
let node_count = nodes_in_region.len() as u32; |
| 431 |
let healthy_nodes = nodes_in_region.iter() |
| 432 |
.filter(|node_id| { |
| 433 |
if let Some(node) = self.node_health_cache.get(*node_id) { |
| 434 |
matches!(node.status, NodeStatus::Healthy) |
| 435 |
} else { |
| 436 |
false |
| 437 |
} |
| 438 |
}) |
| 439 |
.count() as u32; |
| 440 |
|
| 441 |
let average_latency = if !nodes_in_region.is_empty() { |
| 442 |
let total_latency: u128 = nodes_in_region.iter() |
| 443 |
.filter_map(|node_id| self.node_health_cache.get(node_id)) |
| 444 |
.map(|node| node.metrics.network_latency.as_millis()) |
| 445 |
.sum(); |
| 446 |
Duration::from_millis((total_latency / nodes_in_region.len() as u128) as u64) |
| 447 |
} else { |
| 448 |
Duration::from_millis(0) |
| 449 |
}; |
| 450 |
|
| 451 |
let health_score = if node_count > 0 { |
| 452 |
healthy_nodes as f32 / node_count as f32 |
| 453 |
} else { |
| 454 |
0.0 |
| 455 |
}; |
| 456 |
|
| 457 |
let connectivity_status = if health_score > 0.9 { |
| 458 |
ConnectivityStatus::Excellent |
| 459 |
} else if health_score > 0.8 { |
| 460 |
ConnectivityStatus::Good |
| 461 |
} else if health_score > 0.6 { |
| 462 |
ConnectivityStatus::Degraded |
| 463 |
} else if health_score > 0.3 { |
| 464 |
ConnectivityStatus::Poor |
| 465 |
} else { |
| 466 |
ConnectivityStatus::Critical |
| 467 |
}; |
| 468 |
|
| 469 |
let regional = RegionalHealth { |
| 470 |
region: region.to_string(), |
| 471 |
health_score, |
| 472 |
node_count, |
| 473 |
healthy_nodes, |
| 474 |
average_latency, |
| 475 |
storage_utilization: 0.4, // 40% utilized |
| 476 |
connectivity_status, |
| 477 |
risk_factors: self.identify_regional_risk_factors(region, health_score), |
| 478 |
}; |
| 479 |
|
| 480 |
regional_health.insert(region.to_string(), regional); |
| 481 |
} |
| 482 |
|
| 483 |
regional_health |
| 484 |
} |
| 485 |
|
| 486 |
fn analyze_health_trends(&self) -> HealthTrend { |
| 487 |
if self.health_history.len() < 5 { |
| 488 |
return HealthTrend::default(); |
| 489 |
} |
| 490 |
|
| 491 |
let recent_scores: Vec<_> = self.health_history.iter() |
| 492 |
.rev() |
| 493 |
.take(60) // Last hour |
| 494 |
.map(|report| report.overall_health_score) |
| 495 |
.collect(); |
| 496 |
|
| 497 |
let trend_direction = self.calculate_trend_direction(&recent_scores); |
| 498 |
let confidence = self.calculate_trend_confidence(&recent_scores); |
| 499 |
|
| 500 |
let key_indicators = vec![ |
| 501 |
TrendIndicator { |
| 502 |
metric: "Overall Health".to_string(), |
| 503 |
current_value: recent_scores.first().copied().unwrap_or(0.0), |
| 504 |
trend_direction: trend_direction.clone(), |
| 505 |
rate_of_change: self.calculate_rate_of_change(&recent_scores), |
| 506 |
significance: 0.9, |
| 507 |
}, |
| 508 |
TrendIndicator { |
| 509 |
metric: "Node Availability".to_string(), |
| 510 |
current_value: 0.95, |
| 511 |
trend_direction: TrendDirection::Stable, |
| 512 |
rate_of_change: 0.001, |
| 513 |
significance: 0.8, |
| 514 |
}, |
| 515 |
]; |
| 516 |
|
| 517 |
let predicted_issues = self.generate_trend_predictions(&recent_scores); |
| 518 |
|
| 519 |
HealthTrend { |
| 520 |
direction: trend_direction, |
| 521 |
confidence, |
| 522 |
time_window: Duration::from_secs(3600), |
| 523 |
key_indicators, |
| 524 |
predicted_issues, |
| 525 |
} |
| 526 |
} |
| 527 |
|
| 528 |
fn assess_network_risks( |
| 529 |
&self, |
| 530 |
metrics: &GlobalNetworkMetrics, |
| 531 |
regional_health: &HashMap<String, RegionalHealth> |
| 532 |
) -> RiskAssessment { |
| 533 |
let data_loss_risk = if metrics.data_redundancy_level < 2.0 { 0.8 } |
| 534 |
else if metrics.data_redundancy_level < 2.5 { 0.4 } |
| 535 |
else { 0.1 }; |
| 536 |
|
| 537 |
let availability_risk = 1.0 - (metrics.healthy_nodes as f32 / metrics.total_nodes as f32); |
| 538 |
|
| 539 |
let performance_risk = if metrics.network_latency_p95 > Duration::from_millis(1000) { 0.7 } |
| 540 |
else if metrics.network_latency_p95 > Duration::from_millis(500) { 0.4 } |
| 541 |
else { 0.1 }; |
| 542 |
|
| 543 |
let security_risk = metrics.error_rate * 10.0; |
| 544 |
|
| 545 |
let overall_risk_score = (data_loss_risk + availability_risk + performance_risk + security_risk) / 4.0; |
| 546 |
let overall_risk_level = if overall_risk_score > 0.8 { RiskLevel::Critical } |
| 547 |
else if overall_risk_score > 0.6 { RiskLevel::High } |
| 548 |
else if overall_risk_score > 0.4 { RiskLevel::Medium } |
| 549 |
else if overall_risk_score > 0.2 { RiskLevel::Low } |
| 550 |
else { RiskLevel::VeryLow }; |
| 551 |
|
| 552 |
let risk_factors = vec![ |
| 553 |
NetworkRiskFactor { |
| 554 |
factor_type: "Node Concentration".to_string(), |
| 555 |
severity: 0.3, |
| 556 |
likelihood: 0.4, |
| 557 |
impact_scope: "Regional availability".to_string(), |
| 558 |
mitigation_options: vec!["Increase geographic distribution".to_string()], |
| 559 |
}, |
| 560 |
]; |
| 561 |
|
| 562 |
RiskAssessment { |
| 563 |
overall_risk_level, |
| 564 |
data_loss_risk, |
| 565 |
availability_risk, |
| 566 |
performance_risk, |
| 567 |
security_risk, |
| 568 |
mitigation_effectiveness: 0.7, |
| 569 |
risk_factors, |
| 570 |
} |
| 571 |
} |
| 572 |
|
| 573 |
fn calculate_overall_health_score( |
| 574 |
&self, |
| 575 |
metrics: &GlobalNetworkMetrics, |
| 576 |
regional_health: &HashMap<String, RegionalHealth>, |
| 577 |
risk_assessment: &RiskAssessment, |
| 578 |
) -> f32 { |
| 579 |
let availability_score = metrics.healthy_nodes as f32 / metrics.total_nodes as f32; |
| 580 |
let performance_score = if metrics.network_latency_p95 < Duration::from_millis(200) { 1.0 } |
| 581 |
else if metrics.network_latency_p95 < Duration::from_millis(500) { 0.8 } |
| 582 |
else if metrics.network_latency_p95 < Duration::from_millis(1000) { 0.6 } |
| 583 |
else { 0.3 }; |
| 584 |
|
| 585 |
let regional_score = if regional_health.is_empty() { 0.5 } else { |
| 586 |
regional_health.values().map(|r| r.health_score).sum::<f32>() / regional_health.len() as f32 |
| 587 |
}; |
| 588 |
|
| 589 |
let risk_score = 1.0 - risk_assessment.availability_risk; |
| 590 |
|
| 591 |
(availability_score * 0.4 + performance_score * 0.3 + regional_score * 0.2 + risk_score * 0.1) |
| 592 |
} |
| 593 |
|
| 594 |
async fn generate_health_alerts( |
| 595 |
&mut self, |
| 596 |
metrics: &GlobalNetworkMetrics, |
| 597 |
regional_health: &HashMap<String, RegionalHealth>, |
| 598 |
risk_assessment: &RiskAssessment, |
| 599 |
) -> (Vec<HealthAlert>, Vec<HealthAlert>) { |
| 600 |
let mut critical_alerts = Vec::new(); |
| 601 |
let mut warnings = Vec::new(); |
| 602 |
|
| 603 |
// Check for critical node failures |
| 604 |
if metrics.offline_nodes > metrics.total_nodes / 4 { |
| 605 |
let alert = HealthAlert { |
| 606 |
id: format!("critical_node_failures_{}", crate::SerializableInstant::now().elapsed().as_secs()), |
| 607 |
severity: AlertSeverity::Critical, |
| 608 |
alert_type: AlertType::NodeFailures, |
| 609 |
message: format!("{} nodes are offline ({}% of network)", metrics.offline_nodes, |
| 610 |
(metrics.offline_nodes as f32 / metrics.total_nodes as f32 * 100.0) as u32), |
| 611 |
affected_nodes: vec!["multiple".to_string()], |
| 612 |
affected_regions: regional_health.keys().cloned().collect(), |
| 613 |
first_detected: crate::SerializableInstant::now(), |
| 614 |
estimated_impact: ImpactAssessment { |
| 615 |
affected_data_percentage: metrics.offline_nodes as f32 / metrics.total_nodes as f32, |
| 616 |
performance_impact: 0.8, |
| 617 |
availability_risk: 0.9, |
| 618 |
estimated_users_affected: 10000, |
| 619 |
data_at_risk: metrics.used_storage_capacity / 4, |
| 620 |
}, |
| 621 |
recommended_actions: vec![ |
| 622 |
"Investigate node failures immediately".to_string(), |
| 623 |
"Activate emergency replication".to_string(), |
| 624 |
"Contact affected regions".to_string(), |
| 625 |
], |
| 626 |
}; |
| 627 |
critical_alerts.push(alert); |
| 628 |
} |
| 629 |
|
| 630 |
// Check storage capacity |
| 631 |
let storage_utilization = metrics.used_storage_capacity as f32 / metrics.total_storage_capacity as f32; |
| 632 |
if storage_utilization > 0.9 { |
| 633 |
let alert = HealthAlert { |
| 634 |
id: format!("storage_capacity_{}", crate::SerializableInstant::now().elapsed().as_secs()), |
| 635 |
severity: AlertSeverity::High, |
| 636 |
alert_type: AlertType::StorageCapacity, |
| 637 |
message: format!("Network storage is {}% full", (storage_utilization * 100.0) as u32), |
| 638 |
affected_nodes: vec!["all".to_string()], |
| 639 |
affected_regions: regional_health.keys().cloned().collect(), |
| 640 |
first_detected: crate::SerializableInstant::now(), |
| 641 |
estimated_impact: ImpactAssessment { |
| 642 |
affected_data_percentage: 1.0, |
| 643 |
performance_impact: 0.6, |
| 644 |
availability_risk: 0.4, |
| 645 |
estimated_users_affected: 50000, |
| 646 |
data_at_risk: metrics.used_storage_capacity, |
| 647 |
}, |
| 648 |
recommended_actions: vec![ |
| 649 |
"Add storage capacity".to_string(), |
| 650 |
"Implement data cleanup policies".to_string(), |
| 651 |
"Scale up storage nodes".to_string(), |
| 652 |
], |
| 653 |
}; |
| 654 |
warnings.push(alert); |
| 655 |
} |
| 656 |
|
| 657 |
// Check network performance |
| 658 |
if metrics.network_latency_p95 > Duration::from_millis(1000) { |
| 659 |
let alert = HealthAlert { |
| 660 |
id: format!("network_latency_{}", crate::SerializableInstant::now().elapsed().as_secs()), |
| 661 |
severity: AlertSeverity::Medium, |
| 662 |
alert_type: AlertType::PerformanceDegradation, |
| 663 |
message: format!("Network latency is high: {}ms (95th percentile)", |
| 664 |
metrics.network_latency_p95.as_millis()), |
| 665 |
affected_nodes: vec!["multiple".to_string()], |
| 666 |
affected_regions: regional_health.keys().cloned().collect(), |
| 667 |
first_detected: crate::SerializableInstant::now(), |
| 668 |
estimated_impact: ImpactAssessment { |
| 669 |
affected_data_percentage: 0.0, |
| 670 |
performance_impact: 0.7, |
| 671 |
availability_risk: 0.2, |
| 672 |
estimated_users_affected: 25000, |
| 673 |
data_at_risk: 0, |
| 674 |
}, |
| 675 |
recommended_actions: vec![ |
| 676 |
"Investigate network congestion".to_string(), |
| 677 |
"Optimize routing".to_string(), |
| 678 |
"Check regional connectivity".to_string(), |
| 679 |
], |
| 680 |
}; |
| 681 |
warnings.push(alert); |
| 682 |
} |
| 683 |
|
| 684 |
(critical_alerts, warnings) |
| 685 |
} |
| 686 |
|
| 687 |
async fn update_predictive_models(&mut self, report: &NetworkHealthReport) { |
| 688 |
// Update models based on new health report data |
| 689 |
let data_point = HealthDataPoint { |
| 690 |
timestamp: report.timestamp, |
| 691 |
metrics: vec![ |
| 692 |
report.overall_health_score, |
| 693 |
report.network_metrics.healthy_nodes as f32 / report.network_metrics.total_nodes as f32, |
| 694 |
report.network_metrics.network_latency_p95.as_millis() as f32 / 1000.0, |
| 695 |
report.network_metrics.error_rate, |
| 696 |
], |
| 697 |
outcome: None, // Would be populated when actual issues occur |
| 698 |
}; |
| 699 |
|
| 700 |
for model in self.predictive_models.values_mut() { |
| 701 |
model.training_data.push_back(data_point.clone()); |
| 702 |
if model.training_data.len() > 1000 { |
| 703 |
model.training_data.pop_front(); |
| 704 |
} |
| 705 |
} |
| 706 |
} |
| 707 |
|
| 708 |
async fn run_prediction_model(&self, model: &HealthPredictionModel, time_horizon: Duration) -> Option<PredictedIssue> { |
| 709 |
if model.training_data.len() < 10 { |
| 710 |
return None; |
| 711 |
} |
| 712 |
|
| 713 |
// Simple prediction based on recent trends |
| 714 |
let recent_health: Vec<_> = model.training_data.iter() |
| 715 |
.rev() |
| 716 |
.take(10) |
| 717 |
.map(|dp| dp.metrics[0]) |
| 718 |
.collect(); |
| 719 |
|
| 720 |
let trend = self.calculate_rate_of_change(&recent_health); |
| 721 |
let current_health = recent_health.first().copied().unwrap_or(0.5); |
| 722 |
|
| 723 |
if trend < -0.01 && current_health < 0.7 { |
| 724 |
Some(PredictedIssue { |
| 725 |
issue_type: AlertType::PerformanceDegradation, |
| 726 |
probability: 0.6, |
| 727 |
predicted_time: crate::SerializableInstant::now() + time_horizon, |
| 728 |
potential_impact: ImpactAssessment { |
| 729 |
affected_data_percentage: 0.3, |
| 730 |
performance_impact: 0.5, |
| 731 |
availability_risk: 0.3, |
| 732 |
estimated_users_affected: 15000, |
| 733 |
data_at_risk: 1024 * 1024 * 1024, // 1GB |
| 734 |
}, |
| 735 |
prevention_actions: vec![ |
| 736 |
"Increase monitoring frequency".to_string(), |
| 737 |
"Prepare additional resources".to_string(), |
| 738 |
], |
| 739 |
}) |
| 740 |
} else { |
| 741 |
None |
| 742 |
} |
| 743 |
} |
| 744 |
|
| 745 |
fn identify_regional_risk_factors(&self, _region: &str, health_score: f32) -> Vec<RegionalRiskFactor> { |
| 746 |
let mut factors = Vec::new(); |
| 747 |
|
| 748 |
if health_score < 0.7 { |
| 749 |
factors.push(RegionalRiskFactor::InfrastructureIssues); |
| 750 |
} |
| 751 |
|
| 752 |
factors |
| 753 |
} |
| 754 |
|
| 755 |
fn calculate_percentile(&self, values: &[f32], percentile: f32) -> f32 { |
| 756 |
if values.is_empty() { |
| 757 |
return 0.0; |
| 758 |
} |
| 759 |
|
| 760 |
let mut sorted_values = values.to_vec(); |
| 761 |
sorted_values.sort_by(|a, b| a.partial_cmp(b).unwrap()); |
| 762 |
|
| 763 |
let index = (percentile * (sorted_values.len() - 1) as f32).round() as usize; |
| 764 |
sorted_values[index.min(sorted_values.len() - 1)] |
| 765 |
} |
| 766 |
|
| 767 |
fn calculate_trend_direction(&self, values: &[f32]) -> TrendDirection { |
| 768 |
if values.len() < 2 { |
| 769 |
return TrendDirection::Stable; |
| 770 |
} |
| 771 |
|
| 772 |
let slope = self.calculate_rate_of_change(values); |
| 773 |
|
| 774 |
if slope > 0.05 { TrendDirection::StronglyImproving } |
| 775 |
else if slope > 0.02 { TrendDirection::Improving } |
| 776 |
else if slope > -0.02 { TrendDirection::Stable } |
| 777 |
else if slope > -0.05 { TrendDirection::Declining } |
| 778 |
else { TrendDirection::StronglyDeclining } |
| 779 |
} |
| 780 |
|
| 781 |
fn calculate_trend_confidence(&self, values: &[f32]) -> f32 { |
| 782 |
if values.len() < 3 { |
| 783 |
return 0.1; |
| 784 |
} |
| 785 |
|
| 786 |
let mean = values.iter().sum::<f32>() / values.len() as f32; |
| 787 |
let variance = values.iter() |
| 788 |
.map(|&x| (x - mean).powi(2)) |
| 789 |
.sum::<f32>() / values.len() as f32; |
| 790 |
|
| 791 |
1.0 / (1.0 + variance * 10.0) |
| 792 |
} |
| 793 |
|
| 794 |
fn calculate_rate_of_change(&self, values: &[f32]) -> f32 { |
| 795 |
if values.len() < 2 { |
| 796 |
return 0.0; |
| 797 |
} |
| 798 |
|
| 799 |
let first = values.last().copied().unwrap_or(0.0); |
| 800 |
let last = values.first().copied().unwrap_or(0.0); |
| 801 |
|
| 802 |
(last - first) / values.len() as f32 |
| 803 |
} |
| 804 |
|
| 805 |
fn generate_trend_predictions(&self, _values: &[f32]) -> Vec<PredictedIssue> { |
| 806 |
// Placeholder: Would generate predictions based on trend analysis |
| 807 |
Vec::new() |
| 808 |
} |
| 809 |
} |
| 810 |
|
| 811 |
impl Default for AlertThresholds { |
| 812 |
fn default() -> Self { |
| 813 |
Self { |
| 814 |
node_failure_threshold: 0.1, // 10% node failures trigger alert |
| 815 |
network_latency_threshold: Duration::from_millis(500), |
| 816 |
storage_utilization_threshold: 0.85, // 85% storage usage |
| 817 |
error_rate_threshold: 0.05, // 5% error rate |
| 818 |
uptime_threshold: 0.95, // 95% uptime required |
| 819 |
redundancy_threshold: 2.0, // Minimum 2x redundancy |
| 820 |
} |
| 821 |
} |
| 822 |
} |
| 823 |
|
| 824 |
impl Default for HealthTrend { |
| 825 |
fn default() -> Self { |
| 826 |
Self { |
| 827 |
direction: TrendDirection::Stable, |
| 828 |
confidence: 0.5, |
| 829 |
time_window: Duration::from_secs(3600), |
| 830 |
key_indicators: Vec::new(), |
| 831 |
predicted_issues: Vec::new(), |
| 832 |
} |
| 833 |
} |
| 834 |
} |