| 1 |
//! Predictive Replication Module |
| 2 |
//! |
| 3 |
//! Machine learning-based node failure prediction and proactive data migration |
| 4 |
|
| 5 |
use serde::{Deserialize, Serialize}; |
| 6 |
use std::collections::HashMap; |
| 7 |
use tokio::time::Duration; |
| 8 |
|
| 9 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 10 |
pub struct NodeMetrics { |
| 11 |
pub node_id: String, |
| 12 |
pub uptime_percentage: f32, |
| 13 |
pub response_latency: Duration, |
| 14 |
pub storage_usage: f32, |
| 15 |
pub bandwidth_utilization: f32, |
| 16 |
pub error_rate: f32, |
| 17 |
pub last_failure: Option<crate::SerializableInstant>, |
| 18 |
pub hardware_health: HardwareHealth, |
| 19 |
pub geographic_risk: GeographicRisk, |
| 20 |
pub network_stability: NetworkStability, |
| 21 |
} |
| 22 |
|
| 23 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 24 |
pub struct HardwareHealth { |
| 25 |
pub cpu_temperature: f32, |
| 26 |
pub disk_health_score: f32, |
| 27 |
pub memory_errors: u32, |
| 28 |
pub power_stability: f32, |
| 29 |
} |
| 30 |
|
| 31 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 32 |
pub struct GeographicRisk { |
| 33 |
pub natural_disaster_risk: f32, |
| 34 |
pub political_stability: f32, |
| 35 |
pub infrastructure_quality: f32, |
| 36 |
pub connectivity_redundancy: f32, |
| 37 |
} |
| 38 |
|
| 39 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 40 |
pub struct NetworkStability { |
| 41 |
pub connection_drops: u32, |
| 42 |
pub peer_count: u32, |
| 43 |
pub routing_efficiency: f32, |
| 44 |
pub congestion_level: f32, |
| 45 |
} |
| 46 |
|
| 47 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 48 |
pub struct FailurePrediction { |
| 49 |
pub node_id: String, |
| 50 |
pub failure_probability: f32, |
| 51 |
pub predicted_failure_time: Option<crate::SerializableInstant>, |
| 52 |
pub confidence_score: f32, |
| 53 |
pub risk_factors: Vec<RiskFactor>, |
| 54 |
pub recommended_actions: Vec<RecommendedAction>, |
| 55 |
} |
| 56 |
|
| 57 |
#[derive(Debug, Clone, Serialize, Deserialize)] |
| 58 |
pub enum RiskFactor { |
| 59 |
HighLatency, |
| 60 |
FrequentDisconnections, |
| 61 |
StorageExhaustion, |
| 62 |
HardwareDeterioration, |
| 63 |
NetworkCongestion, |
| 64 |
GeographicInstability, |
| 65 |
PerformanceDegradation, |
| 66 |
} |
| 67 |
|
| 68 |
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] |
| 69 |
pub enum RecommendedAction { |
| 70 |
MigrateChunksImmediately, |
| 71 |
IncreaseRedundancy, |
| 72 |
ScheduleMaintenance, |
| 73 |
ReduceLoad, |
| 74 |
MonitorClosely, |
| 75 |
PrepareFailover, |
| 76 |
} |
| 77 |
|
| 78 |
pub struct MLPredictor { |
| 79 |
node_history: HashMap<String, Vec<NodeMetrics>>, |
| 80 |
prediction_models: HashMap<String, PredictionModel>, |
| 81 |
feature_weights: FeatureWeights, |
| 82 |
training_data: Vec<TrainingExample>, |
| 83 |
} |
| 84 |
|
| 85 |
#[derive(Debug, Clone)] |
| 86 |
struct PredictionModel { |
| 87 |
weights: Vec<f32>, |
| 88 |
bias: f32, |
| 89 |
accuracy: f32, |
| 90 |
last_updated: crate::SerializableInstant, |
| 91 |
} |
| 92 |
|
| 93 |
#[derive(Debug, Clone)] |
| 94 |
struct FeatureWeights { |
| 95 |
uptime: f32, |
| 96 |
latency: f32, |
| 97 |
storage: f32, |
| 98 |
bandwidth: f32, |
| 99 |
error_rate: f32, |
| 100 |
hardware_health: f32, |
| 101 |
geographic_risk: f32, |
| 102 |
network_stability: f32, |
| 103 |
} |
| 104 |
|
| 105 |
#[derive(Debug, Clone)] |
| 106 |
struct TrainingExample { |
| 107 |
features: Vec<f32>, |
| 108 |
outcome: bool, // true if node failed |
| 109 |
timestamp: crate::SerializableInstant, |
| 110 |
} |
| 111 |
|
| 112 |
impl MLPredictor { |
| 113 |
pub fn new() -> Self { |
| 114 |
Self { |
| 115 |
node_history: HashMap::new(), |
| 116 |
prediction_models: HashMap::new(), |
| 117 |
feature_weights: FeatureWeights::default(), |
| 118 |
training_data: Vec::new(), |
| 119 |
} |
| 120 |
} |
| 121 |
|
| 122 |
pub async fn update_node_metrics(&mut self, metrics: NodeMetrics) { |
| 123 |
let node_id = metrics.node_id.clone(); |
| 124 |
let history = self.node_history.entry(node_id.clone()).or_insert_with(Vec::new); |
| 125 |
|
| 126 |
history.push(metrics.clone()); |
| 127 |
|
| 128 |
// Keep only last 1000 data points per node |
| 129 |
if history.len() > 1000 { |
| 130 |
history.drain(0..history.len() - 1000); |
| 131 |
} |
| 132 |
|
| 133 |
// Update training data based on actual failures |
| 134 |
if let Some(last_metrics) = history.get(history.len().saturating_sub(2)) { |
| 135 |
if self.detect_failure_transition(last_metrics, &metrics) { |
| 136 |
let features = self.extract_features(last_metrics); |
| 137 |
self.training_data.push(TrainingExample { |
| 138 |
features, |
| 139 |
outcome: true, |
| 140 |
timestamp: crate::SerializableInstant::now(), |
| 141 |
}); |
| 142 |
} |
| 143 |
} |
| 144 |
|
| 145 |
// Retrain model periodically |
| 146 |
if history.len() % 100 == 0 { |
| 147 |
self.retrain_model(&node_id).await; |
| 148 |
} |
| 149 |
} |
| 150 |
|
| 151 |
pub async fn predict_node_failure(&self, node_id: &str) -> Option<FailurePrediction> { |
| 152 |
let history = self.node_history.get(node_id)?; |
| 153 |
let latest_metrics = history.last()?; |
| 154 |
let model = self.prediction_models.get(node_id)?; |
| 155 |
|
| 156 |
let features = self.extract_features(latest_metrics); |
| 157 |
let failure_probability = self.calculate_failure_probability(&features, model); |
| 158 |
|
| 159 |
if failure_probability < 0.1 { |
| 160 |
return None; // Low risk, no prediction needed |
| 161 |
} |
| 162 |
|
| 163 |
let confidence_score = self.calculate_confidence(&features, model); |
| 164 |
let risk_factors = self.identify_risk_factors(latest_metrics); |
| 165 |
let recommended_actions = self.generate_recommendations(failure_probability, &risk_factors); |
| 166 |
|
| 167 |
let predicted_failure_time = if failure_probability > 0.7 { |
| 168 |
Some(crate::SerializableInstant::now() + Duration::from_secs(3600)) // 1 hour |
| 169 |
} else if failure_probability > 0.5 { |
| 170 |
Some(crate::SerializableInstant::now() + Duration::from_secs(7200)) // 2 hours |
| 171 |
} else { |
| 172 |
Some(crate::SerializableInstant::now() + Duration::from_secs(14400)) // 4 hours |
| 173 |
}; |
| 174 |
|
| 175 |
Some(FailurePrediction { |
| 176 |
node_id: node_id.to_string(), |
| 177 |
failure_probability, |
| 178 |
predicted_failure_time, |
| 179 |
confidence_score, |
| 180 |
risk_factors, |
| 181 |
recommended_actions, |
| 182 |
}) |
| 183 |
} |
| 184 |
|
| 185 |
pub async fn get_high_risk_nodes(&self) -> Vec<FailurePrediction> { |
| 186 |
let mut high_risk = Vec::new(); |
| 187 |
|
| 188 |
for node_id in self.node_history.keys() { |
| 189 |
if let Some(prediction) = self.predict_node_failure(node_id).await { |
| 190 |
if prediction.failure_probability > 0.3 { |
| 191 |
high_risk.push(prediction); |
| 192 |
} |
| 193 |
} |
| 194 |
} |
| 195 |
|
| 196 |
// Sort by failure probability (highest first) |
| 197 |
high_risk.sort_by(|a, b| b.failure_probability.partial_cmp(&a.failure_probability).unwrap()); |
| 198 |
high_risk |
| 199 |
} |
| 200 |
|
| 201 |
fn extract_features(&self, metrics: &NodeMetrics) -> Vec<f32> { |
| 202 |
vec![ |
| 203 |
metrics.uptime_percentage, |
| 204 |
metrics.response_latency.as_millis() as f32, |
| 205 |
metrics.storage_usage, |
| 206 |
metrics.bandwidth_utilization, |
| 207 |
metrics.error_rate, |
| 208 |
self.calculate_hardware_score(&metrics.hardware_health), |
| 209 |
self.calculate_geographic_risk_score(&metrics.geographic_risk), |
| 210 |
self.calculate_network_stability_score(&metrics.network_stability), |
| 211 |
] |
| 212 |
} |
| 213 |
|
| 214 |
fn calculate_failure_probability(&self, features: &[f32], model: &PredictionModel) -> f32 { |
| 215 |
let mut score = model.bias; |
| 216 |
for (feature, weight) in features.iter().zip(model.weights.iter()) { |
| 217 |
score += feature * weight; |
| 218 |
} |
| 219 |
|
| 220 |
// Sigmoid activation |
| 221 |
1.0 / (1.0 + (-score).exp()) |
| 222 |
} |
| 223 |
|
| 224 |
fn calculate_confidence(&self, features: &[f32], model: &PredictionModel) -> f32 { |
| 225 |
// Confidence based on feature consistency and model accuracy |
| 226 |
let feature_variance = self.calculate_feature_variance(features); |
| 227 |
let base_confidence = model.accuracy; |
| 228 |
|
| 229 |
// Higher variance reduces confidence |
| 230 |
base_confidence * (1.0 - feature_variance.min(0.5)) |
| 231 |
} |
| 232 |
|
| 233 |
fn calculate_feature_variance(&self, features: &[f32]) -> f32 { |
| 234 |
if features.is_empty() { |
| 235 |
return 0.0; |
| 236 |
} |
| 237 |
|
| 238 |
let mean: f32 = features.iter().sum::<f32>() / features.len() as f32; |
| 239 |
let variance: f32 = features.iter() |
| 240 |
.map(|x| (x - mean).powi(2)) |
| 241 |
.sum::<f32>() / features.len() as f32; |
| 242 |
|
| 243 |
variance.sqrt() |
| 244 |
} |
| 245 |
|
| 246 |
fn identify_risk_factors(&self, metrics: &NodeMetrics) -> Vec<RiskFactor> { |
| 247 |
let mut factors = Vec::new(); |
| 248 |
|
| 249 |
if metrics.response_latency > Duration::from_millis(1000) { |
| 250 |
factors.push(RiskFactor::HighLatency); |
| 251 |
} |
| 252 |
|
| 253 |
if metrics.error_rate > 0.05 { |
| 254 |
factors.push(RiskFactor::FrequentDisconnections); |
| 255 |
} |
| 256 |
|
| 257 |
if metrics.storage_usage > 0.9 { |
| 258 |
factors.push(RiskFactor::StorageExhaustion); |
| 259 |
} |
| 260 |
|
| 261 |
if metrics.hardware_health.disk_health_score < 0.7 |
| 262 |
|| metrics.hardware_health.cpu_temperature > 80.0 { |
| 263 |
factors.push(RiskFactor::HardwareDeterioration); |
| 264 |
} |
| 265 |
|
| 266 |
if metrics.network_stability.congestion_level > 0.8 { |
| 267 |
factors.push(RiskFactor::NetworkCongestion); |
| 268 |
} |
| 269 |
|
| 270 |
if metrics.geographic_risk.natural_disaster_risk > 0.6 |
| 271 |
|| metrics.geographic_risk.political_stability < 0.4 { |
| 272 |
factors.push(RiskFactor::GeographicInstability); |
| 273 |
} |
| 274 |
|
| 275 |
if metrics.uptime_percentage < 0.95 && metrics.bandwidth_utilization < 0.3 { |
| 276 |
factors.push(RiskFactor::PerformanceDegradation); |
| 277 |
} |
| 278 |
|
| 279 |
factors |
| 280 |
} |
| 281 |
|
| 282 |
fn generate_recommendations( |
| 283 |
&self, |
| 284 |
failure_probability: f32, |
| 285 |
risk_factors: &[RiskFactor] |
| 286 |
) -> Vec<RecommendedAction> { |
| 287 |
let mut actions = Vec::new(); |
| 288 |
|
| 289 |
if failure_probability > 0.8 { |
| 290 |
actions.push(RecommendedAction::MigrateChunksImmediately); |
| 291 |
actions.push(RecommendedAction::PrepareFailover); |
| 292 |
} else if failure_probability > 0.6 { |
| 293 |
actions.push(RecommendedAction::IncreaseRedundancy); |
| 294 |
actions.push(RecommendedAction::MonitorClosely); |
| 295 |
} else if failure_probability > 0.4 { |
| 296 |
actions.push(RecommendedAction::ScheduleMaintenance); |
| 297 |
} |
| 298 |
|
| 299 |
for factor in risk_factors { |
| 300 |
match factor { |
| 301 |
RiskFactor::StorageExhaustion => { |
| 302 |
actions.push(RecommendedAction::ReduceLoad); |
| 303 |
} |
| 304 |
RiskFactor::HardwareDeterioration => { |
| 305 |
actions.push(RecommendedAction::ScheduleMaintenance); |
| 306 |
} |
| 307 |
RiskFactor::NetworkCongestion => { |
| 308 |
actions.push(RecommendedAction::ReduceLoad); |
| 309 |
} |
| 310 |
_ => {} |
| 311 |
} |
| 312 |
} |
| 313 |
|
| 314 |
actions.sort(); |
| 315 |
actions.dedup(); |
| 316 |
actions |
| 317 |
} |
| 318 |
|
| 319 |
async fn retrain_model(&mut self, node_id: &str) { |
| 320 |
if let Some(history) = self.node_history.get(node_id) { |
| 321 |
if history.len() < 50 { |
| 322 |
return; // Need more data |
| 323 |
} |
| 324 |
|
| 325 |
let mut model = PredictionModel { |
| 326 |
weights: vec![0.1; 8], // Initialize with small weights |
| 327 |
bias: 0.0, |
| 328 |
accuracy: 0.5, |
| 329 |
last_updated: crate::SerializableInstant::now(), |
| 330 |
}; |
| 331 |
|
| 332 |
// Simple gradient descent training |
| 333 |
let learning_rate = 0.01; |
| 334 |
let epochs = 100; |
| 335 |
|
| 336 |
for _ in 0..epochs { |
| 337 |
for example in &self.training_data { |
| 338 |
if example.features.len() == 8 { |
| 339 |
let prediction = self.calculate_failure_probability(&example.features, &model); |
| 340 |
let target = if example.outcome { 1.0 } else { 0.0 }; |
| 341 |
let error = prediction - target; |
| 342 |
|
| 343 |
// Update weights |
| 344 |
for (i, feature) in example.features.iter().enumerate() { |
| 345 |
model.weights[i] -= learning_rate * error * feature; |
| 346 |
} |
| 347 |
model.bias -= learning_rate * error; |
| 348 |
} |
| 349 |
} |
| 350 |
} |
| 351 |
|
| 352 |
// Calculate accuracy on validation set |
| 353 |
let mut correct = 0; |
| 354 |
let mut total = 0; |
| 355 |
for example in &self.training_data { |
| 356 |
if example.features.len() == 8 { |
| 357 |
let prediction = self.calculate_failure_probability(&example.features, &model); |
| 358 |
let predicted_outcome = prediction > 0.5; |
| 359 |
if predicted_outcome == example.outcome { |
| 360 |
correct += 1; |
| 361 |
} |
| 362 |
total += 1; |
| 363 |
} |
| 364 |
} |
| 365 |
|
| 366 |
if total > 0 { |
| 367 |
model.accuracy = correct as f32 / total as f32; |
| 368 |
} |
| 369 |
|
| 370 |
self.prediction_models.insert(node_id.to_string(), model); |
| 371 |
} |
| 372 |
} |
| 373 |
|
| 374 |
fn detect_failure_transition(&self, previous: &NodeMetrics, current: &NodeMetrics) -> bool { |
| 375 |
// Detect if node has failed based on metrics change |
| 376 |
let uptime_drop = previous.uptime_percentage - current.uptime_percentage; |
| 377 |
let latency_spike = current.response_latency.as_millis() as f32 / |
| 378 |
previous.response_latency.as_millis() as f32; |
| 379 |
let error_increase = current.error_rate / previous.error_rate.max(0.001); |
| 380 |
|
| 381 |
uptime_drop > 0.2 || latency_spike > 2.0 || error_increase > 3.0 |
| 382 |
} |
| 383 |
|
| 384 |
fn calculate_hardware_score(&self, health: &HardwareHealth) -> f32 { |
| 385 |
let temp_score = if health.cpu_temperature > 90.0 { 0.0 } |
| 386 |
else if health.cpu_temperature > 80.0 { 0.3 } |
| 387 |
else if health.cpu_temperature > 70.0 { 0.7 } |
| 388 |
else { 1.0 }; |
| 389 |
|
| 390 |
let disk_score = health.disk_health_score; |
| 391 |
let memory_score = if health.memory_errors > 10 { 0.2 } |
| 392 |
else if health.memory_errors > 5 { 0.6 } |
| 393 |
else { 1.0 }; |
| 394 |
let power_score = health.power_stability; |
| 395 |
|
| 396 |
(temp_score + disk_score + memory_score + power_score) / 4.0 |
| 397 |
} |
| 398 |
|
| 399 |
fn calculate_geographic_risk_score(&self, risk: &GeographicRisk) -> f32 { |
| 400 |
let disaster_score = 1.0 - risk.natural_disaster_risk; |
| 401 |
let political_score = risk.political_stability; |
| 402 |
let infrastructure_score = risk.infrastructure_quality; |
| 403 |
let connectivity_score = risk.connectivity_redundancy; |
| 404 |
|
| 405 |
(disaster_score + political_score + infrastructure_score + connectivity_score) / 4.0 |
| 406 |
} |
| 407 |
|
| 408 |
fn calculate_network_stability_score(&self, stability: &NetworkStability) -> f32 { |
| 409 |
let connection_score = if stability.connection_drops > 10 { 0.2 } |
| 410 |
else if stability.connection_drops > 5 { 0.6 } |
| 411 |
else { 1.0 }; |
| 412 |
|
| 413 |
let peer_score = if stability.peer_count < 3 { 0.3 } |
| 414 |
else if stability.peer_count < 8 { 0.7 } |
| 415 |
else { 1.0 }; |
| 416 |
|
| 417 |
let routing_score = stability.routing_efficiency; |
| 418 |
let congestion_score = 1.0 - stability.congestion_level; |
| 419 |
|
| 420 |
(connection_score + peer_score + routing_score + congestion_score) / 4.0 |
| 421 |
} |
| 422 |
} |
| 423 |
|
| 424 |
impl Default for FeatureWeights { |
| 425 |
fn default() -> Self { |
| 426 |
Self { |
| 427 |
uptime: 0.25, |
| 428 |
latency: 0.20, |
| 429 |
storage: 0.15, |
| 430 |
bandwidth: 0.10, |
| 431 |
error_rate: 0.15, |
| 432 |
hardware_health: 0.05, |
| 433 |
geographic_risk: 0.05, |
| 434 |
network_stability: 0.05, |
| 435 |
} |
| 436 |
} |
| 437 |
} |
| 438 |
|
| 439 |
pub struct ProactiveReplicationManager { |
| 440 |
predictor: MLPredictor, |
| 441 |
migration_scheduler: MigrationScheduler, |
| 442 |
chunk_priority_queue: Vec<ChunkMigrationTask>, |
| 443 |
} |
| 444 |
|
| 445 |
#[derive(Debug, Clone)] |
| 446 |
struct ChunkMigrationTask { |
| 447 |
chunk_id: String, |
| 448 |
source_nodes: Vec<String>, |
| 449 |
target_nodes: Vec<String>, |
| 450 |
priority: u8, // 1-10, higher is more urgent |
| 451 |
deadline: crate::SerializableInstant, |
| 452 |
estimated_transfer_time: Duration, |
| 453 |
} |
| 454 |
|
| 455 |
struct MigrationScheduler { |
| 456 |
active_migrations: HashMap<String, MigrationProgress>, |
| 457 |
bandwidth_budget: BandwidthBudget, |
| 458 |
node_capacities: HashMap<String, NodeCapacity>, |
| 459 |
} |
| 460 |
|
| 461 |
#[derive(Debug, Clone)] |
| 462 |
struct MigrationProgress { |
| 463 |
task: ChunkMigrationTask, |
| 464 |
bytes_transferred: u64, |
| 465 |
total_bytes: u64, |
| 466 |
start_time: crate::SerializableInstant, |
| 467 |
estimated_completion: crate::SerializableInstant, |
| 468 |
} |
| 469 |
|
| 470 |
#[derive(Debug, Clone)] |
| 471 |
struct BandwidthBudget { |
| 472 |
total_available: u64, // bytes per second |
| 473 |
reserved_for_users: u64, |
| 474 |
available_for_migration: u64, |
| 475 |
current_usage: u64, |
| 476 |
} |
| 477 |
|
| 478 |
#[derive(Debug, Clone)] |
| 479 |
struct NodeCapacity { |
| 480 |
storage_available: u64, |
| 481 |
bandwidth_capacity: u64, |
| 482 |
current_load: f32, |
| 483 |
reliability_score: f32, |
| 484 |
} |
| 485 |
|
| 486 |
impl ProactiveReplicationManager { |
| 487 |
pub fn new() -> Self { |
| 488 |
Self { |
| 489 |
predictor: MLPredictor::new(), |
| 490 |
migration_scheduler: MigrationScheduler::new(), |
| 491 |
chunk_priority_queue: Vec::new(), |
| 492 |
} |
| 493 |
} |
| 494 |
|
| 495 |
pub async fn analyze_and_migrate(&mut self) -> Result<(), Box<dyn std::error::Error>> { |
| 496 |
// Get high-risk nodes |
| 497 |
let high_risk_nodes = self.predictor.get_high_risk_nodes().await; |
| 498 |
|
| 499 |
for prediction in high_risk_nodes { |
| 500 |
if prediction.failure_probability > 0.5 { |
| 501 |
self.schedule_emergency_migration(&prediction.node_id, prediction.failure_probability).await?; |
| 502 |
} else if prediction.failure_probability > 0.3 { |
| 503 |
self.schedule_preemptive_migration(&prediction.node_id, prediction.failure_probability).await?; |
| 504 |
} |
| 505 |
} |
| 506 |
|
| 507 |
// Execute scheduled migrations |
| 508 |
self.execute_migration_queue().await?; |
| 509 |
|
| 510 |
Ok(()) |
| 511 |
} |
| 512 |
|
| 513 |
async fn schedule_emergency_migration(&mut self, node_id: &str, risk: f32) -> Result<(), Box<dyn std::error::Error>> { |
| 514 |
let chunks = self.get_chunks_on_node(node_id).await?; |
| 515 |
|
| 516 |
for chunk_id in chunks { |
| 517 |
let task = ChunkMigrationTask { |
| 518 |
chunk_id, |
| 519 |
source_nodes: vec![node_id.to_string()], |
| 520 |
target_nodes: self.select_migration_targets(2, Some(node_id)).await?, |
| 521 |
priority: 10, // Highest priority |
| 522 |
deadline: crate::SerializableInstant::now() + Duration::from_secs(1800), // 30 minutes |
| 523 |
estimated_transfer_time: Duration::from_secs(300), // 5 minutes estimate |
| 524 |
}; |
| 525 |
|
| 526 |
self.chunk_priority_queue.push(task); |
| 527 |
} |
| 528 |
|
| 529 |
// Sort by priority and deadline |
| 530 |
self.chunk_priority_queue.sort_by(|a, b| { |
| 531 |
b.priority.cmp(&a.priority) |
| 532 |
.then_with(|| a.deadline.cmp(&b.deadline)) |
| 533 |
}); |
| 534 |
|
| 535 |
Ok(()) |
| 536 |
} |
| 537 |
|
| 538 |
async fn schedule_preemptive_migration(&mut self, node_id: &str, risk: f32) -> Result<(), Box<dyn std::error::Error>> { |
| 539 |
let chunks = self.get_chunks_on_node(node_id).await?; |
| 540 |
let priority = ((risk - 0.3) * 20.0) as u8 + 3; // Priority 3-7 |
| 541 |
|
| 542 |
for chunk_id in chunks { |
| 543 |
let task = ChunkMigrationTask { |
| 544 |
chunk_id, |
| 545 |
source_nodes: vec![node_id.to_string()], |
| 546 |
target_nodes: self.select_migration_targets(1, Some(node_id)).await?, |
| 547 |
priority, |
| 548 |
deadline: crate::SerializableInstant::now() + Duration::from_secs(7200), // 2 hours |
| 549 |
estimated_transfer_time: Duration::from_secs(600), // 10 minutes estimate |
| 550 |
}; |
| 551 |
|
| 552 |
self.chunk_priority_queue.push(task); |
| 553 |
} |
| 554 |
|
| 555 |
self.chunk_priority_queue.sort_by(|a, b| { |
| 556 |
b.priority.cmp(&a.priority) |
| 557 |
.then_with(|| a.deadline.cmp(&b.deadline)) |
| 558 |
}); |
| 559 |
|
| 560 |
Ok(()) |
| 561 |
} |
| 562 |
|
| 563 |
async fn execute_migration_queue(&mut self) -> Result<(), Box<dyn std::error::Error>> { |
| 564 |
let available_bandwidth = self.migration_scheduler.bandwidth_budget.available_for_migration; |
| 565 |
let mut current_bandwidth_usage = 0u64; |
| 566 |
|
| 567 |
while let Some(task) = self.chunk_priority_queue.pop() { |
| 568 |
if current_bandwidth_usage + self.estimate_bandwidth_usage(&task) > available_bandwidth { |
| 569 |
// Put task back and wait for next cycle |
| 570 |
self.chunk_priority_queue.push(task); |
| 571 |
break; |
| 572 |
} |
| 573 |
|
| 574 |
if self.can_start_migration(&task).await { |
| 575 |
self.start_migration(task).await?; |
| 576 |
current_bandwidth_usage += self.estimate_bandwidth_usage(&self.chunk_priority_queue.last().unwrap()); |
| 577 |
} |
| 578 |
} |
| 579 |
|
| 580 |
Ok(()) |
| 581 |
} |
| 582 |
|
| 583 |
async fn get_chunks_on_node(&self, _node_id: &str) -> Result<Vec<String>, Box<dyn std::error::Error>> { |
| 584 |
// Placeholder: In reality, this would query the chunk index |
| 585 |
Ok(vec!["chunk_1".to_string(), "chunk_2".to_string()]) |
| 586 |
} |
| 587 |
|
| 588 |
async fn select_migration_targets(&self, count: usize, avoid_node: Option<&str>) -> Result<Vec<String>, Box<dyn std::error::Error>> { |
| 589 |
let mut candidates: Vec<_> = self.migration_scheduler.node_capacities.iter() |
| 590 |
.filter(|(node_id, capacity)| { |
| 591 |
if let Some(avoid) = avoid_node { |
| 592 |
*node_id != avoid && capacity.current_load < 0.8 && capacity.reliability_score > 0.7 |
| 593 |
} else { |
| 594 |
capacity.current_load < 0.8 && capacity.reliability_score > 0.7 |
| 595 |
} |
| 596 |
}) |
| 597 |
.collect(); |
| 598 |
|
| 599 |
candidates.sort_by(|a, b| b.1.reliability_score.partial_cmp(&a.1.reliability_score).unwrap()); |
| 600 |
|
| 601 |
Ok(candidates.into_iter() |
| 602 |
.take(count) |
| 603 |
.map(|(node_id, _)| node_id.clone()) |
| 604 |
.collect()) |
| 605 |
} |
| 606 |
|
| 607 |
async fn can_start_migration(&self, task: &ChunkMigrationTask) -> bool { |
| 608 |
// Check if target nodes have capacity |
| 609 |
for target_node in &task.target_nodes { |
| 610 |
if let Some(capacity) = self.migration_scheduler.node_capacities.get(target_node) { |
| 611 |
if capacity.current_load > 0.85 { |
| 612 |
return false; |
| 613 |
} |
| 614 |
} |
| 615 |
} |
| 616 |
|
| 617 |
// Check if we're not already migrating this chunk |
| 618 |
!self.migration_scheduler.active_migrations.contains_key(&task.chunk_id) |
| 619 |
} |
| 620 |
|
| 621 |
async fn start_migration(&mut self, task: ChunkMigrationTask) -> Result<(), Box<dyn std::error::Error>> { |
| 622 |
let progress = MigrationProgress { |
| 623 |
task: task.clone(), |
| 624 |
bytes_transferred: 0, |
| 625 |
total_bytes: 1024 * 1024, // 1MB estimate |
| 626 |
start_time: crate::SerializableInstant::now(), |
| 627 |
estimated_completion: crate::SerializableInstant::now() + task.estimated_transfer_time, |
| 628 |
}; |
| 629 |
|
| 630 |
let chunk_id = task.chunk_id.clone(); |
| 631 |
self.migration_scheduler.active_migrations.insert(task.chunk_id, progress); |
| 632 |
|
| 633 |
// Placeholder: In reality, this would initiate the actual transfer |
| 634 |
println!("Starting migration of chunk {} from {:?} to {:?}", |
| 635 |
chunk_id, task.source_nodes, task.target_nodes); |
| 636 |
|
| 637 |
Ok(()) |
| 638 |
} |
| 639 |
|
| 640 |
fn estimate_bandwidth_usage(&self, task: &ChunkMigrationTask) -> u64 { |
| 641 |
// Estimate based on chunk size and transfer time |
| 642 |
let chunk_size = 1024 * 1024; // 1MB estimate |
| 643 |
let transfer_duration = task.estimated_transfer_time.as_secs().max(1); |
| 644 |
chunk_size / transfer_duration |
| 645 |
} |
| 646 |
} |
| 647 |
|
| 648 |
impl MigrationScheduler { |
| 649 |
fn new() -> Self { |
| 650 |
Self { |
| 651 |
active_migrations: HashMap::new(), |
| 652 |
bandwidth_budget: BandwidthBudget { |
| 653 |
total_available: 100 * 1024 * 1024, // 100 MB/s |
| 654 |
reserved_for_users: 70 * 1024 * 1024, // 70 MB/s for users |
| 655 |
available_for_migration: 30 * 1024 * 1024, // 30 MB/s for migration |
| 656 |
current_usage: 0, |
| 657 |
}, |
| 658 |
node_capacities: HashMap::new(), |
| 659 |
} |
| 660 |
} |
| 661 |
} |