@@ -4,6 +4,7 @@ |
| 4 | 4 | |
| 5 | 5 | use std::path::PathBuf; |
| 6 | 6 | use std::process::{Child, Command}; |
| 7 | +use std::time::{Duration, Instant}; |
| 7 | 8 | |
| 8 | 9 | /// RGBA color for border configuration. |
| 9 | 10 | #[derive(Debug, Clone, Copy)] |
@@ -68,8 +69,22 @@ pub struct BorderManager { |
| 68 | 69 | pub unfocused_color: BorderColor, |
| 69 | 70 | pub radius: f64, |
| 70 | 71 | child: Option<Child>, |
| 72 | + /// Timestamps of recent automatic respawns (oldest first). Used to |
| 73 | + /// rate-limit the watchdog so a permanently-broken ers backs off |
| 74 | + /// instead of fork-bombing the system. |
| 75 | + recent_restarts: Vec<Instant>, |
| 76 | + /// When set, suppress further respawn attempts until this time. |
| 77 | + backoff_until: Option<Instant>, |
| 78 | + /// Last time `health_check` ran try_wait on the child. Throttles the |
| 79 | + /// poll to ~1Hz from the 50ms run-loop tick. |
| 80 | + last_health_check: Option<Instant>, |
| 71 | 81 | } |
| 72 | 82 | |
| 83 | +const RESTART_WINDOW: Duration = Duration::from_secs(60); |
| 84 | +const RESTART_BUDGET: usize = 5; |
| 85 | +const BACKOFF_DURATION: Duration = Duration::from_secs(300); |
| 86 | +const HEALTH_CHECK_INTERVAL: Duration = Duration::from_secs(1); |
| 87 | + |
| 73 | 88 | impl BorderManager { |
| 74 | 89 | #[allow(clippy::new_without_default)] |
| 75 | 90 | pub fn new() -> Self { |
@@ -79,6 +94,9 @@ impl BorderManager { |
| 79 | 94 | unfocused_color: BorderColor::from_hex("#2d2d2d"), |
| 80 | 95 | radius: 10.0, |
| 81 | 96 | child: None, |
| 97 | + recent_restarts: Vec::new(), |
| 98 | + backoff_until: None, |
| 99 | + last_health_check: None, |
| 82 | 100 | } |
| 83 | 101 | } |
| 84 | 102 | |
@@ -139,8 +157,81 @@ impl BorderManager { |
| 139 | 157 | self.child = None; |
| 140 | 158 | } |
| 141 | 159 | |
| 142 | | - /// Restart ers with current settings (used on config reload). |
| 160 | + /// Restart ers with current settings (used on config reload). Resets |
| 161 | + /// the watchdog's backoff so a deliberate user reload always tries |
| 162 | + /// to spawn fresh. |
| 143 | 163 | pub fn restart(&mut self) { |
| 164 | + self.recent_restarts.clear(); |
| 165 | + self.backoff_until = None; |
| 166 | + self.spawn(); |
| 167 | + } |
| 168 | + |
| 169 | + /// Periodically reap the ers child and respawn it if it died, so a |
| 170 | + /// crash inside the renderer doesn't strand tarmac with no borders. |
| 171 | + /// Throttled internally so callers can invoke from the run-loop tick. |
| 172 | + /// Restarts are budgeted: more than `RESTART_BUDGET` restarts inside |
| 173 | + /// `RESTART_WINDOW` triggers a `BACKOFF_DURATION` cooldown, after |
| 174 | + /// which the watchdog tries once more. |
| 175 | + pub fn health_check(&mut self) { |
| 176 | + if !self.is_enabled() { |
| 177 | + return; |
| 178 | + } |
| 179 | + let now = Instant::now(); |
| 180 | + if let Some(prev) = self.last_health_check |
| 181 | + && now.duration_since(prev) < HEALTH_CHECK_INTERVAL |
| 182 | + { |
| 183 | + return; |
| 184 | + } |
| 185 | + self.last_health_check = Some(now); |
| 186 | + |
| 187 | + let died = match self.child.as_mut() { |
| 188 | + Some(child) => match child.try_wait() { |
| 189 | + Ok(Some(status)) => Some(status), |
| 190 | + Ok(None) => None, |
| 191 | + Err(e) => { |
| 192 | + tracing::warn!(err = %e, "ers try_wait failed"); |
| 193 | + None |
| 194 | + } |
| 195 | + }, |
| 196 | + None => None, |
| 197 | + }; |
| 198 | + |
| 199 | + let Some(status) = died else { |
| 200 | + return; |
| 201 | + }; |
| 202 | + // Drop the dead handle so kill()/spawn() don't try to wait on it |
| 203 | + // again. |
| 204 | + self.child = None; |
| 205 | + |
| 206 | + if let Some(until) = self.backoff_until |
| 207 | + && now < until |
| 208 | + { |
| 209 | + tracing::warn!( |
| 210 | + ?status, |
| 211 | + remaining_secs = (until - now).as_secs(), |
| 212 | + "ers exited; respawn suppressed by backoff" |
| 213 | + ); |
| 214 | + return; |
| 215 | + } |
| 216 | + self.backoff_until = None; |
| 217 | + |
| 218 | + // Sliding-window rate limit. |
| 219 | + let cutoff = now - RESTART_WINDOW; |
| 220 | + self.recent_restarts.retain(|t| *t >= cutoff); |
| 221 | + if self.recent_restarts.len() >= RESTART_BUDGET { |
| 222 | + tracing::error!( |
| 223 | + ?status, |
| 224 | + budget = RESTART_BUDGET, |
| 225 | + window_secs = RESTART_WINDOW.as_secs(), |
| 226 | + "ers crashed too many times; backing off" |
| 227 | + ); |
| 228 | + self.backoff_until = Some(now + BACKOFF_DURATION); |
| 229 | + self.recent_restarts.clear(); |
| 230 | + return; |
| 231 | + } |
| 232 | + |
| 233 | + tracing::warn!(?status, "ers exited unexpectedly; respawning"); |
| 234 | + self.recent_restarts.push(now); |
| 144 | 235 | self.spawn(); |
| 145 | 236 | } |
| 146 | 237 | |