gardesk/tarmac / d8fc3ba

Browse files

Respawn crashed ers from BorderManager watchdog

When ers exits unexpectedly the user is left with no borders and no
indication anything went wrong, since BorderManager just stores the
child handle and never reaps it. Add a health_check called from the
50ms poll-timer tick (throttled to ~1Hz) that try_waits on the child,
respawns on death, and rate-limits to 5 restarts per minute followed
by a 5-minute backoff so a permanently-broken ers doesn't fork-bomb.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
d8fc3ba4a7614f7da69f3bd52629c55e54db6a3f
Parents
102c4cf
Tree
7102f77

2 changed files

StatusFile+-
M tarmac/src/main.rs 1 0
M tarmac/src/platform/border.rs 92 1
tarmac/src/main.rsmodified
@@ -428,6 +428,7 @@ unsafe extern "C" fn poll_timer_callback(_timer: *const c_void) {
428428
     WM_STATE.with(|s| {
429429
         if let Some(state) = s.borrow_mut().as_mut() {
430430
             state.process_events();
431
+            state.borders.health_check();
431432
         }
432433
     });
433434
 
tarmac/src/platform/border.rsmodified
@@ -4,6 +4,7 @@
44
 
55
 use std::path::PathBuf;
66
 use std::process::{Child, Command};
7
+use std::time::{Duration, Instant};
78
 
89
 /// RGBA color for border configuration.
910
 #[derive(Debug, Clone, Copy)]
@@ -68,8 +69,22 @@ pub struct BorderManager {
6869
     pub unfocused_color: BorderColor,
6970
     pub radius: f64,
7071
     child: Option<Child>,
72
+    /// Timestamps of recent automatic respawns (oldest first). Used to
73
+    /// rate-limit the watchdog so a permanently-broken ers backs off
74
+    /// instead of fork-bombing the system.
75
+    recent_restarts: Vec<Instant>,
76
+    /// When set, suppress further respawn attempts until this time.
77
+    backoff_until: Option<Instant>,
78
+    /// Last time `health_check` ran try_wait on the child. Throttles the
79
+    /// poll to ~1Hz from the 50ms run-loop tick.
80
+    last_health_check: Option<Instant>,
7181
 }
7282
 
83
+const RESTART_WINDOW: Duration = Duration::from_secs(60);
84
+const RESTART_BUDGET: usize = 5;
85
+const BACKOFF_DURATION: Duration = Duration::from_secs(300);
86
+const HEALTH_CHECK_INTERVAL: Duration = Duration::from_secs(1);
87
+
7388
 impl BorderManager {
7489
     #[allow(clippy::new_without_default)]
7590
     pub fn new() -> Self {
@@ -79,6 +94,9 @@ impl BorderManager {
7994
             unfocused_color: BorderColor::from_hex("#2d2d2d"),
8095
             radius: 10.0,
8196
             child: None,
97
+            recent_restarts: Vec::new(),
98
+            backoff_until: None,
99
+            last_health_check: None,
82100
         }
83101
     }
84102
 
@@ -139,8 +157,81 @@ impl BorderManager {
139157
         self.child = None;
140158
     }
141159
 
142
-    /// Restart ers with current settings (used on config reload).
160
+    /// Restart ers with current settings (used on config reload). Resets
161
+    /// the watchdog's backoff so a deliberate user reload always tries
162
+    /// to spawn fresh.
143163
     pub fn restart(&mut self) {
164
+        self.recent_restarts.clear();
165
+        self.backoff_until = None;
166
+        self.spawn();
167
+    }
168
+
169
+    /// Periodically reap the ers child and respawn it if it died, so a
170
+    /// crash inside the renderer doesn't strand tarmac with no borders.
171
+    /// Throttled internally so callers can invoke from the run-loop tick.
172
+    /// Restarts are budgeted: more than `RESTART_BUDGET` restarts inside
173
+    /// `RESTART_WINDOW` triggers a `BACKOFF_DURATION` cooldown, after
174
+    /// which the watchdog tries once more.
175
+    pub fn health_check(&mut self) {
176
+        if !self.is_enabled() {
177
+            return;
178
+        }
179
+        let now = Instant::now();
180
+        if let Some(prev) = self.last_health_check
181
+            && now.duration_since(prev) < HEALTH_CHECK_INTERVAL
182
+        {
183
+            return;
184
+        }
185
+        self.last_health_check = Some(now);
186
+
187
+        let died = match self.child.as_mut() {
188
+            Some(child) => match child.try_wait() {
189
+                Ok(Some(status)) => Some(status),
190
+                Ok(None) => None,
191
+                Err(e) => {
192
+                    tracing::warn!(err = %e, "ers try_wait failed");
193
+                    None
194
+                }
195
+            },
196
+            None => None,
197
+        };
198
+
199
+        let Some(status) = died else {
200
+            return;
201
+        };
202
+        // Drop the dead handle so kill()/spawn() don't try to wait on it
203
+        // again.
204
+        self.child = None;
205
+
206
+        if let Some(until) = self.backoff_until
207
+            && now < until
208
+        {
209
+            tracing::warn!(
210
+                ?status,
211
+                remaining_secs = (until - now).as_secs(),
212
+                "ers exited; respawn suppressed by backoff"
213
+            );
214
+            return;
215
+        }
216
+        self.backoff_until = None;
217
+
218
+        // Sliding-window rate limit.
219
+        let cutoff = now - RESTART_WINDOW;
220
+        self.recent_restarts.retain(|t| *t >= cutoff);
221
+        if self.recent_restarts.len() >= RESTART_BUDGET {
222
+            tracing::error!(
223
+                ?status,
224
+                budget = RESTART_BUDGET,
225
+                window_secs = RESTART_WINDOW.as_secs(),
226
+                "ers crashed too many times; backing off"
227
+            );
228
+            self.backoff_until = Some(now + BACKOFF_DURATION);
229
+            self.recent_restarts.clear();
230
+            return;
231
+        }
232
+
233
+        tracing::warn!(?status, "ers exited unexpectedly; respawning");
234
+        self.recent_restarts.push(now);
144235
         self.spawn();
145236
     }
146237