gardesk/garwarp / 9e7714d

Browse files

fallback on store load failure

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
9e7714d8ef4d085fbb8b610bd4075b69fd2be7ae
Parents
1881604
Tree
48c741f

1 changed file

StatusFile+-
M garwarp/src/daemon.rs 88 5
garwarp/src/daemon.rsmodified
@@ -1,8 +1,9 @@
1
 use std::fs;
1
 use std::fs;
2
 use std::io::{self, BufRead, BufReader, Read, Write};
2
 use std::io::{self, BufRead, BufReader, Read, Write};
3
 use std::os::unix::net::{UnixListener, UnixStream};
3
 use std::os::unix::net::{UnixListener, UnixStream};
4
+use std::path::Path;
4
 use std::thread;
5
 use std::thread;
5
-use std::time::{Duration, Instant};
6
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
6
 
7
 
7
 use garwarp_ipc::{
8
 use garwarp_ipc::{
8
     ControlRequest, ControlResponse, HealthStatus, RequestTransitionTarget, StatusResponse,
9
     ControlRequest, ControlResponse, HealthStatus, RequestTransitionTarget, StatusResponse,
@@ -35,8 +36,8 @@ pub fn run() -> io::Result<()> {
35
 
36
 
36
     logging::info("daemon_starting");
37
     logging::info("daemon_starting");
37
 
38
 
38
-    let (requests, recovered_ids) =
39
+    let (requests, recovered_ids, startup_degraded) =
39
-        load_registry_with_recovery(&paths.request_store, config.request_timeout)?;
40
+        load_registry_with_fallback(&paths.request_store, config.request_timeout);
40
     if !recovered_ids.is_empty() {
41
     if !recovered_ids.is_empty() {
41
         logging::warn(&format!(
42
         logging::warn(&format!(
42
             "request_recovery_expired count={}",
43
             "request_recovery_expired count={}",
@@ -45,7 +46,11 @@ pub fn run() -> io::Result<()> {
45
     }
46
     }
46
 
47
 
47
     let mut state = DaemonState {
48
     let mut state = DaemonState {
48
-        health: HealthStatus::Healthy,
49
+        health: if startup_degraded {
50
+            HealthStatus::Degraded
51
+        } else {
52
+            HealthStatus::Healthy
53
+        },
49
         requests,
54
         requests,
50
         running: true,
55
         running: true,
51
     };
56
     };
@@ -337,6 +342,48 @@ fn load_registry_with_recovery(
337
     Ok((registry, expired))
342
     Ok((registry, expired))
338
 }
343
 }
339
 
344
 
345
+fn load_registry_with_fallback(
346
+    request_store_path: &Path,
347
+    timeout: Duration,
348
+) -> (RequestRegistry, Vec<String>, bool) {
349
+    match load_registry_with_recovery(request_store_path, timeout) {
350
+        Ok((registry, recovered_ids)) => (registry, recovered_ids, false),
351
+        Err(error) => {
352
+            logging::warn(&format!("request_store_load_failed error={error}"));
353
+            match quarantine_request_store(request_store_path) {
354
+                Ok(Some(path)) => logging::warn(&format!(
355
+                    "request_store_quarantined path={}",
356
+                    path.display()
357
+                )),
358
+                Ok(None) => {}
359
+                Err(error) => {
360
+                    logging::warn(&format!("request_store_quarantine_failed error={error}"))
361
+                }
362
+            }
363
+            (RequestRegistry::new(timeout), Vec::new(), true)
364
+        }
365
+    }
366
+}
367
+
368
+fn quarantine_request_store(path: &Path) -> io::Result<Option<std::path::PathBuf>> {
369
+    if !path.exists() {
370
+        return Ok(None);
371
+    }
372
+
373
+    let parent = path.parent().unwrap_or_else(|| Path::new("."));
374
+    let file_name = path
375
+        .file_name()
376
+        .and_then(|name| name.to_str())
377
+        .unwrap_or("requests.state");
378
+    let nanos = SystemTime::now()
379
+        .duration_since(UNIX_EPOCH)
380
+        .map_or(0, |duration| duration.as_nanos());
381
+
382
+    let quarantined = parent.join(format!("{file_name}.corrupt-{nanos}"));
383
+    fs::rename(path, &quarantined)?;
384
+    Ok(Some(quarantined))
385
+}
386
+
340
 fn persist_registry_state(path: &std::path::Path, registry: &RequestRegistry) {
387
 fn persist_registry_state(path: &std::path::Path, registry: &RequestRegistry) {
341
     if let Err(error) = request_store::persist_registry(path, registry) {
388
     if let Err(error) = request_store::persist_registry(path, registry) {
342
         logging::warn(&format!("request_store_write_failed error={error}"));
389
         logging::warn(&format!("request_store_write_failed error={error}"));
@@ -346,7 +393,8 @@ fn persist_registry_state(path: &std::path::Path, registry: &RequestRegistry) {
346
 #[cfg(test)]
393
 #[cfg(test)]
347
 mod tests {
394
 mod tests {
348
     use super::{
395
     use super::{
349
-        DaemonState, MAX_CONTROL_LINE_BYTES, handle_connection, load_registry_with_recovery,
396
+        DaemonState, MAX_CONTROL_LINE_BYTES, handle_connection, load_registry_with_fallback,
397
+        load_registry_with_recovery,
350
     };
398
     };
351
     use garwarp_ipc::{ControlResponse, HealthStatus};
399
     use garwarp_ipc::{ControlResponse, HealthStatus};
352
     use std::fs;
400
     use std::fs;
@@ -1100,4 +1148,39 @@ mod tests {
1100
 
1148
 
1101
         let _ = fs::remove_file(path);
1149
         let _ = fs::remove_file(path);
1102
     }
1150
     }
1151
+
1152
+    #[test]
1153
+    fn invalid_store_load_uses_empty_registry_and_quarantines_file() {
1154
+        let path = unique_temp_file();
1155
+        fs::write(&path, "id=req-1\tsender=:1.2\tstate=bogus\n")
1156
+            .expect("invalid store should be written");
1157
+
1158
+        let parent = path
1159
+            .parent()
1160
+            .expect("temp file should have parent")
1161
+            .to_path_buf();
1162
+        let file_name = path
1163
+            .file_name()
1164
+            .expect("temp file should have name")
1165
+            .to_string_lossy()
1166
+            .to_string();
1167
+
1168
+        let (registry, recovered_ids, degraded) =
1169
+            load_registry_with_fallback(&path, Duration::from_secs(5));
1170
+        assert!(degraded);
1171
+        assert!(recovered_ids.is_empty());
1172
+        assert_eq!(registry.total_count(), 0);
1173
+        assert!(!path.exists());
1174
+
1175
+        let quarantined = fs::read_dir(&parent)
1176
+            .expect("parent dir should be readable")
1177
+            .filter_map(Result::ok)
1178
+            .find(|entry| {
1179
+                let name = entry.file_name().to_string_lossy().to_string();
1180
+                name.starts_with(&format!("{file_name}.corrupt-"))
1181
+            })
1182
+            .map(|entry| entry.path())
1183
+            .expect("quarantined store should exist");
1184
+        fs::remove_file(quarantined).expect("quarantined store should be removed");
1185
+    }
1103
 }
1186
 }