tenseleyflow/shithub / b3ed4a0

Browse files

verify-wal-archive: only flag failures newer than the most recent success

failed_count in pg_stat_archiver is cumulative — a non-zero count
is fine if the failures pre-date the most recent success (e.g.,
after fixing a misconfigured archive_command). Only the case
where last_failed_time > last_archived_time AND that failure is
recent (< 10 min) is genuine ongoing breakage.
Authored by espadonne
SHA
b3ed4a017085a24067bfa8dd26f06cf4e51cad38
Parents
211ece3
Tree
b2686b1

1 changed file

StatusFile+-
M deploy/postgres/verify-wal-archive.sh 12 4
deploy/postgres/verify-wal-archive.shmodified
@@ -57,8 +57,16 @@ fi
57
 if (( SECS_SINCE > 300 )); then
57
 if (( SECS_SINCE > 300 )); then
58
         fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged"
58
         fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged"
59
 fi
59
 fi
60
-if (( FAILED_COUNT > 0 )); then
60
+# failed_count is cumulative since the last pg_stat_reset_shared('archiver')
61
-        fail "pg_stat_archiver.failed_count=$FAILED_COUNT (last_failed_time epoch=$LAST_FAILED_TIME); inspect journalctl -u postgresql@16-main"
61
+# — a non-zero count is fine if the failures pre-date the most recent
62
+# success. We only flag when the most recent FAILURE is newer than the
63
+# most recent SUCCESS (genuine ongoing breakage) AND that failure is
64
+# recent enough to still be relevant.
65
+if (( FAILED_COUNT > 0 && LAST_FAILED_TIME > LAST_TIME )); then
66
+        SECS_SINCE_FAIL="$((NOW - LAST_FAILED_TIME))"
67
+        if (( SECS_SINCE_FAIL < 600 )); then
68
+                fail "pg_stat_archiver: most recent failure (${SECS_SINCE_FAIL}s ago) is newer than most recent success; archive_command is broken — inspect journalctl -u postgresql@16-main"
69
+        fi
62
 fi
70
 fi
63
 
71
 
64
 # 3. The most-recent segment is visible in Spaces. We list today's
72
 # 3. The most-recent segment is visible in Spaces. We list today's
@@ -67,14 +75,14 @@ fi
67
 # means rclone reported success but the bucket lost the object —
75
 # means rclone reported success but the bucket lost the object —
68
 # rare but worth flagging.
76
 # rare but worth flagging.
69
 TODAY="$(date -u +%Y/%m/%d)"
77
 TODAY="$(date -u +%Y/%m/%d)"
70
-COUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \
78
+COUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
71
         lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)"
79
         lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)"
72
 if (( COUNT == 0 )); then
80
 if (( COUNT == 0 )); then
73
         # Edge case: it's a few minutes after UTC midnight and today's
81
         # Edge case: it's a few minutes after UTC midnight and today's
74
         # prefix is genuinely empty. Look at yesterday too.
82
         # prefix is genuinely empty. Look at yesterday too.
75
         YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \
83
         YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \
76
                 date -u -v-1d +%Y/%m/%d)"
84
                 date -u -v-1d +%Y/%m/%d)"
77
-        YCOUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \
85
+        YCOUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
78
                 lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)"
86
                 lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)"
79
         if (( YCOUNT == 0 )); then
87
         if (( YCOUNT == 0 )); then
80
                 fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success"
88
                 fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success"