tenseleyflow/shithub / b3ed4a0

Browse files

verify-wal-archive: only flag failures newer than the most recent success

failed_count in pg_stat_archiver is cumulative — a non-zero count
is fine if the failures pre-date the most recent success (e.g.,
after fixing a misconfigured archive_command). Only the case
where last_failed_time > last_archived_time AND that failure is
recent (< 10 min) is genuine ongoing breakage.
Authored by espadonne
SHA
b3ed4a017085a24067bfa8dd26f06cf4e51cad38
Parents
211ece3
Tree
b2686b1

1 changed file

StatusFile+-
M deploy/postgres/verify-wal-archive.sh 12 4
deploy/postgres/verify-wal-archive.shmodified
@@ -57,8 +57,16 @@ fi
5757
 if (( SECS_SINCE > 300 )); then
5858
         fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged"
5959
 fi
60
-if (( FAILED_COUNT > 0 )); then
61
-        fail "pg_stat_archiver.failed_count=$FAILED_COUNT (last_failed_time epoch=$LAST_FAILED_TIME); inspect journalctl -u postgresql@16-main"
60
+# failed_count is cumulative since the last pg_stat_reset_shared('archiver')
61
+# — a non-zero count is fine if the failures pre-date the most recent
62
+# success. We only flag when the most recent FAILURE is newer than the
63
+# most recent SUCCESS (genuine ongoing breakage) AND that failure is
64
+# recent enough to still be relevant.
65
+if (( FAILED_COUNT > 0 && LAST_FAILED_TIME > LAST_TIME )); then
66
+        SECS_SINCE_FAIL="$((NOW - LAST_FAILED_TIME))"
67
+        if (( SECS_SINCE_FAIL < 600 )); then
68
+                fail "pg_stat_archiver: most recent failure (${SECS_SINCE_FAIL}s ago) is newer than most recent success; archive_command is broken — inspect journalctl -u postgresql@16-main"
69
+        fi
6270
 fi
6371
 
6472
 # 3. The most-recent segment is visible in Spaces. We list today's
@@ -67,14 +75,14 @@ fi
6775
 # means rclone reported success but the bucket lost the object —
6876
 # rare but worth flagging.
6977
 TODAY="$(date -u +%Y/%m/%d)"
70
-COUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \
78
+COUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
7179
         lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)"
7280
 if (( COUNT == 0 )); then
7381
         # Edge case: it's a few minutes after UTC midnight and today's
7482
         # prefix is genuinely empty. Look at yesterday too.
7583
         YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \
7684
                 date -u -v-1d +%Y/%m/%d)"
77
-        YCOUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \
85
+        YCOUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
7886
                 lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)"
7987
         if (( YCOUNT == 0 )); then
8088
                 fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success"