tenseleyflow/shithub / 9e3b7ec

Browse files

ops(pg): hourly WAL-archive health check, journal-tagged on failure

Authored by espadonne
SHA
9e3b7ec6d5b8709fd5fa04706c21c1f9df6874b1
Parents
ef7f7ab
Tree
c8e17b4

1 changed file

StatusFile+-
A deploy/postgres/verify-wal-archive.sh 86 0
deploy/postgres/verify-wal-archive.shadded
@@ -0,0 +1,86 @@
1
+#!/usr/bin/env bash
2
+# SPDX-License-Identifier: AGPL-3.0-or-later
3
+#
4
+# Hourly health check for WAL archiving. Asserts the chain is end-
5
+# to-end alive: Postgres is running with archive_mode=on, the
6
+# archiver process has reported a recent success, failed_count is
7
+# zero, and a recent segment is actually visible in Spaces.
8
+#
9
+# Silent on success; emits to /var/log/shithub/wal-archive.log AND
10
+# `journalctl -t shithub-wal-archive` (warning priority) on any
11
+# failure. Same shape as shithub-aide-check so the operator's
12
+# muscle memory carries over.
13
+#
14
+# Schedule: hourly via cron (installed by ansible/roles/postgres).
15
+
16
+set -uo pipefail
17
+
18
+LOG=/var/log/shithub/wal-archive.log
19
+mkdir -p "$(dirname "$LOG")"
20
+
21
+ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }
22
+
23
+fail() {
24
+        local msg="$*"
25
+        {
26
+                echo "[$(ts)] FAIL: $msg"
27
+        } >> "$LOG"
28
+        printf '%s\n' "$msg" | systemd-cat -t shithub-wal-archive -p warning
29
+        exit 1
30
+}
31
+
32
+# 1. Postgres up + archive_mode=on.
33
+ARCHIVE_MODE="$(sudo -u postgres psql -tAc 'SHOW archive_mode' 2>&1)"
34
+if [[ "$ARCHIVE_MODE" != "on" ]]; then
35
+        fail "Postgres archive_mode is '$ARCHIVE_MODE', expected 'on'"
36
+fi
37
+
38
+# 2. pg_stat_archiver — last_archived_time recent, failed_count==0.
39
+# archive_timeout is 60s; allow 5x slack to absorb a missed kick.
40
+read -r LAST_WAL LAST_TIME FAILED_COUNT LAST_FAILED_TIME < <(
41
+        sudo -u postgres psql -tAF '|' -c "
42
+                SELECT
43
+                  COALESCE(last_archived_wal, ''),
44
+                  COALESCE(EXTRACT(EPOCH FROM last_archived_time)::bigint::text, '0'),
45
+                  failed_count,
46
+                  COALESCE(EXTRACT(EPOCH FROM last_failed_time)::bigint::text, '0')
47
+                FROM pg_stat_archiver;
48
+        " | tr '|' ' '
49
+)
50
+
51
+NOW="$(date +%s)"
52
+SECS_SINCE="$((NOW - LAST_TIME))"
53
+
54
+if [[ -z "$LAST_WAL" || "$LAST_TIME" == "0" ]]; then
55
+        fail "pg_stat_archiver has never reported a successful archive"
56
+fi
57
+if (( SECS_SINCE > 300 )); then
58
+        fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged"
59
+fi
60
+if (( FAILED_COUNT > 0 )); then
61
+        fail "pg_stat_archiver.failed_count=$FAILED_COUNT (last_failed_time epoch=$LAST_FAILED_TIME); inspect journalctl -u postgresql@16-main"
62
+fi
63
+
64
+# 3. The most-recent segment is visible in Spaces. We list today's
65
+# prefix (the archive script paths under YYYY/MM/DD/) and assert
66
+# the count is non-zero. A zero count with a recent last_archived_time
67
+# means rclone reported success but the bucket lost the object —
68
+# rare but worth flagging.
69
+TODAY="$(date -u +%Y/%m/%d)"
70
+COUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \
71
+        lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)"
72
+if (( COUNT == 0 )); then
73
+        # Edge case: it's a few minutes after UTC midnight and today's
74
+        # prefix is genuinely empty. Look at yesterday too.
75
+        YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \
76
+                date -u -v-1d +%Y/%m/%d)"
77
+        YCOUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \
78
+                lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)"
79
+        if (( YCOUNT == 0 )); then
80
+                fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success"
81
+        fi
82
+fi
83
+
84
+# Heartbeat.
85
+date -u +%Y-%m-%dT%H:%M:%SZ > /var/run/shithub-wal-archive.last-clean
86
+exit 0