Bash · 3914 bytes Raw Blame History
1 #!/usr/bin/env bash
2 # SPDX-License-Identifier: AGPL-3.0-or-later
3 #
4 # Hourly health check for WAL archiving. Asserts the chain is end-
5 # to-end alive: Postgres is running with archive_mode=on, the
6 # archiver process has reported a recent success, failed_count is
7 # zero, and a recent segment is actually visible in Spaces.
8 #
9 # Silent on success; emits to /var/log/shithub/wal-archive.log AND
10 # `journalctl -t shithub-wal-archive` (warning priority) on any
11 # failure. Same shape as shithub-aide-check so the operator's
12 # muscle memory carries over.
13 #
14 # Schedule: hourly via cron (installed by ansible/roles/postgres).
15
16 set -uo pipefail
17
18 LOG=/var/log/shithub/wal-archive.log
19 mkdir -p "$(dirname "$LOG")"
20
21 ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }
22
23 fail() {
24 local msg="$*"
25 {
26 echo "[$(ts)] FAIL: $msg"
27 } >> "$LOG"
28 printf '%s\n' "$msg" | systemd-cat -t shithub-wal-archive -p warning
29 exit 1
30 }
31
32 # 1. Postgres up + archive_mode=on.
33 ARCHIVE_MODE="$(sudo -u postgres psql -tAc 'SHOW archive_mode' 2>&1)"
34 if [[ "$ARCHIVE_MODE" != "on" ]]; then
35 fail "Postgres archive_mode is '$ARCHIVE_MODE', expected 'on'"
36 fi
37
38 # 2. pg_stat_archiver — last_archived_time recent, failed_count==0.
39 # archive_timeout is 60s; allow 5x slack to absorb a missed kick.
40 read -r LAST_WAL LAST_TIME FAILED_COUNT LAST_FAILED_TIME < <(
41 sudo -u postgres psql -tAF '|' -c "
42 SELECT
43 COALESCE(last_archived_wal, ''),
44 COALESCE(EXTRACT(EPOCH FROM last_archived_time)::bigint::text, '0'),
45 failed_count,
46 COALESCE(EXTRACT(EPOCH FROM last_failed_time)::bigint::text, '0')
47 FROM pg_stat_archiver;
48 " | tr '|' ' '
49 )
50
51 NOW="$(date +%s)"
52 SECS_SINCE="$((NOW - LAST_TIME))"
53
54 if [[ -z "$LAST_WAL" || "$LAST_TIME" == "0" ]]; then
55 fail "pg_stat_archiver has never reported a successful archive"
56 fi
57 if (( SECS_SINCE > 300 )); then
58 fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged"
59 fi
60 # failed_count is cumulative since the last pg_stat_reset_shared('archiver')
61 # — a non-zero count is fine if the failures pre-date the most recent
62 # success. We only flag when the most recent FAILURE is newer than the
63 # most recent SUCCESS (genuine ongoing breakage) AND that failure is
64 # recent enough to still be relevant.
65 if (( FAILED_COUNT > 0 && LAST_FAILED_TIME > LAST_TIME )); then
66 SECS_SINCE_FAIL="$((NOW - LAST_FAILED_TIME))"
67 if (( SECS_SINCE_FAIL < 600 )); then
68 fail "pg_stat_archiver: most recent failure (${SECS_SINCE_FAIL}s ago) is newer than most recent success; archive_command is broken — inspect journalctl -u postgresql@16-main"
69 fi
70 fi
71
72 # 3. The most-recent segment is visible in Spaces. We list today's
73 # prefix (the archive script paths under YYYY/MM/DD/) and assert
74 # the count is non-zero. A zero count with a recent last_archived_time
75 # means rclone reported success but the bucket lost the object —
76 # rare but worth flagging.
77 TODAY="$(date -u +%Y/%m/%d)"
78 COUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
79 lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)"
80 if (( COUNT == 0 )); then
81 # Edge case: it's a few minutes after UTC midnight and today's
82 # prefix is genuinely empty. Look at yesterday too.
83 YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \
84 date -u -v-1d +%Y/%m/%d)"
85 YCOUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
86 lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)"
87 if (( YCOUNT == 0 )); then
88 fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success"
89 fi
90 fi
91
92 # Heartbeat.
93 date -u +%Y-%m-%dT%H:%M:%SZ > /var/run/shithub-wal-archive.last-clean
94 exit 0