| 1 | #!/usr/bin/env bash |
| 2 | # SPDX-License-Identifier: AGPL-3.0-or-later |
| 3 | # |
| 4 | # Hourly health check for WAL archiving. Asserts the chain is end- |
| 5 | # to-end alive: Postgres is running with archive_mode=on, the |
| 6 | # archiver process has reported a recent success, failed_count is |
| 7 | # zero, and a recent segment is actually visible in Spaces. |
| 8 | # |
| 9 | # Silent on success; emits to /var/log/shithub/wal-archive.log AND |
| 10 | # `journalctl -t shithub-wal-archive` (warning priority) on any |
| 11 | # failure. Same shape as shithub-aide-check so the operator's |
| 12 | # muscle memory carries over. |
| 13 | # |
| 14 | # Schedule: hourly via cron (installed by ansible/roles/postgres). |
| 15 | |
| 16 | set -uo pipefail |
| 17 | |
| 18 | LOG=/var/log/shithub/wal-archive.log |
| 19 | mkdir -p "$(dirname "$LOG")" |
| 20 | |
| 21 | ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } |
| 22 | |
| 23 | fail() { |
| 24 | local msg="$*" |
| 25 | { |
| 26 | echo "[$(ts)] FAIL: $msg" |
| 27 | } >> "$LOG" |
| 28 | printf '%s\n' "$msg" | systemd-cat -t shithub-wal-archive -p warning |
| 29 | exit 1 |
| 30 | } |
| 31 | |
| 32 | # 1. Postgres up + archive_mode=on. |
| 33 | ARCHIVE_MODE="$(sudo -u postgres psql -tAc 'SHOW archive_mode' 2>&1)" |
| 34 | if [[ "$ARCHIVE_MODE" != "on" ]]; then |
| 35 | fail "Postgres archive_mode is '$ARCHIVE_MODE', expected 'on'" |
| 36 | fi |
| 37 | |
| 38 | # 2. pg_stat_archiver — last_archived_time recent, failed_count==0. |
| 39 | # archive_timeout is 60s; allow 5x slack to absorb a missed kick. |
| 40 | read -r LAST_WAL LAST_TIME FAILED_COUNT LAST_FAILED_TIME < <( |
| 41 | sudo -u postgres psql -tAF '|' -c " |
| 42 | SELECT |
| 43 | COALESCE(last_archived_wal, ''), |
| 44 | COALESCE(EXTRACT(EPOCH FROM last_archived_time)::bigint::text, '0'), |
| 45 | failed_count, |
| 46 | COALESCE(EXTRACT(EPOCH FROM last_failed_time)::bigint::text, '0') |
| 47 | FROM pg_stat_archiver; |
| 48 | " | tr '|' ' ' |
| 49 | ) |
| 50 | |
| 51 | NOW="$(date +%s)" |
| 52 | SECS_SINCE="$((NOW - LAST_TIME))" |
| 53 | |
| 54 | if [[ -z "$LAST_WAL" || "$LAST_TIME" == "0" ]]; then |
| 55 | fail "pg_stat_archiver has never reported a successful archive" |
| 56 | fi |
| 57 | if (( SECS_SINCE > 300 )); then |
| 58 | fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged" |
| 59 | fi |
| 60 | # failed_count is cumulative since the last pg_stat_reset_shared('archiver') |
| 61 | # — a non-zero count is fine if the failures pre-date the most recent |
| 62 | # success. We only flag when the most recent FAILURE is newer than the |
| 63 | # most recent SUCCESS (genuine ongoing breakage) AND that failure is |
| 64 | # recent enough to still be relevant. |
| 65 | if (( FAILED_COUNT > 0 && LAST_FAILED_TIME > LAST_TIME )); then |
| 66 | SECS_SINCE_FAIL="$((NOW - LAST_FAILED_TIME))" |
| 67 | if (( SECS_SINCE_FAIL < 600 )); then |
| 68 | fail "pg_stat_archiver: most recent failure (${SECS_SINCE_FAIL}s ago) is newer than most recent success; archive_command is broken — inspect journalctl -u postgresql@16-main" |
| 69 | fi |
| 70 | fi |
| 71 | |
| 72 | # 3. The most-recent segment is visible in Spaces. We list today's |
| 73 | # prefix (the archive script paths under YYYY/MM/DD/) and assert |
| 74 | # the count is non-zero. A zero count with a recent last_archived_time |
| 75 | # means rclone reported success but the bucket lost the object — |
| 76 | # rare but worth flagging. |
| 77 | TODAY="$(date -u +%Y/%m/%d)" |
| 78 | COUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \ |
| 79 | lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)" |
| 80 | if (( COUNT == 0 )); then |
| 81 | # Edge case: it's a few minutes after UTC midnight and today's |
| 82 | # prefix is genuinely empty. Look at yesterday too. |
| 83 | YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \ |
| 84 | date -u -v-1d +%Y/%m/%d)" |
| 85 | YCOUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \ |
| 86 | lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)" |
| 87 | if (( YCOUNT == 0 )); then |
| 88 | fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success" |
| 89 | fi |
| 90 | fi |
| 91 | |
| 92 | # Heartbeat. |
| 93 | date -u +%Y-%m-%dT%H:%M:%SZ > /var/run/shithub-wal-archive.last-clean |
| 94 | exit 0 |