@@ -0,0 +1,86 @@ |
| 1 | +#!/usr/bin/env bash |
| 2 | +# SPDX-License-Identifier: AGPL-3.0-or-later |
| 3 | +# |
| 4 | +# Hourly health check for WAL archiving. Asserts the chain is end- |
| 5 | +# to-end alive: Postgres is running with archive_mode=on, the |
| 6 | +# archiver process has reported a recent success, failed_count is |
| 7 | +# zero, and a recent segment is actually visible in Spaces. |
| 8 | +# |
| 9 | +# Silent on success; emits to /var/log/shithub/wal-archive.log AND |
| 10 | +# `journalctl -t shithub-wal-archive` (warning priority) on any |
| 11 | +# failure. Same shape as shithub-aide-check so the operator's |
| 12 | +# muscle memory carries over. |
| 13 | +# |
| 14 | +# Schedule: hourly via cron (installed by ansible/roles/postgres). |
| 15 | + |
| 16 | +set -uo pipefail |
| 17 | + |
| 18 | +LOG=/var/log/shithub/wal-archive.log |
| 19 | +mkdir -p "$(dirname "$LOG")" |
| 20 | + |
| 21 | +ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } |
| 22 | + |
| 23 | +fail() { |
| 24 | + local msg="$*" |
| 25 | + { |
| 26 | + echo "[$(ts)] FAIL: $msg" |
| 27 | + } >> "$LOG" |
| 28 | + printf '%s\n' "$msg" | systemd-cat -t shithub-wal-archive -p warning |
| 29 | + exit 1 |
| 30 | +} |
| 31 | + |
| 32 | +# 1. Postgres up + archive_mode=on. |
| 33 | +ARCHIVE_MODE="$(sudo -u postgres psql -tAc 'SHOW archive_mode' 2>&1)" |
| 34 | +if [[ "$ARCHIVE_MODE" != "on" ]]; then |
| 35 | + fail "Postgres archive_mode is '$ARCHIVE_MODE', expected 'on'" |
| 36 | +fi |
| 37 | + |
| 38 | +# 2. pg_stat_archiver — last_archived_time recent, failed_count==0. |
| 39 | +# archive_timeout is 60s; allow 5x slack to absorb a missed kick. |
| 40 | +read -r LAST_WAL LAST_TIME FAILED_COUNT LAST_FAILED_TIME < <( |
| 41 | + sudo -u postgres psql -tAF '|' -c " |
| 42 | + SELECT |
| 43 | + COALESCE(last_archived_wal, ''), |
| 44 | + COALESCE(EXTRACT(EPOCH FROM last_archived_time)::bigint::text, '0'), |
| 45 | + failed_count, |
| 46 | + COALESCE(EXTRACT(EPOCH FROM last_failed_time)::bigint::text, '0') |
| 47 | + FROM pg_stat_archiver; |
| 48 | + " | tr '|' ' ' |
| 49 | +) |
| 50 | + |
| 51 | +NOW="$(date +%s)" |
| 52 | +SECS_SINCE="$((NOW - LAST_TIME))" |
| 53 | + |
| 54 | +if [[ -z "$LAST_WAL" || "$LAST_TIME" == "0" ]]; then |
| 55 | + fail "pg_stat_archiver has never reported a successful archive" |
| 56 | +fi |
| 57 | +if (( SECS_SINCE > 300 )); then |
| 58 | + fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged" |
| 59 | +fi |
| 60 | +if (( FAILED_COUNT > 0 )); then |
| 61 | + fail "pg_stat_archiver.failed_count=$FAILED_COUNT (last_failed_time epoch=$LAST_FAILED_TIME); inspect journalctl -u postgresql@16-main" |
| 62 | +fi |
| 63 | + |
| 64 | +# 3. The most-recent segment is visible in Spaces. We list today's |
| 65 | +# prefix (the archive script paths under YYYY/MM/DD/) and assert |
| 66 | +# the count is non-zero. A zero count with a recent last_archived_time |
| 67 | +# means rclone reported success but the bucket lost the object — |
| 68 | +# rare but worth flagging. |
| 69 | +TODAY="$(date -u +%Y/%m/%d)" |
| 70 | +COUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \ |
| 71 | + lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)" |
| 72 | +if (( COUNT == 0 )); then |
| 73 | + # Edge case: it's a few minutes after UTC midnight and today's |
| 74 | + # prefix is genuinely empty. Look at yesterday too. |
| 75 | + YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \ |
| 76 | + date -u -v-1d +%Y/%m/%d)" |
| 77 | + YCOUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \ |
| 78 | + lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)" |
| 79 | + if (( YCOUNT == 0 )); then |
| 80 | + fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success" |
| 81 | + fi |
| 82 | +fi |
| 83 | + |
| 84 | +# Heartbeat. |
| 85 | +date -u +%Y-%m-%dT%H:%M:%SZ > /var/run/shithub-wal-archive.last-clean |
| 86 | +exit 0 |