@@ -0,0 +1,87 @@ |
| 1 | +#!/usr/bin/env bash |
| 2 | +# SPDX-License-Identifier: AGPL-3.0-or-later |
| 3 | +# |
| 4 | +# Restore drill — exercises the recovery path end-to-end so that |
| 5 | +# we know our backups actually restore. Run quarterly (the calendar |
| 6 | +# entry is in runbooks/backups.md). The script: |
| 7 | +# |
| 8 | +# 1. Spins up an empty Postgres in a temp data directory. |
| 9 | +# 2. Pulls the latest daily dump from Spaces (or an explicit |
| 10 | +# --dump path). |
| 11 | +# 3. pg_restores into the temp instance. |
| 12 | +# 4. Runs smoke-queries.sql to confirm row counts and integrity. |
| 13 | +# 5. Tears the temp instance down. |
| 14 | +# |
| 15 | +# Exits non-zero on any failure. Output is appended to |
| 16 | +# /var/log/shithub/restore-drill.log so the on-call can review. |
| 17 | + |
| 18 | +set -euo pipefail |
| 19 | + |
| 20 | +DUMP="" |
| 21 | +KEEP=0 |
| 22 | +while [[ $# -gt 0 ]]; do |
| 23 | + case "$1" in |
| 24 | + --dump) DUMP="$2"; shift 2 ;; |
| 25 | + --keep) KEEP=1; shift ;; |
| 26 | + *) echo "unknown arg: $1" >&2; exit 2 ;; |
| 27 | + esac |
| 28 | +done |
| 29 | + |
| 30 | +BUCKET="${SHITHUB_BACKUP_BUCKET:-spaces-prod:shithub-backups}" |
| 31 | +WORK="$(mktemp -d -t shithub-restore-XXXXXX)" |
| 32 | +PGDATA="$WORK/pgdata" |
| 33 | +PGPORT="${SHITHUB_RESTORE_PGPORT:-55432}" |
| 34 | +LOG="/var/log/shithub/restore-drill.log" |
| 35 | +mkdir -p "$(dirname "$LOG")" |
| 36 | + |
| 37 | +ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } |
| 38 | +say() { printf '[%s] %s\n' "$(ts)" "$*" | tee -a "$LOG"; } |
| 39 | + |
| 40 | +cleanup() { |
| 41 | + if [[ -f "$PGDATA/postmaster.pid" ]]; then |
| 42 | + pg_ctl -D "$PGDATA" stop -m fast >/dev/null 2>&1 || true |
| 43 | + fi |
| 44 | + if [[ "$KEEP" -eq 0 ]]; then |
| 45 | + rm -rf "$WORK" |
| 46 | + else |
| 47 | + say "kept work dir: $WORK" |
| 48 | + fi |
| 49 | +} |
| 50 | +trap cleanup EXIT |
| 51 | + |
| 52 | +say "restore drill start (work=$WORK port=$PGPORT)" |
| 53 | + |
| 54 | +# 1. Resolve dump path. |
| 55 | +if [[ -z "$DUMP" ]]; then |
| 56 | + LATEST="$(rclone --config /root/.config/rclone/rclone.conf \ |
| 57 | + lsf "$BUCKET/daily/" --recursive --files-only \ |
| 58 | + | sort | tail -n 1)" |
| 59 | + if [[ -z "$LATEST" ]]; then |
| 60 | + say "FAIL: no dumps found in $BUCKET/daily/" |
| 61 | + exit 1 |
| 62 | + fi |
| 63 | + DUMP="$WORK/$(basename "$LATEST")" |
| 64 | + say "fetching $LATEST" |
| 65 | + rclone --config /root/.config/rclone/rclone.conf \ |
| 66 | + copyto "$BUCKET/daily/$LATEST" "$DUMP" |
| 67 | +fi |
| 68 | +say "using dump: $DUMP" |
| 69 | + |
| 70 | +# 2. initdb + start. |
| 71 | +initdb -D "$PGDATA" -U postgres --auth=trust --no-locale --encoding=UTF8 >/dev/null |
| 72 | +echo "port = $PGPORT" >> "$PGDATA/postgresql.conf" |
| 73 | +echo "unix_socket_directories = '$WORK'" >> "$PGDATA/postgresql.conf" |
| 74 | +pg_ctl -D "$PGDATA" -l "$WORK/pg.log" -w start |
| 75 | + |
| 76 | +# 3. Restore. |
| 77 | +createdb -h "$WORK" -p "$PGPORT" -U postgres shithub |
| 78 | +say "restoring..." |
| 79 | +pg_restore --host="$WORK" --port="$PGPORT" --username=postgres \ |
| 80 | + --dbname=shithub --no-owner --no-privileges --jobs=4 "$DUMP" |
| 81 | + |
| 82 | +# 4. Smoke checks. |
| 83 | +say "running smoke queries" |
| 84 | +psql -h "$WORK" -p "$PGPORT" -U postgres -d shithub \ |
| 85 | + -v ON_ERROR_STOP=1 -f "$(dirname "$0")/smoke-queries.sql" |
| 86 | + |
| 87 | +say "restore drill OK" |