| 1 | #!/usr/bin/env bash |
| 2 | # SPDX-License-Identifier: AGPL-3.0-or-later |
| 3 | # |
| 4 | # Restore drill — exercises the recovery path end-to-end so that |
| 5 | # we know our backups actually restore. Run quarterly (the calendar |
| 6 | # entry is in runbooks/backups.md). The script: |
| 7 | # |
| 8 | # 1. Spins up an empty Postgres in a temp data directory. |
| 9 | # 2. Pulls the latest daily dump from Spaces (or an explicit |
| 10 | # --dump path). |
| 11 | # 3. pg_restores into the temp instance. |
| 12 | # 4. Runs smoke-queries.sql to confirm row counts and integrity. |
| 13 | # 5. Tears the temp instance down. |
| 14 | # |
| 15 | # Exits non-zero on any failure. Output is appended to |
| 16 | # /var/log/shithub/restore-drill.log so the on-call can review. |
| 17 | |
| 18 | set -euo pipefail |
| 19 | |
| 20 | DUMP="" |
| 21 | KEEP=0 |
| 22 | while [[ $# -gt 0 ]]; do |
| 23 | case "$1" in |
| 24 | --dump) DUMP="$2"; shift 2 ;; |
| 25 | --keep) KEEP=1; shift ;; |
| 26 | *) echo "unknown arg: $1" >&2; exit 2 ;; |
| 27 | esac |
| 28 | done |
| 29 | |
| 30 | BUCKET="${SHITHUB_BACKUP_BUCKET:-spaces-prod:shithub-backups}" |
| 31 | WORK="$(mktemp -d -t shithub-restore-XXXXXX)" |
| 32 | PGDATA="$WORK/pgdata" |
| 33 | PGPORT="${SHITHUB_RESTORE_PGPORT:-55432}" |
| 34 | LOG="/var/log/shithub/restore-drill.log" |
| 35 | mkdir -p "$(dirname "$LOG")" |
| 36 | |
| 37 | ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } |
| 38 | say() { printf '[%s] %s\n' "$(ts)" "$*" | tee -a "$LOG"; } |
| 39 | |
| 40 | cleanup() { |
| 41 | if [[ -f "$PGDATA/postmaster.pid" ]]; then |
| 42 | pg_ctl -D "$PGDATA" stop -m fast >/dev/null 2>&1 || true |
| 43 | fi |
| 44 | if [[ "$KEEP" -eq 0 ]]; then |
| 45 | rm -rf "$WORK" |
| 46 | else |
| 47 | say "kept work dir: $WORK" |
| 48 | fi |
| 49 | } |
| 50 | trap cleanup EXIT |
| 51 | |
| 52 | say "restore drill start (work=$WORK port=$PGPORT)" |
| 53 | |
| 54 | # 1. Resolve dump path. |
| 55 | if [[ -z "$DUMP" ]]; then |
| 56 | LATEST="$(rclone --config /root/.config/rclone/rclone.conf \ |
| 57 | lsf "$BUCKET/daily/" --recursive --files-only \ |
| 58 | | sort | tail -n 1)" |
| 59 | if [[ -z "$LATEST" ]]; then |
| 60 | say "FAIL: no dumps found in $BUCKET/daily/" |
| 61 | exit 1 |
| 62 | fi |
| 63 | DUMP="$WORK/$(basename "$LATEST")" |
| 64 | say "fetching $LATEST" |
| 65 | rclone --config /root/.config/rclone/rclone.conf \ |
| 66 | copyto "$BUCKET/daily/$LATEST" "$DUMP" |
| 67 | fi |
| 68 | say "using dump: $DUMP" |
| 69 | |
| 70 | # 2. initdb + start. |
| 71 | initdb -D "$PGDATA" -U postgres --auth=trust --no-locale --encoding=UTF8 >/dev/null |
| 72 | echo "port = $PGPORT" >> "$PGDATA/postgresql.conf" |
| 73 | echo "unix_socket_directories = '$WORK'" >> "$PGDATA/postgresql.conf" |
| 74 | pg_ctl -D "$PGDATA" -l "$WORK/pg.log" -w start |
| 75 | |
| 76 | # 3. Restore. |
| 77 | createdb -h "$WORK" -p "$PGPORT" -U postgres shithub |
| 78 | say "restoring..." |
| 79 | pg_restore --host="$WORK" --port="$PGPORT" --username=postgres \ |
| 80 | --dbname=shithub --no-owner --no-privileges --jobs=4 "$DUMP" |
| 81 | |
| 82 | # 4. Smoke checks. |
| 83 | say "running smoke queries" |
| 84 | psql -h "$WORK" -p "$PGPORT" -U postgres -d shithub \ |
| 85 | -v ON_ERROR_STOP=1 -f "$(dirname "$0")/smoke-queries.sql" |
| 86 | |
| 87 | say "restore drill OK" |