Bash · 2694 bytes Raw Blame History
1 #!/usr/bin/env bash
2 # SPDX-License-Identifier: AGPL-3.0-or-later
3 #
4 # Restore drill — exercises the recovery path end-to-end so that
5 # we know our backups actually restore. Run quarterly (the calendar
6 # entry is in runbooks/backups.md). The script:
7 #
8 # 1. Spins up an empty Postgres in a temp data directory.
9 # 2. Pulls the latest daily dump from Spaces (or an explicit
10 # --dump path).
11 # 3. pg_restores into the temp instance.
12 # 4. Runs smoke-queries.sql to confirm row counts and integrity.
13 # 5. Tears the temp instance down.
14 #
15 # Exits non-zero on any failure. Output is appended to
16 # /var/log/shithub/restore-drill.log so the on-call can review.
17
18 set -euo pipefail
19
20 DUMP=""
21 KEEP=0
22 while [[ $# -gt 0 ]]; do
23 case "$1" in
24 --dump) DUMP="$2"; shift 2 ;;
25 --keep) KEEP=1; shift ;;
26 *) echo "unknown arg: $1" >&2; exit 2 ;;
27 esac
28 done
29
30 BUCKET="${SHITHUB_BACKUP_BUCKET:-spaces-prod:shithub-backups}"
31 WORK="$(mktemp -d -t shithub-restore-XXXXXX)"
32 PGDATA="$WORK/pgdata"
33 PGPORT="${SHITHUB_RESTORE_PGPORT:-55432}"
34 LOG="/var/log/shithub/restore-drill.log"
35 mkdir -p "$(dirname "$LOG")"
36
37 ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }
38 say() { printf '[%s] %s\n' "$(ts)" "$*" | tee -a "$LOG"; }
39
40 cleanup() {
41 if [[ -f "$PGDATA/postmaster.pid" ]]; then
42 pg_ctl -D "$PGDATA" stop -m fast >/dev/null 2>&1 || true
43 fi
44 if [[ "$KEEP" -eq 0 ]]; then
45 rm -rf "$WORK"
46 else
47 say "kept work dir: $WORK"
48 fi
49 }
50 trap cleanup EXIT
51
52 say "restore drill start (work=$WORK port=$PGPORT)"
53
54 # 1. Resolve dump path.
55 if [[ -z "$DUMP" ]]; then
56 LATEST="$(rclone --config /root/.config/rclone/rclone.conf \
57 lsf "$BUCKET/daily/" --recursive --files-only \
58 | sort | tail -n 1)"
59 if [[ -z "$LATEST" ]]; then
60 say "FAIL: no dumps found in $BUCKET/daily/"
61 exit 1
62 fi
63 DUMP="$WORK/$(basename "$LATEST")"
64 say "fetching $LATEST"
65 rclone --config /root/.config/rclone/rclone.conf \
66 copyto "$BUCKET/daily/$LATEST" "$DUMP"
67 fi
68 say "using dump: $DUMP"
69
70 # 2. initdb + start.
71 initdb -D "$PGDATA" -U postgres --auth=trust --no-locale --encoding=UTF8 >/dev/null
72 echo "port = $PGPORT" >> "$PGDATA/postgresql.conf"
73 echo "unix_socket_directories = '$WORK'" >> "$PGDATA/postgresql.conf"
74 pg_ctl -D "$PGDATA" -l "$WORK/pg.log" -w start
75
76 # 3. Restore.
77 createdb -h "$WORK" -p "$PGPORT" -U postgres shithub
78 say "restoring..."
79 pg_restore --host="$WORK" --port="$PGPORT" --username=postgres \
80 --dbname=shithub --no-owner --no-privileges --jobs=4 "$DUMP"
81
82 # 4. Smoke checks.
83 say "running smoke queries"
84 psql -h "$WORK" -p "$PGPORT" -U postgres -d shithub \
85 -v ON_ERROR_STOP=1 -f "$(dirname "$0")/smoke-queries.sql"
86
87 say "restore drill OK"