Bash · 4064 bytes Raw Blame History
1 #!/usr/bin/env bash
2 # SPDX-License-Identifier: AGPL-3.0-or-later
3 #
4 # Restore drill — exercises the recovery path end-to-end so that
5 # we know our backups actually restore. Run quarterly (the calendar
6 # entry is in runbooks/backups.md). The script:
7 #
8 # 1. Spins up an empty Postgres in a temp data directory.
9 # 2. Pulls the latest daily dump from Spaces (or an explicit
10 # --dump path).
11 # 3. pg_restores into the temp instance.
12 # 4. Runs smoke-queries.sql to confirm row counts and integrity.
13 # 5. Tears the temp instance down.
14 #
15 # Must run as root (because rclone reads /root/.config/rclone). The
16 # server-side Postgres tools are invoked under sudo -u postgres
17 # because Postgres refuses to start as root, and the WORK directory
18 # is chowned to postgres so the daemon and pg_restore can read the
19 # dump and write to the data dir.
20 #
21 # Exits non-zero on any failure. Output is appended to
22 # /var/log/shithub/restore-drill.log so the on-call can review.
23
24 set -euo pipefail
25
26 DUMP=""
27 KEEP=0
28 while [[ $# -gt 0 ]]; do
29 case "$1" in
30 --dump) DUMP="$2"; shift 2 ;;
31 --keep) KEEP=1; shift ;;
32 *) echo "unknown arg: $1" >&2; exit 2 ;;
33 esac
34 done
35
36 # Server-side tools (initdb, pg_ctl) live under the versioned bindir
37 # on Debian/Ubuntu and aren't on root's PATH. Pick the highest version
38 # present so we don't break when the cluster bumps majors.
39 PG_BIN="$(ls -d /usr/lib/postgresql/*/bin 2>/dev/null | sort -V | tail -n 1)"
40 if [[ -z "$PG_BIN" || ! -x "$PG_BIN/initdb" ]]; then
41 echo "fatal: no postgres server bindir under /usr/lib/postgresql/*/bin" >&2
42 exit 2
43 fi
44
45 BUCKET="${SHITHUB_BACKUP_BUCKET:-spaces-prod:shithub-backups}"
46 WORK="$(mktemp -d -t shithub-restore-XXXXXX)"
47 chown postgres:postgres "$WORK"
48 PGDATA="$WORK/pgdata"
49 PGPORT="${SHITHUB_RESTORE_PGPORT:-55432}"
50 LOG="/var/log/shithub/restore-drill.log"
51 mkdir -p "$(dirname "$LOG")"
52
53 ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }
54 say() { printf '[%s] %s\n' "$(ts)" "$*" | tee -a "$LOG"; }
55
56 cleanup() {
57 if [[ -f "$PGDATA/postmaster.pid" ]]; then
58 sudo -u postgres "$PG_BIN/pg_ctl" -D "$PGDATA" stop -m fast >/dev/null 2>&1 || true
59 fi
60 if [[ "$KEEP" -eq 0 ]]; then
61 rm -rf "$WORK"
62 else
63 say "kept work dir: $WORK"
64 fi
65 }
66 trap cleanup EXIT
67
68 say "restore drill start (work=$WORK port=$PGPORT pg=$PG_BIN)"
69
70 # 1. Resolve dump path. --s3-no-check-bucket: scoped Spaces keys lack
71 # GetBucketLocation; the actual GET works fine.
72 if [[ -z "$DUMP" ]]; then
73 LATEST="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
74 lsf "$BUCKET/daily/" --recursive --files-only \
75 | sort | tail -n 1)"
76 if [[ -z "$LATEST" ]]; then
77 say "FAIL: no dumps found in $BUCKET/daily/"
78 exit 1
79 fi
80 DUMP="$WORK/$(basename "$LATEST")"
81 say "fetching $LATEST"
82 rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \
83 copyto "$BUCKET/daily/$LATEST" "$DUMP"
84 fi
85 chown postgres:postgres "$DUMP"
86 say "using dump: $DUMP"
87
88 # 2. initdb + start. Run as postgres because the server refuses root.
89 sudo -u postgres "$PG_BIN/initdb" -D "$PGDATA" -U postgres --auth=trust --no-locale --encoding=UTF8 >/dev/null
90 {
91 echo "port = $PGPORT"
92 echo "unix_socket_directories = '$WORK'"
93 } | sudo -u postgres tee -a "$PGDATA/postgresql.conf" >/dev/null
94 sudo -u postgres "$PG_BIN/pg_ctl" -D "$PGDATA" -l "$WORK/pg.log" -w start >/dev/null
95
96 # 3. Restore.
97 sudo -u postgres createdb -h "$WORK" -p "$PGPORT" -U postgres shithub
98 say "restoring..."
99 sudo -u postgres pg_restore --host="$WORK" --port="$PGPORT" --username=postgres \
100 --dbname=shithub --no-owner --no-privileges --jobs=4 "$DUMP" \
101 >> "$LOG" 2>&1
102
103 # 4. Smoke checks. Copy the .sql into WORK so the postgres user can read it
104 # (the script lives under /root which is mode 0700).
105 cp "$(dirname "$0")/smoke-queries.sql" "$WORK/smoke.sql"
106 chown postgres:postgres "$WORK/smoke.sql"
107 say "running smoke queries"
108 sudo -u postgres psql -h "$WORK" -p "$PGPORT" -U postgres -d shithub \
109 -v ON_ERROR_STOP=1 -f "$WORK/smoke.sql" >> "$LOG" 2>&1
110
111 say "restore drill OK"