| 1 | #!/usr/bin/env bash |
| 2 | # SPDX-License-Identifier: AGPL-3.0-or-later |
| 3 | # |
| 4 | # Restore drill — exercises the recovery path end-to-end so that |
| 5 | # we know our backups actually restore. Run quarterly (the calendar |
| 6 | # entry is in runbooks/backups.md). The script: |
| 7 | # |
| 8 | # 1. Spins up an empty Postgres in a temp data directory. |
| 9 | # 2. Pulls the latest daily dump from Spaces (or an explicit |
| 10 | # --dump path). |
| 11 | # 3. pg_restores into the temp instance. |
| 12 | # 4. Runs smoke-queries.sql to confirm row counts and integrity. |
| 13 | # 5. Tears the temp instance down. |
| 14 | # |
| 15 | # Must run as root (because rclone reads /root/.config/rclone). The |
| 16 | # server-side Postgres tools are invoked under sudo -u postgres |
| 17 | # because Postgres refuses to start as root, and the WORK directory |
| 18 | # is chowned to postgres so the daemon and pg_restore can read the |
| 19 | # dump and write to the data dir. |
| 20 | # |
| 21 | # Exits non-zero on any failure. Output is appended to |
| 22 | # /var/log/shithub/restore-drill.log so the on-call can review. |
| 23 | |
| 24 | set -euo pipefail |
| 25 | |
| 26 | DUMP="" |
| 27 | KEEP=0 |
| 28 | while [[ $# -gt 0 ]]; do |
| 29 | case "$1" in |
| 30 | --dump) DUMP="$2"; shift 2 ;; |
| 31 | --keep) KEEP=1; shift ;; |
| 32 | *) echo "unknown arg: $1" >&2; exit 2 ;; |
| 33 | esac |
| 34 | done |
| 35 | |
| 36 | # Server-side tools (initdb, pg_ctl) live under the versioned bindir |
| 37 | # on Debian/Ubuntu and aren't on root's PATH. Pick the highest version |
| 38 | # present so we don't break when the cluster bumps majors. |
| 39 | PG_BIN="$(ls -d /usr/lib/postgresql/*/bin 2>/dev/null | sort -V | tail -n 1)" |
| 40 | if [[ -z "$PG_BIN" || ! -x "$PG_BIN/initdb" ]]; then |
| 41 | echo "fatal: no postgres server bindir under /usr/lib/postgresql/*/bin" >&2 |
| 42 | exit 2 |
| 43 | fi |
| 44 | |
| 45 | BUCKET="${SHITHUB_BACKUP_BUCKET:-spaces-prod:shithub-backups}" |
| 46 | WORK="$(mktemp -d -t shithub-restore-XXXXXX)" |
| 47 | chown postgres:postgres "$WORK" |
| 48 | PGDATA="$WORK/pgdata" |
| 49 | PGPORT="${SHITHUB_RESTORE_PGPORT:-55432}" |
| 50 | LOG="/var/log/shithub/restore-drill.log" |
| 51 | mkdir -p "$(dirname "$LOG")" |
| 52 | |
| 53 | ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } |
| 54 | say() { printf '[%s] %s\n' "$(ts)" "$*" | tee -a "$LOG"; } |
| 55 | |
| 56 | cleanup() { |
| 57 | if [[ -f "$PGDATA/postmaster.pid" ]]; then |
| 58 | sudo -u postgres "$PG_BIN/pg_ctl" -D "$PGDATA" stop -m fast >/dev/null 2>&1 || true |
| 59 | fi |
| 60 | if [[ "$KEEP" -eq 0 ]]; then |
| 61 | rm -rf "$WORK" |
| 62 | else |
| 63 | say "kept work dir: $WORK" |
| 64 | fi |
| 65 | } |
| 66 | trap cleanup EXIT |
| 67 | |
| 68 | say "restore drill start (work=$WORK port=$PGPORT pg=$PG_BIN)" |
| 69 | |
| 70 | # 1. Resolve dump path. --s3-no-check-bucket: scoped Spaces keys lack |
| 71 | # GetBucketLocation; the actual GET works fine. |
| 72 | if [[ -z "$DUMP" ]]; then |
| 73 | LATEST="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \ |
| 74 | lsf "$BUCKET/daily/" --recursive --files-only \ |
| 75 | | sort | tail -n 1)" |
| 76 | if [[ -z "$LATEST" ]]; then |
| 77 | say "FAIL: no dumps found in $BUCKET/daily/" |
| 78 | exit 1 |
| 79 | fi |
| 80 | DUMP="$WORK/$(basename "$LATEST")" |
| 81 | say "fetching $LATEST" |
| 82 | rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \ |
| 83 | copyto "$BUCKET/daily/$LATEST" "$DUMP" |
| 84 | fi |
| 85 | chown postgres:postgres "$DUMP" |
| 86 | say "using dump: $DUMP" |
| 87 | |
| 88 | # 2. initdb + start. Run as postgres because the server refuses root. |
| 89 | sudo -u postgres "$PG_BIN/initdb" -D "$PGDATA" -U postgres --auth=trust --no-locale --encoding=UTF8 >/dev/null |
| 90 | { |
| 91 | echo "port = $PGPORT" |
| 92 | echo "unix_socket_directories = '$WORK'" |
| 93 | } | sudo -u postgres tee -a "$PGDATA/postgresql.conf" >/dev/null |
| 94 | sudo -u postgres "$PG_BIN/pg_ctl" -D "$PGDATA" -l "$WORK/pg.log" -w start >/dev/null |
| 95 | |
| 96 | # 3. Restore. |
| 97 | sudo -u postgres createdb -h "$WORK" -p "$PGPORT" -U postgres shithub |
| 98 | say "restoring..." |
| 99 | sudo -u postgres pg_restore --host="$WORK" --port="$PGPORT" --username=postgres \ |
| 100 | --dbname=shithub --no-owner --no-privileges --jobs=4 "$DUMP" \ |
| 101 | >> "$LOG" 2>&1 |
| 102 | |
| 103 | # 4. Smoke checks. Copy the .sql into WORK so the postgres user can read it |
| 104 | # (the script lives under /root which is mode 0700). |
| 105 | cp "$(dirname "$0")/smoke-queries.sql" "$WORK/smoke.sql" |
| 106 | chown postgres:postgres "$WORK/smoke.sql" |
| 107 | say "running smoke queries" |
| 108 | sudo -u postgres psql -h "$WORK" -p "$PGPORT" -U postgres -d shithub \ |
| 109 | -v ON_ERROR_STOP=1 -f "$WORK/smoke.sql" >> "$LOG" 2>&1 |
| 110 | |
| 111 | say "restore drill OK" |