tenseleyflow/shithub / 195ca8f

Browse files

S37: restore-drill (run.sh + smoke queries)

Authored by espadonne
SHA
195ca8fe804a7eeb725e6ae69aecd7e9e396fca3
Parents
4340aea
Tree
2adf9cb

2 changed files

StatusFile+-
A deploy/restore-drill/run.sh 87 0
A deploy/restore-drill/smoke-queries.sql 43 0
deploy/restore-drill/run.shadded
@@ -0,0 +1,87 @@
1
+#!/usr/bin/env bash
2
+# SPDX-License-Identifier: AGPL-3.0-or-later
3
+#
4
+# Restore drill — exercises the recovery path end-to-end so that
5
+# we know our backups actually restore. Run quarterly (the calendar
6
+# entry is in runbooks/backups.md). The script:
7
+#
8
+#   1. Spins up an empty Postgres in a temp data directory.
9
+#   2. Pulls the latest daily dump from Spaces (or an explicit
10
+#      --dump path).
11
+#   3. pg_restores into the temp instance.
12
+#   4. Runs smoke-queries.sql to confirm row counts and integrity.
13
+#   5. Tears the temp instance down.
14
+#
15
+# Exits non-zero on any failure. Output is appended to
16
+# /var/log/shithub/restore-drill.log so the on-call can review.
17
+
18
+set -euo pipefail
19
+
20
+DUMP=""
21
+KEEP=0
22
+while [[ $# -gt 0 ]]; do
23
+  case "$1" in
24
+    --dump)  DUMP="$2"; shift 2 ;;
25
+    --keep)  KEEP=1;    shift   ;;
26
+    *)       echo "unknown arg: $1" >&2; exit 2 ;;
27
+  esac
28
+done
29
+
30
+BUCKET="${SHITHUB_BACKUP_BUCKET:-spaces-prod:shithub-backups}"
31
+WORK="$(mktemp -d -t shithub-restore-XXXXXX)"
32
+PGDATA="$WORK/pgdata"
33
+PGPORT="${SHITHUB_RESTORE_PGPORT:-55432}"
34
+LOG="/var/log/shithub/restore-drill.log"
35
+mkdir -p "$(dirname "$LOG")"
36
+
37
+ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }
38
+say() { printf '[%s] %s\n' "$(ts)" "$*" | tee -a "$LOG"; }
39
+
40
+cleanup() {
41
+  if [[ -f "$PGDATA/postmaster.pid" ]]; then
42
+    pg_ctl -D "$PGDATA" stop -m fast >/dev/null 2>&1 || true
43
+  fi
44
+  if [[ "$KEEP" -eq 0 ]]; then
45
+    rm -rf "$WORK"
46
+  else
47
+    say "kept work dir: $WORK"
48
+  fi
49
+}
50
+trap cleanup EXIT
51
+
52
+say "restore drill start (work=$WORK port=$PGPORT)"
53
+
54
+# 1. Resolve dump path.
55
+if [[ -z "$DUMP" ]]; then
56
+  LATEST="$(rclone --config /root/.config/rclone/rclone.conf \
57
+                   lsf "$BUCKET/daily/" --recursive --files-only \
58
+                | sort | tail -n 1)"
59
+  if [[ -z "$LATEST" ]]; then
60
+    say "FAIL: no dumps found in $BUCKET/daily/"
61
+    exit 1
62
+  fi
63
+  DUMP="$WORK/$(basename "$LATEST")"
64
+  say "fetching $LATEST"
65
+  rclone --config /root/.config/rclone/rclone.conf \
66
+         copyto "$BUCKET/daily/$LATEST" "$DUMP"
67
+fi
68
+say "using dump: $DUMP"
69
+
70
+# 2. initdb + start.
71
+initdb -D "$PGDATA" -U postgres --auth=trust --no-locale --encoding=UTF8 >/dev/null
72
+echo "port = $PGPORT" >> "$PGDATA/postgresql.conf"
73
+echo "unix_socket_directories = '$WORK'" >> "$PGDATA/postgresql.conf"
74
+pg_ctl -D "$PGDATA" -l "$WORK/pg.log" -w start
75
+
76
+# 3. Restore.
77
+createdb -h "$WORK" -p "$PGPORT" -U postgres shithub
78
+say "restoring..."
79
+pg_restore --host="$WORK" --port="$PGPORT" --username=postgres \
80
+           --dbname=shithub --no-owner --no-privileges --jobs=4 "$DUMP"
81
+
82
+# 4. Smoke checks.
83
+say "running smoke queries"
84
+psql -h "$WORK" -p "$PGPORT" -U postgres -d shithub \
85
+     -v ON_ERROR_STOP=1 -f "$(dirname "$0")/smoke-queries.sql"
86
+
87
+say "restore drill OK"
deploy/restore-drill/smoke-queries.sqladded
@@ -0,0 +1,43 @@
1
+-- SPDX-License-Identifier: AGPL-3.0-or-later
2
+--
3
+-- Smoke checks for the restore drill. Each \echo line is the human
4
+-- name of the check. We use psql's ON_ERROR_STOP=1 so any failed
5
+-- assertion exits non-zero and the drill reports failure.
6
+--
7
+-- Keep these checks coarse: count tables, sanity-check primary
8
+-- foreign-key relationships, ensure migrations table exists. Don't
9
+-- compare counts to fixed numbers — counts grow over time. Compare
10
+-- to internal invariants instead.
11
+
12
+\echo === schema_migrations exists and has rows ===
13
+SELECT 1 / (CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END) AS ok
14
+  FROM schema_migrations;
15
+
16
+\echo === core tables non-empty ===
17
+SELECT 1 / (CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END) FROM users;
18
+SELECT 1 / (CASE WHEN COUNT(*) > 0 THEN 1 ELSE 0 END) FROM repos;
19
+
20
+\echo === every repo has a real owner ===
21
+SELECT 1 / (CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END) AS orphan_repos
22
+  FROM repos r
23
+  LEFT JOIN users u ON u.id = r.owner_id
24
+  LEFT JOIN orgs  o ON o.id = r.owner_id
25
+  WHERE u.id IS NULL AND o.id IS NULL;
26
+
27
+\echo === every push_event references a real repo ===
28
+SELECT 1 / (CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END) AS orphan_push_events
29
+  FROM push_events pe
30
+  LEFT JOIN repos r ON r.id = pe.repo_id
31
+  WHERE r.id IS NULL;
32
+
33
+\echo === every issue belongs to a repo ===
34
+SELECT 1 / (CASE WHEN COUNT(*) = 0 THEN 1 ELSE 0 END) AS orphan_issues
35
+  FROM issues i
36
+  LEFT JOIN repos r ON r.id = i.repo_id
37
+  WHERE r.id IS NULL;
38
+
39
+\echo === auth_audit_log columns intact ===
40
+SELECT actor_user_id, action, occurred_at FROM auth_audit_log LIMIT 1;
41
+
42
+\echo === migrations applied through latest known ===
43
+SELECT version FROM schema_migrations ORDER BY version DESC LIMIT 1;