tenseleyflow/shithub / c824190

Browse files

cutover: idempotent provisioner for DO droplet + uptime alerts

Authored by espadonne
SHA
c824190db3d14a3c7c641a1a455a6fd55d0191f7
Parents
c689e3c
Tree
80ab0d2

1 changed file

StatusFile+-
A deploy/cutover/provision-do-alerts.sh 113 0
deploy/cutover/provision-do-alerts.shadded
@@ -0,0 +1,113 @@
1
+#!/usr/bin/env bash
2
+# SPDX-License-Identifier: AGPL-3.0-or-later
3
+#
4
+# Idempotent re-provisioner for the DigitalOcean monitoring alerts
5
+# that today aren't otherwise codified anywhere — they live in DO
6
+# account state. Run from the operator laptop after a fresh account
7
+# (or as a sanity-restore check) to ensure the canonical set exists.
8
+#
9
+# What this script provisions:
10
+#
11
+#   Droplet alerts (resource-utilization, scoped to shithub-app):
12
+#     - CPU > 80% for 10m
13
+#     - Memory > 90% for 10m
14
+#     - Disk > 80% for 10m
15
+#     - Load1 > 4 for 10m
16
+#
17
+#   Uptime check (https://shithub.sh from us_east + eu_west) plus:
18
+#     - down_global → page on full outage (not single-region blip)
19
+#     - ssl_expiry < 14d → catch a Caddy auto-renew failure with buffer
20
+#     - latency p95 > 2s
21
+#
22
+# Idempotency: each create is gated by a list-and-match on description
23
+# (for resource alerts) or name (for uptime check + uptime alerts).
24
+# Re-running is a no-op once everything's in place.
25
+#
26
+# Prereqs:
27
+#   - doctl installed + authenticated
28
+#   - jq
29
+#   - The DO account email is verified (alerts only allow verified
30
+#     team-member emails; new emails are rejected with "email is not
31
+#     verified"). Confirm via `doctl account get`.
32
+
33
+set -euo pipefail
34
+
35
+DROPLET_NAME="${DROPLET_NAME:-shithub-app}"
36
+EMAIL="${ALERT_EMAIL:-$(doctl account get --output json | jq -r '.email')}"
37
+UPTIME_TARGET="${UPTIME_TARGET:-https://shithub.sh}"
38
+UPTIME_NAME="${UPTIME_NAME:-shithub.sh}"
39
+
40
+if ! command -v doctl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then
41
+        echo "fatal: doctl + jq required" >&2
42
+        exit 2
43
+fi
44
+
45
+DROPLET_ID="$(doctl compute droplet list --format ID,Name --no-header \
46
+        | awk -v n="$DROPLET_NAME" '$2==n {print $1; exit}')"
47
+if [[ -z "$DROPLET_ID" ]]; then
48
+        echo "fatal: no droplet named $DROPLET_NAME" >&2
49
+        exit 2
50
+fi
51
+echo "droplet: $DROPLET_NAME ($DROPLET_ID)" >&2
52
+echo "email:   $EMAIL" >&2
53
+
54
+# ── Droplet resource alerts ────────────────────────────────────────
55
+ensure_resource_alert() {
56
+        local desc="$1" type="$2" value="$3" window="$4"
57
+        local existing
58
+        existing="$(doctl monitoring alert list --output json \
59
+                | jq -r --arg d "$desc" '.[] | select(.description == $d) | .uuid')"
60
+        if [[ -n "$existing" ]]; then
61
+                echo "  exists: $desc ($existing)" >&2
62
+                return
63
+        fi
64
+        echo "  create: $desc" >&2
65
+        doctl monitoring alert create \
66
+                --type "$type" --compare GreaterThan --value "$value" --window "$window" \
67
+                --entities "$DROPLET_ID" --emails "$EMAIL" \
68
+                --description "$desc" >/dev/null
69
+}
70
+
71
+echo "resource alerts:" >&2
72
+ensure_resource_alert "$DROPLET_NAME CPU > 80% for 10m"    v1/insights/droplet/cpu                          80 10m
73
+ensure_resource_alert "$DROPLET_NAME memory > 90% for 10m" v1/insights/droplet/memory_utilization_percent   90 10m
74
+ensure_resource_alert "$DROPLET_NAME disk > 80% for 10m"   v1/insights/droplet/disk_utilization_percent     80 10m
75
+ensure_resource_alert "$DROPLET_NAME load1 > 4 for 10m"    v1/insights/droplet/load_1                        4 10m
76
+
77
+# ── Uptime check ──────────────────────────────────────────────────
78
+echo "uptime check:" >&2
79
+CHECK_ID="$(doctl monitoring uptime list --output json \
80
+        | jq -r --arg n "$UPTIME_NAME" '.[] | select(.name == $n) | .id')"
81
+if [[ -z "$CHECK_ID" ]]; then
82
+        echo "  create: $UPTIME_NAME → $UPTIME_TARGET" >&2
83
+        CHECK_ID="$(doctl monitoring uptime create "$UPTIME_NAME" \
84
+                --target "$UPTIME_TARGET" --type https \
85
+                --regions us_east,eu_west --enabled true \
86
+                --output json | jq -r '.[0].id')"
87
+else
88
+        echo "  exists: $UPTIME_NAME ($CHECK_ID)" >&2
89
+fi
90
+
91
+# ── Uptime alerts ─────────────────────────────────────────────────
92
+ensure_uptime_alert() {
93
+        local name="$1" type="$2" threshold="$3" comparison="$4"
94
+        local existing
95
+        existing="$(doctl monitoring uptime alert list "$CHECK_ID" --output json \
96
+                | jq -r --arg n "$name" '.[] | select(.name == $n) | .id')"
97
+        if [[ -n "$existing" ]]; then
98
+                echo "  exists: $name ($existing)" >&2
99
+                return
100
+        fi
101
+        echo "  create: $name" >&2
102
+        doctl monitoring uptime alert create "$CHECK_ID" \
103
+                --name "$name" --type "$type" --period 5m \
104
+                --threshold "$threshold" --comparison "$comparison" \
105
+                --emails "$EMAIL" >/dev/null
106
+}
107
+
108
+echo "uptime alerts:" >&2
109
+ensure_uptime_alert "$UPTIME_NAME down (all regions)"          down_global  0  greater_than
110
+ensure_uptime_alert "$UPTIME_NAME SSL cert expiring < 14 days" ssl_expiry  14  less_than
111
+ensure_uptime_alert "$UPTIME_NAME latency p95 > 2s"            latency   2000  greater_than
112
+
113
+echo "done." >&2