Bash · 5101 bytes Raw Blame History
1 #!/usr/bin/env bash
2 # SPDX-License-Identifier: AGPL-3.0-or-later
3 #
4 # Idempotent re-provisioner for the DigitalOcean monitoring alerts
5 # that today aren't otherwise codified anywhere — they live in DO
6 # account state. Run from the operator laptop after a fresh account
7 # (or as a sanity-restore check) to ensure the canonical set exists.
8 #
9 # What this script provisions:
10 #
11 # Droplet alerts (resource-utilization, scoped to shithub-app):
12 # - CPU > 80% for 10m
13 # - Memory > 90% for 10m
14 # - Disk > 80% for 10m
15 # - Load1 > 4 for 10m
16 #
17 # Uptime check (https://shithub.sh from us_east + eu_west) plus:
18 # - down_global → page on full outage (not single-region blip)
19 # - ssl_expiry < 14d → catch a Caddy auto-renew failure with buffer
20 # - latency p95 > 2s
21 #
22 # Idempotency: each create is gated by a list-and-match on description
23 # (for resource alerts) or name (for uptime check + uptime alerts).
24 # Re-running is a no-op once everything's in place.
25 #
26 # Prereqs:
27 # - doctl installed + authenticated
28 # - jq
29 # - The DO account email is verified (alerts only allow verified
30 # team-member emails; new emails are rejected with "email is not
31 # verified"). Confirm via `doctl account get`.
32
33 set -euo pipefail
34
35 DROPLET_NAME="${DROPLET_NAME:-shithub-app}"
36 EMAIL="${ALERT_EMAIL:-$(doctl account get --output json | jq -r '.email')}"
37 UPTIME_TARGET="${UPTIME_TARGET:-https://shithub.sh}"
38 UPTIME_NAME="${UPTIME_NAME:-shithub.sh}"
39
40 if ! command -v doctl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then
41 echo "fatal: doctl + jq required" >&2
42 exit 2
43 fi
44
45 DROPLET_ID="$(doctl compute droplet list --format ID,Name --no-header \
46 | awk -v n="$DROPLET_NAME" '$2==n {print $1; exit}')"
47 if [[ -z "$DROPLET_ID" ]]; then
48 echo "fatal: no droplet named $DROPLET_NAME" >&2
49 exit 2
50 fi
51 echo "droplet: $DROPLET_NAME ($DROPLET_ID)" >&2
52 echo "email: $EMAIL" >&2
53
54 # ── Droplet resource alerts ────────────────────────────────────────
55 ensure_resource_alert() {
56 local desc="$1" type="$2" value="$3" window="$4"
57 local existing
58 existing="$(doctl monitoring alert list --output json \
59 | jq -r --arg d "$desc" '.[] | select(.description == $d) | .uuid')"
60 if [[ -n "$existing" ]]; then
61 echo " exists: $desc ($existing)" >&2
62 return
63 fi
64 echo " create: $desc" >&2
65 doctl monitoring alert create \
66 --type "$type" --compare GreaterThan --value "$value" --window "$window" \
67 --entities "$DROPLET_ID" --emails "$EMAIL" \
68 --description "$desc" >/dev/null
69 }
70
71 echo "resource alerts:" >&2
72 ensure_resource_alert "$DROPLET_NAME CPU > 80% for 10m" v1/insights/droplet/cpu 80 10m
73 ensure_resource_alert "$DROPLET_NAME memory > 90% for 10m" v1/insights/droplet/memory_utilization_percent 90 10m
74 ensure_resource_alert "$DROPLET_NAME disk > 80% for 10m" v1/insights/droplet/disk_utilization_percent 80 10m
75 ensure_resource_alert "$DROPLET_NAME load1 > 4 for 10m" v1/insights/droplet/load_1 4 10m
76
77 # ── Uptime check ──────────────────────────────────────────────────
78 echo "uptime check:" >&2
79 CHECK_ID="$(doctl monitoring uptime list --output json \
80 | jq -r --arg n "$UPTIME_NAME" '.[] | select(.name == $n) | .id')"
81 if [[ -z "$CHECK_ID" ]]; then
82 echo " create: $UPTIME_NAME$UPTIME_TARGET" >&2
83 CHECK_ID="$(doctl monitoring uptime create "$UPTIME_NAME" \
84 --target "$UPTIME_TARGET" --type https \
85 --regions us_east,eu_west --enabled true \
86 --output json | jq -r '.[0].id')"
87 else
88 echo " exists: $UPTIME_NAME ($CHECK_ID)" >&2
89 fi
90
91 # ── Uptime alerts ─────────────────────────────────────────────────
92 ensure_uptime_alert() {
93 local name="$1" type="$2" threshold="$3" comparison="$4"
94 local existing
95 existing="$(doctl monitoring uptime alert list "$CHECK_ID" --output json \
96 | jq -r --arg n "$name" '.[] | select(.name == $n) | .id')"
97 if [[ -n "$existing" ]]; then
98 echo " exists: $name ($existing)" >&2
99 return
100 fi
101 echo " create: $name" >&2
102 doctl monitoring uptime alert create "$CHECK_ID" \
103 --name "$name" --type "$type" --period 5m \
104 --threshold "$threshold" --comparison "$comparison" \
105 --emails "$EMAIL" >/dev/null
106 }
107
108 echo "uptime alerts:" >&2
109 ensure_uptime_alert "$UPTIME_NAME down (all regions)" down_global 0 greater_than
110 ensure_uptime_alert "$UPTIME_NAME SSL cert expiring < 14 days" ssl_expiry 14 less_than
111 ensure_uptime_alert "$UPTIME_NAME latency p95 > 2s" latency 2000 greater_than
112
113 echo "done." >&2