| 1 | #!/usr/bin/env bash |
| 2 | # SPDX-License-Identifier: AGPL-3.0-or-later |
| 3 | # |
| 4 | # Idempotent re-provisioner for the DigitalOcean monitoring alerts |
| 5 | # that today aren't otherwise codified anywhere — they live in DO |
| 6 | # account state. Run from the operator laptop after a fresh account |
| 7 | # (or as a sanity-restore check) to ensure the canonical set exists. |
| 8 | # |
| 9 | # What this script provisions: |
| 10 | # |
| 11 | # Droplet alerts (resource-utilization, scoped to shithub-app): |
| 12 | # - CPU > 80% for 10m |
| 13 | # - Memory > 90% for 10m |
| 14 | # - Disk > 80% for 10m |
| 15 | # - Load1 > 4 for 10m |
| 16 | # |
| 17 | # Uptime check (https://shithub.sh from us_east + eu_west) plus: |
| 18 | # - down_global → page on full outage (not single-region blip) |
| 19 | # - ssl_expiry < 14d → catch a Caddy auto-renew failure with buffer |
| 20 | # - latency p95 > 2s |
| 21 | # |
| 22 | # Idempotency: each create is gated by a list-and-match on description |
| 23 | # (for resource alerts) or name (for uptime check + uptime alerts). |
| 24 | # Re-running is a no-op once everything's in place. |
| 25 | # |
| 26 | # Prereqs: |
| 27 | # - doctl installed + authenticated |
| 28 | # - jq |
| 29 | # - The DO account email is verified (alerts only allow verified |
| 30 | # team-member emails; new emails are rejected with "email is not |
| 31 | # verified"). Confirm via `doctl account get`. |
| 32 | |
| 33 | set -euo pipefail |
| 34 | |
| 35 | DROPLET_NAME="${DROPLET_NAME:-shithub-app}" |
| 36 | EMAIL="${ALERT_EMAIL:-$(doctl account get --output json | jq -r '.email')}" |
| 37 | UPTIME_TARGET="${UPTIME_TARGET:-https://shithub.sh}" |
| 38 | UPTIME_NAME="${UPTIME_NAME:-shithub.sh}" |
| 39 | |
| 40 | if ! command -v doctl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then |
| 41 | echo "fatal: doctl + jq required" >&2 |
| 42 | exit 2 |
| 43 | fi |
| 44 | |
| 45 | DROPLET_ID="$(doctl compute droplet list --format ID,Name --no-header \ |
| 46 | | awk -v n="$DROPLET_NAME" '$2==n {print $1; exit}')" |
| 47 | if [[ -z "$DROPLET_ID" ]]; then |
| 48 | echo "fatal: no droplet named $DROPLET_NAME" >&2 |
| 49 | exit 2 |
| 50 | fi |
| 51 | echo "droplet: $DROPLET_NAME ($DROPLET_ID)" >&2 |
| 52 | echo "email: $EMAIL" >&2 |
| 53 | |
| 54 | # ── Droplet resource alerts ──────────────────────────────────────── |
| 55 | ensure_resource_alert() { |
| 56 | local desc="$1" type="$2" value="$3" window="$4" |
| 57 | local existing |
| 58 | existing="$(doctl monitoring alert list --output json \ |
| 59 | | jq -r --arg d "$desc" '.[] | select(.description == $d) | .uuid')" |
| 60 | if [[ -n "$existing" ]]; then |
| 61 | echo " exists: $desc ($existing)" >&2 |
| 62 | return |
| 63 | fi |
| 64 | echo " create: $desc" >&2 |
| 65 | doctl monitoring alert create \ |
| 66 | --type "$type" --compare GreaterThan --value "$value" --window "$window" \ |
| 67 | --entities "$DROPLET_ID" --emails "$EMAIL" \ |
| 68 | --description "$desc" >/dev/null |
| 69 | } |
| 70 | |
| 71 | echo "resource alerts:" >&2 |
| 72 | ensure_resource_alert "$DROPLET_NAME CPU > 80% for 10m" v1/insights/droplet/cpu 80 10m |
| 73 | ensure_resource_alert "$DROPLET_NAME memory > 90% for 10m" v1/insights/droplet/memory_utilization_percent 90 10m |
| 74 | ensure_resource_alert "$DROPLET_NAME disk > 80% for 10m" v1/insights/droplet/disk_utilization_percent 80 10m |
| 75 | ensure_resource_alert "$DROPLET_NAME load1 > 4 for 10m" v1/insights/droplet/load_1 4 10m |
| 76 | |
| 77 | # ── Uptime check ────────────────────────────────────────────────── |
| 78 | echo "uptime check:" >&2 |
| 79 | CHECK_ID="$(doctl monitoring uptime list --output json \ |
| 80 | | jq -r --arg n "$UPTIME_NAME" '.[] | select(.name == $n) | .id')" |
| 81 | if [[ -z "$CHECK_ID" ]]; then |
| 82 | echo " create: $UPTIME_NAME → $UPTIME_TARGET" >&2 |
| 83 | CHECK_ID="$(doctl monitoring uptime create "$UPTIME_NAME" \ |
| 84 | --target "$UPTIME_TARGET" --type https \ |
| 85 | --regions us_east,eu_west --enabled true \ |
| 86 | --output json | jq -r '.[0].id')" |
| 87 | else |
| 88 | echo " exists: $UPTIME_NAME ($CHECK_ID)" >&2 |
| 89 | fi |
| 90 | |
| 91 | # ── Uptime alerts ───────────────────────────────────────────────── |
| 92 | ensure_uptime_alert() { |
| 93 | local name="$1" type="$2" threshold="$3" comparison="$4" |
| 94 | local existing |
| 95 | existing="$(doctl monitoring uptime alert list "$CHECK_ID" --output json \ |
| 96 | | jq -r --arg n "$name" '.[] | select(.name == $n) | .id')" |
| 97 | if [[ -n "$existing" ]]; then |
| 98 | echo " exists: $name ($existing)" >&2 |
| 99 | return |
| 100 | fi |
| 101 | echo " create: $name" >&2 |
| 102 | doctl monitoring uptime alert create "$CHECK_ID" \ |
| 103 | --name "$name" --type "$type" --period 5m \ |
| 104 | --threshold "$threshold" --comparison "$comparison" \ |
| 105 | --emails "$EMAIL" >/dev/null |
| 106 | } |
| 107 | |
| 108 | echo "uptime alerts:" >&2 |
| 109 | ensure_uptime_alert "$UPTIME_NAME down (all regions)" down_global 0 greater_than |
| 110 | ensure_uptime_alert "$UPTIME_NAME SSL cert expiring < 14 days" ssl_expiry 14 less_than |
| 111 | ensure_uptime_alert "$UPTIME_NAME latency p95 > 2s" latency 2000 greater_than |
| 112 | |
| 113 | echo "done." >&2 |