@@ -0,0 +1,113 @@ |
| | 1 | +#!/usr/bin/env bash |
| | 2 | +# SPDX-License-Identifier: AGPL-3.0-or-later |
| | 3 | +# |
| | 4 | +# Idempotent re-provisioner for the DigitalOcean monitoring alerts |
| | 5 | +# that today aren't otherwise codified anywhere — they live in DO |
| | 6 | +# account state. Run from the operator laptop after a fresh account |
| | 7 | +# (or as a sanity-restore check) to ensure the canonical set exists. |
| | 8 | +# |
| | 9 | +# What this script provisions: |
| | 10 | +# |
| | 11 | +# Droplet alerts (resource-utilization, scoped to shithub-app): |
| | 12 | +# - CPU > 80% for 10m |
| | 13 | +# - Memory > 90% for 10m |
| | 14 | +# - Disk > 80% for 10m |
| | 15 | +# - Load1 > 4 for 10m |
| | 16 | +# |
| | 17 | +# Uptime check (https://shithub.sh from us_east + eu_west) plus: |
| | 18 | +# - down_global → page on full outage (not single-region blip) |
| | 19 | +# - ssl_expiry < 14d → catch a Caddy auto-renew failure with buffer |
| | 20 | +# - latency p95 > 2s |
| | 21 | +# |
| | 22 | +# Idempotency: each create is gated by a list-and-match on description |
| | 23 | +# (for resource alerts) or name (for uptime check + uptime alerts). |
| | 24 | +# Re-running is a no-op once everything's in place. |
| | 25 | +# |
| | 26 | +# Prereqs: |
| | 27 | +# - doctl installed + authenticated |
| | 28 | +# - jq |
| | 29 | +# - The DO account email is verified (alerts only allow verified |
| | 30 | +# team-member emails; new emails are rejected with "email is not |
| | 31 | +# verified"). Confirm via `doctl account get`. |
| | 32 | + |
| | 33 | +set -euo pipefail |
| | 34 | + |
| | 35 | +DROPLET_NAME="${DROPLET_NAME:-shithub-app}" |
| | 36 | +EMAIL="${ALERT_EMAIL:-$(doctl account get --output json | jq -r '.email')}" |
| | 37 | +UPTIME_TARGET="${UPTIME_TARGET:-https://shithub.sh}" |
| | 38 | +UPTIME_NAME="${UPTIME_NAME:-shithub.sh}" |
| | 39 | + |
| | 40 | +if ! command -v doctl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then |
| | 41 | + echo "fatal: doctl + jq required" >&2 |
| | 42 | + exit 2 |
| | 43 | +fi |
| | 44 | + |
| | 45 | +DROPLET_ID="$(doctl compute droplet list --format ID,Name --no-header \ |
| | 46 | + | awk -v n="$DROPLET_NAME" '$2==n {print $1; exit}')" |
| | 47 | +if [[ -z "$DROPLET_ID" ]]; then |
| | 48 | + echo "fatal: no droplet named $DROPLET_NAME" >&2 |
| | 49 | + exit 2 |
| | 50 | +fi |
| | 51 | +echo "droplet: $DROPLET_NAME ($DROPLET_ID)" >&2 |
| | 52 | +echo "email: $EMAIL" >&2 |
| | 53 | + |
| | 54 | +# ── Droplet resource alerts ──────────────────────────────────────── |
| | 55 | +ensure_resource_alert() { |
| | 56 | + local desc="$1" type="$2" value="$3" window="$4" |
| | 57 | + local existing |
| | 58 | + existing="$(doctl monitoring alert list --output json \ |
| | 59 | + | jq -r --arg d "$desc" '.[] | select(.description == $d) | .uuid')" |
| | 60 | + if [[ -n "$existing" ]]; then |
| | 61 | + echo " exists: $desc ($existing)" >&2 |
| | 62 | + return |
| | 63 | + fi |
| | 64 | + echo " create: $desc" >&2 |
| | 65 | + doctl monitoring alert create \ |
| | 66 | + --type "$type" --compare GreaterThan --value "$value" --window "$window" \ |
| | 67 | + --entities "$DROPLET_ID" --emails "$EMAIL" \ |
| | 68 | + --description "$desc" >/dev/null |
| | 69 | +} |
| | 70 | + |
| | 71 | +echo "resource alerts:" >&2 |
| | 72 | +ensure_resource_alert "$DROPLET_NAME CPU > 80% for 10m" v1/insights/droplet/cpu 80 10m |
| | 73 | +ensure_resource_alert "$DROPLET_NAME memory > 90% for 10m" v1/insights/droplet/memory_utilization_percent 90 10m |
| | 74 | +ensure_resource_alert "$DROPLET_NAME disk > 80% for 10m" v1/insights/droplet/disk_utilization_percent 80 10m |
| | 75 | +ensure_resource_alert "$DROPLET_NAME load1 > 4 for 10m" v1/insights/droplet/load_1 4 10m |
| | 76 | + |
| | 77 | +# ── Uptime check ────────────────────────────────────────────────── |
| | 78 | +echo "uptime check:" >&2 |
| | 79 | +CHECK_ID="$(doctl monitoring uptime list --output json \ |
| | 80 | + | jq -r --arg n "$UPTIME_NAME" '.[] | select(.name == $n) | .id')" |
| | 81 | +if [[ -z "$CHECK_ID" ]]; then |
| | 82 | + echo " create: $UPTIME_NAME → $UPTIME_TARGET" >&2 |
| | 83 | + CHECK_ID="$(doctl monitoring uptime create "$UPTIME_NAME" \ |
| | 84 | + --target "$UPTIME_TARGET" --type https \ |
| | 85 | + --regions us_east,eu_west --enabled true \ |
| | 86 | + --output json | jq -r '.[0].id')" |
| | 87 | +else |
| | 88 | + echo " exists: $UPTIME_NAME ($CHECK_ID)" >&2 |
| | 89 | +fi |
| | 90 | + |
| | 91 | +# ── Uptime alerts ───────────────────────────────────────────────── |
| | 92 | +ensure_uptime_alert() { |
| | 93 | + local name="$1" type="$2" threshold="$3" comparison="$4" |
| | 94 | + local existing |
| | 95 | + existing="$(doctl monitoring uptime alert list "$CHECK_ID" --output json \ |
| | 96 | + | jq -r --arg n "$name" '.[] | select(.name == $n) | .id')" |
| | 97 | + if [[ -n "$existing" ]]; then |
| | 98 | + echo " exists: $name ($existing)" >&2 |
| | 99 | + return |
| | 100 | + fi |
| | 101 | + echo " create: $name" >&2 |
| | 102 | + doctl monitoring uptime alert create "$CHECK_ID" \ |
| | 103 | + --name "$name" --type "$type" --period 5m \ |
| | 104 | + --threshold "$threshold" --comparison "$comparison" \ |
| | 105 | + --emails "$EMAIL" >/dev/null |
| | 106 | +} |
| | 107 | + |
| | 108 | +echo "uptime alerts:" >&2 |
| | 109 | +ensure_uptime_alert "$UPTIME_NAME down (all regions)" down_global 0 greater_than |
| | 110 | +ensure_uptime_alert "$UPTIME_NAME SSL cert expiring < 14 days" ssl_expiry 14 less_than |
| | 111 | +ensure_uptime_alert "$UPTIME_NAME latency p95 > 2s" latency 2000 greater_than |
| | 112 | + |
| | 113 | +echo "done." >&2 |