S37: monitoring — prometheus config + alert rules
- SHA
ae13c492e23b8ea8a5d1fdae1141d15bef2b6648- Parents
-
1e8c8b7 - Tree
c2732bb
ae13c49
ae13c492e23b8ea8a5d1fdae1141d15bef2b66481e8c8b7
c2732bb| Status | File | + | - |
|---|---|---|---|
| A |
deploy/monitoring/prometheus/prometheus.yml
|
62 | 0 |
| A |
deploy/monitoring/prometheus/rules.yml
|
84 | 0 |
deploy/monitoring/prometheus/prometheus.ymladded@@ -0,0 +1,62 @@ | ||
| 1 | +# Prometheus config for the shithub monitoring host. All scrape | |
| 2 | +# targets are reached over the WireGuard mesh (10.50.0.0/24) — never | |
| 3 | +# the public interface. | |
| 4 | +# | |
| 5 | +# Cardinality budget: per the S36 perf pass, we keep label cardinality | |
| 6 | +# bounded by NOT labelling on user_id, repo_id, or path. Route metrics | |
| 7 | +# bucket by route name (set by the chi router); HTTP status is a | |
| 8 | +# small enum. | |
| 9 | + | |
| 10 | +global: | |
| 11 | + scrape_interval: 15s | |
| 12 | + evaluation_interval: 30s | |
| 13 | + external_labels: | |
| 14 | + cluster: shithub-prod | |
| 15 | + | |
| 16 | +rule_files: | |
| 17 | + - "/etc/prometheus/rules/*.yml" | |
| 18 | + | |
| 19 | +alerting: | |
| 20 | + alertmanagers: | |
| 21 | + - static_configs: | |
| 22 | + - targets: ["10.50.0.10:9093"] | |
| 23 | + | |
| 24 | +scrape_configs: | |
| 25 | + - job_name: shithubd-web | |
| 26 | + metrics_path: /metrics | |
| 27 | + static_configs: | |
| 28 | + - targets: | |
| 29 | + - "10.50.0.20:8080" | |
| 30 | + - "10.50.0.21:8080" | |
| 31 | + labels: | |
| 32 | + service: shithubd | |
| 33 | + tier: web | |
| 34 | + | |
| 35 | + - job_name: shithubd-worker | |
| 36 | + metrics_path: /metrics | |
| 37 | + static_configs: | |
| 38 | + - targets: ["10.50.0.20:8081"] | |
| 39 | + labels: | |
| 40 | + service: shithubd | |
| 41 | + tier: worker | |
| 42 | + | |
| 43 | + - job_name: postgres | |
| 44 | + static_configs: | |
| 45 | + - targets: ["10.50.0.30:9187"] | |
| 46 | + labels: | |
| 47 | + service: postgres | |
| 48 | + | |
| 49 | + - job_name: caddy | |
| 50 | + static_configs: | |
| 51 | + - targets: ["10.50.0.20:2019", "10.50.0.21:2019"] | |
| 52 | + labels: | |
| 53 | + service: caddy | |
| 54 | + | |
| 55 | + - job_name: node | |
| 56 | + static_configs: | |
| 57 | + - targets: | |
| 58 | + - "10.50.0.20:9100" | |
| 59 | + - "10.50.0.21:9100" | |
| 60 | + - "10.50.0.30:9100" | |
| 61 | + labels: | |
| 62 | + service: node-exporter | |
deploy/monitoring/prometheus/rules.ymladded@@ -0,0 +1,84 @@ | ||
| 1 | +# Alert rules for shithubd. Keep these short and signal-heavy — | |
| 2 | +# every alert here should map to a runbook in docs/internal/ | |
| 3 | +# runbooks/incidents.md by name. | |
| 4 | + | |
| 5 | +groups: | |
| 6 | + - name: shithubd-availability | |
| 7 | + interval: 30s | |
| 8 | + rules: | |
| 9 | + - alert: ShithubdWebDown | |
| 10 | + expr: up{job="shithubd-web"} == 0 | |
| 11 | + for: 2m | |
| 12 | + labels: {severity: page} | |
| 13 | + annotations: | |
| 14 | + summary: "shithubd web {{ $labels.instance }} is down" | |
| 15 | + runbook: "runbooks/incidents.md#shithubd-down" | |
| 16 | + | |
| 17 | + - alert: ShithubdWorkerDown | |
| 18 | + expr: up{job="shithubd-worker"} == 0 | |
| 19 | + for: 5m | |
| 20 | + labels: {severity: page} | |
| 21 | + annotations: | |
| 22 | + summary: "shithubd worker is down" | |
| 23 | + runbook: "runbooks/incidents.md#worker-down" | |
| 24 | + | |
| 25 | + - alert: PostgresDown | |
| 26 | + expr: up{job="postgres"} == 0 | |
| 27 | + for: 1m | |
| 28 | + labels: {severity: page} | |
| 29 | + annotations: | |
| 30 | + summary: "postgres is down — site cannot serve writes" | |
| 31 | + runbook: "runbooks/incidents.md#postgres-down" | |
| 32 | + | |
| 33 | + - name: shithubd-latency | |
| 34 | + interval: 30s | |
| 35 | + rules: | |
| 36 | + - alert: HighRequestLatencyP95 | |
| 37 | + expr: | | |
| 38 | + histogram_quantile(0.95, | |
| 39 | + sum(rate(http_request_duration_seconds_bucket[5m])) by (route, le) | |
| 40 | + ) > 1.5 | |
| 41 | + for: 10m | |
| 42 | + labels: {severity: ticket} | |
| 43 | + annotations: | |
| 44 | + summary: "p95 latency on {{ $labels.route }} > 1.5s" | |
| 45 | + | |
| 46 | + - alert: HighDBQueryRate | |
| 47 | + expr: | | |
| 48 | + sum(rate(pg_stat_statements_calls_total[5m])) > 5000 | |
| 49 | + for: 10m | |
| 50 | + labels: {severity: ticket} | |
| 51 | + annotations: | |
| 52 | + summary: "DB call rate sustained > 5k/s — possible N+1 regression" | |
| 53 | + | |
| 54 | + - name: shithubd-jobs | |
| 55 | + interval: 30s | |
| 56 | + rules: | |
| 57 | + - alert: JobBacklogGrowing | |
| 58 | + expr: shithubd_job_queue_depth > 5000 | |
| 59 | + for: 15m | |
| 60 | + labels: {severity: ticket} | |
| 61 | + annotations: | |
| 62 | + summary: "job queue depth > 5k — worker cannot keep up" | |
| 63 | + runbook: "runbooks/incidents.md#job-backlog" | |
| 64 | + | |
| 65 | + - alert: WebhookDeliveryFailing | |
| 66 | + expr: | | |
| 67 | + rate(shithubd_webhook_deliveries_total{result="failure"}[15m]) | |
| 68 | + / | |
| 69 | + rate(shithubd_webhook_deliveries_total[15m]) > 0.5 | |
| 70 | + for: 30m | |
| 71 | + labels: {severity: ticket} | |
| 72 | + annotations: | |
| 73 | + summary: "webhook failure rate > 50% sustained" | |
| 74 | + | |
| 75 | + - name: shithubd-backups | |
| 76 | + interval: 5m | |
| 77 | + rules: | |
| 78 | + - alert: BackupOverdue | |
| 79 | + expr: time() - shithubd_backup_last_success_seconds > 60 * 60 * 30 | |
| 80 | + for: 0m | |
| 81 | + labels: {severity: page} | |
| 82 | + annotations: | |
| 83 | + summary: "no successful backup in > 30h" | |
| 84 | + runbook: "runbooks/backups.md#missed-backup" | |