tenseleyflow/shithub / ae13c49

Browse files

S37: monitoring — prometheus config + alert rules

Authored by espadonne
SHA
ae13c492e23b8ea8a5d1fdae1141d15bef2b6648
Parents
1e8c8b7
Tree
c2732bb

2 changed files

StatusFile+-
A deploy/monitoring/prometheus/prometheus.yml 62 0
A deploy/monitoring/prometheus/rules.yml 84 0
deploy/monitoring/prometheus/prometheus.ymladded
@@ -0,0 +1,62 @@
1
+# Prometheus config for the shithub monitoring host. All scrape
2
+# targets are reached over the WireGuard mesh (10.50.0.0/24) — never
3
+# the public interface.
4
+#
5
+# Cardinality budget: per the S36 perf pass, we keep label cardinality
6
+# bounded by NOT labelling on user_id, repo_id, or path. Route metrics
7
+# bucket by route name (set by the chi router); HTTP status is a
8
+# small enum.
9
+
10
+global:
11
+  scrape_interval: 15s
12
+  evaluation_interval: 30s
13
+  external_labels:
14
+    cluster: shithub-prod
15
+
16
+rule_files:
17
+  - "/etc/prometheus/rules/*.yml"
18
+
19
+alerting:
20
+  alertmanagers:
21
+    - static_configs:
22
+        - targets: ["10.50.0.10:9093"]
23
+
24
+scrape_configs:
25
+  - job_name: shithubd-web
26
+    metrics_path: /metrics
27
+    static_configs:
28
+      - targets:
29
+          - "10.50.0.20:8080"
30
+          - "10.50.0.21:8080"
31
+        labels:
32
+          service: shithubd
33
+          tier: web
34
+
35
+  - job_name: shithubd-worker
36
+    metrics_path: /metrics
37
+    static_configs:
38
+      - targets: ["10.50.0.20:8081"]
39
+        labels:
40
+          service: shithubd
41
+          tier: worker
42
+
43
+  - job_name: postgres
44
+    static_configs:
45
+      - targets: ["10.50.0.30:9187"]
46
+        labels:
47
+          service: postgres
48
+
49
+  - job_name: caddy
50
+    static_configs:
51
+      - targets: ["10.50.0.20:2019", "10.50.0.21:2019"]
52
+        labels:
53
+          service: caddy
54
+
55
+  - job_name: node
56
+    static_configs:
57
+      - targets:
58
+          - "10.50.0.20:9100"
59
+          - "10.50.0.21:9100"
60
+          - "10.50.0.30:9100"
61
+        labels:
62
+          service: node-exporter
deploy/monitoring/prometheus/rules.ymladded
@@ -0,0 +1,84 @@
1
+# Alert rules for shithubd. Keep these short and signal-heavy —
2
+# every alert here should map to a runbook in docs/internal/
3
+# runbooks/incidents.md by name.
4
+
5
+groups:
6
+  - name: shithubd-availability
7
+    interval: 30s
8
+    rules:
9
+      - alert: ShithubdWebDown
10
+        expr: up{job="shithubd-web"} == 0
11
+        for: 2m
12
+        labels: {severity: page}
13
+        annotations:
14
+          summary: "shithubd web {{ $labels.instance }} is down"
15
+          runbook: "runbooks/incidents.md#shithubd-down"
16
+
17
+      - alert: ShithubdWorkerDown
18
+        expr: up{job="shithubd-worker"} == 0
19
+        for: 5m
20
+        labels: {severity: page}
21
+        annotations:
22
+          summary: "shithubd worker is down"
23
+          runbook: "runbooks/incidents.md#worker-down"
24
+
25
+      - alert: PostgresDown
26
+        expr: up{job="postgres"} == 0
27
+        for: 1m
28
+        labels: {severity: page}
29
+        annotations:
30
+          summary: "postgres is down — site cannot serve writes"
31
+          runbook: "runbooks/incidents.md#postgres-down"
32
+
33
+  - name: shithubd-latency
34
+    interval: 30s
35
+    rules:
36
+      - alert: HighRequestLatencyP95
37
+        expr: |
38
+          histogram_quantile(0.95,
39
+            sum(rate(http_request_duration_seconds_bucket[5m])) by (route, le)
40
+          ) > 1.5
41
+        for: 10m
42
+        labels: {severity: ticket}
43
+        annotations:
44
+          summary: "p95 latency on {{ $labels.route }} > 1.5s"
45
+
46
+      - alert: HighDBQueryRate
47
+        expr: |
48
+          sum(rate(pg_stat_statements_calls_total[5m])) > 5000
49
+        for: 10m
50
+        labels: {severity: ticket}
51
+        annotations:
52
+          summary: "DB call rate sustained > 5k/s — possible N+1 regression"
53
+
54
+  - name: shithubd-jobs
55
+    interval: 30s
56
+    rules:
57
+      - alert: JobBacklogGrowing
58
+        expr: shithubd_job_queue_depth > 5000
59
+        for: 15m
60
+        labels: {severity: ticket}
61
+        annotations:
62
+          summary: "job queue depth > 5k — worker cannot keep up"
63
+          runbook: "runbooks/incidents.md#job-backlog"
64
+
65
+      - alert: WebhookDeliveryFailing
66
+        expr: |
67
+          rate(shithubd_webhook_deliveries_total{result="failure"}[15m])
68
+            /
69
+          rate(shithubd_webhook_deliveries_total[15m]) > 0.5
70
+        for: 30m
71
+        labels: {severity: ticket}
72
+        annotations:
73
+          summary: "webhook failure rate > 50% sustained"
74
+
75
+  - name: shithubd-backups
76
+    interval: 5m
77
+    rules:
78
+      - alert: BackupOverdue
79
+        expr: time() - shithubd_backup_last_success_seconds > 60 * 60 * 30
80
+        for: 0m
81
+        labels: {severity: page}
82
+        annotations:
83
+          summary: "no successful backup in > 30h"
84
+          runbook: "runbooks/backups.md#missed-backup"