YAML · 2646 bytes Raw Blame History
1 # Alert rules for shithubd. Keep these short and signal-heavy —
2 # every alert here should map to a runbook in docs/internal/
3 # runbooks/incidents.md by name.
4
5 groups:
6 - name: shithubd-availability
7 interval: 30s
8 rules:
9 - alert: ShithubdWebDown
10 expr: up{job="shithubd-web"} == 0
11 for: 2m
12 labels: {severity: page}
13 annotations:
14 summary: "shithubd web {{ $labels.instance }} is down"
15 runbook: "runbooks/incidents.md#shithubd-down"
16
17 - alert: ShithubdWorkerDown
18 expr: up{job="shithubd-worker"} == 0
19 for: 5m
20 labels: {severity: page}
21 annotations:
22 summary: "shithubd worker is down"
23 runbook: "runbooks/incidents.md#worker-down"
24
25 - alert: PostgresDown
26 expr: up{job="postgres"} == 0
27 for: 1m
28 labels: {severity: page}
29 annotations:
30 summary: "postgres is down — site cannot serve writes"
31 runbook: "runbooks/incidents.md#postgres-down"
32
33 - name: shithubd-latency
34 interval: 30s
35 rules:
36 - alert: HighRequestLatencyP95
37 expr: |
38 histogram_quantile(0.95,
39 sum(rate(http_request_duration_seconds_bucket[5m])) by (route, le)
40 ) > 1.5
41 for: 10m
42 labels: {severity: ticket}
43 annotations:
44 summary: "p95 latency on {{ $labels.route }} > 1.5s"
45
46 - alert: HighDBQueryRate
47 expr: |
48 sum(rate(pg_stat_statements_calls_total[5m])) > 5000
49 for: 10m
50 labels: {severity: ticket}
51 annotations:
52 summary: "DB call rate sustained > 5k/s — possible N+1 regression"
53
54 - name: shithubd-jobs
55 interval: 30s
56 rules:
57 - alert: JobBacklogGrowing
58 expr: shithubd_job_queue_depth > 5000
59 for: 15m
60 labels: {severity: ticket}
61 annotations:
62 summary: "job queue depth > 5k — worker cannot keep up"
63 runbook: "runbooks/incidents.md#job-backlog"
64
65 - alert: WebhookDeliveryFailing
66 expr: |
67 rate(shithubd_webhook_deliveries_total{result="failure"}[15m])
68 /
69 rate(shithubd_webhook_deliveries_total[15m]) > 0.5
70 for: 30m
71 labels: {severity: ticket}
72 annotations:
73 summary: "webhook failure rate > 50% sustained"
74
75 - name: shithubd-backups
76 interval: 5m
77 rules:
78 - alert: BackupOverdue
79 expr: time() - shithubd_backup_last_success_seconds > 60 * 60 * 30
80 for: 0m
81 labels: {severity: page}
82 annotations:
83 summary: "no successful backup in > 30h"
84 runbook: "runbooks/backups.md#missed-backup"
85