YAML · 4459 bytes Raw Blame History
1 # Alert rules for shithubd. Keep these short and signal-heavy —
2 # every alert here should map to a runbook in docs/internal/
3 # runbooks/incidents.md by name.
4
5 groups:
6 - name: shithubd-availability
7 interval: 30s
8 rules:
9 - alert: ShithubdWebDown
10 expr: up{job="shithubd-web"} == 0
11 for: 2m
12 labels: {severity: page}
13 annotations:
14 summary: "shithubd web {{ $labels.instance }} is down"
15 runbook: "runbooks/incidents.md#shithubd-down"
16
17 - alert: ShithubdWorkerDown
18 expr: up{job="shithubd-worker"} == 0
19 for: 5m
20 labels: {severity: page}
21 annotations:
22 summary: "shithubd worker is down"
23 runbook: "runbooks/incidents.md#worker-down"
24
25 - alert: PostgresDown
26 expr: up{job="postgres"} == 0
27 for: 1m
28 labels: {severity: page}
29 annotations:
30 summary: "postgres is down — site cannot serve writes"
31 runbook: "runbooks/incidents.md#postgres-down"
32
33 - name: shithubd-latency
34 interval: 30s
35 rules:
36 - alert: HighRequestLatencyP95
37 expr: |
38 histogram_quantile(0.95,
39 sum(rate(http_request_duration_seconds_bucket[5m])) by (route, le)
40 ) > 1.5
41 for: 10m
42 labels: {severity: ticket}
43 annotations:
44 summary: "p95 latency on {{ $labels.route }} > 1.5s"
45
46 - alert: HighDBQueryRate
47 expr: |
48 sum(rate(pg_stat_statements_calls_total[5m])) > 5000
49 for: 10m
50 labels: {severity: ticket}
51 annotations:
52 summary: "DB call rate sustained > 5k/s — possible N+1 regression"
53
54 - name: shithubd-jobs
55 interval: 30s
56 rules:
57 - alert: JobBacklogGrowing
58 expr: shithubd_job_queue_depth > 5000
59 for: 15m
60 labels: {severity: ticket}
61 annotations:
62 summary: "job queue depth > 5k — worker cannot keep up"
63 runbook: "runbooks/incidents.md#job-backlog"
64
65 - alert: WebhookDeliveryFailing
66 expr: |
67 rate(shithubd_webhook_deliveries_total{result="failure"}[15m])
68 /
69 rate(shithubd_webhook_deliveries_total[15m]) > 0.5
70 for: 30m
71 labels: {severity: ticket}
72 annotations:
73 summary: "webhook failure rate > 50% sustained"
74
75 - name: shithubd-actions
76 interval: 30s
77 rules:
78 - alert: ActionsRunnerHeartbeatStale
79 expr: shithub_actions_runner_heartbeat_age_seconds{status!="offline"} > 60
80 for: 5m
81 labels: {severity: page}
82 annotations:
83 summary: "Actions runner {{ $labels.runner }} heartbeat stale for > 60s"
84 runbook: "runbooks/incidents.md#actions-runner-heartbeat-stale"
85
86 - alert: ActionsQueueDepthHigh
87 expr: shithub_actions_queue_depth{resource="jobs"} > 100
88 for: 10m
89 labels: {severity: ticket}
90 annotations:
91 summary: "Actions queued jobs > 100 for 10m"
92 runbook: "runbooks/incidents.md#actions-queue-depth-high"
93
94 - alert: ActionsRunDurationP99Regressed
95 expr: |
96 histogram_quantile(0.99,
97 sum(rate(shithub_actions_run_duration_seconds_bucket[30m])) by (le)
98 )
99 >
100 1.5 *
101 histogram_quantile(0.99,
102 sum(rate(shithub_actions_run_duration_seconds_bucket[30m] offset 24h)) by (le)
103 )
104 for: 15m
105 labels: {severity: ticket}
106 annotations:
107 summary: "Actions run duration p99 regressed by > 50% versus 24h ago"
108 runbook: "runbooks/incidents.md#actions-run-duration-p99-regressed"
109
110 - alert: ActionsLogScrubberPossiblyMissing
111 expr: |
112 sum(rate(shithub_actions_log_chunk_bytes_total{location="server"}[15m])) > 1048576
113 and
114 sum(rate(shithub_actions_log_scrub_replacements_total{location="server"}[15m])) == 0
115 for: 30m
116 labels: {severity: ticket}
117 annotations:
118 summary: "Actions logs are flowing but no server-side secret masks have matched"
119 runbook: "runbooks/incidents.md#actions-log-scrubber-possibly-missing"
120
121 - name: shithubd-backups
122 interval: 5m
123 rules:
124 - alert: BackupOverdue
125 expr: time() - shithubd_backup_last_success_seconds > 60 * 60 * 30
126 for: 0m
127 labels: {severity: page}
128 annotations:
129 summary: "no successful backup in > 30h"
130 runbook: "runbooks/backups.md#missed-backup"
131