tenseleyflow/shithub / 2ab8d07

Browse files

monitoring: add Actions dashboard and alerts

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
2ab8d07593a622dd38994590cb5e1967b59babb7
Parents
11badfd
Tree
2190307

4 changed files

StatusFile+-
A deploy/monitoring/grafana/dashboards/actions.json 247 0
M deploy/monitoring/prometheus/rules.yml 46 0
M docs/internal/runbooks/incidents.md 64 0
M docs/internal/runbooks/observability.md 14 4
deploy/monitoring/grafana/dashboards/actions.jsonadded
@@ -0,0 +1,247 @@
1
+{
2
+  "uid": "shithubd-actions",
3
+  "title": "shithubd - Actions",
4
+  "tags": ["shithubd", "actions"],
5
+  "timezone": "browser",
6
+  "schemaVersion": 39,
7
+  "version": 1,
8
+  "refresh": "30s",
9
+  "time": {"from": "now-6h", "to": "now"},
10
+  "templating": {
11
+    "list": [
12
+      {
13
+        "name": "instance",
14
+        "type": "query",
15
+        "datasource": "Prometheus",
16
+        "query": "label_values(up{job=\"shithubd-web\"}, instance)",
17
+        "includeAll": true,
18
+        "multi": true
19
+      }
20
+    ]
21
+  },
22
+  "panels": [
23
+    {
24
+      "id": 1,
25
+      "type": "stat",
26
+      "title": "Queued jobs",
27
+      "gridPos": {"x": 0, "y": 0, "w": 4, "h": 4},
28
+      "targets": [{"expr": "sum(shithub_actions_queue_depth{resource=\"jobs\",instance=~\"$instance\"})", "refId": "A"}],
29
+      "fieldConfig": {
30
+        "defaults": {
31
+          "thresholds": {
32
+            "mode": "absolute",
33
+            "steps": [
34
+              {"color": "green", "value": null},
35
+              {"color": "yellow", "value": 50},
36
+              {"color": "red", "value": 100}
37
+            ]
38
+          }
39
+        }
40
+      }
41
+    },
42
+    {
43
+      "id": 2,
44
+      "type": "stat",
45
+      "title": "Running jobs",
46
+      "gridPos": {"x": 4, "y": 0, "w": 4, "h": 4},
47
+      "targets": [{"expr": "sum(shithub_actions_active{resource=\"jobs\",instance=~\"$instance\"})", "refId": "A"}]
48
+    },
49
+    {
50
+      "id": 3,
51
+      "type": "stat",
52
+      "title": "Stale runners",
53
+      "gridPos": {"x": 8, "y": 0, "w": 4, "h": 4},
54
+      "targets": [{"expr": "count(shithub_actions_runner_heartbeat_age_seconds{status!=\"offline\",instance=~\"$instance\"} > 60)", "refId": "A"}],
55
+      "fieldConfig": {
56
+        "defaults": {
57
+          "thresholds": {
58
+            "mode": "absolute",
59
+            "steps": [
60
+              {"color": "green", "value": null},
61
+              {"color": "red", "value": 1}
62
+            ]
63
+          }
64
+        }
65
+      }
66
+    },
67
+    {
68
+      "id": 4,
69
+      "type": "stat",
70
+      "title": "Log MB/day",
71
+      "gridPos": {"x": 12, "y": 0, "w": 4, "h": 4},
72
+      "targets": [{"expr": "sum(increase(shithub_actions_log_chunk_bytes_total{instance=~\"$instance\"}[24h])) / 1024 / 1024", "refId": "A"}]
73
+    },
74
+    {
75
+      "id": 5,
76
+      "type": "stat",
77
+      "title": "Run p99",
78
+      "gridPos": {"x": 16, "y": 0, "w": 4, "h": 4},
79
+      "targets": [{"expr": "histogram_quantile(0.99, sum(rate(shithub_actions_run_duration_seconds_bucket{instance=~\"$instance\"}[30m])) by (le))", "refId": "A"}],
80
+      "fieldConfig": {"defaults": {"unit": "s"}}
81
+    },
82
+    {
83
+      "id": 6,
84
+      "type": "stat",
85
+      "title": "Storage MB",
86
+      "gridPos": {"x": 20, "y": 0, "w": 4, "h": 4},
87
+      "targets": [{"expr": "sum(shithub_actions_storage_bytes{instance=~\"$instance\"}) / 1024 / 1024", "refId": "A"}]
88
+    },
89
+    {
90
+      "id": 7,
91
+      "type": "timeseries",
92
+      "title": "Queue depth",
93
+      "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
94
+      "targets": [
95
+        {
96
+          "expr": "sum(shithub_actions_queue_depth{instance=~\"$instance\"}) by (resource)",
97
+          "legendFormat": "{{resource}} queued",
98
+          "refId": "A"
99
+        }
100
+      ]
101
+    },
102
+    {
103
+      "id": 8,
104
+      "type": "timeseries",
105
+      "title": "Active runs and jobs",
106
+      "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
107
+      "targets": [
108
+        {
109
+          "expr": "sum(shithub_actions_active{instance=~\"$instance\"}) by (resource)",
110
+          "legendFormat": "{{resource}} active",
111
+          "refId": "A"
112
+        }
113
+      ]
114
+    },
115
+    {
116
+      "id": 9,
117
+      "type": "timeseries",
118
+      "title": "Run duration p95 and p99",
119
+      "gridPos": {"x": 0, "y": 12, "w": 12, "h": 8},
120
+      "targets": [
121
+        {
122
+          "expr": "histogram_quantile(0.95, sum(rate(shithub_actions_run_duration_seconds_bucket{instance=~\"$instance\"}[15m])) by (le, event))",
123
+          "legendFormat": "p95 {{event}}",
124
+          "refId": "A"
125
+        },
126
+        {
127
+          "expr": "histogram_quantile(0.99, sum(rate(shithub_actions_run_duration_seconds_bucket{instance=~\"$instance\"}[15m])) by (le, event))",
128
+          "legendFormat": "p99 {{event}}",
129
+          "refId": "B"
130
+        }
131
+      ],
132
+      "fieldConfig": {"defaults": {"unit": "s"}}
133
+    },
134
+    {
135
+      "id": 10,
136
+      "type": "timeseries",
137
+      "title": "Runner heartbeat age",
138
+      "gridPos": {"x": 12, "y": 12, "w": 12, "h": 8},
139
+      "targets": [
140
+        {
141
+          "expr": "shithub_actions_runner_heartbeat_age_seconds{instance=~\"$instance\"}",
142
+          "legendFormat": "{{runner}} {{status}}",
143
+          "refId": "A"
144
+        }
145
+      ],
146
+      "fieldConfig": {"defaults": {"unit": "s"}}
147
+    },
148
+    {
149
+      "id": 11,
150
+      "type": "timeseries",
151
+      "title": "Runs per minute",
152
+      "gridPos": {"x": 0, "y": 20, "w": 12, "h": 8},
153
+      "targets": [
154
+        {
155
+          "expr": "sum(rate(shithub_actions_runs_enqueued_total{result=\"fresh\",instance=~\"$instance\"}[5m])) * 60",
156
+          "legendFormat": "enqueued",
157
+          "refId": "A"
158
+        },
159
+        {
160
+          "expr": "sum(rate(shithub_actions_runs_completed_total{instance=~\"$instance\"}[5m])) * 60",
161
+          "legendFormat": "completed",
162
+          "refId": "B"
163
+        }
164
+      ]
165
+    },
166
+    {
167
+      "id": 12,
168
+      "type": "timeseries",
169
+      "title": "Run conclusions",
170
+      "gridPos": {"x": 12, "y": 20, "w": 12, "h": 8},
171
+      "targets": [
172
+        {
173
+          "expr": "sum(rate(shithub_actions_runs_completed_total{instance=~\"$instance\"}[15m])) by (conclusion)",
174
+          "legendFormat": "{{conclusion}}",
175
+          "refId": "A"
176
+        }
177
+      ]
178
+    },
179
+    {
180
+      "id": 13,
181
+      "type": "timeseries",
182
+      "title": "Step outcomes",
183
+      "gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
184
+      "targets": [
185
+        {
186
+          "expr": "sum(rate(shithub_actions_steps_completed_total{instance=~\"$instance\"}[15m])) by (step_type, conclusion)",
187
+          "legendFormat": "{{step_type}} {{conclusion}}",
188
+          "refId": "A"
189
+        }
190
+      ]
191
+    },
192
+    {
193
+      "id": 14,
194
+      "type": "timeseries",
195
+      "title": "Log throughput",
196
+      "gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
197
+      "targets": [
198
+        {
199
+          "expr": "sum(rate(shithub_actions_log_chunk_bytes_total{instance=~\"$instance\"}[5m])) by (location)",
200
+          "legendFormat": "{{location}} bytes/sec",
201
+          "refId": "A"
202
+        },
203
+        {
204
+          "expr": "sum(rate(shithub_actions_log_chunks_total{instance=~\"$instance\"}[5m])) by (location)",
205
+          "legendFormat": "{{location}} chunks/sec",
206
+          "refId": "B"
207
+        }
208
+      ]
209
+    },
210
+    {
211
+      "id": 15,
212
+      "type": "timeseries",
213
+      "title": "Actions storage",
214
+      "gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
215
+      "targets": [
216
+        {
217
+          "expr": "sum(shithub_actions_storage_bytes{instance=~\"$instance\"}) by (kind)",
218
+          "legendFormat": "{{kind}} bytes",
219
+          "refId": "A"
220
+        },
221
+        {
222
+          "expr": "sum(shithub_actions_storage_objects{instance=~\"$instance\"}) by (kind)",
223
+          "legendFormat": "{{kind}} objects",
224
+          "refId": "B"
225
+        }
226
+      ]
227
+    },
228
+    {
229
+      "id": 16,
230
+      "type": "timeseries",
231
+      "title": "Cancellations and retention",
232
+      "gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
233
+      "targets": [
234
+        {
235
+          "expr": "sum(rate(shithub_actions_jobs_cancelled_total{instance=~\"$instance\"}[15m])) by (reason)",
236
+          "legendFormat": "cancel {{reason}}",
237
+          "refId": "A"
238
+        },
239
+        {
240
+          "expr": "sum(rate(shithub_actions_runs_pruned_total{instance=~\"$instance\"}[1h])) by (kind)",
241
+          "legendFormat": "pruned {{kind}}",
242
+          "refId": "B"
243
+        }
244
+      ]
245
+    }
246
+  ]
247
+}
deploy/monitoring/prometheus/rules.ymlmodified
@@ -72,6 +72,52 @@ groups:
7272
         annotations:
7373
           summary: "webhook failure rate > 50% sustained"
7474
 
75
+  - name: shithubd-actions
76
+    interval: 30s
77
+    rules:
78
+      - alert: ActionsRunnerHeartbeatStale
79
+        expr: shithub_actions_runner_heartbeat_age_seconds{status!="offline"} > 60
80
+        for: 5m
81
+        labels: {severity: page}
82
+        annotations:
83
+          summary: "Actions runner {{ $labels.runner }} heartbeat stale for > 60s"
84
+          runbook: "runbooks/incidents.md#actions-runner-heartbeat-stale"
85
+
86
+      - alert: ActionsQueueDepthHigh
87
+        expr: shithub_actions_queue_depth{resource="jobs"} > 100
88
+        for: 10m
89
+        labels: {severity: ticket}
90
+        annotations:
91
+          summary: "Actions queued jobs > 100 for 10m"
92
+          runbook: "runbooks/incidents.md#actions-queue-depth-high"
93
+
94
+      - alert: ActionsRunDurationP99Regressed
95
+        expr: |
96
+          histogram_quantile(0.99,
97
+            sum(rate(shithub_actions_run_duration_seconds_bucket[30m])) by (le)
98
+          )
99
+            >
100
+          1.5 *
101
+          histogram_quantile(0.99,
102
+            sum(rate(shithub_actions_run_duration_seconds_bucket[30m] offset 24h)) by (le)
103
+          )
104
+        for: 15m
105
+        labels: {severity: ticket}
106
+        annotations:
107
+          summary: "Actions run duration p99 regressed by > 50% versus 24h ago"
108
+          runbook: "runbooks/incidents.md#actions-run-duration-p99-regressed"
109
+
110
+      - alert: ActionsLogScrubberPossiblyMissing
111
+        expr: |
112
+          sum(rate(shithub_actions_log_chunk_bytes_total{location="server"}[15m])) > 1048576
113
+            and
114
+          sum(rate(shithub_actions_log_scrub_replacements_total{location="server"}[15m])) == 0
115
+        for: 30m
116
+        labels: {severity: ticket}
117
+        annotations:
118
+          summary: "Actions logs are flowing but no server-side secret masks have matched"
119
+          runbook: "runbooks/incidents.md#actions-log-scrubber-possibly-missing"
120
+
75121
   - name: shithubd-backups
76122
     interval: 5m
77123
     rules:
docs/internal/runbooks/incidents.mdmodified
@@ -72,3 +72,67 @@ write; reads through cache may still appear to work briefly.
7272
    pattern lets multiple workers coexist safely).
7373
 4. To purge a poison job: mark it `failed` (don't delete — we want
7474
    the audit trail).
75
+
76
+## actions-runner-heartbeat-stale
77
+
78
+**Symptom:** `shithub_actions_runner_heartbeat_age_seconds{status!="offline"} >
79
+60` for 5m. Actions jobs can remain queued even while the runner appears
80
+registered.
81
+
82
+1. Identify the runner from the alert label.
83
+2. On the runner host: `systemctl status shithubd-runner` and
84
+   `journalctl -u shithubd-runner -n 200 --no-pager`.
85
+3. On the app host: `shithubd admin actions runner list` and confirm the
86
+   runner labels still match queued jobs.
87
+4. If the runner is wedged, restart `shithubd-runner`. If it cannot
88
+   authenticate, rotate the runner token and redeploy the service env.
89
+5. Record whether the stale heartbeat happened during a deploy, network
90
+   partition, token rotation, or runner engine failure.
91
+
92
+## actions-queue-depth-high
93
+
94
+**Symptom:** `shithub_actions_queue_depth{resource="jobs"} > 100` for 10m.
95
+
96
+1. Check runner availability:
97
+   `shithubd admin actions runner list`.
98
+2. Compare queued labels with runner labels. A workflow using an unsupported
99
+   `runs-on` value will sit queued until a compatible runner exists.
100
+3. Inspect web and worker logs for trigger storms, claim errors, and DB pool
101
+   saturation.
102
+4. If legitimate load exceeds capacity, add runners or raise capacity on idle
103
+   runner hosts. If one repository dominates, cancel or throttle that workload.
104
+5. After mitigation, watch `shithub_actions_queue_depth` drain and confirm
105
+   `shithub_actions_active` does not flatline.
106
+
107
+## actions-run-duration-p99-regressed
108
+
109
+**Symptom:** Actions p99 duration over 30m is >50% above the same window 24h
110
+ago.
111
+
112
+1. Split by event in the Actions dashboard. A single event type usually points
113
+   to one workflow shape rather than runner infrastructure.
114
+2. Compare `shithub_actions_active` and runner capacity. High duration with
115
+   low active jobs suggests slow jobs; high duration with saturated active jobs
116
+   suggests insufficient runner capacity.
117
+3. Check runner host CPU, memory, disk, and Docker/engine logs.
118
+4. If the regression started with a deploy, review runner API, log streaming,
119
+   checkout, and container execution changes first.
120
+5. Capture representative slow run IDs and their step durations before
121
+   canceling or pruning anything.
122
+
123
+## actions-log-scrubber-possibly-missing
124
+
125
+**Symptom:** server-side Actions log bytes are flowing for 30m, but
126
+`shithub_actions_log_scrub_replacements_total{location="server"}` remains zero.
127
+
128
+This is a warning, not proof of leaked secrets. Some periods legitimately have
129
+no secret-bearing logs.
130
+
131
+1. Confirm secrets or variables with sensitive values exist for workloads that
132
+   ran during the window.
133
+2. Trigger a controlled workflow that echoes a known test secret value and
134
+   verify the rendered logs contain `***`, not plaintext.
135
+3. Check runner claims happened after the secret was created or rotated; mask
136
+   snapshots are captured at claim time.
137
+4. If the controlled workflow is not masked, stop affected runners, rotate the
138
+   exposed secret, and open a security incident.
docs/internal/runbooks/observability.mdmodified
@@ -35,6 +35,11 @@ push-only. No Prometheus or Grafana running locally.
3535
 | `shithub_db_pool_acquired` | gauge | Active Postgres connections. Approaching `shithub_db_pool_total` = saturation. |
3636
 | `shithub_db_pool_acquire_wait_seconds_total` | counter | Cumulative wait time. Sudden derivative climb = pool too small. |
3737
 | `shithub_panics_total` | counter | Recovered panics. Should be 0 in steady state. |
38
+| `shithub_actions_queue_depth` | gauge | Queued Actions runs/jobs. Sustained job depth means runners cannot keep up. |
39
+| `shithub_actions_active` | gauge | Running Actions runs/jobs. Use with capacity to distinguish slow jobs from lack of runners. |
40
+| `shithub_actions_runner_heartbeat_age_seconds` | gauge | Seconds since each runner heartbeat. >60s sustained means the runner is stale. |
41
+| `shithub_actions_run_duration_seconds` | histogram | Terminal Actions run duration by event and conclusion. |
42
+| `shithub_actions_log_chunk_bytes_total` | counter | Accepted Actions log bytes, used for throughput and scrubber health alerts. |
3843
 
3944
 ## Operator setup (one-time)
4045
 
@@ -132,10 +137,15 @@ Within a minute, metrics start landing. From the Cloud portal:
132137
 
133138
 ## Building a starter dashboard
134139
 
135
-Cloud → Dashboards → New → import. Use the panels below as a
136
-spine. (We're not committing the dashboard JSON to the repo yet
137
-because Grafana's UUIDs are stack-specific; doc the queries and
138
-let the operator import once.)
140
+Cloud → Dashboards → New → import. Dashboard JSON is committed under:
141
+
142
+```text
143
+deploy/monitoring/grafana/dashboards/shithubd-overview.json
144
+deploy/monitoring/grafana/dashboards/actions.json
145
+```
146
+
147
+Use the panels below as fallback queries if the dashboard import UI drifts or
148
+you need to rebuild by hand.
139149
 
140150
 | Panel | Query |
141151
 |---|---|