tenseleyflow/shithub / ddf7161

Browse files

bench: add Actions load harness

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
ddf716118541410a39c054569144ef162549ca2d
Parents
2ab8d07
Tree
d1785fe

2 changed files

StatusFile+-
A bench/k6/actions-load.js 190 0
M docs/internal/runbooks/actions.md 48 0
bench/k6/actions-load.jsadded
@@ -0,0 +1,190 @@
1
+import encoding from "k6/encoding";
2
+import http from "k6/http";
3
+import { Counter, Rate, Trend } from "k6/metrics";
4
+import { fail, sleep } from "k6";
5
+
6
+const baseURL = (__ENV.SHITHUB_BASE_URL || "").replace(/\/+$/, "");
7
+const runnerTokens = (__ENV.SHITHUB_RUNNER_TOKENS || "")
8
+  .split(",")
9
+  .map((token) => token.trim())
10
+  .filter(Boolean);
11
+const runnerLabels = (__ENV.SHITHUB_RUNNER_LABELS || "self-hosted,linux,ubuntu-latest")
12
+  .split(",")
13
+  .map((label) => label.trim())
14
+  .filter(Boolean);
15
+const runnerCapacity = parseInt(__ENV.SHITHUB_RUNNER_CAPACITY || "17", 10);
16
+const logBytes = Math.min(parseInt(__ENV.SHITHUB_ACTIONS_LOG_BYTES || "4096", 10), 512 * 1024);
17
+const idleSleepSeconds = parseFloat(__ENV.SHITHUB_ACTIONS_IDLE_SLEEP || "20");
18
+
19
+if (!baseURL) {
20
+  throw new Error("SHITHUB_BASE_URL is required");
21
+}
22
+if (runnerTokens.length === 0) {
23
+  throw new Error("SHITHUB_RUNNER_TOKENS must contain at least one runner registration token");
24
+}
25
+
26
+export const options = {
27
+  scenarios: {
28
+    actions_jobs: {
29
+      executor: "constant-vus",
30
+      vus: parseInt(__ENV.SHITHUB_ACTIONS_VUS || "50", 10),
31
+      duration: __ENV.SHITHUB_ACTIONS_DURATION || "10m",
32
+    },
33
+  },
34
+  thresholds: {
35
+    api_errors: ["count==0"],
36
+    job_failures: ["count==0"],
37
+    log_append_duration: ["p(99)<5000"],
38
+    successful_job_rate: ["rate>0.95"],
39
+  },
40
+};
41
+
42
+const claimedJobs = new Counter("claimed_jobs");
43
+const noJobHeartbeats = new Counter("no_job_heartbeats");
44
+const completedJobs = new Counter("completed_jobs");
45
+const jobFailures = new Counter("job_failures");
46
+const apiErrors = new Counter("api_errors");
47
+const successfulJobRate = new Rate("successful_job_rate");
48
+const logAppendDuration = new Trend("log_append_duration");
49
+
50
+export default function () {
51
+  const registrationToken = runnerTokens[(__VU - 1) % runnerTokens.length];
52
+  const claim = heartbeat(registrationToken);
53
+  if (!claim) {
54
+    sleep(idleSleepSeconds + Math.random() * idleSleepSeconds);
55
+    return;
56
+  }
57
+
58
+  claimedJobs.add(1);
59
+  executeClaim(claim);
60
+}
61
+
62
+function heartbeat(registrationToken) {
63
+  const res = http.post(
64
+    `${baseURL}/api/v1/runners/heartbeat`,
65
+    JSON.stringify({ labels: runnerLabels, capacity: runnerCapacity }),
66
+    jsonParams(registrationToken),
67
+  );
68
+  if (res.status === 204) {
69
+    noJobHeartbeats.add(1);
70
+    return null;
71
+  }
72
+  if (res.status !== 200) {
73
+    apiErrors.add(1);
74
+    fail(`heartbeat returned ${res.status}: ${res.body}`);
75
+  }
76
+  return parseJSON(res, "heartbeat claim");
77
+}
78
+
79
+function executeClaim(claim) {
80
+  const job = claim.job;
81
+  let token = claim.token;
82
+  try {
83
+    token = postJob(job.id, "status", token, { status: "running" }, 200).next_token;
84
+
85
+    for (const step of job.steps || []) {
86
+      token = postJob(job.id, `steps/${step.id}/status`, token, { status: "running" }, 200).next_token;
87
+      if (step.run) {
88
+        const logResult = appendLogs(job, step, token);
89
+        token = logResult.token;
90
+        if (logResult.cancelled) {
91
+          successfulJobRate.add(true);
92
+          return;
93
+        }
94
+      }
95
+      token = postJob(
96
+        job.id,
97
+        `steps/${step.id}/status`,
98
+        token,
99
+        { status: "completed", conclusion: "success" },
100
+        200,
101
+      ).next_token;
102
+    }
103
+
104
+    postJob(job.id, "status", token, { status: "completed", conclusion: "success" }, 200);
105
+    completedJobs.add(1);
106
+    successfulJobRate.add(true);
107
+  } catch (err) {
108
+    jobFailures.add(1);
109
+    successfulJobRate.add(false);
110
+    throw err;
111
+  }
112
+}
113
+
114
+function appendLogs(job, step, token) {
115
+  let next = token;
116
+  for (let seq = 0; seq < 3; seq += 1) {
117
+    const chunk = logChunk(job, step, seq);
118
+    const res = http.post(
119
+      `${baseURL}/api/v1/jobs/${job.id}/logs`,
120
+      JSON.stringify({
121
+        seq,
122
+        step_id: step.id,
123
+        chunk: encoding.b64encode(chunk, "std"),
124
+      }),
125
+      jsonParams(next),
126
+    );
127
+    logAppendDuration.add(res.timings.duration);
128
+    if (res.status !== 202) {
129
+      apiErrors.add(1);
130
+      fail(`log append returned ${res.status}: ${res.body}`);
131
+    }
132
+    next = parseJSON(res, "log append").next_token;
133
+
134
+    if (seq === 1) {
135
+      const cancel = postJob(job.id, "cancel-check", next, {}, 200);
136
+      next = cancel.next_token;
137
+      if (cancel.cancelled) {
138
+        next = postJob(
139
+          job.id,
140
+          `steps/${step.id}/status`,
141
+          next,
142
+          { status: "cancelled", conclusion: "cancelled" },
143
+          200,
144
+        ).next_token;
145
+        postJob(job.id, "status", next, { status: "cancelled", conclusion: "cancelled" }, 200);
146
+        return { token: next, cancelled: true };
147
+      }
148
+    }
149
+  }
150
+  return { token: next, cancelled: false };
151
+}
152
+
153
+function postJob(jobID, path, token, body, expectedStatus) {
154
+  const res = http.post(
155
+    `${baseURL}/api/v1/jobs/${jobID}/${path}`,
156
+    JSON.stringify(body),
157
+    jsonParams(token),
158
+  );
159
+  if (res.status !== expectedStatus) {
160
+    apiErrors.add(1);
161
+    fail(`${path} returned ${res.status}: ${res.body}`);
162
+  }
163
+  return parseJSON(res, path);
164
+}
165
+
166
+function parseJSON(res, name) {
167
+  try {
168
+    return res.json();
169
+  } catch (err) {
170
+    apiErrors.add(1);
171
+    fail(`${name} returned invalid JSON: ${err}`);
172
+  }
173
+}
174
+
175
+function jsonParams(token) {
176
+  return {
177
+    headers: {
178
+      Authorization: `Bearer ${token}`,
179
+      "Content-Type": "application/json",
180
+    },
181
+  };
182
+}
183
+
184
+function logChunk(job, step, seq) {
185
+  const prefix = `vu=${__VU} iter=${__ITER} run=${job.run_id} job=${job.id} step=${step.id} seq=${seq}\n`;
186
+  if (prefix.length >= logBytes) {
187
+    return prefix.slice(0, logBytes);
188
+  }
189
+  return prefix + ".".repeat(logBytes - prefix.length);
190
+}
docs/internal/runbooks/actions.mdmodified
@@ -157,12 +157,60 @@ shithubd admin actions runner list
157157
 
158158
 Important metrics:
159159
 
160
+- `shithub_actions_queue_depth{resource="runs|jobs"}`
161
+- `shithub_actions_active{resource="runs|jobs"}`
162
+- `shithub_actions_runner_heartbeat_age_seconds{runner,status}`
163
+- `shithub_actions_runner_capacity{runner,status}`
160164
 - `shithub_actions_runner_heartbeats_total{result="claimed|no_job"}`
161165
 - `shithub_actions_runner_jwt_total{result="issued|rejected|replay"}`
166
+- `shithub_actions_runs_completed_total{event,conclusion}`
167
+- `shithub_actions_run_duration_seconds{event,conclusion}`
168
+- `shithub_actions_steps_completed_total{step_type,conclusion}`
162169
 - `shithub_actions_jobs_cancelled_total{reason="user|concurrency|timeout"}`
163170
 - `shithub_actions_log_scrub_replacements_total{location="server"}`
171
+- `shithub_actions_log_chunks_total{location="server"}`
172
+- `shithub_actions_log_chunk_bytes_total{location="server"}`
173
+- `shithub_actions_storage_objects{kind="artifacts|step_logs|hot_log_chunks"}`
174
+- `shithub_actions_storage_bytes{kind="artifacts|step_logs|hot_log_chunks"}`
164175
 - `shithub_actions_step_timeouts_total`
165176
 
177
+The committed dashboard JSON lives at:
178
+
179
+```text
180
+deploy/monitoring/grafana/dashboards/actions.json
181
+```
182
+
183
+## Load harness
184
+
185
+`bench/k6/actions-load.js` exercises the runner HTTP API under concurrent job
186
+claims. It does not create workflow runs itself; seed the queue first with
187
+pushes or workflow dispatches that produce jobs matching the runner labels.
188
+
189
+Required environment:
190
+
191
+```sh
192
+SHITHUB_BASE_URL=https://shithub.example.test \
193
+SHITHUB_RUNNER_TOKENS=token-a,token-b,token-c \
194
+k6 run bench/k6/actions-load.js
195
+```
196
+
197
+Useful knobs:
198
+
199
+- `SHITHUB_ACTIONS_VUS=50` controls concurrent virtual users.
200
+- `SHITHUB_ACTIONS_DURATION=10m` controls the steady-state window.
201
+- `SHITHUB_RUNNER_LABELS=self-hosted,linux,ubuntu-latest` sets heartbeat
202
+  labels.
203
+- `SHITHUB_RUNNER_CAPACITY=17` keeps three runners near the 50-concurrent
204
+  target.
205
+- `SHITHUB_ACTIONS_LOG_BYTES=4096` controls emitted log chunk size.
206
+
207
+Healthy run expectations:
208
+
209
+- queued jobs drain without unbounded `shithub_actions_queue_depth` growth;
210
+- runner heartbeats keep advancing and no runner deadlocks;
211
+- log append p99 stays below five seconds;
212
+- retention metrics catch up after the retention sweep.
213
+
166214
 ## Emergency cancel
167215
 
168216
 Start with a dry run: