tenseleyflow/shithub / 3ff1cc1

Browse files

docs/actions: document runner API and counters (S41c)

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
3ff1cc1d300e60980cdd7e8479d2eac7aa5d536e
Parents
b70f1f2
Tree
c18337f

7 changed files

StatusFile+-
M cmd/shithubd/admin_runner.go 2 0
A docs/internal/actions-runner-api.md 111 0
M docs/internal/actions-schema.md 9 1
M docs/internal/index.md 2 0
A docs/internal/runbooks/actions-runner.md 84 0
M internal/infra/metrics/metrics.go 23 0
M internal/web/handlers/api/runners.go 9 0
cmd/shithubd/admin_runner.gomodified
@@ -20,6 +20,7 @@ import (
2020
 	actionsdb "github.com/tenseleyFlow/shithub/internal/actions/sqlc"
2121
 	"github.com/tenseleyFlow/shithub/internal/infra/config"
2222
 	"github.com/tenseleyFlow/shithub/internal/infra/db"
23
+	"github.com/tenseleyFlow/shithub/internal/infra/metrics"
2324
 )
2425
 
2526
 func newAdminRunnerCmd() *cobra.Command {
@@ -102,6 +103,7 @@ func newAdminRunnerRegisterCmd() *cobra.Command {
102103
 				return fmt.Errorf("admin runner register: commit: %w", err)
103104
 			}
104105
 			committed = true
106
+			metrics.ActionsRunnerRegistrationsTotal.Inc()
105107
 
106108
 			_, _ = fmt.Fprintf(cmd.OutOrStdout(),
107109
 				"runner registered\nid: %d\nname: %s\nlabels: %s\ncapacity: %d\ntoken: %s\n\nStore this token now; shithub never shows it again.\n",
docs/internal/actions-runner-api.mdadded
@@ -0,0 +1,111 @@
1
+# Actions runner API
2
+
3
+The runner-facing HTTP surface lives in
4
+`internal/web/handlers/api/runners.go`. It is mounted under `/api/v1`
5
+in the CSRF-exempt API group, but it does not use PAT auth. Runners
6
+authenticate first with a long-lived registration token and then with
7
+short-lived per-job JWTs.
8
+
9
+## Auth model
10
+
11
+Operators register a runner with:
12
+
13
+```sh
14
+shithubd admin runner register --name runner-1 --labels self-hosted,linux,ubuntu-latest
15
+```
16
+
17
+The command inserts `workflow_runners`, stores only a SHA-256 hash in
18
+`runner_tokens`, and prints the 32-byte hex token once.
19
+
20
+`POST /api/v1/runners/heartbeat` accepts:
21
+
22
+```http
23
+Authorization: Bearer <registration-token>
24
+```
25
+
26
+When a queued job matches the runner labels and capacity is available,
27
+the response includes a job payload and a 15-minute job JWT. That JWT
28
+has claims:
29
+
30
+```json
31
+{"sub":"runner:<id>","job_id":1,"run_id":1,"repo_id":1,"exp":0,"jti":"..."}
32
+```
33
+
34
+The signing key is derived from `auth.totp_key_b64` with HKDF label
35
+`actions-runner-jwt-v1`; the raw TOTP/secretbox key is not used
36
+directly for JWT signing.
37
+
38
+Job JWTs are single-use. Every job endpoint verifies the signature and
39
+expiry, checks that the path job belongs to the claimed runner/run, and
40
+then inserts `jti` into `runner_jwt_used`. A replay returns 401. To
41
+support multi-step runner flows, successful non-terminal job endpoints
42
+return `next_token` and `next_token_expires_at`.
43
+
44
+## Endpoints
45
+
46
+`POST /api/v1/runners/heartbeat`
47
+
48
+Request body:
49
+
50
+```json
51
+{"labels":["ubuntu-latest","linux"],"capacity":1}
52
+```
53
+
54
+Returns 204 when no matching job is claimable. Returns 200 with
55
+`token`, `expires_at`, and `job` when a job is claimed. Capacity is
56
+enforced server-side by counting current `workflow_jobs.status =
57
+'running'` rows for the runner while holding a row lock on the runner.
58
+
59
+`POST /api/v1/jobs/{id}/logs`
60
+
61
+Auth: job JWT. Body:
62
+
63
+```json
64
+{"seq":0,"chunk":"aGVsbG8K","step_id":123}
65
+```
66
+
67
+`step_id` is optional for the S41c curl smoke path; when omitted the
68
+first step in the job receives the chunk. Chunks are base64-decoded,
69
+capped at 512 KiB raw, and appended to `workflow_step_log_chunks`.
70
+Duplicate `(step_id, seq)` inserts are accepted as idempotent retries.
71
+
72
+`POST /api/v1/jobs/{id}/status`
73
+
74
+Auth: job JWT. Body:
75
+
76
+```json
77
+{"status":"completed","conclusion":"success"}
78
+```
79
+
80
+Valid transitions are `queued|running -> running|completed|cancelled`.
81
+Completed jobs require a valid check conclusion. The handler updates
82
+`workflow_jobs`, rolls up `workflow_runs`, and best-effort updates the
83
+matching `check_runs` row created by the trigger pipeline.
84
+
85
+`POST /api/v1/jobs/{id}/artifacts/upload`
86
+
87
+Auth: job JWT. Body:
88
+
89
+```json
90
+{"name":"test-results.tgz","size_bytes":12345}
91
+```
92
+
93
+Creates a `workflow_artifacts` row and returns a pre-signed S3 PUT URL.
94
+The object key is `actions/runs/<run_id>/artifacts/<name>`.
95
+
96
+`POST /api/v1/jobs/{id}/cancel-check`
97
+
98
+Auth: job JWT. Returns:
99
+
100
+```json
101
+{"cancelled":false,"next_token":"..."}
102
+```
103
+
104
+The boolean mirrors `workflow_jobs.cancel_requested`; the actual cancel
105
+request UI lands later in S41g.
106
+
107
+## Metrics
108
+
109
+- `shithub_actions_runner_registrations_total`
110
+- `shithub_actions_runner_heartbeats_total{result="claimed|no_job"}`
111
+- `shithub_actions_runner_jwt_total{result="issued|rejected|replay"}`
docs/internal/actions-schema.mdmodified
@@ -12,7 +12,7 @@ without churning under them.
1212
 
1313
 ## SQL schema
1414
 
15
-Migrations 0042–0049, in dependency order:
15
+Migrations 0042–0052, in dependency order:
1616
 
1717
 | #     | Table                       | Purpose                                                       |
1818
 | ----- | --------------------------- | ------------------------------------------------------------- |
@@ -24,6 +24,9 @@ Migrations 0042–0049, in dependency order:
2424
 | 0047  | `workflow_step_log_chunks`  | Hot-path append log buffer (concatenated to blob on finalize) |
2525
 | 0048  | `workflow_artifacts`        | Per-run artifact metadata (90-day default expiry)             |
2626
 | 0049  | `actions_variables`         | Non-secret per-repo/org config (Forgejo parity)               |
27
+| 0050  | `workflow_steps.step_with`  | Parsed `with:` inputs for magic `uses:` aliases               |
28
+| 0051  | `workflow_runs.trigger_event_id` | Trigger idempotency for retries/admin replays            |
29
+| 0052  | `runner_jwt_used`           | Single-use replay gate for runner job JWTs                    |
2730
 
2831
 A few load-bearing choices, called out so they're easy to spot in a
2932
 later schema diff:
@@ -56,6 +59,11 @@ later schema diff:
5659
 - **`actions_variables`** — non-secret, plaintext, scoped exactly
5760
   like secrets (per-repo or per-org, never both on the same row).
5861
   Forgejo has the same split; we mirror it for parity.
62
+- **`runner_jwt_used`** — primary-keyed by JWT `jti`. Job endpoints
63
+  insert into this table during auth; zero inserted rows means replay
64
+  and the API returns 401. JWTs are HMAC-SHA256 and use an HKDF
65
+  subkey derived from `auth.totp_key_b64` with label
66
+  `actions-runner-jwt-v1`.
5967
 
6068
 The `version` and `run_index` patterns are the two pieces I'd point
6169
 out to a future maintainer first. Both are cheap to add now and
docs/internal/index.mdmodified
@@ -55,6 +55,8 @@ site.
5555
   [pr-review.md](./pr-review.md)
5656
 - [branch-protection.md](./branch-protection.md),
5757
   [checks.md](./checks.md)
58
+- [actions-schema.md](./actions-schema.md),
59
+  [actions-runner-api.md](./actions-runner-api.md)
5860
 - [orgs.md](./orgs.md), [teams.md](./teams.md)
5961
 - [notifications.md](./notifications.md)
6062
 - [search.md](./search.md), [markdown.md](./markdown.md)
docs/internal/runbooks/actions-runner.mdadded
@@ -0,0 +1,84 @@
1
+# Actions runner smoke runbook
2
+
3
+This runbook drives one queued Actions job with curl. It is for S41c
4
+operator validation before the real `shithubd-runner` binary lands.
5
+
6
+Prereqs:
7
+
8
+- Database migrations are current through `0052_runner_jwt_used.sql`.
9
+- `SHITHUB_TOTP_KEY` or `auth.totp_key_b64` is set on the web process.
10
+- Object storage is configured if testing artifact upload.
11
+- A repo has a workflow under `.shithub/workflows/*.yml` with
12
+  `runs-on: ubuntu-latest`, and a push/dispatch has enqueued a run.
13
+
14
+Register a runner:
15
+
16
+```sh
17
+shithubd admin runner register \
18
+  --name runner-1 \
19
+  --labels self-hosted,linux,ubuntu-latest \
20
+  --capacity 1
21
+```
22
+
23
+Save the printed token:
24
+
25
+```sh
26
+export RUNNER_TOKEN='<printed-token>'
27
+export BASE='https://shithub.example'
28
+```
29
+
30
+Claim a job:
31
+
32
+```sh
33
+curl -fsS "$BASE/api/v1/runners/heartbeat" \
34
+  -H "Authorization: Bearer $RUNNER_TOKEN" \
35
+  -H "Content-Type: application/json" \
36
+  -d '{"labels":["self-hosted","linux","ubuntu-latest"],"capacity":1}' \
37
+  | tee /tmp/shithub-claim.json
38
+```
39
+
40
+Extract the job token and id:
41
+
42
+```sh
43
+export JOB_ID="$(jq -r '.job.id' /tmp/shithub-claim.json)"
44
+export JOB_TOKEN="$(jq -r '.token' /tmp/shithub-claim.json)"
45
+```
46
+
47
+Append a log chunk:
48
+
49
+```sh
50
+curl -fsS "$BASE/api/v1/jobs/$JOB_ID/logs" \
51
+  -H "Authorization: Bearer $JOB_TOKEN" \
52
+  -H "Content-Type: application/json" \
53
+  -d "{\"seq\":0,\"chunk\":\"$(printf 'hello from curl\n' | base64)\"}" \
54
+  | tee /tmp/shithub-log.json
55
+
56
+export JOB_TOKEN="$(jq -r '.next_token' /tmp/shithub-log.json)"
57
+```
58
+
59
+Complete the job:
60
+
61
+```sh
62
+curl -fsS "$BASE/api/v1/jobs/$JOB_ID/status" \
63
+  -H "Authorization: Bearer $JOB_TOKEN" \
64
+  -H "Content-Type: application/json" \
65
+  -d '{"status":"completed","conclusion":"success"}'
66
+```
67
+
68
+Replay check: reusing the log token after the log call must fail with
69
+401 because its `jti` is already present in `runner_jwt_used`.
70
+
71
+```sh
72
+curl -i "$BASE/api/v1/jobs/$JOB_ID/status" \
73
+  -H "Authorization: Bearer $(jq -r '.next_token' /tmp/shithub-log.json)" \
74
+  -H "Content-Type: application/json" \
75
+  -d '{"status":"running"}'
76
+```
77
+
78
+Expected results:
79
+
80
+- `workflow_jobs.status = completed` and conclusion `success`.
81
+- The parent `workflow_runs` row rolls up to completed/success when all
82
+  jobs are terminal.
83
+- The PR Checks tab shows the matching check run as success.
84
+- `/metrics` includes runner registration, heartbeat, and JWT counters.
internal/infra/metrics/metrics.gomodified
@@ -129,6 +129,26 @@ var (
129129
 			Buckets: prometheus.ExponentialBuckets(0.005, 2.0, 12),
130130
 		},
131131
 	)
132
+	ActionsRunnerRegistrationsTotal = prometheus.NewCounter(
133
+		prometheus.CounterOpts{
134
+			Name: "shithub_actions_runner_registrations_total",
135
+			Help: "Total Actions runners registered through operator tooling.",
136
+		},
137
+	)
138
+	ActionsRunnerHeartbeatsTotal = prometheus.NewCounterVec(
139
+		prometheus.CounterOpts{
140
+			Name: "shithub_actions_runner_heartbeats_total",
141
+			Help: "Total runner heartbeats by result (claimed, no_job).",
142
+		},
143
+		[]string{"result"},
144
+	)
145
+	ActionsRunnerJWTTotal = prometheus.NewCounterVec(
146
+		prometheus.CounterOpts{
147
+			Name: "shithub_actions_runner_jwt_total",
148
+			Help: "Total runner job JWT outcomes by result (issued, rejected, replay).",
149
+		},
150
+		[]string{"result"},
151
+	)
132152
 )
133153
 
134154
 func init() {
@@ -146,6 +166,9 @@ func init() {
146166
 		WorkerInFlight,
147167
 		ActionsRunsEnqueuedTotal,
148168
 		ActionsTriggerMatchDurationSeconds,
169
+		ActionsRunnerRegistrationsTotal,
170
+		ActionsRunnerHeartbeatsTotal,
171
+		ActionsRunnerJWTTotal,
149172
 	)
150173
 }
151174
 
internal/web/handlers/api/runners.gomodified
@@ -25,6 +25,7 @@ import (
2525
 	"github.com/tenseleyFlow/shithub/internal/auth/runnerjwt"
2626
 	"github.com/tenseleyFlow/shithub/internal/checks"
2727
 	checksdb "github.com/tenseleyFlow/shithub/internal/checks/sqlc"
28
+	"github.com/tenseleyFlow/shithub/internal/infra/metrics"
2829
 	"github.com/tenseleyFlow/shithub/internal/ratelimit"
2930
 )
3031
 
@@ -92,6 +93,7 @@ func (h *Handlers) runnerHeartbeat(w http.ResponseWriter, r *http.Request) {
9293
 		return
9394
 	}
9495
 	if !claimed {
96
+		metrics.ActionsRunnerHeartbeatsTotal.WithLabelValues("no_job").Inc()
9597
 		w.WriteHeader(http.StatusNoContent)
9698
 		return
9799
 	}
@@ -107,6 +109,8 @@ func (h *Handlers) runnerHeartbeat(w http.ResponseWriter, r *http.Request) {
107109
 		writeAPIError(w, http.StatusInternalServerError, "runner token mint failed")
108110
 		return
109111
 	}
112
+	metrics.ActionsRunnerHeartbeatsTotal.WithLabelValues("claimed").Inc()
113
+	metrics.ActionsRunnerJWTTotal.WithLabelValues("issued").Inc()
110114
 	writeJSON(w, http.StatusOK, presentRunnerClaim(job, steps, token, time.Unix(claims.Exp, 0)))
111115
 }
112116
 
@@ -260,6 +264,7 @@ func (h *Handlers) authenticateRunnerJob(w http.ResponseWriter, r *http.Request)
260264
 	}
261265
 	claims, err := h.d.RunnerJWT.Verify(strings.TrimSpace(strings.TrimPrefix(authz, prefix)))
262266
 	if err != nil {
267
+		metrics.ActionsRunnerJWTTotal.WithLabelValues("rejected").Inc()
263268
 		writeAPIError(w, http.StatusUnauthorized, "job token invalid")
264269
 		return runnerJobAuth{}, false
265270
 	}
@@ -269,6 +274,7 @@ func (h *Handlers) authenticateRunnerJob(w http.ResponseWriter, r *http.Request)
269274
 	}
270275
 	runnerID, err := claims.RunnerID()
271276
 	if err != nil {
277
+		metrics.ActionsRunnerJWTTotal.WithLabelValues("rejected").Inc()
272278
 		writeAPIError(w, http.StatusUnauthorized, "job token invalid")
273279
 		return runnerJobAuth{}, false
274280
 	}
@@ -287,8 +293,10 @@ func (h *Handlers) authenticateRunnerJob(w http.ResponseWriter, r *http.Request)
287293
 	}
288294
 	if err := runnerjwt.Consume(r.Context(), h.d.Pool, claims); err != nil {
289295
 		if errors.Is(err, runnerjwt.ErrReplay) {
296
+			metrics.ActionsRunnerJWTTotal.WithLabelValues("replay").Inc()
290297
 			writeAPIError(w, http.StatusUnauthorized, "job token replayed")
291298
 		} else {
299
+			metrics.ActionsRunnerJWTTotal.WithLabelValues("rejected").Inc()
292300
 			h.d.Logger.ErrorContext(r.Context(), "runner jwt consume failed", "job_id", pathJobID, "error", err)
293301
 			writeAPIError(w, http.StatusUnauthorized, "job token invalid")
294302
 		}
@@ -695,6 +703,7 @@ func (h *Handlers) writeNextTokenResponse(
695703
 		writeAPIError(w, http.StatusInternalServerError, "runner token mint failed")
696704
 		return
697705
 	}
706
+	metrics.ActionsRunnerJWTTotal.WithLabelValues("issued").Inc()
698707
 	body["next_token"] = token
699708
 	body["next_token_expires_at"] = time.Unix(claims.Exp, 0).UTC().Format(time.RFC3339)
700709
 	writeJSON(w, status, body)