Go · 12533 bytes Raw Blame History
1 // SPDX-License-Identifier: AGPL-3.0-or-later
2
3 // Package metrics owns the Prometheus registry. Standard metrics are
4 // instantiated up front; per-package metrics register against the same
5 // shared registry.
6 package metrics
7
8 import (
9 "crypto/subtle"
10 "net/http"
11
12 "github.com/prometheus/client_golang/prometheus"
13 "github.com/prometheus/client_golang/prometheus/collectors"
14 "github.com/prometheus/client_golang/prometheus/promhttp"
15 )
16
17 // Registry is the project-wide Prometheus registry. Subpackages register
18 // their collectors against this so /metrics has a single source.
19 var Registry = prometheus.NewRegistry()
20
21 // Standard process / Go runtime metrics.
22 func init() {
23 Registry.MustRegister(
24 collectors.NewGoCollector(),
25 collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
26 )
27 }
28
29 // HTTP request metrics. Wired by the HTTP middleware.
30 var (
31 HTTPRequestsTotal = prometheus.NewCounterVec(
32 prometheus.CounterOpts{
33 Name: "shithub_http_requests_total",
34 Help: "Total HTTP requests by route, method, and status.",
35 },
36 []string{"route", "method", "status"},
37 )
38 HTTPRequestDuration = prometheus.NewHistogramVec(
39 prometheus.HistogramOpts{
40 Name: "shithub_http_request_duration_seconds",
41 Help: "HTTP request duration distribution by route and method.",
42 Buckets: prometheus.ExponentialBuckets(0.001, 2.5, 12),
43 },
44 []string{"route", "method"},
45 )
46 HTTPInFlight = prometheus.NewGauge(
47 prometheus.GaugeOpts{
48 Name: "shithub_http_in_flight",
49 Help: "Number of HTTP requests currently in flight.",
50 },
51 )
52 PanicsTotal = prometheus.NewCounter(
53 prometheus.CounterOpts{
54 Name: "shithub_panics_total",
55 Help: "Total panics caught by the recover middleware.",
56 },
57 )
58 )
59
60 // DB pool metrics. Updated periodically by an observer goroutine that the
61 // caller starts via Observe(pool, interval).
62 var (
63 DBConnsAcquired = prometheus.NewGauge(
64 prometheus.GaugeOpts{
65 Name: "shithub_db_pool_acquired",
66 Help: "Postgres connections currently checked out of the pool.",
67 },
68 )
69 DBConnsIdle = prometheus.NewGauge(
70 prometheus.GaugeOpts{
71 Name: "shithub_db_pool_idle",
72 Help: "Postgres connections currently idle in the pool.",
73 },
74 )
75 DBConnsTotal = prometheus.NewGauge(
76 prometheus.GaugeOpts{
77 Name: "shithub_db_pool_total",
78 Help: "Postgres connections currently held by the pool.",
79 },
80 )
81 DBAcquireWaitDurationTotal = prometheus.NewCounter(
82 prometheus.CounterOpts{
83 Name: "shithub_db_pool_acquire_wait_seconds_total",
84 Help: "Cumulative time clients spent waiting to acquire a Postgres connection.",
85 },
86 )
87 )
88
89 // Worker metrics. The pool updates these on every dispatch.
90 var (
91 WorkerJobsProcessedTotal = prometheus.NewCounterVec(
92 prometheus.CounterOpts{
93 Name: "shithub_worker_jobs_processed_total",
94 Help: "Worker jobs processed by kind and outcome (ok, retry, failed, poison).",
95 },
96 []string{"kind", "outcome"},
97 )
98 WorkerJobDurationSeconds = prometheus.NewHistogramVec(
99 prometheus.HistogramOpts{
100 Name: "shithub_worker_job_duration_seconds",
101 Help: "Worker handler latency by kind.",
102 Buckets: prometheus.ExponentialBuckets(0.005, 2.5, 12),
103 },
104 []string{"kind"},
105 )
106 WorkerInFlight = prometheus.NewGaugeVec(
107 prometheus.GaugeOpts{
108 Name: "shithub_worker_in_flight",
109 Help: "Worker handler invocations currently in flight by kind.",
110 },
111 []string{"kind"},
112 )
113 )
114
115 // Actions trigger pipeline metrics (S41b). Incremented from
116 // internal/actions/trigger.
117 var (
118 ActionsRunsEnqueuedTotal = prometheus.NewCounterVec(
119 prometheus.CounterOpts{
120 Name: "shithub_actions_runs_enqueued_total",
121 Help: "Total workflow runs enqueued by triggering event kind. Result is 'fresh' for new runs or 'already_exists' when ON CONFLICT noop'd.",
122 },
123 []string{"event", "result"},
124 )
125 ActionsTriggerMatchDurationSeconds = prometheus.NewHistogram(
126 prometheus.HistogramOpts{
127 Name: "shithub_actions_trigger_match_duration_seconds",
128 Help: "Wall-clock time spent in the trigger handler discovering + parsing + matching workflows for one triggering event.",
129 Buckets: prometheus.ExponentialBuckets(0.005, 2.0, 12),
130 },
131 )
132 ActionsRunnerRegistrationsTotal = prometheus.NewCounter(
133 prometheus.CounterOpts{
134 Name: "shithub_actions_runner_registrations_total",
135 Help: "Total Actions runners registered through operator tooling.",
136 },
137 )
138 ActionsRunnerHeartbeatsTotal = prometheus.NewCounterVec(
139 prometheus.CounterOpts{
140 Name: "shithub_actions_runner_heartbeats_total",
141 Help: "Total runner heartbeats by result (claimed, no_job, rejected).",
142 },
143 []string{"result"},
144 )
145 ActionsRunnerJWTTotal = prometheus.NewCounterVec(
146 prometheus.CounterOpts{
147 Name: "shithub_actions_runner_jwt_total",
148 Help: "Total runner job JWT outcomes by result (issued, rejected, replay).",
149 },
150 []string{"result"},
151 )
152 ActionsJobsCancelledTotal = prometheus.NewCounterVec(
153 prometheus.CounterOpts{
154 Name: "shithub_actions_jobs_cancelled_total",
155 Help: "Total Actions job cancellation requests by reason (user, concurrency, timeout).",
156 },
157 []string{"reason"},
158 )
159 ActionsRunsCompletedTotal = prometheus.NewCounterVec(
160 prometheus.CounterOpts{
161 Name: "shithub_actions_runs_completed_total",
162 Help: "Total terminal Actions workflow runs by event kind and conclusion.",
163 },
164 []string{"event", "conclusion"},
165 )
166 ActionsRunDurationSeconds = prometheus.NewHistogramVec(
167 prometheus.HistogramOpts{
168 Name: "shithub_actions_run_duration_seconds",
169 Help: "Actions workflow run duration from started_at or created_at to completed_at, by event kind and conclusion.",
170 Buckets: prometheus.ExponentialBuckets(1, 2.5, 12),
171 },
172 []string{"event", "conclusion"},
173 )
174 ActionsStepsCompletedTotal = prometheus.NewCounterVec(
175 prometheus.CounterOpts{
176 Name: "shithub_actions_steps_completed_total",
177 Help: "Total terminal Actions steps by bounded step type and conclusion.",
178 },
179 []string{"step_type", "conclusion"},
180 )
181 ActionsConcurrencyQueuedTotal = prometheus.NewCounter(
182 prometheus.CounterOpts{
183 Name: "shithub_actions_concurrency_queued_total",
184 Help: "Total Actions workflow runs queued behind an older active run in the same concurrency group.",
185 },
186 )
187 ActionsLogScrubReplacementsTotal = prometheus.NewCounterVec(
188 prometheus.CounterOpts{
189 Name: "shithub_actions_log_scrub_replacements_total",
190 Help: "Total exact secret-value replacements performed on Actions log chunks.",
191 },
192 []string{"location"},
193 )
194 ActionsLogChunksTotal = prometheus.NewCounterVec(
195 prometheus.CounterOpts{
196 Name: "shithub_actions_log_chunks_total",
197 Help: "Total Actions log chunks accepted by location.",
198 },
199 []string{"location"},
200 )
201 ActionsLogChunkBytesTotal = prometheus.NewCounterVec(
202 prometheus.CounterOpts{
203 Name: "shithub_actions_log_chunk_bytes_total",
204 Help: "Total Actions log chunk bytes accepted by location before durable storage.",
205 },
206 []string{"location"},
207 )
208 ActionsRunsPrunedTotal = prometheus.NewCounterVec(
209 prometheus.CounterOpts{
210 Name: "shithub_actions_runs_pruned_total",
211 Help: "Total Actions retention deletions by kind (chunks, blobs, runs, jwt_used).",
212 },
213 []string{"kind"},
214 )
215 ActionsStepTimeoutsTotal = prometheus.NewCounter(
216 prometheus.CounterOpts{
217 Name: "shithub_actions_step_timeouts_total",
218 Help: "Total Actions steps reported as timed out by runners.",
219 },
220 )
221 ActionsQueueDepth = prometheus.NewGaugeVec(
222 prometheus.GaugeOpts{
223 Name: "shithub_actions_queue_depth",
224 Help: "Current queued Actions workflow items by resource (runs, jobs).",
225 },
226 []string{"resource"},
227 )
228 ActionsQueueDepthByLabels = prometheus.NewGaugeVec(
229 prometheus.GaugeOpts{
230 Name: "shithub_actions_queue_depth_by_labels",
231 Help: "Current queued Actions jobs by exact runs-on label expression.",
232 },
233 []string{"labels"},
234 )
235 ActionsActive = prometheus.NewGaugeVec(
236 prometheus.GaugeOpts{
237 Name: "shithub_actions_active",
238 Help: "Current running Actions workflow items by resource (runs, jobs).",
239 },
240 []string{"resource"},
241 )
242 ActionsJobClaimLatencySeconds = prometheus.NewHistogram(
243 prometheus.HistogramOpts{
244 Name: "shithub_actions_job_claim_latency_seconds",
245 Help: "Seconds between job enqueue and runner claim.",
246 Buckets: prometheus.ExponentialBuckets(0.1, 2.5, 12),
247 },
248 )
249 ActionsRunnerHeartbeatAgeSeconds = prometheus.NewGaugeVec(
250 prometheus.GaugeOpts{
251 Name: "shithub_actions_runner_heartbeat_age_seconds",
252 Help: "Seconds since each registered Actions runner last heartbeated. Offline runners that never heartbeated are omitted.",
253 },
254 []string{"runner", "status"},
255 )
256 ActionsRunnerOnline = prometheus.NewGaugeVec(
257 prometheus.GaugeOpts{
258 Name: "shithub_actions_runner_online",
259 Help: "Current runner online state by runner (1 online, 0 unavailable).",
260 },
261 []string{"runner"},
262 )
263 ActionsRunnerStaleTotal = prometheus.NewGauge(
264 prometheus.GaugeOpts{
265 Name: "shithub_actions_runner_stale_total",
266 Help: "Current count of non-revoked runners whose heartbeat is past the stale threshold.",
267 },
268 )
269 ActionsRunnerDraining = prometheus.NewGaugeVec(
270 prometheus.GaugeOpts{
271 Name: "shithub_actions_runner_draining",
272 Help: "Current runner drain state by runner (1 draining, 0 not draining).",
273 },
274 []string{"runner"},
275 )
276 ActionsRunnerCapacity = prometheus.NewGaugeVec(
277 prometheus.GaugeOpts{
278 Name: "shithub_actions_runner_capacity",
279 Help: "Configured Actions runner capacity by runner and status.",
280 },
281 []string{"runner", "status"},
282 )
283 ActionsRunnerRevocationsTotal = prometheus.NewCounter(
284 prometheus.CounterOpts{
285 Name: "shithub_actions_runner_revocations_total",
286 Help: "Total Actions runner hard revocations performed by operator tooling.",
287 },
288 )
289 ActionsStorageObjects = prometheus.NewGaugeVec(
290 prometheus.GaugeOpts{
291 Name: "shithub_actions_storage_objects",
292 Help: "Current durable Actions storage object count by kind.",
293 },
294 []string{"kind"},
295 )
296 ActionsStorageBytes = prometheus.NewGaugeVec(
297 prometheus.GaugeOpts{
298 Name: "shithub_actions_storage_bytes",
299 Help: "Current durable Actions storage byte count by kind.",
300 },
301 []string{"kind"},
302 )
303 )
304
305 func init() {
306 Registry.MustRegister(
307 HTTPRequestsTotal,
308 HTTPRequestDuration,
309 HTTPInFlight,
310 PanicsTotal,
311 DBConnsAcquired,
312 DBConnsIdle,
313 DBConnsTotal,
314 DBAcquireWaitDurationTotal,
315 WorkerJobsProcessedTotal,
316 WorkerJobDurationSeconds,
317 WorkerInFlight,
318 ActionsRunsEnqueuedTotal,
319 ActionsTriggerMatchDurationSeconds,
320 ActionsRunnerRegistrationsTotal,
321 ActionsRunnerHeartbeatsTotal,
322 ActionsRunnerJWTTotal,
323 ActionsJobsCancelledTotal,
324 ActionsRunsCompletedTotal,
325 ActionsRunDurationSeconds,
326 ActionsStepsCompletedTotal,
327 ActionsConcurrencyQueuedTotal,
328 ActionsLogScrubReplacementsTotal,
329 ActionsLogChunksTotal,
330 ActionsLogChunkBytesTotal,
331 ActionsRunsPrunedTotal,
332 ActionsStepTimeoutsTotal,
333 ActionsQueueDepth,
334 ActionsQueueDepthByLabels,
335 ActionsActive,
336 ActionsJobClaimLatencySeconds,
337 ActionsRunnerHeartbeatAgeSeconds,
338 ActionsRunnerOnline,
339 ActionsRunnerStaleTotal,
340 ActionsRunnerDraining,
341 ActionsRunnerCapacity,
342 ActionsRunnerRevocationsTotal,
343 ActionsStorageObjects,
344 ActionsStorageBytes,
345 )
346 }
347
348 // Handler returns the /metrics HTTP handler. When user/pass is set, the
349 // handler enforces HTTP Basic auth; otherwise it serves unauthenticated
350 // (S35 will tighten the policy).
351 //
352 // DisableCompression: promhttp gzips responses when the scraper sends
353 // Accept-Encoding: gzip. Alloy 1.16's Prometheus scraper advertises gzip
354 // but mishandles the Content-Encoding: gzip response (parses raw 0x1f
355 // magic byte as text, scrape fails with up=0). Bypass at the source —
356 // /metrics payload is small enough that wire savings are irrelevant.
357 // Skipping the chi Compress middleware on this route (handlers.go) is
358 // also necessary but not sufficient; promhttp does its own gzip layer.
359 func Handler(user, pass string) http.Handler {
360 h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{
361 Registry: Registry,
362 DisableCompression: true,
363 })
364 if user == "" && pass == "" {
365 return h
366 }
367 expectedUser := []byte(user)
368 expectedPass := []byte(pass)
369 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
370 gotUser, gotPass, ok := r.BasicAuth()
371 if !ok ||
372 subtle.ConstantTimeCompare([]byte(gotUser), expectedUser) != 1 ||
373 subtle.ConstantTimeCompare([]byte(gotPass), expectedPass) != 1 {
374 w.Header().Set("WWW-Authenticate", `Basic realm="metrics"`)
375 http.Error(w, "unauthorized", http.StatusUnauthorized)
376 return
377 }
378 h.ServeHTTP(w, r)
379 })
380 }
381