| 1 | // SPDX-License-Identifier: AGPL-3.0-or-later |
| 2 | |
| 3 | // Package metrics owns the Prometheus registry. Standard metrics are |
| 4 | // instantiated up front; per-package metrics register against the same |
| 5 | // shared registry. |
| 6 | package metrics |
| 7 | |
| 8 | import ( |
| 9 | "crypto/subtle" |
| 10 | "net/http" |
| 11 | |
| 12 | "github.com/prometheus/client_golang/prometheus" |
| 13 | "github.com/prometheus/client_golang/prometheus/collectors" |
| 14 | "github.com/prometheus/client_golang/prometheus/promhttp" |
| 15 | ) |
| 16 | |
| 17 | // Registry is the project-wide Prometheus registry. Subpackages register |
| 18 | // their collectors against this so /metrics has a single source. |
| 19 | var Registry = prometheus.NewRegistry() |
| 20 | |
| 21 | // Standard process / Go runtime metrics. |
| 22 | func init() { |
| 23 | Registry.MustRegister( |
| 24 | collectors.NewGoCollector(), |
| 25 | collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), |
| 26 | ) |
| 27 | } |
| 28 | |
| 29 | // HTTP request metrics. Wired by the HTTP middleware. |
| 30 | var ( |
| 31 | HTTPRequestsTotal = prometheus.NewCounterVec( |
| 32 | prometheus.CounterOpts{ |
| 33 | Name: "shithub_http_requests_total", |
| 34 | Help: "Total HTTP requests by route, method, and status.", |
| 35 | }, |
| 36 | []string{"route", "method", "status"}, |
| 37 | ) |
| 38 | HTTPRequestDuration = prometheus.NewHistogramVec( |
| 39 | prometheus.HistogramOpts{ |
| 40 | Name: "shithub_http_request_duration_seconds", |
| 41 | Help: "HTTP request duration distribution by route and method.", |
| 42 | Buckets: prometheus.ExponentialBuckets(0.001, 2.5, 12), |
| 43 | }, |
| 44 | []string{"route", "method"}, |
| 45 | ) |
| 46 | HTTPInFlight = prometheus.NewGauge( |
| 47 | prometheus.GaugeOpts{ |
| 48 | Name: "shithub_http_in_flight", |
| 49 | Help: "Number of HTTP requests currently in flight.", |
| 50 | }, |
| 51 | ) |
| 52 | PanicsTotal = prometheus.NewCounter( |
| 53 | prometheus.CounterOpts{ |
| 54 | Name: "shithub_panics_total", |
| 55 | Help: "Total panics caught by the recover middleware.", |
| 56 | }, |
| 57 | ) |
| 58 | ) |
| 59 | |
| 60 | // DB pool metrics. Updated periodically by an observer goroutine that the |
| 61 | // caller starts via Observe(pool, interval). |
| 62 | var ( |
| 63 | DBConnsAcquired = prometheus.NewGauge( |
| 64 | prometheus.GaugeOpts{ |
| 65 | Name: "shithub_db_pool_acquired", |
| 66 | Help: "Postgres connections currently checked out of the pool.", |
| 67 | }, |
| 68 | ) |
| 69 | DBConnsIdle = prometheus.NewGauge( |
| 70 | prometheus.GaugeOpts{ |
| 71 | Name: "shithub_db_pool_idle", |
| 72 | Help: "Postgres connections currently idle in the pool.", |
| 73 | }, |
| 74 | ) |
| 75 | DBConnsTotal = prometheus.NewGauge( |
| 76 | prometheus.GaugeOpts{ |
| 77 | Name: "shithub_db_pool_total", |
| 78 | Help: "Postgres connections currently held by the pool.", |
| 79 | }, |
| 80 | ) |
| 81 | DBAcquireWaitDurationTotal = prometheus.NewCounter( |
| 82 | prometheus.CounterOpts{ |
| 83 | Name: "shithub_db_pool_acquire_wait_seconds_total", |
| 84 | Help: "Cumulative time clients spent waiting to acquire a Postgres connection.", |
| 85 | }, |
| 86 | ) |
| 87 | ) |
| 88 | |
| 89 | // Worker metrics. The pool updates these on every dispatch. |
| 90 | var ( |
| 91 | WorkerJobsProcessedTotal = prometheus.NewCounterVec( |
| 92 | prometheus.CounterOpts{ |
| 93 | Name: "shithub_worker_jobs_processed_total", |
| 94 | Help: "Worker jobs processed by kind and outcome (ok, retry, failed, poison).", |
| 95 | }, |
| 96 | []string{"kind", "outcome"}, |
| 97 | ) |
| 98 | WorkerJobDurationSeconds = prometheus.NewHistogramVec( |
| 99 | prometheus.HistogramOpts{ |
| 100 | Name: "shithub_worker_job_duration_seconds", |
| 101 | Help: "Worker handler latency by kind.", |
| 102 | Buckets: prometheus.ExponentialBuckets(0.005, 2.5, 12), |
| 103 | }, |
| 104 | []string{"kind"}, |
| 105 | ) |
| 106 | WorkerInFlight = prometheus.NewGaugeVec( |
| 107 | prometheus.GaugeOpts{ |
| 108 | Name: "shithub_worker_in_flight", |
| 109 | Help: "Worker handler invocations currently in flight by kind.", |
| 110 | }, |
| 111 | []string{"kind"}, |
| 112 | ) |
| 113 | ) |
| 114 | |
| 115 | // Actions trigger pipeline metrics (S41b). Incremented from |
| 116 | // internal/actions/trigger. |
| 117 | var ( |
| 118 | ActionsRunsEnqueuedTotal = prometheus.NewCounterVec( |
| 119 | prometheus.CounterOpts{ |
| 120 | Name: "shithub_actions_runs_enqueued_total", |
| 121 | Help: "Total workflow runs enqueued by triggering event kind. Result is 'fresh' for new runs or 'already_exists' when ON CONFLICT noop'd.", |
| 122 | }, |
| 123 | []string{"event", "result"}, |
| 124 | ) |
| 125 | ActionsTriggerMatchDurationSeconds = prometheus.NewHistogram( |
| 126 | prometheus.HistogramOpts{ |
| 127 | Name: "shithub_actions_trigger_match_duration_seconds", |
| 128 | Help: "Wall-clock time spent in the trigger handler discovering + parsing + matching workflows for one triggering event.", |
| 129 | Buckets: prometheus.ExponentialBuckets(0.005, 2.0, 12), |
| 130 | }, |
| 131 | ) |
| 132 | ActionsRunnerRegistrationsTotal = prometheus.NewCounter( |
| 133 | prometheus.CounterOpts{ |
| 134 | Name: "shithub_actions_runner_registrations_total", |
| 135 | Help: "Total Actions runners registered through operator tooling.", |
| 136 | }, |
| 137 | ) |
| 138 | ActionsRunnerHeartbeatsTotal = prometheus.NewCounterVec( |
| 139 | prometheus.CounterOpts{ |
| 140 | Name: "shithub_actions_runner_heartbeats_total", |
| 141 | Help: "Total runner heartbeats by result (claimed, no_job).", |
| 142 | }, |
| 143 | []string{"result"}, |
| 144 | ) |
| 145 | ActionsRunnerJWTTotal = prometheus.NewCounterVec( |
| 146 | prometheus.CounterOpts{ |
| 147 | Name: "shithub_actions_runner_jwt_total", |
| 148 | Help: "Total runner job JWT outcomes by result (issued, rejected, replay).", |
| 149 | }, |
| 150 | []string{"result"}, |
| 151 | ) |
| 152 | ActionsJobsCancelledTotal = prometheus.NewCounterVec( |
| 153 | prometheus.CounterOpts{ |
| 154 | Name: "shithub_actions_jobs_cancelled_total", |
| 155 | Help: "Total Actions job cancellation requests by reason (user, concurrency, timeout).", |
| 156 | }, |
| 157 | []string{"reason"}, |
| 158 | ) |
| 159 | ActionsConcurrencyQueuedTotal = prometheus.NewCounter( |
| 160 | prometheus.CounterOpts{ |
| 161 | Name: "shithub_actions_concurrency_queued_total", |
| 162 | Help: "Total Actions workflow runs queued behind an older active run in the same concurrency group.", |
| 163 | }, |
| 164 | ) |
| 165 | ActionsLogScrubReplacementsTotal = prometheus.NewCounterVec( |
| 166 | prometheus.CounterOpts{ |
| 167 | Name: "shithub_actions_log_scrub_replacements_total", |
| 168 | Help: "Total exact secret-value replacements performed on Actions log chunks.", |
| 169 | }, |
| 170 | []string{"location"}, |
| 171 | ) |
| 172 | ActionsRunsPrunedTotal = prometheus.NewCounterVec( |
| 173 | prometheus.CounterOpts{ |
| 174 | Name: "shithub_actions_runs_pruned_total", |
| 175 | Help: "Total Actions retention deletions by kind (chunks, blobs, runs, jwt_used).", |
| 176 | }, |
| 177 | []string{"kind"}, |
| 178 | ) |
| 179 | ActionsStepTimeoutsTotal = prometheus.NewCounter( |
| 180 | prometheus.CounterOpts{ |
| 181 | Name: "shithub_actions_step_timeouts_total", |
| 182 | Help: "Total Actions steps reported as timed out by runners.", |
| 183 | }, |
| 184 | ) |
| 185 | ) |
| 186 | |
| 187 | func init() { |
| 188 | Registry.MustRegister( |
| 189 | HTTPRequestsTotal, |
| 190 | HTTPRequestDuration, |
| 191 | HTTPInFlight, |
| 192 | PanicsTotal, |
| 193 | DBConnsAcquired, |
| 194 | DBConnsIdle, |
| 195 | DBConnsTotal, |
| 196 | DBAcquireWaitDurationTotal, |
| 197 | WorkerJobsProcessedTotal, |
| 198 | WorkerJobDurationSeconds, |
| 199 | WorkerInFlight, |
| 200 | ActionsRunsEnqueuedTotal, |
| 201 | ActionsTriggerMatchDurationSeconds, |
| 202 | ActionsRunnerRegistrationsTotal, |
| 203 | ActionsRunnerHeartbeatsTotal, |
| 204 | ActionsRunnerJWTTotal, |
| 205 | ActionsJobsCancelledTotal, |
| 206 | ActionsConcurrencyQueuedTotal, |
| 207 | ActionsLogScrubReplacementsTotal, |
| 208 | ActionsRunsPrunedTotal, |
| 209 | ActionsStepTimeoutsTotal, |
| 210 | ) |
| 211 | } |
| 212 | |
| 213 | // Handler returns the /metrics HTTP handler. When user/pass is set, the |
| 214 | // handler enforces HTTP Basic auth; otherwise it serves unauthenticated |
| 215 | // (S35 will tighten the policy). |
| 216 | // |
| 217 | // DisableCompression: promhttp gzips responses when the scraper sends |
| 218 | // Accept-Encoding: gzip. Alloy 1.16's Prometheus scraper advertises gzip |
| 219 | // but mishandles the Content-Encoding: gzip response (parses raw 0x1f |
| 220 | // magic byte as text, scrape fails with up=0). Bypass at the source — |
| 221 | // /metrics payload is small enough that wire savings are irrelevant. |
| 222 | // Skipping the chi Compress middleware on this route (handlers.go) is |
| 223 | // also necessary but not sufficient; promhttp does its own gzip layer. |
| 224 | func Handler(user, pass string) http.Handler { |
| 225 | h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{ |
| 226 | Registry: Registry, |
| 227 | DisableCompression: true, |
| 228 | }) |
| 229 | if user == "" && pass == "" { |
| 230 | return h |
| 231 | } |
| 232 | expectedUser := []byte(user) |
| 233 | expectedPass := []byte(pass) |
| 234 | return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { |
| 235 | gotUser, gotPass, ok := r.BasicAuth() |
| 236 | if !ok || |
| 237 | subtle.ConstantTimeCompare([]byte(gotUser), expectedUser) != 1 || |
| 238 | subtle.ConstantTimeCompare([]byte(gotPass), expectedPass) != 1 { |
| 239 | w.Header().Set("WWW-Authenticate", `Basic realm="metrics"`) |
| 240 | http.Error(w, "unauthorized", http.StatusUnauthorized) |
| 241 | return |
| 242 | } |
| 243 | h.ServeHTTP(w, r) |
| 244 | }) |
| 245 | } |
| 246 |