Go · 7967 bytes Raw Blame History
1 // SPDX-License-Identifier: AGPL-3.0-or-later
2
3 // Package metrics owns the Prometheus registry. Standard metrics are
4 // instantiated up front; per-package metrics register against the same
5 // shared registry.
6 package metrics
7
8 import (
9 "crypto/subtle"
10 "net/http"
11
12 "github.com/prometheus/client_golang/prometheus"
13 "github.com/prometheus/client_golang/prometheus/collectors"
14 "github.com/prometheus/client_golang/prometheus/promhttp"
15 )
16
17 // Registry is the project-wide Prometheus registry. Subpackages register
18 // their collectors against this so /metrics has a single source.
19 var Registry = prometheus.NewRegistry()
20
21 // Standard process / Go runtime metrics.
22 func init() {
23 Registry.MustRegister(
24 collectors.NewGoCollector(),
25 collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
26 )
27 }
28
29 // HTTP request metrics. Wired by the HTTP middleware.
30 var (
31 HTTPRequestsTotal = prometheus.NewCounterVec(
32 prometheus.CounterOpts{
33 Name: "shithub_http_requests_total",
34 Help: "Total HTTP requests by route, method, and status.",
35 },
36 []string{"route", "method", "status"},
37 )
38 HTTPRequestDuration = prometheus.NewHistogramVec(
39 prometheus.HistogramOpts{
40 Name: "shithub_http_request_duration_seconds",
41 Help: "HTTP request duration distribution by route and method.",
42 Buckets: prometheus.ExponentialBuckets(0.001, 2.5, 12),
43 },
44 []string{"route", "method"},
45 )
46 HTTPInFlight = prometheus.NewGauge(
47 prometheus.GaugeOpts{
48 Name: "shithub_http_in_flight",
49 Help: "Number of HTTP requests currently in flight.",
50 },
51 )
52 PanicsTotal = prometheus.NewCounter(
53 prometheus.CounterOpts{
54 Name: "shithub_panics_total",
55 Help: "Total panics caught by the recover middleware.",
56 },
57 )
58 )
59
60 // DB pool metrics. Updated periodically by an observer goroutine that the
61 // caller starts via Observe(pool, interval).
62 var (
63 DBConnsAcquired = prometheus.NewGauge(
64 prometheus.GaugeOpts{
65 Name: "shithub_db_pool_acquired",
66 Help: "Postgres connections currently checked out of the pool.",
67 },
68 )
69 DBConnsIdle = prometheus.NewGauge(
70 prometheus.GaugeOpts{
71 Name: "shithub_db_pool_idle",
72 Help: "Postgres connections currently idle in the pool.",
73 },
74 )
75 DBConnsTotal = prometheus.NewGauge(
76 prometheus.GaugeOpts{
77 Name: "shithub_db_pool_total",
78 Help: "Postgres connections currently held by the pool.",
79 },
80 )
81 DBAcquireWaitDurationTotal = prometheus.NewCounter(
82 prometheus.CounterOpts{
83 Name: "shithub_db_pool_acquire_wait_seconds_total",
84 Help: "Cumulative time clients spent waiting to acquire a Postgres connection.",
85 },
86 )
87 )
88
89 // Worker metrics. The pool updates these on every dispatch.
90 var (
91 WorkerJobsProcessedTotal = prometheus.NewCounterVec(
92 prometheus.CounterOpts{
93 Name: "shithub_worker_jobs_processed_total",
94 Help: "Worker jobs processed by kind and outcome (ok, retry, failed, poison).",
95 },
96 []string{"kind", "outcome"},
97 )
98 WorkerJobDurationSeconds = prometheus.NewHistogramVec(
99 prometheus.HistogramOpts{
100 Name: "shithub_worker_job_duration_seconds",
101 Help: "Worker handler latency by kind.",
102 Buckets: prometheus.ExponentialBuckets(0.005, 2.5, 12),
103 },
104 []string{"kind"},
105 )
106 WorkerInFlight = prometheus.NewGaugeVec(
107 prometheus.GaugeOpts{
108 Name: "shithub_worker_in_flight",
109 Help: "Worker handler invocations currently in flight by kind.",
110 },
111 []string{"kind"},
112 )
113 )
114
115 // Actions trigger pipeline metrics (S41b). Incremented from
116 // internal/actions/trigger.
117 var (
118 ActionsRunsEnqueuedTotal = prometheus.NewCounterVec(
119 prometheus.CounterOpts{
120 Name: "shithub_actions_runs_enqueued_total",
121 Help: "Total workflow runs enqueued by triggering event kind. Result is 'fresh' for new runs or 'already_exists' when ON CONFLICT noop'd.",
122 },
123 []string{"event", "result"},
124 )
125 ActionsTriggerMatchDurationSeconds = prometheus.NewHistogram(
126 prometheus.HistogramOpts{
127 Name: "shithub_actions_trigger_match_duration_seconds",
128 Help: "Wall-clock time spent in the trigger handler discovering + parsing + matching workflows for one triggering event.",
129 Buckets: prometheus.ExponentialBuckets(0.005, 2.0, 12),
130 },
131 )
132 ActionsRunnerRegistrationsTotal = prometheus.NewCounter(
133 prometheus.CounterOpts{
134 Name: "shithub_actions_runner_registrations_total",
135 Help: "Total Actions runners registered through operator tooling.",
136 },
137 )
138 ActionsRunnerHeartbeatsTotal = prometheus.NewCounterVec(
139 prometheus.CounterOpts{
140 Name: "shithub_actions_runner_heartbeats_total",
141 Help: "Total runner heartbeats by result (claimed, no_job).",
142 },
143 []string{"result"},
144 )
145 ActionsRunnerJWTTotal = prometheus.NewCounterVec(
146 prometheus.CounterOpts{
147 Name: "shithub_actions_runner_jwt_total",
148 Help: "Total runner job JWT outcomes by result (issued, rejected, replay).",
149 },
150 []string{"result"},
151 )
152 ActionsJobsCancelledTotal = prometheus.NewCounterVec(
153 prometheus.CounterOpts{
154 Name: "shithub_actions_jobs_cancelled_total",
155 Help: "Total Actions job cancellation requests by reason (user, concurrency, timeout).",
156 },
157 []string{"reason"},
158 )
159 ActionsConcurrencyQueuedTotal = prometheus.NewCounter(
160 prometheus.CounterOpts{
161 Name: "shithub_actions_concurrency_queued_total",
162 Help: "Total Actions workflow runs queued behind an older active run in the same concurrency group.",
163 },
164 )
165 ActionsLogScrubReplacementsTotal = prometheus.NewCounterVec(
166 prometheus.CounterOpts{
167 Name: "shithub_actions_log_scrub_replacements_total",
168 Help: "Total exact secret-value replacements performed on Actions log chunks.",
169 },
170 []string{"location"},
171 )
172 ActionsRunsPrunedTotal = prometheus.NewCounterVec(
173 prometheus.CounterOpts{
174 Name: "shithub_actions_runs_pruned_total",
175 Help: "Total Actions retention deletions by kind (chunks, blobs, runs, jwt_used).",
176 },
177 []string{"kind"},
178 )
179 ActionsStepTimeoutsTotal = prometheus.NewCounter(
180 prometheus.CounterOpts{
181 Name: "shithub_actions_step_timeouts_total",
182 Help: "Total Actions steps reported as timed out by runners.",
183 },
184 )
185 )
186
187 func init() {
188 Registry.MustRegister(
189 HTTPRequestsTotal,
190 HTTPRequestDuration,
191 HTTPInFlight,
192 PanicsTotal,
193 DBConnsAcquired,
194 DBConnsIdle,
195 DBConnsTotal,
196 DBAcquireWaitDurationTotal,
197 WorkerJobsProcessedTotal,
198 WorkerJobDurationSeconds,
199 WorkerInFlight,
200 ActionsRunsEnqueuedTotal,
201 ActionsTriggerMatchDurationSeconds,
202 ActionsRunnerRegistrationsTotal,
203 ActionsRunnerHeartbeatsTotal,
204 ActionsRunnerJWTTotal,
205 ActionsJobsCancelledTotal,
206 ActionsConcurrencyQueuedTotal,
207 ActionsLogScrubReplacementsTotal,
208 ActionsRunsPrunedTotal,
209 ActionsStepTimeoutsTotal,
210 )
211 }
212
213 // Handler returns the /metrics HTTP handler. When user/pass is set, the
214 // handler enforces HTTP Basic auth; otherwise it serves unauthenticated
215 // (S35 will tighten the policy).
216 //
217 // DisableCompression: promhttp gzips responses when the scraper sends
218 // Accept-Encoding: gzip. Alloy 1.16's Prometheus scraper advertises gzip
219 // but mishandles the Content-Encoding: gzip response (parses raw 0x1f
220 // magic byte as text, scrape fails with up=0). Bypass at the source —
221 // /metrics payload is small enough that wire savings are irrelevant.
222 // Skipping the chi Compress middleware on this route (handlers.go) is
223 // also necessary but not sufficient; promhttp does its own gzip layer.
224 func Handler(user, pass string) http.Handler {
225 h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{
226 Registry: Registry,
227 DisableCompression: true,
228 })
229 if user == "" && pass == "" {
230 return h
231 }
232 expectedUser := []byte(user)
233 expectedPass := []byte(pass)
234 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
235 gotUser, gotPass, ok := r.BasicAuth()
236 if !ok ||
237 subtle.ConstantTimeCompare([]byte(gotUser), expectedUser) != 1 ||
238 subtle.ConstantTimeCompare([]byte(gotPass), expectedPass) != 1 {
239 w.Header().Set("WWW-Authenticate", `Basic realm="metrics"`)
240 http.Error(w, "unauthorized", http.StatusUnauthorized)
241 return
242 }
243 h.ServeHTTP(w, r)
244 })
245 }
246