Go · 7459 bytes Raw Blame History
1 // SPDX-License-Identifier: AGPL-3.0-or-later
2
3 // Package metrics owns the Prometheus registry. Standard metrics are
4 // instantiated up front; per-package metrics register against the same
5 // shared registry.
6 package metrics
7
8 import (
9 "crypto/subtle"
10 "net/http"
11
12 "github.com/prometheus/client_golang/prometheus"
13 "github.com/prometheus/client_golang/prometheus/collectors"
14 "github.com/prometheus/client_golang/prometheus/promhttp"
15 )
16
17 // Registry is the project-wide Prometheus registry. Subpackages register
18 // their collectors against this so /metrics has a single source.
19 var Registry = prometheus.NewRegistry()
20
21 // Standard process / Go runtime metrics.
22 func init() {
23 Registry.MustRegister(
24 collectors.NewGoCollector(),
25 collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
26 )
27 }
28
29 // HTTP request metrics. Wired by the HTTP middleware.
30 var (
31 HTTPRequestsTotal = prometheus.NewCounterVec(
32 prometheus.CounterOpts{
33 Name: "shithub_http_requests_total",
34 Help: "Total HTTP requests by route, method, and status.",
35 },
36 []string{"route", "method", "status"},
37 )
38 HTTPRequestDuration = prometheus.NewHistogramVec(
39 prometheus.HistogramOpts{
40 Name: "shithub_http_request_duration_seconds",
41 Help: "HTTP request duration distribution by route and method.",
42 Buckets: prometheus.ExponentialBuckets(0.001, 2.5, 12),
43 },
44 []string{"route", "method"},
45 )
46 HTTPInFlight = prometheus.NewGauge(
47 prometheus.GaugeOpts{
48 Name: "shithub_http_in_flight",
49 Help: "Number of HTTP requests currently in flight.",
50 },
51 )
52 PanicsTotal = prometheus.NewCounter(
53 prometheus.CounterOpts{
54 Name: "shithub_panics_total",
55 Help: "Total panics caught by the recover middleware.",
56 },
57 )
58 )
59
60 // DB pool metrics. Updated periodically by an observer goroutine that the
61 // caller starts via Observe(pool, interval).
62 var (
63 DBConnsAcquired = prometheus.NewGauge(
64 prometheus.GaugeOpts{
65 Name: "shithub_db_pool_acquired",
66 Help: "Postgres connections currently checked out of the pool.",
67 },
68 )
69 DBConnsIdle = prometheus.NewGauge(
70 prometheus.GaugeOpts{
71 Name: "shithub_db_pool_idle",
72 Help: "Postgres connections currently idle in the pool.",
73 },
74 )
75 DBConnsTotal = prometheus.NewGauge(
76 prometheus.GaugeOpts{
77 Name: "shithub_db_pool_total",
78 Help: "Postgres connections currently held by the pool.",
79 },
80 )
81 DBAcquireWaitDurationTotal = prometheus.NewCounter(
82 prometheus.CounterOpts{
83 Name: "shithub_db_pool_acquire_wait_seconds_total",
84 Help: "Cumulative time clients spent waiting to acquire a Postgres connection.",
85 },
86 )
87 )
88
89 // Worker metrics. The pool updates these on every dispatch.
90 var (
91 WorkerJobsProcessedTotal = prometheus.NewCounterVec(
92 prometheus.CounterOpts{
93 Name: "shithub_worker_jobs_processed_total",
94 Help: "Worker jobs processed by kind and outcome (ok, retry, failed, poison).",
95 },
96 []string{"kind", "outcome"},
97 )
98 WorkerJobDurationSeconds = prometheus.NewHistogramVec(
99 prometheus.HistogramOpts{
100 Name: "shithub_worker_job_duration_seconds",
101 Help: "Worker handler latency by kind.",
102 Buckets: prometheus.ExponentialBuckets(0.005, 2.5, 12),
103 },
104 []string{"kind"},
105 )
106 WorkerInFlight = prometheus.NewGaugeVec(
107 prometheus.GaugeOpts{
108 Name: "shithub_worker_in_flight",
109 Help: "Worker handler invocations currently in flight by kind.",
110 },
111 []string{"kind"},
112 )
113 )
114
115 // Actions trigger pipeline metrics (S41b). Incremented from
116 // internal/actions/trigger.
117 var (
118 ActionsRunsEnqueuedTotal = prometheus.NewCounterVec(
119 prometheus.CounterOpts{
120 Name: "shithub_actions_runs_enqueued_total",
121 Help: "Total workflow runs enqueued by triggering event kind. Result is 'fresh' for new runs or 'already_exists' when ON CONFLICT noop'd.",
122 },
123 []string{"event", "result"},
124 )
125 ActionsTriggerMatchDurationSeconds = prometheus.NewHistogram(
126 prometheus.HistogramOpts{
127 Name: "shithub_actions_trigger_match_duration_seconds",
128 Help: "Wall-clock time spent in the trigger handler discovering + parsing + matching workflows for one triggering event.",
129 Buckets: prometheus.ExponentialBuckets(0.005, 2.0, 12),
130 },
131 )
132 ActionsRunnerRegistrationsTotal = prometheus.NewCounter(
133 prometheus.CounterOpts{
134 Name: "shithub_actions_runner_registrations_total",
135 Help: "Total Actions runners registered through operator tooling.",
136 },
137 )
138 ActionsRunnerHeartbeatsTotal = prometheus.NewCounterVec(
139 prometheus.CounterOpts{
140 Name: "shithub_actions_runner_heartbeats_total",
141 Help: "Total runner heartbeats by result (claimed, no_job).",
142 },
143 []string{"result"},
144 )
145 ActionsRunnerJWTTotal = prometheus.NewCounterVec(
146 prometheus.CounterOpts{
147 Name: "shithub_actions_runner_jwt_total",
148 Help: "Total runner job JWT outcomes by result (issued, rejected, replay).",
149 },
150 []string{"result"},
151 )
152 ActionsJobsCancelledTotal = prometheus.NewCounterVec(
153 prometheus.CounterOpts{
154 Name: "shithub_actions_jobs_cancelled_total",
155 Help: "Total Actions job cancellation requests by reason (user, concurrency, timeout).",
156 },
157 []string{"reason"},
158 )
159 ActionsLogScrubReplacementsTotal = prometheus.NewCounterVec(
160 prometheus.CounterOpts{
161 Name: "shithub_actions_log_scrub_replacements_total",
162 Help: "Total exact secret-value replacements performed on Actions log chunks.",
163 },
164 []string{"location"},
165 )
166 ActionsRunsPrunedTotal = prometheus.NewCounterVec(
167 prometheus.CounterOpts{
168 Name: "shithub_actions_runs_pruned_total",
169 Help: "Total Actions retention deletions by kind (chunks, blobs, runs, jwt_used).",
170 },
171 []string{"kind"},
172 )
173 )
174
175 func init() {
176 Registry.MustRegister(
177 HTTPRequestsTotal,
178 HTTPRequestDuration,
179 HTTPInFlight,
180 PanicsTotal,
181 DBConnsAcquired,
182 DBConnsIdle,
183 DBConnsTotal,
184 DBAcquireWaitDurationTotal,
185 WorkerJobsProcessedTotal,
186 WorkerJobDurationSeconds,
187 WorkerInFlight,
188 ActionsRunsEnqueuedTotal,
189 ActionsTriggerMatchDurationSeconds,
190 ActionsRunnerRegistrationsTotal,
191 ActionsRunnerHeartbeatsTotal,
192 ActionsRunnerJWTTotal,
193 ActionsJobsCancelledTotal,
194 ActionsLogScrubReplacementsTotal,
195 ActionsRunsPrunedTotal,
196 )
197 }
198
199 // Handler returns the /metrics HTTP handler. When user/pass is set, the
200 // handler enforces HTTP Basic auth; otherwise it serves unauthenticated
201 // (S35 will tighten the policy).
202 //
203 // DisableCompression: promhttp gzips responses when the scraper sends
204 // Accept-Encoding: gzip. Alloy 1.16's Prometheus scraper advertises gzip
205 // but mishandles the Content-Encoding: gzip response (parses raw 0x1f
206 // magic byte as text, scrape fails with up=0). Bypass at the source —
207 // /metrics payload is small enough that wire savings are irrelevant.
208 // Skipping the chi Compress middleware on this route (handlers.go) is
209 // also necessary but not sufficient; promhttp does its own gzip layer.
210 func Handler(user, pass string) http.Handler {
211 h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{
212 Registry: Registry,
213 DisableCompression: true,
214 })
215 if user == "" && pass == "" {
216 return h
217 }
218 expectedUser := []byte(user)
219 expectedPass := []byte(pass)
220 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
221 gotUser, gotPass, ok := r.BasicAuth()
222 if !ok ||
223 subtle.ConstantTimeCompare([]byte(gotUser), expectedUser) != 1 ||
224 subtle.ConstantTimeCompare([]byte(gotPass), expectedPass) != 1 {
225 w.Header().Set("WWW-Authenticate", `Basic realm="metrics"`)
226 http.Error(w, "unauthorized", http.StatusUnauthorized)
227 return
228 }
229 h.ServeHTTP(w, r)
230 })
231 }
232