tenseleyflow/shithub / 4340aea

Browse files

S37: monitoring — alertmanager + loki + grafana dashboard

Authored by espadonne
SHA
4340aeadd19c310641a4f8e5797821dcdfde503c
Parents
ae13c49
Tree
df566bf

3 changed files

StatusFile+-
A deploy/monitoring/alertmanager/alertmanager.yml 40 0
A deploy/monitoring/grafana/dashboards/shithubd-overview.json 116 0
A deploy/monitoring/loki/loki-config.yaml 56 0
deploy/monitoring/alertmanager/alertmanager.ymladded
@@ -0,0 +1,40 @@
1
+# Alertmanager — routes pages to the operator's pager and ticket-
2
+# severity alerts to email. Webhook URL is templated from the
3
+# operator's secret store; never commit a real one here.
4
+
5
+global:
6
+  resolve_timeout: 5m
7
+
8
+route:
9
+  receiver: tickets
10
+  group_by: [alertname, service]
11
+  group_wait: 30s
12
+  group_interval: 5m
13
+  repeat_interval: 6h
14
+  routes:
15
+    - matchers: [severity="page"]
16
+      receiver: pager
17
+      group_wait: 10s
18
+      repeat_interval: 1h
19
+
20
+receivers:
21
+  - name: tickets
22
+    email_configs:
23
+      - to: ops@shithub.example
24
+        from: alertmanager@shithub.example
25
+        smarthost: smtp.shithub.example:587
26
+        auth_username: alertmanager@shithub.example
27
+        auth_password_file: /etc/alertmanager/smtp.password
28
+        require_tls: true
29
+        send_resolved: true
30
+
31
+  - name: pager
32
+    webhook_configs:
33
+      - url_file: /etc/alertmanager/pager.url
34
+        send_resolved: true
35
+
36
+inhibit_rules:
37
+  # If the whole web tier is down, suppress per-route latency noise.
38
+  - source_matchers: [alertname="ShithubdWebDown"]
39
+    target_matchers: [alertname="HighRequestLatencyP95"]
40
+    equal: [cluster]
deploy/monitoring/grafana/dashboards/shithubd-overview.jsonadded
@@ -0,0 +1,116 @@
1
+{
2
+  "uid": "shithubd-overview",
3
+  "title": "shithubd — overview",
4
+  "tags": ["shithubd"],
5
+  "timezone": "browser",
6
+  "schemaVersion": 39,
7
+  "version": 1,
8
+  "refresh": "30s",
9
+  "time": {"from": "now-6h", "to": "now"},
10
+  "templating": {
11
+    "list": [
12
+      {
13
+        "name": "instance",
14
+        "type": "query",
15
+        "datasource": "Prometheus",
16
+        "query": "label_values(up{job=\"shithubd-web\"}, instance)",
17
+        "includeAll": true,
18
+        "multi": true
19
+      }
20
+    ]
21
+  },
22
+  "panels": [
23
+    {
24
+      "id": 1,
25
+      "type": "stat",
26
+      "title": "Web up",
27
+      "gridPos": {"x": 0, "y": 0, "w": 4, "h": 4},
28
+      "targets": [{"expr": "sum(up{job=\"shithubd-web\"})", "refId": "A"}]
29
+    },
30
+    {
31
+      "id": 2,
32
+      "type": "stat",
33
+      "title": "Worker up",
34
+      "gridPos": {"x": 4, "y": 0, "w": 4, "h": 4},
35
+      "targets": [{"expr": "sum(up{job=\"shithubd-worker\"})", "refId": "A"}]
36
+    },
37
+    {
38
+      "id": 3,
39
+      "type": "stat",
40
+      "title": "Postgres up",
41
+      "gridPos": {"x": 8, "y": 0, "w": 4, "h": 4},
42
+      "targets": [{"expr": "up{job=\"postgres\"}", "refId": "A"}]
43
+    },
44
+    {
45
+      "id": 4,
46
+      "type": "stat",
47
+      "title": "Job queue depth",
48
+      "gridPos": {"x": 12, "y": 0, "w": 4, "h": 4},
49
+      "targets": [{"expr": "shithubd_job_queue_depth", "refId": "A"}],
50
+      "fieldConfig": {
51
+        "defaults": {
52
+          "thresholds": {
53
+            "mode": "absolute",
54
+            "steps": [
55
+              {"color": "green", "value": null},
56
+              {"color": "yellow", "value": 1000},
57
+              {"color": "red", "value": 5000}
58
+            ]
59
+          }
60
+        }
61
+      }
62
+    },
63
+    {
64
+      "id": 5,
65
+      "type": "timeseries",
66
+      "title": "Request rate by route",
67
+      "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
68
+      "targets": [
69
+        {
70
+          "expr": "sum(rate(http_requests_total{instance=~\"$instance\"}[1m])) by (route)",
71
+          "legendFormat": "{{route}}",
72
+          "refId": "A"
73
+        }
74
+      ]
75
+    },
76
+    {
77
+      "id": 6,
78
+      "type": "timeseries",
79
+      "title": "p95 latency by route",
80
+      "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
81
+      "targets": [
82
+        {
83
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{instance=~\"$instance\"}[5m])) by (route, le))",
84
+          "legendFormat": "{{route}}",
85
+          "refId": "A"
86
+        }
87
+      ]
88
+    },
89
+    {
90
+      "id": 7,
91
+      "type": "timeseries",
92
+      "title": "DB calls/sec",
93
+      "gridPos": {"x": 0, "y": 12, "w": 12, "h": 8},
94
+      "targets": [
95
+        {
96
+          "expr": "sum(rate(pg_stat_statements_calls_total[1m]))",
97
+          "legendFormat": "calls/sec",
98
+          "refId": "A"
99
+        }
100
+      ]
101
+    },
102
+    {
103
+      "id": 8,
104
+      "type": "timeseries",
105
+      "title": "Webhook deliveries (success vs failure)",
106
+      "gridPos": {"x": 12, "y": 12, "w": 12, "h": 8},
107
+      "targets": [
108
+        {
109
+          "expr": "sum(rate(shithubd_webhook_deliveries_total[1m])) by (result)",
110
+          "legendFormat": "{{result}}",
111
+          "refId": "A"
112
+        }
113
+      ]
114
+    }
115
+  ]
116
+}
deploy/monitoring/loki/loki-config.yamladded
@@ -0,0 +1,56 @@
1
+# Loki — single-binary mode on the monitoring host. Promtail on
2
+# each app host tails /var/log/shithubd/*.log and ships here over
3
+# the wg0 mesh. Retention is short; long-term log archival is the
4
+# OS journal + Spaces backup.
5
+
6
+auth_enabled: false
7
+
8
+server:
9
+  http_listen_port: 3100
10
+  grpc_listen_port: 9096
11
+
12
+common:
13
+  instance_addr: 10.50.0.10
14
+  path_prefix: /var/lib/loki
15
+  storage:
16
+    filesystem:
17
+      chunks_directory: /var/lib/loki/chunks
18
+      rules_directory: /var/lib/loki/rules
19
+  replication_factor: 1
20
+  ring:
21
+    kvstore:
22
+      store: inmemory
23
+
24
+schema_config:
25
+  configs:
26
+    - from: 2026-01-01
27
+      store: tsdb
28
+      object_store: filesystem
29
+      schema: v13
30
+      index:
31
+        prefix: index_
32
+        period: 24h
33
+
34
+limits_config:
35
+  retention_period: 168h
36
+  ingestion_rate_mb: 8
37
+  ingestion_burst_size_mb: 16
38
+  max_streams_per_user: 5000
39
+
40
+ruler:
41
+  storage:
42
+    type: local
43
+    local:
44
+      directory: /etc/loki/rules
45
+  rule_path: /tmp/loki-rules
46
+  alertmanager_url: http://10.50.0.10:9093
47
+  ring:
48
+    kvstore:
49
+      store: inmemory
50
+  enable_api: true
51
+
52
+compactor:
53
+  working_directory: /var/lib/loki/compactor
54
+  retention_enabled: true
55
+  retention_delete_delay: 2h
56
+  delete_request_store: filesystem