S37: monitoring — alertmanager + loki + grafana dashboard
- SHA
4340aeadd19c310641a4f8e5797821dcdfde503c- Parents
-
ae13c49 - Tree
df566bf
4340aea
4340aeadd19c310641a4f8e5797821dcdfde503cae13c49
df566bf| Status | File | + | - |
|---|---|---|---|
| A |
deploy/monitoring/alertmanager/alertmanager.yml
|
40 | 0 |
| A |
deploy/monitoring/grafana/dashboards/shithubd-overview.json
|
116 | 0 |
| A |
deploy/monitoring/loki/loki-config.yaml
|
56 | 0 |
deploy/monitoring/alertmanager/alertmanager.ymladded@@ -0,0 +1,40 @@ | ||
| 1 | +# Alertmanager — routes pages to the operator's pager and ticket- | |
| 2 | +# severity alerts to email. Webhook URL is templated from the | |
| 3 | +# operator's secret store; never commit a real one here. | |
| 4 | + | |
| 5 | +global: | |
| 6 | + resolve_timeout: 5m | |
| 7 | + | |
| 8 | +route: | |
| 9 | + receiver: tickets | |
| 10 | + group_by: [alertname, service] | |
| 11 | + group_wait: 30s | |
| 12 | + group_interval: 5m | |
| 13 | + repeat_interval: 6h | |
| 14 | + routes: | |
| 15 | + - matchers: [severity="page"] | |
| 16 | + receiver: pager | |
| 17 | + group_wait: 10s | |
| 18 | + repeat_interval: 1h | |
| 19 | + | |
| 20 | +receivers: | |
| 21 | + - name: tickets | |
| 22 | + email_configs: | |
| 23 | + - to: ops@shithub.example | |
| 24 | + from: alertmanager@shithub.example | |
| 25 | + smarthost: smtp.shithub.example:587 | |
| 26 | + auth_username: alertmanager@shithub.example | |
| 27 | + auth_password_file: /etc/alertmanager/smtp.password | |
| 28 | + require_tls: true | |
| 29 | + send_resolved: true | |
| 30 | + | |
| 31 | + - name: pager | |
| 32 | + webhook_configs: | |
| 33 | + - url_file: /etc/alertmanager/pager.url | |
| 34 | + send_resolved: true | |
| 35 | + | |
| 36 | +inhibit_rules: | |
| 37 | + # If the whole web tier is down, suppress per-route latency noise. | |
| 38 | + - source_matchers: [alertname="ShithubdWebDown"] | |
| 39 | + target_matchers: [alertname="HighRequestLatencyP95"] | |
| 40 | + equal: [cluster] | |
deploy/monitoring/grafana/dashboards/shithubd-overview.jsonadded@@ -0,0 +1,116 @@ | ||
| 1 | +{ | |
| 2 | + "uid": "shithubd-overview", | |
| 3 | + "title": "shithubd — overview", | |
| 4 | + "tags": ["shithubd"], | |
| 5 | + "timezone": "browser", | |
| 6 | + "schemaVersion": 39, | |
| 7 | + "version": 1, | |
| 8 | + "refresh": "30s", | |
| 9 | + "time": {"from": "now-6h", "to": "now"}, | |
| 10 | + "templating": { | |
| 11 | + "list": [ | |
| 12 | + { | |
| 13 | + "name": "instance", | |
| 14 | + "type": "query", | |
| 15 | + "datasource": "Prometheus", | |
| 16 | + "query": "label_values(up{job=\"shithubd-web\"}, instance)", | |
| 17 | + "includeAll": true, | |
| 18 | + "multi": true | |
| 19 | + } | |
| 20 | + ] | |
| 21 | + }, | |
| 22 | + "panels": [ | |
| 23 | + { | |
| 24 | + "id": 1, | |
| 25 | + "type": "stat", | |
| 26 | + "title": "Web up", | |
| 27 | + "gridPos": {"x": 0, "y": 0, "w": 4, "h": 4}, | |
| 28 | + "targets": [{"expr": "sum(up{job=\"shithubd-web\"})", "refId": "A"}] | |
| 29 | + }, | |
| 30 | + { | |
| 31 | + "id": 2, | |
| 32 | + "type": "stat", | |
| 33 | + "title": "Worker up", | |
| 34 | + "gridPos": {"x": 4, "y": 0, "w": 4, "h": 4}, | |
| 35 | + "targets": [{"expr": "sum(up{job=\"shithubd-worker\"})", "refId": "A"}] | |
| 36 | + }, | |
| 37 | + { | |
| 38 | + "id": 3, | |
| 39 | + "type": "stat", | |
| 40 | + "title": "Postgres up", | |
| 41 | + "gridPos": {"x": 8, "y": 0, "w": 4, "h": 4}, | |
| 42 | + "targets": [{"expr": "up{job=\"postgres\"}", "refId": "A"}] | |
| 43 | + }, | |
| 44 | + { | |
| 45 | + "id": 4, | |
| 46 | + "type": "stat", | |
| 47 | + "title": "Job queue depth", | |
| 48 | + "gridPos": {"x": 12, "y": 0, "w": 4, "h": 4}, | |
| 49 | + "targets": [{"expr": "shithubd_job_queue_depth", "refId": "A"}], | |
| 50 | + "fieldConfig": { | |
| 51 | + "defaults": { | |
| 52 | + "thresholds": { | |
| 53 | + "mode": "absolute", | |
| 54 | + "steps": [ | |
| 55 | + {"color": "green", "value": null}, | |
| 56 | + {"color": "yellow", "value": 1000}, | |
| 57 | + {"color": "red", "value": 5000} | |
| 58 | + ] | |
| 59 | + } | |
| 60 | + } | |
| 61 | + } | |
| 62 | + }, | |
| 63 | + { | |
| 64 | + "id": 5, | |
| 65 | + "type": "timeseries", | |
| 66 | + "title": "Request rate by route", | |
| 67 | + "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8}, | |
| 68 | + "targets": [ | |
| 69 | + { | |
| 70 | + "expr": "sum(rate(http_requests_total{instance=~\"$instance\"}[1m])) by (route)", | |
| 71 | + "legendFormat": "{{route}}", | |
| 72 | + "refId": "A" | |
| 73 | + } | |
| 74 | + ] | |
| 75 | + }, | |
| 76 | + { | |
| 77 | + "id": 6, | |
| 78 | + "type": "timeseries", | |
| 79 | + "title": "p95 latency by route", | |
| 80 | + "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8}, | |
| 81 | + "targets": [ | |
| 82 | + { | |
| 83 | + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{instance=~\"$instance\"}[5m])) by (route, le))", | |
| 84 | + "legendFormat": "{{route}}", | |
| 85 | + "refId": "A" | |
| 86 | + } | |
| 87 | + ] | |
| 88 | + }, | |
| 89 | + { | |
| 90 | + "id": 7, | |
| 91 | + "type": "timeseries", | |
| 92 | + "title": "DB calls/sec", | |
| 93 | + "gridPos": {"x": 0, "y": 12, "w": 12, "h": 8}, | |
| 94 | + "targets": [ | |
| 95 | + { | |
| 96 | + "expr": "sum(rate(pg_stat_statements_calls_total[1m]))", | |
| 97 | + "legendFormat": "calls/sec", | |
| 98 | + "refId": "A" | |
| 99 | + } | |
| 100 | + ] | |
| 101 | + }, | |
| 102 | + { | |
| 103 | + "id": 8, | |
| 104 | + "type": "timeseries", | |
| 105 | + "title": "Webhook deliveries (success vs failure)", | |
| 106 | + "gridPos": {"x": 12, "y": 12, "w": 12, "h": 8}, | |
| 107 | + "targets": [ | |
| 108 | + { | |
| 109 | + "expr": "sum(rate(shithubd_webhook_deliveries_total[1m])) by (result)", | |
| 110 | + "legendFormat": "{{result}}", | |
| 111 | + "refId": "A" | |
| 112 | + } | |
| 113 | + ] | |
| 114 | + } | |
| 115 | + ] | |
| 116 | +} | |
deploy/monitoring/loki/loki-config.yamladded@@ -0,0 +1,56 @@ | ||
| 1 | +# Loki — single-binary mode on the monitoring host. Promtail on | |
| 2 | +# each app host tails /var/log/shithubd/*.log and ships here over | |
| 3 | +# the wg0 mesh. Retention is short; long-term log archival is the | |
| 4 | +# OS journal + Spaces backup. | |
| 5 | + | |
| 6 | +auth_enabled: false | |
| 7 | + | |
| 8 | +server: | |
| 9 | + http_listen_port: 3100 | |
| 10 | + grpc_listen_port: 9096 | |
| 11 | + | |
| 12 | +common: | |
| 13 | + instance_addr: 10.50.0.10 | |
| 14 | + path_prefix: /var/lib/loki | |
| 15 | + storage: | |
| 16 | + filesystem: | |
| 17 | + chunks_directory: /var/lib/loki/chunks | |
| 18 | + rules_directory: /var/lib/loki/rules | |
| 19 | + replication_factor: 1 | |
| 20 | + ring: | |
| 21 | + kvstore: | |
| 22 | + store: inmemory | |
| 23 | + | |
| 24 | +schema_config: | |
| 25 | + configs: | |
| 26 | + - from: 2026-01-01 | |
| 27 | + store: tsdb | |
| 28 | + object_store: filesystem | |
| 29 | + schema: v13 | |
| 30 | + index: | |
| 31 | + prefix: index_ | |
| 32 | + period: 24h | |
| 33 | + | |
| 34 | +limits_config: | |
| 35 | + retention_period: 168h | |
| 36 | + ingestion_rate_mb: 8 | |
| 37 | + ingestion_burst_size_mb: 16 | |
| 38 | + max_streams_per_user: 5000 | |
| 39 | + | |
| 40 | +ruler: | |
| 41 | + storage: | |
| 42 | + type: local | |
| 43 | + local: | |
| 44 | + directory: /etc/loki/rules | |
| 45 | + rule_path: /tmp/loki-rules | |
| 46 | + alertmanager_url: http://10.50.0.10:9093 | |
| 47 | + ring: | |
| 48 | + kvstore: | |
| 49 | + store: inmemory | |
| 50 | + enable_api: true | |
| 51 | + | |
| 52 | +compactor: | |
| 53 | + working_directory: /var/lib/loki/compactor | |
| 54 | + retention_enabled: true | |
| 55 | + retention_delete_delay: 2h | |
| 56 | + delete_request_store: filesystem | |