S37: monitoring — alertmanager + loki + grafana dashboard
- SHA
4340aeadd19c310641a4f8e5797821dcdfde503c- Parents
-
ae13c49 - Tree
df566bf
4340aea
4340aeadd19c310641a4f8e5797821dcdfde503cae13c49
df566bf| Status | File | + | - |
|---|---|---|---|
| A |
deploy/monitoring/alertmanager/alertmanager.yml
|
40 | 0 |
| A |
deploy/monitoring/grafana/dashboards/shithubd-overview.json
|
116 | 0 |
| A |
deploy/monitoring/loki/loki-config.yaml
|
56 | 0 |
deploy/monitoring/alertmanager/alertmanager.ymladded@@ -0,0 +1,40 @@ | |||
| 1 | +# Alertmanager — routes pages to the operator's pager and ticket- | ||
| 2 | +# severity alerts to email. Webhook URL is templated from the | ||
| 3 | +# operator's secret store; never commit a real one here. | ||
| 4 | + | ||
| 5 | +global: | ||
| 6 | + resolve_timeout: 5m | ||
| 7 | + | ||
| 8 | +route: | ||
| 9 | + receiver: tickets | ||
| 10 | + group_by: [alertname, service] | ||
| 11 | + group_wait: 30s | ||
| 12 | + group_interval: 5m | ||
| 13 | + repeat_interval: 6h | ||
| 14 | + routes: | ||
| 15 | + - matchers: [severity="page"] | ||
| 16 | + receiver: pager | ||
| 17 | + group_wait: 10s | ||
| 18 | + repeat_interval: 1h | ||
| 19 | + | ||
| 20 | +receivers: | ||
| 21 | + - name: tickets | ||
| 22 | + email_configs: | ||
| 23 | + - to: ops@shithub.example | ||
| 24 | + from: alertmanager@shithub.example | ||
| 25 | + smarthost: smtp.shithub.example:587 | ||
| 26 | + auth_username: alertmanager@shithub.example | ||
| 27 | + auth_password_file: /etc/alertmanager/smtp.password | ||
| 28 | + require_tls: true | ||
| 29 | + send_resolved: true | ||
| 30 | + | ||
| 31 | + - name: pager | ||
| 32 | + webhook_configs: | ||
| 33 | + - url_file: /etc/alertmanager/pager.url | ||
| 34 | + send_resolved: true | ||
| 35 | + | ||
| 36 | +inhibit_rules: | ||
| 37 | + # If the whole web tier is down, suppress per-route latency noise. | ||
| 38 | + - source_matchers: [alertname="ShithubdWebDown"] | ||
| 39 | + target_matchers: [alertname="HighRequestLatencyP95"] | ||
| 40 | + equal: [cluster] | ||
deploy/monitoring/grafana/dashboards/shithubd-overview.jsonadded@@ -0,0 +1,116 @@ | |||
| 1 | +{ | ||
| 2 | + "uid": "shithubd-overview", | ||
| 3 | + "title": "shithubd — overview", | ||
| 4 | + "tags": ["shithubd"], | ||
| 5 | + "timezone": "browser", | ||
| 6 | + "schemaVersion": 39, | ||
| 7 | + "version": 1, | ||
| 8 | + "refresh": "30s", | ||
| 9 | + "time": {"from": "now-6h", "to": "now"}, | ||
| 10 | + "templating": { | ||
| 11 | + "list": [ | ||
| 12 | + { | ||
| 13 | + "name": "instance", | ||
| 14 | + "type": "query", | ||
| 15 | + "datasource": "Prometheus", | ||
| 16 | + "query": "label_values(up{job=\"shithubd-web\"}, instance)", | ||
| 17 | + "includeAll": true, | ||
| 18 | + "multi": true | ||
| 19 | + } | ||
| 20 | + ] | ||
| 21 | + }, | ||
| 22 | + "panels": [ | ||
| 23 | + { | ||
| 24 | + "id": 1, | ||
| 25 | + "type": "stat", | ||
| 26 | + "title": "Web up", | ||
| 27 | + "gridPos": {"x": 0, "y": 0, "w": 4, "h": 4}, | ||
| 28 | + "targets": [{"expr": "sum(up{job=\"shithubd-web\"})", "refId": "A"}] | ||
| 29 | + }, | ||
| 30 | + { | ||
| 31 | + "id": 2, | ||
| 32 | + "type": "stat", | ||
| 33 | + "title": "Worker up", | ||
| 34 | + "gridPos": {"x": 4, "y": 0, "w": 4, "h": 4}, | ||
| 35 | + "targets": [{"expr": "sum(up{job=\"shithubd-worker\"})", "refId": "A"}] | ||
| 36 | + }, | ||
| 37 | + { | ||
| 38 | + "id": 3, | ||
| 39 | + "type": "stat", | ||
| 40 | + "title": "Postgres up", | ||
| 41 | + "gridPos": {"x": 8, "y": 0, "w": 4, "h": 4}, | ||
| 42 | + "targets": [{"expr": "up{job=\"postgres\"}", "refId": "A"}] | ||
| 43 | + }, | ||
| 44 | + { | ||
| 45 | + "id": 4, | ||
| 46 | + "type": "stat", | ||
| 47 | + "title": "Job queue depth", | ||
| 48 | + "gridPos": {"x": 12, "y": 0, "w": 4, "h": 4}, | ||
| 49 | + "targets": [{"expr": "shithubd_job_queue_depth", "refId": "A"}], | ||
| 50 | + "fieldConfig": { | ||
| 51 | + "defaults": { | ||
| 52 | + "thresholds": { | ||
| 53 | + "mode": "absolute", | ||
| 54 | + "steps": [ | ||
| 55 | + {"color": "green", "value": null}, | ||
| 56 | + {"color": "yellow", "value": 1000}, | ||
| 57 | + {"color": "red", "value": 5000} | ||
| 58 | + ] | ||
| 59 | + } | ||
| 60 | + } | ||
| 61 | + } | ||
| 62 | + }, | ||
| 63 | + { | ||
| 64 | + "id": 5, | ||
| 65 | + "type": "timeseries", | ||
| 66 | + "title": "Request rate by route", | ||
| 67 | + "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8}, | ||
| 68 | + "targets": [ | ||
| 69 | + { | ||
| 70 | + "expr": "sum(rate(http_requests_total{instance=~\"$instance\"}[1m])) by (route)", | ||
| 71 | + "legendFormat": "{{route}}", | ||
| 72 | + "refId": "A" | ||
| 73 | + } | ||
| 74 | + ] | ||
| 75 | + }, | ||
| 76 | + { | ||
| 77 | + "id": 6, | ||
| 78 | + "type": "timeseries", | ||
| 79 | + "title": "p95 latency by route", | ||
| 80 | + "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8}, | ||
| 81 | + "targets": [ | ||
| 82 | + { | ||
| 83 | + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{instance=~\"$instance\"}[5m])) by (route, le))", | ||
| 84 | + "legendFormat": "{{route}}", | ||
| 85 | + "refId": "A" | ||
| 86 | + } | ||
| 87 | + ] | ||
| 88 | + }, | ||
| 89 | + { | ||
| 90 | + "id": 7, | ||
| 91 | + "type": "timeseries", | ||
| 92 | + "title": "DB calls/sec", | ||
| 93 | + "gridPos": {"x": 0, "y": 12, "w": 12, "h": 8}, | ||
| 94 | + "targets": [ | ||
| 95 | + { | ||
| 96 | + "expr": "sum(rate(pg_stat_statements_calls_total[1m]))", | ||
| 97 | + "legendFormat": "calls/sec", | ||
| 98 | + "refId": "A" | ||
| 99 | + } | ||
| 100 | + ] | ||
| 101 | + }, | ||
| 102 | + { | ||
| 103 | + "id": 8, | ||
| 104 | + "type": "timeseries", | ||
| 105 | + "title": "Webhook deliveries (success vs failure)", | ||
| 106 | + "gridPos": {"x": 12, "y": 12, "w": 12, "h": 8}, | ||
| 107 | + "targets": [ | ||
| 108 | + { | ||
| 109 | + "expr": "sum(rate(shithubd_webhook_deliveries_total[1m])) by (result)", | ||
| 110 | + "legendFormat": "{{result}}", | ||
| 111 | + "refId": "A" | ||
| 112 | + } | ||
| 113 | + ] | ||
| 114 | + } | ||
| 115 | + ] | ||
| 116 | +} | ||
deploy/monitoring/loki/loki-config.yamladded@@ -0,0 +1,56 @@ | |||
| 1 | +# Loki — single-binary mode on the monitoring host. Promtail on | ||
| 2 | +# each app host tails /var/log/shithubd/*.log and ships here over | ||
| 3 | +# the wg0 mesh. Retention is short; long-term log archival is the | ||
| 4 | +# OS journal + Spaces backup. | ||
| 5 | + | ||
| 6 | +auth_enabled: false | ||
| 7 | + | ||
| 8 | +server: | ||
| 9 | + http_listen_port: 3100 | ||
| 10 | + grpc_listen_port: 9096 | ||
| 11 | + | ||
| 12 | +common: | ||
| 13 | + instance_addr: 10.50.0.10 | ||
| 14 | + path_prefix: /var/lib/loki | ||
| 15 | + storage: | ||
| 16 | + filesystem: | ||
| 17 | + chunks_directory: /var/lib/loki/chunks | ||
| 18 | + rules_directory: /var/lib/loki/rules | ||
| 19 | + replication_factor: 1 | ||
| 20 | + ring: | ||
| 21 | + kvstore: | ||
| 22 | + store: inmemory | ||
| 23 | + | ||
| 24 | +schema_config: | ||
| 25 | + configs: | ||
| 26 | + - from: 2026-01-01 | ||
| 27 | + store: tsdb | ||
| 28 | + object_store: filesystem | ||
| 29 | + schema: v13 | ||
| 30 | + index: | ||
| 31 | + prefix: index_ | ||
| 32 | + period: 24h | ||
| 33 | + | ||
| 34 | +limits_config: | ||
| 35 | + retention_period: 168h | ||
| 36 | + ingestion_rate_mb: 8 | ||
| 37 | + ingestion_burst_size_mb: 16 | ||
| 38 | + max_streams_per_user: 5000 | ||
| 39 | + | ||
| 40 | +ruler: | ||
| 41 | + storage: | ||
| 42 | + type: local | ||
| 43 | + local: | ||
| 44 | + directory: /etc/loki/rules | ||
| 45 | + rule_path: /tmp/loki-rules | ||
| 46 | + alertmanager_url: http://10.50.0.10:9093 | ||
| 47 | + ring: | ||
| 48 | + kvstore: | ||
| 49 | + store: inmemory | ||
| 50 | + enable_api: true | ||
| 51 | + | ||
| 52 | +compactor: | ||
| 53 | + working_directory: /var/lib/loki/compactor | ||
| 54 | + retention_enabled: true | ||
| 55 | + retention_delete_delay: 2h | ||
| 56 | + delete_request_store: filesystem | ||