`2ab8d07`

monitoring: add Actions dashboard and alerts

Authored by mfwolffe <wolffemf@dukes.jmu.edu> yesterday

SHA: 2ab8d07593a622dd38994590cb5e1967b59babb7
Parents: 11badfd
Tree: 2190307

4 changed files

Status	File	+	-
A	`deploy/monitoring/grafana/dashboards/actions.json`	247	0
M	`deploy/monitoring/prometheus/rules.yml`	46	0
M	`docs/internal/runbooks/incidents.md`	64	0
M	`docs/internal/runbooks/observability.md`	14	4

deploy/monitoring/grafana/dashboards/actions.jsonadded

 +{
 +  "uid": "shithubd-actions",
 +  "title": "shithubd - Actions",
 +  "tags": ["shithubd", "actions"],
 +  "timezone": "browser",
 +  "schemaVersion": 39,
 +  "version": 1,
 +  "refresh": "30s",
 +  "time": {"from": "now-6h", "to": "now"},
 +  "templating": {
 +    "list": [
 +      {
 +        "name": "instance",
 +        "type": "query",
 +        "datasource": "Prometheus",
 +        "query": "label_values(up{job=\"shithubd-web\"}, instance)",
 +        "includeAll": true,
 +        "multi": true
 +      }
 +    ]
 +  },
 +  "panels": [
 +    {
 +      "id": 1,
 +      "type": "stat",
 +      "title": "Queued jobs",
 +      "gridPos": {"x": 0, "y": 0, "w": 4, "h": 4},
 +      "targets": [{"expr": "sum(shithub_actions_queue_depth{resource=\"jobs\",instance=~\"$instance\"})", "refId": "A"}],
 +      "fieldConfig": {
 +        "defaults": {
 +          "thresholds": {
 +            "mode": "absolute",
 +            "steps": [
 +              {"color": "green", "value": null},
 +              {"color": "yellow", "value": 50},
 +              {"color": "red", "value": 100}
 +            ]
 +          }
 +        }
 +      }
 +    },
 +    {
 +      "id": 2,
 +      "type": "stat",
 +      "title": "Running jobs",
 +      "gridPos": {"x": 4, "y": 0, "w": 4, "h": 4},
 +      "targets": [{"expr": "sum(shithub_actions_active{resource=\"jobs\",instance=~\"$instance\"})", "refId": "A"}]
 +    },
 +    {
 +      "id": 3,
 +      "type": "stat",
 +      "title": "Stale runners",
 +      "gridPos": {"x": 8, "y": 0, "w": 4, "h": 4},
 +      "targets": [{"expr": "count(shithub_actions_runner_heartbeat_age_seconds{status!=\"offline\",instance=~\"$instance\"} > 60)", "refId": "A"}],
 +      "fieldConfig": {
 +        "defaults": {
 +          "thresholds": {
 +            "mode": "absolute",
 +            "steps": [
 +              {"color": "green", "value": null},
 +              {"color": "red", "value": 1}
 +            ]
 +          }
 +        }
 +      }
 +    },
 +    {
 +      "id": 4,
 +      "type": "stat",
 +      "title": "Log MB/day",
 +      "gridPos": {"x": 12, "y": 0, "w": 4, "h": 4},
 +      "targets": [{"expr": "sum(increase(shithub_actions_log_chunk_bytes_total{instance=~\"$instance\"}[24h])) / 1024 / 1024", "refId": "A"}]
 +    },
 +    {
 +      "id": 5,
 +      "type": "stat",
 +      "title": "Run p99",
 +      "gridPos": {"x": 16, "y": 0, "w": 4, "h": 4},
 +      "targets": [{"expr": "histogram_quantile(0.99, sum(rate(shithub_actions_run_duration_seconds_bucket{instance=~\"$instance\"}[30m])) by (le))", "refId": "A"}],
 +      "fieldConfig": {"defaults": {"unit": "s"}}
 +    },
 +    {
 +      "id": 6,
 +      "type": "stat",
 +      "title": "Storage MB",
 +      "gridPos": {"x": 20, "y": 0, "w": 4, "h": 4},
 +      "targets": [{"expr": "sum(shithub_actions_storage_bytes{instance=~\"$instance\"}) / 1024 / 1024", "refId": "A"}]
 +    },
 +    {
 +      "id": 7,
 +      "type": "timeseries",
 +      "title": "Queue depth",
 +      "gridPos": {"x": 0, "y": 4, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(shithub_actions_queue_depth{instance=~\"$instance\"}) by (resource)",
 +          "legendFormat": "{{resource}} queued",
 +          "refId": "A"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 8,
 +      "type": "timeseries",
 +      "title": "Active runs and jobs",
 +      "gridPos": {"x": 12, "y": 4, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(shithub_actions_active{instance=~\"$instance\"}) by (resource)",
 +          "legendFormat": "{{resource}} active",
 +          "refId": "A"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 9,
 +      "type": "timeseries",
 +      "title": "Run duration p95 and p99",
 +      "gridPos": {"x": 0, "y": 12, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "histogram_quantile(0.95, sum(rate(shithub_actions_run_duration_seconds_bucket{instance=~\"$instance\"}[15m])) by (le, event))",
 +          "legendFormat": "p95 {{event}}",
 +          "refId": "A"
 +        },
 +        {
 +          "expr": "histogram_quantile(0.99, sum(rate(shithub_actions_run_duration_seconds_bucket{instance=~\"$instance\"}[15m])) by (le, event))",
 +          "legendFormat": "p99 {{event}}",
 +          "refId": "B"
 +        }
 +      ],
 +      "fieldConfig": {"defaults": {"unit": "s"}}
 +    },
 +    {
 +      "id": 10,
 +      "type": "timeseries",
 +      "title": "Runner heartbeat age",
 +      "gridPos": {"x": 12, "y": 12, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "shithub_actions_runner_heartbeat_age_seconds{instance=~\"$instance\"}",
 +          "legendFormat": "{{runner}} {{status}}",
 +          "refId": "A"
 +        }
 +      ],
 +      "fieldConfig": {"defaults": {"unit": "s"}}
 +    },
 +    {
 +      "id": 11,
 +      "type": "timeseries",
 +      "title": "Runs per minute",
 +      "gridPos": {"x": 0, "y": 20, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(rate(shithub_actions_runs_enqueued_total{result=\"fresh\",instance=~\"$instance\"}[5m])) * 60",
 +          "legendFormat": "enqueued",
 +          "refId": "A"
 +        },
 +        {
 +          "expr": "sum(rate(shithub_actions_runs_completed_total{instance=~\"$instance\"}[5m])) * 60",
 +          "legendFormat": "completed",
 +          "refId": "B"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 12,
 +      "type": "timeseries",
 +      "title": "Run conclusions",
 +      "gridPos": {"x": 12, "y": 20, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(rate(shithub_actions_runs_completed_total{instance=~\"$instance\"}[15m])) by (conclusion)",
 +          "legendFormat": "{{conclusion}}",
 +          "refId": "A"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 13,
 +      "type": "timeseries",
 +      "title": "Step outcomes",
 +      "gridPos": {"x": 0, "y": 28, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(rate(shithub_actions_steps_completed_total{instance=~\"$instance\"}[15m])) by (step_type, conclusion)",
 +          "legendFormat": "{{step_type}} {{conclusion}}",
 +          "refId": "A"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 14,
 +      "type": "timeseries",
 +      "title": "Log throughput",
 +      "gridPos": {"x": 12, "y": 28, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(rate(shithub_actions_log_chunk_bytes_total{instance=~\"$instance\"}[5m])) by (location)",
 +          "legendFormat": "{{location}} bytes/sec",
 +          "refId": "A"
 +        },
 +        {
 +          "expr": "sum(rate(shithub_actions_log_chunks_total{instance=~\"$instance\"}[5m])) by (location)",
 +          "legendFormat": "{{location}} chunks/sec",
 +          "refId": "B"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 15,
 +      "type": "timeseries",
 +      "title": "Actions storage",
 +      "gridPos": {"x": 0, "y": 36, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(shithub_actions_storage_bytes{instance=~\"$instance\"}) by (kind)",
 +          "legendFormat": "{{kind}} bytes",
 +          "refId": "A"
 +        },
 +        {
 +          "expr": "sum(shithub_actions_storage_objects{instance=~\"$instance\"}) by (kind)",
 +          "legendFormat": "{{kind}} objects",
 +          "refId": "B"
 +        }
 +      ]
 +    },
 +    {
 +      "id": 16,
 +      "type": "timeseries",
 +      "title": "Cancellations and retention",
 +      "gridPos": {"x": 12, "y": 36, "w": 12, "h": 8},
 +      "targets": [
 +        {
 +          "expr": "sum(rate(shithub_actions_jobs_cancelled_total{instance=~\"$instance\"}[15m])) by (reason)",
 +          "legendFormat": "cancel {{reason}}",
 +          "refId": "A"
 +        },
 +        {
 +          "expr": "sum(rate(shithub_actions_runs_pruned_total{instance=~\"$instance\"}[1h])) by (kind)",
 +          "legendFormat": "pruned {{kind}}",
 +          "refId": "B"
 +        }
 +      ]
 +    }
 +  ]
 +}

deploy/monitoring/prometheus/rules.ymlmodified

          annotations:
            summary: "webhook failure rate > 50% sustained"
 +  - name: shithubd-actions
 +    interval: 30s
 +    rules:
 +      - alert: ActionsRunnerHeartbeatStale
 +        expr: shithub_actions_runner_heartbeat_age_seconds{status!="offline"} > 60
 +        for: 5m
 +        labels: {severity: page}
 +        annotations:
 +          summary: "Actions runner {{ $labels.runner }} heartbeat stale for > 60s"
 +          runbook: "runbooks/incidents.md#actions-runner-heartbeat-stale"
++
 +      - alert: ActionsQueueDepthHigh
 +        expr: shithub_actions_queue_depth{resource="jobs"} > 100
 +        for: 10m
 +        labels: {severity: ticket}
 +        annotations:
 +          summary: "Actions queued jobs > 100 for 10m"
 +          runbook: "runbooks/incidents.md#actions-queue-depth-high"
++
 +      - alert: ActionsRunDurationP99Regressed
 +        expr: |
 +          histogram_quantile(0.99,
 +            sum(rate(shithub_actions_run_duration_seconds_bucket[30m])) by (le)
 +          )
 +            >
 +          1.5 *
 +          histogram_quantile(0.99,
 +            sum(rate(shithub_actions_run_duration_seconds_bucket[30m] offset 24h)) by (le)
 +          )
 +        for: 15m
 +        labels: {severity: ticket}
 +        annotations:
 +          summary: "Actions run duration p99 regressed by > 50% versus 24h ago"
 +          runbook: "runbooks/incidents.md#actions-run-duration-p99-regressed"
++
 +      - alert: ActionsLogScrubberPossiblyMissing
 +        expr: |
 +          sum(rate(shithub_actions_log_chunk_bytes_total{location="server"}[15m])) > 1048576
 +            and
 +          sum(rate(shithub_actions_log_scrub_replacements_total{location="server"}[15m])) == 0
 +        for: 30m
 +        labels: {severity: ticket}
 +        annotations:
 +          summary: "Actions logs are flowing but no server-side secret masks have matched"
 +          runbook: "runbooks/incidents.md#actions-log-scrubber-possibly-missing"
++
    - name: shithubd-backups
      interval: 5m
      rules:

docs/internal/runbooks/incidents.mdmodified

     pattern lets multiple workers coexist safely).
 . To purge a poison job: mark it `failed` (don't delete — we want
     the audit trail).
++
 +## actions-runner-heartbeat-stale
++
 +**Symptom:** `shithub_actions_runner_heartbeat_age_seconds{status!="offline"} >
 +60` for 5m. Actions jobs can remain queued even while the runner appears
 +registered.
++
 +1. Identify the runner from the alert label.
 +2. On the runner host: `systemctl status shithubd-runner` and
 +   `journalctl -u shithubd-runner -n 200 --no-pager`.
 +3. On the app host: `shithubd admin actions runner list` and confirm the
 +   runner labels still match queued jobs.
 +4. If the runner is wedged, restart `shithubd-runner`. If it cannot
 +   authenticate, rotate the runner token and redeploy the service env.
 +5. Record whether the stale heartbeat happened during a deploy, network
 +   partition, token rotation, or runner engine failure.
++
 +## actions-queue-depth-high
++
 +**Symptom:** `shithub_actions_queue_depth{resource="jobs"} > 100` for 10m.
++
 +1. Check runner availability:
 +   `shithubd admin actions runner list`.
 +2. Compare queued labels with runner labels. A workflow using an unsupported
 +   `runs-on` value will sit queued until a compatible runner exists.
 +3. Inspect web and worker logs for trigger storms, claim errors, and DB pool
 +   saturation.
 +4. If legitimate load exceeds capacity, add runners or raise capacity on idle
 +   runner hosts. If one repository dominates, cancel or throttle that workload.
 +5. After mitigation, watch `shithub_actions_queue_depth` drain and confirm
 +   `shithub_actions_active` does not flatline.
++
 +## actions-run-duration-p99-regressed
++
 +**Symptom:** Actions p99 duration over 30m is >50% above the same window 24h
 +ago.
++
 +1. Split by event in the Actions dashboard. A single event type usually points
 +   to one workflow shape rather than runner infrastructure.
 +2. Compare `shithub_actions_active` and runner capacity. High duration with
 +   low active jobs suggests slow jobs; high duration with saturated active jobs
 +   suggests insufficient runner capacity.
 +3. Check runner host CPU, memory, disk, and Docker/engine logs.
 +4. If the regression started with a deploy, review runner API, log streaming,
 +   checkout, and container execution changes first.
 +5. Capture representative slow run IDs and their step durations before
 +   canceling or pruning anything.
++
 +## actions-log-scrubber-possibly-missing
++
 +**Symptom:** server-side Actions log bytes are flowing for 30m, but
 +`shithub_actions_log_scrub_replacements_total{location="server"}` remains zero.
++
 +This is a warning, not proof of leaked secrets. Some periods legitimately have
 +no secret-bearing logs.
++
 +1. Confirm secrets or variables with sensitive values exist for workloads that
 +   ran during the window.
 +2. Trigger a controlled workflow that echoes a known test secret value and
 +   verify the rendered logs contain `***`, not plaintext.
 +3. Check runner claims happened after the secret was created or rotated; mask
 +   snapshots are captured at claim time.
 +4. If the controlled workflow is not masked, stop affected runners, rotate the
 +   exposed secret, and open a security incident.

docs/internal/runbooks/observability.mdmodified

  | `shithub_db_pool_acquired` | gauge | Active Postgres connections. Approaching `shithub_db_pool_total` = saturation. |
  | `shithub_db_pool_acquire_wait_seconds_total` | counter | Cumulative wait time. Sudden derivative climb = pool too small. |
  | `shithub_panics_total` | counter | Recovered panics. Should be 0 in steady state. |
 +| `shithub_actions_queue_depth` | gauge | Queued Actions runs/jobs. Sustained job depth means runners cannot keep up. |
 +| `shithub_actions_active` | gauge | Running Actions runs/jobs. Use with capacity to distinguish slow jobs from lack of runners. |
 +| `shithub_actions_runner_heartbeat_age_seconds` | gauge | Seconds since each runner heartbeat. >60s sustained means the runner is stale. |
 +| `shithub_actions_run_duration_seconds` | histogram | Terminal Actions run duration by event and conclusion. |
 +| `shithub_actions_log_chunk_bytes_total` | counter | Accepted Actions log bytes, used for throughput and scrubber health alerts. |
  ## Operator setup (one-time)
  ## Building a starter dashboard
 -Cloud → Dashboards → New → import. Use the panels below as a
 -spine. (We're not committing the dashboard JSON to the repo yet
 -because Grafana's UUIDs are stack-specific; doc the queries and
 -let the operator import once.)
 +Cloud → Dashboards → New → import. Dashboard JSON is committed under:
++
 +```text
 +deploy/monitoring/grafana/dashboards/shithubd-overview.json
 +deploy/monitoring/grafana/dashboards/actions.json
 +```
++
 +Use the panels below as fallback queries if the dashboard import UI drifts or
 +you need to rebuild by hand.
  | Panel | Query |
  |---|---|