# Alert rules for shithubd. Keep these short and signal-heavy —
# every alert here should map to a runbook in docs/internal/
# runbooks/incidents.md by name.

groups:
  - name: shithubd-availability
    interval: 30s
    rules:
      - alert: ShithubdWebDown
        expr: up{job="shithubd-web"} == 0
        for: 2m
        labels: {severity: page}
        annotations:
          summary: "shithubd web {{ $labels.instance }} is down"
          runbook: "runbooks/incidents.md#shithubd-down"

      - alert: ShithubdWorkerDown
        expr: up{job="shithubd-worker"} == 0
        for: 5m
        labels: {severity: page}
        annotations:
          summary: "shithubd worker is down"
          runbook: "runbooks/incidents.md#worker-down"

      - alert: PostgresDown
        expr: up{job="postgres"} == 0
        for: 1m
        labels: {severity: page}
        annotations:
          summary: "postgres is down — site cannot serve writes"
          runbook: "runbooks/incidents.md#postgres-down"

  - name: shithubd-latency
    interval: 30s
    rules:
      - alert: HighRequestLatencyP95
        expr: |
          histogram_quantile(0.95,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (route, le)
          ) > 1.5
        for: 10m
        labels: {severity: ticket}
        annotations:
          summary: "p95 latency on {{ $labels.route }} > 1.5s"

      - alert: HighDBQueryRate
        expr: |
          sum(rate(pg_stat_statements_calls_total[5m])) > 5000
        for: 10m
        labels: {severity: ticket}
        annotations:
          summary: "DB call rate sustained > 5k/s — possible N+1 regression"

  - name: shithubd-jobs
    interval: 30s
    rules:
      - alert: JobBacklogGrowing
        expr: shithubd_job_queue_depth > 5000
        for: 15m
        labels: {severity: ticket}
        annotations:
          summary: "job queue depth > 5k — worker cannot keep up"
          runbook: "runbooks/incidents.md#job-backlog"

      - alert: WebhookDeliveryFailing
        expr: |
          rate(shithubd_webhook_deliveries_total{result="failure"}[15m])
            /
          rate(shithubd_webhook_deliveries_total[15m]) > 0.5
        for: 30m
        labels: {severity: ticket}
        annotations:
          summary: "webhook failure rate > 50% sustained"

  - name: shithubd-actions
    interval: 30s
    rules:
      - alert: ActionsRunnerHeartbeatStale
        expr: shithub_actions_runner_heartbeat_age_seconds{status!="offline"} > 60
        for: 5m
        labels: {severity: page}
        annotations:
          summary: "Actions runner {{ $labels.runner }} heartbeat stale for > 60s"
          runbook: "runbooks/incidents.md#actions-runner-heartbeat-stale"

      - alert: ActionsQueueDepthHigh
        expr: shithub_actions_queue_depth{resource="jobs"} > 100
        for: 10m
        labels: {severity: ticket}
        annotations:
          summary: "Actions queued jobs > 100 for 10m"
          runbook: "runbooks/incidents.md#actions-queue-depth-high"

      - alert: ActionsRunDurationP99Regressed
        expr: |
          histogram_quantile(0.99,
            sum(rate(shithub_actions_run_duration_seconds_bucket[30m])) by (le)
          )
            >
          1.5 *
          histogram_quantile(0.99,
            sum(rate(shithub_actions_run_duration_seconds_bucket[30m] offset 24h)) by (le)
          )
        for: 15m
        labels: {severity: ticket}
        annotations:
          summary: "Actions run duration p99 regressed by > 50% versus 24h ago"
          runbook: "runbooks/incidents.md#actions-run-duration-p99-regressed"

      - alert: ActionsLogScrubberPossiblyMissing
        expr: |
          sum(rate(shithub_actions_log_chunk_bytes_total{location="server"}[15m])) > 1048576
            and
          sum(rate(shithub_actions_log_scrub_replacements_total{location="server"}[15m])) == 0
        for: 30m
        labels: {severity: ticket}
        annotations:
          summary: "Actions logs are flowing but no server-side secret masks have matched"
          runbook: "runbooks/incidents.md#actions-log-scrubber-possibly-missing"

  - name: shithubd-backups
    interval: 5m
    rules:
      - alert: BackupOverdue
        expr: time() - shithubd_backup_last_success_seconds > 60 * 60 * 30
        for: 0m
        labels: {severity: page}
        annotations:
          summary: "no successful backup in > 30h"
          runbook: "runbooks/backups.md#missed-backup"