runner: enforce workflow job timeouts

Status	File	+	-
M	`internal/infra/metrics/metrics.go`	7	0
M	`internal/runner/engine/docker.go`	44	8
M	`internal/runner/engine/docker_test.go`	86	0
M	`internal/runner/engine/types.go`	5	0
M	`internal/web/handlers/api/runners.go`	11	0
M	`internal/web/handlers/api/runners_test.go`	82	0

internal/infra/metrics/metrics.gomodified

  		},
  		[]string{"kind"},
  	)
++	ActionsStepTimeoutsTotal = prometheus.NewCounter(
++		prometheus.CounterOpts{
++			Name: "shithub_actions_step_timeouts_total",
++			Help: "Total Actions steps reported as timed out by runners.",
++		},
++	)
  )
  func init() {
  		ActionsJobsCancelledTotal,
  		ActionsLogScrubReplacementsTotal,
  		ActionsRunsPrunedTotal,
++		ActionsStepTimeoutsTotal,
  	)
  }

internal/runner/engine/docker.gomodified

  	LogChunkBytes    int
  	LogFlushInterval time.Duration
  	StepLogLimit     int64
++	TimeoutMinute    time.Duration
  	Stdout           io.Writer
  	Stderr           io.Writer
  	Runner           CommandRunner
  	if cfg.StepLogLimit <= 0 {
  		cfg.StepLogLimit = 10 * 1024 * 1024
+ 	}
++	if cfg.TimeoutMinute <= 0 {
++		cfg.TimeoutMinute = time.Minute
++	}
  	if cfg.Stdout == nil {
  		cfg.Stdout = io.Discard
+ 	}
  	defer d.closeEventStream(job.ID)
  	if job.TimeoutMinutes > 0 {
  		var cancel context.CancelFunc
--		ctx, cancel = context.WithTimeout(ctx, time.Duration(job.TimeoutMinutes)*time.Minute)
++		ctx, cancel = context.WithTimeoutCause(ctx, time.Duration(job.TimeoutMinutes)*d.cfg.TimeoutMinute, ErrJobTimedOut)
  		defer cancel()
+ 	}
  	if err := os.MkdirAll(job.WorkspaceDir, 0o700); err != nil {
  				CompletedAt: stepCompleted,
+ 			}
  			outcome.StepOutcomes = append(outcome.StepOutcomes, stepOutcome)
--			if emitErr := d.emitStepOutcome(ctx, job.ID, stepOutcome); emitErr != nil {
++			if emitErr := d.emitStepOutcomeAfterRun(ctx, job.ID, stepOutcome); emitErr != nil {
  				outcome.Conclusion = conclusionForError(emitErr)
  				outcome.CompletedAt = time.Now().UTC()
  				return outcome, emitErr
+ 			}
--			if step.ContinueOnError {
++			if step.ContinueOnError && !errors.Is(err, ErrJobTimedOut) {
  				continue
+ 			}
  			outcome.Conclusion = conclusionForError(err)
  			CompletedAt: time.Now().UTC(),
+ 		}
  		outcome.StepOutcomes = append(outcome.StepOutcomes, stepOutcome)
--		if err := d.emitStepOutcome(ctx, job.ID, stepOutcome); err != nil {
++		if err := d.emitStepOutcomeAfterRun(ctx, job.ID, stepOutcome); err != nil {
  			outcome.Conclusion = conclusionForError(err)
  			outcome.CompletedAt = time.Now().UTC()
  			return outcome, err
  	out := io.MultiWriter(d.cfg.Stdout, writer)
  	errOut := io.MultiWriter(d.cfg.Stderr, writer)
  	if err := d.cfg.Runner.Run(ctx, d.cfg.Binary, invocation.args, invocation.env, out, errOut); err != nil {
++		if isJobTimeout(ctx, err) {
++			killCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second)
++			killErr := d.killContainer(killCtx, invocation.containerName)
++			cancel()
++			if killErr != nil {
++				err = errors.Join(err, killErr)
++			}
++			err = fmt.Errorf("%w: %w", ErrJobTimedOut, err)
++		}
  		d.logStep(ctx, "runner step completed", job, step, invocation, conclusionForError(err))
  		if closeErr := writer.Close(); closeErr != nil {
  			return fmt.Errorf("runner engine: step %q failed: %w", stepLabel(step), errors.Join(err, closeErr))
+ 	}
  	killCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
  	defer cancel()
--	if err := d.cfg.Runner.Run(killCtx, d.cfg.Binary, []string{"kill", name}, nil, d.cfg.Stdout, d.cfg.Stderr); err != nil {
++	return d.killContainer(killCtx, name)
++}
++
++func (d *Docker) killContainer(ctx context.Context, name string) error {
++	if err := d.cfg.Runner.Run(ctx, d.cfg.Binary, []string{"kill", name}, nil, d.cfg.Stdout, d.cfg.Stderr); err != nil {
  		return fmt.Errorf("runner engine: kill container %s: %w", name, err)
+ 	}
  	return nil
+ 	}
+ }
++func (d *Docker) emitStepOutcomeAfterRun(ctx context.Context, jobID int64, step StepOutcome) error {
++	if ctx.Err() == nil {
++		return d.emitStepOutcome(ctx, jobID, step)
++	}
++	emitCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second)
++	defer cancel()
++	return d.emitStepOutcome(emitCtx, jobID, step)
++}
++
  func (d *Docker) newStepLogWriter(ctx context.Context, jobID, stepID int64, jobMasks []string) *stepLogWriter {
  	w := &stepLogWriter{
  		ctx:      ctx,
+ }
  func conclusionForError(err error) string {
++	if errors.Is(err, ErrJobTimedOut) {
++		return ConclusionTimedOut
++	}
  	if errors.Is(err, context.Canceled) {
  		return ConclusionCancelled
+ 	}
--	if errors.Is(err, context.DeadlineExceeded) {
--		return ConclusionTimedOut
--	}
  	return ConclusionFailure
+ }
++func isJobTimeout(ctx context.Context, err error) bool {
++	if errors.Is(err, ErrJobTimedOut) {
++		return true
++	}
++	if !errors.Is(err, context.DeadlineExceeded) {
++		return false
++	}
++	return errors.Is(context.Cause(ctx), ErrJobTimedOut)
++}
++
  func containerWorkdir(wd string) (string, error) {
  	wd = strings.TrimSpace(wd)
  	if wd == "" {

internal/runner/engine/docker_test.gomodified

+ 	}
+ }
++type timeoutRunner struct {
++	started   chan struct{}
++	killed    chan struct{}
++	killArgs  []string
++	startOnce sync.Once
++	killOnce  sync.Once
++	mu        sync.Mutex
++}
++
++func newTimeoutRunner() *timeoutRunner {
++	return &timeoutRunner{
++		started: make(chan struct{}),
++		killed:  make(chan struct{}),
++	}
++}
++
++func (r *timeoutRunner) Run(ctx context.Context, _ string, args []string, _ []string, _, _ io.Writer) error {
++	if len(args) > 0 && args[0] == "kill" {
++		r.mu.Lock()
++		r.killArgs = append([]string{}, args...)
++		r.mu.Unlock()
++		r.killOnce.Do(func() { close(r.killed) })
++		return nil
++	}
++	r.startOnce.Do(func() { close(r.started) })
++	<-ctx.Done()
++	return ctx.Err()
++}
++
  func TestDockerExecute_BuildsResourceCappedRunCommand(t *testing.T) {
  	t.Parallel()
  	rec := &recordingRunner{}
+ 	}
+ }
++func TestDockerExecute_TimeoutKillsActiveContainerAndReportsTimedOut(t *testing.T) {
++	t.Parallel()
++	rec := newTimeoutRunner()
++	d := NewDocker(DockerConfig{
++		DefaultImage:     "runner-image",
++		Network:          "bridge",
++		Memory:           "2g",
++		CPUs:             "2",
++		Runner:           rec,
++		TimeoutMinute:    time.Millisecond,
++		LogChunkBytes:    4,
++		StepLogLimit:     1024,
++		LogFlushInterval: time.Hour,
++	})
++	events, err := d.StreamEvents(t.Context(), 99)
++	if err != nil {
++		t.Fatalf("StreamEvents: %v", err)
++	}
++	out, err := d.Execute(t.Context(), Job{
++		ID:             99,
++		TimeoutMinutes: 1,
++		WorkspaceDir:   t.TempDir(),
++		Steps:          []Step{{ID: 123, Run: "sleep 600", ContinueOnError: true}},
++	})
++	if !errors.Is(err, ErrJobTimedOut) {
++		t.Fatalf("Execute error: got %v, want ErrJobTimedOut", err)
++	}
++	if out.Conclusion != ConclusionTimedOut {
++		t.Fatalf("Conclusion: %q", out.Conclusion)
++	}
++	if len(out.StepOutcomes) != 1 ||
++		out.StepOutcomes[0].StepID != 123 ||
++		out.StepOutcomes[0].Status != "completed" ||
++		out.StepOutcomes[0].Conclusion != ConclusionTimedOut {
++		t.Fatalf("StepOutcomes: %#v", out.StepOutcomes)
++	}
++	select {
++	case <-rec.killed:
++	case <-time.After(time.Second):
++		t.Fatal("timeout did not kill active container")
++	}
++	rec.mu.Lock()
++	killArgs := append([]string{}, rec.killArgs...)
++	rec.mu.Unlock()
++	want := []string{"kill", "shithub-job-99-step-123"}
++	if !reflect.DeepEqual(killArgs, want) {
++		t.Fatalf("kill args: got %#v want %#v", killArgs, want)
++	}
++	var got []Event
++	for event := range events {
++		got = append(got, event)
++	}
++	if len(got) != 1 || got[0].Step == nil || got[0].Step.Conclusion != ConclusionTimedOut {
++		t.Fatalf("timeout step event: %#v", got)
++	}
++}
++
  func TestDockerExecute_FailureMapsToFailureConclusion(t *testing.T) {
  	t.Parallel()
  	d := NewDocker(DockerConfig{

internal/runner/engine/types.gomodified

  import (
  	"context"
  	"encoding/json"
++	"errors"
  	"time"
  )
  	ConclusionTimedOut  = "timed_out"
  )
++// ErrJobTimedOut marks an execution failure caused by the workflow job's
++// timeout-minutes deadline, not by runner shutdown or user cancellation.
++var ErrJobTimedOut = errors.New("runner engine: job timed out")
++
  type Engine interface {
  	Execute(ctx context.Context, job Job) (Outcome, error)
  	StreamLogs(ctx context.Context, jobID int64) (<-chan LogChunk, error)

internal/web/handlers/api/runners.gomodified

  		writeAPIError(w, http.StatusInternalServerError, "step status update failed")
  		return
+ 	}
++	recordStepTimeout(step, updated)
  	h.writeNextTokenResponse(w, r, http.StatusOK, auth, map[string]any{
  		"status":     string(updated.Status),
  		"conclusion": nullableConclusion(updated.Conclusion),
+ 	}
+ }
++func recordStepTimeout(before, after actionsdb.WorkflowStep) {
++	if !after.Conclusion.Valid || after.Conclusion.CheckConclusion != actionsdb.CheckConclusionTimedOut {
++		return
++	}
++	if before.Conclusion.Valid && before.Conclusion.CheckConclusion == actionsdb.CheckConclusionTimedOut {
++		return
++	}
++	metrics.ActionsStepTimeoutsTotal.Inc()
++}
++
  func (h *Handlers) applyStepStatus(
  	ctx context.Context,
  	step actionsdb.WorkflowStep,

internal/web/handlers/api/runners_test.gomodified

  	"github.com/go-chi/chi/v5"
  	"github.com/jackc/pgx/v5/pgtype"
  	"github.com/jackc/pgx/v5/pgxpool"
++	dto "github.com/prometheus/client_model/go"
  	"github.com/tenseleyFlow/shithub/internal/actions/finalize"
  	"github.com/tenseleyFlow/shithub/internal/actions/runnertoken"
  	"github.com/tenseleyFlow/shithub/internal/auth/pat"
  	"github.com/tenseleyFlow/shithub/internal/auth/runnerjwt"
  	"github.com/tenseleyFlow/shithub/internal/auth/secretbox"
++	"github.com/tenseleyFlow/shithub/internal/infra/metrics"
  	"github.com/tenseleyFlow/shithub/internal/infra/storage"
  	repogit "github.com/tenseleyFlow/shithub/internal/repos/git"
  	reposdb "github.com/tenseleyFlow/shithub/internal/repos/sqlc"
+ 	}
+ }
++func TestRunnerStepStatusRecordsTimeoutMetricOnce(t *testing.T) {
++	pool := dbtest.NewTestDB(t)
++	logger := slog.New(slog.NewTextHandler(io.Discard, nil))
++	repoID, userID := setupRunnerAPIRepo(t, pool)
++	enqueueRunnerAPIRun(t, pool, logger, repoID, userID)
++	token, _ := registerRunnerForTest(t, pool, []string{"ubuntu-latest"}, 1)
++	router := newRunnerAPIRouter(t, pool, logger, runnerAPISigner(t, time.Now()))
++
++	req := httptest.NewRequest(http.MethodPost, "/api/v1/runners/heartbeat",
++		strings.NewReader(`{"labels":["ubuntu-latest"],"capacity":1}`))
++	req.Header.Set("Authorization", "Bearer "+token)
++	rr := httptest.NewRecorder()
++	router.ServeHTTP(rr, req)
++	if rr.Code != http.StatusOK {
++		t.Fatalf("heartbeat status: got %d, want 200; body=%s", rr.Code, rr.Body.String())
++	}
++	var claim struct {
++		Token string `json:"token"`
++		Job   struct {
++			ID    int64 `json:"id"`
++			Steps []struct {
++				ID int64 `json:"id"`
++			} `json:"steps"`
++		} `json:"job"`
++	}
++	if err := json.Unmarshal(rr.Body.Bytes(), &claim); err != nil {
++		t.Fatalf("decode claim: %v", err)
++	}
++	if len(claim.Job.Steps) == 0 {
++		t.Fatalf("claim steps: %+v", claim.Job.Steps)
++	}
++	stepID := claim.Job.Steps[0].ID
++	before := actionsStepTimeoutsValue(t)
++
++	req = httptest.NewRequest(http.MethodPost, fmt.Sprintf("/api/v1/jobs/%d/steps/%d/status", claim.Job.ID, stepID),
++		strings.NewReader(`{"status":"completed","conclusion":"timed_out"}`))
++	req.Header.Set("Authorization", "Bearer "+claim.Token)
++	rr = httptest.NewRecorder()
++	router.ServeHTTP(rr, req)
++	if rr.Code != http.StatusOK {
++		t.Fatalf("step status: got %d, want 200; body=%s", rr.Code, rr.Body.String())
++	}
++	var statusResp struct {
++		NextToken string `json:"next_token"`
++	}
++	if err := json.Unmarshal(rr.Body.Bytes(), &statusResp); err != nil {
++		t.Fatalf("decode status response: %v", err)
++	}
++	if statusResp.NextToken == "" {
++		t.Fatalf("missing next token: %s", rr.Body.String())
++	}
++	if got := actionsStepTimeoutsValue(t); got != before+1 {
++		t.Fatalf("timeout metric after first report: got %v, want %v", got, before+1)
++	}
++
++	req = httptest.NewRequest(http.MethodPost, fmt.Sprintf("/api/v1/jobs/%d/steps/%d/status", claim.Job.ID, stepID),
++		strings.NewReader(`{"status":"completed","conclusion":"timed_out"}`))
++	req.Header.Set("Authorization", "Bearer "+statusResp.NextToken)
++	rr = httptest.NewRecorder()
++	router.ServeHTTP(rr, req)
++	if rr.Code != http.StatusOK {
++		t.Fatalf("duplicate step status: got %d, want 200; body=%s", rr.Code, rr.Body.String())
++	}
++	if got := actionsStepTimeoutsValue(t); got != before+1 {
++		t.Fatalf("timeout metric after duplicate report: got %v, want still %v", got, before+1)
++	}
++}
++
  func TestWorkflowJobCancelAPIRequestsCancellation(t *testing.T) {
  	ctx := context.Background()
  	pool := dbtest.NewTestDB(t)
  	return r
+ }
++func actionsStepTimeoutsValue(t *testing.T) float64 {
++	t.Helper()
++	var metric dto.Metric
++	if err := metrics.ActionsStepTimeoutsTotal.Write(&metric); err != nil {
++		t.Fatalf("read timeout metric: %v", err)
++	}
++	if metric.Counter == nil {
++		return 0
++	}
++	return metric.Counter.GetValue()
++}
++
  const runnerAPIOldWorkflow = `name: CI
  on: push
  jobs:

tenseleyflow/shithub / `f23d1f1`

6 changed files