`e95a7a7`

S14: worker pool, job handlers, prometheus metrics

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 6 days ago

SHA: e95a7a71164bf2ad4cf5dc488dad845417801608
Parents: e659af3
Tree: 3926ee8

10 changed files

Status	File	+
M	`internal/infra/metrics/metrics.go`	29
A	`internal/worker/backoff_test.go`	59
A	`internal/worker/enqueue.go`	57
A	`internal/worker/jobs/jobs_purge.go`	64
A	`internal/worker/jobs/push_process.go`	153
A	`internal/worker/jobs/push_process_test.go`	292
A	`internal/worker/jobs/repo_size_recalc.go`	105
A	`internal/worker/pool.go`	317
A	`internal/worker/pool_integration_test.go`	234
A	`internal/worker/types.go`	90

internal/infra/metrics/metrics.gomodified

+ 	)
+ )
 +// Worker metrics. The pool updates these on every dispatch.
 +var (
 +	WorkerJobsProcessedTotal = prometheus.NewCounterVec(
 +		prometheus.CounterOpts{
 +			Name: "shithub_worker_jobs_processed_total",
 +			Help: "Worker jobs processed by kind and outcome (ok, retry, failed, poison).",
 +		},
 +		[]string{"kind", "outcome"},
 +	)
 +	WorkerJobDurationSeconds = prometheus.NewHistogramVec(
 +		prometheus.HistogramOpts{
 +			Name:    "shithub_worker_job_duration_seconds",
 +			Help:    "Worker handler latency by kind.",
 +			Buckets: prometheus.ExponentialBuckets(0.005, 2.5, 12),
 +		},
 +		[]string{"kind"},
 +	)
 +	WorkerInFlight = prometheus.NewGaugeVec(
 +		prometheus.GaugeOpts{
 +			Name: "shithub_worker_in_flight",
 +			Help: "Worker handler invocations currently in flight by kind.",
 +		},
 +		[]string{"kind"},
 +	)
 +)
++
  func init() {
  	Registry.MustRegister(
  		HTTPRequestsTotal,
  		DBConnsIdle,
  		DBConnsTotal,
  		DBAcquireWaitDurationTotal,
 +		WorkerJobsProcessedTotal,
 +		WorkerJobDurationSeconds,
 +		WorkerInFlight,
+ 	)
+ }

internal/worker/backoff_test.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package worker_test
++
 +import (
 +	"testing"
 +	"time"
++
 +	"github.com/tenseleyFlow/shithub/internal/worker"
 +)
++
 +func TestBackoff_Doubles(t *testing.T) {
 +	t.Parallel()
 +	cases := []struct {
 +		attempts int
 +		want     time.Duration
 +	}{
 +		{1, 30 * time.Second},
 +		{2, 60 * time.Second},
 +		{3, 120 * time.Second},
 +		{4, 240 * time.Second},
 +		{5, 480 * time.Second},
 +		{6, 960 * time.Second},
 +		{7, 1920 * time.Second},
 +		{8, time.Hour},  // capped
 +		{10, time.Hour}, // still capped
 +	}
 +	for _, c := range cases {
 +		got := worker.Backoff(c.attempts, nil)
 +		if got != c.want {
 +			t.Errorf("Backoff(%d) = %v, want %v", c.attempts, got, c.want)
 +		}
 +	}
 +}
++
 +func TestBackoff_JitterStaysInBand(t *testing.T) {
 +	t.Parallel()
 +	const attempts = 4
 +	base := worker.Backoff(attempts, nil)
 +	low := time.Duration(float64(base) * 0.8)
 +	high := time.Duration(float64(base) * 1.2)
 +	for i := 0; i < 100; i++ {
 +		j := float64(i) / 100.0
 +		got := worker.Backoff(attempts, func() float64 { return j })
 +		if got < low || got >= high {
 +			t.Fatalf("jitter %v: got %v, want in [%v, %v)", j, got, low, high)
 +		}
 +	}
 +}
++
 +func TestBackoff_NonPositiveAttemptsClampsToOne(t *testing.T) {
 +	t.Parallel()
 +	if got := worker.Backoff(0, nil); got != 30*time.Second {
 +		t.Errorf("Backoff(0) = %v, want 30s", got)
 +	}
 +	if got := worker.Backoff(-5, nil); got != 30*time.Second {
 +		t.Errorf("Backoff(-5) = %v, want 30s", got)
 +	}
 +}

internal/worker/enqueue.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package worker
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"fmt"
++
 +	"github.com/jackc/pgx/v5/pgtype"
++
 +	workerdb "github.com/tenseleyFlow/shithub/internal/worker/sqlc"
 +)
++
 +// DBTX matches the pgx interface that sqlc-generated methods accept
 +// (anything providing Exec/Query/QueryRow). The pool, a tx, and the
 +// helpers in dbtest all satisfy it.
 +type DBTX = workerdb.DBTX
++
 +// EnqueueOptions tunes a single enqueue. RunAt zero means "now"; pass a
 +// future time to delay the first run. MaxAttempts zero means use the
 +// table default (5).
 +type EnqueueOptions struct {
 +	RunAt       pgtype.Timestamptz
 +	MaxAttempts int32
 +}
++
 +// Enqueue inserts a job row and returns its id. Callers running inside a
 +// transaction should pass the tx as db so the enqueue is rolled back
 +// alongside any related state changes; same goes for the NOTIFY (issued
 +// separately by the caller via Notify).
 +func Enqueue(ctx context.Context, db DBTX, kind Kind, payload any, opts EnqueueOptions) (int64, error) {
 +	body, err := json.Marshal(payload)
 +	if err != nil {
 +		return 0, fmt.Errorf("worker: marshal payload: %w", err)
 +	}
 +	q := workerdb.New()
 +	row, err := q.EnqueueJob(ctx, db, workerdb.EnqueueJobParams{
 +		Kind:        string(kind),
 +		Payload:     body,
 +		RunAt:       opts.RunAt,
 +		MaxAttempts: pgtype.Int4{Int32: opts.MaxAttempts, Valid: opts.MaxAttempts > 0},
 +	})
 +	if err != nil {
 +		return 0, fmt.Errorf("worker: enqueue %s: %w", kind, err)
 +	}
 +	return row.ID, nil
 +}
++
 +// Notify wakes any LISTENing workers. Safe to call after a successful
 +// commit; if called inside a tx, the NOTIFY only delivers when the tx
 +// commits (Postgres semantics). Errors are non-fatal — workers also poll
 +// at a slow interval as a backstop.
 +func Notify(ctx context.Context, db DBTX) error {
 +	_, err := db.Exec(ctx, "SELECT pg_notify($1, '')", NotifyChannel)
 +	return err
 +}

internal/worker/jobs/jobs_purge.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package jobs
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"fmt"
 +	"log/slog"
 +	"time"
++
 +	"github.com/jackc/pgx/v5/pgtype"
 +	"github.com/jackc/pgx/v5/pgxpool"
++
 +	"github.com/tenseleyFlow/shithub/internal/worker"
 +	workerdb "github.com/tenseleyFlow/shithub/internal/worker/sqlc"
 +)
++
 +// JobsPurgeDeps wires the purge handler.
 +type JobsPurgeDeps struct {
 +	Pool   *pgxpool.Pool
 +	Logger *slog.Logger
 +}
++
 +// JobsPurgePayload — empty object is fine; defaults below.
 +type JobsPurgePayload struct {
 +	CompletedOlderThanDays int `json:"completed_older_than_days,omitempty"`
 +	FailedOlderThanDays    int `json:"failed_older_than_days,omitempty"`
 +}
++
 +// JobsPurge deletes completed/failed jobs older than the configured
 +// retention. Retention defaults: 14 days completed, 30 days failed.
 +// Designed to run as a cron job (S26 ships scheduling); for now it can
 +// be enqueued ad-hoc by the operator.
 +func JobsPurge(deps JobsPurgeDeps) worker.Handler {
 +	return func(ctx context.Context, raw json.RawMessage) error {
 +		p := JobsPurgePayload{}
 +		if len(raw) > 0 {
 +			_ = json.Unmarshal(raw, &p) // tolerant of empty/malformed; we have defaults
 +		}
 +		if p.CompletedOlderThanDays <= 0 {
 +			p.CompletedOlderThanDays = 14
 +		}
 +		if p.FailedOlderThanDays <= 0 {
 +			p.FailedOlderThanDays = 30
 +		}
 +		now := time.Now()
 +		q := workerdb.New()
 +		completedCutoff := pgtype.Timestamptz{Time: now.Add(-time.Duration(p.CompletedOlderThanDays) * 24 * time.Hour), Valid: true}
 +		failedCutoff := pgtype.Timestamptz{Time: now.Add(-time.Duration(p.FailedOlderThanDays) * 24 * time.Hour), Valid: true}
++
 +		nC, err := q.PurgeCompletedJobs(ctx, deps.Pool, completedCutoff)
 +		if err != nil {
 +			return fmt.Errorf("purge completed: %w", err)
 +		}
 +		nF, err := q.PurgeFailedJobs(ctx, deps.Pool, failedCutoff)
 +		if err != nil {
 +			return fmt.Errorf("purge failed: %w", err)
 +		}
 +		deps.Logger.InfoContext(ctx, "jobs:purge",
 +			"completed_deleted", nC, "failed_deleted", nF)
 +		return nil
 +	}
 +}

internal/worker/jobs/push_process.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +// Package jobs holds the concrete worker handlers wired into the pool
 +// at boot. Each file is one kind; handlers stay short and idempotent.
 +package jobs
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"errors"
 +	"fmt"
 +	"log/slog"
 +	"strings"
++
 +	"github.com/jackc/pgx/v5"
 +	"github.com/jackc/pgx/v5/pgtype"
 +	"github.com/jackc/pgx/v5/pgxpool"
++
 +	"github.com/tenseleyFlow/shithub/internal/infra/storage"
 +	reposdb "github.com/tenseleyFlow/shithub/internal/repos/sqlc"
 +	"github.com/tenseleyFlow/shithub/internal/worker"
 +	workerdb "github.com/tenseleyFlow/shithub/internal/worker/sqlc"
 +)
++
 +// PushProcessDeps wires the data this handler needs.
 +type PushProcessDeps struct {
 +	Pool   *pgxpool.Pool
 +	RepoFS *storage.RepoFS
 +	Logger *slog.Logger
 +}
++
 +// PushProcessPayload is the JSON shape post-receive enqueues.
 +type PushProcessPayload struct {
 +	PushEventID int64 `json:"push_event_id"`
 +}
++
 +// PushProcess returns a handler that:
 +//
 +//  1. Loads the push_event by id.
 +//  2. Updates repos.default_branch_oid if the ref matches the default
 +//     branch and the after_sha is non-zero.
 +//  3. Enqueues a repo:size_recalc job (separate handler — du is
 +//     potentially slow, isolate it).
 +//  4. Inserts a webhook_events_pending row carrying the push payload
 +//     (S33 deliverer drains).
 +//  5. Marks the push_event processed.
 +//
 +// The handler is idempotent on processed_at: re-runs after the first
 +// successful run are no-ops.
 +func PushProcess(deps PushProcessDeps) worker.Handler {
 +	return func(ctx context.Context, raw json.RawMessage) error {
 +		var p PushProcessPayload
 +		if err := json.Unmarshal(raw, &p); err != nil {
 +			return worker.PoisonError(fmt.Errorf("bad payload: %w", err))
 +		}
 +		if p.PushEventID == 0 {
 +			return worker.PoisonError(errors.New("missing push_event_id"))
 +		}
++
 +		wq := workerdb.New()
 +		event, err := wq.GetPushEvent(ctx, deps.Pool, p.PushEventID)
 +		if err != nil {
 +			if errors.Is(err, pgx.ErrNoRows) {
 +				return worker.PoisonError(fmt.Errorf("push_event %d not found", p.PushEventID))
 +			}
 +			return fmt.Errorf("load push_event: %w", err)
 +		}
 +		if event.ProcessedAt.Valid {
 +			return nil // idempotent: already done.
 +		}
++
 +		rq := reposdb.New()
 +		repo, err := rq.GetRepoByID(ctx, deps.Pool, event.RepoID)
 +		if err != nil {
 +			return fmt.Errorf("load repo: %w", err)
 +		}
++
 +		// 2: derive default-branch OID. The ref looks like "refs/heads/<name>".
 +		const refPrefix = "refs/heads/"
 +		if strings.HasPrefix(event.Ref, refPrefix) {
 +			branch := event.Ref[len(refPrefix):]
 +			if branch == repo.DefaultBranch {
 +				newOID := event.AfterSha
 +				if isZeroSHA(newOID) {
 +					// branch deleted — clear oid.
 +					_ = rq.UpdateRepoDefaultBranchOID(ctx, deps.Pool, reposdb.UpdateRepoDefaultBranchOIDParams{
 +						ID:               repo.ID,
 +						DefaultBranchOid: pgtype.Text{Valid: false},
 +					})
 +				} else {
 +					_ = rq.UpdateRepoDefaultBranchOID(ctx, deps.Pool, reposdb.UpdateRepoDefaultBranchOIDParams{
 +						ID:               repo.ID,
 +						DefaultBranchOid: pgtype.Text{String: newOID, Valid: true},
 +					})
 +				}
 +			}
 +		}
++
 +		// 3: enqueue size recalc — separate kind, runs independently.
 +		if _, err := worker.Enqueue(ctx, deps.Pool, worker.KindRepoSizeRecalc,
 +			map[string]any{"repo_id": repo.ID},
 +			worker.EnqueueOptions{}); err != nil {
 +			deps.Logger.WarnContext(ctx, "push:process: enqueue size_recalc",
 +				"push_event_id", event.ID, "error", err)
 +		}
++
 +		// 4: stash the payload for S33 to drain.
 +		body, _ := json.Marshal(map[string]any{
 +			"push_event_id":  event.ID,
 +			"repo_id":        event.RepoID,
 +			"pusher_user_id": int64ValueOrZero(event.PusherUserID),
 +			"before_sha":     event.BeforeSha,
 +			"after_sha":      event.AfterSha,
 +			"ref":            event.Ref,
 +			"protocol":       event.Protocol,
 +			"request_id":     event.RequestID,
 +		})
 +		if _, err := wq.InsertWebhookEventPending(ctx, deps.Pool, workerdb.InsertWebhookEventPendingParams{
 +			RepoID:    event.RepoID,
 +			EventKind: "push",
 +			Payload:   body,
 +		}); err != nil {
 +			return fmt.Errorf("insert webhook pending: %w", err)
 +		}
++
 +		// 5: mark processed last so a partial failure earlier triggers a
 +		// retry that retries the whole pipeline. Idempotency is via the
 +		// processed_at guard at the top.
 +		if err := wq.MarkPushEventProcessed(ctx, deps.Pool, event.ID); err != nil {
 +			return fmt.Errorf("mark processed: %w", err)
 +		}
++
 +		// Wake any size_recalc workers waiting on LISTEN.
 +		_ = worker.Notify(ctx, deps.Pool)
 +		return nil
 +	}
 +}
++
 +func isZeroSHA(s string) bool {
 +	for _, c := range s {
 +		if c != '0' {
 +			return false
 +		}
 +	}
 +	return s != ""
 +}
++
 +func int64ValueOrZero(p pgtype.Int8) int64 {
 +	if p.Valid {
 +		return p.Int64
 +	}
 +	return 0
 +}

internal/worker/jobs/push_process_test.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package jobs_test
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"log/slog"
 +	"os"
 +	"strings"
 +	"testing"
 +	"time"
++
 +	"github.com/jackc/pgx/v5/pgtype"
++
 +	"github.com/tenseleyFlow/shithub/internal/auth/audit"
 +	"github.com/tenseleyFlow/shithub/internal/auth/throttle"
 +	"github.com/tenseleyFlow/shithub/internal/infra/storage"
 +	"github.com/tenseleyFlow/shithub/internal/repos"
 +	repogit "github.com/tenseleyFlow/shithub/internal/repos/git"
 +	reposdb "github.com/tenseleyFlow/shithub/internal/repos/sqlc"
 +	"github.com/tenseleyFlow/shithub/internal/testing/dbtest"
 +	usersdb "github.com/tenseleyFlow/shithub/internal/users/sqlc"
 +	"github.com/tenseleyFlow/shithub/internal/worker/jobs"
 +	workerdb "github.com/tenseleyFlow/shithub/internal/worker/sqlc"
 +)
++
 +const fixtureHash = "$argon2id$v=19$m=16384,t=1,p=1$" +
 +	"AAAAAAAAAAAAAAAA$" +
 +	"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
++
 +// TestPushProcess_HappyPath exercises the full push:process pipeline
 +// against real Postgres + real bare repo. Verifies that the handler:
 +//   - sets repos.default_branch_oid when the ref is the default branch,
 +//   - inserts a webhook_events_pending row,
 +//   - marks the push_event processed_at,
 +//   - enqueues a follow-up repo:size_recalc job.
 +func TestPushProcess_HappyPath(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
 +	root := t.TempDir()
 +	rfs, err := storage.NewRepoFS(root)
 +	if err != nil {
 +		t.Fatalf("NewRepoFS: %v", err)
 +	}
++
 +	// User + verified email so repos.Create accepts a templated initial
 +	// commit (the create path needs author identity for plumbing).
 +	uq := usersdb.New()
 +	user, err := uq.CreateUser(context.Background(), pool, usersdb.CreateUserParams{
 +		Username: "alice", DisplayName: "alice", PasswordHash: fixtureHash,
 +	})
 +	if err != nil {
 +		t.Fatalf("CreateUser: %v", err)
 +	}
 +	em, err := uq.CreateUserEmail(context.Background(), pool, usersdb.CreateUserEmailParams{
 +		UserID: user.ID, Email: "alice@example.com", IsPrimary: true, Verified: true,
 +	})
 +	if err != nil {
 +		t.Fatalf("CreateUserEmail: %v", err)
 +	}
 +	_ = uq.LinkUserPrimaryEmail(context.Background(), pool, usersdb.LinkUserPrimaryEmailParams{
 +		ID: user.ID, PrimaryEmailID: pgtype.Int8{Int64: em.ID, Valid: true},
 +	})
++
 +	res, err := repos.Create(context.Background(), repos.Deps{
 +		Pool: pool, RepoFS: rfs, Audit: audit.NewRecorder(), Limiter: throttle.NewLimiter(),
 +	}, repos.Params{
 +		OwnerUserID: user.ID, OwnerUsername: "alice",
 +		Name: "demo", Visibility: "public", InitReadme: true,
 +	})
 +	if err != nil {
 +		t.Fatalf("repos.Create: %v", err)
 +	}
++
 +	// Insert a push_event covering the initial commit on refs/heads/trunk.
 +	wq := workerdb.New()
 +	event, err := wq.InsertPushEvent(context.Background(), pool, workerdb.InsertPushEventParams{
 +		RepoID:       res.Repo.ID,
 +		BeforeSha:    strings.Repeat("0", 40),
 +		AfterSha:     res.InitialCommitOID,
 +		Ref:          "refs/heads/trunk",
 +		Protocol:     "ssh",
 +		PusherUserID: pgtype.Int8{Int64: user.ID, Valid: true},
 +		RequestID:    pgtype.Text{String: "test-req", Valid: true},
 +	})
 +	if err != nil {
 +		t.Fatalf("InsertPushEvent: %v", err)
 +	}
++
 +	// Run the handler directly.
 +	logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelWarn}))
 +	handler := jobs.PushProcess(jobs.PushProcessDeps{Pool: pool, RepoFS: rfs, Logger: logger})
 +	payload, _ := json.Marshal(jobs.PushProcessPayload{PushEventID: event.ID})
 +	if err := handler(context.Background(), payload); err != nil {
 +		t.Fatalf("push:process: %v", err)
 +	}
++
 +	// Default branch OID should now match the initial commit.
 +	rq := reposdb.New()
 +	repo, err := rq.GetRepoByID(context.Background(), pool, res.Repo.ID)
 +	if err != nil {
 +		t.Fatalf("GetRepoByID: %v", err)
 +	}
 +	if !repo.DefaultBranchOid.Valid || repo.DefaultBranchOid.String != res.InitialCommitOID {
 +		t.Errorf("default_branch_oid = %v, want %q", repo.DefaultBranchOid, res.InitialCommitOID)
 +	}
++
 +	// Push event marked processed.
 +	got, err := wq.GetPushEvent(context.Background(), pool, event.ID)
 +	if err != nil {
 +		t.Fatalf("GetPushEvent: %v", err)
 +	}
 +	if !got.ProcessedAt.Valid {
 +		t.Errorf("processed_at not set")
 +	}
++
 +	// Webhook event row exists.
 +	var webhookCount int
 +	row := pool.QueryRow(context.Background(),
 +		`SELECT count(*) FROM webhook_events_pending WHERE repo_id = $1 AND event_kind = 'push'`,
 +		res.Repo.ID)
 +	if err := row.Scan(&webhookCount); err != nil {
 +		t.Fatalf("count webhook_events_pending: %v", err)
 +	}
 +	if webhookCount != 1 {
 +		t.Errorf("webhook_events_pending count = %d, want 1", webhookCount)
 +	}
++
 +	// repo:size_recalc job enqueued.
 +	var sizeJobCount int
 +	row = pool.QueryRow(context.Background(),
 +		`SELECT count(*) FROM jobs WHERE kind = 'repo:size_recalc' AND completed_at IS NULL AND failed_at IS NULL`)
 +	_ = row.Scan(&sizeJobCount)
 +	if sizeJobCount < 1 {
 +		t.Errorf("repo:size_recalc not enqueued (count=%d)", sizeJobCount)
 +	}
++
 +	// Sanity: re-running is a no-op (idempotent on processed_at).
 +	if err := handler(context.Background(), payload); err != nil {
 +		t.Fatalf("re-run: %v", err)
 +	}
 +}
++
 +// TestRepoSizeRecalc_UpdatesDiskUsedBytes drives the size recalc end-to-
 +// end against a real repo and verifies disk_used_bytes is non-zero.
 +func TestRepoSizeRecalc_UpdatesDiskUsedBytes(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
 +	root := t.TempDir()
 +	rfs, err := storage.NewRepoFS(root)
 +	if err != nil {
 +		t.Fatalf("NewRepoFS: %v", err)
 +	}
++
 +	uq := usersdb.New()
 +	user, _ := uq.CreateUser(context.Background(), pool, usersdb.CreateUserParams{
 +		Username: "bob", DisplayName: "bob", PasswordHash: fixtureHash,
 +	})
 +	em, _ := uq.CreateUserEmail(context.Background(), pool, usersdb.CreateUserEmailParams{
 +		UserID: user.ID, Email: "bob@example.com", IsPrimary: true, Verified: true,
 +	})
 +	_ = uq.LinkUserPrimaryEmail(context.Background(), pool, usersdb.LinkUserPrimaryEmailParams{
 +		ID: user.ID, PrimaryEmailID: pgtype.Int8{Int64: em.ID, Valid: true},
 +	})
 +	res, err := repos.Create(context.Background(), repos.Deps{
 +		Pool: pool, RepoFS: rfs, Audit: audit.NewRecorder(), Limiter: throttle.NewLimiter(),
 +	}, repos.Params{
 +		OwnerUserID: user.ID, OwnerUsername: "bob",
 +		Name: "demo", Visibility: "public", InitReadme: true,
 +	})
 +	if err != nil {
 +		t.Fatalf("repos.Create: %v", err)
 +	}
++
 +	logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelWarn}))
 +	handler := jobs.RepoSizeRecalc(jobs.RepoSizeRecalcDeps{Pool: pool, RepoFS: rfs, Logger: logger})
 +	payload, _ := json.Marshal(jobs.RepoSizeRecalcPayload{RepoID: res.Repo.ID})
 +	if err := handler(context.Background(), payload); err != nil {
 +		t.Fatalf("repo:size_recalc: %v", err)
 +	}
++
 +	rq := reposdb.New()
 +	repo, _ := rq.GetRepoByID(context.Background(), pool, res.Repo.ID)
 +	if repo.DiskUsedBytes <= 0 {
 +		t.Errorf("disk_used_bytes = %d, want > 0", repo.DiskUsedBytes)
 +	}
 +}
++
 +// TestPushProcess_BranchNotDefault: a push to refs/heads/feat shouldn't
 +// overwrite default_branch_oid on a repo whose default_branch is trunk.
 +func TestPushProcess_BranchNotDefault(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
 +	root := t.TempDir()
 +	rfs, _ := storage.NewRepoFS(root)
 +	uq := usersdb.New()
 +	user, _ := uq.CreateUser(context.Background(), pool, usersdb.CreateUserParams{
 +		Username: "carol", DisplayName: "carol", PasswordHash: fixtureHash,
 +	})
 +	em, _ := uq.CreateUserEmail(context.Background(), pool, usersdb.CreateUserEmailParams{
 +		UserID: user.ID, Email: "carol@example.com", IsPrimary: true, Verified: true,
 +	})
 +	_ = uq.LinkUserPrimaryEmail(context.Background(), pool, usersdb.LinkUserPrimaryEmailParams{
 +		ID: user.ID, PrimaryEmailID: pgtype.Int8{Int64: em.ID, Valid: true},
 +	})
 +	res, _ := repos.Create(context.Background(), repos.Deps{
 +		Pool: pool, RepoFS: rfs, Audit: audit.NewRecorder(), Limiter: throttle.NewLimiter(),
 +	}, repos.Params{
 +		OwnerUserID: user.ID, OwnerUsername: "carol",
 +		Name: "demo", Visibility: "public", InitReadme: true,
 +	})
++
 +	wq := workerdb.New()
 +	event, err := wq.InsertPushEvent(context.Background(), pool, workerdb.InsertPushEventParams{
 +		RepoID:       res.Repo.ID,
 +		BeforeSha:    strings.Repeat("0", 40),
 +		AfterSha:     "deadbeef" + strings.Repeat("0", 32),
 +		Ref:          "refs/heads/feat",
 +		Protocol:     "ssh",
 +		PusherUserID: pgtype.Int8{Int64: user.ID, Valid: true},
 +	})
 +	if err != nil {
 +		t.Fatalf("InsertPushEvent: %v", err)
 +	}
 +	logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelWarn}))
 +	handler := jobs.PushProcess(jobs.PushProcessDeps{Pool: pool, RepoFS: rfs, Logger: logger})
 +	payload, _ := json.Marshal(jobs.PushProcessPayload{PushEventID: event.ID})
 +	if err := handler(context.Background(), payload); err != nil {
 +		t.Fatalf("push:process: %v", err)
 +	}
 +	rq := reposdb.New()
 +	repo, _ := rq.GetRepoByID(context.Background(), pool, res.Repo.ID)
 +	if repo.DefaultBranchOid.Valid {
 +		t.Errorf("default_branch_oid set to %q for non-default ref", repo.DefaultBranchOid.String)
 +	}
 +}
++
 +// Sanity check that the package's core helpers don't break under nil
 +// payload (defensive — production hooks always send populated payloads).
 +func TestPushProcess_RejectsBadPayload(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
 +	rfs, _ := storage.NewRepoFS(t.TempDir())
 +	logger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelWarn}))
 +	handler := jobs.PushProcess(jobs.PushProcessDeps{Pool: pool, RepoFS: rfs, Logger: logger})
++
 +	// Empty payload → poison.
 +	if err := handler(context.Background(), json.RawMessage(`{}`)); err == nil {
 +		t.Errorf("empty payload: want error, got nil")
 +	}
 +	// Reference unknown event → poison.
 +	if err := handler(context.Background(), json.RawMessage(`{"push_event_id": 99999}`)); err == nil {
 +		t.Errorf("missing event: want error, got nil")
 +	}
 +}
++
 +// Belt + braces: when InitReadme=true the initial commit has a real OID
 +// matching the on-disk HEAD, which validates our test fixtures match
 +// reality.
 +func TestRepoFixture_HeadMatchesInitialCommit(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
 +	root := t.TempDir()
 +	rfs, _ := storage.NewRepoFS(root)
 +	uq := usersdb.New()
 +	user, _ := uq.CreateUser(context.Background(), pool, usersdb.CreateUserParams{
 +		Username: "dave", DisplayName: "dave", PasswordHash: fixtureHash,
 +	})
 +	em, _ := uq.CreateUserEmail(context.Background(), pool, usersdb.CreateUserEmailParams{
 +		UserID: user.ID, Email: "dave@example.com", IsPrimary: true, Verified: true,
 +	})
 +	_ = uq.LinkUserPrimaryEmail(context.Background(), pool, usersdb.LinkUserPrimaryEmailParams{
 +		ID: user.ID, PrimaryEmailID: pgtype.Int8{Int64: em.ID, Valid: true},
 +	})
 +	res, _ := repos.Create(context.Background(), repos.Deps{
 +		Pool: pool, RepoFS: rfs, Audit: audit.NewRecorder(), Limiter: throttle.NewLimiter(),
 +	}, repos.Params{
 +		OwnerUserID: user.ID, OwnerUsername: "dave",
 +		Name: "demo", Visibility: "public", InitReadme: true,
 +	})
 +	gitDir, _ := rfs.RepoPath("dave", "demo")
 +	head, found, err := repogit.HeadOf(context.Background(), gitDir, "trunk")
 +	if err != nil || !found {
 +		t.Fatalf("HeadOf trunk: found=%v err=%v", found, err)
 +	}
 +	if head.OID != res.InitialCommitOID {
 +		t.Errorf("HeadOf.OID = %q, want %q", head.OID, res.InitialCommitOID)
 +	}
 +	// brevity check so the linter is happy with imports.
 +	_ = time.Second
 +}

internal/worker/jobs/repo_size_recalc.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package jobs
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"errors"
 +	"fmt"
 +	"io/fs"
 +	"log/slog"
 +	"path/filepath"
++
 +	"github.com/jackc/pgx/v5"
 +	"github.com/jackc/pgx/v5/pgxpool"
++
 +	"github.com/tenseleyFlow/shithub/internal/infra/storage"
 +	reposdb "github.com/tenseleyFlow/shithub/internal/repos/sqlc"
 +	"github.com/tenseleyFlow/shithub/internal/worker"
 +)
++
 +// RepoSizeRecalcDeps wires the size-recalc handler.
 +type RepoSizeRecalcDeps struct {
 +	Pool   *pgxpool.Pool
 +	RepoFS *storage.RepoFS
 +	Logger *slog.Logger
 +}
++
 +// RepoSizeRecalcPayload — { "repo_id": <int> }.
 +type RepoSizeRecalcPayload struct {
 +	RepoID int64 `json:"repo_id"`
 +}
++
 +// RepoSizeRecalc walks the bare-repo tree and updates
 +// repos.disk_used_bytes. Walked in pure Go (no shelling out to du) so
 +// we get a portable sum and don't have to wrangle stderr from a
 +// blocked subprocess.
 +//
 +// Concurrent runs may compute slightly different sizes if a push lands
 +// mid-walk; that's acceptable — the *last* one wins, and quotas (post-
 +// MVP) tolerate small drift.
 +func RepoSizeRecalc(deps RepoSizeRecalcDeps) worker.Handler {
 +	return func(ctx context.Context, raw json.RawMessage) error {
 +		var p RepoSizeRecalcPayload
 +		if err := json.Unmarshal(raw, &p); err != nil {
 +			return worker.PoisonError(fmt.Errorf("bad payload: %w", err))
 +		}
 +		if p.RepoID == 0 {
 +			return worker.PoisonError(errors.New("missing repo_id"))
 +		}
++
 +		rq := reposdb.New()
 +		ownerRow, err := rq.GetRepoOwnerUsernameByID(ctx, deps.Pool, p.RepoID)
 +		if err != nil {
 +			if errors.Is(err, pgx.ErrNoRows) {
 +				return worker.PoisonError(fmt.Errorf("repo %d not found", p.RepoID))
 +			}
 +			return fmt.Errorf("load repo: %w", err)
 +		}
++
 +		gitDir, err := deps.RepoFS.RepoPath(ownerRow.OwnerUsername, ownerRow.RepoName)
 +		if err != nil {
 +			return worker.PoisonError(fmt.Errorf("repo path: %w", err))
 +		}
 +		size, err := walkSize(ctx, gitDir)
 +		if err != nil {
 +			return fmt.Errorf("walk size: %w", err)
 +		}
 +		if err := rq.UpdateRepoDiskUsed(ctx, deps.Pool, reposdb.UpdateRepoDiskUsedParams{
 +			ID:            p.RepoID,
 +			DiskUsedBytes: size,
 +		}); err != nil {
 +			return fmt.Errorf("update disk_used: %w", err)
 +		}
 +		return nil
 +	}
 +}
++
 +// walkSize sums the byte size of every regular file under root. Walks
 +// once; doesn't follow symlinks (we never create any inside a bare
 +// repo). Honors ctx so a long-running walk on a giant repo can be
 +// cancelled by graceful shutdown.
 +func walkSize(ctx context.Context, root string) (int64, error) {
 +	var total int64
 +	err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error {
 +		if walkErr != nil {
 +			return walkErr
 +		}
 +		if err := ctx.Err(); err != nil {
 +			return err
 +		}
 +		if d.IsDir() {
 +			return nil
 +		}
 +		info, err := d.Info()
 +		if err != nil {
 +			return err
 +		}
 +		if info.Mode().IsRegular() {
 +			total += info.Size()
 +		}
 +		return nil
 +	})
 +	return total, err
 +}

internal/worker/pool.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package worker
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"errors"
 +	"fmt"
 +	"log/slog"
 +	mrand "math/rand"
 +	"os"
 +	"strconv"
 +	"sync"
 +	"time"
++
 +	"github.com/jackc/pgx/v5"
 +	"github.com/jackc/pgx/v5/pgtype"
 +	"github.com/jackc/pgx/v5/pgxpool"
++
 +	"github.com/tenseleyFlow/shithub/internal/infra/metrics"
 +	workerdb "github.com/tenseleyFlow/shithub/internal/worker/sqlc"
 +)
++
 +// PoolConfig configures Pool. Leave fields zero for the documented
 +// defaults.
 +type PoolConfig struct {
 +	Workers    int           // default 4
 +	IdlePoll   time.Duration // default 5s — backstop when LISTEN drops a wake
 +	JobTimeout time.Duration // default 5min, applied per-job via context
 +	InstanceID string        // default "<hostname>:<pid>"
 +	Logger     *slog.Logger  // default discards
 +}
++
 +// Pool dispatches jobs from the queue. Construct via NewPool, register
 +// handlers via Register, run via Run.
 +type Pool struct {
 +	cfg      PoolConfig
 +	db       *pgxpool.Pool
 +	q        *workerdb.Queries
 +	handlers map[Kind]Handler
 +	rng      *mrand.Rand
 +	mu       sync.Mutex // guards handlers + rng
 +}
++
 +// NewPool wires a pool against an open pgx pool. Callers register
 +// handlers before calling Run.
 +func NewPool(db *pgxpool.Pool, cfg PoolConfig) *Pool {
 +	if cfg.Workers <= 0 {
 +		cfg.Workers = 4
 +	}
 +	if cfg.IdlePoll <= 0 {
 +		cfg.IdlePoll = 5 * time.Second
 +	}
 +	if cfg.JobTimeout <= 0 {
 +		cfg.JobTimeout = 5 * time.Minute
 +	}
 +	if cfg.InstanceID == "" {
 +		host, _ := os.Hostname()
 +		cfg.InstanceID = host + ":" + strconv.Itoa(os.Getpid())
 +	}
 +	if cfg.Logger == nil {
 +		cfg.Logger = slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))
 +	}
 +	return &Pool{
 +		cfg:      cfg,
 +		db:       db,
 +		q:        workerdb.New(),
 +		handlers: make(map[Kind]Handler),
 +		// nolint:gosec // G404: jitter is non-cryptographic by design.
 +		rng: mrand.New(mrand.NewSource(time.Now().UnixNano())),
 +	}
 +}
++
 +// Register associates a handler with a kind. Re-registering replaces
 +// the previous handler. Registration is goroutine-safe so test harnesses
 +// can swap handlers between runs.
 +func (p *Pool) Register(kind Kind, h Handler) {
 +	p.mu.Lock()
 +	defer p.mu.Unlock()
 +	p.handlers[kind] = h
 +}
++
 +// Run blocks until ctx is cancelled. Spawns cfg.Workers worker goroutines
 +// plus one LISTEN goroutine that fans out wake-ups. Returns nil after a
 +// clean drain; returns ctx.Err() if drain timed out.
 +func (p *Pool) Run(ctx context.Context) error {
 +	p.cfg.Logger.InfoContext(ctx, "worker: starting",
 +		"workers", p.cfg.Workers,
 +		"instance_id", p.cfg.InstanceID,
 +		"kinds", p.kindList())
++
 +	wake := make(chan struct{}, p.cfg.Workers)
 +	var wg sync.WaitGroup
++
 +	// LISTEN goroutine. Holds a dedicated conn for the lifetime of Run.
 +	wg.Add(1)
 +	go func() {
 +		defer wg.Done()
 +		p.listenLoop(ctx, wake)
 +	}()
++
 +	for i := 0; i < p.cfg.Workers; i++ {
 +		wg.Add(1)
 +		go func(id int) {
 +			defer wg.Done()
 +			p.workerLoop(ctx, id, wake)
 +		}(i)
 +	}
++
 +	wg.Wait()
 +	p.cfg.Logger.InfoContext(ctx, "worker: stopped")
 +	return nil
 +}
++
 +func (p *Pool) kindList() []string {
 +	p.mu.Lock()
 +	defer p.mu.Unlock()
 +	out := make([]string, 0, len(p.handlers))
 +	for k := range p.handlers {
 +		out = append(out, string(k))
 +	}
 +	return out
 +}
++
 +// listenLoop maintains a LISTEN on NotifyChannel. On each NOTIFY, fan
 +// out to wake. On reconnect-required errors, sleeps briefly and retries.
 +func (p *Pool) listenLoop(ctx context.Context, wake chan<- struct{}) {
 +	for {
 +		if err := ctx.Err(); err != nil {
 +			return
 +		}
 +		if err := p.listenOnce(ctx, wake); err != nil && !errors.Is(err, context.Canceled) {
 +			p.cfg.Logger.WarnContext(ctx, "worker: listen restart", "error", err)
 +			select {
 +			case <-ctx.Done():
 +				return
 +			case <-time.After(2 * time.Second):
 +			}
 +		}
 +	}
 +}
++
 +func (p *Pool) listenOnce(ctx context.Context, wake chan<- struct{}) error {
 +	conn, err := p.db.Acquire(ctx)
 +	if err != nil {
 +		return fmt.Errorf("acquire: %w", err)
 +	}
 +	defer conn.Release()
++
 +	if _, err := conn.Exec(ctx, "LISTEN "+NotifyChannel); err != nil {
 +		return fmt.Errorf("LISTEN: %w", err)
 +	}
 +	for {
 +		_, err := conn.Conn().WaitForNotification(ctx)
 +		if err != nil {
 +			return err
 +		}
 +		// Fan out to as many workers as are idle. Non-blocking sends so
 +		// we don't stall on a saturated pool.
 +		for i := 0; i < p.cfg.Workers; i++ {
 +			select {
 +			case wake <- struct{}{}:
 +			default:
 +			}
 +		}
 +	}
 +}
++
 +func (p *Pool) workerLoop(ctx context.Context, id int, wake <-chan struct{}) {
 +	logger := p.cfg.Logger.With("worker_id", id)
 +	ticker := time.NewTicker(p.cfg.IdlePoll)
 +	defer ticker.Stop()
++
 +	for {
 +		select {
 +		case <-ctx.Done():
 +			return
 +		case <-wake:
 +		case <-ticker.C:
 +		}
 +		// Drain: try every registered kind; if any kind returned a job
 +		// loop back immediately without waiting on wake/tick.
 +		for {
 +			any, err := p.tryClaimAndRun(ctx, logger)
 +			if err != nil {
 +				logger.WarnContext(ctx, "worker: claim cycle error", "error", err)
 +				break
 +			}
 +			if !any {
 +				break
 +			}
 +		}
 +	}
 +}
++
 +// tryClaimAndRun walks every registered kind and attempts to claim one
 +// job. Returns true if any kind produced work this pass.
 +func (p *Pool) tryClaimAndRun(ctx context.Context, logger *slog.Logger) (bool, error) {
 +	p.mu.Lock()
 +	kinds := make([]Kind, 0, len(p.handlers))
 +	for k := range p.handlers {
 +		kinds = append(kinds, k)
 +	}
 +	p.mu.Unlock()
++
 +	any := false
 +	for _, kind := range kinds {
 +		if ctx.Err() != nil {
 +			return any, ctx.Err()
 +		}
 +		ran, err := p.runOne(ctx, kind, logger)
 +		if err != nil {
 +			return any, err
 +		}
 +		if ran {
 +			any = true
 +		}
 +	}
 +	return any, nil
 +}
++
 +// runOne claims one job of the given kind, runs it, and records the
 +// outcome. Returns ran=true when a job was claimed (regardless of
 +// success), false when the queue had nothing for this kind.
 +func (p *Pool) runOne(ctx context.Context, kind Kind, logger *slog.Logger) (bool, error) {
 +	job, err := p.q.ClaimJob(ctx, p.db, workerdb.ClaimJobParams{
 +		Kind:     string(kind),
 +		LockedBy: pgtype.Text{String: p.cfg.InstanceID, Valid: true},
 +	})
 +	if errors.Is(err, pgx.ErrNoRows) {
 +		return false, nil
 +	}
 +	if err != nil {
 +		return false, fmt.Errorf("claim %s: %w", kind, err)
 +	}
++
 +	p.mu.Lock()
 +	h, ok := p.handlers[Kind(job.Kind)]
 +	p.mu.Unlock()
 +	if !ok {
 +		// Registered handler vanished between claim and dispatch; fail
 +		// the job rather than block the queue. Should never happen in
 +		// practice — Register is one-shot at boot.
 +		_ = p.q.MarkJobFailed(ctx, p.db, workerdb.MarkJobFailedParams{
 +			ID:        job.ID,
 +			LastError: pgtype.Text{String: "no handler registered", Valid: true},
 +		})
 +		return true, nil
 +	}
++
 +	jobCtx, cancel := context.WithTimeout(ctx, p.cfg.JobTimeout)
 +	start := time.Now()
 +	metrics.WorkerInFlight.WithLabelValues(job.Kind).Inc()
 +	runErr := safeRun(jobCtx, h, job.Payload)
 +	metrics.WorkerInFlight.WithLabelValues(job.Kind).Dec()
 +	cancel()
 +	metrics.WorkerJobDurationSeconds.WithLabelValues(job.Kind).Observe(time.Since(start).Seconds())
++
 +	logger.InfoContext(ctx, "worker: dispatched",
 +		"job_id", job.ID,
 +		"kind", job.Kind,
 +		"attempt", job.Attempts,
 +		"duration_ms", time.Since(start).Milliseconds(),
 +		"ok", runErr == nil,
 +	)
++
 +	if runErr == nil {
 +		if err := p.q.MarkJobCompleted(ctx, p.db, job.ID); err != nil {
 +			logger.ErrorContext(ctx, "worker: mark completed", "job_id", job.ID, "error", err)
 +		}
 +		metrics.WorkerJobsProcessedTotal.WithLabelValues(job.Kind, "ok").Inc()
 +		return true, nil
 +	}
++
 +	// Failure path. Poison errors skip retry.
 +	if errors.Is(runErr, ErrPoison) {
 +		_ = p.q.MarkJobFailed(ctx, p.db, workerdb.MarkJobFailedParams{
 +			ID:        job.ID,
 +			LastError: pgtype.Text{String: runErr.Error(), Valid: true},
 +		})
 +		metrics.WorkerJobsProcessedTotal.WithLabelValues(job.Kind, "poison").Inc()
 +		return true, nil
 +	}
++
 +	if int(job.Attempts) >= int(job.MaxAttempts) {
 +		_ = p.q.MarkJobFailed(ctx, p.db, workerdb.MarkJobFailedParams{
 +			ID:        job.ID,
 +			LastError: pgtype.Text{String: runErr.Error(), Valid: true},
 +		})
 +		metrics.WorkerJobsProcessedTotal.WithLabelValues(job.Kind, "failed").Inc()
 +		return true, nil
 +	}
++
 +	p.mu.Lock()
 +	delay := Backoff(int(job.Attempts), p.rng.Float64)
 +	p.mu.Unlock()
 +	_ = p.q.RescheduleJob(ctx, p.db, workerdb.RescheduleJobParams{
 +		ID:        job.ID,
 +		LastError: pgtype.Text{String: runErr.Error(), Valid: true},
 +		RunAt:     pgtype.Timestamptz{Time: time.Now().Add(delay), Valid: true},
 +	})
 +	metrics.WorkerJobsProcessedTotal.WithLabelValues(job.Kind, "retry").Inc()
 +	return true, nil
 +}
++
 +// safeRun wraps the handler in a recover so a panicking handler doesn't
 +// take the worker goroutine with it; the job is rescheduled like any
 +// other failure.
 +func safeRun(ctx context.Context, h Handler, payload json.RawMessage) (err error) {
 +	defer func() {
 +		if r := recover(); r != nil {
 +			err = fmt.Errorf("worker: handler panic: %v", r)
 +		}
 +	}()
 +	return h(ctx, payload)
 +}

internal/worker/pool_integration_test.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +package worker_test
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"errors"
 +	"sync"
 +	"sync/atomic"
 +	"testing"
 +	"time"
++
 +	"github.com/jackc/pgx/v5/pgtype"
++
 +	"github.com/tenseleyFlow/shithub/internal/testing/dbtest"
 +	"github.com/tenseleyFlow/shithub/internal/worker"
 +	workerdb "github.com/tenseleyFlow/shithub/internal/worker/sqlc"
 +)
++
 +// testKind is unique per test so handlers don't bleed across parallel
 +// tests sharing the worker package's runtime state.
 +const (
 +	testKindHappy   worker.Kind = "test:happy"
 +	testKindRetry   worker.Kind = "test:retry"
 +	testKindPoison  worker.Kind = "test:poison"
 +	testKindFanIn50 worker.Kind = "test:fanin50"
 +)
++
 +// runUntil starts the pool in a goroutine and returns a stop func that
 +// cancels the context and waits for clean exit.
 +func runPool(t *testing.T, p *worker.Pool) (cancel func()) {
 +	t.Helper()
 +	ctx, c := context.WithCancel(context.Background())
 +	done := make(chan struct{})
 +	go func() {
 +		_ = p.Run(ctx)
 +		close(done)
 +	}()
 +	return func() {
 +		c()
 +		select {
 +		case <-done:
 +		case <-time.After(10 * time.Second):
 +			t.Fatal("pool did not stop in 10s")
 +		}
 +	}
 +}
++
 +func TestPool_HappyPath(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
++
 +	var seen atomic.Int64
 +	p := worker.NewPool(pool, worker.PoolConfig{Workers: 2, IdlePoll: 200 * time.Millisecond})
 +	p.Register(testKindHappy, func(_ context.Context, _ json.RawMessage) error {
 +		seen.Add(1)
 +		return nil
 +	})
 +	stop := runPool(t, p)
 +	defer stop()
++
 +	id, err := worker.Enqueue(context.Background(), pool, testKindHappy, map[string]any{"x": 1}, worker.EnqueueOptions{})
 +	if err != nil {
 +		t.Fatalf("Enqueue: %v", err)
 +	}
 +	if err := worker.Notify(context.Background(), pool); err != nil {
 +		t.Fatalf("Notify: %v", err)
 +	}
++
 +	waitFor(t, 5*time.Second, func() bool { return seen.Load() == 1 })
++
 +	q := workerdb.New()
 +	job, err := q.GetJob(context.Background(), pool, id)
 +	if err != nil {
 +		t.Fatalf("GetJob: %v", err)
 +	}
 +	if !job.CompletedAt.Valid {
 +		t.Errorf("job %d: completed_at unset; last_error=%v", id, job.LastError.String)
 +	}
 +}
++
 +func TestPool_RetryThenSucceed(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
++
 +	var attempts atomic.Int64
 +	p := worker.NewPool(pool, worker.PoolConfig{Workers: 2, IdlePoll: 100 * time.Millisecond})
 +	p.Register(testKindRetry, func(_ context.Context, _ json.RawMessage) error {
 +		if attempts.Add(1) < 2 {
 +			return errors.New("transient")
 +		}
 +		return nil
 +	})
 +	stop := runPool(t, p)
 +	defer stop()
++
 +	// Enqueue with run_at well in the past so reschedule fires immediately.
 +	id, err := worker.Enqueue(context.Background(), pool, testKindRetry, map[string]any{}, worker.EnqueueOptions{MaxAttempts: 5})
 +	if err != nil {
 +		t.Fatalf("Enqueue: %v", err)
 +	}
 +	_ = worker.Notify(context.Background(), pool)
++
 +	// Force the rescheduled run_at to "now" so we don't wait the full
 +	// backoff. We do this by polling: after the first attempt fails,
 +	// the row's run_at is base * 2 ≈ 60s. We bypass via direct UPDATE.
 +	q := workerdb.New()
 +	waitFor(t, 5*time.Second, func() bool { return attempts.Load() >= 1 })
 +	if _, err := pool.Exec(context.Background(), `UPDATE jobs SET run_at = now() WHERE id = $1`, id); err != nil {
 +		t.Fatalf("force run_at: %v", err)
 +	}
 +	_ = worker.Notify(context.Background(), pool)
++
 +	waitFor(t, 5*time.Second, func() bool { return attempts.Load() >= 2 })
 +	waitFor(t, 5*time.Second, func() bool {
 +		j, err := q.GetJob(context.Background(), pool, id)
 +		return err == nil && j.CompletedAt.Valid
 +	})
 +}
++
 +func TestPool_PoisonGoesStraightToFailed(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
++
 +	var calls atomic.Int64
 +	p := worker.NewPool(pool, worker.PoolConfig{Workers: 1, IdlePoll: 100 * time.Millisecond})
 +	p.Register(testKindPoison, func(_ context.Context, _ json.RawMessage) error {
 +		calls.Add(1)
 +		return worker.PoisonError(errors.New("nope"))
 +	})
 +	stop := runPool(t, p)
 +	defer stop()
++
 +	id, err := worker.Enqueue(context.Background(), pool, testKindPoison, map[string]any{}, worker.EnqueueOptions{})
 +	if err != nil {
 +		t.Fatalf("Enqueue: %v", err)
 +	}
 +	_ = worker.Notify(context.Background(), pool)
++
 +	q := workerdb.New()
 +	waitFor(t, 5*time.Second, func() bool {
 +		j, err := q.GetJob(context.Background(), pool, id)
 +		return err == nil && j.FailedAt.Valid
 +	})
 +	if got := calls.Load(); got != 1 {
 +		t.Errorf("calls = %d, want 1 (no retry on poison)", got)
 +	}
 +	j, _ := q.GetJob(context.Background(), pool, id)
 +	if !j.LastError.Valid || j.LastError.String == "" {
 +		t.Errorf("last_error not recorded on poison")
 +	}
 +}
++
 +func TestPool_ConcurrentClaimsExactlyOnce(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
++
 +	const total = 50
 +	processed := make(map[int64]int) // job_id → times processed
 +	var mu sync.Mutex
 +	p := worker.NewPool(pool, worker.PoolConfig{Workers: 4, IdlePoll: 50 * time.Millisecond})
 +	p.Register(testKindFanIn50, func(_ context.Context, raw json.RawMessage) error {
 +		var payload struct {
 +			ID int64 `json:"id"`
 +		}
 +		_ = json.Unmarshal(raw, &payload)
 +		mu.Lock()
 +		processed[payload.ID]++
 +		mu.Unlock()
 +		return nil
 +	})
 +	stop := runPool(t, p)
 +	defer stop()
++
 +	for i := 0; i < total; i++ {
 +		_, err := worker.Enqueue(context.Background(), pool, testKindFanIn50,
 +			map[string]any{"id": i}, worker.EnqueueOptions{})
 +		if err != nil {
 +			t.Fatalf("Enqueue: %v", err)
 +		}
 +	}
 +	_ = worker.Notify(context.Background(), pool)
++
 +	waitFor(t, 10*time.Second, func() bool {
 +		mu.Lock()
 +		defer mu.Unlock()
 +		return len(processed) == total
 +	})
++
 +	mu.Lock()
 +	defer mu.Unlock()
 +	for id, count := range processed {
 +		if count != 1 {
 +			t.Errorf("job %d processed %d times, want 1", id, count)
 +		}
 +	}
 +}
++
 +func TestEnqueue_DelayedRunAt(t *testing.T) {
 +	t.Parallel()
 +	pool := dbtest.NewTestDB(t)
 +	future := time.Now().Add(1 * time.Hour)
 +	id, err := worker.Enqueue(context.Background(), pool, "test:delayed", map[string]any{}, worker.EnqueueOptions{
 +		RunAt: pgtype.Timestamptz{Time: future, Valid: true},
 +	})
 +	if err != nil {
 +		t.Fatalf("Enqueue: %v", err)
 +	}
 +	q := workerdb.New()
 +	job, err := q.GetJob(context.Background(), pool, id)
 +	if err != nil {
 +		t.Fatalf("GetJob: %v", err)
 +	}
 +	if !job.RunAt.Time.Equal(future.UTC().Truncate(time.Microsecond)) {
 +		// pg truncates to microseconds; allow a tiny delta.
 +		if d := job.RunAt.Time.Sub(future); d > time.Second || d < -time.Second {
 +			t.Errorf("run_at = %v, want %v", job.RunAt.Time, future)
 +		}
 +	}
 +}
++
 +// waitFor polls cond every 50ms up to limit. Fails the test on timeout.
 +func waitFor(t *testing.T, limit time.Duration, cond func() bool) {
 +	t.Helper()
 +	deadline := time.Now().Add(limit)
 +	for time.Now().Before(deadline) {
 +		if cond() {
 +			return
 +		}
 +		time.Sleep(50 * time.Millisecond)
 +	}
 +	t.Fatalf("waitFor: condition not met within %v", limit)
 +}

internal/worker/types.goadded

 +// SPDX-License-Identifier: AGPL-3.0-or-later
++
 +// Package worker drives the Postgres-backed job queue introduced in
 +// S14. The pool dispatches one Job per goroutine, claiming rows via
 +// FOR UPDATE SKIP LOCKED so concurrent workers don't double-process.
 +//
 +// Job kinds, their payload schema, and their handlers live alongside
 +// this package in sub-packages (jobs/<kind>.go). The pool itself is
 +// kind-agnostic: a Handler is just a func that takes a payload and
 +// returns nil-or-error.
 +package worker
++
 +import (
 +	"context"
 +	"encoding/json"
 +	"errors"
 +	"fmt"
 +	"time"
 +)
++
 +// Kind is the canonical name of a job. Use lowercase letters and
 +// colon-separated namespaces (e.g. "push:process", "repo:size_recalc").
 +// Kind doubles as the dispatch index — workers query
 +// `WHERE kind = $1` so adding new kinds doesn't disturb existing ones.
 +type Kind string
++
 +// Built-in kinds shipped in S14.
 +const (
 +	KindPushProcess    Kind = "push:process"
 +	KindRepoSizeRecalc Kind = "repo:size_recalc"
 +	KindJobsPurge      Kind = "jobs:purge_completed"
 +)
++
 +// NotifyChannel is the Postgres LISTEN/NOTIFY channel the pool subscribes
 +// to so it wakes up immediately when a job is enqueued, instead of
 +// polling. Callers wrapping enqueue in a tx must NOTIFY inside the
 +// same tx so the notification only fires on commit.
 +const NotifyChannel = "shithub_jobs"
++
 +// Handler runs one job. The framework supplies the raw JSON payload;
 +// handlers Unmarshal into their own typed schema. A nil error reports
 +// success and the job is marked completed; any non-nil error triggers
 +// the backoff/retry path. ErrPoison is the explicit "do not retry" signal
 +// — useful when the input is malformed and retrying can't help.
 +type Handler func(ctx context.Context, payload json.RawMessage) error
++
 +// ErrPoison wraps a handler error that should NOT be retried. The pool
 +// jumps the job straight to MarkJobFailed instead of rescheduling.
 +var ErrPoison = errors.New("worker: poison job")
++
 +// PoisonError wraps cause as a poison error. The cause is preserved in
 +// last_error for operator inspection.
 +func PoisonError(cause error) error {
 +	return fmt.Errorf("%w: %v", ErrPoison, cause)
 +}
++
 +// Backoff returns the delay before retrying a job that is about to be
 +// rescheduled. The formula is `30s * 2^attempts` capped at 1 hour, with
 +// ±20% jitter so a fleet doesn't synchronize retries on a sibling
 +// dependency outage.
 +//
 +// `attempts` is the number of attempts already made (1-indexed: the
 +// just-failed attempt counts as 1).
 +func Backoff(attempts int, jitter func() float64) time.Duration {
 +	if attempts < 1 {
 +		attempts = 1
 +	}
 +	const (
 +		base = 30 * time.Second
 +		cap_ = time.Hour
 +	)
 +	// Compute base * 2^(attempts-1), guarding against overflow.
 +	d := base
 +	for i := 1; i < attempts; i++ {
 +		d *= 2
 +		if d >= cap_ {
 +			d = cap_
 +			break
 +		}
 +	}
 +	if d > cap_ {
 +		d = cap_
 +	}
 +	if jitter != nil {
 +		// jitter() in [0,1) → multiplier in [0.8, 1.2)
 +		mult := 0.8 + 0.4*jitter()
 +		d = time.Duration(float64(d) * mult)
 +	}
 +	return d
 +}