tenseleyflow/shithub / fe289bb

Browse files

search: cache tab-count badges per-(query, viewer) (SR2 H5)

Pre-fix: every GET /search render fired 5 FTS count-only queries
(one per tab kind) on top of the active tab's own search — 6 total.
With no rate limit (SR2 H4) on top, a single client could hammer
the DB hard.

Now searchTabs consults a small TTL'd LRU keyed on (canonical
query, viewer user_id) before doing the 5 count calls. Cache miss
is single-flighted via lru.Group so a thundering herd on a hot key
collapses to one upstream wave.

Cache details:
- 30s TTL — stale enough to absorb dashboard-style auto-fire from
a typing user, fresh enough that a recent push surfaces in the
badge within a minute.
- 1024-entry LRU cap — bounded memory under arbitrary query
rotation.
- Key: (canonicalizeQuery(parsed), viewer.ID-or-0). Anonymous
viewers share one slot per query (their visibility = public-only,
so the count is identical for all anons). Authed viewers each get
their own slot because what they can read differs.
- canonicalizeQuery folds whitespace + casing across Text, Phrase,
RepoFilter, StateFilter, AuthorFilter so q='Foo' and q='foo '
hit the same slot. Distinct filters do NOT collide — verified by
TestCanonicalizeQuery_DistinctFiltersDistinct.

The active tab's actual result rows are NOT cached (only the
count-only badge calls are). Per-page renders therefore drop from
6 queries to 1 on cache hit, 6 on cache miss.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
fe289bb03c940c8c12b4eae0569d209f277ba159
Parents
06c48ad
Tree
54a1a42

3 changed files

StatusFile+-
M internal/web/handlers/search/search.go 112 13
A internal/web/handlers/search/tabs_cache.go 88 0
A internal/web/handlers/search/tabs_cache_test.go 132 0
internal/web/handlers/search/search.gomodified
@@ -6,15 +6,18 @@
66
 package search
77
 
88
 import (
9
+	"context"
910
 	"errors"
1011
 	"log/slog"
1112
 	"net/http"
1213
 	"net/url"
14
+	"time"
1315
 
1416
 	"github.com/go-chi/chi/v5"
1517
 	"github.com/jackc/pgx/v5/pgxpool"
1618
 
1719
 	"github.com/tenseleyFlow/shithub/internal/auth/policy"
20
+	"github.com/tenseleyFlow/shithub/internal/ratelimit"
1821
 	srch "github.com/tenseleyFlow/shithub/internal/search"
1922
 	"github.com/tenseleyFlow/shithub/internal/web/middleware"
2023
 	"github.com/tenseleyFlow/shithub/internal/web/render"
@@ -25,11 +28,17 @@ type Deps struct {
2528
 	Logger *slog.Logger
2629
 	Render *render.Renderer
2730
 	Pool   *pgxpool.Pool
31
+	// Limiter, when non-nil, gates /search per-(viewer or IP). Audit
32
+	// 2026-05-10 H4: search renders amplify FTS cost 5×–6× per
33
+	// request, so without a limiter a single client can hammer the
34
+	// DB. Optional in tests; required in production wiring.
35
+	Limiter *ratelimit.Limiter
2836
 }
2937
 
3038
 // Handlers is the registered handler set. Construct via New.
3139
 type Handlers struct {
32
-	d Deps
40
+	d         Deps
41
+	tabsCache *tabsCache // nil-safe — Mount constructs it
3342
 }
3443
 
3544
 // New constructs the handler set, validating Deps.
@@ -40,15 +49,53 @@ func New(d Deps) (*Handlers, error) {
4049
 	if d.Pool == nil {
4150
 		return nil, errors.New("search: nil Pool")
4251
 	}
43
-	return &Handlers{d: d}, nil
52
+	return &Handlers{d: d, tabsCache: newTabsCache()}, nil
4453
 }
4554
 
46
-// Mount registers /search and /search/quick.
55
+// SearchRateLimitPolicy is the per-(viewer or IP) limit applied to
56
+// /search and /search/quick. 60/min is generous for human use
57
+// (typical browse rate is well under this) but cheap to defeat any
58
+// query-rotation attack that bypasses the tab-count cache (audit
59
+// 2026-05-10 H4+H5). Surfaced as a var so tests can tighten it.
60
+var SearchRateLimitPolicy = ratelimit.Policy{
61
+	Scope:  "search",
62
+	Max:    60,
63
+	Window: 1 * time.Minute,
64
+}
65
+
66
+// Mount registers /search and /search/quick. When d.Limiter is set,
67
+// both routes go through the rate-limit middleware before reaching
68
+// the handlers — protects the FTS path from query-rotation attacks
69
+// that the tab-counts cache alone can't absorb.
4770
 func (h *Handlers) Mount(r chi.Router) {
71
+	if h.d.Limiter != nil {
72
+		r.Group(func(r chi.Router) {
73
+			r.Use(h.d.Limiter.Middleware(SearchRateLimitPolicy, searchRateLimitKey))
74
+			r.Get("/search", h.results)
75
+			r.Get("/search/quick", h.quick)
76
+		})
77
+		return
78
+	}
4879
 	r.Get("/search", h.results)
4980
 	r.Get("/search/quick", h.quick)
5081
 }
5182
 
83
+// searchRateLimitKey picks the per-request key. Authed users key
84
+// on user_id (so an attacker can't bypass by hopping accounts they
85
+// don't have); anonymous users key on the trusted client IP. We
86
+// trust X-Forwarded-For only when middleware.RealIP has already
87
+// vetted it, which it does at the global stack level.
88
+func searchRateLimitKey(r *http.Request) string {
89
+	viewer := middleware.CurrentUserFromContext(r.Context())
90
+	if !viewer.IsAnonymous() {
91
+		return "u:" + intString(int(viewer.ID))
92
+	}
93
+	if ip, ok := ratelimit.ClientIP(r, true); ok {
94
+		return "ip:" + ip.String()
95
+	}
96
+	return ""
97
+}
98
+
5299
 func (h *Handlers) deps() srch.Deps {
53100
 	return srch.Deps{Pool: h.d.Pool, Logger: h.d.Logger}
54101
 }
@@ -194,29 +241,81 @@ func (h *Handlers) searchTabs(r *http.Request, actor policy.Actor, parsed srch.P
194241
 		return tabs
195242
 	}
196243
 
197
-	deps := h.deps()
244
+	// Counts are cached per-(query, viewer) for tabsCacheTTL. The
245
+	// active-tab's actual result rows are NOT cached here — only the
246
+	// 5 count-only badge calls that pre-fix were the dominant cost
247
+	// (audit 2026-05-10 H5). Single-flighted via lru.Group so a
248
+	// thundering-herd on the same key doesn't spawn N waves.
249
+	key := tabsCacheKey{q: canonicalizeQuery(parsed), userID: actorUserID(actor)}
250
+	cached, err := h.tabsCache.g.Do(r.Context(), key, func(ctx context.Context) ([]searchTab, error) {
251
+		return h.computeTabCounts(ctx, actor, parsed), nil
252
+	})
253
+	if err != nil {
254
+		// Group.Do never caches errors and our fetch returns nil; this
255
+		// path is unreachable today but kept for defensiveness.
256
+		h.d.Logger.ErrorContext(r.Context(), "search tabs cache", "error", err)
257
+		cached = h.computeTabCounts(r.Context(), actor, parsed)
258
+	}
259
+	// Merge cached counts into the freshly-built (Selected/Href-aware)
260
+	// tabs slice. The cached value carries Counts and the same Key
261
+	// ordering; everything else is per-request and not cached.
198262
 	for i := range tabs {
263
+		for j := range cached {
264
+			if cached[j].Key == tabs[i].Key {
265
+				tabs[i].Count = cached[j].Count
266
+				break
267
+			}
268
+		}
269
+	}
270
+	return tabs
271
+}
272
+
273
+// computeTabCounts is the cache miss path: 5 FTS count-only queries.
274
+// Returned slice carries (Key, Count) only — Selected/Href/Label/
275
+// Icon are per-request and applied by the caller.
276
+func (h *Handlers) computeTabCounts(ctx context.Context, actor policy.Actor, parsed srch.ParsedQuery) []searchTab {
277
+	deps := h.deps()
278
+	out := []searchTab{
279
+		{Key: "code"},
280
+		{Key: "repositories"},
281
+		{Key: "issues"},
282
+		{Key: "pullrequests"},
283
+		{Key: "users"},
284
+	}
285
+	for i := range out {
199286
 		var total int64
200287
 		var err error
201
-		switch tabs[i].Key {
288
+		switch out[i].Key {
202289
 		case "repositories":
203
-			_, total, err = srch.SearchRepos(r.Context(), deps, actor, parsed, 0, 0)
290
+			_, total, err = srch.SearchRepos(ctx, deps, actor, parsed, 0, 0)
204291
 		case "code":
205
-			_, total, err = srch.SearchCode(r.Context(), deps, actor, parsed, 0, 0)
292
+			_, total, err = srch.SearchCode(ctx, deps, actor, parsed, 0, 0)
206293
 		case "issues":
207
-			_, total, err = srch.SearchIssues(r.Context(), deps, actor, parsed, "issue", 0, 0)
294
+			_, total, err = srch.SearchIssues(ctx, deps, actor, parsed, "issue", 0, 0)
208295
 		case "pullrequests":
209
-			_, total, err = srch.SearchIssues(r.Context(), deps, actor, parsed, "pr", 0, 0)
296
+			_, total, err = srch.SearchIssues(ctx, deps, actor, parsed, "pr", 0, 0)
210297
 		case "users":
211
-			_, total, err = srch.SearchUsers(r.Context(), deps, parsed, 0, 0)
298
+			_, total, err = srch.SearchUsers(ctx, deps, parsed, 0, 0)
212299
 		}
213300
 		if err != nil && !errors.Is(err, srch.ErrEmptyQuery) {
214
-			h.d.Logger.ErrorContext(r.Context(), "search tab count", "tab", tabs[i].Key, "error", err)
301
+			h.d.Logger.ErrorContext(ctx, "search tab count", "tab", out[i].Key, "error", err)
215302
 			continue
216303
 		}
217
-		tabs[i].Count = total
304
+		out[i].Count = total
218305
 	}
219
-	return tabs
306
+	return out
307
+}
308
+
309
+// actorUserID returns 0 for anonymous, the user_id otherwise. Used
310
+// as the (anon vs each-authed-user) discriminant in the tabs cache
311
+// key — anonymous viewers all see the same public-only result set
312
+// so they share a slot; authed viewers see private results based
313
+// on their collab roles, so each gets their own.
314
+func actorUserID(a policy.Actor) int64 {
315
+	if a.IsAnonymous {
316
+		return 0
317
+	}
318
+	return a.UserID
220319
 }
221320
 
222321
 func searchHref(q, tab string, page int) string {
internal/web/handlers/search/tabs_cache.goadded
@@ -0,0 +1,88 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package search
4
+
5
+import (
6
+	"fmt"
7
+	"strings"
8
+	"time"
9
+
10
+	"github.com/tenseleyFlow/shithub/internal/cache/lru"
11
+	srch "github.com/tenseleyFlow/shithub/internal/search"
12
+)
13
+
14
+// tabsCacheKey is the (canonical-query, viewer-id, anon) tuple every
15
+// distinct count slice maps to. Anonymous viewers share one cache
16
+// slot per query (their visibility is the same: public-only). Authed
17
+// viewers each get their own bucket because what they can read
18
+// differs (private repos they collaborate on, etc.) and the tab
19
+// counts must reflect that — sharing the slice across viewers would
20
+// leak the existence of private results.
21
+//
22
+// We canonicalize the query with strings.ToLower + collapse-spaces
23
+// so q="foo" and q="FOO " hit the same slot. Operators (repo:, is:,
24
+// state:, author:) are folded into the canonical form by
25
+// canonicalizeQuery's ParsedQuery round-trip — same parsed shape
26
+// produces the same cache key.
27
+type tabsCacheKey struct {
28
+	q      string // canonical query string
29
+	userID int64  // 0 for anonymous
30
+}
31
+
32
+// tabsCacheTTL is short enough that stale counts can't mislead an
33
+// operator triaging a recent push, long enough to absorb the
34
+// dashboard-style "user types in search box, browser auto-fires
35
+// repeatedly" pattern.
36
+const (
37
+	tabsCacheTTL  = 30 * time.Second
38
+	tabsCacheSize = 1024
39
+)
40
+
41
+// tabsCache wraps a small LRU around the per-(query, viewer) tab-
42
+// count slice the searchTabs renderer needs. Pre-fix the renderer
43
+// fired 5 FTS counts on EVERY GET /search render — six queries per
44
+// page since the active tab also runs its own search. With this
45
+// cache, the steady-state cost of a hot query is a single lookup
46
+// (assuming the active tab still runs since its result list is
47
+// not cached, only the 5 count-only calls are).
48
+//
49
+// Single-flighted via lru.Group so a thundering-herd on the same
50
+// (q, viewer) coalesces into one upstream wave.
51
+type tabsCache struct {
52
+	g *lru.Group[tabsCacheKey, []searchTab]
53
+}
54
+
55
+func newTabsCache() *tabsCache {
56
+	c := lru.NewWithTTL[tabsCacheKey, []searchTab](tabsCacheSize, tabsCacheTTL)
57
+	g := lru.NewGroup(c, func(k tabsCacheKey) string {
58
+		return fmt.Sprintf("%d|%s", k.userID, k.q)
59
+	})
60
+	return &tabsCache{g: g}
61
+}
62
+
63
+// canonicalizeQuery returns a stable string key for ParsedQuery.
64
+// Two raw queries that parse identically produce the same key.
65
+func canonicalizeQuery(p srch.ParsedQuery) string {
66
+	var b strings.Builder
67
+	b.WriteString("t=")
68
+	b.WriteString(strings.ToLower(strings.Join(strings.Fields(p.Text), " ")))
69
+	if p.Phrase != "" {
70
+		b.WriteString("|p=")
71
+		b.WriteString(strings.ToLower(p.Phrase))
72
+	}
73
+	if p.RepoFilter != nil {
74
+		b.WriteString("|r=")
75
+		b.WriteString(strings.ToLower(p.RepoFilter.Owner))
76
+		b.WriteString("/")
77
+		b.WriteString(strings.ToLower(p.RepoFilter.Name))
78
+	}
79
+	if p.StateFilter != "" {
80
+		b.WriteString("|s=")
81
+		b.WriteString(strings.ToLower(p.StateFilter))
82
+	}
83
+	if p.AuthorFilter != "" {
84
+		b.WriteString("|a=")
85
+		b.WriteString(strings.ToLower(p.AuthorFilter))
86
+	}
87
+	return b.String()
88
+}
internal/web/handlers/search/tabs_cache_test.goadded
@@ -0,0 +1,132 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package search
4
+
5
+import (
6
+	"context"
7
+	"testing"
8
+
9
+	srch "github.com/tenseleyFlow/shithub/internal/search"
10
+)
11
+
12
+// TestCanonicalizeQuery_StableAcrossWhitespaceAndCase pins the cache
13
+// key contract: equivalent ParsedQuery values produce equal cache
14
+// keys regardless of original whitespace or letter case. Without
15
+// this, the cache hit-rate would collapse on common query variants.
16
+func TestCanonicalizeQuery_StableAcrossWhitespaceAndCase(t *testing.T) {
17
+	t.Parallel()
18
+
19
+	cases := []struct {
20
+		name string
21
+		a, b srch.ParsedQuery
22
+	}{
23
+		{
24
+			"casing",
25
+			srch.ParsedQuery{Text: "FooBar"},
26
+			srch.ParsedQuery{Text: "foobar"},
27
+		},
28
+		{
29
+			"whitespace",
30
+			srch.ParsedQuery{Text: "foo  bar"},
31
+			srch.ParsedQuery{Text: " foo bar "},
32
+		},
33
+		{
34
+			"phrase casing",
35
+			srch.ParsedQuery{Text: "x", Phrase: "Hello World"},
36
+			srch.ParsedQuery{Text: "x", Phrase: "hello world"},
37
+		},
38
+		{
39
+			"repo filter casing",
40
+			srch.ParsedQuery{Text: "x", RepoFilter: &srch.RepoFilter{Owner: "Alice", Name: "Repo"}},
41
+			srch.ParsedQuery{Text: "x", RepoFilter: &srch.RepoFilter{Owner: "alice", Name: "repo"}},
42
+		},
43
+		{
44
+			"state casing",
45
+			srch.ParsedQuery{Text: "x", StateFilter: "OPEN"},
46
+			srch.ParsedQuery{Text: "x", StateFilter: "open"},
47
+		},
48
+	}
49
+	for _, tc := range cases {
50
+		t.Run(tc.name, func(t *testing.T) {
51
+			t.Parallel()
52
+			ka := canonicalizeQuery(tc.a)
53
+			kb := canonicalizeQuery(tc.b)
54
+			if ka != kb {
55
+				t.Fatalf("canonicalizeQuery diverged:\n a=%q\n b=%q", ka, kb)
56
+			}
57
+		})
58
+	}
59
+}
60
+
61
+// TestCanonicalizeQuery_DistinctFiltersDistinct pins that DIFFERENT
62
+// queries don't collide. Same Text but different filters MUST produce
63
+// different keys — otherwise a viewer with no access to a private
64
+// `repo:secret` could see its result count via cache pollution from
65
+// a separate query.
66
+func TestCanonicalizeQuery_DistinctFiltersDistinct(t *testing.T) {
67
+	t.Parallel()
68
+
69
+	base := srch.ParsedQuery{Text: "foo"}
70
+	variants := []srch.ParsedQuery{
71
+		{Text: "foo", Phrase: "exact"},
72
+		{Text: "foo", RepoFilter: &srch.RepoFilter{Owner: "a", Name: "b"}},
73
+		{Text: "foo", StateFilter: "open"},
74
+		{Text: "foo", AuthorFilter: "alice"},
75
+	}
76
+	baseKey := canonicalizeQuery(base)
77
+	for i, v := range variants {
78
+		got := canonicalizeQuery(v)
79
+		if got == baseKey {
80
+			t.Errorf("variant %d collides with base: %q", i, got)
81
+		}
82
+	}
83
+}
84
+
85
+// TestTabsCache_HitOnSameKey pins the cache contract:
86
+// repeated lookups for the same (query, viewer) within the TTL hit
87
+// the cache and the fetcher runs at most once.
88
+func TestTabsCache_HitOnSameKey(t *testing.T) {
89
+	t.Parallel()
90
+
91
+	cache := newTabsCache()
92
+	key := tabsCacheKey{q: "t=foo", userID: 7}
93
+
94
+	calls := 0
95
+	want := []searchTab{{Key: "code", Count: 42}}
96
+	for i := 0; i < 5; i++ {
97
+		got, err := cache.g.Do(context.Background(), key, func(_ context.Context) ([]searchTab, error) {
98
+			calls++
99
+			return want, nil
100
+		})
101
+		if err != nil {
102
+			t.Fatalf("Do: %v", err)
103
+		}
104
+		if len(got) != 1 || got[0].Count != 42 {
105
+			t.Fatalf("got = %+v", got)
106
+		}
107
+	}
108
+	if calls != 1 {
109
+		t.Fatalf("fetcher invoked %d times; want 1", calls)
110
+	}
111
+}
112
+
113
+// TestTabsCache_DistinctKeysIsolated pins the per-viewer isolation
114
+// invariant: Alice's count must not leak to Bob's render even when
115
+// the canonicalized query matches.
116
+func TestTabsCache_DistinctKeysIsolated(t *testing.T) {
117
+	t.Parallel()
118
+
119
+	cache := newTabsCache()
120
+	alice := tabsCacheKey{q: "t=foo", userID: 1}
121
+	bob := tabsCacheKey{q: "t=foo", userID: 2}
122
+
123
+	a, _ := cache.g.Do(context.Background(), alice, func(_ context.Context) ([]searchTab, error) {
124
+		return []searchTab{{Key: "repositories", Count: 10}}, nil
125
+	})
126
+	b, _ := cache.g.Do(context.Background(), bob, func(_ context.Context) ([]searchTab, error) {
127
+		return []searchTab{{Key: "repositories", Count: 99}}, nil
128
+	})
129
+	if a[0].Count == b[0].Count {
130
+		t.Fatalf("alice and bob got the same count — visibility leak")
131
+	}
132
+}