MySQL · 7164 bytes Raw Blame History
1 -- SPDX-License-Identifier: AGPL-3.0-or-later
2 --
3 -- S28 search index tables for repos, issues, users.
4 --
5 -- Each table is keyed 1:1 with its source row and holds a tsvector
6 -- maintained by AFTER triggers on the source. We use a separate
7 -- table rather than a generated column on the source so:
8 -- 1. Existing query buckets don't have to change shape.
9 -- 2. The tsv column doesn't bloat every row read in unrelated
10 -- contexts (e.g. the repo list page would otherwise pay the
11 -- cost of pulling the tsv on every paint).
12 --
13 -- The tsv config is `english` everywhere — good enough for v1.
14 -- Multi-language content is post-MVP (would need per-document
15 -- language detection and a config switch).
16 --
17 -- `unaccent` is composed via the dictionary chain so accent stripping
18 -- happens before stemming; the search-side tsquery uses the same
19 -- chain so "café" matches "cafe".
20 --
21 -- Code search lives in 0032 — splitting because it's the bulkier
22 -- of the two and ships with its own worker job.
23
24 -- +goose Up
25
26 -- Build a custom dictionary chain that runs `unaccent` first, then
27 -- english stemming. The same config is used by both index-side and
28 -- query-side calls so accents normalize consistently on both sides.
29 -- Lazy: skip if it already exists (e.g. on a re-up after partial
30 -- failure).
31 -- +goose StatementBegin
32 DO $$
33 BEGIN
34 IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'shithub_search') THEN
35 CREATE TEXT SEARCH CONFIGURATION shithub_search (COPY = pg_catalog.english);
36 ALTER TEXT SEARCH CONFIGURATION shithub_search
37 ALTER MAPPING FOR hword, hword_part, word
38 WITH unaccent, english_stem;
39 END IF;
40 END $$;
41 -- +goose StatementEnd
42
43 -- ─── repos ─────────────────────────────────────────────────────────
44
45 CREATE TABLE repos_search (
46 repo_id bigint PRIMARY KEY REFERENCES repos(id) ON DELETE CASCADE,
47 tsv tsvector NOT NULL
48 );
49
50 CREATE INDEX repos_search_tsv_idx ON repos_search USING GIN (tsv);
51
52 -- +goose StatementBegin
53 CREATE OR REPLACE FUNCTION tg_repos_search_upsert() RETURNS trigger
54 LANGUAGE plpgsql AS $$
55 BEGIN
56 INSERT INTO repos_search (repo_id, tsv) VALUES (
57 NEW.id,
58 setweight(to_tsvector('shithub_search', coalesce(NEW.name::text, '')), 'A') ||
59 setweight(to_tsvector('shithub_search', coalesce(NEW.description, '')), 'B')
60 )
61 ON CONFLICT (repo_id) DO UPDATE
62 SET tsv = EXCLUDED.tsv;
63 RETURN NEW;
64 END;
65 $$;
66 -- +goose StatementEnd
67
68 CREATE TRIGGER repos_search_upsert
69 AFTER INSERT OR UPDATE OF name, description ON repos
70 FOR EACH ROW EXECUTE FUNCTION tg_repos_search_upsert();
71
72 -- Backfill any existing rows.
73 INSERT INTO repos_search (repo_id, tsv)
74 SELECT id,
75 setweight(to_tsvector('shithub_search', coalesce(name::text, '')), 'A') ||
76 setweight(to_tsvector('shithub_search', coalesce(description, '')), 'B')
77 FROM repos
78 ON CONFLICT (repo_id) DO NOTHING;
79
80 -- ─── issues ────────────────────────────────────────────────────────
81
82 CREATE TABLE issues_search (
83 issue_id bigint PRIMARY KEY REFERENCES issues(id) ON DELETE CASCADE,
84 repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
85 kind issue_kind NOT NULL,
86 state issue_state NOT NULL,
87 author_user_id bigint REFERENCES users(id) ON DELETE SET NULL,
88 tsv tsvector NOT NULL
89 );
90
91 CREATE INDEX issues_search_tsv_idx ON issues_search USING GIN (tsv);
92 CREATE INDEX issues_search_repo_idx ON issues_search (repo_id);
93 CREATE INDEX issues_search_state_idx ON issues_search (state);
94 CREATE INDEX issues_search_author_idx ON issues_search (author_user_id) WHERE author_user_id IS NOT NULL;
95
96 -- +goose StatementBegin
97 CREATE OR REPLACE FUNCTION tg_issues_search_upsert() RETURNS trigger
98 LANGUAGE plpgsql AS $$
99 BEGIN
100 INSERT INTO issues_search (issue_id, repo_id, kind, state, author_user_id, tsv) VALUES (
101 NEW.id, NEW.repo_id, NEW.kind, NEW.state, NEW.author_user_id,
102 setweight(to_tsvector('shithub_search', coalesce(NEW.title, '')), 'A') ||
103 setweight(to_tsvector('shithub_search', coalesce(NEW.body, '')), 'B')
104 )
105 ON CONFLICT (issue_id) DO UPDATE
106 SET repo_id = EXCLUDED.repo_id,
107 kind = EXCLUDED.kind,
108 state = EXCLUDED.state,
109 author_user_id = EXCLUDED.author_user_id,
110 tsv = EXCLUDED.tsv;
111 RETURN NEW;
112 END;
113 $$;
114 -- +goose StatementEnd
115
116 CREATE TRIGGER issues_search_upsert
117 AFTER INSERT OR UPDATE OF title, body, state, kind ON issues
118 FOR EACH ROW EXECUTE FUNCTION tg_issues_search_upsert();
119
120 INSERT INTO issues_search (issue_id, repo_id, kind, state, author_user_id, tsv)
121 SELECT id, repo_id, kind, state, author_user_id,
122 setweight(to_tsvector('shithub_search', coalesce(title, '')), 'A') ||
123 setweight(to_tsvector('shithub_search', coalesce(body, '')), 'B')
124 FROM issues
125 ON CONFLICT (issue_id) DO NOTHING;
126
127 -- ─── users ─────────────────────────────────────────────────────────
128
129 CREATE TABLE users_search (
130 user_id bigint PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
131 tsv tsvector NOT NULL
132 );
133
134 CREATE INDEX users_search_tsv_idx ON users_search USING GIN (tsv);
135
136 -- +goose StatementBegin
137 CREATE OR REPLACE FUNCTION tg_users_search_upsert() RETURNS trigger
138 LANGUAGE plpgsql AS $$
139 BEGIN
140 INSERT INTO users_search (user_id, tsv) VALUES (
141 NEW.id,
142 setweight(to_tsvector('shithub_search', coalesce(NEW.username::text, '')), 'A') ||
143 setweight(to_tsvector('shithub_search', coalesce(NEW.display_name, '')), 'B') ||
144 setweight(to_tsvector('shithub_search', coalesce(NEW.bio, '')), 'C')
145 )
146 ON CONFLICT (user_id) DO UPDATE
147 SET tsv = EXCLUDED.tsv;
148 RETURN NEW;
149 END;
150 $$;
151 -- +goose StatementEnd
152
153 CREATE TRIGGER users_search_upsert
154 AFTER INSERT OR UPDATE OF username, display_name, bio ON users
155 FOR EACH ROW EXECUTE FUNCTION tg_users_search_upsert();
156
157 INSERT INTO users_search (user_id, tsv)
158 SELECT id,
159 setweight(to_tsvector('shithub_search', coalesce(username::text, '')), 'A') ||
160 setweight(to_tsvector('shithub_search', coalesce(display_name, '')), 'B') ||
161 setweight(to_tsvector('shithub_search', coalesce(bio, '')), 'C')
162 FROM users
163 ON CONFLICT (user_id) DO NOTHING;
164
165 -- +goose Down
166 DROP TRIGGER IF EXISTS users_search_upsert ON users;
167 DROP FUNCTION IF EXISTS tg_users_search_upsert();
168 DROP TABLE IF EXISTS users_search;
169
170 DROP TRIGGER IF EXISTS issues_search_upsert ON issues;
171 DROP FUNCTION IF EXISTS tg_issues_search_upsert();
172 DROP TABLE IF EXISTS issues_search;
173
174 DROP TRIGGER IF EXISTS repos_search_upsert ON repos;
175 DROP FUNCTION IF EXISTS tg_repos_search_upsert();
176 DROP TABLE IF EXISTS repos_search;
177
178 DROP TEXT SEARCH CONFIGURATION IF EXISTS shithub_search;
179