tenseleyflow/shithub / 8ed6f76

Browse files

S28: migrations 0030-0032 — extensions + search index tables + triggers

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
8ed6f7600bccdd49e125723ed1ee10b3194d6bcb
Parents
b6e3156
Tree
00d551a

3 changed files

StatusFile+-
A internal/migrationsfs/migrations/0030_search_extensions.sql 20 0
A internal/migrationsfs/migrations/0031_search_indexes.sql 178 0
A internal/migrationsfs/migrations/0032_code_search.sql 66 0
internal/migrationsfs/migrations/0030_search_extensions.sqladded
@@ -0,0 +1,20 @@
1
+-- SPDX-License-Identifier: AGPL-3.0-or-later
2
+--
3
+-- Postgres extensions used by S28 search:
4
+--   pg_trgm   — trigram similarity for code identifiers and substring
5
+--               match where the FTS tokenizer breaks down (camelCase,
6
+--               snake_case, mixed-language code).
7
+--   unaccent  — strips Latin diacritics so "café" matches "cafe" in
8
+--               human-name search.
9
+--
10
+-- Both ship with PostgreSQL contrib; no external server required.
11
+
12
+-- +goose Up
13
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
14
+CREATE EXTENSION IF NOT EXISTS unaccent;
15
+
16
+-- +goose Down
17
+-- We don't drop the extensions on rollback — other migrations may
18
+-- have started depending on them, and DROP EXTENSION cascades to
19
+-- dependent objects. Leave them installed; the pure cost is one
20
+-- catalog row each.
internal/migrationsfs/migrations/0031_search_indexes.sqladded
@@ -0,0 +1,178 @@
1
+-- SPDX-License-Identifier: AGPL-3.0-or-later
2
+--
3
+-- S28 search index tables for repos, issues, users.
4
+--
5
+-- Each table is keyed 1:1 with its source row and holds a tsvector
6
+-- maintained by AFTER triggers on the source. We use a separate
7
+-- table rather than a generated column on the source so:
8
+--   1. Existing query buckets don't have to change shape.
9
+--   2. The tsv column doesn't bloat every row read in unrelated
10
+--      contexts (e.g. the repo list page would otherwise pay the
11
+--      cost of pulling the tsv on every paint).
12
+--
13
+-- The tsv config is `english` everywhere — good enough for v1.
14
+-- Multi-language content is post-MVP (would need per-document
15
+-- language detection and a config switch).
16
+--
17
+-- `unaccent` is composed via the dictionary chain so accent stripping
18
+-- happens before stemming; the search-side tsquery uses the same
19
+-- chain so "café" matches "cafe".
20
+--
21
+-- Code search lives in 0032 — splitting because it's the bulkier
22
+-- of the two and ships with its own worker job.
23
+
24
+-- +goose Up
25
+
26
+-- Build a custom dictionary chain that runs `unaccent` first, then
27
+-- english stemming. The same config is used by both index-side and
28
+-- query-side calls so accents normalize consistently on both sides.
29
+-- Lazy: skip if it already exists (e.g. on a re-up after partial
30
+-- failure).
31
+-- +goose StatementBegin
32
+DO $$
33
+BEGIN
34
+    IF NOT EXISTS (SELECT 1 FROM pg_ts_config WHERE cfgname = 'shithub_search') THEN
35
+        CREATE TEXT SEARCH CONFIGURATION shithub_search (COPY = pg_catalog.english);
36
+        ALTER TEXT SEARCH CONFIGURATION shithub_search
37
+            ALTER MAPPING FOR hword, hword_part, word
38
+            WITH unaccent, english_stem;
39
+    END IF;
40
+END $$;
41
+-- +goose StatementEnd
42
+
43
+-- ─── repos ─────────────────────────────────────────────────────────
44
+
45
+CREATE TABLE repos_search (
46
+    repo_id   bigint   PRIMARY KEY REFERENCES repos(id) ON DELETE CASCADE,
47
+    tsv       tsvector NOT NULL
48
+);
49
+
50
+CREATE INDEX repos_search_tsv_idx ON repos_search USING GIN (tsv);
51
+
52
+-- +goose StatementBegin
53
+CREATE OR REPLACE FUNCTION tg_repos_search_upsert() RETURNS trigger
54
+    LANGUAGE plpgsql AS $$
55
+BEGIN
56
+    INSERT INTO repos_search (repo_id, tsv) VALUES (
57
+        NEW.id,
58
+        setweight(to_tsvector('shithub_search', coalesce(NEW.name::text, '')), 'A') ||
59
+        setweight(to_tsvector('shithub_search', coalesce(NEW.description, '')), 'B')
60
+    )
61
+    ON CONFLICT (repo_id) DO UPDATE
62
+        SET tsv = EXCLUDED.tsv;
63
+    RETURN NEW;
64
+END;
65
+$$;
66
+-- +goose StatementEnd
67
+
68
+CREATE TRIGGER repos_search_upsert
69
+    AFTER INSERT OR UPDATE OF name, description ON repos
70
+    FOR EACH ROW EXECUTE FUNCTION tg_repos_search_upsert();
71
+
72
+-- Backfill any existing rows.
73
+INSERT INTO repos_search (repo_id, tsv)
74
+SELECT id,
75
+       setweight(to_tsvector('shithub_search', coalesce(name::text, '')), 'A') ||
76
+       setweight(to_tsvector('shithub_search', coalesce(description, '')), 'B')
77
+FROM repos
78
+ON CONFLICT (repo_id) DO NOTHING;
79
+
80
+-- ─── issues ────────────────────────────────────────────────────────
81
+
82
+CREATE TABLE issues_search (
83
+    issue_id        bigint   PRIMARY KEY REFERENCES issues(id) ON DELETE CASCADE,
84
+    repo_id         bigint   NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
85
+    kind            issue_kind NOT NULL,
86
+    state           issue_state NOT NULL,
87
+    author_user_id  bigint   REFERENCES users(id) ON DELETE SET NULL,
88
+    tsv             tsvector NOT NULL
89
+);
90
+
91
+CREATE INDEX issues_search_tsv_idx       ON issues_search USING GIN (tsv);
92
+CREATE INDEX issues_search_repo_idx      ON issues_search (repo_id);
93
+CREATE INDEX issues_search_state_idx     ON issues_search (state);
94
+CREATE INDEX issues_search_author_idx    ON issues_search (author_user_id) WHERE author_user_id IS NOT NULL;
95
+
96
+-- +goose StatementBegin
97
+CREATE OR REPLACE FUNCTION tg_issues_search_upsert() RETURNS trigger
98
+    LANGUAGE plpgsql AS $$
99
+BEGIN
100
+    INSERT INTO issues_search (issue_id, repo_id, kind, state, author_user_id, tsv) VALUES (
101
+        NEW.id, NEW.repo_id, NEW.kind, NEW.state, NEW.author_user_id,
102
+        setweight(to_tsvector('shithub_search', coalesce(NEW.title, '')), 'A') ||
103
+        setweight(to_tsvector('shithub_search', coalesce(NEW.body, '')), 'B')
104
+    )
105
+    ON CONFLICT (issue_id) DO UPDATE
106
+        SET repo_id        = EXCLUDED.repo_id,
107
+            kind           = EXCLUDED.kind,
108
+            state          = EXCLUDED.state,
109
+            author_user_id = EXCLUDED.author_user_id,
110
+            tsv            = EXCLUDED.tsv;
111
+    RETURN NEW;
112
+END;
113
+$$;
114
+-- +goose StatementEnd
115
+
116
+CREATE TRIGGER issues_search_upsert
117
+    AFTER INSERT OR UPDATE OF title, body, state, kind ON issues
118
+    FOR EACH ROW EXECUTE FUNCTION tg_issues_search_upsert();
119
+
120
+INSERT INTO issues_search (issue_id, repo_id, kind, state, author_user_id, tsv)
121
+SELECT id, repo_id, kind, state, author_user_id,
122
+       setweight(to_tsvector('shithub_search', coalesce(title, '')), 'A') ||
123
+       setweight(to_tsvector('shithub_search', coalesce(body, '')), 'B')
124
+FROM issues
125
+ON CONFLICT (issue_id) DO NOTHING;
126
+
127
+-- ─── users ─────────────────────────────────────────────────────────
128
+
129
+CREATE TABLE users_search (
130
+    user_id bigint   PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
131
+    tsv     tsvector NOT NULL
132
+);
133
+
134
+CREATE INDEX users_search_tsv_idx ON users_search USING GIN (tsv);
135
+
136
+-- +goose StatementBegin
137
+CREATE OR REPLACE FUNCTION tg_users_search_upsert() RETURNS trigger
138
+    LANGUAGE plpgsql AS $$
139
+BEGIN
140
+    INSERT INTO users_search (user_id, tsv) VALUES (
141
+        NEW.id,
142
+        setweight(to_tsvector('shithub_search', coalesce(NEW.username::text, '')), 'A') ||
143
+        setweight(to_tsvector('shithub_search', coalesce(NEW.display_name, '')), 'B') ||
144
+        setweight(to_tsvector('shithub_search', coalesce(NEW.bio, '')), 'C')
145
+    )
146
+    ON CONFLICT (user_id) DO UPDATE
147
+        SET tsv = EXCLUDED.tsv;
148
+    RETURN NEW;
149
+END;
150
+$$;
151
+-- +goose StatementEnd
152
+
153
+CREATE TRIGGER users_search_upsert
154
+    AFTER INSERT OR UPDATE OF username, display_name, bio ON users
155
+    FOR EACH ROW EXECUTE FUNCTION tg_users_search_upsert();
156
+
157
+INSERT INTO users_search (user_id, tsv)
158
+SELECT id,
159
+       setweight(to_tsvector('shithub_search', coalesce(username::text, '')), 'A') ||
160
+       setweight(to_tsvector('shithub_search', coalesce(display_name, '')), 'B') ||
161
+       setweight(to_tsvector('shithub_search', coalesce(bio, '')), 'C')
162
+FROM users
163
+ON CONFLICT (user_id) DO NOTHING;
164
+
165
+-- +goose Down
166
+DROP TRIGGER IF EXISTS users_search_upsert ON users;
167
+DROP FUNCTION IF EXISTS tg_users_search_upsert();
168
+DROP TABLE IF EXISTS users_search;
169
+
170
+DROP TRIGGER IF EXISTS issues_search_upsert ON issues;
171
+DROP FUNCTION IF EXISTS tg_issues_search_upsert();
172
+DROP TABLE IF EXISTS issues_search;
173
+
174
+DROP TRIGGER IF EXISTS repos_search_upsert ON repos;
175
+DROP FUNCTION IF EXISTS tg_repos_search_upsert();
176
+DROP TABLE IF EXISTS repos_search;
177
+
178
+DROP TEXT SEARCH CONFIGURATION IF EXISTS shithub_search;
internal/migrationsfs/migrations/0032_code_search.sqladded
@@ -0,0 +1,66 @@
1
+-- SPDX-License-Identifier: AGPL-3.0-or-later
2
+--
3
+-- S28 code search index.
4
+--
5
+-- Two tables, both scoped to a repo's default branch (named in
6
+-- `ref_name` so we don't lock in "default" semantics — the worker
7
+-- can index a different ref later if we expand v1):
8
+--
9
+--   code_search_paths   — per-(repo, ref, path), tsvector on the
10
+--                         path string. Always populated regardless
11
+--                         of file size (cheap).
12
+--   code_search_content — per-(repo, ref, path), tsvector on file
13
+--                         contents AND a trigram column for camel-
14
+--                         /snake-case identifier substring matches
15
+--                         that the FTS tokenizer mangles. Skipped
16
+--                         when the file is > 256 KiB or non-text.
17
+--
18
+-- Both tables are rewritten by the `repo:index_code` worker job in
19
+-- a single tx (delete-then-insert) so readers never see a partial
20
+-- index. The atomic-swap shape lives in the worker, not here.
21
+--
22
+-- `last_indexed_oid` on `repos` lets the reconciler detect drift
23
+-- (default_branch_oid moved but last_indexed_oid didn't catch up).
24
+
25
+-- +goose Up
26
+
27
+ALTER TABLE repos
28
+    ADD COLUMN last_indexed_oid text;
29
+
30
+CREATE TABLE code_search_paths (
31
+    repo_id  bigint   NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
32
+    ref_name text     NOT NULL,
33
+    path     text     NOT NULL,
34
+    tsv      tsvector NOT NULL,
35
+    PRIMARY KEY (repo_id, ref_name, path)
36
+);
37
+
38
+CREATE INDEX code_search_paths_tsv_idx
39
+    ON code_search_paths USING GIN (tsv);
40
+
41
+CREATE INDEX code_search_paths_path_trgm_idx
42
+    ON code_search_paths USING GIN (path gin_trgm_ops);
43
+
44
+CREATE TABLE code_search_content (
45
+    repo_id     bigint   NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
46
+    ref_name    text     NOT NULL,
47
+    path        text     NOT NULL,
48
+    content_tsv tsvector NOT NULL,
49
+    content_trgm text    NOT NULL,
50
+    PRIMARY KEY (repo_id, ref_name, path)
51
+);
52
+
53
+CREATE INDEX code_search_content_tsv_idx
54
+    ON code_search_content USING GIN (content_tsv);
55
+
56
+-- Trigram on content for substring + identifier matches. The
57
+-- column carries the (truncated) raw text; pg_trgm builds the
58
+-- index off it. Truncate to 64 KiB at the worker layer to keep
59
+-- pg_trgm rows bounded.
60
+CREATE INDEX code_search_content_trgm_idx
61
+    ON code_search_content USING GIN (content_trgm gin_trgm_ops);
62
+
63
+-- +goose Down
64
+DROP TABLE IF EXISTS code_search_content;
65
+DROP TABLE IF EXISTS code_search_paths;
66
+ALTER TABLE repos DROP COLUMN IF EXISTS last_indexed_oid;