MySQL · 2498 bytes Raw Blame History
1 -- SPDX-License-Identifier: AGPL-3.0-or-later
2 --
3 -- S28 code search index.
4 --
5 -- Two tables, both scoped to a repo's default branch (named in
6 -- `ref_name` so we don't lock in "default" semantics — the worker
7 -- can index a different ref later if we expand v1):
8 --
9 -- code_search_paths — per-(repo, ref, path), tsvector on the
10 -- path string. Always populated regardless
11 -- of file size (cheap).
12 -- code_search_content — per-(repo, ref, path), tsvector on file
13 -- contents AND a trigram column for camel-
14 -- /snake-case identifier substring matches
15 -- that the FTS tokenizer mangles. Skipped
16 -- when the file is > 256 KiB or non-text.
17 --
18 -- Both tables are rewritten by the `repo:index_code` worker job in
19 -- a single tx (delete-then-insert) so readers never see a partial
20 -- index. The atomic-swap shape lives in the worker, not here.
21 --
22 -- `last_indexed_oid` on `repos` lets the reconciler detect drift
23 -- (default_branch_oid moved but last_indexed_oid didn't catch up).
24
25 -- +goose Up
26
27 ALTER TABLE repos
28 ADD COLUMN last_indexed_oid text;
29
30 CREATE TABLE code_search_paths (
31 repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
32 ref_name text NOT NULL,
33 path text NOT NULL,
34 tsv tsvector NOT NULL,
35 PRIMARY KEY (repo_id, ref_name, path)
36 );
37
38 CREATE INDEX code_search_paths_tsv_idx
39 ON code_search_paths USING GIN (tsv);
40
41 CREATE INDEX code_search_paths_path_trgm_idx
42 ON code_search_paths USING GIN (path gin_trgm_ops);
43
44 CREATE TABLE code_search_content (
45 repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
46 ref_name text NOT NULL,
47 path text NOT NULL,
48 content_tsv tsvector NOT NULL,
49 content_trgm text NOT NULL,
50 PRIMARY KEY (repo_id, ref_name, path)
51 );
52
53 CREATE INDEX code_search_content_tsv_idx
54 ON code_search_content USING GIN (content_tsv);
55
56 -- Trigram on content for substring + identifier matches. The
57 -- column carries the (truncated) raw text; pg_trgm builds the
58 -- index off it. Truncate to 64 KiB at the worker layer to keep
59 -- pg_trgm rows bounded.
60 CREATE INDEX code_search_content_trgm_idx
61 ON code_search_content USING GIN (content_trgm gin_trgm_ops);
62
63 -- +goose Down
64 DROP TABLE IF EXISTS code_search_content;
65 DROP TABLE IF EXISTS code_search_paths;
66 ALTER TABLE repos DROP COLUMN IF EXISTS last_indexed_oid;
67