| 1 | -- SPDX-License-Identifier: AGPL-3.0-or-later |
| 2 | -- |
| 3 | -- S28 code search index. |
| 4 | -- |
| 5 | -- Two tables, both scoped to a repo's default branch (named in |
| 6 | -- `ref_name` so we don't lock in "default" semantics — the worker |
| 7 | -- can index a different ref later if we expand v1): |
| 8 | -- |
| 9 | -- code_search_paths — per-(repo, ref, path), tsvector on the |
| 10 | -- path string. Always populated regardless |
| 11 | -- of file size (cheap). |
| 12 | -- code_search_content — per-(repo, ref, path), tsvector on file |
| 13 | -- contents AND a trigram column for camel- |
| 14 | -- /snake-case identifier substring matches |
| 15 | -- that the FTS tokenizer mangles. Skipped |
| 16 | -- when the file is > 256 KiB or non-text. |
| 17 | -- |
| 18 | -- Both tables are rewritten by the `repo:index_code` worker job in |
| 19 | -- a single tx (delete-then-insert) so readers never see a partial |
| 20 | -- index. The atomic-swap shape lives in the worker, not here. |
| 21 | -- |
| 22 | -- `last_indexed_oid` on `repos` lets the reconciler detect drift |
| 23 | -- (default_branch_oid moved but last_indexed_oid didn't catch up). |
| 24 | |
| 25 | -- +goose Up |
| 26 | |
| 27 | ALTER TABLE repos |
| 28 | ADD COLUMN last_indexed_oid text; |
| 29 | |
| 30 | CREATE TABLE code_search_paths ( |
| 31 | repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE, |
| 32 | ref_name text NOT NULL, |
| 33 | path text NOT NULL, |
| 34 | tsv tsvector NOT NULL, |
| 35 | PRIMARY KEY (repo_id, ref_name, path) |
| 36 | ); |
| 37 | |
| 38 | CREATE INDEX code_search_paths_tsv_idx |
| 39 | ON code_search_paths USING GIN (tsv); |
| 40 | |
| 41 | CREATE INDEX code_search_paths_path_trgm_idx |
| 42 | ON code_search_paths USING GIN (path gin_trgm_ops); |
| 43 | |
| 44 | CREATE TABLE code_search_content ( |
| 45 | repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE, |
| 46 | ref_name text NOT NULL, |
| 47 | path text NOT NULL, |
| 48 | content_tsv tsvector NOT NULL, |
| 49 | content_trgm text NOT NULL, |
| 50 | PRIMARY KEY (repo_id, ref_name, path) |
| 51 | ); |
| 52 | |
| 53 | CREATE INDEX code_search_content_tsv_idx |
| 54 | ON code_search_content USING GIN (content_tsv); |
| 55 | |
| 56 | -- Trigram on content for substring + identifier matches. The |
| 57 | -- column carries the (truncated) raw text; pg_trgm builds the |
| 58 | -- index off it. Truncate to 64 KiB at the worker layer to keep |
| 59 | -- pg_trgm rows bounded. |
| 60 | CREATE INDEX code_search_content_trgm_idx |
| 61 | ON code_search_content USING GIN (content_trgm gin_trgm_ops); |
| 62 | |
| 63 | -- +goose Down |
| 64 | DROP TABLE IF EXISTS code_search_content; |
| 65 | DROP TABLE IF EXISTS code_search_paths; |
| 66 | ALTER TABLE repos DROP COLUMN IF EXISTS last_indexed_oid; |
| 67 |