shithub Public

Watch 1 Fork 0 Star 0

MySQL · 2498 bytes Raw Blame History

  
        1
        -- SPDX-License-Identifier: AGPL-3.0-or-later
      
        2
        --
      
        3
        -- S28 code search index.
      
        4
        --
      
        5
        -- Two tables, both scoped to a repo's default branch (named in
      
        6
        -- `ref_name` so we don't lock in "default" semantics — the worker
      
        7
        -- can index a different ref later if we expand v1):
      
        8
        --
      
        9
        --   code_search_paths   — per-(repo, ref, path), tsvector on the
      
        10
        --                         path string. Always populated regardless
      
        11
        --                         of file size (cheap).
      
        12
        --   code_search_content — per-(repo, ref, path), tsvector on file
      
        13
        --                         contents AND a trigram column for camel-
      
        14
        --                         /snake-case identifier substring matches
      
        15
        --                         that the FTS tokenizer mangles. Skipped
      
        16
        --                         when the file is > 256 KiB or non-text.
      
        17
        --
      
        18
        -- Both tables are rewritten by the `repo:index_code` worker job in
      
        19
        -- a single tx (delete-then-insert) so readers never see a partial
      
        20
        -- index. The atomic-swap shape lives in the worker, not here.
      
        21
        --
      
        22
        -- `last_indexed_oid` on `repos` lets the reconciler detect drift
      
        23
        -- (default_branch_oid moved but last_indexed_oid didn't catch up).
      
        24
        
        25
        -- +goose Up
      
        26
        
        27
        ALTER TABLE repos
      
        28
            ADD COLUMN last_indexed_oid text;
      
        29
        
        30
        CREATE TABLE code_search_paths (
      
        31
            repo_id  bigint   NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
      
        32
            ref_name text     NOT NULL,
      
        33
            path     text     NOT NULL,
      
        34
            tsv      tsvector NOT NULL,
      
        35
            PRIMARY KEY (repo_id, ref_name, path)
      
        36
        );
      
        37
        
        38
        CREATE INDEX code_search_paths_tsv_idx
      
        39
            ON code_search_paths USING GIN (tsv);
      
        40
        
        41
        CREATE INDEX code_search_paths_path_trgm_idx
      
        42
            ON code_search_paths USING GIN (path gin_trgm_ops);
      
        43
        
        44
        CREATE TABLE code_search_content (
      
        45
            repo_id     bigint   NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
      
        46
            ref_name    text     NOT NULL,
      
        47
            path        text     NOT NULL,
      
        48
            content_tsv tsvector NOT NULL,
      
        49
            content_trgm text    NOT NULL,
      
        50
            PRIMARY KEY (repo_id, ref_name, path)
      
        51
        );
      
        52
        
        53
        CREATE INDEX code_search_content_tsv_idx
      
        54
            ON code_search_content USING GIN (content_tsv);
      
        55
        
        56
        -- Trigram on content for substring + identifier matches. The
      
        57
        -- column carries the (truncated) raw text; pg_trgm builds the
      
        58
        -- index off it. Truncate to 64 KiB at the worker layer to keep
      
        59
        -- pg_trgm rows bounded.
      
        60
        CREATE INDEX code_search_content_trgm_idx
      
        61
            ON code_search_content USING GIN (content_trgm gin_trgm_ops);
      
        62
        
        63
        -- +goose Down
      
        64
        DROP TABLE IF EXISTS code_search_content;
      
        65
        DROP TABLE IF EXISTS code_search_paths;
      
        66
        ALTER TABLE repos DROP COLUMN IF EXISTS last_indexed_oid;
      
        67

1	-- SPDX-License-Identifier: AGPL-3.0-or-later
2	--
3	-- S28 code search index.
4	--
5	-- Two tables, both scoped to a repo's default branch (named in
6	-- `ref_name` so we don't lock in "default" semantics — the worker
7	-- can index a different ref later if we expand v1):
8	--
9	-- code_search_paths — per-(repo, ref, path), tsvector on the
10	-- path string. Always populated regardless
11	-- of file size (cheap).
12	-- code_search_content — per-(repo, ref, path), tsvector on file
13	-- contents AND a trigram column for camel-
14	-- /snake-case identifier substring matches
15	-- that the FTS tokenizer mangles. Skipped
16	-- when the file is > 256 KiB or non-text.
17	--
18	-- Both tables are rewritten by the `repo:index_code` worker job in
19	-- a single tx (delete-then-insert) so readers never see a partial
20	-- index. The atomic-swap shape lives in the worker, not here.
21	--
22	-- `last_indexed_oid` on `repos` lets the reconciler detect drift
23	-- (default_branch_oid moved but last_indexed_oid didn't catch up).
24
25	-- +goose Up
26
27	ALTER TABLE repos
28	ADD COLUMN last_indexed_oid text;
29
30	CREATE TABLE code_search_paths (
31	repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
32	ref_name text NOT NULL,
33	path text NOT NULL,
34	tsv tsvector NOT NULL,
35	PRIMARY KEY (repo_id, ref_name, path)
36	);
37
38	CREATE INDEX code_search_paths_tsv_idx
39	ON code_search_paths USING GIN (tsv);
40
41	CREATE INDEX code_search_paths_path_trgm_idx
42	ON code_search_paths USING GIN (path gin_trgm_ops);
43
44	CREATE TABLE code_search_content (
45	repo_id bigint NOT NULL REFERENCES repos(id) ON DELETE CASCADE,
46	ref_name text NOT NULL,
47	path text NOT NULL,
48	content_tsv tsvector NOT NULL,
49	content_trgm text NOT NULL,
50	PRIMARY KEY (repo_id, ref_name, path)
51	);
52
53	CREATE INDEX code_search_content_tsv_idx
54	ON code_search_content USING GIN (content_tsv);
55
56	-- Trigram on content for substring + identifier matches. The
57	-- column carries the (truncated) raw text; pg_trgm builds the
58	-- index off it. Truncate to 64 KiB at the worker layer to keep
59	-- pg_trgm rows bounded.
60	CREATE INDEX code_search_content_trgm_idx
61	ON code_search_content USING GIN (content_trgm gin_trgm_ops);
62
63	-- +goose Down
64	DROP TABLE IF EXISTS code_search_content;
65	DROP TABLE IF EXISTS code_search_paths;
66	ALTER TABLE repos DROP COLUMN IF EXISTS last_indexed_oid;
67