shithub Public

Watch 1 Fork 0 Star 0

Go · 6334 bytes Raw Blame History

  
        1
        // SPDX-License-Identifier: AGPL-3.0-or-later
      
        2
        
        3
        // Package expr is the strict-allowlist expression evaluator for
      
        4
        // `${{ … }}` blocks in workflow files.
      
        5
        //
      
        6
        // The evaluator is intentionally tiny:
      
        7
        //   - Allowed namespaces: secrets, env, vars, shithub.event, shithub.run_id,
      
        8
        //     shithub.sha, shithub.ref, shithub.actor.
      
        9
        //   - Allowed functions: contains, startsWith, endsWith,
      
        10
        //     success(), failure(), always(), cancelled().
      
        11
        //   - Operators: && || ! == != binary string concat (none — we don't
      
        12
        //     support arithmetic or anything else in v1).
      
        13
        //
      
        14
        // Anything outside that set is an evaluation error. This is the load-
      
        15
        // bearing security surface — the more we accept, the more attack
      
        16
        // surface we open. Future expansion goes through a reviewer-required
      
        17
        // note in the commit message (per the campaign §"Risks": "block any
      
        18
        // S41 PR that adds an evaluator function without a security note").
      
        19
        //
      
        20
        // Every produced Value carries a Tainted bool. References that
      
        21
        // resolve into the shithub.event.* namespace are tagged Tainted=true;
      
        22
        // taint propagates through string concatenation, comparisons (the
      
        23
        // boolean output isn't tainted, but the comparison operands' values
      
        24
        // are checked), and function returns.
      
        25
        package expr
      
        26
        
        27
        import (
      
        28
        	"fmt"
      
        29
        	"strings"
      
        30
        	"unicode"
      
        31
        	"unicode/utf8"
      
        32
        )
      
        33
        
        34
        // TokenKind classifies a lexed token.
      
        35
        type TokenKind int
      
        36
        
        37
        const (
      
        38
        	TokInvalid TokenKind = iota
      
        39
        	TokIdent             // foo, secrets, shithub
      
        40
        	TokDot               // .
      
        41
        	TokLParen            // (
      
        42
        	TokRParen            // )
      
        43
        	TokComma             // ,
      
        44
        	TokString            // 'literal' (single-quoted only — GHA convention)
      
        45
        	TokBool              // true | false
      
        46
        	TokNull              // null
      
        47
        	TokAnd               // &&
      
        48
        	TokOr                // ||
      
        49
        	TokNot               // !
      
        50
        	TokEq                // ==
      
        51
        	TokNe                // !=
      
        52
        	TokEOF
      
        53
        )
      
        54
        
        55
        // Token is a single lexed unit. Pos is the byte offset in the original
      
        56
        // source (useful for diagnostic spans).
      
        57
        type Token struct {
      
        58
        	Kind  TokenKind
      
        59
        	Value string
      
        60
        	Pos   int
      
        61
        }
      
        62
        
        63
        func (k TokenKind) String() string {
      
        64
        	switch k {
      
        65
        	case TokIdent:
      
        66
        		return "identifier"
      
        67
        	case TokDot:
      
        68
        		return "."
      
        69
        	case TokLParen:
      
        70
        		return "("
      
        71
        	case TokRParen:
      
        72
        		return ")"
      
        73
        	case TokComma:
      
        74
        		return ","
      
        75
        	case TokString:
      
        76
        		return "string literal"
      
        77
        	case TokBool:
      
        78
        		return "boolean"
      
        79
        	case TokNull:
      
        80
        		return "null"
      
        81
        	case TokAnd:
      
        82
        		return "&&"
      
        83
        	case TokOr:
      
        84
        		return "||"
      
        85
        	case TokNot:
      
        86
        		return "!"
      
        87
        	case TokEq:
      
        88
        		return "=="
      
        89
        	case TokNe:
      
        90
        		return "!="
      
        91
        	case TokEOF:
      
        92
        		return "end of input"
      
        93
        	}
      
        94
        	return "invalid"
      
        95
        }
      
        96
        
        97
        // Lex returns the token stream for src or an error on the first lexical
      
        98
        // problem. Whitespace is skipped silently. The lexer doesn't strip the
      
        99
        // surrounding `${{ … }}` — the caller does that before calling Lex.
      
        100
        //
      
        101
        // The main loop walks runes (not bytes) so multi-byte UTF-8 characters
      
        102
        // in identifiers — e.g., a Greek-letter variable name in an env-bag
      
        103
        // key — produce correct identifier boundaries instead of mangled
      
        104
        // "unexpected character" errors at continuation bytes.
      
        105
        func Lex(src string) ([]Token, error) {
      
        106
        	var out []Token
      
        107
        	i := 0
      
        108
        	for i < len(src) {
      
        109
        		r, size := utf8.DecodeRuneInString(src[i:])
      
        110
        		if r == utf8.RuneError && size <= 1 {
      
        111
        			return nil, fmt.Errorf("expr: invalid UTF-8 at offset %d", i)
      
        112
        		}
      
        113
        		switch {
      
        114
        		case r == ' ' || r == '\t' || r == '\n' || r == '\r':
      
        115
        			i += size
      
        116
        		case r == '.':
      
        117
        			out = append(out, Token{Kind: TokDot, Value: ".", Pos: i})
      
        118
        			i += size
      
        119
        		case r == '(':
      
        120
        			out = append(out, Token{Kind: TokLParen, Value: "(", Pos: i})
      
        121
        			i += size
      
        122
        		case r == ')':
      
        123
        			out = append(out, Token{Kind: TokRParen, Value: ")", Pos: i})
      
        124
        			i += size
      
        125
        		case r == ',':
      
        126
        			out = append(out, Token{Kind: TokComma, Value: ",", Pos: i})
      
        127
        			i += size
      
        128
        		case r == '\'':
      
        129
        			tok, n, err := lexString(src[i:], i)
      
        130
        			if err != nil {
      
        131
        				return nil, err
      
        132
        			}
      
        133
        			out = append(out, tok)
      
        134
        			i += n
      
        135
        		case r == '&':
      
        136
        			if i+1 < len(src) && src[i+1] == '&' {
      
        137
        				out = append(out, Token{Kind: TokAnd, Value: "&&", Pos: i})
      
        138
        				i += 2
      
        139
        			} else {
      
        140
        				return nil, fmt.Errorf("expr: stray '&' at offset %d (expected '&&')", i)
      
        141
        			}
      
        142
        		case r == '|':
      
        143
        			if i+1 < len(src) && src[i+1] == '|' {
      
        144
        				out = append(out, Token{Kind: TokOr, Value: "||", Pos: i})
      
        145
        				i += 2
      
        146
        			} else {
      
        147
        				return nil, fmt.Errorf("expr: stray '|' at offset %d (expected '||')", i)
      
        148
        			}
      
        149
        		case r == '!':
      
        150
        			if i+1 < len(src) && src[i+1] == '=' {
      
        151
        				out = append(out, Token{Kind: TokNe, Value: "!=", Pos: i})
      
        152
        				i += 2
      
        153
        			} else {
      
        154
        				out = append(out, Token{Kind: TokNot, Value: "!", Pos: i})
      
        155
        				i += size
      
        156
        			}
      
        157
        		case r == '=':
      
        158
        			if i+1 < len(src) && src[i+1] == '=' {
      
        159
        				out = append(out, Token{Kind: TokEq, Value: "==", Pos: i})
      
        160
        				i += 2
      
        161
        			} else {
      
        162
        				return nil, fmt.Errorf("expr: stray '=' at offset %d (expected '==')", i)
      
        163
        			}
      
        164
        		case isIdentStart(r):
      
        165
        			tok, n := lexIdent(src[i:], i)
      
        166
        			out = append(out, tok)
      
        167
        			i += n
      
        168
        		default:
      
        169
        			return nil, fmt.Errorf("expr: unexpected character %q at offset %d", r, i)
      
        170
        		}
      
        171
        	}
      
        172
        	out = append(out, Token{Kind: TokEOF, Pos: i})
      
        173
        	return out, nil
      
        174
        }
      
        175
        
        176
        func lexString(src string, basePos int) (Token, int, error) {
      
        177
        	if len(src) < 2 {
      
        178
        		return Token{}, 0, fmt.Errorf("expr: unterminated string at offset %d", basePos)
      
        179
        	}
      
        180
        	// Walk until matching '. GHA expressions do NOT support backslash
      
        181
        	// escapes; the only escape is doubling the quote: '' produces '.
      
        182
        	var b strings.Builder
      
        183
        	i := 1 // skip opening '
      
        184
        	for i < len(src) {
      
        185
        		c := src[i]
      
        186
        		if c == '\'' {
      
        187
        			if i+1 < len(src) && src[i+1] == '\'' {
      
        188
        				b.WriteByte('\'')
      
        189
        				i += 2
      
        190
        				continue
      
        191
        			}
      
        192
        			return Token{Kind: TokString, Value: b.String(), Pos: basePos}, i + 1, nil
      
        193
        		}
      
        194
        		b.WriteByte(c)
      
        195
        		i++
      
        196
        	}
      
        197
        	return Token{}, 0, fmt.Errorf("expr: unterminated string at offset %d", basePos)
      
        198
        }
      
        199
        
        200
        func lexIdent(src string, basePos int) (Token, int) {
      
        201
        	i := 0
      
        202
        	for i < len(src) {
      
        203
        		r, size := utf8.DecodeRuneInString(src[i:])
      
        204
        		if r == utf8.RuneError {
      
        205
        			break
      
        206
        		}
      
        207
        		if !isIdentChar(r) {
      
        208
        			break
      
        209
        		}
      
        210
        		i += size
      
        211
        	}
      
        212
        	v := src[:i]
      
        213
        	switch v {
      
        214
        	case "true", "false":
      
        215
        		return Token{Kind: TokBool, Value: v, Pos: basePos}, i
      
        216
        	case "null":
      
        217
        		return Token{Kind: TokNull, Value: v, Pos: basePos}, i
      
        218
        	}
      
        219
        	return Token{Kind: TokIdent, Value: v, Pos: basePos}, i
      
        220
        }
      
        221
        
        222
        func isIdentStart(r rune) bool {
      
        223
        	return unicode.IsLetter(r) || r == '_'
      
        224
        }
      
        225
        
        226
        func isIdentChar(r rune) bool {
      
        227
        	return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_'
      
        228
        }
      
        229

1	// SPDX-License-Identifier: AGPL-3.0-or-later
2
3	// Package expr is the strict-allowlist expression evaluator for
4	// `${{ … }}` blocks in workflow files.
5	//
6	// The evaluator is intentionally tiny:
7	// - Allowed namespaces: secrets, env, vars, shithub.event, shithub.run_id,
8	// shithub.sha, shithub.ref, shithub.actor.
9	// - Allowed functions: contains, startsWith, endsWith,
10	// success(), failure(), always(), cancelled().
11	// - Operators: && \|\| ! == != binary string concat (none — we don't
12	// support arithmetic or anything else in v1).
13	//
14	// Anything outside that set is an evaluation error. This is the load-
15	// bearing security surface — the more we accept, the more attack
16	// surface we open. Future expansion goes through a reviewer-required
17	// note in the commit message (per the campaign §"Risks": "block any
18	// S41 PR that adds an evaluator function without a security note").
19	//
20	// Every produced Value carries a Tainted bool. References that
21	// resolve into the shithub.event.* namespace are tagged Tainted=true;
22	// taint propagates through string concatenation, comparisons (the
23	// boolean output isn't tainted, but the comparison operands' values
24	// are checked), and function returns.
25	package expr
26
27	import (
28	"fmt"
29	"strings"
30	"unicode"
31	"unicode/utf8"
32	)
33
34	// TokenKind classifies a lexed token.
35	type TokenKind int
36
37	const (
38	TokInvalid TokenKind = iota
39	TokIdent // foo, secrets, shithub
40	TokDot // .
41	TokLParen // (
42	TokRParen // )
43	TokComma // ,
44	TokString // 'literal' (single-quoted only — GHA convention)
45	TokBool // true \| false
46	TokNull // null
47	TokAnd // &&
48	TokOr // \|\|
49	TokNot // !
50	TokEq // ==
51	TokNe // !=
52	TokEOF
53	)
54
55	// Token is a single lexed unit. Pos is the byte offset in the original
56	// source (useful for diagnostic spans).
57	type Token struct {
58	Kind TokenKind
59	Value string
60	Pos int
61	}
62
63	func (k TokenKind) String() string {
64	switch k {
65	case TokIdent:
66	return "identifier"
67	case TokDot:
68	return "."
69	case TokLParen:
70	return "("
71	case TokRParen:
72	return ")"
73	case TokComma:
74	return ","
75	case TokString:
76	return "string literal"
77	case TokBool:
78	return "boolean"
79	case TokNull:
80	return "null"
81	case TokAnd:
82	return "&&"
83	case TokOr:
84	return "\|\|"
85	case TokNot:
86	return "!"
87	case TokEq:
88	return "=="
89	case TokNe:
90	return "!="
91	case TokEOF:
92	return "end of input"
93	}
94	return "invalid"
95	}
96
97	// Lex returns the token stream for src or an error on the first lexical
98	// problem. Whitespace is skipped silently. The lexer doesn't strip the
99	// surrounding `${{ … }}` — the caller does that before calling Lex.
100	//
101	// The main loop walks runes (not bytes) so multi-byte UTF-8 characters
102	// in identifiers — e.g., a Greek-letter variable name in an env-bag
103	// key — produce correct identifier boundaries instead of mangled
104	// "unexpected character" errors at continuation bytes.
105	func Lex(src string) ([]Token, error) {
106	var out []Token
107	i := 0
108	for i < len(src) {
109	r, size := utf8.DecodeRuneInString(src[i:])
110	if r == utf8.RuneError && size <= 1 {
111	return nil, fmt.Errorf("expr: invalid UTF-8 at offset %d", i)
112	}
113	switch {
114	case r == ' ' \|\| r == '\t' \|\| r == '\n' \|\| r == '\r':
115	i += size
116	case r == '.':
117	out = append(out, Token{Kind: TokDot, Value: ".", Pos: i})
118	i += size
119	case r == '(':
120	out = append(out, Token{Kind: TokLParen, Value: "(", Pos: i})
121	i += size
122	case r == ')':
123	out = append(out, Token{Kind: TokRParen, Value: ")", Pos: i})
124	i += size
125	case r == ',':
126	out = append(out, Token{Kind: TokComma, Value: ",", Pos: i})
127	i += size
128	case r == '\'':
129	tok, n, err := lexString(src[i:], i)
130	if err != nil {
131	return nil, err
132	}
133	out = append(out, tok)
134	i += n
135	case r == '&':
136	if i+1 < len(src) && src[i+1] == '&' {
137	out = append(out, Token{Kind: TokAnd, Value: "&&", Pos: i})
138	i += 2
139	} else {
140	return nil, fmt.Errorf("expr: stray '&' at offset %d (expected '&&')", i)
141	}
142	case r == '\|':
143	if i+1 < len(src) && src[i+1] == '\|' {
144	out = append(out, Token{Kind: TokOr, Value: "\|\|", Pos: i})
145	i += 2
146	} else {
147	return nil, fmt.Errorf("expr: stray '\|' at offset %d (expected '\|\|')", i)
148	}
149	case r == '!':
150	if i+1 < len(src) && src[i+1] == '=' {
151	out = append(out, Token{Kind: TokNe, Value: "!=", Pos: i})
152	i += 2
153	} else {
154	out = append(out, Token{Kind: TokNot, Value: "!", Pos: i})
155	i += size
156	}
157	case r == '=':
158	if i+1 < len(src) && src[i+1] == '=' {
159	out = append(out, Token{Kind: TokEq, Value: "==", Pos: i})
160	i += 2
161	} else {
162	return nil, fmt.Errorf("expr: stray '=' at offset %d (expected '==')", i)
163	}
164	case isIdentStart(r):
165	tok, n := lexIdent(src[i:], i)
166	out = append(out, tok)
167	i += n
168	default:
169	return nil, fmt.Errorf("expr: unexpected character %q at offset %d", r, i)
170	}
171	}
172	out = append(out, Token{Kind: TokEOF, Pos: i})
173	return out, nil
174	}
175
176	func lexString(src string, basePos int) (Token, int, error) {
177	if len(src) < 2 {
178	return Token{}, 0, fmt.Errorf("expr: unterminated string at offset %d", basePos)
179	}
180	// Walk until matching '. GHA expressions do NOT support backslash
181	// escapes; the only escape is doubling the quote: '' produces '.
182	var b strings.Builder
183	i := 1 // skip opening '
184	for i < len(src) {
185	c := src[i]
186	if c == '\'' {
187	if i+1 < len(src) && src[i+1] == '\'' {
188	b.WriteByte('\'')
189	i += 2
190	continue
191	}
192	return Token{Kind: TokString, Value: b.String(), Pos: basePos}, i + 1, nil
193	}
194	b.WriteByte(c)
195	i++
196	}
197	return Token{}, 0, fmt.Errorf("expr: unterminated string at offset %d", basePos)
198	}
199
200	func lexIdent(src string, basePos int) (Token, int) {
201	i := 0
202	for i < len(src) {
203	r, size := utf8.DecodeRuneInString(src[i:])
204	if r == utf8.RuneError {
205	break
206	}
207	if !isIdentChar(r) {
208	break
209	}
210	i += size
211	}
212	v := src[:i]
213	switch v {
214	case "true", "false":
215	return Token{Kind: TokBool, Value: v, Pos: basePos}, i
216	case "null":
217	return Token{Kind: TokNull, Value: v, Pos: basePos}, i
218	}
219	return Token{Kind: TokIdent, Value: v, Pos: basePos}, i
220	}
221
222	func isIdentStart(r rune) bool {
223	return unicode.IsLetter(r) \|\| r == '_'
224	}
225
226	func isIdentChar(r rune) bool {
227	return unicode.IsLetter(r) \|\| unicode.IsDigit(r) \|\| r == '_'
228	}
229