tenseleyflow/shithub / 058db02

Browse files

actions/expr: rune-aware lexer (S41a-M1)

Pre-fix the lexer iterated byte-by-byte and called unicode.IsLetter
on a rune cast from a single byte — only handles ASCII + Latin-1
correctly. Multi-byte UTF-8 sequences either fed leading bytes that
happened to test as letters (false-positive identifier chars) or
fell into the default arm with a confusing 'unexpected character'
error pointing at a continuation byte.

Replaces the main loop and lexIdent with utf8.DecodeRuneInString
walks. isIdentStart/isIdentChar now take rune. Single-byte ASCII
operators (==, !=, &&, ||) keep their fast path since they can't
overlap with UTF-8 continuation bytes (which are >= 0x80).

Surfaces invalid UTF-8 with a precise 'invalid UTF-8 at offset N'
error instead of letting it through.

Quality fix, not a security one: the byte-level lexer failed
closed (rejected). Closes the door consistently and gives authors
correct diagnostics.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
058db020eb4f77f7829b2cc46fb28aa2c73a1e2f
Parents
9c71d87
Tree
6146048

1 changed file

StatusFile+-
M internal/actions/expr/lex.go 41 25
internal/actions/expr/lex.gomodified
@@ -28,6 +28,7 @@ import (
2828
 	"fmt"
2929
 	"strings"
3030
 	"unicode"
31
+	"unicode/utf8"
3132
 )
3233
 
3334
 // TokenKind classifies a lexed token.
@@ -96,68 +97,76 @@ func (k TokenKind) String() string {
9697
 // Lex returns the token stream for src or an error on the first lexical
9798
 // problem. Whitespace is skipped silently. The lexer doesn't strip the
9899
 // surrounding `${{ … }}` — the caller does that before calling Lex.
100
+//
101
+// The main loop walks runes (not bytes) so multi-byte UTF-8 characters
102
+// in identifiers — e.g., a Greek-letter variable name in an env-bag
103
+// key — produce correct identifier boundaries instead of mangled
104
+// "unexpected character" errors at continuation bytes.
99105
 func Lex(src string) ([]Token, error) {
100106
 	var out []Token
101107
 	i := 0
102108
 	for i < len(src) {
103
-		c := src[i]
109
+		r, size := utf8.DecodeRuneInString(src[i:])
110
+		if r == utf8.RuneError && size <= 1 {
111
+			return nil, fmt.Errorf("expr: invalid UTF-8 at offset %d", i)
112
+		}
104113
 		switch {
105
-		case c == ' ' || c == '\t' || c == '\n' || c == '\r':
106
-			i++
107
-		case c == '.':
114
+		case r == ' ' || r == '\t' || r == '\n' || r == '\r':
115
+			i += size
116
+		case r == '.':
108117
 			out = append(out, Token{Kind: TokDot, Value: ".", Pos: i})
109
-			i++
110
-		case c == '(':
118
+			i += size
119
+		case r == '(':
111120
 			out = append(out, Token{Kind: TokLParen, Value: "(", Pos: i})
112
-			i++
113
-		case c == ')':
121
+			i += size
122
+		case r == ')':
114123
 			out = append(out, Token{Kind: TokRParen, Value: ")", Pos: i})
115
-			i++
116
-		case c == ',':
124
+			i += size
125
+		case r == ',':
117126
 			out = append(out, Token{Kind: TokComma, Value: ",", Pos: i})
118
-			i++
119
-		case c == '\'':
127
+			i += size
128
+		case r == '\'':
120129
 			tok, n, err := lexString(src[i:], i)
121130
 			if err != nil {
122131
 				return nil, err
123132
 			}
124133
 			out = append(out, tok)
125134
 			i += n
126
-		case c == '&':
135
+		case r == '&':
127136
 			if i+1 < len(src) && src[i+1] == '&' {
128137
 				out = append(out, Token{Kind: TokAnd, Value: "&&", Pos: i})
129138
 				i += 2
130139
 			} else {
131140
 				return nil, fmt.Errorf("expr: stray '&' at offset %d (expected '&&')", i)
132141
 			}
133
-		case c == '|':
142
+		case r == '|':
134143
 			if i+1 < len(src) && src[i+1] == '|' {
135144
 				out = append(out, Token{Kind: TokOr, Value: "||", Pos: i})
136145
 				i += 2
137146
 			} else {
138147
 				return nil, fmt.Errorf("expr: stray '|' at offset %d (expected '||')", i)
139148
 			}
140
-		case c == '!':
149
+		case r == '!':
141150
 			if i+1 < len(src) && src[i+1] == '=' {
142151
 				out = append(out, Token{Kind: TokNe, Value: "!=", Pos: i})
143152
 				i += 2
144153
 			} else {
145154
 				out = append(out, Token{Kind: TokNot, Value: "!", Pos: i})
146
-				i++
155
+				i += size
147156
 			}
148
-		case c == '=':
157
+		case r == '=':
149158
 			if i+1 < len(src) && src[i+1] == '=' {
150159
 				out = append(out, Token{Kind: TokEq, Value: "==", Pos: i})
151160
 				i += 2
152161
 			} else {
153162
 				return nil, fmt.Errorf("expr: stray '=' at offset %d (expected '==')", i)
154163
 			}
155
-		case isIdentStart(c):
164
+		case isIdentStart(r):
156165
 			tok, n := lexIdent(src[i:], i)
157166
 			out = append(out, tok)
158167
 			i += n
159168
 		default:
160
-			return nil, fmt.Errorf("expr: unexpected character %q at offset %d", c, i)
169
+			return nil, fmt.Errorf("expr: unexpected character %q at offset %d", r, i)
161170
 		}
162171
 	}
163172
 	out = append(out, Token{Kind: TokEOF, Pos: i})
@@ -190,8 +199,15 @@ func lexString(src string, basePos int) (Token, int, error) {
190199
 
191200
 func lexIdent(src string, basePos int) (Token, int) {
192201
 	i := 0
193
-	for i < len(src) && isIdentChar(src[i]) {
194
-		i++
202
+	for i < len(src) {
203
+		r, size := utf8.DecodeRuneInString(src[i:])
204
+		if r == utf8.RuneError {
205
+			break
206
+		}
207
+		if !isIdentChar(r) {
208
+			break
209
+		}
210
+		i += size
195211
 	}
196212
 	v := src[:i]
197213
 	switch v {
@@ -203,10 +219,10 @@ func lexIdent(src string, basePos int) (Token, int) {
203219
 	return Token{Kind: TokIdent, Value: v, Pos: basePos}, i
204220
 }
205221
 
206
-func isIdentStart(c byte) bool {
207
-	return unicode.IsLetter(rune(c)) || c == '_'
222
+func isIdentStart(r rune) bool {
223
+	return unicode.IsLetter(r) || r == '_'
208224
 }
209225
 
210
-func isIdentChar(c byte) bool {
211
-	return unicode.IsLetter(rune(c)) || unicode.IsDigit(rune(c)) || c == '_'
226
+func isIdentChar(r rune) bool {
227
+	return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_'
212228
 }