@@ -28,6 +28,7 @@ import ( |
| 28 | 28 | "fmt" |
| 29 | 29 | "strings" |
| 30 | 30 | "unicode" |
| 31 | + "unicode/utf8" |
| 31 | 32 | ) |
| 32 | 33 | |
| 33 | 34 | // TokenKind classifies a lexed token. |
@@ -96,68 +97,76 @@ func (k TokenKind) String() string { |
| 96 | 97 | // Lex returns the token stream for src or an error on the first lexical |
| 97 | 98 | // problem. Whitespace is skipped silently. The lexer doesn't strip the |
| 98 | 99 | // surrounding `${{ … }}` — the caller does that before calling Lex. |
| 100 | +// |
| 101 | +// The main loop walks runes (not bytes) so multi-byte UTF-8 characters |
| 102 | +// in identifiers — e.g., a Greek-letter variable name in an env-bag |
| 103 | +// key — produce correct identifier boundaries instead of mangled |
| 104 | +// "unexpected character" errors at continuation bytes. |
| 99 | 105 | func Lex(src string) ([]Token, error) { |
| 100 | 106 | var out []Token |
| 101 | 107 | i := 0 |
| 102 | 108 | for i < len(src) { |
| 103 | | - c := src[i] |
| 109 | + r, size := utf8.DecodeRuneInString(src[i:]) |
| 110 | + if r == utf8.RuneError && size <= 1 { |
| 111 | + return nil, fmt.Errorf("expr: invalid UTF-8 at offset %d", i) |
| 112 | + } |
| 104 | 113 | switch { |
| 105 | | - case c == ' ' || c == '\t' || c == '\n' || c == '\r': |
| 106 | | - i++ |
| 107 | | - case c == '.': |
| 114 | + case r == ' ' || r == '\t' || r == '\n' || r == '\r': |
| 115 | + i += size |
| 116 | + case r == '.': |
| 108 | 117 | out = append(out, Token{Kind: TokDot, Value: ".", Pos: i}) |
| 109 | | - i++ |
| 110 | | - case c == '(': |
| 118 | + i += size |
| 119 | + case r == '(': |
| 111 | 120 | out = append(out, Token{Kind: TokLParen, Value: "(", Pos: i}) |
| 112 | | - i++ |
| 113 | | - case c == ')': |
| 121 | + i += size |
| 122 | + case r == ')': |
| 114 | 123 | out = append(out, Token{Kind: TokRParen, Value: ")", Pos: i}) |
| 115 | | - i++ |
| 116 | | - case c == ',': |
| 124 | + i += size |
| 125 | + case r == ',': |
| 117 | 126 | out = append(out, Token{Kind: TokComma, Value: ",", Pos: i}) |
| 118 | | - i++ |
| 119 | | - case c == '\'': |
| 127 | + i += size |
| 128 | + case r == '\'': |
| 120 | 129 | tok, n, err := lexString(src[i:], i) |
| 121 | 130 | if err != nil { |
| 122 | 131 | return nil, err |
| 123 | 132 | } |
| 124 | 133 | out = append(out, tok) |
| 125 | 134 | i += n |
| 126 | | - case c == '&': |
| 135 | + case r == '&': |
| 127 | 136 | if i+1 < len(src) && src[i+1] == '&' { |
| 128 | 137 | out = append(out, Token{Kind: TokAnd, Value: "&&", Pos: i}) |
| 129 | 138 | i += 2 |
| 130 | 139 | } else { |
| 131 | 140 | return nil, fmt.Errorf("expr: stray '&' at offset %d (expected '&&')", i) |
| 132 | 141 | } |
| 133 | | - case c == '|': |
| 142 | + case r == '|': |
| 134 | 143 | if i+1 < len(src) && src[i+1] == '|' { |
| 135 | 144 | out = append(out, Token{Kind: TokOr, Value: "||", Pos: i}) |
| 136 | 145 | i += 2 |
| 137 | 146 | } else { |
| 138 | 147 | return nil, fmt.Errorf("expr: stray '|' at offset %d (expected '||')", i) |
| 139 | 148 | } |
| 140 | | - case c == '!': |
| 149 | + case r == '!': |
| 141 | 150 | if i+1 < len(src) && src[i+1] == '=' { |
| 142 | 151 | out = append(out, Token{Kind: TokNe, Value: "!=", Pos: i}) |
| 143 | 152 | i += 2 |
| 144 | 153 | } else { |
| 145 | 154 | out = append(out, Token{Kind: TokNot, Value: "!", Pos: i}) |
| 146 | | - i++ |
| 155 | + i += size |
| 147 | 156 | } |
| 148 | | - case c == '=': |
| 157 | + case r == '=': |
| 149 | 158 | if i+1 < len(src) && src[i+1] == '=' { |
| 150 | 159 | out = append(out, Token{Kind: TokEq, Value: "==", Pos: i}) |
| 151 | 160 | i += 2 |
| 152 | 161 | } else { |
| 153 | 162 | return nil, fmt.Errorf("expr: stray '=' at offset %d (expected '==')", i) |
| 154 | 163 | } |
| 155 | | - case isIdentStart(c): |
| 164 | + case isIdentStart(r): |
| 156 | 165 | tok, n := lexIdent(src[i:], i) |
| 157 | 166 | out = append(out, tok) |
| 158 | 167 | i += n |
| 159 | 168 | default: |
| 160 | | - return nil, fmt.Errorf("expr: unexpected character %q at offset %d", c, i) |
| 169 | + return nil, fmt.Errorf("expr: unexpected character %q at offset %d", r, i) |
| 161 | 170 | } |
| 162 | 171 | } |
| 163 | 172 | out = append(out, Token{Kind: TokEOF, Pos: i}) |
@@ -190,8 +199,15 @@ func lexString(src string, basePos int) (Token, int, error) { |
| 190 | 199 | |
| 191 | 200 | func lexIdent(src string, basePos int) (Token, int) { |
| 192 | 201 | i := 0 |
| 193 | | - for i < len(src) && isIdentChar(src[i]) { |
| 194 | | - i++ |
| 202 | + for i < len(src) { |
| 203 | + r, size := utf8.DecodeRuneInString(src[i:]) |
| 204 | + if r == utf8.RuneError { |
| 205 | + break |
| 206 | + } |
| 207 | + if !isIdentChar(r) { |
| 208 | + break |
| 209 | + } |
| 210 | + i += size |
| 195 | 211 | } |
| 196 | 212 | v := src[:i] |
| 197 | 213 | switch v { |
@@ -203,10 +219,10 @@ func lexIdent(src string, basePos int) (Token, int) { |
| 203 | 219 | return Token{Kind: TokIdent, Value: v, Pos: basePos}, i |
| 204 | 220 | } |
| 205 | 221 | |
| 206 | | -func isIdentStart(c byte) bool { |
| 207 | | - return unicode.IsLetter(rune(c)) || c == '_' |
| 222 | +func isIdentStart(r rune) bool { |
| 223 | + return unicode.IsLetter(r) || r == '_' |
| 208 | 224 | } |
| 209 | 225 | |
| 210 | | -func isIdentChar(c byte) bool { |
| 211 | | - return unicode.IsLetter(rune(c)) || unicode.IsDigit(rune(c)) || c == '_' |
| 226 | +func isIdentChar(r rune) bool { |
| 227 | + return unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' |
| 212 | 228 | } |