| 1 | !> Lexical analyzer module for FORTBITE |
| 2 | !> |
| 3 | !> Tokenizes mathematical expressions into a stream of tokens for parsing. |
| 4 | !> Handles numbers, identifiers, operators, and special syntax like :: and := |
| 5 | module fortbite_lexer_m |
| 6 | use fortbite_types_m, only: token_t, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, & |
| 7 | TOKEN_OPERATOR, TOKEN_LPAREN, TOKEN_RPAREN, & |
| 8 | TOKEN_LBRACKET, TOKEN_RBRACKET, TOKEN_SEMICOLON, & |
| 9 | TOKEN_COMMA, TOKEN_ASSIGN, TOKEN_PRECISION |
| 10 | use iso_fortran_env, only: real64 |
| 11 | implicit none |
| 12 | private |
| 13 | |
| 14 | public :: tokenize, free_tokens, print_tokens |
| 15 | |
| 16 | ! Lexer state |
| 17 | type :: lexer_state_t |
| 18 | character(len=:), allocatable :: input |
| 19 | integer :: position |
| 20 | integer :: length |
| 21 | character :: current_char |
| 22 | end type lexer_state_t |
| 23 | |
| 24 | contains |
| 25 | |
| 26 | !> Tokenize a mathematical expression |
| 27 | function tokenize(expression) result(tokens) |
| 28 | character(len=*), intent(in) :: expression |
| 29 | type(token_t), allocatable :: tokens(:) |
| 30 | |
| 31 | type(lexer_state_t) :: lexer |
| 32 | type(token_t) :: current_token |
| 33 | integer :: token_count, capacity |
| 34 | |
| 35 | ! Initialize lexer state |
| 36 | lexer%input = trim(adjustl(expression)) |
| 37 | lexer%length = len_trim(lexer%input) |
| 38 | lexer%position = 1 |
| 39 | |
| 40 | if (lexer%length == 0) then |
| 41 | allocate(tokens(1)) |
| 42 | tokens(1)%token_type = TOKEN_EOF |
| 43 | return |
| 44 | end if |
| 45 | |
| 46 | lexer%current_char = lexer%input(1:1) |
| 47 | |
| 48 | ! Dynamic token array |
| 49 | capacity = max(16, lexer%length / 2) |
| 50 | allocate(tokens(capacity)) |
| 51 | token_count = 0 |
| 52 | |
| 53 | ! Main tokenization loop |
| 54 | do |
| 55 | call skip_whitespace(lexer) |
| 56 | if (lexer%current_char == char(0) .or. lexer%position > lexer%length) exit |
| 57 | |
| 58 | current_token = next_token(lexer) |
| 59 | |
| 60 | ! Resize array if needed |
| 61 | if (token_count >= capacity) then |
| 62 | capacity = capacity * 2 |
| 63 | call resize_token_array(tokens, capacity) |
| 64 | end if |
| 65 | |
| 66 | token_count = token_count + 1 |
| 67 | tokens(token_count) = current_token |
| 68 | |
| 69 | if (current_token%token_type == TOKEN_EOF) exit |
| 70 | end do |
| 71 | |
| 72 | ! Add EOF token if not already present |
| 73 | if (token_count == 0 .or. tokens(token_count)%token_type /= TOKEN_EOF) then |
| 74 | if (token_count >= capacity) then |
| 75 | capacity = capacity + 1 |
| 76 | call resize_token_array(tokens, capacity) |
| 77 | end if |
| 78 | token_count = token_count + 1 |
| 79 | tokens(token_count)%token_type = TOKEN_EOF |
| 80 | tokens(token_count)%position = lexer%position |
| 81 | end if |
| 82 | |
| 83 | ! Trim array to actual size |
| 84 | call resize_token_array(tokens, token_count) |
| 85 | end function tokenize |
| 86 | |
| 87 | !> Get the next token from the input stream |
| 88 | function next_token(lexer) result(token) |
| 89 | type(lexer_state_t), intent(inout) :: lexer |
| 90 | type(token_t) :: token |
| 91 | |
| 92 | call skip_whitespace(lexer) |
| 93 | |
| 94 | if (lexer%position > lexer%length) then |
| 95 | token%token_type = TOKEN_EOF |
| 96 | token%position = lexer%position |
| 97 | return |
| 98 | end if |
| 99 | |
| 100 | token%position = lexer%position |
| 101 | |
| 102 | select case (lexer%current_char) |
| 103 | case ('0':'9', '.') |
| 104 | token = read_number(lexer) |
| 105 | case ('a':'z', 'A':'Z', '_') |
| 106 | token = read_identifier(lexer) |
| 107 | case ('(') |
| 108 | token%token_type = TOKEN_LPAREN |
| 109 | token%text = '(' |
| 110 | call advance(lexer) |
| 111 | case (')') |
| 112 | token%token_type = TOKEN_RPAREN |
| 113 | token%text = ')' |
| 114 | call advance(lexer) |
| 115 | case ('[') |
| 116 | token%token_type = TOKEN_LBRACKET |
| 117 | token%text = '[' |
| 118 | call advance(lexer) |
| 119 | case (']') |
| 120 | token%token_type = TOKEN_RBRACKET |
| 121 | token%text = ']' |
| 122 | call advance(lexer) |
| 123 | case (';') |
| 124 | token%token_type = TOKEN_SEMICOLON |
| 125 | token%text = ';' |
| 126 | call advance(lexer) |
| 127 | case (',') |
| 128 | token%token_type = TOKEN_COMMA |
| 129 | token%text = ',' |
| 130 | call advance(lexer) |
| 131 | case (':') |
| 132 | token = read_colon_operator(lexer) |
| 133 | case ('+', '-', '/', '^', '=', '<', '>', '!') |
| 134 | token = read_operator(lexer) |
| 135 | case ('*') |
| 136 | token = read_power_or_multiply(lexer) |
| 137 | case default |
| 138 | ! Unknown character - create error token |
| 139 | token%token_type = TOKEN_EOF ! We'll use this as error for now |
| 140 | token%text = lexer%current_char |
| 141 | call advance(lexer) |
| 142 | end select |
| 143 | end function next_token |
| 144 | |
| 145 | !> Read a numeric token (integer or real) |
| 146 | function read_number(lexer) result(token) |
| 147 | type(lexer_state_t), intent(inout) :: lexer |
| 148 | type(token_t) :: token |
| 149 | |
| 150 | integer :: start_pos |
| 151 | logical :: has_dot, has_exp |
| 152 | |
| 153 | start_pos = lexer%position |
| 154 | has_dot = .false. |
| 155 | has_exp = .false. |
| 156 | |
| 157 | ! Handle leading decimal point |
| 158 | if (lexer%current_char == '.') then |
| 159 | has_dot = .true. |
| 160 | call advance(lexer) |
| 161 | if (.not. is_digit(lexer%current_char)) then |
| 162 | ! Just a dot, not a number |
| 163 | token%token_type = TOKEN_OPERATOR |
| 164 | token%text = '.' |
| 165 | return |
| 166 | end if |
| 167 | end if |
| 168 | |
| 169 | ! Read digits |
| 170 | do while (is_digit(lexer%current_char)) |
| 171 | call advance(lexer) |
| 172 | end do |
| 173 | |
| 174 | ! Handle decimal point |
| 175 | if (lexer%current_char == '.' .and. .not. has_dot) then |
| 176 | has_dot = .true. |
| 177 | call advance(lexer) |
| 178 | do while (is_digit(lexer%current_char)) |
| 179 | call advance(lexer) |
| 180 | end do |
| 181 | end if |
| 182 | |
| 183 | ! Handle scientific notation |
| 184 | if ((lexer%current_char == 'e' .or. lexer%current_char == 'E') .and. .not. has_exp) then |
| 185 | has_exp = .true. |
| 186 | call advance(lexer) |
| 187 | if (lexer%current_char == '+' .or. lexer%current_char == '-') then |
| 188 | call advance(lexer) |
| 189 | end if |
| 190 | do while (is_digit(lexer%current_char)) |
| 191 | call advance(lexer) |
| 192 | end do |
| 193 | end if |
| 194 | |
| 195 | token%token_type = TOKEN_NUMBER |
| 196 | token%text = lexer%input(start_pos:lexer%position-1) |
| 197 | end function read_number |
| 198 | |
| 199 | !> Read an identifier or keyword |
| 200 | function read_identifier(lexer) result(token) |
| 201 | type(lexer_state_t), intent(inout) :: lexer |
| 202 | type(token_t) :: token |
| 203 | |
| 204 | integer :: start_pos |
| 205 | |
| 206 | start_pos = lexer%position |
| 207 | |
| 208 | ! Read alphanumeric characters and underscores |
| 209 | do while (is_alphanumeric(lexer%current_char)) |
| 210 | call advance(lexer) |
| 211 | end do |
| 212 | |
| 213 | token%token_type = TOKEN_IDENTIFIER |
| 214 | token%text = lexer%input(start_pos:lexer%position-1) |
| 215 | end function read_identifier |
| 216 | |
| 217 | !> Read colon-based operators (: := ::) |
| 218 | function read_colon_operator(lexer) result(token) |
| 219 | type(lexer_state_t), intent(inout) :: lexer |
| 220 | type(token_t) :: token |
| 221 | |
| 222 | integer :: start_pos |
| 223 | |
| 224 | start_pos = lexer%position |
| 225 | call advance(lexer) ! Skip first ':' |
| 226 | |
| 227 | if (lexer%current_char == '=') then |
| 228 | ! Assignment operator := |
| 229 | call advance(lexer) |
| 230 | token%token_type = TOKEN_ASSIGN |
| 231 | token%text = ':=' |
| 232 | else if (lexer%current_char == ':') then |
| 233 | ! Precision operator :: |
| 234 | call advance(lexer) |
| 235 | token%token_type = TOKEN_PRECISION |
| 236 | token%text = '::' |
| 237 | else |
| 238 | ! Just a colon (should be an error in math expressions) |
| 239 | token%token_type = TOKEN_OPERATOR |
| 240 | token%text = ':' |
| 241 | end if |
| 242 | end function read_colon_operator |
| 243 | |
| 244 | !> Read mathematical operators |
| 245 | function read_operator(lexer) result(token) |
| 246 | type(lexer_state_t), intent(inout) :: lexer |
| 247 | type(token_t) :: token |
| 248 | |
| 249 | integer :: start_pos |
| 250 | |
| 251 | start_pos = lexer%position |
| 252 | |
| 253 | select case (lexer%current_char) |
| 254 | case ('+', '-', '/', '^') |
| 255 | token%text = lexer%current_char |
| 256 | call advance(lexer) |
| 257 | case ('*') |
| 258 | call advance(lexer) |
| 259 | if (lexer%current_char == '*') then |
| 260 | ! Power operator ** |
| 261 | call advance(lexer) |
| 262 | token%text = '**' |
| 263 | else |
| 264 | token%text = '*' |
| 265 | end if |
| 266 | case ('=') |
| 267 | call advance(lexer) |
| 268 | if (lexer%current_char == '=') then |
| 269 | call advance(lexer) |
| 270 | token%text = '==' |
| 271 | else |
| 272 | token%text = '=' |
| 273 | end if |
| 274 | case ('<') |
| 275 | call advance(lexer) |
| 276 | if (lexer%current_char == '=') then |
| 277 | call advance(lexer) |
| 278 | token%text = '<=' |
| 279 | else |
| 280 | token%text = '<' |
| 281 | end if |
| 282 | case ('>') |
| 283 | call advance(lexer) |
| 284 | if (lexer%current_char == '=') then |
| 285 | call advance(lexer) |
| 286 | token%text = '>=' |
| 287 | else |
| 288 | token%text = '>' |
| 289 | end if |
| 290 | case ('!') |
| 291 | call advance(lexer) |
| 292 | if (lexer%current_char == '=') then |
| 293 | call advance(lexer) |
| 294 | token%text = '!=' |
| 295 | else |
| 296 | token%text = '!' |
| 297 | end if |
| 298 | case default |
| 299 | token%text = lexer%current_char |
| 300 | call advance(lexer) |
| 301 | end select |
| 302 | |
| 303 | token%token_type = TOKEN_OPERATOR |
| 304 | end function read_operator |
| 305 | |
| 306 | !> Handle * vs ** power operator |
| 307 | function read_power_or_multiply(lexer) result(token) |
| 308 | type(lexer_state_t), intent(inout) :: lexer |
| 309 | type(token_t) :: token |
| 310 | |
| 311 | call advance(lexer) ! Skip first '*' |
| 312 | |
| 313 | if (lexer%current_char == '*') then |
| 314 | ! Power operator ** |
| 315 | call advance(lexer) |
| 316 | token%token_type = TOKEN_OPERATOR |
| 317 | token%text = '**' |
| 318 | else |
| 319 | ! Multiply operator * |
| 320 | token%token_type = TOKEN_OPERATOR |
| 321 | token%text = '*' |
| 322 | end if |
| 323 | end function read_power_or_multiply |
| 324 | |
| 325 | !> Skip whitespace characters |
| 326 | subroutine skip_whitespace(lexer) |
| 327 | type(lexer_state_t), intent(inout) :: lexer |
| 328 | |
| 329 | do while (is_whitespace(lexer%current_char) .and. lexer%position <= lexer%length) |
| 330 | call advance(lexer) |
| 331 | end do |
| 332 | end subroutine skip_whitespace |
| 333 | |
| 334 | !> Advance to the next character |
| 335 | subroutine advance(lexer) |
| 336 | type(lexer_state_t), intent(inout) :: lexer |
| 337 | |
| 338 | lexer%position = lexer%position + 1 |
| 339 | if (lexer%position <= lexer%length) then |
| 340 | lexer%current_char = lexer%input(lexer%position:lexer%position) |
| 341 | else |
| 342 | lexer%current_char = char(0) ! EOF marker |
| 343 | end if |
| 344 | end subroutine advance |
| 345 | |
| 346 | !> Check if character is a digit |
| 347 | logical function is_digit(c) |
| 348 | character, intent(in) :: c |
| 349 | is_digit = (c >= '0' .and. c <= '9') |
| 350 | end function is_digit |
| 351 | |
| 352 | !> Check if character is alphabetic |
| 353 | logical function is_alpha(c) |
| 354 | character, intent(in) :: c |
| 355 | is_alpha = (c >= 'a' .and. c <= 'z') .or. (c >= 'A' .and. c <= 'Z') .or. c == '_' |
| 356 | end function is_alpha |
| 357 | |
| 358 | !> Check if character is alphanumeric |
| 359 | logical function is_alphanumeric(c) |
| 360 | character, intent(in) :: c |
| 361 | is_alphanumeric = is_alpha(c) .or. is_digit(c) |
| 362 | end function is_alphanumeric |
| 363 | |
| 364 | !> Check if character is whitespace |
| 365 | logical function is_whitespace(c) |
| 366 | character, intent(in) :: c |
| 367 | is_whitespace = (c == ' ' .or. c == char(9) .or. c == char(10) .or. c == char(13)) |
| 368 | end function is_whitespace |
| 369 | |
| 370 | !> Resize token array |
| 371 | subroutine resize_token_array(tokens, new_size) |
| 372 | type(token_t), allocatable, intent(inout) :: tokens(:) |
| 373 | integer, intent(in) :: new_size |
| 374 | |
| 375 | type(token_t), allocatable :: temp(:) |
| 376 | integer :: old_size |
| 377 | |
| 378 | if (.not. allocated(tokens)) then |
| 379 | allocate(tokens(new_size)) |
| 380 | return |
| 381 | end if |
| 382 | |
| 383 | old_size = size(tokens) |
| 384 | if (new_size == old_size) return |
| 385 | |
| 386 | allocate(temp(min(old_size, new_size))) |
| 387 | temp = tokens(1:min(old_size, new_size)) |
| 388 | |
| 389 | deallocate(tokens) |
| 390 | allocate(tokens(new_size)) |
| 391 | |
| 392 | if (new_size >= old_size) then |
| 393 | tokens(1:old_size) = temp |
| 394 | else |
| 395 | tokens = temp(1:new_size) |
| 396 | end if |
| 397 | |
| 398 | deallocate(temp) |
| 399 | end subroutine resize_token_array |
| 400 | |
| 401 | !> Free token array memory |
| 402 | subroutine free_tokens(tokens) |
| 403 | type(token_t), allocatable, intent(inout) :: tokens(:) |
| 404 | |
| 405 | if (allocated(tokens)) deallocate(tokens) |
| 406 | end subroutine free_tokens |
| 407 | |
| 408 | !> Print tokens for debugging |
| 409 | subroutine print_tokens(tokens) |
| 410 | type(token_t), intent(in) :: tokens(:) |
| 411 | integer :: i |
| 412 | |
| 413 | write(*, '(A)') 'Tokens:' |
| 414 | do i = 1, size(tokens) |
| 415 | if (tokens(i)%token_type == TOKEN_EOF) then |
| 416 | write(*, '(A,I0,A)') ' [', i, '] EOF' |
| 417 | exit |
| 418 | end if |
| 419 | write(*, '(A,I0,A,A,A,A,A,I0)') ' [', i, '] ', & |
| 420 | get_token_type_name(tokens(i)%token_type), ' "', & |
| 421 | tokens(i)%text, '" @', tokens(i)%position |
| 422 | end do |
| 423 | end subroutine print_tokens |
| 424 | |
| 425 | !> Get human-readable token type name |
| 426 | function get_token_type_name(token_type) result(name) |
| 427 | integer, intent(in) :: token_type |
| 428 | character(len=12) :: name |
| 429 | |
| 430 | select case (token_type) |
| 431 | case (TOKEN_EOF) |
| 432 | name = 'EOF' |
| 433 | case (TOKEN_NUMBER) |
| 434 | name = 'NUMBER' |
| 435 | case (TOKEN_IDENTIFIER) |
| 436 | name = 'IDENTIFIER' |
| 437 | case (TOKEN_OPERATOR) |
| 438 | name = 'OPERATOR' |
| 439 | case (TOKEN_LPAREN) |
| 440 | name = 'LPAREN' |
| 441 | case (TOKEN_RPAREN) |
| 442 | name = 'RPAREN' |
| 443 | case (TOKEN_LBRACKET) |
| 444 | name = 'LBRACKET' |
| 445 | case (TOKEN_RBRACKET) |
| 446 | name = 'RBRACKET' |
| 447 | case (TOKEN_SEMICOLON) |
| 448 | name = 'SEMICOLON' |
| 449 | case (TOKEN_COMMA) |
| 450 | name = 'COMMA' |
| 451 | case (TOKEN_ASSIGN) |
| 452 | name = 'ASSIGN' |
| 453 | case (TOKEN_PRECISION) |
| 454 | name = 'PRECISION' |
| 455 | case default |
| 456 | name = 'UNKNOWN' |
| 457 | end select |
| 458 | end function get_token_type_name |
| 459 | |
| 460 | end module fortbite_lexer_m |