Fortran · 14883 bytes Raw Blame History
1 !> Lexical analyzer module for FORTBITE
2 !>
3 !> Tokenizes mathematical expressions into a stream of tokens for parsing.
4 !> Handles numbers, identifiers, operators, and special syntax like :: and :=
5 module fortbite_lexer_m
6 use fortbite_types_m, only: token_t, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, &
7 TOKEN_OPERATOR, TOKEN_LPAREN, TOKEN_RPAREN, &
8 TOKEN_LBRACKET, TOKEN_RBRACKET, TOKEN_SEMICOLON, &
9 TOKEN_COMMA, TOKEN_ASSIGN, TOKEN_PRECISION
10 use iso_fortran_env, only: real64
11 implicit none
12 private
13
14 public :: tokenize, free_tokens, print_tokens
15
16 ! Lexer state
17 type :: lexer_state_t
18 character(len=:), allocatable :: input
19 integer :: position
20 integer :: length
21 character :: current_char
22 end type lexer_state_t
23
24 contains
25
26 !> Tokenize a mathematical expression
27 function tokenize(expression) result(tokens)
28 character(len=*), intent(in) :: expression
29 type(token_t), allocatable :: tokens(:)
30
31 type(lexer_state_t) :: lexer
32 type(token_t) :: current_token
33 integer :: token_count, capacity
34
35 ! Initialize lexer state
36 lexer%input = trim(adjustl(expression))
37 lexer%length = len_trim(lexer%input)
38 lexer%position = 1
39
40 if (lexer%length == 0) then
41 allocate(tokens(1))
42 tokens(1)%token_type = TOKEN_EOF
43 return
44 end if
45
46 lexer%current_char = lexer%input(1:1)
47
48 ! Dynamic token array
49 capacity = max(16, lexer%length / 2)
50 allocate(tokens(capacity))
51 token_count = 0
52
53 ! Main tokenization loop
54 do
55 call skip_whitespace(lexer)
56 if (lexer%current_char == char(0) .or. lexer%position > lexer%length) exit
57
58 current_token = next_token(lexer)
59
60 ! Resize array if needed
61 if (token_count >= capacity) then
62 capacity = capacity * 2
63 call resize_token_array(tokens, capacity)
64 end if
65
66 token_count = token_count + 1
67 tokens(token_count) = current_token
68
69 if (current_token%token_type == TOKEN_EOF) exit
70 end do
71
72 ! Add EOF token if not already present
73 if (token_count == 0 .or. tokens(token_count)%token_type /= TOKEN_EOF) then
74 if (token_count >= capacity) then
75 capacity = capacity + 1
76 call resize_token_array(tokens, capacity)
77 end if
78 token_count = token_count + 1
79 tokens(token_count)%token_type = TOKEN_EOF
80 tokens(token_count)%position = lexer%position
81 end if
82
83 ! Trim array to actual size
84 call resize_token_array(tokens, token_count)
85 end function tokenize
86
87 !> Get the next token from the input stream
88 function next_token(lexer) result(token)
89 type(lexer_state_t), intent(inout) :: lexer
90 type(token_t) :: token
91
92 call skip_whitespace(lexer)
93
94 if (lexer%position > lexer%length) then
95 token%token_type = TOKEN_EOF
96 token%position = lexer%position
97 return
98 end if
99
100 token%position = lexer%position
101
102 select case (lexer%current_char)
103 case ('0':'9', '.')
104 token = read_number(lexer)
105 case ('a':'z', 'A':'Z', '_')
106 token = read_identifier(lexer)
107 case ('(')
108 token%token_type = TOKEN_LPAREN
109 token%text = '('
110 call advance(lexer)
111 case (')')
112 token%token_type = TOKEN_RPAREN
113 token%text = ')'
114 call advance(lexer)
115 case ('[')
116 token%token_type = TOKEN_LBRACKET
117 token%text = '['
118 call advance(lexer)
119 case (']')
120 token%token_type = TOKEN_RBRACKET
121 token%text = ']'
122 call advance(lexer)
123 case (';')
124 token%token_type = TOKEN_SEMICOLON
125 token%text = ';'
126 call advance(lexer)
127 case (',')
128 token%token_type = TOKEN_COMMA
129 token%text = ','
130 call advance(lexer)
131 case (':')
132 token = read_colon_operator(lexer)
133 case ('+', '-', '/', '^', '=', '<', '>', '!')
134 token = read_operator(lexer)
135 case ('*')
136 token = read_power_or_multiply(lexer)
137 case default
138 ! Unknown character - create error token
139 token%token_type = TOKEN_EOF ! We'll use this as error for now
140 token%text = lexer%current_char
141 call advance(lexer)
142 end select
143 end function next_token
144
145 !> Read a numeric token (integer or real)
146 function read_number(lexer) result(token)
147 type(lexer_state_t), intent(inout) :: lexer
148 type(token_t) :: token
149
150 integer :: start_pos
151 logical :: has_dot, has_exp
152
153 start_pos = lexer%position
154 has_dot = .false.
155 has_exp = .false.
156
157 ! Handle leading decimal point
158 if (lexer%current_char == '.') then
159 has_dot = .true.
160 call advance(lexer)
161 if (.not. is_digit(lexer%current_char)) then
162 ! Just a dot, not a number
163 token%token_type = TOKEN_OPERATOR
164 token%text = '.'
165 return
166 end if
167 end if
168
169 ! Read digits
170 do while (is_digit(lexer%current_char))
171 call advance(lexer)
172 end do
173
174 ! Handle decimal point
175 if (lexer%current_char == '.' .and. .not. has_dot) then
176 has_dot = .true.
177 call advance(lexer)
178 do while (is_digit(lexer%current_char))
179 call advance(lexer)
180 end do
181 end if
182
183 ! Handle scientific notation
184 if ((lexer%current_char == 'e' .or. lexer%current_char == 'E') .and. .not. has_exp) then
185 has_exp = .true.
186 call advance(lexer)
187 if (lexer%current_char == '+' .or. lexer%current_char == '-') then
188 call advance(lexer)
189 end if
190 do while (is_digit(lexer%current_char))
191 call advance(lexer)
192 end do
193 end if
194
195 token%token_type = TOKEN_NUMBER
196 token%text = lexer%input(start_pos:lexer%position-1)
197 end function read_number
198
199 !> Read an identifier or keyword
200 function read_identifier(lexer) result(token)
201 type(lexer_state_t), intent(inout) :: lexer
202 type(token_t) :: token
203
204 integer :: start_pos
205
206 start_pos = lexer%position
207
208 ! Read alphanumeric characters and underscores
209 do while (is_alphanumeric(lexer%current_char))
210 call advance(lexer)
211 end do
212
213 token%token_type = TOKEN_IDENTIFIER
214 token%text = lexer%input(start_pos:lexer%position-1)
215 end function read_identifier
216
217 !> Read colon-based operators (: := ::)
218 function read_colon_operator(lexer) result(token)
219 type(lexer_state_t), intent(inout) :: lexer
220 type(token_t) :: token
221
222 integer :: start_pos
223
224 start_pos = lexer%position
225 call advance(lexer) ! Skip first ':'
226
227 if (lexer%current_char == '=') then
228 ! Assignment operator :=
229 call advance(lexer)
230 token%token_type = TOKEN_ASSIGN
231 token%text = ':='
232 else if (lexer%current_char == ':') then
233 ! Precision operator ::
234 call advance(lexer)
235 token%token_type = TOKEN_PRECISION
236 token%text = '::'
237 else
238 ! Just a colon (should be an error in math expressions)
239 token%token_type = TOKEN_OPERATOR
240 token%text = ':'
241 end if
242 end function read_colon_operator
243
244 !> Read mathematical operators
245 function read_operator(lexer) result(token)
246 type(lexer_state_t), intent(inout) :: lexer
247 type(token_t) :: token
248
249 integer :: start_pos
250
251 start_pos = lexer%position
252
253 select case (lexer%current_char)
254 case ('+', '-', '/', '^')
255 token%text = lexer%current_char
256 call advance(lexer)
257 case ('*')
258 call advance(lexer)
259 if (lexer%current_char == '*') then
260 ! Power operator **
261 call advance(lexer)
262 token%text = '**'
263 else
264 token%text = '*'
265 end if
266 case ('=')
267 call advance(lexer)
268 if (lexer%current_char == '=') then
269 call advance(lexer)
270 token%text = '=='
271 else
272 token%text = '='
273 end if
274 case ('<')
275 call advance(lexer)
276 if (lexer%current_char == '=') then
277 call advance(lexer)
278 token%text = '<='
279 else
280 token%text = '<'
281 end if
282 case ('>')
283 call advance(lexer)
284 if (lexer%current_char == '=') then
285 call advance(lexer)
286 token%text = '>='
287 else
288 token%text = '>'
289 end if
290 case ('!')
291 call advance(lexer)
292 if (lexer%current_char == '=') then
293 call advance(lexer)
294 token%text = '!='
295 else
296 token%text = '!'
297 end if
298 case default
299 token%text = lexer%current_char
300 call advance(lexer)
301 end select
302
303 token%token_type = TOKEN_OPERATOR
304 end function read_operator
305
306 !> Handle * vs ** power operator
307 function read_power_or_multiply(lexer) result(token)
308 type(lexer_state_t), intent(inout) :: lexer
309 type(token_t) :: token
310
311 call advance(lexer) ! Skip first '*'
312
313 if (lexer%current_char == '*') then
314 ! Power operator **
315 call advance(lexer)
316 token%token_type = TOKEN_OPERATOR
317 token%text = '**'
318 else
319 ! Multiply operator *
320 token%token_type = TOKEN_OPERATOR
321 token%text = '*'
322 end if
323 end function read_power_or_multiply
324
325 !> Skip whitespace characters
326 subroutine skip_whitespace(lexer)
327 type(lexer_state_t), intent(inout) :: lexer
328
329 do while (is_whitespace(lexer%current_char) .and. lexer%position <= lexer%length)
330 call advance(lexer)
331 end do
332 end subroutine skip_whitespace
333
334 !> Advance to the next character
335 subroutine advance(lexer)
336 type(lexer_state_t), intent(inout) :: lexer
337
338 lexer%position = lexer%position + 1
339 if (lexer%position <= lexer%length) then
340 lexer%current_char = lexer%input(lexer%position:lexer%position)
341 else
342 lexer%current_char = char(0) ! EOF marker
343 end if
344 end subroutine advance
345
346 !> Check if character is a digit
347 logical function is_digit(c)
348 character, intent(in) :: c
349 is_digit = (c >= '0' .and. c <= '9')
350 end function is_digit
351
352 !> Check if character is alphabetic
353 logical function is_alpha(c)
354 character, intent(in) :: c
355 is_alpha = (c >= 'a' .and. c <= 'z') .or. (c >= 'A' .and. c <= 'Z') .or. c == '_'
356 end function is_alpha
357
358 !> Check if character is alphanumeric
359 logical function is_alphanumeric(c)
360 character, intent(in) :: c
361 is_alphanumeric = is_alpha(c) .or. is_digit(c)
362 end function is_alphanumeric
363
364 !> Check if character is whitespace
365 logical function is_whitespace(c)
366 character, intent(in) :: c
367 is_whitespace = (c == ' ' .or. c == char(9) .or. c == char(10) .or. c == char(13))
368 end function is_whitespace
369
370 !> Resize token array
371 subroutine resize_token_array(tokens, new_size)
372 type(token_t), allocatable, intent(inout) :: tokens(:)
373 integer, intent(in) :: new_size
374
375 type(token_t), allocatable :: temp(:)
376 integer :: old_size
377
378 if (.not. allocated(tokens)) then
379 allocate(tokens(new_size))
380 return
381 end if
382
383 old_size = size(tokens)
384 if (new_size == old_size) return
385
386 allocate(temp(min(old_size, new_size)))
387 temp = tokens(1:min(old_size, new_size))
388
389 deallocate(tokens)
390 allocate(tokens(new_size))
391
392 if (new_size >= old_size) then
393 tokens(1:old_size) = temp
394 else
395 tokens = temp(1:new_size)
396 end if
397
398 deallocate(temp)
399 end subroutine resize_token_array
400
401 !> Free token array memory
402 subroutine free_tokens(tokens)
403 type(token_t), allocatable, intent(inout) :: tokens(:)
404
405 if (allocated(tokens)) deallocate(tokens)
406 end subroutine free_tokens
407
408 !> Print tokens for debugging
409 subroutine print_tokens(tokens)
410 type(token_t), intent(in) :: tokens(:)
411 integer :: i
412
413 write(*, '(A)') 'Tokens:'
414 do i = 1, size(tokens)
415 if (tokens(i)%token_type == TOKEN_EOF) then
416 write(*, '(A,I0,A)') ' [', i, '] EOF'
417 exit
418 end if
419 write(*, '(A,I0,A,A,A,A,A,I0)') ' [', i, '] ', &
420 get_token_type_name(tokens(i)%token_type), ' "', &
421 tokens(i)%text, '" @', tokens(i)%position
422 end do
423 end subroutine print_tokens
424
425 !> Get human-readable token type name
426 function get_token_type_name(token_type) result(name)
427 integer, intent(in) :: token_type
428 character(len=12) :: name
429
430 select case (token_type)
431 case (TOKEN_EOF)
432 name = 'EOF'
433 case (TOKEN_NUMBER)
434 name = 'NUMBER'
435 case (TOKEN_IDENTIFIER)
436 name = 'IDENTIFIER'
437 case (TOKEN_OPERATOR)
438 name = 'OPERATOR'
439 case (TOKEN_LPAREN)
440 name = 'LPAREN'
441 case (TOKEN_RPAREN)
442 name = 'RPAREN'
443 case (TOKEN_LBRACKET)
444 name = 'LBRACKET'
445 case (TOKEN_RBRACKET)
446 name = 'RBRACKET'
447 case (TOKEN_SEMICOLON)
448 name = 'SEMICOLON'
449 case (TOKEN_COMMA)
450 name = 'COMMA'
451 case (TOKEN_ASSIGN)
452 name = 'ASSIGN'
453 case (TOKEN_PRECISION)
454 name = 'PRECISION'
455 case default
456 name = 'UNKNOWN'
457 end select
458 end function get_token_type_name
459
460 end module fortbite_lexer_m