Fortran · 56455 bytes Raw Blame History
1 ! =====================================
2 ! Lexer Module - Phase 1 of Grammar-Aware Parser
3 ! =====================================
4 ! Tokenizes shell input into meaningful units
5 ! Part of the parser rewrite project
6 !
7 ! Status: PHASE 1 - Full implementation
8 ! Author: Parser Rewrite Team
9 ! Created: 2025-11-05
10
11 module lexer
12 use iso_fortran_env
13 use shell_types
14 use shell_types, only: QUOTE_NONE, QUOTE_SINGLE, QUOTE_DOUBLE
15 implicit none
16 private
17
18 ! Public interface
19 public :: tokenize
20 public :: next_token
21 public :: peek_token
22 public :: is_keyword
23 public :: is_operator
24
25 ! Lexer state enumeration
26 integer, parameter :: LEX_NORMAL = 1
27 integer, parameter :: LEX_IN_SINGLE_QUOTE = 2
28 integer, parameter :: LEX_IN_DOUBLE_QUOTE = 3
29 integer, parameter :: LEX_IN_WORD = 4
30 integer, parameter :: LEX_IN_OPERATOR = 5
31 integer, parameter :: LEX_IN_DOLLAR_SINGLE_QUOTE = 6
32
33 ! Context tracking for [[ ]] test expressions
34 ! Inside [[ ]], && || < > are test operators, not shell operators
35 logical :: in_double_bracket_context = .false.
36
37 contains
38
39 ! =====================================
40 ! Character Classification Helpers
41 ! =====================================
42
43 pure function is_whitespace(ch) result(is_ws)
44 character(len=1), intent(in) :: ch
45 logical :: is_ws
46 is_ws = (ch == ' ' .or. ch == char(9) .or. ch == char(13)) ! space, tab, CR
47 end function is_whitespace
48
49 pure function is_operator_start(ch) result(is_op)
50 character(len=1), intent(in) :: ch
51 logical :: is_op
52 is_op = (ch == '|' .or. ch == '&' .or. ch == ';' .or. &
53 ch == '<' .or. ch == '>' .or. ch == '(' .or. ch == ')')
54 end function is_operator_start
55
56 pure function is_word_char(ch) result(is_wc)
57 character(len=1), intent(in) :: ch
58 logical :: is_wc
59 ! Word characters: anything that's not whitespace, operator, or special
60 is_wc = .not. (is_whitespace(ch) .or. is_operator_start(ch) .or. &
61 ch == char(10) .or. ch == '#' .or. ch == '"' .or. &
62 ch == "'" .or. ch == '\')
63 end function is_word_char
64
65 ! =====================================
66 ! Operator Recognition
67 ! =====================================
68
69 function is_operator(str) result(is_op)
70 character(len=*), intent(in) :: str
71 logical :: is_op
72
73 select case(trim(str))
74 ! Logical operators
75 case('&&', '||')
76 is_op = .true.
77 ! Pipe and background
78 case('|', '&')
79 is_op = .true.
80 ! Separators
81 case(';', ';;')
82 is_op = .true.
83 ! Redirections
84 case('<', '>', '>>', '<>', '>&', '<&', '>|', '<<', '<<-', '<<<')
85 is_op = .true.
86 ! Grouping
87 case('(', ')', '{', '}')
88 is_op = .true.
89 case default
90 is_op = .false.
91 end select
92 end function is_operator
93
94 ! =====================================
95 ! is_keyword - Check if word is a shell keyword
96 ! =====================================
97 function is_keyword(word) result(is_kw)
98 character(len=*), intent(in) :: word
99 logical :: is_kw
100
101 select case(trim(word))
102 ! Control flow keywords
103 case('if', 'then', 'else', 'elif', 'fi')
104 is_kw = .true.
105 case('for', 'in', 'do', 'done')
106 is_kw = .true.
107 case('while', 'until')
108 is_kw = .true.
109 case('case', 'esac')
110 is_kw = .true.
111 ! Other keywords
112 case('function', 'select', 'time', 'coproc')
113 is_kw = .true.
114 case('{', '}')
115 is_kw = .true.
116 case('!') ! Negation operator (context-dependent)
117 is_kw = .true.
118 case default
119 is_kw = .false.
120 end select
121 end function is_keyword
122
123 ! =====================================
124 ! tokenize - Main entry point for lexical analysis
125 ! =====================================
126 subroutine tokenize(input, tokens, num_tokens)
127 character(len=*), intent(in) :: input
128 type(token_t), intent(out) :: tokens(:)
129 integer, intent(out) :: num_tokens
130
131 integer :: pos, input_len, state, token_start
132 character(len=1) :: ch, next_ch
133 character(len=MAX_TOKEN_LEN) :: current_token
134 integer :: token_len, paren_depth
135 logical :: in_escape, continuing_word, token_has_quoted_part
136
137 num_tokens = 0
138 pos = 1
139 input_len = len_trim(input)
140 state = LEX_NORMAL
141 token_start = 1
142 current_token = ''
143 token_len = 0
144 in_escape = .false.
145 paren_depth = 0
146 continuing_word = .false.
147 token_has_quoted_part = .false.
148 in_double_bracket_context = .false.
149
150 do while (pos <= input_len .and. num_tokens < size(tokens))
151 ch = input(pos:pos)
152
153 ! Get next character for lookahead (if available)
154 if (pos < input_len) then
155 next_ch = input(pos+1:pos+1)
156 else
157 next_ch = ' '
158 end if
159
160 select case(state)
161
162 ! ============ NORMAL STATE ============
163 case(LEX_NORMAL)
164
165 ! Skip whitespace
166 if (is_whitespace(ch)) then
167 pos = pos + 1
168 cycle
169 end if
170
171 ! Newline - significant token
172 if (ch == char(10)) then
173 call add_token(tokens, num_tokens, TOKEN_NEWLINE, char(10), pos, pos, .false.)
174 pos = pos + 1
175 cycle
176 end if
177
178 ! Comments: # to end of line
179 if (ch == '#') then
180 ! Skip until newline or end of input
181 do while (pos <= input_len .and. input(pos:pos) /= char(10))
182 pos = pos + 1
183 end do
184 cycle
185 end if
186
187 ! Single quote: literal string
188 if (ch == "'") then
189 state = LEX_IN_SINGLE_QUOTE
190 ! Only reset token if we're NOT continuing a word
191 if (.not. continuing_word) then
192 token_start = pos
193 token_len = 0
194 current_token = ''
195 end if
196 pos = pos + 1
197 cycle
198 end if
199
200 ! Double quote: expandable string
201 if (ch == '"') then
202 state = LEX_IN_DOUBLE_QUOTE
203 ! Only reset token if we're NOT continuing a word
204 if (.not. continuing_word) then
205 token_start = pos
206 token_len = 0
207 current_token = ''
208 end if
209 pos = pos + 1
210 cycle
211 end if
212
213 ! Backslash escape
214 if (ch == '\') then
215 if (pos < input_len) then
216 ! Start a word token with the escaped character
217 state = LEX_IN_WORD
218 token_start = pos
219 in_escape = .true. ! Mark this token as escaped
220 ! For characters that would trigger expansion ($, `, etc), preserve backslash
221 ! so the expansion phase knows not to expand them
222 if (next_ch == '$' .or. next_ch == '`') then
223 token_len = 2
224 current_token(1:2) = '\' // next_ch
225 else
226 token_len = 1
227 current_token = next_ch
228 end if
229 pos = pos + 2 ! Skip backslash and next char
230 cycle
231 end if
232 end if
233
234 ! Multi-character operators
235 ! Inside [[ ]], treat & | < > ( ) as word characters (test operators)
236 if (in_double_bracket_context .and. &
237 (ch == '&' .or. ch == '|' .or. ch == '<' .or. ch == '>' .or. &
238 ch == '(' .or. ch == ')')) then
239 state = LEX_IN_WORD
240 token_start = pos
241 token_len = 1
242 current_token = ch
243 pos = pos + 1
244 cycle
245 end if
246 if (is_operator_start(ch)) then
247 state = LEX_IN_OPERATOR
248 token_start = pos
249 token_len = 1
250 current_token = ch
251 pos = pos + 1
252 cycle
253 end if
254
255 ! Check for $'...' ANSI-C quoting
256 if (ch == '$' .and. pos < input_len .and. next_ch == "'") then
257 state = LEX_IN_DOLLAR_SINGLE_QUOTE
258 token_start = pos
259 token_len = 0
260 current_token = ''
261 continuing_word = .false.
262 token_has_quoted_part = .true.
263 pos = pos + 2 ! Skip $'
264 cycle
265 end if
266
267 ! Check for $( or $(( - these should be kept in word tokens for expansion
268 if (ch == '$' .and. pos < input_len .and. next_ch == '(') then
269 ! This is command substitution or arithmetic - include in word
270 state = LEX_IN_WORD
271 token_start = pos
272 token_len = 2
273 current_token = '$('
274 paren_depth = 1 ! Track that we're inside $(
275 pos = pos + 2
276 cycle
277 end if
278
279 ! Check for ${ - parameter expansion should be kept in word tokens
280 if (ch == '$' .and. pos < input_len .and. next_ch == '{') then
281 ! This is parameter expansion - include in word
282 state = LEX_IN_WORD
283 token_start = pos
284 token_len = 2
285 current_token = '${'
286 paren_depth = 1 ! Track that we're inside ${
287 pos = pos + 2
288 cycle
289 end if
290
291 ! Check for $' - ANSI-C quoting
292 if (ch == '$' .and. pos < input_len .and. next_ch == "'") then
293 state = LEX_IN_WORD
294 token_start = pos
295 token_len = 0
296 current_token = ''
297 token_has_quoted_part = .true.
298 pos = pos + 2 ! Skip $'
299 call process_ansi_c_quote(input, pos, input_len, current_token, token_len)
300 cycle
301 end if
302
303 ! Assignment detection: VAR=value
304 ! (This is complex - we'll detect it as WORD and let parser handle it)
305
306 ! Start of word
307 state = LEX_IN_WORD
308 token_start = pos
309 token_len = 1
310 current_token = ch
311 pos = pos + 1
312
313 ! ============ SINGLE QUOTE STATE ============
314 case(LEX_IN_SINGLE_QUOTE)
315 if (ch == "'") then
316 ! End of single-quoted string
317 ! Add sentinel char(3) to mark end of single-quoted literal
318 if (token_len < MAX_TOKEN_LEN) then
319 token_len = token_len + 1
320 current_token(token_len:token_len) = char(3)
321 end if
322 pos = pos + 1 ! Move past closing quote
323 ! Check if next character continues the word (adjacent quote, word char, or escape)
324 if (pos <= input_len) then
325 next_ch = input(pos:pos)
326 if (next_ch == "'" .or. next_ch == '"') then
327 ! Adjacent quote follows - continue building this token
328 state = LEX_IN_WORD
329 continuing_word = .false.
330 cycle
331 else if (next_ch == '\') then
332 ! Backslash escape follows - continue building this token
333 state = LEX_IN_WORD
334 continuing_word = .false.
335 cycle
336 else if (is_word_char(next_ch)) then
337 ! Word character follows - continue building this token
338 state = LEX_IN_WORD
339 continuing_word = .false.
340 cycle
341 end if
342 end if
343 ! No adjacent quote or word char - finalize token
344 if (continuing_word) then
345 ! We're building a multi-part word - go back to LEX_IN_WORD
346 state = LEX_IN_WORD
347 continuing_word = .false.
348 else
349 ! Standalone quoted string - emit token
350 call add_token(tokens, num_tokens, TOKEN_WORD, current_token(1:token_len), &
351 token_start, pos-1, .true., quote_type=QUOTE_SINGLE)
352 state = LEX_NORMAL
353 end if
354 else
355 ! Add character to token (everything is literal)
356 if (token_len < MAX_TOKEN_LEN) then
357 token_len = token_len + 1
358 current_token(token_len:token_len) = ch
359 end if
360 pos = pos + 1
361 end if
362
363 ! ============ DOLLAR SINGLE QUOTE STATE ($'...') ============
364 case(LEX_IN_DOLLAR_SINGLE_QUOTE)
365 if (ch == "'") then
366 ! End of $'...' string — add sentinels to mark as quoted
367 if (token_len < MAX_TOKEN_LEN) then
368 token_len = token_len + 1
369 current_token(token_len:token_len) = char(3) ! end sentinel
370 end if
371 pos = pos + 1
372 ! Check if next character continues the word
373 if (pos <= input_len) then
374 next_ch = input(pos:pos)
375 if (next_ch == "'" .or. next_ch == '"' .or. next_ch == '\' .or. &
376 is_word_char(next_ch)) then
377 state = LEX_IN_WORD
378 continuing_word = .false.
379 cycle
380 end if
381 end if
382 if (continuing_word) then
383 state = LEX_IN_WORD
384 continuing_word = .false.
385 else
386 call add_token(tokens, num_tokens, TOKEN_WORD, current_token(1:token_len), &
387 token_start, pos-1, .true., quote_type=QUOTE_SINGLE)
388 state = LEX_NORMAL
389 end if
390 else if (ch == '\' .and. pos < input_len) then
391 ! Escape sequences in $'...'
392 next_ch = input(pos+1:pos+1)
393 if (token_len < MAX_TOKEN_LEN) then
394 select case(next_ch)
395 case('a')
396 token_len = token_len + 1
397 current_token(token_len:token_len) = char(7) ! bell
398 case('b')
399 token_len = token_len + 1
400 current_token(token_len:token_len) = char(8) ! backspace
401 case('e', 'E')
402 token_len = token_len + 1
403 current_token(token_len:token_len) = char(27) ! escape
404 case('f')
405 token_len = token_len + 1
406 current_token(token_len:token_len) = char(12) ! form feed
407 case('n')
408 token_len = token_len + 1
409 current_token(token_len:token_len) = char(10) ! newline
410 case('r')
411 token_len = token_len + 1
412 current_token(token_len:token_len) = char(13) ! carriage return
413 case('t')
414 token_len = token_len + 1
415 current_token(token_len:token_len) = char(9) ! tab
416 case('v')
417 token_len = token_len + 1
418 current_token(token_len:token_len) = char(11) ! vertical tab
419 case('\')
420 token_len = token_len + 1
421 current_token(token_len:token_len) = '\'
422 case("'")
423 token_len = token_len + 1
424 current_token(token_len:token_len) = "'"
425 case('"')
426 token_len = token_len + 1
427 current_token(token_len:token_len) = '"'
428 case('x')
429 ! Hex escape: \xHH (up to 2 hex digits)
430 block
431 integer :: hval, hdigits
432 character :: hch
433 hval = 0; hdigits = 0
434 pos = pos + 2 ! skip \x
435 do while (pos <= input_len .and. hdigits < 2)
436 hch = input(pos:pos)
437 if (hch >= '0' .and. hch <= '9') then
438 hval = hval * 16 + (ichar(hch) - ichar('0'))
439 else if (hch >= 'a' .and. hch <= 'f') then
440 hval = hval * 16 + (ichar(hch) - ichar('a') + 10)
441 else if (hch >= 'A' .and. hch <= 'F') then
442 hval = hval * 16 + (ichar(hch) - ichar('A') + 10)
443 else
444 exit
445 end if
446 pos = pos + 1
447 hdigits = hdigits + 1
448 end do
449 if (hdigits > 0 .and. hval <= 255) then
450 token_len = token_len + 1
451 current_token(token_len:token_len) = char(hval)
452 end if
453 cycle ! pos already advanced past hex digits
454 end block
455 case('0', '1', '2', '3', '4', '5', '6', '7')
456 ! Octal escape: \nnn (up to 3 octal digits)
457 block
458 integer :: oval, odigits
459 character :: och
460 oval = 0; odigits = 0
461 pos = pos + 1 ! skip backslash only
462 do while (pos <= input_len .and. odigits < 3)
463 och = input(pos:pos)
464 if (och >= '0' .and. och <= '7') then
465 oval = oval * 8 + (ichar(och) - ichar('0'))
466 else
467 exit
468 end if
469 pos = pos + 1
470 odigits = odigits + 1
471 end do
472 if (odigits > 0 .and. oval <= 255) then
473 token_len = token_len + 1
474 current_token(token_len:token_len) = char(oval)
475 end if
476 cycle ! pos already advanced past octal digits
477 end block
478 case default
479 ! Unknown escape — keep both chars
480 token_len = token_len + 1
481 current_token(token_len:token_len) = ch
482 token_len = token_len + 1
483 current_token(token_len:token_len) = next_ch
484 end select
485 end if
486 pos = pos + 2
487 else
488 ! Regular character — add literally
489 if (token_len < MAX_TOKEN_LEN) then
490 token_len = token_len + 1
491 current_token(token_len:token_len) = ch
492 end if
493 pos = pos + 1
494 end if
495
496 ! ============ DOUBLE QUOTE STATE ============
497 case(LEX_IN_DOUBLE_QUOTE)
498 if (ch == '\' .and. pos < input_len) then
499 ! Backslash escape in double quotes (only for $, `, ", \, newline)
500 if (next_ch == '$' .or. next_ch == '`') then
501 ! For \$ and \` - keep BOTH chars so expansion can see the escape
502 if (token_len < MAX_TOKEN_LEN - 1) then
503 token_len = token_len + 1
504 current_token(token_len:token_len) = ch
505 token_len = token_len + 1
506 current_token(token_len:token_len) = next_ch
507 end if
508 pos = pos + 2
509 else if (next_ch == '"' .or. next_ch == '\' .or. next_ch == char(10)) then
510 ! For \" and \\ and \newline - add only escaped character
511 if (token_len < MAX_TOKEN_LEN) then
512 token_len = token_len + 1
513 current_token(token_len:token_len) = next_ch
514 end if
515 pos = pos + 2
516 else
517 ! Backslash is literal
518 if (token_len < MAX_TOKEN_LEN) then
519 token_len = token_len + 1
520 current_token(token_len:token_len) = ch
521 end if
522 pos = pos + 1
523 end if
524 else if (ch == '$' .and. pos < input_len .and. next_ch == '(') then
525 ! Command substitution inside double quotes - need to find matching )
526 ! while ignoring quotes inside $()
527 if (token_len < MAX_TOKEN_LEN - 1) then
528 token_len = token_len + 1
529 current_token(token_len:token_len) = '$'
530 token_len = token_len + 1
531 current_token(token_len:token_len) = '('
532 end if
533 pos = pos + 2
534 paren_depth = 1
535 ! Scan to find matching ), respecting nested parens and quotes
536 do while (pos <= input_len .and. paren_depth > 0)
537 ch = input(pos:pos)
538 if (ch == '"') then
539 ! Skip double-quoted string inside command substitution
540 if (token_len < MAX_TOKEN_LEN) then
541 token_len = token_len + 1
542 current_token(token_len:token_len) = ch
543 end if
544 pos = pos + 1
545 do while (pos <= input_len)
546 ch = input(pos:pos)
547 if (ch == '\' .and. pos < input_len) then
548 ! Skip escaped char
549 if (token_len < MAX_TOKEN_LEN - 1) then
550 token_len = token_len + 1
551 current_token(token_len:token_len) = ch
552 token_len = token_len + 1
553 current_token(token_len:token_len) = input(pos+1:pos+1)
554 end if
555 pos = pos + 2
556 else if (ch == '"') then
557 if (token_len < MAX_TOKEN_LEN) then
558 token_len = token_len + 1
559 current_token(token_len:token_len) = ch
560 end if
561 pos = pos + 1
562 exit
563 else
564 if (token_len < MAX_TOKEN_LEN) then
565 token_len = token_len + 1
566 current_token(token_len:token_len) = ch
567 end if
568 pos = pos + 1
569 end if
570 end do
571 else if (ch == "'") then
572 ! Skip single-quoted string
573 if (token_len < MAX_TOKEN_LEN) then
574 token_len = token_len + 1
575 current_token(token_len:token_len) = ch
576 end if
577 pos = pos + 1
578 do while (pos <= input_len .and. input(pos:pos) /= "'")
579 if (token_len < MAX_TOKEN_LEN) then
580 token_len = token_len + 1
581 current_token(token_len:token_len) = input(pos:pos)
582 end if
583 pos = pos + 1
584 end do
585 if (pos <= input_len) then
586 if (token_len < MAX_TOKEN_LEN) then
587 token_len = token_len + 1
588 current_token(token_len:token_len) = "'"
589 end if
590 pos = pos + 1
591 end if
592 else if (ch == '(') then
593 paren_depth = paren_depth + 1
594 if (token_len < MAX_TOKEN_LEN) then
595 token_len = token_len + 1
596 current_token(token_len:token_len) = ch
597 end if
598 pos = pos + 1
599 else if (ch == ')') then
600 paren_depth = paren_depth - 1
601 if (token_len < MAX_TOKEN_LEN) then
602 token_len = token_len + 1
603 current_token(token_len:token_len) = ch
604 end if
605 pos = pos + 1
606 else
607 if (token_len < MAX_TOKEN_LEN) then
608 token_len = token_len + 1
609 current_token(token_len:token_len) = ch
610 end if
611 pos = pos + 1
612 end if
613 end do
614 else if (ch == '"') then
615 ! End of double-quoted string
616 pos = pos + 1 ! Move past closing quote
617 ! Check if next character continues the word (adjacent quote, word char, or escape)
618 if (pos <= input_len) then
619 next_ch = input(pos:pos)
620 if (next_ch == "'" .or. next_ch == '"') then
621 ! Adjacent quote follows - continue building this token
622 ! Add sentinel to mark quote boundary (so expansion knows where quoted part ends)
623 if (token_len < MAX_TOKEN_LEN) then
624 token_len = token_len + 1
625 current_token(token_len:token_len) = char(1) ! ASCII SOH as sentinel
626 end if
627 state = LEX_IN_WORD
628 continuing_word = .false.
629 cycle
630 else if (next_ch == '\') then
631 ! Backslash escape follows - continue building this token
632 ! Add sentinel to mark quote boundary
633 if (token_len < MAX_TOKEN_LEN) then
634 token_len = token_len + 1
635 current_token(token_len:token_len) = char(1) ! ASCII SOH as sentinel
636 end if
637 state = LEX_IN_WORD
638 continuing_word = .false.
639 cycle
640 else if (is_word_char(next_ch)) then
641 ! Word character follows - continue building this token
642 ! Add sentinel to mark quote boundary (so expansion knows where quoted part ends)
643 if (token_len < MAX_TOKEN_LEN) then
644 token_len = token_len + 1
645 current_token(token_len:token_len) = char(1) ! ASCII SOH as sentinel
646 end if
647 state = LEX_IN_WORD
648 continuing_word = .false.
649 cycle
650 end if
651 end if
652 ! No adjacent quote or word char - finalize token
653 if (continuing_word) then
654 ! We're building a multi-part word - go back to LEX_IN_WORD
655 state = LEX_IN_WORD
656 continuing_word = .false.
657 else
658 ! Standalone quoted string - emit token
659 call add_token(tokens, num_tokens, TOKEN_WORD, current_token(1:token_len), &
660 token_start, pos-1, .true., quote_type=QUOTE_DOUBLE)
661 state = LEX_NORMAL
662 end if
663 else
664 ! Add character to token
665 if (token_len < MAX_TOKEN_LEN) then
666 token_len = token_len + 1
667 current_token(token_len:token_len) = ch
668 end if
669 pos = pos + 1
670 end if
671
672 ! ============ WORD STATE ============
673 case(LEX_IN_WORD)
674 ! Check if we're inside $() - if so, keep EVERYTHING including spaces
675 ! IMPORTANT: Also check paren_depth > 0 to ensure we're actually inside the $()
676 if (index(current_token(1:token_len), '$(') > 0 .and. paren_depth > 0) then
677 ! Inside command substitution - track paren depth
678 if (ch == '(') then
679 paren_depth = paren_depth + 1
680 if (token_len < MAX_TOKEN_LEN) then
681 token_len = token_len + 1
682 current_token(token_len:token_len) = ch
683 end if
684 pos = pos + 1
685 else if (ch == ')') then
686 paren_depth = paren_depth - 1
687 if (token_len < MAX_TOKEN_LEN) then
688 token_len = token_len + 1
689 current_token(token_len:token_len) = ch
690 end if
691 pos = pos + 1
692 ! If paren_depth hits 0, we closed the $(...)
693 if (paren_depth == 0) then
694 ! End of command substitution - finish token
695 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
696 token_start, pos-1, token_has_quoted_part, in_escape)
697 state = LEX_NORMAL
698 in_escape = .false.
699 token_has_quoted_part = .false.
700 end if
701 else
702 ! Inside $() - keep EVERYTHING including spaces
703 if (token_len < MAX_TOKEN_LEN) then
704 token_len = token_len + 1
705 current_token(token_len:token_len) = ch
706 end if
707 pos = pos + 1
708 end if
709 ! Check if we're inside ${ - if so, keep EVERYTHING until closing }
710 ! IMPORTANT: Also check paren_depth > 0 to ensure we're actually inside the ${}
711 else if (index(current_token(1:token_len), '${') > 0 .and. paren_depth > 0) then
712 ! Inside parameter expansion - track brace depth
713 if (ch == '{') then
714 paren_depth = paren_depth + 1
715 if (token_len < MAX_TOKEN_LEN) then
716 token_len = token_len + 1
717 current_token(token_len:token_len) = ch
718 end if
719 pos = pos + 1
720 else if (ch == '}') then
721 paren_depth = paren_depth - 1
722 if (token_len < MAX_TOKEN_LEN) then
723 token_len = token_len + 1
724 current_token(token_len:token_len) = ch
725 end if
726 pos = pos + 1
727 ! If paren_depth hits 0, we closed the ${...}
728 if (paren_depth == 0) then
729 ! Check if next character continues the word (e.g., ${A}${B})
730 ! Don't end token if next char is $ or other word character
731 if (pos <= input_len) then
732 next_ch = input(pos:pos)
733 ! If next character starts a new expansion or is alphanumeric, continue token
734 if (next_ch == '$' .or. next_ch == '{' .or. &
735 (next_ch >= 'a' .and. next_ch <= 'z') .or. &
736 (next_ch >= 'A' .and. next_ch <= 'Z') .or. &
737 (next_ch >= '0' .and. next_ch <= '9') .or. &
738 next_ch == '_' .or. next_ch == '-' .or. next_ch == '.') then
739 ! Continue building the same token - don't end it yet
740 ! state stays LEX_WORD
741 else
742 ! End of parameter expansion - finish token
743 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
744 token_start, pos-1, token_has_quoted_part, in_escape)
745 state = LEX_NORMAL
746 in_escape = .false.
747 token_has_quoted_part = .false.
748 end if
749 else
750 ! End of input - finish token
751 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
752 token_start, pos-1, token_has_quoted_part, in_escape)
753 state = LEX_NORMAL
754 in_escape = .false.
755 token_has_quoted_part = .false.
756 end if
757 end if
758 else
759 ! Inside ${ - keep EVERYTHING including spaces
760 if (token_len < MAX_TOKEN_LEN) then
761 token_len = token_len + 1
762 current_token(token_len:token_len) = ch
763 end if
764 pos = pos + 1
765 end if
766 else if (ch == '\' .and. pos < input_len) then
767 ! Backslash escape in word
768 ! For expansion-triggering chars, preserve backslash
769 if (next_ch == '$' .or. next_ch == '`') then
770 if (token_len < MAX_TOKEN_LEN - 1) then
771 token_len = token_len + 1
772 current_token(token_len:token_len) = '\'
773 token_len = token_len + 1
774 current_token(token_len:token_len) = next_ch
775 end if
776 else
777 if (token_len < MAX_TOKEN_LEN) then
778 token_len = token_len + 1
779 current_token(token_len:token_len) = next_ch
780 end if
781 end if
782 pos = pos + 2
783 else if (ch == "'" .or. ch == '"') then
784 ! Check for $' (ANSI-C quoting) when last char in token is $
785 if (ch == "'" .and. token_len >= 1 .and. &
786 current_token(token_len:token_len) == '$') then
787 ! Remove trailing $ and process ANSI-C quoted string
788 token_len = token_len - 1
789 token_has_quoted_part = .true.
790 pos = pos + 1 ! Skip opening '
791 call process_ansi_c_quote(input, pos, input_len, current_token, token_len)
792 cycle
793 end if
794 ! Quote in middle of word - continue building the same token
795 ! Mark that we're continuing a word so quote handler doesn't reset the token
796 continuing_word = .true.
797 token_has_quoted_part = .true. ! Track that this word contains quoted content
798 ! Transition to appropriate quote state
799 if (ch == "'") then
800 ! Check for $' (ANSI-C quoting) — the $ is already in the token
801 if (token_len >= 1 .and. current_token(token_len:token_len) == '$') then
802 token_len = token_len - 1 ! Remove the $ from token
803 state = LEX_IN_DOLLAR_SINGLE_QUOTE
804 pos = pos + 1 ! Skip the opening quote
805 cycle
806 end if
807 ! Add sentinel char(2) to mark start of single-quoted literal (no expansion)
808 if (token_len < MAX_TOKEN_LEN) then
809 token_len = token_len + 1
810 current_token(token_len:token_len) = char(2)
811 end if
812 state = LEX_IN_SINGLE_QUOTE
813 else
814 state = LEX_IN_DOUBLE_QUOTE
815 end if
816 pos = pos + 1 ! Skip the opening quote
817 else if (ch == '#') then
818 ! # is normally comment, but in $# it's part of variable
819 ! Keep it if current token is just $
820 if (token_len == 1 .and. current_token(1:1) == '$') then
821 token_len = token_len + 1
822 current_token(token_len:token_len) = ch
823 pos = pos + 1
824 else
825 ! End word, let # start a comment
826 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
827 token_start, pos-1, token_has_quoted_part, in_escape)
828 state = LEX_NORMAL
829 in_escape = .false.
830 token_has_quoted_part = .false.
831 end if
832 else if (ch == '$' .and. pos < input_len .and. next_ch == '(') then
833 ! $( for command/arithmetic substitution - keep in word
834 if (token_len < MAX_TOKEN_LEN - 1) then
835 token_len = token_len + 1
836 current_token(token_len:token_len) = ch
837 token_len = token_len + 1
838 current_token(token_len:token_len) = next_ch
839 paren_depth = 1 ! Track that we're inside $(
840 end if
841 pos = pos + 2
842 else if (ch == '$' .and. pos < input_len .and. next_ch == '{') then
843 ! ${ for parameter expansion - keep in word
844 if (token_len < MAX_TOKEN_LEN - 1) then
845 token_len = token_len + 1
846 current_token(token_len:token_len) = ch
847 token_len = token_len + 1
848 current_token(token_len:token_len) = next_ch
849 paren_depth = 1 ! Track that we're inside ${
850 end if
851 pos = pos + 2
852 else if ((ch >= '0' .and. ch <= '9') .or. ch == '+' .or. ch == '-' .or. &
853 ch == '*' .or. ch == '/' .or. ch == '%') then
854 ! Keep these chars in word (for variables and arithmetic)
855 if (token_len < MAX_TOKEN_LEN) then
856 token_len = token_len + 1
857 current_token(token_len:token_len) = ch
858 end if
859 pos = pos + 1
860 else if (ch == '(' .or. ch == ')') then
861 ! Inside [[ ]], keep parens as part of word (regex patterns, grouping)
862 if (in_double_bracket_context) then
863 if (token_len < MAX_TOKEN_LEN) then
864 token_len = token_len + 1
865 current_token(token_len:token_len) = ch
866 end if
867 pos = pos + 1
868 ! Parentheses: Keep ONLY if inside $(( or $(
869 ! Check if current token ends with $ (for x=$(cmd) or just $(cmd))
870 ! NOTE: Only for '(' - ')' after $ (like $$) should end the word
871 else if (ch == '(' .and. token_len >= 1 .and. current_token(token_len:token_len) == '$') then
872 ! Just added $, now seeing ( - this is $( substitution - keep both
873 if (token_len < MAX_TOKEN_LEN) then
874 token_len = token_len + 1
875 current_token(token_len:token_len) = ch
876 end if
877 pos = pos + 1
878 else if (token_len >= 2 .and. index(current_token(1:token_len), '$(') > 0) then
879 ! Already inside $(...) - keep parens
880 if (token_len < MAX_TOKEN_LEN) then
881 token_len = token_len + 1
882 current_token(token_len:token_len) = ch
883 end if
884 pos = pos + 1
885 else if (ch == '(' .and. token_len >= 1 .and. &
886 current_token(token_len:token_len) == '=') then
887 ! Array assignment: VAR=(...) - include the parenthesized content
888 ! Scan for matching ) respecting quotes and nested parens
889 if (token_len < MAX_TOKEN_LEN) then
890 token_len = token_len + 1
891 current_token(token_len:token_len) = '('
892 end if
893 pos = pos + 1
894 paren_depth = 1
895 do while (pos <= input_len .and. paren_depth > 0)
896 ch = input(pos:pos)
897 if (ch == '"') then
898 ! Skip double-quoted string
899 if (token_len < MAX_TOKEN_LEN) then
900 token_len = token_len + 1
901 current_token(token_len:token_len) = ch
902 end if
903 pos = pos + 1
904 do while (pos <= input_len .and. input(pos:pos) /= '"')
905 if (input(pos:pos) == '\' .and. pos < input_len) then
906 if (token_len < MAX_TOKEN_LEN - 1) then
907 token_len = token_len + 1
908 current_token(token_len:token_len) = input(pos:pos)
909 token_len = token_len + 1
910 current_token(token_len:token_len) = input(pos+1:pos+1)
911 end if
912 pos = pos + 2
913 else
914 if (token_len < MAX_TOKEN_LEN) then
915 token_len = token_len + 1
916 current_token(token_len:token_len) = input(pos:pos)
917 end if
918 pos = pos + 1
919 end if
920 end do
921 if (pos <= input_len) then
922 if (token_len < MAX_TOKEN_LEN) then
923 token_len = token_len + 1
924 current_token(token_len:token_len) = '"'
925 end if
926 pos = pos + 1
927 end if
928 else if (ch == "'") then
929 ! Skip single-quoted string
930 if (token_len < MAX_TOKEN_LEN) then
931 token_len = token_len + 1
932 current_token(token_len:token_len) = ch
933 end if
934 pos = pos + 1
935 do while (pos <= input_len .and. input(pos:pos) /= "'")
936 if (token_len < MAX_TOKEN_LEN) then
937 token_len = token_len + 1
938 current_token(token_len:token_len) = input(pos:pos)
939 end if
940 pos = pos + 1
941 end do
942 if (pos <= input_len) then
943 if (token_len < MAX_TOKEN_LEN) then
944 token_len = token_len + 1
945 current_token(token_len:token_len) = "'"
946 end if
947 pos = pos + 1
948 end if
949 else if (ch == '(') then
950 paren_depth = paren_depth + 1
951 if (token_len < MAX_TOKEN_LEN) then
952 token_len = token_len + 1
953 current_token(token_len:token_len) = ch
954 end if
955 pos = pos + 1
956 else if (ch == ')') then
957 paren_depth = paren_depth - 1
958 if (token_len < MAX_TOKEN_LEN) then
959 token_len = token_len + 1
960 current_token(token_len:token_len) = ch
961 end if
962 pos = pos + 1
963 else
964 if (token_len < MAX_TOKEN_LEN) then
965 token_len = token_len + 1
966 current_token(token_len:token_len) = ch
967 end if
968 pos = pos + 1
969 end if
970 end do
971 ! Token complete with closing )
972 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
973 token_start, pos-1, token_has_quoted_part, in_escape)
974 state = LEX_NORMAL
975 in_escape = .false.
976 token_has_quoted_part = .false.
977 else
978 ! Not in substitution - end word, let paren be operator
979 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
980 token_start, pos-1, token_has_quoted_part, in_escape)
981 state = LEX_NORMAL
982 in_escape = .false.
983 token_has_quoted_part = .false.
984 end if
985 else if (ch == '{' .or. ch == '}') then
986 ! Braces: Keep in word for brace expansion (e.g., {1,2,3} or file{a,b}.txt)
987 ! They're only command group operators when surrounded by whitespace
988 if (token_len < MAX_TOKEN_LEN) then
989 token_len = token_len + 1
990 current_token(token_len:token_len) = ch
991 end if
992 pos = pos + 1
993 else if (in_double_bracket_context .and. &
994 (ch == '&' .or. ch == '|' .or. ch == '<' .or. ch == '>' .or. &
995 ch == '(' .or. ch == ')')) then
996 ! Inside [[ ]], these are test operators, not shell operators
997 if (token_len < MAX_TOKEN_LEN) then
998 token_len = token_len + 1
999 current_token(token_len:token_len) = ch
1000 end if
1001 pos = pos + 1
1002 else if (is_word_char(ch)) then
1003 ! Continue word
1004 if (token_len < MAX_TOKEN_LEN) then
1005 token_len = token_len + 1
1006 current_token(token_len:token_len) = ch
1007 end if
1008 pos = pos + 1
1009 else
1010 ! End of word
1011 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
1012 token_start, pos-1, token_has_quoted_part, in_escape)
1013 state = LEX_NORMAL
1014 in_escape = .false.
1015 token_has_quoted_part = .false.
1016 ! Don't increment pos, let NORMAL state handle this character
1017 end if
1018
1019 ! ============ OPERATOR STATE ============
1020 case(LEX_IN_OPERATOR)
1021 ! Try to match multi-character operators
1022 if (token_len == 1) then
1023 select case(current_token(1:1))
1024 case('&')
1025 if (ch == '&') then
1026 current_token(2:2) = ch
1027 token_len = 2
1028 call add_token(tokens, num_tokens, TOKEN_OPERATOR, '&&', token_start, pos, .false.)
1029 state = LEX_NORMAL
1030 pos = pos + 1
1031 else
1032 call add_token(tokens, num_tokens, TOKEN_OPERATOR, '&', token_start, pos-1, .false.)
1033 state = LEX_NORMAL
1034 end if
1035 case('|')
1036 if (ch == '|') then
1037 current_token(2:2) = ch
1038 token_len = 2
1039 call add_token(tokens, num_tokens, TOKEN_OPERATOR, '||', token_start, pos, .false.)
1040 state = LEX_NORMAL
1041 pos = pos + 1
1042 else
1043 call add_token(tokens, num_tokens, TOKEN_OPERATOR, '|', token_start, pos-1, .false.)
1044 state = LEX_NORMAL
1045 end if
1046 case('>')
1047 if (ch == '>') then
1048 current_token(2:2) = ch
1049 token_len = 2
1050 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '>>', token_start, pos, .false.)
1051 state = LEX_NORMAL
1052 pos = pos + 1
1053 else if (ch == '&') then
1054 current_token(2:2) = ch
1055 token_len = 2
1056 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '>&', token_start, pos, .false.)
1057 state = LEX_NORMAL
1058 pos = pos + 1
1059 else if (ch == '|') then
1060 current_token(2:2) = ch
1061 token_len = 2
1062 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '>|', token_start, pos, .false.)
1063 state = LEX_NORMAL
1064 pos = pos + 1
1065 else
1066 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '>', token_start, pos-1, .false.)
1067 state = LEX_NORMAL
1068 end if
1069 case('<')
1070 if (ch == '<') then
1071 ! Could be << or <<< or <<-
1072 if (pos < input_len .and. next_ch == '<') then
1073 current_token(2:3) = '<<'
1074 token_len = 3
1075 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '<<<', token_start, pos+1, .false.)
1076 state = LEX_NORMAL
1077 pos = pos + 2
1078 else if (pos < input_len .and. next_ch == '-') then
1079 current_token(2:3) = '<-'
1080 token_len = 3
1081 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '<<-', token_start, pos+1, .false.)
1082 state = LEX_NORMAL
1083 pos = pos + 2
1084 else
1085 current_token(2:2) = ch
1086 token_len = 2
1087 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '<<', token_start, pos, .false.)
1088 state = LEX_NORMAL
1089 pos = pos + 1
1090 end if
1091 else if (ch == '>') then
1092 current_token(2:2) = ch
1093 token_len = 2
1094 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '<>', token_start, pos, .false.)
1095 state = LEX_NORMAL
1096 pos = pos + 1
1097 else if (ch == '&') then
1098 current_token(2:2) = ch
1099 token_len = 2
1100 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '<&', token_start, pos, .false.)
1101 state = LEX_NORMAL
1102 pos = pos + 1
1103 else
1104 call add_token(tokens, num_tokens, TOKEN_REDIRECT, '<', token_start, pos-1, .false.)
1105 state = LEX_NORMAL
1106 end if
1107 case(';')
1108 if (ch == ';') then
1109 current_token(2:2) = ch
1110 token_len = 2
1111 call add_token(tokens, num_tokens, TOKEN_OPERATOR, ';;', token_start, pos, .false.)
1112 state = LEX_NORMAL
1113 pos = pos + 1
1114 else
1115 call add_token(tokens, num_tokens, TOKEN_OPERATOR, ';', token_start, pos-1, .false.)
1116 state = LEX_NORMAL
1117 end if
1118 case('(', ')')
1119 call add_token(tokens, num_tokens, TOKEN_OPERATOR, current_token(1:1), &
1120 token_start, pos-1, .false.)
1121 state = LEX_NORMAL
1122 case default
1123 ! Single-character operator
1124 call add_token(tokens, num_tokens, TOKEN_OPERATOR, current_token(1:1), &
1125 token_start, pos-1, .false.)
1126 state = LEX_NORMAL
1127 end select
1128 else
1129 ! Multi-character operator complete
1130 state = LEX_NORMAL
1131 end if
1132
1133 end select
1134 end do
1135
1136 ! Flush any remaining token
1137 if (state == LEX_IN_WORD .and. token_len > 0) then
1138 call add_word_or_keyword(tokens, num_tokens, current_token(1:token_len), &
1139 token_start, input_len, token_has_quoted_part, in_escape)
1140 else if (state == LEX_IN_SINGLE_QUOTE .or. state == LEX_IN_DOUBLE_QUOTE) then
1141 ! Unterminated quote - add as word with error marker
1142 if (state == LEX_IN_SINGLE_QUOTE) then
1143 call add_token(tokens, num_tokens, TOKEN_WORD, current_token(1:token_len), &
1144 token_start, input_len, .true., quote_type=QUOTE_SINGLE)
1145 else
1146 call add_token(tokens, num_tokens, TOKEN_WORD, current_token(1:token_len), &
1147 token_start, input_len, .true., quote_type=QUOTE_DOUBLE)
1148 end if
1149 else if (state == LEX_IN_OPERATOR .and. token_len > 0) then
1150 ! Flush operator
1151 call add_token(tokens, num_tokens, TOKEN_OPERATOR, current_token(1:token_len), &
1152 token_start, input_len, .false.)
1153 end if
1154
1155 ! Add EOF token
1156 call add_token(tokens, num_tokens, TOKEN_EOF, '', input_len+1, input_len+1, .false.)
1157
1158 end subroutine tokenize
1159
1160 ! =====================================
1161 ! Helper: Add token to array
1162 ! =====================================
1163 subroutine add_token(tokens, num_tokens, tok_type, value, start_pos, end_pos, quoted, escaped, quote_type)
1164 use shell_types, only: QUOTE_NONE
1165 type(token_t), intent(inout) :: tokens(:)
1166 integer, intent(inout) :: num_tokens
1167 integer, intent(in) :: tok_type, start_pos, end_pos
1168 character(len=*), intent(in) :: value
1169 logical, intent(in) :: quoted
1170 logical, intent(in), optional :: escaped
1171 integer, intent(in), optional :: quote_type
1172
1173 if (num_tokens < size(tokens)) then
1174 num_tokens = num_tokens + 1
1175 tokens(num_tokens)%token_type = tok_type
1176 tokens(num_tokens)%value = value
1177 tokens(num_tokens)%value_length = len(value) ! Store actual content length
1178 tokens(num_tokens)%start_pos = start_pos
1179 tokens(num_tokens)%end_pos = end_pos
1180 tokens(num_tokens)%quoted = quoted
1181 if (present(escaped)) then
1182 tokens(num_tokens)%escaped = escaped
1183 else
1184 tokens(num_tokens)%escaped = .false.
1185 end if
1186 if (present(quote_type)) then
1187 tokens(num_tokens)%quote_type = quote_type
1188 else
1189 tokens(num_tokens)%quote_type = QUOTE_NONE
1190 end if
1191 end if
1192 end subroutine add_token
1193
1194 ! =====================================
1195 ! Helper: Add word or keyword token
1196 ! =====================================
1197 subroutine add_word_or_keyword(tokens, num_tokens, value, start_pos, end_pos, quoted, escaped)
1198 type(token_t), intent(inout) :: tokens(:)
1199 integer, intent(inout) :: num_tokens
1200 character(len=*), intent(in) :: value
1201 integer, intent(in) :: start_pos, end_pos
1202 logical, intent(in) :: quoted
1203 logical, intent(in), optional :: escaped
1204
1205 integer :: tok_type
1206
1207 ! Quoted strings are always words, never keywords
1208 if (quoted) then
1209 tok_type = TOKEN_WORD
1210 else if (is_keyword(value)) then
1211 tok_type = TOKEN_KEYWORD
1212 else
1213 tok_type = TOKEN_WORD
1214 end if
1215
1216 ! Track [[ ]] context: inside test expressions, operators become words
1217 if (.not. quoted) then
1218 if (trim(value) == '[[') in_double_bracket_context = .true.
1219 if (trim(value) == ']]') in_double_bracket_context = .false.
1220 end if
1221
1222 call add_token(tokens, num_tokens, tok_type, value, start_pos, end_pos, quoted, escaped)
1223 end subroutine add_word_or_keyword
1224
1225 ! =====================================
1226 ! next_token - Get next token from stream
1227 ! =====================================
1228 function next_token(tokens, pos) result(tok)
1229 type(token_t), intent(in) :: tokens(:)
1230 integer, intent(inout) :: pos
1231 type(token_t) :: tok
1232
1233 if (pos <= size(tokens) .and. pos > 0) then
1234 tok = tokens(pos)
1235 pos = pos + 1
1236 else
1237 ! Return EOF token
1238 tok%token_type = TOKEN_EOF
1239 tok%value = ''
1240 tok%start_pos = 0
1241 tok%end_pos = 0
1242 tok%quoted = .false.
1243 end if
1244 end function next_token
1245
1246 ! =====================================
1247 ! peek_token - Look ahead without consuming
1248 ! =====================================
1249 function peek_token(tokens, pos) result(tok)
1250 type(token_t), intent(in) :: tokens(:)
1251 integer, intent(in) :: pos
1252 type(token_t) :: tok
1253
1254 if (pos <= size(tokens) .and. pos > 0) then
1255 tok = tokens(pos)
1256 else
1257 ! Return EOF token
1258 tok%token_type = TOKEN_EOF
1259 tok%value = ''
1260 tok%start_pos = 0
1261 tok%end_pos = 0
1262 tok%quoted = .false.
1263 end if
1264 end function peek_token
1265
1266 ! =====================================
1267 ! process_ansi_c_quote - Handle $'...' ANSI-C quoting
1268 ! Reads characters from input starting at pos (after $'),
1269 ! processes escape sequences, appends to current_token.
1270 ! Wraps output in char(2)/char(3) sentinels to prevent expansion.
1271 ! =====================================
1272 subroutine process_ansi_c_quote(input, pos, input_len, current_token, token_len)
1273 character(len=*), intent(in) :: input
1274 integer, intent(inout) :: pos
1275 integer, intent(in) :: input_len
1276 character(len=*), intent(inout) :: current_token
1277 integer, intent(inout) :: token_len
1278
1279 character :: ch, esc_ch
1280 integer :: oct_val, hex_val, n_digits, i
1281
1282 ! Add sentinel to mark start of quoted content (no expansion)
1283 if (token_len < MAX_TOKEN_LEN) then
1284 token_len = token_len + 1
1285 current_token(token_len:token_len) = char(2)
1286 end if
1287
1288 do while (pos <= input_len)
1289 ch = input(pos:pos)
1290
1291 if (ch == "'") then
1292 ! Closing quote
1293 pos = pos + 1
1294 ! Add sentinel to mark end of quoted content
1295 if (token_len < MAX_TOKEN_LEN) then
1296 token_len = token_len + 1
1297 current_token(token_len:token_len) = char(3)
1298 end if
1299 return
1300 end if
1301
1302 if (ch == '\' .and. pos < input_len) then
1303 ! Escape sequence
1304 esc_ch = input(pos+1:pos+1)
1305 select case(esc_ch)
1306 case('a') ! Alert (bell)
1307 if (token_len < MAX_TOKEN_LEN) then
1308 token_len = token_len + 1
1309 current_token(token_len:token_len) = char(7)
1310 end if
1311 pos = pos + 2
1312 case('b') ! Backspace
1313 if (token_len < MAX_TOKEN_LEN) then
1314 token_len = token_len + 1
1315 current_token(token_len:token_len) = char(8)
1316 end if
1317 pos = pos + 2
1318 case('e', 'E') ! Escape
1319 if (token_len < MAX_TOKEN_LEN) then
1320 token_len = token_len + 1
1321 current_token(token_len:token_len) = char(27)
1322 end if
1323 pos = pos + 2
1324 case('f') ! Form feed
1325 if (token_len < MAX_TOKEN_LEN) then
1326 token_len = token_len + 1
1327 current_token(token_len:token_len) = char(12)
1328 end if
1329 pos = pos + 2
1330 case('n') ! Newline
1331 if (token_len < MAX_TOKEN_LEN) then
1332 token_len = token_len + 1
1333 current_token(token_len:token_len) = char(10)
1334 end if
1335 pos = pos + 2
1336 case('r') ! Carriage return
1337 if (token_len < MAX_TOKEN_LEN) then
1338 token_len = token_len + 1
1339 current_token(token_len:token_len) = char(13)
1340 end if
1341 pos = pos + 2
1342 case('t') ! Horizontal tab
1343 if (token_len < MAX_TOKEN_LEN) then
1344 token_len = token_len + 1
1345 current_token(token_len:token_len) = char(9)
1346 end if
1347 pos = pos + 2
1348 case('v') ! Vertical tab
1349 if (token_len < MAX_TOKEN_LEN) then
1350 token_len = token_len + 1
1351 current_token(token_len:token_len) = char(11)
1352 end if
1353 pos = pos + 2
1354 case('\') ! Literal backslash
1355 if (token_len < MAX_TOKEN_LEN) then
1356 token_len = token_len + 1
1357 current_token(token_len:token_len) = '\'
1358 end if
1359 pos = pos + 2
1360 case("'") ! Literal single quote
1361 if (token_len < MAX_TOKEN_LEN) then
1362 token_len = token_len + 1
1363 current_token(token_len:token_len) = "'"
1364 end if
1365 pos = pos + 2
1366 case('"') ! Literal double quote
1367 if (token_len < MAX_TOKEN_LEN) then
1368 token_len = token_len + 1
1369 current_token(token_len:token_len) = '"'
1370 end if
1371 pos = pos + 2
1372 case('0', '1', '2', '3', '4', '5', '6', '7')
1373 ! Octal: \nnn (up to 3 digits)
1374 oct_val = 0
1375 n_digits = 0
1376 pos = pos + 1 ! skip backslash
1377 do while (pos <= input_len .and. n_digits < 3)
1378 ch = input(pos:pos)
1379 if (ch >= '0' .and. ch <= '7') then
1380 oct_val = oct_val * 8 + (ichar(ch) - ichar('0'))
1381 pos = pos + 1
1382 n_digits = n_digits + 1
1383 else
1384 exit
1385 end if
1386 end do
1387 if (oct_val > 0 .and. oct_val <= 127 .and. token_len < MAX_TOKEN_LEN) then
1388 token_len = token_len + 1
1389 current_token(token_len:token_len) = char(oct_val)
1390 end if
1391 case('x')
1392 ! Hex: \xHH (up to 2 digits)
1393 hex_val = 0
1394 n_digits = 0
1395 pos = pos + 2 ! skip \x
1396 do while (pos <= input_len .and. n_digits < 2)
1397 ch = input(pos:pos)
1398 if (ch >= '0' .and. ch <= '9') then
1399 hex_val = hex_val * 16 + (ichar(ch) - ichar('0'))
1400 pos = pos + 1
1401 n_digits = n_digits + 1
1402 else if (ch >= 'a' .and. ch <= 'f') then
1403 hex_val = hex_val * 16 + (ichar(ch) - ichar('a') + 10)
1404 pos = pos + 1
1405 n_digits = n_digits + 1
1406 else if (ch >= 'A' .and. ch <= 'F') then
1407 hex_val = hex_val * 16 + (ichar(ch) - ichar('A') + 10)
1408 pos = pos + 1
1409 n_digits = n_digits + 1
1410 else
1411 exit
1412 end if
1413 end do
1414 if (hex_val > 0 .and. hex_val <= 127 .and. token_len < MAX_TOKEN_LEN) then
1415 token_len = token_len + 1
1416 current_token(token_len:token_len) = char(hex_val)
1417 end if
1418 case default
1419 ! Unknown escape: include backslash and character literally
1420 if (token_len < MAX_TOKEN_LEN - 1) then
1421 token_len = token_len + 1
1422 current_token(token_len:token_len) = '\'
1423 token_len = token_len + 1
1424 current_token(token_len:token_len) = esc_ch
1425 end if
1426 pos = pos + 2
1427 end select
1428 else
1429 ! Regular character
1430 if (token_len < MAX_TOKEN_LEN) then
1431 token_len = token_len + 1
1432 current_token(token_len:token_len) = ch
1433 end if
1434 pos = pos + 1
1435 end if
1436 end do
1437
1438 ! Unterminated $'...' - add sentinel anyway
1439 if (token_len < MAX_TOKEN_LEN) then
1440 token_len = token_len + 1
1441 current_token(token_len:token_len) = char(3)
1442 end if
1443 end subroutine process_ansi_c_quote
1444
1445 end module lexer
1446