ferp Public

Watch 0 Fork 1 Star 0

Fortran · 13562 bytes Raw Blame History

  
        1
        module regex_lexer
      
        2
          !> Regex pattern tokenizer for FERP
      
        3
          !> Handles both BRE (Basic) and ERE (Extended) regex dialects
      
        4
          use regex_types
      
        5
          use ferp_kinds, only: pattern_len
      
        6
          implicit none
      
        7
          private
      
        8
        
        9
          public :: tokenize
      
        10
        
        11
        contains
      
        12
        
        13
          subroutine tokenize(pattern, tokens, is_ere, ierr)
      
        14
            !> Tokenize a regex pattern
      
        15
            character(len=*), intent(in) :: pattern
      
        16
            type(token_list_t), intent(out) :: tokens
      
        17
            logical, intent(in) :: is_ere           ! True for ERE, false for BRE
      
        18
            integer, intent(out) :: ierr
      
        19
        
        20
            integer :: i, n
      
        21
            character(len=1) :: c, next_c
      
        22
            type(token_t) :: tok
      
        23
            logical :: in_bracket
      
        24
        
        25
            ierr = 0
      
        26
            call tokens%init()
      
        27
            n = pattern_len(pattern)  ! Use pattern_len to preserve whitespace patterns
      
        28
            i = 1
      
        29
            in_bracket = .false.
      
        30
        
        31
            do while (i <= n)
      
        32
              c = pattern(i:i)
      
        33
              next_c = ' '
      
        34
              if (i < n) next_c = pattern(i+1:i+1)
      
        35
        
        36
              tok = token_t()
      
        37
              tok%pos = i
      
        38
        
        39
              ! Handle character class separately
      
        40
              if (c == '[') then
      
        41
                call parse_char_class(pattern, i, n, tok, ierr)
      
        42
                if (ierr /= 0) return
      
        43
                call tokens%append(tok)
      
        44
                cycle
      
        45
              end if
      
        46
        
        47
              ! Handle escape sequences
      
        48
              if (c == '\') then
      
        49
                if (i >= n) then
      
        50
                  ! Trailing backslash - treat as literal
      
        51
                  tok%ttype = TOK_LITERAL
      
        52
                  tok%char_val = '\'
      
        53
                  i = i + 1
      
        54
                else
      
        55
                  call parse_escape(pattern, i, n, tok, is_ere, ierr)
      
        56
                  if (ierr /= 0) return
      
        57
                end if
      
        58
                call tokens%append(tok)
      
        59
                cycle
      
        60
              end if
      
        61
        
        62
              ! Handle metacharacters based on BRE vs ERE
      
        63
              select case (c)
      
        64
                case ('.')
      
        65
                  tok%ttype = TOK_DOT
      
        66
                  i = i + 1
      
        67
        
        68
                case ('*')
      
        69
                  tok%ttype = TOK_STAR
      
        70
                  i = i + 1
      
        71
        
        72
                case ('^')
      
        73
                  tok%ttype = TOK_CARET
      
        74
                  i = i + 1
      
        75
        
        76
                case ('$')
      
        77
                  tok%ttype = TOK_DOLLAR
      
        78
                  i = i + 1
      
        79
        
        80
                case ('+')
      
        81
                  if (is_ere) then
      
        82
                    tok%ttype = TOK_PLUS
      
        83
                  else
      
        84
                    tok%ttype = TOK_LITERAL
      
        85
                    tok%char_val = '+'
      
        86
                  end if
      
        87
                  i = i + 1
      
        88
        
        89
                case ('?')
      
        90
                  if (is_ere) then
      
        91
                    tok%ttype = TOK_QUESTION
      
        92
                  else
      
        93
                    tok%ttype = TOK_LITERAL
      
        94
                    tok%char_val = '?'
      
        95
                  end if
      
        96
                  i = i + 1
      
        97
        
        98
                case ('|')
      
        99
                  if (is_ere) then
      
        100
                    tok%ttype = TOK_PIPE
      
        101
                  else
      
        102
                    tok%ttype = TOK_LITERAL
      
        103
                    tok%char_val = '|'
      
        104
                  end if
      
        105
                  i = i + 1
      
        106
        
        107
                case ('(')
      
        108
                  if (is_ere) then
      
        109
                    tok%ttype = TOK_LPAREN
      
        110
                  else
      
        111
                    tok%ttype = TOK_LITERAL
      
        112
                    tok%char_val = '('
      
        113
                  end if
      
        114
                  i = i + 1
      
        115
        
        116
                case (')')
      
        117
                  if (is_ere) then
      
        118
                    tok%ttype = TOK_RPAREN
      
        119
                  else
      
        120
                    tok%ttype = TOK_LITERAL
      
        121
                    tok%char_val = ')'
      
        122
                  end if
      
        123
                  i = i + 1
      
        124
        
        125
                case ('{')
      
        126
                  if (is_ere) then
      
        127
                    tok%ttype = TOK_LBRACE
      
        128
                  else
      
        129
                    tok%ttype = TOK_LITERAL
      
        130
                    tok%char_val = '{'
      
        131
                  end if
      
        132
                  i = i + 1
      
        133
        
        134
                case ('}')
      
        135
                  if (is_ere) then
      
        136
                    tok%ttype = TOK_RBRACE
      
        137
                  else
      
        138
                    tok%ttype = TOK_LITERAL
      
        139
                    tok%char_val = '}'
      
        140
                  end if
      
        141
                  i = i + 1
      
        142
        
        143
                case default
      
        144
                  ! Literal character
      
        145
                  tok%ttype = TOK_LITERAL
      
        146
                  tok%char_val = c
      
        147
                  i = i + 1
      
        148
              end select
      
        149
        
        150
              call tokens%append(tok)
      
        151
            end do
      
        152
        
        153
            ! Add end token
      
        154
            tok = token_t()
      
        155
            tok%ttype = TOK_END
      
        156
            tok%pos = n + 1
      
        157
            call tokens%append(tok)
      
        158
        
        159
          end subroutine tokenize
      
        160
        
        161
          subroutine parse_escape(pattern, pos, n, tok, is_ere, ierr)
      
        162
            !> Parse an escape sequence starting at pos (which points to \)
      
        163
            character(len=*), intent(in) :: pattern
      
        164
            integer, intent(inout) :: pos
      
        165
            integer, intent(in) :: n
      
        166
            type(token_t), intent(out) :: tok
      
        167
            logical, intent(in) :: is_ere
      
        168
            integer, intent(out) :: ierr
      
        169
        
        170
            character(len=1) :: c
      
        171
            integer :: ref_num
      
        172
        
        173
            ierr = 0
      
        174
            pos = pos + 1  ! Skip the backslash
      
        175
            c = pattern(pos:pos)
      
        176
        
        177
            select case (c)
      
        178
              ! BRE special escapes (become metacharacters)
      
        179
              case ('(')
      
        180
                if (is_ere) then
      
        181
                  tok%ttype = TOK_LITERAL
      
        182
                  tok%char_val = '('
      
        183
                else
      
        184
                  tok%ttype = TOK_LPAREN
      
        185
                end if
      
        186
                pos = pos + 1
      
        187
        
        188
              case (')')
      
        189
                if (is_ere) then
      
        190
                  tok%ttype = TOK_LITERAL
      
        191
                  tok%char_val = ')'
      
        192
                else
      
        193
                  tok%ttype = TOK_RPAREN
      
        194
                end if
      
        195
                pos = pos + 1
      
        196
        
        197
              case ('{')
      
        198
                if (is_ere) then
      
        199
                  tok%ttype = TOK_LITERAL
      
        200
                  tok%char_val = '{'
      
        201
                else
      
        202
                  tok%ttype = TOK_LBRACE
      
        203
                end if
      
        204
                pos = pos + 1
      
        205
        
        206
              case ('}')
      
        207
                if (is_ere) then
      
        208
                  tok%ttype = TOK_LITERAL
      
        209
                  tok%char_val = '}'
      
        210
                else
      
        211
                  tok%ttype = TOK_RBRACE
      
        212
                end if
      
        213
                pos = pos + 1
      
        214
        
        215
              ! GNU extensions for BRE (also work in ERE)
      
        216
              case ('+')
      
        217
                if (.not. is_ere) then
      
        218
                  tok%ttype = TOK_PLUS
      
        219
                else
      
        220
                  tok%ttype = TOK_LITERAL
      
        221
                  tok%char_val = '+'
      
        222
                end if
      
        223
                pos = pos + 1
      
        224
        
        225
              case ('?')
      
        226
                if (.not. is_ere) then
      
        227
                  tok%ttype = TOK_QUESTION
      
        228
                else
      
        229
                  tok%ttype = TOK_LITERAL
      
        230
                  tok%char_val = '?'
      
        231
                end if
      
        232
                pos = pos + 1
      
        233
        
        234
              case ('|')
      
        235
                if (.not. is_ere) then
      
        236
                  tok%ttype = TOK_PIPE
      
        237
                else
      
        238
                  tok%ttype = TOK_LITERAL
      
        239
                  tok%char_val = '|'
      
        240
                end if
      
        241
                pos = pos + 1
      
        242
        
        243
              ! Backreferences \1-\9
      
        244
              case ('1', '2', '3', '4', '5', '6', '7', '8', '9')
      
        245
                tok%ttype = TOK_BACKREF
      
        246
                read(c, '(I1)') ref_num
      
        247
                tok%int_val = ref_num
      
        248
                pos = pos + 1
      
        249
        
        250
              ! Word boundaries
      
        251
              case ('<')
      
        252
                tok%ttype = TOK_WORD_BOUNDARY
      
        253
                tok%int_val = 1  ! word start
      
        254
                pos = pos + 1
      
        255
        
        256
              case ('>')
      
        257
                tok%ttype = TOK_WORD_BOUNDARY
      
        258
                tok%int_val = 2  ! word end
      
        259
                pos = pos + 1
      
        260
        
        261
              case ('b')
      
        262
                tok%ttype = TOK_WORD_BOUNDARY
      
        263
                tok%int_val = 3  ! word boundary (either)
      
        264
                pos = pos + 1
      
        265
        
        266
              case ('B')
      
        267
                tok%ttype = TOK_WORD_BOUNDARY
      
        268
                tok%int_val = 4  ! not word boundary
      
        269
                pos = pos + 1
      
        270
        
        271
              ! Character escapes
      
        272
              case ('n')
      
        273
                tok%ttype = TOK_LITERAL
      
        274
                tok%char_val = char(10)  ! newline
      
        275
                pos = pos + 1
      
        276
        
        277
              case ('t')
      
        278
                tok%ttype = TOK_LITERAL
      
        279
                tok%char_val = char(9)   ! tab
      
        280
                pos = pos + 1
      
        281
        
        282
              case ('r')
      
        283
                tok%ttype = TOK_LITERAL
      
        284
                tok%char_val = char(13)  ! carriage return
      
        285
                pos = pos + 1
      
        286
        
        287
              ! Escape metacharacters to make them literal
      
        288
              case ('.', '*', '^', '$', '[', ']', '\')
      
        289
                tok%ttype = TOK_LITERAL
      
        290
                tok%char_val = c
      
        291
                pos = pos + 1
      
        292
        
        293
              case default
      
        294
                ! Unknown escape - treat as literal
      
        295
                tok%ttype = TOK_LITERAL
      
        296
                tok%char_val = c
      
        297
                pos = pos + 1
      
        298
            end select
      
        299
        
        300
          end subroutine parse_escape
      
        301
        
        302
          subroutine parse_char_class(pattern, pos, n, tok, ierr)
      
        303
            !> Parse a character class [...] starting at pos (which points to [)
      
        304
            character(len=*), intent(in) :: pattern
      
        305
            integer, intent(inout) :: pos
      
        306
            integer, intent(in) :: n
      
        307
            type(token_t), intent(out) :: tok
      
        308
            integer, intent(out) :: ierr
      
        309
        
        310
            integer :: j, start_char, end_char
      
        311
            character(len=1) :: c, prev_c
      
        312
            logical :: negated, first
      
        313
        
        314
            ierr = 0
      
        315
            tok%ttype = TOK_LITERAL  ! Will be set properly at end
      
        316
            tok%char_class = .false.
      
        317
            tok%negated = .false.
      
        318
        
        319
            pos = pos + 1  ! Skip [
      
        320
        
        321
            if (pos > n) then
      
        322
              ierr = 1
      
        323
              return
      
        324
            end if
      
        325
        
        326
            ! Check for negation
      
        327
            negated = .false.
      
        328
            if (pattern(pos:pos) == '^') then
      
        329
              negated = .true.
      
        330
              pos = pos + 1
      
        331
            end if
      
        332
        
        333
            ! Handle ] at start (it's literal)
      
        334
            first = .true.
      
        335
            if (pos <= n .and. pattern(pos:pos) == ']') then
      
        336
              tok%char_class(ichar(']')) = .true.
      
        337
              pos = pos + 1
      
        338
              first = .false.
      
        339
            end if
      
        340
        
        341
            ! Handle - at start (it's literal)
      
        342
            if (pos <= n .and. pattern(pos:pos) == '-') then
      
        343
              tok%char_class(ichar('-')) = .true.
      
        344
              pos = pos + 1
      
        345
            end if
      
        346
        
        347
            prev_c = char(0)
      
        348
            do while (pos <= n)
      
        349
              c = pattern(pos:pos)
      
        350
        
        351
              if (c == ']') then
      
        352
                ! End of character class
      
        353
                pos = pos + 1
      
        354
                tok%negated = negated
      
        355
                tok%ttype = TOK_LBRACKET  ! Indicate this is a char class token
      
        356
                return
      
        357
              end if
      
        358
        
        359
              if (c == '-' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) /= ']') then
      
        360
                ! Range: prev_c - next_c
      
        361
                if (prev_c /= char(0)) then
      
        362
                  pos = pos + 1
      
        363
                  if (pos > n) then
      
        364
                    ierr = 1
      
        365
                    return
      
        366
                  end if
      
        367
                  c = pattern(pos:pos)
      
        368
        
        369
                  ! Handle escape in range end
      
        370
                  if (c == '\' .and. pos + 1 <= n) then
      
        371
                    pos = pos + 1
      
        372
                    c = pattern(pos:pos)
      
        373
                  end if
      
        374
        
        375
                  start_char = ichar(prev_c)
      
        376
                  end_char = ichar(c)
      
        377
                  if (start_char > end_char) then
      
        378
                    ! Invalid range, but we'll be lenient
      
        379
                    tok%char_class(start_char) = .true.
      
        380
                    tok%char_class(ichar('-')) = .true.
      
        381
                    tok%char_class(end_char) = .true.
      
        382
                  else
      
        383
                    do j = start_char, end_char
      
        384
                      tok%char_class(j) = .true.
      
        385
                    end do
      
        386
                  end if
      
        387
                  prev_c = char(0)  ! Reset after range
      
        388
                  pos = pos + 1
      
        389
                  cycle
      
        390
                else
      
        391
                  ! - at start after ] or as first char
      
        392
                  tok%char_class(ichar('-')) = .true.
      
        393
                  pos = pos + 1
      
        394
                  cycle
      
        395
                end if
      
        396
              end if
      
        397
        
        398
              if (c == '\' .and. pos + 1 <= n) then
      
        399
                ! Escape sequence in character class
      
        400
                pos = pos + 1
      
        401
                c = pattern(pos:pos)
      
        402
                select case (c)
      
        403
                  case ('n')
      
        404
                    c = char(10)
      
        405
                  case ('t')
      
        406
                    c = char(9)
      
        407
                  case ('r')
      
        408
                    c = char(13)
      
        409
                  ! Otherwise take the character literally
      
        410
                end select
      
        411
              end if
      
        412
        
        413
              if (c == '[' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) == ':') then
      
        414
                ! POSIX character class [:alpha:] etc
      
        415
                call parse_posix_class(pattern, pos, n, tok%char_class, ierr)
      
        416
                if (ierr /= 0) return
      
        417
                prev_c = char(0)
      
        418
                cycle
      
        419
              end if
      
        420
        
        421
              ! Regular character
      
        422
              tok%char_class(ichar(c)) = .true.
      
        423
              prev_c = c
      
        424
              pos = pos + 1
      
        425
            end do
      
        426
        
        427
            ! Unterminated character class
      
        428
            ierr = 1
      
        429
        
        430
          end subroutine parse_char_class
      
        431
        
        432
          subroutine parse_posix_class(pattern, pos, n, char_class, ierr)
      
        433
            !> Parse POSIX character class [:name:]
      
        434
            character(len=*), intent(in) :: pattern
      
        435
            integer, intent(inout) :: pos
      
        436
            integer, intent(in) :: n
      
        437
            logical, intent(inout) :: char_class(0:255)
      
        438
            integer, intent(out) :: ierr
      
        439
        
        440
            integer :: end_pos, j
      
        441
            character(len=16) :: class_name
      
        442
        
        443
            ierr = 0
      
        444
        
        445
            ! Find closing :]
      
        446
            end_pos = index(pattern(pos:n), ':]')
      
        447
            if (end_pos == 0) then
      
        448
              ierr = 1
      
        449
              return
      
        450
            end if
      
        451
            end_pos = pos + end_pos - 1  ! Adjust to absolute position (index is 1-based)
      
        452
        
        453
            ! Extract class name (skip [: at start, stop before :]
      
        454
            class_name = pattern(pos+2:end_pos-1)
      
        455
        
        456
            select case (trim(class_name))
      
        457
              case ('alnum')
      
        458
                do j = ichar('a'), ichar('z')
      
        459
                  char_class(j) = .true.
      
        460
                end do
      
        461
                do j = ichar('A'), ichar('Z')
      
        462
                  char_class(j) = .true.
      
        463
                end do
      
        464
                do j = ichar('0'), ichar('9')
      
        465
                  char_class(j) = .true.
      
        466
                end do
      
        467
        
        468
              case ('alpha')
      
        469
                do j = ichar('a'), ichar('z')
      
        470
                  char_class(j) = .true.
      
        471
                end do
      
        472
                do j = ichar('A'), ichar('Z')
      
        473
                  char_class(j) = .true.
      
        474
                end do
      
        475
        
        476
              case ('digit')
      
        477
                do j = ichar('0'), ichar('9')
      
        478
                  char_class(j) = .true.
      
        479
                end do
      
        480
        
        481
              case ('lower')
      
        482
                do j = ichar('a'), ichar('z')
      
        483
                  char_class(j) = .true.
      
        484
                end do
      
        485
        
        486
              case ('upper')
      
        487
                do j = ichar('A'), ichar('Z')
      
        488
                  char_class(j) = .true.
      
        489
                end do
      
        490
        
        491
              case ('space')
      
        492
                char_class(ichar(' ')) = .true.
      
        493
                char_class(9) = .true.   ! tab
      
        494
                char_class(10) = .true.  ! newline
      
        495
                char_class(11) = .true.  ! vertical tab
      
        496
                char_class(12) = .true.  ! form feed
      
        497
                char_class(13) = .true.  ! carriage return
      
        498
        
        499
              case ('blank')
      
        500
                char_class(ichar(' ')) = .true.
      
        501
                char_class(9) = .true.   ! tab
      
        502
        
        503
              case ('punct')
      
        504
                ! Punctuation characters
      
        505
                do j = 33, 47
      
        506
                  char_class(j) = .true.
      
        507
                end do
      
        508
                do j = 58, 64
      
        509
                  char_class(j) = .true.
      
        510
                end do
      
        511
                do j = 91, 96
      
        512
                  char_class(j) = .true.
      
        513
                end do
      
        514
                do j = 123, 126
      
        515
                  char_class(j) = .true.
      
        516
                end do
      
        517
        
        518
              case ('xdigit')
      
        519
                do j = ichar('0'), ichar('9')
      
        520
                  char_class(j) = .true.
      
        521
                end do
      
        522
                do j = ichar('a'), ichar('f')
      
        523
                  char_class(j) = .true.
      
        524
                end do
      
        525
                do j = ichar('A'), ichar('F')
      
        526
                  char_class(j) = .true.
      
        527
                end do
      
        528
        
        529
              case ('word')
      
        530
                ! GNU extension: word characters
      
        531
                do j = ichar('a'), ichar('z')
      
        532
                  char_class(j) = .true.
      
        533
                end do
      
        534
                do j = ichar('A'), ichar('Z')
      
        535
                  char_class(j) = .true.
      
        536
                end do
      
        537
                do j = ichar('0'), ichar('9')
      
        538
                  char_class(j) = .true.
      
        539
                end do
      
        540
                char_class(ichar('_')) = .true.
      
        541
        
        542
              case default
      
        543
                ! Unknown class - ignore silently
      
        544
            end select
      
        545
        
        546
            pos = end_pos + 2  ! Skip past :] (end_pos points to ':', so +2 to skip both)
      
        547
        
        548
          end subroutine parse_posix_class
      
        549
        
        550
        end module regex_lexer
      
        551

1	module regex_lexer
2	!> Regex pattern tokenizer for FERP
3	!> Handles both BRE (Basic) and ERE (Extended) regex dialects
4	use regex_types
5	use ferp_kinds, only: pattern_len
6	implicit none
7	private
8
9	public :: tokenize
10
11	contains
12
13	subroutine tokenize(pattern, tokens, is_ere, ierr)
14	!> Tokenize a regex pattern
15	character(len=*), intent(in) :: pattern
16	type(token_list_t), intent(out) :: tokens
17	logical, intent(in) :: is_ere ! True for ERE, false for BRE
18	integer, intent(out) :: ierr
19
20	integer :: i, n
21	character(len=1) :: c, next_c
22	type(token_t) :: tok
23	logical :: in_bracket
24
25	ierr = 0
26	call tokens%init()
27	n = pattern_len(pattern) ! Use pattern_len to preserve whitespace patterns
28	i = 1
29	in_bracket = .false.
30
31	do while (i <= n)
32	c = pattern(i:i)
33	next_c = ' '
34	if (i < n) next_c = pattern(i+1:i+1)
35
36	tok = token_t()
37	tok%pos = i
38
39	! Handle character class separately
40	if (c == '[') then
41	call parse_char_class(pattern, i, n, tok, ierr)
42	if (ierr /= 0) return
43	call tokens%append(tok)
44	cycle
45	end if
46
47	! Handle escape sequences
48	if (c == '\') then
49	if (i >= n) then
50	! Trailing backslash - treat as literal
51	tok%ttype = TOK_LITERAL
52	tok%char_val = '\'
53	i = i + 1
54	else
55	call parse_escape(pattern, i, n, tok, is_ere, ierr)
56	if (ierr /= 0) return
57	end if
58	call tokens%append(tok)
59	cycle
60	end if
61
62	! Handle metacharacters based on BRE vs ERE
63	select case (c)
64	case ('.')
65	tok%ttype = TOK_DOT
66	i = i + 1
67
68	case ('*')
69	tok%ttype = TOK_STAR
70	i = i + 1
71
72	case ('^')
73	tok%ttype = TOK_CARET
74	i = i + 1
75
76	case ('$')
77	tok%ttype = TOK_DOLLAR
78	i = i + 1
79
80	case ('+')
81	if (is_ere) then
82	tok%ttype = TOK_PLUS
83	else
84	tok%ttype = TOK_LITERAL
85	tok%char_val = '+'
86	end if
87	i = i + 1
88
89	case ('?')
90	if (is_ere) then
91	tok%ttype = TOK_QUESTION
92	else
93	tok%ttype = TOK_LITERAL
94	tok%char_val = '?'
95	end if
96	i = i + 1
97
98	case ('\|')
99	if (is_ere) then
100	tok%ttype = TOK_PIPE
101	else
102	tok%ttype = TOK_LITERAL
103	tok%char_val = '\|'
104	end if
105	i = i + 1
106
107	case ('(')
108	if (is_ere) then
109	tok%ttype = TOK_LPAREN
110	else
111	tok%ttype = TOK_LITERAL
112	tok%char_val = '('
113	end if
114	i = i + 1
115
116	case (')')
117	if (is_ere) then
118	tok%ttype = TOK_RPAREN
119	else
120	tok%ttype = TOK_LITERAL
121	tok%char_val = ')'
122	end if
123	i = i + 1
124
125	case ('{')
126	if (is_ere) then
127	tok%ttype = TOK_LBRACE
128	else
129	tok%ttype = TOK_LITERAL
130	tok%char_val = '{'
131	end if
132	i = i + 1
133
134	case ('}')
135	if (is_ere) then
136	tok%ttype = TOK_RBRACE
137	else
138	tok%ttype = TOK_LITERAL
139	tok%char_val = '}'
140	end if
141	i = i + 1
142
143	case default
144	! Literal character
145	tok%ttype = TOK_LITERAL
146	tok%char_val = c
147	i = i + 1
148	end select
149
150	call tokens%append(tok)
151	end do
152
153	! Add end token
154	tok = token_t()
155	tok%ttype = TOK_END
156	tok%pos = n + 1
157	call tokens%append(tok)
158
159	end subroutine tokenize
160
161	subroutine parse_escape(pattern, pos, n, tok, is_ere, ierr)
162	!> Parse an escape sequence starting at pos (which points to \)
163	character(len=*), intent(in) :: pattern
164	integer, intent(inout) :: pos
165	integer, intent(in) :: n
166	type(token_t), intent(out) :: tok
167	logical, intent(in) :: is_ere
168	integer, intent(out) :: ierr
169
170	character(len=1) :: c
171	integer :: ref_num
172
173	ierr = 0
174	pos = pos + 1 ! Skip the backslash
175	c = pattern(pos:pos)
176
177	select case (c)
178	! BRE special escapes (become metacharacters)
179	case ('(')
180	if (is_ere) then
181	tok%ttype = TOK_LITERAL
182	tok%char_val = '('
183	else
184	tok%ttype = TOK_LPAREN
185	end if
186	pos = pos + 1
187
188	case (')')
189	if (is_ere) then
190	tok%ttype = TOK_LITERAL
191	tok%char_val = ')'
192	else
193	tok%ttype = TOK_RPAREN
194	end if
195	pos = pos + 1
196
197	case ('{')
198	if (is_ere) then
199	tok%ttype = TOK_LITERAL
200	tok%char_val = '{'
201	else
202	tok%ttype = TOK_LBRACE
203	end if
204	pos = pos + 1
205
206	case ('}')
207	if (is_ere) then
208	tok%ttype = TOK_LITERAL
209	tok%char_val = '}'
210	else
211	tok%ttype = TOK_RBRACE
212	end if
213	pos = pos + 1
214
215	! GNU extensions for BRE (also work in ERE)
216	case ('+')
217	if (.not. is_ere) then
218	tok%ttype = TOK_PLUS
219	else
220	tok%ttype = TOK_LITERAL
221	tok%char_val = '+'
222	end if
223	pos = pos + 1
224
225	case ('?')
226	if (.not. is_ere) then
227	tok%ttype = TOK_QUESTION
228	else
229	tok%ttype = TOK_LITERAL
230	tok%char_val = '?'
231	end if
232	pos = pos + 1
233
234	case ('\|')
235	if (.not. is_ere) then
236	tok%ttype = TOK_PIPE
237	else
238	tok%ttype = TOK_LITERAL
239	tok%char_val = '\|'
240	end if
241	pos = pos + 1
242
243	! Backreferences \1-\9
244	case ('1', '2', '3', '4', '5', '6', '7', '8', '9')
245	tok%ttype = TOK_BACKREF
246	read(c, '(I1)') ref_num
247	tok%int_val = ref_num
248	pos = pos + 1
249
250	! Word boundaries
251	case ('<')
252	tok%ttype = TOK_WORD_BOUNDARY
253	tok%int_val = 1 ! word start
254	pos = pos + 1
255
256	case ('>')
257	tok%ttype = TOK_WORD_BOUNDARY
258	tok%int_val = 2 ! word end
259	pos = pos + 1
260
261	case ('b')
262	tok%ttype = TOK_WORD_BOUNDARY
263	tok%int_val = 3 ! word boundary (either)
264	pos = pos + 1
265
266	case ('B')
267	tok%ttype = TOK_WORD_BOUNDARY
268	tok%int_val = 4 ! not word boundary
269	pos = pos + 1
270
271	! Character escapes
272	case ('n')
273	tok%ttype = TOK_LITERAL
274	tok%char_val = char(10) ! newline
275	pos = pos + 1
276
277	case ('t')
278	tok%ttype = TOK_LITERAL
279	tok%char_val = char(9) ! tab
280	pos = pos + 1
281
282	case ('r')
283	tok%ttype = TOK_LITERAL
284	tok%char_val = char(13) ! carriage return
285	pos = pos + 1
286
287	! Escape metacharacters to make them literal
288	case ('.', '*', '^', '$', '[', ']', '\')
289	tok%ttype = TOK_LITERAL
290	tok%char_val = c
291	pos = pos + 1
292
293	case default
294	! Unknown escape - treat as literal
295	tok%ttype = TOK_LITERAL
296	tok%char_val = c
297	pos = pos + 1
298	end select
299
300	end subroutine parse_escape
301
302	subroutine parse_char_class(pattern, pos, n, tok, ierr)
303	!> Parse a character class [...] starting at pos (which points to [)
304	character(len=*), intent(in) :: pattern
305	integer, intent(inout) :: pos
306	integer, intent(in) :: n
307	type(token_t), intent(out) :: tok
308	integer, intent(out) :: ierr
309
310	integer :: j, start_char, end_char
311	character(len=1) :: c, prev_c
312	logical :: negated, first
313
314	ierr = 0
315	tok%ttype = TOK_LITERAL ! Will be set properly at end
316	tok%char_class = .false.
317	tok%negated = .false.
318
319	pos = pos + 1 ! Skip [
320
321	if (pos > n) then
322	ierr = 1
323	return
324	end if
325
326	! Check for negation
327	negated = .false.
328	if (pattern(pos:pos) == '^') then
329	negated = .true.
330	pos = pos + 1
331	end if
332
333	! Handle ] at start (it's literal)
334	first = .true.
335	if (pos <= n .and. pattern(pos:pos) == ']') then
336	tok%char_class(ichar(']')) = .true.
337	pos = pos + 1
338	first = .false.
339	end if
340
341	! Handle - at start (it's literal)
342	if (pos <= n .and. pattern(pos:pos) == '-') then
343	tok%char_class(ichar('-')) = .true.
344	pos = pos + 1
345	end if
346
347	prev_c = char(0)
348	do while (pos <= n)
349	c = pattern(pos:pos)
350
351	if (c == ']') then
352	! End of character class
353	pos = pos + 1
354	tok%negated = negated
355	tok%ttype = TOK_LBRACKET ! Indicate this is a char class token
356	return
357	end if
358
359	if (c == '-' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) /= ']') then
360	! Range: prev_c - next_c
361	if (prev_c /= char(0)) then
362	pos = pos + 1
363	if (pos > n) then
364	ierr = 1
365	return
366	end if
367	c = pattern(pos:pos)
368
369	! Handle escape in range end
370	if (c == '\' .and. pos + 1 <= n) then
371	pos = pos + 1
372	c = pattern(pos:pos)
373	end if
374
375	start_char = ichar(prev_c)
376	end_char = ichar(c)
377	if (start_char > end_char) then
378	! Invalid range, but we'll be lenient
379	tok%char_class(start_char) = .true.
380	tok%char_class(ichar('-')) = .true.
381	tok%char_class(end_char) = .true.
382	else
383	do j = start_char, end_char
384	tok%char_class(j) = .true.
385	end do
386	end if
387	prev_c = char(0) ! Reset after range
388	pos = pos + 1
389	cycle
390	else
391	! - at start after ] or as first char
392	tok%char_class(ichar('-')) = .true.
393	pos = pos + 1
394	cycle
395	end if
396	end if
397
398	if (c == '\' .and. pos + 1 <= n) then
399	! Escape sequence in character class
400	pos = pos + 1
401	c = pattern(pos:pos)
402	select case (c)
403	case ('n')
404	c = char(10)
405	case ('t')
406	c = char(9)
407	case ('r')
408	c = char(13)
409	! Otherwise take the character literally
410	end select
411	end if
412
413	if (c == '[' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) == ':') then
414	! POSIX character class [:alpha:] etc
415	call parse_posix_class(pattern, pos, n, tok%char_class, ierr)
416	if (ierr /= 0) return
417	prev_c = char(0)
418	cycle
419	end if
420
421	! Regular character
422	tok%char_class(ichar(c)) = .true.
423	prev_c = c
424	pos = pos + 1
425	end do
426
427	! Unterminated character class
428	ierr = 1
429
430	end subroutine parse_char_class
431
432	subroutine parse_posix_class(pattern, pos, n, char_class, ierr)
433	!> Parse POSIX character class [:name:]
434	character(len=*), intent(in) :: pattern
435	integer, intent(inout) :: pos
436	integer, intent(in) :: n
437	logical, intent(inout) :: char_class(0:255)
438	integer, intent(out) :: ierr
439
440	integer :: end_pos, j
441	character(len=16) :: class_name
442
443	ierr = 0
444
445	! Find closing :]
446	end_pos = index(pattern(pos:n), ':]')
447	if (end_pos == 0) then
448	ierr = 1
449	return
450	end if
451	end_pos = pos + end_pos - 1 ! Adjust to absolute position (index is 1-based)
452
453	! Extract class name (skip [: at start, stop before :]
454	class_name = pattern(pos+2:end_pos-1)
455
456	select case (trim(class_name))
457	case ('alnum')
458	do j = ichar('a'), ichar('z')
459	char_class(j) = .true.
460	end do
461	do j = ichar('A'), ichar('Z')
462	char_class(j) = .true.
463	end do
464	do j = ichar('0'), ichar('9')
465	char_class(j) = .true.
466	end do
467
468	case ('alpha')
469	do j = ichar('a'), ichar('z')
470	char_class(j) = .true.
471	end do
472	do j = ichar('A'), ichar('Z')
473	char_class(j) = .true.
474	end do
475
476	case ('digit')
477	do j = ichar('0'), ichar('9')
478	char_class(j) = .true.
479	end do
480
481	case ('lower')
482	do j = ichar('a'), ichar('z')
483	char_class(j) = .true.
484	end do
485
486	case ('upper')
487	do j = ichar('A'), ichar('Z')
488	char_class(j) = .true.
489	end do
490
491	case ('space')
492	char_class(ichar(' ')) = .true.
493	char_class(9) = .true. ! tab
494	char_class(10) = .true. ! newline
495	char_class(11) = .true. ! vertical tab
496	char_class(12) = .true. ! form feed
497	char_class(13) = .true. ! carriage return
498
499	case ('blank')
500	char_class(ichar(' ')) = .true.
501	char_class(9) = .true. ! tab
502
503	case ('punct')
504	! Punctuation characters
505	do j = 33, 47
506	char_class(j) = .true.
507	end do
508	do j = 58, 64
509	char_class(j) = .true.
510	end do
511	do j = 91, 96
512	char_class(j) = .true.
513	end do
514	do j = 123, 126
515	char_class(j) = .true.
516	end do
517
518	case ('xdigit')
519	do j = ichar('0'), ichar('9')
520	char_class(j) = .true.
521	end do
522	do j = ichar('a'), ichar('f')
523	char_class(j) = .true.
524	end do
525	do j = ichar('A'), ichar('F')
526	char_class(j) = .true.
527	end do
528
529	case ('word')
530	! GNU extension: word characters
531	do j = ichar('a'), ichar('z')
532	char_class(j) = .true.
533	end do
534	do j = ichar('A'), ichar('Z')
535	char_class(j) = .true.
536	end do
537	do j = ichar('0'), ichar('9')
538	char_class(j) = .true.
539	end do
540	char_class(ichar('_')) = .true.
541
542	case default
543	! Unknown class - ignore silently
544	end select
545
546	pos = end_pos + 2 ! Skip past :] (end_pos points to ':', so +2 to skip both)
547
548	end subroutine parse_posix_class
549
550	end module regex_lexer
551