| 1 | module regex_lexer |
| 2 | !> Regex pattern tokenizer for FERP |
| 3 | !> Handles both BRE (Basic) and ERE (Extended) regex dialects |
| 4 | use regex_types |
| 5 | use ferp_kinds, only: pattern_len |
| 6 | implicit none |
| 7 | private |
| 8 | |
| 9 | public :: tokenize |
| 10 | |
| 11 | contains |
| 12 | |
| 13 | subroutine tokenize(pattern, tokens, is_ere, ierr) |
| 14 | !> Tokenize a regex pattern |
| 15 | character(len=*), intent(in) :: pattern |
| 16 | type(token_list_t), intent(out) :: tokens |
| 17 | logical, intent(in) :: is_ere ! True for ERE, false for BRE |
| 18 | integer, intent(out) :: ierr |
| 19 | |
| 20 | integer :: i, n |
| 21 | character(len=1) :: c, next_c |
| 22 | type(token_t) :: tok |
| 23 | logical :: in_bracket |
| 24 | |
| 25 | ierr = 0 |
| 26 | call tokens%init() |
| 27 | n = pattern_len(pattern) ! Use pattern_len to preserve whitespace patterns |
| 28 | i = 1 |
| 29 | in_bracket = .false. |
| 30 | |
| 31 | do while (i <= n) |
| 32 | c = pattern(i:i) |
| 33 | next_c = ' ' |
| 34 | if (i < n) next_c = pattern(i+1:i+1) |
| 35 | |
| 36 | tok = token_t() |
| 37 | tok%pos = i |
| 38 | |
| 39 | ! Handle character class separately |
| 40 | if (c == '[') then |
| 41 | call parse_char_class(pattern, i, n, tok, ierr) |
| 42 | if (ierr /= 0) return |
| 43 | call tokens%append(tok) |
| 44 | cycle |
| 45 | end if |
| 46 | |
| 47 | ! Handle escape sequences |
| 48 | if (c == '\') then |
| 49 | if (i >= n) then |
| 50 | ! Trailing backslash - treat as literal |
| 51 | tok%ttype = TOK_LITERAL |
| 52 | tok%char_val = '\' |
| 53 | i = i + 1 |
| 54 | else |
| 55 | call parse_escape(pattern, i, n, tok, is_ere, ierr) |
| 56 | if (ierr /= 0) return |
| 57 | end if |
| 58 | call tokens%append(tok) |
| 59 | cycle |
| 60 | end if |
| 61 | |
| 62 | ! Handle metacharacters based on BRE vs ERE |
| 63 | select case (c) |
| 64 | case ('.') |
| 65 | tok%ttype = TOK_DOT |
| 66 | i = i + 1 |
| 67 | |
| 68 | case ('*') |
| 69 | tok%ttype = TOK_STAR |
| 70 | i = i + 1 |
| 71 | |
| 72 | case ('^') |
| 73 | tok%ttype = TOK_CARET |
| 74 | i = i + 1 |
| 75 | |
| 76 | case ('$') |
| 77 | tok%ttype = TOK_DOLLAR |
| 78 | i = i + 1 |
| 79 | |
| 80 | case ('+') |
| 81 | if (is_ere) then |
| 82 | tok%ttype = TOK_PLUS |
| 83 | else |
| 84 | tok%ttype = TOK_LITERAL |
| 85 | tok%char_val = '+' |
| 86 | end if |
| 87 | i = i + 1 |
| 88 | |
| 89 | case ('?') |
| 90 | if (is_ere) then |
| 91 | tok%ttype = TOK_QUESTION |
| 92 | else |
| 93 | tok%ttype = TOK_LITERAL |
| 94 | tok%char_val = '?' |
| 95 | end if |
| 96 | i = i + 1 |
| 97 | |
| 98 | case ('|') |
| 99 | if (is_ere) then |
| 100 | tok%ttype = TOK_PIPE |
| 101 | else |
| 102 | tok%ttype = TOK_LITERAL |
| 103 | tok%char_val = '|' |
| 104 | end if |
| 105 | i = i + 1 |
| 106 | |
| 107 | case ('(') |
| 108 | if (is_ere) then |
| 109 | tok%ttype = TOK_LPAREN |
| 110 | else |
| 111 | tok%ttype = TOK_LITERAL |
| 112 | tok%char_val = '(' |
| 113 | end if |
| 114 | i = i + 1 |
| 115 | |
| 116 | case (')') |
| 117 | if (is_ere) then |
| 118 | tok%ttype = TOK_RPAREN |
| 119 | else |
| 120 | tok%ttype = TOK_LITERAL |
| 121 | tok%char_val = ')' |
| 122 | end if |
| 123 | i = i + 1 |
| 124 | |
| 125 | case ('{') |
| 126 | if (is_ere) then |
| 127 | tok%ttype = TOK_LBRACE |
| 128 | else |
| 129 | tok%ttype = TOK_LITERAL |
| 130 | tok%char_val = '{' |
| 131 | end if |
| 132 | i = i + 1 |
| 133 | |
| 134 | case ('}') |
| 135 | if (is_ere) then |
| 136 | tok%ttype = TOK_RBRACE |
| 137 | else |
| 138 | tok%ttype = TOK_LITERAL |
| 139 | tok%char_val = '}' |
| 140 | end if |
| 141 | i = i + 1 |
| 142 | |
| 143 | case default |
| 144 | ! Literal character |
| 145 | tok%ttype = TOK_LITERAL |
| 146 | tok%char_val = c |
| 147 | i = i + 1 |
| 148 | end select |
| 149 | |
| 150 | call tokens%append(tok) |
| 151 | end do |
| 152 | |
| 153 | ! Add end token |
| 154 | tok = token_t() |
| 155 | tok%ttype = TOK_END |
| 156 | tok%pos = n + 1 |
| 157 | call tokens%append(tok) |
| 158 | |
| 159 | end subroutine tokenize |
| 160 | |
| 161 | subroutine parse_escape(pattern, pos, n, tok, is_ere, ierr) |
| 162 | !> Parse an escape sequence starting at pos (which points to \) |
| 163 | character(len=*), intent(in) :: pattern |
| 164 | integer, intent(inout) :: pos |
| 165 | integer, intent(in) :: n |
| 166 | type(token_t), intent(out) :: tok |
| 167 | logical, intent(in) :: is_ere |
| 168 | integer, intent(out) :: ierr |
| 169 | |
| 170 | character(len=1) :: c |
| 171 | integer :: ref_num |
| 172 | |
| 173 | ierr = 0 |
| 174 | pos = pos + 1 ! Skip the backslash |
| 175 | c = pattern(pos:pos) |
| 176 | |
| 177 | select case (c) |
| 178 | ! BRE special escapes (become metacharacters) |
| 179 | case ('(') |
| 180 | if (is_ere) then |
| 181 | tok%ttype = TOK_LITERAL |
| 182 | tok%char_val = '(' |
| 183 | else |
| 184 | tok%ttype = TOK_LPAREN |
| 185 | end if |
| 186 | pos = pos + 1 |
| 187 | |
| 188 | case (')') |
| 189 | if (is_ere) then |
| 190 | tok%ttype = TOK_LITERAL |
| 191 | tok%char_val = ')' |
| 192 | else |
| 193 | tok%ttype = TOK_RPAREN |
| 194 | end if |
| 195 | pos = pos + 1 |
| 196 | |
| 197 | case ('{') |
| 198 | if (is_ere) then |
| 199 | tok%ttype = TOK_LITERAL |
| 200 | tok%char_val = '{' |
| 201 | else |
| 202 | tok%ttype = TOK_LBRACE |
| 203 | end if |
| 204 | pos = pos + 1 |
| 205 | |
| 206 | case ('}') |
| 207 | if (is_ere) then |
| 208 | tok%ttype = TOK_LITERAL |
| 209 | tok%char_val = '}' |
| 210 | else |
| 211 | tok%ttype = TOK_RBRACE |
| 212 | end if |
| 213 | pos = pos + 1 |
| 214 | |
| 215 | ! GNU extensions for BRE (also work in ERE) |
| 216 | case ('+') |
| 217 | if (.not. is_ere) then |
| 218 | tok%ttype = TOK_PLUS |
| 219 | else |
| 220 | tok%ttype = TOK_LITERAL |
| 221 | tok%char_val = '+' |
| 222 | end if |
| 223 | pos = pos + 1 |
| 224 | |
| 225 | case ('?') |
| 226 | if (.not. is_ere) then |
| 227 | tok%ttype = TOK_QUESTION |
| 228 | else |
| 229 | tok%ttype = TOK_LITERAL |
| 230 | tok%char_val = '?' |
| 231 | end if |
| 232 | pos = pos + 1 |
| 233 | |
| 234 | case ('|') |
| 235 | if (.not. is_ere) then |
| 236 | tok%ttype = TOK_PIPE |
| 237 | else |
| 238 | tok%ttype = TOK_LITERAL |
| 239 | tok%char_val = '|' |
| 240 | end if |
| 241 | pos = pos + 1 |
| 242 | |
| 243 | ! Backreferences \1-\9 |
| 244 | case ('1', '2', '3', '4', '5', '6', '7', '8', '9') |
| 245 | tok%ttype = TOK_BACKREF |
| 246 | read(c, '(I1)') ref_num |
| 247 | tok%int_val = ref_num |
| 248 | pos = pos + 1 |
| 249 | |
| 250 | ! Word boundaries |
| 251 | case ('<') |
| 252 | tok%ttype = TOK_WORD_BOUNDARY |
| 253 | tok%int_val = 1 ! word start |
| 254 | pos = pos + 1 |
| 255 | |
| 256 | case ('>') |
| 257 | tok%ttype = TOK_WORD_BOUNDARY |
| 258 | tok%int_val = 2 ! word end |
| 259 | pos = pos + 1 |
| 260 | |
| 261 | case ('b') |
| 262 | tok%ttype = TOK_WORD_BOUNDARY |
| 263 | tok%int_val = 3 ! word boundary (either) |
| 264 | pos = pos + 1 |
| 265 | |
| 266 | case ('B') |
| 267 | tok%ttype = TOK_WORD_BOUNDARY |
| 268 | tok%int_val = 4 ! not word boundary |
| 269 | pos = pos + 1 |
| 270 | |
| 271 | ! Character escapes |
| 272 | case ('n') |
| 273 | tok%ttype = TOK_LITERAL |
| 274 | tok%char_val = char(10) ! newline |
| 275 | pos = pos + 1 |
| 276 | |
| 277 | case ('t') |
| 278 | tok%ttype = TOK_LITERAL |
| 279 | tok%char_val = char(9) ! tab |
| 280 | pos = pos + 1 |
| 281 | |
| 282 | case ('r') |
| 283 | tok%ttype = TOK_LITERAL |
| 284 | tok%char_val = char(13) ! carriage return |
| 285 | pos = pos + 1 |
| 286 | |
| 287 | ! Escape metacharacters to make them literal |
| 288 | case ('.', '*', '^', '$', '[', ']', '\') |
| 289 | tok%ttype = TOK_LITERAL |
| 290 | tok%char_val = c |
| 291 | pos = pos + 1 |
| 292 | |
| 293 | case default |
| 294 | ! Unknown escape - treat as literal |
| 295 | tok%ttype = TOK_LITERAL |
| 296 | tok%char_val = c |
| 297 | pos = pos + 1 |
| 298 | end select |
| 299 | |
| 300 | end subroutine parse_escape |
| 301 | |
| 302 | subroutine parse_char_class(pattern, pos, n, tok, ierr) |
| 303 | !> Parse a character class [...] starting at pos (which points to [) |
| 304 | character(len=*), intent(in) :: pattern |
| 305 | integer, intent(inout) :: pos |
| 306 | integer, intent(in) :: n |
| 307 | type(token_t), intent(out) :: tok |
| 308 | integer, intent(out) :: ierr |
| 309 | |
| 310 | integer :: j, start_char, end_char |
| 311 | character(len=1) :: c, prev_c |
| 312 | logical :: negated, first |
| 313 | |
| 314 | ierr = 0 |
| 315 | tok%ttype = TOK_LITERAL ! Will be set properly at end |
| 316 | tok%char_class = .false. |
| 317 | tok%negated = .false. |
| 318 | |
| 319 | pos = pos + 1 ! Skip [ |
| 320 | |
| 321 | if (pos > n) then |
| 322 | ierr = 1 |
| 323 | return |
| 324 | end if |
| 325 | |
| 326 | ! Check for negation |
| 327 | negated = .false. |
| 328 | if (pattern(pos:pos) == '^') then |
| 329 | negated = .true. |
| 330 | pos = pos + 1 |
| 331 | end if |
| 332 | |
| 333 | ! Handle ] at start (it's literal) |
| 334 | first = .true. |
| 335 | if (pos <= n .and. pattern(pos:pos) == ']') then |
| 336 | tok%char_class(ichar(']')) = .true. |
| 337 | pos = pos + 1 |
| 338 | first = .false. |
| 339 | end if |
| 340 | |
| 341 | ! Handle - at start (it's literal) |
| 342 | if (pos <= n .and. pattern(pos:pos) == '-') then |
| 343 | tok%char_class(ichar('-')) = .true. |
| 344 | pos = pos + 1 |
| 345 | end if |
| 346 | |
| 347 | prev_c = char(0) |
| 348 | do while (pos <= n) |
| 349 | c = pattern(pos:pos) |
| 350 | |
| 351 | if (c == ']') then |
| 352 | ! End of character class |
| 353 | pos = pos + 1 |
| 354 | tok%negated = negated |
| 355 | tok%ttype = TOK_LBRACKET ! Indicate this is a char class token |
| 356 | return |
| 357 | end if |
| 358 | |
| 359 | if (c == '-' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) /= ']') then |
| 360 | ! Range: prev_c - next_c |
| 361 | if (prev_c /= char(0)) then |
| 362 | pos = pos + 1 |
| 363 | if (pos > n) then |
| 364 | ierr = 1 |
| 365 | return |
| 366 | end if |
| 367 | c = pattern(pos:pos) |
| 368 | |
| 369 | ! Handle escape in range end |
| 370 | if (c == '\' .and. pos + 1 <= n) then |
| 371 | pos = pos + 1 |
| 372 | c = pattern(pos:pos) |
| 373 | end if |
| 374 | |
| 375 | start_char = ichar(prev_c) |
| 376 | end_char = ichar(c) |
| 377 | if (start_char > end_char) then |
| 378 | ! Invalid range, but we'll be lenient |
| 379 | tok%char_class(start_char) = .true. |
| 380 | tok%char_class(ichar('-')) = .true. |
| 381 | tok%char_class(end_char) = .true. |
| 382 | else |
| 383 | do j = start_char, end_char |
| 384 | tok%char_class(j) = .true. |
| 385 | end do |
| 386 | end if |
| 387 | prev_c = char(0) ! Reset after range |
| 388 | pos = pos + 1 |
| 389 | cycle |
| 390 | else |
| 391 | ! - at start after ] or as first char |
| 392 | tok%char_class(ichar('-')) = .true. |
| 393 | pos = pos + 1 |
| 394 | cycle |
| 395 | end if |
| 396 | end if |
| 397 | |
| 398 | if (c == '\' .and. pos + 1 <= n) then |
| 399 | ! Escape sequence in character class |
| 400 | pos = pos + 1 |
| 401 | c = pattern(pos:pos) |
| 402 | select case (c) |
| 403 | case ('n') |
| 404 | c = char(10) |
| 405 | case ('t') |
| 406 | c = char(9) |
| 407 | case ('r') |
| 408 | c = char(13) |
| 409 | ! Otherwise take the character literally |
| 410 | end select |
| 411 | end if |
| 412 | |
| 413 | if (c == '[' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) == ':') then |
| 414 | ! POSIX character class [:alpha:] etc |
| 415 | call parse_posix_class(pattern, pos, n, tok%char_class, ierr) |
| 416 | if (ierr /= 0) return |
| 417 | prev_c = char(0) |
| 418 | cycle |
| 419 | end if |
| 420 | |
| 421 | ! Regular character |
| 422 | tok%char_class(ichar(c)) = .true. |
| 423 | prev_c = c |
| 424 | pos = pos + 1 |
| 425 | end do |
| 426 | |
| 427 | ! Unterminated character class |
| 428 | ierr = 1 |
| 429 | |
| 430 | end subroutine parse_char_class |
| 431 | |
| 432 | subroutine parse_posix_class(pattern, pos, n, char_class, ierr) |
| 433 | !> Parse POSIX character class [:name:] |
| 434 | character(len=*), intent(in) :: pattern |
| 435 | integer, intent(inout) :: pos |
| 436 | integer, intent(in) :: n |
| 437 | logical, intent(inout) :: char_class(0:255) |
| 438 | integer, intent(out) :: ierr |
| 439 | |
| 440 | integer :: end_pos, j |
| 441 | character(len=16) :: class_name |
| 442 | |
| 443 | ierr = 0 |
| 444 | |
| 445 | ! Find closing :] |
| 446 | end_pos = index(pattern(pos:n), ':]') |
| 447 | if (end_pos == 0) then |
| 448 | ierr = 1 |
| 449 | return |
| 450 | end if |
| 451 | end_pos = pos + end_pos - 1 ! Adjust to absolute position (index is 1-based) |
| 452 | |
| 453 | ! Extract class name (skip [: at start, stop before :] |
| 454 | class_name = pattern(pos+2:end_pos-1) |
| 455 | |
| 456 | select case (trim(class_name)) |
| 457 | case ('alnum') |
| 458 | do j = ichar('a'), ichar('z') |
| 459 | char_class(j) = .true. |
| 460 | end do |
| 461 | do j = ichar('A'), ichar('Z') |
| 462 | char_class(j) = .true. |
| 463 | end do |
| 464 | do j = ichar('0'), ichar('9') |
| 465 | char_class(j) = .true. |
| 466 | end do |
| 467 | |
| 468 | case ('alpha') |
| 469 | do j = ichar('a'), ichar('z') |
| 470 | char_class(j) = .true. |
| 471 | end do |
| 472 | do j = ichar('A'), ichar('Z') |
| 473 | char_class(j) = .true. |
| 474 | end do |
| 475 | |
| 476 | case ('digit') |
| 477 | do j = ichar('0'), ichar('9') |
| 478 | char_class(j) = .true. |
| 479 | end do |
| 480 | |
| 481 | case ('lower') |
| 482 | do j = ichar('a'), ichar('z') |
| 483 | char_class(j) = .true. |
| 484 | end do |
| 485 | |
| 486 | case ('upper') |
| 487 | do j = ichar('A'), ichar('Z') |
| 488 | char_class(j) = .true. |
| 489 | end do |
| 490 | |
| 491 | case ('space') |
| 492 | char_class(ichar(' ')) = .true. |
| 493 | char_class(9) = .true. ! tab |
| 494 | char_class(10) = .true. ! newline |
| 495 | char_class(11) = .true. ! vertical tab |
| 496 | char_class(12) = .true. ! form feed |
| 497 | char_class(13) = .true. ! carriage return |
| 498 | |
| 499 | case ('blank') |
| 500 | char_class(ichar(' ')) = .true. |
| 501 | char_class(9) = .true. ! tab |
| 502 | |
| 503 | case ('punct') |
| 504 | ! Punctuation characters |
| 505 | do j = 33, 47 |
| 506 | char_class(j) = .true. |
| 507 | end do |
| 508 | do j = 58, 64 |
| 509 | char_class(j) = .true. |
| 510 | end do |
| 511 | do j = 91, 96 |
| 512 | char_class(j) = .true. |
| 513 | end do |
| 514 | do j = 123, 126 |
| 515 | char_class(j) = .true. |
| 516 | end do |
| 517 | |
| 518 | case ('xdigit') |
| 519 | do j = ichar('0'), ichar('9') |
| 520 | char_class(j) = .true. |
| 521 | end do |
| 522 | do j = ichar('a'), ichar('f') |
| 523 | char_class(j) = .true. |
| 524 | end do |
| 525 | do j = ichar('A'), ichar('F') |
| 526 | char_class(j) = .true. |
| 527 | end do |
| 528 | |
| 529 | case ('word') |
| 530 | ! GNU extension: word characters |
| 531 | do j = ichar('a'), ichar('z') |
| 532 | char_class(j) = .true. |
| 533 | end do |
| 534 | do j = ichar('A'), ichar('Z') |
| 535 | char_class(j) = .true. |
| 536 | end do |
| 537 | do j = ichar('0'), ichar('9') |
| 538 | char_class(j) = .true. |
| 539 | end do |
| 540 | char_class(ichar('_')) = .true. |
| 541 | |
| 542 | case default |
| 543 | ! Unknown class - ignore silently |
| 544 | end select |
| 545 | |
| 546 | pos = end_pos + 2 ! Skip past :] (end_pos points to ':', so +2 to skip both) |
| 547 | |
| 548 | end subroutine parse_posix_class |
| 549 | |
| 550 | end module regex_lexer |
| 551 |