| 1 | module regex_parser |
| 2 | !> Regex parser - converts token stream to AST |
| 3 | !> Uses recursive descent parsing with proper precedence: |
| 4 | !> alternation < concatenation < quantifier < atom |
| 5 | use regex_types |
| 6 | implicit none |
| 7 | private |
| 8 | |
| 9 | public :: parse, ast_pool_t |
| 10 | |
| 11 | !> AST node pool - stores all nodes with indices |
| 12 | type :: ast_pool_t |
| 13 | type(ast_node_t), allocatable :: nodes(:) |
| 14 | integer :: count = 0 |
| 15 | integer :: capacity = 0 |
| 16 | contains |
| 17 | procedure :: init => pool_init |
| 18 | procedure :: alloc => pool_alloc |
| 19 | procedure :: get => pool_get |
| 20 | procedure :: cleanup => pool_cleanup |
| 21 | end type ast_pool_t |
| 22 | |
| 23 | ! Module-level parsing state |
| 24 | type(token_list_t), pointer :: g_tokens => null() |
| 25 | integer :: g_pos = 0 |
| 26 | integer :: g_group_num = 0 |
| 27 | type(ast_pool_t), pointer :: g_pool => null() |
| 28 | |
| 29 | contains |
| 30 | |
| 31 | !--------------------------------------------------------------------------- |
| 32 | ! AST Pool Methods |
| 33 | !--------------------------------------------------------------------------- |
| 34 | subroutine pool_init(this, initial_capacity) |
| 35 | class(ast_pool_t), intent(inout) :: this |
| 36 | integer, intent(in), optional :: initial_capacity |
| 37 | integer :: cap |
| 38 | |
| 39 | cap = 64 |
| 40 | if (present(initial_capacity)) cap = initial_capacity |
| 41 | |
| 42 | if (allocated(this%nodes)) deallocate(this%nodes) |
| 43 | allocate(this%nodes(cap)) |
| 44 | this%count = 0 |
| 45 | this%capacity = cap |
| 46 | end subroutine pool_init |
| 47 | |
| 48 | function pool_alloc(this) result(idx) |
| 49 | class(ast_pool_t), intent(inout) :: this |
| 50 | integer :: idx |
| 51 | type(ast_node_t), allocatable :: temp(:) |
| 52 | |
| 53 | if (.not. allocated(this%nodes)) call this%init() |
| 54 | |
| 55 | if (this%count >= this%capacity) then |
| 56 | allocate(temp(this%capacity * 2)) |
| 57 | temp(1:this%count) = this%nodes(1:this%count) |
| 58 | call move_alloc(temp, this%nodes) |
| 59 | this%capacity = this%capacity * 2 |
| 60 | end if |
| 61 | |
| 62 | this%count = this%count + 1 |
| 63 | idx = this%count |
| 64 | this%nodes(idx) = ast_node_t() ! Initialize to default |
| 65 | end function pool_alloc |
| 66 | |
| 67 | function pool_get(this, idx) result(node) |
| 68 | class(ast_pool_t), intent(in) :: this |
| 69 | integer, intent(in) :: idx |
| 70 | type(ast_node_t) :: node |
| 71 | |
| 72 | if (idx >= 1 .and. idx <= this%count) then |
| 73 | node = this%nodes(idx) |
| 74 | else |
| 75 | node = ast_node_t() |
| 76 | end if |
| 77 | end function pool_get |
| 78 | |
| 79 | subroutine pool_cleanup(this) |
| 80 | class(ast_pool_t), intent(inout) :: this |
| 81 | if (allocated(this%nodes)) deallocate(this%nodes) |
| 82 | this%count = 0 |
| 83 | this%capacity = 0 |
| 84 | end subroutine pool_cleanup |
| 85 | |
| 86 | !--------------------------------------------------------------------------- |
| 87 | ! Main Parse Entry Point |
| 88 | !--------------------------------------------------------------------------- |
| 89 | subroutine parse(tokens, pool, root_idx, num_groups, ierr) |
| 90 | !> Parse tokens into an AST stored in pool |
| 91 | type(token_list_t), target, intent(in) :: tokens |
| 92 | type(ast_pool_t), target, intent(inout) :: pool |
| 93 | integer, intent(out) :: root_idx |
| 94 | integer, intent(out) :: num_groups |
| 95 | integer, intent(out) :: ierr |
| 96 | |
| 97 | ierr = 0 |
| 98 | root_idx = 0 |
| 99 | |
| 100 | ! Set up global state |
| 101 | g_tokens => tokens |
| 102 | g_pos = 1 |
| 103 | g_group_num = 0 |
| 104 | g_pool => pool |
| 105 | |
| 106 | call pool%init() |
| 107 | |
| 108 | ! Parse alternation (lowest precedence) |
| 109 | root_idx = parse_alternation(ierr) |
| 110 | if (ierr /= 0) return |
| 111 | |
| 112 | num_groups = g_group_num |
| 113 | |
| 114 | ! Verify we consumed all tokens |
| 115 | if (current_type() /= TOK_END) then |
| 116 | ierr = 1 |
| 117 | end if |
| 118 | |
| 119 | end subroutine parse |
| 120 | |
| 121 | !--------------------------------------------------------------------------- |
| 122 | ! Token Access Helpers |
| 123 | !--------------------------------------------------------------------------- |
| 124 | function current_token() result(tok) |
| 125 | type(token_t) :: tok |
| 126 | tok = g_tokens%get(g_pos) |
| 127 | end function current_token |
| 128 | |
| 129 | function current_type() result(ttype) |
| 130 | integer :: ttype |
| 131 | type(token_t) :: tok |
| 132 | tok = g_tokens%get(g_pos) |
| 133 | ttype = tok%ttype |
| 134 | end function current_type |
| 135 | |
| 136 | subroutine advance() |
| 137 | g_pos = g_pos + 1 |
| 138 | end subroutine advance |
| 139 | |
| 140 | !--------------------------------------------------------------------------- |
| 141 | ! Recursive Descent Parser |
| 142 | !--------------------------------------------------------------------------- |
| 143 | |
| 144 | recursive function parse_alternation(ierr) result(idx) |
| 145 | !> Parse: alternation = concat ('|' concat)* |
| 146 | integer, intent(out) :: ierr |
| 147 | integer :: idx |
| 148 | |
| 149 | integer :: left, right, alt_idx |
| 150 | |
| 151 | ierr = 0 |
| 152 | idx = 0 |
| 153 | left = parse_concat(ierr) |
| 154 | if (ierr /= 0) return |
| 155 | |
| 156 | idx = left |
| 157 | |
| 158 | do while (current_type() == TOK_PIPE) |
| 159 | call advance() ! consume | |
| 160 | |
| 161 | right = parse_concat(ierr) |
| 162 | if (ierr /= 0) return |
| 163 | |
| 164 | ! Create alternation node |
| 165 | alt_idx = g_pool%alloc() |
| 166 | g_pool%nodes(alt_idx)%ntype = AST_ALTERNATE |
| 167 | g_pool%nodes(alt_idx)%left = idx |
| 168 | g_pool%nodes(alt_idx)%right = right |
| 169 | |
| 170 | idx = alt_idx |
| 171 | end do |
| 172 | |
| 173 | end function parse_alternation |
| 174 | |
| 175 | recursive function parse_concat(ierr) result(idx) |
| 176 | !> Parse: concat = quantified+ |
| 177 | integer, intent(out) :: ierr |
| 178 | integer :: idx |
| 179 | |
| 180 | integer :: left, right, concat_idx, ttype |
| 181 | |
| 182 | ierr = 0 |
| 183 | idx = 0 |
| 184 | |
| 185 | ! Check if we have anything to parse |
| 186 | ttype = current_type() |
| 187 | if (ttype == TOK_END .or. ttype == TOK_PIPE .or. ttype == TOK_RPAREN) then |
| 188 | ! Empty - create empty literal (matches empty string) |
| 189 | idx = g_pool%alloc() |
| 190 | g_pool%nodes(idx)%ntype = AST_LITERAL |
| 191 | g_pool%nodes(idx)%char_val = char(0) ! Special empty marker |
| 192 | return |
| 193 | end if |
| 194 | |
| 195 | ! Parse first quantified expression |
| 196 | left = parse_quantified(ierr) |
| 197 | if (ierr /= 0) return |
| 198 | |
| 199 | idx = left |
| 200 | |
| 201 | ! Parse remaining quantified expressions |
| 202 | do |
| 203 | ttype = current_type() |
| 204 | if (ttype == TOK_END .or. ttype == TOK_PIPE .or. ttype == TOK_RPAREN) exit |
| 205 | |
| 206 | right = parse_quantified(ierr) |
| 207 | if (ierr /= 0) return |
| 208 | |
| 209 | ! Create concatenation node |
| 210 | concat_idx = g_pool%alloc() |
| 211 | g_pool%nodes(concat_idx)%ntype = AST_CONCAT |
| 212 | g_pool%nodes(concat_idx)%left = idx |
| 213 | g_pool%nodes(concat_idx)%right = right |
| 214 | |
| 215 | idx = concat_idx |
| 216 | end do |
| 217 | |
| 218 | end function parse_concat |
| 219 | |
| 220 | recursive function parse_quantified(ierr) result(idx) |
| 221 | !> Parse: quantified = atom quantifier? |
| 222 | integer, intent(out) :: ierr |
| 223 | integer :: idx |
| 224 | |
| 225 | integer :: atom_idx, quant_idx, ttype |
| 226 | integer :: min_rep, max_rep |
| 227 | |
| 228 | ierr = 0 |
| 229 | idx = 0 |
| 230 | |
| 231 | ! Parse the atom |
| 232 | atom_idx = parse_atom(ierr) |
| 233 | if (ierr /= 0) return |
| 234 | |
| 235 | idx = atom_idx |
| 236 | |
| 237 | ! Check for quantifier |
| 238 | ttype = current_type() |
| 239 | select case (ttype) |
| 240 | case (TOK_STAR) |
| 241 | min_rep = 0 |
| 242 | max_rep = -1 ! unlimited |
| 243 | call advance() |
| 244 | |
| 245 | case (TOK_PLUS) |
| 246 | min_rep = 1 |
| 247 | max_rep = -1 |
| 248 | call advance() |
| 249 | |
| 250 | case (TOK_QUESTION) |
| 251 | min_rep = 0 |
| 252 | max_rep = 1 |
| 253 | call advance() |
| 254 | |
| 255 | case (TOK_LBRACE) |
| 256 | call parse_brace_quantifier(min_rep, max_rep, ierr) |
| 257 | if (ierr /= 0) return |
| 258 | |
| 259 | case default |
| 260 | ! No quantifier |
| 261 | return |
| 262 | end select |
| 263 | |
| 264 | ! Create quantifier node |
| 265 | quant_idx = g_pool%alloc() |
| 266 | g_pool%nodes(quant_idx)%ntype = AST_QUANTIFIER |
| 267 | g_pool%nodes(quant_idx)%child = atom_idx |
| 268 | g_pool%nodes(quant_idx)%min_rep = min_rep |
| 269 | g_pool%nodes(quant_idx)%max_rep = max_rep |
| 270 | |
| 271 | idx = quant_idx |
| 272 | |
| 273 | end function parse_quantified |
| 274 | |
| 275 | subroutine parse_brace_quantifier(min_rep, max_rep, ierr) |
| 276 | !> Parse {n}, {n,}, or {n,m} |
| 277 | integer, intent(out) :: min_rep, max_rep, ierr |
| 278 | |
| 279 | type(token_t) :: tok |
| 280 | integer :: ttype, num |
| 281 | |
| 282 | ierr = 0 |
| 283 | min_rep = 0 |
| 284 | max_rep = 0 |
| 285 | |
| 286 | call advance() ! consume { |
| 287 | |
| 288 | ! Expect a number or } |
| 289 | ttype = current_type() |
| 290 | if (ttype == TOK_RBRACE) then |
| 291 | ! {} - treat as {0,} (match zero or more) |
| 292 | call advance() |
| 293 | max_rep = -1 |
| 294 | return |
| 295 | end if |
| 296 | |
| 297 | if (ttype /= TOK_LITERAL) then |
| 298 | ierr = 1 |
| 299 | return |
| 300 | end if |
| 301 | |
| 302 | ! Parse first number |
| 303 | tok = current_token() |
| 304 | if (.not. is_digit(tok%char_val)) then |
| 305 | ierr = 1 |
| 306 | return |
| 307 | end if |
| 308 | |
| 309 | num = 0 |
| 310 | do while (current_type() == TOK_LITERAL) |
| 311 | tok = current_token() |
| 312 | if (.not. is_digit(tok%char_val)) exit |
| 313 | num = num * 10 + (ichar(tok%char_val) - ichar('0')) |
| 314 | call advance() |
| 315 | end do |
| 316 | min_rep = num |
| 317 | |
| 318 | ttype = current_type() |
| 319 | if (ttype == TOK_RBRACE) then |
| 320 | ! {n} - exact count |
| 321 | call advance() |
| 322 | max_rep = min_rep |
| 323 | return |
| 324 | end if |
| 325 | |
| 326 | ! Expect comma |
| 327 | tok = current_token() |
| 328 | if (ttype /= TOK_LITERAL .or. tok%char_val /= ',') then |
| 329 | ierr = 1 |
| 330 | return |
| 331 | end if |
| 332 | call advance() |
| 333 | |
| 334 | ttype = current_type() |
| 335 | if (ttype == TOK_RBRACE) then |
| 336 | ! {n,} - at least n |
| 337 | call advance() |
| 338 | max_rep = -1 |
| 339 | return |
| 340 | end if |
| 341 | |
| 342 | ! Parse second number |
| 343 | if (ttype /= TOK_LITERAL) then |
| 344 | ierr = 1 |
| 345 | return |
| 346 | end if |
| 347 | |
| 348 | num = 0 |
| 349 | do while (current_type() == TOK_LITERAL) |
| 350 | tok = current_token() |
| 351 | if (.not. is_digit(tok%char_val)) exit |
| 352 | num = num * 10 + (ichar(tok%char_val) - ichar('0')) |
| 353 | call advance() |
| 354 | end do |
| 355 | max_rep = num |
| 356 | |
| 357 | ! Expect } |
| 358 | if (current_type() /= TOK_RBRACE) then |
| 359 | ierr = 1 |
| 360 | return |
| 361 | end if |
| 362 | call advance() |
| 363 | |
| 364 | end subroutine parse_brace_quantifier |
| 365 | |
| 366 | recursive function parse_atom(ierr) result(idx) |
| 367 | !> Parse: atom = literal | '.' | '[' class ']' | '(' regex ')' | anchor | backref |
| 368 | integer, intent(out) :: ierr |
| 369 | integer :: idx |
| 370 | |
| 371 | type(token_t) :: tok |
| 372 | integer :: ttype, group_idx |
| 373 | |
| 374 | ierr = 0 |
| 375 | idx = 0 |
| 376 | |
| 377 | tok = current_token() |
| 378 | ttype = tok%ttype |
| 379 | |
| 380 | select case (ttype) |
| 381 | case (TOK_LITERAL) |
| 382 | idx = g_pool%alloc() |
| 383 | g_pool%nodes(idx)%ntype = AST_LITERAL |
| 384 | g_pool%nodes(idx)%char_val = tok%char_val |
| 385 | call advance() |
| 386 | |
| 387 | case (TOK_DOT) |
| 388 | idx = g_pool%alloc() |
| 389 | g_pool%nodes(idx)%ntype = AST_DOT |
| 390 | call advance() |
| 391 | |
| 392 | case (TOK_LBRACKET) |
| 393 | ! Character class (already parsed by lexer) |
| 394 | idx = g_pool%alloc() |
| 395 | g_pool%nodes(idx)%ntype = AST_CHAR_CLASS |
| 396 | g_pool%nodes(idx)%char_class = tok%char_class |
| 397 | g_pool%nodes(idx)%negated = tok%negated |
| 398 | call advance() |
| 399 | |
| 400 | case (TOK_LPAREN) |
| 401 | call advance() ! consume ( |
| 402 | |
| 403 | g_group_num = g_group_num + 1 |
| 404 | group_idx = g_group_num |
| 405 | |
| 406 | ! Parse inner expression |
| 407 | idx = parse_alternation(ierr) |
| 408 | if (ierr /= 0) return |
| 409 | |
| 410 | ! Expect ) |
| 411 | if (current_type() /= TOK_RPAREN) then |
| 412 | ierr = 1 |
| 413 | return |
| 414 | end if |
| 415 | call advance() |
| 416 | |
| 417 | ! Wrap in group node |
| 418 | group_idx = g_pool%alloc() |
| 419 | g_pool%nodes(group_idx)%ntype = AST_GROUP |
| 420 | g_pool%nodes(group_idx)%child = idx |
| 421 | g_pool%nodes(group_idx)%group_num = g_group_num |
| 422 | |
| 423 | idx = group_idx |
| 424 | |
| 425 | case (TOK_CARET) |
| 426 | idx = g_pool%alloc() |
| 427 | g_pool%nodes(idx)%ntype = AST_ANCHOR |
| 428 | g_pool%nodes(idx)%anchor_type = 1 ! start |
| 429 | call advance() |
| 430 | |
| 431 | case (TOK_DOLLAR) |
| 432 | idx = g_pool%alloc() |
| 433 | g_pool%nodes(idx)%ntype = AST_ANCHOR |
| 434 | g_pool%nodes(idx)%anchor_type = 2 ! end |
| 435 | call advance() |
| 436 | |
| 437 | case (TOK_WORD_BOUNDARY) |
| 438 | idx = g_pool%alloc() |
| 439 | g_pool%nodes(idx)%ntype = AST_ANCHOR |
| 440 | g_pool%nodes(idx)%anchor_type = tok%int_val + 2 ! 3=word_start, 4=word_end, 5=boundary |
| 441 | call advance() |
| 442 | |
| 443 | case (TOK_BACKREF) |
| 444 | idx = g_pool%alloc() |
| 445 | g_pool%nodes(idx)%ntype = AST_BACKREF |
| 446 | g_pool%nodes(idx)%group_num = tok%int_val |
| 447 | call advance() |
| 448 | |
| 449 | case default |
| 450 | ! Unexpected token - error |
| 451 | ierr = 1 |
| 452 | end select |
| 453 | |
| 454 | end function parse_atom |
| 455 | |
| 456 | !--------------------------------------------------------------------------- |
| 457 | ! Utility Functions |
| 458 | !--------------------------------------------------------------------------- |
| 459 | pure function is_digit(c) result(res) |
| 460 | character(len=1), intent(in) :: c |
| 461 | logical :: res |
| 462 | res = (c >= '0' .and. c <= '9') |
| 463 | end function is_digit |
| 464 | |
| 465 | end module regex_parser |
| 466 |