@@ -61,6 +61,10 @@ module regex_optimizer |
| 61 | integer :: start_state = 0 ! Starting DFA state | 61 | integer :: start_state = 0 ! Starting DFA state |
| 62 | logical :: compiled = .false. ! DFA successfully compiled | 62 | logical :: compiled = .false. ! DFA successfully compiled |
| 63 | logical :: too_large = .false. ! DFA exceeded size limit | 63 | logical :: too_large = .false. ! DFA exceeded size limit |
| | 64 | + ! Character equivalence classes |
| | 65 | + integer :: char_to_class(0:255) = 0 ! Maps char code to class index |
| | 66 | + integer :: num_classes = 256 ! Number of equivalence classes |
| | 67 | + logical :: use_equiv_classes = .false. ! Using equivalence classes |
| 64 | end type compiled_dfa_t | 68 | end type compiled_dfa_t |
| 65 | | 69 | |
| 66 | !> Optimized NFA with precomputed data | 70 | !> Optimized NFA with precomputed data |
@@ -383,18 +387,125 @@ contains |
| 383 | end do | 387 | end do |
| 384 | end function has_anchor_transitions | 388 | end function has_anchor_transitions |
| 385 | | 389 | |
| | 390 | + !--------------------------------------------------------------------------- |
| | 391 | + ! Character Equivalence Classes |
| | 392 | + !--------------------------------------------------------------------------- |
| | 393 | + |
| | 394 | + subroutine compute_equiv_classes(nfa, char_to_class, num_classes) |
| | 395 | + !> Compute character equivalence classes from NFA transitions |
| | 396 | + !> Characters with identical behavior across all NFA states belong to same class |
| | 397 | + !> This reduces DFA transition table from 256 entries to num_classes entries |
| | 398 | + type(nfa_t), intent(in) :: nfa |
| | 399 | + integer, intent(out) :: char_to_class(0:255) |
| | 400 | + integer, intent(out) :: num_classes |
| | 401 | + |
| | 402 | + ! Signature for each character: encodes which transitions it triggers |
| | 403 | + ! We use a simple approach: hash the set of (state, target) pairs for each char |
| | 404 | + integer(8) :: char_signature(0:255) |
| | 405 | + integer :: state, i, c, target |
| | 406 | + type(nfa_transition_t) :: trans |
| | 407 | + integer(8) :: sig |
| | 408 | + integer :: class_map(0:255) ! signature hash -> class index |
| | 409 | + logical :: found |
| | 410 | + |
| | 411 | + ! Initialize all characters to have signature 0 (no transitions) |
| | 412 | + char_signature = 0_8 |
| | 413 | + |
| | 414 | + ! Build signature for each character based on NFA transitions |
| | 415 | + do state = 1, nfa%num_states |
| | 416 | + do i = 1, nfa%states(state)%num_trans |
| | 417 | + trans = nfa%states(state)%trans(i) |
| | 418 | + target = trans%target |
| | 419 | + |
| | 420 | + select case (trans%trans_type) |
| | 421 | + case (TRANS_CHAR) |
| | 422 | + ! Single character transition |
| | 423 | + c = ichar(trans%match_char) |
| | 424 | + ! Add (state, target) to signature using FNV-1a-like hash |
| | 425 | + char_signature(c) = ieor(char_signature(c), & |
| | 426 | + int(state * 31 + target, 8) * 1099511628211_8) |
| | 427 | + |
| | 428 | + case (TRANS_CLASS) |
| | 429 | + ! Character class transition - add to all matching chars |
| | 430 | + do c = 0, 255 |
| | 431 | + if (charclass_test(trans%char_bits, char(c))) then |
| | 432 | + char_signature(c) = ieor(char_signature(c), & |
| | 433 | + int(state * 31 + target, 8) * 1099511628211_8) |
| | 434 | + end if |
| | 435 | + end do |
| | 436 | + |
| | 437 | + case (TRANS_ANY) |
| | 438 | + ! Dot matches all except newline |
| | 439 | + do c = 0, 255 |
| | 440 | + if (c /= 10) then ! Not newline |
| | 441 | + char_signature(c) = ieor(char_signature(c), & |
| | 442 | + int(state * 31 + target, 8) * 1099511628211_8) |
| | 443 | + end if |
| | 444 | + end do |
| | 445 | + end select |
| | 446 | + end do |
| | 447 | + end do |
| | 448 | + |
| | 449 | + ! Force each alphabetic character to have a unique signature |
| | 450 | + ! This ensures they get their own equivalence classes, so the case-folding |
| | 451 | + ! code in DFA compilation works correctly (it relies on the class representative |
| | 452 | + ! being alphabetic to compute transitions for both cases) |
| | 453 | + do c = ichar('a'), ichar('z') |
| | 454 | + ! Add unique value to each letter's signature to separate them from non-letters |
| | 455 | + char_signature(c) = ieor(char_signature(c), int(c * 7919 + 1, 8)) |
| | 456 | + char_signature(c - 32) = ieor(char_signature(c - 32), int((c - 32) * 7919 + 1, 8)) |
| | 457 | + end do |
| | 458 | + |
| | 459 | + ! Now group characters by signature |
| | 460 | + num_classes = 0 |
| | 461 | + class_map = -1 |
| | 462 | + char_to_class = 0 |
| | 463 | + |
| | 464 | + do c = 0, 255 |
| | 465 | + sig = char_signature(c) |
| | 466 | + |
| | 467 | + ! Look for existing class with this signature |
| | 468 | + found = .false. |
| | 469 | + do i = 0, num_classes - 1 |
| | 470 | + if (class_map(i) /= -1) then |
| | 471 | + ! Check if any character in class i has same signature |
| | 472 | + ! We stored the signature hash as a proxy |
| | 473 | + if (char_signature(class_map(i)) == sig) then |
| | 474 | + char_to_class(c) = i |
| | 475 | + found = .true. |
| | 476 | + exit |
| | 477 | + end if |
| | 478 | + end if |
| | 479 | + end do |
| | 480 | + |
| | 481 | + if (.not. found) then |
| | 482 | + ! Create new class |
| | 483 | + char_to_class(c) = num_classes |
| | 484 | + class_map(num_classes) = c ! Remember one char from this class |
| | 485 | + num_classes = num_classes + 1 |
| | 486 | + end if |
| | 487 | + end do |
| | 488 | + |
| | 489 | + ! Ensure at least one class |
| | 490 | + if (num_classes == 0) num_classes = 1 |
| | 491 | + |
| | 492 | + end subroutine compute_equiv_classes |
| | 493 | + |
| 386 | !--------------------------------------------------------------------------- | 494 | !--------------------------------------------------------------------------- |
| 387 | ! DFA Compilation: Convert NFA to DFA for O(n) matching | 495 | ! DFA Compilation: Convert NFA to DFA for O(n) matching |
| 388 | !--------------------------------------------------------------------------- | 496 | !--------------------------------------------------------------------------- |
| 389 | | 497 | |
| 390 | subroutine compile_dfa(opt) | 498 | subroutine compile_dfa(opt) |
| 391 | !> Compile NFA to DFA using subset construction | 499 | !> Compile NFA to DFA using subset construction |
| 392 | - !> Creates DFA states lazily, stopping if too many states | 500 | + !> Uses character equivalence classes to reduce compilation time |
| 393 | type(optimized_nfa_t), intent(inout) :: opt | 501 | type(optimized_nfa_t), intent(inout) :: opt |
| 394 | | 502 | |
| 395 | type(state_set_t) :: start_set, next_set | 503 | type(state_set_t) :: start_set, next_set |
| 396 | integer :: worklist(MAX_DFA_STATES), work_head, work_tail | 504 | integer :: worklist(MAX_DFA_STATES), work_head, work_tail |
| 397 | integer :: dfa_idx, char_code, next_idx, old_num_states | 505 | integer :: dfa_idx, char_code, next_idx, old_num_states |
| | 506 | + integer :: class_idx, c |
| | 507 | + integer :: class_representative(0:255) ! One char per class |
| | 508 | + integer :: class_transitions(0:255) ! Computed transition per class |
| 398 | | 509 | |
| 399 | ! Allocate DFA states | 510 | ! Allocate DFA states |
| 400 | if (allocated(opt%dfa%states)) deallocate(opt%dfa%states) | 511 | if (allocated(opt%dfa%states)) deallocate(opt%dfa%states) |
@@ -404,6 +515,19 @@ contains |
| 404 | opt%dfa%too_large = .false. | 515 | opt%dfa%too_large = .false. |
| 405 | opt%use_dfa = .false. | 516 | opt%use_dfa = .false. |
| 406 | | 517 | |
| | 518 | + ! Compute character equivalence classes |
| | 519 | + call compute_equiv_classes(opt%nfa, opt%dfa%char_to_class, opt%dfa%num_classes) |
| | 520 | + opt%dfa%use_equiv_classes = (opt%dfa%num_classes < 256) |
| | 521 | + |
| | 522 | + ! Build representative character for each class |
| | 523 | + class_representative = -1 |
| | 524 | + do c = 0, 255 |
| | 525 | + class_idx = opt%dfa%char_to_class(c) |
| | 526 | + if (class_representative(class_idx) == -1) then |
| | 527 | + class_representative(class_idx) = c |
| | 528 | + end if |
| | 529 | + end do |
| | 530 | + |
| 407 | ! Compute start state: epsilon closure of NFA start | 531 | ! Compute start state: epsilon closure of NFA start |
| 408 | call start_set%clear() | 532 | call start_set%clear() |
| 409 | call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set) | 533 | call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set) |
@@ -427,10 +551,13 @@ contains |
| 427 | dfa_idx = worklist(work_head) | 551 | dfa_idx = worklist(work_head) |
| 428 | work_head = work_head + 1 | 552 | work_head = work_head + 1 |
| 429 | | 553 | |
| 430 | - ! Compute transitions for all 256 characters | 554 | + ! First, compute transitions for each equivalence class (not all 256 chars) |
| 431 | - ! For case-insensitive matching, we compute transitions for both cases | 555 | + class_transitions = DFA_DEAD_STATE |
| 432 | - ! and union them so 'a' and 'A' go to the same DFA state | 556 | + |
| 433 | - do char_code = 0, 255 | 557 | + do class_idx = 0, opt%dfa%num_classes - 1 |
| | 558 | + char_code = class_representative(class_idx) |
| | 559 | + if (char_code < 0) cycle |
| | 560 | + |
| 434 | call next_set%clear() | 561 | call next_set%clear() |
| 435 | | 562 | |
| 436 | ! Compute NFA transitions for this character | 563 | ! Compute NFA transitions for this character |
@@ -439,11 +566,9 @@ contains |
| 439 | | 566 | |
| 440 | ! For alphabetic characters, also compute transitions for opposite case | 567 | ! For alphabetic characters, also compute transitions for opposite case |
| 441 | if (char_code >= ichar('a') .and. char_code <= ichar('z')) then | 568 | if (char_code >= ichar('a') .and. char_code <= ichar('z')) then |
| 442 | - ! Also try uppercase | | |
| 443 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & | 569 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & |
| 444 | char(char_code - 32), next_set) | 570 | char(char_code - 32), next_set) |
| 445 | else if (char_code >= ichar('A') .and. char_code <= ichar('Z')) then | 571 | else if (char_code >= ichar('A') .and. char_code <= ichar('Z')) then |
| 446 | - ! Also try lowercase | | |
| 447 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & | 572 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & |
| 448 | char(char_code + 32), next_set) | 573 | char(char_code + 32), next_set) |
| 449 | end if | 574 | end if |
@@ -454,20 +579,19 @@ contains |
| 454 | end if | 579 | end if |
| 455 | | 580 | |
| 456 | if (next_set%is_empty()) then | 581 | if (next_set%is_empty()) then |
| 457 | - opt%dfa%states(dfa_idx)%transitions(char_code) = DFA_DEAD_STATE | 582 | + class_transitions(class_idx) = DFA_DEAD_STATE |
| 458 | else | 583 | else |
| 459 | ! Find or create DFA state for this NFA state set | 584 | ! Find or create DFA state for this NFA state set |
| 460 | old_num_states = opt%dfa%num_states | 585 | old_num_states = opt%dfa%num_states |
| 461 | next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa) | 586 | next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa) |
| 462 | | 587 | |
| 463 | if (next_idx == -1) then | 588 | if (next_idx == -1) then |
| 464 | - ! Too many DFA states - abort | | |
| 465 | opt%dfa%too_large = .true. | 589 | opt%dfa%too_large = .true. |
| 466 | opt%dfa%compiled = .false. | 590 | opt%dfa%compiled = .false. |
| 467 | return | 591 | return |
| 468 | end if | 592 | end if |
| 469 | | 593 | |
| 470 | - opt%dfa%states(dfa_idx)%transitions(char_code) = next_idx | 594 | + class_transitions(class_idx) = next_idx |
| 471 | | 595 | |
| 472 | ! Add new state to worklist only if it was just created | 596 | ! Add new state to worklist only if it was just created |
| 473 | if (opt%dfa%num_states > old_num_states) then | 597 | if (opt%dfa%num_states > old_num_states) then |
@@ -481,6 +605,12 @@ contains |
| 481 | end if | 605 | end if |
| 482 | end if | 606 | end if |
| 483 | end do | 607 | end do |
| | 608 | + |
| | 609 | + ! Now fill in the full 256-entry transition table from class transitions |
| | 610 | + do c = 0, 255 |
| | 611 | + class_idx = opt%dfa%char_to_class(c) |
| | 612 | + opt%dfa%states(dfa_idx)%transitions(c) = class_transitions(class_idx) |
| | 613 | + end do |
| 484 | end do | 614 | end do |
| 485 | | 615 | |
| 486 | ! Minimize DFA to reduce state count | 616 | ! Minimize DFA to reduce state count |