@@ -61,6 +61,10 @@ module regex_optimizer |
| 61 | 61 | integer :: start_state = 0 ! Starting DFA state |
| 62 | 62 | logical :: compiled = .false. ! DFA successfully compiled |
| 63 | 63 | logical :: too_large = .false. ! DFA exceeded size limit |
| 64 | + ! Character equivalence classes |
| 65 | + integer :: char_to_class(0:255) = 0 ! Maps char code to class index |
| 66 | + integer :: num_classes = 256 ! Number of equivalence classes |
| 67 | + logical :: use_equiv_classes = .false. ! Using equivalence classes |
| 64 | 68 | end type compiled_dfa_t |
| 65 | 69 | |
| 66 | 70 | !> Optimized NFA with precomputed data |
@@ -383,18 +387,125 @@ contains |
| 383 | 387 | end do |
| 384 | 388 | end function has_anchor_transitions |
| 385 | 389 | |
| 390 | + !--------------------------------------------------------------------------- |
| 391 | + ! Character Equivalence Classes |
| 392 | + !--------------------------------------------------------------------------- |
| 393 | + |
| 394 | + subroutine compute_equiv_classes(nfa, char_to_class, num_classes) |
| 395 | + !> Compute character equivalence classes from NFA transitions |
| 396 | + !> Characters with identical behavior across all NFA states belong to same class |
| 397 | + !> This reduces DFA transition table from 256 entries to num_classes entries |
| 398 | + type(nfa_t), intent(in) :: nfa |
| 399 | + integer, intent(out) :: char_to_class(0:255) |
| 400 | + integer, intent(out) :: num_classes |
| 401 | + |
| 402 | + ! Signature for each character: encodes which transitions it triggers |
| 403 | + ! We use a simple approach: hash the set of (state, target) pairs for each char |
| 404 | + integer(8) :: char_signature(0:255) |
| 405 | + integer :: state, i, c, target |
| 406 | + type(nfa_transition_t) :: trans |
| 407 | + integer(8) :: sig |
| 408 | + integer :: class_map(0:255) ! signature hash -> class index |
| 409 | + logical :: found |
| 410 | + |
| 411 | + ! Initialize all characters to have signature 0 (no transitions) |
| 412 | + char_signature = 0_8 |
| 413 | + |
| 414 | + ! Build signature for each character based on NFA transitions |
| 415 | + do state = 1, nfa%num_states |
| 416 | + do i = 1, nfa%states(state)%num_trans |
| 417 | + trans = nfa%states(state)%trans(i) |
| 418 | + target = trans%target |
| 419 | + |
| 420 | + select case (trans%trans_type) |
| 421 | + case (TRANS_CHAR) |
| 422 | + ! Single character transition |
| 423 | + c = ichar(trans%match_char) |
| 424 | + ! Add (state, target) to signature using FNV-1a-like hash |
| 425 | + char_signature(c) = ieor(char_signature(c), & |
| 426 | + int(state * 31 + target, 8) * 1099511628211_8) |
| 427 | + |
| 428 | + case (TRANS_CLASS) |
| 429 | + ! Character class transition - add to all matching chars |
| 430 | + do c = 0, 255 |
| 431 | + if (charclass_test(trans%char_bits, char(c))) then |
| 432 | + char_signature(c) = ieor(char_signature(c), & |
| 433 | + int(state * 31 + target, 8) * 1099511628211_8) |
| 434 | + end if |
| 435 | + end do |
| 436 | + |
| 437 | + case (TRANS_ANY) |
| 438 | + ! Dot matches all except newline |
| 439 | + do c = 0, 255 |
| 440 | + if (c /= 10) then ! Not newline |
| 441 | + char_signature(c) = ieor(char_signature(c), & |
| 442 | + int(state * 31 + target, 8) * 1099511628211_8) |
| 443 | + end if |
| 444 | + end do |
| 445 | + end select |
| 446 | + end do |
| 447 | + end do |
| 448 | + |
| 449 | + ! Force each alphabetic character to have a unique signature |
| 450 | + ! This ensures they get their own equivalence classes, so the case-folding |
| 451 | + ! code in DFA compilation works correctly (it relies on the class representative |
| 452 | + ! being alphabetic to compute transitions for both cases) |
| 453 | + do c = ichar('a'), ichar('z') |
| 454 | + ! Add unique value to each letter's signature to separate them from non-letters |
| 455 | + char_signature(c) = ieor(char_signature(c), int(c * 7919 + 1, 8)) |
| 456 | + char_signature(c - 32) = ieor(char_signature(c - 32), int((c - 32) * 7919 + 1, 8)) |
| 457 | + end do |
| 458 | + |
| 459 | + ! Now group characters by signature |
| 460 | + num_classes = 0 |
| 461 | + class_map = -1 |
| 462 | + char_to_class = 0 |
| 463 | + |
| 464 | + do c = 0, 255 |
| 465 | + sig = char_signature(c) |
| 466 | + |
| 467 | + ! Look for existing class with this signature |
| 468 | + found = .false. |
| 469 | + do i = 0, num_classes - 1 |
| 470 | + if (class_map(i) /= -1) then |
| 471 | + ! Check if any character in class i has same signature |
| 472 | + ! We stored the signature hash as a proxy |
| 473 | + if (char_signature(class_map(i)) == sig) then |
| 474 | + char_to_class(c) = i |
| 475 | + found = .true. |
| 476 | + exit |
| 477 | + end if |
| 478 | + end if |
| 479 | + end do |
| 480 | + |
| 481 | + if (.not. found) then |
| 482 | + ! Create new class |
| 483 | + char_to_class(c) = num_classes |
| 484 | + class_map(num_classes) = c ! Remember one char from this class |
| 485 | + num_classes = num_classes + 1 |
| 486 | + end if |
| 487 | + end do |
| 488 | + |
| 489 | + ! Ensure at least one class |
| 490 | + if (num_classes == 0) num_classes = 1 |
| 491 | + |
| 492 | + end subroutine compute_equiv_classes |
| 493 | + |
| 386 | 494 | !--------------------------------------------------------------------------- |
| 387 | 495 | ! DFA Compilation: Convert NFA to DFA for O(n) matching |
| 388 | 496 | !--------------------------------------------------------------------------- |
| 389 | 497 | |
| 390 | 498 | subroutine compile_dfa(opt) |
| 391 | 499 | !> Compile NFA to DFA using subset construction |
| 392 | | - !> Creates DFA states lazily, stopping if too many states |
| 500 | + !> Uses character equivalence classes to reduce compilation time |
| 393 | 501 | type(optimized_nfa_t), intent(inout) :: opt |
| 394 | 502 | |
| 395 | 503 | type(state_set_t) :: start_set, next_set |
| 396 | 504 | integer :: worklist(MAX_DFA_STATES), work_head, work_tail |
| 397 | 505 | integer :: dfa_idx, char_code, next_idx, old_num_states |
| 506 | + integer :: class_idx, c |
| 507 | + integer :: class_representative(0:255) ! One char per class |
| 508 | + integer :: class_transitions(0:255) ! Computed transition per class |
| 398 | 509 | |
| 399 | 510 | ! Allocate DFA states |
| 400 | 511 | if (allocated(opt%dfa%states)) deallocate(opt%dfa%states) |
@@ -404,6 +515,19 @@ contains |
| 404 | 515 | opt%dfa%too_large = .false. |
| 405 | 516 | opt%use_dfa = .false. |
| 406 | 517 | |
| 518 | + ! Compute character equivalence classes |
| 519 | + call compute_equiv_classes(opt%nfa, opt%dfa%char_to_class, opt%dfa%num_classes) |
| 520 | + opt%dfa%use_equiv_classes = (opt%dfa%num_classes < 256) |
| 521 | + |
| 522 | + ! Build representative character for each class |
| 523 | + class_representative = -1 |
| 524 | + do c = 0, 255 |
| 525 | + class_idx = opt%dfa%char_to_class(c) |
| 526 | + if (class_representative(class_idx) == -1) then |
| 527 | + class_representative(class_idx) = c |
| 528 | + end if |
| 529 | + end do |
| 530 | + |
| 407 | 531 | ! Compute start state: epsilon closure of NFA start |
| 408 | 532 | call start_set%clear() |
| 409 | 533 | call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set) |
@@ -427,10 +551,13 @@ contains |
| 427 | 551 | dfa_idx = worklist(work_head) |
| 428 | 552 | work_head = work_head + 1 |
| 429 | 553 | |
| 430 | | - ! Compute transitions for all 256 characters |
| 431 | | - ! For case-insensitive matching, we compute transitions for both cases |
| 432 | | - ! and union them so 'a' and 'A' go to the same DFA state |
| 433 | | - do char_code = 0, 255 |
| 554 | + ! First, compute transitions for each equivalence class (not all 256 chars) |
| 555 | + class_transitions = DFA_DEAD_STATE |
| 556 | + |
| 557 | + do class_idx = 0, opt%dfa%num_classes - 1 |
| 558 | + char_code = class_representative(class_idx) |
| 559 | + if (char_code < 0) cycle |
| 560 | + |
| 434 | 561 | call next_set%clear() |
| 435 | 562 | |
| 436 | 563 | ! Compute NFA transitions for this character |
@@ -439,11 +566,9 @@ contains |
| 439 | 566 | |
| 440 | 567 | ! For alphabetic characters, also compute transitions for opposite case |
| 441 | 568 | if (char_code >= ichar('a') .and. char_code <= ichar('z')) then |
| 442 | | - ! Also try uppercase |
| 443 | 569 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & |
| 444 | 570 | char(char_code - 32), next_set) |
| 445 | 571 | else if (char_code >= ichar('A') .and. char_code <= ichar('Z')) then |
| 446 | | - ! Also try lowercase |
| 447 | 572 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & |
| 448 | 573 | char(char_code + 32), next_set) |
| 449 | 574 | end if |
@@ -454,20 +579,19 @@ contains |
| 454 | 579 | end if |
| 455 | 580 | |
| 456 | 581 | if (next_set%is_empty()) then |
| 457 | | - opt%dfa%states(dfa_idx)%transitions(char_code) = DFA_DEAD_STATE |
| 582 | + class_transitions(class_idx) = DFA_DEAD_STATE |
| 458 | 583 | else |
| 459 | 584 | ! Find or create DFA state for this NFA state set |
| 460 | 585 | old_num_states = opt%dfa%num_states |
| 461 | 586 | next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa) |
| 462 | 587 | |
| 463 | 588 | if (next_idx == -1) then |
| 464 | | - ! Too many DFA states - abort |
| 465 | 589 | opt%dfa%too_large = .true. |
| 466 | 590 | opt%dfa%compiled = .false. |
| 467 | 591 | return |
| 468 | 592 | end if |
| 469 | 593 | |
| 470 | | - opt%dfa%states(dfa_idx)%transitions(char_code) = next_idx |
| 594 | + class_transitions(class_idx) = next_idx |
| 471 | 595 | |
| 472 | 596 | ! Add new state to worklist only if it was just created |
| 473 | 597 | if (opt%dfa%num_states > old_num_states) then |
@@ -481,6 +605,12 @@ contains |
| 481 | 605 | end if |
| 482 | 606 | end if |
| 483 | 607 | end do |
| 608 | + |
| 609 | + ! Now fill in the full 256-entry transition table from class transitions |
| 610 | + do c = 0, 255 |
| 611 | + class_idx = opt%dfa%char_to_class(c) |
| 612 | + opt%dfa%states(dfa_idx)%transitions(c) = class_transitions(class_idx) |
| 613 | + end do |
| 484 | 614 | end do |
| 485 | 615 | |
| 486 | 616 | ! Minimize DFA to reduce state count |