module regex_optimizer !> Regex optimization module for FERP !> Provides optimized NFA matching with: !> - Literal prefix extraction for Boyer-Moore skip !> - Bit vector state sets for O(1) operations !> - Lazy DFA state caching !> - Anchored pattern fast paths !> - Aho-Corasick for alternation patterns !> - Bitwise character class matching use regex_types use regex_charclass use aho_corasick use ferp_kinds, only: pattern_len implicit none private public :: optimized_nfa_t public :: optimize_nfa, optimized_match, optimized_search public :: try_build_aho_corasick integer, parameter :: MAX_STATES = 1024 integer, parameter :: MAX_PREFIX_LEN = 64 integer, parameter :: DFA_CACHE_SIZE = 256 ! Cache recent state transitions integer, parameter :: MAX_DFA_STATES = 512 ! Max DFA states before fallback to NFA integer, parameter :: DFA_DEAD_STATE = 0 ! Special state: no match possible !> Bit vector for state sets - much faster than array lookup type :: state_set_t integer(8) :: bits(MAX_STATES / 64 + 1) = 0 integer :: count = 0 contains procedure :: clear => state_set_clear procedure :: add => state_set_add procedure :: contains => state_set_contains procedure :: is_empty => state_set_is_empty procedure :: copy_from => state_set_copy procedure :: hash => state_set_hash procedure :: equals => state_set_equals end type state_set_t !> DFA cache entry - caches (state_set_hash, char) -> next_states transitions type :: dfa_cache_entry_t integer(8) :: state_hash = 0 ! Hash of source state set integer :: char_code = -1 ! Character being matched type(state_set_t) :: next_states ! Resulting states after transition logical :: valid = .false. ! Entry is populated logical :: is_case_insensitive = .false. ! Case sensitivity flag end type dfa_cache_entry_t !> Full DFA state - precomputed transitions for all 256 characters type :: dfa_state_t integer :: transitions(0:255) = DFA_DEAD_STATE ! Next state for each byte type(state_set_t) :: nfa_states ! Corresponding NFA state set logical :: is_accept = .false. ! Is this an accepting state? integer(8) :: state_hash = 0 ! Hash for lookup end type dfa_state_t !> Compiled DFA for O(n) matching type :: compiled_dfa_t type(dfa_state_t), allocatable :: states(:) ! DFA states integer :: num_states = 0 ! Number of states built integer :: start_state = 0 ! Starting DFA state logical :: compiled = .false. ! DFA successfully compiled logical :: too_large = .false. ! DFA exceeded size limit ! Character equivalence classes integer :: char_to_class(0:255) = 0 ! Maps char code to class index integer :: num_classes = 256 ! Number of equivalence classes logical :: use_equiv_classes = .false. ! Using equivalence classes end type compiled_dfa_t !> Optimized NFA with precomputed data type :: optimized_nfa_t type(nfa_t) :: nfa ! Original NFA character(len=MAX_PREFIX_LEN) :: prefix = '' ! Literal prefix for quick skip integer :: prefix_len = 0 logical :: anchored_start = .false. ! Pattern starts with ^ logical :: anchored_end = .false. ! Pattern ends with $ integer :: skip_table(0:255) = 0 ! Boyer-Moore skip table for prefix type(state_set_t) :: start_closure ! Pre-computed start state epsilon closure type(state_set_t), allocatable :: epsilon_closures(:) ! Pre-computed epsilon closures per state type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE) ! Lazy DFA cache type(compiled_dfa_t) :: dfa ! Full compiled DFA (if available) logical :: use_dfa = .false. ! Use DFA instead of NFA type(ac_automaton_t) :: ac ! Aho-Corasick automaton (for alternation) logical :: use_aho_corasick = .false. ! Use Aho-Corasick for matching logical :: has_backrefs = .false. ! Pattern contains backreferences logical :: optimized = .false. end type optimized_nfa_t contains !--------------------------------------------------------------------------- ! State Set Operations (Bit Vector) !--------------------------------------------------------------------------- subroutine state_set_clear(this) class(state_set_t), intent(inout) :: this this%bits = 0 this%count = 0 end subroutine state_set_clear subroutine state_set_add(this, state) class(state_set_t), intent(inout) :: this integer, intent(in) :: state integer :: word_idx, bit_idx integer(8) :: mask if (state < 1 .or. state > MAX_STATES) return word_idx = (state - 1) / 64 + 1 bit_idx = mod(state - 1, 64) mask = ishft(1_8, bit_idx) if (iand(this%bits(word_idx), mask) == 0) then this%bits(word_idx) = ior(this%bits(word_idx), mask) this%count = this%count + 1 end if end subroutine state_set_add function state_set_contains(this, state) result(found) class(state_set_t), intent(in) :: this integer, intent(in) :: state logical :: found integer :: word_idx, bit_idx integer(8) :: mask found = .false. if (state < 1 .or. state > MAX_STATES) return word_idx = (state - 1) / 64 + 1 bit_idx = mod(state - 1, 64) mask = ishft(1_8, bit_idx) found = iand(this%bits(word_idx), mask) /= 0 end function state_set_contains function state_set_is_empty(this) result(empty) class(state_set_t), intent(in) :: this logical :: empty empty = this%count == 0 end function state_set_is_empty subroutine state_set_copy(this, other) class(state_set_t), intent(inout) :: this type(state_set_t), intent(in) :: other this%bits = other%bits this%count = other%count end subroutine state_set_copy function state_set_hash(this) result(h) !> Compute a hash of the state set for cache lookup !> Uses FNV-1a style hashing on the bit words class(state_set_t), intent(in) :: this integer(8) :: h integer :: i integer(8), parameter :: FNV_OFFSET = int(Z'CBF29CE484222325', 8) integer(8), parameter :: FNV_PRIME = int(Z'100000001B3', 8) h = FNV_OFFSET do i = 1, size(this%bits) h = ieor(h, this%bits(i)) h = h * FNV_PRIME end do end function state_set_hash function state_set_equals(this, other) result(eq) !> Check if two state sets are identical class(state_set_t), intent(in) :: this type(state_set_t), intent(in) :: other logical :: eq integer :: i eq = .false. if (this%count /= other%count) return do i = 1, size(this%bits) if (this%bits(i) /= other%bits(i)) return end do eq = .true. end function state_set_equals !--------------------------------------------------------------------------- ! Optimization: Analyze NFA and extract optimizations !--------------------------------------------------------------------------- subroutine optimize_nfa(opt, nfa) type(optimized_nfa_t), intent(out) :: opt type(nfa_t), intent(in) :: nfa opt%nfa = nfa opt%prefix_len = 0 opt%prefix = '' opt%anchored_start = .false. opt%anchored_end = .false. opt%use_dfa = .false. opt%has_backrefs = .false. ! Detect backreferences in NFA opt%has_backrefs = has_backref_transitions(nfa) ! Extract literal prefix and detect anchors call extract_prefix_and_anchors(opt) ! Build Boyer-Moore skip table for prefix if (opt%prefix_len > 0) then call build_skip_table(opt%prefix, opt%prefix_len, opt%skip_table) end if ! Pre-compute start state epsilon closure (position-independent part) call precompute_start_closure(opt) ! Pre-compute epsilon closures for all states (for fast expansion) call precompute_all_epsilon_closures(opt) ! Clear DFA cache opt%dfa_cache%valid = .false. ! Try to compile full DFA for O(n) matching ! Only for patterns without any position-dependent transitions (anchors) ! and without backreferences (which require backtracking) if (.not. has_anchor_transitions(opt%nfa) .and. .not. opt%has_backrefs) then call compile_dfa(opt) ! DEBUG: Print DFA compilation result (uncomment for debugging) ! write(0,*) 'DFA compiled:', opt%use_dfa, 'states:', opt%dfa%num_states, 'too_large:', opt%dfa%too_large end if opt%optimized = .true. end subroutine optimize_nfa subroutine extract_prefix_and_anchors(opt) type(optimized_nfa_t), intent(inout) :: opt integer :: state, i, prefix_len type(nfa_transition_t) :: trans logical :: done prefix_len = 0 state = opt%nfa%start_state ! Check for start anchor if (state >= 1 .and. state <= opt%nfa%num_states) then do i = 1, opt%nfa%states(state)%num_trans trans = opt%nfa%states(state)%trans(i) if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 1) then opt%anchored_start = .true. state = trans%target exit end if end do end if ! Extract literal prefix by following single-path character transitions done = .false. do while (.not. done .and. prefix_len < MAX_PREFIX_LEN) if (state < 1 .or. state > opt%nfa%num_states) exit if (opt%nfa%states(state)%num_trans /= 1) exit trans = opt%nfa%states(state)%trans(1) if (trans%trans_type == TRANS_CHAR) then prefix_len = prefix_len + 1 opt%prefix(prefix_len:prefix_len) = trans%match_char state = trans%target else if (trans%trans_type == TRANS_EPSILON) then ! Follow epsilon, but only if it's the only transition state = trans%target else done = .true. end if end do opt%prefix_len = prefix_len ! Check for end anchor (scan accept state's incoming transitions) ! This is approximate - just check if accept state has anchor transition if (opt%nfa%accept_state >= 1 .and. opt%nfa%accept_state <= opt%nfa%num_states) then ! Check states that point to accept state do state = 1, opt%nfa%num_states do i = 1, opt%nfa%states(state)%num_trans trans = opt%nfa%states(state)%trans(i) if (trans%target == opt%nfa%accept_state) then if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 2) then opt%anchored_end = .true. end if end if end do end do end if end subroutine extract_prefix_and_anchors subroutine build_skip_table(prefix, prefix_len, skip_table) character(len=*), intent(in) :: prefix integer, intent(in) :: prefix_len integer, intent(out) :: skip_table(0:255) integer :: i, c ! Default skip is prefix length skip_table = prefix_len ! Set skip distances for characters in prefix do i = 1, prefix_len - 1 c = ichar(prefix(i:i)) skip_table(c) = prefix_len - i end do end subroutine build_skip_table subroutine precompute_start_closure(opt) type(optimized_nfa_t), intent(inout) :: opt ! Compute basic epsilon closure of start state ! (Some closures depend on position for anchors, handle those at runtime) call opt%start_closure%clear() call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, opt%start_closure) end subroutine precompute_start_closure subroutine compute_epsilon_closure_basic(nfa, start_state, result_set) type(nfa_t), intent(in) :: nfa integer, intent(in) :: start_state type(state_set_t), intent(inout) :: result_set integer :: stack(MAX_STATES), stack_top integer :: state, i, target type(nfa_transition_t) :: trans stack_top = 1 stack(1) = start_state do while (stack_top > 0) state = stack(stack_top) stack_top = stack_top - 1 if (state < 1 .or. state > nfa%num_states) cycle if (result_set%contains(state)) cycle call result_set%add(state) ! Follow epsilon transitions (not anchors - those are position-dependent) do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type >= 0) then target = trans%target if (target >= 1 .and. target <= nfa%num_states) then if (.not. result_set%contains(target)) then stack_top = stack_top + 1 if (stack_top <= MAX_STATES) stack(stack_top) = target end if end if end if end do end do end subroutine compute_epsilon_closure_basic subroutine precompute_all_epsilon_closures(opt) !> Pre-compute epsilon closure for every NFA state !> This allows O(1) closure lookup during matching instead of repeated traversal type(optimized_nfa_t), intent(inout) :: opt integer :: i, n n = opt%nfa%num_states if (n <= 0) return ! Allocate epsilon closures array if (allocated(opt%epsilon_closures)) deallocate(opt%epsilon_closures) allocate(opt%epsilon_closures(n)) ! Compute epsilon closure for each state do i = 1, n call opt%epsilon_closures(i)%clear() call compute_epsilon_closure_basic(opt%nfa, i, opt%epsilon_closures(i)) end do end subroutine precompute_all_epsilon_closures function has_anchor_transitions(nfa) result(has_anchors) !> Check if NFA has any anchor transitions (position-dependent) !> These include ^, $, \<, \>, \b, \B type(nfa_t), intent(in) :: nfa logical :: has_anchors integer :: state, i type(nfa_transition_t) :: trans has_anchors = .false. do state = 1, nfa%num_states do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) if (trans%trans_type == TRANS_ANCHOR) then has_anchors = .true. return end if end do end do end function has_anchor_transitions function has_backref_transitions(nfa) result(has_backrefs) !> Check if NFA has any backreference transitions !> Backrefs are encoded as TRANS_EPSILON with negative anchor_type type(nfa_t), intent(in) :: nfa logical :: has_backrefs integer :: state, i type(nfa_transition_t) :: trans has_backrefs = .false. do state = 1, nfa%num_states do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type < 0) then has_backrefs = .true. return end if end do end do end function has_backref_transitions !--------------------------------------------------------------------------- ! Character Equivalence Classes !--------------------------------------------------------------------------- subroutine compute_equiv_classes(nfa, char_to_class, num_classes) !> Compute character equivalence classes from NFA transitions !> Characters with identical behavior across all NFA states belong to same class !> This reduces DFA transition table from 256 entries to num_classes entries type(nfa_t), intent(in) :: nfa integer, intent(out) :: char_to_class(0:255) integer, intent(out) :: num_classes ! Signature for each character: encodes which transitions it triggers ! We use a simple approach: hash the set of (state, target) pairs for each char integer(8) :: char_signature(0:255) integer :: state, i, c, target type(nfa_transition_t) :: trans integer(8) :: sig integer :: class_map(0:255) ! signature hash -> class index logical :: found ! Initialize all characters to have signature 0 (no transitions) char_signature = 0_8 ! Build signature for each character based on NFA transitions do state = 1, nfa%num_states do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) target = trans%target select case (trans%trans_type) case (TRANS_CHAR) ! Single character transition c = ichar(trans%match_char) ! Add (state, target) to signature using FNV-1a-like hash char_signature(c) = ieor(char_signature(c), & int(state * 31 + target, 8) * 1099511628211_8) case (TRANS_CLASS) ! Character class transition - add to all matching chars do c = 0, 255 if (charclass_test(trans%char_bits, char(c))) then char_signature(c) = ieor(char_signature(c), & int(state * 31 + target, 8) * 1099511628211_8) end if end do case (TRANS_ANY) ! Dot matches all except newline do c = 0, 255 if (c /= 10) then ! Not newline char_signature(c) = ieor(char_signature(c), & int(state * 31 + target, 8) * 1099511628211_8) end if end do end select end do end do ! Force each alphabetic character to have a unique signature ! This ensures they get their own equivalence classes, so the case-folding ! code in DFA compilation works correctly (it relies on the class representative ! being alphabetic to compute transitions for both cases) do c = ichar('a'), ichar('z') ! Add unique value to each letter's signature to separate them from non-letters char_signature(c) = ieor(char_signature(c), int(c * 7919 + 1, 8)) char_signature(c - 32) = ieor(char_signature(c - 32), int((c - 32) * 7919 + 1, 8)) end do ! Now group characters by signature num_classes = 0 class_map = -1 char_to_class = 0 do c = 0, 255 sig = char_signature(c) ! Look for existing class with this signature found = .false. do i = 0, num_classes - 1 if (class_map(i) /= -1) then ! Check if any character in class i has same signature ! We stored the signature hash as a proxy if (char_signature(class_map(i)) == sig) then char_to_class(c) = i found = .true. exit end if end if end do if (.not. found) then ! Create new class char_to_class(c) = num_classes class_map(num_classes) = c ! Remember one char from this class num_classes = num_classes + 1 end if end do ! Ensure at least one class if (num_classes == 0) num_classes = 1 end subroutine compute_equiv_classes !--------------------------------------------------------------------------- ! DFA Compilation: Convert NFA to DFA for O(n) matching !--------------------------------------------------------------------------- subroutine compile_dfa(opt) !> Compile NFA to DFA using subset construction !> Uses character equivalence classes to reduce compilation time type(optimized_nfa_t), intent(inout) :: opt type(state_set_t) :: start_set, next_set integer :: worklist(MAX_DFA_STATES), work_head, work_tail integer :: dfa_idx, char_code, next_idx, old_num_states integer :: class_idx, c integer :: class_representative(0:255) ! One char per class integer :: class_transitions(0:255) ! Computed transition per class ! Allocate DFA states if (allocated(opt%dfa%states)) deallocate(opt%dfa%states) allocate(opt%dfa%states(MAX_DFA_STATES)) opt%dfa%num_states = 0 opt%dfa%compiled = .false. opt%dfa%too_large = .false. opt%use_dfa = .false. ! Compute character equivalence classes call compute_equiv_classes(opt%nfa, opt%dfa%char_to_class, opt%dfa%num_classes) opt%dfa%use_equiv_classes = (opt%dfa%num_classes < 256) ! Build representative character for each class class_representative = -1 do c = 0, 255 class_idx = opt%dfa%char_to_class(c) if (class_representative(class_idx) == -1) then class_representative(class_idx) = c end if end do ! Compute start state: epsilon closure of NFA start call start_set%clear() call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set) if (start_set%is_empty()) return ! Create initial DFA state opt%dfa%num_states = 1 opt%dfa%states(1)%nfa_states = start_set opt%dfa%states(1)%state_hash = start_set%hash() opt%dfa%states(1)%is_accept = is_accepting_set(opt%nfa, start_set) opt%dfa%start_state = 1 ! Initialize worklist with start state work_head = 1 work_tail = 1 worklist(1) = 1 ! Process worklist: for each DFA state, compute transitions do while (work_head <= work_tail) dfa_idx = worklist(work_head) work_head = work_head + 1 ! First, compute transitions for each equivalence class (not all 256 chars) class_transitions = DFA_DEAD_STATE do class_idx = 0, opt%dfa%num_classes - 1 char_code = class_representative(class_idx) if (char_code < 0) cycle call next_set%clear() ! Compute NFA transitions for this character ! Note: DFA is case-sensitive. Case-insensitive matching uses NFA path. call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & char(char_code), next_set) ! Compute epsilon closure of result if (.not. next_set%is_empty()) then call expand_epsilon_closure_simple(opt, next_set) end if if (next_set%is_empty()) then class_transitions(class_idx) = DFA_DEAD_STATE else ! Find or create DFA state for this NFA state set old_num_states = opt%dfa%num_states next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa) if (next_idx == -1) then opt%dfa%too_large = .true. opt%dfa%compiled = .false. return end if class_transitions(class_idx) = next_idx ! Add new state to worklist only if it was just created if (opt%dfa%num_states > old_num_states) then work_tail = work_tail + 1 if (work_tail > MAX_DFA_STATES) then opt%dfa%too_large = .true. opt%dfa%compiled = .false. return end if worklist(work_tail) = next_idx end if end if end do ! Now fill in the full 256-entry transition table from class transitions do c = 0, 255 class_idx = opt%dfa%char_to_class(c) opt%dfa%states(dfa_idx)%transitions(c) = class_transitions(class_idx) end do end do ! Minimize DFA to reduce state count call minimize_dfa(opt%dfa) opt%dfa%compiled = .true. opt%use_dfa = .true. end subroutine compile_dfa function find_or_create_dfa_state(dfa, nfa_states, nfa) result(idx) !> Find existing DFA state for NFA state set, or create new one !> Returns -1 if DFA state limit exceeded type(compiled_dfa_t), intent(inout) :: dfa type(state_set_t), intent(in) :: nfa_states type(nfa_t), intent(in) :: nfa integer :: idx integer(8) :: h integer :: i h = nfa_states%hash() ! Search existing states do i = 1, dfa%num_states if (dfa%states(i)%state_hash == h .and. & dfa%states(i)%nfa_states%equals(nfa_states)) then idx = i return end if end do ! Create new state if (dfa%num_states >= MAX_DFA_STATES) then idx = -1 return end if dfa%num_states = dfa%num_states + 1 idx = dfa%num_states dfa%states(idx)%nfa_states = nfa_states dfa%states(idx)%state_hash = h dfa%states(idx)%is_accept = is_accepting_set(nfa, nfa_states) dfa%states(idx)%transitions = DFA_DEAD_STATE end function find_or_create_dfa_state subroutine minimize_dfa(dfa) !> Minimize DFA using Hopcroft's algorithm !> Merges equivalent states to reduce DFA size type(compiled_dfa_t), intent(inout) :: dfa integer :: num_states, num_partitions integer, allocatable :: partition(:) ! partition(state) = partition ID integer, allocatable :: part_size(:) ! Size of each partition integer, allocatable :: representative(:) ! Representative state for each partition integer, allocatable :: new_state_id(:) ! Mapping from old state to new state ID type(dfa_state_t), allocatable :: new_states(:) logical, allocatable :: in_worklist(:) ! Is partition in worklist? integer, allocatable :: worklist(:) ! Partitions to process integer :: work_head, work_tail integer :: i, c, state, target, part_id integer :: num_accept, num_reject integer :: old_part, new_part_id logical :: needs_split integer, allocatable :: split_marker(:) ! Which states go to partition A on char c integer :: new_num_states num_states = dfa%num_states if (num_states <= 1) return ! Nothing to minimize ! Allocate working arrays allocate(partition(num_states)) allocate(part_size(num_states)) allocate(representative(num_states)) allocate(new_state_id(num_states)) allocate(in_worklist(num_states)) allocate(worklist(num_states)) allocate(split_marker(num_states)) ! Initialize partitions: accepting states = partition 1, non-accepting = partition 2 partition = 0 part_size = 0 num_accept = 0 num_reject = 0 do i = 1, num_states if (dfa%states(i)%is_accept) then partition(i) = 1 num_accept = num_accept + 1 else partition(i) = 2 num_reject = num_reject + 1 end if end do part_size(1) = num_accept part_size(2) = num_reject num_partitions = 2 ! Handle edge case: all accepting or all rejecting if (num_accept == 0 .or. num_reject == 0) then num_partitions = 1 partition = 1 part_size(1) = num_states end if ! Initialize worklist with smaller partition (Hopcroft optimization) in_worklist = .false. work_head = 1 work_tail = 0 if (num_partitions == 2) then if (num_accept <= num_reject) then work_tail = 1 worklist(1) = 1 in_worklist(1) = .true. else work_tail = 1 worklist(1) = 2 in_worklist(2) = .true. end if end if ! Main refinement loop do while (work_head <= work_tail) part_id = worklist(work_head) work_head = work_head + 1 in_worklist(part_id) = .false. ! For each character, check if this partition splits others do c = 0, 255 ! Mark states that transition to partition part_id on character c split_marker = 0 do state = 1, num_states target = dfa%states(state)%transitions(c) if (target > 0 .and. target <= num_states) then if (partition(target) == part_id) then split_marker(state) = 1 end if end if end do ! Check each existing partition for splits do old_part = 1, num_partitions ! Count states in this partition that go to part_id vs don't num_accept = 0 ! Reuse: count going to part_id num_reject = 0 ! Reuse: count not going to part_id do state = 1, num_states if (partition(state) == old_part) then if (split_marker(state) == 1) then num_accept = num_accept + 1 else num_reject = num_reject + 1 end if end if end do ! If partition needs splitting (has both types) needs_split = (num_accept > 0 .and. num_reject > 0) if (needs_split) then ! Create new partition for the smaller group num_partitions = num_partitions + 1 new_part_id = num_partitions ! Move the smaller group to new partition if (num_accept <= num_reject) then ! Move states going to part_id to new partition do state = 1, num_states if (partition(state) == old_part .and. split_marker(state) == 1) then partition(state) = new_part_id end if end do part_size(new_part_id) = num_accept part_size(old_part) = num_reject else ! Move states NOT going to part_id to new partition do state = 1, num_states if (partition(state) == old_part .and. split_marker(state) == 0) then partition(state) = new_part_id end if end do part_size(new_part_id) = num_reject part_size(old_part) = num_accept end if ! Update worklist if (in_worklist(old_part)) then ! Both halves need to be in worklist work_tail = work_tail + 1 worklist(work_tail) = new_part_id in_worklist(new_part_id) = .true. else ! Add smaller partition to worklist if (part_size(new_part_id) <= part_size(old_part)) then work_tail = work_tail + 1 worklist(work_tail) = new_part_id in_worklist(new_part_id) = .true. else work_tail = work_tail + 1 worklist(work_tail) = old_part in_worklist(old_part) = .true. end if end if end if end do end do end do ! Check if minimization actually reduced states if (num_partitions >= num_states) then ! No reduction possible deallocate(partition, part_size, representative, new_state_id) deallocate(in_worklist, worklist, split_marker) return end if ! Find representative for each partition (lowest numbered state) representative = 0 do state = 1, num_states part_id = partition(state) if (representative(part_id) == 0) then representative(part_id) = state end if end do ! Build new state IDs (compact numbering) new_state_id = 0 new_num_states = 0 do part_id = 1, num_partitions if (representative(part_id) > 0) then new_num_states = new_num_states + 1 ! Map all states in this partition to new state ID do state = 1, num_states if (partition(state) == part_id) then new_state_id(state) = new_num_states end if end do end if end do ! Build minimized DFA allocate(new_states(new_num_states)) do part_id = 1, num_partitions state = representative(part_id) if (state == 0) cycle i = new_state_id(state) new_states(i)%is_accept = dfa%states(state)%is_accept new_states(i)%state_hash = dfa%states(state)%state_hash new_states(i)%nfa_states = dfa%states(state)%nfa_states ! Remap transitions do c = 0, 255 target = dfa%states(state)%transitions(c) if (target > 0 .and. target <= num_states) then new_states(i)%transitions(c) = new_state_id(target) else new_states(i)%transitions(c) = DFA_DEAD_STATE end if end do end do ! Update DFA with minimized version deallocate(dfa%states) allocate(dfa%states(new_num_states)) dfa%states = new_states dfa%start_state = new_state_id(dfa%start_state) dfa%num_states = new_num_states ! Cleanup deallocate(partition, part_size, representative, new_state_id) deallocate(in_worklist, worklist, split_marker, new_states) end subroutine minimize_dfa subroutine compute_char_transitions_simple(nfa, current, c, next_set) !> Compute character transitions without case folding (for DFA compilation) type(nfa_t), intent(in) :: nfa type(state_set_t), intent(in) :: current character(len=1), intent(in) :: c type(state_set_t), intent(inout) :: next_set integer :: state, word_idx, bit_idx, i integer(8) :: word, mask type(nfa_transition_t) :: trans do word_idx = 1, size(current%bits) word = current%bits(word_idx) if (word == 0) cycle do bit_idx = 0, 63 mask = ishft(1_8, bit_idx) if (iand(word, mask) /= 0) then state = (word_idx - 1) * 64 + bit_idx + 1 if (state > nfa%num_states) cycle do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) select case (trans%trans_type) case (TRANS_CHAR) if (c == trans%match_char) then call next_set%add(trans%target) end if case (TRANS_CLASS) if (trans%char_class(ichar(c)) .neqv. trans%negated) then call next_set%add(trans%target) end if case (TRANS_ANY) if (c /= char(10)) then call next_set%add(trans%target) end if end select end do end if end do end do end subroutine compute_char_transitions_simple subroutine expand_epsilon_closure_simple(opt, state_set) !> Expand state set to include epsilon closure (in-place) !> Uses pre-computed closures for O(1) lookup per state type(optimized_nfa_t), intent(in) :: opt type(state_set_t), intent(inout) :: state_set integer :: word_idx, bit_idx, state, j integer(8) :: word, mask, original_bits(size(state_set%bits)) ! If no pre-computed closures, fall back to computing on-the-fly if (.not. allocated(opt%epsilon_closures)) then call expand_epsilon_closure_simple_fallback(opt%nfa, state_set) return end if ! Save original bits to avoid processing newly added states original_bits = state_set%bits ! Expand using pre-computed closures - just OR the bit vectors do word_idx = 1, size(original_bits) word = original_bits(word_idx) if (word == 0) cycle do bit_idx = 0, 63 mask = ishft(1_8, bit_idx) if (iand(word, mask) /= 0) then state = (word_idx - 1) * 64 + bit_idx + 1 if (state >= 1 .and. state <= opt%nfa%num_states) then ! Merge pre-computed epsilon closure using bitwise OR do j = 1, size(state_set%bits) state_set%bits(j) = ior(state_set%bits(j), opt%epsilon_closures(state)%bits(j)) end do end if end if end do end do end subroutine expand_epsilon_closure_simple subroutine expand_epsilon_closure_simple_fallback(nfa, state_set) !> Fallback: compute epsilon closure on-the-fly type(nfa_t), intent(in) :: nfa type(state_set_t), intent(inout) :: state_set type(state_set_t) :: result integer :: word_idx, bit_idx, state integer(8) :: word, mask call result%clear() do word_idx = 1, size(state_set%bits) word = state_set%bits(word_idx) if (word == 0) cycle do bit_idx = 0, 63 mask = ishft(1_8, bit_idx) if (iand(word, mask) /= 0) then state = (word_idx - 1) * 64 + bit_idx + 1 if (state <= nfa%num_states) then call compute_epsilon_closure_basic(nfa, state, result) end if end if end do end do call state_set%copy_from(result) end subroutine expand_epsilon_closure_simple_fallback !--------------------------------------------------------------------------- ! Optimized Search: Use prefix to skip positions !--------------------------------------------------------------------------- function optimized_search(opt, text, ignore_case) result(res) type(optimized_nfa_t), intent(inout) :: opt character(len=*), intent(in) :: text logical, intent(in) :: ignore_case type(match_result_t) :: res integer :: text_len, pos, skip type(match_result_t) :: try_res res%matched = .false. text_len = len_trim(text) ! Fast path: use Aho-Corasick for alternation patterns ! Only use AC if ignore_case setting matches what was compiled if (opt%use_aho_corasick) then if (ignore_case .eqv. opt%ac%ignore_case) then res = ac_optimized_search(opt%ac, text) return end if end if if (opt%nfa%num_states == 0) return ! Backtracking path: use backtracking matcher for patterns with backreferences if (opt%has_backrefs) then res = backtrack_search(opt%nfa, text, text_len, ignore_case) return end if ! Fast path: use DFA if available (O(n) matching) ! DFA is case-sensitive; case-insensitive matching falls through to NFA path if (opt%use_dfa .and. .not. ignore_case) then res = dfa_search(opt%dfa, text, text_len) return end if ! Fast path: anchored start - only try position 1 if (opt%anchored_start) then res = optimized_match(opt, text, 1, ignore_case) return end if ! Use prefix to skip positions (Boyer-Moore style) if (opt%prefix_len > 0 .and. .not. ignore_case) then pos = opt%prefix_len do while (pos <= text_len) ! Check if prefix matches at this position if (prefix_matches(text, pos - opt%prefix_len + 1, opt%prefix, opt%prefix_len)) then ! Try full NFA match from this position try_res = optimized_match(opt, text, pos - opt%prefix_len + 1, ignore_case) if (try_res%matched) then res = try_res return end if pos = pos + 1 else ! Skip based on mismatched character skip = opt%skip_table(ichar(text(pos:pos))) pos = pos + max(skip, 1) end if end do else ! No prefix optimization - try each position do pos = 1, text_len + 1 try_res = optimized_match(opt, text, pos, ignore_case) if (try_res%matched) then res = try_res return end if end do end if end function optimized_search function dfa_search(dfa, text, text_len) result(res) !> Fast O(n) DFA-based search !> Tries each starting position and returns first match type(compiled_dfa_t), intent(in) :: dfa character(len=*), intent(in) :: text integer, intent(in) :: text_len type(match_result_t) :: res integer :: start_pos type(match_result_t) :: try_res res%matched = .false. if (.not. dfa%compiled .or. dfa%num_states == 0) return ! Try each starting position do start_pos = 1, text_len + 1 try_res = dfa_match(dfa, text, text_len, start_pos) if (try_res%matched) then res = try_res return end if end do end function dfa_search function dfa_match(dfa, text, text_len, start_pos) result(res) !> O(n) DFA matching from a specific position !> Just follows transition table - no state set operations type(compiled_dfa_t), intent(in) :: dfa character(len=*), intent(in) :: text integer, intent(in) :: text_len, start_pos type(match_result_t) :: res integer :: state, pos, char_code res%matched = .false. if (.not. dfa%compiled) return state = dfa%start_state pos = start_pos ! Check if start state is accepting (empty match) if (dfa%states(state)%is_accept) then res%matched = .true. res%match_start = start_pos res%match_end = start_pos - 1 end if ! Process each character do while (pos <= text_len) char_code = ichar(text(pos:pos)) state = dfa%states(state)%transitions(char_code) if (state == DFA_DEAD_STATE) exit pos = pos + 1 ! Check for acceptance (greedy - find longest) if (dfa%states(state)%is_accept) then res%matched = .true. res%match_start = start_pos res%match_end = pos - 1 end if end do end function dfa_match function prefix_matches(text, pos, prefix, prefix_len) result(matches) character(len=*), intent(in) :: text integer, intent(in) :: pos, prefix_len character(len=*), intent(in) :: prefix logical :: matches integer :: i, text_len matches = .false. text_len = len_trim(text) if (pos < 1 .or. pos + prefix_len - 1 > text_len) return do i = 1, prefix_len if (text(pos+i-1:pos+i-1) /= prefix(i:i)) return end do matches = .true. end function prefix_matches !--------------------------------------------------------------------------- ! Optimized Match: Use bit vectors and caching !--------------------------------------------------------------------------- function optimized_match(opt, text, start_pos, ignore_case) result(res) type(optimized_nfa_t), intent(inout) :: opt character(len=*), intent(in) :: text integer, intent(in) :: start_pos logical, intent(in) :: ignore_case type(match_result_t) :: res type(state_set_t) :: current, next_set integer :: pos, text_len character(len=1) :: c res%matched = .false. text_len = len_trim(text) if (opt%nfa%num_states == 0) return ! Initialize with epsilon closure of start state (including position-dependent anchors) call current%clear() call compute_epsilon_closure_full(opt%nfa, opt%nfa%start_state, current, text, start_pos, text_len) ! Check if already accepting (empty pattern) if (is_accepting_set(opt%nfa, current)) then res%matched = .true. res%match_start = start_pos res%match_end = start_pos - 1 return end if ! Process each character pos = start_pos do while (pos <= text_len .and. .not. current%is_empty()) c = text(pos:pos) ! Compute next states with DFA caching call next_set%clear() call step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set) ! Compute epsilon closure call current%clear() call expand_epsilon_closure(opt%nfa, next_set, current, text, pos + 1, text_len) pos = pos + 1 ! Check for acceptance (greedy - continue to find longest) if (is_accepting_set(opt%nfa, current)) then res%matched = .true. res%match_start = start_pos res%match_end = pos - 1 end if end do end function optimized_match subroutine compute_epsilon_closure_full(nfa, start_state, result_set, text, pos, text_len) type(nfa_t), intent(in) :: nfa integer, intent(in) :: start_state type(state_set_t), intent(inout) :: result_set character(len=*), intent(in) :: text integer, intent(in) :: pos, text_len integer :: stack(MAX_STATES), stack_top integer :: state, i, target type(nfa_transition_t) :: trans stack_top = 1 stack(1) = start_state do while (stack_top > 0) state = stack(stack_top) stack_top = stack_top - 1 if (state < 1 .or. state > nfa%num_states) cycle if (result_set%contains(state)) cycle call result_set%add(state) do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) if (trans%trans_type == TRANS_EPSILON) then if (trans%anchor_type < 0) cycle ! Skip backrefs target = trans%target if (target >= 1 .and. target <= nfa%num_states) then if (.not. result_set%contains(target)) then stack_top = stack_top + 1 if (stack_top <= MAX_STATES) stack(stack_top) = target end if end if else if (trans%trans_type == TRANS_ANCHOR) then if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then target = trans%target if (target >= 1 .and. target <= nfa%num_states) then if (.not. result_set%contains(target)) then stack_top = stack_top + 1 if (stack_top <= MAX_STATES) stack(stack_top) = target end if end if end if end if end do end do end subroutine compute_epsilon_closure_full subroutine expand_epsilon_closure(nfa, input_set, result_set, text, pos, text_len) type(nfa_t), intent(in) :: nfa type(state_set_t), intent(in) :: input_set type(state_set_t), intent(inout) :: result_set character(len=*), intent(in) :: text integer, intent(in) :: pos, text_len integer :: state, word_idx, bit_idx integer(8) :: word, mask ! Iterate through set bits do word_idx = 1, size(input_set%bits) word = input_set%bits(word_idx) if (word == 0) cycle do bit_idx = 0, 63 mask = ishft(1_8, bit_idx) if (iand(word, mask) /= 0) then state = (word_idx - 1) * 64 + bit_idx + 1 if (state <= nfa%num_states) then call compute_epsilon_closure_full(nfa, state, result_set, text, pos, text_len) end if end if end do end do end subroutine expand_epsilon_closure subroutine step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set) !> Compute next states with DFA caching !> Cache key: (state_set_hash, char_code, ignore_case) !> This avoids recomputing transitions for repeated (state_set, char) pairs type(optimized_nfa_t), intent(inout) :: opt type(state_set_t), intent(in) :: current character(len=1), intent(in) :: c integer, intent(in) :: pos, text_len character(len=*), intent(in) :: text logical, intent(in) :: ignore_case type(state_set_t), intent(inout) :: next_set integer(8) :: state_hash integer :: cache_idx, char_code ! Compute cache key state_hash = current%hash() char_code = ichar(c) ! Compute cache index (combine hash with char code) cache_idx = int(mod(abs(ieor(state_hash, int(char_code, 8))), int(DFA_CACHE_SIZE, 8))) + 1 ! Check cache hit (using hash + char + case as key) ! Note: This may have rare hash collisions, but performance benefit outweighs risk if (opt%dfa_cache(cache_idx)%valid .and. & opt%dfa_cache(cache_idx)%state_hash == state_hash .and. & opt%dfa_cache(cache_idx)%char_code == char_code .and. & (opt%dfa_cache(cache_idx)%is_case_insensitive .eqv. ignore_case)) then ! Cache hit - copy cached result call next_set%copy_from(opt%dfa_cache(cache_idx)%next_states) return end if ! Cache miss - compute transitions call compute_char_transitions(opt%nfa, current, c, ignore_case, next_set) ! Store in cache opt%dfa_cache(cache_idx)%valid = .true. opt%dfa_cache(cache_idx)%state_hash = state_hash opt%dfa_cache(cache_idx)%char_code = char_code opt%dfa_cache(cache_idx)%is_case_insensitive = ignore_case call opt%dfa_cache(cache_idx)%next_states%copy_from(next_set) end subroutine step_with_cache subroutine compute_char_transitions(nfa, current, c, ignore_case, next_set) !> Compute character transitions without caching (called on cache miss) type(nfa_t), intent(in) :: nfa type(state_set_t), intent(in) :: current character(len=1), intent(in) :: c logical, intent(in) :: ignore_case type(state_set_t), intent(inout) :: next_set integer :: state, word_idx, bit_idx, i integer(8) :: word, mask type(nfa_transition_t) :: trans character(len=1) :: c_lower, match_lower ! Iterate through current states do word_idx = 1, size(current%bits) word = current%bits(word_idx) if (word == 0) cycle do bit_idx = 0, 63 mask = ishft(1_8, bit_idx) if (iand(word, mask) /= 0) then state = (word_idx - 1) * 64 + bit_idx + 1 if (state > nfa%num_states) cycle ! Process transitions from this state do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) select case (trans%trans_type) case (TRANS_CHAR) if (ignore_case) then c_lower = to_lower_char(c) match_lower = to_lower_char(trans%match_char) if (c_lower == match_lower) then call next_set%add(trans%target) end if else if (c == trans%match_char) then call next_set%add(trans%target) end if end if case (TRANS_CLASS) ! Use fast bitwise character class test if (ignore_case) then if (charclass_test_case_insensitive(trans%char_bits, c)) then call next_set%add(trans%target) end if else if (charclass_test(trans%char_bits, c)) then call next_set%add(trans%target) end if end if case (TRANS_ANY) if (c /= char(10)) then call next_set%add(trans%target) end if end select end do end if end do end do end subroutine compute_char_transitions function is_accepting_set(nfa, states) result(res) type(nfa_t), intent(in) :: nfa type(state_set_t), intent(in) :: states logical :: res integer :: state, word_idx, bit_idx integer(8) :: word, mask res = .false. do word_idx = 1, size(states%bits) word = states%bits(word_idx) if (word == 0) cycle do bit_idx = 0, 63 mask = ishft(1_8, bit_idx) if (iand(word, mask) /= 0) then state = (word_idx - 1) * 64 + bit_idx + 1 if (state >= 1 .and. state <= nfa%num_states) then if (nfa%states(state)%is_accept) then res = .true. return end if end if end if end do end do end function is_accepting_set !--------------------------------------------------------------------------- ! Helper functions !--------------------------------------------------------------------------- function anchor_matches_opt(anchor_type, text, pos, text_len) result(matches) integer, intent(in) :: anchor_type character(len=*), intent(in) :: text integer, intent(in) :: pos, text_len logical :: matches logical :: at_start, at_end, prev_word, curr_word matches = .false. at_start = (pos == 1) .or. (pos < 1) at_end = (pos > text_len) select case (anchor_type) case (1) ! ^ if (at_start) then matches = .true. else if (pos > 1 .and. pos <= text_len + 1) then matches = (text(pos-1:pos-1) == char(10)) end if case (2) ! $ if (at_end) then matches = .true. else if (pos >= 1 .and. pos <= text_len) then matches = (text(pos:pos) == char(10)) end if case (3) ! \< prev_word = .false. curr_word = .false. if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) matches = (.not. prev_word) .and. curr_word case (4) ! \> prev_word = .false. curr_word = .false. if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) matches = prev_word .and. (.not. curr_word) case (5) ! \b prev_word = .false. curr_word = .false. if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) matches = prev_word .neqv. curr_word case (6) ! \B prev_word = .false. curr_word = .false. if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) matches = prev_word .eqv. curr_word end select end function anchor_matches_opt pure function is_word_char_opt(c) result(res) character(len=1), intent(in) :: c logical :: res integer :: ic ic = ichar(c) res = (ic >= ichar('a') .and. ic <= ichar('z')) .or. & (ic >= ichar('A') .and. ic <= ichar('Z')) .or. & (ic >= ichar('0') .and. ic <= ichar('9')) .or. & (c == '_') end function is_word_char_opt pure function to_lower_char(c) result(lower) character(len=1), intent(in) :: c character(len=1) :: lower integer :: ic ic = ichar(c) if (ic >= ichar('A') .and. ic <= ichar('Z')) then ! ASCII uppercase A-Z -> a-z lower = char(ic + 32) else if (ic >= 192 .and. ic <= 214) then ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246) lower = char(ic + 32) else if (ic >= 216 .and. ic <= 222) then ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254) lower = char(ic + 32) else if (ic >= 128 .and. ic <= 150) then ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6) ! When preceded by 0xC3, these represent À-Ö, fold to à-ö lower = char(ic + 32) else if (ic >= 152 .and. ic <= 158) then ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE) ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ lower = char(ic + 32) else lower = c end if end function to_lower_char function char_in_class_opt(c, char_class, negated, ignore_case) result(res) character(len=1), intent(in) :: c logical, intent(in) :: char_class(0:255) logical, intent(in) :: negated, ignore_case logical :: res integer :: ic character(len=1) :: c_lower, c_upper ic = ichar(c) res = char_class(ic) if (ignore_case .and. .not. res) then c_lower = to_lower_char(c) if (c_lower /= c) res = char_class(ichar(c_lower)) if (.not. res) then ic = ichar(c) if (ic >= ichar('a') .and. ic <= ichar('z')) then c_upper = char(ic - 32) res = char_class(ichar(c_upper)) end if end if end if if (negated) res = .not. res end function char_in_class_opt !--------------------------------------------------------------------------- ! Aho-Corasick Integration for Alternation Patterns !--------------------------------------------------------------------------- subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case) !> Try to build Aho-Corasick automaton for simple alternation patterns !> Pattern like "foo|bar|baz" with only literal characters and | separators type(optimized_nfa_t), intent(inout) :: opt character(len=*), intent(in) :: pattern logical, intent(in) :: is_ere, ignore_case character(len=4096), allocatable :: alternatives(:) integer :: num_alternatives, ierr logical :: is_simple allocate(alternatives(1000)) opt%use_aho_corasick = .false. ! Check if pattern is simple alternation of literals call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple) ! DEBUG (commented out for production) ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple if (.not. is_simple .or. num_alternatives < 2) return ! Build Aho-Corasick automaton call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr) if (ierr == 0) then opt%use_aho_corasick = .true. end if deallocate(alternatives) end subroutine try_build_aho_corasick subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple) !> Parse pattern to check if it's simple alternation of literals !> Returns the alternatives if so character(len=*), intent(in) :: pattern logical, intent(in) :: is_ere character(len=*), intent(out) :: alternatives(:) integer, intent(out) :: num_alt logical, intent(out) :: is_simple integer :: i, pat_len, alt_start, alt_len character(len=1) :: c, next_c logical :: in_escape is_simple = .true. num_alt = 0 pat_len = pattern_len(pattern) ! Use pattern_len to preserve whitespace patterns if (pat_len == 0) then is_simple = .false. return end if alt_start = 1 alt_len = 0 in_escape = .false. i = 1 do while (i <= pat_len) c = pattern(i:i) if (in_escape) then ! In ERE mode, \| is literal | ! In BRE mode, \| is alternation (GNU extension) if (c == '|' .and. .not. is_ere) then ! BRE alternation if (alt_len > 0) then num_alt = num_alt + 1 if (num_alt > size(alternatives)) then is_simple = .false. return end if alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) ! Add null terminator to preserve exact length if (alt_len < len(alternatives(num_alt))) then alternatives(num_alt)(alt_len+1:alt_len+1) = char(0) end if else ! Empty alternative - still valid num_alt = num_alt + 1 alternatives(num_alt) = char(0) end if alt_start = i + 1 alt_len = 0 else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. & c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. & c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. & c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. & c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. & c == '7' .or. c == '8' .or. c == '9') then ! Regex metacharacter - not simple is_simple = .false. return else ! Escaped literal character (e.g., \., \*, etc.) alt_len = alt_len + 1 end if in_escape = .false. i = i + 1 cycle end if if (c == '\') then in_escape = .true. i = i + 1 cycle end if ! Check for metacharacters if (is_ere) then ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters if (c == '|') then ! Alternation separator if (alt_len > 0) then num_alt = num_alt + 1 if (num_alt > size(alternatives)) then is_simple = .false. return end if alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) ! Add null terminator to preserve exact length if (alt_len < len(alternatives(num_alt))) then alternatives(num_alt)(alt_len+1:alt_len+1) = char(0) end if else num_alt = num_alt + 1 alternatives(num_alt) = char(0) end if alt_start = i + 1 alt_len = 0 i = i + 1 cycle else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. & c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. & c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then ! Metacharacter - not simple alternation is_simple = .false. return end if else ! BRE mode: only . * [ ] ^ $ are metacharacters ! | is literal, \| is alternation (GNU extension) if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. & c == '^' .or. c == '$') then is_simple = .false. return end if end if ! Regular literal character alt_len = alt_len + 1 i = i + 1 end do ! Handle last alternative if (alt_len > 0 .or. num_alt > 0) then num_alt = num_alt + 1 if (num_alt > size(alternatives)) then is_simple = .false. return end if if (alt_len > 0) then alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) ! Add null terminator to preserve exact length if (alt_len < len(alternatives(num_alt))) then alternatives(num_alt)(alt_len+1:alt_len+1) = char(0) end if else alternatives(num_alt) = char(0) end if end if ! Need at least 2 alternatives for Aho-Corasick to be useful if (num_alt < 2) then is_simple = .false. end if end subroutine parse_simple_alternation function ac_optimized_search(ac, text) result(res) !> Search using Aho-Corasick automaton type(ac_automaton_t), intent(in) :: ac character(len=*), intent(in) :: text type(match_result_t) :: res type(ac_match_t) :: ac_match res%matched = .false. ac_match = ac_search(ac, text) if (ac_match%matched) then res%matched = .true. res%match_start = ac_match%start_pos res%match_end = ac_match%end_pos end if end function ac_optimized_search !--------------------------------------------------------------------------- ! Backtracking Matcher for Backreferences !--------------------------------------------------------------------------- function backtrack_search(nfa, text, text_len, ignore_case) result(res) !> Search for pattern with backreferences using backtracking !> Tries each starting position until a match is found type(nfa_t), intent(in) :: nfa character(len=*), intent(in) :: text integer, intent(in) :: text_len logical, intent(in) :: ignore_case type(match_result_t) :: res integer :: start_pos type(match_result_t) :: try_res res%matched = .false. do start_pos = 1, text_len + 1 try_res = backtrack_match(nfa, text, text_len, start_pos, ignore_case) if (try_res%matched) then res = try_res return end if end do end function backtrack_search function backtrack_match(nfa, text, text_len, start_pos, ignore_case) result(res) !> Try to match NFA with backreferences starting at start_pos !> Uses recursive backtracking to track group captures type(nfa_t), intent(in) :: nfa character(len=*), intent(in) :: text integer, intent(in) :: text_len, start_pos logical, intent(in) :: ignore_case type(match_result_t) :: res integer :: group_starts(9), group_ends(9) integer :: best_end res%matched = .false. group_starts = 0 group_ends = 0 best_end = start_pos - 1 ! Try to match from the start state if (backtrack_from_state(nfa, nfa%start_state, text, text_len, start_pos, & ignore_case, group_starts, group_ends, best_end)) then res%matched = .true. res%match_start = start_pos res%match_end = best_end res%group_starts = group_starts res%group_ends = group_ends end if end function backtrack_match recursive function backtrack_from_state(nfa, state, text, text_len, pos, & ignore_case, group_starts, group_ends, best_end) result(matched) !> Recursive backtracking from a given NFA state !> Returns true if we can reach an accepting state type(nfa_t), intent(in) :: nfa integer, intent(in) :: state character(len=*), intent(in) :: text integer, intent(in) :: text_len, pos logical, intent(in) :: ignore_case integer, intent(inout) :: group_starts(9), group_ends(9) integer, intent(inout) :: best_end logical :: matched integer :: i, target, old_start, old_end integer :: backref_num, ref_start, ref_end, ref_len integer :: saved_starts(9), saved_ends(9) type(nfa_transition_t) :: trans character(len=1) :: c, c_lower, match_lower logical :: char_matches matched = .false. if (state < 1 .or. state > nfa%num_states) return ! Record group start if this state starts a group if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then old_start = group_starts(nfa%states(state)%group_start) group_starts(nfa%states(state)%group_start) = pos else old_start = 0 end if ! Record group end if this state ends a group ! This must be done BEFORE processing transitions so backrefs can see the captured text if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then old_end = group_ends(nfa%states(state)%group_end) group_ends(nfa%states(state)%group_end) = pos - 1 else old_end = 0 end if ! Check if this is an accepting state if (nfa%states(state)%is_accept) then matched = .true. if (pos - 1 > best_end) best_end = pos - 1 ! Continue to find longest match (greedy) end if ! Try each transition from this state do i = 1, nfa%states(state)%num_trans trans = nfa%states(state)%trans(i) target = trans%target select case (trans%trans_type) case (TRANS_EPSILON) ! Check for backreference (negative anchor_type) if (trans%anchor_type < 0) then backref_num = -trans%anchor_type if (backref_num >= 1 .and. backref_num <= 9) then ref_start = group_starts(backref_num) ref_end = group_ends(backref_num) ! If group hasn't been captured yet, backreference fails if (ref_start == 0 .or. ref_end == 0 .or. ref_end < ref_start) cycle ref_len = ref_end - ref_start + 1 ! Check if we have enough text remaining if (pos + ref_len - 1 > text_len) cycle ! Check if the text matches the captured group if (ignore_case) then if (.not. strings_equal_icase(text(pos:pos+ref_len-1), & text(ref_start:ref_end))) cycle else if (text(pos:pos+ref_len-1) /= text(ref_start:ref_end)) cycle end if ! Backref matches - continue from target with advanced position saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos + ref_len, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if else ! Regular epsilon transition saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if case (TRANS_ANCHOR) ! Check if anchor matches at this position if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if case (TRANS_CHAR) ! Character transition - need text available if (pos <= text_len) then c = text(pos:pos) char_matches = .false. if (ignore_case) then c_lower = to_lower_char(c) match_lower = to_lower_char(trans%match_char) char_matches = (c_lower == match_lower) else char_matches = (c == trans%match_char) end if if (char_matches) then saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if end if case (TRANS_CLASS) ! Character class transition if (pos <= text_len) then c = text(pos:pos) if (ignore_case) then if (charclass_test_case_insensitive(trans%char_bits, c)) then saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if else if (charclass_test(trans%char_bits, c)) then saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if end if end if case (TRANS_ANY) ! Dot matches any character except newline if (pos <= text_len) then if (text(pos:pos) /= char(10)) then saved_starts = group_starts saved_ends = group_ends if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & ignore_case, group_starts, group_ends, best_end)) then matched = .true. else group_starts = saved_starts group_ends = saved_ends end if end if end if end select end do ! Restore group start and end if we didn't match if (.not. matched) then if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then group_starts(nfa%states(state)%group_start) = old_start end if if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then group_ends(nfa%states(state)%group_end) = old_end end if end if end function backtrack_from_state function strings_equal_icase(s1, s2) result(equal) !> Compare two strings case-insensitively character(len=*), intent(in) :: s1, s2 logical :: equal integer :: i, n equal = .false. n = len(s1) if (len(s2) /= n) return do i = 1, n if (to_lower_char(s1(i:i)) /= to_lower_char(s2(i:i))) return end do equal = .true. end function strings_equal_icase end module regex_optimizer