ferp Public

Watch 0 Fork 1 Star 0
Fortran · 71301 bytes Raw Blame History
  
        1
        module regex_optimizer
      
        2
          !> Regex optimization module for FERP
      
        3
          !> Provides optimized NFA matching with:
      
        4
          !>   - Literal prefix extraction for Boyer-Moore skip
      
        5
          !>   - Bit vector state sets for O(1) operations
      
        6
          !>   - Lazy DFA state caching
      
        7
          !>   - Anchored pattern fast paths
      
        8
          !>   - Aho-Corasick for alternation patterns
      
        9
          !>   - Bitwise character class matching
      
        10
          use regex_types
      
        11
          use regex_charclass
      
        12
          use aho_corasick
      
        13
          use ferp_kinds, only: pattern_len
      
        14
          implicit none
      
        15
          private
      
        16
        
        17
          public :: optimized_nfa_t
      
        18
          public :: optimize_nfa, optimized_match, optimized_search
      
        19
          public :: try_build_aho_corasick
      
        20
        
        21
          integer, parameter :: MAX_STATES = 1024
      
        22
          integer, parameter :: MAX_PREFIX_LEN = 64
      
        23
          integer, parameter :: DFA_CACHE_SIZE = 256  ! Cache recent state transitions
      
        24
          integer, parameter :: MAX_DFA_STATES = 512  ! Max DFA states before fallback to NFA
      
        25
          integer, parameter :: DFA_DEAD_STATE = 0    ! Special state: no match possible
      
        26
        
        27
          !> Bit vector for state sets - much faster than array lookup
      
        28
          type :: state_set_t
      
        29
            integer(8) :: bits(MAX_STATES / 64 + 1) = 0
      
        30
            integer :: count = 0
      
        31
          contains
      
        32
            procedure :: clear => state_set_clear
      
        33
            procedure :: add => state_set_add
      
        34
            procedure :: contains => state_set_contains
      
        35
            procedure :: is_empty => state_set_is_empty
      
        36
            procedure :: copy_from => state_set_copy
      
        37
            procedure :: hash => state_set_hash
      
        38
            procedure :: equals => state_set_equals
      
        39
          end type state_set_t
      
        40
        
        41
          !> DFA cache entry - caches (state_set_hash, char) -> next_states transitions
      
        42
          type :: dfa_cache_entry_t
      
        43
            integer(8) :: state_hash = 0        ! Hash of source state set
      
        44
            integer :: char_code = -1           ! Character being matched
      
        45
            type(state_set_t) :: next_states    ! Resulting states after transition
      
        46
            logical :: valid = .false.          ! Entry is populated
      
        47
            logical :: is_case_insensitive = .false.  ! Case sensitivity flag
      
        48
          end type dfa_cache_entry_t
      
        49
        
        50
          !> Full DFA state - precomputed transitions for all 256 characters
      
        51
          type :: dfa_state_t
      
        52
            integer :: transitions(0:255) = DFA_DEAD_STATE  ! Next state for each byte
      
        53
            type(state_set_t) :: nfa_states                  ! Corresponding NFA state set
      
        54
            logical :: is_accept = .false.                   ! Is this an accepting state?
      
        55
            integer(8) :: state_hash = 0                     ! Hash for lookup
      
        56
          end type dfa_state_t
      
        57
        
        58
          !> Compiled DFA for O(n) matching
      
        59
          type :: compiled_dfa_t
      
        60
            type(dfa_state_t), allocatable :: states(:)     ! DFA states
      
        61
            integer :: num_states = 0                        ! Number of states built
      
        62
            integer :: start_state = 0                       ! Starting DFA state
      
        63
            logical :: compiled = .false.                    ! DFA successfully compiled
      
        64
            logical :: too_large = .false.                   ! DFA exceeded size limit
      
        65
            ! Character equivalence classes
      
        66
            integer :: char_to_class(0:255) = 0             ! Maps char code to class index
      
        67
            integer :: num_classes = 256                     ! Number of equivalence classes
      
        68
            logical :: use_equiv_classes = .false.           ! Using equivalence classes
      
        69
          end type compiled_dfa_t
      
        70
        
        71
          !> Optimized NFA with precomputed data
      
        72
          type :: optimized_nfa_t
      
        73
            type(nfa_t) :: nfa                          ! Original NFA
      
        74
            character(len=MAX_PREFIX_LEN) :: prefix = '' ! Literal prefix for quick skip
      
        75
            integer :: prefix_len = 0
      
        76
            logical :: anchored_start = .false.          ! Pattern starts with ^
      
        77
            logical :: anchored_end = .false.            ! Pattern ends with $
      
        78
            integer :: skip_table(0:255) = 0             ! Boyer-Moore skip table for prefix
      
        79
            type(state_set_t) :: start_closure           ! Pre-computed start state epsilon closure
      
        80
            type(state_set_t), allocatable :: epsilon_closures(:)  ! Pre-computed epsilon closures per state
      
        81
            type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE)  ! Lazy DFA cache
      
        82
            type(compiled_dfa_t) :: dfa                  ! Full compiled DFA (if available)
      
        83
            logical :: use_dfa = .false.                 ! Use DFA instead of NFA
      
        84
            type(ac_automaton_t) :: ac                   ! Aho-Corasick automaton (for alternation)
      
        85
            logical :: use_aho_corasick = .false.        ! Use Aho-Corasick for matching
      
        86
            logical :: has_backrefs = .false.            ! Pattern contains backreferences
      
        87
            logical :: optimized = .false.
      
        88
          end type optimized_nfa_t
      
        89
        
        90
        contains
      
        91
        
        92
          !---------------------------------------------------------------------------
      
        93
          ! State Set Operations (Bit Vector)
      
        94
          !---------------------------------------------------------------------------
      
        95
        
        96
          subroutine state_set_clear(this)
      
        97
            class(state_set_t), intent(inout) :: this
      
        98
            this%bits = 0
      
        99
            this%count = 0
      
        100
          end subroutine state_set_clear
      
        101
        
        102
          subroutine state_set_add(this, state)
      
        103
            class(state_set_t), intent(inout) :: this
      
        104
            integer, intent(in) :: state
      
        105
            integer :: word_idx, bit_idx
      
        106
            integer(8) :: mask
      
        107
        
        108
            if (state < 1 .or. state > MAX_STATES) return
      
        109
        
        110
            word_idx = (state - 1) / 64 + 1
      
        111
            bit_idx = mod(state - 1, 64)
      
        112
            mask = ishft(1_8, bit_idx)
      
        113
        
        114
            if (iand(this%bits(word_idx), mask) == 0) then
      
        115
              this%bits(word_idx) = ior(this%bits(word_idx), mask)
      
        116
              this%count = this%count + 1
      
        117
            end if
      
        118
          end subroutine state_set_add
      
        119
        
        120
          function state_set_contains(this, state) result(found)
      
        121
            class(state_set_t), intent(in) :: this
      
        122
            integer, intent(in) :: state
      
        123
            logical :: found
      
        124
            integer :: word_idx, bit_idx
      
        125
            integer(8) :: mask
      
        126
        
        127
            found = .false.
      
        128
            if (state < 1 .or. state > MAX_STATES) return
      
        129
        
        130
            word_idx = (state - 1) / 64 + 1
      
        131
            bit_idx = mod(state - 1, 64)
      
        132
            mask = ishft(1_8, bit_idx)
      
        133
        
        134
            found = iand(this%bits(word_idx), mask) /= 0
      
        135
          end function state_set_contains
      
        136
        
        137
          function state_set_is_empty(this) result(empty)
      
        138
            class(state_set_t), intent(in) :: this
      
        139
            logical :: empty
      
        140
            empty = this%count == 0
      
        141
          end function state_set_is_empty
      
        142
        
        143
          subroutine state_set_copy(this, other)
      
        144
            class(state_set_t), intent(inout) :: this
      
        145
            type(state_set_t), intent(in) :: other
      
        146
            this%bits = other%bits
      
        147
            this%count = other%count
      
        148
          end subroutine state_set_copy
      
        149
        
        150
          function state_set_hash(this) result(h)
      
        151
            !> Compute a hash of the state set for cache lookup
      
        152
            !> Uses FNV-1a style hashing on the bit words
      
        153
            class(state_set_t), intent(in) :: this
      
        154
            integer(8) :: h
      
        155
            integer :: i
      
        156
            integer(8), parameter :: FNV_OFFSET = int(Z'CBF29CE484222325', 8)
      
        157
            integer(8), parameter :: FNV_PRIME = int(Z'100000001B3', 8)
      
        158
        
        159
            h = FNV_OFFSET
      
        160
            do i = 1, size(this%bits)
      
        161
              h = ieor(h, this%bits(i))
      
        162
              h = h * FNV_PRIME
      
        163
            end do
      
        164
          end function state_set_hash
      
        165
        
        166
          function state_set_equals(this, other) result(eq)
      
        167
            !> Check if two state sets are identical
      
        168
            class(state_set_t), intent(in) :: this
      
        169
            type(state_set_t), intent(in) :: other
      
        170
            logical :: eq
      
        171
            integer :: i
      
        172
        
        173
            eq = .false.
      
        174
            if (this%count /= other%count) return
      
        175
        
        176
            do i = 1, size(this%bits)
      
        177
              if (this%bits(i) /= other%bits(i)) return
      
        178
            end do
      
        179
        
        180
            eq = .true.
      
        181
          end function state_set_equals
      
        182
        
        183
          !---------------------------------------------------------------------------
      
        184
          ! Optimization: Analyze NFA and extract optimizations
      
        185
          !---------------------------------------------------------------------------
      
        186
        
        187
          subroutine optimize_nfa(opt, nfa)
      
        188
            type(optimized_nfa_t), intent(out) :: opt
      
        189
            type(nfa_t), intent(in) :: nfa
      
        190
        
        191
            opt%nfa = nfa
      
        192
            opt%prefix_len = 0
      
        193
            opt%prefix = ''
      
        194
            opt%anchored_start = .false.
      
        195
            opt%anchored_end = .false.
      
        196
            opt%use_dfa = .false.
      
        197
            opt%has_backrefs = .false.
      
        198
        
        199
            ! Detect backreferences in NFA
      
        200
            opt%has_backrefs = has_backref_transitions(nfa)
      
        201
        
        202
            ! Extract literal prefix and detect anchors
      
        203
            call extract_prefix_and_anchors(opt)
      
        204
        
        205
            ! Build Boyer-Moore skip table for prefix
      
        206
            if (opt%prefix_len > 0) then
      
        207
              call build_skip_table(opt%prefix, opt%prefix_len, opt%skip_table)
      
        208
            end if
      
        209
        
        210
            ! Pre-compute start state epsilon closure (position-independent part)
      
        211
            call precompute_start_closure(opt)
      
        212
        
        213
            ! Pre-compute epsilon closures for all states (for fast expansion)
      
        214
            call precompute_all_epsilon_closures(opt)
      
        215
        
        216
            ! Clear DFA cache
      
        217
            opt%dfa_cache%valid = .false.
      
        218
        
        219
            ! Try to compile full DFA for O(n) matching
      
        220
            ! Only for patterns without any position-dependent transitions (anchors)
      
        221
            ! and without backreferences (which require backtracking)
      
        222
            if (.not. has_anchor_transitions(opt%nfa) .and. .not. opt%has_backrefs) then
      
        223
              call compile_dfa(opt)
      
        224
              ! DEBUG: Print DFA compilation result (uncomment for debugging)
      
        225
              ! write(0,*) 'DFA compiled:', opt%use_dfa, 'states:', opt%dfa%num_states, 'too_large:', opt%dfa%too_large
      
        226
            end if
      
        227
        
        228
            opt%optimized = .true.
      
        229
        
        230
          end subroutine optimize_nfa
      
        231
        
        232
          subroutine extract_prefix_and_anchors(opt)
      
        233
            type(optimized_nfa_t), intent(inout) :: opt
      
        234
            integer :: state, i, prefix_len
      
        235
            type(nfa_transition_t) :: trans
      
        236
            logical :: done
      
        237
        
        238
            prefix_len = 0
      
        239
            state = opt%nfa%start_state
      
        240
        
        241
            ! Check for start anchor
      
        242
            if (state >= 1 .and. state <= opt%nfa%num_states) then
      
        243
              do i = 1, opt%nfa%states(state)%num_trans
      
        244
                trans = opt%nfa%states(state)%trans(i)
      
        245
                if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 1) then
      
        246
                  opt%anchored_start = .true.
      
        247
                  state = trans%target
      
        248
                  exit
      
        249
                end if
      
        250
              end do
      
        251
            end if
      
        252
        
        253
            ! Extract literal prefix by following single-path character transitions
      
        254
            done = .false.
      
        255
            do while (.not. done .and. prefix_len < MAX_PREFIX_LEN)
      
        256
              if (state < 1 .or. state > opt%nfa%num_states) exit
      
        257
              if (opt%nfa%states(state)%num_trans /= 1) exit
      
        258
        
        259
              trans = opt%nfa%states(state)%trans(1)
      
        260
        
        261
              if (trans%trans_type == TRANS_CHAR) then
      
        262
                prefix_len = prefix_len + 1
      
        263
                opt%prefix(prefix_len:prefix_len) = trans%match_char
      
        264
                state = trans%target
      
        265
              else if (trans%trans_type == TRANS_EPSILON) then
      
        266
                ! Follow epsilon, but only if it's the only transition
      
        267
                state = trans%target
      
        268
              else
      
        269
                done = .true.
      
        270
              end if
      
        271
            end do
      
        272
        
        273
            opt%prefix_len = prefix_len
      
        274
        
        275
            ! Check for end anchor (scan accept state's incoming transitions)
      
        276
            ! This is approximate - just check if accept state has anchor transition
      
        277
            if (opt%nfa%accept_state >= 1 .and. opt%nfa%accept_state <= opt%nfa%num_states) then
      
        278
              ! Check states that point to accept state
      
        279
              do state = 1, opt%nfa%num_states
      
        280
                do i = 1, opt%nfa%states(state)%num_trans
      
        281
                  trans = opt%nfa%states(state)%trans(i)
      
        282
                  if (trans%target == opt%nfa%accept_state) then
      
        283
                    if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 2) then
      
        284
                      opt%anchored_end = .true.
      
        285
                    end if
      
        286
                  end if
      
        287
                end do
      
        288
              end do
      
        289
            end if
      
        290
        
        291
          end subroutine extract_prefix_and_anchors
      
        292
        
        293
          subroutine build_skip_table(prefix, prefix_len, skip_table)
      
        294
            character(len=*), intent(in) :: prefix
      
        295
            integer, intent(in) :: prefix_len
      
        296
            integer, intent(out) :: skip_table(0:255)
      
        297
            integer :: i, c
      
        298
        
        299
            ! Default skip is prefix length
      
        300
            skip_table = prefix_len
      
        301
        
        302
            ! Set skip distances for characters in prefix
      
        303
            do i = 1, prefix_len - 1
      
        304
              c = ichar(prefix(i:i))
      
        305
              skip_table(c) = prefix_len - i
      
        306
            end do
      
        307
          end subroutine build_skip_table
      
        308
        
        309
          subroutine precompute_start_closure(opt)
      
        310
            type(optimized_nfa_t), intent(inout) :: opt
      
        311
            ! Compute basic epsilon closure of start state
      
        312
            ! (Some closures depend on position for anchors, handle those at runtime)
      
        313
            call opt%start_closure%clear()
      
        314
            call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, opt%start_closure)
      
        315
          end subroutine precompute_start_closure
      
        316
        
        317
          subroutine compute_epsilon_closure_basic(nfa, start_state, result_set)
      
        318
            type(nfa_t), intent(in) :: nfa
      
        319
            integer, intent(in) :: start_state
      
        320
            type(state_set_t), intent(inout) :: result_set
      
        321
        
        322
            integer :: stack(MAX_STATES), stack_top
      
        323
            integer :: state, i, target
      
        324
            type(nfa_transition_t) :: trans
      
        325
        
        326
            stack_top = 1
      
        327
            stack(1) = start_state
      
        328
        
        329
            do while (stack_top > 0)
      
        330
              state = stack(stack_top)
      
        331
              stack_top = stack_top - 1
      
        332
        
        333
              if (state < 1 .or. state > nfa%num_states) cycle
      
        334
              if (result_set%contains(state)) cycle
      
        335
        
        336
              call result_set%add(state)
      
        337
        
        338
              ! Follow epsilon transitions (not anchors - those are position-dependent)
      
        339
              do i = 1, nfa%states(state)%num_trans
      
        340
                trans = nfa%states(state)%trans(i)
      
        341
                if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type >= 0) then
      
        342
                  target = trans%target
      
        343
                  if (target >= 1 .and. target <= nfa%num_states) then
      
        344
                    if (.not. result_set%contains(target)) then
      
        345
                      stack_top = stack_top + 1
      
        346
                      if (stack_top <= MAX_STATES) stack(stack_top) = target
      
        347
                    end if
      
        348
                  end if
      
        349
                end if
      
        350
              end do
      
        351
            end do
      
        352
          end subroutine compute_epsilon_closure_basic
      
        353
        
        354
          subroutine precompute_all_epsilon_closures(opt)
      
        355
            !> Pre-compute epsilon closure for every NFA state
      
        356
            !> This allows O(1) closure lookup during matching instead of repeated traversal
      
        357
            type(optimized_nfa_t), intent(inout) :: opt
      
        358
        
        359
            integer :: i, n
      
        360
        
        361
            n = opt%nfa%num_states
      
        362
            if (n <= 0) return
      
        363
        
        364
            ! Allocate epsilon closures array
      
        365
            if (allocated(opt%epsilon_closures)) deallocate(opt%epsilon_closures)
      
        366
            allocate(opt%epsilon_closures(n))
      
        367
        
        368
            ! Compute epsilon closure for each state
      
        369
            do i = 1, n
      
        370
              call opt%epsilon_closures(i)%clear()
      
        371
              call compute_epsilon_closure_basic(opt%nfa, i, opt%epsilon_closures(i))
      
        372
            end do
      
        373
          end subroutine precompute_all_epsilon_closures
      
        374
        
        375
          function has_anchor_transitions(nfa) result(has_anchors)
      
        376
            !> Check if NFA has any anchor transitions (position-dependent)
      
        377
            !> These include ^, $, \<, \>, \b, \B
      
        378
            type(nfa_t), intent(in) :: nfa
      
        379
            logical :: has_anchors
      
        380
        
        381
            integer :: state, i
      
        382
            type(nfa_transition_t) :: trans
      
        383
        
        384
            has_anchors = .false.
      
        385
        
        386
            do state = 1, nfa%num_states
      
        387
              do i = 1, nfa%states(state)%num_trans
      
        388
                trans = nfa%states(state)%trans(i)
      
        389
                if (trans%trans_type == TRANS_ANCHOR) then
      
        390
                  has_anchors = .true.
      
        391
                  return
      
        392
                end if
      
        393
              end do
      
        394
            end do
      
        395
          end function has_anchor_transitions
      
        396
        
        397
          function has_backref_transitions(nfa) result(has_backrefs)
      
        398
            !> Check if NFA has any backreference transitions
      
        399
            !> Backrefs are encoded as TRANS_EPSILON with negative anchor_type
      
        400
            type(nfa_t), intent(in) :: nfa
      
        401
            logical :: has_backrefs
      
        402
        
        403
            integer :: state, i
      
        404
            type(nfa_transition_t) :: trans
      
        405
        
        406
            has_backrefs = .false.
      
        407
        
        408
            do state = 1, nfa%num_states
      
        409
              do i = 1, nfa%states(state)%num_trans
      
        410
                trans = nfa%states(state)%trans(i)
      
        411
                if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type < 0) then
      
        412
                  has_backrefs = .true.
      
        413
                  return
      
        414
                end if
      
        415
              end do
      
        416
            end do
      
        417
          end function has_backref_transitions
      
        418
        
        419
          !---------------------------------------------------------------------------
      
        420
          ! Character Equivalence Classes
      
        421
          !---------------------------------------------------------------------------
      
        422
        
        423
          subroutine compute_equiv_classes(nfa, char_to_class, num_classes)
      
        424
            !> Compute character equivalence classes from NFA transitions
      
        425
            !> Characters with identical behavior across all NFA states belong to same class
      
        426
            !> This reduces DFA transition table from 256 entries to num_classes entries
      
        427
            type(nfa_t), intent(in) :: nfa
      
        428
            integer, intent(out) :: char_to_class(0:255)
      
        429
            integer, intent(out) :: num_classes
      
        430
        
        431
            ! Signature for each character: encodes which transitions it triggers
      
        432
            ! We use a simple approach: hash the set of (state, target) pairs for each char
      
        433
            integer(8) :: char_signature(0:255)
      
        434
            integer :: state, i, c, target
      
        435
            type(nfa_transition_t) :: trans
      
        436
            integer(8) :: sig
      
        437
            integer :: class_map(0:255)  ! signature hash -> class index
      
        438
            logical :: found
      
        439
        
        440
            ! Initialize all characters to have signature 0 (no transitions)
      
        441
            char_signature = 0_8
      
        442
        
        443
            ! Build signature for each character based on NFA transitions
      
        444
            do state = 1, nfa%num_states
      
        445
              do i = 1, nfa%states(state)%num_trans
      
        446
                trans = nfa%states(state)%trans(i)
      
        447
                target = trans%target
      
        448
        
        449
                select case (trans%trans_type)
      
        450
                  case (TRANS_CHAR)
      
        451
                    ! Single character transition
      
        452
                    c = ichar(trans%match_char)
      
        453
                    ! Add (state, target) to signature using FNV-1a-like hash
      
        454
                    char_signature(c) = ieor(char_signature(c), &
      
        455
                                             int(state * 31 + target, 8) * 1099511628211_8)
      
        456
        
        457
                  case (TRANS_CLASS)
      
        458
                    ! Character class transition - add to all matching chars
      
        459
                    do c = 0, 255
      
        460
                      if (charclass_test(trans%char_bits, char(c))) then
      
        461
                        char_signature(c) = ieor(char_signature(c), &
      
        462
                                                 int(state * 31 + target, 8) * 1099511628211_8)
      
        463
                      end if
      
        464
                    end do
      
        465
        
        466
                  case (TRANS_ANY)
      
        467
                    ! Dot matches all except newline
      
        468
                    do c = 0, 255
      
        469
                      if (c /= 10) then  ! Not newline
      
        470
                        char_signature(c) = ieor(char_signature(c), &
      
        471
                                                 int(state * 31 + target, 8) * 1099511628211_8)
      
        472
                      end if
      
        473
                    end do
      
        474
                end select
      
        475
              end do
      
        476
            end do
      
        477
        
        478
            ! Force each alphabetic character to have a unique signature
      
        479
            ! This ensures they get their own equivalence classes, so the case-folding
      
        480
            ! code in DFA compilation works correctly (it relies on the class representative
      
        481
            ! being alphabetic to compute transitions for both cases)
      
        482
            do c = ichar('a'), ichar('z')
      
        483
              ! Add unique value to each letter's signature to separate them from non-letters
      
        484
              char_signature(c) = ieor(char_signature(c), int(c * 7919 + 1, 8))
      
        485
              char_signature(c - 32) = ieor(char_signature(c - 32), int((c - 32) * 7919 + 1, 8))
      
        486
            end do
      
        487
        
        488
            ! Now group characters by signature
      
        489
            num_classes = 0
      
        490
            class_map = -1
      
        491
            char_to_class = 0
      
        492
        
        493
            do c = 0, 255
      
        494
              sig = char_signature(c)
      
        495
        
        496
              ! Look for existing class with this signature
      
        497
              found = .false.
      
        498
              do i = 0, num_classes - 1
      
        499
                if (class_map(i) /= -1) then
      
        500
                  ! Check if any character in class i has same signature
      
        501
                  ! We stored the signature hash as a proxy
      
        502
                  if (char_signature(class_map(i)) == sig) then
      
        503
                    char_to_class(c) = i
      
        504
                    found = .true.
      
        505
                    exit
      
        506
                  end if
      
        507
                end if
      
        508
              end do
      
        509
        
        510
              if (.not. found) then
      
        511
                ! Create new class
      
        512
                char_to_class(c) = num_classes
      
        513
                class_map(num_classes) = c  ! Remember one char from this class
      
        514
                num_classes = num_classes + 1
      
        515
              end if
      
        516
            end do
      
        517
        
        518
            ! Ensure at least one class
      
        519
            if (num_classes == 0) num_classes = 1
      
        520
        
        521
          end subroutine compute_equiv_classes
      
        522
        
        523
          !---------------------------------------------------------------------------
      
        524
          ! DFA Compilation: Convert NFA to DFA for O(n) matching
      
        525
          !---------------------------------------------------------------------------
      
        526
        
        527
          subroutine compile_dfa(opt)
      
        528
            !> Compile NFA to DFA using subset construction
      
        529
            !> Uses character equivalence classes to reduce compilation time
      
        530
            type(optimized_nfa_t), intent(inout) :: opt
      
        531
        
        532
            type(state_set_t) :: start_set, next_set
      
        533
            integer :: worklist(MAX_DFA_STATES), work_head, work_tail
      
        534
            integer :: dfa_idx, char_code, next_idx, old_num_states
      
        535
            integer :: class_idx, c
      
        536
            integer :: class_representative(0:255)  ! One char per class
      
        537
            integer :: class_transitions(0:255)     ! Computed transition per class
      
        538
        
        539
            ! Allocate DFA states
      
        540
            if (allocated(opt%dfa%states)) deallocate(opt%dfa%states)
      
        541
            allocate(opt%dfa%states(MAX_DFA_STATES))
      
        542
            opt%dfa%num_states = 0
      
        543
            opt%dfa%compiled = .false.
      
        544
            opt%dfa%too_large = .false.
      
        545
            opt%use_dfa = .false.
      
        546
        
        547
            ! Compute character equivalence classes
      
        548
            call compute_equiv_classes(opt%nfa, opt%dfa%char_to_class, opt%dfa%num_classes)
      
        549
            opt%dfa%use_equiv_classes = (opt%dfa%num_classes < 256)
      
        550
        
        551
            ! Build representative character for each class
      
        552
            class_representative = -1
      
        553
            do c = 0, 255
      
        554
              class_idx = opt%dfa%char_to_class(c)
      
        555
              if (class_representative(class_idx) == -1) then
      
        556
                class_representative(class_idx) = c
      
        557
              end if
      
        558
            end do
      
        559
        
        560
            ! Compute start state: epsilon closure of NFA start
      
        561
            call start_set%clear()
      
        562
            call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set)
      
        563
        
        564
            if (start_set%is_empty()) return
      
        565
        
        566
            ! Create initial DFA state
      
        567
            opt%dfa%num_states = 1
      
        568
            opt%dfa%states(1)%nfa_states = start_set
      
        569
            opt%dfa%states(1)%state_hash = start_set%hash()
      
        570
            opt%dfa%states(1)%is_accept = is_accepting_set(opt%nfa, start_set)
      
        571
            opt%dfa%start_state = 1
      
        572
        
        573
            ! Initialize worklist with start state
      
        574
            work_head = 1
      
        575
            work_tail = 1
      
        576
            worklist(1) = 1
      
        577
        
        578
            ! Process worklist: for each DFA state, compute transitions
      
        579
            do while (work_head <= work_tail)
      
        580
              dfa_idx = worklist(work_head)
      
        581
              work_head = work_head + 1
      
        582
        
        583
              ! First, compute transitions for each equivalence class (not all 256 chars)
      
        584
              class_transitions = DFA_DEAD_STATE
      
        585
        
        586
              do class_idx = 0, opt%dfa%num_classes - 1
      
        587
                char_code = class_representative(class_idx)
      
        588
                if (char_code < 0) cycle
      
        589
        
        590
                call next_set%clear()
      
        591
        
        592
                ! Compute NFA transitions for this character
      
        593
                ! Note: DFA is case-sensitive. Case-insensitive matching uses NFA path.
      
        594
                call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, &
      
        595
                                                     char(char_code), next_set)
      
        596
        
        597
                ! Compute epsilon closure of result
      
        598
                if (.not. next_set%is_empty()) then
      
        599
                  call expand_epsilon_closure_simple(opt, next_set)
      
        600
                end if
      
        601
        
        602
                if (next_set%is_empty()) then
      
        603
                  class_transitions(class_idx) = DFA_DEAD_STATE
      
        604
                else
      
        605
                  ! Find or create DFA state for this NFA state set
      
        606
                  old_num_states = opt%dfa%num_states
      
        607
                  next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa)
      
        608
        
        609
                  if (next_idx == -1) then
      
        610
                    opt%dfa%too_large = .true.
      
        611
                    opt%dfa%compiled = .false.
      
        612
                    return
      
        613
                  end if
      
        614
        
        615
                  class_transitions(class_idx) = next_idx
      
        616
        
        617
                  ! Add new state to worklist only if it was just created
      
        618
                  if (opt%dfa%num_states > old_num_states) then
      
        619
                    work_tail = work_tail + 1
      
        620
                    if (work_tail > MAX_DFA_STATES) then
      
        621
                      opt%dfa%too_large = .true.
      
        622
                      opt%dfa%compiled = .false.
      
        623
                      return
      
        624
                    end if
      
        625
                    worklist(work_tail) = next_idx
      
        626
                  end if
      
        627
                end if
      
        628
              end do
      
        629
        
        630
              ! Now fill in the full 256-entry transition table from class transitions
      
        631
              do c = 0, 255
      
        632
                class_idx = opt%dfa%char_to_class(c)
      
        633
                opt%dfa%states(dfa_idx)%transitions(c) = class_transitions(class_idx)
      
        634
              end do
      
        635
            end do
      
        636
        
        637
            ! Minimize DFA to reduce state count
      
        638
            call minimize_dfa(opt%dfa)
      
        639
        
        640
            opt%dfa%compiled = .true.
      
        641
            opt%use_dfa = .true.
      
        642
        
        643
          end subroutine compile_dfa
      
        644
        
        645
          function find_or_create_dfa_state(dfa, nfa_states, nfa) result(idx)
      
        646
            !> Find existing DFA state for NFA state set, or create new one
      
        647
            !> Returns -1 if DFA state limit exceeded
      
        648
            type(compiled_dfa_t), intent(inout) :: dfa
      
        649
            type(state_set_t), intent(in) :: nfa_states
      
        650
            type(nfa_t), intent(in) :: nfa
      
        651
            integer :: idx
      
        652
        
        653
            integer(8) :: h
      
        654
            integer :: i
      
        655
        
        656
            h = nfa_states%hash()
      
        657
        
        658
            ! Search existing states
      
        659
            do i = 1, dfa%num_states
      
        660
              if (dfa%states(i)%state_hash == h .and. &
      
        661
                  dfa%states(i)%nfa_states%equals(nfa_states)) then
      
        662
                idx = i
      
        663
                return
      
        664
              end if
      
        665
            end do
      
        666
        
        667
            ! Create new state
      
        668
            if (dfa%num_states >= MAX_DFA_STATES) then
      
        669
              idx = -1
      
        670
              return
      
        671
            end if
      
        672
        
        673
            dfa%num_states = dfa%num_states + 1
      
        674
            idx = dfa%num_states
      
        675
            dfa%states(idx)%nfa_states = nfa_states
      
        676
            dfa%states(idx)%state_hash = h
      
        677
            dfa%states(idx)%is_accept = is_accepting_set(nfa, nfa_states)
      
        678
            dfa%states(idx)%transitions = DFA_DEAD_STATE
      
        679
        
        680
          end function find_or_create_dfa_state
      
        681
        
        682
          subroutine minimize_dfa(dfa)
      
        683
            !> Minimize DFA using Hopcroft's algorithm
      
        684
            !> Merges equivalent states to reduce DFA size
      
        685
            type(compiled_dfa_t), intent(inout) :: dfa
      
        686
        
        687
            integer :: num_states, num_partitions
      
        688
            integer, allocatable :: partition(:)      ! partition(state) = partition ID
      
        689
            integer, allocatable :: part_size(:)      ! Size of each partition
      
        690
            integer, allocatable :: representative(:) ! Representative state for each partition
      
        691
            integer, allocatable :: new_state_id(:)   ! Mapping from old state to new state ID
      
        692
            type(dfa_state_t), allocatable :: new_states(:)
      
        693
        
        694
            logical, allocatable :: in_worklist(:)    ! Is partition in worklist?
      
        695
            integer, allocatable :: worklist(:)       ! Partitions to process
      
        696
            integer :: work_head, work_tail
      
        697
        
        698
            integer :: i, c, state, target, part_id
      
        699
            integer :: num_accept, num_reject
      
        700
            integer :: old_part, new_part_id
      
        701
            logical :: needs_split
      
        702
            integer, allocatable :: split_marker(:)   ! Which states go to partition A on char c
      
        703
            integer :: new_num_states
      
        704
        
        705
            num_states = dfa%num_states
      
        706
            if (num_states <= 1) return  ! Nothing to minimize
      
        707
        
        708
            ! Allocate working arrays
      
        709
            allocate(partition(num_states))
      
        710
            allocate(part_size(num_states))
      
        711
            allocate(representative(num_states))
      
        712
            allocate(new_state_id(num_states))
      
        713
            allocate(in_worklist(num_states))
      
        714
            allocate(worklist(num_states))
      
        715
            allocate(split_marker(num_states))
      
        716
        
        717
            ! Initialize partitions: accepting states = partition 1, non-accepting = partition 2
      
        718
            partition = 0
      
        719
            part_size = 0
      
        720
            num_accept = 0
      
        721
            num_reject = 0
      
        722
        
        723
            do i = 1, num_states
      
        724
              if (dfa%states(i)%is_accept) then
      
        725
                partition(i) = 1
      
        726
                num_accept = num_accept + 1
      
        727
              else
      
        728
                partition(i) = 2
      
        729
                num_reject = num_reject + 1
      
        730
              end if
      
        731
            end do
      
        732
        
        733
            part_size(1) = num_accept
      
        734
            part_size(2) = num_reject
      
        735
            num_partitions = 2
      
        736
        
        737
            ! Handle edge case: all accepting or all rejecting
      
        738
            if (num_accept == 0 .or. num_reject == 0) then
      
        739
              num_partitions = 1
      
        740
              partition = 1
      
        741
              part_size(1) = num_states
      
        742
            end if
      
        743
        
        744
            ! Initialize worklist with smaller partition (Hopcroft optimization)
      
        745
            in_worklist = .false.
      
        746
            work_head = 1
      
        747
            work_tail = 0
      
        748
        
        749
            if (num_partitions == 2) then
      
        750
              if (num_accept <= num_reject) then
      
        751
                work_tail = 1
      
        752
                worklist(1) = 1
      
        753
                in_worklist(1) = .true.
      
        754
              else
      
        755
                work_tail = 1
      
        756
                worklist(1) = 2
      
        757
                in_worklist(2) = .true.
      
        758
              end if
      
        759
            end if
      
        760
        
        761
            ! Main refinement loop
      
        762
            do while (work_head <= work_tail)
      
        763
              part_id = worklist(work_head)
      
        764
              work_head = work_head + 1
      
        765
              in_worklist(part_id) = .false.
      
        766
        
        767
              ! For each character, check if this partition splits others
      
        768
              do c = 0, 255
      
        769
                ! Mark states that transition to partition part_id on character c
      
        770
                split_marker = 0
      
        771
                do state = 1, num_states
      
        772
                  target = dfa%states(state)%transitions(c)
      
        773
                  if (target > 0 .and. target <= num_states) then
      
        774
                    if (partition(target) == part_id) then
      
        775
                      split_marker(state) = 1
      
        776
                    end if
      
        777
                  end if
      
        778
                end do
      
        779
        
        780
                ! Check each existing partition for splits
      
        781
                do old_part = 1, num_partitions
      
        782
                  ! Count states in this partition that go to part_id vs don't
      
        783
                  num_accept = 0  ! Reuse: count going to part_id
      
        784
                  num_reject = 0  ! Reuse: count not going to part_id
      
        785
        
        786
                  do state = 1, num_states
      
        787
                    if (partition(state) == old_part) then
      
        788
                      if (split_marker(state) == 1) then
      
        789
                        num_accept = num_accept + 1
      
        790
                      else
      
        791
                        num_reject = num_reject + 1
      
        792
                      end if
      
        793
                    end if
      
        794
                  end do
      
        795
        
        796
                  ! If partition needs splitting (has both types)
      
        797
                  needs_split = (num_accept > 0 .and. num_reject > 0)
      
        798
        
        799
                  if (needs_split) then
      
        800
                    ! Create new partition for the smaller group
      
        801
                    num_partitions = num_partitions + 1
      
        802
                    new_part_id = num_partitions
      
        803
        
        804
                    ! Move the smaller group to new partition
      
        805
                    if (num_accept <= num_reject) then
      
        806
                      ! Move states going to part_id to new partition
      
        807
                      do state = 1, num_states
      
        808
                        if (partition(state) == old_part .and. split_marker(state) == 1) then
      
        809
                          partition(state) = new_part_id
      
        810
                        end if
      
        811
                      end do
      
        812
                      part_size(new_part_id) = num_accept
      
        813
                      part_size(old_part) = num_reject
      
        814
                    else
      
        815
                      ! Move states NOT going to part_id to new partition
      
        816
                      do state = 1, num_states
      
        817
                        if (partition(state) == old_part .and. split_marker(state) == 0) then
      
        818
                          partition(state) = new_part_id
      
        819
                        end if
      
        820
                      end do
      
        821
                      part_size(new_part_id) = num_reject
      
        822
                      part_size(old_part) = num_accept
      
        823
                    end if
      
        824
        
        825
                    ! Update worklist
      
        826
                    if (in_worklist(old_part)) then
      
        827
                      ! Both halves need to be in worklist
      
        828
                      work_tail = work_tail + 1
      
        829
                      worklist(work_tail) = new_part_id
      
        830
                      in_worklist(new_part_id) = .true.
      
        831
                    else
      
        832
                      ! Add smaller partition to worklist
      
        833
                      if (part_size(new_part_id) <= part_size(old_part)) then
      
        834
                        work_tail = work_tail + 1
      
        835
                        worklist(work_tail) = new_part_id
      
        836
                        in_worklist(new_part_id) = .true.
      
        837
                      else
      
        838
                        work_tail = work_tail + 1
      
        839
                        worklist(work_tail) = old_part
      
        840
                        in_worklist(old_part) = .true.
      
        841
                      end if
      
        842
                    end if
      
        843
                  end if
      
        844
                end do
      
        845
              end do
      
        846
            end do
      
        847
        
        848
            ! Check if minimization actually reduced states
      
        849
            if (num_partitions >= num_states) then
      
        850
              ! No reduction possible
      
        851
              deallocate(partition, part_size, representative, new_state_id)
      
        852
              deallocate(in_worklist, worklist, split_marker)
      
        853
              return
      
        854
            end if
      
        855
        
        856
            ! Find representative for each partition (lowest numbered state)
      
        857
            representative = 0
      
        858
            do state = 1, num_states
      
        859
              part_id = partition(state)
      
        860
              if (representative(part_id) == 0) then
      
        861
                representative(part_id) = state
      
        862
              end if
      
        863
            end do
      
        864
        
        865
            ! Build new state IDs (compact numbering)
      
        866
            new_state_id = 0
      
        867
            new_num_states = 0
      
        868
            do part_id = 1, num_partitions
      
        869
              if (representative(part_id) > 0) then
      
        870
                new_num_states = new_num_states + 1
      
        871
                ! Map all states in this partition to new state ID
      
        872
                do state = 1, num_states
      
        873
                  if (partition(state) == part_id) then
      
        874
                    new_state_id(state) = new_num_states
      
        875
                  end if
      
        876
                end do
      
        877
              end if
      
        878
            end do
      
        879
        
        880
            ! Build minimized DFA
      
        881
            allocate(new_states(new_num_states))
      
        882
        
        883
            do part_id = 1, num_partitions
      
        884
              state = representative(part_id)
      
        885
              if (state == 0) cycle
      
        886
        
        887
              i = new_state_id(state)
      
        888
              new_states(i)%is_accept = dfa%states(state)%is_accept
      
        889
              new_states(i)%state_hash = dfa%states(state)%state_hash
      
        890
              new_states(i)%nfa_states = dfa%states(state)%nfa_states
      
        891
        
        892
              ! Remap transitions
      
        893
              do c = 0, 255
      
        894
                target = dfa%states(state)%transitions(c)
      
        895
                if (target > 0 .and. target <= num_states) then
      
        896
                  new_states(i)%transitions(c) = new_state_id(target)
      
        897
                else
      
        898
                  new_states(i)%transitions(c) = DFA_DEAD_STATE
      
        899
                end if
      
        900
              end do
      
        901
            end do
      
        902
        
        903
            ! Update DFA with minimized version
      
        904
            deallocate(dfa%states)
      
        905
            allocate(dfa%states(new_num_states))
      
        906
            dfa%states = new_states
      
        907
            dfa%start_state = new_state_id(dfa%start_state)
      
        908
            dfa%num_states = new_num_states
      
        909
        
        910
            ! Cleanup
      
        911
            deallocate(partition, part_size, representative, new_state_id)
      
        912
            deallocate(in_worklist, worklist, split_marker, new_states)
      
        913
        
        914
          end subroutine minimize_dfa
      
        915
        
        916
          subroutine compute_char_transitions_simple(nfa, current, c, next_set)
      
        917
            !> Compute character transitions without case folding (for DFA compilation)
      
        918
            type(nfa_t), intent(in) :: nfa
      
        919
            type(state_set_t), intent(in) :: current
      
        920
            character(len=1), intent(in) :: c
      
        921
            type(state_set_t), intent(inout) :: next_set
      
        922
        
        923
            integer :: state, word_idx, bit_idx, i
      
        924
            integer(8) :: word, mask
      
        925
            type(nfa_transition_t) :: trans
      
        926
        
        927
            do word_idx = 1, size(current%bits)
      
        928
              word = current%bits(word_idx)
      
        929
              if (word == 0) cycle
      
        930
        
        931
              do bit_idx = 0, 63
      
        932
                mask = ishft(1_8, bit_idx)
      
        933
                if (iand(word, mask) /= 0) then
      
        934
                  state = (word_idx - 1) * 64 + bit_idx + 1
      
        935
                  if (state > nfa%num_states) cycle
      
        936
        
        937
                  do i = 1, nfa%states(state)%num_trans
      
        938
                    trans = nfa%states(state)%trans(i)
      
        939
        
        940
                    select case (trans%trans_type)
      
        941
                      case (TRANS_CHAR)
      
        942
                        if (c == trans%match_char) then
      
        943
                          call next_set%add(trans%target)
      
        944
                        end if
      
        945
        
        946
                      case (TRANS_CLASS)
      
        947
                        if (trans%char_class(ichar(c)) .neqv. trans%negated) then
      
        948
                          call next_set%add(trans%target)
      
        949
                        end if
      
        950
        
        951
                      case (TRANS_ANY)
      
        952
                        if (c /= char(10)) then
      
        953
                          call next_set%add(trans%target)
      
        954
                        end if
      
        955
                    end select
      
        956
                  end do
      
        957
                end if
      
        958
              end do
      
        959
            end do
      
        960
          end subroutine compute_char_transitions_simple
      
        961
        
        962
          subroutine expand_epsilon_closure_simple(opt, state_set)
      
        963
            !> Expand state set to include epsilon closure (in-place)
      
        964
            !> Uses pre-computed closures for O(1) lookup per state
      
        965
            type(optimized_nfa_t), intent(in) :: opt
      
        966
            type(state_set_t), intent(inout) :: state_set
      
        967
        
        968
            integer :: word_idx, bit_idx, state, j
      
        969
            integer(8) :: word, mask, original_bits(size(state_set%bits))
      
        970
        
        971
            ! If no pre-computed closures, fall back to computing on-the-fly
      
        972
            if (.not. allocated(opt%epsilon_closures)) then
      
        973
              call expand_epsilon_closure_simple_fallback(opt%nfa, state_set)
      
        974
              return
      
        975
            end if
      
        976
        
        977
            ! Save original bits to avoid processing newly added states
      
        978
            original_bits = state_set%bits
      
        979
        
        980
            ! Expand using pre-computed closures - just OR the bit vectors
      
        981
            do word_idx = 1, size(original_bits)
      
        982
              word = original_bits(word_idx)
      
        983
              if (word == 0) cycle
      
        984
        
        985
              do bit_idx = 0, 63
      
        986
                mask = ishft(1_8, bit_idx)
      
        987
                if (iand(word, mask) /= 0) then
      
        988
                  state = (word_idx - 1) * 64 + bit_idx + 1
      
        989
                  if (state >= 1 .and. state <= opt%nfa%num_states) then
      
        990
                    ! Merge pre-computed epsilon closure using bitwise OR
      
        991
                    do j = 1, size(state_set%bits)
      
        992
                      state_set%bits(j) = ior(state_set%bits(j), opt%epsilon_closures(state)%bits(j))
      
        993
                    end do
      
        994
                  end if
      
        995
                end if
      
        996
              end do
      
        997
            end do
      
        998
          end subroutine expand_epsilon_closure_simple
      
        999
        
        1000
          subroutine expand_epsilon_closure_simple_fallback(nfa, state_set)
      
        1001
            !> Fallback: compute epsilon closure on-the-fly
      
        1002
            type(nfa_t), intent(in) :: nfa
      
        1003
            type(state_set_t), intent(inout) :: state_set
      
        1004
        
        1005
            type(state_set_t) :: result
      
        1006
            integer :: word_idx, bit_idx, state
      
        1007
            integer(8) :: word, mask
      
        1008
        
        1009
            call result%clear()
      
        1010
        
        1011
            do word_idx = 1, size(state_set%bits)
      
        1012
              word = state_set%bits(word_idx)
      
        1013
              if (word == 0) cycle
      
        1014
        
        1015
              do bit_idx = 0, 63
      
        1016
                mask = ishft(1_8, bit_idx)
      
        1017
                if (iand(word, mask) /= 0) then
      
        1018
                  state = (word_idx - 1) * 64 + bit_idx + 1
      
        1019
                  if (state <= nfa%num_states) then
      
        1020
                    call compute_epsilon_closure_basic(nfa, state, result)
      
        1021
                  end if
      
        1022
                end if
      
        1023
              end do
      
        1024
            end do
      
        1025
        
        1026
            call state_set%copy_from(result)
      
        1027
          end subroutine expand_epsilon_closure_simple_fallback
      
        1028
        
        1029
          !---------------------------------------------------------------------------
      
        1030
          ! Optimized Search: Use prefix to skip positions
      
        1031
          !---------------------------------------------------------------------------
      
        1032
        
        1033
          function optimized_search(opt, text, ignore_case) result(res)
      
        1034
            type(optimized_nfa_t), intent(inout) :: opt
      
        1035
            character(len=*), intent(in) :: text
      
        1036
            logical, intent(in) :: ignore_case
      
        1037
            type(match_result_t) :: res
      
        1038
        
        1039
            integer :: text_len, pos, skip
      
        1040
            type(match_result_t) :: try_res
      
        1041
        
        1042
            res%matched = .false.
      
        1043
            text_len = len_trim(text)
      
        1044
        
        1045
            ! Fast path: use Aho-Corasick for alternation patterns
      
        1046
            ! Only use AC if ignore_case setting matches what was compiled
      
        1047
            if (opt%use_aho_corasick) then
      
        1048
              if (ignore_case .eqv. opt%ac%ignore_case) then
      
        1049
                res = ac_optimized_search(opt%ac, text)
      
        1050
                return
      
        1051
              end if
      
        1052
            end if
      
        1053
        
        1054
            if (opt%nfa%num_states == 0) return
      
        1055
        
        1056
            ! Backtracking path: use backtracking matcher for patterns with backreferences
      
        1057
            if (opt%has_backrefs) then
      
        1058
              res = backtrack_search(opt%nfa, text, text_len, ignore_case)
      
        1059
              return
      
        1060
            end if
      
        1061
        
        1062
            ! Fast path: use DFA if available (O(n) matching)
      
        1063
            ! DFA is case-sensitive; case-insensitive matching falls through to NFA path
      
        1064
            if (opt%use_dfa .and. .not. ignore_case) then
      
        1065
              res = dfa_search(opt%dfa, text, text_len)
      
        1066
              return
      
        1067
            end if
      
        1068
        
        1069
            ! Fast path: anchored start - only try position 1
      
        1070
            if (opt%anchored_start) then
      
        1071
              res = optimized_match(opt, text, 1, ignore_case)
      
        1072
              return
      
        1073
            end if
      
        1074
        
        1075
            ! Use prefix to skip positions (Boyer-Moore style)
      
        1076
            if (opt%prefix_len > 0 .and. .not. ignore_case) then
      
        1077
              pos = opt%prefix_len
      
        1078
              do while (pos <= text_len)
      
        1079
                ! Check if prefix matches at this position
      
        1080
                if (prefix_matches(text, pos - opt%prefix_len + 1, opt%prefix, opt%prefix_len)) then
      
        1081
                  ! Try full NFA match from this position
      
        1082
                  try_res = optimized_match(opt, text, pos - opt%prefix_len + 1, ignore_case)
      
        1083
                  if (try_res%matched) then
      
        1084
                    res = try_res
      
        1085
                    return
      
        1086
                  end if
      
        1087
                  pos = pos + 1
      
        1088
                else
      
        1089
                  ! Skip based on mismatched character
      
        1090
                  skip = opt%skip_table(ichar(text(pos:pos)))
      
        1091
                  pos = pos + max(skip, 1)
      
        1092
                end if
      
        1093
              end do
      
        1094
            else
      
        1095
              ! No prefix optimization - try each position
      
        1096
              do pos = 1, text_len + 1
      
        1097
                try_res = optimized_match(opt, text, pos, ignore_case)
      
        1098
                if (try_res%matched) then
      
        1099
                  res = try_res
      
        1100
                  return
      
        1101
                end if
      
        1102
              end do
      
        1103
            end if
      
        1104
        
        1105
          end function optimized_search
      
        1106
        
        1107
          function dfa_search(dfa, text, text_len) result(res)
      
        1108
            !> Fast O(n) DFA-based search
      
        1109
            !> Tries each starting position and returns first match
      
        1110
            type(compiled_dfa_t), intent(in) :: dfa
      
        1111
            character(len=*), intent(in) :: text
      
        1112
            integer, intent(in) :: text_len
      
        1113
            type(match_result_t) :: res
      
        1114
        
        1115
            integer :: start_pos
      
        1116
            type(match_result_t) :: try_res
      
        1117
        
        1118
            res%matched = .false.
      
        1119
        
        1120
            if (.not. dfa%compiled .or. dfa%num_states == 0) return
      
        1121
        
        1122
            ! Try each starting position
      
        1123
            do start_pos = 1, text_len + 1
      
        1124
              try_res = dfa_match(dfa, text, text_len, start_pos)
      
        1125
              if (try_res%matched) then
      
        1126
                res = try_res
      
        1127
                return
      
        1128
              end if
      
        1129
            end do
      
        1130
        
        1131
          end function dfa_search
      
        1132
        
        1133
          function dfa_match(dfa, text, text_len, start_pos) result(res)
      
        1134
            !> O(n) DFA matching from a specific position
      
        1135
            !> Just follows transition table - no state set operations
      
        1136
            type(compiled_dfa_t), intent(in) :: dfa
      
        1137
            character(len=*), intent(in) :: text
      
        1138
            integer, intent(in) :: text_len, start_pos
      
        1139
            type(match_result_t) :: res
      
        1140
        
        1141
            integer :: state, pos, char_code
      
        1142
        
        1143
            res%matched = .false.
      
        1144
        
        1145
            if (.not. dfa%compiled) return
      
        1146
        
        1147
            state = dfa%start_state
      
        1148
            pos = start_pos
      
        1149
        
        1150
            ! Check if start state is accepting (empty match)
      
        1151
            if (dfa%states(state)%is_accept) then
      
        1152
              res%matched = .true.
      
        1153
              res%match_start = start_pos
      
        1154
              res%match_end = start_pos - 1
      
        1155
            end if
      
        1156
        
        1157
            ! Process each character
      
        1158
            do while (pos <= text_len)
      
        1159
              char_code = ichar(text(pos:pos))
      
        1160
              state = dfa%states(state)%transitions(char_code)
      
        1161
        
        1162
              if (state == DFA_DEAD_STATE) exit
      
        1163
        
        1164
              pos = pos + 1
      
        1165
        
        1166
              ! Check for acceptance (greedy - find longest)
      
        1167
              if (dfa%states(state)%is_accept) then
      
        1168
                res%matched = .true.
      
        1169
                res%match_start = start_pos
      
        1170
                res%match_end = pos - 1
      
        1171
              end if
      
        1172
            end do
      
        1173
        
        1174
          end function dfa_match
      
        1175
        
        1176
          function prefix_matches(text, pos, prefix, prefix_len) result(matches)
      
        1177
            character(len=*), intent(in) :: text
      
        1178
            integer, intent(in) :: pos, prefix_len
      
        1179
            character(len=*), intent(in) :: prefix
      
        1180
            logical :: matches
      
        1181
            integer :: i, text_len
      
        1182
        
        1183
            matches = .false.
      
        1184
            text_len = len_trim(text)
      
        1185
        
        1186
            if (pos < 1 .or. pos + prefix_len - 1 > text_len) return
      
        1187
        
        1188
            do i = 1, prefix_len
      
        1189
              if (text(pos+i-1:pos+i-1) /= prefix(i:i)) return
      
        1190
            end do
      
        1191
        
        1192
            matches = .true.
      
        1193
          end function prefix_matches
      
        1194
        
        1195
          !---------------------------------------------------------------------------
      
        1196
          ! Optimized Match: Use bit vectors and caching
      
        1197
          !---------------------------------------------------------------------------
      
        1198
        
        1199
          function optimized_match(opt, text, start_pos, ignore_case) result(res)
      
        1200
            type(optimized_nfa_t), intent(inout) :: opt
      
        1201
            character(len=*), intent(in) :: text
      
        1202
            integer, intent(in) :: start_pos
      
        1203
            logical, intent(in) :: ignore_case
      
        1204
            type(match_result_t) :: res
      
        1205
        
        1206
            type(state_set_t) :: current, next_set
      
        1207
            integer :: pos, text_len
      
        1208
            character(len=1) :: c
      
        1209
        
        1210
            res%matched = .false.
      
        1211
            text_len = len_trim(text)
      
        1212
        
        1213
            if (opt%nfa%num_states == 0) return
      
        1214
        
        1215
            ! Initialize with epsilon closure of start state (including position-dependent anchors)
      
        1216
            call current%clear()
      
        1217
            call compute_epsilon_closure_full(opt%nfa, opt%nfa%start_state, current, text, start_pos, text_len)
      
        1218
        
        1219
            ! Check if already accepting (empty pattern)
      
        1220
            if (is_accepting_set(opt%nfa, current)) then
      
        1221
              res%matched = .true.
      
        1222
              res%match_start = start_pos
      
        1223
              res%match_end = start_pos - 1
      
        1224
              return
      
        1225
            end if
      
        1226
        
        1227
            ! Process each character
      
        1228
            pos = start_pos
      
        1229
            do while (pos <= text_len .and. .not. current%is_empty())
      
        1230
              c = text(pos:pos)
      
        1231
        
        1232
              ! Compute next states with DFA caching
      
        1233
              call next_set%clear()
      
        1234
              call step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set)
      
        1235
        
        1236
              ! Compute epsilon closure
      
        1237
              call current%clear()
      
        1238
              call expand_epsilon_closure(opt%nfa, next_set, current, text, pos + 1, text_len)
      
        1239
        
        1240
              pos = pos + 1
      
        1241
        
        1242
              ! Check for acceptance (greedy - continue to find longest)
      
        1243
              if (is_accepting_set(opt%nfa, current)) then
      
        1244
                res%matched = .true.
      
        1245
                res%match_start = start_pos
      
        1246
                res%match_end = pos - 1
      
        1247
              end if
      
        1248
            end do
      
        1249
        
        1250
          end function optimized_match
      
        1251
        
        1252
          subroutine compute_epsilon_closure_full(nfa, start_state, result_set, text, pos, text_len)
      
        1253
            type(nfa_t), intent(in) :: nfa
      
        1254
            integer, intent(in) :: start_state
      
        1255
            type(state_set_t), intent(inout) :: result_set
      
        1256
            character(len=*), intent(in) :: text
      
        1257
            integer, intent(in) :: pos, text_len
      
        1258
        
        1259
            integer :: stack(MAX_STATES), stack_top
      
        1260
            integer :: state, i, target
      
        1261
            type(nfa_transition_t) :: trans
      
        1262
        
        1263
            stack_top = 1
      
        1264
            stack(1) = start_state
      
        1265
        
        1266
            do while (stack_top > 0)
      
        1267
              state = stack(stack_top)
      
        1268
              stack_top = stack_top - 1
      
        1269
        
        1270
              if (state < 1 .or. state > nfa%num_states) cycle
      
        1271
              if (result_set%contains(state)) cycle
      
        1272
        
        1273
              call result_set%add(state)
      
        1274
        
        1275
              do i = 1, nfa%states(state)%num_trans
      
        1276
                trans = nfa%states(state)%trans(i)
      
        1277
        
        1278
                if (trans%trans_type == TRANS_EPSILON) then
      
        1279
                  if (trans%anchor_type < 0) cycle  ! Skip backrefs
      
        1280
                  target = trans%target
      
        1281
                  if (target >= 1 .and. target <= nfa%num_states) then
      
        1282
                    if (.not. result_set%contains(target)) then
      
        1283
                      stack_top = stack_top + 1
      
        1284
                      if (stack_top <= MAX_STATES) stack(stack_top) = target
      
        1285
                    end if
      
        1286
                  end if
      
        1287
        
        1288
                else if (trans%trans_type == TRANS_ANCHOR) then
      
        1289
                  if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then
      
        1290
                    target = trans%target
      
        1291
                    if (target >= 1 .and. target <= nfa%num_states) then
      
        1292
                      if (.not. result_set%contains(target)) then
      
        1293
                        stack_top = stack_top + 1
      
        1294
                        if (stack_top <= MAX_STATES) stack(stack_top) = target
      
        1295
                      end if
      
        1296
                    end if
      
        1297
                  end if
      
        1298
                end if
      
        1299
              end do
      
        1300
            end do
      
        1301
          end subroutine compute_epsilon_closure_full
      
        1302
        
        1303
          subroutine expand_epsilon_closure(nfa, input_set, result_set, text, pos, text_len)
      
        1304
            type(nfa_t), intent(in) :: nfa
      
        1305
            type(state_set_t), intent(in) :: input_set
      
        1306
            type(state_set_t), intent(inout) :: result_set
      
        1307
            character(len=*), intent(in) :: text
      
        1308
            integer, intent(in) :: pos, text_len
      
        1309
        
        1310
            integer :: state, word_idx, bit_idx
      
        1311
            integer(8) :: word, mask
      
        1312
        
        1313
            ! Iterate through set bits
      
        1314
            do word_idx = 1, size(input_set%bits)
      
        1315
              word = input_set%bits(word_idx)
      
        1316
              if (word == 0) cycle
      
        1317
        
        1318
              do bit_idx = 0, 63
      
        1319
                mask = ishft(1_8, bit_idx)
      
        1320
                if (iand(word, mask) /= 0) then
      
        1321
                  state = (word_idx - 1) * 64 + bit_idx + 1
      
        1322
                  if (state <= nfa%num_states) then
      
        1323
                    call compute_epsilon_closure_full(nfa, state, result_set, text, pos, text_len)
      
        1324
                  end if
      
        1325
                end if
      
        1326
              end do
      
        1327
            end do
      
        1328
          end subroutine expand_epsilon_closure
      
        1329
        
        1330
          subroutine step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set)
      
        1331
            !> Compute next states with DFA caching
      
        1332
            !> Cache key: (state_set_hash, char_code, ignore_case)
      
        1333
            !> This avoids recomputing transitions for repeated (state_set, char) pairs
      
        1334
            type(optimized_nfa_t), intent(inout) :: opt
      
        1335
            type(state_set_t), intent(in) :: current
      
        1336
            character(len=1), intent(in) :: c
      
        1337
            integer, intent(in) :: pos, text_len
      
        1338
            character(len=*), intent(in) :: text
      
        1339
            logical, intent(in) :: ignore_case
      
        1340
            type(state_set_t), intent(inout) :: next_set
      
        1341
        
        1342
            integer(8) :: state_hash
      
        1343
            integer :: cache_idx, char_code
      
        1344
        
        1345
            ! Compute cache key
      
        1346
            state_hash = current%hash()
      
        1347
            char_code = ichar(c)
      
        1348
        
        1349
            ! Compute cache index (combine hash with char code)
      
        1350
            cache_idx = int(mod(abs(ieor(state_hash, int(char_code, 8))), int(DFA_CACHE_SIZE, 8))) + 1
      
        1351
        
        1352
            ! Check cache hit (using hash + char + case as key)
      
        1353
            ! Note: This may have rare hash collisions, but performance benefit outweighs risk
      
        1354
            if (opt%dfa_cache(cache_idx)%valid .and. &
      
        1355
                opt%dfa_cache(cache_idx)%state_hash == state_hash .and. &
      
        1356
                opt%dfa_cache(cache_idx)%char_code == char_code .and. &
      
        1357
                (opt%dfa_cache(cache_idx)%is_case_insensitive .eqv. ignore_case)) then
      
        1358
              ! Cache hit - copy cached result
      
        1359
              call next_set%copy_from(opt%dfa_cache(cache_idx)%next_states)
      
        1360
              return
      
        1361
            end if
      
        1362
        
        1363
            ! Cache miss - compute transitions
      
        1364
            call compute_char_transitions(opt%nfa, current, c, ignore_case, next_set)
      
        1365
        
        1366
            ! Store in cache
      
        1367
            opt%dfa_cache(cache_idx)%valid = .true.
      
        1368
            opt%dfa_cache(cache_idx)%state_hash = state_hash
      
        1369
            opt%dfa_cache(cache_idx)%char_code = char_code
      
        1370
            opt%dfa_cache(cache_idx)%is_case_insensitive = ignore_case
      
        1371
            call opt%dfa_cache(cache_idx)%next_states%copy_from(next_set)
      
        1372
        
        1373
          end subroutine step_with_cache
      
        1374
        
        1375
          subroutine compute_char_transitions(nfa, current, c, ignore_case, next_set)
      
        1376
            !> Compute character transitions without caching (called on cache miss)
      
        1377
            type(nfa_t), intent(in) :: nfa
      
        1378
            type(state_set_t), intent(in) :: current
      
        1379
            character(len=1), intent(in) :: c
      
        1380
            logical, intent(in) :: ignore_case
      
        1381
            type(state_set_t), intent(inout) :: next_set
      
        1382
        
        1383
            integer :: state, word_idx, bit_idx, i
      
        1384
            integer(8) :: word, mask
      
        1385
            type(nfa_transition_t) :: trans
      
        1386
            character(len=1) :: c_lower, match_lower
      
        1387
        
        1388
            ! Iterate through current states
      
        1389
            do word_idx = 1, size(current%bits)
      
        1390
              word = current%bits(word_idx)
      
        1391
              if (word == 0) cycle
      
        1392
        
        1393
              do bit_idx = 0, 63
      
        1394
                mask = ishft(1_8, bit_idx)
      
        1395
                if (iand(word, mask) /= 0) then
      
        1396
                  state = (word_idx - 1) * 64 + bit_idx + 1
      
        1397
                  if (state > nfa%num_states) cycle
      
        1398
        
        1399
                  ! Process transitions from this state
      
        1400
                  do i = 1, nfa%states(state)%num_trans
      
        1401
                    trans = nfa%states(state)%trans(i)
      
        1402
        
        1403
                    select case (trans%trans_type)
      
        1404
                      case (TRANS_CHAR)
      
        1405
                        if (ignore_case) then
      
        1406
                          c_lower = to_lower_char(c)
      
        1407
                          match_lower = to_lower_char(trans%match_char)
      
        1408
                          if (c_lower == match_lower) then
      
        1409
                            call next_set%add(trans%target)
      
        1410
                          end if
      
        1411
                        else
      
        1412
                          if (c == trans%match_char) then
      
        1413
                            call next_set%add(trans%target)
      
        1414
                          end if
      
        1415
                        end if
      
        1416
        
        1417
                      case (TRANS_CLASS)
      
        1418
                        ! Use fast bitwise character class test
      
        1419
                        if (ignore_case) then
      
        1420
                          if (charclass_test_case_insensitive(trans%char_bits, c)) then
      
        1421
                            call next_set%add(trans%target)
      
        1422
                          end if
      
        1423
                        else
      
        1424
                          if (charclass_test(trans%char_bits, c)) then
      
        1425
                            call next_set%add(trans%target)
      
        1426
                          end if
      
        1427
                        end if
      
        1428
        
        1429
                      case (TRANS_ANY)
      
        1430
                        if (c /= char(10)) then
      
        1431
                          call next_set%add(trans%target)
      
        1432
                        end if
      
        1433
                    end select
      
        1434
                  end do
      
        1435
                end if
      
        1436
              end do
      
        1437
            end do
      
        1438
          end subroutine compute_char_transitions
      
        1439
        
        1440
          function is_accepting_set(nfa, states) result(res)
      
        1441
            type(nfa_t), intent(in) :: nfa
      
        1442
            type(state_set_t), intent(in) :: states
      
        1443
            logical :: res
      
        1444
        
        1445
            integer :: state, word_idx, bit_idx
      
        1446
            integer(8) :: word, mask
      
        1447
        
        1448
            res = .false.
      
        1449
        
        1450
            do word_idx = 1, size(states%bits)
      
        1451
              word = states%bits(word_idx)
      
        1452
              if (word == 0) cycle
      
        1453
        
        1454
              do bit_idx = 0, 63
      
        1455
                mask = ishft(1_8, bit_idx)
      
        1456
                if (iand(word, mask) /= 0) then
      
        1457
                  state = (word_idx - 1) * 64 + bit_idx + 1
      
        1458
                  if (state >= 1 .and. state <= nfa%num_states) then
      
        1459
                    if (nfa%states(state)%is_accept) then
      
        1460
                      res = .true.
      
        1461
                      return
      
        1462
                    end if
      
        1463
                  end if
      
        1464
                end if
      
        1465
              end do
      
        1466
            end do
      
        1467
          end function is_accepting_set
      
        1468
        
        1469
          !---------------------------------------------------------------------------
      
        1470
          ! Helper functions
      
        1471
          !---------------------------------------------------------------------------
      
        1472
        
        1473
          function anchor_matches_opt(anchor_type, text, pos, text_len) result(matches)
      
        1474
            integer, intent(in) :: anchor_type
      
        1475
            character(len=*), intent(in) :: text
      
        1476
            integer, intent(in) :: pos, text_len
      
        1477
            logical :: matches
      
        1478
        
        1479
            logical :: at_start, at_end, prev_word, curr_word
      
        1480
        
        1481
            matches = .false.
      
        1482
            at_start = (pos == 1) .or. (pos < 1)
      
        1483
            at_end = (pos > text_len)
      
        1484
        
        1485
            select case (anchor_type)
      
        1486
              case (1)  ! ^
      
        1487
                if (at_start) then
      
        1488
                  matches = .true.
      
        1489
                else if (pos > 1 .and. pos <= text_len + 1) then
      
        1490
                  matches = (text(pos-1:pos-1) == char(10))
      
        1491
                end if
      
        1492
        
        1493
              case (2)  ! $
      
        1494
                if (at_end) then
      
        1495
                  matches = .true.
      
        1496
                else if (pos >= 1 .and. pos <= text_len) then
      
        1497
                  matches = (text(pos:pos) == char(10))
      
        1498
                end if
      
        1499
        
        1500
              case (3)  ! \<
      
        1501
                prev_word = .false.
      
        1502
                curr_word = .false.
      
        1503
                if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
      
        1504
                if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
      
        1505
                matches = (.not. prev_word) .and. curr_word
      
        1506
        
        1507
              case (4)  ! \>
      
        1508
                prev_word = .false.
      
        1509
                curr_word = .false.
      
        1510
                if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
      
        1511
                if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
      
        1512
                matches = prev_word .and. (.not. curr_word)
      
        1513
        
        1514
              case (5)  ! \b
      
        1515
                prev_word = .false.
      
        1516
                curr_word = .false.
      
        1517
                if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
      
        1518
                if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
      
        1519
                matches = prev_word .neqv. curr_word
      
        1520
        
        1521
              case (6)  ! \B
      
        1522
                prev_word = .false.
      
        1523
                curr_word = .false.
      
        1524
                if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
      
        1525
                if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
      
        1526
                matches = prev_word .eqv. curr_word
      
        1527
            end select
      
        1528
          end function anchor_matches_opt
      
        1529
        
        1530
          pure function is_word_char_opt(c) result(res)
      
        1531
            character(len=1), intent(in) :: c
      
        1532
            logical :: res
      
        1533
            integer :: ic
      
        1534
            ic = ichar(c)
      
        1535
            res = (ic >= ichar('a') .and. ic <= ichar('z')) .or. &
      
        1536
                  (ic >= ichar('A') .and. ic <= ichar('Z')) .or. &
      
        1537
                  (ic >= ichar('0') .and. ic <= ichar('9')) .or. &
      
        1538
                  (c == '_')
      
        1539
          end function is_word_char_opt
      
        1540
        
        1541
          pure function to_lower_char(c) result(lower)
      
        1542
            character(len=1), intent(in) :: c
      
        1543
            character(len=1) :: lower
      
        1544
            integer :: ic
      
        1545
            ic = ichar(c)
      
        1546
            if (ic >= ichar('A') .and. ic <= ichar('Z')) then
      
        1547
              ! ASCII uppercase A-Z -> a-z
      
        1548
              lower = char(ic + 32)
      
        1549
            else if (ic >= 192 .and. ic <= 214) then
      
        1550
              ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246)
      
        1551
              lower = char(ic + 32)
      
        1552
            else if (ic >= 216 .and. ic <= 222) then
      
        1553
              ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254)
      
        1554
              lower = char(ic + 32)
      
        1555
            else if (ic >= 128 .and. ic <= 150) then
      
        1556
              ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6)
      
        1557
              ! When preceded by 0xC3, these represent À-Ö, fold to à-ö
      
        1558
              lower = char(ic + 32)
      
        1559
            else if (ic >= 152 .and. ic <= 158) then
      
        1560
              ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE)
      
        1561
              ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ
      
        1562
              lower = char(ic + 32)
      
        1563
            else
      
        1564
              lower = c
      
        1565
            end if
      
        1566
          end function to_lower_char
      
        1567
        
        1568
          function char_in_class_opt(c, char_class, negated, ignore_case) result(res)
      
        1569
            character(len=1), intent(in) :: c
      
        1570
            logical, intent(in) :: char_class(0:255)
      
        1571
            logical, intent(in) :: negated, ignore_case
      
        1572
            logical :: res
      
        1573
        
        1574
            integer :: ic
      
        1575
            character(len=1) :: c_lower, c_upper
      
        1576
        
        1577
            ic = ichar(c)
      
        1578
            res = char_class(ic)
      
        1579
        
        1580
            if (ignore_case .and. .not. res) then
      
        1581
              c_lower = to_lower_char(c)
      
        1582
              if (c_lower /= c) res = char_class(ichar(c_lower))
      
        1583
              if (.not. res) then
      
        1584
                ic = ichar(c)
      
        1585
                if (ic >= ichar('a') .and. ic <= ichar('z')) then
      
        1586
                  c_upper = char(ic - 32)
      
        1587
                  res = char_class(ichar(c_upper))
      
        1588
                end if
      
        1589
              end if
      
        1590
            end if
      
        1591
        
        1592
            if (negated) res = .not. res
      
        1593
          end function char_in_class_opt
      
        1594
        
        1595
          !---------------------------------------------------------------------------
      
        1596
          ! Aho-Corasick Integration for Alternation Patterns
      
        1597
          !---------------------------------------------------------------------------
      
        1598
        
        1599
          subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case)
      
        1600
            !> Try to build Aho-Corasick automaton for simple alternation patterns
      
        1601
            !> Pattern like "foo|bar|baz" with only literal characters and | separators
      
        1602
            type(optimized_nfa_t), intent(inout) :: opt
      
        1603
            character(len=*), intent(in) :: pattern
      
        1604
            logical, intent(in) :: is_ere, ignore_case
      
        1605
        
        1606
            character(len=4096), allocatable :: alternatives(:)
      
        1607
            integer :: num_alternatives, ierr
      
        1608
            logical :: is_simple
      
        1609
        
        1610
            allocate(alternatives(1000))
      
        1611
        
        1612
            opt%use_aho_corasick = .false.
      
        1613
        
        1614
            ! Check if pattern is simple alternation of literals
      
        1615
            call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple)
      
        1616
        
        1617
            ! DEBUG (commented out for production)
      
        1618
            ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple
      
        1619
        
        1620
            if (.not. is_simple .or. num_alternatives < 2) return
      
        1621
        
        1622
            ! Build Aho-Corasick automaton
      
        1623
            call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr)
      
        1624
        
        1625
            if (ierr == 0) then
      
        1626
              opt%use_aho_corasick = .true.
      
        1627
            end if
      
        1628
        
        1629
            deallocate(alternatives)
      
        1630
        
        1631
          end subroutine try_build_aho_corasick
      
        1632
        
        1633
          subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple)
      
        1634
            !> Parse pattern to check if it's simple alternation of literals
      
        1635
            !> Returns the alternatives if so
      
        1636
            character(len=*), intent(in) :: pattern
      
        1637
            logical, intent(in) :: is_ere
      
        1638
            character(len=*), intent(out) :: alternatives(:)
      
        1639
            integer, intent(out) :: num_alt
      
        1640
            logical, intent(out) :: is_simple
      
        1641
        
        1642
            integer :: i, pat_len, alt_start, alt_len
      
        1643
            character(len=1) :: c, next_c
      
        1644
            logical :: in_escape
      
        1645
        
        1646
            is_simple = .true.
      
        1647
            num_alt = 0
      
        1648
            pat_len = pattern_len(pattern)  ! Use pattern_len to preserve whitespace patterns
      
        1649
        
        1650
            if (pat_len == 0) then
      
        1651
              is_simple = .false.
      
        1652
              return
      
        1653
            end if
      
        1654
        
        1655
            alt_start = 1
      
        1656
            alt_len = 0
      
        1657
            in_escape = .false.
      
        1658
            i = 1
      
        1659
        
        1660
            do while (i <= pat_len)
      
        1661
              c = pattern(i:i)
      
        1662
        
        1663
              if (in_escape) then
      
        1664
                ! In ERE mode, \| is literal |
      
        1665
                ! In BRE mode, \| is alternation (GNU extension)
      
        1666
                if (c == '|' .and. .not. is_ere) then
      
        1667
                  ! BRE alternation
      
        1668
                  if (alt_len > 0) then
      
        1669
                    num_alt = num_alt + 1
      
        1670
                    if (num_alt > size(alternatives)) then
      
        1671
                      is_simple = .false.
      
        1672
                      return
      
        1673
                    end if
      
        1674
                    alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
      
        1675
                    ! Add null terminator to preserve exact length
      
        1676
                    if (alt_len < len(alternatives(num_alt))) then
      
        1677
                      alternatives(num_alt)(alt_len+1:alt_len+1) = char(0)
      
        1678
                    end if
      
        1679
                  else
      
        1680
                    ! Empty alternative - still valid
      
        1681
                    num_alt = num_alt + 1
      
        1682
                    alternatives(num_alt) = char(0)
      
        1683
                  end if
      
        1684
                  alt_start = i + 1
      
        1685
                  alt_len = 0
      
        1686
                else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. &
      
        1687
                         c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. &
      
        1688
                         c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. &
      
        1689
                         c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. &
      
        1690
                         c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. &
      
        1691
                         c == '7' .or. c == '8' .or. c == '9') then
      
        1692
                  ! Regex metacharacter - not simple
      
        1693
                  is_simple = .false.
      
        1694
                  return
      
        1695
                else
      
        1696
                  ! Escaped literal character (e.g., \., \*, etc.)
      
        1697
                  alt_len = alt_len + 1
      
        1698
                end if
      
        1699
                in_escape = .false.
      
        1700
                i = i + 1
      
        1701
                cycle
      
        1702
              end if
      
        1703
        
        1704
              if (c == '\') then
      
        1705
                in_escape = .true.
      
        1706
                i = i + 1
      
        1707
                cycle
      
        1708
              end if
      
        1709
        
        1710
              ! Check for metacharacters
      
        1711
              if (is_ere) then
      
        1712
                ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters
      
        1713
                if (c == '|') then
      
        1714
                  ! Alternation separator
      
        1715
                  if (alt_len > 0) then
      
        1716
                    num_alt = num_alt + 1
      
        1717
                    if (num_alt > size(alternatives)) then
      
        1718
                      is_simple = .false.
      
        1719
                      return
      
        1720
                    end if
      
        1721
                    alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
      
        1722
                    ! Add null terminator to preserve exact length
      
        1723
                    if (alt_len < len(alternatives(num_alt))) then
      
        1724
                      alternatives(num_alt)(alt_len+1:alt_len+1) = char(0)
      
        1725
                    end if
      
        1726
                  else
      
        1727
                    num_alt = num_alt + 1
      
        1728
                    alternatives(num_alt) = char(0)
      
        1729
                  end if
      
        1730
                  alt_start = i + 1
      
        1731
                  alt_len = 0
      
        1732
                  i = i + 1
      
        1733
                  cycle
      
        1734
                else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. &
      
        1735
                         c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. &
      
        1736
                         c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then
      
        1737
                  ! Metacharacter - not simple alternation
      
        1738
                  is_simple = .false.
      
        1739
                  return
      
        1740
                end if
      
        1741
              else
      
        1742
                ! BRE mode: only . * [ ] ^ $ are metacharacters
      
        1743
                ! | is literal, \| is alternation (GNU extension)
      
        1744
                if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. &
      
        1745
                    c == '^' .or. c == '$') then
      
        1746
                  is_simple = .false.
      
        1747
                  return
      
        1748
                end if
      
        1749
              end if
      
        1750
        
        1751
              ! Regular literal character
      
        1752
              alt_len = alt_len + 1
      
        1753
              i = i + 1
      
        1754
            end do
      
        1755
        
        1756
            ! Handle last alternative
      
        1757
            if (alt_len > 0 .or. num_alt > 0) then
      
        1758
              num_alt = num_alt + 1
      
        1759
              if (num_alt > size(alternatives)) then
      
        1760
                is_simple = .false.
      
        1761
                return
      
        1762
              end if
      
        1763
              if (alt_len > 0) then
      
        1764
                alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
      
        1765
                ! Add null terminator to preserve exact length
      
        1766
                if (alt_len < len(alternatives(num_alt))) then
      
        1767
                  alternatives(num_alt)(alt_len+1:alt_len+1) = char(0)
      
        1768
                end if
      
        1769
              else
      
        1770
                alternatives(num_alt) = char(0)
      
        1771
              end if
      
        1772
            end if
      
        1773
        
        1774
            ! Need at least 2 alternatives for Aho-Corasick to be useful
      
        1775
            if (num_alt < 2) then
      
        1776
              is_simple = .false.
      
        1777
            end if
      
        1778
        
        1779
          end subroutine parse_simple_alternation
      
        1780
        
        1781
          function ac_optimized_search(ac, text) result(res)
      
        1782
            !> Search using Aho-Corasick automaton
      
        1783
            type(ac_automaton_t), intent(in) :: ac
      
        1784
            character(len=*), intent(in) :: text
      
        1785
            type(match_result_t) :: res
      
        1786
        
        1787
            type(ac_match_t) :: ac_match
      
        1788
        
        1789
            res%matched = .false.
      
        1790
        
        1791
            ac_match = ac_search(ac, text)
      
        1792
            if (ac_match%matched) then
      
        1793
              res%matched = .true.
      
        1794
              res%match_start = ac_match%start_pos
      
        1795
              res%match_end = ac_match%end_pos
      
        1796
            end if
      
        1797
        
        1798
          end function ac_optimized_search
      
        1799
        
        1800
          !---------------------------------------------------------------------------
      
        1801
          ! Backtracking Matcher for Backreferences
      
        1802
          !---------------------------------------------------------------------------
      
        1803
        
        1804
          function backtrack_search(nfa, text, text_len, ignore_case) result(res)
      
        1805
            !> Search for pattern with backreferences using backtracking
      
        1806
            !> Tries each starting position until a match is found
      
        1807
            type(nfa_t), intent(in) :: nfa
      
        1808
            character(len=*), intent(in) :: text
      
        1809
            integer, intent(in) :: text_len
      
        1810
            logical, intent(in) :: ignore_case
      
        1811
            type(match_result_t) :: res
      
        1812
        
        1813
            integer :: start_pos
      
        1814
            type(match_result_t) :: try_res
      
        1815
        
        1816
            res%matched = .false.
      
        1817
        
        1818
            do start_pos = 1, text_len + 1
      
        1819
              try_res = backtrack_match(nfa, text, text_len, start_pos, ignore_case)
      
        1820
              if (try_res%matched) then
      
        1821
                res = try_res
      
        1822
                return
      
        1823
              end if
      
        1824
            end do
      
        1825
        
        1826
          end function backtrack_search
      
        1827
        
        1828
          function backtrack_match(nfa, text, text_len, start_pos, ignore_case) result(res)
      
        1829
            !> Try to match NFA with backreferences starting at start_pos
      
        1830
            !> Uses recursive backtracking to track group captures
      
        1831
            type(nfa_t), intent(in) :: nfa
      
        1832
            character(len=*), intent(in) :: text
      
        1833
            integer, intent(in) :: text_len, start_pos
      
        1834
            logical, intent(in) :: ignore_case
      
        1835
            type(match_result_t) :: res
      
        1836
        
        1837
            integer :: group_starts(9), group_ends(9)
      
        1838
            integer :: best_end
      
        1839
        
        1840
            res%matched = .false.
      
        1841
            group_starts = 0
      
        1842
            group_ends = 0
      
        1843
            best_end = start_pos - 1
      
        1844
        
        1845
            ! Try to match from the start state
      
        1846
            if (backtrack_from_state(nfa, nfa%start_state, text, text_len, start_pos, &
      
        1847
                                     ignore_case, group_starts, group_ends, best_end)) then
      
        1848
              res%matched = .true.
      
        1849
              res%match_start = start_pos
      
        1850
              res%match_end = best_end
      
        1851
              res%group_starts = group_starts
      
        1852
              res%group_ends = group_ends
      
        1853
            end if
      
        1854
        
        1855
          end function backtrack_match
      
        1856
        
        1857
          recursive function backtrack_from_state(nfa, state, text, text_len, pos, &
      
        1858
                                                  ignore_case, group_starts, group_ends, best_end) result(matched)
      
        1859
            !> Recursive backtracking from a given NFA state
      
        1860
            !> Returns true if we can reach an accepting state
      
        1861
            type(nfa_t), intent(in) :: nfa
      
        1862
            integer, intent(in) :: state
      
        1863
            character(len=*), intent(in) :: text
      
        1864
            integer, intent(in) :: text_len, pos
      
        1865
            logical, intent(in) :: ignore_case
      
        1866
            integer, intent(inout) :: group_starts(9), group_ends(9)
      
        1867
            integer, intent(inout) :: best_end
      
        1868
            logical :: matched
      
        1869
        
        1870
            integer :: i, target, old_start, old_end
      
        1871
            integer :: backref_num, ref_start, ref_end, ref_len
      
        1872
            integer :: saved_starts(9), saved_ends(9)
      
        1873
            type(nfa_transition_t) :: trans
      
        1874
            character(len=1) :: c, c_lower, match_lower
      
        1875
            logical :: char_matches
      
        1876
        
        1877
            matched = .false.
      
        1878
        
        1879
            if (state < 1 .or. state > nfa%num_states) return
      
        1880
        
        1881
            ! Record group start if this state starts a group
      
        1882
            if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then
      
        1883
              old_start = group_starts(nfa%states(state)%group_start)
      
        1884
              group_starts(nfa%states(state)%group_start) = pos
      
        1885
            else
      
        1886
              old_start = 0
      
        1887
            end if
      
        1888
        
        1889
            ! Record group end if this state ends a group
      
        1890
            ! This must be done BEFORE processing transitions so backrefs can see the captured text
      
        1891
            if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then
      
        1892
              old_end = group_ends(nfa%states(state)%group_end)
      
        1893
              group_ends(nfa%states(state)%group_end) = pos - 1
      
        1894
            else
      
        1895
              old_end = 0
      
        1896
            end if
      
        1897
        
        1898
            ! Check if this is an accepting state
      
        1899
            if (nfa%states(state)%is_accept) then
      
        1900
              matched = .true.
      
        1901
              if (pos - 1 > best_end) best_end = pos - 1
      
        1902
              ! Continue to find longest match (greedy)
      
        1903
            end if
      
        1904
        
        1905
            ! Try each transition from this state
      
        1906
            do i = 1, nfa%states(state)%num_trans
      
        1907
              trans = nfa%states(state)%trans(i)
      
        1908
              target = trans%target
      
        1909
        
        1910
              select case (trans%trans_type)
      
        1911
                case (TRANS_EPSILON)
      
        1912
                  ! Check for backreference (negative anchor_type)
      
        1913
                  if (trans%anchor_type < 0) then
      
        1914
                    backref_num = -trans%anchor_type
      
        1915
                    if (backref_num >= 1 .and. backref_num <= 9) then
      
        1916
                      ref_start = group_starts(backref_num)
      
        1917
                      ref_end = group_ends(backref_num)
      
        1918
        
        1919
                      ! If group hasn't been captured yet, backreference fails
      
        1920
                      if (ref_start == 0 .or. ref_end == 0 .or. ref_end < ref_start) cycle
      
        1921
        
        1922
                      ref_len = ref_end - ref_start + 1
      
        1923
        
        1924
                      ! Check if we have enough text remaining
      
        1925
                      if (pos + ref_len - 1 > text_len) cycle
      
        1926
        
        1927
                      ! Check if the text matches the captured group
      
        1928
                      if (ignore_case) then
      
        1929
                        if (.not. strings_equal_icase(text(pos:pos+ref_len-1), &
      
        1930
                                                      text(ref_start:ref_end))) cycle
      
        1931
                      else
      
        1932
                        if (text(pos:pos+ref_len-1) /= text(ref_start:ref_end)) cycle
      
        1933
                      end if
      
        1934
        
        1935
                      ! Backref matches - continue from target with advanced position
      
        1936
                      saved_starts = group_starts
      
        1937
                      saved_ends = group_ends
      
        1938
                      if (backtrack_from_state(nfa, target, text, text_len, pos + ref_len, &
      
        1939
                                               ignore_case, group_starts, group_ends, best_end)) then
      
        1940
                        matched = .true.
      
        1941
                      else
      
        1942
                        group_starts = saved_starts
      
        1943
                        group_ends = saved_ends
      
        1944
                      end if
      
        1945
                    end if
      
        1946
                  else
      
        1947
                    ! Regular epsilon transition
      
        1948
                    saved_starts = group_starts
      
        1949
                    saved_ends = group_ends
      
        1950
                    if (backtrack_from_state(nfa, target, text, text_len, pos, &
      
        1951
                                             ignore_case, group_starts, group_ends, best_end)) then
      
        1952
                      matched = .true.
      
        1953
                    else
      
        1954
                      group_starts = saved_starts
      
        1955
                      group_ends = saved_ends
      
        1956
                    end if
      
        1957
                  end if
      
        1958
        
        1959
                case (TRANS_ANCHOR)
      
        1960
                  ! Check if anchor matches at this position
      
        1961
                  if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then
      
        1962
                    saved_starts = group_starts
      
        1963
                    saved_ends = group_ends
      
        1964
                    if (backtrack_from_state(nfa, target, text, text_len, pos, &
      
        1965
                                             ignore_case, group_starts, group_ends, best_end)) then
      
        1966
                      matched = .true.
      
        1967
                    else
      
        1968
                      group_starts = saved_starts
      
        1969
                      group_ends = saved_ends
      
        1970
                    end if
      
        1971
                  end if
      
        1972
        
        1973
                case (TRANS_CHAR)
      
        1974
                  ! Character transition - need text available
      
        1975
                  if (pos <= text_len) then
      
        1976
                    c = text(pos:pos)
      
        1977
                    char_matches = .false.
      
        1978
        
        1979
                    if (ignore_case) then
      
        1980
                      c_lower = to_lower_char(c)
      
        1981
                      match_lower = to_lower_char(trans%match_char)
      
        1982
                      char_matches = (c_lower == match_lower)
      
        1983
                    else
      
        1984
                      char_matches = (c == trans%match_char)
      
        1985
                    end if
      
        1986
        
        1987
                    if (char_matches) then
      
        1988
                      saved_starts = group_starts
      
        1989
                      saved_ends = group_ends
      
        1990
                      if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
      
        1991
                                               ignore_case, group_starts, group_ends, best_end)) then
      
        1992
                        matched = .true.
      
        1993
                      else
      
        1994
                        group_starts = saved_starts
      
        1995
                        group_ends = saved_ends
      
        1996
                      end if
      
        1997
                    end if
      
        1998
                  end if
      
        1999
        
        2000
                case (TRANS_CLASS)
      
        2001
                  ! Character class transition
      
        2002
                  if (pos <= text_len) then
      
        2003
                    c = text(pos:pos)
      
        2004
                    if (ignore_case) then
      
        2005
                      if (charclass_test_case_insensitive(trans%char_bits, c)) then
      
        2006
                        saved_starts = group_starts
      
        2007
                        saved_ends = group_ends
      
        2008
                        if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
      
        2009
                                                 ignore_case, group_starts, group_ends, best_end)) then
      
        2010
                          matched = .true.
      
        2011
                        else
      
        2012
                          group_starts = saved_starts
      
        2013
                          group_ends = saved_ends
      
        2014
                        end if
      
        2015
                      end if
      
        2016
                    else
      
        2017
                      if (charclass_test(trans%char_bits, c)) then
      
        2018
                        saved_starts = group_starts
      
        2019
                        saved_ends = group_ends
      
        2020
                        if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
      
        2021
                                                 ignore_case, group_starts, group_ends, best_end)) then
      
        2022
                          matched = .true.
      
        2023
                        else
      
        2024
                          group_starts = saved_starts
      
        2025
                          group_ends = saved_ends
      
        2026
                        end if
      
        2027
                      end if
      
        2028
                    end if
      
        2029
                  end if
      
        2030
        
        2031
                case (TRANS_ANY)
      
        2032
                  ! Dot matches any character except newline
      
        2033
                  if (pos <= text_len) then
      
        2034
                    if (text(pos:pos) /= char(10)) then
      
        2035
                      saved_starts = group_starts
      
        2036
                      saved_ends = group_ends
      
        2037
                      if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
      
        2038
                                               ignore_case, group_starts, group_ends, best_end)) then
      
        2039
                        matched = .true.
      
        2040
                      else
      
        2041
                        group_starts = saved_starts
      
        2042
                        group_ends = saved_ends
      
        2043
                      end if
      
        2044
                    end if
      
        2045
                  end if
      
        2046
        
        2047
              end select
      
        2048
            end do
      
        2049
        
        2050
            ! Restore group start and end if we didn't match
      
        2051
            if (.not. matched) then
      
        2052
              if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then
      
        2053
                group_starts(nfa%states(state)%group_start) = old_start
      
        2054
              end if
      
        2055
              if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then
      
        2056
                group_ends(nfa%states(state)%group_end) = old_end
      
        2057
              end if
      
        2058
            end if
      
        2059
        
        2060
          end function backtrack_from_state
      
        2061
        
        2062
          function strings_equal_icase(s1, s2) result(equal)
      
        2063
            !> Compare two strings case-insensitively
      
        2064
            character(len=*), intent(in) :: s1, s2
      
        2065
            logical :: equal
      
        2066
            integer :: i, n
      
        2067
        
        2068
            equal = .false.
      
        2069
            n = len(s1)
      
        2070
            if (len(s2) /= n) return
      
        2071
        
        2072
            do i = 1, n
      
        2073
              if (to_lower_char(s1(i:i)) /= to_lower_char(s2(i:i))) return
      
        2074
            end do
      
        2075
        
        2076
            equal = .true.
      
        2077
          end function strings_equal_icase
      
        2078
        
        2079
        end module regex_optimizer
      
        2080