| 1 | module regex_optimizer |
| 2 | !> Regex optimization module for FERP |
| 3 | !> Provides optimized NFA matching with: |
| 4 | !> - Literal prefix extraction for Boyer-Moore skip |
| 5 | !> - Bit vector state sets for O(1) operations |
| 6 | !> - Lazy DFA state caching |
| 7 | !> - Anchored pattern fast paths |
| 8 | !> - Aho-Corasick for alternation patterns |
| 9 | !> - Bitwise character class matching |
| 10 | use regex_types |
| 11 | use regex_charclass |
| 12 | use aho_corasick |
| 13 | use ferp_kinds, only: pattern_len |
| 14 | implicit none |
| 15 | private |
| 16 | |
| 17 | public :: optimized_nfa_t |
| 18 | public :: optimize_nfa, optimized_match, optimized_search |
| 19 | public :: try_build_aho_corasick |
| 20 | |
| 21 | integer, parameter :: MAX_STATES = 1024 |
| 22 | integer, parameter :: MAX_PREFIX_LEN = 64 |
| 23 | integer, parameter :: DFA_CACHE_SIZE = 256 ! Cache recent state transitions |
| 24 | integer, parameter :: MAX_DFA_STATES = 512 ! Max DFA states before fallback to NFA |
| 25 | integer, parameter :: DFA_DEAD_STATE = 0 ! Special state: no match possible |
| 26 | |
| 27 | !> Bit vector for state sets - much faster than array lookup |
| 28 | type :: state_set_t |
| 29 | integer(8) :: bits(MAX_STATES / 64 + 1) = 0 |
| 30 | integer :: count = 0 |
| 31 | contains |
| 32 | procedure :: clear => state_set_clear |
| 33 | procedure :: add => state_set_add |
| 34 | procedure :: contains => state_set_contains |
| 35 | procedure :: is_empty => state_set_is_empty |
| 36 | procedure :: copy_from => state_set_copy |
| 37 | procedure :: hash => state_set_hash |
| 38 | procedure :: equals => state_set_equals |
| 39 | end type state_set_t |
| 40 | |
| 41 | !> DFA cache entry - caches (state_set_hash, char) -> next_states transitions |
| 42 | type :: dfa_cache_entry_t |
| 43 | integer(8) :: state_hash = 0 ! Hash of source state set |
| 44 | integer :: char_code = -1 ! Character being matched |
| 45 | type(state_set_t) :: next_states ! Resulting states after transition |
| 46 | logical :: valid = .false. ! Entry is populated |
| 47 | logical :: is_case_insensitive = .false. ! Case sensitivity flag |
| 48 | end type dfa_cache_entry_t |
| 49 | |
| 50 | !> Full DFA state - precomputed transitions for all 256 characters |
| 51 | type :: dfa_state_t |
| 52 | integer :: transitions(0:255) = DFA_DEAD_STATE ! Next state for each byte |
| 53 | type(state_set_t) :: nfa_states ! Corresponding NFA state set |
| 54 | logical :: is_accept = .false. ! Is this an accepting state? |
| 55 | integer(8) :: state_hash = 0 ! Hash for lookup |
| 56 | end type dfa_state_t |
| 57 | |
| 58 | !> Compiled DFA for O(n) matching |
| 59 | type :: compiled_dfa_t |
| 60 | type(dfa_state_t), allocatable :: states(:) ! DFA states |
| 61 | integer :: num_states = 0 ! Number of states built |
| 62 | integer :: start_state = 0 ! Starting DFA state |
| 63 | logical :: compiled = .false. ! DFA successfully compiled |
| 64 | logical :: too_large = .false. ! DFA exceeded size limit |
| 65 | ! Character equivalence classes |
| 66 | integer :: char_to_class(0:255) = 0 ! Maps char code to class index |
| 67 | integer :: num_classes = 256 ! Number of equivalence classes |
| 68 | logical :: use_equiv_classes = .false. ! Using equivalence classes |
| 69 | end type compiled_dfa_t |
| 70 | |
| 71 | !> Optimized NFA with precomputed data |
| 72 | type :: optimized_nfa_t |
| 73 | type(nfa_t) :: nfa ! Original NFA |
| 74 | character(len=MAX_PREFIX_LEN) :: prefix = '' ! Literal prefix for quick skip |
| 75 | integer :: prefix_len = 0 |
| 76 | logical :: anchored_start = .false. ! Pattern starts with ^ |
| 77 | logical :: anchored_end = .false. ! Pattern ends with $ |
| 78 | integer :: skip_table(0:255) = 0 ! Boyer-Moore skip table for prefix |
| 79 | type(state_set_t) :: start_closure ! Pre-computed start state epsilon closure |
| 80 | type(state_set_t), allocatable :: epsilon_closures(:) ! Pre-computed epsilon closures per state |
| 81 | type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE) ! Lazy DFA cache |
| 82 | type(compiled_dfa_t) :: dfa ! Full compiled DFA (if available) |
| 83 | logical :: use_dfa = .false. ! Use DFA instead of NFA |
| 84 | type(ac_automaton_t) :: ac ! Aho-Corasick automaton (for alternation) |
| 85 | logical :: use_aho_corasick = .false. ! Use Aho-Corasick for matching |
| 86 | logical :: has_backrefs = .false. ! Pattern contains backreferences |
| 87 | logical :: optimized = .false. |
| 88 | end type optimized_nfa_t |
| 89 | |
| 90 | contains |
| 91 | |
| 92 | !--------------------------------------------------------------------------- |
| 93 | ! State Set Operations (Bit Vector) |
| 94 | !--------------------------------------------------------------------------- |
| 95 | |
| 96 | subroutine state_set_clear(this) |
| 97 | class(state_set_t), intent(inout) :: this |
| 98 | this%bits = 0 |
| 99 | this%count = 0 |
| 100 | end subroutine state_set_clear |
| 101 | |
| 102 | subroutine state_set_add(this, state) |
| 103 | class(state_set_t), intent(inout) :: this |
| 104 | integer, intent(in) :: state |
| 105 | integer :: word_idx, bit_idx |
| 106 | integer(8) :: mask |
| 107 | |
| 108 | if (state < 1 .or. state > MAX_STATES) return |
| 109 | |
| 110 | word_idx = (state - 1) / 64 + 1 |
| 111 | bit_idx = mod(state - 1, 64) |
| 112 | mask = ishft(1_8, bit_idx) |
| 113 | |
| 114 | if (iand(this%bits(word_idx), mask) == 0) then |
| 115 | this%bits(word_idx) = ior(this%bits(word_idx), mask) |
| 116 | this%count = this%count + 1 |
| 117 | end if |
| 118 | end subroutine state_set_add |
| 119 | |
| 120 | function state_set_contains(this, state) result(found) |
| 121 | class(state_set_t), intent(in) :: this |
| 122 | integer, intent(in) :: state |
| 123 | logical :: found |
| 124 | integer :: word_idx, bit_idx |
| 125 | integer(8) :: mask |
| 126 | |
| 127 | found = .false. |
| 128 | if (state < 1 .or. state > MAX_STATES) return |
| 129 | |
| 130 | word_idx = (state - 1) / 64 + 1 |
| 131 | bit_idx = mod(state - 1, 64) |
| 132 | mask = ishft(1_8, bit_idx) |
| 133 | |
| 134 | found = iand(this%bits(word_idx), mask) /= 0 |
| 135 | end function state_set_contains |
| 136 | |
| 137 | function state_set_is_empty(this) result(empty) |
| 138 | class(state_set_t), intent(in) :: this |
| 139 | logical :: empty |
| 140 | empty = this%count == 0 |
| 141 | end function state_set_is_empty |
| 142 | |
| 143 | subroutine state_set_copy(this, other) |
| 144 | class(state_set_t), intent(inout) :: this |
| 145 | type(state_set_t), intent(in) :: other |
| 146 | this%bits = other%bits |
| 147 | this%count = other%count |
| 148 | end subroutine state_set_copy |
| 149 | |
| 150 | function state_set_hash(this) result(h) |
| 151 | !> Compute a hash of the state set for cache lookup |
| 152 | !> Uses FNV-1a style hashing on the bit words |
| 153 | class(state_set_t), intent(in) :: this |
| 154 | integer(8) :: h |
| 155 | integer :: i |
| 156 | integer(8), parameter :: FNV_OFFSET = int(Z'CBF29CE484222325', 8) |
| 157 | integer(8), parameter :: FNV_PRIME = int(Z'100000001B3', 8) |
| 158 | |
| 159 | h = FNV_OFFSET |
| 160 | do i = 1, size(this%bits) |
| 161 | h = ieor(h, this%bits(i)) |
| 162 | h = h * FNV_PRIME |
| 163 | end do |
| 164 | end function state_set_hash |
| 165 | |
| 166 | function state_set_equals(this, other) result(eq) |
| 167 | !> Check if two state sets are identical |
| 168 | class(state_set_t), intent(in) :: this |
| 169 | type(state_set_t), intent(in) :: other |
| 170 | logical :: eq |
| 171 | integer :: i |
| 172 | |
| 173 | eq = .false. |
| 174 | if (this%count /= other%count) return |
| 175 | |
| 176 | do i = 1, size(this%bits) |
| 177 | if (this%bits(i) /= other%bits(i)) return |
| 178 | end do |
| 179 | |
| 180 | eq = .true. |
| 181 | end function state_set_equals |
| 182 | |
| 183 | !--------------------------------------------------------------------------- |
| 184 | ! Optimization: Analyze NFA and extract optimizations |
| 185 | !--------------------------------------------------------------------------- |
| 186 | |
| 187 | subroutine optimize_nfa(opt, nfa) |
| 188 | type(optimized_nfa_t), intent(out) :: opt |
| 189 | type(nfa_t), intent(in) :: nfa |
| 190 | |
| 191 | opt%nfa = nfa |
| 192 | opt%prefix_len = 0 |
| 193 | opt%prefix = '' |
| 194 | opt%anchored_start = .false. |
| 195 | opt%anchored_end = .false. |
| 196 | opt%use_dfa = .false. |
| 197 | opt%has_backrefs = .false. |
| 198 | |
| 199 | ! Detect backreferences in NFA |
| 200 | opt%has_backrefs = has_backref_transitions(nfa) |
| 201 | |
| 202 | ! Extract literal prefix and detect anchors |
| 203 | call extract_prefix_and_anchors(opt) |
| 204 | |
| 205 | ! Build Boyer-Moore skip table for prefix |
| 206 | if (opt%prefix_len > 0) then |
| 207 | call build_skip_table(opt%prefix, opt%prefix_len, opt%skip_table) |
| 208 | end if |
| 209 | |
| 210 | ! Pre-compute start state epsilon closure (position-independent part) |
| 211 | call precompute_start_closure(opt) |
| 212 | |
| 213 | ! Pre-compute epsilon closures for all states (for fast expansion) |
| 214 | call precompute_all_epsilon_closures(opt) |
| 215 | |
| 216 | ! Clear DFA cache |
| 217 | opt%dfa_cache%valid = .false. |
| 218 | |
| 219 | ! Try to compile full DFA for O(n) matching |
| 220 | ! Only for patterns without any position-dependent transitions (anchors) |
| 221 | ! and without backreferences (which require backtracking) |
| 222 | if (.not. has_anchor_transitions(opt%nfa) .and. .not. opt%has_backrefs) then |
| 223 | call compile_dfa(opt) |
| 224 | ! DEBUG: Print DFA compilation result (uncomment for debugging) |
| 225 | ! write(0,*) 'DFA compiled:', opt%use_dfa, 'states:', opt%dfa%num_states, 'too_large:', opt%dfa%too_large |
| 226 | end if |
| 227 | |
| 228 | opt%optimized = .true. |
| 229 | |
| 230 | end subroutine optimize_nfa |
| 231 | |
| 232 | subroutine extract_prefix_and_anchors(opt) |
| 233 | type(optimized_nfa_t), intent(inout) :: opt |
| 234 | integer :: state, i, prefix_len |
| 235 | type(nfa_transition_t) :: trans |
| 236 | logical :: done |
| 237 | |
| 238 | prefix_len = 0 |
| 239 | state = opt%nfa%start_state |
| 240 | |
| 241 | ! Check for start anchor |
| 242 | if (state >= 1 .and. state <= opt%nfa%num_states) then |
| 243 | do i = 1, opt%nfa%states(state)%num_trans |
| 244 | trans = opt%nfa%states(state)%trans(i) |
| 245 | if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 1) then |
| 246 | opt%anchored_start = .true. |
| 247 | state = trans%target |
| 248 | exit |
| 249 | end if |
| 250 | end do |
| 251 | end if |
| 252 | |
| 253 | ! Extract literal prefix by following single-path character transitions |
| 254 | done = .false. |
| 255 | do while (.not. done .and. prefix_len < MAX_PREFIX_LEN) |
| 256 | if (state < 1 .or. state > opt%nfa%num_states) exit |
| 257 | if (opt%nfa%states(state)%num_trans /= 1) exit |
| 258 | |
| 259 | trans = opt%nfa%states(state)%trans(1) |
| 260 | |
| 261 | if (trans%trans_type == TRANS_CHAR) then |
| 262 | prefix_len = prefix_len + 1 |
| 263 | opt%prefix(prefix_len:prefix_len) = trans%match_char |
| 264 | state = trans%target |
| 265 | else if (trans%trans_type == TRANS_EPSILON) then |
| 266 | ! Follow epsilon, but only if it's the only transition |
| 267 | state = trans%target |
| 268 | else |
| 269 | done = .true. |
| 270 | end if |
| 271 | end do |
| 272 | |
| 273 | opt%prefix_len = prefix_len |
| 274 | |
| 275 | ! Check for end anchor (scan accept state's incoming transitions) |
| 276 | ! This is approximate - just check if accept state has anchor transition |
| 277 | if (opt%nfa%accept_state >= 1 .and. opt%nfa%accept_state <= opt%nfa%num_states) then |
| 278 | ! Check states that point to accept state |
| 279 | do state = 1, opt%nfa%num_states |
| 280 | do i = 1, opt%nfa%states(state)%num_trans |
| 281 | trans = opt%nfa%states(state)%trans(i) |
| 282 | if (trans%target == opt%nfa%accept_state) then |
| 283 | if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 2) then |
| 284 | opt%anchored_end = .true. |
| 285 | end if |
| 286 | end if |
| 287 | end do |
| 288 | end do |
| 289 | end if |
| 290 | |
| 291 | end subroutine extract_prefix_and_anchors |
| 292 | |
| 293 | subroutine build_skip_table(prefix, prefix_len, skip_table) |
| 294 | character(len=*), intent(in) :: prefix |
| 295 | integer, intent(in) :: prefix_len |
| 296 | integer, intent(out) :: skip_table(0:255) |
| 297 | integer :: i, c |
| 298 | |
| 299 | ! Default skip is prefix length |
| 300 | skip_table = prefix_len |
| 301 | |
| 302 | ! Set skip distances for characters in prefix |
| 303 | do i = 1, prefix_len - 1 |
| 304 | c = ichar(prefix(i:i)) |
| 305 | skip_table(c) = prefix_len - i |
| 306 | end do |
| 307 | end subroutine build_skip_table |
| 308 | |
| 309 | subroutine precompute_start_closure(opt) |
| 310 | type(optimized_nfa_t), intent(inout) :: opt |
| 311 | ! Compute basic epsilon closure of start state |
| 312 | ! (Some closures depend on position for anchors, handle those at runtime) |
| 313 | call opt%start_closure%clear() |
| 314 | call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, opt%start_closure) |
| 315 | end subroutine precompute_start_closure |
| 316 | |
| 317 | subroutine compute_epsilon_closure_basic(nfa, start_state, result_set) |
| 318 | type(nfa_t), intent(in) :: nfa |
| 319 | integer, intent(in) :: start_state |
| 320 | type(state_set_t), intent(inout) :: result_set |
| 321 | |
| 322 | integer :: stack(MAX_STATES), stack_top |
| 323 | integer :: state, i, target |
| 324 | type(nfa_transition_t) :: trans |
| 325 | |
| 326 | stack_top = 1 |
| 327 | stack(1) = start_state |
| 328 | |
| 329 | do while (stack_top > 0) |
| 330 | state = stack(stack_top) |
| 331 | stack_top = stack_top - 1 |
| 332 | |
| 333 | if (state < 1 .or. state > nfa%num_states) cycle |
| 334 | if (result_set%contains(state)) cycle |
| 335 | |
| 336 | call result_set%add(state) |
| 337 | |
| 338 | ! Follow epsilon transitions (not anchors - those are position-dependent) |
| 339 | do i = 1, nfa%states(state)%num_trans |
| 340 | trans = nfa%states(state)%trans(i) |
| 341 | if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type >= 0) then |
| 342 | target = trans%target |
| 343 | if (target >= 1 .and. target <= nfa%num_states) then |
| 344 | if (.not. result_set%contains(target)) then |
| 345 | stack_top = stack_top + 1 |
| 346 | if (stack_top <= MAX_STATES) stack(stack_top) = target |
| 347 | end if |
| 348 | end if |
| 349 | end if |
| 350 | end do |
| 351 | end do |
| 352 | end subroutine compute_epsilon_closure_basic |
| 353 | |
| 354 | subroutine precompute_all_epsilon_closures(opt) |
| 355 | !> Pre-compute epsilon closure for every NFA state |
| 356 | !> This allows O(1) closure lookup during matching instead of repeated traversal |
| 357 | type(optimized_nfa_t), intent(inout) :: opt |
| 358 | |
| 359 | integer :: i, n |
| 360 | |
| 361 | n = opt%nfa%num_states |
| 362 | if (n <= 0) return |
| 363 | |
| 364 | ! Allocate epsilon closures array |
| 365 | if (allocated(opt%epsilon_closures)) deallocate(opt%epsilon_closures) |
| 366 | allocate(opt%epsilon_closures(n)) |
| 367 | |
| 368 | ! Compute epsilon closure for each state |
| 369 | do i = 1, n |
| 370 | call opt%epsilon_closures(i)%clear() |
| 371 | call compute_epsilon_closure_basic(opt%nfa, i, opt%epsilon_closures(i)) |
| 372 | end do |
| 373 | end subroutine precompute_all_epsilon_closures |
| 374 | |
| 375 | function has_anchor_transitions(nfa) result(has_anchors) |
| 376 | !> Check if NFA has any anchor transitions (position-dependent) |
| 377 | !> These include ^, $, \<, \>, \b, \B |
| 378 | type(nfa_t), intent(in) :: nfa |
| 379 | logical :: has_anchors |
| 380 | |
| 381 | integer :: state, i |
| 382 | type(nfa_transition_t) :: trans |
| 383 | |
| 384 | has_anchors = .false. |
| 385 | |
| 386 | do state = 1, nfa%num_states |
| 387 | do i = 1, nfa%states(state)%num_trans |
| 388 | trans = nfa%states(state)%trans(i) |
| 389 | if (trans%trans_type == TRANS_ANCHOR) then |
| 390 | has_anchors = .true. |
| 391 | return |
| 392 | end if |
| 393 | end do |
| 394 | end do |
| 395 | end function has_anchor_transitions |
| 396 | |
| 397 | function has_backref_transitions(nfa) result(has_backrefs) |
| 398 | !> Check if NFA has any backreference transitions |
| 399 | !> Backrefs are encoded as TRANS_EPSILON with negative anchor_type |
| 400 | type(nfa_t), intent(in) :: nfa |
| 401 | logical :: has_backrefs |
| 402 | |
| 403 | integer :: state, i |
| 404 | type(nfa_transition_t) :: trans |
| 405 | |
| 406 | has_backrefs = .false. |
| 407 | |
| 408 | do state = 1, nfa%num_states |
| 409 | do i = 1, nfa%states(state)%num_trans |
| 410 | trans = nfa%states(state)%trans(i) |
| 411 | if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type < 0) then |
| 412 | has_backrefs = .true. |
| 413 | return |
| 414 | end if |
| 415 | end do |
| 416 | end do |
| 417 | end function has_backref_transitions |
| 418 | |
| 419 | !--------------------------------------------------------------------------- |
| 420 | ! Character Equivalence Classes |
| 421 | !--------------------------------------------------------------------------- |
| 422 | |
| 423 | subroutine compute_equiv_classes(nfa, char_to_class, num_classes) |
| 424 | !> Compute character equivalence classes from NFA transitions |
| 425 | !> Characters with identical behavior across all NFA states belong to same class |
| 426 | !> This reduces DFA transition table from 256 entries to num_classes entries |
| 427 | type(nfa_t), intent(in) :: nfa |
| 428 | integer, intent(out) :: char_to_class(0:255) |
| 429 | integer, intent(out) :: num_classes |
| 430 | |
| 431 | ! Signature for each character: encodes which transitions it triggers |
| 432 | ! We use a simple approach: hash the set of (state, target) pairs for each char |
| 433 | integer(8) :: char_signature(0:255) |
| 434 | integer :: state, i, c, target |
| 435 | type(nfa_transition_t) :: trans |
| 436 | integer(8) :: sig |
| 437 | integer :: class_map(0:255) ! signature hash -> class index |
| 438 | logical :: found |
| 439 | |
| 440 | ! Initialize all characters to have signature 0 (no transitions) |
| 441 | char_signature = 0_8 |
| 442 | |
| 443 | ! Build signature for each character based on NFA transitions |
| 444 | do state = 1, nfa%num_states |
| 445 | do i = 1, nfa%states(state)%num_trans |
| 446 | trans = nfa%states(state)%trans(i) |
| 447 | target = trans%target |
| 448 | |
| 449 | select case (trans%trans_type) |
| 450 | case (TRANS_CHAR) |
| 451 | ! Single character transition |
| 452 | c = ichar(trans%match_char) |
| 453 | ! Add (state, target) to signature using FNV-1a-like hash |
| 454 | char_signature(c) = ieor(char_signature(c), & |
| 455 | int(state * 31 + target, 8) * 1099511628211_8) |
| 456 | |
| 457 | case (TRANS_CLASS) |
| 458 | ! Character class transition - add to all matching chars |
| 459 | do c = 0, 255 |
| 460 | if (charclass_test(trans%char_bits, char(c))) then |
| 461 | char_signature(c) = ieor(char_signature(c), & |
| 462 | int(state * 31 + target, 8) * 1099511628211_8) |
| 463 | end if |
| 464 | end do |
| 465 | |
| 466 | case (TRANS_ANY) |
| 467 | ! Dot matches all except newline |
| 468 | do c = 0, 255 |
| 469 | if (c /= 10) then ! Not newline |
| 470 | char_signature(c) = ieor(char_signature(c), & |
| 471 | int(state * 31 + target, 8) * 1099511628211_8) |
| 472 | end if |
| 473 | end do |
| 474 | end select |
| 475 | end do |
| 476 | end do |
| 477 | |
| 478 | ! Force each alphabetic character to have a unique signature |
| 479 | ! This ensures they get their own equivalence classes, so the case-folding |
| 480 | ! code in DFA compilation works correctly (it relies on the class representative |
| 481 | ! being alphabetic to compute transitions for both cases) |
| 482 | do c = ichar('a'), ichar('z') |
| 483 | ! Add unique value to each letter's signature to separate them from non-letters |
| 484 | char_signature(c) = ieor(char_signature(c), int(c * 7919 + 1, 8)) |
| 485 | char_signature(c - 32) = ieor(char_signature(c - 32), int((c - 32) * 7919 + 1, 8)) |
| 486 | end do |
| 487 | |
| 488 | ! Now group characters by signature |
| 489 | num_classes = 0 |
| 490 | class_map = -1 |
| 491 | char_to_class = 0 |
| 492 | |
| 493 | do c = 0, 255 |
| 494 | sig = char_signature(c) |
| 495 | |
| 496 | ! Look for existing class with this signature |
| 497 | found = .false. |
| 498 | do i = 0, num_classes - 1 |
| 499 | if (class_map(i) /= -1) then |
| 500 | ! Check if any character in class i has same signature |
| 501 | ! We stored the signature hash as a proxy |
| 502 | if (char_signature(class_map(i)) == sig) then |
| 503 | char_to_class(c) = i |
| 504 | found = .true. |
| 505 | exit |
| 506 | end if |
| 507 | end if |
| 508 | end do |
| 509 | |
| 510 | if (.not. found) then |
| 511 | ! Create new class |
| 512 | char_to_class(c) = num_classes |
| 513 | class_map(num_classes) = c ! Remember one char from this class |
| 514 | num_classes = num_classes + 1 |
| 515 | end if |
| 516 | end do |
| 517 | |
| 518 | ! Ensure at least one class |
| 519 | if (num_classes == 0) num_classes = 1 |
| 520 | |
| 521 | end subroutine compute_equiv_classes |
| 522 | |
| 523 | !--------------------------------------------------------------------------- |
| 524 | ! DFA Compilation: Convert NFA to DFA for O(n) matching |
| 525 | !--------------------------------------------------------------------------- |
| 526 | |
| 527 | subroutine compile_dfa(opt) |
| 528 | !> Compile NFA to DFA using subset construction |
| 529 | !> Uses character equivalence classes to reduce compilation time |
| 530 | type(optimized_nfa_t), intent(inout) :: opt |
| 531 | |
| 532 | type(state_set_t) :: start_set, next_set |
| 533 | integer :: worklist(MAX_DFA_STATES), work_head, work_tail |
| 534 | integer :: dfa_idx, char_code, next_idx, old_num_states |
| 535 | integer :: class_idx, c |
| 536 | integer :: class_representative(0:255) ! One char per class |
| 537 | integer :: class_transitions(0:255) ! Computed transition per class |
| 538 | |
| 539 | ! Allocate DFA states |
| 540 | if (allocated(opt%dfa%states)) deallocate(opt%dfa%states) |
| 541 | allocate(opt%dfa%states(MAX_DFA_STATES)) |
| 542 | opt%dfa%num_states = 0 |
| 543 | opt%dfa%compiled = .false. |
| 544 | opt%dfa%too_large = .false. |
| 545 | opt%use_dfa = .false. |
| 546 | |
| 547 | ! Compute character equivalence classes |
| 548 | call compute_equiv_classes(opt%nfa, opt%dfa%char_to_class, opt%dfa%num_classes) |
| 549 | opt%dfa%use_equiv_classes = (opt%dfa%num_classes < 256) |
| 550 | |
| 551 | ! Build representative character for each class |
| 552 | class_representative = -1 |
| 553 | do c = 0, 255 |
| 554 | class_idx = opt%dfa%char_to_class(c) |
| 555 | if (class_representative(class_idx) == -1) then |
| 556 | class_representative(class_idx) = c |
| 557 | end if |
| 558 | end do |
| 559 | |
| 560 | ! Compute start state: epsilon closure of NFA start |
| 561 | call start_set%clear() |
| 562 | call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set) |
| 563 | |
| 564 | if (start_set%is_empty()) return |
| 565 | |
| 566 | ! Create initial DFA state |
| 567 | opt%dfa%num_states = 1 |
| 568 | opt%dfa%states(1)%nfa_states = start_set |
| 569 | opt%dfa%states(1)%state_hash = start_set%hash() |
| 570 | opt%dfa%states(1)%is_accept = is_accepting_set(opt%nfa, start_set) |
| 571 | opt%dfa%start_state = 1 |
| 572 | |
| 573 | ! Initialize worklist with start state |
| 574 | work_head = 1 |
| 575 | work_tail = 1 |
| 576 | worklist(1) = 1 |
| 577 | |
| 578 | ! Process worklist: for each DFA state, compute transitions |
| 579 | do while (work_head <= work_tail) |
| 580 | dfa_idx = worklist(work_head) |
| 581 | work_head = work_head + 1 |
| 582 | |
| 583 | ! First, compute transitions for each equivalence class (not all 256 chars) |
| 584 | class_transitions = DFA_DEAD_STATE |
| 585 | |
| 586 | do class_idx = 0, opt%dfa%num_classes - 1 |
| 587 | char_code = class_representative(class_idx) |
| 588 | if (char_code < 0) cycle |
| 589 | |
| 590 | call next_set%clear() |
| 591 | |
| 592 | ! Compute NFA transitions for this character |
| 593 | ! Note: DFA is case-sensitive. Case-insensitive matching uses NFA path. |
| 594 | call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, & |
| 595 | char(char_code), next_set) |
| 596 | |
| 597 | ! Compute epsilon closure of result |
| 598 | if (.not. next_set%is_empty()) then |
| 599 | call expand_epsilon_closure_simple(opt, next_set) |
| 600 | end if |
| 601 | |
| 602 | if (next_set%is_empty()) then |
| 603 | class_transitions(class_idx) = DFA_DEAD_STATE |
| 604 | else |
| 605 | ! Find or create DFA state for this NFA state set |
| 606 | old_num_states = opt%dfa%num_states |
| 607 | next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa) |
| 608 | |
| 609 | if (next_idx == -1) then |
| 610 | opt%dfa%too_large = .true. |
| 611 | opt%dfa%compiled = .false. |
| 612 | return |
| 613 | end if |
| 614 | |
| 615 | class_transitions(class_idx) = next_idx |
| 616 | |
| 617 | ! Add new state to worklist only if it was just created |
| 618 | if (opt%dfa%num_states > old_num_states) then |
| 619 | work_tail = work_tail + 1 |
| 620 | if (work_tail > MAX_DFA_STATES) then |
| 621 | opt%dfa%too_large = .true. |
| 622 | opt%dfa%compiled = .false. |
| 623 | return |
| 624 | end if |
| 625 | worklist(work_tail) = next_idx |
| 626 | end if |
| 627 | end if |
| 628 | end do |
| 629 | |
| 630 | ! Now fill in the full 256-entry transition table from class transitions |
| 631 | do c = 0, 255 |
| 632 | class_idx = opt%dfa%char_to_class(c) |
| 633 | opt%dfa%states(dfa_idx)%transitions(c) = class_transitions(class_idx) |
| 634 | end do |
| 635 | end do |
| 636 | |
| 637 | ! Minimize DFA to reduce state count |
| 638 | call minimize_dfa(opt%dfa) |
| 639 | |
| 640 | opt%dfa%compiled = .true. |
| 641 | opt%use_dfa = .true. |
| 642 | |
| 643 | end subroutine compile_dfa |
| 644 | |
| 645 | function find_or_create_dfa_state(dfa, nfa_states, nfa) result(idx) |
| 646 | !> Find existing DFA state for NFA state set, or create new one |
| 647 | !> Returns -1 if DFA state limit exceeded |
| 648 | type(compiled_dfa_t), intent(inout) :: dfa |
| 649 | type(state_set_t), intent(in) :: nfa_states |
| 650 | type(nfa_t), intent(in) :: nfa |
| 651 | integer :: idx |
| 652 | |
| 653 | integer(8) :: h |
| 654 | integer :: i |
| 655 | |
| 656 | h = nfa_states%hash() |
| 657 | |
| 658 | ! Search existing states |
| 659 | do i = 1, dfa%num_states |
| 660 | if (dfa%states(i)%state_hash == h .and. & |
| 661 | dfa%states(i)%nfa_states%equals(nfa_states)) then |
| 662 | idx = i |
| 663 | return |
| 664 | end if |
| 665 | end do |
| 666 | |
| 667 | ! Create new state |
| 668 | if (dfa%num_states >= MAX_DFA_STATES) then |
| 669 | idx = -1 |
| 670 | return |
| 671 | end if |
| 672 | |
| 673 | dfa%num_states = dfa%num_states + 1 |
| 674 | idx = dfa%num_states |
| 675 | dfa%states(idx)%nfa_states = nfa_states |
| 676 | dfa%states(idx)%state_hash = h |
| 677 | dfa%states(idx)%is_accept = is_accepting_set(nfa, nfa_states) |
| 678 | dfa%states(idx)%transitions = DFA_DEAD_STATE |
| 679 | |
| 680 | end function find_or_create_dfa_state |
| 681 | |
| 682 | subroutine minimize_dfa(dfa) |
| 683 | !> Minimize DFA using Hopcroft's algorithm |
| 684 | !> Merges equivalent states to reduce DFA size |
| 685 | type(compiled_dfa_t), intent(inout) :: dfa |
| 686 | |
| 687 | integer :: num_states, num_partitions |
| 688 | integer, allocatable :: partition(:) ! partition(state) = partition ID |
| 689 | integer, allocatable :: part_size(:) ! Size of each partition |
| 690 | integer, allocatable :: representative(:) ! Representative state for each partition |
| 691 | integer, allocatable :: new_state_id(:) ! Mapping from old state to new state ID |
| 692 | type(dfa_state_t), allocatable :: new_states(:) |
| 693 | |
| 694 | logical, allocatable :: in_worklist(:) ! Is partition in worklist? |
| 695 | integer, allocatable :: worklist(:) ! Partitions to process |
| 696 | integer :: work_head, work_tail |
| 697 | |
| 698 | integer :: i, c, state, target, part_id |
| 699 | integer :: num_accept, num_reject |
| 700 | integer :: old_part, new_part_id |
| 701 | logical :: needs_split |
| 702 | integer, allocatable :: split_marker(:) ! Which states go to partition A on char c |
| 703 | integer :: new_num_states |
| 704 | |
| 705 | num_states = dfa%num_states |
| 706 | if (num_states <= 1) return ! Nothing to minimize |
| 707 | |
| 708 | ! Allocate working arrays |
| 709 | allocate(partition(num_states)) |
| 710 | allocate(part_size(num_states)) |
| 711 | allocate(representative(num_states)) |
| 712 | allocate(new_state_id(num_states)) |
| 713 | allocate(in_worklist(num_states)) |
| 714 | allocate(worklist(num_states)) |
| 715 | allocate(split_marker(num_states)) |
| 716 | |
| 717 | ! Initialize partitions: accepting states = partition 1, non-accepting = partition 2 |
| 718 | partition = 0 |
| 719 | part_size = 0 |
| 720 | num_accept = 0 |
| 721 | num_reject = 0 |
| 722 | |
| 723 | do i = 1, num_states |
| 724 | if (dfa%states(i)%is_accept) then |
| 725 | partition(i) = 1 |
| 726 | num_accept = num_accept + 1 |
| 727 | else |
| 728 | partition(i) = 2 |
| 729 | num_reject = num_reject + 1 |
| 730 | end if |
| 731 | end do |
| 732 | |
| 733 | part_size(1) = num_accept |
| 734 | part_size(2) = num_reject |
| 735 | num_partitions = 2 |
| 736 | |
| 737 | ! Handle edge case: all accepting or all rejecting |
| 738 | if (num_accept == 0 .or. num_reject == 0) then |
| 739 | num_partitions = 1 |
| 740 | partition = 1 |
| 741 | part_size(1) = num_states |
| 742 | end if |
| 743 | |
| 744 | ! Initialize worklist with smaller partition (Hopcroft optimization) |
| 745 | in_worklist = .false. |
| 746 | work_head = 1 |
| 747 | work_tail = 0 |
| 748 | |
| 749 | if (num_partitions == 2) then |
| 750 | if (num_accept <= num_reject) then |
| 751 | work_tail = 1 |
| 752 | worklist(1) = 1 |
| 753 | in_worklist(1) = .true. |
| 754 | else |
| 755 | work_tail = 1 |
| 756 | worklist(1) = 2 |
| 757 | in_worklist(2) = .true. |
| 758 | end if |
| 759 | end if |
| 760 | |
| 761 | ! Main refinement loop |
| 762 | do while (work_head <= work_tail) |
| 763 | part_id = worklist(work_head) |
| 764 | work_head = work_head + 1 |
| 765 | in_worklist(part_id) = .false. |
| 766 | |
| 767 | ! For each character, check if this partition splits others |
| 768 | do c = 0, 255 |
| 769 | ! Mark states that transition to partition part_id on character c |
| 770 | split_marker = 0 |
| 771 | do state = 1, num_states |
| 772 | target = dfa%states(state)%transitions(c) |
| 773 | if (target > 0 .and. target <= num_states) then |
| 774 | if (partition(target) == part_id) then |
| 775 | split_marker(state) = 1 |
| 776 | end if |
| 777 | end if |
| 778 | end do |
| 779 | |
| 780 | ! Check each existing partition for splits |
| 781 | do old_part = 1, num_partitions |
| 782 | ! Count states in this partition that go to part_id vs don't |
| 783 | num_accept = 0 ! Reuse: count going to part_id |
| 784 | num_reject = 0 ! Reuse: count not going to part_id |
| 785 | |
| 786 | do state = 1, num_states |
| 787 | if (partition(state) == old_part) then |
| 788 | if (split_marker(state) == 1) then |
| 789 | num_accept = num_accept + 1 |
| 790 | else |
| 791 | num_reject = num_reject + 1 |
| 792 | end if |
| 793 | end if |
| 794 | end do |
| 795 | |
| 796 | ! If partition needs splitting (has both types) |
| 797 | needs_split = (num_accept > 0 .and. num_reject > 0) |
| 798 | |
| 799 | if (needs_split) then |
| 800 | ! Create new partition for the smaller group |
| 801 | num_partitions = num_partitions + 1 |
| 802 | new_part_id = num_partitions |
| 803 | |
| 804 | ! Move the smaller group to new partition |
| 805 | if (num_accept <= num_reject) then |
| 806 | ! Move states going to part_id to new partition |
| 807 | do state = 1, num_states |
| 808 | if (partition(state) == old_part .and. split_marker(state) == 1) then |
| 809 | partition(state) = new_part_id |
| 810 | end if |
| 811 | end do |
| 812 | part_size(new_part_id) = num_accept |
| 813 | part_size(old_part) = num_reject |
| 814 | else |
| 815 | ! Move states NOT going to part_id to new partition |
| 816 | do state = 1, num_states |
| 817 | if (partition(state) == old_part .and. split_marker(state) == 0) then |
| 818 | partition(state) = new_part_id |
| 819 | end if |
| 820 | end do |
| 821 | part_size(new_part_id) = num_reject |
| 822 | part_size(old_part) = num_accept |
| 823 | end if |
| 824 | |
| 825 | ! Update worklist |
| 826 | if (in_worklist(old_part)) then |
| 827 | ! Both halves need to be in worklist |
| 828 | work_tail = work_tail + 1 |
| 829 | worklist(work_tail) = new_part_id |
| 830 | in_worklist(new_part_id) = .true. |
| 831 | else |
| 832 | ! Add smaller partition to worklist |
| 833 | if (part_size(new_part_id) <= part_size(old_part)) then |
| 834 | work_tail = work_tail + 1 |
| 835 | worklist(work_tail) = new_part_id |
| 836 | in_worklist(new_part_id) = .true. |
| 837 | else |
| 838 | work_tail = work_tail + 1 |
| 839 | worklist(work_tail) = old_part |
| 840 | in_worklist(old_part) = .true. |
| 841 | end if |
| 842 | end if |
| 843 | end if |
| 844 | end do |
| 845 | end do |
| 846 | end do |
| 847 | |
| 848 | ! Check if minimization actually reduced states |
| 849 | if (num_partitions >= num_states) then |
| 850 | ! No reduction possible |
| 851 | deallocate(partition, part_size, representative, new_state_id) |
| 852 | deallocate(in_worklist, worklist, split_marker) |
| 853 | return |
| 854 | end if |
| 855 | |
| 856 | ! Find representative for each partition (lowest numbered state) |
| 857 | representative = 0 |
| 858 | do state = 1, num_states |
| 859 | part_id = partition(state) |
| 860 | if (representative(part_id) == 0) then |
| 861 | representative(part_id) = state |
| 862 | end if |
| 863 | end do |
| 864 | |
| 865 | ! Build new state IDs (compact numbering) |
| 866 | new_state_id = 0 |
| 867 | new_num_states = 0 |
| 868 | do part_id = 1, num_partitions |
| 869 | if (representative(part_id) > 0) then |
| 870 | new_num_states = new_num_states + 1 |
| 871 | ! Map all states in this partition to new state ID |
| 872 | do state = 1, num_states |
| 873 | if (partition(state) == part_id) then |
| 874 | new_state_id(state) = new_num_states |
| 875 | end if |
| 876 | end do |
| 877 | end if |
| 878 | end do |
| 879 | |
| 880 | ! Build minimized DFA |
| 881 | allocate(new_states(new_num_states)) |
| 882 | |
| 883 | do part_id = 1, num_partitions |
| 884 | state = representative(part_id) |
| 885 | if (state == 0) cycle |
| 886 | |
| 887 | i = new_state_id(state) |
| 888 | new_states(i)%is_accept = dfa%states(state)%is_accept |
| 889 | new_states(i)%state_hash = dfa%states(state)%state_hash |
| 890 | new_states(i)%nfa_states = dfa%states(state)%nfa_states |
| 891 | |
| 892 | ! Remap transitions |
| 893 | do c = 0, 255 |
| 894 | target = dfa%states(state)%transitions(c) |
| 895 | if (target > 0 .and. target <= num_states) then |
| 896 | new_states(i)%transitions(c) = new_state_id(target) |
| 897 | else |
| 898 | new_states(i)%transitions(c) = DFA_DEAD_STATE |
| 899 | end if |
| 900 | end do |
| 901 | end do |
| 902 | |
| 903 | ! Update DFA with minimized version |
| 904 | deallocate(dfa%states) |
| 905 | allocate(dfa%states(new_num_states)) |
| 906 | dfa%states = new_states |
| 907 | dfa%start_state = new_state_id(dfa%start_state) |
| 908 | dfa%num_states = new_num_states |
| 909 | |
| 910 | ! Cleanup |
| 911 | deallocate(partition, part_size, representative, new_state_id) |
| 912 | deallocate(in_worklist, worklist, split_marker, new_states) |
| 913 | |
| 914 | end subroutine minimize_dfa |
| 915 | |
| 916 | subroutine compute_char_transitions_simple(nfa, current, c, next_set) |
| 917 | !> Compute character transitions without case folding (for DFA compilation) |
| 918 | type(nfa_t), intent(in) :: nfa |
| 919 | type(state_set_t), intent(in) :: current |
| 920 | character(len=1), intent(in) :: c |
| 921 | type(state_set_t), intent(inout) :: next_set |
| 922 | |
| 923 | integer :: state, word_idx, bit_idx, i |
| 924 | integer(8) :: word, mask |
| 925 | type(nfa_transition_t) :: trans |
| 926 | |
| 927 | do word_idx = 1, size(current%bits) |
| 928 | word = current%bits(word_idx) |
| 929 | if (word == 0) cycle |
| 930 | |
| 931 | do bit_idx = 0, 63 |
| 932 | mask = ishft(1_8, bit_idx) |
| 933 | if (iand(word, mask) /= 0) then |
| 934 | state = (word_idx - 1) * 64 + bit_idx + 1 |
| 935 | if (state > nfa%num_states) cycle |
| 936 | |
| 937 | do i = 1, nfa%states(state)%num_trans |
| 938 | trans = nfa%states(state)%trans(i) |
| 939 | |
| 940 | select case (trans%trans_type) |
| 941 | case (TRANS_CHAR) |
| 942 | if (c == trans%match_char) then |
| 943 | call next_set%add(trans%target) |
| 944 | end if |
| 945 | |
| 946 | case (TRANS_CLASS) |
| 947 | if (trans%char_class(ichar(c)) .neqv. trans%negated) then |
| 948 | call next_set%add(trans%target) |
| 949 | end if |
| 950 | |
| 951 | case (TRANS_ANY) |
| 952 | if (c /= char(10)) then |
| 953 | call next_set%add(trans%target) |
| 954 | end if |
| 955 | end select |
| 956 | end do |
| 957 | end if |
| 958 | end do |
| 959 | end do |
| 960 | end subroutine compute_char_transitions_simple |
| 961 | |
| 962 | subroutine expand_epsilon_closure_simple(opt, state_set) |
| 963 | !> Expand state set to include epsilon closure (in-place) |
| 964 | !> Uses pre-computed closures for O(1) lookup per state |
| 965 | type(optimized_nfa_t), intent(in) :: opt |
| 966 | type(state_set_t), intent(inout) :: state_set |
| 967 | |
| 968 | integer :: word_idx, bit_idx, state, j |
| 969 | integer(8) :: word, mask, original_bits(size(state_set%bits)) |
| 970 | |
| 971 | ! If no pre-computed closures, fall back to computing on-the-fly |
| 972 | if (.not. allocated(opt%epsilon_closures)) then |
| 973 | call expand_epsilon_closure_simple_fallback(opt%nfa, state_set) |
| 974 | return |
| 975 | end if |
| 976 | |
| 977 | ! Save original bits to avoid processing newly added states |
| 978 | original_bits = state_set%bits |
| 979 | |
| 980 | ! Expand using pre-computed closures - just OR the bit vectors |
| 981 | do word_idx = 1, size(original_bits) |
| 982 | word = original_bits(word_idx) |
| 983 | if (word == 0) cycle |
| 984 | |
| 985 | do bit_idx = 0, 63 |
| 986 | mask = ishft(1_8, bit_idx) |
| 987 | if (iand(word, mask) /= 0) then |
| 988 | state = (word_idx - 1) * 64 + bit_idx + 1 |
| 989 | if (state >= 1 .and. state <= opt%nfa%num_states) then |
| 990 | ! Merge pre-computed epsilon closure using bitwise OR |
| 991 | do j = 1, size(state_set%bits) |
| 992 | state_set%bits(j) = ior(state_set%bits(j), opt%epsilon_closures(state)%bits(j)) |
| 993 | end do |
| 994 | end if |
| 995 | end if |
| 996 | end do |
| 997 | end do |
| 998 | end subroutine expand_epsilon_closure_simple |
| 999 | |
| 1000 | subroutine expand_epsilon_closure_simple_fallback(nfa, state_set) |
| 1001 | !> Fallback: compute epsilon closure on-the-fly |
| 1002 | type(nfa_t), intent(in) :: nfa |
| 1003 | type(state_set_t), intent(inout) :: state_set |
| 1004 | |
| 1005 | type(state_set_t) :: result |
| 1006 | integer :: word_idx, bit_idx, state |
| 1007 | integer(8) :: word, mask |
| 1008 | |
| 1009 | call result%clear() |
| 1010 | |
| 1011 | do word_idx = 1, size(state_set%bits) |
| 1012 | word = state_set%bits(word_idx) |
| 1013 | if (word == 0) cycle |
| 1014 | |
| 1015 | do bit_idx = 0, 63 |
| 1016 | mask = ishft(1_8, bit_idx) |
| 1017 | if (iand(word, mask) /= 0) then |
| 1018 | state = (word_idx - 1) * 64 + bit_idx + 1 |
| 1019 | if (state <= nfa%num_states) then |
| 1020 | call compute_epsilon_closure_basic(nfa, state, result) |
| 1021 | end if |
| 1022 | end if |
| 1023 | end do |
| 1024 | end do |
| 1025 | |
| 1026 | call state_set%copy_from(result) |
| 1027 | end subroutine expand_epsilon_closure_simple_fallback |
| 1028 | |
| 1029 | !--------------------------------------------------------------------------- |
| 1030 | ! Optimized Search: Use prefix to skip positions |
| 1031 | !--------------------------------------------------------------------------- |
| 1032 | |
| 1033 | function optimized_search(opt, text, ignore_case) result(res) |
| 1034 | type(optimized_nfa_t), intent(inout) :: opt |
| 1035 | character(len=*), intent(in) :: text |
| 1036 | logical, intent(in) :: ignore_case |
| 1037 | type(match_result_t) :: res |
| 1038 | |
| 1039 | integer :: text_len, pos, skip |
| 1040 | type(match_result_t) :: try_res |
| 1041 | |
| 1042 | res%matched = .false. |
| 1043 | text_len = len_trim(text) |
| 1044 | |
| 1045 | ! Fast path: use Aho-Corasick for alternation patterns |
| 1046 | ! Only use AC if ignore_case setting matches what was compiled |
| 1047 | if (opt%use_aho_corasick) then |
| 1048 | if (ignore_case .eqv. opt%ac%ignore_case) then |
| 1049 | res = ac_optimized_search(opt%ac, text) |
| 1050 | return |
| 1051 | end if |
| 1052 | end if |
| 1053 | |
| 1054 | if (opt%nfa%num_states == 0) return |
| 1055 | |
| 1056 | ! Backtracking path: use backtracking matcher for patterns with backreferences |
| 1057 | if (opt%has_backrefs) then |
| 1058 | res = backtrack_search(opt%nfa, text, text_len, ignore_case) |
| 1059 | return |
| 1060 | end if |
| 1061 | |
| 1062 | ! Fast path: use DFA if available (O(n) matching) |
| 1063 | ! DFA is case-sensitive; case-insensitive matching falls through to NFA path |
| 1064 | if (opt%use_dfa .and. .not. ignore_case) then |
| 1065 | res = dfa_search(opt%dfa, text, text_len) |
| 1066 | return |
| 1067 | end if |
| 1068 | |
| 1069 | ! Fast path: anchored start - only try position 1 |
| 1070 | if (opt%anchored_start) then |
| 1071 | res = optimized_match(opt, text, 1, ignore_case) |
| 1072 | return |
| 1073 | end if |
| 1074 | |
| 1075 | ! Use prefix to skip positions (Boyer-Moore style) |
| 1076 | if (opt%prefix_len > 0 .and. .not. ignore_case) then |
| 1077 | pos = opt%prefix_len |
| 1078 | do while (pos <= text_len) |
| 1079 | ! Check if prefix matches at this position |
| 1080 | if (prefix_matches(text, pos - opt%prefix_len + 1, opt%prefix, opt%prefix_len)) then |
| 1081 | ! Try full NFA match from this position |
| 1082 | try_res = optimized_match(opt, text, pos - opt%prefix_len + 1, ignore_case) |
| 1083 | if (try_res%matched) then |
| 1084 | res = try_res |
| 1085 | return |
| 1086 | end if |
| 1087 | pos = pos + 1 |
| 1088 | else |
| 1089 | ! Skip based on mismatched character |
| 1090 | skip = opt%skip_table(ichar(text(pos:pos))) |
| 1091 | pos = pos + max(skip, 1) |
| 1092 | end if |
| 1093 | end do |
| 1094 | else |
| 1095 | ! No prefix optimization - try each position |
| 1096 | do pos = 1, text_len + 1 |
| 1097 | try_res = optimized_match(opt, text, pos, ignore_case) |
| 1098 | if (try_res%matched) then |
| 1099 | res = try_res |
| 1100 | return |
| 1101 | end if |
| 1102 | end do |
| 1103 | end if |
| 1104 | |
| 1105 | end function optimized_search |
| 1106 | |
| 1107 | function dfa_search(dfa, text, text_len) result(res) |
| 1108 | !> Fast O(n) DFA-based search |
| 1109 | !> Tries each starting position and returns first match |
| 1110 | type(compiled_dfa_t), intent(in) :: dfa |
| 1111 | character(len=*), intent(in) :: text |
| 1112 | integer, intent(in) :: text_len |
| 1113 | type(match_result_t) :: res |
| 1114 | |
| 1115 | integer :: start_pos |
| 1116 | type(match_result_t) :: try_res |
| 1117 | |
| 1118 | res%matched = .false. |
| 1119 | |
| 1120 | if (.not. dfa%compiled .or. dfa%num_states == 0) return |
| 1121 | |
| 1122 | ! Try each starting position |
| 1123 | do start_pos = 1, text_len + 1 |
| 1124 | try_res = dfa_match(dfa, text, text_len, start_pos) |
| 1125 | if (try_res%matched) then |
| 1126 | res = try_res |
| 1127 | return |
| 1128 | end if |
| 1129 | end do |
| 1130 | |
| 1131 | end function dfa_search |
| 1132 | |
| 1133 | function dfa_match(dfa, text, text_len, start_pos) result(res) |
| 1134 | !> O(n) DFA matching from a specific position |
| 1135 | !> Just follows transition table - no state set operations |
| 1136 | type(compiled_dfa_t), intent(in) :: dfa |
| 1137 | character(len=*), intent(in) :: text |
| 1138 | integer, intent(in) :: text_len, start_pos |
| 1139 | type(match_result_t) :: res |
| 1140 | |
| 1141 | integer :: state, pos, char_code |
| 1142 | |
| 1143 | res%matched = .false. |
| 1144 | |
| 1145 | if (.not. dfa%compiled) return |
| 1146 | |
| 1147 | state = dfa%start_state |
| 1148 | pos = start_pos |
| 1149 | |
| 1150 | ! Check if start state is accepting (empty match) |
| 1151 | if (dfa%states(state)%is_accept) then |
| 1152 | res%matched = .true. |
| 1153 | res%match_start = start_pos |
| 1154 | res%match_end = start_pos - 1 |
| 1155 | end if |
| 1156 | |
| 1157 | ! Process each character |
| 1158 | do while (pos <= text_len) |
| 1159 | char_code = ichar(text(pos:pos)) |
| 1160 | state = dfa%states(state)%transitions(char_code) |
| 1161 | |
| 1162 | if (state == DFA_DEAD_STATE) exit |
| 1163 | |
| 1164 | pos = pos + 1 |
| 1165 | |
| 1166 | ! Check for acceptance (greedy - find longest) |
| 1167 | if (dfa%states(state)%is_accept) then |
| 1168 | res%matched = .true. |
| 1169 | res%match_start = start_pos |
| 1170 | res%match_end = pos - 1 |
| 1171 | end if |
| 1172 | end do |
| 1173 | |
| 1174 | end function dfa_match |
| 1175 | |
| 1176 | function prefix_matches(text, pos, prefix, prefix_len) result(matches) |
| 1177 | character(len=*), intent(in) :: text |
| 1178 | integer, intent(in) :: pos, prefix_len |
| 1179 | character(len=*), intent(in) :: prefix |
| 1180 | logical :: matches |
| 1181 | integer :: i, text_len |
| 1182 | |
| 1183 | matches = .false. |
| 1184 | text_len = len_trim(text) |
| 1185 | |
| 1186 | if (pos < 1 .or. pos + prefix_len - 1 > text_len) return |
| 1187 | |
| 1188 | do i = 1, prefix_len |
| 1189 | if (text(pos+i-1:pos+i-1) /= prefix(i:i)) return |
| 1190 | end do |
| 1191 | |
| 1192 | matches = .true. |
| 1193 | end function prefix_matches |
| 1194 | |
| 1195 | !--------------------------------------------------------------------------- |
| 1196 | ! Optimized Match: Use bit vectors and caching |
| 1197 | !--------------------------------------------------------------------------- |
| 1198 | |
| 1199 | function optimized_match(opt, text, start_pos, ignore_case) result(res) |
| 1200 | type(optimized_nfa_t), intent(inout) :: opt |
| 1201 | character(len=*), intent(in) :: text |
| 1202 | integer, intent(in) :: start_pos |
| 1203 | logical, intent(in) :: ignore_case |
| 1204 | type(match_result_t) :: res |
| 1205 | |
| 1206 | type(state_set_t) :: current, next_set |
| 1207 | integer :: pos, text_len |
| 1208 | character(len=1) :: c |
| 1209 | |
| 1210 | res%matched = .false. |
| 1211 | text_len = len_trim(text) |
| 1212 | |
| 1213 | if (opt%nfa%num_states == 0) return |
| 1214 | |
| 1215 | ! Initialize with epsilon closure of start state (including position-dependent anchors) |
| 1216 | call current%clear() |
| 1217 | call compute_epsilon_closure_full(opt%nfa, opt%nfa%start_state, current, text, start_pos, text_len) |
| 1218 | |
| 1219 | ! Check if already accepting (empty pattern) |
| 1220 | if (is_accepting_set(opt%nfa, current)) then |
| 1221 | res%matched = .true. |
| 1222 | res%match_start = start_pos |
| 1223 | res%match_end = start_pos - 1 |
| 1224 | return |
| 1225 | end if |
| 1226 | |
| 1227 | ! Process each character |
| 1228 | pos = start_pos |
| 1229 | do while (pos <= text_len .and. .not. current%is_empty()) |
| 1230 | c = text(pos:pos) |
| 1231 | |
| 1232 | ! Compute next states with DFA caching |
| 1233 | call next_set%clear() |
| 1234 | call step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set) |
| 1235 | |
| 1236 | ! Compute epsilon closure |
| 1237 | call current%clear() |
| 1238 | call expand_epsilon_closure(opt%nfa, next_set, current, text, pos + 1, text_len) |
| 1239 | |
| 1240 | pos = pos + 1 |
| 1241 | |
| 1242 | ! Check for acceptance (greedy - continue to find longest) |
| 1243 | if (is_accepting_set(opt%nfa, current)) then |
| 1244 | res%matched = .true. |
| 1245 | res%match_start = start_pos |
| 1246 | res%match_end = pos - 1 |
| 1247 | end if |
| 1248 | end do |
| 1249 | |
| 1250 | end function optimized_match |
| 1251 | |
| 1252 | subroutine compute_epsilon_closure_full(nfa, start_state, result_set, text, pos, text_len) |
| 1253 | type(nfa_t), intent(in) :: nfa |
| 1254 | integer, intent(in) :: start_state |
| 1255 | type(state_set_t), intent(inout) :: result_set |
| 1256 | character(len=*), intent(in) :: text |
| 1257 | integer, intent(in) :: pos, text_len |
| 1258 | |
| 1259 | integer :: stack(MAX_STATES), stack_top |
| 1260 | integer :: state, i, target |
| 1261 | type(nfa_transition_t) :: trans |
| 1262 | |
| 1263 | stack_top = 1 |
| 1264 | stack(1) = start_state |
| 1265 | |
| 1266 | do while (stack_top > 0) |
| 1267 | state = stack(stack_top) |
| 1268 | stack_top = stack_top - 1 |
| 1269 | |
| 1270 | if (state < 1 .or. state > nfa%num_states) cycle |
| 1271 | if (result_set%contains(state)) cycle |
| 1272 | |
| 1273 | call result_set%add(state) |
| 1274 | |
| 1275 | do i = 1, nfa%states(state)%num_trans |
| 1276 | trans = nfa%states(state)%trans(i) |
| 1277 | |
| 1278 | if (trans%trans_type == TRANS_EPSILON) then |
| 1279 | if (trans%anchor_type < 0) cycle ! Skip backrefs |
| 1280 | target = trans%target |
| 1281 | if (target >= 1 .and. target <= nfa%num_states) then |
| 1282 | if (.not. result_set%contains(target)) then |
| 1283 | stack_top = stack_top + 1 |
| 1284 | if (stack_top <= MAX_STATES) stack(stack_top) = target |
| 1285 | end if |
| 1286 | end if |
| 1287 | |
| 1288 | else if (trans%trans_type == TRANS_ANCHOR) then |
| 1289 | if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then |
| 1290 | target = trans%target |
| 1291 | if (target >= 1 .and. target <= nfa%num_states) then |
| 1292 | if (.not. result_set%contains(target)) then |
| 1293 | stack_top = stack_top + 1 |
| 1294 | if (stack_top <= MAX_STATES) stack(stack_top) = target |
| 1295 | end if |
| 1296 | end if |
| 1297 | end if |
| 1298 | end if |
| 1299 | end do |
| 1300 | end do |
| 1301 | end subroutine compute_epsilon_closure_full |
| 1302 | |
| 1303 | subroutine expand_epsilon_closure(nfa, input_set, result_set, text, pos, text_len) |
| 1304 | type(nfa_t), intent(in) :: nfa |
| 1305 | type(state_set_t), intent(in) :: input_set |
| 1306 | type(state_set_t), intent(inout) :: result_set |
| 1307 | character(len=*), intent(in) :: text |
| 1308 | integer, intent(in) :: pos, text_len |
| 1309 | |
| 1310 | integer :: state, word_idx, bit_idx |
| 1311 | integer(8) :: word, mask |
| 1312 | |
| 1313 | ! Iterate through set bits |
| 1314 | do word_idx = 1, size(input_set%bits) |
| 1315 | word = input_set%bits(word_idx) |
| 1316 | if (word == 0) cycle |
| 1317 | |
| 1318 | do bit_idx = 0, 63 |
| 1319 | mask = ishft(1_8, bit_idx) |
| 1320 | if (iand(word, mask) /= 0) then |
| 1321 | state = (word_idx - 1) * 64 + bit_idx + 1 |
| 1322 | if (state <= nfa%num_states) then |
| 1323 | call compute_epsilon_closure_full(nfa, state, result_set, text, pos, text_len) |
| 1324 | end if |
| 1325 | end if |
| 1326 | end do |
| 1327 | end do |
| 1328 | end subroutine expand_epsilon_closure |
| 1329 | |
| 1330 | subroutine step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set) |
| 1331 | !> Compute next states with DFA caching |
| 1332 | !> Cache key: (state_set_hash, char_code, ignore_case) |
| 1333 | !> This avoids recomputing transitions for repeated (state_set, char) pairs |
| 1334 | type(optimized_nfa_t), intent(inout) :: opt |
| 1335 | type(state_set_t), intent(in) :: current |
| 1336 | character(len=1), intent(in) :: c |
| 1337 | integer, intent(in) :: pos, text_len |
| 1338 | character(len=*), intent(in) :: text |
| 1339 | logical, intent(in) :: ignore_case |
| 1340 | type(state_set_t), intent(inout) :: next_set |
| 1341 | |
| 1342 | integer(8) :: state_hash |
| 1343 | integer :: cache_idx, char_code |
| 1344 | |
| 1345 | ! Compute cache key |
| 1346 | state_hash = current%hash() |
| 1347 | char_code = ichar(c) |
| 1348 | |
| 1349 | ! Compute cache index (combine hash with char code) |
| 1350 | cache_idx = int(mod(abs(ieor(state_hash, int(char_code, 8))), int(DFA_CACHE_SIZE, 8))) + 1 |
| 1351 | |
| 1352 | ! Check cache hit (using hash + char + case as key) |
| 1353 | ! Note: This may have rare hash collisions, but performance benefit outweighs risk |
| 1354 | if (opt%dfa_cache(cache_idx)%valid .and. & |
| 1355 | opt%dfa_cache(cache_idx)%state_hash == state_hash .and. & |
| 1356 | opt%dfa_cache(cache_idx)%char_code == char_code .and. & |
| 1357 | (opt%dfa_cache(cache_idx)%is_case_insensitive .eqv. ignore_case)) then |
| 1358 | ! Cache hit - copy cached result |
| 1359 | call next_set%copy_from(opt%dfa_cache(cache_idx)%next_states) |
| 1360 | return |
| 1361 | end if |
| 1362 | |
| 1363 | ! Cache miss - compute transitions |
| 1364 | call compute_char_transitions(opt%nfa, current, c, ignore_case, next_set) |
| 1365 | |
| 1366 | ! Store in cache |
| 1367 | opt%dfa_cache(cache_idx)%valid = .true. |
| 1368 | opt%dfa_cache(cache_idx)%state_hash = state_hash |
| 1369 | opt%dfa_cache(cache_idx)%char_code = char_code |
| 1370 | opt%dfa_cache(cache_idx)%is_case_insensitive = ignore_case |
| 1371 | call opt%dfa_cache(cache_idx)%next_states%copy_from(next_set) |
| 1372 | |
| 1373 | end subroutine step_with_cache |
| 1374 | |
| 1375 | subroutine compute_char_transitions(nfa, current, c, ignore_case, next_set) |
| 1376 | !> Compute character transitions without caching (called on cache miss) |
| 1377 | type(nfa_t), intent(in) :: nfa |
| 1378 | type(state_set_t), intent(in) :: current |
| 1379 | character(len=1), intent(in) :: c |
| 1380 | logical, intent(in) :: ignore_case |
| 1381 | type(state_set_t), intent(inout) :: next_set |
| 1382 | |
| 1383 | integer :: state, word_idx, bit_idx, i |
| 1384 | integer(8) :: word, mask |
| 1385 | type(nfa_transition_t) :: trans |
| 1386 | character(len=1) :: c_lower, match_lower |
| 1387 | |
| 1388 | ! Iterate through current states |
| 1389 | do word_idx = 1, size(current%bits) |
| 1390 | word = current%bits(word_idx) |
| 1391 | if (word == 0) cycle |
| 1392 | |
| 1393 | do bit_idx = 0, 63 |
| 1394 | mask = ishft(1_8, bit_idx) |
| 1395 | if (iand(word, mask) /= 0) then |
| 1396 | state = (word_idx - 1) * 64 + bit_idx + 1 |
| 1397 | if (state > nfa%num_states) cycle |
| 1398 | |
| 1399 | ! Process transitions from this state |
| 1400 | do i = 1, nfa%states(state)%num_trans |
| 1401 | trans = nfa%states(state)%trans(i) |
| 1402 | |
| 1403 | select case (trans%trans_type) |
| 1404 | case (TRANS_CHAR) |
| 1405 | if (ignore_case) then |
| 1406 | c_lower = to_lower_char(c) |
| 1407 | match_lower = to_lower_char(trans%match_char) |
| 1408 | if (c_lower == match_lower) then |
| 1409 | call next_set%add(trans%target) |
| 1410 | end if |
| 1411 | else |
| 1412 | if (c == trans%match_char) then |
| 1413 | call next_set%add(trans%target) |
| 1414 | end if |
| 1415 | end if |
| 1416 | |
| 1417 | case (TRANS_CLASS) |
| 1418 | ! Use fast bitwise character class test |
| 1419 | if (ignore_case) then |
| 1420 | if (charclass_test_case_insensitive(trans%char_bits, c)) then |
| 1421 | call next_set%add(trans%target) |
| 1422 | end if |
| 1423 | else |
| 1424 | if (charclass_test(trans%char_bits, c)) then |
| 1425 | call next_set%add(trans%target) |
| 1426 | end if |
| 1427 | end if |
| 1428 | |
| 1429 | case (TRANS_ANY) |
| 1430 | if (c /= char(10)) then |
| 1431 | call next_set%add(trans%target) |
| 1432 | end if |
| 1433 | end select |
| 1434 | end do |
| 1435 | end if |
| 1436 | end do |
| 1437 | end do |
| 1438 | end subroutine compute_char_transitions |
| 1439 | |
| 1440 | function is_accepting_set(nfa, states) result(res) |
| 1441 | type(nfa_t), intent(in) :: nfa |
| 1442 | type(state_set_t), intent(in) :: states |
| 1443 | logical :: res |
| 1444 | |
| 1445 | integer :: state, word_idx, bit_idx |
| 1446 | integer(8) :: word, mask |
| 1447 | |
| 1448 | res = .false. |
| 1449 | |
| 1450 | do word_idx = 1, size(states%bits) |
| 1451 | word = states%bits(word_idx) |
| 1452 | if (word == 0) cycle |
| 1453 | |
| 1454 | do bit_idx = 0, 63 |
| 1455 | mask = ishft(1_8, bit_idx) |
| 1456 | if (iand(word, mask) /= 0) then |
| 1457 | state = (word_idx - 1) * 64 + bit_idx + 1 |
| 1458 | if (state >= 1 .and. state <= nfa%num_states) then |
| 1459 | if (nfa%states(state)%is_accept) then |
| 1460 | res = .true. |
| 1461 | return |
| 1462 | end if |
| 1463 | end if |
| 1464 | end if |
| 1465 | end do |
| 1466 | end do |
| 1467 | end function is_accepting_set |
| 1468 | |
| 1469 | !--------------------------------------------------------------------------- |
| 1470 | ! Helper functions |
| 1471 | !--------------------------------------------------------------------------- |
| 1472 | |
| 1473 | function anchor_matches_opt(anchor_type, text, pos, text_len) result(matches) |
| 1474 | integer, intent(in) :: anchor_type |
| 1475 | character(len=*), intent(in) :: text |
| 1476 | integer, intent(in) :: pos, text_len |
| 1477 | logical :: matches |
| 1478 | |
| 1479 | logical :: at_start, at_end, prev_word, curr_word |
| 1480 | |
| 1481 | matches = .false. |
| 1482 | at_start = (pos == 1) .or. (pos < 1) |
| 1483 | at_end = (pos > text_len) |
| 1484 | |
| 1485 | select case (anchor_type) |
| 1486 | case (1) ! ^ |
| 1487 | if (at_start) then |
| 1488 | matches = .true. |
| 1489 | else if (pos > 1 .and. pos <= text_len + 1) then |
| 1490 | matches = (text(pos-1:pos-1) == char(10)) |
| 1491 | end if |
| 1492 | |
| 1493 | case (2) ! $ |
| 1494 | if (at_end) then |
| 1495 | matches = .true. |
| 1496 | else if (pos >= 1 .and. pos <= text_len) then |
| 1497 | matches = (text(pos:pos) == char(10)) |
| 1498 | end if |
| 1499 | |
| 1500 | case (3) ! \< |
| 1501 | prev_word = .false. |
| 1502 | curr_word = .false. |
| 1503 | if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) |
| 1504 | if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) |
| 1505 | matches = (.not. prev_word) .and. curr_word |
| 1506 | |
| 1507 | case (4) ! \> |
| 1508 | prev_word = .false. |
| 1509 | curr_word = .false. |
| 1510 | if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) |
| 1511 | if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) |
| 1512 | matches = prev_word .and. (.not. curr_word) |
| 1513 | |
| 1514 | case (5) ! \b |
| 1515 | prev_word = .false. |
| 1516 | curr_word = .false. |
| 1517 | if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) |
| 1518 | if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) |
| 1519 | matches = prev_word .neqv. curr_word |
| 1520 | |
| 1521 | case (6) ! \B |
| 1522 | prev_word = .false. |
| 1523 | curr_word = .false. |
| 1524 | if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1)) |
| 1525 | if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos)) |
| 1526 | matches = prev_word .eqv. curr_word |
| 1527 | end select |
| 1528 | end function anchor_matches_opt |
| 1529 | |
| 1530 | pure function is_word_char_opt(c) result(res) |
| 1531 | character(len=1), intent(in) :: c |
| 1532 | logical :: res |
| 1533 | integer :: ic |
| 1534 | ic = ichar(c) |
| 1535 | res = (ic >= ichar('a') .and. ic <= ichar('z')) .or. & |
| 1536 | (ic >= ichar('A') .and. ic <= ichar('Z')) .or. & |
| 1537 | (ic >= ichar('0') .and. ic <= ichar('9')) .or. & |
| 1538 | (c == '_') |
| 1539 | end function is_word_char_opt |
| 1540 | |
| 1541 | pure function to_lower_char(c) result(lower) |
| 1542 | character(len=1), intent(in) :: c |
| 1543 | character(len=1) :: lower |
| 1544 | integer :: ic |
| 1545 | ic = ichar(c) |
| 1546 | if (ic >= ichar('A') .and. ic <= ichar('Z')) then |
| 1547 | ! ASCII uppercase A-Z -> a-z |
| 1548 | lower = char(ic + 32) |
| 1549 | else if (ic >= 192 .and. ic <= 214) then |
| 1550 | ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246) |
| 1551 | lower = char(ic + 32) |
| 1552 | else if (ic >= 216 .and. ic <= 222) then |
| 1553 | ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254) |
| 1554 | lower = char(ic + 32) |
| 1555 | else if (ic >= 128 .and. ic <= 150) then |
| 1556 | ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6) |
| 1557 | ! When preceded by 0xC3, these represent À-Ö, fold to à-ö |
| 1558 | lower = char(ic + 32) |
| 1559 | else if (ic >= 152 .and. ic <= 158) then |
| 1560 | ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE) |
| 1561 | ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ |
| 1562 | lower = char(ic + 32) |
| 1563 | else |
| 1564 | lower = c |
| 1565 | end if |
| 1566 | end function to_lower_char |
| 1567 | |
| 1568 | function char_in_class_opt(c, char_class, negated, ignore_case) result(res) |
| 1569 | character(len=1), intent(in) :: c |
| 1570 | logical, intent(in) :: char_class(0:255) |
| 1571 | logical, intent(in) :: negated, ignore_case |
| 1572 | logical :: res |
| 1573 | |
| 1574 | integer :: ic |
| 1575 | character(len=1) :: c_lower, c_upper |
| 1576 | |
| 1577 | ic = ichar(c) |
| 1578 | res = char_class(ic) |
| 1579 | |
| 1580 | if (ignore_case .and. .not. res) then |
| 1581 | c_lower = to_lower_char(c) |
| 1582 | if (c_lower /= c) res = char_class(ichar(c_lower)) |
| 1583 | if (.not. res) then |
| 1584 | ic = ichar(c) |
| 1585 | if (ic >= ichar('a') .and. ic <= ichar('z')) then |
| 1586 | c_upper = char(ic - 32) |
| 1587 | res = char_class(ichar(c_upper)) |
| 1588 | end if |
| 1589 | end if |
| 1590 | end if |
| 1591 | |
| 1592 | if (negated) res = .not. res |
| 1593 | end function char_in_class_opt |
| 1594 | |
| 1595 | !--------------------------------------------------------------------------- |
| 1596 | ! Aho-Corasick Integration for Alternation Patterns |
| 1597 | !--------------------------------------------------------------------------- |
| 1598 | |
| 1599 | subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case) |
| 1600 | !> Try to build Aho-Corasick automaton for simple alternation patterns |
| 1601 | !> Pattern like "foo|bar|baz" with only literal characters and | separators |
| 1602 | type(optimized_nfa_t), intent(inout) :: opt |
| 1603 | character(len=*), intent(in) :: pattern |
| 1604 | logical, intent(in) :: is_ere, ignore_case |
| 1605 | |
| 1606 | character(len=4096), allocatable :: alternatives(:) |
| 1607 | integer :: num_alternatives, ierr |
| 1608 | logical :: is_simple |
| 1609 | |
| 1610 | allocate(alternatives(1000)) |
| 1611 | |
| 1612 | opt%use_aho_corasick = .false. |
| 1613 | |
| 1614 | ! Check if pattern is simple alternation of literals |
| 1615 | call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple) |
| 1616 | |
| 1617 | ! DEBUG (commented out for production) |
| 1618 | ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple |
| 1619 | |
| 1620 | if (.not. is_simple .or. num_alternatives < 2) return |
| 1621 | |
| 1622 | ! Build Aho-Corasick automaton |
| 1623 | call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr) |
| 1624 | |
| 1625 | if (ierr == 0) then |
| 1626 | opt%use_aho_corasick = .true. |
| 1627 | end if |
| 1628 | |
| 1629 | deallocate(alternatives) |
| 1630 | |
| 1631 | end subroutine try_build_aho_corasick |
| 1632 | |
| 1633 | subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple) |
| 1634 | !> Parse pattern to check if it's simple alternation of literals |
| 1635 | !> Returns the alternatives if so |
| 1636 | character(len=*), intent(in) :: pattern |
| 1637 | logical, intent(in) :: is_ere |
| 1638 | character(len=*), intent(out) :: alternatives(:) |
| 1639 | integer, intent(out) :: num_alt |
| 1640 | logical, intent(out) :: is_simple |
| 1641 | |
| 1642 | integer :: i, pat_len, alt_start, alt_len |
| 1643 | character(len=1) :: c, next_c |
| 1644 | logical :: in_escape |
| 1645 | |
| 1646 | is_simple = .true. |
| 1647 | num_alt = 0 |
| 1648 | pat_len = pattern_len(pattern) ! Use pattern_len to preserve whitespace patterns |
| 1649 | |
| 1650 | if (pat_len == 0) then |
| 1651 | is_simple = .false. |
| 1652 | return |
| 1653 | end if |
| 1654 | |
| 1655 | alt_start = 1 |
| 1656 | alt_len = 0 |
| 1657 | in_escape = .false. |
| 1658 | i = 1 |
| 1659 | |
| 1660 | do while (i <= pat_len) |
| 1661 | c = pattern(i:i) |
| 1662 | |
| 1663 | if (in_escape) then |
| 1664 | ! In ERE mode, \| is literal | |
| 1665 | ! In BRE mode, \| is alternation (GNU extension) |
| 1666 | if (c == '|' .and. .not. is_ere) then |
| 1667 | ! BRE alternation |
| 1668 | if (alt_len > 0) then |
| 1669 | num_alt = num_alt + 1 |
| 1670 | if (num_alt > size(alternatives)) then |
| 1671 | is_simple = .false. |
| 1672 | return |
| 1673 | end if |
| 1674 | alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) |
| 1675 | ! Add null terminator to preserve exact length |
| 1676 | if (alt_len < len(alternatives(num_alt))) then |
| 1677 | alternatives(num_alt)(alt_len+1:alt_len+1) = char(0) |
| 1678 | end if |
| 1679 | else |
| 1680 | ! Empty alternative - still valid |
| 1681 | num_alt = num_alt + 1 |
| 1682 | alternatives(num_alt) = char(0) |
| 1683 | end if |
| 1684 | alt_start = i + 1 |
| 1685 | alt_len = 0 |
| 1686 | else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. & |
| 1687 | c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. & |
| 1688 | c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. & |
| 1689 | c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. & |
| 1690 | c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. & |
| 1691 | c == '7' .or. c == '8' .or. c == '9') then |
| 1692 | ! Regex metacharacter - not simple |
| 1693 | is_simple = .false. |
| 1694 | return |
| 1695 | else |
| 1696 | ! Escaped literal character (e.g., \., \*, etc.) |
| 1697 | alt_len = alt_len + 1 |
| 1698 | end if |
| 1699 | in_escape = .false. |
| 1700 | i = i + 1 |
| 1701 | cycle |
| 1702 | end if |
| 1703 | |
| 1704 | if (c == '\') then |
| 1705 | in_escape = .true. |
| 1706 | i = i + 1 |
| 1707 | cycle |
| 1708 | end if |
| 1709 | |
| 1710 | ! Check for metacharacters |
| 1711 | if (is_ere) then |
| 1712 | ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters |
| 1713 | if (c == '|') then |
| 1714 | ! Alternation separator |
| 1715 | if (alt_len > 0) then |
| 1716 | num_alt = num_alt + 1 |
| 1717 | if (num_alt > size(alternatives)) then |
| 1718 | is_simple = .false. |
| 1719 | return |
| 1720 | end if |
| 1721 | alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) |
| 1722 | ! Add null terminator to preserve exact length |
| 1723 | if (alt_len < len(alternatives(num_alt))) then |
| 1724 | alternatives(num_alt)(alt_len+1:alt_len+1) = char(0) |
| 1725 | end if |
| 1726 | else |
| 1727 | num_alt = num_alt + 1 |
| 1728 | alternatives(num_alt) = char(0) |
| 1729 | end if |
| 1730 | alt_start = i + 1 |
| 1731 | alt_len = 0 |
| 1732 | i = i + 1 |
| 1733 | cycle |
| 1734 | else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. & |
| 1735 | c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. & |
| 1736 | c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then |
| 1737 | ! Metacharacter - not simple alternation |
| 1738 | is_simple = .false. |
| 1739 | return |
| 1740 | end if |
| 1741 | else |
| 1742 | ! BRE mode: only . * [ ] ^ $ are metacharacters |
| 1743 | ! | is literal, \| is alternation (GNU extension) |
| 1744 | if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. & |
| 1745 | c == '^' .or. c == '$') then |
| 1746 | is_simple = .false. |
| 1747 | return |
| 1748 | end if |
| 1749 | end if |
| 1750 | |
| 1751 | ! Regular literal character |
| 1752 | alt_len = alt_len + 1 |
| 1753 | i = i + 1 |
| 1754 | end do |
| 1755 | |
| 1756 | ! Handle last alternative |
| 1757 | if (alt_len > 0 .or. num_alt > 0) then |
| 1758 | num_alt = num_alt + 1 |
| 1759 | if (num_alt > size(alternatives)) then |
| 1760 | is_simple = .false. |
| 1761 | return |
| 1762 | end if |
| 1763 | if (alt_len > 0) then |
| 1764 | alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) |
| 1765 | ! Add null terminator to preserve exact length |
| 1766 | if (alt_len < len(alternatives(num_alt))) then |
| 1767 | alternatives(num_alt)(alt_len+1:alt_len+1) = char(0) |
| 1768 | end if |
| 1769 | else |
| 1770 | alternatives(num_alt) = char(0) |
| 1771 | end if |
| 1772 | end if |
| 1773 | |
| 1774 | ! Need at least 2 alternatives for Aho-Corasick to be useful |
| 1775 | if (num_alt < 2) then |
| 1776 | is_simple = .false. |
| 1777 | end if |
| 1778 | |
| 1779 | end subroutine parse_simple_alternation |
| 1780 | |
| 1781 | function ac_optimized_search(ac, text) result(res) |
| 1782 | !> Search using Aho-Corasick automaton |
| 1783 | type(ac_automaton_t), intent(in) :: ac |
| 1784 | character(len=*), intent(in) :: text |
| 1785 | type(match_result_t) :: res |
| 1786 | |
| 1787 | type(ac_match_t) :: ac_match |
| 1788 | |
| 1789 | res%matched = .false. |
| 1790 | |
| 1791 | ac_match = ac_search(ac, text) |
| 1792 | if (ac_match%matched) then |
| 1793 | res%matched = .true. |
| 1794 | res%match_start = ac_match%start_pos |
| 1795 | res%match_end = ac_match%end_pos |
| 1796 | end if |
| 1797 | |
| 1798 | end function ac_optimized_search |
| 1799 | |
| 1800 | !--------------------------------------------------------------------------- |
| 1801 | ! Backtracking Matcher for Backreferences |
| 1802 | !--------------------------------------------------------------------------- |
| 1803 | |
| 1804 | function backtrack_search(nfa, text, text_len, ignore_case) result(res) |
| 1805 | !> Search for pattern with backreferences using backtracking |
| 1806 | !> Tries each starting position until a match is found |
| 1807 | type(nfa_t), intent(in) :: nfa |
| 1808 | character(len=*), intent(in) :: text |
| 1809 | integer, intent(in) :: text_len |
| 1810 | logical, intent(in) :: ignore_case |
| 1811 | type(match_result_t) :: res |
| 1812 | |
| 1813 | integer :: start_pos |
| 1814 | type(match_result_t) :: try_res |
| 1815 | |
| 1816 | res%matched = .false. |
| 1817 | |
| 1818 | do start_pos = 1, text_len + 1 |
| 1819 | try_res = backtrack_match(nfa, text, text_len, start_pos, ignore_case) |
| 1820 | if (try_res%matched) then |
| 1821 | res = try_res |
| 1822 | return |
| 1823 | end if |
| 1824 | end do |
| 1825 | |
| 1826 | end function backtrack_search |
| 1827 | |
| 1828 | function backtrack_match(nfa, text, text_len, start_pos, ignore_case) result(res) |
| 1829 | !> Try to match NFA with backreferences starting at start_pos |
| 1830 | !> Uses recursive backtracking to track group captures |
| 1831 | type(nfa_t), intent(in) :: nfa |
| 1832 | character(len=*), intent(in) :: text |
| 1833 | integer, intent(in) :: text_len, start_pos |
| 1834 | logical, intent(in) :: ignore_case |
| 1835 | type(match_result_t) :: res |
| 1836 | |
| 1837 | integer :: group_starts(9), group_ends(9) |
| 1838 | integer :: best_end |
| 1839 | |
| 1840 | res%matched = .false. |
| 1841 | group_starts = 0 |
| 1842 | group_ends = 0 |
| 1843 | best_end = start_pos - 1 |
| 1844 | |
| 1845 | ! Try to match from the start state |
| 1846 | if (backtrack_from_state(nfa, nfa%start_state, text, text_len, start_pos, & |
| 1847 | ignore_case, group_starts, group_ends, best_end)) then |
| 1848 | res%matched = .true. |
| 1849 | res%match_start = start_pos |
| 1850 | res%match_end = best_end |
| 1851 | res%group_starts = group_starts |
| 1852 | res%group_ends = group_ends |
| 1853 | end if |
| 1854 | |
| 1855 | end function backtrack_match |
| 1856 | |
| 1857 | recursive function backtrack_from_state(nfa, state, text, text_len, pos, & |
| 1858 | ignore_case, group_starts, group_ends, best_end) result(matched) |
| 1859 | !> Recursive backtracking from a given NFA state |
| 1860 | !> Returns true if we can reach an accepting state |
| 1861 | type(nfa_t), intent(in) :: nfa |
| 1862 | integer, intent(in) :: state |
| 1863 | character(len=*), intent(in) :: text |
| 1864 | integer, intent(in) :: text_len, pos |
| 1865 | logical, intent(in) :: ignore_case |
| 1866 | integer, intent(inout) :: group_starts(9), group_ends(9) |
| 1867 | integer, intent(inout) :: best_end |
| 1868 | logical :: matched |
| 1869 | |
| 1870 | integer :: i, target, old_start, old_end |
| 1871 | integer :: backref_num, ref_start, ref_end, ref_len |
| 1872 | integer :: saved_starts(9), saved_ends(9) |
| 1873 | type(nfa_transition_t) :: trans |
| 1874 | character(len=1) :: c, c_lower, match_lower |
| 1875 | logical :: char_matches |
| 1876 | |
| 1877 | matched = .false. |
| 1878 | |
| 1879 | if (state < 1 .or. state > nfa%num_states) return |
| 1880 | |
| 1881 | ! Record group start if this state starts a group |
| 1882 | if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then |
| 1883 | old_start = group_starts(nfa%states(state)%group_start) |
| 1884 | group_starts(nfa%states(state)%group_start) = pos |
| 1885 | else |
| 1886 | old_start = 0 |
| 1887 | end if |
| 1888 | |
| 1889 | ! Record group end if this state ends a group |
| 1890 | ! This must be done BEFORE processing transitions so backrefs can see the captured text |
| 1891 | if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then |
| 1892 | old_end = group_ends(nfa%states(state)%group_end) |
| 1893 | group_ends(nfa%states(state)%group_end) = pos - 1 |
| 1894 | else |
| 1895 | old_end = 0 |
| 1896 | end if |
| 1897 | |
| 1898 | ! Check if this is an accepting state |
| 1899 | if (nfa%states(state)%is_accept) then |
| 1900 | matched = .true. |
| 1901 | if (pos - 1 > best_end) best_end = pos - 1 |
| 1902 | ! Continue to find longest match (greedy) |
| 1903 | end if |
| 1904 | |
| 1905 | ! Try each transition from this state |
| 1906 | do i = 1, nfa%states(state)%num_trans |
| 1907 | trans = nfa%states(state)%trans(i) |
| 1908 | target = trans%target |
| 1909 | |
| 1910 | select case (trans%trans_type) |
| 1911 | case (TRANS_EPSILON) |
| 1912 | ! Check for backreference (negative anchor_type) |
| 1913 | if (trans%anchor_type < 0) then |
| 1914 | backref_num = -trans%anchor_type |
| 1915 | if (backref_num >= 1 .and. backref_num <= 9) then |
| 1916 | ref_start = group_starts(backref_num) |
| 1917 | ref_end = group_ends(backref_num) |
| 1918 | |
| 1919 | ! If group hasn't been captured yet, backreference fails |
| 1920 | if (ref_start == 0 .or. ref_end == 0 .or. ref_end < ref_start) cycle |
| 1921 | |
| 1922 | ref_len = ref_end - ref_start + 1 |
| 1923 | |
| 1924 | ! Check if we have enough text remaining |
| 1925 | if (pos + ref_len - 1 > text_len) cycle |
| 1926 | |
| 1927 | ! Check if the text matches the captured group |
| 1928 | if (ignore_case) then |
| 1929 | if (.not. strings_equal_icase(text(pos:pos+ref_len-1), & |
| 1930 | text(ref_start:ref_end))) cycle |
| 1931 | else |
| 1932 | if (text(pos:pos+ref_len-1) /= text(ref_start:ref_end)) cycle |
| 1933 | end if |
| 1934 | |
| 1935 | ! Backref matches - continue from target with advanced position |
| 1936 | saved_starts = group_starts |
| 1937 | saved_ends = group_ends |
| 1938 | if (backtrack_from_state(nfa, target, text, text_len, pos + ref_len, & |
| 1939 | ignore_case, group_starts, group_ends, best_end)) then |
| 1940 | matched = .true. |
| 1941 | else |
| 1942 | group_starts = saved_starts |
| 1943 | group_ends = saved_ends |
| 1944 | end if |
| 1945 | end if |
| 1946 | else |
| 1947 | ! Regular epsilon transition |
| 1948 | saved_starts = group_starts |
| 1949 | saved_ends = group_ends |
| 1950 | if (backtrack_from_state(nfa, target, text, text_len, pos, & |
| 1951 | ignore_case, group_starts, group_ends, best_end)) then |
| 1952 | matched = .true. |
| 1953 | else |
| 1954 | group_starts = saved_starts |
| 1955 | group_ends = saved_ends |
| 1956 | end if |
| 1957 | end if |
| 1958 | |
| 1959 | case (TRANS_ANCHOR) |
| 1960 | ! Check if anchor matches at this position |
| 1961 | if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then |
| 1962 | saved_starts = group_starts |
| 1963 | saved_ends = group_ends |
| 1964 | if (backtrack_from_state(nfa, target, text, text_len, pos, & |
| 1965 | ignore_case, group_starts, group_ends, best_end)) then |
| 1966 | matched = .true. |
| 1967 | else |
| 1968 | group_starts = saved_starts |
| 1969 | group_ends = saved_ends |
| 1970 | end if |
| 1971 | end if |
| 1972 | |
| 1973 | case (TRANS_CHAR) |
| 1974 | ! Character transition - need text available |
| 1975 | if (pos <= text_len) then |
| 1976 | c = text(pos:pos) |
| 1977 | char_matches = .false. |
| 1978 | |
| 1979 | if (ignore_case) then |
| 1980 | c_lower = to_lower_char(c) |
| 1981 | match_lower = to_lower_char(trans%match_char) |
| 1982 | char_matches = (c_lower == match_lower) |
| 1983 | else |
| 1984 | char_matches = (c == trans%match_char) |
| 1985 | end if |
| 1986 | |
| 1987 | if (char_matches) then |
| 1988 | saved_starts = group_starts |
| 1989 | saved_ends = group_ends |
| 1990 | if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 1991 | ignore_case, group_starts, group_ends, best_end)) then |
| 1992 | matched = .true. |
| 1993 | else |
| 1994 | group_starts = saved_starts |
| 1995 | group_ends = saved_ends |
| 1996 | end if |
| 1997 | end if |
| 1998 | end if |
| 1999 | |
| 2000 | case (TRANS_CLASS) |
| 2001 | ! Character class transition |
| 2002 | if (pos <= text_len) then |
| 2003 | c = text(pos:pos) |
| 2004 | if (ignore_case) then |
| 2005 | if (charclass_test_case_insensitive(trans%char_bits, c)) then |
| 2006 | saved_starts = group_starts |
| 2007 | saved_ends = group_ends |
| 2008 | if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 2009 | ignore_case, group_starts, group_ends, best_end)) then |
| 2010 | matched = .true. |
| 2011 | else |
| 2012 | group_starts = saved_starts |
| 2013 | group_ends = saved_ends |
| 2014 | end if |
| 2015 | end if |
| 2016 | else |
| 2017 | if (charclass_test(trans%char_bits, c)) then |
| 2018 | saved_starts = group_starts |
| 2019 | saved_ends = group_ends |
| 2020 | if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 2021 | ignore_case, group_starts, group_ends, best_end)) then |
| 2022 | matched = .true. |
| 2023 | else |
| 2024 | group_starts = saved_starts |
| 2025 | group_ends = saved_ends |
| 2026 | end if |
| 2027 | end if |
| 2028 | end if |
| 2029 | end if |
| 2030 | |
| 2031 | case (TRANS_ANY) |
| 2032 | ! Dot matches any character except newline |
| 2033 | if (pos <= text_len) then |
| 2034 | if (text(pos:pos) /= char(10)) then |
| 2035 | saved_starts = group_starts |
| 2036 | saved_ends = group_ends |
| 2037 | if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 2038 | ignore_case, group_starts, group_ends, best_end)) then |
| 2039 | matched = .true. |
| 2040 | else |
| 2041 | group_starts = saved_starts |
| 2042 | group_ends = saved_ends |
| 2043 | end if |
| 2044 | end if |
| 2045 | end if |
| 2046 | |
| 2047 | end select |
| 2048 | end do |
| 2049 | |
| 2050 | ! Restore group start and end if we didn't match |
| 2051 | if (.not. matched) then |
| 2052 | if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then |
| 2053 | group_starts(nfa%states(state)%group_start) = old_start |
| 2054 | end if |
| 2055 | if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then |
| 2056 | group_ends(nfa%states(state)%group_end) = old_end |
| 2057 | end if |
| 2058 | end if |
| 2059 | |
| 2060 | end function backtrack_from_state |
| 2061 | |
| 2062 | function strings_equal_icase(s1, s2) result(equal) |
| 2063 | !> Compare two strings case-insensitively |
| 2064 | character(len=*), intent(in) :: s1, s2 |
| 2065 | logical :: equal |
| 2066 | integer :: i, n |
| 2067 | |
| 2068 | equal = .false. |
| 2069 | n = len(s1) |
| 2070 | if (len(s2) /= n) return |
| 2071 | |
| 2072 | do i = 1, n |
| 2073 | if (to_lower_char(s1(i:i)) /= to_lower_char(s2(i:i))) return |
| 2074 | end do |
| 2075 | |
| 2076 | equal = .true. |
| 2077 | end function strings_equal_icase |
| 2078 | |
| 2079 | end module regex_optimizer |
| 2080 |