Fortran · 71301 bytes Raw Blame History
1 module regex_optimizer
2 !> Regex optimization module for FERP
3 !> Provides optimized NFA matching with:
4 !> - Literal prefix extraction for Boyer-Moore skip
5 !> - Bit vector state sets for O(1) operations
6 !> - Lazy DFA state caching
7 !> - Anchored pattern fast paths
8 !> - Aho-Corasick for alternation patterns
9 !> - Bitwise character class matching
10 use regex_types
11 use regex_charclass
12 use aho_corasick
13 use ferp_kinds, only: pattern_len
14 implicit none
15 private
16
17 public :: optimized_nfa_t
18 public :: optimize_nfa, optimized_match, optimized_search
19 public :: try_build_aho_corasick
20
21 integer, parameter :: MAX_STATES = 1024
22 integer, parameter :: MAX_PREFIX_LEN = 64
23 integer, parameter :: DFA_CACHE_SIZE = 256 ! Cache recent state transitions
24 integer, parameter :: MAX_DFA_STATES = 512 ! Max DFA states before fallback to NFA
25 integer, parameter :: DFA_DEAD_STATE = 0 ! Special state: no match possible
26
27 !> Bit vector for state sets - much faster than array lookup
28 type :: state_set_t
29 integer(8) :: bits(MAX_STATES / 64 + 1) = 0
30 integer :: count = 0
31 contains
32 procedure :: clear => state_set_clear
33 procedure :: add => state_set_add
34 procedure :: contains => state_set_contains
35 procedure :: is_empty => state_set_is_empty
36 procedure :: copy_from => state_set_copy
37 procedure :: hash => state_set_hash
38 procedure :: equals => state_set_equals
39 end type state_set_t
40
41 !> DFA cache entry - caches (state_set_hash, char) -> next_states transitions
42 type :: dfa_cache_entry_t
43 integer(8) :: state_hash = 0 ! Hash of source state set
44 integer :: char_code = -1 ! Character being matched
45 type(state_set_t) :: next_states ! Resulting states after transition
46 logical :: valid = .false. ! Entry is populated
47 logical :: is_case_insensitive = .false. ! Case sensitivity flag
48 end type dfa_cache_entry_t
49
50 !> Full DFA state - precomputed transitions for all 256 characters
51 type :: dfa_state_t
52 integer :: transitions(0:255) = DFA_DEAD_STATE ! Next state for each byte
53 type(state_set_t) :: nfa_states ! Corresponding NFA state set
54 logical :: is_accept = .false. ! Is this an accepting state?
55 integer(8) :: state_hash = 0 ! Hash for lookup
56 end type dfa_state_t
57
58 !> Compiled DFA for O(n) matching
59 type :: compiled_dfa_t
60 type(dfa_state_t), allocatable :: states(:) ! DFA states
61 integer :: num_states = 0 ! Number of states built
62 integer :: start_state = 0 ! Starting DFA state
63 logical :: compiled = .false. ! DFA successfully compiled
64 logical :: too_large = .false. ! DFA exceeded size limit
65 ! Character equivalence classes
66 integer :: char_to_class(0:255) = 0 ! Maps char code to class index
67 integer :: num_classes = 256 ! Number of equivalence classes
68 logical :: use_equiv_classes = .false. ! Using equivalence classes
69 end type compiled_dfa_t
70
71 !> Optimized NFA with precomputed data
72 type :: optimized_nfa_t
73 type(nfa_t) :: nfa ! Original NFA
74 character(len=MAX_PREFIX_LEN) :: prefix = '' ! Literal prefix for quick skip
75 integer :: prefix_len = 0
76 logical :: anchored_start = .false. ! Pattern starts with ^
77 logical :: anchored_end = .false. ! Pattern ends with $
78 integer :: skip_table(0:255) = 0 ! Boyer-Moore skip table for prefix
79 type(state_set_t) :: start_closure ! Pre-computed start state epsilon closure
80 type(state_set_t), allocatable :: epsilon_closures(:) ! Pre-computed epsilon closures per state
81 type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE) ! Lazy DFA cache
82 type(compiled_dfa_t) :: dfa ! Full compiled DFA (if available)
83 logical :: use_dfa = .false. ! Use DFA instead of NFA
84 type(ac_automaton_t) :: ac ! Aho-Corasick automaton (for alternation)
85 logical :: use_aho_corasick = .false. ! Use Aho-Corasick for matching
86 logical :: has_backrefs = .false. ! Pattern contains backreferences
87 logical :: optimized = .false.
88 end type optimized_nfa_t
89
90 contains
91
92 !---------------------------------------------------------------------------
93 ! State Set Operations (Bit Vector)
94 !---------------------------------------------------------------------------
95
96 subroutine state_set_clear(this)
97 class(state_set_t), intent(inout) :: this
98 this%bits = 0
99 this%count = 0
100 end subroutine state_set_clear
101
102 subroutine state_set_add(this, state)
103 class(state_set_t), intent(inout) :: this
104 integer, intent(in) :: state
105 integer :: word_idx, bit_idx
106 integer(8) :: mask
107
108 if (state < 1 .or. state > MAX_STATES) return
109
110 word_idx = (state - 1) / 64 + 1
111 bit_idx = mod(state - 1, 64)
112 mask = ishft(1_8, bit_idx)
113
114 if (iand(this%bits(word_idx), mask) == 0) then
115 this%bits(word_idx) = ior(this%bits(word_idx), mask)
116 this%count = this%count + 1
117 end if
118 end subroutine state_set_add
119
120 function state_set_contains(this, state) result(found)
121 class(state_set_t), intent(in) :: this
122 integer, intent(in) :: state
123 logical :: found
124 integer :: word_idx, bit_idx
125 integer(8) :: mask
126
127 found = .false.
128 if (state < 1 .or. state > MAX_STATES) return
129
130 word_idx = (state - 1) / 64 + 1
131 bit_idx = mod(state - 1, 64)
132 mask = ishft(1_8, bit_idx)
133
134 found = iand(this%bits(word_idx), mask) /= 0
135 end function state_set_contains
136
137 function state_set_is_empty(this) result(empty)
138 class(state_set_t), intent(in) :: this
139 logical :: empty
140 empty = this%count == 0
141 end function state_set_is_empty
142
143 subroutine state_set_copy(this, other)
144 class(state_set_t), intent(inout) :: this
145 type(state_set_t), intent(in) :: other
146 this%bits = other%bits
147 this%count = other%count
148 end subroutine state_set_copy
149
150 function state_set_hash(this) result(h)
151 !> Compute a hash of the state set for cache lookup
152 !> Uses FNV-1a style hashing on the bit words
153 class(state_set_t), intent(in) :: this
154 integer(8) :: h
155 integer :: i
156 integer(8), parameter :: FNV_OFFSET = int(Z'CBF29CE484222325', 8)
157 integer(8), parameter :: FNV_PRIME = int(Z'100000001B3', 8)
158
159 h = FNV_OFFSET
160 do i = 1, size(this%bits)
161 h = ieor(h, this%bits(i))
162 h = h * FNV_PRIME
163 end do
164 end function state_set_hash
165
166 function state_set_equals(this, other) result(eq)
167 !> Check if two state sets are identical
168 class(state_set_t), intent(in) :: this
169 type(state_set_t), intent(in) :: other
170 logical :: eq
171 integer :: i
172
173 eq = .false.
174 if (this%count /= other%count) return
175
176 do i = 1, size(this%bits)
177 if (this%bits(i) /= other%bits(i)) return
178 end do
179
180 eq = .true.
181 end function state_set_equals
182
183 !---------------------------------------------------------------------------
184 ! Optimization: Analyze NFA and extract optimizations
185 !---------------------------------------------------------------------------
186
187 subroutine optimize_nfa(opt, nfa)
188 type(optimized_nfa_t), intent(out) :: opt
189 type(nfa_t), intent(in) :: nfa
190
191 opt%nfa = nfa
192 opt%prefix_len = 0
193 opt%prefix = ''
194 opt%anchored_start = .false.
195 opt%anchored_end = .false.
196 opt%use_dfa = .false.
197 opt%has_backrefs = .false.
198
199 ! Detect backreferences in NFA
200 opt%has_backrefs = has_backref_transitions(nfa)
201
202 ! Extract literal prefix and detect anchors
203 call extract_prefix_and_anchors(opt)
204
205 ! Build Boyer-Moore skip table for prefix
206 if (opt%prefix_len > 0) then
207 call build_skip_table(opt%prefix, opt%prefix_len, opt%skip_table)
208 end if
209
210 ! Pre-compute start state epsilon closure (position-independent part)
211 call precompute_start_closure(opt)
212
213 ! Pre-compute epsilon closures for all states (for fast expansion)
214 call precompute_all_epsilon_closures(opt)
215
216 ! Clear DFA cache
217 opt%dfa_cache%valid = .false.
218
219 ! Try to compile full DFA for O(n) matching
220 ! Only for patterns without any position-dependent transitions (anchors)
221 ! and without backreferences (which require backtracking)
222 if (.not. has_anchor_transitions(opt%nfa) .and. .not. opt%has_backrefs) then
223 call compile_dfa(opt)
224 ! DEBUG: Print DFA compilation result (uncomment for debugging)
225 ! write(0,*) 'DFA compiled:', opt%use_dfa, 'states:', opt%dfa%num_states, 'too_large:', opt%dfa%too_large
226 end if
227
228 opt%optimized = .true.
229
230 end subroutine optimize_nfa
231
232 subroutine extract_prefix_and_anchors(opt)
233 type(optimized_nfa_t), intent(inout) :: opt
234 integer :: state, i, prefix_len
235 type(nfa_transition_t) :: trans
236 logical :: done
237
238 prefix_len = 0
239 state = opt%nfa%start_state
240
241 ! Check for start anchor
242 if (state >= 1 .and. state <= opt%nfa%num_states) then
243 do i = 1, opt%nfa%states(state)%num_trans
244 trans = opt%nfa%states(state)%trans(i)
245 if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 1) then
246 opt%anchored_start = .true.
247 state = trans%target
248 exit
249 end if
250 end do
251 end if
252
253 ! Extract literal prefix by following single-path character transitions
254 done = .false.
255 do while (.not. done .and. prefix_len < MAX_PREFIX_LEN)
256 if (state < 1 .or. state > opt%nfa%num_states) exit
257 if (opt%nfa%states(state)%num_trans /= 1) exit
258
259 trans = opt%nfa%states(state)%trans(1)
260
261 if (trans%trans_type == TRANS_CHAR) then
262 prefix_len = prefix_len + 1
263 opt%prefix(prefix_len:prefix_len) = trans%match_char
264 state = trans%target
265 else if (trans%trans_type == TRANS_EPSILON) then
266 ! Follow epsilon, but only if it's the only transition
267 state = trans%target
268 else
269 done = .true.
270 end if
271 end do
272
273 opt%prefix_len = prefix_len
274
275 ! Check for end anchor (scan accept state's incoming transitions)
276 ! This is approximate - just check if accept state has anchor transition
277 if (opt%nfa%accept_state >= 1 .and. opt%nfa%accept_state <= opt%nfa%num_states) then
278 ! Check states that point to accept state
279 do state = 1, opt%nfa%num_states
280 do i = 1, opt%nfa%states(state)%num_trans
281 trans = opt%nfa%states(state)%trans(i)
282 if (trans%target == opt%nfa%accept_state) then
283 if (trans%trans_type == TRANS_ANCHOR .and. trans%anchor_type == 2) then
284 opt%anchored_end = .true.
285 end if
286 end if
287 end do
288 end do
289 end if
290
291 end subroutine extract_prefix_and_anchors
292
293 subroutine build_skip_table(prefix, prefix_len, skip_table)
294 character(len=*), intent(in) :: prefix
295 integer, intent(in) :: prefix_len
296 integer, intent(out) :: skip_table(0:255)
297 integer :: i, c
298
299 ! Default skip is prefix length
300 skip_table = prefix_len
301
302 ! Set skip distances for characters in prefix
303 do i = 1, prefix_len - 1
304 c = ichar(prefix(i:i))
305 skip_table(c) = prefix_len - i
306 end do
307 end subroutine build_skip_table
308
309 subroutine precompute_start_closure(opt)
310 type(optimized_nfa_t), intent(inout) :: opt
311 ! Compute basic epsilon closure of start state
312 ! (Some closures depend on position for anchors, handle those at runtime)
313 call opt%start_closure%clear()
314 call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, opt%start_closure)
315 end subroutine precompute_start_closure
316
317 subroutine compute_epsilon_closure_basic(nfa, start_state, result_set)
318 type(nfa_t), intent(in) :: nfa
319 integer, intent(in) :: start_state
320 type(state_set_t), intent(inout) :: result_set
321
322 integer :: stack(MAX_STATES), stack_top
323 integer :: state, i, target
324 type(nfa_transition_t) :: trans
325
326 stack_top = 1
327 stack(1) = start_state
328
329 do while (stack_top > 0)
330 state = stack(stack_top)
331 stack_top = stack_top - 1
332
333 if (state < 1 .or. state > nfa%num_states) cycle
334 if (result_set%contains(state)) cycle
335
336 call result_set%add(state)
337
338 ! Follow epsilon transitions (not anchors - those are position-dependent)
339 do i = 1, nfa%states(state)%num_trans
340 trans = nfa%states(state)%trans(i)
341 if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type >= 0) then
342 target = trans%target
343 if (target >= 1 .and. target <= nfa%num_states) then
344 if (.not. result_set%contains(target)) then
345 stack_top = stack_top + 1
346 if (stack_top <= MAX_STATES) stack(stack_top) = target
347 end if
348 end if
349 end if
350 end do
351 end do
352 end subroutine compute_epsilon_closure_basic
353
354 subroutine precompute_all_epsilon_closures(opt)
355 !> Pre-compute epsilon closure for every NFA state
356 !> This allows O(1) closure lookup during matching instead of repeated traversal
357 type(optimized_nfa_t), intent(inout) :: opt
358
359 integer :: i, n
360
361 n = opt%nfa%num_states
362 if (n <= 0) return
363
364 ! Allocate epsilon closures array
365 if (allocated(opt%epsilon_closures)) deallocate(opt%epsilon_closures)
366 allocate(opt%epsilon_closures(n))
367
368 ! Compute epsilon closure for each state
369 do i = 1, n
370 call opt%epsilon_closures(i)%clear()
371 call compute_epsilon_closure_basic(opt%nfa, i, opt%epsilon_closures(i))
372 end do
373 end subroutine precompute_all_epsilon_closures
374
375 function has_anchor_transitions(nfa) result(has_anchors)
376 !> Check if NFA has any anchor transitions (position-dependent)
377 !> These include ^, $, \<, \>, \b, \B
378 type(nfa_t), intent(in) :: nfa
379 logical :: has_anchors
380
381 integer :: state, i
382 type(nfa_transition_t) :: trans
383
384 has_anchors = .false.
385
386 do state = 1, nfa%num_states
387 do i = 1, nfa%states(state)%num_trans
388 trans = nfa%states(state)%trans(i)
389 if (trans%trans_type == TRANS_ANCHOR) then
390 has_anchors = .true.
391 return
392 end if
393 end do
394 end do
395 end function has_anchor_transitions
396
397 function has_backref_transitions(nfa) result(has_backrefs)
398 !> Check if NFA has any backreference transitions
399 !> Backrefs are encoded as TRANS_EPSILON with negative anchor_type
400 type(nfa_t), intent(in) :: nfa
401 logical :: has_backrefs
402
403 integer :: state, i
404 type(nfa_transition_t) :: trans
405
406 has_backrefs = .false.
407
408 do state = 1, nfa%num_states
409 do i = 1, nfa%states(state)%num_trans
410 trans = nfa%states(state)%trans(i)
411 if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type < 0) then
412 has_backrefs = .true.
413 return
414 end if
415 end do
416 end do
417 end function has_backref_transitions
418
419 !---------------------------------------------------------------------------
420 ! Character Equivalence Classes
421 !---------------------------------------------------------------------------
422
423 subroutine compute_equiv_classes(nfa, char_to_class, num_classes)
424 !> Compute character equivalence classes from NFA transitions
425 !> Characters with identical behavior across all NFA states belong to same class
426 !> This reduces DFA transition table from 256 entries to num_classes entries
427 type(nfa_t), intent(in) :: nfa
428 integer, intent(out) :: char_to_class(0:255)
429 integer, intent(out) :: num_classes
430
431 ! Signature for each character: encodes which transitions it triggers
432 ! We use a simple approach: hash the set of (state, target) pairs for each char
433 integer(8) :: char_signature(0:255)
434 integer :: state, i, c, target
435 type(nfa_transition_t) :: trans
436 integer(8) :: sig
437 integer :: class_map(0:255) ! signature hash -> class index
438 logical :: found
439
440 ! Initialize all characters to have signature 0 (no transitions)
441 char_signature = 0_8
442
443 ! Build signature for each character based on NFA transitions
444 do state = 1, nfa%num_states
445 do i = 1, nfa%states(state)%num_trans
446 trans = nfa%states(state)%trans(i)
447 target = trans%target
448
449 select case (trans%trans_type)
450 case (TRANS_CHAR)
451 ! Single character transition
452 c = ichar(trans%match_char)
453 ! Add (state, target) to signature using FNV-1a-like hash
454 char_signature(c) = ieor(char_signature(c), &
455 int(state * 31 + target, 8) * 1099511628211_8)
456
457 case (TRANS_CLASS)
458 ! Character class transition - add to all matching chars
459 do c = 0, 255
460 if (charclass_test(trans%char_bits, char(c))) then
461 char_signature(c) = ieor(char_signature(c), &
462 int(state * 31 + target, 8) * 1099511628211_8)
463 end if
464 end do
465
466 case (TRANS_ANY)
467 ! Dot matches all except newline
468 do c = 0, 255
469 if (c /= 10) then ! Not newline
470 char_signature(c) = ieor(char_signature(c), &
471 int(state * 31 + target, 8) * 1099511628211_8)
472 end if
473 end do
474 end select
475 end do
476 end do
477
478 ! Force each alphabetic character to have a unique signature
479 ! This ensures they get their own equivalence classes, so the case-folding
480 ! code in DFA compilation works correctly (it relies on the class representative
481 ! being alphabetic to compute transitions for both cases)
482 do c = ichar('a'), ichar('z')
483 ! Add unique value to each letter's signature to separate them from non-letters
484 char_signature(c) = ieor(char_signature(c), int(c * 7919 + 1, 8))
485 char_signature(c - 32) = ieor(char_signature(c - 32), int((c - 32) * 7919 + 1, 8))
486 end do
487
488 ! Now group characters by signature
489 num_classes = 0
490 class_map = -1
491 char_to_class = 0
492
493 do c = 0, 255
494 sig = char_signature(c)
495
496 ! Look for existing class with this signature
497 found = .false.
498 do i = 0, num_classes - 1
499 if (class_map(i) /= -1) then
500 ! Check if any character in class i has same signature
501 ! We stored the signature hash as a proxy
502 if (char_signature(class_map(i)) == sig) then
503 char_to_class(c) = i
504 found = .true.
505 exit
506 end if
507 end if
508 end do
509
510 if (.not. found) then
511 ! Create new class
512 char_to_class(c) = num_classes
513 class_map(num_classes) = c ! Remember one char from this class
514 num_classes = num_classes + 1
515 end if
516 end do
517
518 ! Ensure at least one class
519 if (num_classes == 0) num_classes = 1
520
521 end subroutine compute_equiv_classes
522
523 !---------------------------------------------------------------------------
524 ! DFA Compilation: Convert NFA to DFA for O(n) matching
525 !---------------------------------------------------------------------------
526
527 subroutine compile_dfa(opt)
528 !> Compile NFA to DFA using subset construction
529 !> Uses character equivalence classes to reduce compilation time
530 type(optimized_nfa_t), intent(inout) :: opt
531
532 type(state_set_t) :: start_set, next_set
533 integer :: worklist(MAX_DFA_STATES), work_head, work_tail
534 integer :: dfa_idx, char_code, next_idx, old_num_states
535 integer :: class_idx, c
536 integer :: class_representative(0:255) ! One char per class
537 integer :: class_transitions(0:255) ! Computed transition per class
538
539 ! Allocate DFA states
540 if (allocated(opt%dfa%states)) deallocate(opt%dfa%states)
541 allocate(opt%dfa%states(MAX_DFA_STATES))
542 opt%dfa%num_states = 0
543 opt%dfa%compiled = .false.
544 opt%dfa%too_large = .false.
545 opt%use_dfa = .false.
546
547 ! Compute character equivalence classes
548 call compute_equiv_classes(opt%nfa, opt%dfa%char_to_class, opt%dfa%num_classes)
549 opt%dfa%use_equiv_classes = (opt%dfa%num_classes < 256)
550
551 ! Build representative character for each class
552 class_representative = -1
553 do c = 0, 255
554 class_idx = opt%dfa%char_to_class(c)
555 if (class_representative(class_idx) == -1) then
556 class_representative(class_idx) = c
557 end if
558 end do
559
560 ! Compute start state: epsilon closure of NFA start
561 call start_set%clear()
562 call compute_epsilon_closure_basic(opt%nfa, opt%nfa%start_state, start_set)
563
564 if (start_set%is_empty()) return
565
566 ! Create initial DFA state
567 opt%dfa%num_states = 1
568 opt%dfa%states(1)%nfa_states = start_set
569 opt%dfa%states(1)%state_hash = start_set%hash()
570 opt%dfa%states(1)%is_accept = is_accepting_set(opt%nfa, start_set)
571 opt%dfa%start_state = 1
572
573 ! Initialize worklist with start state
574 work_head = 1
575 work_tail = 1
576 worklist(1) = 1
577
578 ! Process worklist: for each DFA state, compute transitions
579 do while (work_head <= work_tail)
580 dfa_idx = worklist(work_head)
581 work_head = work_head + 1
582
583 ! First, compute transitions for each equivalence class (not all 256 chars)
584 class_transitions = DFA_DEAD_STATE
585
586 do class_idx = 0, opt%dfa%num_classes - 1
587 char_code = class_representative(class_idx)
588 if (char_code < 0) cycle
589
590 call next_set%clear()
591
592 ! Compute NFA transitions for this character
593 ! Note: DFA is case-sensitive. Case-insensitive matching uses NFA path.
594 call compute_char_transitions_simple(opt%nfa, opt%dfa%states(dfa_idx)%nfa_states, &
595 char(char_code), next_set)
596
597 ! Compute epsilon closure of result
598 if (.not. next_set%is_empty()) then
599 call expand_epsilon_closure_simple(opt, next_set)
600 end if
601
602 if (next_set%is_empty()) then
603 class_transitions(class_idx) = DFA_DEAD_STATE
604 else
605 ! Find or create DFA state for this NFA state set
606 old_num_states = opt%dfa%num_states
607 next_idx = find_or_create_dfa_state(opt%dfa, next_set, opt%nfa)
608
609 if (next_idx == -1) then
610 opt%dfa%too_large = .true.
611 opt%dfa%compiled = .false.
612 return
613 end if
614
615 class_transitions(class_idx) = next_idx
616
617 ! Add new state to worklist only if it was just created
618 if (opt%dfa%num_states > old_num_states) then
619 work_tail = work_tail + 1
620 if (work_tail > MAX_DFA_STATES) then
621 opt%dfa%too_large = .true.
622 opt%dfa%compiled = .false.
623 return
624 end if
625 worklist(work_tail) = next_idx
626 end if
627 end if
628 end do
629
630 ! Now fill in the full 256-entry transition table from class transitions
631 do c = 0, 255
632 class_idx = opt%dfa%char_to_class(c)
633 opt%dfa%states(dfa_idx)%transitions(c) = class_transitions(class_idx)
634 end do
635 end do
636
637 ! Minimize DFA to reduce state count
638 call minimize_dfa(opt%dfa)
639
640 opt%dfa%compiled = .true.
641 opt%use_dfa = .true.
642
643 end subroutine compile_dfa
644
645 function find_or_create_dfa_state(dfa, nfa_states, nfa) result(idx)
646 !> Find existing DFA state for NFA state set, or create new one
647 !> Returns -1 if DFA state limit exceeded
648 type(compiled_dfa_t), intent(inout) :: dfa
649 type(state_set_t), intent(in) :: nfa_states
650 type(nfa_t), intent(in) :: nfa
651 integer :: idx
652
653 integer(8) :: h
654 integer :: i
655
656 h = nfa_states%hash()
657
658 ! Search existing states
659 do i = 1, dfa%num_states
660 if (dfa%states(i)%state_hash == h .and. &
661 dfa%states(i)%nfa_states%equals(nfa_states)) then
662 idx = i
663 return
664 end if
665 end do
666
667 ! Create new state
668 if (dfa%num_states >= MAX_DFA_STATES) then
669 idx = -1
670 return
671 end if
672
673 dfa%num_states = dfa%num_states + 1
674 idx = dfa%num_states
675 dfa%states(idx)%nfa_states = nfa_states
676 dfa%states(idx)%state_hash = h
677 dfa%states(idx)%is_accept = is_accepting_set(nfa, nfa_states)
678 dfa%states(idx)%transitions = DFA_DEAD_STATE
679
680 end function find_or_create_dfa_state
681
682 subroutine minimize_dfa(dfa)
683 !> Minimize DFA using Hopcroft's algorithm
684 !> Merges equivalent states to reduce DFA size
685 type(compiled_dfa_t), intent(inout) :: dfa
686
687 integer :: num_states, num_partitions
688 integer, allocatable :: partition(:) ! partition(state) = partition ID
689 integer, allocatable :: part_size(:) ! Size of each partition
690 integer, allocatable :: representative(:) ! Representative state for each partition
691 integer, allocatable :: new_state_id(:) ! Mapping from old state to new state ID
692 type(dfa_state_t), allocatable :: new_states(:)
693
694 logical, allocatable :: in_worklist(:) ! Is partition in worklist?
695 integer, allocatable :: worklist(:) ! Partitions to process
696 integer :: work_head, work_tail
697
698 integer :: i, c, state, target, part_id
699 integer :: num_accept, num_reject
700 integer :: old_part, new_part_id
701 logical :: needs_split
702 integer, allocatable :: split_marker(:) ! Which states go to partition A on char c
703 integer :: new_num_states
704
705 num_states = dfa%num_states
706 if (num_states <= 1) return ! Nothing to minimize
707
708 ! Allocate working arrays
709 allocate(partition(num_states))
710 allocate(part_size(num_states))
711 allocate(representative(num_states))
712 allocate(new_state_id(num_states))
713 allocate(in_worklist(num_states))
714 allocate(worklist(num_states))
715 allocate(split_marker(num_states))
716
717 ! Initialize partitions: accepting states = partition 1, non-accepting = partition 2
718 partition = 0
719 part_size = 0
720 num_accept = 0
721 num_reject = 0
722
723 do i = 1, num_states
724 if (dfa%states(i)%is_accept) then
725 partition(i) = 1
726 num_accept = num_accept + 1
727 else
728 partition(i) = 2
729 num_reject = num_reject + 1
730 end if
731 end do
732
733 part_size(1) = num_accept
734 part_size(2) = num_reject
735 num_partitions = 2
736
737 ! Handle edge case: all accepting or all rejecting
738 if (num_accept == 0 .or. num_reject == 0) then
739 num_partitions = 1
740 partition = 1
741 part_size(1) = num_states
742 end if
743
744 ! Initialize worklist with smaller partition (Hopcroft optimization)
745 in_worklist = .false.
746 work_head = 1
747 work_tail = 0
748
749 if (num_partitions == 2) then
750 if (num_accept <= num_reject) then
751 work_tail = 1
752 worklist(1) = 1
753 in_worklist(1) = .true.
754 else
755 work_tail = 1
756 worklist(1) = 2
757 in_worklist(2) = .true.
758 end if
759 end if
760
761 ! Main refinement loop
762 do while (work_head <= work_tail)
763 part_id = worklist(work_head)
764 work_head = work_head + 1
765 in_worklist(part_id) = .false.
766
767 ! For each character, check if this partition splits others
768 do c = 0, 255
769 ! Mark states that transition to partition part_id on character c
770 split_marker = 0
771 do state = 1, num_states
772 target = dfa%states(state)%transitions(c)
773 if (target > 0 .and. target <= num_states) then
774 if (partition(target) == part_id) then
775 split_marker(state) = 1
776 end if
777 end if
778 end do
779
780 ! Check each existing partition for splits
781 do old_part = 1, num_partitions
782 ! Count states in this partition that go to part_id vs don't
783 num_accept = 0 ! Reuse: count going to part_id
784 num_reject = 0 ! Reuse: count not going to part_id
785
786 do state = 1, num_states
787 if (partition(state) == old_part) then
788 if (split_marker(state) == 1) then
789 num_accept = num_accept + 1
790 else
791 num_reject = num_reject + 1
792 end if
793 end if
794 end do
795
796 ! If partition needs splitting (has both types)
797 needs_split = (num_accept > 0 .and. num_reject > 0)
798
799 if (needs_split) then
800 ! Create new partition for the smaller group
801 num_partitions = num_partitions + 1
802 new_part_id = num_partitions
803
804 ! Move the smaller group to new partition
805 if (num_accept <= num_reject) then
806 ! Move states going to part_id to new partition
807 do state = 1, num_states
808 if (partition(state) == old_part .and. split_marker(state) == 1) then
809 partition(state) = new_part_id
810 end if
811 end do
812 part_size(new_part_id) = num_accept
813 part_size(old_part) = num_reject
814 else
815 ! Move states NOT going to part_id to new partition
816 do state = 1, num_states
817 if (partition(state) == old_part .and. split_marker(state) == 0) then
818 partition(state) = new_part_id
819 end if
820 end do
821 part_size(new_part_id) = num_reject
822 part_size(old_part) = num_accept
823 end if
824
825 ! Update worklist
826 if (in_worklist(old_part)) then
827 ! Both halves need to be in worklist
828 work_tail = work_tail + 1
829 worklist(work_tail) = new_part_id
830 in_worklist(new_part_id) = .true.
831 else
832 ! Add smaller partition to worklist
833 if (part_size(new_part_id) <= part_size(old_part)) then
834 work_tail = work_tail + 1
835 worklist(work_tail) = new_part_id
836 in_worklist(new_part_id) = .true.
837 else
838 work_tail = work_tail + 1
839 worklist(work_tail) = old_part
840 in_worklist(old_part) = .true.
841 end if
842 end if
843 end if
844 end do
845 end do
846 end do
847
848 ! Check if minimization actually reduced states
849 if (num_partitions >= num_states) then
850 ! No reduction possible
851 deallocate(partition, part_size, representative, new_state_id)
852 deallocate(in_worklist, worklist, split_marker)
853 return
854 end if
855
856 ! Find representative for each partition (lowest numbered state)
857 representative = 0
858 do state = 1, num_states
859 part_id = partition(state)
860 if (representative(part_id) == 0) then
861 representative(part_id) = state
862 end if
863 end do
864
865 ! Build new state IDs (compact numbering)
866 new_state_id = 0
867 new_num_states = 0
868 do part_id = 1, num_partitions
869 if (representative(part_id) > 0) then
870 new_num_states = new_num_states + 1
871 ! Map all states in this partition to new state ID
872 do state = 1, num_states
873 if (partition(state) == part_id) then
874 new_state_id(state) = new_num_states
875 end if
876 end do
877 end if
878 end do
879
880 ! Build minimized DFA
881 allocate(new_states(new_num_states))
882
883 do part_id = 1, num_partitions
884 state = representative(part_id)
885 if (state == 0) cycle
886
887 i = new_state_id(state)
888 new_states(i)%is_accept = dfa%states(state)%is_accept
889 new_states(i)%state_hash = dfa%states(state)%state_hash
890 new_states(i)%nfa_states = dfa%states(state)%nfa_states
891
892 ! Remap transitions
893 do c = 0, 255
894 target = dfa%states(state)%transitions(c)
895 if (target > 0 .and. target <= num_states) then
896 new_states(i)%transitions(c) = new_state_id(target)
897 else
898 new_states(i)%transitions(c) = DFA_DEAD_STATE
899 end if
900 end do
901 end do
902
903 ! Update DFA with minimized version
904 deallocate(dfa%states)
905 allocate(dfa%states(new_num_states))
906 dfa%states = new_states
907 dfa%start_state = new_state_id(dfa%start_state)
908 dfa%num_states = new_num_states
909
910 ! Cleanup
911 deallocate(partition, part_size, representative, new_state_id)
912 deallocate(in_worklist, worklist, split_marker, new_states)
913
914 end subroutine minimize_dfa
915
916 subroutine compute_char_transitions_simple(nfa, current, c, next_set)
917 !> Compute character transitions without case folding (for DFA compilation)
918 type(nfa_t), intent(in) :: nfa
919 type(state_set_t), intent(in) :: current
920 character(len=1), intent(in) :: c
921 type(state_set_t), intent(inout) :: next_set
922
923 integer :: state, word_idx, bit_idx, i
924 integer(8) :: word, mask
925 type(nfa_transition_t) :: trans
926
927 do word_idx = 1, size(current%bits)
928 word = current%bits(word_idx)
929 if (word == 0) cycle
930
931 do bit_idx = 0, 63
932 mask = ishft(1_8, bit_idx)
933 if (iand(word, mask) /= 0) then
934 state = (word_idx - 1) * 64 + bit_idx + 1
935 if (state > nfa%num_states) cycle
936
937 do i = 1, nfa%states(state)%num_trans
938 trans = nfa%states(state)%trans(i)
939
940 select case (trans%trans_type)
941 case (TRANS_CHAR)
942 if (c == trans%match_char) then
943 call next_set%add(trans%target)
944 end if
945
946 case (TRANS_CLASS)
947 if (trans%char_class(ichar(c)) .neqv. trans%negated) then
948 call next_set%add(trans%target)
949 end if
950
951 case (TRANS_ANY)
952 if (c /= char(10)) then
953 call next_set%add(trans%target)
954 end if
955 end select
956 end do
957 end if
958 end do
959 end do
960 end subroutine compute_char_transitions_simple
961
962 subroutine expand_epsilon_closure_simple(opt, state_set)
963 !> Expand state set to include epsilon closure (in-place)
964 !> Uses pre-computed closures for O(1) lookup per state
965 type(optimized_nfa_t), intent(in) :: opt
966 type(state_set_t), intent(inout) :: state_set
967
968 integer :: word_idx, bit_idx, state, j
969 integer(8) :: word, mask, original_bits(size(state_set%bits))
970
971 ! If no pre-computed closures, fall back to computing on-the-fly
972 if (.not. allocated(opt%epsilon_closures)) then
973 call expand_epsilon_closure_simple_fallback(opt%nfa, state_set)
974 return
975 end if
976
977 ! Save original bits to avoid processing newly added states
978 original_bits = state_set%bits
979
980 ! Expand using pre-computed closures - just OR the bit vectors
981 do word_idx = 1, size(original_bits)
982 word = original_bits(word_idx)
983 if (word == 0) cycle
984
985 do bit_idx = 0, 63
986 mask = ishft(1_8, bit_idx)
987 if (iand(word, mask) /= 0) then
988 state = (word_idx - 1) * 64 + bit_idx + 1
989 if (state >= 1 .and. state <= opt%nfa%num_states) then
990 ! Merge pre-computed epsilon closure using bitwise OR
991 do j = 1, size(state_set%bits)
992 state_set%bits(j) = ior(state_set%bits(j), opt%epsilon_closures(state)%bits(j))
993 end do
994 end if
995 end if
996 end do
997 end do
998 end subroutine expand_epsilon_closure_simple
999
1000 subroutine expand_epsilon_closure_simple_fallback(nfa, state_set)
1001 !> Fallback: compute epsilon closure on-the-fly
1002 type(nfa_t), intent(in) :: nfa
1003 type(state_set_t), intent(inout) :: state_set
1004
1005 type(state_set_t) :: result
1006 integer :: word_idx, bit_idx, state
1007 integer(8) :: word, mask
1008
1009 call result%clear()
1010
1011 do word_idx = 1, size(state_set%bits)
1012 word = state_set%bits(word_idx)
1013 if (word == 0) cycle
1014
1015 do bit_idx = 0, 63
1016 mask = ishft(1_8, bit_idx)
1017 if (iand(word, mask) /= 0) then
1018 state = (word_idx - 1) * 64 + bit_idx + 1
1019 if (state <= nfa%num_states) then
1020 call compute_epsilon_closure_basic(nfa, state, result)
1021 end if
1022 end if
1023 end do
1024 end do
1025
1026 call state_set%copy_from(result)
1027 end subroutine expand_epsilon_closure_simple_fallback
1028
1029 !---------------------------------------------------------------------------
1030 ! Optimized Search: Use prefix to skip positions
1031 !---------------------------------------------------------------------------
1032
1033 function optimized_search(opt, text, ignore_case) result(res)
1034 type(optimized_nfa_t), intent(inout) :: opt
1035 character(len=*), intent(in) :: text
1036 logical, intent(in) :: ignore_case
1037 type(match_result_t) :: res
1038
1039 integer :: text_len, pos, skip
1040 type(match_result_t) :: try_res
1041
1042 res%matched = .false.
1043 text_len = len_trim(text)
1044
1045 ! Fast path: use Aho-Corasick for alternation patterns
1046 ! Only use AC if ignore_case setting matches what was compiled
1047 if (opt%use_aho_corasick) then
1048 if (ignore_case .eqv. opt%ac%ignore_case) then
1049 res = ac_optimized_search(opt%ac, text)
1050 return
1051 end if
1052 end if
1053
1054 if (opt%nfa%num_states == 0) return
1055
1056 ! Backtracking path: use backtracking matcher for patterns with backreferences
1057 if (opt%has_backrefs) then
1058 res = backtrack_search(opt%nfa, text, text_len, ignore_case)
1059 return
1060 end if
1061
1062 ! Fast path: use DFA if available (O(n) matching)
1063 ! DFA is case-sensitive; case-insensitive matching falls through to NFA path
1064 if (opt%use_dfa .and. .not. ignore_case) then
1065 res = dfa_search(opt%dfa, text, text_len)
1066 return
1067 end if
1068
1069 ! Fast path: anchored start - only try position 1
1070 if (opt%anchored_start) then
1071 res = optimized_match(opt, text, 1, ignore_case)
1072 return
1073 end if
1074
1075 ! Use prefix to skip positions (Boyer-Moore style)
1076 if (opt%prefix_len > 0 .and. .not. ignore_case) then
1077 pos = opt%prefix_len
1078 do while (pos <= text_len)
1079 ! Check if prefix matches at this position
1080 if (prefix_matches(text, pos - opt%prefix_len + 1, opt%prefix, opt%prefix_len)) then
1081 ! Try full NFA match from this position
1082 try_res = optimized_match(opt, text, pos - opt%prefix_len + 1, ignore_case)
1083 if (try_res%matched) then
1084 res = try_res
1085 return
1086 end if
1087 pos = pos + 1
1088 else
1089 ! Skip based on mismatched character
1090 skip = opt%skip_table(ichar(text(pos:pos)))
1091 pos = pos + max(skip, 1)
1092 end if
1093 end do
1094 else
1095 ! No prefix optimization - try each position
1096 do pos = 1, text_len + 1
1097 try_res = optimized_match(opt, text, pos, ignore_case)
1098 if (try_res%matched) then
1099 res = try_res
1100 return
1101 end if
1102 end do
1103 end if
1104
1105 end function optimized_search
1106
1107 function dfa_search(dfa, text, text_len) result(res)
1108 !> Fast O(n) DFA-based search
1109 !> Tries each starting position and returns first match
1110 type(compiled_dfa_t), intent(in) :: dfa
1111 character(len=*), intent(in) :: text
1112 integer, intent(in) :: text_len
1113 type(match_result_t) :: res
1114
1115 integer :: start_pos
1116 type(match_result_t) :: try_res
1117
1118 res%matched = .false.
1119
1120 if (.not. dfa%compiled .or. dfa%num_states == 0) return
1121
1122 ! Try each starting position
1123 do start_pos = 1, text_len + 1
1124 try_res = dfa_match(dfa, text, text_len, start_pos)
1125 if (try_res%matched) then
1126 res = try_res
1127 return
1128 end if
1129 end do
1130
1131 end function dfa_search
1132
1133 function dfa_match(dfa, text, text_len, start_pos) result(res)
1134 !> O(n) DFA matching from a specific position
1135 !> Just follows transition table - no state set operations
1136 type(compiled_dfa_t), intent(in) :: dfa
1137 character(len=*), intent(in) :: text
1138 integer, intent(in) :: text_len, start_pos
1139 type(match_result_t) :: res
1140
1141 integer :: state, pos, char_code
1142
1143 res%matched = .false.
1144
1145 if (.not. dfa%compiled) return
1146
1147 state = dfa%start_state
1148 pos = start_pos
1149
1150 ! Check if start state is accepting (empty match)
1151 if (dfa%states(state)%is_accept) then
1152 res%matched = .true.
1153 res%match_start = start_pos
1154 res%match_end = start_pos - 1
1155 end if
1156
1157 ! Process each character
1158 do while (pos <= text_len)
1159 char_code = ichar(text(pos:pos))
1160 state = dfa%states(state)%transitions(char_code)
1161
1162 if (state == DFA_DEAD_STATE) exit
1163
1164 pos = pos + 1
1165
1166 ! Check for acceptance (greedy - find longest)
1167 if (dfa%states(state)%is_accept) then
1168 res%matched = .true.
1169 res%match_start = start_pos
1170 res%match_end = pos - 1
1171 end if
1172 end do
1173
1174 end function dfa_match
1175
1176 function prefix_matches(text, pos, prefix, prefix_len) result(matches)
1177 character(len=*), intent(in) :: text
1178 integer, intent(in) :: pos, prefix_len
1179 character(len=*), intent(in) :: prefix
1180 logical :: matches
1181 integer :: i, text_len
1182
1183 matches = .false.
1184 text_len = len_trim(text)
1185
1186 if (pos < 1 .or. pos + prefix_len - 1 > text_len) return
1187
1188 do i = 1, prefix_len
1189 if (text(pos+i-1:pos+i-1) /= prefix(i:i)) return
1190 end do
1191
1192 matches = .true.
1193 end function prefix_matches
1194
1195 !---------------------------------------------------------------------------
1196 ! Optimized Match: Use bit vectors and caching
1197 !---------------------------------------------------------------------------
1198
1199 function optimized_match(opt, text, start_pos, ignore_case) result(res)
1200 type(optimized_nfa_t), intent(inout) :: opt
1201 character(len=*), intent(in) :: text
1202 integer, intent(in) :: start_pos
1203 logical, intent(in) :: ignore_case
1204 type(match_result_t) :: res
1205
1206 type(state_set_t) :: current, next_set
1207 integer :: pos, text_len
1208 character(len=1) :: c
1209
1210 res%matched = .false.
1211 text_len = len_trim(text)
1212
1213 if (opt%nfa%num_states == 0) return
1214
1215 ! Initialize with epsilon closure of start state (including position-dependent anchors)
1216 call current%clear()
1217 call compute_epsilon_closure_full(opt%nfa, opt%nfa%start_state, current, text, start_pos, text_len)
1218
1219 ! Check if already accepting (empty pattern)
1220 if (is_accepting_set(opt%nfa, current)) then
1221 res%matched = .true.
1222 res%match_start = start_pos
1223 res%match_end = start_pos - 1
1224 return
1225 end if
1226
1227 ! Process each character
1228 pos = start_pos
1229 do while (pos <= text_len .and. .not. current%is_empty())
1230 c = text(pos:pos)
1231
1232 ! Compute next states with DFA caching
1233 call next_set%clear()
1234 call step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set)
1235
1236 ! Compute epsilon closure
1237 call current%clear()
1238 call expand_epsilon_closure(opt%nfa, next_set, current, text, pos + 1, text_len)
1239
1240 pos = pos + 1
1241
1242 ! Check for acceptance (greedy - continue to find longest)
1243 if (is_accepting_set(opt%nfa, current)) then
1244 res%matched = .true.
1245 res%match_start = start_pos
1246 res%match_end = pos - 1
1247 end if
1248 end do
1249
1250 end function optimized_match
1251
1252 subroutine compute_epsilon_closure_full(nfa, start_state, result_set, text, pos, text_len)
1253 type(nfa_t), intent(in) :: nfa
1254 integer, intent(in) :: start_state
1255 type(state_set_t), intent(inout) :: result_set
1256 character(len=*), intent(in) :: text
1257 integer, intent(in) :: pos, text_len
1258
1259 integer :: stack(MAX_STATES), stack_top
1260 integer :: state, i, target
1261 type(nfa_transition_t) :: trans
1262
1263 stack_top = 1
1264 stack(1) = start_state
1265
1266 do while (stack_top > 0)
1267 state = stack(stack_top)
1268 stack_top = stack_top - 1
1269
1270 if (state < 1 .or. state > nfa%num_states) cycle
1271 if (result_set%contains(state)) cycle
1272
1273 call result_set%add(state)
1274
1275 do i = 1, nfa%states(state)%num_trans
1276 trans = nfa%states(state)%trans(i)
1277
1278 if (trans%trans_type == TRANS_EPSILON) then
1279 if (trans%anchor_type < 0) cycle ! Skip backrefs
1280 target = trans%target
1281 if (target >= 1 .and. target <= nfa%num_states) then
1282 if (.not. result_set%contains(target)) then
1283 stack_top = stack_top + 1
1284 if (stack_top <= MAX_STATES) stack(stack_top) = target
1285 end if
1286 end if
1287
1288 else if (trans%trans_type == TRANS_ANCHOR) then
1289 if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then
1290 target = trans%target
1291 if (target >= 1 .and. target <= nfa%num_states) then
1292 if (.not. result_set%contains(target)) then
1293 stack_top = stack_top + 1
1294 if (stack_top <= MAX_STATES) stack(stack_top) = target
1295 end if
1296 end if
1297 end if
1298 end if
1299 end do
1300 end do
1301 end subroutine compute_epsilon_closure_full
1302
1303 subroutine expand_epsilon_closure(nfa, input_set, result_set, text, pos, text_len)
1304 type(nfa_t), intent(in) :: nfa
1305 type(state_set_t), intent(in) :: input_set
1306 type(state_set_t), intent(inout) :: result_set
1307 character(len=*), intent(in) :: text
1308 integer, intent(in) :: pos, text_len
1309
1310 integer :: state, word_idx, bit_idx
1311 integer(8) :: word, mask
1312
1313 ! Iterate through set bits
1314 do word_idx = 1, size(input_set%bits)
1315 word = input_set%bits(word_idx)
1316 if (word == 0) cycle
1317
1318 do bit_idx = 0, 63
1319 mask = ishft(1_8, bit_idx)
1320 if (iand(word, mask) /= 0) then
1321 state = (word_idx - 1) * 64 + bit_idx + 1
1322 if (state <= nfa%num_states) then
1323 call compute_epsilon_closure_full(nfa, state, result_set, text, pos, text_len)
1324 end if
1325 end if
1326 end do
1327 end do
1328 end subroutine expand_epsilon_closure
1329
1330 subroutine step_with_cache(opt, current, c, pos, text, text_len, ignore_case, next_set)
1331 !> Compute next states with DFA caching
1332 !> Cache key: (state_set_hash, char_code, ignore_case)
1333 !> This avoids recomputing transitions for repeated (state_set, char) pairs
1334 type(optimized_nfa_t), intent(inout) :: opt
1335 type(state_set_t), intent(in) :: current
1336 character(len=1), intent(in) :: c
1337 integer, intent(in) :: pos, text_len
1338 character(len=*), intent(in) :: text
1339 logical, intent(in) :: ignore_case
1340 type(state_set_t), intent(inout) :: next_set
1341
1342 integer(8) :: state_hash
1343 integer :: cache_idx, char_code
1344
1345 ! Compute cache key
1346 state_hash = current%hash()
1347 char_code = ichar(c)
1348
1349 ! Compute cache index (combine hash with char code)
1350 cache_idx = int(mod(abs(ieor(state_hash, int(char_code, 8))), int(DFA_CACHE_SIZE, 8))) + 1
1351
1352 ! Check cache hit (using hash + char + case as key)
1353 ! Note: This may have rare hash collisions, but performance benefit outweighs risk
1354 if (opt%dfa_cache(cache_idx)%valid .and. &
1355 opt%dfa_cache(cache_idx)%state_hash == state_hash .and. &
1356 opt%dfa_cache(cache_idx)%char_code == char_code .and. &
1357 (opt%dfa_cache(cache_idx)%is_case_insensitive .eqv. ignore_case)) then
1358 ! Cache hit - copy cached result
1359 call next_set%copy_from(opt%dfa_cache(cache_idx)%next_states)
1360 return
1361 end if
1362
1363 ! Cache miss - compute transitions
1364 call compute_char_transitions(opt%nfa, current, c, ignore_case, next_set)
1365
1366 ! Store in cache
1367 opt%dfa_cache(cache_idx)%valid = .true.
1368 opt%dfa_cache(cache_idx)%state_hash = state_hash
1369 opt%dfa_cache(cache_idx)%char_code = char_code
1370 opt%dfa_cache(cache_idx)%is_case_insensitive = ignore_case
1371 call opt%dfa_cache(cache_idx)%next_states%copy_from(next_set)
1372
1373 end subroutine step_with_cache
1374
1375 subroutine compute_char_transitions(nfa, current, c, ignore_case, next_set)
1376 !> Compute character transitions without caching (called on cache miss)
1377 type(nfa_t), intent(in) :: nfa
1378 type(state_set_t), intent(in) :: current
1379 character(len=1), intent(in) :: c
1380 logical, intent(in) :: ignore_case
1381 type(state_set_t), intent(inout) :: next_set
1382
1383 integer :: state, word_idx, bit_idx, i
1384 integer(8) :: word, mask
1385 type(nfa_transition_t) :: trans
1386 character(len=1) :: c_lower, match_lower
1387
1388 ! Iterate through current states
1389 do word_idx = 1, size(current%bits)
1390 word = current%bits(word_idx)
1391 if (word == 0) cycle
1392
1393 do bit_idx = 0, 63
1394 mask = ishft(1_8, bit_idx)
1395 if (iand(word, mask) /= 0) then
1396 state = (word_idx - 1) * 64 + bit_idx + 1
1397 if (state > nfa%num_states) cycle
1398
1399 ! Process transitions from this state
1400 do i = 1, nfa%states(state)%num_trans
1401 trans = nfa%states(state)%trans(i)
1402
1403 select case (trans%trans_type)
1404 case (TRANS_CHAR)
1405 if (ignore_case) then
1406 c_lower = to_lower_char(c)
1407 match_lower = to_lower_char(trans%match_char)
1408 if (c_lower == match_lower) then
1409 call next_set%add(trans%target)
1410 end if
1411 else
1412 if (c == trans%match_char) then
1413 call next_set%add(trans%target)
1414 end if
1415 end if
1416
1417 case (TRANS_CLASS)
1418 ! Use fast bitwise character class test
1419 if (ignore_case) then
1420 if (charclass_test_case_insensitive(trans%char_bits, c)) then
1421 call next_set%add(trans%target)
1422 end if
1423 else
1424 if (charclass_test(trans%char_bits, c)) then
1425 call next_set%add(trans%target)
1426 end if
1427 end if
1428
1429 case (TRANS_ANY)
1430 if (c /= char(10)) then
1431 call next_set%add(trans%target)
1432 end if
1433 end select
1434 end do
1435 end if
1436 end do
1437 end do
1438 end subroutine compute_char_transitions
1439
1440 function is_accepting_set(nfa, states) result(res)
1441 type(nfa_t), intent(in) :: nfa
1442 type(state_set_t), intent(in) :: states
1443 logical :: res
1444
1445 integer :: state, word_idx, bit_idx
1446 integer(8) :: word, mask
1447
1448 res = .false.
1449
1450 do word_idx = 1, size(states%bits)
1451 word = states%bits(word_idx)
1452 if (word == 0) cycle
1453
1454 do bit_idx = 0, 63
1455 mask = ishft(1_8, bit_idx)
1456 if (iand(word, mask) /= 0) then
1457 state = (word_idx - 1) * 64 + bit_idx + 1
1458 if (state >= 1 .and. state <= nfa%num_states) then
1459 if (nfa%states(state)%is_accept) then
1460 res = .true.
1461 return
1462 end if
1463 end if
1464 end if
1465 end do
1466 end do
1467 end function is_accepting_set
1468
1469 !---------------------------------------------------------------------------
1470 ! Helper functions
1471 !---------------------------------------------------------------------------
1472
1473 function anchor_matches_opt(anchor_type, text, pos, text_len) result(matches)
1474 integer, intent(in) :: anchor_type
1475 character(len=*), intent(in) :: text
1476 integer, intent(in) :: pos, text_len
1477 logical :: matches
1478
1479 logical :: at_start, at_end, prev_word, curr_word
1480
1481 matches = .false.
1482 at_start = (pos == 1) .or. (pos < 1)
1483 at_end = (pos > text_len)
1484
1485 select case (anchor_type)
1486 case (1) ! ^
1487 if (at_start) then
1488 matches = .true.
1489 else if (pos > 1 .and. pos <= text_len + 1) then
1490 matches = (text(pos-1:pos-1) == char(10))
1491 end if
1492
1493 case (2) ! $
1494 if (at_end) then
1495 matches = .true.
1496 else if (pos >= 1 .and. pos <= text_len) then
1497 matches = (text(pos:pos) == char(10))
1498 end if
1499
1500 case (3) ! \<
1501 prev_word = .false.
1502 curr_word = .false.
1503 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
1504 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
1505 matches = (.not. prev_word) .and. curr_word
1506
1507 case (4) ! \>
1508 prev_word = .false.
1509 curr_word = .false.
1510 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
1511 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
1512 matches = prev_word .and. (.not. curr_word)
1513
1514 case (5) ! \b
1515 prev_word = .false.
1516 curr_word = .false.
1517 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
1518 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
1519 matches = prev_word .neqv. curr_word
1520
1521 case (6) ! \B
1522 prev_word = .false.
1523 curr_word = .false.
1524 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char_opt(text(pos-1:pos-1))
1525 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char_opt(text(pos:pos))
1526 matches = prev_word .eqv. curr_word
1527 end select
1528 end function anchor_matches_opt
1529
1530 pure function is_word_char_opt(c) result(res)
1531 character(len=1), intent(in) :: c
1532 logical :: res
1533 integer :: ic
1534 ic = ichar(c)
1535 res = (ic >= ichar('a') .and. ic <= ichar('z')) .or. &
1536 (ic >= ichar('A') .and. ic <= ichar('Z')) .or. &
1537 (ic >= ichar('0') .and. ic <= ichar('9')) .or. &
1538 (c == '_')
1539 end function is_word_char_opt
1540
1541 pure function to_lower_char(c) result(lower)
1542 character(len=1), intent(in) :: c
1543 character(len=1) :: lower
1544 integer :: ic
1545 ic = ichar(c)
1546 if (ic >= ichar('A') .and. ic <= ichar('Z')) then
1547 ! ASCII uppercase A-Z -> a-z
1548 lower = char(ic + 32)
1549 else if (ic >= 192 .and. ic <= 214) then
1550 ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246)
1551 lower = char(ic + 32)
1552 else if (ic >= 216 .and. ic <= 222) then
1553 ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254)
1554 lower = char(ic + 32)
1555 else if (ic >= 128 .and. ic <= 150) then
1556 ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6)
1557 ! When preceded by 0xC3, these represent À-Ö, fold to à-ö
1558 lower = char(ic + 32)
1559 else if (ic >= 152 .and. ic <= 158) then
1560 ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE)
1561 ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ
1562 lower = char(ic + 32)
1563 else
1564 lower = c
1565 end if
1566 end function to_lower_char
1567
1568 function char_in_class_opt(c, char_class, negated, ignore_case) result(res)
1569 character(len=1), intent(in) :: c
1570 logical, intent(in) :: char_class(0:255)
1571 logical, intent(in) :: negated, ignore_case
1572 logical :: res
1573
1574 integer :: ic
1575 character(len=1) :: c_lower, c_upper
1576
1577 ic = ichar(c)
1578 res = char_class(ic)
1579
1580 if (ignore_case .and. .not. res) then
1581 c_lower = to_lower_char(c)
1582 if (c_lower /= c) res = char_class(ichar(c_lower))
1583 if (.not. res) then
1584 ic = ichar(c)
1585 if (ic >= ichar('a') .and. ic <= ichar('z')) then
1586 c_upper = char(ic - 32)
1587 res = char_class(ichar(c_upper))
1588 end if
1589 end if
1590 end if
1591
1592 if (negated) res = .not. res
1593 end function char_in_class_opt
1594
1595 !---------------------------------------------------------------------------
1596 ! Aho-Corasick Integration for Alternation Patterns
1597 !---------------------------------------------------------------------------
1598
1599 subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case)
1600 !> Try to build Aho-Corasick automaton for simple alternation patterns
1601 !> Pattern like "foo|bar|baz" with only literal characters and | separators
1602 type(optimized_nfa_t), intent(inout) :: opt
1603 character(len=*), intent(in) :: pattern
1604 logical, intent(in) :: is_ere, ignore_case
1605
1606 character(len=4096), allocatable :: alternatives(:)
1607 integer :: num_alternatives, ierr
1608 logical :: is_simple
1609
1610 allocate(alternatives(1000))
1611
1612 opt%use_aho_corasick = .false.
1613
1614 ! Check if pattern is simple alternation of literals
1615 call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple)
1616
1617 ! DEBUG (commented out for production)
1618 ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple
1619
1620 if (.not. is_simple .or. num_alternatives < 2) return
1621
1622 ! Build Aho-Corasick automaton
1623 call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr)
1624
1625 if (ierr == 0) then
1626 opt%use_aho_corasick = .true.
1627 end if
1628
1629 deallocate(alternatives)
1630
1631 end subroutine try_build_aho_corasick
1632
1633 subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple)
1634 !> Parse pattern to check if it's simple alternation of literals
1635 !> Returns the alternatives if so
1636 character(len=*), intent(in) :: pattern
1637 logical, intent(in) :: is_ere
1638 character(len=*), intent(out) :: alternatives(:)
1639 integer, intent(out) :: num_alt
1640 logical, intent(out) :: is_simple
1641
1642 integer :: i, pat_len, alt_start, alt_len
1643 character(len=1) :: c, next_c
1644 logical :: in_escape
1645
1646 is_simple = .true.
1647 num_alt = 0
1648 pat_len = pattern_len(pattern) ! Use pattern_len to preserve whitespace patterns
1649
1650 if (pat_len == 0) then
1651 is_simple = .false.
1652 return
1653 end if
1654
1655 alt_start = 1
1656 alt_len = 0
1657 in_escape = .false.
1658 i = 1
1659
1660 do while (i <= pat_len)
1661 c = pattern(i:i)
1662
1663 if (in_escape) then
1664 ! In ERE mode, \| is literal |
1665 ! In BRE mode, \| is alternation (GNU extension)
1666 if (c == '|' .and. .not. is_ere) then
1667 ! BRE alternation
1668 if (alt_len > 0) then
1669 num_alt = num_alt + 1
1670 if (num_alt > size(alternatives)) then
1671 is_simple = .false.
1672 return
1673 end if
1674 alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
1675 ! Add null terminator to preserve exact length
1676 if (alt_len < len(alternatives(num_alt))) then
1677 alternatives(num_alt)(alt_len+1:alt_len+1) = char(0)
1678 end if
1679 else
1680 ! Empty alternative - still valid
1681 num_alt = num_alt + 1
1682 alternatives(num_alt) = char(0)
1683 end if
1684 alt_start = i + 1
1685 alt_len = 0
1686 else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. &
1687 c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. &
1688 c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. &
1689 c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. &
1690 c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. &
1691 c == '7' .or. c == '8' .or. c == '9') then
1692 ! Regex metacharacter - not simple
1693 is_simple = .false.
1694 return
1695 else
1696 ! Escaped literal character (e.g., \., \*, etc.)
1697 alt_len = alt_len + 1
1698 end if
1699 in_escape = .false.
1700 i = i + 1
1701 cycle
1702 end if
1703
1704 if (c == '\') then
1705 in_escape = .true.
1706 i = i + 1
1707 cycle
1708 end if
1709
1710 ! Check for metacharacters
1711 if (is_ere) then
1712 ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters
1713 if (c == '|') then
1714 ! Alternation separator
1715 if (alt_len > 0) then
1716 num_alt = num_alt + 1
1717 if (num_alt > size(alternatives)) then
1718 is_simple = .false.
1719 return
1720 end if
1721 alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
1722 ! Add null terminator to preserve exact length
1723 if (alt_len < len(alternatives(num_alt))) then
1724 alternatives(num_alt)(alt_len+1:alt_len+1) = char(0)
1725 end if
1726 else
1727 num_alt = num_alt + 1
1728 alternatives(num_alt) = char(0)
1729 end if
1730 alt_start = i + 1
1731 alt_len = 0
1732 i = i + 1
1733 cycle
1734 else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. &
1735 c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. &
1736 c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then
1737 ! Metacharacter - not simple alternation
1738 is_simple = .false.
1739 return
1740 end if
1741 else
1742 ! BRE mode: only . * [ ] ^ $ are metacharacters
1743 ! | is literal, \| is alternation (GNU extension)
1744 if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. &
1745 c == '^' .or. c == '$') then
1746 is_simple = .false.
1747 return
1748 end if
1749 end if
1750
1751 ! Regular literal character
1752 alt_len = alt_len + 1
1753 i = i + 1
1754 end do
1755
1756 ! Handle last alternative
1757 if (alt_len > 0 .or. num_alt > 0) then
1758 num_alt = num_alt + 1
1759 if (num_alt > size(alternatives)) then
1760 is_simple = .false.
1761 return
1762 end if
1763 if (alt_len > 0) then
1764 alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
1765 ! Add null terminator to preserve exact length
1766 if (alt_len < len(alternatives(num_alt))) then
1767 alternatives(num_alt)(alt_len+1:alt_len+1) = char(0)
1768 end if
1769 else
1770 alternatives(num_alt) = char(0)
1771 end if
1772 end if
1773
1774 ! Need at least 2 alternatives for Aho-Corasick to be useful
1775 if (num_alt < 2) then
1776 is_simple = .false.
1777 end if
1778
1779 end subroutine parse_simple_alternation
1780
1781 function ac_optimized_search(ac, text) result(res)
1782 !> Search using Aho-Corasick automaton
1783 type(ac_automaton_t), intent(in) :: ac
1784 character(len=*), intent(in) :: text
1785 type(match_result_t) :: res
1786
1787 type(ac_match_t) :: ac_match
1788
1789 res%matched = .false.
1790
1791 ac_match = ac_search(ac, text)
1792 if (ac_match%matched) then
1793 res%matched = .true.
1794 res%match_start = ac_match%start_pos
1795 res%match_end = ac_match%end_pos
1796 end if
1797
1798 end function ac_optimized_search
1799
1800 !---------------------------------------------------------------------------
1801 ! Backtracking Matcher for Backreferences
1802 !---------------------------------------------------------------------------
1803
1804 function backtrack_search(nfa, text, text_len, ignore_case) result(res)
1805 !> Search for pattern with backreferences using backtracking
1806 !> Tries each starting position until a match is found
1807 type(nfa_t), intent(in) :: nfa
1808 character(len=*), intent(in) :: text
1809 integer, intent(in) :: text_len
1810 logical, intent(in) :: ignore_case
1811 type(match_result_t) :: res
1812
1813 integer :: start_pos
1814 type(match_result_t) :: try_res
1815
1816 res%matched = .false.
1817
1818 do start_pos = 1, text_len + 1
1819 try_res = backtrack_match(nfa, text, text_len, start_pos, ignore_case)
1820 if (try_res%matched) then
1821 res = try_res
1822 return
1823 end if
1824 end do
1825
1826 end function backtrack_search
1827
1828 function backtrack_match(nfa, text, text_len, start_pos, ignore_case) result(res)
1829 !> Try to match NFA with backreferences starting at start_pos
1830 !> Uses recursive backtracking to track group captures
1831 type(nfa_t), intent(in) :: nfa
1832 character(len=*), intent(in) :: text
1833 integer, intent(in) :: text_len, start_pos
1834 logical, intent(in) :: ignore_case
1835 type(match_result_t) :: res
1836
1837 integer :: group_starts(9), group_ends(9)
1838 integer :: best_end
1839
1840 res%matched = .false.
1841 group_starts = 0
1842 group_ends = 0
1843 best_end = start_pos - 1
1844
1845 ! Try to match from the start state
1846 if (backtrack_from_state(nfa, nfa%start_state, text, text_len, start_pos, &
1847 ignore_case, group_starts, group_ends, best_end)) then
1848 res%matched = .true.
1849 res%match_start = start_pos
1850 res%match_end = best_end
1851 res%group_starts = group_starts
1852 res%group_ends = group_ends
1853 end if
1854
1855 end function backtrack_match
1856
1857 recursive function backtrack_from_state(nfa, state, text, text_len, pos, &
1858 ignore_case, group_starts, group_ends, best_end) result(matched)
1859 !> Recursive backtracking from a given NFA state
1860 !> Returns true if we can reach an accepting state
1861 type(nfa_t), intent(in) :: nfa
1862 integer, intent(in) :: state
1863 character(len=*), intent(in) :: text
1864 integer, intent(in) :: text_len, pos
1865 logical, intent(in) :: ignore_case
1866 integer, intent(inout) :: group_starts(9), group_ends(9)
1867 integer, intent(inout) :: best_end
1868 logical :: matched
1869
1870 integer :: i, target, old_start, old_end
1871 integer :: backref_num, ref_start, ref_end, ref_len
1872 integer :: saved_starts(9), saved_ends(9)
1873 type(nfa_transition_t) :: trans
1874 character(len=1) :: c, c_lower, match_lower
1875 logical :: char_matches
1876
1877 matched = .false.
1878
1879 if (state < 1 .or. state > nfa%num_states) return
1880
1881 ! Record group start if this state starts a group
1882 if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then
1883 old_start = group_starts(nfa%states(state)%group_start)
1884 group_starts(nfa%states(state)%group_start) = pos
1885 else
1886 old_start = 0
1887 end if
1888
1889 ! Record group end if this state ends a group
1890 ! This must be done BEFORE processing transitions so backrefs can see the captured text
1891 if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then
1892 old_end = group_ends(nfa%states(state)%group_end)
1893 group_ends(nfa%states(state)%group_end) = pos - 1
1894 else
1895 old_end = 0
1896 end if
1897
1898 ! Check if this is an accepting state
1899 if (nfa%states(state)%is_accept) then
1900 matched = .true.
1901 if (pos - 1 > best_end) best_end = pos - 1
1902 ! Continue to find longest match (greedy)
1903 end if
1904
1905 ! Try each transition from this state
1906 do i = 1, nfa%states(state)%num_trans
1907 trans = nfa%states(state)%trans(i)
1908 target = trans%target
1909
1910 select case (trans%trans_type)
1911 case (TRANS_EPSILON)
1912 ! Check for backreference (negative anchor_type)
1913 if (trans%anchor_type < 0) then
1914 backref_num = -trans%anchor_type
1915 if (backref_num >= 1 .and. backref_num <= 9) then
1916 ref_start = group_starts(backref_num)
1917 ref_end = group_ends(backref_num)
1918
1919 ! If group hasn't been captured yet, backreference fails
1920 if (ref_start == 0 .or. ref_end == 0 .or. ref_end < ref_start) cycle
1921
1922 ref_len = ref_end - ref_start + 1
1923
1924 ! Check if we have enough text remaining
1925 if (pos + ref_len - 1 > text_len) cycle
1926
1927 ! Check if the text matches the captured group
1928 if (ignore_case) then
1929 if (.not. strings_equal_icase(text(pos:pos+ref_len-1), &
1930 text(ref_start:ref_end))) cycle
1931 else
1932 if (text(pos:pos+ref_len-1) /= text(ref_start:ref_end)) cycle
1933 end if
1934
1935 ! Backref matches - continue from target with advanced position
1936 saved_starts = group_starts
1937 saved_ends = group_ends
1938 if (backtrack_from_state(nfa, target, text, text_len, pos + ref_len, &
1939 ignore_case, group_starts, group_ends, best_end)) then
1940 matched = .true.
1941 else
1942 group_starts = saved_starts
1943 group_ends = saved_ends
1944 end if
1945 end if
1946 else
1947 ! Regular epsilon transition
1948 saved_starts = group_starts
1949 saved_ends = group_ends
1950 if (backtrack_from_state(nfa, target, text, text_len, pos, &
1951 ignore_case, group_starts, group_ends, best_end)) then
1952 matched = .true.
1953 else
1954 group_starts = saved_starts
1955 group_ends = saved_ends
1956 end if
1957 end if
1958
1959 case (TRANS_ANCHOR)
1960 ! Check if anchor matches at this position
1961 if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then
1962 saved_starts = group_starts
1963 saved_ends = group_ends
1964 if (backtrack_from_state(nfa, target, text, text_len, pos, &
1965 ignore_case, group_starts, group_ends, best_end)) then
1966 matched = .true.
1967 else
1968 group_starts = saved_starts
1969 group_ends = saved_ends
1970 end if
1971 end if
1972
1973 case (TRANS_CHAR)
1974 ! Character transition - need text available
1975 if (pos <= text_len) then
1976 c = text(pos:pos)
1977 char_matches = .false.
1978
1979 if (ignore_case) then
1980 c_lower = to_lower_char(c)
1981 match_lower = to_lower_char(trans%match_char)
1982 char_matches = (c_lower == match_lower)
1983 else
1984 char_matches = (c == trans%match_char)
1985 end if
1986
1987 if (char_matches) then
1988 saved_starts = group_starts
1989 saved_ends = group_ends
1990 if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
1991 ignore_case, group_starts, group_ends, best_end)) then
1992 matched = .true.
1993 else
1994 group_starts = saved_starts
1995 group_ends = saved_ends
1996 end if
1997 end if
1998 end if
1999
2000 case (TRANS_CLASS)
2001 ! Character class transition
2002 if (pos <= text_len) then
2003 c = text(pos:pos)
2004 if (ignore_case) then
2005 if (charclass_test_case_insensitive(trans%char_bits, c)) then
2006 saved_starts = group_starts
2007 saved_ends = group_ends
2008 if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
2009 ignore_case, group_starts, group_ends, best_end)) then
2010 matched = .true.
2011 else
2012 group_starts = saved_starts
2013 group_ends = saved_ends
2014 end if
2015 end if
2016 else
2017 if (charclass_test(trans%char_bits, c)) then
2018 saved_starts = group_starts
2019 saved_ends = group_ends
2020 if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
2021 ignore_case, group_starts, group_ends, best_end)) then
2022 matched = .true.
2023 else
2024 group_starts = saved_starts
2025 group_ends = saved_ends
2026 end if
2027 end if
2028 end if
2029 end if
2030
2031 case (TRANS_ANY)
2032 ! Dot matches any character except newline
2033 if (pos <= text_len) then
2034 if (text(pos:pos) /= char(10)) then
2035 saved_starts = group_starts
2036 saved_ends = group_ends
2037 if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
2038 ignore_case, group_starts, group_ends, best_end)) then
2039 matched = .true.
2040 else
2041 group_starts = saved_starts
2042 group_ends = saved_ends
2043 end if
2044 end if
2045 end if
2046
2047 end select
2048 end do
2049
2050 ! Restore group start and end if we didn't match
2051 if (.not. matched) then
2052 if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then
2053 group_starts(nfa%states(state)%group_start) = old_start
2054 end if
2055 if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then
2056 group_ends(nfa%states(state)%group_end) = old_end
2057 end if
2058 end if
2059
2060 end function backtrack_from_state
2061
2062 function strings_equal_icase(s1, s2) result(equal)
2063 !> Compare two strings case-insensitively
2064 character(len=*), intent(in) :: s1, s2
2065 logical :: equal
2066 integer :: i, n
2067
2068 equal = .false.
2069 n = len(s1)
2070 if (len(s2) /= n) return
2071
2072 do i = 1, n
2073 if (to_lower_char(s1(i:i)) /= to_lower_char(s2(i:i))) return
2074 end do
2075
2076 equal = .true.
2077 end function strings_equal_icase
2078
2079 end module regex_optimizer
2080