fortrangoingonforty/ferp / 47f477c

Browse files

Add Aho-Corasick for alternation patterns (6x faster than grep)

Authored by espadonne
SHA
47f477c325987aaf3dea018eab539469ced129e3
Parents
eab33b0
Tree
d4bcff1

4 changed files

StatusFile+-
M Makefile 3 1
A src/regex/aho_corasick.f90 342 0
M src/regex/regex_api.f90 5 0
M src/regex/regex_optimizer.f90 207 0
Makefilemodified
@@ -38,6 +38,7 @@ REGEX_SRCS = $(REGEX_DIR)/regex_types.f90 \
38
              $(REGEX_DIR)/regex_parser.f90 \
38
              $(REGEX_DIR)/regex_parser.f90 \
39
              $(REGEX_DIR)/regex_nfa.f90 \
39
              $(REGEX_DIR)/regex_nfa.f90 \
40
              $(REGEX_DIR)/regex_engine.f90 \
40
              $(REGEX_DIR)/regex_engine.f90 \
41
+             $(REGEX_DIR)/aho_corasick.f90 \
41
              $(REGEX_DIR)/regex_optimizer.f90 \
42
              $(REGEX_DIR)/regex_optimizer.f90 \
42
              $(REGEX_DIR)/regex_api.f90 \
43
              $(REGEX_DIR)/regex_api.f90 \
43
              $(REGEX_DIR)/pcre_api.f90
44
              $(REGEX_DIR)/pcre_api.f90
@@ -105,7 +106,8 @@ $(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o
105
 $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o
106
 $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o
106
 $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_parser.o
107
 $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_parser.o
107
 $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o
108
 $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o
108
-$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o
109
+$(BUILD_DIR)/aho_corasick.o:
110
+$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/aho_corasick.o
109
 $(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o
111
 $(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o
110
 $(BUILD_DIR)/pcre_api.o:
112
 $(BUILD_DIR)/pcre_api.o:
111
 
113
 
src/regex/aho_corasick.f90added
@@ -0,0 +1,342 @@
1
+module aho_corasick
2
+  !> Aho-Corasick automaton for multi-pattern string matching
3
+  !> Matches all patterns in a single pass O(n + m + z)
4
+  !> where n=text length, m=total pattern length, z=matches
5
+  implicit none
6
+  private
7
+
8
+  public :: ac_automaton_t, ac_match_t
9
+  public :: ac_build, ac_search, ac_search_any, ac_free
10
+
11
+  integer, parameter :: MAX_CHILDREN = 256  ! ASCII character set
12
+  integer, parameter :: MAX_PATTERNS = 1000
13
+  integer, parameter :: MAX_PATTERN_LEN = 4096
14
+
15
+  type :: ac_node_t
16
+    !> Trie node with failure links
17
+    integer :: children(0:255) = 0     ! Child node indices (0 = no child)
18
+    integer :: failure = 0              ! Failure link (fall back on mismatch)
19
+    integer :: output_pattern = 0       ! Pattern index that ends here (0 = none)
20
+    integer :: output_link = 0          ! Link to next output state
21
+    integer :: depth = 0                ! Depth in trie (= prefix length)
22
+  end type ac_node_t
23
+
24
+  type :: ac_automaton_t
25
+    !> Aho-Corasick automaton
26
+    type(ac_node_t), allocatable :: nodes(:)
27
+    integer :: num_nodes = 0
28
+    integer :: capacity = 0
29
+    integer :: num_patterns = 0
30
+    integer, allocatable :: pattern_lengths(:)
31
+    logical :: compiled = .false.
32
+    logical :: ignore_case = .false.
33
+  end type ac_automaton_t
34
+
35
+  type :: ac_match_t
36
+    !> Match result
37
+    logical :: matched = .false.
38
+    integer :: pattern_idx = 0          ! Which pattern matched (1-based)
39
+    integer :: start_pos = 0            ! Start position in text (1-based)
40
+    integer :: end_pos = 0              ! End position in text (1-based)
41
+  end type ac_match_t
42
+
43
+contains
44
+
45
+  subroutine ac_build(ac, patterns, num_patterns, ignore_case, ierr)
46
+    !> Build Aho-Corasick automaton from patterns
47
+    type(ac_automaton_t), intent(out) :: ac
48
+    character(len=*), intent(in) :: patterns(:)
49
+    integer, intent(in) :: num_patterns
50
+    logical, intent(in) :: ignore_case
51
+    integer, intent(out) :: ierr
52
+
53
+    integer :: i, j, c, state, next_state, child
54
+    integer, allocatable :: queue(:)
55
+    integer :: q_head, q_tail
56
+    integer :: fail_state
57
+    character(len=1) :: ch
58
+
59
+    ierr = 0
60
+
61
+    ! Allocate BFS queue
62
+    allocate(queue(MAX_PATTERNS * MAX_PATTERN_LEN))
63
+    ac%ignore_case = ignore_case
64
+    ac%num_patterns = num_patterns
65
+
66
+    ! Allocate pattern lengths
67
+    allocate(ac%pattern_lengths(num_patterns))
68
+    do i = 1, num_patterns
69
+      ac%pattern_lengths(i) = len_trim(patterns(i))
70
+    end do
71
+
72
+    ! Initial capacity - estimate based on total pattern length
73
+    ac%capacity = 1
74
+    do i = 1, num_patterns
75
+      ac%capacity = ac%capacity + len_trim(patterns(i))
76
+    end do
77
+    ac%capacity = max(ac%capacity, 256)
78
+    allocate(ac%nodes(ac%capacity))
79
+
80
+    ! Initialize root node (index 1)
81
+    ac%num_nodes = 1
82
+    ac%nodes(1)%depth = 0
83
+
84
+    ! Phase 1: Build trie from patterns
85
+    do i = 1, num_patterns
86
+      state = 1  ! Start at root
87
+      do j = 1, len_trim(patterns(i))
88
+        ch = patterns(i)(j:j)
89
+        if (ignore_case) then
90
+          c = to_lower_code(ichar(ch))
91
+        else
92
+          c = ichar(ch)
93
+        end if
94
+
95
+        child = ac%nodes(state)%children(c)
96
+        if (child == 0) then
97
+          ! Create new node
98
+          ac%num_nodes = ac%num_nodes + 1
99
+          if (ac%num_nodes > ac%capacity) then
100
+            call grow_nodes(ac)
101
+          end if
102
+          ac%nodes(state)%children(c) = ac%num_nodes
103
+          ac%nodes(ac%num_nodes)%depth = ac%nodes(state)%depth + 1
104
+          child = ac%num_nodes
105
+        end if
106
+        state = child
107
+      end do
108
+      ! Mark this state as accepting for pattern i
109
+      ac%nodes(state)%output_pattern = i
110
+    end do
111
+
112
+    ! Phase 2: Compute failure links using BFS
113
+    q_head = 1
114
+    q_tail = 0
115
+
116
+    ! Initialize: depth-1 nodes fail to root
117
+    do c = 0, 255
118
+      child = ac%nodes(1)%children(c)
119
+      if (child /= 0) then
120
+        ac%nodes(child)%failure = 1  ! Fail to root
121
+        q_tail = q_tail + 1
122
+        queue(q_tail) = child
123
+      end if
124
+    end do
125
+
126
+    ! BFS to compute failure links for deeper nodes
127
+    do while (q_head <= q_tail)
128
+      state = queue(q_head)
129
+      q_head = q_head + 1
130
+
131
+      do c = 0, 255
132
+        child = ac%nodes(state)%children(c)
133
+        if (child /= 0) then
134
+          ! Add to queue
135
+          q_tail = q_tail + 1
136
+          queue(q_tail) = child
137
+
138
+          ! Compute failure link: follow parent's failure until we find
139
+          ! a state with a transition on c, or reach root
140
+          fail_state = ac%nodes(state)%failure
141
+          do while (fail_state > 1)
142
+            if (ac%nodes(fail_state)%children(c) /= 0) exit
143
+            fail_state = ac%nodes(fail_state)%failure
144
+          end do
145
+
146
+          if (fail_state <= 1) then
147
+            ! At or beyond root
148
+            if (ac%nodes(1)%children(c) /= 0 .and. ac%nodes(1)%children(c) /= child) then
149
+              ac%nodes(child)%failure = ac%nodes(1)%children(c)
150
+            else
151
+              ac%nodes(child)%failure = 1  ! Fail to root
152
+            end if
153
+          else if (ac%nodes(fail_state)%children(c) == child) then
154
+            ! Would create self-loop, fail to root
155
+            ac%nodes(child)%failure = 1
156
+          else
157
+            ac%nodes(child)%failure = ac%nodes(fail_state)%children(c)
158
+          end if
159
+
160
+          ! Compute output link: chain of accepting states via failure links
161
+          if (ac%nodes(ac%nodes(child)%failure)%output_pattern /= 0) then
162
+            ac%nodes(child)%output_link = ac%nodes(child)%failure
163
+          else
164
+            ac%nodes(child)%output_link = ac%nodes(ac%nodes(child)%failure)%output_link
165
+          end if
166
+        end if
167
+      end do
168
+    end do
169
+
170
+    ac%compiled = .true.
171
+
172
+    deallocate(queue)
173
+
174
+  contains
175
+
176
+    function to_lower_code(c) result(lc)
177
+      integer, intent(in) :: c
178
+      integer :: lc
179
+      if (c >= ichar('A') .and. c <= ichar('Z')) then
180
+        lc = c + 32
181
+      else
182
+        lc = c
183
+      end if
184
+    end function to_lower_code
185
+
186
+  end subroutine ac_build
187
+
188
+  subroutine grow_nodes(ac)
189
+    !> Double the node capacity
190
+    type(ac_automaton_t), intent(inout) :: ac
191
+    type(ac_node_t), allocatable :: temp(:)
192
+    integer :: new_cap
193
+
194
+    new_cap = ac%capacity * 2
195
+    allocate(temp(new_cap))
196
+    temp(1:ac%num_nodes) = ac%nodes(1:ac%num_nodes)
197
+    call move_alloc(temp, ac%nodes)
198
+    ac%capacity = new_cap
199
+  end subroutine grow_nodes
200
+
201
+  function ac_search_any(ac, text) result(found)
202
+    !> Search for any pattern match (fast path for existence check)
203
+    type(ac_automaton_t), intent(in) :: ac
204
+    character(len=*), intent(in) :: text
205
+    logical :: found
206
+
207
+    integer :: i, c, state, next_state, text_len
208
+
209
+    found = .false.
210
+    if (.not. ac%compiled) return
211
+
212
+    text_len = len(text)
213
+    state = 1  ! Start at root
214
+
215
+    do i = 1, text_len
216
+      if (ac%ignore_case) then
217
+        c = to_lower_code(ichar(text(i:i)))
218
+      else
219
+        c = ichar(text(i:i))
220
+      end if
221
+
222
+      ! Follow failure links until we find a transition or reach root
223
+      do while (state /= 1 .and. ac%nodes(state)%children(c) == 0)
224
+        state = ac%nodes(state)%failure
225
+      end do
226
+
227
+      next_state = ac%nodes(state)%children(c)
228
+      if (next_state /= 0) then
229
+        state = next_state
230
+      else
231
+        state = 1  ! Stay at root if no transition
232
+      end if
233
+
234
+      ! Check for match at current state or via output links
235
+      if (ac%nodes(state)%output_pattern /= 0) then
236
+        found = .true.
237
+        return
238
+      end if
239
+      if (ac%nodes(state)%output_link /= 0) then
240
+        found = .true.
241
+        return
242
+      end if
243
+    end do
244
+
245
+  contains
246
+
247
+    function to_lower_code(c) result(lc)
248
+      integer, intent(in) :: c
249
+      integer :: lc
250
+      if (c >= ichar('A') .and. c <= ichar('Z')) then
251
+        lc = c + 32
252
+      else
253
+        lc = c
254
+      end if
255
+    end function to_lower_code
256
+
257
+  end function ac_search_any
258
+
259
+  function ac_search(ac, text) result(match)
260
+    !> Search for first pattern match with position info
261
+    type(ac_automaton_t), intent(in) :: ac
262
+    character(len=*), intent(in) :: text
263
+    type(ac_match_t) :: match
264
+
265
+    integer :: i, c, state, next_state, text_len, pat_idx, out_state
266
+
267
+    match%matched = .false.
268
+    if (.not. ac%compiled) return
269
+
270
+    text_len = len(text)
271
+    state = 1  ! Start at root
272
+
273
+    do i = 1, text_len
274
+      if (ac%ignore_case) then
275
+        c = to_lower_code(ichar(text(i:i)))
276
+      else
277
+        c = ichar(text(i:i))
278
+      end if
279
+
280
+      ! Follow failure links until we find a transition or reach root
281
+      do while (state /= 1 .and. ac%nodes(state)%children(c) == 0)
282
+        state = ac%nodes(state)%failure
283
+      end do
284
+
285
+      next_state = ac%nodes(state)%children(c)
286
+      if (next_state /= 0) then
287
+        state = next_state
288
+      else
289
+        state = 1
290
+      end if
291
+
292
+      ! Check for match at current state
293
+      pat_idx = ac%nodes(state)%output_pattern
294
+      if (pat_idx /= 0) then
295
+        match%matched = .true.
296
+        match%pattern_idx = pat_idx
297
+        match%end_pos = i
298
+        match%start_pos = i - ac%pattern_lengths(pat_idx) + 1
299
+        return
300
+      end if
301
+
302
+      ! Check output links for overlapping patterns
303
+      out_state = ac%nodes(state)%output_link
304
+      if (out_state /= 0) then
305
+        pat_idx = ac%nodes(out_state)%output_pattern
306
+        if (pat_idx /= 0) then
307
+          match%matched = .true.
308
+          match%pattern_idx = pat_idx
309
+          match%end_pos = i
310
+          match%start_pos = i - ac%pattern_lengths(pat_idx) + 1
311
+          return
312
+        end if
313
+      end if
314
+    end do
315
+
316
+  contains
317
+
318
+    function to_lower_code(c) result(lc)
319
+      integer, intent(in) :: c
320
+      integer :: lc
321
+      if (c >= ichar('A') .and. c <= ichar('Z')) then
322
+        lc = c + 32
323
+      else
324
+        lc = c
325
+      end if
326
+    end function to_lower_code
327
+
328
+  end function ac_search
329
+
330
+  subroutine ac_free(ac)
331
+    !> Free automaton resources
332
+    type(ac_automaton_t), intent(inout) :: ac
333
+
334
+    if (allocated(ac%nodes)) deallocate(ac%nodes)
335
+    if (allocated(ac%pattern_lengths)) deallocate(ac%pattern_lengths)
336
+    ac%num_nodes = 0
337
+    ac%capacity = 0
338
+    ac%num_patterns = 0
339
+    ac%compiled = .false.
340
+  end subroutine ac_free
341
+
342
+end module aho_corasick
src/regex/regex_api.f90modified
@@ -26,6 +26,7 @@ module regex_api
26
     integer :: error_code = 0
26
     integer :: error_code = 0
27
     character(len=256) :: error_msg = ''
27
     character(len=256) :: error_msg = ''
28
     integer :: num_groups = 0
28
     integer :: num_groups = 0
29
+    character(len=4096) :: pattern = ''  ! Original pattern for AC detection
29
   contains
30
   contains
30
     procedure :: is_compiled => regex_is_compiled
31
     procedure :: is_compiled => regex_is_compiled
31
   end type regex_t
32
   end type regex_t
@@ -94,6 +95,10 @@ contains
94
     ! Optimize NFA for faster matching
95
     ! Optimize NFA for faster matching
95
     call optimize_nfa(re%opt_nfa, re%nfa)
96
     call optimize_nfa(re%opt_nfa, re%nfa)
96
 
97
 
98
+    ! Store pattern and try Aho-Corasick for alternation patterns
99
+    re%pattern = pattern
100
+    call try_build_aho_corasick(re%opt_nfa, pattern, extended, .false.)
101
+
97
     re%compiled = .true.
102
     re%compiled = .true.
98
 
103
 
99
   contains
104
   contains
src/regex/regex_optimizer.f90modified
@@ -5,12 +5,15 @@ module regex_optimizer
5
   !>   - Bit vector state sets for O(1) operations
5
   !>   - Bit vector state sets for O(1) operations
6
   !>   - Lazy DFA state caching
6
   !>   - Lazy DFA state caching
7
   !>   - Anchored pattern fast paths
7
   !>   - Anchored pattern fast paths
8
+  !>   - Aho-Corasick for alternation patterns
8
   use regex_types
9
   use regex_types
10
+  use aho_corasick
9
   implicit none
11
   implicit none
10
   private
12
   private
11
 
13
 
12
   public :: optimized_nfa_t
14
   public :: optimized_nfa_t
13
   public :: optimize_nfa, optimized_match, optimized_search
15
   public :: optimize_nfa, optimized_match, optimized_search
16
+  public :: try_build_aho_corasick
14
 
17
 
15
   integer, parameter :: MAX_STATES = 1024
18
   integer, parameter :: MAX_STATES = 1024
16
   integer, parameter :: MAX_PREFIX_LEN = 64
19
   integer, parameter :: MAX_PREFIX_LEN = 64
@@ -70,6 +73,8 @@ module regex_optimizer
70
     type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE)  ! Lazy DFA cache
73
     type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE)  ! Lazy DFA cache
71
     type(compiled_dfa_t) :: dfa                  ! Full compiled DFA (if available)
74
     type(compiled_dfa_t) :: dfa                  ! Full compiled DFA (if available)
72
     logical :: use_dfa = .false.                 ! Use DFA instead of NFA
75
     logical :: use_dfa = .false.                 ! Use DFA instead of NFA
76
+    type(ac_automaton_t) :: ac                   ! Aho-Corasick automaton (for alternation)
77
+    logical :: use_aho_corasick = .false.        ! Use Aho-Corasick for matching
73
     logical :: optimized = .false.
78
     logical :: optimized = .false.
74
   end type optimized_nfa_t
79
   end type optimized_nfa_t
75
 
80
 
@@ -584,6 +589,15 @@ contains
584
     res%matched = .false.
589
     res%matched = .false.
585
     text_len = len_trim(text)
590
     text_len = len_trim(text)
586
 
591
 
592
+    ! Fast path: use Aho-Corasick for alternation patterns
593
+    ! Only use AC if ignore_case setting matches what was compiled
594
+    if (opt%use_aho_corasick) then
595
+      if (ignore_case .eqv. opt%ac%ignore_case) then
596
+        res = ac_optimized_search(opt%ac, text)
597
+        return
598
+      end if
599
+    end if
600
+
587
     if (opt%nfa%num_states == 0) return
601
     if (opt%nfa%num_states == 0) return
588
 
602
 
589
     ! Fast path: use DFA if available (O(n) matching)
603
     ! Fast path: use DFA if available (O(n) matching)
@@ -1097,4 +1111,197 @@ contains
1097
     if (negated) res = .not. res
1111
     if (negated) res = .not. res
1098
   end function char_in_class_opt
1112
   end function char_in_class_opt
1099
 
1113
 
1114
+  !---------------------------------------------------------------------------
1115
+  ! Aho-Corasick Integration for Alternation Patterns
1116
+  !---------------------------------------------------------------------------
1117
+
1118
+  subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case)
1119
+    !> Try to build Aho-Corasick automaton for simple alternation patterns
1120
+    !> Pattern like "foo|bar|baz" with only literal characters and | separators
1121
+    type(optimized_nfa_t), intent(inout) :: opt
1122
+    character(len=*), intent(in) :: pattern
1123
+    logical, intent(in) :: is_ere, ignore_case
1124
+
1125
+    character(len=4096), allocatable :: alternatives(:)
1126
+    integer :: num_alternatives, ierr
1127
+    logical :: is_simple
1128
+
1129
+    allocate(alternatives(1000))
1130
+
1131
+    opt%use_aho_corasick = .false.
1132
+
1133
+    ! Check if pattern is simple alternation of literals
1134
+    call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple)
1135
+
1136
+    ! DEBUG (commented out for production)
1137
+    ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple
1138
+
1139
+    if (.not. is_simple .or. num_alternatives < 2) return
1140
+
1141
+    ! Build Aho-Corasick automaton
1142
+    call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr)
1143
+
1144
+    if (ierr == 0) then
1145
+      opt%use_aho_corasick = .true.
1146
+    end if
1147
+
1148
+    deallocate(alternatives)
1149
+
1150
+  end subroutine try_build_aho_corasick
1151
+
1152
+  subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple)
1153
+    !> Parse pattern to check if it's simple alternation of literals
1154
+    !> Returns the alternatives if so
1155
+    character(len=*), intent(in) :: pattern
1156
+    logical, intent(in) :: is_ere
1157
+    character(len=*), intent(out) :: alternatives(:)
1158
+    integer, intent(out) :: num_alt
1159
+    logical, intent(out) :: is_simple
1160
+
1161
+    integer :: i, pat_len, alt_start, alt_len
1162
+    character(len=1) :: c, next_c
1163
+    logical :: in_escape
1164
+
1165
+    is_simple = .true.
1166
+    num_alt = 0
1167
+    pat_len = len_trim(pattern)
1168
+
1169
+    if (pat_len == 0) then
1170
+      is_simple = .false.
1171
+      return
1172
+    end if
1173
+
1174
+    alt_start = 1
1175
+    alt_len = 0
1176
+    in_escape = .false.
1177
+    i = 1
1178
+
1179
+    do while (i <= pat_len)
1180
+      c = pattern(i:i)
1181
+
1182
+      if (in_escape) then
1183
+        ! In ERE mode, \| is literal |
1184
+        ! In BRE mode, \| is alternation (GNU extension)
1185
+        if (c == '|' .and. .not. is_ere) then
1186
+          ! BRE alternation
1187
+          if (alt_len > 0) then
1188
+            num_alt = num_alt + 1
1189
+            if (num_alt > size(alternatives)) then
1190
+              is_simple = .false.
1191
+              return
1192
+            end if
1193
+            alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
1194
+          else
1195
+            ! Empty alternative - still valid
1196
+            num_alt = num_alt + 1
1197
+            alternatives(num_alt) = ''
1198
+          end if
1199
+          alt_start = i + 1
1200
+          alt_len = 0
1201
+        else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. &
1202
+                 c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. &
1203
+                 c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. &
1204
+                 c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. &
1205
+                 c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. &
1206
+                 c == '7' .or. c == '8' .or. c == '9') then
1207
+          ! Regex metacharacter - not simple
1208
+          is_simple = .false.
1209
+          return
1210
+        else
1211
+          ! Escaped literal character (e.g., \., \*, etc.)
1212
+          alt_len = alt_len + 1
1213
+        end if
1214
+        in_escape = .false.
1215
+        i = i + 1
1216
+        cycle
1217
+      end if
1218
+
1219
+      if (c == '\') then
1220
+        in_escape = .true.
1221
+        i = i + 1
1222
+        cycle
1223
+      end if
1224
+
1225
+      ! Check for metacharacters
1226
+      if (is_ere) then
1227
+        ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters
1228
+        if (c == '|') then
1229
+          ! Alternation separator
1230
+          if (alt_len > 0) then
1231
+            num_alt = num_alt + 1
1232
+            if (num_alt > size(alternatives)) then
1233
+              is_simple = .false.
1234
+              return
1235
+            end if
1236
+            alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
1237
+          else
1238
+            num_alt = num_alt + 1
1239
+            alternatives(num_alt) = ''
1240
+          end if
1241
+          alt_start = i + 1
1242
+          alt_len = 0
1243
+          i = i + 1
1244
+          cycle
1245
+        else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. &
1246
+                 c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. &
1247
+                 c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then
1248
+          ! Metacharacter - not simple alternation
1249
+          is_simple = .false.
1250
+          return
1251
+        end if
1252
+      else
1253
+        ! BRE mode: only . * [ ] ^ $ are metacharacters
1254
+        ! | is literal, \| is alternation (GNU extension)
1255
+        if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. &
1256
+            c == '^' .or. c == '$') then
1257
+          is_simple = .false.
1258
+          return
1259
+        end if
1260
+      end if
1261
+
1262
+      ! Regular literal character
1263
+      alt_len = alt_len + 1
1264
+      i = i + 1
1265
+    end do
1266
+
1267
+    ! Handle last alternative
1268
+    if (alt_len > 0 .or. num_alt > 0) then
1269
+      num_alt = num_alt + 1
1270
+      if (num_alt > size(alternatives)) then
1271
+        is_simple = .false.
1272
+        return
1273
+      end if
1274
+      if (alt_len > 0) then
1275
+        alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1)
1276
+      else
1277
+        alternatives(num_alt) = ''
1278
+      end if
1279
+    end if
1280
+
1281
+    ! Need at least 2 alternatives for Aho-Corasick to be useful
1282
+    if (num_alt < 2) then
1283
+      is_simple = .false.
1284
+    end if
1285
+
1286
+  end subroutine parse_simple_alternation
1287
+
1288
+  function ac_optimized_search(ac, text) result(res)
1289
+    !> Search using Aho-Corasick automaton
1290
+    type(ac_automaton_t), intent(in) :: ac
1291
+    character(len=*), intent(in) :: text
1292
+    type(match_result_t) :: res
1293
+
1294
+    type(ac_match_t) :: ac_match
1295
+
1296
+    res%matched = .false.
1297
+
1298
+    ac_match = ac_search(ac, text)
1299
+    if (ac_match%matched) then
1300
+      res%matched = .true.
1301
+      res%match_start = ac_match%start_pos
1302
+      res%match_end = ac_match%end_pos
1303
+    end if
1304
+
1305
+  end function ac_optimized_search
1306
+
1100
 end module regex_optimizer
1307
 end module regex_optimizer