Add Aho-Corasick for alternation patterns (6x faster than grep)
- SHA
47f477c325987aaf3dea018eab539469ced129e3- Parents
-
eab33b0 - Tree
d4bcff1
47f477c
47f477c325987aaf3dea018eab539469ced129e3eab33b0
d4bcff1| Status | File | + | - |
|---|---|---|---|
| M |
Makefile
|
3 | 1 |
| A |
src/regex/aho_corasick.f90
|
342 | 0 |
| M |
src/regex/regex_api.f90
|
5 | 0 |
| M |
src/regex/regex_optimizer.f90
|
207 | 0 |
Makefilemodified@@ -38,6 +38,7 @@ REGEX_SRCS = $(REGEX_DIR)/regex_types.f90 \ | |||
| 38 | $(REGEX_DIR)/regex_parser.f90 \ | 38 | $(REGEX_DIR)/regex_parser.f90 \ |
| 39 | $(REGEX_DIR)/regex_nfa.f90 \ | 39 | $(REGEX_DIR)/regex_nfa.f90 \ |
| 40 | $(REGEX_DIR)/regex_engine.f90 \ | 40 | $(REGEX_DIR)/regex_engine.f90 \ |
| 41 | + $(REGEX_DIR)/aho_corasick.f90 \ | ||
| 41 | $(REGEX_DIR)/regex_optimizer.f90 \ | 42 | $(REGEX_DIR)/regex_optimizer.f90 \ |
| 42 | $(REGEX_DIR)/regex_api.f90 \ | 43 | $(REGEX_DIR)/regex_api.f90 \ |
| 43 | $(REGEX_DIR)/pcre_api.f90 | 44 | $(REGEX_DIR)/pcre_api.f90 |
@@ -105,7 +106,8 @@ $(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o | |||
| 105 | $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o | 106 | $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o |
| 106 | $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_parser.o | 107 | $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_parser.o |
| 107 | $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o | 108 | $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o |
| 108 | -$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o | 109 | +$(BUILD_DIR)/aho_corasick.o: |
| 110 | +$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/aho_corasick.o | ||
| 109 | $(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o | 111 | $(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o |
| 110 | $(BUILD_DIR)/pcre_api.o: | 112 | $(BUILD_DIR)/pcre_api.o: |
| 111 | 113 | ||
src/regex/aho_corasick.f90added@@ -0,0 +1,342 @@ | |||
| 1 | +module aho_corasick | ||
| 2 | + !> Aho-Corasick automaton for multi-pattern string matching | ||
| 3 | + !> Matches all patterns in a single pass O(n + m + z) | ||
| 4 | + !> where n=text length, m=total pattern length, z=matches | ||
| 5 | + implicit none | ||
| 6 | + private | ||
| 7 | + | ||
| 8 | + public :: ac_automaton_t, ac_match_t | ||
| 9 | + public :: ac_build, ac_search, ac_search_any, ac_free | ||
| 10 | + | ||
| 11 | + integer, parameter :: MAX_CHILDREN = 256 ! ASCII character set | ||
| 12 | + integer, parameter :: MAX_PATTERNS = 1000 | ||
| 13 | + integer, parameter :: MAX_PATTERN_LEN = 4096 | ||
| 14 | + | ||
| 15 | + type :: ac_node_t | ||
| 16 | + !> Trie node with failure links | ||
| 17 | + integer :: children(0:255) = 0 ! Child node indices (0 = no child) | ||
| 18 | + integer :: failure = 0 ! Failure link (fall back on mismatch) | ||
| 19 | + integer :: output_pattern = 0 ! Pattern index that ends here (0 = none) | ||
| 20 | + integer :: output_link = 0 ! Link to next output state | ||
| 21 | + integer :: depth = 0 ! Depth in trie (= prefix length) | ||
| 22 | + end type ac_node_t | ||
| 23 | + | ||
| 24 | + type :: ac_automaton_t | ||
| 25 | + !> Aho-Corasick automaton | ||
| 26 | + type(ac_node_t), allocatable :: nodes(:) | ||
| 27 | + integer :: num_nodes = 0 | ||
| 28 | + integer :: capacity = 0 | ||
| 29 | + integer :: num_patterns = 0 | ||
| 30 | + integer, allocatable :: pattern_lengths(:) | ||
| 31 | + logical :: compiled = .false. | ||
| 32 | + logical :: ignore_case = .false. | ||
| 33 | + end type ac_automaton_t | ||
| 34 | + | ||
| 35 | + type :: ac_match_t | ||
| 36 | + !> Match result | ||
| 37 | + logical :: matched = .false. | ||
| 38 | + integer :: pattern_idx = 0 ! Which pattern matched (1-based) | ||
| 39 | + integer :: start_pos = 0 ! Start position in text (1-based) | ||
| 40 | + integer :: end_pos = 0 ! End position in text (1-based) | ||
| 41 | + end type ac_match_t | ||
| 42 | + | ||
| 43 | +contains | ||
| 44 | + | ||
| 45 | + subroutine ac_build(ac, patterns, num_patterns, ignore_case, ierr) | ||
| 46 | + !> Build Aho-Corasick automaton from patterns | ||
| 47 | + type(ac_automaton_t), intent(out) :: ac | ||
| 48 | + character(len=*), intent(in) :: patterns(:) | ||
| 49 | + integer, intent(in) :: num_patterns | ||
| 50 | + logical, intent(in) :: ignore_case | ||
| 51 | + integer, intent(out) :: ierr | ||
| 52 | + | ||
| 53 | + integer :: i, j, c, state, next_state, child | ||
| 54 | + integer, allocatable :: queue(:) | ||
| 55 | + integer :: q_head, q_tail | ||
| 56 | + integer :: fail_state | ||
| 57 | + character(len=1) :: ch | ||
| 58 | + | ||
| 59 | + ierr = 0 | ||
| 60 | + | ||
| 61 | + ! Allocate BFS queue | ||
| 62 | + allocate(queue(MAX_PATTERNS * MAX_PATTERN_LEN)) | ||
| 63 | + ac%ignore_case = ignore_case | ||
| 64 | + ac%num_patterns = num_patterns | ||
| 65 | + | ||
| 66 | + ! Allocate pattern lengths | ||
| 67 | + allocate(ac%pattern_lengths(num_patterns)) | ||
| 68 | + do i = 1, num_patterns | ||
| 69 | + ac%pattern_lengths(i) = len_trim(patterns(i)) | ||
| 70 | + end do | ||
| 71 | + | ||
| 72 | + ! Initial capacity - estimate based on total pattern length | ||
| 73 | + ac%capacity = 1 | ||
| 74 | + do i = 1, num_patterns | ||
| 75 | + ac%capacity = ac%capacity + len_trim(patterns(i)) | ||
| 76 | + end do | ||
| 77 | + ac%capacity = max(ac%capacity, 256) | ||
| 78 | + allocate(ac%nodes(ac%capacity)) | ||
| 79 | + | ||
| 80 | + ! Initialize root node (index 1) | ||
| 81 | + ac%num_nodes = 1 | ||
| 82 | + ac%nodes(1)%depth = 0 | ||
| 83 | + | ||
| 84 | + ! Phase 1: Build trie from patterns | ||
| 85 | + do i = 1, num_patterns | ||
| 86 | + state = 1 ! Start at root | ||
| 87 | + do j = 1, len_trim(patterns(i)) | ||
| 88 | + ch = patterns(i)(j:j) | ||
| 89 | + if (ignore_case) then | ||
| 90 | + c = to_lower_code(ichar(ch)) | ||
| 91 | + else | ||
| 92 | + c = ichar(ch) | ||
| 93 | + end if | ||
| 94 | + | ||
| 95 | + child = ac%nodes(state)%children(c) | ||
| 96 | + if (child == 0) then | ||
| 97 | + ! Create new node | ||
| 98 | + ac%num_nodes = ac%num_nodes + 1 | ||
| 99 | + if (ac%num_nodes > ac%capacity) then | ||
| 100 | + call grow_nodes(ac) | ||
| 101 | + end if | ||
| 102 | + ac%nodes(state)%children(c) = ac%num_nodes | ||
| 103 | + ac%nodes(ac%num_nodes)%depth = ac%nodes(state)%depth + 1 | ||
| 104 | + child = ac%num_nodes | ||
| 105 | + end if | ||
| 106 | + state = child | ||
| 107 | + end do | ||
| 108 | + ! Mark this state as accepting for pattern i | ||
| 109 | + ac%nodes(state)%output_pattern = i | ||
| 110 | + end do | ||
| 111 | + | ||
| 112 | + ! Phase 2: Compute failure links using BFS | ||
| 113 | + q_head = 1 | ||
| 114 | + q_tail = 0 | ||
| 115 | + | ||
| 116 | + ! Initialize: depth-1 nodes fail to root | ||
| 117 | + do c = 0, 255 | ||
| 118 | + child = ac%nodes(1)%children(c) | ||
| 119 | + if (child /= 0) then | ||
| 120 | + ac%nodes(child)%failure = 1 ! Fail to root | ||
| 121 | + q_tail = q_tail + 1 | ||
| 122 | + queue(q_tail) = child | ||
| 123 | + end if | ||
| 124 | + end do | ||
| 125 | + | ||
| 126 | + ! BFS to compute failure links for deeper nodes | ||
| 127 | + do while (q_head <= q_tail) | ||
| 128 | + state = queue(q_head) | ||
| 129 | + q_head = q_head + 1 | ||
| 130 | + | ||
| 131 | + do c = 0, 255 | ||
| 132 | + child = ac%nodes(state)%children(c) | ||
| 133 | + if (child /= 0) then | ||
| 134 | + ! Add to queue | ||
| 135 | + q_tail = q_tail + 1 | ||
| 136 | + queue(q_tail) = child | ||
| 137 | + | ||
| 138 | + ! Compute failure link: follow parent's failure until we find | ||
| 139 | + ! a state with a transition on c, or reach root | ||
| 140 | + fail_state = ac%nodes(state)%failure | ||
| 141 | + do while (fail_state > 1) | ||
| 142 | + if (ac%nodes(fail_state)%children(c) /= 0) exit | ||
| 143 | + fail_state = ac%nodes(fail_state)%failure | ||
| 144 | + end do | ||
| 145 | + | ||
| 146 | + if (fail_state <= 1) then | ||
| 147 | + ! At or beyond root | ||
| 148 | + if (ac%nodes(1)%children(c) /= 0 .and. ac%nodes(1)%children(c) /= child) then | ||
| 149 | + ac%nodes(child)%failure = ac%nodes(1)%children(c) | ||
| 150 | + else | ||
| 151 | + ac%nodes(child)%failure = 1 ! Fail to root | ||
| 152 | + end if | ||
| 153 | + else if (ac%nodes(fail_state)%children(c) == child) then | ||
| 154 | + ! Would create self-loop, fail to root | ||
| 155 | + ac%nodes(child)%failure = 1 | ||
| 156 | + else | ||
| 157 | + ac%nodes(child)%failure = ac%nodes(fail_state)%children(c) | ||
| 158 | + end if | ||
| 159 | + | ||
| 160 | + ! Compute output link: chain of accepting states via failure links | ||
| 161 | + if (ac%nodes(ac%nodes(child)%failure)%output_pattern /= 0) then | ||
| 162 | + ac%nodes(child)%output_link = ac%nodes(child)%failure | ||
| 163 | + else | ||
| 164 | + ac%nodes(child)%output_link = ac%nodes(ac%nodes(child)%failure)%output_link | ||
| 165 | + end if | ||
| 166 | + end if | ||
| 167 | + end do | ||
| 168 | + end do | ||
| 169 | + | ||
| 170 | + ac%compiled = .true. | ||
| 171 | + | ||
| 172 | + deallocate(queue) | ||
| 173 | + | ||
| 174 | + contains | ||
| 175 | + | ||
| 176 | + function to_lower_code(c) result(lc) | ||
| 177 | + integer, intent(in) :: c | ||
| 178 | + integer :: lc | ||
| 179 | + if (c >= ichar('A') .and. c <= ichar('Z')) then | ||
| 180 | + lc = c + 32 | ||
| 181 | + else | ||
| 182 | + lc = c | ||
| 183 | + end if | ||
| 184 | + end function to_lower_code | ||
| 185 | + | ||
| 186 | + end subroutine ac_build | ||
| 187 | + | ||
| 188 | + subroutine grow_nodes(ac) | ||
| 189 | + !> Double the node capacity | ||
| 190 | + type(ac_automaton_t), intent(inout) :: ac | ||
| 191 | + type(ac_node_t), allocatable :: temp(:) | ||
| 192 | + integer :: new_cap | ||
| 193 | + | ||
| 194 | + new_cap = ac%capacity * 2 | ||
| 195 | + allocate(temp(new_cap)) | ||
| 196 | + temp(1:ac%num_nodes) = ac%nodes(1:ac%num_nodes) | ||
| 197 | + call move_alloc(temp, ac%nodes) | ||
| 198 | + ac%capacity = new_cap | ||
| 199 | + end subroutine grow_nodes | ||
| 200 | + | ||
| 201 | + function ac_search_any(ac, text) result(found) | ||
| 202 | + !> Search for any pattern match (fast path for existence check) | ||
| 203 | + type(ac_automaton_t), intent(in) :: ac | ||
| 204 | + character(len=*), intent(in) :: text | ||
| 205 | + logical :: found | ||
| 206 | + | ||
| 207 | + integer :: i, c, state, next_state, text_len | ||
| 208 | + | ||
| 209 | + found = .false. | ||
| 210 | + if (.not. ac%compiled) return | ||
| 211 | + | ||
| 212 | + text_len = len(text) | ||
| 213 | + state = 1 ! Start at root | ||
| 214 | + | ||
| 215 | + do i = 1, text_len | ||
| 216 | + if (ac%ignore_case) then | ||
| 217 | + c = to_lower_code(ichar(text(i:i))) | ||
| 218 | + else | ||
| 219 | + c = ichar(text(i:i)) | ||
| 220 | + end if | ||
| 221 | + | ||
| 222 | + ! Follow failure links until we find a transition or reach root | ||
| 223 | + do while (state /= 1 .and. ac%nodes(state)%children(c) == 0) | ||
| 224 | + state = ac%nodes(state)%failure | ||
| 225 | + end do | ||
| 226 | + | ||
| 227 | + next_state = ac%nodes(state)%children(c) | ||
| 228 | + if (next_state /= 0) then | ||
| 229 | + state = next_state | ||
| 230 | + else | ||
| 231 | + state = 1 ! Stay at root if no transition | ||
| 232 | + end if | ||
| 233 | + | ||
| 234 | + ! Check for match at current state or via output links | ||
| 235 | + if (ac%nodes(state)%output_pattern /= 0) then | ||
| 236 | + found = .true. | ||
| 237 | + return | ||
| 238 | + end if | ||
| 239 | + if (ac%nodes(state)%output_link /= 0) then | ||
| 240 | + found = .true. | ||
| 241 | + return | ||
| 242 | + end if | ||
| 243 | + end do | ||
| 244 | + | ||
| 245 | + contains | ||
| 246 | + | ||
| 247 | + function to_lower_code(c) result(lc) | ||
| 248 | + integer, intent(in) :: c | ||
| 249 | + integer :: lc | ||
| 250 | + if (c >= ichar('A') .and. c <= ichar('Z')) then | ||
| 251 | + lc = c + 32 | ||
| 252 | + else | ||
| 253 | + lc = c | ||
| 254 | + end if | ||
| 255 | + end function to_lower_code | ||
| 256 | + | ||
| 257 | + end function ac_search_any | ||
| 258 | + | ||
| 259 | + function ac_search(ac, text) result(match) | ||
| 260 | + !> Search for first pattern match with position info | ||
| 261 | + type(ac_automaton_t), intent(in) :: ac | ||
| 262 | + character(len=*), intent(in) :: text | ||
| 263 | + type(ac_match_t) :: match | ||
| 264 | + | ||
| 265 | + integer :: i, c, state, next_state, text_len, pat_idx, out_state | ||
| 266 | + | ||
| 267 | + match%matched = .false. | ||
| 268 | + if (.not. ac%compiled) return | ||
| 269 | + | ||
| 270 | + text_len = len(text) | ||
| 271 | + state = 1 ! Start at root | ||
| 272 | + | ||
| 273 | + do i = 1, text_len | ||
| 274 | + if (ac%ignore_case) then | ||
| 275 | + c = to_lower_code(ichar(text(i:i))) | ||
| 276 | + else | ||
| 277 | + c = ichar(text(i:i)) | ||
| 278 | + end if | ||
| 279 | + | ||
| 280 | + ! Follow failure links until we find a transition or reach root | ||
| 281 | + do while (state /= 1 .and. ac%nodes(state)%children(c) == 0) | ||
| 282 | + state = ac%nodes(state)%failure | ||
| 283 | + end do | ||
| 284 | + | ||
| 285 | + next_state = ac%nodes(state)%children(c) | ||
| 286 | + if (next_state /= 0) then | ||
| 287 | + state = next_state | ||
| 288 | + else | ||
| 289 | + state = 1 | ||
| 290 | + end if | ||
| 291 | + | ||
| 292 | + ! Check for match at current state | ||
| 293 | + pat_idx = ac%nodes(state)%output_pattern | ||
| 294 | + if (pat_idx /= 0) then | ||
| 295 | + match%matched = .true. | ||
| 296 | + match%pattern_idx = pat_idx | ||
| 297 | + match%end_pos = i | ||
| 298 | + match%start_pos = i - ac%pattern_lengths(pat_idx) + 1 | ||
| 299 | + return | ||
| 300 | + end if | ||
| 301 | + | ||
| 302 | + ! Check output links for overlapping patterns | ||
| 303 | + out_state = ac%nodes(state)%output_link | ||
| 304 | + if (out_state /= 0) then | ||
| 305 | + pat_idx = ac%nodes(out_state)%output_pattern | ||
| 306 | + if (pat_idx /= 0) then | ||
| 307 | + match%matched = .true. | ||
| 308 | + match%pattern_idx = pat_idx | ||
| 309 | + match%end_pos = i | ||
| 310 | + match%start_pos = i - ac%pattern_lengths(pat_idx) + 1 | ||
| 311 | + return | ||
| 312 | + end if | ||
| 313 | + end if | ||
| 314 | + end do | ||
| 315 | + | ||
| 316 | + contains | ||
| 317 | + | ||
| 318 | + function to_lower_code(c) result(lc) | ||
| 319 | + integer, intent(in) :: c | ||
| 320 | + integer :: lc | ||
| 321 | + if (c >= ichar('A') .and. c <= ichar('Z')) then | ||
| 322 | + lc = c + 32 | ||
| 323 | + else | ||
| 324 | + lc = c | ||
| 325 | + end if | ||
| 326 | + end function to_lower_code | ||
| 327 | + | ||
| 328 | + end function ac_search | ||
| 329 | + | ||
| 330 | + subroutine ac_free(ac) | ||
| 331 | + !> Free automaton resources | ||
| 332 | + type(ac_automaton_t), intent(inout) :: ac | ||
| 333 | + | ||
| 334 | + if (allocated(ac%nodes)) deallocate(ac%nodes) | ||
| 335 | + if (allocated(ac%pattern_lengths)) deallocate(ac%pattern_lengths) | ||
| 336 | + ac%num_nodes = 0 | ||
| 337 | + ac%capacity = 0 | ||
| 338 | + ac%num_patterns = 0 | ||
| 339 | + ac%compiled = .false. | ||
| 340 | + end subroutine ac_free | ||
| 341 | + | ||
| 342 | +end module aho_corasick | ||
src/regex/regex_api.f90modified@@ -26,6 +26,7 @@ module regex_api | |||
| 26 | integer :: error_code = 0 | 26 | integer :: error_code = 0 |
| 27 | character(len=256) :: error_msg = '' | 27 | character(len=256) :: error_msg = '' |
| 28 | integer :: num_groups = 0 | 28 | integer :: num_groups = 0 |
| 29 | + character(len=4096) :: pattern = '' ! Original pattern for AC detection | ||
| 29 | contains | 30 | contains |
| 30 | procedure :: is_compiled => regex_is_compiled | 31 | procedure :: is_compiled => regex_is_compiled |
| 31 | end type regex_t | 32 | end type regex_t |
@@ -94,6 +95,10 @@ contains | |||
| 94 | ! Optimize NFA for faster matching | 95 | ! Optimize NFA for faster matching |
| 95 | call optimize_nfa(re%opt_nfa, re%nfa) | 96 | call optimize_nfa(re%opt_nfa, re%nfa) |
| 96 | 97 | ||
| 98 | + ! Store pattern and try Aho-Corasick for alternation patterns | ||
| 99 | + re%pattern = pattern | ||
| 100 | + call try_build_aho_corasick(re%opt_nfa, pattern, extended, .false.) | ||
| 101 | + | ||
| 97 | re%compiled = .true. | 102 | re%compiled = .true. |
| 98 | 103 | ||
| 99 | contains | 104 | contains |
src/regex/regex_optimizer.f90modified@@ -5,12 +5,15 @@ module regex_optimizer | |||
| 5 | !> - Bit vector state sets for O(1) operations | 5 | !> - Bit vector state sets for O(1) operations |
| 6 | !> - Lazy DFA state caching | 6 | !> - Lazy DFA state caching |
| 7 | !> - Anchored pattern fast paths | 7 | !> - Anchored pattern fast paths |
| 8 | + !> - Aho-Corasick for alternation patterns | ||
| 8 | use regex_types | 9 | use regex_types |
| 10 | + use aho_corasick | ||
| 9 | implicit none | 11 | implicit none |
| 10 | private | 12 | private |
| 11 | 13 | ||
| 12 | public :: optimized_nfa_t | 14 | public :: optimized_nfa_t |
| 13 | public :: optimize_nfa, optimized_match, optimized_search | 15 | public :: optimize_nfa, optimized_match, optimized_search |
| 16 | + public :: try_build_aho_corasick | ||
| 14 | 17 | ||
| 15 | integer, parameter :: MAX_STATES = 1024 | 18 | integer, parameter :: MAX_STATES = 1024 |
| 16 | integer, parameter :: MAX_PREFIX_LEN = 64 | 19 | integer, parameter :: MAX_PREFIX_LEN = 64 |
@@ -70,6 +73,8 @@ module regex_optimizer | |||
| 70 | type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE) ! Lazy DFA cache | 73 | type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE) ! Lazy DFA cache |
| 71 | type(compiled_dfa_t) :: dfa ! Full compiled DFA (if available) | 74 | type(compiled_dfa_t) :: dfa ! Full compiled DFA (if available) |
| 72 | logical :: use_dfa = .false. ! Use DFA instead of NFA | 75 | logical :: use_dfa = .false. ! Use DFA instead of NFA |
| 76 | + type(ac_automaton_t) :: ac ! Aho-Corasick automaton (for alternation) | ||
| 77 | + logical :: use_aho_corasick = .false. ! Use Aho-Corasick for matching | ||
| 73 | logical :: optimized = .false. | 78 | logical :: optimized = .false. |
| 74 | end type optimized_nfa_t | 79 | end type optimized_nfa_t |
| 75 | 80 | ||
@@ -584,6 +589,15 @@ contains | |||
| 584 | res%matched = .false. | 589 | res%matched = .false. |
| 585 | text_len = len_trim(text) | 590 | text_len = len_trim(text) |
| 586 | 591 | ||
| 592 | + ! Fast path: use Aho-Corasick for alternation patterns | ||
| 593 | + ! Only use AC if ignore_case setting matches what was compiled | ||
| 594 | + if (opt%use_aho_corasick) then | ||
| 595 | + if (ignore_case .eqv. opt%ac%ignore_case) then | ||
| 596 | + res = ac_optimized_search(opt%ac, text) | ||
| 597 | + return | ||
| 598 | + end if | ||
| 599 | + end if | ||
| 600 | + | ||
| 587 | if (opt%nfa%num_states == 0) return | 601 | if (opt%nfa%num_states == 0) return |
| 588 | 602 | ||
| 589 | ! Fast path: use DFA if available (O(n) matching) | 603 | ! Fast path: use DFA if available (O(n) matching) |
@@ -1097,4 +1111,197 @@ contains | |||
| 1097 | if (negated) res = .not. res | 1111 | if (negated) res = .not. res |
| 1098 | end function char_in_class_opt | 1112 | end function char_in_class_opt |
| 1099 | 1113 | ||
| 1114 | + !--------------------------------------------------------------------------- | ||
| 1115 | + ! Aho-Corasick Integration for Alternation Patterns | ||
| 1116 | + !--------------------------------------------------------------------------- | ||
| 1117 | + | ||
| 1118 | + subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case) | ||
| 1119 | + !> Try to build Aho-Corasick automaton for simple alternation patterns | ||
| 1120 | + !> Pattern like "foo|bar|baz" with only literal characters and | separators | ||
| 1121 | + type(optimized_nfa_t), intent(inout) :: opt | ||
| 1122 | + character(len=*), intent(in) :: pattern | ||
| 1123 | + logical, intent(in) :: is_ere, ignore_case | ||
| 1124 | + | ||
| 1125 | + character(len=4096), allocatable :: alternatives(:) | ||
| 1126 | + integer :: num_alternatives, ierr | ||
| 1127 | + logical :: is_simple | ||
| 1128 | + | ||
| 1129 | + allocate(alternatives(1000)) | ||
| 1130 | + | ||
| 1131 | + opt%use_aho_corasick = .false. | ||
| 1132 | + | ||
| 1133 | + ! Check if pattern is simple alternation of literals | ||
| 1134 | + call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple) | ||
| 1135 | + | ||
| 1136 | + ! DEBUG (commented out for production) | ||
| 1137 | + ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple | ||
| 1138 | + | ||
| 1139 | + if (.not. is_simple .or. num_alternatives < 2) return | ||
| 1140 | + | ||
| 1141 | + ! Build Aho-Corasick automaton | ||
| 1142 | + call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr) | ||
| 1143 | + | ||
| 1144 | + if (ierr == 0) then | ||
| 1145 | + opt%use_aho_corasick = .true. | ||
| 1146 | + end if | ||
| 1147 | + | ||
| 1148 | + deallocate(alternatives) | ||
| 1149 | + | ||
| 1150 | + end subroutine try_build_aho_corasick | ||
| 1151 | + | ||
| 1152 | + subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple) | ||
| 1153 | + !> Parse pattern to check if it's simple alternation of literals | ||
| 1154 | + !> Returns the alternatives if so | ||
| 1155 | + character(len=*), intent(in) :: pattern | ||
| 1156 | + logical, intent(in) :: is_ere | ||
| 1157 | + character(len=*), intent(out) :: alternatives(:) | ||
| 1158 | + integer, intent(out) :: num_alt | ||
| 1159 | + logical, intent(out) :: is_simple | ||
| 1160 | + | ||
| 1161 | + integer :: i, pat_len, alt_start, alt_len | ||
| 1162 | + character(len=1) :: c, next_c | ||
| 1163 | + logical :: in_escape | ||
| 1164 | + | ||
| 1165 | + is_simple = .true. | ||
| 1166 | + num_alt = 0 | ||
| 1167 | + pat_len = len_trim(pattern) | ||
| 1168 | + | ||
| 1169 | + if (pat_len == 0) then | ||
| 1170 | + is_simple = .false. | ||
| 1171 | + return | ||
| 1172 | + end if | ||
| 1173 | + | ||
| 1174 | + alt_start = 1 | ||
| 1175 | + alt_len = 0 | ||
| 1176 | + in_escape = .false. | ||
| 1177 | + i = 1 | ||
| 1178 | + | ||
| 1179 | + do while (i <= pat_len) | ||
| 1180 | + c = pattern(i:i) | ||
| 1181 | + | ||
| 1182 | + if (in_escape) then | ||
| 1183 | + ! In ERE mode, \| is literal | | ||
| 1184 | + ! In BRE mode, \| is alternation (GNU extension) | ||
| 1185 | + if (c == '|' .and. .not. is_ere) then | ||
| 1186 | + ! BRE alternation | ||
| 1187 | + if (alt_len > 0) then | ||
| 1188 | + num_alt = num_alt + 1 | ||
| 1189 | + if (num_alt > size(alternatives)) then | ||
| 1190 | + is_simple = .false. | ||
| 1191 | + return | ||
| 1192 | + end if | ||
| 1193 | + alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) | ||
| 1194 | + else | ||
| 1195 | + ! Empty alternative - still valid | ||
| 1196 | + num_alt = num_alt + 1 | ||
| 1197 | + alternatives(num_alt) = '' | ||
| 1198 | + end if | ||
| 1199 | + alt_start = i + 1 | ||
| 1200 | + alt_len = 0 | ||
| 1201 | + else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. & | ||
| 1202 | + c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. & | ||
| 1203 | + c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. & | ||
| 1204 | + c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. & | ||
| 1205 | + c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. & | ||
| 1206 | + c == '7' .or. c == '8' .or. c == '9') then | ||
| 1207 | + ! Regex metacharacter - not simple | ||
| 1208 | + is_simple = .false. | ||
| 1209 | + return | ||
| 1210 | + else | ||
| 1211 | + ! Escaped literal character (e.g., \., \*, etc.) | ||
| 1212 | + alt_len = alt_len + 1 | ||
| 1213 | + end if | ||
| 1214 | + in_escape = .false. | ||
| 1215 | + i = i + 1 | ||
| 1216 | + cycle | ||
| 1217 | + end if | ||
| 1218 | + | ||
| 1219 | + if (c == '\') then | ||
| 1220 | + in_escape = .true. | ||
| 1221 | + i = i + 1 | ||
| 1222 | + cycle | ||
| 1223 | + end if | ||
| 1224 | + | ||
| 1225 | + ! Check for metacharacters | ||
| 1226 | + if (is_ere) then | ||
| 1227 | + ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters | ||
| 1228 | + if (c == '|') then | ||
| 1229 | + ! Alternation separator | ||
| 1230 | + if (alt_len > 0) then | ||
| 1231 | + num_alt = num_alt + 1 | ||
| 1232 | + if (num_alt > size(alternatives)) then | ||
| 1233 | + is_simple = .false. | ||
| 1234 | + return | ||
| 1235 | + end if | ||
| 1236 | + alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) | ||
| 1237 | + else | ||
| 1238 | + num_alt = num_alt + 1 | ||
| 1239 | + alternatives(num_alt) = '' | ||
| 1240 | + end if | ||
| 1241 | + alt_start = i + 1 | ||
| 1242 | + alt_len = 0 | ||
| 1243 | + i = i + 1 | ||
| 1244 | + cycle | ||
| 1245 | + else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. & | ||
| 1246 | + c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. & | ||
| 1247 | + c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then | ||
| 1248 | + ! Metacharacter - not simple alternation | ||
| 1249 | + is_simple = .false. | ||
| 1250 | + return | ||
| 1251 | + end if | ||
| 1252 | + else | ||
| 1253 | + ! BRE mode: only . * [ ] ^ $ are metacharacters | ||
| 1254 | + ! | is literal, \| is alternation (GNU extension) | ||
| 1255 | + if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. & | ||
| 1256 | + c == '^' .or. c == '$') then | ||
| 1257 | + is_simple = .false. | ||
| 1258 | + return | ||
| 1259 | + end if | ||
| 1260 | + end if | ||
| 1261 | + | ||
| 1262 | + ! Regular literal character | ||
| 1263 | + alt_len = alt_len + 1 | ||
| 1264 | + i = i + 1 | ||
| 1265 | + end do | ||
| 1266 | + | ||
| 1267 | + ! Handle last alternative | ||
| 1268 | + if (alt_len > 0 .or. num_alt > 0) then | ||
| 1269 | + num_alt = num_alt + 1 | ||
| 1270 | + if (num_alt > size(alternatives)) then | ||
| 1271 | + is_simple = .false. | ||
| 1272 | + return | ||
| 1273 | + end if | ||
| 1274 | + if (alt_len > 0) then | ||
| 1275 | + alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) | ||
| 1276 | + else | ||
| 1277 | + alternatives(num_alt) = '' | ||
| 1278 | + end if | ||
| 1279 | + end if | ||
| 1280 | + | ||
| 1281 | + ! Need at least 2 alternatives for Aho-Corasick to be useful | ||
| 1282 | + if (num_alt < 2) then | ||
| 1283 | + is_simple = .false. | ||
| 1284 | + end if | ||
| 1285 | + | ||
| 1286 | + end subroutine parse_simple_alternation | ||
| 1287 | + | ||
| 1288 | + function ac_optimized_search(ac, text) result(res) | ||
| 1289 | + !> Search using Aho-Corasick automaton | ||
| 1290 | + type(ac_automaton_t), intent(in) :: ac | ||
| 1291 | + character(len=*), intent(in) :: text | ||
| 1292 | + type(match_result_t) :: res | ||
| 1293 | + | ||
| 1294 | + type(ac_match_t) :: ac_match | ||
| 1295 | + | ||
| 1296 | + res%matched = .false. | ||
| 1297 | + | ||
| 1298 | + ac_match = ac_search(ac, text) | ||
| 1299 | + if (ac_match%matched) then | ||
| 1300 | + res%matched = .true. | ||
| 1301 | + res%match_start = ac_match%start_pos | ||
| 1302 | + res%match_end = ac_match%end_pos | ||
| 1303 | + end if | ||
| 1304 | + | ||
| 1305 | + end function ac_optimized_search | ||
| 1306 | + | ||
| 1100 | end module regex_optimizer | 1307 | end module regex_optimizer |