Add Aho-Corasick for alternation patterns (6x faster than grep)
- SHA
47f477c325987aaf3dea018eab539469ced129e3- Parents
-
eab33b0 - Tree
d4bcff1
47f477c
47f477c325987aaf3dea018eab539469ced129e3eab33b0
d4bcff1| Status | File | + | - |
|---|---|---|---|
| M |
Makefile
|
3 | 1 |
| A |
src/regex/aho_corasick.f90
|
342 | 0 |
| M |
src/regex/regex_api.f90
|
5 | 0 |
| M |
src/regex/regex_optimizer.f90
|
207 | 0 |
Makefilemodified@@ -38,6 +38,7 @@ REGEX_SRCS = $(REGEX_DIR)/regex_types.f90 \ | ||
| 38 | 38 | $(REGEX_DIR)/regex_parser.f90 \ |
| 39 | 39 | $(REGEX_DIR)/regex_nfa.f90 \ |
| 40 | 40 | $(REGEX_DIR)/regex_engine.f90 \ |
| 41 | + $(REGEX_DIR)/aho_corasick.f90 \ | |
| 41 | 42 | $(REGEX_DIR)/regex_optimizer.f90 \ |
| 42 | 43 | $(REGEX_DIR)/regex_api.f90 \ |
| 43 | 44 | $(REGEX_DIR)/pcre_api.f90 |
@@ -105,7 +106,8 @@ $(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o | ||
| 105 | 106 | $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o |
| 106 | 107 | $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_parser.o |
| 107 | 108 | $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o |
| 108 | -$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o | |
| 109 | +$(BUILD_DIR)/aho_corasick.o: | |
| 110 | +$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/aho_corasick.o | |
| 109 | 111 | $(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o |
| 110 | 112 | $(BUILD_DIR)/pcre_api.o: |
| 111 | 113 | |
src/regex/aho_corasick.f90added@@ -0,0 +1,342 @@ | ||
| 1 | +module aho_corasick | |
| 2 | + !> Aho-Corasick automaton for multi-pattern string matching | |
| 3 | + !> Matches all patterns in a single pass O(n + m + z) | |
| 4 | + !> where n=text length, m=total pattern length, z=matches | |
| 5 | + implicit none | |
| 6 | + private | |
| 7 | + | |
| 8 | + public :: ac_automaton_t, ac_match_t | |
| 9 | + public :: ac_build, ac_search, ac_search_any, ac_free | |
| 10 | + | |
| 11 | + integer, parameter :: MAX_CHILDREN = 256 ! ASCII character set | |
| 12 | + integer, parameter :: MAX_PATTERNS = 1000 | |
| 13 | + integer, parameter :: MAX_PATTERN_LEN = 4096 | |
| 14 | + | |
| 15 | + type :: ac_node_t | |
| 16 | + !> Trie node with failure links | |
| 17 | + integer :: children(0:255) = 0 ! Child node indices (0 = no child) | |
| 18 | + integer :: failure = 0 ! Failure link (fall back on mismatch) | |
| 19 | + integer :: output_pattern = 0 ! Pattern index that ends here (0 = none) | |
| 20 | + integer :: output_link = 0 ! Link to next output state | |
| 21 | + integer :: depth = 0 ! Depth in trie (= prefix length) | |
| 22 | + end type ac_node_t | |
| 23 | + | |
| 24 | + type :: ac_automaton_t | |
| 25 | + !> Aho-Corasick automaton | |
| 26 | + type(ac_node_t), allocatable :: nodes(:) | |
| 27 | + integer :: num_nodes = 0 | |
| 28 | + integer :: capacity = 0 | |
| 29 | + integer :: num_patterns = 0 | |
| 30 | + integer, allocatable :: pattern_lengths(:) | |
| 31 | + logical :: compiled = .false. | |
| 32 | + logical :: ignore_case = .false. | |
| 33 | + end type ac_automaton_t | |
| 34 | + | |
| 35 | + type :: ac_match_t | |
| 36 | + !> Match result | |
| 37 | + logical :: matched = .false. | |
| 38 | + integer :: pattern_idx = 0 ! Which pattern matched (1-based) | |
| 39 | + integer :: start_pos = 0 ! Start position in text (1-based) | |
| 40 | + integer :: end_pos = 0 ! End position in text (1-based) | |
| 41 | + end type ac_match_t | |
| 42 | + | |
| 43 | +contains | |
| 44 | + | |
| 45 | + subroutine ac_build(ac, patterns, num_patterns, ignore_case, ierr) | |
| 46 | + !> Build Aho-Corasick automaton from patterns | |
| 47 | + type(ac_automaton_t), intent(out) :: ac | |
| 48 | + character(len=*), intent(in) :: patterns(:) | |
| 49 | + integer, intent(in) :: num_patterns | |
| 50 | + logical, intent(in) :: ignore_case | |
| 51 | + integer, intent(out) :: ierr | |
| 52 | + | |
| 53 | + integer :: i, j, c, state, next_state, child | |
| 54 | + integer, allocatable :: queue(:) | |
| 55 | + integer :: q_head, q_tail | |
| 56 | + integer :: fail_state | |
| 57 | + character(len=1) :: ch | |
| 58 | + | |
| 59 | + ierr = 0 | |
| 60 | + | |
| 61 | + ! Allocate BFS queue | |
| 62 | + allocate(queue(MAX_PATTERNS * MAX_PATTERN_LEN)) | |
| 63 | + ac%ignore_case = ignore_case | |
| 64 | + ac%num_patterns = num_patterns | |
| 65 | + | |
| 66 | + ! Allocate pattern lengths | |
| 67 | + allocate(ac%pattern_lengths(num_patterns)) | |
| 68 | + do i = 1, num_patterns | |
| 69 | + ac%pattern_lengths(i) = len_trim(patterns(i)) | |
| 70 | + end do | |
| 71 | + | |
| 72 | + ! Initial capacity - estimate based on total pattern length | |
| 73 | + ac%capacity = 1 | |
| 74 | + do i = 1, num_patterns | |
| 75 | + ac%capacity = ac%capacity + len_trim(patterns(i)) | |
| 76 | + end do | |
| 77 | + ac%capacity = max(ac%capacity, 256) | |
| 78 | + allocate(ac%nodes(ac%capacity)) | |
| 79 | + | |
| 80 | + ! Initialize root node (index 1) | |
| 81 | + ac%num_nodes = 1 | |
| 82 | + ac%nodes(1)%depth = 0 | |
| 83 | + | |
| 84 | + ! Phase 1: Build trie from patterns | |
| 85 | + do i = 1, num_patterns | |
| 86 | + state = 1 ! Start at root | |
| 87 | + do j = 1, len_trim(patterns(i)) | |
| 88 | + ch = patterns(i)(j:j) | |
| 89 | + if (ignore_case) then | |
| 90 | + c = to_lower_code(ichar(ch)) | |
| 91 | + else | |
| 92 | + c = ichar(ch) | |
| 93 | + end if | |
| 94 | + | |
| 95 | + child = ac%nodes(state)%children(c) | |
| 96 | + if (child == 0) then | |
| 97 | + ! Create new node | |
| 98 | + ac%num_nodes = ac%num_nodes + 1 | |
| 99 | + if (ac%num_nodes > ac%capacity) then | |
| 100 | + call grow_nodes(ac) | |
| 101 | + end if | |
| 102 | + ac%nodes(state)%children(c) = ac%num_nodes | |
| 103 | + ac%nodes(ac%num_nodes)%depth = ac%nodes(state)%depth + 1 | |
| 104 | + child = ac%num_nodes | |
| 105 | + end if | |
| 106 | + state = child | |
| 107 | + end do | |
| 108 | + ! Mark this state as accepting for pattern i | |
| 109 | + ac%nodes(state)%output_pattern = i | |
| 110 | + end do | |
| 111 | + | |
| 112 | + ! Phase 2: Compute failure links using BFS | |
| 113 | + q_head = 1 | |
| 114 | + q_tail = 0 | |
| 115 | + | |
| 116 | + ! Initialize: depth-1 nodes fail to root | |
| 117 | + do c = 0, 255 | |
| 118 | + child = ac%nodes(1)%children(c) | |
| 119 | + if (child /= 0) then | |
| 120 | + ac%nodes(child)%failure = 1 ! Fail to root | |
| 121 | + q_tail = q_tail + 1 | |
| 122 | + queue(q_tail) = child | |
| 123 | + end if | |
| 124 | + end do | |
| 125 | + | |
| 126 | + ! BFS to compute failure links for deeper nodes | |
| 127 | + do while (q_head <= q_tail) | |
| 128 | + state = queue(q_head) | |
| 129 | + q_head = q_head + 1 | |
| 130 | + | |
| 131 | + do c = 0, 255 | |
| 132 | + child = ac%nodes(state)%children(c) | |
| 133 | + if (child /= 0) then | |
| 134 | + ! Add to queue | |
| 135 | + q_tail = q_tail + 1 | |
| 136 | + queue(q_tail) = child | |
| 137 | + | |
| 138 | + ! Compute failure link: follow parent's failure until we find | |
| 139 | + ! a state with a transition on c, or reach root | |
| 140 | + fail_state = ac%nodes(state)%failure | |
| 141 | + do while (fail_state > 1) | |
| 142 | + if (ac%nodes(fail_state)%children(c) /= 0) exit | |
| 143 | + fail_state = ac%nodes(fail_state)%failure | |
| 144 | + end do | |
| 145 | + | |
| 146 | + if (fail_state <= 1) then | |
| 147 | + ! At or beyond root | |
| 148 | + if (ac%nodes(1)%children(c) /= 0 .and. ac%nodes(1)%children(c) /= child) then | |
| 149 | + ac%nodes(child)%failure = ac%nodes(1)%children(c) | |
| 150 | + else | |
| 151 | + ac%nodes(child)%failure = 1 ! Fail to root | |
| 152 | + end if | |
| 153 | + else if (ac%nodes(fail_state)%children(c) == child) then | |
| 154 | + ! Would create self-loop, fail to root | |
| 155 | + ac%nodes(child)%failure = 1 | |
| 156 | + else | |
| 157 | + ac%nodes(child)%failure = ac%nodes(fail_state)%children(c) | |
| 158 | + end if | |
| 159 | + | |
| 160 | + ! Compute output link: chain of accepting states via failure links | |
| 161 | + if (ac%nodes(ac%nodes(child)%failure)%output_pattern /= 0) then | |
| 162 | + ac%nodes(child)%output_link = ac%nodes(child)%failure | |
| 163 | + else | |
| 164 | + ac%nodes(child)%output_link = ac%nodes(ac%nodes(child)%failure)%output_link | |
| 165 | + end if | |
| 166 | + end if | |
| 167 | + end do | |
| 168 | + end do | |
| 169 | + | |
| 170 | + ac%compiled = .true. | |
| 171 | + | |
| 172 | + deallocate(queue) | |
| 173 | + | |
| 174 | + contains | |
| 175 | + | |
| 176 | + function to_lower_code(c) result(lc) | |
| 177 | + integer, intent(in) :: c | |
| 178 | + integer :: lc | |
| 179 | + if (c >= ichar('A') .and. c <= ichar('Z')) then | |
| 180 | + lc = c + 32 | |
| 181 | + else | |
| 182 | + lc = c | |
| 183 | + end if | |
| 184 | + end function to_lower_code | |
| 185 | + | |
| 186 | + end subroutine ac_build | |
| 187 | + | |
| 188 | + subroutine grow_nodes(ac) | |
| 189 | + !> Double the node capacity | |
| 190 | + type(ac_automaton_t), intent(inout) :: ac | |
| 191 | + type(ac_node_t), allocatable :: temp(:) | |
| 192 | + integer :: new_cap | |
| 193 | + | |
| 194 | + new_cap = ac%capacity * 2 | |
| 195 | + allocate(temp(new_cap)) | |
| 196 | + temp(1:ac%num_nodes) = ac%nodes(1:ac%num_nodes) | |
| 197 | + call move_alloc(temp, ac%nodes) | |
| 198 | + ac%capacity = new_cap | |
| 199 | + end subroutine grow_nodes | |
| 200 | + | |
| 201 | + function ac_search_any(ac, text) result(found) | |
| 202 | + !> Search for any pattern match (fast path for existence check) | |
| 203 | + type(ac_automaton_t), intent(in) :: ac | |
| 204 | + character(len=*), intent(in) :: text | |
| 205 | + logical :: found | |
| 206 | + | |
| 207 | + integer :: i, c, state, next_state, text_len | |
| 208 | + | |
| 209 | + found = .false. | |
| 210 | + if (.not. ac%compiled) return | |
| 211 | + | |
| 212 | + text_len = len(text) | |
| 213 | + state = 1 ! Start at root | |
| 214 | + | |
| 215 | + do i = 1, text_len | |
| 216 | + if (ac%ignore_case) then | |
| 217 | + c = to_lower_code(ichar(text(i:i))) | |
| 218 | + else | |
| 219 | + c = ichar(text(i:i)) | |
| 220 | + end if | |
| 221 | + | |
| 222 | + ! Follow failure links until we find a transition or reach root | |
| 223 | + do while (state /= 1 .and. ac%nodes(state)%children(c) == 0) | |
| 224 | + state = ac%nodes(state)%failure | |
| 225 | + end do | |
| 226 | + | |
| 227 | + next_state = ac%nodes(state)%children(c) | |
| 228 | + if (next_state /= 0) then | |
| 229 | + state = next_state | |
| 230 | + else | |
| 231 | + state = 1 ! Stay at root if no transition | |
| 232 | + end if | |
| 233 | + | |
| 234 | + ! Check for match at current state or via output links | |
| 235 | + if (ac%nodes(state)%output_pattern /= 0) then | |
| 236 | + found = .true. | |
| 237 | + return | |
| 238 | + end if | |
| 239 | + if (ac%nodes(state)%output_link /= 0) then | |
| 240 | + found = .true. | |
| 241 | + return | |
| 242 | + end if | |
| 243 | + end do | |
| 244 | + | |
| 245 | + contains | |
| 246 | + | |
| 247 | + function to_lower_code(c) result(lc) | |
| 248 | + integer, intent(in) :: c | |
| 249 | + integer :: lc | |
| 250 | + if (c >= ichar('A') .and. c <= ichar('Z')) then | |
| 251 | + lc = c + 32 | |
| 252 | + else | |
| 253 | + lc = c | |
| 254 | + end if | |
| 255 | + end function to_lower_code | |
| 256 | + | |
| 257 | + end function ac_search_any | |
| 258 | + | |
| 259 | + function ac_search(ac, text) result(match) | |
| 260 | + !> Search for first pattern match with position info | |
| 261 | + type(ac_automaton_t), intent(in) :: ac | |
| 262 | + character(len=*), intent(in) :: text | |
| 263 | + type(ac_match_t) :: match | |
| 264 | + | |
| 265 | + integer :: i, c, state, next_state, text_len, pat_idx, out_state | |
| 266 | + | |
| 267 | + match%matched = .false. | |
| 268 | + if (.not. ac%compiled) return | |
| 269 | + | |
| 270 | + text_len = len(text) | |
| 271 | + state = 1 ! Start at root | |
| 272 | + | |
| 273 | + do i = 1, text_len | |
| 274 | + if (ac%ignore_case) then | |
| 275 | + c = to_lower_code(ichar(text(i:i))) | |
| 276 | + else | |
| 277 | + c = ichar(text(i:i)) | |
| 278 | + end if | |
| 279 | + | |
| 280 | + ! Follow failure links until we find a transition or reach root | |
| 281 | + do while (state /= 1 .and. ac%nodes(state)%children(c) == 0) | |
| 282 | + state = ac%nodes(state)%failure | |
| 283 | + end do | |
| 284 | + | |
| 285 | + next_state = ac%nodes(state)%children(c) | |
| 286 | + if (next_state /= 0) then | |
| 287 | + state = next_state | |
| 288 | + else | |
| 289 | + state = 1 | |
| 290 | + end if | |
| 291 | + | |
| 292 | + ! Check for match at current state | |
| 293 | + pat_idx = ac%nodes(state)%output_pattern | |
| 294 | + if (pat_idx /= 0) then | |
| 295 | + match%matched = .true. | |
| 296 | + match%pattern_idx = pat_idx | |
| 297 | + match%end_pos = i | |
| 298 | + match%start_pos = i - ac%pattern_lengths(pat_idx) + 1 | |
| 299 | + return | |
| 300 | + end if | |
| 301 | + | |
| 302 | + ! Check output links for overlapping patterns | |
| 303 | + out_state = ac%nodes(state)%output_link | |
| 304 | + if (out_state /= 0) then | |
| 305 | + pat_idx = ac%nodes(out_state)%output_pattern | |
| 306 | + if (pat_idx /= 0) then | |
| 307 | + match%matched = .true. | |
| 308 | + match%pattern_idx = pat_idx | |
| 309 | + match%end_pos = i | |
| 310 | + match%start_pos = i - ac%pattern_lengths(pat_idx) + 1 | |
| 311 | + return | |
| 312 | + end if | |
| 313 | + end if | |
| 314 | + end do | |
| 315 | + | |
| 316 | + contains | |
| 317 | + | |
| 318 | + function to_lower_code(c) result(lc) | |
| 319 | + integer, intent(in) :: c | |
| 320 | + integer :: lc | |
| 321 | + if (c >= ichar('A') .and. c <= ichar('Z')) then | |
| 322 | + lc = c + 32 | |
| 323 | + else | |
| 324 | + lc = c | |
| 325 | + end if | |
| 326 | + end function to_lower_code | |
| 327 | + | |
| 328 | + end function ac_search | |
| 329 | + | |
| 330 | + subroutine ac_free(ac) | |
| 331 | + !> Free automaton resources | |
| 332 | + type(ac_automaton_t), intent(inout) :: ac | |
| 333 | + | |
| 334 | + if (allocated(ac%nodes)) deallocate(ac%nodes) | |
| 335 | + if (allocated(ac%pattern_lengths)) deallocate(ac%pattern_lengths) | |
| 336 | + ac%num_nodes = 0 | |
| 337 | + ac%capacity = 0 | |
| 338 | + ac%num_patterns = 0 | |
| 339 | + ac%compiled = .false. | |
| 340 | + end subroutine ac_free | |
| 341 | + | |
| 342 | +end module aho_corasick | |
src/regex/regex_api.f90modified@@ -26,6 +26,7 @@ module regex_api | ||
| 26 | 26 | integer :: error_code = 0 |
| 27 | 27 | character(len=256) :: error_msg = '' |
| 28 | 28 | integer :: num_groups = 0 |
| 29 | + character(len=4096) :: pattern = '' ! Original pattern for AC detection | |
| 29 | 30 | contains |
| 30 | 31 | procedure :: is_compiled => regex_is_compiled |
| 31 | 32 | end type regex_t |
@@ -94,6 +95,10 @@ contains | ||
| 94 | 95 | ! Optimize NFA for faster matching |
| 95 | 96 | call optimize_nfa(re%opt_nfa, re%nfa) |
| 96 | 97 | |
| 98 | + ! Store pattern and try Aho-Corasick for alternation patterns | |
| 99 | + re%pattern = pattern | |
| 100 | + call try_build_aho_corasick(re%opt_nfa, pattern, extended, .false.) | |
| 101 | + | |
| 97 | 102 | re%compiled = .true. |
| 98 | 103 | |
| 99 | 104 | contains |
src/regex/regex_optimizer.f90modified@@ -5,12 +5,15 @@ module regex_optimizer | ||
| 5 | 5 | !> - Bit vector state sets for O(1) operations |
| 6 | 6 | !> - Lazy DFA state caching |
| 7 | 7 | !> - Anchored pattern fast paths |
| 8 | + !> - Aho-Corasick for alternation patterns | |
| 8 | 9 | use regex_types |
| 10 | + use aho_corasick | |
| 9 | 11 | implicit none |
| 10 | 12 | private |
| 11 | 13 | |
| 12 | 14 | public :: optimized_nfa_t |
| 13 | 15 | public :: optimize_nfa, optimized_match, optimized_search |
| 16 | + public :: try_build_aho_corasick | |
| 14 | 17 | |
| 15 | 18 | integer, parameter :: MAX_STATES = 1024 |
| 16 | 19 | integer, parameter :: MAX_PREFIX_LEN = 64 |
@@ -70,6 +73,8 @@ module regex_optimizer | ||
| 70 | 73 | type(dfa_cache_entry_t) :: dfa_cache(DFA_CACHE_SIZE) ! Lazy DFA cache |
| 71 | 74 | type(compiled_dfa_t) :: dfa ! Full compiled DFA (if available) |
| 72 | 75 | logical :: use_dfa = .false. ! Use DFA instead of NFA |
| 76 | + type(ac_automaton_t) :: ac ! Aho-Corasick automaton (for alternation) | |
| 77 | + logical :: use_aho_corasick = .false. ! Use Aho-Corasick for matching | |
| 73 | 78 | logical :: optimized = .false. |
| 74 | 79 | end type optimized_nfa_t |
| 75 | 80 | |
@@ -584,6 +589,15 @@ contains | ||
| 584 | 589 | res%matched = .false. |
| 585 | 590 | text_len = len_trim(text) |
| 586 | 591 | |
| 592 | + ! Fast path: use Aho-Corasick for alternation patterns | |
| 593 | + ! Only use AC if ignore_case setting matches what was compiled | |
| 594 | + if (opt%use_aho_corasick) then | |
| 595 | + if (ignore_case .eqv. opt%ac%ignore_case) then | |
| 596 | + res = ac_optimized_search(opt%ac, text) | |
| 597 | + return | |
| 598 | + end if | |
| 599 | + end if | |
| 600 | + | |
| 587 | 601 | if (opt%nfa%num_states == 0) return |
| 588 | 602 | |
| 589 | 603 | ! Fast path: use DFA if available (O(n) matching) |
@@ -1097,4 +1111,197 @@ contains | ||
| 1097 | 1111 | if (negated) res = .not. res |
| 1098 | 1112 | end function char_in_class_opt |
| 1099 | 1113 | |
| 1114 | + !--------------------------------------------------------------------------- | |
| 1115 | + ! Aho-Corasick Integration for Alternation Patterns | |
| 1116 | + !--------------------------------------------------------------------------- | |
| 1117 | + | |
| 1118 | + subroutine try_build_aho_corasick(opt, pattern, is_ere, ignore_case) | |
| 1119 | + !> Try to build Aho-Corasick automaton for simple alternation patterns | |
| 1120 | + !> Pattern like "foo|bar|baz" with only literal characters and | separators | |
| 1121 | + type(optimized_nfa_t), intent(inout) :: opt | |
| 1122 | + character(len=*), intent(in) :: pattern | |
| 1123 | + logical, intent(in) :: is_ere, ignore_case | |
| 1124 | + | |
| 1125 | + character(len=4096), allocatable :: alternatives(:) | |
| 1126 | + integer :: num_alternatives, ierr | |
| 1127 | + logical :: is_simple | |
| 1128 | + | |
| 1129 | + allocate(alternatives(1000)) | |
| 1130 | + | |
| 1131 | + opt%use_aho_corasick = .false. | |
| 1132 | + | |
| 1133 | + ! Check if pattern is simple alternation of literals | |
| 1134 | + call parse_simple_alternation(pattern, is_ere, alternatives, num_alternatives, is_simple) | |
| 1135 | + | |
| 1136 | + ! DEBUG (commented out for production) | |
| 1137 | + ! write(error_unit, '(A,I0,A,L1)') 'DEBUG AC: num_alt=', num_alternatives, ' is_simple=', is_simple | |
| 1138 | + | |
| 1139 | + if (.not. is_simple .or. num_alternatives < 2) return | |
| 1140 | + | |
| 1141 | + ! Build Aho-Corasick automaton | |
| 1142 | + call ac_build(opt%ac, alternatives, num_alternatives, ignore_case, ierr) | |
| 1143 | + | |
| 1144 | + if (ierr == 0) then | |
| 1145 | + opt%use_aho_corasick = .true. | |
| 1146 | + end if | |
| 1147 | + | |
| 1148 | + deallocate(alternatives) | |
| 1149 | + | |
| 1150 | + end subroutine try_build_aho_corasick | |
| 1151 | + | |
| 1152 | + subroutine parse_simple_alternation(pattern, is_ere, alternatives, num_alt, is_simple) | |
| 1153 | + !> Parse pattern to check if it's simple alternation of literals | |
| 1154 | + !> Returns the alternatives if so | |
| 1155 | + character(len=*), intent(in) :: pattern | |
| 1156 | + logical, intent(in) :: is_ere | |
| 1157 | + character(len=*), intent(out) :: alternatives(:) | |
| 1158 | + integer, intent(out) :: num_alt | |
| 1159 | + logical, intent(out) :: is_simple | |
| 1160 | + | |
| 1161 | + integer :: i, pat_len, alt_start, alt_len | |
| 1162 | + character(len=1) :: c, next_c | |
| 1163 | + logical :: in_escape | |
| 1164 | + | |
| 1165 | + is_simple = .true. | |
| 1166 | + num_alt = 0 | |
| 1167 | + pat_len = len_trim(pattern) | |
| 1168 | + | |
| 1169 | + if (pat_len == 0) then | |
| 1170 | + is_simple = .false. | |
| 1171 | + return | |
| 1172 | + end if | |
| 1173 | + | |
| 1174 | + alt_start = 1 | |
| 1175 | + alt_len = 0 | |
| 1176 | + in_escape = .false. | |
| 1177 | + i = 1 | |
| 1178 | + | |
| 1179 | + do while (i <= pat_len) | |
| 1180 | + c = pattern(i:i) | |
| 1181 | + | |
| 1182 | + if (in_escape) then | |
| 1183 | + ! In ERE mode, \| is literal | | |
| 1184 | + ! In BRE mode, \| is alternation (GNU extension) | |
| 1185 | + if (c == '|' .and. .not. is_ere) then | |
| 1186 | + ! BRE alternation | |
| 1187 | + if (alt_len > 0) then | |
| 1188 | + num_alt = num_alt + 1 | |
| 1189 | + if (num_alt > size(alternatives)) then | |
| 1190 | + is_simple = .false. | |
| 1191 | + return | |
| 1192 | + end if | |
| 1193 | + alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) | |
| 1194 | + else | |
| 1195 | + ! Empty alternative - still valid | |
| 1196 | + num_alt = num_alt + 1 | |
| 1197 | + alternatives(num_alt) = '' | |
| 1198 | + end if | |
| 1199 | + alt_start = i + 1 | |
| 1200 | + alt_len = 0 | |
| 1201 | + else if (c == '(' .or. c == ')' .or. c == '{' .or. c == '}' .or. & | |
| 1202 | + c == '<' .or. c == '>' .or. c == 'b' .or. c == 'B' .or. & | |
| 1203 | + c == 'd' .or. c == 'D' .or. c == 'w' .or. c == 'W' .or. & | |
| 1204 | + c == 's' .or. c == 'S' .or. c == '1' .or. c == '2' .or. & | |
| 1205 | + c == '3' .or. c == '4' .or. c == '5' .or. c == '6' .or. & | |
| 1206 | + c == '7' .or. c == '8' .or. c == '9') then | |
| 1207 | + ! Regex metacharacter - not simple | |
| 1208 | + is_simple = .false. | |
| 1209 | + return | |
| 1210 | + else | |
| 1211 | + ! Escaped literal character (e.g., \., \*, etc.) | |
| 1212 | + alt_len = alt_len + 1 | |
| 1213 | + end if | |
| 1214 | + in_escape = .false. | |
| 1215 | + i = i + 1 | |
| 1216 | + cycle | |
| 1217 | + end if | |
| 1218 | + | |
| 1219 | + if (c == '\') then | |
| 1220 | + in_escape = .true. | |
| 1221 | + i = i + 1 | |
| 1222 | + cycle | |
| 1223 | + end if | |
| 1224 | + | |
| 1225 | + ! Check for metacharacters | |
| 1226 | + if (is_ere) then | |
| 1227 | + ! ERE mode: | is alternation, . * + ? [ ] ^ $ ( ) { } are metacharacters | |
| 1228 | + if (c == '|') then | |
| 1229 | + ! Alternation separator | |
| 1230 | + if (alt_len > 0) then | |
| 1231 | + num_alt = num_alt + 1 | |
| 1232 | + if (num_alt > size(alternatives)) then | |
| 1233 | + is_simple = .false. | |
| 1234 | + return | |
| 1235 | + end if | |
| 1236 | + alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) | |
| 1237 | + else | |
| 1238 | + num_alt = num_alt + 1 | |
| 1239 | + alternatives(num_alt) = '' | |
| 1240 | + end if | |
| 1241 | + alt_start = i + 1 | |
| 1242 | + alt_len = 0 | |
| 1243 | + i = i + 1 | |
| 1244 | + cycle | |
| 1245 | + else if (c == '.' .or. c == '*' .or. c == '+' .or. c == '?' .or. & | |
| 1246 | + c == '[' .or. c == ']' .or. c == '^' .or. c == '$' .or. & | |
| 1247 | + c == '(' .or. c == ')' .or. c == '{' .or. c == '}') then | |
| 1248 | + ! Metacharacter - not simple alternation | |
| 1249 | + is_simple = .false. | |
| 1250 | + return | |
| 1251 | + end if | |
| 1252 | + else | |
| 1253 | + ! BRE mode: only . * [ ] ^ $ are metacharacters | |
| 1254 | + ! | is literal, \| is alternation (GNU extension) | |
| 1255 | + if (c == '.' .or. c == '*' .or. c == '[' .or. c == ']' .or. & | |
| 1256 | + c == '^' .or. c == '$') then | |
| 1257 | + is_simple = .false. | |
| 1258 | + return | |
| 1259 | + end if | |
| 1260 | + end if | |
| 1261 | + | |
| 1262 | + ! Regular literal character | |
| 1263 | + alt_len = alt_len + 1 | |
| 1264 | + i = i + 1 | |
| 1265 | + end do | |
| 1266 | + | |
| 1267 | + ! Handle last alternative | |
| 1268 | + if (alt_len > 0 .or. num_alt > 0) then | |
| 1269 | + num_alt = num_alt + 1 | |
| 1270 | + if (num_alt > size(alternatives)) then | |
| 1271 | + is_simple = .false. | |
| 1272 | + return | |
| 1273 | + end if | |
| 1274 | + if (alt_len > 0) then | |
| 1275 | + alternatives(num_alt) = pattern(alt_start:alt_start+alt_len-1) | |
| 1276 | + else | |
| 1277 | + alternatives(num_alt) = '' | |
| 1278 | + end if | |
| 1279 | + end if | |
| 1280 | + | |
| 1281 | + ! Need at least 2 alternatives for Aho-Corasick to be useful | |
| 1282 | + if (num_alt < 2) then | |
| 1283 | + is_simple = .false. | |
| 1284 | + end if | |
| 1285 | + | |
| 1286 | + end subroutine parse_simple_alternation | |
| 1287 | + | |
| 1288 | + function ac_optimized_search(ac, text) result(res) | |
| 1289 | + !> Search using Aho-Corasick automaton | |
| 1290 | + type(ac_automaton_t), intent(in) :: ac | |
| 1291 | + character(len=*), intent(in) :: text | |
| 1292 | + type(match_result_t) :: res | |
| 1293 | + | |
| 1294 | + type(ac_match_t) :: ac_match | |
| 1295 | + | |
| 1296 | + res%matched = .false. | |
| 1297 | + | |
| 1298 | + ac_match = ac_search(ac, text) | |
| 1299 | + if (ac_match%matched) then | |
| 1300 | + res%matched = .true. | |
| 1301 | + res%match_start = ac_match%start_pos | |
| 1302 | + res%match_end = ac_match%end_pos | |
| 1303 | + end if | |
| 1304 | + | |
| 1305 | + end function ac_optimized_search | |
| 1306 | + | |
| 1100 | 1307 | end module regex_optimizer |