| 1 | module aho_corasick |
| 2 | !> Aho-Corasick automaton for multi-pattern string matching |
| 3 | !> Matches all patterns in a single pass O(n + m + z) |
| 4 | !> where n=text length, m=total pattern length, z=matches |
| 5 | use ferp_kinds, only: pattern_len |
| 6 | implicit none |
| 7 | private |
| 8 | |
| 9 | public :: ac_automaton_t, ac_match_t |
| 10 | public :: ac_build, ac_search, ac_search_any, ac_free |
| 11 | |
| 12 | integer, parameter :: MAX_CHILDREN = 256 ! ASCII character set |
| 13 | integer, parameter :: MAX_PATTERNS = 1000 |
| 14 | integer, parameter :: MAX_PATTERN_LEN = 4096 |
| 15 | |
| 16 | type :: ac_node_t |
| 17 | !> Trie node with failure links |
| 18 | integer :: children(0:255) = 0 ! Child node indices (0 = no child) |
| 19 | integer :: failure = 0 ! Failure link (fall back on mismatch) |
| 20 | integer :: output_pattern = 0 ! Pattern index that ends here (0 = none) |
| 21 | integer :: output_link = 0 ! Link to next output state |
| 22 | integer :: depth = 0 ! Depth in trie (= prefix length) |
| 23 | end type ac_node_t |
| 24 | |
| 25 | type :: ac_automaton_t |
| 26 | !> Aho-Corasick automaton |
| 27 | type(ac_node_t), allocatable :: nodes(:) |
| 28 | integer :: num_nodes = 0 |
| 29 | integer :: capacity = 0 |
| 30 | integer :: num_patterns = 0 |
| 31 | integer, allocatable :: pattern_lengths(:) |
| 32 | logical :: compiled = .false. |
| 33 | logical :: ignore_case = .false. |
| 34 | end type ac_automaton_t |
| 35 | |
| 36 | type :: ac_match_t |
| 37 | !> Match result |
| 38 | logical :: matched = .false. |
| 39 | integer :: pattern_idx = 0 ! Which pattern matched (1-based) |
| 40 | integer :: start_pos = 0 ! Start position in text (1-based) |
| 41 | integer :: end_pos = 0 ! End position in text (1-based) |
| 42 | end type ac_match_t |
| 43 | |
| 44 | contains |
| 45 | |
| 46 | subroutine ac_build(ac, patterns, num_patterns, ignore_case, ierr) |
| 47 | !> Build Aho-Corasick automaton from patterns |
| 48 | type(ac_automaton_t), intent(out) :: ac |
| 49 | character(len=*), intent(in) :: patterns(:) |
| 50 | integer, intent(in) :: num_patterns |
| 51 | logical, intent(in) :: ignore_case |
| 52 | integer, intent(out) :: ierr |
| 53 | |
| 54 | integer :: i, j, c, state, next_state, child |
| 55 | integer, allocatable :: queue(:) |
| 56 | integer :: q_head, q_tail |
| 57 | integer :: fail_state |
| 58 | character(len=1) :: ch |
| 59 | |
| 60 | ierr = 0 |
| 61 | |
| 62 | ! Allocate BFS queue |
| 63 | allocate(queue(MAX_PATTERNS * MAX_PATTERN_LEN)) |
| 64 | ac%ignore_case = ignore_case |
| 65 | ac%num_patterns = num_patterns |
| 66 | |
| 67 | ! Allocate pattern lengths (use pattern_len to preserve whitespace patterns) |
| 68 | allocate(ac%pattern_lengths(num_patterns)) |
| 69 | do i = 1, num_patterns |
| 70 | ac%pattern_lengths(i) = pattern_len(patterns(i)) |
| 71 | end do |
| 72 | |
| 73 | ! Initial capacity - estimate based on total pattern length |
| 74 | ac%capacity = 1 |
| 75 | do i = 1, num_patterns |
| 76 | ac%capacity = ac%capacity + pattern_len(patterns(i)) |
| 77 | end do |
| 78 | ac%capacity = max(ac%capacity, 256) |
| 79 | allocate(ac%nodes(ac%capacity)) |
| 80 | |
| 81 | ! Initialize root node (index 1) |
| 82 | ac%num_nodes = 1 |
| 83 | ac%nodes(1)%depth = 0 |
| 84 | |
| 85 | ! Phase 1: Build trie from patterns |
| 86 | do i = 1, num_patterns |
| 87 | state = 1 ! Start at root |
| 88 | do j = 1, pattern_len(patterns(i)) |
| 89 | ch = patterns(i)(j:j) |
| 90 | if (ignore_case) then |
| 91 | c = to_lower_code(ichar(ch)) |
| 92 | else |
| 93 | c = ichar(ch) |
| 94 | end if |
| 95 | |
| 96 | child = ac%nodes(state)%children(c) |
| 97 | if (child == 0) then |
| 98 | ! Create new node |
| 99 | ac%num_nodes = ac%num_nodes + 1 |
| 100 | if (ac%num_nodes > ac%capacity) then |
| 101 | call grow_nodes(ac) |
| 102 | end if |
| 103 | ac%nodes(state)%children(c) = ac%num_nodes |
| 104 | ac%nodes(ac%num_nodes)%depth = ac%nodes(state)%depth + 1 |
| 105 | child = ac%num_nodes |
| 106 | end if |
| 107 | state = child |
| 108 | end do |
| 109 | ! Mark this state as accepting for pattern i |
| 110 | ac%nodes(state)%output_pattern = i |
| 111 | end do |
| 112 | |
| 113 | ! Phase 2: Compute failure links using BFS |
| 114 | q_head = 1 |
| 115 | q_tail = 0 |
| 116 | |
| 117 | ! Initialize: depth-1 nodes fail to root |
| 118 | do c = 0, 255 |
| 119 | child = ac%nodes(1)%children(c) |
| 120 | if (child /= 0) then |
| 121 | ac%nodes(child)%failure = 1 ! Fail to root |
| 122 | q_tail = q_tail + 1 |
| 123 | queue(q_tail) = child |
| 124 | end if |
| 125 | end do |
| 126 | |
| 127 | ! BFS to compute failure links for deeper nodes |
| 128 | do while (q_head <= q_tail) |
| 129 | state = queue(q_head) |
| 130 | q_head = q_head + 1 |
| 131 | |
| 132 | do c = 0, 255 |
| 133 | child = ac%nodes(state)%children(c) |
| 134 | if (child /= 0) then |
| 135 | ! Add to queue |
| 136 | q_tail = q_tail + 1 |
| 137 | queue(q_tail) = child |
| 138 | |
| 139 | ! Compute failure link: follow parent's failure until we find |
| 140 | ! a state with a transition on c, or reach root |
| 141 | fail_state = ac%nodes(state)%failure |
| 142 | do while (fail_state > 1) |
| 143 | if (ac%nodes(fail_state)%children(c) /= 0) exit |
| 144 | fail_state = ac%nodes(fail_state)%failure |
| 145 | end do |
| 146 | |
| 147 | if (fail_state <= 1) then |
| 148 | ! At or beyond root |
| 149 | if (ac%nodes(1)%children(c) /= 0 .and. ac%nodes(1)%children(c) /= child) then |
| 150 | ac%nodes(child)%failure = ac%nodes(1)%children(c) |
| 151 | else |
| 152 | ac%nodes(child)%failure = 1 ! Fail to root |
| 153 | end if |
| 154 | else if (ac%nodes(fail_state)%children(c) == child) then |
| 155 | ! Would create self-loop, fail to root |
| 156 | ac%nodes(child)%failure = 1 |
| 157 | else |
| 158 | ac%nodes(child)%failure = ac%nodes(fail_state)%children(c) |
| 159 | end if |
| 160 | |
| 161 | ! Compute output link: chain of accepting states via failure links |
| 162 | if (ac%nodes(ac%nodes(child)%failure)%output_pattern /= 0) then |
| 163 | ac%nodes(child)%output_link = ac%nodes(child)%failure |
| 164 | else |
| 165 | ac%nodes(child)%output_link = ac%nodes(ac%nodes(child)%failure)%output_link |
| 166 | end if |
| 167 | end if |
| 168 | end do |
| 169 | end do |
| 170 | |
| 171 | ac%compiled = .true. |
| 172 | |
| 173 | deallocate(queue) |
| 174 | |
| 175 | contains |
| 176 | |
| 177 | function to_lower_code(c) result(lc) |
| 178 | integer, intent(in) :: c |
| 179 | integer :: lc |
| 180 | if (c >= ichar('A') .and. c <= ichar('Z')) then |
| 181 | lc = c + 32 |
| 182 | else |
| 183 | lc = c |
| 184 | end if |
| 185 | end function to_lower_code |
| 186 | |
| 187 | end subroutine ac_build |
| 188 | |
| 189 | subroutine grow_nodes(ac) |
| 190 | !> Double the node capacity |
| 191 | type(ac_automaton_t), intent(inout) :: ac |
| 192 | type(ac_node_t), allocatable :: temp(:) |
| 193 | integer :: new_cap |
| 194 | |
| 195 | new_cap = ac%capacity * 2 |
| 196 | allocate(temp(new_cap)) |
| 197 | temp(1:ac%num_nodes) = ac%nodes(1:ac%num_nodes) |
| 198 | call move_alloc(temp, ac%nodes) |
| 199 | ac%capacity = new_cap |
| 200 | end subroutine grow_nodes |
| 201 | |
| 202 | function ac_search_any(ac, text) result(found) |
| 203 | !> Search for any pattern match (fast path for existence check) |
| 204 | type(ac_automaton_t), intent(in) :: ac |
| 205 | character(len=*), intent(in) :: text |
| 206 | logical :: found |
| 207 | |
| 208 | integer :: i, c, state, next_state, text_len |
| 209 | |
| 210 | found = .false. |
| 211 | if (.not. ac%compiled) return |
| 212 | |
| 213 | text_len = len(text) |
| 214 | state = 1 ! Start at root |
| 215 | |
| 216 | do i = 1, text_len |
| 217 | if (ac%ignore_case) then |
| 218 | c = to_lower_code(ichar(text(i:i))) |
| 219 | else |
| 220 | c = ichar(text(i:i)) |
| 221 | end if |
| 222 | |
| 223 | ! Follow failure links until we find a transition or reach root |
| 224 | do while (state /= 1 .and. ac%nodes(state)%children(c) == 0) |
| 225 | state = ac%nodes(state)%failure |
| 226 | end do |
| 227 | |
| 228 | next_state = ac%nodes(state)%children(c) |
| 229 | if (next_state /= 0) then |
| 230 | state = next_state |
| 231 | else |
| 232 | state = 1 ! Stay at root if no transition |
| 233 | end if |
| 234 | |
| 235 | ! Check for match at current state or via output links |
| 236 | if (ac%nodes(state)%output_pattern /= 0) then |
| 237 | found = .true. |
| 238 | return |
| 239 | end if |
| 240 | if (ac%nodes(state)%output_link /= 0) then |
| 241 | found = .true. |
| 242 | return |
| 243 | end if |
| 244 | end do |
| 245 | |
| 246 | contains |
| 247 | |
| 248 | function to_lower_code(c) result(lc) |
| 249 | integer, intent(in) :: c |
| 250 | integer :: lc |
| 251 | if (c >= ichar('A') .and. c <= ichar('Z')) then |
| 252 | lc = c + 32 |
| 253 | else |
| 254 | lc = c |
| 255 | end if |
| 256 | end function to_lower_code |
| 257 | |
| 258 | end function ac_search_any |
| 259 | |
| 260 | function ac_search(ac, text) result(match) |
| 261 | !> Search for first pattern match with position info |
| 262 | type(ac_automaton_t), intent(in) :: ac |
| 263 | character(len=*), intent(in) :: text |
| 264 | type(ac_match_t) :: match |
| 265 | |
| 266 | integer :: i, c, state, next_state, text_len, pat_idx, out_state |
| 267 | |
| 268 | match%matched = .false. |
| 269 | if (.not. ac%compiled) return |
| 270 | |
| 271 | text_len = len(text) |
| 272 | state = 1 ! Start at root |
| 273 | |
| 274 | do i = 1, text_len |
| 275 | if (ac%ignore_case) then |
| 276 | c = to_lower_code(ichar(text(i:i))) |
| 277 | else |
| 278 | c = ichar(text(i:i)) |
| 279 | end if |
| 280 | |
| 281 | ! Follow failure links until we find a transition or reach root |
| 282 | do while (state /= 1 .and. ac%nodes(state)%children(c) == 0) |
| 283 | state = ac%nodes(state)%failure |
| 284 | end do |
| 285 | |
| 286 | next_state = ac%nodes(state)%children(c) |
| 287 | if (next_state /= 0) then |
| 288 | state = next_state |
| 289 | else |
| 290 | state = 1 |
| 291 | end if |
| 292 | |
| 293 | ! Check for match at current state |
| 294 | pat_idx = ac%nodes(state)%output_pattern |
| 295 | if (pat_idx /= 0) then |
| 296 | match%matched = .true. |
| 297 | match%pattern_idx = pat_idx |
| 298 | match%end_pos = i |
| 299 | match%start_pos = i - ac%pattern_lengths(pat_idx) + 1 |
| 300 | return |
| 301 | end if |
| 302 | |
| 303 | ! Check output links for overlapping patterns |
| 304 | out_state = ac%nodes(state)%output_link |
| 305 | if (out_state /= 0) then |
| 306 | pat_idx = ac%nodes(out_state)%output_pattern |
| 307 | if (pat_idx /= 0) then |
| 308 | match%matched = .true. |
| 309 | match%pattern_idx = pat_idx |
| 310 | match%end_pos = i |
| 311 | match%start_pos = i - ac%pattern_lengths(pat_idx) + 1 |
| 312 | return |
| 313 | end if |
| 314 | end if |
| 315 | end do |
| 316 | |
| 317 | contains |
| 318 | |
| 319 | function to_lower_code(c) result(lc) |
| 320 | integer, intent(in) :: c |
| 321 | integer :: lc |
| 322 | if (c >= ichar('A') .and. c <= ichar('Z')) then |
| 323 | lc = c + 32 |
| 324 | else |
| 325 | lc = c |
| 326 | end if |
| 327 | end function to_lower_code |
| 328 | |
| 329 | end function ac_search |
| 330 | |
| 331 | subroutine ac_free(ac) |
| 332 | !> Free automaton resources |
| 333 | type(ac_automaton_t), intent(inout) :: ac |
| 334 | |
| 335 | if (allocated(ac%nodes)) deallocate(ac%nodes) |
| 336 | if (allocated(ac%pattern_lengths)) deallocate(ac%pattern_lengths) |
| 337 | ac%num_nodes = 0 |
| 338 | ac%capacity = 0 |
| 339 | ac%num_patterns = 0 |
| 340 | ac%compiled = .false. |
| 341 | end subroutine ac_free |
| 342 | |
| 343 | end module aho_corasick |
| 344 |