Fortran · 10089 bytes Raw Blame History
1 module aho_corasick
2 !> Aho-Corasick automaton for multi-pattern string matching
3 !> Matches all patterns in a single pass O(n + m + z)
4 !> where n=text length, m=total pattern length, z=matches
5 use ferp_kinds, only: pattern_len
6 implicit none
7 private
8
9 public :: ac_automaton_t, ac_match_t
10 public :: ac_build, ac_search, ac_search_any, ac_free
11
12 integer, parameter :: MAX_CHILDREN = 256 ! ASCII character set
13 integer, parameter :: MAX_PATTERNS = 1000
14 integer, parameter :: MAX_PATTERN_LEN = 4096
15
16 type :: ac_node_t
17 !> Trie node with failure links
18 integer :: children(0:255) = 0 ! Child node indices (0 = no child)
19 integer :: failure = 0 ! Failure link (fall back on mismatch)
20 integer :: output_pattern = 0 ! Pattern index that ends here (0 = none)
21 integer :: output_link = 0 ! Link to next output state
22 integer :: depth = 0 ! Depth in trie (= prefix length)
23 end type ac_node_t
24
25 type :: ac_automaton_t
26 !> Aho-Corasick automaton
27 type(ac_node_t), allocatable :: nodes(:)
28 integer :: num_nodes = 0
29 integer :: capacity = 0
30 integer :: num_patterns = 0
31 integer, allocatable :: pattern_lengths(:)
32 logical :: compiled = .false.
33 logical :: ignore_case = .false.
34 end type ac_automaton_t
35
36 type :: ac_match_t
37 !> Match result
38 logical :: matched = .false.
39 integer :: pattern_idx = 0 ! Which pattern matched (1-based)
40 integer :: start_pos = 0 ! Start position in text (1-based)
41 integer :: end_pos = 0 ! End position in text (1-based)
42 end type ac_match_t
43
44 contains
45
46 subroutine ac_build(ac, patterns, num_patterns, ignore_case, ierr)
47 !> Build Aho-Corasick automaton from patterns
48 type(ac_automaton_t), intent(out) :: ac
49 character(len=*), intent(in) :: patterns(:)
50 integer, intent(in) :: num_patterns
51 logical, intent(in) :: ignore_case
52 integer, intent(out) :: ierr
53
54 integer :: i, j, c, state, next_state, child
55 integer, allocatable :: queue(:)
56 integer :: q_head, q_tail
57 integer :: fail_state
58 character(len=1) :: ch
59
60 ierr = 0
61
62 ! Allocate BFS queue
63 allocate(queue(MAX_PATTERNS * MAX_PATTERN_LEN))
64 ac%ignore_case = ignore_case
65 ac%num_patterns = num_patterns
66
67 ! Allocate pattern lengths (use pattern_len to preserve whitespace patterns)
68 allocate(ac%pattern_lengths(num_patterns))
69 do i = 1, num_patterns
70 ac%pattern_lengths(i) = pattern_len(patterns(i))
71 end do
72
73 ! Initial capacity - estimate based on total pattern length
74 ac%capacity = 1
75 do i = 1, num_patterns
76 ac%capacity = ac%capacity + pattern_len(patterns(i))
77 end do
78 ac%capacity = max(ac%capacity, 256)
79 allocate(ac%nodes(ac%capacity))
80
81 ! Initialize root node (index 1)
82 ac%num_nodes = 1
83 ac%nodes(1)%depth = 0
84
85 ! Phase 1: Build trie from patterns
86 do i = 1, num_patterns
87 state = 1 ! Start at root
88 do j = 1, pattern_len(patterns(i))
89 ch = patterns(i)(j:j)
90 if (ignore_case) then
91 c = to_lower_code(ichar(ch))
92 else
93 c = ichar(ch)
94 end if
95
96 child = ac%nodes(state)%children(c)
97 if (child == 0) then
98 ! Create new node
99 ac%num_nodes = ac%num_nodes + 1
100 if (ac%num_nodes > ac%capacity) then
101 call grow_nodes(ac)
102 end if
103 ac%nodes(state)%children(c) = ac%num_nodes
104 ac%nodes(ac%num_nodes)%depth = ac%nodes(state)%depth + 1
105 child = ac%num_nodes
106 end if
107 state = child
108 end do
109 ! Mark this state as accepting for pattern i
110 ac%nodes(state)%output_pattern = i
111 end do
112
113 ! Phase 2: Compute failure links using BFS
114 q_head = 1
115 q_tail = 0
116
117 ! Initialize: depth-1 nodes fail to root
118 do c = 0, 255
119 child = ac%nodes(1)%children(c)
120 if (child /= 0) then
121 ac%nodes(child)%failure = 1 ! Fail to root
122 q_tail = q_tail + 1
123 queue(q_tail) = child
124 end if
125 end do
126
127 ! BFS to compute failure links for deeper nodes
128 do while (q_head <= q_tail)
129 state = queue(q_head)
130 q_head = q_head + 1
131
132 do c = 0, 255
133 child = ac%nodes(state)%children(c)
134 if (child /= 0) then
135 ! Add to queue
136 q_tail = q_tail + 1
137 queue(q_tail) = child
138
139 ! Compute failure link: follow parent's failure until we find
140 ! a state with a transition on c, or reach root
141 fail_state = ac%nodes(state)%failure
142 do while (fail_state > 1)
143 if (ac%nodes(fail_state)%children(c) /= 0) exit
144 fail_state = ac%nodes(fail_state)%failure
145 end do
146
147 if (fail_state <= 1) then
148 ! At or beyond root
149 if (ac%nodes(1)%children(c) /= 0 .and. ac%nodes(1)%children(c) /= child) then
150 ac%nodes(child)%failure = ac%nodes(1)%children(c)
151 else
152 ac%nodes(child)%failure = 1 ! Fail to root
153 end if
154 else if (ac%nodes(fail_state)%children(c) == child) then
155 ! Would create self-loop, fail to root
156 ac%nodes(child)%failure = 1
157 else
158 ac%nodes(child)%failure = ac%nodes(fail_state)%children(c)
159 end if
160
161 ! Compute output link: chain of accepting states via failure links
162 if (ac%nodes(ac%nodes(child)%failure)%output_pattern /= 0) then
163 ac%nodes(child)%output_link = ac%nodes(child)%failure
164 else
165 ac%nodes(child)%output_link = ac%nodes(ac%nodes(child)%failure)%output_link
166 end if
167 end if
168 end do
169 end do
170
171 ac%compiled = .true.
172
173 deallocate(queue)
174
175 contains
176
177 function to_lower_code(c) result(lc)
178 integer, intent(in) :: c
179 integer :: lc
180 if (c >= ichar('A') .and. c <= ichar('Z')) then
181 lc = c + 32
182 else
183 lc = c
184 end if
185 end function to_lower_code
186
187 end subroutine ac_build
188
189 subroutine grow_nodes(ac)
190 !> Double the node capacity
191 type(ac_automaton_t), intent(inout) :: ac
192 type(ac_node_t), allocatable :: temp(:)
193 integer :: new_cap
194
195 new_cap = ac%capacity * 2
196 allocate(temp(new_cap))
197 temp(1:ac%num_nodes) = ac%nodes(1:ac%num_nodes)
198 call move_alloc(temp, ac%nodes)
199 ac%capacity = new_cap
200 end subroutine grow_nodes
201
202 function ac_search_any(ac, text) result(found)
203 !> Search for any pattern match (fast path for existence check)
204 type(ac_automaton_t), intent(in) :: ac
205 character(len=*), intent(in) :: text
206 logical :: found
207
208 integer :: i, c, state, next_state, text_len
209
210 found = .false.
211 if (.not. ac%compiled) return
212
213 text_len = len(text)
214 state = 1 ! Start at root
215
216 do i = 1, text_len
217 if (ac%ignore_case) then
218 c = to_lower_code(ichar(text(i:i)))
219 else
220 c = ichar(text(i:i))
221 end if
222
223 ! Follow failure links until we find a transition or reach root
224 do while (state /= 1 .and. ac%nodes(state)%children(c) == 0)
225 state = ac%nodes(state)%failure
226 end do
227
228 next_state = ac%nodes(state)%children(c)
229 if (next_state /= 0) then
230 state = next_state
231 else
232 state = 1 ! Stay at root if no transition
233 end if
234
235 ! Check for match at current state or via output links
236 if (ac%nodes(state)%output_pattern /= 0) then
237 found = .true.
238 return
239 end if
240 if (ac%nodes(state)%output_link /= 0) then
241 found = .true.
242 return
243 end if
244 end do
245
246 contains
247
248 function to_lower_code(c) result(lc)
249 integer, intent(in) :: c
250 integer :: lc
251 if (c >= ichar('A') .and. c <= ichar('Z')) then
252 lc = c + 32
253 else
254 lc = c
255 end if
256 end function to_lower_code
257
258 end function ac_search_any
259
260 function ac_search(ac, text) result(match)
261 !> Search for first pattern match with position info
262 type(ac_automaton_t), intent(in) :: ac
263 character(len=*), intent(in) :: text
264 type(ac_match_t) :: match
265
266 integer :: i, c, state, next_state, text_len, pat_idx, out_state
267
268 match%matched = .false.
269 if (.not. ac%compiled) return
270
271 text_len = len(text)
272 state = 1 ! Start at root
273
274 do i = 1, text_len
275 if (ac%ignore_case) then
276 c = to_lower_code(ichar(text(i:i)))
277 else
278 c = ichar(text(i:i))
279 end if
280
281 ! Follow failure links until we find a transition or reach root
282 do while (state /= 1 .and. ac%nodes(state)%children(c) == 0)
283 state = ac%nodes(state)%failure
284 end do
285
286 next_state = ac%nodes(state)%children(c)
287 if (next_state /= 0) then
288 state = next_state
289 else
290 state = 1
291 end if
292
293 ! Check for match at current state
294 pat_idx = ac%nodes(state)%output_pattern
295 if (pat_idx /= 0) then
296 match%matched = .true.
297 match%pattern_idx = pat_idx
298 match%end_pos = i
299 match%start_pos = i - ac%pattern_lengths(pat_idx) + 1
300 return
301 end if
302
303 ! Check output links for overlapping patterns
304 out_state = ac%nodes(state)%output_link
305 if (out_state /= 0) then
306 pat_idx = ac%nodes(out_state)%output_pattern
307 if (pat_idx /= 0) then
308 match%matched = .true.
309 match%pattern_idx = pat_idx
310 match%end_pos = i
311 match%start_pos = i - ac%pattern_lengths(pat_idx) + 1
312 return
313 end if
314 end if
315 end do
316
317 contains
318
319 function to_lower_code(c) result(lc)
320 integer, intent(in) :: c
321 integer :: lc
322 if (c >= ichar('A') .and. c <= ichar('Z')) then
323 lc = c + 32
324 else
325 lc = c
326 end if
327 end function to_lower_code
328
329 end function ac_search
330
331 subroutine ac_free(ac)
332 !> Free automaton resources
333 type(ac_automaton_t), intent(inout) :: ac
334
335 if (allocated(ac%nodes)) deallocate(ac%nodes)
336 if (allocated(ac%pattern_lengths)) deallocate(ac%pattern_lengths)
337 ac%num_nodes = 0
338 ac%capacity = 0
339 ac%num_patterns = 0
340 ac%compiled = .false.
341 end subroutine ac_free
342
343 end module aho_corasick
344