Fortran · 9994 bytes Raw Blame History
1 module aho_corasick
2 !> Aho-Corasick automaton for multi-pattern string matching
3 !> Matches all patterns in a single pass O(n + m + z)
4 !> where n=text length, m=total pattern length, z=matches
5 implicit none
6 private
7
8 public :: ac_automaton_t, ac_match_t
9 public :: ac_build, ac_search, ac_search_any, ac_free
10
11 integer, parameter :: MAX_CHILDREN = 256 ! ASCII character set
12 integer, parameter :: MAX_PATTERNS = 1000
13 integer, parameter :: MAX_PATTERN_LEN = 4096
14
15 type :: ac_node_t
16 !> Trie node with failure links
17 integer :: children(0:255) = 0 ! Child node indices (0 = no child)
18 integer :: failure = 0 ! Failure link (fall back on mismatch)
19 integer :: output_pattern = 0 ! Pattern index that ends here (0 = none)
20 integer :: output_link = 0 ! Link to next output state
21 integer :: depth = 0 ! Depth in trie (= prefix length)
22 end type ac_node_t
23
24 type :: ac_automaton_t
25 !> Aho-Corasick automaton
26 type(ac_node_t), allocatable :: nodes(:)
27 integer :: num_nodes = 0
28 integer :: capacity = 0
29 integer :: num_patterns = 0
30 integer, allocatable :: pattern_lengths(:)
31 logical :: compiled = .false.
32 logical :: ignore_case = .false.
33 end type ac_automaton_t
34
35 type :: ac_match_t
36 !> Match result
37 logical :: matched = .false.
38 integer :: pattern_idx = 0 ! Which pattern matched (1-based)
39 integer :: start_pos = 0 ! Start position in text (1-based)
40 integer :: end_pos = 0 ! End position in text (1-based)
41 end type ac_match_t
42
43 contains
44
45 subroutine ac_build(ac, patterns, num_patterns, ignore_case, ierr)
46 !> Build Aho-Corasick automaton from patterns
47 type(ac_automaton_t), intent(out) :: ac
48 character(len=*), intent(in) :: patterns(:)
49 integer, intent(in) :: num_patterns
50 logical, intent(in) :: ignore_case
51 integer, intent(out) :: ierr
52
53 integer :: i, j, c, state, next_state, child
54 integer, allocatable :: queue(:)
55 integer :: q_head, q_tail
56 integer :: fail_state
57 character(len=1) :: ch
58
59 ierr = 0
60
61 ! Allocate BFS queue
62 allocate(queue(MAX_PATTERNS * MAX_PATTERN_LEN))
63 ac%ignore_case = ignore_case
64 ac%num_patterns = num_patterns
65
66 ! Allocate pattern lengths
67 allocate(ac%pattern_lengths(num_patterns))
68 do i = 1, num_patterns
69 ac%pattern_lengths(i) = len_trim(patterns(i))
70 end do
71
72 ! Initial capacity - estimate based on total pattern length
73 ac%capacity = 1
74 do i = 1, num_patterns
75 ac%capacity = ac%capacity + len_trim(patterns(i))
76 end do
77 ac%capacity = max(ac%capacity, 256)
78 allocate(ac%nodes(ac%capacity))
79
80 ! Initialize root node (index 1)
81 ac%num_nodes = 1
82 ac%nodes(1)%depth = 0
83
84 ! Phase 1: Build trie from patterns
85 do i = 1, num_patterns
86 state = 1 ! Start at root
87 do j = 1, len_trim(patterns(i))
88 ch = patterns(i)(j:j)
89 if (ignore_case) then
90 c = to_lower_code(ichar(ch))
91 else
92 c = ichar(ch)
93 end if
94
95 child = ac%nodes(state)%children(c)
96 if (child == 0) then
97 ! Create new node
98 ac%num_nodes = ac%num_nodes + 1
99 if (ac%num_nodes > ac%capacity) then
100 call grow_nodes(ac)
101 end if
102 ac%nodes(state)%children(c) = ac%num_nodes
103 ac%nodes(ac%num_nodes)%depth = ac%nodes(state)%depth + 1
104 child = ac%num_nodes
105 end if
106 state = child
107 end do
108 ! Mark this state as accepting for pattern i
109 ac%nodes(state)%output_pattern = i
110 end do
111
112 ! Phase 2: Compute failure links using BFS
113 q_head = 1
114 q_tail = 0
115
116 ! Initialize: depth-1 nodes fail to root
117 do c = 0, 255
118 child = ac%nodes(1)%children(c)
119 if (child /= 0) then
120 ac%nodes(child)%failure = 1 ! Fail to root
121 q_tail = q_tail + 1
122 queue(q_tail) = child
123 end if
124 end do
125
126 ! BFS to compute failure links for deeper nodes
127 do while (q_head <= q_tail)
128 state = queue(q_head)
129 q_head = q_head + 1
130
131 do c = 0, 255
132 child = ac%nodes(state)%children(c)
133 if (child /= 0) then
134 ! Add to queue
135 q_tail = q_tail + 1
136 queue(q_tail) = child
137
138 ! Compute failure link: follow parent's failure until we find
139 ! a state with a transition on c, or reach root
140 fail_state = ac%nodes(state)%failure
141 do while (fail_state > 1)
142 if (ac%nodes(fail_state)%children(c) /= 0) exit
143 fail_state = ac%nodes(fail_state)%failure
144 end do
145
146 if (fail_state <= 1) then
147 ! At or beyond root
148 if (ac%nodes(1)%children(c) /= 0 .and. ac%nodes(1)%children(c) /= child) then
149 ac%nodes(child)%failure = ac%nodes(1)%children(c)
150 else
151 ac%nodes(child)%failure = 1 ! Fail to root
152 end if
153 else if (ac%nodes(fail_state)%children(c) == child) then
154 ! Would create self-loop, fail to root
155 ac%nodes(child)%failure = 1
156 else
157 ac%nodes(child)%failure = ac%nodes(fail_state)%children(c)
158 end if
159
160 ! Compute output link: chain of accepting states via failure links
161 if (ac%nodes(ac%nodes(child)%failure)%output_pattern /= 0) then
162 ac%nodes(child)%output_link = ac%nodes(child)%failure
163 else
164 ac%nodes(child)%output_link = ac%nodes(ac%nodes(child)%failure)%output_link
165 end if
166 end if
167 end do
168 end do
169
170 ac%compiled = .true.
171
172 deallocate(queue)
173
174 contains
175
176 function to_lower_code(c) result(lc)
177 integer, intent(in) :: c
178 integer :: lc
179 if (c >= ichar('A') .and. c <= ichar('Z')) then
180 lc = c + 32
181 else
182 lc = c
183 end if
184 end function to_lower_code
185
186 end subroutine ac_build
187
188 subroutine grow_nodes(ac)
189 !> Double the node capacity
190 type(ac_automaton_t), intent(inout) :: ac
191 type(ac_node_t), allocatable :: temp(:)
192 integer :: new_cap
193
194 new_cap = ac%capacity * 2
195 allocate(temp(new_cap))
196 temp(1:ac%num_nodes) = ac%nodes(1:ac%num_nodes)
197 call move_alloc(temp, ac%nodes)
198 ac%capacity = new_cap
199 end subroutine grow_nodes
200
201 function ac_search_any(ac, text) result(found)
202 !> Search for any pattern match (fast path for existence check)
203 type(ac_automaton_t), intent(in) :: ac
204 character(len=*), intent(in) :: text
205 logical :: found
206
207 integer :: i, c, state, next_state, text_len
208
209 found = .false.
210 if (.not. ac%compiled) return
211
212 text_len = len(text)
213 state = 1 ! Start at root
214
215 do i = 1, text_len
216 if (ac%ignore_case) then
217 c = to_lower_code(ichar(text(i:i)))
218 else
219 c = ichar(text(i:i))
220 end if
221
222 ! Follow failure links until we find a transition or reach root
223 do while (state /= 1 .and. ac%nodes(state)%children(c) == 0)
224 state = ac%nodes(state)%failure
225 end do
226
227 next_state = ac%nodes(state)%children(c)
228 if (next_state /= 0) then
229 state = next_state
230 else
231 state = 1 ! Stay at root if no transition
232 end if
233
234 ! Check for match at current state or via output links
235 if (ac%nodes(state)%output_pattern /= 0) then
236 found = .true.
237 return
238 end if
239 if (ac%nodes(state)%output_link /= 0) then
240 found = .true.
241 return
242 end if
243 end do
244
245 contains
246
247 function to_lower_code(c) result(lc)
248 integer, intent(in) :: c
249 integer :: lc
250 if (c >= ichar('A') .and. c <= ichar('Z')) then
251 lc = c + 32
252 else
253 lc = c
254 end if
255 end function to_lower_code
256
257 end function ac_search_any
258
259 function ac_search(ac, text) result(match)
260 !> Search for first pattern match with position info
261 type(ac_automaton_t), intent(in) :: ac
262 character(len=*), intent(in) :: text
263 type(ac_match_t) :: match
264
265 integer :: i, c, state, next_state, text_len, pat_idx, out_state
266
267 match%matched = .false.
268 if (.not. ac%compiled) return
269
270 text_len = len(text)
271 state = 1 ! Start at root
272
273 do i = 1, text_len
274 if (ac%ignore_case) then
275 c = to_lower_code(ichar(text(i:i)))
276 else
277 c = ichar(text(i:i))
278 end if
279
280 ! Follow failure links until we find a transition or reach root
281 do while (state /= 1 .and. ac%nodes(state)%children(c) == 0)
282 state = ac%nodes(state)%failure
283 end do
284
285 next_state = ac%nodes(state)%children(c)
286 if (next_state /= 0) then
287 state = next_state
288 else
289 state = 1
290 end if
291
292 ! Check for match at current state
293 pat_idx = ac%nodes(state)%output_pattern
294 if (pat_idx /= 0) then
295 match%matched = .true.
296 match%pattern_idx = pat_idx
297 match%end_pos = i
298 match%start_pos = i - ac%pattern_lengths(pat_idx) + 1
299 return
300 end if
301
302 ! Check output links for overlapping patterns
303 out_state = ac%nodes(state)%output_link
304 if (out_state /= 0) then
305 pat_idx = ac%nodes(out_state)%output_pattern
306 if (pat_idx /= 0) then
307 match%matched = .true.
308 match%pattern_idx = pat_idx
309 match%end_pos = i
310 match%start_pos = i - ac%pattern_lengths(pat_idx) + 1
311 return
312 end if
313 end if
314 end do
315
316 contains
317
318 function to_lower_code(c) result(lc)
319 integer, intent(in) :: c
320 integer :: lc
321 if (c >= ichar('A') .and. c <= ichar('Z')) then
322 lc = c + 32
323 else
324 lc = c
325 end if
326 end function to_lower_code
327
328 end function ac_search
329
330 subroutine ac_free(ac)
331 !> Free automaton resources
332 type(ac_automaton_t), intent(inout) :: ac
333
334 if (allocated(ac%nodes)) deallocate(ac%nodes)
335 if (allocated(ac%pattern_lengths)) deallocate(ac%pattern_lengths)
336 ac%num_nodes = 0
337 ac%capacity = 0
338 ac%num_patterns = 0
339 ac%compiled = .false.
340 end subroutine ac_free
341
342 end module aho_corasick
343