Fortran · 12043 bytes Raw Blame History
1 module regex_engine
2 !> NFA simulation engine using Thompson's algorithm
3 !> Tracks set of current states, processes input character by character
4 use regex_types
5 implicit none
6 private
7
8 public :: nfa_match, nfa_search
9
10 integer, parameter :: MAX_STATES = 1024
11
12 contains
13
14 function nfa_match(nfa, text, start_pos, ignore_case) result(res)
15 !> Try to match NFA starting at start_pos in text
16 !> Returns match result with positions if successful
17 type(nfa_t), intent(in) :: nfa
18 character(len=*), intent(in) :: text
19 integer, intent(in) :: start_pos
20 logical, intent(in) :: ignore_case
21 type(match_result_t) :: res
22
23 integer :: current(MAX_STATES), next_states(MAX_STATES)
24 integer :: num_current, num_next
25 integer :: i, pos, text_len
26 character(len=1) :: c
27
28 res%matched = .false.
29 text_len = len_trim(text)
30
31 if (nfa%num_states == 0) return
32
33 ! Initialize with epsilon closure of start state
34 num_current = 0
35 call epsilon_closure(nfa, nfa%start_state, current, num_current, &
36 text, start_pos, text_len)
37
38 ! Check if already accepting (empty pattern case)
39 if (is_accepting(nfa, current, num_current)) then
40 res%matched = .true.
41 res%match_start = start_pos
42 res%match_end = start_pos - 1 ! Empty match
43 return
44 end if
45
46 ! Process each character
47 pos = start_pos
48 do while (pos <= text_len .and. num_current > 0)
49 c = text(pos:pos)
50
51 ! Compute next states
52 num_next = 0
53 do i = 1, num_current
54 call step(nfa, current(i), c, pos, text, text_len, ignore_case, &
55 next_states, num_next)
56 end do
57
58 ! Compute epsilon closure of next states
59 num_current = 0
60 do i = 1, num_next
61 call epsilon_closure(nfa, next_states(i), current, num_current, &
62 text, pos + 1, text_len)
63 end do
64
65 pos = pos + 1
66
67 ! Check for acceptance
68 if (is_accepting(nfa, current, num_current)) then
69 res%matched = .true.
70 res%match_start = start_pos
71 res%match_end = pos - 1
72 ! Don't return yet - try to find longest match (greedy)
73 end if
74 end do
75
76 end function nfa_match
77
78 function nfa_search(nfa, text, ignore_case) result(res)
79 !> Search for pattern anywhere in text
80 type(nfa_t), intent(in) :: nfa
81 character(len=*), intent(in) :: text
82 logical, intent(in) :: ignore_case
83 type(match_result_t) :: res
84
85 integer :: i, text_len
86 type(match_result_t) :: try_res
87
88 res%matched = .false.
89 text_len = len_trim(text)
90
91 ! Try matching at each position
92 do i = 1, text_len + 1 ! +1 to handle empty string match at end
93 try_res = nfa_match(nfa, text, i, ignore_case)
94 if (try_res%matched) then
95 res = try_res
96 return
97 end if
98 end do
99
100 end function nfa_search
101
102 subroutine epsilon_closure(nfa, state, states, num_states, text, pos, text_len)
103 !> Compute epsilon closure of a state, adding to states array
104 type(nfa_t), intent(in) :: nfa
105 integer, intent(in) :: state
106 integer, intent(inout) :: states(MAX_STATES)
107 integer, intent(inout) :: num_states
108 character(len=*), intent(in) :: text
109 integer, intent(in) :: pos, text_len
110
111 integer :: stack(MAX_STATES), stack_top
112 integer :: current, i, target
113 type(nfa_transition_t) :: trans
114 logical :: visited(MAX_STATES)
115
116 visited = .false.
117 stack_top = 1
118 stack(1) = state
119
120 do while (stack_top > 0)
121 current = stack(stack_top)
122 stack_top = stack_top - 1
123
124 if (current < 1 .or. current > nfa%num_states) cycle
125 if (visited(current)) cycle
126 visited(current) = .true.
127
128 ! Add to result
129 if (num_states < MAX_STATES) then
130 num_states = num_states + 1
131 states(num_states) = current
132 end if
133
134 ! Follow epsilon transitions and anchors
135 do i = 1, nfa%states(current)%num_trans
136 trans = nfa%states(current)%trans(i)
137
138 if (trans%trans_type == TRANS_EPSILON) then
139 ! Check for backref marker (negative anchor_type)
140 if (trans%anchor_type < 0) then
141 ! Backref - skip for now (would need captured text)
142 cycle
143 end if
144
145 target = trans%target
146 if (target >= 1 .and. target <= nfa%num_states .and. .not. visited(target)) then
147 stack_top = stack_top + 1
148 stack(stack_top) = target
149 end if
150
151 else if (trans%trans_type == TRANS_ANCHOR) then
152 ! Check if anchor matches
153 if (anchor_matches(trans%anchor_type, text, pos, text_len)) then
154 target = trans%target
155 if (target >= 1 .and. target <= nfa%num_states .and. .not. visited(target)) then
156 stack_top = stack_top + 1
157 stack(stack_top) = target
158 end if
159 end if
160 end if
161 end do
162 end do
163
164 end subroutine epsilon_closure
165
166 subroutine step(nfa, state, c, pos, text, text_len, ignore_case, next_states, num_next)
167 !> Take one step from state on character c
168 type(nfa_t), intent(in) :: nfa
169 integer, intent(in) :: state
170 character(len=1), intent(in) :: c
171 integer, intent(in) :: pos, text_len
172 character(len=*), intent(in) :: text
173 logical, intent(in) :: ignore_case
174 integer, intent(inout) :: next_states(MAX_STATES)
175 integer, intent(inout) :: num_next
176
177 integer :: i
178 type(nfa_transition_t) :: trans
179 character(len=1) :: c_lower, match_lower
180
181 do i = 1, nfa%states(state)%num_trans
182 trans = nfa%states(state)%trans(i)
183
184 select case (trans%trans_type)
185 case (TRANS_CHAR)
186 if (ignore_case) then
187 c_lower = to_lower_char(c)
188 match_lower = to_lower_char(trans%match_char)
189 if (c_lower == match_lower) then
190 call add_if_unique(next_states, num_next, trans%target)
191 end if
192 else
193 if (c == trans%match_char) then
194 call add_if_unique(next_states, num_next, trans%target)
195 end if
196 end if
197
198 case (TRANS_CLASS)
199 if (char_in_class(c, trans%char_class, trans%negated, ignore_case)) then
200 call add_if_unique(next_states, num_next, trans%target)
201 end if
202
203 case (TRANS_ANY)
204 ! Match any character except newline
205 if (c /= char(10)) then
206 call add_if_unique(next_states, num_next, trans%target)
207 end if
208
209 end select
210 end do
211
212 end subroutine step
213
214 function is_accepting(nfa, states, num_states) result(res)
215 !> Check if any state in the set is an accepting state
216 type(nfa_t), intent(in) :: nfa
217 integer, intent(in) :: states(MAX_STATES)
218 integer, intent(in) :: num_states
219 logical :: res
220
221 integer :: i
222
223 res = .false.
224 do i = 1, num_states
225 if (states(i) >= 1 .and. states(i) <= nfa%num_states) then
226 if (nfa%states(states(i))%is_accept) then
227 res = .true.
228 return
229 end if
230 end if
231 end do
232
233 end function is_accepting
234
235 function anchor_matches(anchor_type, text, pos, text_len) result(matches)
236 !> Check if anchor matches at position
237 integer, intent(in) :: anchor_type
238 character(len=*), intent(in) :: text
239 integer, intent(in) :: pos, text_len
240 logical :: matches
241
242 logical :: at_start, at_end, prev_word, curr_word
243
244 matches = .false.
245
246 at_start = (pos == 1) .or. (pos < 1)
247 at_end = (pos > text_len)
248
249 select case (anchor_type)
250 case (1) ! ^ start anchor
251 if (at_start) then
252 matches = .true.
253 else if (pos > 1 .and. pos <= text_len + 1) then
254 matches = (text(pos-1:pos-1) == char(10))
255 end if
256
257 case (2) ! $ end anchor
258 if (at_end) then
259 matches = .true.
260 else if (pos >= 1 .and. pos <= text_len) then
261 matches = (text(pos:pos) == char(10))
262 end if
263
264 case (3) ! \< word start
265 prev_word = .false.
266 curr_word = .false.
267 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char(text(pos-1:pos-1))
268 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char(text(pos:pos))
269 matches = (.not. prev_word) .and. curr_word
270
271 case (4) ! \> word end
272 prev_word = .false.
273 curr_word = .false.
274 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char(text(pos-1:pos-1))
275 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char(text(pos:pos))
276 matches = prev_word .and. (.not. curr_word)
277
278 case (5) ! \b word boundary
279 prev_word = .false.
280 curr_word = .false.
281 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char(text(pos-1:pos-1))
282 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char(text(pos:pos))
283 matches = prev_word .neqv. curr_word
284
285 case (6) ! \B not word boundary
286 prev_word = .false.
287 curr_word = .false.
288 if (pos > 1 .and. pos <= text_len + 1) prev_word = is_word_char(text(pos-1:pos-1))
289 if (pos >= 1 .and. pos <= text_len) curr_word = is_word_char(text(pos:pos))
290 matches = prev_word .eqv. curr_word
291 end select
292
293 end function anchor_matches
294
295 pure function is_word_char(c) result(res)
296 character(len=1), intent(in) :: c
297 logical :: res
298 integer :: ic
299
300 ic = ichar(c)
301 res = (ic >= ichar('a') .and. ic <= ichar('z')) .or. &
302 (ic >= ichar('A') .and. ic <= ichar('Z')) .or. &
303 (ic >= ichar('0') .and. ic <= ichar('9')) .or. &
304 (c == '_')
305 end function is_word_char
306
307 pure function to_lower_char(c) result(lower)
308 character(len=1), intent(in) :: c
309 character(len=1) :: lower
310 integer :: ic
311
312 ic = ichar(c)
313 if (ic >= ichar('A') .and. ic <= ichar('Z')) then
314 ! ASCII uppercase A-Z -> a-z
315 lower = char(ic + 32)
316 else if (ic >= 192 .and. ic <= 214) then
317 ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246)
318 lower = char(ic + 32)
319 else if (ic >= 216 .and. ic <= 222) then
320 ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254)
321 lower = char(ic + 32)
322 else if (ic >= 128 .and. ic <= 150) then
323 ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6)
324 ! When preceded by 0xC3, these represent À-Ö, fold to à-ö
325 lower = char(ic + 32)
326 else if (ic >= 152 .and. ic <= 158) then
327 ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE)
328 ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ
329 lower = char(ic + 32)
330 else
331 lower = c
332 end if
333 end function to_lower_char
334
335 function char_in_class(c, char_class, negated, ignore_case) result(res)
336 character(len=1), intent(in) :: c
337 logical, intent(in) :: char_class(0:255)
338 logical, intent(in) :: negated, ignore_case
339 logical :: res
340
341 integer :: ic
342 character(len=1) :: c_lower, c_upper
343
344 ic = ichar(c)
345 res = char_class(ic)
346
347 ! For case-insensitive, check both cases
348 if (ignore_case .and. .not. res) then
349 c_lower = to_lower_char(c)
350 c_upper = to_upper_char(c)
351 if (c_lower /= c) res = char_class(ichar(c_lower))
352 if (.not. res .and. c_upper /= c) res = char_class(ichar(c_upper))
353 end if
354
355 if (negated) res = .not. res
356
357 end function char_in_class
358
359 pure function to_upper_char(c) result(upper)
360 character(len=1), intent(in) :: c
361 character(len=1) :: upper
362 integer :: ic
363
364 ic = ichar(c)
365 if (ic >= ichar('a') .and. ic <= ichar('z')) then
366 upper = char(ic - 32)
367 else
368 upper = c
369 end if
370 end function to_upper_char
371
372 subroutine add_if_unique(arr, num, val)
373 integer, intent(inout) :: arr(MAX_STATES)
374 integer, intent(inout) :: num
375 integer, intent(in) :: val
376
377 integer :: i
378
379 ! Check if already present
380 do i = 1, num
381 if (arr(i) == val) return
382 end do
383
384 ! Add if space available
385 if (num < MAX_STATES) then
386 num = num + 1
387 arr(num) = val
388 end if
389
390 end subroutine add_if_unique
391
392 end module regex_engine
393