Fortran · 10797 bytes Raw Blame History
1 module regex_nfa
2 !> Thompson NFA construction from AST
3 !> Implements the classic Thompson construction algorithm
4 use regex_types
5 use regex_charclass
6 use regex_parser, only: ast_pool_t
7 implicit none
8 private
9
10 public :: build_nfa
11
12 ! Fragment type for Thompson construction
13 type :: fragment_t
14 integer :: start_state = 0 ! Start state index
15 integer :: accept_state = 0 ! Accept state index
16 end type fragment_t
17
18 contains
19
20 subroutine build_nfa(pool, root_idx, nfa, ierr)
21 !> Build NFA from AST
22 type(ast_pool_t), intent(in) :: pool
23 integer, intent(in) :: root_idx
24 type(nfa_t), intent(out) :: nfa
25 integer, intent(out) :: ierr
26
27 type(fragment_t) :: frag
28
29 ierr = 0
30 call nfa%init()
31
32 if (root_idx == 0) then
33 ! Empty pattern - create NFA that matches empty string
34 nfa%start_state = nfa%add_state()
35 nfa%accept_state = nfa%add_state()
36 nfa%states(nfa%accept_state)%is_accept = .true.
37 call add_epsilon(nfa, nfa%start_state, nfa%accept_state)
38 return
39 end if
40
41 ! Build NFA from AST
42 frag = build_fragment(pool, root_idx, nfa, ierr)
43 if (ierr /= 0) return
44
45 nfa%start_state = frag%start_state
46 nfa%accept_state = frag%accept_state
47 nfa%states(frag%accept_state)%is_accept = .true.
48
49 end subroutine build_nfa
50
51 recursive function build_fragment(pool, node_idx, nfa, ierr) result(frag)
52 !> Build NFA fragment for an AST node
53 type(ast_pool_t), intent(in) :: pool
54 integer, intent(in) :: node_idx
55 type(nfa_t), intent(inout) :: nfa
56 integer, intent(out) :: ierr
57 type(fragment_t) :: frag
58
59 type(ast_node_t) :: node
60 type(fragment_t) :: left_frag, right_frag, child_frag
61 type(nfa_transition_t) :: trans
62 integer :: s1, s2
63
64 ierr = 0
65 frag%start_state = 0
66 frag%accept_state = 0
67
68 if (node_idx == 0 .or. node_idx > pool%count) then
69 ierr = 1
70 return
71 end if
72
73 node = pool%nodes(node_idx)
74
75 select case (node%ntype)
76
77 case (AST_LITERAL)
78 ! Literal: (s1) --c--> (s2)
79 s1 = nfa%add_state()
80 s2 = nfa%add_state()
81
82 if (node%char_val == char(0)) then
83 ! Empty literal - epsilon transition
84 call add_epsilon(nfa, s1, s2)
85 else
86 trans%trans_type = TRANS_CHAR
87 trans%match_char = node%char_val
88 trans%target = s2
89 call nfa%states(s1)%add_trans(trans)
90 end if
91
92 frag%start_state = s1
93 frag%accept_state = s2
94
95 case (AST_DOT)
96 ! Dot: (s1) --any--> (s2)
97 s1 = nfa%add_state()
98 s2 = nfa%add_state()
99
100 trans%trans_type = TRANS_ANY
101 trans%target = s2
102 call nfa%states(s1)%add_trans(trans)
103
104 frag%start_state = s1
105 frag%accept_state = s2
106
107 case (AST_CHAR_CLASS)
108 ! Character class: (s1) --class--> (s2)
109 s1 = nfa%add_state()
110 s2 = nfa%add_state()
111
112 trans%trans_type = TRANS_CLASS
113 trans%char_class = node%char_class
114 trans%negated = node%negated
115 ! Pre-compute bitwise character class for fast matching
116 call charclass_from_array(trans%char_bits, node%char_class, node%negated)
117 trans%target = s2
118 call nfa%states(s1)%add_trans(trans)
119
120 frag%start_state = s1
121 frag%accept_state = s2
122
123 case (AST_ANCHOR)
124 ! Anchor: (s1) --anchor--> (s2) (zero-width)
125 s1 = nfa%add_state()
126 s2 = nfa%add_state()
127
128 trans%trans_type = TRANS_ANCHOR
129 trans%anchor_type = node%anchor_type
130 trans%target = s2
131 call nfa%states(s1)%add_trans(trans)
132
133 frag%start_state = s1
134 frag%accept_state = s2
135
136 case (AST_CONCAT)
137 ! Concatenation: connect left accept to right start
138 left_frag = build_fragment(pool, node%left, nfa, ierr)
139 if (ierr /= 0) return
140 right_frag = build_fragment(pool, node%right, nfa, ierr)
141 if (ierr /= 0) return
142
143 ! Connect left accept to right start with epsilon
144 call add_epsilon(nfa, left_frag%accept_state, right_frag%start_state)
145
146 frag%start_state = left_frag%start_state
147 frag%accept_state = right_frag%accept_state
148
149 case (AST_ALTERNATE)
150 ! Alternation: new start with epsilon to both, both accept to new accept
151 ! e-->(left.s)-->(left.a)--e
152 ! / \
153 ! (s1)- ->(s2)
154 ! \ /
155 ! e-->(right.s)-->(right.a)--e
156
157 left_frag = build_fragment(pool, node%left, nfa, ierr)
158 if (ierr /= 0) return
159 right_frag = build_fragment(pool, node%right, nfa, ierr)
160 if (ierr /= 0) return
161
162 s1 = nfa%add_state()
163 s2 = nfa%add_state()
164
165 call add_epsilon(nfa, s1, left_frag%start_state)
166 call add_epsilon(nfa, s1, right_frag%start_state)
167 call add_epsilon(nfa, left_frag%accept_state, s2)
168 call add_epsilon(nfa, right_frag%accept_state, s2)
169
170 frag%start_state = s1
171 frag%accept_state = s2
172
173 case (AST_QUANTIFIER)
174 child_frag = build_fragment(pool, node%child, nfa, ierr)
175 if (ierr /= 0) return
176
177 if (node%min_rep == 0 .and. node%max_rep == -1) then
178 ! Star (*): zero or more
179 ! e-------->
180 ! / \
181 ! (s1)--e-->(c.s)-->(c.a)-->(s2)
182 ! \ /
183 ! <-e-
184
185 s1 = nfa%add_state()
186 s2 = nfa%add_state()
187
188 call add_epsilon(nfa, s1, child_frag%start_state)
189 call add_epsilon(nfa, s1, s2)
190 call add_epsilon(nfa, child_frag%accept_state, s2)
191 call add_epsilon(nfa, child_frag%accept_state, child_frag%start_state)
192
193 frag%start_state = s1
194 frag%accept_state = s2
195
196 else if (node%min_rep == 1 .and. node%max_rep == -1) then
197 ! Plus (+): one or more
198 ! (c.s)-->(c.a)-->(s2)
199 ! \ /
200 ! <-e-
201
202 s2 = nfa%add_state()
203
204 call add_epsilon(nfa, child_frag%accept_state, s2)
205 call add_epsilon(nfa, child_frag%accept_state, child_frag%start_state)
206
207 frag%start_state = child_frag%start_state
208 frag%accept_state = s2
209
210 else if (node%min_rep == 0 .and. node%max_rep == 1) then
211 ! Question (?): zero or one
212 ! e-------->
213 ! / \
214 ! (s1)--e-->(c.s)-->(c.a)-->(s2)
215
216 s1 = nfa%add_state()
217 s2 = nfa%add_state()
218
219 call add_epsilon(nfa, s1, child_frag%start_state)
220 call add_epsilon(nfa, s1, s2)
221 call add_epsilon(nfa, child_frag%accept_state, s2)
222
223 frag%start_state = s1
224 frag%accept_state = s2
225
226 else
227 ! Bounded quantifier {n,m}
228 ! For simplicity, unroll: min copies required, then (max-min) optional
229 call build_bounded_quantifier(pool, node, child_frag, nfa, frag, ierr)
230 if (ierr /= 0) return
231 end if
232
233 case (AST_GROUP)
234 ! Group: just build the child, mark states for group capture
235 child_frag = build_fragment(pool, node%child, nfa, ierr)
236 if (ierr /= 0) return
237
238 ! Mark group boundaries
239 nfa%states(child_frag%start_state)%group_start = node%group_num
240 nfa%states(child_frag%accept_state)%group_end = node%group_num
241 nfa%num_groups = max(nfa%num_groups, node%group_num)
242
243 frag = child_frag
244
245 case (AST_BACKREF)
246 ! Backreference - needs special handling in engine
247 ! For now, create placeholder states
248 s1 = nfa%add_state()
249 s2 = nfa%add_state()
250
251 ! Store backref info in transition
252 trans%trans_type = TRANS_EPSILON ! Will be handled specially
253 trans%anchor_type = -node%group_num ! Negative = backref
254 trans%target = s2
255 call nfa%states(s1)%add_trans(trans)
256
257 frag%start_state = s1
258 frag%accept_state = s2
259
260 case default
261 ierr = 1
262
263 end select
264
265 end function build_fragment
266
267 subroutine build_bounded_quantifier(pool, node, child_frag, nfa, frag, ierr)
268 !> Build NFA for {n,m} quantifier
269 type(ast_pool_t), intent(in) :: pool
270 type(ast_node_t), intent(in) :: node
271 type(fragment_t), intent(in) :: child_frag
272 type(nfa_t), intent(inout) :: nfa
273 type(fragment_t), intent(out) :: frag
274 integer, intent(out) :: ierr
275
276 integer :: i, min_rep, max_rep
277 type(fragment_t) :: copy_frag, prev_frag
278 integer :: s1, s2
279
280 ierr = 0
281 min_rep = node%min_rep
282 max_rep = node%max_rep
283
284 if (min_rep == 0 .and. max_rep == 0) then
285 ! {0} or {} - match empty
286 s1 = nfa%add_state()
287 s2 = nfa%add_state()
288 call add_epsilon(nfa, s1, s2)
289 frag%start_state = s1
290 frag%accept_state = s2
291 return
292 end if
293
294 ! Start with first copy (or empty if min=0)
295 if (min_rep == 0) then
296 s1 = nfa%add_state()
297 frag%start_state = s1
298 frag%accept_state = s1
299 else
300 ! First required copy
301 frag = child_frag
302
303 ! Additional required copies
304 do i = 2, min_rep
305 copy_frag = build_fragment(pool, node%child, nfa, ierr)
306 if (ierr /= 0) return
307 call add_epsilon(nfa, frag%accept_state, copy_frag%start_state)
308 frag%accept_state = copy_frag%accept_state
309 end do
310 end if
311
312 ! Optional copies
313 if (max_rep == -1) then
314 ! {n,} - unlimited
315 copy_frag = build_fragment(pool, node%child, nfa, ierr)
316 if (ierr /= 0) return
317
318 s2 = nfa%add_state()
319 call add_epsilon(nfa, frag%accept_state, copy_frag%start_state)
320 call add_epsilon(nfa, frag%accept_state, s2)
321 call add_epsilon(nfa, copy_frag%accept_state, s2)
322 call add_epsilon(nfa, copy_frag%accept_state, copy_frag%start_state)
323
324 frag%accept_state = s2
325
326 else if (max_rep > min_rep) then
327 ! {n,m} - limited optional copies
328 prev_frag = frag
329 s2 = nfa%add_state()
330
331 do i = min_rep + 1, max_rep
332 copy_frag = build_fragment(pool, node%child, nfa, ierr)
333 if (ierr /= 0) return
334
335 call add_epsilon(nfa, prev_frag%accept_state, copy_frag%start_state)
336 call add_epsilon(nfa, prev_frag%accept_state, s2)
337
338 prev_frag = copy_frag
339 end do
340
341 call add_epsilon(nfa, prev_frag%accept_state, s2)
342 frag%accept_state = s2
343 end if
344
345 end subroutine build_bounded_quantifier
346
347 subroutine add_epsilon(nfa, from_state, to_state)
348 !> Add epsilon transition from one state to another
349 type(nfa_t), intent(inout) :: nfa
350 integer, intent(in) :: from_state, to_state
351
352 type(nfa_transition_t) :: trans
353
354 trans%trans_type = TRANS_EPSILON
355 trans%target = to_state
356 call nfa%states(from_state)%add_trans(trans)
357
358 end subroutine add_epsilon
359
360 end module regex_nfa
361