Fortran · 11850 bytes Raw Blame History
1 module regex_parser
2 !> Regex parser - converts token stream to AST
3 !> Uses recursive descent parsing with proper precedence:
4 !> alternation < concatenation < quantifier < atom
5 use regex_types
6 implicit none
7 private
8
9 public :: parse, ast_pool_t
10
11 !> AST node pool - stores all nodes with indices
12 type :: ast_pool_t
13 type(ast_node_t), allocatable :: nodes(:)
14 integer :: count = 0
15 integer :: capacity = 0
16 contains
17 procedure :: init => pool_init
18 procedure :: alloc => pool_alloc
19 procedure :: get => pool_get
20 procedure :: cleanup => pool_cleanup
21 end type ast_pool_t
22
23 ! Module-level parsing state
24 type(token_list_t), pointer :: g_tokens => null()
25 integer :: g_pos = 0
26 integer :: g_group_num = 0
27 type(ast_pool_t), pointer :: g_pool => null()
28
29 contains
30
31 !---------------------------------------------------------------------------
32 ! AST Pool Methods
33 !---------------------------------------------------------------------------
34 subroutine pool_init(this, initial_capacity)
35 class(ast_pool_t), intent(inout) :: this
36 integer, intent(in), optional :: initial_capacity
37 integer :: cap
38
39 cap = 64
40 if (present(initial_capacity)) cap = initial_capacity
41
42 if (allocated(this%nodes)) deallocate(this%nodes)
43 allocate(this%nodes(cap))
44 this%count = 0
45 this%capacity = cap
46 end subroutine pool_init
47
48 function pool_alloc(this) result(idx)
49 class(ast_pool_t), intent(inout) :: this
50 integer :: idx
51 type(ast_node_t), allocatable :: temp(:)
52
53 if (.not. allocated(this%nodes)) call this%init()
54
55 if (this%count >= this%capacity) then
56 allocate(temp(this%capacity * 2))
57 temp(1:this%count) = this%nodes(1:this%count)
58 call move_alloc(temp, this%nodes)
59 this%capacity = this%capacity * 2
60 end if
61
62 this%count = this%count + 1
63 idx = this%count
64 this%nodes(idx) = ast_node_t() ! Initialize to default
65 end function pool_alloc
66
67 function pool_get(this, idx) result(node)
68 class(ast_pool_t), intent(in) :: this
69 integer, intent(in) :: idx
70 type(ast_node_t) :: node
71
72 if (idx >= 1 .and. idx <= this%count) then
73 node = this%nodes(idx)
74 else
75 node = ast_node_t()
76 end if
77 end function pool_get
78
79 subroutine pool_cleanup(this)
80 class(ast_pool_t), intent(inout) :: this
81 if (allocated(this%nodes)) deallocate(this%nodes)
82 this%count = 0
83 this%capacity = 0
84 end subroutine pool_cleanup
85
86 !---------------------------------------------------------------------------
87 ! Main Parse Entry Point
88 !---------------------------------------------------------------------------
89 subroutine parse(tokens, pool, root_idx, num_groups, ierr)
90 !> Parse tokens into an AST stored in pool
91 type(token_list_t), target, intent(in) :: tokens
92 type(ast_pool_t), target, intent(inout) :: pool
93 integer, intent(out) :: root_idx
94 integer, intent(out) :: num_groups
95 integer, intent(out) :: ierr
96
97 ierr = 0
98 root_idx = 0
99
100 ! Set up global state
101 g_tokens => tokens
102 g_pos = 1
103 g_group_num = 0
104 g_pool => pool
105
106 call pool%init()
107
108 ! Parse alternation (lowest precedence)
109 root_idx = parse_alternation(ierr)
110 if (ierr /= 0) return
111
112 num_groups = g_group_num
113
114 ! Verify we consumed all tokens
115 if (current_type() /= TOK_END) then
116 ierr = 1
117 end if
118
119 end subroutine parse
120
121 !---------------------------------------------------------------------------
122 ! Token Access Helpers
123 !---------------------------------------------------------------------------
124 function current_token() result(tok)
125 type(token_t) :: tok
126 tok = g_tokens%get(g_pos)
127 end function current_token
128
129 function current_type() result(ttype)
130 integer :: ttype
131 type(token_t) :: tok
132 tok = g_tokens%get(g_pos)
133 ttype = tok%ttype
134 end function current_type
135
136 subroutine advance()
137 g_pos = g_pos + 1
138 end subroutine advance
139
140 !---------------------------------------------------------------------------
141 ! Recursive Descent Parser
142 !---------------------------------------------------------------------------
143
144 recursive function parse_alternation(ierr) result(idx)
145 !> Parse: alternation = concat ('|' concat)*
146 integer, intent(out) :: ierr
147 integer :: idx
148
149 integer :: left, right, alt_idx
150
151 ierr = 0
152 idx = 0
153 left = parse_concat(ierr)
154 if (ierr /= 0) return
155
156 idx = left
157
158 do while (current_type() == TOK_PIPE)
159 call advance() ! consume |
160
161 right = parse_concat(ierr)
162 if (ierr /= 0) return
163
164 ! Create alternation node
165 alt_idx = g_pool%alloc()
166 g_pool%nodes(alt_idx)%ntype = AST_ALTERNATE
167 g_pool%nodes(alt_idx)%left = idx
168 g_pool%nodes(alt_idx)%right = right
169
170 idx = alt_idx
171 end do
172
173 end function parse_alternation
174
175 recursive function parse_concat(ierr) result(idx)
176 !> Parse: concat = quantified+
177 integer, intent(out) :: ierr
178 integer :: idx
179
180 integer :: left, right, concat_idx, ttype
181
182 ierr = 0
183 idx = 0
184
185 ! Check if we have anything to parse
186 ttype = current_type()
187 if (ttype == TOK_END .or. ttype == TOK_PIPE .or. ttype == TOK_RPAREN) then
188 ! Empty - create empty literal (matches empty string)
189 idx = g_pool%alloc()
190 g_pool%nodes(idx)%ntype = AST_LITERAL
191 g_pool%nodes(idx)%char_val = char(0) ! Special empty marker
192 return
193 end if
194
195 ! Parse first quantified expression
196 left = parse_quantified(ierr)
197 if (ierr /= 0) return
198
199 idx = left
200
201 ! Parse remaining quantified expressions
202 do
203 ttype = current_type()
204 if (ttype == TOK_END .or. ttype == TOK_PIPE .or. ttype == TOK_RPAREN) exit
205
206 right = parse_quantified(ierr)
207 if (ierr /= 0) return
208
209 ! Create concatenation node
210 concat_idx = g_pool%alloc()
211 g_pool%nodes(concat_idx)%ntype = AST_CONCAT
212 g_pool%nodes(concat_idx)%left = idx
213 g_pool%nodes(concat_idx)%right = right
214
215 idx = concat_idx
216 end do
217
218 end function parse_concat
219
220 recursive function parse_quantified(ierr) result(idx)
221 !> Parse: quantified = atom quantifier?
222 integer, intent(out) :: ierr
223 integer :: idx
224
225 integer :: atom_idx, quant_idx, ttype
226 integer :: min_rep, max_rep
227
228 ierr = 0
229 idx = 0
230
231 ! Parse the atom
232 atom_idx = parse_atom(ierr)
233 if (ierr /= 0) return
234
235 idx = atom_idx
236
237 ! Check for quantifier
238 ttype = current_type()
239 select case (ttype)
240 case (TOK_STAR)
241 min_rep = 0
242 max_rep = -1 ! unlimited
243 call advance()
244
245 case (TOK_PLUS)
246 min_rep = 1
247 max_rep = -1
248 call advance()
249
250 case (TOK_QUESTION)
251 min_rep = 0
252 max_rep = 1
253 call advance()
254
255 case (TOK_LBRACE)
256 call parse_brace_quantifier(min_rep, max_rep, ierr)
257 if (ierr /= 0) return
258
259 case default
260 ! No quantifier
261 return
262 end select
263
264 ! Create quantifier node
265 quant_idx = g_pool%alloc()
266 g_pool%nodes(quant_idx)%ntype = AST_QUANTIFIER
267 g_pool%nodes(quant_idx)%child = atom_idx
268 g_pool%nodes(quant_idx)%min_rep = min_rep
269 g_pool%nodes(quant_idx)%max_rep = max_rep
270
271 idx = quant_idx
272
273 end function parse_quantified
274
275 subroutine parse_brace_quantifier(min_rep, max_rep, ierr)
276 !> Parse {n}, {n,}, or {n,m}
277 integer, intent(out) :: min_rep, max_rep, ierr
278
279 type(token_t) :: tok
280 integer :: ttype, num
281
282 ierr = 0
283 min_rep = 0
284 max_rep = 0
285
286 call advance() ! consume {
287
288 ! Expect a number or }
289 ttype = current_type()
290 if (ttype == TOK_RBRACE) then
291 ! {} - treat as {0,} (match zero or more)
292 call advance()
293 max_rep = -1
294 return
295 end if
296
297 if (ttype /= TOK_LITERAL) then
298 ierr = 1
299 return
300 end if
301
302 ! Parse first number
303 tok = current_token()
304 if (.not. is_digit(tok%char_val)) then
305 ierr = 1
306 return
307 end if
308
309 num = 0
310 do while (current_type() == TOK_LITERAL)
311 tok = current_token()
312 if (.not. is_digit(tok%char_val)) exit
313 num = num * 10 + (ichar(tok%char_val) - ichar('0'))
314 call advance()
315 end do
316 min_rep = num
317
318 ttype = current_type()
319 if (ttype == TOK_RBRACE) then
320 ! {n} - exact count
321 call advance()
322 max_rep = min_rep
323 return
324 end if
325
326 ! Expect comma
327 tok = current_token()
328 if (ttype /= TOK_LITERAL .or. tok%char_val /= ',') then
329 ierr = 1
330 return
331 end if
332 call advance()
333
334 ttype = current_type()
335 if (ttype == TOK_RBRACE) then
336 ! {n,} - at least n
337 call advance()
338 max_rep = -1
339 return
340 end if
341
342 ! Parse second number
343 if (ttype /= TOK_LITERAL) then
344 ierr = 1
345 return
346 end if
347
348 num = 0
349 do while (current_type() == TOK_LITERAL)
350 tok = current_token()
351 if (.not. is_digit(tok%char_val)) exit
352 num = num * 10 + (ichar(tok%char_val) - ichar('0'))
353 call advance()
354 end do
355 max_rep = num
356
357 ! Expect }
358 if (current_type() /= TOK_RBRACE) then
359 ierr = 1
360 return
361 end if
362 call advance()
363
364 end subroutine parse_brace_quantifier
365
366 recursive function parse_atom(ierr) result(idx)
367 !> Parse: atom = literal | '.' | '[' class ']' | '(' regex ')' | anchor | backref
368 integer, intent(out) :: ierr
369 integer :: idx
370
371 type(token_t) :: tok
372 integer :: ttype, group_idx
373
374 ierr = 0
375 idx = 0
376
377 tok = current_token()
378 ttype = tok%ttype
379
380 select case (ttype)
381 case (TOK_LITERAL)
382 idx = g_pool%alloc()
383 g_pool%nodes(idx)%ntype = AST_LITERAL
384 g_pool%nodes(idx)%char_val = tok%char_val
385 call advance()
386
387 case (TOK_DOT)
388 idx = g_pool%alloc()
389 g_pool%nodes(idx)%ntype = AST_DOT
390 call advance()
391
392 case (TOK_LBRACKET)
393 ! Character class (already parsed by lexer)
394 idx = g_pool%alloc()
395 g_pool%nodes(idx)%ntype = AST_CHAR_CLASS
396 g_pool%nodes(idx)%char_class = tok%char_class
397 g_pool%nodes(idx)%negated = tok%negated
398 call advance()
399
400 case (TOK_LPAREN)
401 call advance() ! consume (
402
403 g_group_num = g_group_num + 1
404 group_idx = g_group_num
405
406 ! Parse inner expression
407 idx = parse_alternation(ierr)
408 if (ierr /= 0) return
409
410 ! Expect )
411 if (current_type() /= TOK_RPAREN) then
412 ierr = 1
413 return
414 end if
415 call advance()
416
417 ! Wrap in group node
418 group_idx = g_pool%alloc()
419 g_pool%nodes(group_idx)%ntype = AST_GROUP
420 g_pool%nodes(group_idx)%child = idx
421 g_pool%nodes(group_idx)%group_num = g_group_num
422
423 idx = group_idx
424
425 case (TOK_CARET)
426 idx = g_pool%alloc()
427 g_pool%nodes(idx)%ntype = AST_ANCHOR
428 g_pool%nodes(idx)%anchor_type = 1 ! start
429 call advance()
430
431 case (TOK_DOLLAR)
432 idx = g_pool%alloc()
433 g_pool%nodes(idx)%ntype = AST_ANCHOR
434 g_pool%nodes(idx)%anchor_type = 2 ! end
435 call advance()
436
437 case (TOK_WORD_BOUNDARY)
438 idx = g_pool%alloc()
439 g_pool%nodes(idx)%ntype = AST_ANCHOR
440 g_pool%nodes(idx)%anchor_type = tok%int_val + 2 ! 3=word_start, 4=word_end, 5=boundary
441 call advance()
442
443 case (TOK_BACKREF)
444 idx = g_pool%alloc()
445 g_pool%nodes(idx)%ntype = AST_BACKREF
446 g_pool%nodes(idx)%group_num = tok%int_val
447 call advance()
448
449 case default
450 ! Unexpected token - error
451 ierr = 1
452 end select
453
454 end function parse_atom
455
456 !---------------------------------------------------------------------------
457 ! Utility Functions
458 !---------------------------------------------------------------------------
459 pure function is_digit(c) result(res)
460 character(len=1), intent(in) :: c
461 logical :: res
462 res = (c >= '0' .and. c <= '9')
463 end function is_digit
464
465 end module regex_parser
466