Fortran · 13562 bytes Raw Blame History
1 module regex_lexer
2 !> Regex pattern tokenizer for FERP
3 !> Handles both BRE (Basic) and ERE (Extended) regex dialects
4 use regex_types
5 use ferp_kinds, only: pattern_len
6 implicit none
7 private
8
9 public :: tokenize
10
11 contains
12
13 subroutine tokenize(pattern, tokens, is_ere, ierr)
14 !> Tokenize a regex pattern
15 character(len=*), intent(in) :: pattern
16 type(token_list_t), intent(out) :: tokens
17 logical, intent(in) :: is_ere ! True for ERE, false for BRE
18 integer, intent(out) :: ierr
19
20 integer :: i, n
21 character(len=1) :: c, next_c
22 type(token_t) :: tok
23 logical :: in_bracket
24
25 ierr = 0
26 call tokens%init()
27 n = pattern_len(pattern) ! Use pattern_len to preserve whitespace patterns
28 i = 1
29 in_bracket = .false.
30
31 do while (i <= n)
32 c = pattern(i:i)
33 next_c = ' '
34 if (i < n) next_c = pattern(i+1:i+1)
35
36 tok = token_t()
37 tok%pos = i
38
39 ! Handle character class separately
40 if (c == '[') then
41 call parse_char_class(pattern, i, n, tok, ierr)
42 if (ierr /= 0) return
43 call tokens%append(tok)
44 cycle
45 end if
46
47 ! Handle escape sequences
48 if (c == '\') then
49 if (i >= n) then
50 ! Trailing backslash - treat as literal
51 tok%ttype = TOK_LITERAL
52 tok%char_val = '\'
53 i = i + 1
54 else
55 call parse_escape(pattern, i, n, tok, is_ere, ierr)
56 if (ierr /= 0) return
57 end if
58 call tokens%append(tok)
59 cycle
60 end if
61
62 ! Handle metacharacters based on BRE vs ERE
63 select case (c)
64 case ('.')
65 tok%ttype = TOK_DOT
66 i = i + 1
67
68 case ('*')
69 tok%ttype = TOK_STAR
70 i = i + 1
71
72 case ('^')
73 tok%ttype = TOK_CARET
74 i = i + 1
75
76 case ('$')
77 tok%ttype = TOK_DOLLAR
78 i = i + 1
79
80 case ('+')
81 if (is_ere) then
82 tok%ttype = TOK_PLUS
83 else
84 tok%ttype = TOK_LITERAL
85 tok%char_val = '+'
86 end if
87 i = i + 1
88
89 case ('?')
90 if (is_ere) then
91 tok%ttype = TOK_QUESTION
92 else
93 tok%ttype = TOK_LITERAL
94 tok%char_val = '?'
95 end if
96 i = i + 1
97
98 case ('|')
99 if (is_ere) then
100 tok%ttype = TOK_PIPE
101 else
102 tok%ttype = TOK_LITERAL
103 tok%char_val = '|'
104 end if
105 i = i + 1
106
107 case ('(')
108 if (is_ere) then
109 tok%ttype = TOK_LPAREN
110 else
111 tok%ttype = TOK_LITERAL
112 tok%char_val = '('
113 end if
114 i = i + 1
115
116 case (')')
117 if (is_ere) then
118 tok%ttype = TOK_RPAREN
119 else
120 tok%ttype = TOK_LITERAL
121 tok%char_val = ')'
122 end if
123 i = i + 1
124
125 case ('{')
126 if (is_ere) then
127 tok%ttype = TOK_LBRACE
128 else
129 tok%ttype = TOK_LITERAL
130 tok%char_val = '{'
131 end if
132 i = i + 1
133
134 case ('}')
135 if (is_ere) then
136 tok%ttype = TOK_RBRACE
137 else
138 tok%ttype = TOK_LITERAL
139 tok%char_val = '}'
140 end if
141 i = i + 1
142
143 case default
144 ! Literal character
145 tok%ttype = TOK_LITERAL
146 tok%char_val = c
147 i = i + 1
148 end select
149
150 call tokens%append(tok)
151 end do
152
153 ! Add end token
154 tok = token_t()
155 tok%ttype = TOK_END
156 tok%pos = n + 1
157 call tokens%append(tok)
158
159 end subroutine tokenize
160
161 subroutine parse_escape(pattern, pos, n, tok, is_ere, ierr)
162 !> Parse an escape sequence starting at pos (which points to \)
163 character(len=*), intent(in) :: pattern
164 integer, intent(inout) :: pos
165 integer, intent(in) :: n
166 type(token_t), intent(out) :: tok
167 logical, intent(in) :: is_ere
168 integer, intent(out) :: ierr
169
170 character(len=1) :: c
171 integer :: ref_num
172
173 ierr = 0
174 pos = pos + 1 ! Skip the backslash
175 c = pattern(pos:pos)
176
177 select case (c)
178 ! BRE special escapes (become metacharacters)
179 case ('(')
180 if (is_ere) then
181 tok%ttype = TOK_LITERAL
182 tok%char_val = '('
183 else
184 tok%ttype = TOK_LPAREN
185 end if
186 pos = pos + 1
187
188 case (')')
189 if (is_ere) then
190 tok%ttype = TOK_LITERAL
191 tok%char_val = ')'
192 else
193 tok%ttype = TOK_RPAREN
194 end if
195 pos = pos + 1
196
197 case ('{')
198 if (is_ere) then
199 tok%ttype = TOK_LITERAL
200 tok%char_val = '{'
201 else
202 tok%ttype = TOK_LBRACE
203 end if
204 pos = pos + 1
205
206 case ('}')
207 if (is_ere) then
208 tok%ttype = TOK_LITERAL
209 tok%char_val = '}'
210 else
211 tok%ttype = TOK_RBRACE
212 end if
213 pos = pos + 1
214
215 ! GNU extensions for BRE (also work in ERE)
216 case ('+')
217 if (.not. is_ere) then
218 tok%ttype = TOK_PLUS
219 else
220 tok%ttype = TOK_LITERAL
221 tok%char_val = '+'
222 end if
223 pos = pos + 1
224
225 case ('?')
226 if (.not. is_ere) then
227 tok%ttype = TOK_QUESTION
228 else
229 tok%ttype = TOK_LITERAL
230 tok%char_val = '?'
231 end if
232 pos = pos + 1
233
234 case ('|')
235 if (.not. is_ere) then
236 tok%ttype = TOK_PIPE
237 else
238 tok%ttype = TOK_LITERAL
239 tok%char_val = '|'
240 end if
241 pos = pos + 1
242
243 ! Backreferences \1-\9
244 case ('1', '2', '3', '4', '5', '6', '7', '8', '9')
245 tok%ttype = TOK_BACKREF
246 read(c, '(I1)') ref_num
247 tok%int_val = ref_num
248 pos = pos + 1
249
250 ! Word boundaries
251 case ('<')
252 tok%ttype = TOK_WORD_BOUNDARY
253 tok%int_val = 1 ! word start
254 pos = pos + 1
255
256 case ('>')
257 tok%ttype = TOK_WORD_BOUNDARY
258 tok%int_val = 2 ! word end
259 pos = pos + 1
260
261 case ('b')
262 tok%ttype = TOK_WORD_BOUNDARY
263 tok%int_val = 3 ! word boundary (either)
264 pos = pos + 1
265
266 case ('B')
267 tok%ttype = TOK_WORD_BOUNDARY
268 tok%int_val = 4 ! not word boundary
269 pos = pos + 1
270
271 ! Character escapes
272 case ('n')
273 tok%ttype = TOK_LITERAL
274 tok%char_val = char(10) ! newline
275 pos = pos + 1
276
277 case ('t')
278 tok%ttype = TOK_LITERAL
279 tok%char_val = char(9) ! tab
280 pos = pos + 1
281
282 case ('r')
283 tok%ttype = TOK_LITERAL
284 tok%char_val = char(13) ! carriage return
285 pos = pos + 1
286
287 ! Escape metacharacters to make them literal
288 case ('.', '*', '^', '$', '[', ']', '\')
289 tok%ttype = TOK_LITERAL
290 tok%char_val = c
291 pos = pos + 1
292
293 case default
294 ! Unknown escape - treat as literal
295 tok%ttype = TOK_LITERAL
296 tok%char_val = c
297 pos = pos + 1
298 end select
299
300 end subroutine parse_escape
301
302 subroutine parse_char_class(pattern, pos, n, tok, ierr)
303 !> Parse a character class [...] starting at pos (which points to [)
304 character(len=*), intent(in) :: pattern
305 integer, intent(inout) :: pos
306 integer, intent(in) :: n
307 type(token_t), intent(out) :: tok
308 integer, intent(out) :: ierr
309
310 integer :: j, start_char, end_char
311 character(len=1) :: c, prev_c
312 logical :: negated, first
313
314 ierr = 0
315 tok%ttype = TOK_LITERAL ! Will be set properly at end
316 tok%char_class = .false.
317 tok%negated = .false.
318
319 pos = pos + 1 ! Skip [
320
321 if (pos > n) then
322 ierr = 1
323 return
324 end if
325
326 ! Check for negation
327 negated = .false.
328 if (pattern(pos:pos) == '^') then
329 negated = .true.
330 pos = pos + 1
331 end if
332
333 ! Handle ] at start (it's literal)
334 first = .true.
335 if (pos <= n .and. pattern(pos:pos) == ']') then
336 tok%char_class(ichar(']')) = .true.
337 pos = pos + 1
338 first = .false.
339 end if
340
341 ! Handle - at start (it's literal)
342 if (pos <= n .and. pattern(pos:pos) == '-') then
343 tok%char_class(ichar('-')) = .true.
344 pos = pos + 1
345 end if
346
347 prev_c = char(0)
348 do while (pos <= n)
349 c = pattern(pos:pos)
350
351 if (c == ']') then
352 ! End of character class
353 pos = pos + 1
354 tok%negated = negated
355 tok%ttype = TOK_LBRACKET ! Indicate this is a char class token
356 return
357 end if
358
359 if (c == '-' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) /= ']') then
360 ! Range: prev_c - next_c
361 if (prev_c /= char(0)) then
362 pos = pos + 1
363 if (pos > n) then
364 ierr = 1
365 return
366 end if
367 c = pattern(pos:pos)
368
369 ! Handle escape in range end
370 if (c == '\' .and. pos + 1 <= n) then
371 pos = pos + 1
372 c = pattern(pos:pos)
373 end if
374
375 start_char = ichar(prev_c)
376 end_char = ichar(c)
377 if (start_char > end_char) then
378 ! Invalid range, but we'll be lenient
379 tok%char_class(start_char) = .true.
380 tok%char_class(ichar('-')) = .true.
381 tok%char_class(end_char) = .true.
382 else
383 do j = start_char, end_char
384 tok%char_class(j) = .true.
385 end do
386 end if
387 prev_c = char(0) ! Reset after range
388 pos = pos + 1
389 cycle
390 else
391 ! - at start after ] or as first char
392 tok%char_class(ichar('-')) = .true.
393 pos = pos + 1
394 cycle
395 end if
396 end if
397
398 if (c == '\' .and. pos + 1 <= n) then
399 ! Escape sequence in character class
400 pos = pos + 1
401 c = pattern(pos:pos)
402 select case (c)
403 case ('n')
404 c = char(10)
405 case ('t')
406 c = char(9)
407 case ('r')
408 c = char(13)
409 ! Otherwise take the character literally
410 end select
411 end if
412
413 if (c == '[' .and. pos + 1 <= n .and. pattern(pos+1:pos+1) == ':') then
414 ! POSIX character class [:alpha:] etc
415 call parse_posix_class(pattern, pos, n, tok%char_class, ierr)
416 if (ierr /= 0) return
417 prev_c = char(0)
418 cycle
419 end if
420
421 ! Regular character
422 tok%char_class(ichar(c)) = .true.
423 prev_c = c
424 pos = pos + 1
425 end do
426
427 ! Unterminated character class
428 ierr = 1
429
430 end subroutine parse_char_class
431
432 subroutine parse_posix_class(pattern, pos, n, char_class, ierr)
433 !> Parse POSIX character class [:name:]
434 character(len=*), intent(in) :: pattern
435 integer, intent(inout) :: pos
436 integer, intent(in) :: n
437 logical, intent(inout) :: char_class(0:255)
438 integer, intent(out) :: ierr
439
440 integer :: end_pos, j
441 character(len=16) :: class_name
442
443 ierr = 0
444
445 ! Find closing :]
446 end_pos = index(pattern(pos:n), ':]')
447 if (end_pos == 0) then
448 ierr = 1
449 return
450 end if
451 end_pos = pos + end_pos - 1 ! Adjust to absolute position (index is 1-based)
452
453 ! Extract class name (skip [: at start, stop before :]
454 class_name = pattern(pos+2:end_pos-1)
455
456 select case (trim(class_name))
457 case ('alnum')
458 do j = ichar('a'), ichar('z')
459 char_class(j) = .true.
460 end do
461 do j = ichar('A'), ichar('Z')
462 char_class(j) = .true.
463 end do
464 do j = ichar('0'), ichar('9')
465 char_class(j) = .true.
466 end do
467
468 case ('alpha')
469 do j = ichar('a'), ichar('z')
470 char_class(j) = .true.
471 end do
472 do j = ichar('A'), ichar('Z')
473 char_class(j) = .true.
474 end do
475
476 case ('digit')
477 do j = ichar('0'), ichar('9')
478 char_class(j) = .true.
479 end do
480
481 case ('lower')
482 do j = ichar('a'), ichar('z')
483 char_class(j) = .true.
484 end do
485
486 case ('upper')
487 do j = ichar('A'), ichar('Z')
488 char_class(j) = .true.
489 end do
490
491 case ('space')
492 char_class(ichar(' ')) = .true.
493 char_class(9) = .true. ! tab
494 char_class(10) = .true. ! newline
495 char_class(11) = .true. ! vertical tab
496 char_class(12) = .true. ! form feed
497 char_class(13) = .true. ! carriage return
498
499 case ('blank')
500 char_class(ichar(' ')) = .true.
501 char_class(9) = .true. ! tab
502
503 case ('punct')
504 ! Punctuation characters
505 do j = 33, 47
506 char_class(j) = .true.
507 end do
508 do j = 58, 64
509 char_class(j) = .true.
510 end do
511 do j = 91, 96
512 char_class(j) = .true.
513 end do
514 do j = 123, 126
515 char_class(j) = .true.
516 end do
517
518 case ('xdigit')
519 do j = ichar('0'), ichar('9')
520 char_class(j) = .true.
521 end do
522 do j = ichar('a'), ichar('f')
523 char_class(j) = .true.
524 end do
525 do j = ichar('A'), ichar('F')
526 char_class(j) = .true.
527 end do
528
529 case ('word')
530 ! GNU extension: word characters
531 do j = ichar('a'), ichar('z')
532 char_class(j) = .true.
533 end do
534 do j = ichar('A'), ichar('Z')
535 char_class(j) = .true.
536 end do
537 do j = ichar('0'), ichar('9')
538 char_class(j) = .true.
539 end do
540 char_class(ichar('_')) = .true.
541
542 case default
543 ! Unknown class - ignore silently
544 end select
545
546 pos = end_pos + 2 ! Skip past :] (end_pos points to ':', so +2 to skip both)
547
548 end subroutine parse_posix_class
549
550 end module regex_lexer
551