fortrangoingonforty/ferp / 5263050

Browse files

Update regex modules to use pattern_len

Use pattern_len() instead of len_trim() across regex modules to properly
handle whitespace patterns. Update Makefile dependencies to ensure
ferp_kinds is compiled before regex modules that use pattern_len.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
5263050a3cf26fd45d0f2da7f877f9cc64ea19a0
Parents
e34a89e
Tree
7ec0955

5 changed files

StatusFile+-
M Makefile 6 6
M src/regex/aho_corasick.f90 5 4
M src/regex/pcre_api.f90 12 6
M src/regex/regex_api.f90 3 2
M src/regex/regex_lexer.f90 2 1
Makefilemodified
@@ -102,17 +102,17 @@ $(BUILD_DIR)/%.o: $(SRC_DIR)/%.f90 | $(BUILD_DIR)
102102
 $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c | $(BUILD_DIR)
103103
 	$(CC) $(CFLAGS) -c $< -o $@
104104
 
105
-# Regex module dependencies
105
+# Regex module dependencies (note: some depend on ferp_kinds for pattern_len function)
106106
 $(BUILD_DIR)/regex_charclass.o:
107107
 $(BUILD_DIR)/regex_types.o: $(BUILD_DIR)/regex_charclass.o
108
-$(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o
108
+$(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/ferp_kinds.o
109109
 $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o
110110
 $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/regex_parser.o
111111
 $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o
112
-$(BUILD_DIR)/aho_corasick.o:
113
-$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/aho_corasick.o
114
-$(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o
115
-$(BUILD_DIR)/pcre_api.o:
112
+$(BUILD_DIR)/aho_corasick.o: $(BUILD_DIR)/ferp_kinds.o
113
+$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/aho_corasick.o $(BUILD_DIR)/ferp_kinds.o
114
+$(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o $(BUILD_DIR)/ferp_kinds.o
115
+$(BUILD_DIR)/pcre_api.o: $(BUILD_DIR)/ferp_kinds.o
116116
 
117117
 # Main module dependencies
118118
 $(BUILD_DIR)/ferp_options.o: $(BUILD_DIR)/ferp_kinds.o
src/regex/aho_corasick.f90modified
@@ -2,6 +2,7 @@ module aho_corasick
22
   !> Aho-Corasick automaton for multi-pattern string matching
33
   !> Matches all patterns in a single pass O(n + m + z)
44
   !> where n=text length, m=total pattern length, z=matches
5
+  use ferp_kinds, only: pattern_len
56
   implicit none
67
   private
78
 
@@ -63,16 +64,16 @@ contains
6364
     ac%ignore_case = ignore_case
6465
     ac%num_patterns = num_patterns
6566
 
66
-    ! Allocate pattern lengths
67
+    ! Allocate pattern lengths (use pattern_len to preserve whitespace patterns)
6768
     allocate(ac%pattern_lengths(num_patterns))
6869
     do i = 1, num_patterns
69
-      ac%pattern_lengths(i) = len_trim(patterns(i))
70
+      ac%pattern_lengths(i) = pattern_len(patterns(i))
7071
     end do
7172
 
7273
     ! Initial capacity - estimate based on total pattern length
7374
     ac%capacity = 1
7475
     do i = 1, num_patterns
75
-      ac%capacity = ac%capacity + len_trim(patterns(i))
76
+      ac%capacity = ac%capacity + pattern_len(patterns(i))
7677
     end do
7778
     ac%capacity = max(ac%capacity, 256)
7879
     allocate(ac%nodes(ac%capacity))
@@ -84,7 +85,7 @@ contains
8485
     ! Phase 1: Build trie from patterns
8586
     do i = 1, num_patterns
8687
       state = 1  ! Start at root
87
-      do j = 1, len_trim(patterns(i))
88
+      do j = 1, pattern_len(patterns(i))
8889
         ch = patterns(i)(j:j)
8990
         if (ignore_case) then
9091
           c = to_lower_code(ichar(ch))
src/regex/pcre_api.f90modified
@@ -2,6 +2,7 @@ module pcre_api
22
   !> PCRE2 library bindings for Perl-compatible regular expressions
33
   !> Uses iso_c_binding for C interoperability with libpcre2-8
44
   use, intrinsic :: iso_c_binding
5
+  use ferp_kinds, only: pattern_len
56
   implicit none
67
   private
78
 
@@ -159,27 +160,32 @@ contains
159160
     logical, intent(in), optional :: ignore_case
160161
     integer, intent(out) :: ierr
161162
 
162
-    character(len=len_trim(pattern)+1, kind=c_char) :: c_pattern
163
+    integer :: plen
164
+    character(len=:), allocatable :: c_pattern
163165
     integer(c_int) :: options, errorcode
164
-    integer(c_size_t) :: erroroffset, pattern_len
166
+    integer(c_size_t) :: erroroffset, pcre_pattern_len
165167
 
166168
     ierr = 0
167169
     re%compiled = .false.
168170
     re%error_code = 0
169171
     re%error_msg = ''
170172
 
173
+    ! Get actual pattern length (preserving whitespace patterns)
174
+    plen = pattern_len(pattern)
175
+
171176
     ! Set options - enable UTF-8 and Unicode properties by default
172177
     options = ior(PCRE2_UTF, PCRE2_UCP)
173178
     if (present(ignore_case)) then
174179
       if (ignore_case) options = ior(options, PCRE2_CASELESS)
175180
     end if
176181
 
177
-    ! Prepare pattern as C string
178
-    c_pattern = trim(pattern) // c_null_char
179
-    pattern_len = int(len_trim(pattern), c_size_t)
182
+    ! Prepare pattern as C string (use exact length, not trim)
183
+    allocate(character(len=plen+1) :: c_pattern)
184
+    c_pattern = pattern(1:plen) // c_null_char
185
+    pcre_pattern_len = int(plen, c_size_t)
180186
 
181187
     ! Compile pattern
182
-    re%code = pcre2_compile_8(c_pattern, pattern_len, options, &
188
+    re%code = pcre2_compile_8(c_pattern, pcre_pattern_len, options, &
183189
                                errorcode, erroroffset, c_null_ptr)
184190
 
185191
     if (.not. c_associated(re%code)) then
src/regex/regex_api.f90modified
@@ -7,6 +7,7 @@ module regex_api
77
   use regex_nfa
88
   use regex_engine
99
   use regex_optimizer
10
+  use ferp_kinds, only: pattern_len
1011
   implicit none
1112
   private
1213
 
@@ -53,8 +54,8 @@ contains
5354
     if (present(is_ere)) extended = is_ere
5455
     re%is_ere = extended
5556
 
56
-    ! Handle empty pattern
57
-    if (len_trim(pattern) == 0) then
57
+    ! Handle empty pattern (use pattern_len to preserve whitespace patterns)
58
+    if (pattern_len(pattern) == 0) then
5859
       call re%nfa%init()
5960
       re%nfa%start_state = re%nfa%add_state()
6061
       re%nfa%accept_state = re%nfa%add_state()
src/regex/regex_lexer.f90modified
@@ -2,6 +2,7 @@ module regex_lexer
22
   !> Regex pattern tokenizer for FERP
33
   !> Handles both BRE (Basic) and ERE (Extended) regex dialects
44
   use regex_types
5
+  use ferp_kinds, only: pattern_len
56
   implicit none
67
   private
78
 
@@ -23,7 +24,7 @@ contains
2324
 
2425
     ierr = 0
2526
     call tokens%init()
26
-    n = len_trim(pattern)
27
+    n = pattern_len(pattern)  ! Use pattern_len to preserve whitespace patterns
2728
     i = 1
2829
     in_bracket = .false.
2930