fortrangoingonforty/ferp / 5263050

Browse files

Update regex modules to use pattern_len

Use pattern_len() instead of len_trim() across regex modules to properly
handle whitespace patterns. Update Makefile dependencies to ensure
ferp_kinds is compiled before regex modules that use pattern_len.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
5263050a3cf26fd45d0f2da7f877f9cc64ea19a0
Parents
e34a89e
Tree
7ec0955

5 changed files

StatusFile+-
M Makefile 6 6
M src/regex/aho_corasick.f90 5 4
M src/regex/pcre_api.f90 12 6
M src/regex/regex_api.f90 3 2
M src/regex/regex_lexer.f90 2 1
Makefilemodified
@@ -102,17 +102,17 @@ $(BUILD_DIR)/%.o: $(SRC_DIR)/%.f90 | $(BUILD_DIR)
102
 $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c | $(BUILD_DIR)
102
 $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c | $(BUILD_DIR)
103
 	$(CC) $(CFLAGS) -c $< -o $@
103
 	$(CC) $(CFLAGS) -c $< -o $@
104
 
104
 
105
-# Regex module dependencies
105
+# Regex module dependencies (note: some depend on ferp_kinds for pattern_len function)
106
 $(BUILD_DIR)/regex_charclass.o:
106
 $(BUILD_DIR)/regex_charclass.o:
107
 $(BUILD_DIR)/regex_types.o: $(BUILD_DIR)/regex_charclass.o
107
 $(BUILD_DIR)/regex_types.o: $(BUILD_DIR)/regex_charclass.o
108
-$(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o
108
+$(BUILD_DIR)/regex_lexer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/ferp_kinds.o
109
 $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o
109
 $(BUILD_DIR)/regex_parser.o: $(BUILD_DIR)/regex_types.o
110
 $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/regex_parser.o
110
 $(BUILD_DIR)/regex_nfa.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/regex_parser.o
111
 $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o
111
 $(BUILD_DIR)/regex_engine.o: $(BUILD_DIR)/regex_types.o
112
-$(BUILD_DIR)/aho_corasick.o:
112
+$(BUILD_DIR)/aho_corasick.o: $(BUILD_DIR)/ferp_kinds.o
113
-$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/aho_corasick.o
113
+$(BUILD_DIR)/regex_optimizer.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_charclass.o $(BUILD_DIR)/aho_corasick.o $(BUILD_DIR)/ferp_kinds.o
114
-$(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o
114
+$(BUILD_DIR)/regex_api.o: $(BUILD_DIR)/regex_types.o $(BUILD_DIR)/regex_lexer.o $(BUILD_DIR)/regex_parser.o $(BUILD_DIR)/regex_nfa.o $(BUILD_DIR)/regex_engine.o $(BUILD_DIR)/regex_optimizer.o $(BUILD_DIR)/ferp_kinds.o
115
-$(BUILD_DIR)/pcre_api.o:
115
+$(BUILD_DIR)/pcre_api.o: $(BUILD_DIR)/ferp_kinds.o
116
 
116
 
117
 # Main module dependencies
117
 # Main module dependencies
118
 $(BUILD_DIR)/ferp_options.o: $(BUILD_DIR)/ferp_kinds.o
118
 $(BUILD_DIR)/ferp_options.o: $(BUILD_DIR)/ferp_kinds.o
src/regex/aho_corasick.f90modified
@@ -2,6 +2,7 @@ module aho_corasick
2
   !> Aho-Corasick automaton for multi-pattern string matching
2
   !> Aho-Corasick automaton for multi-pattern string matching
3
   !> Matches all patterns in a single pass O(n + m + z)
3
   !> Matches all patterns in a single pass O(n + m + z)
4
   !> where n=text length, m=total pattern length, z=matches
4
   !> where n=text length, m=total pattern length, z=matches
5
+  use ferp_kinds, only: pattern_len
5
   implicit none
6
   implicit none
6
   private
7
   private
7
 
8
 
@@ -63,16 +64,16 @@ contains
63
     ac%ignore_case = ignore_case
64
     ac%ignore_case = ignore_case
64
     ac%num_patterns = num_patterns
65
     ac%num_patterns = num_patterns
65
 
66
 
66
-    ! Allocate pattern lengths
67
+    ! Allocate pattern lengths (use pattern_len to preserve whitespace patterns)
67
     allocate(ac%pattern_lengths(num_patterns))
68
     allocate(ac%pattern_lengths(num_patterns))
68
     do i = 1, num_patterns
69
     do i = 1, num_patterns
69
-      ac%pattern_lengths(i) = len_trim(patterns(i))
70
+      ac%pattern_lengths(i) = pattern_len(patterns(i))
70
     end do
71
     end do
71
 
72
 
72
     ! Initial capacity - estimate based on total pattern length
73
     ! Initial capacity - estimate based on total pattern length
73
     ac%capacity = 1
74
     ac%capacity = 1
74
     do i = 1, num_patterns
75
     do i = 1, num_patterns
75
-      ac%capacity = ac%capacity + len_trim(patterns(i))
76
+      ac%capacity = ac%capacity + pattern_len(patterns(i))
76
     end do
77
     end do
77
     ac%capacity = max(ac%capacity, 256)
78
     ac%capacity = max(ac%capacity, 256)
78
     allocate(ac%nodes(ac%capacity))
79
     allocate(ac%nodes(ac%capacity))
@@ -84,7 +85,7 @@ contains
84
     ! Phase 1: Build trie from patterns
85
     ! Phase 1: Build trie from patterns
85
     do i = 1, num_patterns
86
     do i = 1, num_patterns
86
       state = 1  ! Start at root
87
       state = 1  ! Start at root
87
-      do j = 1, len_trim(patterns(i))
88
+      do j = 1, pattern_len(patterns(i))
88
         ch = patterns(i)(j:j)
89
         ch = patterns(i)(j:j)
89
         if (ignore_case) then
90
         if (ignore_case) then
90
           c = to_lower_code(ichar(ch))
91
           c = to_lower_code(ichar(ch))
src/regex/pcre_api.f90modified
@@ -2,6 +2,7 @@ module pcre_api
2
   !> PCRE2 library bindings for Perl-compatible regular expressions
2
   !> PCRE2 library bindings for Perl-compatible regular expressions
3
   !> Uses iso_c_binding for C interoperability with libpcre2-8
3
   !> Uses iso_c_binding for C interoperability with libpcre2-8
4
   use, intrinsic :: iso_c_binding
4
   use, intrinsic :: iso_c_binding
5
+  use ferp_kinds, only: pattern_len
5
   implicit none
6
   implicit none
6
   private
7
   private
7
 
8
 
@@ -159,27 +160,32 @@ contains
159
     logical, intent(in), optional :: ignore_case
160
     logical, intent(in), optional :: ignore_case
160
     integer, intent(out) :: ierr
161
     integer, intent(out) :: ierr
161
 
162
 
162
-    character(len=len_trim(pattern)+1, kind=c_char) :: c_pattern
163
+    integer :: plen
164
+    character(len=:), allocatable :: c_pattern
163
     integer(c_int) :: options, errorcode
165
     integer(c_int) :: options, errorcode
164
-    integer(c_size_t) :: erroroffset, pattern_len
166
+    integer(c_size_t) :: erroroffset, pcre_pattern_len
165
 
167
 
166
     ierr = 0
168
     ierr = 0
167
     re%compiled = .false.
169
     re%compiled = .false.
168
     re%error_code = 0
170
     re%error_code = 0
169
     re%error_msg = ''
171
     re%error_msg = ''
170
 
172
 
173
+    ! Get actual pattern length (preserving whitespace patterns)
174
+    plen = pattern_len(pattern)
175
+
171
     ! Set options - enable UTF-8 and Unicode properties by default
176
     ! Set options - enable UTF-8 and Unicode properties by default
172
     options = ior(PCRE2_UTF, PCRE2_UCP)
177
     options = ior(PCRE2_UTF, PCRE2_UCP)
173
     if (present(ignore_case)) then
178
     if (present(ignore_case)) then
174
       if (ignore_case) options = ior(options, PCRE2_CASELESS)
179
       if (ignore_case) options = ior(options, PCRE2_CASELESS)
175
     end if
180
     end if
176
 
181
 
177
-    ! Prepare pattern as C string
182
+    ! Prepare pattern as C string (use exact length, not trim)
178
-    c_pattern = trim(pattern) // c_null_char
183
+    allocate(character(len=plen+1) :: c_pattern)
179
-    pattern_len = int(len_trim(pattern), c_size_t)
184
+    c_pattern = pattern(1:plen) // c_null_char
185
+    pcre_pattern_len = int(plen, c_size_t)
180
 
186
 
181
     ! Compile pattern
187
     ! Compile pattern
182
-    re%code = pcre2_compile_8(c_pattern, pattern_len, options, &
188
+    re%code = pcre2_compile_8(c_pattern, pcre_pattern_len, options, &
183
                                errorcode, erroroffset, c_null_ptr)
189
                                errorcode, erroroffset, c_null_ptr)
184
 
190
 
185
     if (.not. c_associated(re%code)) then
191
     if (.not. c_associated(re%code)) then
src/regex/regex_api.f90modified
@@ -7,6 +7,7 @@ module regex_api
7
   use regex_nfa
7
   use regex_nfa
8
   use regex_engine
8
   use regex_engine
9
   use regex_optimizer
9
   use regex_optimizer
10
+  use ferp_kinds, only: pattern_len
10
   implicit none
11
   implicit none
11
   private
12
   private
12
 
13
 
@@ -53,8 +54,8 @@ contains
53
     if (present(is_ere)) extended = is_ere
54
     if (present(is_ere)) extended = is_ere
54
     re%is_ere = extended
55
     re%is_ere = extended
55
 
56
 
56
-    ! Handle empty pattern
57
+    ! Handle empty pattern (use pattern_len to preserve whitespace patterns)
57
-    if (len_trim(pattern) == 0) then
58
+    if (pattern_len(pattern) == 0) then
58
       call re%nfa%init()
59
       call re%nfa%init()
59
       re%nfa%start_state = re%nfa%add_state()
60
       re%nfa%start_state = re%nfa%add_state()
60
       re%nfa%accept_state = re%nfa%add_state()
61
       re%nfa%accept_state = re%nfa%add_state()
src/regex/regex_lexer.f90modified
@@ -2,6 +2,7 @@ module regex_lexer
2
   !> Regex pattern tokenizer for FERP
2
   !> Regex pattern tokenizer for FERP
3
   !> Handles both BRE (Basic) and ERE (Extended) regex dialects
3
   !> Handles both BRE (Basic) and ERE (Extended) regex dialects
4
   use regex_types
4
   use regex_types
5
+  use ferp_kinds, only: pattern_len
5
   implicit none
6
   implicit none
6
   private
7
   private
7
 
8
 
@@ -23,7 +24,7 @@ contains
23
 
24
 
24
     ierr = 0
25
     ierr = 0
25
     call tokens%init()
26
     call tokens%init()
26
-    n = len_trim(pattern)
27
+    n = pattern_len(pattern)  ! Use pattern_len to preserve whitespace patterns
27
     i = 1
28
     i = 1
28
     in_bracket = .false.
29
     in_bracket = .false.
29
 
30