| 1 | module pcre_api |
| 2 | !> PCRE2 library bindings for Perl-compatible regular expressions |
| 3 | !> Uses iso_c_binding for C interoperability with libpcre2-8 |
| 4 | use, intrinsic :: iso_c_binding |
| 5 | use ferp_kinds, only: pattern_len |
| 6 | implicit none |
| 7 | private |
| 8 | |
| 9 | public :: pcre_t, pcre_match_result_t |
| 10 | public :: pcre_compile, pcre_match, pcre_search |
| 11 | public :: pcre_free, pcre_error_message |
| 12 | public :: pcre_available |
| 13 | |
| 14 | !--------------------------------------------------------------------------- |
| 15 | ! PCRE2 Option Constants |
| 16 | !--------------------------------------------------------------------------- |
| 17 | integer(c_int), parameter, public :: PCRE2_CASELESS = int(z'00000008', c_int) |
| 18 | integer(c_int), parameter, public :: PCRE2_MULTILINE = int(z'00000400', c_int) |
| 19 | integer(c_int), parameter, public :: PCRE2_DOTALL = int(z'00000020', c_int) |
| 20 | integer(c_int), parameter, public :: PCRE2_EXTENDED = int(z'00000080', c_int) |
| 21 | integer(c_int), parameter, public :: PCRE2_UTF = int(z'00080000', c_int) |
| 22 | integer(c_int), parameter, public :: PCRE2_UCP = int(z'00020000', c_int) ! Unicode properties |
| 23 | integer(c_int), parameter, public :: PCRE2_NO_UTF_CHECK = int(z'40000000', c_int) |
| 24 | |
| 25 | !--------------------------------------------------------------------------- |
| 26 | ! C Interface to PCRE2 Library (8-bit) |
| 27 | !--------------------------------------------------------------------------- |
| 28 | interface |
| 29 | ! pcre2_compile_8 - Compile a regular expression pattern |
| 30 | function pcre2_compile_8(pattern, length, options, errorcode, erroroffset, ccontext) & |
| 31 | bind(C, name="pcre2_compile_8") |
| 32 | import :: c_ptr, c_char, c_size_t, c_int |
| 33 | character(kind=c_char), intent(in) :: pattern(*) |
| 34 | integer(c_size_t), value :: length |
| 35 | integer(c_int), value :: options |
| 36 | integer(c_int), intent(out) :: errorcode |
| 37 | integer(c_size_t), intent(out) :: erroroffset |
| 38 | type(c_ptr), value :: ccontext |
| 39 | type(c_ptr) :: pcre2_compile_8 |
| 40 | end function pcre2_compile_8 |
| 41 | |
| 42 | ! pcre2_match_data_create_from_pattern_8 - Create match data block |
| 43 | function pcre2_match_data_create_from_pattern_8(code, gcontext) & |
| 44 | bind(C, name="pcre2_match_data_create_from_pattern_8") |
| 45 | import :: c_ptr |
| 46 | type(c_ptr), value :: code |
| 47 | type(c_ptr), value :: gcontext |
| 48 | type(c_ptr) :: pcre2_match_data_create_from_pattern_8 |
| 49 | end function pcre2_match_data_create_from_pattern_8 |
| 50 | |
| 51 | ! pcre2_match_8 - Match a compiled pattern against a subject string |
| 52 | function pcre2_match_8(code, subject, length, startoffset, options, & |
| 53 | match_data, mcontext) bind(C, name="pcre2_match_8") |
| 54 | import :: c_ptr, c_char, c_size_t, c_int |
| 55 | type(c_ptr), value :: code |
| 56 | character(kind=c_char), intent(in) :: subject(*) |
| 57 | integer(c_size_t), value :: length |
| 58 | integer(c_size_t), value :: startoffset |
| 59 | integer(c_int), value :: options |
| 60 | type(c_ptr), value :: match_data |
| 61 | type(c_ptr), value :: mcontext |
| 62 | integer(c_int) :: pcre2_match_8 |
| 63 | end function pcre2_match_8 |
| 64 | |
| 65 | ! pcre2_get_ovector_pointer_8 - Get pointer to output vector |
| 66 | function pcre2_get_ovector_pointer_8(match_data) & |
| 67 | bind(C, name="pcre2_get_ovector_pointer_8") |
| 68 | import :: c_ptr |
| 69 | type(c_ptr), value :: match_data |
| 70 | type(c_ptr) :: pcre2_get_ovector_pointer_8 |
| 71 | end function pcre2_get_ovector_pointer_8 |
| 72 | |
| 73 | ! pcre2_get_ovector_count_8 - Get number of pairs in output vector |
| 74 | function pcre2_get_ovector_count_8(match_data) & |
| 75 | bind(C, name="pcre2_get_ovector_count_8") |
| 76 | import :: c_ptr, c_int |
| 77 | type(c_ptr), value :: match_data |
| 78 | integer(c_int) :: pcre2_get_ovector_count_8 |
| 79 | end function pcre2_get_ovector_count_8 |
| 80 | |
| 81 | ! pcre2_code_free_8 - Free a compiled pattern |
| 82 | subroutine pcre2_code_free_8(code) bind(C, name="pcre2_code_free_8") |
| 83 | import :: c_ptr |
| 84 | type(c_ptr), value :: code |
| 85 | end subroutine pcre2_code_free_8 |
| 86 | |
| 87 | ! pcre2_match_data_free_8 - Free match data block |
| 88 | subroutine pcre2_match_data_free_8(match_data) & |
| 89 | bind(C, name="pcre2_match_data_free_8") |
| 90 | import :: c_ptr |
| 91 | type(c_ptr), value :: match_data |
| 92 | end subroutine pcre2_match_data_free_8 |
| 93 | |
| 94 | ! pcre2_get_error_message_8 - Get error message for error code |
| 95 | function pcre2_get_error_message_8(errorcode, buffer, bufflen) & |
| 96 | bind(C, name="pcre2_get_error_message_8") |
| 97 | import :: c_ptr, c_char, c_size_t, c_int |
| 98 | integer(c_int), value :: errorcode |
| 99 | character(kind=c_char), intent(out) :: buffer(*) |
| 100 | integer(c_size_t), value :: bufflen |
| 101 | integer(c_int) :: pcre2_get_error_message_8 |
| 102 | end function pcre2_get_error_message_8 |
| 103 | end interface |
| 104 | |
| 105 | !--------------------------------------------------------------------------- |
| 106 | ! Compiled PCRE Pattern Type |
| 107 | !--------------------------------------------------------------------------- |
| 108 | type :: pcre_t |
| 109 | private |
| 110 | type(c_ptr) :: code = c_null_ptr ! Compiled pattern |
| 111 | type(c_ptr) :: match_data = c_null_ptr ! Match data block |
| 112 | logical :: compiled = .false. |
| 113 | integer :: error_code = 0 |
| 114 | character(len=256) :: error_msg = '' |
| 115 | contains |
| 116 | procedure :: is_compiled => pcre_is_compiled |
| 117 | end type pcre_t |
| 118 | |
| 119 | !--------------------------------------------------------------------------- |
| 120 | ! Match Result Type |
| 121 | !--------------------------------------------------------------------------- |
| 122 | type :: pcre_match_result_t |
| 123 | logical :: matched = .false. |
| 124 | integer :: match_start = 0 ! 1-based start position |
| 125 | integer :: match_end = 0 ! 1-based end position |
| 126 | integer :: group_starts(20) = 0 |
| 127 | integer :: group_ends(20) = 0 |
| 128 | integer :: num_groups = 0 |
| 129 | end type pcre_match_result_t |
| 130 | |
| 131 | ! Module state |
| 132 | logical, save :: pcre_checked = .false. |
| 133 | logical, save :: pcre_is_available = .false. |
| 134 | |
| 135 | contains |
| 136 | |
| 137 | !--------------------------------------------------------------------------- |
| 138 | ! Check if PCRE2 library is available |
| 139 | !--------------------------------------------------------------------------- |
| 140 | function pcre_available() result(available) |
| 141 | logical :: available |
| 142 | |
| 143 | if (.not. pcre_checked) then |
| 144 | ! Try to compile a simple pattern to check availability |
| 145 | ! If the library isn't linked, this will cause a runtime error |
| 146 | ! For now, assume available if we got this far (library linked) |
| 147 | pcre_is_available = .true. |
| 148 | pcre_checked = .true. |
| 149 | end if |
| 150 | |
| 151 | available = pcre_is_available |
| 152 | end function pcre_available |
| 153 | |
| 154 | !--------------------------------------------------------------------------- |
| 155 | ! Compile a PCRE pattern |
| 156 | !--------------------------------------------------------------------------- |
| 157 | subroutine pcre_compile(re, pattern, ignore_case, ierr) |
| 158 | type(pcre_t), intent(out) :: re |
| 159 | character(len=*), intent(in) :: pattern |
| 160 | logical, intent(in), optional :: ignore_case |
| 161 | integer, intent(out) :: ierr |
| 162 | |
| 163 | integer :: plen |
| 164 | character(len=:), allocatable :: c_pattern |
| 165 | integer(c_int) :: options, errorcode |
| 166 | integer(c_size_t) :: erroroffset, pcre_pattern_len |
| 167 | |
| 168 | ierr = 0 |
| 169 | re%compiled = .false. |
| 170 | re%error_code = 0 |
| 171 | re%error_msg = '' |
| 172 | |
| 173 | ! Get actual pattern length (preserving whitespace patterns) |
| 174 | plen = pattern_len(pattern) |
| 175 | |
| 176 | ! Set options - enable UTF-8 and Unicode properties by default |
| 177 | options = ior(PCRE2_UTF, PCRE2_UCP) |
| 178 | if (present(ignore_case)) then |
| 179 | if (ignore_case) options = ior(options, PCRE2_CASELESS) |
| 180 | end if |
| 181 | |
| 182 | ! Prepare pattern as C string (use exact length, not trim) |
| 183 | allocate(character(len=plen+1) :: c_pattern) |
| 184 | c_pattern = pattern(1:plen) // c_null_char |
| 185 | pcre_pattern_len = int(plen, c_size_t) |
| 186 | |
| 187 | ! Compile pattern |
| 188 | re%code = pcre2_compile_8(c_pattern, pcre_pattern_len, options, & |
| 189 | errorcode, erroroffset, c_null_ptr) |
| 190 | |
| 191 | if (.not. c_associated(re%code)) then |
| 192 | re%error_code = int(errorcode) |
| 193 | call get_pcre_error(errorcode, re%error_msg) |
| 194 | ierr = 1 |
| 195 | return |
| 196 | end if |
| 197 | |
| 198 | ! Create match data block |
| 199 | re%match_data = pcre2_match_data_create_from_pattern_8(re%code, c_null_ptr) |
| 200 | if (.not. c_associated(re%match_data)) then |
| 201 | call pcre2_code_free_8(re%code) |
| 202 | re%code = c_null_ptr |
| 203 | re%error_msg = 'Failed to create match data' |
| 204 | ierr = 2 |
| 205 | return |
| 206 | end if |
| 207 | |
| 208 | re%compiled = .true. |
| 209 | |
| 210 | end subroutine pcre_compile |
| 211 | |
| 212 | !--------------------------------------------------------------------------- |
| 213 | ! Match pattern against text (returns true if matches anywhere) |
| 214 | !--------------------------------------------------------------------------- |
| 215 | function pcre_match(re, text, ignore_case) result(matched) |
| 216 | type(pcre_t), intent(in) :: re |
| 217 | character(len=*), intent(in) :: text |
| 218 | logical, intent(in), optional :: ignore_case |
| 219 | logical :: matched |
| 220 | |
| 221 | type(pcre_match_result_t) :: res |
| 222 | logical :: icase |
| 223 | |
| 224 | matched = .false. |
| 225 | if (.not. re%compiled) return |
| 226 | |
| 227 | icase = .false. |
| 228 | if (present(ignore_case)) icase = ignore_case |
| 229 | |
| 230 | res = pcre_search(re, text, icase) |
| 231 | matched = res%matched |
| 232 | |
| 233 | end function pcre_match |
| 234 | |
| 235 | !--------------------------------------------------------------------------- |
| 236 | ! Search for pattern in text, return match result with positions |
| 237 | !--------------------------------------------------------------------------- |
| 238 | function pcre_search(re, text, ignore_case, start_offset) result(res) |
| 239 | type(pcre_t), intent(in) :: re |
| 240 | character(len=*), intent(in) :: text |
| 241 | logical, intent(in), optional :: ignore_case |
| 242 | integer, intent(in), optional :: start_offset |
| 243 | type(pcre_match_result_t) :: res |
| 244 | |
| 245 | character(len=len(text)+1, kind=c_char) :: c_text |
| 246 | integer(c_int) :: rc, options |
| 247 | integer(c_size_t) :: text_len, startoffset |
| 248 | integer(c_size_t) :: ovector_count |
| 249 | type(c_ptr) :: ovector_ptr |
| 250 | integer(c_size_t), pointer :: ovector(:) |
| 251 | integer :: i |
| 252 | |
| 253 | res%matched = .false. |
| 254 | res%match_start = 0 |
| 255 | res%match_end = 0 |
| 256 | res%num_groups = 0 |
| 257 | |
| 258 | if (.not. re%compiled) return |
| 259 | |
| 260 | ! Prepare text as C string (without null terminator for length) |
| 261 | c_text = text // c_null_char |
| 262 | text_len = int(len(text), c_size_t) |
| 263 | |
| 264 | ! Set start offset |
| 265 | startoffset = 0_c_size_t |
| 266 | if (present(start_offset)) then |
| 267 | if (start_offset > 0) startoffset = int(start_offset - 1, c_size_t) |
| 268 | end if |
| 269 | |
| 270 | ! Match options (ignore_case was set at compile time) |
| 271 | options = 0_c_int |
| 272 | |
| 273 | ! Execute match |
| 274 | rc = pcre2_match_8(re%code, c_text, text_len, startoffset, options, & |
| 275 | re%match_data, c_null_ptr) |
| 276 | |
| 277 | if (rc < 0) then |
| 278 | ! No match or error |
| 279 | return |
| 280 | end if |
| 281 | |
| 282 | res%matched = .true. |
| 283 | |
| 284 | ! Get output vector with match positions |
| 285 | ovector_ptr = pcre2_get_ovector_pointer_8(re%match_data) |
| 286 | if (.not. c_associated(ovector_ptr)) return |
| 287 | |
| 288 | ovector_count = int(pcre2_get_ovector_count_8(re%match_data), c_size_t) |
| 289 | |
| 290 | ! Map to Fortran array - ovector has pairs of (start, end) positions |
| 291 | ! PCRE2 uses byte offsets (0-based), we need 1-based character positions |
| 292 | call c_f_pointer(ovector_ptr, ovector, [ovector_count * 2]) |
| 293 | |
| 294 | ! Overall match is in ovector(1) and ovector(2) |
| 295 | res%match_start = int(ovector(1)) + 1 ! Convert 0-based to 1-based |
| 296 | res%match_end = int(ovector(2)) ! End is exclusive in PCRE2, so this is correct |
| 297 | |
| 298 | ! Capture groups start at index 3 (pairs 2+) |
| 299 | res%num_groups = min(int(rc) - 1, 20) |
| 300 | do i = 1, res%num_groups |
| 301 | if ((i * 2 + 1) <= int(ovector_count * 2)) then |
| 302 | res%group_starts(i) = int(ovector(i * 2 + 1)) + 1 |
| 303 | res%group_ends(i) = int(ovector(i * 2 + 2)) |
| 304 | end if |
| 305 | end do |
| 306 | |
| 307 | end function pcre_search |
| 308 | |
| 309 | !--------------------------------------------------------------------------- |
| 310 | ! Free PCRE compiled pattern resources |
| 311 | !--------------------------------------------------------------------------- |
| 312 | subroutine pcre_free(re) |
| 313 | type(pcre_t), intent(inout) :: re |
| 314 | |
| 315 | if (c_associated(re%match_data)) then |
| 316 | call pcre2_match_data_free_8(re%match_data) |
| 317 | re%match_data = c_null_ptr |
| 318 | end if |
| 319 | |
| 320 | if (c_associated(re%code)) then |
| 321 | call pcre2_code_free_8(re%code) |
| 322 | re%code = c_null_ptr |
| 323 | end if |
| 324 | |
| 325 | re%compiled = .false. |
| 326 | |
| 327 | end subroutine pcre_free |
| 328 | |
| 329 | !--------------------------------------------------------------------------- |
| 330 | ! Get error message for failed compilation |
| 331 | !--------------------------------------------------------------------------- |
| 332 | function pcre_error_message(re) result(msg) |
| 333 | type(pcre_t), intent(in) :: re |
| 334 | character(len=256) :: msg |
| 335 | msg = re%error_msg |
| 336 | end function pcre_error_message |
| 337 | |
| 338 | !--------------------------------------------------------------------------- |
| 339 | ! Check if pattern is compiled |
| 340 | !--------------------------------------------------------------------------- |
| 341 | function pcre_is_compiled(this) result(res) |
| 342 | class(pcre_t), intent(in) :: this |
| 343 | logical :: res |
| 344 | res = this%compiled |
| 345 | end function pcre_is_compiled |
| 346 | |
| 347 | !--------------------------------------------------------------------------- |
| 348 | ! Get PCRE error message from error code |
| 349 | !--------------------------------------------------------------------------- |
| 350 | subroutine get_pcre_error(errorcode, msg) |
| 351 | integer(c_int), intent(in) :: errorcode |
| 352 | character(len=*), intent(out) :: msg |
| 353 | |
| 354 | character(len=256, kind=c_char) :: c_buffer |
| 355 | integer(c_int) :: ret |
| 356 | |
| 357 | msg = '' |
| 358 | ret = pcre2_get_error_message_8(errorcode, c_buffer, 256_c_size_t) |
| 359 | |
| 360 | if (ret > 0) then |
| 361 | msg = c_to_fortran_string(c_buffer) |
| 362 | else |
| 363 | write(msg, '(A,I0)') 'PCRE error code: ', errorcode |
| 364 | end if |
| 365 | |
| 366 | end subroutine get_pcre_error |
| 367 | |
| 368 | !--------------------------------------------------------------------------- |
| 369 | ! Convert C string to Fortran string |
| 370 | !--------------------------------------------------------------------------- |
| 371 | function c_to_fortran_string(c_str) result(f_str) |
| 372 | character(len=*, kind=c_char), intent(in) :: c_str |
| 373 | character(len=len(c_str)) :: f_str |
| 374 | integer :: i |
| 375 | |
| 376 | f_str = '' |
| 377 | do i = 1, len(c_str) |
| 378 | if (c_str(i:i) == c_null_char) exit |
| 379 | f_str(i:i) = c_str(i:i) |
| 380 | end do |
| 381 | end function c_to_fortran_string |
| 382 | |
| 383 | end module pcre_api |
| 384 |