| 1 | module regex_charclass |
| 2 | !> High-performance bitwise character class operations for FERP |
| 3 | !> Uses 256-bit representation (4 x 64-bit integers) instead of boolean array |
| 4 | !> Provides O(1) membership testing with minimal memory footprint |
| 5 | implicit none |
| 6 | private |
| 7 | |
| 8 | public :: char_class_bits_t |
| 9 | public :: charclass_from_array, charclass_test, charclass_test_case_insensitive |
| 10 | public :: charclass_set, charclass_clear, charclass_set_range |
| 11 | public :: charclass_add_case_variants |
| 12 | |
| 13 | !> Bitwise character class - 256 bits in 4 words |
| 14 | type :: char_class_bits_t |
| 15 | integer(8) :: words(4) = 0_8 ! bits 0-63, 64-127, 128-191, 192-255 |
| 16 | logical :: negated = .false. |
| 17 | end type char_class_bits_t |
| 18 | |
| 19 | contains |
| 20 | |
| 21 | pure subroutine charclass_clear(cc) |
| 22 | !> Clear all bits |
| 23 | type(char_class_bits_t), intent(inout) :: cc |
| 24 | cc%words = 0_8 |
| 25 | cc%negated = .false. |
| 26 | end subroutine charclass_clear |
| 27 | |
| 28 | pure subroutine charclass_set(cc, char_code) |
| 29 | !> Set a single character bit |
| 30 | type(char_class_bits_t), intent(inout) :: cc |
| 31 | integer, intent(in) :: char_code |
| 32 | integer :: word_idx, bit_idx |
| 33 | |
| 34 | if (char_code < 0 .or. char_code > 255) return |
| 35 | word_idx = char_code / 64 + 1 ! 1-based index |
| 36 | bit_idx = mod(char_code, 64) |
| 37 | cc%words(word_idx) = ior(cc%words(word_idx), ishft(1_8, bit_idx)) |
| 38 | end subroutine charclass_set |
| 39 | |
| 40 | pure subroutine charclass_set_range(cc, start_char, end_char) |
| 41 | !> Set a range of character bits efficiently |
| 42 | type(char_class_bits_t), intent(inout) :: cc |
| 43 | integer, intent(in) :: start_char, end_char |
| 44 | integer :: i |
| 45 | |
| 46 | do i = start_char, end_char |
| 47 | if (i >= 0 .and. i <= 255) then |
| 48 | call charclass_set(cc, i) |
| 49 | end if |
| 50 | end do |
| 51 | end subroutine charclass_set_range |
| 52 | |
| 53 | pure function charclass_test(cc, c) result(res) |
| 54 | !> Test if character is in class - O(1) bit test |
| 55 | type(char_class_bits_t), intent(in) :: cc |
| 56 | character(len=1), intent(in) :: c |
| 57 | logical :: res |
| 58 | |
| 59 | integer :: char_code, word_idx, bit_idx |
| 60 | |
| 61 | char_code = ichar(c) |
| 62 | word_idx = char_code / 64 + 1 |
| 63 | bit_idx = mod(char_code, 64) |
| 64 | res = btest(cc%words(word_idx), bit_idx) |
| 65 | |
| 66 | if (cc%negated) res = .not. res |
| 67 | end function charclass_test |
| 68 | |
| 69 | pure function charclass_test_case_insensitive(cc, c) result(res) |
| 70 | !> Test character with case insensitivity - checks both cases in one call |
| 71 | type(char_class_bits_t), intent(in) :: cc |
| 72 | character(len=1), intent(in) :: c |
| 73 | logical :: res |
| 74 | |
| 75 | integer :: char_code, word_idx, bit_idx, other_case |
| 76 | |
| 77 | char_code = ichar(c) |
| 78 | word_idx = char_code / 64 + 1 |
| 79 | bit_idx = mod(char_code, 64) |
| 80 | res = btest(cc%words(word_idx), bit_idx) |
| 81 | |
| 82 | ! Quick check for other case (only for a-z and A-Z) |
| 83 | if (.not. res) then |
| 84 | if (char_code >= 65 .and. char_code <= 90) then |
| 85 | ! Uppercase A-Z -> check lowercase a-z |
| 86 | other_case = char_code + 32 |
| 87 | word_idx = other_case / 64 + 1 |
| 88 | bit_idx = mod(other_case, 64) |
| 89 | res = btest(cc%words(word_idx), bit_idx) |
| 90 | else if (char_code >= 97 .and. char_code <= 122) then |
| 91 | ! Lowercase a-z -> check uppercase A-Z |
| 92 | other_case = char_code - 32 |
| 93 | word_idx = other_case / 64 + 1 |
| 94 | bit_idx = mod(other_case, 64) |
| 95 | res = btest(cc%words(word_idx), bit_idx) |
| 96 | end if |
| 97 | end if |
| 98 | |
| 99 | if (cc%negated) res = .not. res |
| 100 | end function charclass_test_case_insensitive |
| 101 | |
| 102 | pure subroutine charclass_from_array(cc, char_class_array, negated) |
| 103 | !> Convert 256-element boolean array to bitwise format |
| 104 | type(char_class_bits_t), intent(out) :: cc |
| 105 | logical, intent(in) :: char_class_array(0:255) |
| 106 | logical, intent(in) :: negated |
| 107 | |
| 108 | integer :: i, word_idx, bit_idx |
| 109 | |
| 110 | cc%words = 0_8 |
| 111 | cc%negated = negated |
| 112 | |
| 113 | do i = 0, 255 |
| 114 | if (char_class_array(i)) then |
| 115 | word_idx = i / 64 + 1 |
| 116 | bit_idx = mod(i, 64) |
| 117 | cc%words(word_idx) = ior(cc%words(word_idx), ishft(1_8, bit_idx)) |
| 118 | end if |
| 119 | end do |
| 120 | end subroutine charclass_from_array |
| 121 | |
| 122 | pure subroutine charclass_add_case_variants(cc) |
| 123 | !> Pre-compute case variants into the character class |
| 124 | !> After calling this, case-insensitive matching becomes a single test |
| 125 | type(char_class_bits_t), intent(inout) :: cc |
| 126 | |
| 127 | integer :: i, word_idx, bit_idx, other_case |
| 128 | integer(8) :: saved_words(4) |
| 129 | |
| 130 | saved_words = cc%words |
| 131 | |
| 132 | ! For each set bit, also set its case variant |
| 133 | do i = 0, 255 |
| 134 | word_idx = i / 64 + 1 |
| 135 | bit_idx = mod(i, 64) |
| 136 | |
| 137 | if (btest(saved_words(word_idx), bit_idx)) then |
| 138 | ! Character is in class - add its case variant |
| 139 | if (i >= 65 .and. i <= 90) then |
| 140 | ! Uppercase A-Z -> add lowercase a-z |
| 141 | other_case = i + 32 |
| 142 | call charclass_set(cc, other_case) |
| 143 | else if (i >= 97 .and. i <= 122) then |
| 144 | ! Lowercase a-z -> add uppercase A-Z |
| 145 | other_case = i - 32 |
| 146 | call charclass_set(cc, other_case) |
| 147 | end if |
| 148 | end if |
| 149 | end do |
| 150 | end subroutine charclass_add_case_variants |
| 151 | |
| 152 | end module regex_charclass |
| 153 |