| 1 | module ferp_search |
| 2 | !> Fast string search algorithms for FERP |
| 3 | !> Implements Boyer-Moore-Horspool for fixed string matching |
| 4 | use ferp_kinds |
| 5 | implicit none |
| 6 | private |
| 7 | |
| 8 | public :: bm_search, bm_search_all |
| 9 | public :: bm_pattern_t, bm_compile, bm_free |
| 10 | |
| 11 | !> Compiled Boyer-Moore pattern |
| 12 | type :: bm_pattern_t |
| 13 | character(len=:), allocatable :: pattern |
| 14 | integer :: pattern_len = 0 |
| 15 | integer :: skip_table(0:255) ! Bad character skip table |
| 16 | logical :: case_insensitive = .false. |
| 17 | end type bm_pattern_t |
| 18 | |
| 19 | contains |
| 20 | |
| 21 | subroutine bm_compile(pat, pattern, case_insensitive) |
| 22 | !> Compile a pattern for Boyer-Moore search |
| 23 | type(bm_pattern_t), intent(out) :: pat |
| 24 | character(len=*), intent(in) :: pattern |
| 25 | logical, intent(in), optional :: case_insensitive |
| 26 | |
| 27 | integer :: i, c, pat_len |
| 28 | character(len=:), allocatable :: work_pattern |
| 29 | |
| 30 | pat%case_insensitive = .false. |
| 31 | if (present(case_insensitive)) pat%case_insensitive = case_insensitive |
| 32 | |
| 33 | pat_len = len(pattern) |
| 34 | pat%pattern_len = pat_len |
| 35 | |
| 36 | ! Store pattern (lowercase if case-insensitive) |
| 37 | if (pat%case_insensitive) then |
| 38 | allocate(character(len=pat_len) :: work_pattern) |
| 39 | do i = 1, pat_len |
| 40 | work_pattern(i:i) = to_lower(pattern(i:i)) |
| 41 | end do |
| 42 | pat%pattern = work_pattern |
| 43 | else |
| 44 | pat%pattern = pattern |
| 45 | end if |
| 46 | |
| 47 | ! Initialize skip table - default skip is pattern length |
| 48 | pat%skip_table = pat_len |
| 49 | |
| 50 | ! Build bad character table |
| 51 | ! For each character in pattern (except last), set skip distance |
| 52 | do i = 1, pat_len - 1 |
| 53 | if (pat%case_insensitive) then |
| 54 | c = ichar(to_lower(pattern(i:i))) |
| 55 | else |
| 56 | c = ichar(pattern(i:i)) |
| 57 | end if |
| 58 | pat%skip_table(c) = pat_len - i |
| 59 | |
| 60 | ! For case-insensitive, also set the other case |
| 61 | if (pat%case_insensitive) then |
| 62 | if (c >= ichar('a') .and. c <= ichar('z')) then |
| 63 | pat%skip_table(c - 32) = pat_len - i ! uppercase |
| 64 | else if (c >= ichar('A') .and. c <= ichar('Z')) then |
| 65 | pat%skip_table(c + 32) = pat_len - i ! lowercase |
| 66 | end if |
| 67 | end if |
| 68 | end do |
| 69 | |
| 70 | end subroutine bm_compile |
| 71 | |
| 72 | subroutine bm_free(pat) |
| 73 | !> Free compiled pattern |
| 74 | type(bm_pattern_t), intent(inout) :: pat |
| 75 | if (allocated(pat%pattern)) deallocate(pat%pattern) |
| 76 | pat%pattern_len = 0 |
| 77 | end subroutine bm_free |
| 78 | |
| 79 | function bm_search(text, pat) result(pos) |
| 80 | !> Search for pattern in text using Boyer-Moore-Horspool |
| 81 | !> Returns position of first match (1-based), or 0 if not found |
| 82 | character(len=*), intent(in) :: text |
| 83 | type(bm_pattern_t), intent(in) :: pat |
| 84 | integer :: pos |
| 85 | |
| 86 | integer :: text_len, pat_len, i, j, skip |
| 87 | character :: tc, pc |
| 88 | |
| 89 | pos = 0 |
| 90 | text_len = len(text) |
| 91 | pat_len = pat%pattern_len |
| 92 | |
| 93 | if (pat_len == 0) then |
| 94 | pos = 1 ! Empty pattern matches at start |
| 95 | return |
| 96 | end if |
| 97 | |
| 98 | if (text_len < pat_len) return |
| 99 | |
| 100 | i = pat_len ! Start at position where pattern could first match |
| 101 | |
| 102 | do while (i <= text_len) |
| 103 | ! Compare pattern right-to-left |
| 104 | j = pat_len |
| 105 | do while (j >= 1) |
| 106 | if (pat%case_insensitive) then |
| 107 | tc = to_lower(text(i - pat_len + j:i - pat_len + j)) |
| 108 | else |
| 109 | tc = text(i - pat_len + j:i - pat_len + j) |
| 110 | end if |
| 111 | pc = pat%pattern(j:j) |
| 112 | |
| 113 | if (tc /= pc) exit |
| 114 | j = j - 1 |
| 115 | end do |
| 116 | |
| 117 | if (j == 0) then |
| 118 | ! Full match found |
| 119 | pos = i - pat_len + 1 |
| 120 | return |
| 121 | end if |
| 122 | |
| 123 | ! Skip based on bad character at current position |
| 124 | if (pat%case_insensitive) then |
| 125 | skip = pat%skip_table(ichar(to_lower(text(i:i)))) |
| 126 | else |
| 127 | skip = pat%skip_table(ichar(text(i:i))) |
| 128 | end if |
| 129 | i = i + skip |
| 130 | end do |
| 131 | |
| 132 | end function bm_search |
| 133 | |
| 134 | subroutine bm_search_all(text, pat, positions, count) |
| 135 | !> Find all occurrences of pattern in text |
| 136 | character(len=*), intent(in) :: text |
| 137 | type(bm_pattern_t), intent(in) :: pat |
| 138 | integer, intent(out) :: positions(:) ! Array to store positions |
| 139 | integer, intent(out) :: count ! Number of matches found |
| 140 | |
| 141 | integer :: text_len, pat_len, i, j, skip, max_matches |
| 142 | character :: tc, pc |
| 143 | |
| 144 | count = 0 |
| 145 | max_matches = size(positions) |
| 146 | text_len = len(text) |
| 147 | pat_len = pat%pattern_len |
| 148 | |
| 149 | if (pat_len == 0) then |
| 150 | ! Empty pattern matches at every position |
| 151 | do i = 1, min(text_len + 1, max_matches) |
| 152 | count = count + 1 |
| 153 | positions(count) = i |
| 154 | end do |
| 155 | return |
| 156 | end if |
| 157 | |
| 158 | if (text_len < pat_len) return |
| 159 | |
| 160 | i = pat_len |
| 161 | |
| 162 | do while (i <= text_len .and. count < max_matches) |
| 163 | ! Compare pattern right-to-left |
| 164 | j = pat_len |
| 165 | do while (j >= 1) |
| 166 | if (pat%case_insensitive) then |
| 167 | tc = to_lower(text(i - pat_len + j:i - pat_len + j)) |
| 168 | else |
| 169 | tc = text(i - pat_len + j:i - pat_len + j) |
| 170 | end if |
| 171 | pc = pat%pattern(j:j) |
| 172 | |
| 173 | if (tc /= pc) exit |
| 174 | j = j - 1 |
| 175 | end do |
| 176 | |
| 177 | if (j == 0) then |
| 178 | ! Full match found |
| 179 | count = count + 1 |
| 180 | positions(count) = i - pat_len + 1 |
| 181 | ! Move past this match (non-overlapping) |
| 182 | i = i + pat_len |
| 183 | else |
| 184 | ! Skip based on bad character |
| 185 | if (pat%case_insensitive) then |
| 186 | skip = pat%skip_table(ichar(to_lower(text(i:i)))) |
| 187 | else |
| 188 | skip = pat%skip_table(ichar(text(i:i))) |
| 189 | end if |
| 190 | i = i + max(skip, 1) |
| 191 | end if |
| 192 | end do |
| 193 | |
| 194 | end subroutine bm_search_all |
| 195 | |
| 196 | pure function to_lower(ch) result(lower) |
| 197 | !> Convert character to lowercase |
| 198 | character, intent(in) :: ch |
| 199 | character :: lower |
| 200 | integer :: ic |
| 201 | |
| 202 | ic = ichar(ch) |
| 203 | if (ic >= ichar('A') .and. ic <= ichar('Z')) then |
| 204 | lower = char(ic + 32) |
| 205 | else |
| 206 | lower = ch |
| 207 | end if |
| 208 | end function to_lower |
| 209 | |
| 210 | end module ferp_search |
| 211 |