| 1 | module regex_api |
| 2 | !> Public API for the FERP regex engine |
| 3 | !> Provides high-level interface for pattern compilation and matching |
| 4 | use regex_types |
| 5 | use regex_lexer |
| 6 | use regex_parser |
| 7 | use regex_nfa |
| 8 | use regex_engine |
| 9 | use regex_optimizer |
| 10 | use ferp_kinds, only: pattern_len |
| 11 | implicit none |
| 12 | private |
| 13 | |
| 14 | public :: regex_t |
| 15 | public :: regex_compile, regex_match, regex_search |
| 16 | public :: regex_free, regex_error_message |
| 17 | public :: match_result_t |
| 18 | |
| 19 | !> Compiled regex type |
| 20 | type :: regex_t |
| 21 | private |
| 22 | type(nfa_t) :: nfa |
| 23 | type(optimized_nfa_t) :: opt_nfa ! Optimized NFA for faster matching |
| 24 | type(ast_pool_t) :: ast_pool |
| 25 | logical :: compiled = .false. |
| 26 | logical :: is_ere = .false. |
| 27 | integer :: error_code = 0 |
| 28 | character(len=256) :: error_msg = '' |
| 29 | integer :: num_groups = 0 |
| 30 | character(len=4096) :: pattern = '' ! Original pattern for AC detection |
| 31 | contains |
| 32 | procedure :: is_compiled => regex_is_compiled |
| 33 | end type regex_t |
| 34 | |
| 35 | contains |
| 36 | |
| 37 | subroutine regex_compile(re, pattern, is_ere, ierr) |
| 38 | !> Compile a regex pattern |
| 39 | type(regex_t), intent(out) :: re |
| 40 | character(len=*), intent(in) :: pattern |
| 41 | logical, intent(in), optional :: is_ere |
| 42 | integer, intent(out) :: ierr |
| 43 | |
| 44 | type(token_list_t) :: tokens |
| 45 | integer :: root_idx |
| 46 | logical :: extended |
| 47 | |
| 48 | ierr = 0 |
| 49 | re%compiled = .false. |
| 50 | re%error_code = 0 |
| 51 | re%error_msg = '' |
| 52 | |
| 53 | extended = .false. |
| 54 | if (present(is_ere)) extended = is_ere |
| 55 | re%is_ere = extended |
| 56 | |
| 57 | ! Handle empty pattern (use pattern_len to preserve whitespace patterns) |
| 58 | if (pattern_len(pattern) == 0) then |
| 59 | call re%nfa%init() |
| 60 | re%nfa%start_state = re%nfa%add_state() |
| 61 | re%nfa%accept_state = re%nfa%add_state() |
| 62 | re%nfa%states(re%nfa%accept_state)%is_accept = .true. |
| 63 | ! Add epsilon transition for empty match |
| 64 | call add_eps(re%nfa, re%nfa%start_state, re%nfa%accept_state) |
| 65 | ! Optimize NFA for faster matching |
| 66 | call optimize_nfa(re%opt_nfa, re%nfa) |
| 67 | re%compiled = .true. |
| 68 | re%num_groups = 0 |
| 69 | return |
| 70 | end if |
| 71 | |
| 72 | ! Tokenize |
| 73 | call tokenize(pattern, tokens, extended, ierr) |
| 74 | if (ierr /= 0) then |
| 75 | re%error_code = 1 |
| 76 | re%error_msg = 'Invalid pattern: tokenization failed' |
| 77 | return |
| 78 | end if |
| 79 | |
| 80 | ! Parse |
| 81 | call parse(tokens, re%ast_pool, root_idx, re%num_groups, ierr) |
| 82 | if (ierr /= 0) then |
| 83 | re%error_code = 2 |
| 84 | re%error_msg = 'Invalid pattern: parse failed' |
| 85 | return |
| 86 | end if |
| 87 | |
| 88 | ! Build NFA |
| 89 | call build_nfa(re%ast_pool, root_idx, re%nfa, ierr) |
| 90 | if (ierr /= 0) then |
| 91 | re%error_code = 3 |
| 92 | re%error_msg = 'Invalid pattern: NFA construction failed' |
| 93 | return |
| 94 | end if |
| 95 | |
| 96 | ! Optimize NFA for faster matching |
| 97 | call optimize_nfa(re%opt_nfa, re%nfa) |
| 98 | |
| 99 | ! Store pattern and try Aho-Corasick for alternation patterns |
| 100 | re%pattern = pattern |
| 101 | call try_build_aho_corasick(re%opt_nfa, pattern, extended, .false.) |
| 102 | |
| 103 | re%compiled = .true. |
| 104 | |
| 105 | contains |
| 106 | subroutine add_eps(nfa, from, to) |
| 107 | type(nfa_t), intent(inout) :: nfa |
| 108 | integer, intent(in) :: from, to |
| 109 | type(nfa_transition_t) :: trans |
| 110 | trans%trans_type = TRANS_EPSILON |
| 111 | trans%target = to |
| 112 | call nfa%states(from)%add_trans(trans) |
| 113 | end subroutine |
| 114 | end subroutine regex_compile |
| 115 | |
| 116 | function regex_match(re, text, ignore_case) result(matched) |
| 117 | !> Check if pattern matches anywhere in text |
| 118 | type(regex_t), intent(inout) :: re ! inout for DFA cache |
| 119 | character(len=*), intent(in) :: text |
| 120 | logical, intent(in), optional :: ignore_case |
| 121 | logical :: matched |
| 122 | |
| 123 | type(match_result_t) :: res |
| 124 | logical :: icase |
| 125 | |
| 126 | matched = .false. |
| 127 | if (.not. re%compiled) return |
| 128 | |
| 129 | icase = .false. |
| 130 | if (present(ignore_case)) icase = ignore_case |
| 131 | |
| 132 | ! Use optimized search with bit vectors and prefix skip |
| 133 | res = optimized_search(re%opt_nfa, text, icase) |
| 134 | matched = res%matched |
| 135 | |
| 136 | end function regex_match |
| 137 | |
| 138 | function regex_search(re, text, ignore_case) result(res) |
| 139 | !> Search for pattern in text, return match result with positions |
| 140 | type(regex_t), intent(inout) :: re ! inout for DFA cache |
| 141 | character(len=*), intent(in) :: text |
| 142 | logical, intent(in), optional :: ignore_case |
| 143 | type(match_result_t) :: res |
| 144 | |
| 145 | logical :: icase |
| 146 | |
| 147 | res%matched = .false. |
| 148 | if (.not. re%compiled) return |
| 149 | |
| 150 | icase = .false. |
| 151 | if (present(ignore_case)) icase = ignore_case |
| 152 | |
| 153 | ! Use optimized search with bit vectors and prefix skip |
| 154 | res = optimized_search(re%opt_nfa, text, icase) |
| 155 | |
| 156 | end function regex_search |
| 157 | |
| 158 | subroutine regex_free(re) |
| 159 | !> Free resources associated with compiled regex |
| 160 | type(regex_t), intent(inout) :: re |
| 161 | |
| 162 | call re%nfa%cleanup() |
| 163 | call re%ast_pool%cleanup() |
| 164 | re%compiled = .false. |
| 165 | |
| 166 | end subroutine regex_free |
| 167 | |
| 168 | function regex_error_message(re) result(msg) |
| 169 | !> Get error message from failed compilation |
| 170 | type(regex_t), intent(in) :: re |
| 171 | character(len=256) :: msg |
| 172 | msg = re%error_msg |
| 173 | end function regex_error_message |
| 174 | |
| 175 | function regex_is_compiled(this) result(res) |
| 176 | class(regex_t), intent(in) :: this |
| 177 | logical :: res |
| 178 | res = this%compiled |
| 179 | end function regex_is_compiled |
| 180 | |
| 181 | end module regex_api |
| 182 |