@@ -83,6 +83,7 @@ module regex_optimizer |
| 83 | 83 | logical :: use_dfa = .false. ! Use DFA instead of NFA |
| 84 | 84 | type(ac_automaton_t) :: ac ! Aho-Corasick automaton (for alternation) |
| 85 | 85 | logical :: use_aho_corasick = .false. ! Use Aho-Corasick for matching |
| 86 | + logical :: has_backrefs = .false. ! Pattern contains backreferences |
| 86 | 87 | logical :: optimized = .false. |
| 87 | 88 | end type optimized_nfa_t |
| 88 | 89 | |
@@ -193,6 +194,10 @@ contains |
| 193 | 194 | opt%anchored_start = .false. |
| 194 | 195 | opt%anchored_end = .false. |
| 195 | 196 | opt%use_dfa = .false. |
| 197 | + opt%has_backrefs = .false. |
| 198 | + |
| 199 | + ! Detect backreferences in NFA |
| 200 | + opt%has_backrefs = has_backref_transitions(nfa) |
| 196 | 201 | |
| 197 | 202 | ! Extract literal prefix and detect anchors |
| 198 | 203 | call extract_prefix_and_anchors(opt) |
@@ -213,7 +218,8 @@ contains |
| 213 | 218 | |
| 214 | 219 | ! Try to compile full DFA for O(n) matching |
| 215 | 220 | ! Only for patterns without any position-dependent transitions (anchors) |
| 216 | | - if (.not. has_anchor_transitions(opt%nfa)) then |
| 221 | + ! and without backreferences (which require backtracking) |
| 222 | + if (.not. has_anchor_transitions(opt%nfa) .and. .not. opt%has_backrefs) then |
| 217 | 223 | call compile_dfa(opt) |
| 218 | 224 | ! DEBUG: Print DFA compilation result (uncomment for debugging) |
| 219 | 225 | ! write(0,*) 'DFA compiled:', opt%use_dfa, 'states:', opt%dfa%num_states, 'too_large:', opt%dfa%too_large |
@@ -388,6 +394,28 @@ contains |
| 388 | 394 | end do |
| 389 | 395 | end function has_anchor_transitions |
| 390 | 396 | |
| 397 | + function has_backref_transitions(nfa) result(has_backrefs) |
| 398 | + !> Check if NFA has any backreference transitions |
| 399 | + !> Backrefs are encoded as TRANS_EPSILON with negative anchor_type |
| 400 | + type(nfa_t), intent(in) :: nfa |
| 401 | + logical :: has_backrefs |
| 402 | + |
| 403 | + integer :: state, i |
| 404 | + type(nfa_transition_t) :: trans |
| 405 | + |
| 406 | + has_backrefs = .false. |
| 407 | + |
| 408 | + do state = 1, nfa%num_states |
| 409 | + do i = 1, nfa%states(state)%num_trans |
| 410 | + trans = nfa%states(state)%trans(i) |
| 411 | + if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type < 0) then |
| 412 | + has_backrefs = .true. |
| 413 | + return |
| 414 | + end if |
| 415 | + end do |
| 416 | + end do |
| 417 | + end function has_backref_transitions |
| 418 | + |
| 391 | 419 | !--------------------------------------------------------------------------- |
| 392 | 420 | ! Character Equivalence Classes |
| 393 | 421 | !--------------------------------------------------------------------------- |
@@ -1025,6 +1053,12 @@ contains |
| 1025 | 1053 | |
| 1026 | 1054 | if (opt%nfa%num_states == 0) return |
| 1027 | 1055 | |
| 1056 | + ! Backtracking path: use backtracking matcher for patterns with backreferences |
| 1057 | + if (opt%has_backrefs) then |
| 1058 | + res = backtrack_search(opt%nfa, text, text_len, ignore_case) |
| 1059 | + return |
| 1060 | + end if |
| 1061 | + |
| 1028 | 1062 | ! Fast path: use DFA if available (O(n) matching) |
| 1029 | 1063 | ! DFA is case-sensitive; case-insensitive matching falls through to NFA path |
| 1030 | 1064 | if (opt%use_dfa .and. .not. ignore_case) then |
@@ -1510,6 +1544,21 @@ contains |
| 1510 | 1544 | integer :: ic |
| 1511 | 1545 | ic = ichar(c) |
| 1512 | 1546 | if (ic >= ichar('A') .and. ic <= ichar('Z')) then |
| 1547 | + ! ASCII uppercase A-Z -> a-z |
| 1548 | + lower = char(ic + 32) |
| 1549 | + else if (ic >= 192 .and. ic <= 214) then |
| 1550 | + ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246) |
| 1551 | + lower = char(ic + 32) |
| 1552 | + else if (ic >= 216 .and. ic <= 222) then |
| 1553 | + ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254) |
| 1554 | + lower = char(ic + 32) |
| 1555 | + else if (ic >= 128 .and. ic <= 150) then |
| 1556 | + ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6) |
| 1557 | + ! When preceded by 0xC3, these represent À-Ö, fold to à-ö |
| 1558 | + lower = char(ic + 32) |
| 1559 | + else if (ic >= 152 .and. ic <= 158) then |
| 1560 | + ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE) |
| 1561 | + ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ |
| 1513 | 1562 | lower = char(ic + 32) |
| 1514 | 1563 | else |
| 1515 | 1564 | lower = c |
@@ -1748,4 +1797,283 @@ contains |
| 1748 | 1797 | |
| 1749 | 1798 | end function ac_optimized_search |
| 1750 | 1799 | |
| 1800 | + !--------------------------------------------------------------------------- |
| 1801 | + ! Backtracking Matcher for Backreferences |
| 1802 | + !--------------------------------------------------------------------------- |
| 1803 | + |
| 1804 | + function backtrack_search(nfa, text, text_len, ignore_case) result(res) |
| 1805 | + !> Search for pattern with backreferences using backtracking |
| 1806 | + !> Tries each starting position until a match is found |
| 1807 | + type(nfa_t), intent(in) :: nfa |
| 1808 | + character(len=*), intent(in) :: text |
| 1809 | + integer, intent(in) :: text_len |
| 1810 | + logical, intent(in) :: ignore_case |
| 1811 | + type(match_result_t) :: res |
| 1812 | + |
| 1813 | + integer :: start_pos |
| 1814 | + type(match_result_t) :: try_res |
| 1815 | + |
| 1816 | + res%matched = .false. |
| 1817 | + |
| 1818 | + do start_pos = 1, text_len + 1 |
| 1819 | + try_res = backtrack_match(nfa, text, text_len, start_pos, ignore_case) |
| 1820 | + if (try_res%matched) then |
| 1821 | + res = try_res |
| 1822 | + return |
| 1823 | + end if |
| 1824 | + end do |
| 1825 | + |
| 1826 | + end function backtrack_search |
| 1827 | + |
| 1828 | + function backtrack_match(nfa, text, text_len, start_pos, ignore_case) result(res) |
| 1829 | + !> Try to match NFA with backreferences starting at start_pos |
| 1830 | + !> Uses recursive backtracking to track group captures |
| 1831 | + type(nfa_t), intent(in) :: nfa |
| 1832 | + character(len=*), intent(in) :: text |
| 1833 | + integer, intent(in) :: text_len, start_pos |
| 1834 | + logical, intent(in) :: ignore_case |
| 1835 | + type(match_result_t) :: res |
| 1836 | + |
| 1837 | + integer :: group_starts(9), group_ends(9) |
| 1838 | + integer :: best_end |
| 1839 | + |
| 1840 | + res%matched = .false. |
| 1841 | + group_starts = 0 |
| 1842 | + group_ends = 0 |
| 1843 | + best_end = start_pos - 1 |
| 1844 | + |
| 1845 | + ! Try to match from the start state |
| 1846 | + if (backtrack_from_state(nfa, nfa%start_state, text, text_len, start_pos, & |
| 1847 | + ignore_case, group_starts, group_ends, best_end)) then |
| 1848 | + res%matched = .true. |
| 1849 | + res%match_start = start_pos |
| 1850 | + res%match_end = best_end |
| 1851 | + res%group_starts = group_starts |
| 1852 | + res%group_ends = group_ends |
| 1853 | + end if |
| 1854 | + |
| 1855 | + end function backtrack_match |
| 1856 | + |
| 1857 | + recursive function backtrack_from_state(nfa, state, text, text_len, pos, & |
| 1858 | + ignore_case, group_starts, group_ends, best_end) result(matched) |
| 1859 | + !> Recursive backtracking from a given NFA state |
| 1860 | + !> Returns true if we can reach an accepting state |
| 1861 | + type(nfa_t), intent(in) :: nfa |
| 1862 | + integer, intent(in) :: state |
| 1863 | + character(len=*), intent(in) :: text |
| 1864 | + integer, intent(in) :: text_len, pos |
| 1865 | + logical, intent(in) :: ignore_case |
| 1866 | + integer, intent(inout) :: group_starts(9), group_ends(9) |
| 1867 | + integer, intent(inout) :: best_end |
| 1868 | + logical :: matched |
| 1869 | + |
| 1870 | + integer :: i, target, old_start, old_end |
| 1871 | + integer :: backref_num, ref_start, ref_end, ref_len |
| 1872 | + integer :: saved_starts(9), saved_ends(9) |
| 1873 | + type(nfa_transition_t) :: trans |
| 1874 | + character(len=1) :: c, c_lower, match_lower |
| 1875 | + logical :: char_matches |
| 1876 | + |
| 1877 | + matched = .false. |
| 1878 | + |
| 1879 | + if (state < 1 .or. state > nfa%num_states) return |
| 1880 | + |
| 1881 | + ! Record group start if this state starts a group |
| 1882 | + if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then |
| 1883 | + old_start = group_starts(nfa%states(state)%group_start) |
| 1884 | + group_starts(nfa%states(state)%group_start) = pos |
| 1885 | + else |
| 1886 | + old_start = 0 |
| 1887 | + end if |
| 1888 | + |
| 1889 | + ! Record group end if this state ends a group |
| 1890 | + ! This must be done BEFORE processing transitions so backrefs can see the captured text |
| 1891 | + if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then |
| 1892 | + old_end = group_ends(nfa%states(state)%group_end) |
| 1893 | + group_ends(nfa%states(state)%group_end) = pos - 1 |
| 1894 | + else |
| 1895 | + old_end = 0 |
| 1896 | + end if |
| 1897 | + |
| 1898 | + ! Check if this is an accepting state |
| 1899 | + if (nfa%states(state)%is_accept) then |
| 1900 | + matched = .true. |
| 1901 | + if (pos - 1 > best_end) best_end = pos - 1 |
| 1902 | + ! Continue to find longest match (greedy) |
| 1903 | + end if |
| 1904 | + |
| 1905 | + ! Try each transition from this state |
| 1906 | + do i = 1, nfa%states(state)%num_trans |
| 1907 | + trans = nfa%states(state)%trans(i) |
| 1908 | + target = trans%target |
| 1909 | + |
| 1910 | + select case (trans%trans_type) |
| 1911 | + case (TRANS_EPSILON) |
| 1912 | + ! Check for backreference (negative anchor_type) |
| 1913 | + if (trans%anchor_type < 0) then |
| 1914 | + backref_num = -trans%anchor_type |
| 1915 | + if (backref_num >= 1 .and. backref_num <= 9) then |
| 1916 | + ref_start = group_starts(backref_num) |
| 1917 | + ref_end = group_ends(backref_num) |
| 1918 | + |
| 1919 | + ! If group hasn't been captured yet, backreference fails |
| 1920 | + if (ref_start == 0 .or. ref_end == 0 .or. ref_end < ref_start) cycle |
| 1921 | + |
| 1922 | + ref_len = ref_end - ref_start + 1 |
| 1923 | + |
| 1924 | + ! Check if we have enough text remaining |
| 1925 | + if (pos + ref_len - 1 > text_len) cycle |
| 1926 | + |
| 1927 | + ! Check if the text matches the captured group |
| 1928 | + if (ignore_case) then |
| 1929 | + if (.not. strings_equal_icase(text(pos:pos+ref_len-1), & |
| 1930 | + text(ref_start:ref_end))) cycle |
| 1931 | + else |
| 1932 | + if (text(pos:pos+ref_len-1) /= text(ref_start:ref_end)) cycle |
| 1933 | + end if |
| 1934 | + |
| 1935 | + ! Backref matches - continue from target with advanced position |
| 1936 | + saved_starts = group_starts |
| 1937 | + saved_ends = group_ends |
| 1938 | + if (backtrack_from_state(nfa, target, text, text_len, pos + ref_len, & |
| 1939 | + ignore_case, group_starts, group_ends, best_end)) then |
| 1940 | + matched = .true. |
| 1941 | + else |
| 1942 | + group_starts = saved_starts |
| 1943 | + group_ends = saved_ends |
| 1944 | + end if |
| 1945 | + end if |
| 1946 | + else |
| 1947 | + ! Regular epsilon transition |
| 1948 | + saved_starts = group_starts |
| 1949 | + saved_ends = group_ends |
| 1950 | + if (backtrack_from_state(nfa, target, text, text_len, pos, & |
| 1951 | + ignore_case, group_starts, group_ends, best_end)) then |
| 1952 | + matched = .true. |
| 1953 | + else |
| 1954 | + group_starts = saved_starts |
| 1955 | + group_ends = saved_ends |
| 1956 | + end if |
| 1957 | + end if |
| 1958 | + |
| 1959 | + case (TRANS_ANCHOR) |
| 1960 | + ! Check if anchor matches at this position |
| 1961 | + if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then |
| 1962 | + saved_starts = group_starts |
| 1963 | + saved_ends = group_ends |
| 1964 | + if (backtrack_from_state(nfa, target, text, text_len, pos, & |
| 1965 | + ignore_case, group_starts, group_ends, best_end)) then |
| 1966 | + matched = .true. |
| 1967 | + else |
| 1968 | + group_starts = saved_starts |
| 1969 | + group_ends = saved_ends |
| 1970 | + end if |
| 1971 | + end if |
| 1972 | + |
| 1973 | + case (TRANS_CHAR) |
| 1974 | + ! Character transition - need text available |
| 1975 | + if (pos <= text_len) then |
| 1976 | + c = text(pos:pos) |
| 1977 | + char_matches = .false. |
| 1978 | + |
| 1979 | + if (ignore_case) then |
| 1980 | + c_lower = to_lower_char(c) |
| 1981 | + match_lower = to_lower_char(trans%match_char) |
| 1982 | + char_matches = (c_lower == match_lower) |
| 1983 | + else |
| 1984 | + char_matches = (c == trans%match_char) |
| 1985 | + end if |
| 1986 | + |
| 1987 | + if (char_matches) then |
| 1988 | + saved_starts = group_starts |
| 1989 | + saved_ends = group_ends |
| 1990 | + if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 1991 | + ignore_case, group_starts, group_ends, best_end)) then |
| 1992 | + matched = .true. |
| 1993 | + else |
| 1994 | + group_starts = saved_starts |
| 1995 | + group_ends = saved_ends |
| 1996 | + end if |
| 1997 | + end if |
| 1998 | + end if |
| 1999 | + |
| 2000 | + case (TRANS_CLASS) |
| 2001 | + ! Character class transition |
| 2002 | + if (pos <= text_len) then |
| 2003 | + c = text(pos:pos) |
| 2004 | + if (ignore_case) then |
| 2005 | + if (charclass_test_case_insensitive(trans%char_bits, c)) then |
| 2006 | + saved_starts = group_starts |
| 2007 | + saved_ends = group_ends |
| 2008 | + if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 2009 | + ignore_case, group_starts, group_ends, best_end)) then |
| 2010 | + matched = .true. |
| 2011 | + else |
| 2012 | + group_starts = saved_starts |
| 2013 | + group_ends = saved_ends |
| 2014 | + end if |
| 2015 | + end if |
| 2016 | + else |
| 2017 | + if (charclass_test(trans%char_bits, c)) then |
| 2018 | + saved_starts = group_starts |
| 2019 | + saved_ends = group_ends |
| 2020 | + if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 2021 | + ignore_case, group_starts, group_ends, best_end)) then |
| 2022 | + matched = .true. |
| 2023 | + else |
| 2024 | + group_starts = saved_starts |
| 2025 | + group_ends = saved_ends |
| 2026 | + end if |
| 2027 | + end if |
| 2028 | + end if |
| 2029 | + end if |
| 2030 | + |
| 2031 | + case (TRANS_ANY) |
| 2032 | + ! Dot matches any character except newline |
| 2033 | + if (pos <= text_len) then |
| 2034 | + if (text(pos:pos) /= char(10)) then |
| 2035 | + saved_starts = group_starts |
| 2036 | + saved_ends = group_ends |
| 2037 | + if (backtrack_from_state(nfa, target, text, text_len, pos + 1, & |
| 2038 | + ignore_case, group_starts, group_ends, best_end)) then |
| 2039 | + matched = .true. |
| 2040 | + else |
| 2041 | + group_starts = saved_starts |
| 2042 | + group_ends = saved_ends |
| 2043 | + end if |
| 2044 | + end if |
| 2045 | + end if |
| 2046 | + |
| 2047 | + end select |
| 2048 | + end do |
| 2049 | + |
| 2050 | + ! Restore group start and end if we didn't match |
| 2051 | + if (.not. matched) then |
| 2052 | + if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then |
| 2053 | + group_starts(nfa%states(state)%group_start) = old_start |
| 2054 | + end if |
| 2055 | + if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then |
| 2056 | + group_ends(nfa%states(state)%group_end) = old_end |
| 2057 | + end if |
| 2058 | + end if |
| 2059 | + |
| 2060 | + end function backtrack_from_state |
| 2061 | + |
| 2062 | + function strings_equal_icase(s1, s2) result(equal) |
| 2063 | + !> Compare two strings case-insensitively |
| 2064 | + character(len=*), intent(in) :: s1, s2 |
| 2065 | + logical :: equal |
| 2066 | + integer :: i, n |
| 2067 | + |
| 2068 | + equal = .false. |
| 2069 | + n = len(s1) |
| 2070 | + if (len(s2) /= n) return |
| 2071 | + |
| 2072 | + do i = 1, n |
| 2073 | + if (to_lower_char(s1(i:i)) /= to_lower_char(s2(i:i))) return |
| 2074 | + end do |
| 2075 | + |
| 2076 | + equal = .true. |
| 2077 | + end function strings_equal_icase |
| 2078 | + |
| 1751 | 2079 | end module regex_optimizer |