fortrangoingonforty/ferp / a3c0ed6

Browse files

Implement backreference support and Latin-1 case folding

Backreferences:
- Add has_backrefs flag to detect patterns with \1-\9
- Implement backtracking matcher for patterns with backreferences
- Track captured group start/end positions during matching
- Support case-insensitive backreference matching

Case folding:
- Extend to_lower_char() to handle Latin-1 characters (192-222)
- Add UTF-8 continuation byte handling for Latin Extended-A
- Enables case-insensitive matching for accented characters (é, ñ, etc.)
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
a3c0ed65f8ce1869344b6196ae4f61d1a03ec033
Parents
4dc60e3
Tree
4b61a2c

2 changed files

StatusFile+-
M src/regex/regex_engine.f90 15 0
M src/regex/regex_optimizer.f90 329 1
src/regex/regex_engine.f90modified
@@ -311,6 +311,21 @@ contains
311311
 
312312
     ic = ichar(c)
313313
     if (ic >= ichar('A') .and. ic <= ichar('Z')) then
314
+      ! ASCII uppercase A-Z -> a-z
315
+      lower = char(ic + 32)
316
+    else if (ic >= 192 .and. ic <= 214) then
317
+      ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246)
318
+      lower = char(ic + 32)
319
+    else if (ic >= 216 .and. ic <= 222) then
320
+      ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254)
321
+      lower = char(ic + 32)
322
+    else if (ic >= 128 .and. ic <= 150) then
323
+      ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6)
324
+      ! When preceded by 0xC3, these represent À-Ö, fold to à-ö
325
+      lower = char(ic + 32)
326
+    else if (ic >= 152 .and. ic <= 158) then
327
+      ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE)
328
+      ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ
314329
       lower = char(ic + 32)
315330
     else
316331
       lower = c
src/regex/regex_optimizer.f90modified
@@ -83,6 +83,7 @@ module regex_optimizer
8383
     logical :: use_dfa = .false.                 ! Use DFA instead of NFA
8484
     type(ac_automaton_t) :: ac                   ! Aho-Corasick automaton (for alternation)
8585
     logical :: use_aho_corasick = .false.        ! Use Aho-Corasick for matching
86
+    logical :: has_backrefs = .false.            ! Pattern contains backreferences
8687
     logical :: optimized = .false.
8788
   end type optimized_nfa_t
8889
 
@@ -193,6 +194,10 @@ contains
193194
     opt%anchored_start = .false.
194195
     opt%anchored_end = .false.
195196
     opt%use_dfa = .false.
197
+    opt%has_backrefs = .false.
198
+
199
+    ! Detect backreferences in NFA
200
+    opt%has_backrefs = has_backref_transitions(nfa)
196201
 
197202
     ! Extract literal prefix and detect anchors
198203
     call extract_prefix_and_anchors(opt)
@@ -213,7 +218,8 @@ contains
213218
 
214219
     ! Try to compile full DFA for O(n) matching
215220
     ! Only for patterns without any position-dependent transitions (anchors)
216
-    if (.not. has_anchor_transitions(opt%nfa)) then
221
+    ! and without backreferences (which require backtracking)
222
+    if (.not. has_anchor_transitions(opt%nfa) .and. .not. opt%has_backrefs) then
217223
       call compile_dfa(opt)
218224
       ! DEBUG: Print DFA compilation result (uncomment for debugging)
219225
       ! write(0,*) 'DFA compiled:', opt%use_dfa, 'states:', opt%dfa%num_states, 'too_large:', opt%dfa%too_large
@@ -388,6 +394,28 @@ contains
388394
     end do
389395
   end function has_anchor_transitions
390396
 
397
+  function has_backref_transitions(nfa) result(has_backrefs)
398
+    !> Check if NFA has any backreference transitions
399
+    !> Backrefs are encoded as TRANS_EPSILON with negative anchor_type
400
+    type(nfa_t), intent(in) :: nfa
401
+    logical :: has_backrefs
402
+
403
+    integer :: state, i
404
+    type(nfa_transition_t) :: trans
405
+
406
+    has_backrefs = .false.
407
+
408
+    do state = 1, nfa%num_states
409
+      do i = 1, nfa%states(state)%num_trans
410
+        trans = nfa%states(state)%trans(i)
411
+        if (trans%trans_type == TRANS_EPSILON .and. trans%anchor_type < 0) then
412
+          has_backrefs = .true.
413
+          return
414
+        end if
415
+      end do
416
+    end do
417
+  end function has_backref_transitions
418
+
391419
   !---------------------------------------------------------------------------
392420
   ! Character Equivalence Classes
393421
   !---------------------------------------------------------------------------
@@ -1025,6 +1053,12 @@ contains
10251053
 
10261054
     if (opt%nfa%num_states == 0) return
10271055
 
1056
+    ! Backtracking path: use backtracking matcher for patterns with backreferences
1057
+    if (opt%has_backrefs) then
1058
+      res = backtrack_search(opt%nfa, text, text_len, ignore_case)
1059
+      return
1060
+    end if
1061
+
10281062
     ! Fast path: use DFA if available (O(n) matching)
10291063
     ! DFA is case-sensitive; case-insensitive matching falls through to NFA path
10301064
     if (opt%use_dfa .and. .not. ignore_case) then
@@ -1510,6 +1544,21 @@ contains
15101544
     integer :: ic
15111545
     ic = ichar(c)
15121546
     if (ic >= ichar('A') .and. ic <= ichar('Z')) then
1547
+      ! ASCII uppercase A-Z -> a-z
1548
+      lower = char(ic + 32)
1549
+    else if (ic >= 192 .and. ic <= 214) then
1550
+      ! Latin-1 uppercase À-Ö (192-214) -> à-ö (224-246)
1551
+      lower = char(ic + 32)
1552
+    else if (ic >= 216 .and. ic <= 222) then
1553
+      ! Latin-1 uppercase Ø-Þ (216-222) -> ø-þ (248-254)
1554
+      lower = char(ic + 32)
1555
+    else if (ic >= 128 .and. ic <= 150) then
1556
+      ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00C0-U+00D6)
1557
+      ! When preceded by 0xC3, these represent À-Ö, fold to à-ö
1558
+      lower = char(ic + 32)
1559
+    else if (ic >= 152 .and. ic <= 158) then
1560
+      ! UTF-8 continuation byte for uppercase Latin Extended-A (U+00D8-U+00DE)
1561
+      ! When preceded by 0xC3, these represent Ø-Þ, fold to ø-þ
15131562
       lower = char(ic + 32)
15141563
     else
15151564
       lower = c
@@ -1748,4 +1797,283 @@ contains
17481797
 
17491798
   end function ac_optimized_search
17501799
 
1800
+  !---------------------------------------------------------------------------
1801
+  ! Backtracking Matcher for Backreferences
1802
+  !---------------------------------------------------------------------------
1803
+
1804
+  function backtrack_search(nfa, text, text_len, ignore_case) result(res)
1805
+    !> Search for pattern with backreferences using backtracking
1806
+    !> Tries each starting position until a match is found
1807
+    type(nfa_t), intent(in) :: nfa
1808
+    character(len=*), intent(in) :: text
1809
+    integer, intent(in) :: text_len
1810
+    logical, intent(in) :: ignore_case
1811
+    type(match_result_t) :: res
1812
+
1813
+    integer :: start_pos
1814
+    type(match_result_t) :: try_res
1815
+
1816
+    res%matched = .false.
1817
+
1818
+    do start_pos = 1, text_len + 1
1819
+      try_res = backtrack_match(nfa, text, text_len, start_pos, ignore_case)
1820
+      if (try_res%matched) then
1821
+        res = try_res
1822
+        return
1823
+      end if
1824
+    end do
1825
+
1826
+  end function backtrack_search
1827
+
1828
+  function backtrack_match(nfa, text, text_len, start_pos, ignore_case) result(res)
1829
+    !> Try to match NFA with backreferences starting at start_pos
1830
+    !> Uses recursive backtracking to track group captures
1831
+    type(nfa_t), intent(in) :: nfa
1832
+    character(len=*), intent(in) :: text
1833
+    integer, intent(in) :: text_len, start_pos
1834
+    logical, intent(in) :: ignore_case
1835
+    type(match_result_t) :: res
1836
+
1837
+    integer :: group_starts(9), group_ends(9)
1838
+    integer :: best_end
1839
+
1840
+    res%matched = .false.
1841
+    group_starts = 0
1842
+    group_ends = 0
1843
+    best_end = start_pos - 1
1844
+
1845
+    ! Try to match from the start state
1846
+    if (backtrack_from_state(nfa, nfa%start_state, text, text_len, start_pos, &
1847
+                             ignore_case, group_starts, group_ends, best_end)) then
1848
+      res%matched = .true.
1849
+      res%match_start = start_pos
1850
+      res%match_end = best_end
1851
+      res%group_starts = group_starts
1852
+      res%group_ends = group_ends
1853
+    end if
1854
+
1855
+  end function backtrack_match
1856
+
1857
+  recursive function backtrack_from_state(nfa, state, text, text_len, pos, &
1858
+                                          ignore_case, group_starts, group_ends, best_end) result(matched)
1859
+    !> Recursive backtracking from a given NFA state
1860
+    !> Returns true if we can reach an accepting state
1861
+    type(nfa_t), intent(in) :: nfa
1862
+    integer, intent(in) :: state
1863
+    character(len=*), intent(in) :: text
1864
+    integer, intent(in) :: text_len, pos
1865
+    logical, intent(in) :: ignore_case
1866
+    integer, intent(inout) :: group_starts(9), group_ends(9)
1867
+    integer, intent(inout) :: best_end
1868
+    logical :: matched
1869
+
1870
+    integer :: i, target, old_start, old_end
1871
+    integer :: backref_num, ref_start, ref_end, ref_len
1872
+    integer :: saved_starts(9), saved_ends(9)
1873
+    type(nfa_transition_t) :: trans
1874
+    character(len=1) :: c, c_lower, match_lower
1875
+    logical :: char_matches
1876
+
1877
+    matched = .false.
1878
+
1879
+    if (state < 1 .or. state > nfa%num_states) return
1880
+
1881
+    ! Record group start if this state starts a group
1882
+    if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then
1883
+      old_start = group_starts(nfa%states(state)%group_start)
1884
+      group_starts(nfa%states(state)%group_start) = pos
1885
+    else
1886
+      old_start = 0
1887
+    end if
1888
+
1889
+    ! Record group end if this state ends a group
1890
+    ! This must be done BEFORE processing transitions so backrefs can see the captured text
1891
+    if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then
1892
+      old_end = group_ends(nfa%states(state)%group_end)
1893
+      group_ends(nfa%states(state)%group_end) = pos - 1
1894
+    else
1895
+      old_end = 0
1896
+    end if
1897
+
1898
+    ! Check if this is an accepting state
1899
+    if (nfa%states(state)%is_accept) then
1900
+      matched = .true.
1901
+      if (pos - 1 > best_end) best_end = pos - 1
1902
+      ! Continue to find longest match (greedy)
1903
+    end if
1904
+
1905
+    ! Try each transition from this state
1906
+    do i = 1, nfa%states(state)%num_trans
1907
+      trans = nfa%states(state)%trans(i)
1908
+      target = trans%target
1909
+
1910
+      select case (trans%trans_type)
1911
+        case (TRANS_EPSILON)
1912
+          ! Check for backreference (negative anchor_type)
1913
+          if (trans%anchor_type < 0) then
1914
+            backref_num = -trans%anchor_type
1915
+            if (backref_num >= 1 .and. backref_num <= 9) then
1916
+              ref_start = group_starts(backref_num)
1917
+              ref_end = group_ends(backref_num)
1918
+
1919
+              ! If group hasn't been captured yet, backreference fails
1920
+              if (ref_start == 0 .or. ref_end == 0 .or. ref_end < ref_start) cycle
1921
+
1922
+              ref_len = ref_end - ref_start + 1
1923
+
1924
+              ! Check if we have enough text remaining
1925
+              if (pos + ref_len - 1 > text_len) cycle
1926
+
1927
+              ! Check if the text matches the captured group
1928
+              if (ignore_case) then
1929
+                if (.not. strings_equal_icase(text(pos:pos+ref_len-1), &
1930
+                                              text(ref_start:ref_end))) cycle
1931
+              else
1932
+                if (text(pos:pos+ref_len-1) /= text(ref_start:ref_end)) cycle
1933
+              end if
1934
+
1935
+              ! Backref matches - continue from target with advanced position
1936
+              saved_starts = group_starts
1937
+              saved_ends = group_ends
1938
+              if (backtrack_from_state(nfa, target, text, text_len, pos + ref_len, &
1939
+                                       ignore_case, group_starts, group_ends, best_end)) then
1940
+                matched = .true.
1941
+              else
1942
+                group_starts = saved_starts
1943
+                group_ends = saved_ends
1944
+              end if
1945
+            end if
1946
+          else
1947
+            ! Regular epsilon transition
1948
+            saved_starts = group_starts
1949
+            saved_ends = group_ends
1950
+            if (backtrack_from_state(nfa, target, text, text_len, pos, &
1951
+                                     ignore_case, group_starts, group_ends, best_end)) then
1952
+              matched = .true.
1953
+            else
1954
+              group_starts = saved_starts
1955
+              group_ends = saved_ends
1956
+            end if
1957
+          end if
1958
+
1959
+        case (TRANS_ANCHOR)
1960
+          ! Check if anchor matches at this position
1961
+          if (anchor_matches_opt(trans%anchor_type, text, pos, text_len)) then
1962
+            saved_starts = group_starts
1963
+            saved_ends = group_ends
1964
+            if (backtrack_from_state(nfa, target, text, text_len, pos, &
1965
+                                     ignore_case, group_starts, group_ends, best_end)) then
1966
+              matched = .true.
1967
+            else
1968
+              group_starts = saved_starts
1969
+              group_ends = saved_ends
1970
+            end if
1971
+          end if
1972
+
1973
+        case (TRANS_CHAR)
1974
+          ! Character transition - need text available
1975
+          if (pos <= text_len) then
1976
+            c = text(pos:pos)
1977
+            char_matches = .false.
1978
+
1979
+            if (ignore_case) then
1980
+              c_lower = to_lower_char(c)
1981
+              match_lower = to_lower_char(trans%match_char)
1982
+              char_matches = (c_lower == match_lower)
1983
+            else
1984
+              char_matches = (c == trans%match_char)
1985
+            end if
1986
+
1987
+            if (char_matches) then
1988
+              saved_starts = group_starts
1989
+              saved_ends = group_ends
1990
+              if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
1991
+                                       ignore_case, group_starts, group_ends, best_end)) then
1992
+                matched = .true.
1993
+              else
1994
+                group_starts = saved_starts
1995
+                group_ends = saved_ends
1996
+              end if
1997
+            end if
1998
+          end if
1999
+
2000
+        case (TRANS_CLASS)
2001
+          ! Character class transition
2002
+          if (pos <= text_len) then
2003
+            c = text(pos:pos)
2004
+            if (ignore_case) then
2005
+              if (charclass_test_case_insensitive(trans%char_bits, c)) then
2006
+                saved_starts = group_starts
2007
+                saved_ends = group_ends
2008
+                if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
2009
+                                         ignore_case, group_starts, group_ends, best_end)) then
2010
+                  matched = .true.
2011
+                else
2012
+                  group_starts = saved_starts
2013
+                  group_ends = saved_ends
2014
+                end if
2015
+              end if
2016
+            else
2017
+              if (charclass_test(trans%char_bits, c)) then
2018
+                saved_starts = group_starts
2019
+                saved_ends = group_ends
2020
+                if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
2021
+                                         ignore_case, group_starts, group_ends, best_end)) then
2022
+                  matched = .true.
2023
+                else
2024
+                  group_starts = saved_starts
2025
+                  group_ends = saved_ends
2026
+                end if
2027
+              end if
2028
+            end if
2029
+          end if
2030
+
2031
+        case (TRANS_ANY)
2032
+          ! Dot matches any character except newline
2033
+          if (pos <= text_len) then
2034
+            if (text(pos:pos) /= char(10)) then
2035
+              saved_starts = group_starts
2036
+              saved_ends = group_ends
2037
+              if (backtrack_from_state(nfa, target, text, text_len, pos + 1, &
2038
+                                       ignore_case, group_starts, group_ends, best_end)) then
2039
+                matched = .true.
2040
+              else
2041
+                group_starts = saved_starts
2042
+                group_ends = saved_ends
2043
+              end if
2044
+            end if
2045
+          end if
2046
+
2047
+      end select
2048
+    end do
2049
+
2050
+    ! Restore group start and end if we didn't match
2051
+    if (.not. matched) then
2052
+      if (nfa%states(state)%group_start > 0 .and. nfa%states(state)%group_start <= 9) then
2053
+        group_starts(nfa%states(state)%group_start) = old_start
2054
+      end if
2055
+      if (nfa%states(state)%group_end > 0 .and. nfa%states(state)%group_end <= 9) then
2056
+        group_ends(nfa%states(state)%group_end) = old_end
2057
+      end if
2058
+    end if
2059
+
2060
+  end function backtrack_from_state
2061
+
2062
+  function strings_equal_icase(s1, s2) result(equal)
2063
+    !> Compare two strings case-insensitively
2064
+    character(len=*), intent(in) :: s1, s2
2065
+    logical :: equal
2066
+    integer :: i, n
2067
+
2068
+    equal = .false.
2069
+    n = len(s1)
2070
+    if (len(s2) /= n) return
2071
+
2072
+    do i = 1, n
2073
+      if (to_lower_char(s1(i:i)) /= to_lower_char(s2(i:i))) return
2074
+    end do
2075
+
2076
+    equal = .true.
2077
+  end function strings_equal_icase
2078
+
17512079
 end module regex_optimizer