module utf8_module
    use iso_fortran_env, only: int8
    implicit none
    private

    public :: utf8_char_count, utf8_byte_to_char_index, utf8_char_to_byte_index
    public :: utf8_char_at, utf8_display_width, utf8_is_valid_start

contains

    ! Count number of UTF-8 characters (not bytes) in a string
    pure function utf8_char_count(str) result(count)
        character(len=*), intent(in) :: str
        integer :: count
        integer :: i, byte_len, char_len

        count = 0
        byte_len = len(str)
        i = 1

        do while (i <= byte_len)
            char_len = utf8_char_byte_length(str, i)
            if (char_len > 0) then
                count = count + 1
                i = i + char_len
            else
                ! Invalid UTF-8, treat as single byte
                count = count + 1
                i = i + 1
            end if
        end do
    end function utf8_char_count

    ! Convert character index to byte index
    ! char_idx is 1-based character position
    ! Returns byte position (1-based), or 0 if out of bounds
    pure function utf8_char_to_byte_index(str, char_idx) result(byte_idx)
        character(len=*), intent(in) :: str
        integer, intent(in) :: char_idx
        integer :: byte_idx
        integer :: i, char_count, char_len, byte_len

        byte_idx = 0
        if (char_idx < 1) return

        byte_len = len(str)
        char_count = 0
        i = 1

        do while (i <= byte_len)
            char_count = char_count + 1
            if (char_count == char_idx) then
                byte_idx = i
                return
            end if

            char_len = utf8_char_byte_length(str, i)
            if (char_len > 0) then
                i = i + char_len
            else
                i = i + 1
            end if
        end do

        ! If char_idx == char_count + 1, return position after last char
        if (char_idx == char_count + 1) then
            byte_idx = byte_len + 1
        end if
    end function utf8_char_to_byte_index

    ! Convert byte index to character index
    ! byte_idx is 1-based byte position
    ! Returns character position (1-based)
    pure function utf8_byte_to_char_index(str, byte_idx) result(char_idx)
        character(len=*), intent(in) :: str
        integer, intent(in) :: byte_idx
        integer :: char_idx
        integer :: i, char_len, byte_len

        char_idx = 0
        if (byte_idx < 1) return

        byte_len = len(str)
        if (byte_idx > byte_len + 1) return

        char_idx = 1
        i = 1

        do while (i < byte_idx .and. i <= byte_len)
            char_len = utf8_char_byte_length(str, i)
            if (char_len > 0) then
                i = i + char_len
            else
                i = i + 1
            end if
            if (i <= byte_idx) char_idx = char_idx + 1
        end do
    end function utf8_byte_to_char_index

    ! Get the UTF-8 character at a given character index
    ! Returns empty string if out of bounds
    function utf8_char_at(str, char_idx) result(char_str)
        character(len=*), intent(in) :: str
        integer, intent(in) :: char_idx
        character(len=:), allocatable :: char_str
        integer :: byte_idx, char_len

        byte_idx = utf8_char_to_byte_index(str, char_idx)
        if (byte_idx == 0 .or. byte_idx > len(str)) then
            allocate(character(len=0) :: char_str)
            return
        end if

        char_len = utf8_char_byte_length(str, byte_idx)
        if (char_len <= 0) char_len = 1

        ! Make sure we don't go past end of string
        char_len = min(char_len, len(str) - byte_idx + 1)

        allocate(character(len=char_len) :: char_str)
        char_str = str(byte_idx:byte_idx+char_len-1)
    end function utf8_char_at

    ! Calculate display width of a UTF-8 string
    ! (accounts for wide characters like CJK)
    pure function utf8_display_width(str) result(width)
        character(len=*), intent(in) :: str
        integer :: width
        integer :: i, char_len, byte_len, code_point

        width = 0
        byte_len = len(str)
        i = 1

        do while (i <= byte_len)
            char_len = utf8_char_byte_length(str, i)
            if (char_len > 0) then
                code_point = utf8_decode_char(str, i, char_len)
                width = width + utf8_char_width(code_point)
                i = i + char_len
            else
                ! Invalid UTF-8, count as 1 wide
                width = width + 1
                i = i + 1
            end if
        end do
    end function utf8_display_width

    ! Determine byte length of UTF-8 character starting at position i
    ! Returns 0 if invalid UTF-8 start byte
    pure function utf8_char_byte_length(str, i) result(char_len)
        character(len=*), intent(in) :: str
        integer, intent(in) :: i
        integer :: char_len
        integer :: byte_val, str_len

        str_len = len(str)

        if (i < 1 .or. i > str_len) then
            char_len = 0
            return
        end if

        byte_val = iachar(str(i:i))

        ! ASCII: 0xxxxxxx
        if (byte_val < 128) then
            char_len = 1
        ! 2-byte: 110xxxxx
        else if (iand(byte_val, int(b'11100000')) == int(b'11000000')) then
            char_len = 2
        ! 3-byte: 1110xxxx
        else if (iand(byte_val, int(b'11110000')) == int(b'11100000')) then
            char_len = 3
        ! 4-byte: 11110xxx
        else if (iand(byte_val, int(b'11111000')) == int(b'11110000')) then
            char_len = 4
        else
            ! Invalid or continuation byte
            char_len = 0
        end if

        ! Make sure we don't read past end of string
        if (char_len > 0) then
            char_len = min(char_len, str_len - i + 1)
        end if
    end function utf8_char_byte_length

    ! Check if byte is a valid UTF-8 start byte (not continuation)
    pure function utf8_is_valid_start(byte_val) result(is_start)
        integer, intent(in) :: byte_val
        logical :: is_start

        ! Continuation bytes are 10xxxxxx
        is_start = iand(byte_val, int(b'11000000')) /= int(b'10000000')
    end function utf8_is_valid_start

    ! Decode UTF-8 character to Unicode code point
    pure function utf8_decode_char(str, pos, len) result(code_point)
        character(len=*), intent(in) :: str
        integer, intent(in) :: pos, len
        integer :: code_point
        integer :: byte1, byte2, byte3, byte4

        if (len < 1 .or. pos + len - 1 > len_trim(str)) then
            code_point = 0
            return
        end if

        byte1 = iachar(str(pos:pos))

        select case(len)
        case(1)
            code_point = byte1
        case(2)
            byte2 = iachar(str(pos+1:pos+1))
            code_point = ior(ishft(iand(byte1, int(b'00011111')), 6), &
                            iand(byte2, int(b'00111111')))
        case(3)
            byte2 = iachar(str(pos+1:pos+1))
            byte3 = iachar(str(pos+2:pos+2))
            code_point = ior(ior(ishft(iand(byte1, int(b'00001111')), 12), &
                                ishft(iand(byte2, int(b'00111111')), 6)), &
                            iand(byte3, int(b'00111111')))
        case(4)
            byte2 = iachar(str(pos+1:pos+1))
            byte3 = iachar(str(pos+2:pos+2))
            byte4 = iachar(str(pos+3:pos+3))
            code_point = ior(ior(ior(ishft(iand(byte1, int(b'00000111')), 18), &
                                    ishft(iand(byte2, int(b'00111111')), 12)), &
                                ishft(iand(byte3, int(b'00111111')), 6)), &
                            iand(byte4, int(b'00111111')))
        case default
            code_point = 0
        end select
    end function utf8_decode_char

    ! Get display width of a Unicode code point
    ! Returns 0 for combining chars, 1 for normal, 2 for wide (CJK)
    pure function utf8_char_width(code_point) result(width)
        integer, intent(in) :: code_point
        integer :: width

        ! Simplified width calculation
        ! Full implementation would need Unicode data tables

        ! Control characters
        if (code_point < 32 .or. (code_point >= 127 .and. code_point < 160)) then
            width = 0
        ! CJK Unified Ideographs and other wide chars
        else if ((code_point >= int(z'1100') .and. code_point <= int(z'115F')) .or. &  ! Hangul Jamo
                 (code_point >= int(z'2E80') .and. code_point <= int(z'A4CF')) .or. &  ! CJK
                 (code_point >= int(z'AC00') .and. code_point <= int(z'D7A3')) .or. &  ! Hangul Syllables
                 (code_point >= int(z'F900') .and. code_point <= int(z'FAFF')) .or. &  ! CJK Compatibility
                 (code_point >= int(z'FE10') .and. code_point <= int(z'FE19')) .or. &  ! Vertical forms
                 (code_point >= int(z'FE30') .and. code_point <= int(z'FE6F')) .or. &  ! CJK Compatibility Forms
                 (code_point >= int(z'FF00') .and. code_point <= int(z'FF60')) .or. &  ! Fullwidth Forms
                 (code_point >= int(z'FFE0') .and. code_point <= int(z'FFE6')) .or. &  ! Fullwidth Forms
                 (code_point >= int(z'20000') .and. code_point <= int(z'2FFFD')) .or. & ! CJK Extension B-F
                 (code_point >= int(z'30000') .and. code_point <= int(z'3FFFD'))) then
            width = 2
        ! Combining characters (simplified - just a few ranges)
        else if ((code_point >= int(z'0300') .and. code_point <= int(z'036F')) .or. &  ! Combining Diacriticals
                 (code_point >= int(z'1AB0') .and. code_point <= int(z'1AFF')) .or. &  ! Combining Diacriticals Extended
                 (code_point >= int(z'1DC0') .and. code_point <= int(z'1DFF')) .or. &  ! Combining Diacriticals Supplement
                 (code_point >= int(z'20D0') .and. code_point <= int(z'20FF')) .or. &  ! Combining Diacriticals for Symbols
                 (code_point >= int(z'FE20') .and. code_point <= int(z'FE2F'))) then   ! Combining Half Marks
            width = 0
        else
            ! Normal width
            width = 1
        end if
    end function utf8_char_width

end module utf8_module