facsimile Public

Watch 0 Fork 0 Star 0

Fortran · 10150 bytes Raw Blame History

  
        1
        module utf8_module
      
        2
            use iso_fortran_env, only: int8
      
        3
            implicit none
      
        4
            private
      
        5
        
        6
            public :: utf8_char_count, utf8_byte_to_char_index, utf8_char_to_byte_index
      
        7
            public :: utf8_char_at, utf8_display_width, utf8_is_valid_start
      
        8
        
        9
        contains
      
        10
        
        11
            ! Count number of UTF-8 characters (not bytes) in a string
      
        12
            pure function utf8_char_count(str) result(count)
      
        13
                character(len=*), intent(in) :: str
      
        14
                integer :: count
      
        15
                integer :: i, byte_len, char_len
      
        16
        
        17
                count = 0
      
        18
                byte_len = len(str)
      
        19
                i = 1
      
        20
        
        21
                do while (i <= byte_len)
      
        22
                    char_len = utf8_char_byte_length(str, i)
      
        23
                    if (char_len > 0) then
      
        24
                        count = count + 1
      
        25
                        i = i + char_len
      
        26
                    else
      
        27
                        ! Invalid UTF-8, treat as single byte
      
        28
                        count = count + 1
      
        29
                        i = i + 1
      
        30
                    end if
      
        31
                end do
      
        32
            end function utf8_char_count
      
        33
        
        34
            ! Convert character index to byte index
      
        35
            ! char_idx is 1-based character position
      
        36
            ! Returns byte position (1-based), or 0 if out of bounds
      
        37
            pure function utf8_char_to_byte_index(str, char_idx) result(byte_idx)
      
        38
                character(len=*), intent(in) :: str
      
        39
                integer, intent(in) :: char_idx
      
        40
                integer :: byte_idx
      
        41
                integer :: i, char_count, char_len, byte_len
      
        42
        
        43
                byte_idx = 0
      
        44
                if (char_idx < 1) return
      
        45
        
        46
                byte_len = len(str)
      
        47
                char_count = 0
      
        48
                i = 1
      
        49
        
        50
                do while (i <= byte_len)
      
        51
                    char_count = char_count + 1
      
        52
                    if (char_count == char_idx) then
      
        53
                        byte_idx = i
      
        54
                        return
      
        55
                    end if
      
        56
        
        57
                    char_len = utf8_char_byte_length(str, i)
      
        58
                    if (char_len > 0) then
      
        59
                        i = i + char_len
      
        60
                    else
      
        61
                        i = i + 1
      
        62
                    end if
      
        63
                end do
      
        64
        
        65
                ! If char_idx == char_count + 1, return position after last char
      
        66
                if (char_idx == char_count + 1) then
      
        67
                    byte_idx = byte_len + 1
      
        68
                end if
      
        69
            end function utf8_char_to_byte_index
      
        70
        
        71
            ! Convert byte index to character index
      
        72
            ! byte_idx is 1-based byte position
      
        73
            ! Returns character position (1-based)
      
        74
            pure function utf8_byte_to_char_index(str, byte_idx) result(char_idx)
      
        75
                character(len=*), intent(in) :: str
      
        76
                integer, intent(in) :: byte_idx
      
        77
                integer :: char_idx
      
        78
                integer :: i, char_len, byte_len
      
        79
        
        80
                char_idx = 0
      
        81
                if (byte_idx < 1) return
      
        82
        
        83
                byte_len = len(str)
      
        84
                if (byte_idx > byte_len + 1) return
      
        85
        
        86
                char_idx = 1
      
        87
                i = 1
      
        88
        
        89
                do while (i < byte_idx .and. i <= byte_len)
      
        90
                    char_len = utf8_char_byte_length(str, i)
      
        91
                    if (char_len > 0) then
      
        92
                        i = i + char_len
      
        93
                    else
      
        94
                        i = i + 1
      
        95
                    end if
      
        96
                    if (i <= byte_idx) char_idx = char_idx + 1
      
        97
                end do
      
        98
            end function utf8_byte_to_char_index
      
        99
        
        100
            ! Get the UTF-8 character at a given character index
      
        101
            ! Returns empty string if out of bounds
      
        102
            function utf8_char_at(str, char_idx) result(char_str)
      
        103
                character(len=*), intent(in) :: str
      
        104
                integer, intent(in) :: char_idx
      
        105
                character(len=:), allocatable :: char_str
      
        106
                integer :: byte_idx, char_len
      
        107
        
        108
                byte_idx = utf8_char_to_byte_index(str, char_idx)
      
        109
                if (byte_idx == 0 .or. byte_idx > len(str)) then
      
        110
                    allocate(character(len=0) :: char_str)
      
        111
                    return
      
        112
                end if
      
        113
        
        114
                char_len = utf8_char_byte_length(str, byte_idx)
      
        115
                if (char_len <= 0) char_len = 1
      
        116
        
        117
                ! Make sure we don't go past end of string
      
        118
                char_len = min(char_len, len(str) - byte_idx + 1)
      
        119
        
        120
                allocate(character(len=char_len) :: char_str)
      
        121
                char_str = str(byte_idx:byte_idx+char_len-1)
      
        122
            end function utf8_char_at
      
        123
        
        124
            ! Calculate display width of a UTF-8 string
      
        125
            ! (accounts for wide characters like CJK)
      
        126
            pure function utf8_display_width(str) result(width)
      
        127
                character(len=*), intent(in) :: str
      
        128
                integer :: width
      
        129
                integer :: i, char_len, byte_len, code_point
      
        130
        
        131
                width = 0
      
        132
                byte_len = len(str)
      
        133
                i = 1
      
        134
        
        135
                do while (i <= byte_len)
      
        136
                    char_len = utf8_char_byte_length(str, i)
      
        137
                    if (char_len > 0) then
      
        138
                        code_point = utf8_decode_char(str, i, char_len)
      
        139
                        width = width + utf8_char_width(code_point)
      
        140
                        i = i + char_len
      
        141
                    else
      
        142
                        ! Invalid UTF-8, count as 1 wide
      
        143
                        width = width + 1
      
        144
                        i = i + 1
      
        145
                    end if
      
        146
                end do
      
        147
            end function utf8_display_width
      
        148
        
        149
            ! Determine byte length of UTF-8 character starting at position i
      
        150
            ! Returns 0 if invalid UTF-8 start byte
      
        151
            pure function utf8_char_byte_length(str, i) result(char_len)
      
        152
                character(len=*), intent(in) :: str
      
        153
                integer, intent(in) :: i
      
        154
                integer :: char_len
      
        155
                integer :: byte_val, str_len
      
        156
        
        157
                str_len = len(str)
      
        158
        
        159
                if (i < 1 .or. i > str_len) then
      
        160
                    char_len = 0
      
        161
                    return
      
        162
                end if
      
        163
        
        164
                byte_val = iachar(str(i:i))
      
        165
        
        166
                ! ASCII: 0xxxxxxx
      
        167
                if (byte_val < 128) then
      
        168
                    char_len = 1
      
        169
                ! 2-byte: 110xxxxx
      
        170
                else if (iand(byte_val, int(b'11100000')) == int(b'11000000')) then
      
        171
                    char_len = 2
      
        172
                ! 3-byte: 1110xxxx
      
        173
                else if (iand(byte_val, int(b'11110000')) == int(b'11100000')) then
      
        174
                    char_len = 3
      
        175
                ! 4-byte: 11110xxx
      
        176
                else if (iand(byte_val, int(b'11111000')) == int(b'11110000')) then
      
        177
                    char_len = 4
      
        178
                else
      
        179
                    ! Invalid or continuation byte
      
        180
                    char_len = 0
      
        181
                end if
      
        182
        
        183
                ! Make sure we don't read past end of string
      
        184
                if (char_len > 0) then
      
        185
                    char_len = min(char_len, str_len - i + 1)
      
        186
                end if
      
        187
            end function utf8_char_byte_length
      
        188
        
        189
            ! Check if byte is a valid UTF-8 start byte (not continuation)
      
        190
            pure function utf8_is_valid_start(byte_val) result(is_start)
      
        191
                integer, intent(in) :: byte_val
      
        192
                logical :: is_start
      
        193
        
        194
                ! Continuation bytes are 10xxxxxx
      
        195
                is_start = iand(byte_val, int(b'11000000')) /= int(b'10000000')
      
        196
            end function utf8_is_valid_start
      
        197
        
        198
            ! Decode UTF-8 character to Unicode code point
      
        199
            pure function utf8_decode_char(str, pos, len) result(code_point)
      
        200
                character(len=*), intent(in) :: str
      
        201
                integer, intent(in) :: pos, len
      
        202
                integer :: code_point
      
        203
                integer :: byte1, byte2, byte3, byte4
      
        204
        
        205
                if (len < 1 .or. pos + len - 1 > len_trim(str)) then
      
        206
                    code_point = 0
      
        207
                    return
      
        208
                end if
      
        209
        
        210
                byte1 = iachar(str(pos:pos))
      
        211
        
        212
                select case(len)
      
        213
                case(1)
      
        214
                    code_point = byte1
      
        215
                case(2)
      
        216
                    byte2 = iachar(str(pos+1:pos+1))
      
        217
                    code_point = ior(ishft(iand(byte1, int(b'00011111')), 6), &
      
        218
                                    iand(byte2, int(b'00111111')))
      
        219
                case(3)
      
        220
                    byte2 = iachar(str(pos+1:pos+1))
      
        221
                    byte3 = iachar(str(pos+2:pos+2))
      
        222
                    code_point = ior(ior(ishft(iand(byte1, int(b'00001111')), 12), &
      
        223
                                        ishft(iand(byte2, int(b'00111111')), 6)), &
      
        224
                                    iand(byte3, int(b'00111111')))
      
        225
                case(4)
      
        226
                    byte2 = iachar(str(pos+1:pos+1))
      
        227
                    byte3 = iachar(str(pos+2:pos+2))
      
        228
                    byte4 = iachar(str(pos+3:pos+3))
      
        229
                    code_point = ior(ior(ior(ishft(iand(byte1, int(b'00000111')), 18), &
      
        230
                                            ishft(iand(byte2, int(b'00111111')), 12)), &
      
        231
                                        ishft(iand(byte3, int(b'00111111')), 6)), &
      
        232
                                    iand(byte4, int(b'00111111')))
      
        233
                case default
      
        234
                    code_point = 0
      
        235
                end select
      
        236
            end function utf8_decode_char
      
        237
        
        238
            ! Get display width of a Unicode code point
      
        239
            ! Returns 0 for combining chars, 1 for normal, 2 for wide (CJK)
      
        240
            pure function utf8_char_width(code_point) result(width)
      
        241
                integer, intent(in) :: code_point
      
        242
                integer :: width
      
        243
        
        244
                ! Simplified width calculation
      
        245
                ! Full implementation would need Unicode data tables
      
        246
        
        247
                ! Control characters
      
        248
                if (code_point < 32 .or. (code_point >= 127 .and. code_point < 160)) then
      
        249
                    width = 0
      
        250
                ! CJK Unified Ideographs and other wide chars
      
        251
                else if ((code_point >= int(z'1100') .and. code_point <= int(z'115F')) .or. &  ! Hangul Jamo
      
        252
                         (code_point >= int(z'2E80') .and. code_point <= int(z'A4CF')) .or. &  ! CJK
      
        253
                         (code_point >= int(z'AC00') .and. code_point <= int(z'D7A3')) .or. &  ! Hangul Syllables
      
        254
                         (code_point >= int(z'F900') .and. code_point <= int(z'FAFF')) .or. &  ! CJK Compatibility
      
        255
                         (code_point >= int(z'FE10') .and. code_point <= int(z'FE19')) .or. &  ! Vertical forms
      
        256
                         (code_point >= int(z'FE30') .and. code_point <= int(z'FE6F')) .or. &  ! CJK Compatibility Forms
      
        257
                         (code_point >= int(z'FF00') .and. code_point <= int(z'FF60')) .or. &  ! Fullwidth Forms
      
        258
                         (code_point >= int(z'FFE0') .and. code_point <= int(z'FFE6')) .or. &  ! Fullwidth Forms
      
        259
                         (code_point >= int(z'20000') .and. code_point <= int(z'2FFFD')) .or. & ! CJK Extension B-F
      
        260
                         (code_point >= int(z'30000') .and. code_point <= int(z'3FFFD'))) then
      
        261
                    width = 2
      
        262
                ! Combining characters (simplified - just a few ranges)
      
        263
                else if ((code_point >= int(z'0300') .and. code_point <= int(z'036F')) .or. &  ! Combining Diacriticals
      
        264
                         (code_point >= int(z'1AB0') .and. code_point <= int(z'1AFF')) .or. &  ! Combining Diacriticals Extended
      
        265
                         (code_point >= int(z'1DC0') .and. code_point <= int(z'1DFF')) .or. &  ! Combining Diacriticals Supplement
      
        266
                         (code_point >= int(z'20D0') .and. code_point <= int(z'20FF')) .or. &  ! Combining Diacriticals for Symbols
      
        267
                         (code_point >= int(z'FE20') .and. code_point <= int(z'FE2F'))) then   ! Combining Half Marks
      
        268
                    width = 0
      
        269
                else
      
        270
                    ! Normal width
      
        271
                    width = 1
      
        272
                end if
      
        273
            end function utf8_char_width
      
        274
        
        275
        end module utf8_module
      
        276

1	module utf8_module
2	use iso_fortran_env, only: int8
3	implicit none
4	private
5
6	public :: utf8_char_count, utf8_byte_to_char_index, utf8_char_to_byte_index
7	public :: utf8_char_at, utf8_display_width, utf8_is_valid_start
8
9	contains
10
11	! Count number of UTF-8 characters (not bytes) in a string
12	pure function utf8_char_count(str) result(count)
13	character(len=*), intent(in) :: str
14	integer :: count
15	integer :: i, byte_len, char_len
16
17	count = 0
18	byte_len = len(str)
19	i = 1
20
21	do while (i <= byte_len)
22	char_len = utf8_char_byte_length(str, i)
23	if (char_len > 0) then
24	count = count + 1
25	i = i + char_len
26	else
27	! Invalid UTF-8, treat as single byte
28	count = count + 1
29	i = i + 1
30	end if
31	end do
32	end function utf8_char_count
33
34	! Convert character index to byte index
35	! char_idx is 1-based character position
36	! Returns byte position (1-based), or 0 if out of bounds
37	pure function utf8_char_to_byte_index(str, char_idx) result(byte_idx)
38	character(len=*), intent(in) :: str
39	integer, intent(in) :: char_idx
40	integer :: byte_idx
41	integer :: i, char_count, char_len, byte_len
42
43	byte_idx = 0
44	if (char_idx < 1) return
45
46	byte_len = len(str)
47	char_count = 0
48	i = 1
49
50	do while (i <= byte_len)
51	char_count = char_count + 1
52	if (char_count == char_idx) then
53	byte_idx = i
54	return
55	end if
56
57	char_len = utf8_char_byte_length(str, i)
58	if (char_len > 0) then
59	i = i + char_len
60	else
61	i = i + 1
62	end if
63	end do
64
65	! If char_idx == char_count + 1, return position after last char
66	if (char_idx == char_count + 1) then
67	byte_idx = byte_len + 1
68	end if
69	end function utf8_char_to_byte_index
70
71	! Convert byte index to character index
72	! byte_idx is 1-based byte position
73	! Returns character position (1-based)
74	pure function utf8_byte_to_char_index(str, byte_idx) result(char_idx)
75	character(len=*), intent(in) :: str
76	integer, intent(in) :: byte_idx
77	integer :: char_idx
78	integer :: i, char_len, byte_len
79
80	char_idx = 0
81	if (byte_idx < 1) return
82
83	byte_len = len(str)
84	if (byte_idx > byte_len + 1) return
85
86	char_idx = 1
87	i = 1
88
89	do while (i < byte_idx .and. i <= byte_len)
90	char_len = utf8_char_byte_length(str, i)
91	if (char_len > 0) then
92	i = i + char_len
93	else
94	i = i + 1
95	end if
96	if (i <= byte_idx) char_idx = char_idx + 1
97	end do
98	end function utf8_byte_to_char_index
99
100	! Get the UTF-8 character at a given character index
101	! Returns empty string if out of bounds
102	function utf8_char_at(str, char_idx) result(char_str)
103	character(len=*), intent(in) :: str
104	integer, intent(in) :: char_idx
105	character(len=:), allocatable :: char_str
106	integer :: byte_idx, char_len
107
108	byte_idx = utf8_char_to_byte_index(str, char_idx)
109	if (byte_idx == 0 .or. byte_idx > len(str)) then
110	allocate(character(len=0) :: char_str)
111	return
112	end if
113
114	char_len = utf8_char_byte_length(str, byte_idx)
115	if (char_len <= 0) char_len = 1
116
117	! Make sure we don't go past end of string
118	char_len = min(char_len, len(str) - byte_idx + 1)
119
120	allocate(character(len=char_len) :: char_str)
121	char_str = str(byte_idx:byte_idx+char_len-1)
122	end function utf8_char_at
123
124	! Calculate display width of a UTF-8 string
125	! (accounts for wide characters like CJK)
126	pure function utf8_display_width(str) result(width)
127	character(len=*), intent(in) :: str
128	integer :: width
129	integer :: i, char_len, byte_len, code_point
130
131	width = 0
132	byte_len = len(str)
133	i = 1
134
135	do while (i <= byte_len)
136	char_len = utf8_char_byte_length(str, i)
137	if (char_len > 0) then
138	code_point = utf8_decode_char(str, i, char_len)
139	width = width + utf8_char_width(code_point)
140	i = i + char_len
141	else
142	! Invalid UTF-8, count as 1 wide
143	width = width + 1
144	i = i + 1
145	end if
146	end do
147	end function utf8_display_width
148
149	! Determine byte length of UTF-8 character starting at position i
150	! Returns 0 if invalid UTF-8 start byte
151	pure function utf8_char_byte_length(str, i) result(char_len)
152	character(len=*), intent(in) :: str
153	integer, intent(in) :: i
154	integer :: char_len
155	integer :: byte_val, str_len
156
157	str_len = len(str)
158
159	if (i < 1 .or. i > str_len) then
160	char_len = 0
161	return
162	end if
163
164	byte_val = iachar(str(i:i))
165
166	! ASCII: 0xxxxxxx
167	if (byte_val < 128) then
168	char_len = 1
169	! 2-byte: 110xxxxx
170	else if (iand(byte_val, int(b'11100000')) == int(b'11000000')) then
171	char_len = 2
172	! 3-byte: 1110xxxx
173	else if (iand(byte_val, int(b'11110000')) == int(b'11100000')) then
174	char_len = 3
175	! 4-byte: 11110xxx
176	else if (iand(byte_val, int(b'11111000')) == int(b'11110000')) then
177	char_len = 4
178	else
179	! Invalid or continuation byte
180	char_len = 0
181	end if
182
183	! Make sure we don't read past end of string
184	if (char_len > 0) then
185	char_len = min(char_len, str_len - i + 1)
186	end if
187	end function utf8_char_byte_length
188
189	! Check if byte is a valid UTF-8 start byte (not continuation)
190	pure function utf8_is_valid_start(byte_val) result(is_start)
191	integer, intent(in) :: byte_val
192	logical :: is_start
193
194	! Continuation bytes are 10xxxxxx
195	is_start = iand(byte_val, int(b'11000000')) /= int(b'10000000')
196	end function utf8_is_valid_start
197
198	! Decode UTF-8 character to Unicode code point
199	pure function utf8_decode_char(str, pos, len) result(code_point)
200	character(len=*), intent(in) :: str
201	integer, intent(in) :: pos, len
202	integer :: code_point
203	integer :: byte1, byte2, byte3, byte4
204
205	if (len < 1 .or. pos + len - 1 > len_trim(str)) then
206	code_point = 0
207	return
208	end if
209
210	byte1 = iachar(str(pos:pos))
211
212	select case(len)
213	case(1)
214	code_point = byte1
215	case(2)
216	byte2 = iachar(str(pos+1:pos+1))
217	code_point = ior(ishft(iand(byte1, int(b'00011111')), 6), &
218	iand(byte2, int(b'00111111')))
219	case(3)
220	byte2 = iachar(str(pos+1:pos+1))
221	byte3 = iachar(str(pos+2:pos+2))
222	code_point = ior(ior(ishft(iand(byte1, int(b'00001111')), 12), &
223	ishft(iand(byte2, int(b'00111111')), 6)), &
224	iand(byte3, int(b'00111111')))
225	case(4)
226	byte2 = iachar(str(pos+1:pos+1))
227	byte3 = iachar(str(pos+2:pos+2))
228	byte4 = iachar(str(pos+3:pos+3))
229	code_point = ior(ior(ior(ishft(iand(byte1, int(b'00000111')), 18), &
230	ishft(iand(byte2, int(b'00111111')), 12)), &
231	ishft(iand(byte3, int(b'00111111')), 6)), &
232	iand(byte4, int(b'00111111')))
233	case default
234	code_point = 0
235	end select
236	end function utf8_decode_char
237
238	! Get display width of a Unicode code point
239	! Returns 0 for combining chars, 1 for normal, 2 for wide (CJK)
240	pure function utf8_char_width(code_point) result(width)
241	integer, intent(in) :: code_point
242	integer :: width
243
244	! Simplified width calculation
245	! Full implementation would need Unicode data tables
246
247	! Control characters
248	if (code_point < 32 .or. (code_point >= 127 .and. code_point < 160)) then
249	width = 0
250	! CJK Unified Ideographs and other wide chars
251	else if ((code_point >= int(z'1100') .and. code_point <= int(z'115F')) .or. & ! Hangul Jamo
252	(code_point >= int(z'2E80') .and. code_point <= int(z'A4CF')) .or. & ! CJK
253	(code_point >= int(z'AC00') .and. code_point <= int(z'D7A3')) .or. & ! Hangul Syllables
254	(code_point >= int(z'F900') .and. code_point <= int(z'FAFF')) .or. & ! CJK Compatibility
255	(code_point >= int(z'FE10') .and. code_point <= int(z'FE19')) .or. & ! Vertical forms
256	(code_point >= int(z'FE30') .and. code_point <= int(z'FE6F')) .or. & ! CJK Compatibility Forms
257	(code_point >= int(z'FF00') .and. code_point <= int(z'FF60')) .or. & ! Fullwidth Forms
258	(code_point >= int(z'FFE0') .and. code_point <= int(z'FFE6')) .or. & ! Fullwidth Forms
259	(code_point >= int(z'20000') .and. code_point <= int(z'2FFFD')) .or. & ! CJK Extension B-F
260	(code_point >= int(z'30000') .and. code_point <= int(z'3FFFD'))) then
261	width = 2
262	! Combining characters (simplified - just a few ranges)
263	else if ((code_point >= int(z'0300') .and. code_point <= int(z'036F')) .or. & ! Combining Diacriticals
264	(code_point >= int(z'1AB0') .and. code_point <= int(z'1AFF')) .or. & ! Combining Diacriticals Extended
265	(code_point >= int(z'1DC0') .and. code_point <= int(z'1DFF')) .or. & ! Combining Diacriticals Supplement
266	(code_point >= int(z'20D0') .and. code_point <= int(z'20FF')) .or. & ! Combining Diacriticals for Symbols
267	(code_point >= int(z'FE20') .and. code_point <= int(z'FE2F'))) then ! Combining Half Marks
268	width = 0
269	else
270	! Normal width
271	width = 1
272	end if
273	end function utf8_char_width
274
275	end module utf8_module
276