Fortran · 10150 bytes Raw Blame History
1 module utf8_module
2 use iso_fortran_env, only: int8
3 implicit none
4 private
5
6 public :: utf8_char_count, utf8_byte_to_char_index, utf8_char_to_byte_index
7 public :: utf8_char_at, utf8_display_width, utf8_is_valid_start
8
9 contains
10
11 ! Count number of UTF-8 characters (not bytes) in a string
12 pure function utf8_char_count(str) result(count)
13 character(len=*), intent(in) :: str
14 integer :: count
15 integer :: i, byte_len, char_len
16
17 count = 0
18 byte_len = len(str)
19 i = 1
20
21 do while (i <= byte_len)
22 char_len = utf8_char_byte_length(str, i)
23 if (char_len > 0) then
24 count = count + 1
25 i = i + char_len
26 else
27 ! Invalid UTF-8, treat as single byte
28 count = count + 1
29 i = i + 1
30 end if
31 end do
32 end function utf8_char_count
33
34 ! Convert character index to byte index
35 ! char_idx is 1-based character position
36 ! Returns byte position (1-based), or 0 if out of bounds
37 pure function utf8_char_to_byte_index(str, char_idx) result(byte_idx)
38 character(len=*), intent(in) :: str
39 integer, intent(in) :: char_idx
40 integer :: byte_idx
41 integer :: i, char_count, char_len, byte_len
42
43 byte_idx = 0
44 if (char_idx < 1) return
45
46 byte_len = len(str)
47 char_count = 0
48 i = 1
49
50 do while (i <= byte_len)
51 char_count = char_count + 1
52 if (char_count == char_idx) then
53 byte_idx = i
54 return
55 end if
56
57 char_len = utf8_char_byte_length(str, i)
58 if (char_len > 0) then
59 i = i + char_len
60 else
61 i = i + 1
62 end if
63 end do
64
65 ! If char_idx == char_count + 1, return position after last char
66 if (char_idx == char_count + 1) then
67 byte_idx = byte_len + 1
68 end if
69 end function utf8_char_to_byte_index
70
71 ! Convert byte index to character index
72 ! byte_idx is 1-based byte position
73 ! Returns character position (1-based)
74 pure function utf8_byte_to_char_index(str, byte_idx) result(char_idx)
75 character(len=*), intent(in) :: str
76 integer, intent(in) :: byte_idx
77 integer :: char_idx
78 integer :: i, char_len, byte_len
79
80 char_idx = 0
81 if (byte_idx < 1) return
82
83 byte_len = len(str)
84 if (byte_idx > byte_len + 1) return
85
86 char_idx = 1
87 i = 1
88
89 do while (i < byte_idx .and. i <= byte_len)
90 char_len = utf8_char_byte_length(str, i)
91 if (char_len > 0) then
92 i = i + char_len
93 else
94 i = i + 1
95 end if
96 if (i <= byte_idx) char_idx = char_idx + 1
97 end do
98 end function utf8_byte_to_char_index
99
100 ! Get the UTF-8 character at a given character index
101 ! Returns empty string if out of bounds
102 function utf8_char_at(str, char_idx) result(char_str)
103 character(len=*), intent(in) :: str
104 integer, intent(in) :: char_idx
105 character(len=:), allocatable :: char_str
106 integer :: byte_idx, char_len
107
108 byte_idx = utf8_char_to_byte_index(str, char_idx)
109 if (byte_idx == 0 .or. byte_idx > len(str)) then
110 allocate(character(len=0) :: char_str)
111 return
112 end if
113
114 char_len = utf8_char_byte_length(str, byte_idx)
115 if (char_len <= 0) char_len = 1
116
117 ! Make sure we don't go past end of string
118 char_len = min(char_len, len(str) - byte_idx + 1)
119
120 allocate(character(len=char_len) :: char_str)
121 char_str = str(byte_idx:byte_idx+char_len-1)
122 end function utf8_char_at
123
124 ! Calculate display width of a UTF-8 string
125 ! (accounts for wide characters like CJK)
126 pure function utf8_display_width(str) result(width)
127 character(len=*), intent(in) :: str
128 integer :: width
129 integer :: i, char_len, byte_len, code_point
130
131 width = 0
132 byte_len = len(str)
133 i = 1
134
135 do while (i <= byte_len)
136 char_len = utf8_char_byte_length(str, i)
137 if (char_len > 0) then
138 code_point = utf8_decode_char(str, i, char_len)
139 width = width + utf8_char_width(code_point)
140 i = i + char_len
141 else
142 ! Invalid UTF-8, count as 1 wide
143 width = width + 1
144 i = i + 1
145 end if
146 end do
147 end function utf8_display_width
148
149 ! Determine byte length of UTF-8 character starting at position i
150 ! Returns 0 if invalid UTF-8 start byte
151 pure function utf8_char_byte_length(str, i) result(char_len)
152 character(len=*), intent(in) :: str
153 integer, intent(in) :: i
154 integer :: char_len
155 integer :: byte_val, str_len
156
157 str_len = len(str)
158
159 if (i < 1 .or. i > str_len) then
160 char_len = 0
161 return
162 end if
163
164 byte_val = iachar(str(i:i))
165
166 ! ASCII: 0xxxxxxx
167 if (byte_val < 128) then
168 char_len = 1
169 ! 2-byte: 110xxxxx
170 else if (iand(byte_val, int(b'11100000')) == int(b'11000000')) then
171 char_len = 2
172 ! 3-byte: 1110xxxx
173 else if (iand(byte_val, int(b'11110000')) == int(b'11100000')) then
174 char_len = 3
175 ! 4-byte: 11110xxx
176 else if (iand(byte_val, int(b'11111000')) == int(b'11110000')) then
177 char_len = 4
178 else
179 ! Invalid or continuation byte
180 char_len = 0
181 end if
182
183 ! Make sure we don't read past end of string
184 if (char_len > 0) then
185 char_len = min(char_len, str_len - i + 1)
186 end if
187 end function utf8_char_byte_length
188
189 ! Check if byte is a valid UTF-8 start byte (not continuation)
190 pure function utf8_is_valid_start(byte_val) result(is_start)
191 integer, intent(in) :: byte_val
192 logical :: is_start
193
194 ! Continuation bytes are 10xxxxxx
195 is_start = iand(byte_val, int(b'11000000')) /= int(b'10000000')
196 end function utf8_is_valid_start
197
198 ! Decode UTF-8 character to Unicode code point
199 pure function utf8_decode_char(str, pos, len) result(code_point)
200 character(len=*), intent(in) :: str
201 integer, intent(in) :: pos, len
202 integer :: code_point
203 integer :: byte1, byte2, byte3, byte4
204
205 if (len < 1 .or. pos + len - 1 > len_trim(str)) then
206 code_point = 0
207 return
208 end if
209
210 byte1 = iachar(str(pos:pos))
211
212 select case(len)
213 case(1)
214 code_point = byte1
215 case(2)
216 byte2 = iachar(str(pos+1:pos+1))
217 code_point = ior(ishft(iand(byte1, int(b'00011111')), 6), &
218 iand(byte2, int(b'00111111')))
219 case(3)
220 byte2 = iachar(str(pos+1:pos+1))
221 byte3 = iachar(str(pos+2:pos+2))
222 code_point = ior(ior(ishft(iand(byte1, int(b'00001111')), 12), &
223 ishft(iand(byte2, int(b'00111111')), 6)), &
224 iand(byte3, int(b'00111111')))
225 case(4)
226 byte2 = iachar(str(pos+1:pos+1))
227 byte3 = iachar(str(pos+2:pos+2))
228 byte4 = iachar(str(pos+3:pos+3))
229 code_point = ior(ior(ior(ishft(iand(byte1, int(b'00000111')), 18), &
230 ishft(iand(byte2, int(b'00111111')), 12)), &
231 ishft(iand(byte3, int(b'00111111')), 6)), &
232 iand(byte4, int(b'00111111')))
233 case default
234 code_point = 0
235 end select
236 end function utf8_decode_char
237
238 ! Get display width of a Unicode code point
239 ! Returns 0 for combining chars, 1 for normal, 2 for wide (CJK)
240 pure function utf8_char_width(code_point) result(width)
241 integer, intent(in) :: code_point
242 integer :: width
243
244 ! Simplified width calculation
245 ! Full implementation would need Unicode data tables
246
247 ! Control characters
248 if (code_point < 32 .or. (code_point >= 127 .and. code_point < 160)) then
249 width = 0
250 ! CJK Unified Ideographs and other wide chars
251 else if ((code_point >= int(z'1100') .and. code_point <= int(z'115F')) .or. & ! Hangul Jamo
252 (code_point >= int(z'2E80') .and. code_point <= int(z'A4CF')) .or. & ! CJK
253 (code_point >= int(z'AC00') .and. code_point <= int(z'D7A3')) .or. & ! Hangul Syllables
254 (code_point >= int(z'F900') .and. code_point <= int(z'FAFF')) .or. & ! CJK Compatibility
255 (code_point >= int(z'FE10') .and. code_point <= int(z'FE19')) .or. & ! Vertical forms
256 (code_point >= int(z'FE30') .and. code_point <= int(z'FE6F')) .or. & ! CJK Compatibility Forms
257 (code_point >= int(z'FF00') .and. code_point <= int(z'FF60')) .or. & ! Fullwidth Forms
258 (code_point >= int(z'FFE0') .and. code_point <= int(z'FFE6')) .or. & ! Fullwidth Forms
259 (code_point >= int(z'20000') .and. code_point <= int(z'2FFFD')) .or. & ! CJK Extension B-F
260 (code_point >= int(z'30000') .and. code_point <= int(z'3FFFD'))) then
261 width = 2
262 ! Combining characters (simplified - just a few ranges)
263 else if ((code_point >= int(z'0300') .and. code_point <= int(z'036F')) .or. & ! Combining Diacriticals
264 (code_point >= int(z'1AB0') .and. code_point <= int(z'1AFF')) .or. & ! Combining Diacriticals Extended
265 (code_point >= int(z'1DC0') .and. code_point <= int(z'1DFF')) .or. & ! Combining Diacriticals Supplement
266 (code_point >= int(z'20D0') .and. code_point <= int(z'20FF')) .or. & ! Combining Diacriticals for Symbols
267 (code_point >= int(z'FE20') .and. code_point <= int(z'FE2F'))) then ! Combining Half Marks
268 width = 0
269 else
270 ! Normal width
271 width = 1
272 end if
273 end function utf8_char_width
274
275 end module utf8_module
276