simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8_to_utf16.h
1#ifndef SIMDUTF_UTF8_TO_UTF16_H
2#define SIMDUTF_UTF8_TO_UTF16_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_utf16 {
8
9template <endianness big_endian, typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
14 char16_t *utf16_output) {
15 size_t pos = 0;
16 char16_t *start{utf16_output};
17 while (pos < len) {
18#if SIMDUTF_CPLUSPLUS23
19 if !consteval
20#endif
21 // try to convert the next block of 16 ASCII bytes
22 {
23 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
24 // they are ascii
25 uint64_t v1;
26 ::memcpy(&v1, data + pos, sizeof(uint64_t));
27 uint64_t v2;
28 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
29 uint64_t v{v1 | v2};
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf16_output++ = !match_system(big_endian)
34 ? char16_t(u16_swap_bytes(data[pos]))
35 : char16_t(data[pos]);
36 pos++;
37 }
38 continue;
39 }
40 }
41 }
42
43 uint8_t leading_byte = data[pos]; // leading byte
44 if (leading_byte < 0b10000000) {
45 // converting one ASCII byte !!!
46 *utf16_output++ = !match_system(big_endian)
47 ? char16_t(u16_swap_bytes(leading_byte))
48 : char16_t(leading_byte);
49 pos++;
50 } else if ((leading_byte & 0b11100000) == 0b11000000) {
51 // We have a two-byte UTF-8, it should become
52 // a single UTF-16 word.
53 if (pos + 1 >= len) {
54 return 0;
55 } // minimal bound checking
56 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
57 return 0;
58 }
59 // range check
60 uint32_t code_point =
61 (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
62 if (code_point < 0x80 || 0x7ff < code_point) {
63 return 0;
64 }
65 if simdutf_constexpr (!match_system(big_endian)) {
66 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
67 }
68 *utf16_output++ = char16_t(code_point);
69 pos += 2;
70 } else if ((leading_byte & 0b11110000) == 0b11100000) {
71 // We have a three-byte UTF-8, it should become
72 // a single UTF-16 word.
73 if (pos + 2 >= len) {
74 return 0;
75 } // minimal bound checking
76
77 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
78 return 0;
79 }
80 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
81 return 0;
82 }
83 // range check
84 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
85 (data[pos + 1] & 0b00111111) << 6 |
86 (data[pos + 2] & 0b00111111);
87 if (code_point < 0x800 || 0xffff < code_point ||
88 (0xd7ff < code_point && code_point < 0xe000)) {
89 return 0;
90 }
91 if simdutf_constexpr (!match_system(big_endian)) {
92 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
93 }
94 *utf16_output++ = char16_t(code_point);
95 pos += 3;
96 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
97 // we have a 4-byte UTF-8 word.
98 if (pos + 3 >= len) {
99 return 0;
100 } // minimal bound checking
101 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
102 return 0;
103 }
104 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
105 return 0;
106 }
107 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
108 return 0;
109 }
110
111 // range check
112 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
113 (data[pos + 1] & 0b00111111) << 12 |
114 (data[pos + 2] & 0b00111111) << 6 |
115 (data[pos + 3] & 0b00111111);
116 if (code_point <= 0xffff || 0x10ffff < code_point) {
117 return 0;
118 }
119 code_point -= 0x10000;
120 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
121 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
122 if simdutf_constexpr (!match_system(big_endian)) {
123 high_surrogate = u16_swap_bytes(high_surrogate);
124 low_surrogate = u16_swap_bytes(low_surrogate);
125 }
126 *utf16_output++ = char16_t(high_surrogate);
127 *utf16_output++ = char16_t(low_surrogate);
128 pos += 4;
129 } else {
130 return 0;
131 }
132 }
133 return utf16_output - start;
134}
135
136template <endianness big_endian, typename InputPtr>
137#if SIMDUTF_CPLUSPLUS20
138 requires simdutf::detail::indexes_into_byte_like<InputPtr>
139#endif
140simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
141 char16_t *utf16_output) {
142 size_t pos = 0;
143 char16_t *start{utf16_output};
144 while (pos < len) {
145#if SIMDUTF_CPLUSPLUS23
146 if !consteval
147#endif
148 {
149 // try to convert the next block of 16 ASCII bytes
150 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
151 // they are ascii
152 uint64_t v1;
153 ::memcpy(&v1, data + pos, sizeof(uint64_t));
154 uint64_t v2;
155 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
156 uint64_t v{v1 | v2};
157 if ((v & 0x8080808080808080) == 0) {
158 size_t final_pos = pos + 16;
159 while (pos < final_pos) {
160 const char16_t byte = uint8_t(data[pos]);
161 *utf16_output++ =
162 !match_system(big_endian) ? u16_swap_bytes(byte) : byte;
163 pos++;
164 }
165 continue;
166 }
167 }
168 }
169
170 auto leading_byte = uint8_t(data[pos]); // leading byte
171 if (leading_byte < 0b10000000) {
172 // converting one ASCII byte !!!
173 *utf16_output++ = !match_system(big_endian)
174 ? char16_t(u16_swap_bytes(leading_byte))
175 : char16_t(leading_byte);
176 pos++;
177 } else if ((leading_byte & 0b11100000) == 0b11000000) {
178 // We have a two-byte UTF-8, it should become
179 // a single UTF-16 word.
180 if (pos + 1 >= len) {
181 return result(error_code::TOO_SHORT, pos);
182 } // minimal bound checking
183 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
184 return result(error_code::TOO_SHORT, pos);
185 }
186 // range check
187 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
188 (uint8_t(data[pos + 1]) & 0b00111111);
189 if (code_point < 0x80 || 0x7ff < code_point) {
190 return result(error_code::OVERLONG, pos);
191 }
192 if simdutf_constexpr (!match_system(big_endian)) {
193 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
194 }
195 *utf16_output++ = char16_t(code_point);
196 pos += 2;
197 } else if ((leading_byte & 0b11110000) == 0b11100000) {
198 // We have a three-byte UTF-8, it should become
199 // a single UTF-16 word.
200 if (pos + 2 >= len) {
201 return result(error_code::TOO_SHORT, pos);
202 } // minimal bound checking
203
204 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
205 return result(error_code::TOO_SHORT, pos);
206 }
207 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
208 return result(error_code::TOO_SHORT, pos);
209 }
210 // range check
211 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
212 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
213 (uint8_t(data[pos + 2]) & 0b00111111);
214 if ((code_point < 0x800) || (0xffff < code_point)) {
215 return result(error_code::OVERLONG, pos);
216 }
217 if (0xd7ff < code_point && code_point < 0xe000) {
218 return result(error_code::SURROGATE, pos);
219 }
220 if simdutf_constexpr (!match_system(big_endian)) {
221 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
222 }
223 *utf16_output++ = char16_t(code_point);
224 pos += 3;
225 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
226 // we have a 4-byte UTF-8 word.
227 if (pos + 3 >= len) {
228 return result(error_code::TOO_SHORT, pos);
229 } // minimal bound checking
230 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
231 return result(error_code::TOO_SHORT, pos);
232 }
233 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
234 return result(error_code::TOO_SHORT, pos);
235 }
236 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
237 return result(error_code::TOO_SHORT, pos);
238 }
239
240 // range check
241 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
242 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
243 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
244 (uint8_t(data[pos + 3]) & 0b00111111);
245 if (code_point <= 0xffff) {
246 return result(error_code::OVERLONG, pos);
247 }
248 if (0x10ffff < code_point) {
249 return result(error_code::TOO_LARGE, pos);
250 }
251 code_point -= 0x10000;
252 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
253 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
254 if simdutf_constexpr (!match_system(big_endian)) {
255 high_surrogate = u16_swap_bytes(high_surrogate);
256 low_surrogate = u16_swap_bytes(low_surrogate);
257 }
258 *utf16_output++ = char16_t(high_surrogate);
259 *utf16_output++ = char16_t(low_surrogate);
260 pos += 4;
261 } else {
262 // we either have too many continuation bytes or an invalid leading byte
263 if ((leading_byte & 0b11000000) == 0b10000000) {
264 return result(error_code::TOO_LONG, pos);
265 } else {
266 return result(error_code::HEADER_BITS, pos);
267 }
268 }
269 }
270 return result(error_code::SUCCESS, utf16_output - start);
271}
272
288template <endianness endian>
289inline result rewind_and_convert_with_errors(size_t prior_bytes,
290 const char *buf, size_t len,
291 char16_t *utf16_output) {
292 size_t extra_len{0};
293 // We potentially need to go back in time and find a leading byte.
294 // In theory '3' would be sufficient, but sometimes the error can go back
295 // quite far.
296 size_t how_far_back = prior_bytes;
297 // size_t how_far_back = 3; // 3 bytes in the past + current position
298 // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
299 bool found_leading_bytes{false};
300 // important: it is i <= how_far_back and not 'i < how_far_back'.
301 for (size_t i = 0; i <= how_far_back; i++) {
302 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
303 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
304 if (found_leading_bytes) {
305 if (i > 0 && byte < 128) {
306 // If we had to go back and the leading byte is ascii
307 // then we can stop right away.
308 return result(error_code::TOO_LONG, 0 - i + 1);
309 }
310 buf -= i;
311 extra_len = i;
312 break;
313 }
314 }
315 //
316 // It is possible for this function to return a negative count in its result.
317 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
318 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
319 // unsigned integral type of the result of the sizeof operator
320 //
321 // An unsigned type will simply wrap round arithmetically (well defined).
322 //
323 if (!found_leading_bytes) {
324 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
325 // [....] [continuation] [continuation] [continuation] | [buf is
326 // continuation] Or we possibly have a stream that does not start with a
327 // leading byte.
328 return result(error_code::TOO_LONG, 0 - how_far_back);
329 }
330 result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
331 if (res.error) {
332 res.count -= extra_len;
333 }
334 return res;
335}
336
337} // namespace utf8_to_utf16
338} // unnamed namespace
339} // namespace scalar
340} // namespace simdutf
341
342#endif