simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8_to_utf16.h
1#ifndef SIMDUTF_UTF8_TO_UTF16_H
2#define SIMDUTF_UTF8_TO_UTF16_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_utf16 {
8
9template <endianness big_endian, typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
14 char16_t *utf16_output) {
15 size_t pos = 0;
16 char16_t *start{utf16_output};
17 while (pos < len) {
18#if SIMDUTF_CPLUSPLUS23
19 if !consteval
20#endif
21 // try to convert the next block of 16 ASCII bytes
22 {
23 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
24 // they are ascii
25 uint64_t v1;
26 ::memcpy(&v1, data + pos, sizeof(uint64_t));
27 uint64_t v2;
28 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
29 uint64_t v{v1 | v2};
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf16_output++ = !match_system(big_endian)
34 ? char16_t(u16_swap_bytes(data[pos]))
35 : char16_t(data[pos]);
36 pos++;
37 }
38 continue;
39 }
40 }
41 }
42
43 uint8_t leading_byte = data[pos]; // leading byte
44 if (leading_byte < 0b10000000) {
45 // converting one ASCII byte !!!
46 *utf16_output++ = !match_system(big_endian)
47 ? char16_t(u16_swap_bytes(leading_byte))
48 : char16_t(leading_byte);
49 pos++;
50 } else if ((leading_byte & 0b11100000) == 0b11000000) {
51 // We have a two-byte UTF-8, it should become
52 // a single UTF-16 word.
53 if (pos + 1 >= len) {
54 return 0;
55 } // minimal bound checking
56 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
57 return 0;
58 }
59 // range check
60 uint32_t code_point =
61 (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
62 if (code_point < 0x80) {
63 return 0;
64 }
65 if constexpr (!match_system(big_endian)) {
66 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
67 }
68 *utf16_output++ = char16_t(code_point);
69 pos += 2;
70 } else if ((leading_byte & 0b11110000) == 0b11100000) {
71 // We have a three-byte UTF-8, it should become
72 // a single UTF-16 word.
73 if (pos + 2 >= len) {
74 return 0;
75 } // minimal bound checking
76
77 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
78 return 0;
79 }
80 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
81 return 0;
82 }
83 // range check
84 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
85 (data[pos + 1] & 0b00111111) << 6 |
86 (data[pos + 2] & 0b00111111);
87 if (code_point < 0x800 || (0xd7ff < code_point && code_point < 0xe000)) {
88 return 0;
89 }
90 if constexpr (!match_system(big_endian)) {
91 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
92 }
93 *utf16_output++ = char16_t(code_point);
94 pos += 3;
95 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
96 // we have a 4-byte UTF-8 word.
97 if (pos + 3 >= len) {
98 return 0;
99 } // minimal bound checking
100 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
101 return 0;
102 }
103 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
104 return 0;
105 }
106 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
107 return 0;
108 }
109
110 // range check
111 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
112 (data[pos + 1] & 0b00111111) << 12 |
113 (data[pos + 2] & 0b00111111) << 6 |
114 (data[pos + 3] & 0b00111111);
115 if (code_point <= 0xffff || 0x10ffff < code_point) {
116 return 0;
117 }
118 code_point -= 0x10000;
119 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
120 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
121 if constexpr (!match_system(big_endian)) {
122 high_surrogate = u16_swap_bytes(high_surrogate);
123 low_surrogate = u16_swap_bytes(low_surrogate);
124 }
125 *utf16_output++ = char16_t(high_surrogate);
126 *utf16_output++ = char16_t(low_surrogate);
127 pos += 4;
128 } else {
129 return 0;
130 }
131 }
132 return utf16_output - start;
133}
134
135template <endianness big_endian, typename InputPtr>
136#if SIMDUTF_CPLUSPLUS20
137 requires simdutf::detail::indexes_into_byte_like<InputPtr>
138#endif
139simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
140 char16_t *utf16_output) {
141 size_t pos = 0;
142 char16_t *start{utf16_output};
143 while (pos < len) {
144#if SIMDUTF_CPLUSPLUS23
145 if !consteval
146#endif
147 {
148 // try to convert the next block of 16 ASCII bytes
149 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
150 // they are ascii
151 uint64_t v1;
152 ::memcpy(&v1, data + pos, sizeof(uint64_t));
153 uint64_t v2;
154 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
155 uint64_t v{v1 | v2};
156 if ((v & 0x8080808080808080) == 0) {
157 size_t final_pos = pos + 16;
158 while (pos < final_pos) {
159 const char16_t byte = uint8_t(data[pos]);
160 *utf16_output++ =
161 !match_system(big_endian) ? u16_swap_bytes(byte) : byte;
162 pos++;
163 }
164 continue;
165 }
166 }
167 }
168
169 auto leading_byte = uint8_t(data[pos]); // leading byte
170 if (leading_byte < 0b10000000) {
171 // converting one ASCII byte !!!
172 *utf16_output++ = !match_system(big_endian)
173 ? char16_t(u16_swap_bytes(leading_byte))
174 : char16_t(leading_byte);
175 pos++;
176 } else if ((leading_byte & 0b11100000) == 0b11000000) {
177 // We have a two-byte UTF-8, it should become
178 // a single UTF-16 word.
179 if (pos + 1 >= len) {
180 return result(error_code::TOO_SHORT, pos);
181 } // minimal bound checking
182 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
183 return result(error_code::TOO_SHORT, pos);
184 }
185 // range check
186 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
187 (uint8_t(data[pos + 1]) & 0b00111111);
188 if (code_point < 0x80) {
189 return result(error_code::OVERLONG, pos);
190 }
191 if constexpr (!match_system(big_endian)) {
192 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
193 }
194 *utf16_output++ = char16_t(code_point);
195 pos += 2;
196 } else if ((leading_byte & 0b11110000) == 0b11100000) {
197 // We have a three-byte UTF-8, it should become
198 // a single UTF-16 word.
199 if (pos + 2 >= len) {
200 return result(error_code::TOO_SHORT, pos);
201 } // minimal bound checking
202
203 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
204 return result(error_code::TOO_SHORT, pos);
205 }
206 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
207 return result(error_code::TOO_SHORT, pos);
208 }
209 // range check
210 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
211 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
212 (uint8_t(data[pos + 2]) & 0b00111111);
213 if (code_point < 0x800) {
214 return result(error_code::OVERLONG, pos);
215 }
216 if (0xd7ff < code_point && code_point < 0xe000) {
217 return result(error_code::SURROGATE, pos);
218 }
219 if constexpr (!match_system(big_endian)) {
220 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
221 }
222 *utf16_output++ = char16_t(code_point);
223 pos += 3;
224 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
225 // we have a 4-byte UTF-8 word.
226 if (pos + 3 >= len) {
227 return result(error_code::TOO_SHORT, pos);
228 } // minimal bound checking
229 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
230 return result(error_code::TOO_SHORT, pos);
231 }
232 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
233 return result(error_code::TOO_SHORT, pos);
234 }
235 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
236 return result(error_code::TOO_SHORT, pos);
237 }
238
239 // range check
240 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
241 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
242 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
243 (uint8_t(data[pos + 3]) & 0b00111111);
244 if (code_point <= 0xffff) {
245 return result(error_code::OVERLONG, pos);
246 }
247 if (0x10ffff < code_point) {
248 return result(error_code::TOO_LARGE, pos);
249 }
250 code_point -= 0x10000;
251 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
252 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
253 if constexpr (!match_system(big_endian)) {
254 high_surrogate = u16_swap_bytes(high_surrogate);
255 low_surrogate = u16_swap_bytes(low_surrogate);
256 }
257 *utf16_output++ = char16_t(high_surrogate);
258 *utf16_output++ = char16_t(low_surrogate);
259 pos += 4;
260 } else {
261 // we either have too many continuation bytes or an invalid leading byte
262 if ((leading_byte & 0b11000000) == 0b10000000) {
263 return result(error_code::TOO_LONG, pos);
264 } else {
265 return result(error_code::HEADER_BITS, pos);
266 }
267 }
268 }
269 return result(error_code::SUCCESS, utf16_output - start);
270}
271
272/**
273 * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
274 * we have up to len input bytes left, and we encountered some error. It is
275 * possible that the error is at 'buf' exactly, but it could also be in the
276 * previous bytes (up to 3 bytes back).
277 *
278 * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
279 * current memory section and can be safely accessed. We prior_bytes to access
280 * safely up to three bytes before 'buf'.
281 *
282 * The caller is responsible to ensure that len > 0.
283 *
284 * If the error is believed to have occurred prior to 'buf', the count value
285 * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
286 */
287template <endianness endian>
288inline result rewind_and_convert_with_errors(size_t prior_bytes,
289 const char *buf, size_t len,
290 char16_t *utf16_output) {
291 size_t extra_len{0};
292 // We potentially need to go back in time and find a leading byte.
293 // In theory '3' would be sufficient, but sometimes the error can go back
294 // quite far.
295 size_t how_far_back = prior_bytes;
296 // size_t how_far_back = 3; // 3 bytes in the past + current position
297 // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
298 bool found_leading_bytes{false};
299 // important: it is i <= how_far_back and not 'i < how_far_back'.
300 for (size_t i = 0; i <= how_far_back; i++) {
301 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
302 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
303 if (found_leading_bytes) {
304 if (i > 0 && byte < 128) {
305 // If we had to go back and the leading byte is ascii
306 // then we can stop right away.
307 return result(error_code::TOO_LONG, 0 - i + 1);
308 }
309 buf -= i;
310 extra_len = i;
311 break;
312 }
313 }
314 //
315 // It is possible for this function to return a negative count in its result.
316 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
317 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
318 // unsigned integral type of the result of the sizeof operator
319 //
320 // An unsigned type will simply wrap round arithmetically (well defined).
321 //
322 if (!found_leading_bytes) {
323 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
324 // [....] [continuation] [continuation] [continuation] | [buf is
325 // continuation] Or we possibly have a stream that does not start with a
326 // leading byte.
327 return result(error_code::TOO_LONG, 0 - how_far_back);
328 }
329 result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
330 if (res.error) {
331 res.count -= extra_len;
332 }
333 return res;
334}
335
336} // namespace utf8_to_utf16
337} // unnamed namespace
338} // namespace scalar
339} // namespace simdutf
340
341#endif