1#ifndef SIMDUTF_UTF8_TO_UTF16_H
2#define SIMDUTF_UTF8_TO_UTF16_H
7namespace utf8_to_utf16 {
9template <endianness big_endian,
typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
13simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
14 char16_t *utf16_output) {
16 char16_t *start{utf16_output};
18#if SIMDUTF_CPLUSPLUS23
23 if (pos + 16 <= len) {
26 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
28 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf16_output++ = !match_system(big_endian)
34 ? char16_t(u16_swap_bytes(data[pos]))
35 : char16_t(data[pos]);
43 uint8_t leading_byte = data[pos];
44 if (leading_byte < 0b10000000) {
46 *utf16_output++ = !match_system(big_endian)
47 ? char16_t(u16_swap_bytes(leading_byte))
48 : char16_t(leading_byte);
50 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
56 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
61 (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
62 if (code_point < 0x80) {
65 if constexpr (!match_system(big_endian)) {
66 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
68 *utf16_output++ = char16_t(code_point);
70 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
77 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
80 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
84 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
85 (data[pos + 1] & 0b00111111) << 6 |
86 (data[pos + 2] & 0b00111111);
87 if (code_point < 0x800 || (0xd7ff < code_point && code_point < 0xe000)) {
90 if constexpr (!match_system(big_endian)) {
91 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
93 *utf16_output++ = char16_t(code_point);
95 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
100 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
103 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
106 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
111 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
112 (data[pos + 1] & 0b00111111) << 12 |
113 (data[pos + 2] & 0b00111111) << 6 |
114 (data[pos + 3] & 0b00111111);
115 if (code_point <= 0xffff || 0x10ffff < code_point) {
118 code_point -= 0x10000;
119 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
120 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
121 if constexpr (!match_system(big_endian)) {
122 high_surrogate = u16_swap_bytes(high_surrogate);
123 low_surrogate = u16_swap_bytes(low_surrogate);
125 *utf16_output++ = char16_t(high_surrogate);
126 *utf16_output++ = char16_t(low_surrogate);
132 return utf16_output - start;
135template <endianness big_endian,
typename InputPtr>
136#if SIMDUTF_CPLUSPLUS20
137 requires simdutf::detail::indexes_into_byte_like<InputPtr>
139simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
140 char16_t *utf16_output) {
142 char16_t *start{utf16_output};
144#if SIMDUTF_CPLUSPLUS23
149 if (pos + 16 <= len) {
152 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
154 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
156 if ((v & 0x8080808080808080) == 0) {
157 size_t final_pos = pos + 16;
158 while (pos < final_pos) {
159 const char16_t byte = uint8_t(data[pos]);
161 !match_system(big_endian) ? u16_swap_bytes(
byte) : byte;
169 auto leading_byte = uint8_t(data[pos]);
170 if (leading_byte < 0b10000000) {
172 *utf16_output++ = !match_system(big_endian)
173 ? char16_t(u16_swap_bytes(leading_byte))
174 : char16_t(leading_byte);
176 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
179 if (pos + 1 >= len) {
180 return result(error_code::TOO_SHORT, pos);
182 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
183 return result(error_code::TOO_SHORT, pos);
186 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
187 (uint8_t(data[pos + 1]) & 0b00111111);
188 if (code_point < 0x80) {
189 return result(error_code::OVERLONG, pos);
191 if constexpr (!match_system(big_endian)) {
192 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
194 *utf16_output++ = char16_t(code_point);
196 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
199 if (pos + 2 >= len) {
200 return result(error_code::TOO_SHORT, pos);
203 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
204 return result(error_code::TOO_SHORT, pos);
206 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
207 return result(error_code::TOO_SHORT, pos);
210 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
211 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
212 (uint8_t(data[pos + 2]) & 0b00111111);
213 if (code_point < 0x800) {
214 return result(error_code::OVERLONG, pos);
216 if (0xd7ff < code_point && code_point < 0xe000) {
217 return result(error_code::SURROGATE, pos);
219 if constexpr (!match_system(big_endian)) {
220 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
222 *utf16_output++ = char16_t(code_point);
224 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
226 if (pos + 3 >= len) {
227 return result(error_code::TOO_SHORT, pos);
229 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
230 return result(error_code::TOO_SHORT, pos);
232 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
233 return result(error_code::TOO_SHORT, pos);
235 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
236 return result(error_code::TOO_SHORT, pos);
240 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
241 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
242 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
243 (uint8_t(data[pos + 3]) & 0b00111111);
244 if (code_point <= 0xffff) {
245 return result(error_code::OVERLONG, pos);
247 if (0x10ffff < code_point) {
248 return result(error_code::TOO_LARGE, pos);
250 code_point -= 0x10000;
251 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
252 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
253 if constexpr (!match_system(big_endian)) {
254 high_surrogate = u16_swap_bytes(high_surrogate);
255 low_surrogate = u16_swap_bytes(low_surrogate);
257 *utf16_output++ = char16_t(high_surrogate);
258 *utf16_output++ = char16_t(low_surrogate);
262 if ((leading_byte & 0b11000000) == 0b10000000) {
263 return result(error_code::TOO_LONG, pos);
265 return result(error_code::HEADER_BITS, pos);
269 return result(error_code::SUCCESS, utf16_output - start);
287template <endianness endian>
288inline result rewind_and_convert_with_errors(
size_t prior_bytes,
289 const char *buf,
size_t len,
290 char16_t *utf16_output) {
295 size_t how_far_back = prior_bytes;
298 bool found_leading_bytes{
false};
300 for (
size_t i = 0; i <= how_far_back; i++) {
301 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
302 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
303 if (found_leading_bytes) {
304 if (i > 0 &&
byte < 128) {
307 return result(error_code::TOO_LONG, 0 - i + 1);
322 if (!found_leading_bytes) {
327 return result(error_code::TOO_LONG, 0 - how_far_back);
329 result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
331 res.count -= extra_len;