1#ifndef SIMDUTF_UTF8_TO_UTF16_H
2#define SIMDUTF_UTF8_TO_UTF16_H
7namespace utf8_to_utf16 {
9template <endianness big_endian,
typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
13simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
14 char16_t *utf16_output) {
16 char16_t *start{utf16_output};
18#if SIMDUTF_CPLUSPLUS23
23 if (pos + 16 <= len) {
26 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
28 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf16_output++ = !match_system(big_endian)
34 ? char16_t(u16_swap_bytes(data[pos]))
35 : char16_t(data[pos]);
43 uint8_t leading_byte = data[pos];
44 if (leading_byte < 0b10000000) {
46 *utf16_output++ = !match_system(big_endian)
47 ? char16_t(u16_swap_bytes(leading_byte))
48 : char16_t(leading_byte);
50 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
56 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
61 (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
62 if (code_point < 0x80 || 0x7ff < code_point) {
65 if simdutf_constexpr (!match_system(big_endian)) {
66 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
68 *utf16_output++ = char16_t(code_point);
70 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
77 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
80 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
84 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
85 (data[pos + 1] & 0b00111111) << 6 |
86 (data[pos + 2] & 0b00111111);
87 if (code_point < 0x800 || 0xffff < code_point ||
88 (0xd7ff < code_point && code_point < 0xe000)) {
91 if simdutf_constexpr (!match_system(big_endian)) {
92 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
94 *utf16_output++ = char16_t(code_point);
96 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
101 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
104 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
107 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
112 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
113 (data[pos + 1] & 0b00111111) << 12 |
114 (data[pos + 2] & 0b00111111) << 6 |
115 (data[pos + 3] & 0b00111111);
116 if (code_point <= 0xffff || 0x10ffff < code_point) {
119 code_point -= 0x10000;
120 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
121 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
122 if simdutf_constexpr (!match_system(big_endian)) {
123 high_surrogate = u16_swap_bytes(high_surrogate);
124 low_surrogate = u16_swap_bytes(low_surrogate);
126 *utf16_output++ = char16_t(high_surrogate);
127 *utf16_output++ = char16_t(low_surrogate);
133 return utf16_output - start;
136template <endianness big_endian,
typename InputPtr>
137#if SIMDUTF_CPLUSPLUS20
138 requires simdutf::detail::indexes_into_byte_like<InputPtr>
140simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
141 char16_t *utf16_output) {
143 char16_t *start{utf16_output};
145#if SIMDUTF_CPLUSPLUS23
150 if (pos + 16 <= len) {
153 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
155 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
157 if ((v & 0x8080808080808080) == 0) {
158 size_t final_pos = pos + 16;
159 while (pos < final_pos) {
160 const char16_t byte = uint8_t(data[pos]);
162 !match_system(big_endian) ? u16_swap_bytes(
byte) : byte;
170 auto leading_byte = uint8_t(data[pos]);
171 if (leading_byte < 0b10000000) {
173 *utf16_output++ = !match_system(big_endian)
174 ? char16_t(u16_swap_bytes(leading_byte))
175 : char16_t(leading_byte);
177 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
180 if (pos + 1 >= len) {
181 return result(error_code::TOO_SHORT, pos);
183 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
184 return result(error_code::TOO_SHORT, pos);
187 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
188 (uint8_t(data[pos + 1]) & 0b00111111);
189 if (code_point < 0x80 || 0x7ff < code_point) {
190 return result(error_code::OVERLONG, pos);
192 if simdutf_constexpr (!match_system(big_endian)) {
193 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
195 *utf16_output++ = char16_t(code_point);
197 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
200 if (pos + 2 >= len) {
201 return result(error_code::TOO_SHORT, pos);
204 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
205 return result(error_code::TOO_SHORT, pos);
207 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
208 return result(error_code::TOO_SHORT, pos);
211 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
212 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
213 (uint8_t(data[pos + 2]) & 0b00111111);
214 if ((code_point < 0x800) || (0xffff < code_point)) {
215 return result(error_code::OVERLONG, pos);
217 if (0xd7ff < code_point && code_point < 0xe000) {
218 return result(error_code::SURROGATE, pos);
220 if simdutf_constexpr (!match_system(big_endian)) {
221 code_point = uint32_t(u16_swap_bytes(uint16_t(code_point)));
223 *utf16_output++ = char16_t(code_point);
225 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
227 if (pos + 3 >= len) {
228 return result(error_code::TOO_SHORT, pos);
230 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
231 return result(error_code::TOO_SHORT, pos);
233 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
234 return result(error_code::TOO_SHORT, pos);
236 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
237 return result(error_code::TOO_SHORT, pos);
241 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
242 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
243 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
244 (uint8_t(data[pos + 3]) & 0b00111111);
245 if (code_point <= 0xffff) {
246 return result(error_code::OVERLONG, pos);
248 if (0x10ffff < code_point) {
249 return result(error_code::TOO_LARGE, pos);
251 code_point -= 0x10000;
252 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
253 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
254 if simdutf_constexpr (!match_system(big_endian)) {
255 high_surrogate = u16_swap_bytes(high_surrogate);
256 low_surrogate = u16_swap_bytes(low_surrogate);
258 *utf16_output++ = char16_t(high_surrogate);
259 *utf16_output++ = char16_t(low_surrogate);
263 if ((leading_byte & 0b11000000) == 0b10000000) {
264 return result(error_code::TOO_LONG, pos);
266 return result(error_code::HEADER_BITS, pos);
270 return result(error_code::SUCCESS, utf16_output - start);
288template <endianness endian>
289inline result rewind_and_convert_with_errors(
size_t prior_bytes,
290 const char *buf,
size_t len,
291 char16_t *utf16_output) {
296 size_t how_far_back = prior_bytes;
299 bool found_leading_bytes{
false};
301 for (
size_t i = 0; i <= how_far_back; i++) {
302 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
303 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
304 if (found_leading_bytes) {
305 if (i > 0 &&
byte < 128) {
308 return result(error_code::TOO_LONG, 0 - i + 1);
323 if (!found_leading_bytes) {
328 return result(error_code::TOO_LONG, 0 - how_far_back);
330 result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
332 res.count -= extra_len;