1#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
2#define SIMDUTF_VALID_UTF8_TO_UTF16_H
7namespace utf8_to_utf16 {
9template <endianness big_endian,
typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
13simdutf_constexpr23
size_t convert_valid(InputPtr data,
size_t len,
14 char16_t *utf16_output) {
16 char16_t *start{utf16_output};
18#if SIMDUTF_CPLUSPLUS23
25 ::memcpy(&v, data + pos,
sizeof(uint64_t));
26 if ((v & 0x8080808080808080) == 0) {
27 size_t final_pos = pos + 8;
28 while (pos < final_pos) {
29 const char16_t byte = uint8_t(data[pos]);
31 !match_system(big_endian) ? u16_swap_bytes(
byte) : byte;
39 auto leading_byte = uint8_t(data[pos]);
40 if (leading_byte < 0b10000000) {
42 *utf16_output++ = !match_system(big_endian)
43 ? char16_t(u16_swap_bytes(leading_byte))
44 : char16_t(leading_byte);
46 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
52 uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
53 (uint8_t(data[pos + 1]) & 0b00111111));
54 if simdutf_constexpr (!match_system(big_endian)) {
55 code_point = u16_swap_bytes(uint16_t(code_point));
57 *utf16_output++ = char16_t(code_point);
59 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
66 uint16_t(((leading_byte & 0b00001111) << 12) |
67 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
68 (uint8_t(data[pos + 2]) & 0b00111111));
69 if simdutf_constexpr (!match_system(big_endian)) {
70 code_point = u16_swap_bytes(uint16_t(code_point));
72 *utf16_output++ = char16_t(code_point);
74 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
79 uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
80 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
81 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
82 (uint8_t(data[pos + 3]) & 0b00111111);
83 code_point -= 0x10000;
84 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
85 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
86 if simdutf_constexpr (!match_system(big_endian)) {
87 high_surrogate = u16_swap_bytes(high_surrogate);
88 low_surrogate = u16_swap_bytes(low_surrogate);
90 *utf16_output++ = char16_t(high_surrogate);
91 *utf16_output++ = char16_t(low_surrogate);
98 return utf16_output - start;