simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
valid_utf8_to_utf16.h
1#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
2#define SIMDUTF_VALID_UTF8_TO_UTF16_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_utf16 {
8
9template <endianness big_endian, typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
14 char16_t *utf16_output) {
15 size_t pos = 0;
16 char16_t *start{utf16_output};
17 while (pos < len) {
18#if SIMDUTF_CPLUSPLUS23
19 if !consteval
20#endif
21 { // try to convert the next block of 8 ASCII bytes
22 if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that
23 // they are ascii
24 uint64_t v;
25 ::memcpy(&v, data + pos, sizeof(uint64_t));
26 if ((v & 0x8080808080808080) == 0) {
27 size_t final_pos = pos + 8;
28 while (pos < final_pos) {
29 const char16_t byte = uint8_t(data[pos]);
30 *utf16_output++ =
31 !match_system(big_endian) ? u16_swap_bytes(byte) : byte;
32 pos++;
33 }
34 continue;
35 }
36 }
37 }
38
39 auto leading_byte = uint8_t(data[pos]); // leading byte
40 if (leading_byte < 0b10000000) {
41 // converting one ASCII byte !!!
42 *utf16_output++ = !match_system(big_endian)
43 ? char16_t(u16_swap_bytes(leading_byte))
44 : char16_t(leading_byte);
45 pos++;
46 } else if ((leading_byte & 0b11100000) == 0b11000000) {
47 // We have a two-byte UTF-8, it should become
48 // a single UTF-16 word.
49 if (pos + 1 >= len) {
50 break;
51 } // minimal bound checking
52 uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
53 (uint8_t(data[pos + 1]) & 0b00111111));
54 if simdutf_constexpr (!match_system(big_endian)) {
55 code_point = u16_swap_bytes(uint16_t(code_point));
56 }
57 *utf16_output++ = char16_t(code_point);
58 pos += 2;
59 } else if ((leading_byte & 0b11110000) == 0b11100000) {
60 // We have a three-byte UTF-8, it should become
61 // a single UTF-16 word.
62 if (pos + 2 >= len) {
63 break;
64 } // minimal bound checking
65 uint16_t code_point =
66 uint16_t(((leading_byte & 0b00001111) << 12) |
67 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
68 (uint8_t(data[pos + 2]) & 0b00111111));
69 if simdutf_constexpr (!match_system(big_endian)) {
70 code_point = u16_swap_bytes(uint16_t(code_point));
71 }
72 *utf16_output++ = char16_t(code_point);
73 pos += 3;
74 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
75 // we have a 4-byte UTF-8 word.
76 if (pos + 3 >= len) {
77 break;
78 } // minimal bound checking
79 uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
80 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
81 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
82 (uint8_t(data[pos + 3]) & 0b00111111);
83 code_point -= 0x10000;
84 uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
85 uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
86 if simdutf_constexpr (!match_system(big_endian)) {
87 high_surrogate = u16_swap_bytes(high_surrogate);
88 low_surrogate = u16_swap_bytes(low_surrogate);
89 }
90 *utf16_output++ = char16_t(high_surrogate);
91 *utf16_output++ = char16_t(low_surrogate);
92 pos += 4;
93 } else {
94 // we may have a continuation but we do not do error checking
95 return 0;
96 }
97 }
98 return utf16_output - start;
99}
100
101} // namespace utf8_to_utf16
102} // unnamed namespace
103} // namespace scalar
104} // namespace simdutf
105
106#endif