simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
valid_utf16_to_utf8.h
1#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
2#define SIMDUTF_VALID_UTF16_TO_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf16_to_utf8 {
8
9template <endianness big_endian, typename InputPtr, typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
12 simdutf::detail::index_assignable_from_char<OutputPtr>)
13#endif
14simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
15 OutputPtr utf8_output) {
16 size_t pos = 0;
17 auto start = utf8_output;
18 while (pos < len) {
19#if SIMDUTF_CPLUSPLUS23
20 if !consteval
21#endif
22 {
23 // try to convert the next block of 4 ASCII characters
24 if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that
25 // they are ascii
26 uint64_t v;
27 ::memcpy(&v, data + pos, sizeof(uint64_t));
28 if simdutf_constexpr (!match_system(big_endian)) {
29 v = (v >> 8) | (v << (64 - 8));
30 }
31 if ((v & 0xFF80FF80FF80FF80) == 0) {
32 size_t final_pos = pos + 4;
33 while (pos < final_pos) {
34 *utf8_output++ = !match_system(big_endian)
35 ? char(u16_swap_bytes(data[pos]))
36 : char(data[pos]);
37 pos++;
38 }
39 continue;
40 }
41 }
42 }
43
44 uint16_t word =
45 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
46 if ((word & 0xFF80) == 0) {
47 // will generate one UTF-8 bytes
48 *utf8_output++ = char(word);
49 pos++;
50 } else if ((word & 0xF800) == 0) {
51 // will generate two UTF-8 bytes
52 // we have 0b110XXXXX 0b10XXXXXX
53 *utf8_output++ = char((word >> 6) | 0b11000000);
54 *utf8_output++ = char((word & 0b111111) | 0b10000000);
55 pos++;
56 } else if ((word & 0xF800) != 0xD800) {
57 // will generate three UTF-8 bytes
58 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
59 *utf8_output++ = char((word >> 12) | 0b11100000);
60 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
61 *utf8_output++ = char((word & 0b111111) | 0b10000000);
62 pos++;
63 } else {
64 // must be a surrogate pair
65 uint16_t diff = uint16_t(word - 0xD800);
66 if (pos + 1 >= len) {
67 return 0;
68 } // minimal bound checking
69 uint16_t next_word = !match_system(big_endian)
70 ? u16_swap_bytes(data[pos + 1])
71 : data[pos + 1];
72 uint16_t diff2 = uint16_t(next_word - 0xDC00);
73 uint32_t value = (diff << 10) + diff2 + 0x10000;
74 // will generate four UTF-8 bytes
75 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
76 *utf8_output++ = char((value >> 18) | 0b11110000);
77 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
78 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
79 *utf8_output++ = char((value & 0b111111) | 0b10000000);
80 pos += 2;
81 }
82 }
83 return utf8_output - start;
84}
85
86} // namespace utf16_to_utf8
87} // unnamed namespace
88} // namespace scalar
89} // namespace simdutf
90
91#endif