simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf16_to_utf8.h
1#ifndef SIMDUTF_UTF16_TO_UTF8_H
2#define SIMDUTF_UTF16_TO_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf16_to_utf8 {
8
9template <endianness big_endian, typename InputPtr, typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_utf16<InputPtr>
12// FIXME constrain output as well
13#endif
14simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
15 OutputPtr utf8_output) {
16 size_t pos = 0;
17 const auto start = utf8_output;
18 while (pos < len) {
19#if SIMDUTF_CPLUSPLUS23
20 if !consteval
21#endif
22 {
23 // try to convert the next block of 8 bytes
24 if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that
25 // they are ascii
26 uint64_t v;
27 ::memcpy(&v, data + pos, sizeof(uint64_t));
28 if simdutf_constexpr (!match_system(big_endian)) {
29 v = (v >> 8) | (v << (64 - 8));
30 }
31 if ((v & 0xFF80FF80FF80FF80) == 0) {
32 size_t final_pos = pos + 4;
33 while (pos < final_pos) {
34 *utf8_output++ = !match_system(big_endian)
35 ? char(u16_swap_bytes(data[pos]))
36 : char(data[pos]);
37 pos++;
38 }
39 continue;
40 }
41 }
42 }
43 uint16_t word =
44 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
45 if ((word & 0xFF80) == 0) {
46 // will generate one UTF-8 bytes
47 *utf8_output++ = char(word);
48 pos++;
49 } else if ((word & 0xF800) == 0) {
50 // will generate two UTF-8 bytes
51 // we have 0b110XXXXX 0b10XXXXXX
52 *utf8_output++ = char((word >> 6) | 0b11000000);
53 *utf8_output++ = char((word & 0b111111) | 0b10000000);
54 pos++;
55 } else if ((word & 0xF800) != 0xD800) {
56 // will generate three UTF-8 bytes
57 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
58 *utf8_output++ = char((word >> 12) | 0b11100000);
59 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
60 *utf8_output++ = char((word & 0b111111) | 0b10000000);
61 pos++;
62 } else {
63 // must be a surrogate pair
64 if (pos + 1 >= len) {
65 return 0;
66 }
67 uint16_t diff = uint16_t(word - 0xD800);
68 if (diff > 0x3FF) {
69 return 0;
70 }
71 uint16_t next_word = !match_system(big_endian)
72 ? u16_swap_bytes(data[pos + 1])
73 : data[pos + 1];
74 uint16_t diff2 = uint16_t(next_word - 0xDC00);
75 if (diff2 > 0x3FF) {
76 return 0;
77 }
78 uint32_t value = (diff << 10) + diff2 + 0x10000;
79 // will generate four UTF-8 bytes
80 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
81 *utf8_output++ = char((value >> 18) | 0b11110000);
82 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
83 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
84 *utf8_output++ = char((value & 0b111111) | 0b10000000);
85 pos += 2;
86 }
87 }
88 return utf8_output - start;
89}
90
91template <endianness big_endian, bool check_output = false, typename InputPtr,
92 typename OutputPtr>
93#if SIMDUTF_CPLUSPLUS20
94 requires(simdutf::detail::indexes_into_utf16<InputPtr> &&
95 simdutf::detail::index_assignable_from_char<OutputPtr>)
96#endif
97simdutf_constexpr23 full_result convert_with_errors(InputPtr data, size_t len,
98 OutputPtr utf8_output,
99 size_t utf8_len = 0) {
100 if (check_output && utf8_len == 0) {
101 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, 0, 0);
102 }
103
104 size_t pos = 0;
105 auto start = utf8_output;
106 auto end = utf8_output + utf8_len;
107
108 while (pos < len) {
109#if SIMDUTF_CPLUSPLUS23
110 if !consteval
111#endif
112 {
113 // try to convert the next block of 8 bytes
114 if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that
115 // they are ascii
116 uint64_t v;
117 ::memcpy(&v, data + pos, sizeof(uint64_t));
118 if simdutf_constexpr (!match_system(big_endian))
119 v = (v >> 8) | (v << (64 - 8));
120 if ((v & 0xFF80FF80FF80FF80) == 0) {
121 size_t final_pos = pos + 4;
122 while (pos < final_pos) {
123 if (check_output && size_t(end - utf8_output) < 1) {
124 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
125 utf8_output - start);
126 }
127 *utf8_output++ = !match_system(big_endian)
128 ? char(u16_swap_bytes(data[pos]))
129 : char(data[pos]);
130 pos++;
131 }
132 continue;
133 }
134 }
135 }
136
137 uint16_t word =
138 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
139 if ((word & 0xFF80) == 0) {
140 // will generate one UTF-8 bytes
141 if (check_output && size_t(end - utf8_output) < 1) {
142 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
143 utf8_output - start);
144 }
145 *utf8_output++ = char(word);
146 pos++;
147 } else if ((word & 0xF800) == 0) {
148 // will generate two UTF-8 bytes
149 // we have 0b110XXXXX 0b10XXXXXX
150 if (check_output && size_t(end - utf8_output) < 2) {
151 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
152 utf8_output - start);
153 }
154 *utf8_output++ = char((word >> 6) | 0b11000000);
155 *utf8_output++ = char((word & 0b111111) | 0b10000000);
156 pos++;
157
158 } else if ((word & 0xF800) != 0xD800) {
159 // will generate three UTF-8 bytes
160 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
161 if (check_output && size_t(end - utf8_output) < 3) {
162 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
163 utf8_output - start);
164 }
165 *utf8_output++ = char((word >> 12) | 0b11100000);
166 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
167 *utf8_output++ = char((word & 0b111111) | 0b10000000);
168 pos++;
169 } else {
170
171 if (check_output && size_t(end - utf8_output) < 4) {
172 return full_result(error_code::OUTPUT_BUFFER_TOO_SMALL, pos,
173 utf8_output - start);
174 }
175 // must be a surrogate pair
176 if (pos + 1 >= len) {
177 return full_result(error_code::SURROGATE, pos, utf8_output - start);
178 }
179 uint16_t diff = uint16_t(word - 0xD800);
180 if (diff > 0x3FF) {
181 return full_result(error_code::SURROGATE, pos, utf8_output - start);
182 }
183 uint16_t next_word = !match_system(big_endian)
184 ? u16_swap_bytes(data[pos + 1])
185 : data[pos + 1];
186 uint16_t diff2 = uint16_t(next_word - 0xDC00);
187 if (diff2 > 0x3FF) {
188 return full_result(error_code::SURROGATE, pos, utf8_output - start);
189 }
190 uint32_t value = (diff << 10) + diff2 + 0x10000;
191 // will generate four UTF-8 bytes
192 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
193 *utf8_output++ = char((value >> 18) | 0b11110000);
194 *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
195 *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
196 *utf8_output++ = char((value & 0b111111) | 0b10000000);
197 pos += 2;
198 }
199 }
200 return full_result(error_code::SUCCESS, pos, utf8_output - start);
201}
202
203template <endianness big_endian>
204inline result simple_convert_with_errors(const char16_t *buf, size_t len,
205 char *utf8_output) {
206 return convert_with_errors<big_endian, false>(buf, len, utf8_output, 0);
207}
208
209} // namespace utf16_to_utf8
210} // unnamed namespace
211} // namespace scalar
212} // namespace simdutf
213
214#endif