1#ifndef SIMDUTF_UTF8_TO_UTF32_H
2#define SIMDUTF_UTF8_TO_UTF32_H
7namespace utf8_to_utf32 {
9template <
typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
13simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
14 char32_t *utf32_output) {
16 char32_t *start{utf32_output};
18#if SIMDUTF_CPLUSPLUS23
23 if (pos + 16 <= len) {
26 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
28 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf32_output++ = uint8_t(data[pos]);
40 auto leading_byte = uint8_t(data[pos]);
41 if (leading_byte < 0b10000000) {
43 *utf32_output++ = char32_t(leading_byte);
45 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
50 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
54 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
55 (uint8_t(data[pos + 1]) & 0b00111111);
56 if (code_point < 0x80) {
59 *utf32_output++ = char32_t(code_point);
61 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
67 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
70 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
74 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
75 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
76 (uint8_t(data[pos + 2]) & 0b00111111);
77 if (code_point < 0x800 || (0xd7ff < code_point && code_point < 0xe000)) {
80 *utf32_output++ = char32_t(code_point);
82 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
87 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
90 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
93 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
98 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
99 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
100 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
101 (uint8_t(data[pos + 3]) & 0b00111111);
102 if (code_point <= 0xffff || 0x10ffff < code_point) {
105 *utf32_output++ = char32_t(code_point);
111 return utf32_output - start;
114template <
typename InputPtr>
115#if SIMDUTF_CPLUSPLUS20
116 requires simdutf::detail::indexes_into_byte_like<InputPtr>
118simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
119 char32_t *utf32_output) {
121 char32_t *start{utf32_output};
123#if SIMDUTF_CPLUSPLUS23
128 if (pos + 16 <= len) {
131 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
133 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
135 if ((v & 0x8080808080808080) == 0) {
136 size_t final_pos = pos + 16;
137 while (pos < final_pos) {
138 *utf32_output++ = uint8_t(data[pos]);
145 auto leading_byte = uint8_t(data[pos]);
146 if (leading_byte < 0b10000000) {
148 *utf32_output++ = char32_t(leading_byte);
150 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
152 if (pos + 1 >= len) {
153 return result(error_code::TOO_SHORT, pos);
155 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
156 return result(error_code::TOO_SHORT, pos);
159 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
160 (uint8_t(data[pos + 1]) & 0b00111111);
161 if (code_point < 0x80) {
162 return result(error_code::OVERLONG, pos);
164 *utf32_output++ = char32_t(code_point);
166 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
168 if (pos + 2 >= len) {
169 return result(error_code::TOO_SHORT, pos);
172 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
173 return result(error_code::TOO_SHORT, pos);
175 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
176 return result(error_code::TOO_SHORT, pos);
179 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
180 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
181 (uint8_t(data[pos + 2]) & 0b00111111);
182 if (code_point < 0x800) {
183 return result(error_code::OVERLONG, pos);
185 if (0xd7ff < code_point && code_point < 0xe000) {
186 return result(error_code::SURROGATE, pos);
188 *utf32_output++ = char32_t(code_point);
190 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
192 if (pos + 3 >= len) {
193 return result(error_code::TOO_SHORT, pos);
195 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
196 return result(error_code::TOO_SHORT, pos);
198 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
199 return result(error_code::TOO_SHORT, pos);
201 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
202 return result(error_code::TOO_SHORT, pos);
206 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
207 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
208 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
209 (uint8_t(data[pos + 3]) & 0b00111111);
210 if (code_point <= 0xffff) {
211 return result(error_code::OVERLONG, pos);
213 if (0x10ffff < code_point) {
214 return result(error_code::TOO_LARGE, pos);
216 *utf32_output++ = char32_t(code_point);
220 if ((leading_byte & 0b11000000) == 0b10000000) {
221 return result(error_code::TOO_LONG, pos);
223 return result(error_code::HEADER_BITS, pos);
227 return result(error_code::SUCCESS, utf32_output - start);
245inline result rewind_and_convert_with_errors(
size_t prior_bytes,
246 const char *buf,
size_t len,
247 char32_t *utf32_output) {
250 size_t how_far_back = 3;
251 if (how_far_back > prior_bytes) {
252 how_far_back = prior_bytes;
254 bool found_leading_bytes{
false};
256 for (
size_t i = 0; i <= how_far_back; i++) {
257 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
258 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
259 if (found_leading_bytes) {
260 if (i > 0 &&
byte < 128) {
263 return result(error_code::TOO_LONG, 0 - i + 1);
278 if (!found_leading_bytes) {
283 return result(error_code::TOO_LONG, 0 - how_far_back);
286 result res = convert_with_errors(buf, len + extra_len, utf32_output);
288 res.count -= extra_len;