1#ifndef SIMDUTF_UTF8_TO_UTF32_H
2#define SIMDUTF_UTF8_TO_UTF32_H
7namespace utf8_to_utf32 {
9template <
typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
13simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
14 char32_t *utf32_output) {
16 char32_t *start{utf32_output};
18#if SIMDUTF_CPLUSPLUS23
23 if (pos + 16 <= len) {
26 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
28 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf32_output++ = uint8_t(data[pos]);
40 auto leading_byte = uint8_t(data[pos]);
41 if (leading_byte < 0b10000000) {
43 *utf32_output++ = char32_t(leading_byte);
45 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
50 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
54 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
55 (uint8_t(data[pos + 1]) & 0b00111111);
56 if (code_point < 0x80 || 0x7ff < code_point) {
59 *utf32_output++ = char32_t(code_point);
61 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
67 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
70 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
74 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
75 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
76 (uint8_t(data[pos + 2]) & 0b00111111);
77 if (code_point < 0x800 || 0xffff < code_point ||
78 (0xd7ff < code_point && code_point < 0xe000)) {
81 *utf32_output++ = char32_t(code_point);
83 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
88 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
91 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
94 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
99 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
100 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
101 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
102 (uint8_t(data[pos + 3]) & 0b00111111);
103 if (code_point <= 0xffff || 0x10ffff < code_point) {
106 *utf32_output++ = char32_t(code_point);
112 return utf32_output - start;
115template <
typename InputPtr>
116#if SIMDUTF_CPLUSPLUS20
117 requires simdutf::detail::indexes_into_byte_like<InputPtr>
119simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
120 char32_t *utf32_output) {
122 char32_t *start{utf32_output};
124#if SIMDUTF_CPLUSPLUS23
129 if (pos + 16 <= len) {
132 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
134 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
136 if ((v & 0x8080808080808080) == 0) {
137 size_t final_pos = pos + 16;
138 while (pos < final_pos) {
139 *utf32_output++ = uint8_t(data[pos]);
146 auto leading_byte = uint8_t(data[pos]);
147 if (leading_byte < 0b10000000) {
149 *utf32_output++ = char32_t(leading_byte);
151 }
else if ((leading_byte & 0b11100000) == 0b11000000) {
153 if (pos + 1 >= len) {
154 return result(error_code::TOO_SHORT, pos);
156 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
157 return result(error_code::TOO_SHORT, pos);
160 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
161 (uint8_t(data[pos + 1]) & 0b00111111);
162 if (code_point < 0x80 || 0x7ff < code_point) {
163 return result(error_code::OVERLONG, pos);
165 *utf32_output++ = char32_t(code_point);
167 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
169 if (pos + 2 >= len) {
170 return result(error_code::TOO_SHORT, pos);
173 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
174 return result(error_code::TOO_SHORT, pos);
176 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
177 return result(error_code::TOO_SHORT, pos);
180 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
181 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
182 (uint8_t(data[pos + 2]) & 0b00111111);
183 if (code_point < 0x800 || 0xffff < code_point) {
184 return result(error_code::OVERLONG, pos);
186 if (0xd7ff < code_point && code_point < 0xe000) {
187 return result(error_code::SURROGATE, pos);
189 *utf32_output++ = char32_t(code_point);
191 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
193 if (pos + 3 >= len) {
194 return result(error_code::TOO_SHORT, pos);
196 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
197 return result(error_code::TOO_SHORT, pos);
199 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
200 return result(error_code::TOO_SHORT, pos);
202 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
203 return result(error_code::TOO_SHORT, pos);
207 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
208 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
209 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
210 (uint8_t(data[pos + 3]) & 0b00111111);
211 if (code_point <= 0xffff) {
212 return result(error_code::OVERLONG, pos);
214 if (0x10ffff < code_point) {
215 return result(error_code::TOO_LARGE, pos);
217 *utf32_output++ = char32_t(code_point);
221 if ((leading_byte & 0b11000000) == 0b10000000) {
222 return result(error_code::TOO_LONG, pos);
224 return result(error_code::HEADER_BITS, pos);
228 return result(error_code::SUCCESS, utf32_output - start);
246inline result rewind_and_convert_with_errors(
size_t prior_bytes,
247 const char *buf,
size_t len,
248 char32_t *utf32_output) {
251 size_t how_far_back = 3;
252 if (how_far_back > prior_bytes) {
253 how_far_back = prior_bytes;
255 bool found_leading_bytes{
false};
257 for (
size_t i = 0; i <= how_far_back; i++) {
258 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
259 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
260 if (found_leading_bytes) {
261 if (i > 0 &&
byte < 128) {
264 return result(error_code::TOO_LONG, 0 - i + 1);
279 if (!found_leading_bytes) {
284 return result(error_code::TOO_LONG, 0 - how_far_back);
287 result res = convert_with_errors(buf, len + extra_len, utf32_output);
289 res.count -= extra_len;