simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8_to_utf32.h
1#ifndef SIMDUTF_UTF8_TO_UTF32_H
2#define SIMDUTF_UTF8_TO_UTF32_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_utf32 {
8
9template <typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
14 char32_t *utf32_output) {
15 size_t pos = 0;
16 char32_t *start{utf32_output};
17 while (pos < len) {
18#if SIMDUTF_CPLUSPLUS23
19 if !consteval
20#endif
21 {
22 // try to convert the next block of 16 ASCII bytes
23 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
24 // they are ascii
25 uint64_t v1;
26 ::memcpy(&v1, data + pos, sizeof(uint64_t));
27 uint64_t v2;
28 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
29 uint64_t v{v1 | v2};
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf32_output++ = uint8_t(data[pos]);
34 pos++;
35 }
36 continue;
37 }
38 }
39 }
40 auto leading_byte = uint8_t(data[pos]); // leading byte
41 if (leading_byte < 0b10000000) {
42 // converting one ASCII byte !!!
43 *utf32_output++ = char32_t(leading_byte);
44 pos++;
45 } else if ((leading_byte & 0b11100000) == 0b11000000) {
46 // We have a two-byte UTF-8
47 if (pos + 1 >= len) {
48 return 0;
49 } // minimal bound checking
50 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
51 return 0;
52 }
53 // range check
54 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
55 (uint8_t(data[pos + 1]) & 0b00111111);
56 if (code_point < 0x80) {
57 return 0;
58 }
59 *utf32_output++ = char32_t(code_point);
60 pos += 2;
61 } else if ((leading_byte & 0b11110000) == 0b11100000) {
62 // We have a three-byte UTF-8
63 if (pos + 2 >= len) {
64 return 0;
65 } // minimal bound checking
66
67 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
68 return 0;
69 }
70 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
71 return 0;
72 }
73 // range check
74 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
75 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
76 (uint8_t(data[pos + 2]) & 0b00111111);
77 if (code_point < 0x800 || (0xd7ff < code_point && code_point < 0xe000)) {
78 return 0;
79 }
80 *utf32_output++ = char32_t(code_point);
81 pos += 3;
82 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
83 // we have a 4-byte UTF-8 word.
84 if (pos + 3 >= len) {
85 return 0;
86 } // minimal bound checking
87 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
88 return 0;
89 }
90 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
91 return 0;
92 }
93 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
94 return 0;
95 }
96
97 // range check
98 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
99 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
100 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
101 (uint8_t(data[pos + 3]) & 0b00111111);
102 if (code_point <= 0xffff || 0x10ffff < code_point) {
103 return 0;
104 }
105 *utf32_output++ = char32_t(code_point);
106 pos += 4;
107 } else {
108 return 0;
109 }
110 }
111 return utf32_output - start;
112}
113
114template <typename InputPtr>
115#if SIMDUTF_CPLUSPLUS20
116 requires simdutf::detail::indexes_into_byte_like<InputPtr>
117#endif
118simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
119 char32_t *utf32_output) {
120 size_t pos = 0;
121 char32_t *start{utf32_output};
122 while (pos < len) {
123#if SIMDUTF_CPLUSPLUS23
124 if !consteval
125#endif
126 {
127 // try to convert the next block of 16 ASCII bytes
128 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
129 // they are ascii
130 uint64_t v1;
131 ::memcpy(&v1, data + pos, sizeof(uint64_t));
132 uint64_t v2;
133 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
134 uint64_t v{v1 | v2};
135 if ((v & 0x8080808080808080) == 0) {
136 size_t final_pos = pos + 16;
137 while (pos < final_pos) {
138 *utf32_output++ = uint8_t(data[pos]);
139 pos++;
140 }
141 continue;
142 }
143 }
144 }
145 auto leading_byte = uint8_t(data[pos]); // leading byte
146 if (leading_byte < 0b10000000) {
147 // converting one ASCII byte !!!
148 *utf32_output++ = char32_t(leading_byte);
149 pos++;
150 } else if ((leading_byte & 0b11100000) == 0b11000000) {
151 // We have a two-byte UTF-8
152 if (pos + 1 >= len) {
153 return result(error_code::TOO_SHORT, pos);
154 } // minimal bound checking
155 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
156 return result(error_code::TOO_SHORT, pos);
157 }
158 // range check
159 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
160 (uint8_t(data[pos + 1]) & 0b00111111);
161 if (code_point < 0x80) {
162 return result(error_code::OVERLONG, pos);
163 }
164 *utf32_output++ = char32_t(code_point);
165 pos += 2;
166 } else if ((leading_byte & 0b11110000) == 0b11100000) {
167 // We have a three-byte UTF-8
168 if (pos + 2 >= len) {
169 return result(error_code::TOO_SHORT, pos);
170 } // minimal bound checking
171
172 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
173 return result(error_code::TOO_SHORT, pos);
174 }
175 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
176 return result(error_code::TOO_SHORT, pos);
177 }
178 // range check
179 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
180 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
181 (uint8_t(data[pos + 2]) & 0b00111111);
182 if (code_point < 0x800) {
183 return result(error_code::OVERLONG, pos);
184 }
185 if (0xd7ff < code_point && code_point < 0xe000) {
186 return result(error_code::SURROGATE, pos);
187 }
188 *utf32_output++ = char32_t(code_point);
189 pos += 3;
190 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
191 // we have a 4-byte UTF-8 word.
192 if (pos + 3 >= len) {
193 return result(error_code::TOO_SHORT, pos);
194 } // minimal bound checking
195 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
196 return result(error_code::TOO_SHORT, pos);
197 }
198 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
199 return result(error_code::TOO_SHORT, pos);
200 }
201 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
202 return result(error_code::TOO_SHORT, pos);
203 }
204
205 // range check
206 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
207 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
208 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
209 (uint8_t(data[pos + 3]) & 0b00111111);
210 if (code_point <= 0xffff) {
211 return result(error_code::OVERLONG, pos);
212 }
213 if (0x10ffff < code_point) {
214 return result(error_code::TOO_LARGE, pos);
215 }
216 *utf32_output++ = char32_t(code_point);
217 pos += 4;
218 } else {
219 // we either have too many continuation bytes or an invalid leading byte
220 if ((leading_byte & 0b11000000) == 0b10000000) {
221 return result(error_code::TOO_LONG, pos);
222 } else {
223 return result(error_code::HEADER_BITS, pos);
224 }
225 }
226 }
227 return result(error_code::SUCCESS, utf32_output - start);
228}
229
230/**
231 * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
232 * we have up to len input bytes left, and we encountered some error. It is
233 * possible that the error is at 'buf' exactly, but it could also be in the
234 * previous bytes location (up to 3 bytes back).
235 *
236 * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
237 * current memory section and can be safely accessed. We prior_bytes to access
238 * safely up to three bytes before 'buf'.
239 *
240 * The caller is responsible to ensure that len > 0.
241 *
242 * If the error is believed to have occurred prior to 'buf', the count value
243 * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
244 */
245inline result rewind_and_convert_with_errors(size_t prior_bytes,
246 const char *buf, size_t len,
247 char32_t *utf32_output) {
248 size_t extra_len{0};
249 // We potentially need to go back in time and find a leading byte.
250 size_t how_far_back = 3; // 3 bytes in the past + current position
251 if (how_far_back > prior_bytes) {
252 how_far_back = prior_bytes;
253 }
254 bool found_leading_bytes{false};
255 // important: it is i <= how_far_back and not 'i < how_far_back'.
256 for (size_t i = 0; i <= how_far_back; i++) {
257 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
258 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
259 if (found_leading_bytes) {
260 if (i > 0 && byte < 128) {
261 // If we had to go back and the leading byte is ascii
262 // then we can stop right away.
263 return result(error_code::TOO_LONG, 0 - i + 1);
264 }
265 buf -= i;
266 extra_len = i;
267 break;
268 }
269 }
270 //
271 // It is possible for this function to return a negative count in its result.
272 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
273 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
274 // unsigned integral type of the result of the sizeof operator
275 //
276 // An unsigned type will simply wrap round arithmetically (well defined).
277 //
278 if (!found_leading_bytes) {
279 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
280 // [....] [continuation] [continuation] [continuation] | [buf is
281 // continuation] Or we possibly have a stream that does not start with a
282 // leading byte.
283 return result(error_code::TOO_LONG, 0 - how_far_back);
284 }
285
286 result res = convert_with_errors(buf, len + extra_len, utf32_output);
287 if (res.error) {
288 res.count -= extra_len;
289 }
290 return res;
291}
292
293} // namespace utf8_to_utf32
294} // unnamed namespace
295} // namespace scalar
296} // namespace simdutf
297
298#endif