simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8_to_utf32.h
1#ifndef SIMDUTF_UTF8_TO_UTF32_H
2#define SIMDUTF_UTF8_TO_UTF32_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_utf32 {
8
9template <typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
14 char32_t *utf32_output) {
15 size_t pos = 0;
16 char32_t *start{utf32_output};
17 while (pos < len) {
18#if SIMDUTF_CPLUSPLUS23
19 if !consteval
20#endif
21 {
22 // try to convert the next block of 16 ASCII bytes
23 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
24 // they are ascii
25 uint64_t v1;
26 ::memcpy(&v1, data + pos, sizeof(uint64_t));
27 uint64_t v2;
28 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
29 uint64_t v{v1 | v2};
30 if ((v & 0x8080808080808080) == 0) {
31 size_t final_pos = pos + 16;
32 while (pos < final_pos) {
33 *utf32_output++ = uint8_t(data[pos]);
34 pos++;
35 }
36 continue;
37 }
38 }
39 }
40 auto leading_byte = uint8_t(data[pos]); // leading byte
41 if (leading_byte < 0b10000000) {
42 // converting one ASCII byte !!!
43 *utf32_output++ = char32_t(leading_byte);
44 pos++;
45 } else if ((leading_byte & 0b11100000) == 0b11000000) {
46 // We have a two-byte UTF-8
47 if (pos + 1 >= len) {
48 return 0;
49 } // minimal bound checking
50 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
51 return 0;
52 }
53 // range check
54 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
55 (uint8_t(data[pos + 1]) & 0b00111111);
56 if (code_point < 0x80 || 0x7ff < code_point) {
57 return 0;
58 }
59 *utf32_output++ = char32_t(code_point);
60 pos += 2;
61 } else if ((leading_byte & 0b11110000) == 0b11100000) {
62 // We have a three-byte UTF-8
63 if (pos + 2 >= len) {
64 return 0;
65 } // minimal bound checking
66
67 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
68 return 0;
69 }
70 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
71 return 0;
72 }
73 // range check
74 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
75 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
76 (uint8_t(data[pos + 2]) & 0b00111111);
77 if (code_point < 0x800 || 0xffff < code_point ||
78 (0xd7ff < code_point && code_point < 0xe000)) {
79 return 0;
80 }
81 *utf32_output++ = char32_t(code_point);
82 pos += 3;
83 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
84 // we have a 4-byte UTF-8 word.
85 if (pos + 3 >= len) {
86 return 0;
87 } // minimal bound checking
88 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
89 return 0;
90 }
91 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
92 return 0;
93 }
94 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
95 return 0;
96 }
97
98 // range check
99 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
100 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
101 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
102 (uint8_t(data[pos + 3]) & 0b00111111);
103 if (code_point <= 0xffff || 0x10ffff < code_point) {
104 return 0;
105 }
106 *utf32_output++ = char32_t(code_point);
107 pos += 4;
108 } else {
109 return 0;
110 }
111 }
112 return utf32_output - start;
113}
114
115template <typename InputPtr>
116#if SIMDUTF_CPLUSPLUS20
117 requires simdutf::detail::indexes_into_byte_like<InputPtr>
118#endif
119simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
120 char32_t *utf32_output) {
121 size_t pos = 0;
122 char32_t *start{utf32_output};
123 while (pos < len) {
124#if SIMDUTF_CPLUSPLUS23
125 if !consteval
126#endif
127 {
128 // try to convert the next block of 16 ASCII bytes
129 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
130 // they are ascii
131 uint64_t v1;
132 ::memcpy(&v1, data + pos, sizeof(uint64_t));
133 uint64_t v2;
134 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
135 uint64_t v{v1 | v2};
136 if ((v & 0x8080808080808080) == 0) {
137 size_t final_pos = pos + 16;
138 while (pos < final_pos) {
139 *utf32_output++ = uint8_t(data[pos]);
140 pos++;
141 }
142 continue;
143 }
144 }
145 }
146 auto leading_byte = uint8_t(data[pos]); // leading byte
147 if (leading_byte < 0b10000000) {
148 // converting one ASCII byte !!!
149 *utf32_output++ = char32_t(leading_byte);
150 pos++;
151 } else if ((leading_byte & 0b11100000) == 0b11000000) {
152 // We have a two-byte UTF-8
153 if (pos + 1 >= len) {
154 return result(error_code::TOO_SHORT, pos);
155 } // minimal bound checking
156 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
157 return result(error_code::TOO_SHORT, pos);
158 }
159 // range check
160 uint32_t code_point = (leading_byte & 0b00011111) << 6 |
161 (uint8_t(data[pos + 1]) & 0b00111111);
162 if (code_point < 0x80 || 0x7ff < code_point) {
163 return result(error_code::OVERLONG, pos);
164 }
165 *utf32_output++ = char32_t(code_point);
166 pos += 2;
167 } else if ((leading_byte & 0b11110000) == 0b11100000) {
168 // We have a three-byte UTF-8
169 if (pos + 2 >= len) {
170 return result(error_code::TOO_SHORT, pos);
171 } // minimal bound checking
172
173 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
174 return result(error_code::TOO_SHORT, pos);
175 }
176 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
177 return result(error_code::TOO_SHORT, pos);
178 }
179 // range check
180 uint32_t code_point = (leading_byte & 0b00001111) << 12 |
181 (uint8_t(data[pos + 1]) & 0b00111111) << 6 |
182 (uint8_t(data[pos + 2]) & 0b00111111);
183 if (code_point < 0x800 || 0xffff < code_point) {
184 return result(error_code::OVERLONG, pos);
185 }
186 if (0xd7ff < code_point && code_point < 0xe000) {
187 return result(error_code::SURROGATE, pos);
188 }
189 *utf32_output++ = char32_t(code_point);
190 pos += 3;
191 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
192 // we have a 4-byte UTF-8 word.
193 if (pos + 3 >= len) {
194 return result(error_code::TOO_SHORT, pos);
195 } // minimal bound checking
196 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
197 return result(error_code::TOO_SHORT, pos);
198 }
199 if ((uint8_t(data[pos + 2]) & 0b11000000) != 0b10000000) {
200 return result(error_code::TOO_SHORT, pos);
201 }
202 if ((uint8_t(data[pos + 3]) & 0b11000000) != 0b10000000) {
203 return result(error_code::TOO_SHORT, pos);
204 }
205
206 // range check
207 uint32_t code_point = (leading_byte & 0b00000111) << 18 |
208 (uint8_t(data[pos + 1]) & 0b00111111) << 12 |
209 (uint8_t(data[pos + 2]) & 0b00111111) << 6 |
210 (uint8_t(data[pos + 3]) & 0b00111111);
211 if (code_point <= 0xffff) {
212 return result(error_code::OVERLONG, pos);
213 }
214 if (0x10ffff < code_point) {
215 return result(error_code::TOO_LARGE, pos);
216 }
217 *utf32_output++ = char32_t(code_point);
218 pos += 4;
219 } else {
220 // we either have too many continuation bytes or an invalid leading byte
221 if ((leading_byte & 0b11000000) == 0b10000000) {
222 return result(error_code::TOO_LONG, pos);
223 } else {
224 return result(error_code::HEADER_BITS, pos);
225 }
226 }
227 }
228 return result(error_code::SUCCESS, utf32_output - start);
229}
230
246inline result rewind_and_convert_with_errors(size_t prior_bytes,
247 const char *buf, size_t len,
248 char32_t *utf32_output) {
249 size_t extra_len{0};
250 // We potentially need to go back in time and find a leading byte.
251 size_t how_far_back = 3; // 3 bytes in the past + current position
252 if (how_far_back > prior_bytes) {
253 how_far_back = prior_bytes;
254 }
255 bool found_leading_bytes{false};
256 // important: it is i <= how_far_back and not 'i < how_far_back'.
257 for (size_t i = 0; i <= how_far_back; i++) {
258 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
259 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
260 if (found_leading_bytes) {
261 if (i > 0 && byte < 128) {
262 // If we had to go back and the leading byte is ascii
263 // then we can stop right away.
264 return result(error_code::TOO_LONG, 0 - i + 1);
265 }
266 buf -= i;
267 extra_len = i;
268 break;
269 }
270 }
271 //
272 // It is possible for this function to return a negative count in its result.
273 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
274 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
275 // unsigned integral type of the result of the sizeof operator
276 //
277 // An unsigned type will simply wrap round arithmetically (well defined).
278 //
279 if (!found_leading_bytes) {
280 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
281 // [....] [continuation] [continuation] [continuation] | [buf is
282 // continuation] Or we possibly have a stream that does not start with a
283 // leading byte.
284 return result(error_code::TOO_LONG, 0 - how_far_back);
285 }
286
287 result res = convert_with_errors(buf, len + extra_len, utf32_output);
288 if (res.error) {
289 res.count -= extra_len;
290 }
291 return res;
292}
293
294} // namespace utf8_to_utf32
295} // unnamed namespace
296} // namespace scalar
297} // namespace simdutf
298
299#endif