simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8.h
1#ifndef SIMDUTF_UTF8_H
2#define SIMDUTF_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8 {
8
9// credit: based on code from Google Fuchsia (Apache Licensed)
10template <class BytePtr>
11simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data,
12 size_t len) noexcept {
13 static_assert(
14 std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
15 "dereferencing the data pointer must result in a uint8_t");
16 uint64_t pos = 0;
17 uint32_t code_point = 0;
18 while (pos < len) {
19 uint64_t next_pos;
20#if SIMDUTF_CPLUSPLUS23
21 if !consteval
22#endif
23 { // check if the next 16 bytes are ascii.
24 next_pos = pos + 16;
25 if (next_pos <= len) { // if it is safe to read 16 more bytes, check
26 // that they are ascii
27 uint64_t v1{};
28 std::memcpy(&v1, data + pos, sizeof(uint64_t));
29 uint64_t v2{};
30 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
31 uint64_t v{v1 | v2};
32 if ((v & 0x8080808080808080) == 0) {
33 pos = next_pos;
34 continue;
35 }
36 }
37 }
38
39 unsigned char byte = data[pos];
40
41 while (byte < 0b10000000) {
42 if (++pos == len) {
43 return true;
44 }
45 byte = data[pos];
46 }
47
48 if ((byte & 0b11100000) == 0b11000000) {
49 next_pos = pos + 2;
50 if (next_pos > len) {
51 return false;
52 }
53 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
54 return false;
55 }
56 // range check
57 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
58 if (code_point < 0x80) {
59 return false;
60 }
61 } else if ((byte & 0b11110000) == 0b11100000) {
62 next_pos = pos + 3;
63 if (next_pos > len) {
64 return false;
65 }
66 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
67 return false;
68 }
69 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
70 return false;
71 }
72 // range check
73 code_point = (byte & 0b00001111) << 12 |
74 (data[pos + 1] & 0b00111111) << 6 |
75 (data[pos + 2] & 0b00111111);
76 if ((code_point < 0x800) ||
77 (0xd7ff < code_point && code_point < 0xe000)) {
78 return false;
79 }
80 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
81 next_pos = pos + 4;
82 if (next_pos > len) {
83 return false;
84 }
85 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
86 return false;
87 }
88 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
89 return false;
90 }
91 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
92 return false;
93 }
94 // range check
95 code_point =
96 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
97 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
98 if (code_point <= 0xffff || 0x10ffff < code_point) {
99 return false;
100 }
101 } else {
102 // we may have a continuation
103 return false;
104 }
105 pos = next_pos;
106 }
107 return true;
108}
109
110simdutf_really_inline simdutf_warn_unused bool validate(const char *buf,
111 size_t len) noexcept {
112 return validate(reinterpret_cast<const uint8_t *>(buf), len);
113}
114
115template <class BytePtr>
116simdutf_constexpr23 simdutf_warn_unused result
117validate_with_errors(BytePtr data, size_t len) noexcept {
118 static_assert(
119 std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
120 "dereferencing the data pointer must result in a uint8_t");
121 size_t pos = 0;
122 uint32_t code_point = 0;
123 while (pos < len) {
124 // check of the next 16 bytes are ascii.
125 size_t next_pos = pos + 16;
126 if (next_pos <=
127 len) { // if it is safe to read 16 more bytes, check that they are ascii
128 uint64_t v1;
129 std::memcpy(&v1, data + pos, sizeof(uint64_t));
130 uint64_t v2;
131 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
132 uint64_t v{v1 | v2};
133 if ((v & 0x8080808080808080) == 0) {
134 pos = next_pos;
135 continue;
136 }
137 }
138 unsigned char byte = data[pos];
139
140 while (byte < 0b10000000) {
141 if (++pos == len) {
142 return result(error_code::SUCCESS, len);
143 }
144 byte = data[pos];
145 }
146
147 if ((byte & 0b11100000) == 0b11000000) {
148 next_pos = pos + 2;
149 if (next_pos > len) {
150 return result(error_code::TOO_SHORT, pos);
151 }
152 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
153 return result(error_code::TOO_SHORT, pos);
154 }
155 // range check
156 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
157 if (code_point < 0x80) {
158 return result(error_code::OVERLONG, pos);
159 }
160 } else if ((byte & 0b11110000) == 0b11100000) {
161 next_pos = pos + 3;
162 if (next_pos > len) {
163 return result(error_code::TOO_SHORT, pos);
164 }
165 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
166 return result(error_code::TOO_SHORT, pos);
167 }
168 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
169 return result(error_code::TOO_SHORT, pos);
170 }
171 // range check
172 code_point = (byte & 0b00001111) << 12 |
173 (data[pos + 1] & 0b00111111) << 6 |
174 (data[pos + 2] & 0b00111111);
175 if (code_point < 0x800) {
176 return result(error_code::OVERLONG, pos);
177 }
178 if (0xd7ff < code_point && code_point < 0xe000) {
179 return result(error_code::SURROGATE, pos);
180 }
181 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
182 next_pos = pos + 4;
183 if (next_pos > len) {
184 return result(error_code::TOO_SHORT, pos);
185 }
186 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
187 return result(error_code::TOO_SHORT, pos);
188 }
189 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
190 return result(error_code::TOO_SHORT, pos);
191 }
192 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
193 return result(error_code::TOO_SHORT, pos);
194 }
195 // range check
196 code_point =
197 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
198 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
199 if (code_point <= 0xffff) {
200 return result(error_code::OVERLONG, pos);
201 }
202 if (0x10ffff < code_point) {
203 return result(error_code::TOO_LARGE, pos);
204 }
205 } else {
206 // we either have too many continuation bytes or an invalid leading byte
207 if ((byte & 0b11000000) == 0b10000000) {
208 return result(error_code::TOO_LONG, pos);
209 } else {
210 return result(error_code::HEADER_BITS, pos);
211 }
212 }
213 pos = next_pos;
214 }
215 return result(error_code::SUCCESS, len);
216}
217
218simdutf_really_inline simdutf_warn_unused result
219validate_with_errors(const char *buf, size_t len) noexcept {
220 return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len);
221}
222
223// Finds the previous leading byte starting backward from buf and validates with
224// errors from there Used to pinpoint the location of an error when an invalid
225// chunk is detected We assume that the stream starts with a leading byte, and
226// to check that it is the case, we ask that you pass a pointer to the start of
227// the stream (start). Note that the resulting count is underflowed if an error
228// is encountered in the rewinded segment.
229inline simdutf_warn_unused result rewind_and_validate_with_errors(
230 const char *start, const char *buf, size_t len) noexcept {
231 // First check that we start with a leading byte
232 if ((*start & 0b11000000) == 0b10000000) {
233 return result(error_code::TOO_LONG, 0);
234 }
235 size_t extra_len{0};
236 // A leading byte cannot be further than 4 bytes away
237 for (int i = 0; i < 5; i++) {
238 unsigned char byte = *buf;
239 if ((byte & 0b11000000) != 0b10000000) {
240 break;
241 } else {
242 buf--;
243 extra_len++;
244 }
245 }
246
247 result res = validate_with_errors(buf, len + extra_len);
248 res.count -= extra_len; // Might underflow
249 return res;
250}
251
252template <typename InputPtr>
253#if SIMDUTF_CPLUSPLUS20
254 requires simdutf::detail::indexes_into_byte_like<InputPtr>
255#endif
256simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) {
257 size_t counter{0};
258 for (size_t i = 0; i < len; i++) {
259 // -65 is 0b10111111, anything larger in two-complement's should start a new
260 // code point.
261 if (int8_t(data[i]) > -65) {
262 counter++;
263 }
264 }
265 return counter;
266}
267
268template <typename InputPtr>
269#if SIMDUTF_CPLUSPLUS20
270 requires simdutf::detail::indexes_into_byte_like<InputPtr>
271#endif
272simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) {
273 size_t counter{0};
274 for (size_t i = 0; i < len; i++) {
275 if (int8_t(data[i]) > -65) {
276 counter++;
277 }
278 if (uint8_t(data[i]) >= 240) {
279 counter++;
280 }
281 }
282 return counter;
283}
284
285template <typename InputPtr>
286#if SIMDUTF_CPLUSPLUS20
287 requires simdutf::detail::indexes_into_byte_like<InputPtr>
288#endif
289simdutf_warn_unused simdutf_constexpr23 size_t
290trim_partial_utf8(InputPtr input, size_t length) {
291 if (length < 3) {
292 switch (length) {
293 case 2:
294 if (uint8_t(input[length - 1]) >= 0xc0) {
295 return length - 1;
296 } // 2-, 3- and 4-byte characters with only 1 byte left
297 if (uint8_t(input[length - 2]) >= 0xe0) {
298 return length - 2;
299 } // 3- and 4-byte characters with only 2 bytes left
300 return length;
301 case 1:
302 if (uint8_t(input[length - 1]) >= 0xc0) {
303 return length - 1;
304 } // 2-, 3- and 4-byte characters with only 1 byte left
305 return length;
306 case 0:
307 return length;
308 }
309 }
310 if (uint8_t(input[length - 1]) >= 0xc0) {
311 return length - 1;
312 } // 2-, 3- and 4-byte characters with only 1 byte left
313 if (uint8_t(input[length - 2]) >= 0xe0) {
314 return length - 2;
315 } // 3- and 4-byte characters with only 1 byte left
316 if (uint8_t(input[length - 3]) >= 0xf0) {
317 return length - 3;
318 } // 4-byte characters with only 3 bytes left
319 return length;
320}
321
322} // namespace utf8
323} // unnamed namespace
324} // namespace scalar
325} // namespace simdutf
326
327#endif