simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8.h
1#ifndef SIMDUTF_UTF8_H
2#define SIMDUTF_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8 {
8
9// credit: based on code from Google Fuchsia (Apache Licensed)
10template <class BytePtr>
11simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data,
12 size_t len) noexcept {
13 static_assert(
14 std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
15 "dereferencing the data pointer must result in a uint8_t");
16 uint64_t pos = 0;
17 uint32_t code_point = 0;
18 while (pos < len) {
19 uint64_t next_pos;
20#if SIMDUTF_CPLUSPLUS23
21 if !consteval
22#endif
23 { // check if the next 16 bytes are ascii.
24 next_pos = pos + 16;
25 if (next_pos <= len) { // if it is safe to read 16 more bytes, check
26 // that they are ascii
27 uint64_t v1{};
28 std::memcpy(&v1, data + pos, sizeof(uint64_t));
29 uint64_t v2{};
30 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
31 uint64_t v{v1 | v2};
32 if ((v & 0x8080808080808080) == 0) {
33 pos = next_pos;
34 continue;
35 }
36 }
37 }
38
39 unsigned char byte = data[pos];
40
41 while (byte < 0b10000000) {
42 if (++pos == len) {
43 return true;
44 }
45 byte = data[pos];
46 }
47
48 if ((byte & 0b11100000) == 0b11000000) {
49 next_pos = pos + 2;
50 if (next_pos > len) {
51 return false;
52 }
53 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
54 return false;
55 }
56 // range check
57 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
58 if ((code_point < 0x80) || (0x7ff < code_point)) {
59 return false;
60 }
61 } else if ((byte & 0b11110000) == 0b11100000) {
62 next_pos = pos + 3;
63 if (next_pos > len) {
64 return false;
65 }
66 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
67 return false;
68 }
69 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
70 return false;
71 }
72 // range check
73 code_point = (byte & 0b00001111) << 12 |
74 (data[pos + 1] & 0b00111111) << 6 |
75 (data[pos + 2] & 0b00111111);
76 if ((code_point < 0x800) || (0xffff < code_point) ||
77 (0xd7ff < code_point && code_point < 0xe000)) {
78 return false;
79 }
80 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
81 next_pos = pos + 4;
82 if (next_pos > len) {
83 return false;
84 }
85 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
86 return false;
87 }
88 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
89 return false;
90 }
91 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
92 return false;
93 }
94 // range check
95 code_point =
96 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
97 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
98 if (code_point <= 0xffff || 0x10ffff < code_point) {
99 return false;
100 }
101 } else {
102 // we may have a continuation
103 return false;
104 }
105 pos = next_pos;
106 }
107 return true;
108}
109
110simdutf_really_inline simdutf_warn_unused bool validate(const char *buf,
111 size_t len) noexcept {
112 return validate(reinterpret_cast<const uint8_t *>(buf), len);
113}
114
115template <class BytePtr>
116simdutf_constexpr23 simdutf_warn_unused result
117validate_with_errors(BytePtr data, size_t len) noexcept {
118 static_assert(
119 std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
120 "dereferencing the data pointer must result in a uint8_t");
121 size_t pos = 0;
122 uint32_t code_point = 0;
123 while (pos < len) {
124 // check of the next 16 bytes are ascii.
125 size_t next_pos = pos + 16;
126 if (next_pos <=
127 len) { // if it is safe to read 16 more bytes, check that they are ascii
128 uint64_t v1;
129 std::memcpy(&v1, data + pos, sizeof(uint64_t));
130 uint64_t v2;
131 std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
132 uint64_t v{v1 | v2};
133 if ((v & 0x8080808080808080) == 0) {
134 pos = next_pos;
135 continue;
136 }
137 }
138 unsigned char byte = data[pos];
139
140 while (byte < 0b10000000) {
141 if (++pos == len) {
142 return result(error_code::SUCCESS, len);
143 }
144 byte = data[pos];
145 }
146
147 if ((byte & 0b11100000) == 0b11000000) {
148 next_pos = pos + 2;
149 if (next_pos > len) {
150 return result(error_code::TOO_SHORT, pos);
151 }
152 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
153 return result(error_code::TOO_SHORT, pos);
154 }
155 // range check
156 code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
157 if ((code_point < 0x80) || (0x7ff < code_point)) {
158 return result(error_code::OVERLONG, pos);
159 }
160 } else if ((byte & 0b11110000) == 0b11100000) {
161 next_pos = pos + 3;
162 if (next_pos > len) {
163 return result(error_code::TOO_SHORT, pos);
164 }
165 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
166 return result(error_code::TOO_SHORT, pos);
167 }
168 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
169 return result(error_code::TOO_SHORT, pos);
170 }
171 // range check
172 code_point = (byte & 0b00001111) << 12 |
173 (data[pos + 1] & 0b00111111) << 6 |
174 (data[pos + 2] & 0b00111111);
175 if ((code_point < 0x800) || (0xffff < code_point)) {
176 return result(error_code::OVERLONG, pos);
177 }
178 if (0xd7ff < code_point && code_point < 0xe000) {
179 return result(error_code::SURROGATE, pos);
180 }
181 } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
182 next_pos = pos + 4;
183 if (next_pos > len) {
184 return result(error_code::TOO_SHORT, pos);
185 }
186 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
187 return result(error_code::TOO_SHORT, pos);
188 }
189 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
190 return result(error_code::TOO_SHORT, pos);
191 }
192 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
193 return result(error_code::TOO_SHORT, pos);
194 }
195 // range check
196 code_point =
197 (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
198 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
199 if (code_point <= 0xffff) {
200 return result(error_code::OVERLONG, pos);
201 }
202 if (0x10ffff < code_point) {
203 return result(error_code::TOO_LARGE, pos);
204 }
205 } else {
206 // we either have too many continuation bytes or an invalid leading byte
207 if ((byte & 0b11000000) == 0b10000000) {
208 return result(error_code::TOO_LONG, pos);
209 } else {
210 return result(error_code::HEADER_BITS, pos);
211 }
212 }
213 pos = next_pos;
214 }
215 return result(error_code::SUCCESS, len);
216}
217
218simdutf_really_inline simdutf_warn_unused result
219validate_with_errors(const char *buf, size_t len) noexcept {
220 return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len);
221}
222
223// Finds the previous leading byte starting backward from buf and validates with
224// errors from there Used to pinpoint the location of an error when an invalid
225// chunk is detected We assume that the stream starts with a leading byte, and
226// to check that it is the case, we ask that you pass a pointer to the start of
227// the stream (start).
228inline simdutf_warn_unused result rewind_and_validate_with_errors(
229 const char *start, const char *buf, size_t len) noexcept {
230 // First check that we start with a leading byte
231 if ((*start & 0b11000000) == 0b10000000) {
232 return result(error_code::TOO_LONG, 0);
233 }
234 size_t extra_len{0};
235 // A leading byte cannot be further than 4 bytes away
236 for (int i = 0; i < 5; i++) {
237 unsigned char byte = *buf;
238 if ((byte & 0b11000000) != 0b10000000) {
239 break;
240 } else {
241 buf--;
242 extra_len++;
243 }
244 }
245
246 result res = validate_with_errors(buf, len + extra_len);
247 res.count -= extra_len;
248 return res;
249}
250
251template <typename InputPtr>
252#if SIMDUTF_CPLUSPLUS20
253 requires simdutf::detail::indexes_into_byte_like<InputPtr>
254#endif
255simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) {
256 size_t counter{0};
257 for (size_t i = 0; i < len; i++) {
258 // -65 is 0b10111111, anything larger in two-complement's should start a new
259 // code point.
260 if (int8_t(data[i]) > -65) {
261 counter++;
262 }
263 }
264 return counter;
265}
266
267template <typename InputPtr>
268#if SIMDUTF_CPLUSPLUS20
269 requires simdutf::detail::indexes_into_byte_like<InputPtr>
270#endif
271simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) {
272 size_t counter{0};
273 for (size_t i = 0; i < len; i++) {
274 if (int8_t(data[i]) > -65) {
275 counter++;
276 }
277 if (uint8_t(data[i]) >= 240) {
278 counter++;
279 }
280 }
281 return counter;
282}
283
284template <typename InputPtr>
285#if SIMDUTF_CPLUSPLUS20
286 requires simdutf::detail::indexes_into_byte_like<InputPtr>
287#endif
288simdutf_warn_unused simdutf_constexpr23 size_t
289trim_partial_utf8(InputPtr input, size_t length) {
290 if (length < 3) {
291 switch (length) {
292 case 2:
293 if (uint8_t(input[length - 1]) >= 0xc0) {
294 return length - 1;
295 } // 2-, 3- and 4-byte characters with only 1 byte left
296 if (uint8_t(input[length - 2]) >= 0xe0) {
297 return length - 2;
298 } // 3- and 4-byte characters with only 2 bytes left
299 return length;
300 case 1:
301 if (uint8_t(input[length - 1]) >= 0xc0) {
302 return length - 1;
303 } // 2-, 3- and 4-byte characters with only 1 byte left
304 return length;
305 case 0:
306 return length;
307 }
308 }
309 if (uint8_t(input[length - 1]) >= 0xc0) {
310 return length - 1;
311 } // 2-, 3- and 4-byte characters with only 1 byte left
312 if (uint8_t(input[length - 2]) >= 0xe0) {
313 return length - 2;
314 } // 3- and 4-byte characters with only 1 byte left
315 if (uint8_t(input[length - 3]) >= 0xf0) {
316 return length - 3;
317 } // 4-byte characters with only 3 bytes left
318 return length;
319}
320
321} // namespace utf8
322} // unnamed namespace
323} // namespace scalar
324} // namespace simdutf
325
326#endif