10template <
class BytePtr>
11simdutf_constexpr23 simdutf_warn_unused
bool validate(BytePtr data,
12 size_t len)
noexcept {
14 std::is_same<
typename std::decay<
decltype(*data)>::type, uint8_t>::value,
15 "dereferencing the data pointer must result in a uint8_t");
17 uint32_t code_point = 0;
20#if SIMDUTF_CPLUSPLUS23
25 if (next_pos <= len) {
28 std::memcpy(&v1, data + pos,
sizeof(uint64_t));
30 std::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
32 if ((v & 0x8080808080808080) == 0) {
39 unsigned char byte = data[pos];
41 while (
byte < 0b10000000) {
48 if ((
byte & 0b11100000) == 0b11000000) {
53 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
57 code_point = (
byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
58 if ((code_point < 0x80) || (0x7ff < code_point)) {
61 }
else if ((
byte & 0b11110000) == 0b11100000) {
66 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
69 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
73 code_point = (
byte & 0b00001111) << 12 |
74 (data[pos + 1] & 0b00111111) << 6 |
75 (data[pos + 2] & 0b00111111);
76 if ((code_point < 0x800) || (0xffff < code_point) ||
77 (0xd7ff < code_point && code_point < 0xe000)) {
80 }
else if ((
byte & 0b11111000) == 0b11110000) {
85 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
88 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
91 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
96 (
byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
97 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
98 if (code_point <= 0xffff || 0x10ffff < code_point) {
110simdutf_really_inline simdutf_warn_unused
bool validate(
const char *buf,
111 size_t len)
noexcept {
112 return validate(
reinterpret_cast<const uint8_t *
>(buf), len);
115template <
class BytePtr>
116simdutf_constexpr23 simdutf_warn_unused result
117validate_with_errors(BytePtr data,
size_t len)
noexcept {
119 std::is_same<
typename std::decay<
decltype(*data)>::type, uint8_t>::value,
120 "dereferencing the data pointer must result in a uint8_t");
122 uint32_t code_point = 0;
125 size_t next_pos = pos + 16;
129 std::memcpy(&v1, data + pos,
sizeof(uint64_t));
131 std::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
133 if ((v & 0x8080808080808080) == 0) {
138 unsigned char byte = data[pos];
140 while (
byte < 0b10000000) {
142 return result(error_code::SUCCESS, len);
147 if ((
byte & 0b11100000) == 0b11000000) {
149 if (next_pos > len) {
150 return result(error_code::TOO_SHORT, pos);
152 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
153 return result(error_code::TOO_SHORT, pos);
156 code_point = (
byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
157 if ((code_point < 0x80) || (0x7ff < code_point)) {
158 return result(error_code::OVERLONG, pos);
160 }
else if ((
byte & 0b11110000) == 0b11100000) {
162 if (next_pos > len) {
163 return result(error_code::TOO_SHORT, pos);
165 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
166 return result(error_code::TOO_SHORT, pos);
168 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
169 return result(error_code::TOO_SHORT, pos);
172 code_point = (
byte & 0b00001111) << 12 |
173 (data[pos + 1] & 0b00111111) << 6 |
174 (data[pos + 2] & 0b00111111);
175 if ((code_point < 0x800) || (0xffff < code_point)) {
176 return result(error_code::OVERLONG, pos);
178 if (0xd7ff < code_point && code_point < 0xe000) {
179 return result(error_code::SURROGATE, pos);
181 }
else if ((
byte & 0b11111000) == 0b11110000) {
183 if (next_pos > len) {
184 return result(error_code::TOO_SHORT, pos);
186 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
187 return result(error_code::TOO_SHORT, pos);
189 if ((data[pos + 2] & 0b11000000) != 0b10000000) {
190 return result(error_code::TOO_SHORT, pos);
192 if ((data[pos + 3] & 0b11000000) != 0b10000000) {
193 return result(error_code::TOO_SHORT, pos);
197 (
byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
198 (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
199 if (code_point <= 0xffff) {
200 return result(error_code::OVERLONG, pos);
202 if (0x10ffff < code_point) {
203 return result(error_code::TOO_LARGE, pos);
207 if ((
byte & 0b11000000) == 0b10000000) {
208 return result(error_code::TOO_LONG, pos);
210 return result(error_code::HEADER_BITS, pos);
215 return result(error_code::SUCCESS, len);
218simdutf_really_inline simdutf_warn_unused result
219validate_with_errors(
const char *buf,
size_t len)
noexcept {
220 return validate_with_errors(
reinterpret_cast<const uint8_t *
>(buf), len);
228inline simdutf_warn_unused result rewind_and_validate_with_errors(
229 const char *start,
const char *buf,
size_t len)
noexcept {
231 if ((*start & 0b11000000) == 0b10000000) {
232 return result(error_code::TOO_LONG, 0);
236 for (
int i = 0; i < 5; i++) {
237 unsigned char byte = *buf;
238 if ((
byte & 0b11000000) != 0b10000000) {
246 result res = validate_with_errors(buf, len + extra_len);
247 res.count -= extra_len;
251template <
typename InputPtr>
252#if SIMDUTF_CPLUSPLUS20
253 requires simdutf::detail::indexes_into_byte_like<InputPtr>
255simdutf_constexpr23
size_t count_code_points(InputPtr data,
size_t len) {
257 for (
size_t i = 0; i < len; i++) {
260 if (int8_t(data[i]) > -65) {
267template <
typename InputPtr>
268#if SIMDUTF_CPLUSPLUS20
269 requires simdutf::detail::indexes_into_byte_like<InputPtr>
271simdutf_constexpr23
size_t utf16_length_from_utf8(InputPtr data,
size_t len) {
273 for (
size_t i = 0; i < len; i++) {
274 if (int8_t(data[i]) > -65) {
277 if (uint8_t(data[i]) >= 240) {
284template <
typename InputPtr>
285#if SIMDUTF_CPLUSPLUS20
286 requires simdutf::detail::indexes_into_byte_like<InputPtr>
288simdutf_warn_unused simdutf_constexpr23
size_t
289trim_partial_utf8(InputPtr input,
size_t length) {
293 if (uint8_t(input[length - 1]) >= 0xc0) {
296 if (uint8_t(input[length - 2]) >= 0xe0) {
301 if (uint8_t(input[length - 1]) >= 0xc0) {
309 if (uint8_t(input[length - 1]) >= 0xc0) {
312 if (uint8_t(input[length - 2]) >= 0xe0) {
315 if (uint8_t(input[length - 3]) >= 0xf0) {