8template <endianness big_endian>
9simdutf_warn_unused simdutf_constexpr23
bool
10validate_as_ascii(
const char16_t *data,
size_t len)
noexcept {
11 for (
size_t pos = 0; pos < len; pos++) {
12 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
20template <endianness big_endian>
21inline simdutf_warn_unused simdutf_constexpr23
bool
22validate(
const char16_t *data,
size_t len)
noexcept {
25 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26 if ((word & 0xF800) == 0xD800) {
30 char16_t diff = char16_t(word - 0xD800);
34 char16_t next_word = !match_system(big_endian)
35 ? u16_swap_bytes(data[pos + 1])
37 char16_t diff2 = char16_t(next_word - 0xDC00);
49template <endianness big_endian>
50inline simdutf_warn_unused simdutf_constexpr23 result
51validate_with_errors(
const char16_t *data,
size_t len)
noexcept {
54 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55 if ((word & 0xF800) == 0xD800) {
57 return result(error_code::SURROGATE, pos);
59 char16_t diff = char16_t(word - 0xD800);
61 return result(error_code::SURROGATE, pos);
63 char16_t next_word = !match_system(big_endian)
64 ? u16_swap_bytes(data[pos + 1])
66 char16_t diff2 = uint16_t(next_word - 0xDC00);
68 return result(error_code::SURROGATE, pos);
75 return result(error_code::SUCCESS, pos);
78template <endianness big_endian>
79simdutf_constexpr23
size_t count_code_points(
const char16_t *p,
size_t len) {
82 for (
size_t i = 0; i < len; i++) {
83 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84 counter += ((word & 0xFC00) != 0xDC00);
89template <endianness big_endian>
90simdutf_constexpr23
size_t utf8_length_from_utf16(
const char16_t *p,
94 for (
size_t i = 0; i < len; i++) {
95 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
97 counter +=
static_cast<size_t>(
100 counter +=
static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
106template <endianness big_endian>
107simdutf_constexpr23
size_t utf32_length_from_utf16(
const char16_t *p,
111 for (
size_t i = 0; i < len; i++) {
112 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113 counter += ((word & 0xFC00) != 0xDC00);
118simdutf_really_inline simdutf_constexpr23
void
119change_endianness_utf16(
const char16_t *input,
size_t size,
char16_t *output) {
120 for (
size_t i = 0; i < size; i++) {
121 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
125template <endianness big_endian>
126simdutf_warn_unused simdutf_constexpr23
size_t
127trim_partial_utf16(
const char16_t *input,
size_t length) {
131 uint16_t last_word = uint16_t(input[length - 1]);
132 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133 length -= ((last_word & 0xFC00) == 0xD800);
137template <endianness big_endian>
constexpr bool is_high_surrogate(
char16_t c) {
138 c = scalar::utf16::swap_if_needed<big_endian>(c);
139 return (0xd800 <= c && c <= 0xdbff);
142template <endianness big_endian>
constexpr bool is_low_surrogate(
char16_t c) {
143 c = scalar::utf16::swap_if_needed<big_endian>(c);
144 return (0xdc00 <= c && c <= 0xdfff);
147simdutf_really_inline
constexpr bool high_surrogate(
char16_t c) {
148 return (0xd800 <= c && c <= 0xdbff);
151simdutf_really_inline
constexpr bool low_surrogate(
char16_t c) {
152 return (0xdc00 <= c && c <= 0xdfff);
155template <endianness big_endian>
156simdutf_constexpr23 result
157utf8_length_from_utf16_with_replacement(
const char16_t *p,
size_t len) {
158 bool any_surrogates =
false;
161 for (
size_t i = 0; i < len; i++) {
162 if (is_high_surrogate<big_endian>(p[i])) {
163 any_surrogates =
true;
165 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
172 }
else if (is_low_surrogate<big_endian>(p[i])) {
173 any_surrogates =
true;
177 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
180 static_cast<size_t>(word > 0x7F);
181 counter +=
static_cast<size_t>(word > 0x7FF);
183 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
188template <endianness big_endian>
constexpr char16_t replacement() {
189 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
192template <endianness big_endian>
193simdutf_constexpr23
void to_well_formed_utf16(
const char16_t *input,
size_t len,
195 const char16_t replacement = utf16::replacement<big_endian>();
196 bool high_surrogate_prev =
false, high_surrogate, low_surrogate;
198 for (; i < len; i++) {
199 char16_t c = input[i];
200 high_surrogate = is_high_surrogate<big_endian>(c);
201 low_surrogate = is_low_surrogate<big_endian>(c);
202 if (high_surrogate_prev && !low_surrogate) {
203 output[i - 1] = replacement;
206 if (!high_surrogate_prev && low_surrogate) {
207 output[i] = replacement;
209 output[i] = input[i];
211 high_surrogate_prev = high_surrogate;
215 if (high_surrogate_prev) {
216 output[i - 1] = replacement;