8template <endianness big_endian>
9simdutf_warn_unused simdutf_constexpr23
bool
10validate_as_ascii(
const char16_t *data,
size_t len)
noexcept {
11 for (
size_t pos = 0; pos < len; pos++) {
12 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
20template <endianness big_endian>
21inline simdutf_warn_unused simdutf_constexpr23
bool
22validate(
const char16_t *data,
size_t len)
noexcept {
25 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26 if ((word & 0xF800) == 0xD800) {
30 char16_t diff = char16_t(word - 0xD800);
34 char16_t next_word = !match_system(big_endian)
35 ? u16_swap_bytes(data[pos + 1])
37 char16_t diff2 = char16_t(next_word - 0xDC00);
49template <endianness big_endian>
50inline simdutf_warn_unused simdutf_constexpr23 result
51validate_with_errors(
const char16_t *data,
size_t len)
noexcept {
54 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55 if ((word & 0xF800) == 0xD800) {
57 return result(error_code::SURROGATE, pos);
59 char16_t diff = char16_t(word - 0xD800);
61 return result(error_code::SURROGATE, pos);
63 char16_t next_word = !match_system(big_endian)
64 ? u16_swap_bytes(data[pos + 1])
66 char16_t diff2 = uint16_t(next_word - 0xDC00);
68 return result(error_code::SURROGATE, pos);
75 return result(error_code::SUCCESS, pos);
78template <endianness big_endian>
79simdutf_constexpr23
size_t count_code_points(
const char16_t *p,
size_t len) {
82 for (
size_t i = 0; i < len; i++) {
83 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84 counter += ((word & 0xFC00) != 0xDC00);
89template <endianness big_endian>
90simdutf_constexpr23
size_t utf8_length_from_utf16(
const char16_t *p,
94 for (
size_t i = 0; i < len; i++) {
95 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
97 counter +=
static_cast<size_t>(
100 counter +=
static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
106template <endianness big_endian>
107simdutf_constexpr23
size_t utf32_length_from_utf16(
const char16_t *p,
111 for (
size_t i = 0; i < len; i++) {
112 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113 counter += ((word & 0xFC00) != 0xDC00);
118simdutf_really_inline simdutf_constexpr23
void
119change_endianness_utf16(
const char16_t *input,
size_t size,
char16_t *output) {
120 for (
size_t i = 0; i < size; i++) {
121 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
125template <endianness big_endian>
126simdutf_warn_unused simdutf_constexpr23
size_t
127trim_partial_utf16(
const char16_t *input,
size_t length) {
131 uint16_t last_word = uint16_t(input[length - 1]);
132 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133 length -= ((last_word & 0xFC00) == 0xD800);
137template <endianness big_endian>
138simdutf_constexpr
bool is_high_surrogate(
char16_t c) {
139 c = scalar::utf16::swap_if_needed<big_endian>(c);
140 return (0xd800 <= c && c <= 0xdbff);
143template <endianness big_endian>
144simdutf_constexpr
bool is_low_surrogate(
char16_t c) {
145 c = scalar::utf16::swap_if_needed<big_endian>(c);
146 return (0xdc00 <= c && c <= 0xdfff);
149simdutf_really_inline
constexpr bool high_surrogate(
char16_t c) {
150 return (0xd800 <= c && c <= 0xdbff);
153simdutf_really_inline
constexpr bool low_surrogate(
char16_t c) {
154 return (0xdc00 <= c && c <= 0xdfff);
157template <endianness big_endian>
158simdutf_constexpr23 result
159utf8_length_from_utf16_with_replacement(
const char16_t *p,
size_t len) {
160 bool any_surrogates =
false;
163 for (
size_t i = 0; i < len; i++) {
164 if (is_high_surrogate<big_endian>(p[i])) {
165 any_surrogates =
true;
167 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
174 }
else if (is_low_surrogate<big_endian>(p[i])) {
175 any_surrogates =
true;
179 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
182 static_cast<size_t>(word > 0x7F);
183 counter +=
static_cast<size_t>(word > 0x7FF);
185 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
190template <endianness big_endian>
constexpr char16_t replacement() {
191 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
194template <endianness big_endian>
195simdutf_constexpr23
void to_well_formed_utf16(
const char16_t *input,
size_t len,
197 const char16_t replacement = utf16::replacement<big_endian>();
198 bool high_surrogate_prev =
false, high_surrogate, low_surrogate;
200 for (; i < len; i++) {
201 char16_t c = input[i];
202 high_surrogate = is_high_surrogate<big_endian>(c);
203 low_surrogate = is_low_surrogate<big_endian>(c);
204 if (high_surrogate_prev && !low_surrogate) {
205 output[i - 1] = replacement;
208 if (!high_surrogate_prev && low_surrogate) {
209 output[i] = replacement;
211 output[i] = input[i];
213 high_surrogate_prev = high_surrogate;
217 if (high_surrogate_prev) {
218 output[i - 1] = replacement;