simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf16.h
1#ifndef SIMDUTF_UTF16_H
2#define SIMDUTF_UTF16_H
3
4namespace simdutf {
5namespace scalar {
6namespace utf16 {
7
8template <endianness big_endian>
9simdutf_warn_unused simdutf_constexpr23 bool
10validate_as_ascii(const char16_t *data, size_t len) noexcept {
11 for (size_t pos = 0; pos < len; pos++) {
12 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
13 if (word >= 0x80) {
14 return false;
15 }
16 }
17 return true;
18}
19
20template <endianness big_endian>
21inline simdutf_warn_unused simdutf_constexpr23 bool
22validate(const char16_t *data, size_t len) noexcept {
23 uint64_t pos = 0;
24 while (pos < len) {
25 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26 if ((word & 0xF800) == 0xD800) {
27 if (pos + 1 >= len) {
28 return false;
29 }
30 char16_t diff = char16_t(word - 0xD800);
31 if (diff > 0x3FF) {
32 return false;
33 }
34 char16_t next_word = !match_system(big_endian)
35 ? u16_swap_bytes(data[pos + 1])
36 : data[pos + 1];
37 char16_t diff2 = char16_t(next_word - 0xDC00);
38 if (diff2 > 0x3FF) {
39 return false;
40 }
41 pos += 2;
42 } else {
43 pos++;
44 }
45 }
46 return true;
47}
48
49template <endianness big_endian>
50inline simdutf_warn_unused simdutf_constexpr23 result
51validate_with_errors(const char16_t *data, size_t len) noexcept {
52 size_t pos = 0;
53 while (pos < len) {
54 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55 if ((word & 0xF800) == 0xD800) {
56 if (pos + 1 >= len) {
57 return result(error_code::SURROGATE, pos);
58 }
59 char16_t diff = char16_t(word - 0xD800);
60 if (diff > 0x3FF) {
61 return result(error_code::SURROGATE, pos);
62 }
63 char16_t next_word = !match_system(big_endian)
64 ? u16_swap_bytes(data[pos + 1])
65 : data[pos + 1];
66 char16_t diff2 = uint16_t(next_word - 0xDC00);
67 if (diff2 > 0x3FF) {
68 return result(error_code::SURROGATE, pos);
69 }
70 pos += 2;
71 } else {
72 pos++;
73 }
74 }
75 return result(error_code::SUCCESS, pos);
76}
77
78template <endianness big_endian>
79simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
80 // We are not BOM aware.
81 size_t counter{0};
82 for (size_t i = 0; i < len; i++) {
83 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84 counter += ((word & 0xFC00) != 0xDC00);
85 }
86 return counter;
87}
88
89template <endianness big_endian>
90simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
91 size_t len) {
92 // We are not BOM aware.
93 size_t counter{0};
94 for (size_t i = 0; i < len; i++) {
95 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
96 counter++; // ASCII
97 counter += static_cast<size_t>(
98 word >
99 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
100 counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
101 (word >= 0xE000)); // three-byte
102 }
103 return counter;
104}
105
106template <endianness big_endian>
107simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
108 size_t len) {
109 // We are not BOM aware.
110 size_t counter{0};
111 for (size_t i = 0; i < len; i++) {
112 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113 counter += ((word & 0xFC00) != 0xDC00);
114 }
115 return counter;
116}
117
118simdutf_really_inline simdutf_constexpr23 void
119change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
120 for (size_t i = 0; i < size; i++) {
121 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
122 }
123}
124
125template <endianness big_endian>
126simdutf_warn_unused simdutf_constexpr23 size_t
127trim_partial_utf16(const char16_t *input, size_t length) {
128 if (length == 0) {
129 return 0;
130 }
131 uint16_t last_word = uint16_t(input[length - 1]);
132 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133 length -= ((last_word & 0xFC00) == 0xD800);
134 return length;
135}
136
137template <endianness big_endian> constexpr bool is_high_surrogate(char16_t c) {
138 c = scalar::utf16::swap_if_needed<big_endian>(c);
139 return (0xd800 <= c && c <= 0xdbff);
140}
141
142template <endianness big_endian> constexpr bool is_low_surrogate(char16_t c) {
143 c = scalar::utf16::swap_if_needed<big_endian>(c);
144 return (0xdc00 <= c && c <= 0xdfff);
145}
146
147simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
148 return (0xd800 <= c && c <= 0xdbff);
149}
150
151simdutf_really_inline constexpr bool low_surrogate(char16_t c) {
152 return (0xdc00 <= c && c <= 0xdfff);
153}
154
155template <endianness big_endian>
156simdutf_constexpr23 result
157utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
158 bool any_surrogates = false;
159 // We are not BOM aware.
160 size_t counter{0};
161 for (size_t i = 0; i < len; i++) {
162 if (is_high_surrogate<big_endian>(p[i])) {
163 any_surrogates = true;
164 // surrogate pair
165 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
166 counter += 4;
167 i++; // skip low surrogate
168 } else {
169 counter += 3; // unpaired high surrogate replaced by U+FFFD
170 }
171 continue;
172 } else if (is_low_surrogate<big_endian>(p[i])) {
173 any_surrogates = true;
174 counter += 3; // unpaired low surrogate replaced by U+FFFD
175 continue;
176 }
177 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
178 counter++; // at least 1 byte
179 counter +=
180 static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
181 counter += static_cast<size_t>(word > 0x7FF); // three-byte
182 }
183 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
184 counter};
185}
186
187// variable templates are a C++14 extension
188template <endianness big_endian> constexpr char16_t replacement() {
189 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
190}
191
192template <endianness big_endian>
193simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
194 char16_t *output) {
195 const char16_t replacement = utf16::replacement<big_endian>();
196 bool high_surrogate_prev = false, high_surrogate, low_surrogate;
197 size_t i = 0;
198 for (; i < len; i++) {
199 char16_t c = input[i];
200 high_surrogate = is_high_surrogate<big_endian>(c);
201 low_surrogate = is_low_surrogate<big_endian>(c);
202 if (high_surrogate_prev && !low_surrogate) {
203 output[i - 1] = replacement;
204 }
205
206 if (!high_surrogate_prev && low_surrogate) {
207 output[i] = replacement;
208 } else {
209 output[i] = input[i];
210 }
211 high_surrogate_prev = high_surrogate;
212 }
213
214 /* string may not end with high surrogate */
215 if (high_surrogate_prev) {
216 output[i - 1] = replacement;
217 }
218}
219
220} // namespace utf16
221} // namespace scalar
222} // namespace simdutf
223
224#endif