simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf16.h
1#ifndef SIMDUTF_UTF16_H
2#define SIMDUTF_UTF16_H
3
4namespace simdutf {
5namespace scalar {
6namespace utf16 {
7
8template <endianness big_endian>
9simdutf_warn_unused simdutf_constexpr23 bool
10validate_as_ascii(const char16_t *data, size_t len) noexcept {
11 for (size_t pos = 0; pos < len; pos++) {
12 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
13 if (word >= 0x80) {
14 return false;
15 }
16 }
17 return true;
18}
19
20template <endianness big_endian>
21inline simdutf_warn_unused simdutf_constexpr23 bool
22validate(const char16_t *data, size_t len) noexcept {
23 uint64_t pos = 0;
24 while (pos < len) {
25 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26 if ((word & 0xF800) == 0xD800) {
27 if (pos + 1 >= len) {
28 return false;
29 }
30 char16_t diff = char16_t(word - 0xD800);
31 if (diff > 0x3FF) {
32 return false;
33 }
34 char16_t next_word = !match_system(big_endian)
35 ? u16_swap_bytes(data[pos + 1])
36 : data[pos + 1];
37 char16_t diff2 = char16_t(next_word - 0xDC00);
38 if (diff2 > 0x3FF) {
39 return false;
40 }
41 pos += 2;
42 } else {
43 pos++;
44 }
45 }
46 return true;
47}
48
49template <endianness big_endian>
50inline simdutf_warn_unused simdutf_constexpr23 result
51validate_with_errors(const char16_t *data, size_t len) noexcept {
52 size_t pos = 0;
53 while (pos < len) {
54 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55 if ((word & 0xF800) == 0xD800) {
56 if (pos + 1 >= len) {
57 return result(error_code::SURROGATE, pos);
58 }
59 char16_t diff = char16_t(word - 0xD800);
60 if (diff > 0x3FF) {
61 return result(error_code::SURROGATE, pos);
62 }
63 char16_t next_word = !match_system(big_endian)
64 ? u16_swap_bytes(data[pos + 1])
65 : data[pos + 1];
66 char16_t diff2 = uint16_t(next_word - 0xDC00);
67 if (diff2 > 0x3FF) {
68 return result(error_code::SURROGATE, pos);
69 }
70 pos += 2;
71 } else {
72 pos++;
73 }
74 }
75 return result(error_code::SUCCESS, pos);
76}
77
78template <endianness big_endian>
79simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
80 // We are not BOM aware.
81 size_t counter{0};
82 for (size_t i = 0; i < len; i++) {
83 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84 counter += ((word & 0xFC00) != 0xDC00);
85 }
86 return counter;
87}
88
89template <endianness big_endian>
90simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
91 size_t len) {
92 // We are not BOM aware.
93 size_t counter{0};
94 for (size_t i = 0; i < len; i++) {
95 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
96 counter++; // ASCII
97 counter += static_cast<size_t>(
98 word >
99 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
100 counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
101 (word >= 0xE000)); // three-byte
102 }
103 return counter;
104}
105
106template <endianness big_endian>
107simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
108 size_t len) {
109 // We are not BOM aware.
110 size_t counter{0};
111 for (size_t i = 0; i < len; i++) {
112 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113 counter += ((word & 0xFC00) != 0xDC00);
114 }
115 return counter;
116}
117
118simdutf_really_inline simdutf_constexpr23 void
119change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
120 for (size_t i = 0; i < size; i++) {
121 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
122 }
123}
124
125template <endianness big_endian>
126simdutf_warn_unused simdutf_constexpr23 size_t
127trim_partial_utf16(const char16_t *input, size_t length) {
128 if (length == 0) {
129 return 0;
130 }
131 uint16_t last_word = uint16_t(input[length - 1]);
132 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133 length -= ((last_word & 0xFC00) == 0xD800);
134 return length;
135}
136
137template <endianness big_endian>
138simdutf_constexpr bool is_high_surrogate(char16_t c) {
139 c = scalar::utf16::swap_if_needed<big_endian>(c);
140 return (0xd800 <= c && c <= 0xdbff);
141}
142
143template <endianness big_endian>
144simdutf_constexpr bool is_low_surrogate(char16_t c) {
145 c = scalar::utf16::swap_if_needed<big_endian>(c);
146 return (0xdc00 <= c && c <= 0xdfff);
147}
148
149simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
150 return (0xd800 <= c && c <= 0xdbff);
151}
152
153simdutf_really_inline constexpr bool low_surrogate(char16_t c) {
154 return (0xdc00 <= c && c <= 0xdfff);
155}
156
157template <endianness big_endian>
158simdutf_constexpr23 result
159utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
160 bool any_surrogates = false;
161 // We are not BOM aware.
162 size_t counter{0};
163 for (size_t i = 0; i < len; i++) {
164 if (is_high_surrogate<big_endian>(p[i])) {
165 any_surrogates = true;
166 // surrogate pair
167 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
168 counter += 4;
169 i++; // skip low surrogate
170 } else {
171 counter += 3; // unpaired high surrogate replaced by U+FFFD
172 }
173 continue;
174 } else if (is_low_surrogate<big_endian>(p[i])) {
175 any_surrogates = true;
176 counter += 3; // unpaired low surrogate replaced by U+FFFD
177 continue;
178 }
179 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
180 counter++; // at least 1 byte
181 counter +=
182 static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
183 counter += static_cast<size_t>(word > 0x7FF); // three-byte
184 }
185 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
186 counter};
187}
188
189// variable templates are a C++14 extension
190template <endianness big_endian> constexpr char16_t replacement() {
191 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
192}
193
194template <endianness big_endian>
195simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
196 char16_t *output) {
197 const char16_t replacement = utf16::replacement<big_endian>();
198 bool high_surrogate_prev = false, high_surrogate, low_surrogate;
199 size_t i = 0;
200 for (; i < len; i++) {
201 char16_t c = input[i];
202 high_surrogate = is_high_surrogate<big_endian>(c);
203 low_surrogate = is_low_surrogate<big_endian>(c);
204 if (high_surrogate_prev && !low_surrogate) {
205 output[i - 1] = replacement;
206 }
207
208 if (!high_surrogate_prev && low_surrogate) {
209 output[i] = replacement;
210 } else {
211 output[i] = input[i];
212 }
213 high_surrogate_prev = high_surrogate;
214 }
215
216 /* string may not end with high surrogate */
217 if (high_surrogate_prev) {
218 output[i - 1] = replacement;
219 }
220}
221
222} // namespace utf16
223} // namespace scalar
224} // namespace simdutf
225
226#endif