simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf16.h
1#ifndef SIMDUTF_UTF16_H
2#define SIMDUTF_UTF16_H
3
4namespace simdutf {
5namespace scalar {
6namespace utf16 {
7
8template <endianness big_endian>
9simdutf_warn_unused simdutf_constexpr23 bool
10validate_as_ascii(const char16_t *data, size_t len) noexcept {
11 for (size_t pos = 0; pos < len; pos++) {
12 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
13 if (word >= 0x80) {
14 return false;
15 }
16 }
17 return true;
18}
19
20template <endianness big_endian>
21inline simdutf_warn_unused simdutf_constexpr23 bool
22validate(const char16_t *data, size_t len) noexcept {
23 uint64_t pos = 0;
24 while (pos < len) {
25 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26 if ((word & 0xF800) == 0xD800) {
27 if (pos + 1 >= len) {
28 return false;
29 }
30 char16_t diff = char16_t(word - 0xD800);
31 if (diff > 0x3FF) {
32 return false;
33 }
34 char16_t next_word = !match_system(big_endian)
35 ? u16_swap_bytes(data[pos + 1])
36 : data[pos + 1];
37 char16_t diff2 = char16_t(next_word - 0xDC00);
38 if (diff2 > 0x3FF) {
39 return false;
40 }
41 pos += 2;
42 } else {
43 pos++;
44 }
45 }
46 return true;
47}
48
49template <endianness big_endian>
50inline simdutf_warn_unused simdutf_constexpr23 result
51validate_with_errors(const char16_t *data, size_t len) noexcept {
52 size_t pos = 0;
53 while (pos < len) {
54 char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55 if ((word & 0xF800) == 0xD800) {
56 if (pos + 1 >= len) {
57 return result(error_code::SURROGATE, pos);
58 }
59 char16_t diff = char16_t(word - 0xD800);
60 if (diff > 0x3FF) {
61 return result(error_code::SURROGATE, pos);
62 }
63 char16_t next_word = !match_system(big_endian)
64 ? u16_swap_bytes(data[pos + 1])
65 : data[pos + 1];
66 char16_t diff2 = uint16_t(next_word - 0xDC00);
67 if (diff2 > 0x3FF) {
68 return result(error_code::SURROGATE, pos);
69 }
70 pos += 2;
71 } else {
72 pos++;
73 }
74 }
75 return result(error_code::SUCCESS, pos);
76}
77
78template <endianness big_endian>
79simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
80 // We are not BOM aware.
81 size_t counter{0};
82 for (size_t i = 0; i < len; i++) {
83 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84 counter += ((word & 0xFC00) != 0xDC00);
85 }
86 return counter;
87}
88
89template <endianness big_endian>
90simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
91 size_t len) {
92 // We are not BOM aware.
93 size_t counter{0};
94 for (size_t i = 0; i < len; i++) {
95 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
96 counter++; // ASCII
97 counter += static_cast<size_t>(
98 word >
99 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
100 counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
101 (word >= 0xE000)); // three-byte
102 }
103 return counter;
104}
105
106template <endianness big_endian>
107simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
108 size_t len) {
109 // We are not BOM aware.
110 size_t counter{0};
111 for (size_t i = 0; i < len; i++) {
112 char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113 counter += ((word & 0xFC00) != 0xDC00);
114 }
115 return counter;
116}
117
118simdutf_really_inline simdutf_constexpr23 void
119change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
120 for (size_t i = 0; i < size; i++) {
121 *output++ = char16_t(input[i] >> 8 | input[i] << 8);
122 }
123}
124
125template <endianness big_endian>
126simdutf_warn_unused simdutf_constexpr23 size_t
127trim_partial_utf16(const char16_t *input, size_t length) {
128 if (length == 0) {
129 return 0;
130 }
131 uint16_t last_word = uint16_t(input[length - 1]);
132 last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133 length -= ((last_word & 0xFC00) == 0xD800);
134 return length;
135}
136
137template <endianness big_endian> constexpr bool is_high_surrogate(char16_t c) {
138 c = scalar::utf16::swap_if_needed<big_endian>(c);
139 return (0xd800 <= c && c <= 0xdbff);
140}
141
142template <endianness big_endian> constexpr bool is_low_surrogate(char16_t c) {
143 c = scalar::utf16::swap_if_needed<big_endian>(c);
144 return (0xdc00 <= c && c <= 0xdfff);
145}
146
147simdutf_unused simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
148 return (0xd800 <= c && c <= 0xdbff);
149}
150
151template <endianness big_endian>
152simdutf_constexpr23 result
153utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
154 bool any_surrogates = false;
155 // We are not BOM aware.
156 size_t counter{0};
157 for (size_t i = 0; i < len; i++) {
158 if (is_high_surrogate<big_endian>(p[i])) {
159 any_surrogates = true;
160 // surrogate pair
161 if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
162 counter += 4;
163 i++; // skip low surrogate
164 } else {
165 counter += 3; // unpaired high surrogate replaced by U+FFFD
166 }
167 continue;
168 } else if (is_low_surrogate<big_endian>(p[i])) {
169 any_surrogates = true;
170 counter += 3; // unpaired low surrogate replaced by U+FFFD
171 continue;
172 }
173 char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
174 counter++; // at least 1 byte
175 counter +=
176 static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
177 counter += static_cast<size_t>(word > 0x7FF); // three-byte
178 }
179 return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
180 counter};
181}
182
183// variable templates are a C++14 extension
184template <endianness big_endian> constexpr char16_t replacement() {
185 return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
186}
187
188template <endianness big_endian>
189simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
190 char16_t *output) {
191 const char16_t replacement = utf16::replacement<big_endian>();
192 bool high_surrogate_prev = false, high_surrogate, low_surrogate;
193 size_t i = 0;
194 for (; i < len; i++) {
195 char16_t c = input[i];
196 high_surrogate = is_high_surrogate<big_endian>(c);
197 low_surrogate = is_low_surrogate<big_endian>(c);
198 if (high_surrogate_prev && !low_surrogate) {
199 output[i - 1] = replacement;
200 }
201
202 if (!high_surrogate_prev && low_surrogate) {
203 output[i] = replacement;
204 } else {
205 output[i] = input[i];
206 }
207 high_surrogate_prev = high_surrogate;
208 }
209
210 /* string may not end with high surrogate */
211 if (high_surrogate_prev) {
212 output[i - 1] = replacement;
213 }
214}
215
216} // namespace utf16
217} // namespace scalar
218} // namespace simdutf
219
220#endif