simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf8_to_latin1.h
1#ifndef SIMDUTF_UTF8_TO_LATIN1_H
2#define SIMDUTF_UTF8_TO_LATIN1_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_latin1 {
8
9template <typename InputPtr, typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
12 simdutf::detail::indexes_into_byte_like<OutputPtr>)
13#endif
14simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
15 OutputPtr latin_output) {
16 size_t pos = 0;
17 auto start = latin_output;
18
19 while (pos < len) {
20#if SIMDUTF_CPLUSPLUS23
21 if !consteval
22#endif
23 {
24 // try to convert the next block of 16 ASCII bytes
25 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
26 // they are ascii
27 uint64_t v1;
28 ::memcpy(&v1, data + pos, sizeof(uint64_t));
29 uint64_t v2;
30 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
31 uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
32 // 1000 1000 .... etc
33 if ((v & 0x8080808080808080) ==
34 0) { // if NONE of these are set, e.g. all of them are zero, then
35 // everything is ASCII
36 size_t final_pos = pos + 16;
37 while (pos < final_pos) {
38 *latin_output++ = char(data[pos]);
39 pos++;
40 }
41 continue;
42 }
43 }
44 }
45
46 // suppose it is not an all ASCII byte sequence
47 uint8_t leading_byte = data[pos]; // leading byte
48 if (leading_byte < 0b10000000) {
49 // converting one ASCII byte !!!
50 *latin_output++ = char(leading_byte);
51 pos++;
52 } else if ((leading_byte & 0b11100000) ==
53 0b11000000) { // the first three bits indicate:
54 // We have a two-byte UTF-8
55 if (pos + 1 >= len) {
56 return 0;
57 } // minimal bound checking
58 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
59 return 0;
60 } // checks if the next byte is a valid continuation byte in UTF-8. A
61 // valid continuation byte starts with 10.
62 // range check -
63 uint32_t code_point =
64 (leading_byte & 0b00011111) << 6 |
65 (data[pos + 1] &
66 0b00111111); // assembles the Unicode code point from the two bytes.
67 // It does this by discarding the leading 110 and 10
68 // bits from the two bytes, shifting the remaining bits
69 // of the first byte, and then combining the results
70 // with a bitwise OR operation.
71 if (code_point < 0x80 || 0xFF < code_point) {
72 return 0; // We only care about the range 129-255 which is Non-ASCII
73 // latin1 characters. A code_point beneath 0x80 is invalid as
74 // it is already covered by bytes whose leading bit is zero.
75 }
76 *latin_output++ = char(code_point);
77 pos += 2;
78 } else {
79 return 0;
80 }
81 }
82 return latin_output - start;
83}
84
85template <typename InputPtr>
86#if SIMDUTF_CPLUSPLUS20
87 requires simdutf::detail::indexes_into_byte_like<InputPtr>
88#endif
89simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
90 char *latin_output) {
91 size_t pos = 0;
92 char *start{latin_output};
93
94 while (pos < len) {
95#if SIMDUTF_CPLUSPLUS23
96 if !consteval
97#endif
98 {
99 // try to convert the next block of 16 ASCII bytes
100 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
101 // they are ascii
102 uint64_t v1;
103 ::memcpy(&v1, data + pos, sizeof(uint64_t));
104 uint64_t v2;
105 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
106 uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
107 // 1000 1000...etc
108 if ((v & 0x8080808080808080) ==
109 0) { // if NONE of these are set, e.g. all of them are zero, then
110 // everything is ASCII
111 size_t final_pos = pos + 16;
112 while (pos < final_pos) {
113 *latin_output++ = char(data[pos]);
114 pos++;
115 }
116 continue;
117 }
118 }
119 }
120 // suppose it is not an all ASCII byte sequence
121 uint8_t leading_byte = data[pos]; // leading byte
122 if (leading_byte < 0b10000000) {
123 // converting one ASCII byte !!!
124 *latin_output++ = char(leading_byte);
125 pos++;
126 } else if ((leading_byte & 0b11100000) ==
127 0b11000000) { // the first three bits indicate:
128 // We have a two-byte UTF-8
129 if (pos + 1 >= len) {
130 return result(error_code::TOO_SHORT, pos);
131 } // minimal bound checking
132 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
133 return result(error_code::TOO_SHORT, pos);
134 } // checks if the next byte is a valid continuation byte in UTF-8. A
135 // valid continuation byte starts with 10.
136 // range check -
137 uint32_t code_point =
138 (leading_byte & 0b00011111) << 6 |
139 (data[pos + 1] &
140 0b00111111); // assembles the Unicode code point from the two bytes.
141 // It does this by discarding the leading 110 and 10
142 // bits from the two bytes, shifting the remaining bits
143 // of the first byte, and then combining the results
144 // with a bitwise OR operation.
145 if (code_point < 0x80) {
146 return result(error_code::OVERLONG, pos);
147 }
148 if (0xFF < code_point) {
149 return result(error_code::TOO_LARGE, pos);
150 } // We only care about the range 129-255 which is Non-ASCII latin1
151 // characters
152 *latin_output++ = char(code_point);
153 pos += 2;
154 } else if ((leading_byte & 0b11110000) == 0b11100000) {
155 // We have a three-byte UTF-8
156 return result(error_code::TOO_LARGE, pos);
157 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
158 // we have a 4-byte UTF-8 word.
159 return result(error_code::TOO_LARGE, pos);
160 } else {
161 // we either have too many continuation bytes or an invalid leading byte
162 if ((leading_byte & 0b11000000) == 0b10000000) {
163 return result(error_code::TOO_LONG, pos);
164 }
165
166 return result(error_code::HEADER_BITS, pos);
167 }
168 }
169 return result(error_code::SUCCESS, latin_output - start);
170}
171
172inline result rewind_and_convert_with_errors(size_t prior_bytes,
173 const char *buf, size_t len,
174 char *latin1_output) {
175 size_t extra_len{0};
176 // We potentially need to go back in time and find a leading byte.
177 // In theory '3' would be sufficient, but sometimes the error can go back
178 // quite far.
179 size_t how_far_back = prior_bytes;
180 // size_t how_far_back = 3; // 3 bytes in the past + current position
181 // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
182 bool found_leading_bytes{false};
183 // important: it is i <= how_far_back and not 'i < how_far_back'.
184 for (size_t i = 0; i <= how_far_back; i++) {
185 unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
186 found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
187 if (found_leading_bytes) {
188 if (i > 0 && byte < 128) {
189 // If we had to go back and the leading byte is ascii
190 // then we can stop right away.
191 return result(error_code::TOO_LONG, 0 - i + 1);
192 }
193 buf -= i;
194 extra_len = i;
195 break;
196 }
197 }
198 //
199 // It is possible for this function to return a negative count in its result.
200 // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
201 // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
202 // unsigned integral type of the result of the sizeof operator
203 //
204 // An unsigned type will simply wrap round arithmetically (well defined).
205 //
206 if (!found_leading_bytes) {
207 // If how_far_back == 3, we may have four consecutive continuation bytes!!!
208 // [....] [continuation] [continuation] [continuation] | [buf is
209 // continuation] Or we possibly have a stream that does not start with a
210 // leading byte.
211 return result(error_code::TOO_LONG, 0 - how_far_back);
212 }
213 result res = convert_with_errors(buf, len + extra_len, latin1_output);
214 if (res.error) {
215 res.count -= extra_len;
216 }
217 return res;
218}
219
220} // namespace utf8_to_latin1
221} // unnamed namespace
222} // namespace scalar
223} // namespace simdutf
224
225#endif