1#ifndef SIMDUTF_UTF8_TO_LATIN1_H
2#define SIMDUTF_UTF8_TO_LATIN1_H
7namespace utf8_to_latin1 {
9template <
typename InputPtr,
typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
12 simdutf::detail::indexes_into_byte_like<OutputPtr>)
14simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
15 OutputPtr latin_output) {
17 auto start = latin_output;
20#if SIMDUTF_CPLUSPLUS23
25 if (pos + 16 <= len) {
28 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
30 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
33 if ((v & 0x8080808080808080) ==
36 size_t final_pos = pos + 16;
37 while (pos < final_pos) {
38 *latin_output++ = char(data[pos]);
47 uint8_t leading_byte = data[pos];
48 if (leading_byte < 0b10000000) {
50 *latin_output++ = char(leading_byte);
52 }
else if ((leading_byte & 0b11100000) ==
58 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
64 (leading_byte & 0b00011111) << 6 |
71 if (code_point < 0x80 || 0xFF < code_point) {
76 *latin_output++ = char(code_point);
82 return latin_output - start;
85template <
typename InputPtr>
86#if SIMDUTF_CPLUSPLUS20
87 requires simdutf::detail::indexes_into_byte_like<InputPtr>
89simdutf_constexpr23 result convert_with_errors(InputPtr data,
size_t len,
92 char *start{latin_output};
95#if SIMDUTF_CPLUSPLUS23
100 if (pos + 16 <= len) {
103 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
105 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
108 if ((v & 0x8080808080808080) ==
111 size_t final_pos = pos + 16;
112 while (pos < final_pos) {
113 *latin_output++ = char(data[pos]);
121 uint8_t leading_byte = data[pos];
122 if (leading_byte < 0b10000000) {
124 *latin_output++ = char(leading_byte);
126 }
else if ((leading_byte & 0b11100000) ==
129 if (pos + 1 >= len) {
130 return result(error_code::TOO_SHORT, pos);
132 if ((data[pos + 1] & 0b11000000) != 0b10000000) {
133 return result(error_code::TOO_SHORT, pos);
137 uint32_t code_point =
138 (leading_byte & 0b00011111) << 6 |
145 if (code_point < 0x80) {
146 return result(error_code::OVERLONG, pos);
148 if (0xFF < code_point) {
149 return result(error_code::TOO_LARGE, pos);
152 *latin_output++ = char(code_point);
154 }
else if ((leading_byte & 0b11110000) == 0b11100000) {
156 return result(error_code::TOO_LARGE, pos);
157 }
else if ((leading_byte & 0b11111000) == 0b11110000) {
159 return result(error_code::TOO_LARGE, pos);
162 if ((leading_byte & 0b11000000) == 0b10000000) {
163 return result(error_code::TOO_LONG, pos);
166 return result(error_code::HEADER_BITS, pos);
169 return result(error_code::SUCCESS, latin_output - start);
172inline result rewind_and_convert_with_errors(
size_t prior_bytes,
173 const char *buf,
size_t len,
174 char *latin1_output) {
179 size_t how_far_back = prior_bytes;
182 bool found_leading_bytes{
false};
184 for (
size_t i = 0; i <= how_far_back; i++) {
185 unsigned char byte = buf[-
static_cast<std::ptrdiff_t
>(i)];
186 found_leading_bytes = ((
byte & 0b11000000) != 0b10000000);
187 if (found_leading_bytes) {
188 if (i > 0 &&
byte < 128) {
191 return result(error_code::TOO_LONG, 0 - i + 1);
206 if (!found_leading_bytes) {
211 return result(error_code::TOO_LONG, 0 - how_far_back);
213 result res = convert_with_errors(buf, len + extra_len, latin1_output);
215 res.count -= extra_len;