1#ifndef SIMDUTF_LATIN1_TO_UTF8_H
2#define SIMDUTF_LATIN1_TO_UTF8_H
7namespace latin1_to_utf8 {
9template <
typename InputPtr,
typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
12 simdutf::detail::index_assignable_from_char<OutputPtr>)
14simdutf_constexpr23
size_t convert(InputPtr data,
size_t len,
15 OutputPtr utf8_output) {
21#if SIMDUTF_CPLUSPLUS23
26 if (pos + 16 <= len) {
29 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
31 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
35 if ((v & 0x8080808080808080) ==
38 size_t final_pos = pos + 16;
39 while (pos < final_pos) {
40 utf8_output[utf8_pos++] = char(data[pos]);
48 unsigned char byte = data[pos];
49 if ((
byte & 0x80) == 0) {
51 utf8_output[utf8_pos++] = char(
byte);
55 utf8_output[utf8_pos++] = char((
byte >> 6) | 0b11000000);
56 utf8_output[utf8_pos++] = char((
byte & 0b111111) | 0b10000000);
63simdutf_really_inline
size_t convert(
const char *buf,
size_t len,
65 return convert(
reinterpret_cast<const unsigned char *
>(buf), len,
69inline size_t convert_safe(
const char *buf,
size_t len,
char *utf8_output,
71 const unsigned char *data =
reinterpret_cast<const unsigned char *
>(buf);
75 while (pos < len && utf8_pos < utf8_len) {
77 if (pos >= skip_pos && pos + 16 <= len &&
78 utf8_pos + 16 <= utf8_len) {
81 ::memcpy(&v1, data + pos,
sizeof(uint64_t));
83 ::memcpy(&v2, data + pos +
sizeof(uint64_t),
sizeof(uint64_t));
87 if ((v & 0x8080808080808080) ==
90 ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
99 const auto byte = data[pos];
100 if ((
byte & 0x80) == 0) {
102 utf8_output[utf8_pos++] = char(
byte);
104 }
else if (utf8_pos + 2 <= utf8_len) {
106 utf8_output[utf8_pos++] = char((
byte >> 6) | 0b11000000);
107 utf8_output[utf8_pos++] = char((
byte & 0b111111) | 0b10000000);
117template <
typename InputPtr,
typename OutputPtr>
118#if SIMDUTF_CPLUSPLUS20
119 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
120 simdutf::detail::index_assignable_from_char<OutputPtr>)
122simdutf_constexpr23
size_t convert_safe_constexpr(InputPtr data,
size_t len,
123 OutputPtr utf8_output,
127 while (pos < len && utf8_pos < utf8_len) {
128 const unsigned char byte = data[pos];
129 if ((
byte & 0x80) == 0) {
131 utf8_output[utf8_pos++] = char(
byte);
133 }
else if (utf8_pos + 2 <= utf8_len) {
135 utf8_output[utf8_pos++] = char((
byte >> 6) | 0b11000000);
136 utf8_output[utf8_pos++] = char((
byte & 0b111111) | 0b10000000);
145template <
typename InputPtr>
146#if SIMDUTF_CPLUSPLUS20
147 requires simdutf::detail::indexes_into_byte_like<InputPtr>
149simdutf_constexpr23 simdutf_warn_unused
size_t
150utf8_length_from_latin1(InputPtr input,
size_t length)
noexcept {
151 size_t answer = length;
154#if SIMDUTF_CPLUSPLUS23
158 auto pop = [](uint64_t v) {
159 return (
size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
160 UINT64_C(0x0101010101010101) >>
163 for (; i + 32 <= length; i += 32) {
165 memcpy(&v, input + i, 8);
167 memcpy(&v, input + i + 8,
sizeof(v));
169 memcpy(&v, input + i + 16,
sizeof(v));
171 memcpy(&v, input + i + 24,
sizeof(v));
174 for (; i + 8 <= length; i += 8) {
176 memcpy(&v, input + i,
sizeof(v));
180 for (; i + 1 <= length; i += 1) {
181 answer +=
static_cast<uint8_t
>(input[i]) >> 7;