simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
latin1_to_utf8.h
1#ifndef SIMDUTF_LATIN1_TO_UTF8_H
2#define SIMDUTF_LATIN1_TO_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace latin1_to_utf8 {
8
9template <typename InputPtr, typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
12 simdutf::detail::index_assignable_from_char<OutputPtr>)
13#endif
14simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
15 OutputPtr utf8_output) {
16 // const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
17 size_t pos = 0;
18 size_t utf8_pos = 0;
19
20 while (pos < len) {
21#if SIMDUTF_CPLUSPLUS23
22 if !consteval
23#endif
24 {
25 // try to convert the next block of 16 ASCII bytes
26 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
27 // they are ascii
28 uint64_t v1;
29 ::memcpy(&v1, data + pos, sizeof(uint64_t));
30 uint64_t v2;
31 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
32 uint64_t v{v1 |
33 v2}; // We are only interested in these bits: 1000 1000 1000
34 // 1000, so it makes sense to concatenate everything
35 if ((v & 0x8080808080808080) ==
36 0) { // if NONE of these are set, e.g. all of them are zero, then
37 // everything is ASCII
38 size_t final_pos = pos + 16;
39 while (pos < final_pos) {
40 utf8_output[utf8_pos++] = char(data[pos]);
41 pos++;
42 }
43 continue;
44 }
45 } // if (pos + 16 <= len)
46 } // !consteval scope
47
48 unsigned char byte = data[pos];
49 if ((byte & 0x80) == 0) { // if ASCII
50 // will generate one UTF-8 bytes
51 utf8_output[utf8_pos++] = char(byte);
52 pos++;
53 } else {
54 // will generate two UTF-8 bytes
55 utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
56 utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
57 pos++;
58 }
59 } // while
60 return utf8_pos;
61}
62
63simdutf_really_inline size_t convert(const char *buf, size_t len,
64 char *utf8_output) {
65 return convert(reinterpret_cast<const unsigned char *>(buf), len,
66 utf8_output);
67}
68
69inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
70 size_t utf8_len) {
71 const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
72 size_t pos = 0;
73 size_t skip_pos = 0;
74 size_t utf8_pos = 0;
75 while (pos < len && utf8_pos < utf8_len) {
76 // try to convert the next block of 16 ASCII bytes
77 if (pos >= skip_pos && pos + 16 <= len &&
78 utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
79 // check that they are ascii
80 uint64_t v1;
81 ::memcpy(&v1, data + pos, sizeof(uint64_t));
82 uint64_t v2;
83 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
84 uint64_t v{v1 |
85 v2}; // We are only interested in these bits: 1000 1000 1000
86 // 1000, so it makes sense to concatenate everything
87 if ((v & 0x8080808080808080) ==
88 0) { // if NONE of these are set, e.g. all of them are zero, then
89 // everything is ASCII
90 ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
91 utf8_pos += 16;
92 pos += 16;
93 } else {
94 // At least one of the next 16 bytes are not ASCII, we will process them
95 // one by one
96 skip_pos = pos + 16;
97 }
98 } else {
99 const auto byte = data[pos];
100 if ((byte & 0x80) == 0) { // if ASCII
101 // will generate one UTF-8 bytes
102 utf8_output[utf8_pos++] = char(byte);
103 pos++;
104 } else if (utf8_pos + 2 <= utf8_len) {
105 // will generate two UTF-8 bytes
106 utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
107 utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
108 pos++;
109 } else {
110 break;
111 }
112 }
113 }
114 return utf8_pos;
115}
116
117template <typename InputPtr, typename OutputPtr>
118#if SIMDUTF_CPLUSPLUS20
119 requires(simdutf::detail::indexes_into_byte_like<InputPtr> &&
120 simdutf::detail::index_assignable_from_char<OutputPtr>)
121#endif
122simdutf_constexpr23 size_t convert_safe_constexpr(InputPtr data, size_t len,
123 OutputPtr utf8_output,
124 size_t utf8_len) {
125 size_t pos = 0;
126 size_t utf8_pos = 0;
127 while (pos < len && utf8_pos < utf8_len) {
128 const unsigned char byte = data[pos];
129 if ((byte & 0x80) == 0) { // if ASCII
130 // will generate one UTF-8 bytes
131 utf8_output[utf8_pos++] = char(byte);
132 pos++;
133 } else if (utf8_pos + 2 <= utf8_len) {
134 // will generate two UTF-8 bytes
135 utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
136 utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
137 pos++;
138 } else {
139 break;
140 }
141 }
142 return utf8_pos;
143}
144
145template <typename InputPtr>
146#if SIMDUTF_CPLUSPLUS20
147 requires simdutf::detail::indexes_into_byte_like<InputPtr>
148#endif
149simdutf_constexpr23 simdutf_warn_unused size_t
150utf8_length_from_latin1(InputPtr input, size_t length) noexcept {
151 size_t answer = length;
152 size_t i = 0;
153
154#if SIMDUTF_CPLUSPLUS23
155 if !consteval
156#endif
157 {
158 auto pop = [](uint64_t v) {
159 return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
160 UINT64_C(0x0101010101010101) >>
161 56);
162 };
163 for (; i + 32 <= length; i += 32) {
164 uint64_t v;
165 memcpy(&v, input + i, 8);
166 answer += pop(v);
167 memcpy(&v, input + i + 8, sizeof(v));
168 answer += pop(v);
169 memcpy(&v, input + i + 16, sizeof(v));
170 answer += pop(v);
171 memcpy(&v, input + i + 24, sizeof(v));
172 answer += pop(v);
173 }
174 for (; i + 8 <= length; i += 8) {
175 uint64_t v;
176 memcpy(&v, input + i, sizeof(v));
177 answer += pop(v);
178 }
179 } // !consteval scope
180 for (; i + 1 <= length; i += 1) {
181 answer += static_cast<uint8_t>(input[i]) >> 7;
182 }
183 return answer;
184}
185
186} // namespace latin1_to_utf8
187} // unnamed namespace
188} // namespace scalar
189} // namespace simdutf
190
191#endif