simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
valid_utf32_to_utf8.h
1#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
2#define SIMDUTF_VALID_UTF32_TO_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf32_to_utf8 {
8
9template <typename InputPtr, typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
12 simdutf::detail::index_assignable_from_char<OutputPtr>)
13#endif
14simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
15 OutputPtr utf8_output) {
16 size_t pos = 0;
17 auto start = utf8_output;
18 while (pos < len) {
19#if SIMDUTF_CPLUSPLUS23
20 if !consteval
21#endif
22 { // try to convert the next block of 2 ASCII characters
23 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
24 // they are ascii
25 uint64_t v;
26 ::memcpy(&v, data + pos, sizeof(uint64_t));
27 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
28 *utf8_output++ = char(data[pos]);
29 *utf8_output++ = char(data[pos + 1]);
30 pos += 2;
31 continue;
32 }
33 }
34 }
35
36 uint32_t word = data[pos];
37 if ((word & 0xFFFFFF80) == 0) {
38 // will generate one UTF-8 bytes
39 *utf8_output++ = char(word);
40 pos++;
41 } else if ((word & 0xFFFFF800) == 0) {
42 // will generate two UTF-8 bytes
43 // we have 0b110XXXXX 0b10XXXXXX
44 *utf8_output++ = char((word >> 6) | 0b11000000);
45 *utf8_output++ = char((word & 0b111111) | 0b10000000);
46 pos++;
47 } else if ((word & 0xFFFF0000) == 0) {
48 // will generate three UTF-8 bytes
49 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
50 *utf8_output++ = char((word >> 12) | 0b11100000);
51 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
52 *utf8_output++ = char((word & 0b111111) | 0b10000000);
53 pos++;
54 } else {
55 // will generate four UTF-8 bytes
56 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
57 *utf8_output++ = char((word >> 18) | 0b11110000);
58 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
59 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
60 *utf8_output++ = char((word & 0b111111) | 0b10000000);
61 pos++;
62 }
63 }
64 return utf8_output - start;
65}
66
67} // namespace utf32_to_utf8
68} // unnamed namespace
69} // namespace scalar
70} // namespace simdutf
71
72#endif