simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf32_to_utf8.h
1#ifndef SIMDUTF_UTF32_TO_UTF8_H
2#define SIMDUTF_UTF32_TO_UTF8_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf32_to_utf8 {
8
9template <typename InputPtr, typename OutputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
12 simdutf::detail::index_assignable_from_char<OutputPtr>)
13#endif
14simdutf_constexpr23 size_t convert(InputPtr data, size_t len,
15 OutputPtr utf8_output) {
16 size_t pos = 0;
17 auto start = utf8_output;
18 while (pos < len) {
19#if SIMDUTF_CPLUSPLUS23
20 if !consteval
21#endif
22 { // try to convert the next block of 2 ASCII characters
23 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
24 // they are ascii
25 uint64_t v;
26 ::memcpy(&v, data + pos, sizeof(uint64_t));
27 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
28 *utf8_output++ = char(data[pos]);
29 *utf8_output++ = char(data[pos + 1]);
30 pos += 2;
31 continue;
32 }
33 }
34 }
35
36 uint32_t word = data[pos];
37 if ((word & 0xFFFFFF80) == 0) {
38 // will generate one UTF-8 bytes
39 *utf8_output++ = char(word);
40 pos++;
41 } else if ((word & 0xFFFFF800) == 0) {
42 // will generate two UTF-8 bytes
43 // we have 0b110XXXXX 0b10XXXXXX
44 *utf8_output++ = char((word >> 6) | 0b11000000);
45 *utf8_output++ = char((word & 0b111111) | 0b10000000);
46 pos++;
47 } else if ((word & 0xFFFF0000) == 0) {
48 // will generate three UTF-8 bytes
49 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
50 if (word >= 0xD800 && word <= 0xDFFF) {
51 return 0;
52 }
53 *utf8_output++ = char((word >> 12) | 0b11100000);
54 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
55 *utf8_output++ = char((word & 0b111111) | 0b10000000);
56 pos++;
57 } else {
58 // will generate four UTF-8 bytes
59 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
60 if (word > 0x10FFFF) {
61 return 0;
62 }
63 *utf8_output++ = char((word >> 18) | 0b11110000);
64 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
65 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
66 *utf8_output++ = char((word & 0b111111) | 0b10000000);
67 pos++;
68 }
69 }
70 return utf8_output - start;
71}
72
73template <typename InputPtr, typename OutputPtr>
74#if SIMDUTF_CPLUSPLUS20
75 requires(simdutf::detail::indexes_into_utf32<InputPtr> &&
76 simdutf::detail::index_assignable_from_char<OutputPtr>)
77#endif
78simdutf_constexpr23 result convert_with_errors(InputPtr data, size_t len,
79 OutputPtr utf8_output) {
80 size_t pos = 0;
81 auto start = utf8_output;
82 while (pos < len) {
83#if SIMDUTF_CPLUSPLUS23
84 if !consteval
85#endif
86 { // try to convert the next block of 2 ASCII characters
87 if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that
88 // they are ascii
89 uint64_t v;
90 ::memcpy(&v, data + pos, sizeof(uint64_t));
91 if ((v & 0xFFFFFF80FFFFFF80) == 0) {
92 *utf8_output++ = char(data[pos]);
93 *utf8_output++ = char(data[pos + 1]);
94 pos += 2;
95 continue;
96 }
97 }
98 }
99
100 uint32_t word = data[pos];
101 if ((word & 0xFFFFFF80) == 0) {
102 // will generate one UTF-8 bytes
103 *utf8_output++ = char(word);
104 pos++;
105 } else if ((word & 0xFFFFF800) == 0) {
106 // will generate two UTF-8 bytes
107 // we have 0b110XXXXX 0b10XXXXXX
108 *utf8_output++ = char((word >> 6) | 0b11000000);
109 *utf8_output++ = char((word & 0b111111) | 0b10000000);
110 pos++;
111 } else if ((word & 0xFFFF0000) == 0) {
112 // will generate three UTF-8 bytes
113 // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
114 if (word >= 0xD800 && word <= 0xDFFF) {
115 return result(error_code::SURROGATE, pos);
116 }
117 *utf8_output++ = char((word >> 12) | 0b11100000);
118 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
119 *utf8_output++ = char((word & 0b111111) | 0b10000000);
120 pos++;
121 } else {
122 // will generate four UTF-8 bytes
123 // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
124 if (word > 0x10FFFF) {
125 return result(error_code::TOO_LARGE, pos);
126 }
127 *utf8_output++ = char((word >> 18) | 0b11110000);
128 *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
129 *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
130 *utf8_output++ = char((word & 0b111111) | 0b10000000);
131 pos++;
132 }
133 }
134 return result(error_code::SUCCESS, utf8_output - start);
135}
136
137} // namespace utf32_to_utf8
138} // unnamed namespace
139} // namespace scalar
140} // namespace simdutf
141
142#endif