simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
valid_utf8_to_utf32.h
1#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
2#define SIMDUTF_VALID_UTF8_TO_UTF32_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_utf32 {
8
9template <typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
14 char32_t *utf32_output) {
15 size_t pos = 0;
16 char32_t *start{utf32_output};
17 while (pos < len) {
18#if SIMDUTF_CPLUSPLUS23
19 if !consteval
20#endif
21 {
22 // try to convert the next block of 8 ASCII bytes
23 if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that
24 // they are ascii
25 uint64_t v;
26 ::memcpy(&v, data + pos, sizeof(uint64_t));
27 if ((v & 0x8080808080808080) == 0) {
28 size_t final_pos = pos + 8;
29 while (pos < final_pos) {
30 *utf32_output++ = uint8_t(data[pos]);
31 pos++;
32 }
33 continue;
34 }
35 }
36 }
37 auto leading_byte = uint8_t(data[pos]); // leading byte
38 if (leading_byte < 0b10000000) {
39 // converting one ASCII byte !!!
40 *utf32_output++ = char32_t(leading_byte);
41 pos++;
42 } else if ((leading_byte & 0b11100000) == 0b11000000) {
43 // We have a two-byte UTF-8
44 if (pos + 1 >= len) {
45 break;
46 } // minimal bound checking
47 *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
48 (uint8_t(data[pos + 1]) & 0b00111111));
49 pos += 2;
50 } else if ((leading_byte & 0b11110000) == 0b11100000) {
51 // We have a three-byte UTF-8
52 if (pos + 2 >= len) {
53 break;
54 } // minimal bound checking
55 *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
56 ((uint8_t(data[pos + 1]) & 0b00111111) << 6) |
57 (uint8_t(data[pos + 2]) & 0b00111111));
58 pos += 3;
59 } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
60 // we have a 4-byte UTF-8 word.
61 if (pos + 3 >= len) {
62 break;
63 } // minimal bound checking
64 uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
65 ((uint8_t(data[pos + 1]) & 0b00111111) << 12) |
66 ((uint8_t(data[pos + 2]) & 0b00111111) << 6) |
67 (uint8_t(data[pos + 3]) & 0b00111111);
68 *utf32_output++ = char32_t(code_word);
69 pos += 4;
70 } else {
71 // we may have a continuation but we do not do error checking
72 return 0;
73 }
74 }
75 return utf32_output - start;
76}
77
78} // namespace utf8_to_utf32
79} // unnamed namespace
80} // namespace scalar
81} // namespace simdutf
82
83#endif