simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
valid_utf8_to_latin1.h
1#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
2#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf8_to_latin1 {
8
9template <typename InputPtr>
10#if SIMDUTF_CPLUSPLUS20
11 requires simdutf::detail::indexes_into_byte_like<InputPtr>
12#endif
13simdutf_constexpr23 size_t convert_valid(InputPtr data, size_t len,
14 char *latin_output) {
15
16 size_t pos = 0;
17 char *start{latin_output};
18
19 while (pos < len) {
20#if SIMDUTF_CPLUSPLUS23
21 if !consteval
22#endif
23 {
24 // try to convert the next block of 16 ASCII bytes
25 if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that
26 // they are ascii
27 uint64_t v1;
28 ::memcpy(&v1, data + pos, sizeof(uint64_t));
29 uint64_t v2;
30 ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
31 uint64_t v{v1 |
32 v2}; // We are only interested in these bits: 1000 1000 1000
33 // 1000, so it makes sense to concatenate everything
34 if ((v & 0x8080808080808080) ==
35 0) { // if NONE of these are set, e.g. all of them are zero, then
36 // everything is ASCII
37 size_t final_pos = pos + 16;
38 while (pos < final_pos) {
39 *latin_output++ = uint8_t(data[pos]);
40 pos++;
41 }
42 continue;
43 }
44 }
45 }
46
47 // suppose it is not an all ASCII byte sequence
48 auto leading_byte = uint8_t(data[pos]); // leading byte
49 if (leading_byte < 0b10000000) {
50 // converting one ASCII byte !!!
51 *latin_output++ = char(leading_byte);
52 pos++;
53 } else if ((leading_byte & 0b11100000) ==
54 0b11000000) { // the first three bits indicate:
55 // We have a two-byte UTF-8
56 if (pos + 1 >= len) {
57 break;
58 } // minimal bound checking
59 if ((uint8_t(data[pos + 1]) & 0b11000000) != 0b10000000) {
60 return 0;
61 } // checks if the next byte is a valid continuation byte in UTF-8. A
62 // valid continuation byte starts with 10.
63 // range check -
64 uint32_t code_point =
65 (leading_byte & 0b00011111) << 6 |
66 (uint8_t(data[pos + 1]) &
67 0b00111111); // assembles the Unicode code point from the two bytes.
68 // It does this by discarding the leading 110 and 10
69 // bits from the two bytes, shifting the remaining bits
70 // of the first byte, and then combining the results
71 // with a bitwise OR operation.
72 *latin_output++ = char(code_point);
73 pos += 2;
74 } else {
75 // we may have a continuation but we do not do error checking
76 return 0;
77 }
78 }
79 return latin_output - start;
80}
81
82} // namespace utf8_to_latin1
83} // unnamed namespace
84} // namespace scalar
85} // namespace simdutf
86
87#endif