simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf16_to_utf32.h
1#ifndef SIMDUTF_UTF16_TO_UTF32_H
2#define SIMDUTF_UTF16_TO_UTF32_H
3
4namespace simdutf {
5namespace scalar {
6namespace {
7namespace utf16_to_utf32 {
8
9template <endianness big_endian>
10simdutf_constexpr23 size_t convert(const char16_t *data, size_t len,
11 char32_t *utf32_output) {
12 size_t pos = 0;
13 char32_t *start{utf32_output};
14 while (pos < len) {
15 uint16_t word =
16 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
17 if ((word & 0xF800) != 0xD800) {
18 // No surrogate pair, extend 16-bit word to 32-bit word
19 *utf32_output++ = char32_t(word);
20 pos++;
21 } else {
22 // must be a surrogate pair
23 uint16_t diff = uint16_t(word - 0xD800);
24 if (diff > 0x3FF) {
25 return 0;
26 }
27 if (pos + 1 >= len) {
28 return 0;
29 } // minimal bound checking
30 uint16_t next_word = !match_system(big_endian)
31 ? u16_swap_bytes(data[pos + 1])
32 : data[pos + 1];
33 uint16_t diff2 = uint16_t(next_word - 0xDC00);
34 if (diff2 > 0x3FF) {
35 return 0;
36 }
37 uint32_t value = (diff << 10) + diff2 + 0x10000;
38 *utf32_output++ = char32_t(value);
39 pos += 2;
40 }
41 }
42 return utf32_output - start;
43}
44
45template <endianness big_endian>
46simdutf_constexpr23 result convert_with_errors(const char16_t *data, size_t len,
47 char32_t *utf32_output) {
48 size_t pos = 0;
49 char32_t *start{utf32_output};
50 while (pos < len) {
51 uint16_t word =
52 !match_system(big_endian) ? u16_swap_bytes(data[pos]) : data[pos];
53 if ((word & 0xF800) != 0xD800) {
54 // No surrogate pair, extend 16-bit word to 32-bit word
55 *utf32_output++ = char32_t(word);
56 pos++;
57 } else {
58 // must be a surrogate pair
59 uint16_t diff = uint16_t(word - 0xD800);
60 if (diff > 0x3FF) {
61 return result(error_code::SURROGATE, pos);
62 }
63 if (pos + 1 >= len) {
64 return result(error_code::SURROGATE, pos);
65 } // minimal bound checking
66 uint16_t next_word = !match_system(big_endian)
67 ? u16_swap_bytes(data[pos + 1])
68 : data[pos + 1];
69 uint16_t diff2 = uint16_t(next_word - 0xDC00);
70 if (diff2 > 0x3FF) {
71 return result(error_code::SURROGATE, pos);
72 }
73 uint32_t value = (diff << 10) + diff2 + 0x10000;
74 *utf32_output++ = char32_t(value);
75 pos += 2;
76 }
77 }
78 return result(error_code::SUCCESS, utf32_output - start);
79}
80
81} // namespace utf16_to_utf32
82} // unnamed namespace
83} // namespace scalar
84} // namespace simdutf
85
86#endif