simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
utf32.h
1#ifndef SIMDUTF_UTF32_H
2#define SIMDUTF_UTF32_H
3
4namespace simdutf {
5namespace scalar {
6namespace utf32 {
7
8template <typename InputPtr>
9#if SIMDUTF_CPLUSPLUS20
10 requires simdutf::detail::indexes_into_uint32<InputPtr>
11#endif
12simdutf_warn_unused simdutf_constexpr23 bool validate(InputPtr data,
13 size_t len) noexcept {
14 uint64_t pos = 0;
15 for (; pos < len; pos++) {
16 uint32_t word = data[pos];
17 if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
18 return false;
19 }
20 }
21 return true;
22}
23
24simdutf_warn_unused simdutf_really_inline bool validate(const char32_t *buf,
25 size_t len) noexcept {
26 return validate(reinterpret_cast<const uint32_t *>(buf), len);
27}
28
29template <typename InputPtr>
30#if SIMDUTF_CPLUSPLUS20
31 requires simdutf::detail::indexes_into_uint32<InputPtr>
32#endif
33simdutf_warn_unused simdutf_constexpr23 result
34validate_with_errors(InputPtr data, size_t len) noexcept {
35 size_t pos = 0;
36 for (; pos < len; pos++) {
37 uint32_t word = data[pos];
38 if (word > 0x10FFFF) {
39 return result(error_code::TOO_LARGE, pos);
40 }
41 if (word >= 0xD800 && word <= 0xDFFF) {
42 return result(error_code::SURROGATE, pos);
43 }
44 }
45 return result(error_code::SUCCESS, pos);
46}
47
48simdutf_warn_unused simdutf_really_inline result
49validate_with_errors(const char32_t *buf, size_t len) noexcept {
50 return validate_with_errors(reinterpret_cast<const uint32_t *>(buf), len);
51}
52
53inline simdutf_constexpr23 size_t utf8_length_from_utf32(const char32_t *p,
54 size_t len) {
55 // We are not BOM aware.
56 size_t counter{0};
57 for (size_t i = 0; i < len; i++) {
58 // credit: @ttsugriy for the vectorizable approach
59 counter++; // ASCII
60 counter += static_cast<size_t>(p[i] > 0x7F); // two-byte
61 counter += static_cast<size_t>(p[i] > 0x7FF); // three-byte
62 counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
63 }
64 return counter;
65}
66
67inline simdutf_warn_unused simdutf_constexpr23 size_t
68utf16_length_from_utf32(const char32_t *p, size_t len) {
69 // We are not BOM aware.
70 size_t counter{0};
71 for (size_t i = 0; i < len; i++) {
72 counter++; // non-surrogate word
73 counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
74 }
75 return counter;
76}
77
78} // namespace utf32
79} // namespace scalar
80} // namespace simdutf
81
82#endif