simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
encoding_types.h
1#ifndef SIMDUTF_ENCODING_TYPES_H
2#define SIMDUTF_ENCODING_TYPES_H
3#include <string_view>
4#include "simdutf/portability.h"
5#include "simdutf/common_defs.h"
6
7#if !defined(SIMDUTF_NO_STD_TEXT_ENCODING) && \
8 defined(__cpp_lib_text_encoding) && __cpp_lib_text_encoding >= 202306L
9 #define SIMDUTF_HAS_STD_TEXT_ENCODING 1
10 #include <text_encoding>
11#endif
12
13namespace simdutf {
14
15enum encoding_type {
16 UTF8 = 1, // BOM 0xef 0xbb 0xbf
17 UTF16_LE = 2, // BOM 0xff 0xfe
18 UTF16_BE = 4, // BOM 0xfe 0xff
19 UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
20 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
21 Latin1 = 32,
22
23 unspecified = 0
24};
25
26#ifndef SIMDUTF_IS_BIG_ENDIAN
27 #error "SIMDUTF_IS_BIG_ENDIAN needs to be defined."
28#endif
29
30enum endianness {
31 LITTLE = 0,
32 BIG = 1,
33 NATIVE =
34#if SIMDUTF_IS_BIG_ENDIAN
35 BIG
36#else
37 LITTLE
38#endif
39};
40
41simdutf_warn_unused simdutf_really_inline constexpr bool
42match_system(endianness e) {
43 return e == endianness::NATIVE;
44}
45
46simdutf_warn_unused std::string_view to_string(encoding_type bom);
47
48// Note that BOM for UTF8 is discouraged.
49namespace BOM {
50
51/**
52 * Checks for a BOM. If not, returns unspecified
53 * @param byte the string to process
54 * @param length the length of the string in code units
55 * @return the corresponding encoding
56 */
57
58simdutf_warn_unused encoding_type check_bom(const uint8_t *byte, size_t length);
59simdutf_warn_unused encoding_type check_bom(const char *byte, size_t length);
60/**
61 * Returns the size, in bytes, of the BOM for a given encoding type.
62 * Note that UTF8 BOM are discouraged.
63 * @param bom the encoding type
64 * @return the size in bytes of the corresponding BOM
65 */
66simdutf_warn_unused size_t bom_byte_size(encoding_type bom);
67
68} // namespace BOM
69
70#ifdef SIMDUTF_HAS_STD_TEXT_ENCODING
71/**
72 * Convert a simdutf encoding type to a std::text_encoding.
73 *
74 * @param enc the simdutf encoding type
75 * @return the corresponding std::text_encoding, or
76 * std::text_encoding::id::unknown for unspecified/unsupported
77 */
78simdutf_warn_unused constexpr std::text_encoding
79to_std_encoding(encoding_type enc) noexcept {
80 switch (enc) {
81 case UTF8:
82 return std::text_encoding(std::text_encoding::id::UTF8);
83 case UTF16_LE:
84 return std::text_encoding(std::text_encoding::id::UTF16LE);
85 case UTF16_BE:
86 return std::text_encoding(std::text_encoding::id::UTF16BE);
87 case UTF32_LE:
88 return std::text_encoding(std::text_encoding::id::UTF32LE);
89 case UTF32_BE:
90 return std::text_encoding(std::text_encoding::id::UTF32BE);
91 case Latin1:
92 return std::text_encoding(std::text_encoding::id::ISOLatin1);
93 case unspecified:
94 default:
95 return std::text_encoding(std::text_encoding::id::unknown);
96 }
97}
98
99/**
100 * Convert a std::text_encoding to a simdutf encoding type.
101 *
102 * @param enc the std::text_encoding
103 * @return the corresponding simdutf encoding type, or
104 * encoding_type::unspecified if the encoding is not supported
105 */
106simdutf_warn_unused constexpr encoding_type
107from_std_encoding(const std::text_encoding &enc) noexcept {
108 switch (enc.mib()) {
109 case std::text_encoding::id::UTF8:
110 return UTF8;
111 case std::text_encoding::id::UTF16LE:
112 return UTF16_LE;
113 case std::text_encoding::id::UTF16BE:
114 return UTF16_BE;
115 case std::text_encoding::id::UTF32LE:
116 return UTF32_LE;
117 case std::text_encoding::id::UTF32BE:
118 return UTF32_BE;
119 case std::text_encoding::id::ISOLatin1:
120 return Latin1;
121 default:
122 return unspecified;
123 }
124}
125
126/**
127 * Get the native-endian UTF-16 encoding type for this system.
128 *
129 * @return UTF16_LE on little-endian systems, UTF16_BE on big-endian systems
130 */
131simdutf_warn_unused constexpr encoding_type native_utf16_encoding() noexcept {
132 #if SIMDUTF_IS_BIG_ENDIAN
133 return UTF16_BE;
134 #else
135 return UTF16_LE;
136 #endif
137}
138
139/**
140 * Get the native-endian UTF-32 encoding type for this system.
141 *
142 * @return UTF32_LE on little-endian systems, UTF32_BE on big-endian systems
143 */
144simdutf_warn_unused constexpr encoding_type native_utf32_encoding() noexcept {
145 #if SIMDUTF_IS_BIG_ENDIAN
146 return UTF32_BE;
147 #else
148 return UTF32_LE;
149 #endif
150}
151
152/**
153 * Convert a std::text_encoding to a simdutf encoding type,
154 * using native endianness for UTF-16/UTF-32 without explicit endianness.
155 *
156 * When the input is std::text_encoding::id::UTF16 or UTF32 (without LE/BE
157 * suffix), this returns the native-endian simdutf variant.
158 *
159 * @param enc the std::text_encoding
160 * @return the corresponding simdutf encoding type, or
161 * encoding_type::unspecified if the encoding is not supported
162 */
163simdutf_warn_unused constexpr encoding_type
164from_std_encoding_native(const std::text_encoding &enc) noexcept {
165 switch (enc.mib()) {
166 case std::text_encoding::id::UTF8:
167 return UTF8;
168 case std::text_encoding::id::UTF16:
169 return native_utf16_encoding();
170 case std::text_encoding::id::UTF16LE:
171 return UTF16_LE;
172 case std::text_encoding::id::UTF16BE:
173 return UTF16_BE;
174 case std::text_encoding::id::UTF32:
175 return native_utf32_encoding();
176 case std::text_encoding::id::UTF32LE:
177 return UTF32_LE;
178 case std::text_encoding::id::UTF32BE:
179 return UTF32_BE;
180 case std::text_encoding::id::ISOLatin1:
181 return Latin1;
182 default:
183 return unspecified;
184 }
185}
186#endif // SIMDUTF_HAS_STD_TEXT_ENCODING
187
188} // namespace simdutf
189#endif