simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
encoding_types.h
1#ifndef SIMDUTF_ENCODING_TYPES_H
2#define SIMDUTF_ENCODING_TYPES_H
3#include <string>
4#include "simdutf/portability.h"
5#include "simdutf/common_defs.h"
6
7#if !defined(SIMDUTF_NO_STD_TEXT_ENCODING) && \
8 defined(__cpp_lib_text_encoding) && __cpp_lib_text_encoding >= 202306L
9 #define SIMDUTF_HAS_STD_TEXT_ENCODING 1
10 #include <text_encoding>
11#endif
12
13namespace simdutf {
14
15enum encoding_type {
16 UTF8 = 1, // BOM 0xef 0xbb 0xbf
17 UTF16_LE = 2, // BOM 0xff 0xfe
18 UTF16_BE = 4, // BOM 0xfe 0xff
19 UTF32_LE = 8, // BOM 0xff 0xfe 0x00 0x00
20 UTF32_BE = 16, // BOM 0x00 0x00 0xfe 0xff
21 Latin1 = 32,
22
23 unspecified = 0
24};
25
26#ifndef SIMDUTF_IS_BIG_ENDIAN
27 #error "SIMDUTF_IS_BIG_ENDIAN needs to be defined."
28#endif
29
30enum endianness {
31 LITTLE = 0,
32 BIG = 1,
33 NATIVE =
34#if SIMDUTF_IS_BIG_ENDIAN
35 BIG
36#else
37 LITTLE
38#endif
39};
40
41simdutf_warn_unused simdutf_really_inline constexpr bool
42match_system(endianness e) {
43 return e == endianness::NATIVE;
44}
45
46simdutf_warn_unused std::string to_string(encoding_type bom);
47
48// Note that BOM for UTF8 is discouraged.
49namespace BOM {
50
58simdutf_warn_unused encoding_type check_bom(const uint8_t *byte, size_t length);
59simdutf_warn_unused encoding_type check_bom(const char *byte, size_t length);
66simdutf_warn_unused size_t bom_byte_size(encoding_type bom);
67
68} // namespace BOM
69
70#ifdef SIMDUTF_HAS_STD_TEXT_ENCODING
78simdutf_warn_unused constexpr std::text_encoding
79to_std_encoding(encoding_type enc) noexcept {
80 switch (enc) {
81 case UTF8:
82 return std::text_encoding(std::text_encoding::id::UTF8);
83 case UTF16_LE:
84 return std::text_encoding(std::text_encoding::id::UTF16LE);
85 case UTF16_BE:
86 return std::text_encoding(std::text_encoding::id::UTF16BE);
87 case UTF32_LE:
88 return std::text_encoding(std::text_encoding::id::UTF32LE);
89 case UTF32_BE:
90 return std::text_encoding(std::text_encoding::id::UTF32BE);
91 case Latin1:
92 return std::text_encoding(std::text_encoding::id::ISOLatin1);
93 case unspecified:
94 default:
95 return std::text_encoding(std::text_encoding::id::unknown);
96 }
97}
98
106simdutf_warn_unused constexpr encoding_type
107from_std_encoding(const std::text_encoding &enc) noexcept {
108 switch (enc.mib()) {
109 case std::text_encoding::id::UTF8:
110 return UTF8;
111 case std::text_encoding::id::UTF16LE:
112 return UTF16_LE;
113 case std::text_encoding::id::UTF16BE:
114 return UTF16_BE;
115 case std::text_encoding::id::UTF32LE:
116 return UTF32_LE;
117 case std::text_encoding::id::UTF32BE:
118 return UTF32_BE;
119 case std::text_encoding::id::ISOLatin1:
120 return Latin1;
121 default:
122 return unspecified;
123 }
124}
125
131simdutf_warn_unused constexpr encoding_type native_utf16_encoding() noexcept {
132 #if SIMDUTF_IS_BIG_ENDIAN
133 return UTF16_BE;
134 #else
135 return UTF16_LE;
136 #endif
137}
138
144simdutf_warn_unused constexpr encoding_type native_utf32_encoding() noexcept {
145 #if SIMDUTF_IS_BIG_ENDIAN
146 return UTF32_BE;
147 #else
148 return UTF32_LE;
149 #endif
150}
151
163simdutf_warn_unused constexpr encoding_type
164from_std_encoding_native(const std::text_encoding &enc) noexcept {
165 switch (enc.mib()) {
166 case std::text_encoding::id::UTF8:
167 return UTF8;
168 case std::text_encoding::id::UTF16:
169 return native_utf16_encoding();
170 case std::text_encoding::id::UTF16LE:
171 return UTF16_LE;
172 case std::text_encoding::id::UTF16BE:
173 return UTF16_BE;
174 case std::text_encoding::id::UTF32:
175 return native_utf32_encoding();
176 case std::text_encoding::id::UTF32LE:
177 return UTF32_LE;
178 case std::text_encoding::id::UTF32BE:
179 return UTF32_BE;
180 case std::text_encoding::id::ISOLatin1:
181 return Latin1;
182 default:
183 return unspecified;
184 }
185}
186#endif // SIMDUTF_HAS_STD_TEXT_ENCODING
187
188} // namespace simdutf
189#endif