simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
simdutf_c.h
1/***
2 * simdutf_c.h.h - C API for simdutf
3 * This is currently experimental.
4 * We are committed to keeping the C API, but there might be mistakes in our
5 * implementation. Please report any issues you find.
6 */
7
8#ifndef SIMDUTF_C_H
9#define SIMDUTF_C_H
10
11#include <stddef.h>
12#include <stdbool.h>
13#include <stdint.h>
14
15#ifdef __has_include
16 #if __has_include(<uchar.h>)
17 #include <uchar.h>
18 #else // __has_include(<uchar.h>)
19 #define char16_t uint16_t
20 #define char32_t uint32_t
21 #endif // __has_include(<uchar.h>)
22#else // __has_include(<uchar.h>)
23 #define char16_t uint16_t
24 #define char32_t uint32_t
25#endif // __has_include
26
27#ifdef __cplusplus
28extern "C" {
29#endif
30
31/* C-friendly subset of simdutf errors */
32typedef enum simdutf_error_code {
33 SIMDUTF_ERROR_SUCCESS = 0,
34 SIMDUTF_ERROR_HEADER_BITS,
35 SIMDUTF_ERROR_TOO_SHORT,
36 SIMDUTF_ERROR_TOO_LONG,
37 SIMDUTF_ERROR_OVERLONG,
38 SIMDUTF_ERROR_TOO_LARGE,
39 SIMDUTF_ERROR_SURROGATE,
40 SIMDUTF_ERROR_INVALID_BASE64_CHARACTER,
41 SIMDUTF_ERROR_BASE64_INPUT_REMAINDER,
42 SIMDUTF_ERROR_BASE64_EXTRA_BITS,
43 SIMDUTF_ERROR_OUTPUT_BUFFER_TOO_SMALL,
44 SIMDUTF_ERROR_OTHER
45} simdutf_error_code;
46
47typedef struct simdutf_result {
48 simdutf_error_code error;
49 size_t count; /* position of error or number of code units validated */
51
52typedef enum simdutf_encoding_type {
53 SIMDUTF_ENCODING_UNSPECIFIED = 0,
54 SIMDUTF_ENCODING_UTF8 = 1,
55 SIMDUTF_ENCODING_UTF16_LE = 2,
56 SIMDUTF_ENCODING_UTF16_BE = 4,
57 SIMDUTF_ENCODING_UTF32_LE = 8,
58 SIMDUTF_ENCODING_UTF32_BE = 16
59} simdutf_encoding_type;
60
61/* Validate UTF-8: returns true iff input is valid UTF-8 */
62bool simdutf_validate_utf8(const char *buf, size_t len);
63
64/* Validate UTF-8 with detailed result */
65simdutf_result simdutf_validate_utf8_with_errors(const char *buf, size_t len);
66
67/* Encoding detection */
68simdutf_encoding_type simdutf_autodetect_encoding(const char *input,
69 size_t length);
70int simdutf_detect_encodings(const char *input, size_t length);
71
72/* ASCII validation */
73bool simdutf_validate_ascii(const char *buf, size_t len);
74simdutf_result simdutf_validate_ascii_with_errors(const char *buf, size_t len);
75
76/* UTF-16 ASCII checks */
77bool simdutf_validate_utf16_as_ascii(const char16_t *buf, size_t len);
78bool simdutf_validate_utf16be_as_ascii(const char16_t *buf, size_t len);
79bool simdutf_validate_utf16le_as_ascii(const char16_t *buf, size_t len);
80
81/* UTF-16/UTF-8/UTF-32 validation (native/endian-specific) */
82bool simdutf_validate_utf16(const char16_t *buf, size_t len);
83bool simdutf_validate_utf16le(const char16_t *buf, size_t len);
84bool simdutf_validate_utf16be(const char16_t *buf, size_t len);
85simdutf_result simdutf_validate_utf16_with_errors(const char16_t *buf,
86 size_t len);
87simdutf_result simdutf_validate_utf16le_with_errors(const char16_t *buf,
88 size_t len);
89simdutf_result simdutf_validate_utf16be_with_errors(const char16_t *buf,
90 size_t len);
91
92bool simdutf_validate_utf32(const char32_t *buf, size_t len);
93simdutf_result simdutf_validate_utf32_with_errors(const char32_t *buf,
94 size_t len);
95
96/* to_well_formed UTF-16 helpers */
97void simdutf_to_well_formed_utf16le(const char16_t *input, size_t len,
98 char16_t *output);
99void simdutf_to_well_formed_utf16be(const char16_t *input, size_t len,
100 char16_t *output);
101void simdutf_to_well_formed_utf16(const char16_t *input, size_t len,
102 char16_t *output);
103
104/* Counting */
105size_t simdutf_count_utf16(const char16_t *input, size_t length);
106size_t simdutf_count_utf16le(const char16_t *input, size_t length);
107size_t simdutf_count_utf16be(const char16_t *input, size_t length);
108size_t simdutf_count_utf8(const char *input, size_t length);
109
110/* Length estimators */
111size_t simdutf_utf8_length_from_latin1(const char *input, size_t length);
112size_t simdutf_latin1_length_from_utf8(const char *input, size_t length);
113size_t simdutf_latin1_length_from_utf16(size_t length);
114size_t simdutf_latin1_length_from_utf32(size_t length);
115size_t simdutf_utf16_length_from_utf8(const char *input, size_t length);
116size_t simdutf_utf32_length_from_utf8(const char *input, size_t length);
117size_t simdutf_utf8_length_from_utf16(const char16_t *input, size_t length);
119simdutf_utf8_length_from_utf16_with_replacement(const char16_t *input,
120 size_t length);
121size_t simdutf_utf8_length_from_utf16le(const char16_t *input, size_t length);
122size_t simdutf_utf8_length_from_utf16be(const char16_t *input, size_t length);
124simdutf_utf8_length_from_utf16le_with_replacement(const char16_t *input,
125 size_t length);
127simdutf_utf8_length_from_utf16be_with_replacement(const char16_t *input,
128 size_t length);
129
130/* Conversions: latin1 <-> utf8, utf8 <-> utf16/utf32, utf16 <-> utf8, etc. */
131size_t simdutf_convert_latin1_to_utf8(const char *input, size_t length,
132 char *output);
133size_t simdutf_convert_latin1_to_utf8_safe(const char *input, size_t length,
134 char *output, size_t utf8_len);
135size_t simdutf_convert_latin1_to_utf16le(const char *input, size_t length,
136 char16_t *output);
137size_t simdutf_convert_latin1_to_utf16be(const char *input, size_t length,
138 char16_t *output);
139size_t simdutf_convert_latin1_to_utf32(const char *input, size_t length,
140 char32_t *output);
141
142size_t simdutf_convert_utf8_to_latin1(const char *input, size_t length,
143 char *output);
144size_t simdutf_convert_utf8_to_utf16le(const char *input, size_t length,
145 char16_t *output);
146size_t simdutf_convert_utf8_to_utf16be(const char *input, size_t length,
147 char16_t *output);
148size_t simdutf_convert_utf8_to_utf16(const char *input, size_t length,
149 char16_t *output);
150
151size_t simdutf_convert_utf8_to_utf32(const char *input, size_t length,
152 char32_t *output);
153simdutf_result simdutf_convert_utf8_to_latin1_with_errors(const char *input,
154 size_t length,
155 char *output);
156simdutf_result simdutf_convert_utf8_to_utf16_with_errors(const char *input,
157 size_t length,
158 char16_t *output);
159simdutf_result simdutf_convert_utf8_to_utf16le_with_errors(const char *input,
160 size_t length,
161 char16_t *output);
162simdutf_result simdutf_convert_utf8_to_utf16be_with_errors(const char *input,
163 size_t length,
164 char16_t *output);
165simdutf_result simdutf_convert_utf8_to_utf32_with_errors(const char *input,
166 size_t length,
167 char32_t *output);
168
169/* Conversions assuming valid input */
170size_t simdutf_convert_valid_utf8_to_latin1(const char *input, size_t length,
171 char *output);
172size_t simdutf_convert_valid_utf8_to_utf16le(const char *input, size_t length,
173 char16_t *output);
174size_t simdutf_convert_valid_utf8_to_utf16be(const char *input, size_t length,
175 char16_t *output);
176size_t simdutf_convert_valid_utf8_to_utf32(const char *input, size_t length,
177 char32_t *output);
178
179/* UTF-16 -> UTF-8 and related conversions */
180size_t simdutf_convert_utf16_to_utf8(const char16_t *input, size_t length,
181 char *output);
182size_t simdutf_convert_utf16le_to_utf8(const char16_t *input, size_t length,
183 char *output);
184size_t simdutf_convert_utf16be_to_utf8(const char16_t *input, size_t length,
185 char *output);
186size_t simdutf_convert_utf16_to_utf8_safe(const char16_t *input, size_t length,
187 char *output, size_t utf8_len);
188size_t simdutf_convert_utf16_to_latin1(const char16_t *input, size_t length,
189 char *output);
190size_t simdutf_convert_utf16le_to_latin1(const char16_t *input, size_t length,
191 char *output);
192size_t simdutf_convert_utf16be_to_latin1(const char16_t *input, size_t length,
193 char *output);
195simdutf_convert_utf16_to_latin1_with_errors(const char16_t *input,
196 size_t length, char *output);
198simdutf_convert_utf16le_to_latin1_with_errors(const char16_t *input,
199 size_t length, char *output);
201simdutf_convert_utf16be_to_latin1_with_errors(const char16_t *input,
202 size_t length, char *output);
203
204simdutf_result simdutf_convert_utf16_to_utf8_with_errors(const char16_t *input,
205 size_t length,
206 char *output);
208simdutf_convert_utf16le_to_utf8_with_errors(const char16_t *input,
209 size_t length, char *output);
211simdutf_convert_utf16be_to_utf8_with_errors(const char16_t *input,
212 size_t length, char *output);
213
214size_t simdutf_convert_valid_utf16_to_utf8(const char16_t *input, size_t length,
215 char *output);
216size_t simdutf_convert_valid_utf16_to_latin1(const char16_t *input,
217 size_t length, char *output);
218size_t simdutf_convert_valid_utf16le_to_latin1(const char16_t *input,
219 size_t length, char *output);
220size_t simdutf_convert_valid_utf16be_to_latin1(const char16_t *input,
221 size_t length, char *output);
222
223size_t simdutf_convert_valid_utf16le_to_utf8(const char16_t *input,
224 size_t length, char *output);
225size_t simdutf_convert_valid_utf16be_to_utf8(const char16_t *input,
226 size_t length, char *output);
227
228/* UTF-16 <-> UTF-32 conversions */
229size_t simdutf_convert_utf16_to_utf32(const char16_t *input, size_t length,
230 char32_t *output);
231size_t simdutf_convert_utf16le_to_utf32(const char16_t *input, size_t length,
232 char32_t *output);
233size_t simdutf_convert_utf16be_to_utf32(const char16_t *input, size_t length,
234 char32_t *output);
235simdutf_result simdutf_convert_utf16_to_utf32_with_errors(const char16_t *input,
236 size_t length,
237 char32_t *output);
239simdutf_convert_utf16le_to_utf32_with_errors(const char16_t *input,
240 size_t length, char32_t *output);
242simdutf_convert_utf16be_to_utf32_with_errors(const char16_t *input,
243 size_t length, char32_t *output);
244
245/* Valid UTF-16 conversions */
246size_t simdutf_convert_valid_utf16_to_utf32(const char16_t *input,
247 size_t length, char32_t *output);
248size_t simdutf_convert_valid_utf16le_to_utf32(const char16_t *input,
249 size_t length, char32_t *output);
250size_t simdutf_convert_valid_utf16be_to_utf32(const char16_t *input,
251 size_t length, char32_t *output);
252
253/* UTF-32 -> ... conversions */
254size_t simdutf_convert_utf32_to_utf8(const char32_t *input, size_t length,
255 char *output);
256simdutf_result simdutf_convert_utf32_to_utf8_with_errors(const char32_t *input,
257 size_t length,
258 char *output);
259size_t simdutf_convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
260 char *output);
261
262size_t simdutf_convert_utf32_to_utf16(const char32_t *input, size_t length,
263 char16_t *output);
264size_t simdutf_convert_utf32_to_utf16le(const char32_t *input, size_t length,
265 char16_t *output);
266size_t simdutf_convert_utf32_to_utf16be(const char32_t *input, size_t length,
267 char16_t *output);
269simdutf_convert_utf32_to_latin1_with_errors(const char32_t *input,
270 size_t length, char *output);
271
272/* --- Find helpers --- */
273const char *simdutf_find(const char *start, const char *end, char character);
274const char16_t *simdutf_find_utf16(const char16_t *start, const char16_t *end,
275 char16_t character);
276
277/* --- Base64 enums and helpers --- */
278typedef enum simdutf_base64_options {
279 SIMDUTF_BASE64_DEFAULT = 0,
280 SIMDUTF_BASE64_URL = 1,
281 SIMDUTF_BASE64_DEFAULT_NO_PADDING = 2,
282 SIMDUTF_BASE64_URL_WITH_PADDING = 3,
283 SIMDUTF_BASE64_DEFAULT_ACCEPT_GARBAGE = 4,
284 SIMDUTF_BASE64_URL_ACCEPT_GARBAGE = 5,
285 SIMDUTF_BASE64_DEFAULT_OR_URL = 8,
286 SIMDUTF_BASE64_DEFAULT_OR_URL_ACCEPT_GARBAGE = 12
287} simdutf_base64_options;
288
289typedef enum simdutf_last_chunk_handling_options {
290 SIMDUTF_LAST_CHUNK_LOOSE = 0,
291 SIMDUTF_LAST_CHUNK_STRICT = 1,
292 SIMDUTF_LAST_CHUNK_STOP_BEFORE_PARTIAL = 2,
293 SIMDUTF_LAST_CHUNK_ONLY_FULL_CHUNKS = 3
294} simdutf_last_chunk_handling_options;
295
296/* maximal binary length estimators */
297size_t simdutf_maximal_binary_length_from_base64(const char *input,
298 size_t length);
299size_t simdutf_maximal_binary_length_from_base64_utf16(const char16_t *input,
300 size_t length);
301
302/* base64 decoding/encoding */
303simdutf_result simdutf_base64_to_binary(
304 const char *input, size_t length, char *output,
305 simdutf_base64_options options,
306 simdutf_last_chunk_handling_options last_chunk_options);
307simdutf_result simdutf_base64_to_binary_utf16(
308 const char16_t *input, size_t length, char *output,
309 simdutf_base64_options options,
310 simdutf_last_chunk_handling_options last_chunk_options);
311
312size_t simdutf_base64_length_from_binary(size_t length,
313 simdutf_base64_options options);
314size_t simdutf_base64_length_from_binary_with_lines(
315 size_t length, simdutf_base64_options options, size_t line_length);
316
317size_t simdutf_binary_to_base64(const char *input, size_t length, char *output,
318 simdutf_base64_options options);
319size_t simdutf_binary_to_base64_with_lines(const char *input, size_t length,
320 char *output, size_t line_length,
321 simdutf_base64_options options);
322
323/* safe decoding that provides an in/out outlen parameter */
324simdutf_result simdutf_base64_to_binary_safe(
325 const char *input, size_t length, char *output, size_t *outlen,
326 simdutf_base64_options options,
327 simdutf_last_chunk_handling_options last_chunk_options,
328 bool decode_up_to_bad_char);
329simdutf_result simdutf_base64_to_binary_safe_utf16(
330 const char16_t *input, size_t length, char *output, size_t *outlen,
331 simdutf_base64_options options,
332 simdutf_last_chunk_handling_options last_chunk_options,
333 bool decode_up_to_bad_char);
334
335#ifdef __cplusplus
336} /* extern "C" */
337#endif
338
339#endif /* SIMDUTF_C_H */