simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
simdutf_c.h
1/***
2 * simdutf_c.h.h - C API for simdutf
3 * This is currently experimental.
4 * We are committed to keeping the C API, but there might be mistakes in our
5 * implementation. Please report any issues you find.
6 */
7
8#ifndef SIMDUTF_C_H
9#define SIMDUTF_C_H
10
11#include <stddef.h>
12#include <stdbool.h>
13#include <stdint.h>
14
15#ifdef __has_include
16 #if __has_include(<uchar.h>)
17 #include <uchar.h>
18 #else // __has_include(<uchar.h>)
19 #define char16_t uint16_t
20 #define char32_t uint32_t
21 #endif // __has_include(<uchar.h>)
22#else // __has_include(<uchar.h>)
23 #define char16_t uint16_t
24 #define char32_t uint32_t
25#endif // __has_include
26
27#ifdef __cplusplus
28extern "C" {
29#endif
30
31/* C-friendly subset of simdutf errors */
32typedef enum simdutf_error_code {
33 SIMDUTF_ERROR_SUCCESS = 0,
34 SIMDUTF_ERROR_HEADER_BITS,
35 SIMDUTF_ERROR_TOO_SHORT,
36 SIMDUTF_ERROR_TOO_LONG,
37 SIMDUTF_ERROR_OVERLONG,
38 SIMDUTF_ERROR_TOO_LARGE,
39 SIMDUTF_ERROR_SURROGATE,
40 SIMDUTF_ERROR_INVALID_BASE64_CHARACTER,
41 SIMDUTF_ERROR_BASE64_INPUT_REMAINDER,
42 SIMDUTF_ERROR_BASE64_EXTRA_BITS,
43 SIMDUTF_ERROR_OUTPUT_BUFFER_TOO_SMALL,
44 SIMDUTF_ERROR_OTHER
45} simdutf_error_code;
46
47typedef struct simdutf_result {
48 simdutf_error_code error;
49 size_t count; /* position of error or number of code units validated */
51
52typedef struct simdutf_full_result {
53 simdutf_error_code error;
54 size_t input_count; /* number of input units consumed */
55 size_t output_count; /* number of output bytes written */
57
58typedef enum simdutf_encoding_type {
59 SIMDUTF_ENCODING_UNSPECIFIED = 0,
60 SIMDUTF_ENCODING_UTF8 = 1,
61 SIMDUTF_ENCODING_UTF16_LE = 2,
62 SIMDUTF_ENCODING_UTF16_BE = 4,
63 SIMDUTF_ENCODING_UTF32_LE = 8,
64 SIMDUTF_ENCODING_UTF32_BE = 16
65} simdutf_encoding_type;
66
67/* Validate UTF-8: returns true iff input is valid UTF-8 */
68bool simdutf_validate_utf8(const char *buf, size_t len);
69
70/* Validate UTF-8 with detailed result */
71simdutf_result simdutf_validate_utf8_with_errors(const char *buf, size_t len);
72
73/* Encoding detection */
74simdutf_encoding_type simdutf_autodetect_encoding(const char *input,
75 size_t length);
76int simdutf_detect_encodings(const char *input, size_t length);
77
78/* ASCII validation */
79bool simdutf_validate_ascii(const char *buf, size_t len);
80simdutf_result simdutf_validate_ascii_with_errors(const char *buf, size_t len);
81
82/* UTF-16 ASCII checks */
83bool simdutf_validate_utf16_as_ascii(const char16_t *buf, size_t len);
84bool simdutf_validate_utf16be_as_ascii(const char16_t *buf, size_t len);
85bool simdutf_validate_utf16le_as_ascii(const char16_t *buf, size_t len);
86
87/* UTF-16/UTF-8/UTF-32 validation (native/endian-specific) */
88bool simdutf_validate_utf16(const char16_t *buf, size_t len);
89bool simdutf_validate_utf16le(const char16_t *buf, size_t len);
90bool simdutf_validate_utf16be(const char16_t *buf, size_t len);
91simdutf_result simdutf_validate_utf16_with_errors(const char16_t *buf,
92 size_t len);
93simdutf_result simdutf_validate_utf16le_with_errors(const char16_t *buf,
94 size_t len);
95simdutf_result simdutf_validate_utf16be_with_errors(const char16_t *buf,
96 size_t len);
97
98bool simdutf_validate_utf32(const char32_t *buf, size_t len);
99simdutf_result simdutf_validate_utf32_with_errors(const char32_t *buf,
100 size_t len);
101
102/* to_well_formed UTF-16 helpers */
103void simdutf_to_well_formed_utf16le(const char16_t *input, size_t len,
104 char16_t *output);
105void simdutf_to_well_formed_utf16be(const char16_t *input, size_t len,
106 char16_t *output);
107void simdutf_to_well_formed_utf16(const char16_t *input, size_t len,
108 char16_t *output);
109
110/* Counting */
111size_t simdutf_count_utf16(const char16_t *input, size_t length);
112size_t simdutf_count_utf16le(const char16_t *input, size_t length);
113size_t simdutf_count_utf16be(const char16_t *input, size_t length);
114size_t simdutf_count_utf8(const char *input, size_t length);
115
116/* Length estimators */
117size_t simdutf_utf8_length_from_latin1(const char *input, size_t length);
118size_t simdutf_latin1_length_from_utf8(const char *input, size_t length);
119size_t simdutf_latin1_length_from_utf16(size_t length);
120size_t simdutf_latin1_length_from_utf32(size_t length);
121size_t simdutf_utf16_length_from_utf8(const char *input, size_t length);
122size_t simdutf_utf32_length_from_utf8(const char *input, size_t length);
123size_t simdutf_utf8_length_from_utf16(const char16_t *input, size_t length);
124size_t simdutf_utf8_length_from_utf32(const char32_t *input, size_t length);
126simdutf_utf8_length_from_utf16_with_replacement(const char16_t *input,
127 size_t length);
128size_t simdutf_utf8_length_from_utf16le(const char16_t *input, size_t length);
129size_t simdutf_utf8_length_from_utf16be(const char16_t *input, size_t length);
131simdutf_utf8_length_from_utf16le_with_replacement(const char16_t *input,
132 size_t length);
134simdutf_utf8_length_from_utf16be_with_replacement(const char16_t *input,
135 size_t length);
136
137/* Conversions: latin1 <-> utf8, utf8 <-> utf16/utf32, utf16 <-> utf8, etc. */
138size_t simdutf_convert_latin1_to_utf8(const char *input, size_t length,
139 char *output);
140size_t simdutf_convert_latin1_to_utf8_safe(const char *input, size_t length,
141 char *output, size_t utf8_len);
142size_t simdutf_convert_latin1_to_utf16le(const char *input, size_t length,
143 char16_t *output);
144size_t simdutf_convert_latin1_to_utf16be(const char *input, size_t length,
145 char16_t *output);
146size_t simdutf_convert_latin1_to_utf16(const char *input, size_t length,
147 char16_t *output);
148size_t simdutf_convert_latin1_to_utf32(const char *input, size_t length,
149 char32_t *output);
150
151size_t simdutf_convert_utf8_to_latin1(const char *input, size_t length,
152 char *output);
153size_t simdutf_convert_utf8_to_utf16le(const char *input, size_t length,
154 char16_t *output);
155size_t simdutf_convert_utf8_to_utf16be(const char *input, size_t length,
156 char16_t *output);
157size_t simdutf_convert_utf8_to_utf16(const char *input, size_t length,
158 char16_t *output);
159
160size_t simdutf_convert_utf8_to_utf32(const char *input, size_t length,
161 char32_t *output);
162simdutf_result simdutf_convert_utf8_to_latin1_with_errors(const char *input,
163 size_t length,
164 char *output);
165simdutf_result simdutf_convert_utf8_to_utf16_with_errors(const char *input,
166 size_t length,
167 char16_t *output);
168simdutf_result simdutf_convert_utf8_to_utf16le_with_errors(const char *input,
169 size_t length,
170 char16_t *output);
171simdutf_result simdutf_convert_utf8_to_utf16be_with_errors(const char *input,
172 size_t length,
173 char16_t *output);
174simdutf_result simdutf_convert_utf8_to_utf32_with_errors(const char *input,
175 size_t length,
176 char32_t *output);
177
178/* Conversions assuming valid input */
179size_t simdutf_convert_valid_utf8_to_latin1(const char *input, size_t length,
180 char *output);
181size_t simdutf_convert_valid_utf8_to_utf16le(const char *input, size_t length,
182 char16_t *output);
183size_t simdutf_convert_valid_utf8_to_utf16be(const char *input, size_t length,
184 char16_t *output);
185size_t simdutf_convert_valid_utf8_to_utf32(const char *input, size_t length,
186 char32_t *output);
187
188/* UTF-16 -> UTF-8 and related conversions */
189size_t simdutf_convert_utf16_to_utf8(const char16_t *input, size_t length,
190 char *output);
191size_t simdutf_convert_utf16le_to_utf8(const char16_t *input, size_t length,
192 char *output);
193size_t simdutf_convert_utf16be_to_utf8(const char16_t *input, size_t length,
194 char *output);
195size_t simdutf_convert_utf16_to_utf8_safe(const char16_t *input, size_t length,
196 char *output, size_t utf8_len);
197size_t simdutf_convert_utf16_to_latin1(const char16_t *input, size_t length,
198 char *output);
199size_t simdutf_convert_utf16le_to_latin1(const char16_t *input, size_t length,
200 char *output);
201size_t simdutf_convert_utf16be_to_latin1(const char16_t *input, size_t length,
202 char *output);
204simdutf_convert_utf16_to_latin1_with_errors(const char16_t *input,
205 size_t length, char *output);
207simdutf_convert_utf16le_to_latin1_with_errors(const char16_t *input,
208 size_t length, char *output);
210simdutf_convert_utf16be_to_latin1_with_errors(const char16_t *input,
211 size_t length, char *output);
212
213simdutf_result simdutf_convert_utf16_to_utf8_with_errors(const char16_t *input,
214 size_t length,
215 char *output);
217simdutf_convert_utf16le_to_utf8_with_errors(const char16_t *input,
218 size_t length, char *output);
220simdutf_convert_utf16be_to_utf8_with_errors(const char16_t *input,
221 size_t length, char *output);
222
223size_t simdutf_convert_valid_utf16_to_utf8(const char16_t *input, size_t length,
224 char *output);
225size_t simdutf_convert_valid_utf16_to_latin1(const char16_t *input,
226 size_t length, char *output);
227size_t simdutf_convert_valid_utf16le_to_latin1(const char16_t *input,
228 size_t length, char *output);
229size_t simdutf_convert_valid_utf16be_to_latin1(const char16_t *input,
230 size_t length, char *output);
231
232size_t simdutf_convert_valid_utf16le_to_utf8(const char16_t *input,
233 size_t length, char *output);
234size_t simdutf_convert_valid_utf16be_to_utf8(const char16_t *input,
235 size_t length, char *output);
236
237/* UTF-16 <-> UTF-32 conversions */
238size_t simdutf_convert_utf16_to_utf32(const char16_t *input, size_t length,
239 char32_t *output);
240size_t simdutf_convert_utf16le_to_utf32(const char16_t *input, size_t length,
241 char32_t *output);
242size_t simdutf_convert_utf16be_to_utf32(const char16_t *input, size_t length,
243 char32_t *output);
244simdutf_result simdutf_convert_utf16_to_utf32_with_errors(const char16_t *input,
245 size_t length,
246 char32_t *output);
248simdutf_convert_utf16le_to_utf32_with_errors(const char16_t *input,
249 size_t length, char32_t *output);
251simdutf_convert_utf16be_to_utf32_with_errors(const char16_t *input,
252 size_t length, char32_t *output);
253
254/* Valid UTF-16 conversions */
255size_t simdutf_convert_valid_utf16_to_utf32(const char16_t *input,
256 size_t length, char32_t *output);
257size_t simdutf_convert_valid_utf16le_to_utf32(const char16_t *input,
258 size_t length, char32_t *output);
259size_t simdutf_convert_valid_utf16be_to_utf32(const char16_t *input,
260 size_t length, char32_t *output);
261
262/* UTF-32 -> ... conversions */
263size_t simdutf_convert_utf32_to_utf8(const char32_t *input, size_t length,
264 char *output);
265simdutf_result simdutf_convert_utf32_to_utf8_with_errors(const char32_t *input,
266 size_t length,
267 char *output);
268size_t simdutf_convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
269 char *output);
270
271size_t simdutf_convert_utf32_to_utf16(const char32_t *input, size_t length,
272 char16_t *output);
273size_t simdutf_convert_utf32_to_utf16le(const char32_t *input, size_t length,
274 char16_t *output);
275size_t simdutf_convert_utf32_to_utf16be(const char32_t *input, size_t length,
276 char16_t *output);
278simdutf_convert_utf32_to_latin1_with_errors(const char32_t *input,
279 size_t length, char *output);
280
281/* --- Find helpers --- */
282const char *simdutf_find(const char *start, const char *end, char character);
283const char16_t *simdutf_find_utf16(const char16_t *start, const char16_t *end,
284 char16_t character);
285
286/* --- Base64 enums and helpers --- */
287typedef enum simdutf_base64_options {
288 SIMDUTF_BASE64_DEFAULT = 0,
289 SIMDUTF_BASE64_URL = 1,
290 SIMDUTF_BASE64_DEFAULT_NO_PADDING = 2,
291 SIMDUTF_BASE64_URL_WITH_PADDING = 3,
292 SIMDUTF_BASE64_DEFAULT_ACCEPT_GARBAGE = 4,
293 SIMDUTF_BASE64_URL_ACCEPT_GARBAGE = 5,
294 SIMDUTF_BASE64_DEFAULT_OR_URL = 8,
295 SIMDUTF_BASE64_DEFAULT_OR_URL_ACCEPT_GARBAGE = 12
296} simdutf_base64_options;
297
298typedef enum simdutf_last_chunk_handling_options {
299 SIMDUTF_LAST_CHUNK_LOOSE = 0,
300 SIMDUTF_LAST_CHUNK_STRICT = 1,
301 SIMDUTF_LAST_CHUNK_STOP_BEFORE_PARTIAL = 2,
302 SIMDUTF_LAST_CHUNK_ONLY_FULL_CHUNKS = 3
303} simdutf_last_chunk_handling_options;
304
305/* maximal binary length estimators */
306size_t simdutf_maximal_binary_length_from_base64(const char *input,
307 size_t length);
308size_t simdutf_maximal_binary_length_from_base64_utf16(const char16_t *input,
309 size_t length);
310
311/* base64 decoding/encoding */
312simdutf_result simdutf_base64_to_binary(
313 const char *input, size_t length, char *output,
314 simdutf_base64_options options,
315 simdutf_last_chunk_handling_options last_chunk_options);
316simdutf_result simdutf_base64_to_binary_utf16(
317 const char16_t *input, size_t length, char *output,
318 simdutf_base64_options options,
319 simdutf_last_chunk_handling_options last_chunk_options);
320
321size_t simdutf_base64_length_from_binary(size_t length,
322 simdutf_base64_options options);
323size_t simdutf_base64_length_from_binary_with_lines(
324 size_t length, simdutf_base64_options options, size_t line_length);
325
326size_t simdutf_binary_to_base64(const char *input, size_t length, char *output,
327 simdutf_base64_options options);
328size_t simdutf_binary_to_base64_with_lines(const char *input, size_t length,
329 char *output, size_t line_length,
330 simdutf_base64_options options);
331
332/* safe decoding that provides an in/out outlen parameter */
333simdutf_result simdutf_base64_to_binary_safe(
334 const char *input, size_t length, char *output, size_t *outlen,
335 simdutf_base64_options options,
336 simdutf_last_chunk_handling_options last_chunk_options,
337 bool decode_up_to_bad_char);
338simdutf_result simdutf_base64_to_binary_safe_utf16(
339 const char16_t *input, size_t length, char *output, size_t *outlen,
340 simdutf_base64_options options,
341 simdutf_last_chunk_handling_options last_chunk_options,
342 bool decode_up_to_bad_char);
343
344/* detailed decoding returning input_count and output_count */
345simdutf_full_result simdutf_base64_to_binary_details(
346 const char *input, size_t length, char *output,
347 simdutf_base64_options options,
348 simdutf_last_chunk_handling_options last_chunk_options);
349simdutf_full_result simdutf_base64_to_binary_details_utf16(
350 const char16_t *input, size_t length, char *output,
351 simdutf_base64_options options,
352 simdutf_last_chunk_handling_options last_chunk_options);
353
354/* single-character base64 validation */
355bool simdutf_base64_valid(char input, simdutf_base64_options options);
356bool simdutf_base64_valid_utf16(char16_t input, simdutf_base64_options options);
357
358#ifdef __cplusplus
359} /* extern "C" */
360#endif
361
362#endif /* SIMDUTF_C_H */