simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
implementation.h
1#ifndef SIMDUTF_IMPLEMENTATION_H
2#define SIMDUTF_IMPLEMENTATION_H
3#if !defined(SIMDUTF_NO_THREADS)
4 #include <atomic>
5#endif
6#ifdef SIMDUTF_INTERNAL_TESTS
7 #include <vector>
8#endif
9#include "simdutf/common_defs.h"
10#include "simdutf/compiler_check.h"
11#include "simdutf/encoding_types.h"
12#include "simdutf/error.h"
13#include "simdutf/internal/isadetection.h"
14
15#include <string_view>
16#if SIMDUTF_SPAN
17 #include <concepts>
18 #include <type_traits>
19 #include <span>
20 #include <tuple>
21 #include <utility> // for std::unreachable
22#endif
23// The following defines are conditionally enabled/disabled during amalgamation.
24// By default all features are enabled, regular code shouldn't check them. Only
25// when user code really relies of a selected subset, it's good to verify these
26// flags, like:
27//
28// #if !SIMDUTF_FEATURE_UTF16
29// # error("Please amalgamate simdutf with UTF-16 support")
30// #endif
31//
32#ifndef SIMDUTF_FEATURE_DETECT_ENCODING
33 #define SIMDUTF_FEATURE_DETECT_ENCODING 1
34#endif
35#ifndef SIMDUTF_FEATURE_ASCII
36 #define SIMDUTF_FEATURE_ASCII 1
37#endif
38#ifndef SIMDUTF_FEATURE_LATIN1
39 #define SIMDUTF_FEATURE_LATIN1 1
40#endif
41#ifndef SIMDUTF_FEATURE_UTF8
42 #define SIMDUTF_FEATURE_UTF8 1
43#endif
44#ifndef SIMDUTF_FEATURE_UTF16
45 #define SIMDUTF_FEATURE_UTF16 1
46#endif
47#ifndef SIMDUTF_FEATURE_UTF32
48 #define SIMDUTF_FEATURE_UTF32 1
49#endif
50#ifndef SIMDUTF_FEATURE_BASE64
51 #define SIMDUTF_FEATURE_BASE64 1
52#endif
53
54#if SIMDUTF_CPLUSPLUS23
55 #include <simdutf/constexpr_ptr.h>
56#endif
57
58#if SIMDUTF_SPAN
59/// helpers placed in namespace detail are not a part of the public API
60namespace simdutf {
61namespace detail {
62/**
63 * matches a byte, in the many ways C++ allows. note that these
64 * are all distinct types.
65 */
66template <typename T>
67concept byte_like = std::is_same_v<T, std::byte> || //
68 std::is_same_v<T, char> || //
69 std::is_same_v<T, signed char> || //
70 std::is_same_v<T, unsigned char> || //
71 std::is_same_v<T, char8_t>;
72
73template <typename T>
74concept is_byte_like = byte_like<std::remove_cvref_t<T>>;
75
76template <typename T>
77concept is_pointer = std::is_pointer_v<T>;
78
79/**
80 * matches anything that behaves like std::span and points to character-like
81 * data such as: std::byte, char, unsigned char, signed char, std::int8_t,
82 * std::uint8_t
83 */
84template <typename T>
85concept input_span_of_byte_like = requires(const T &t) {
86 { t.size() } noexcept -> std::convertible_to<std::size_t>;
87 { t.data() } noexcept -> is_pointer;
88 { *t.data() } noexcept -> is_byte_like;
89};
90
91template <typename T>
92concept is_mutable = !std::is_const_v<std::remove_reference_t<T>>;
93
94/**
95 * like span_of_byte_like, but for an output span (intended to be written to)
96 */
97template <typename T>
98concept output_span_of_byte_like = requires(T &t) {
99 { t.size() } noexcept -> std::convertible_to<std::size_t>;
100 { t.data() } noexcept -> is_pointer;
101 { *t.data() } noexcept -> is_byte_like;
102 { *t.data() } noexcept -> is_mutable;
103};
104
105/**
106 * a pointer like object, when indexed, results in a byte like result.
107 * valid examples: char*, const char*, std::array<char,10>
108 * invalid examples: int*, std::array<int,10>
109 */
110template <class InputPtr>
111concept indexes_into_byte_like = requires(InputPtr p) {
112 { std::decay_t<decltype(p[0])>{} } -> simdutf::detail::byte_like;
113};
114template <class InputPtr>
115concept indexes_into_utf16 = requires(InputPtr p) {
116 { std::decay_t<decltype(p[0])>{} } -> std::same_as<char16_t>;
117};
118template <class InputPtr>
119concept indexes_into_utf32 = requires(InputPtr p) {
120 { std::decay_t<decltype(p[0])>{} } -> std::same_as<char32_t>;
121};
122
123template <class InputPtr>
124concept index_assignable_from_char = requires(InputPtr p, char s) {
125 { p[0] = s };
126};
127
128/**
129 * a pointer like object that results in a uint32_t when indexed.
130 * valid examples: uint32_t*
131 */
132template <class InputPtr>
133concept indexes_into_uint32 = requires(InputPtr p) {
134 { std::decay_t<decltype(p[0])>{} } -> std::same_as<std::uint32_t>;
135};
136} // namespace detail
137} // namespace simdutf
138#endif // SIMDUTF_SPAN
139
140// these includes are needed for constexpr support. they are
141// not part of the public api.
142#include <simdutf/scalar/swap_bytes.h>
143#include <simdutf/scalar/ascii.h>
144#include <simdutf/scalar/atomic_util.h>
145#include <simdutf/scalar/latin1.h>
146#include <simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h>
147#include <simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h>
148#include <simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h>
149#include <simdutf/scalar/utf16.h>
150#include <simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h>
151#include <simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h>
152#include <simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h>
153#include <simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h>
154#include <simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h>
155#include <simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h>
156#include <simdutf/scalar/utf32.h>
157#include <simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h>
158#include <simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h>
159#include <simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h>
160#include <simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h>
161#include <simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h>
162#include <simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h>
163#include <simdutf/scalar/utf8.h>
164#include <simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h>
165#include <simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h>
166#include <simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h>
167#include <simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h>
168#include <simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h>
169#include <simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h>
170
171namespace simdutf {
172
173constexpr size_t default_line_length =
174 76; ///< default line length for base64 encoding with lines
175
176#if SIMDUTF_FEATURE_DETECT_ENCODING
177/**
178 * Autodetect the encoding of the input, a single encoding is recommended.
179 * E.g., the function might return simdutf::encoding_type::UTF8,
180 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
181 * simdutf::encoding_type::UTF32_LE.
182 *
183 * @param input the string to analyze.
184 * @param length the length of the string in bytes.
185 * @return the detected encoding type
186 */
187simdutf_warn_unused simdutf::encoding_type
188autodetect_encoding(const char *input, size_t length) noexcept;
189simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
190autodetect_encoding(const uint8_t *input, size_t length) noexcept {
191 return autodetect_encoding(reinterpret_cast<const char *>(input), length);
192}
193 #if SIMDUTF_SPAN
194/**
195 * Autodetect the encoding of the input, a single encoding is recommended.
196 * E.g., the function might return simdutf::encoding_type::UTF8,
197 * simdutf::encoding_type::UTF16_LE, simdutf::encoding_type::UTF16_BE, or
198 * simdutf::encoding_type::UTF32_LE.
199 *
200 * @param input the string to analyze. can be a anything span-like that has a
201 * data() and size() that points to character data: std::string,
202 * std::string_view, std::vector<char>, std::span<const std::byte> etc.
203 * @return the detected encoding type
204 */
205simdutf_really_inline simdutf_warn_unused simdutf::encoding_type
206autodetect_encoding(
207 const detail::input_span_of_byte_like auto &input) noexcept {
208 return autodetect_encoding(reinterpret_cast<const char *>(input.data()),
209 input.size());
210}
211 #endif // SIMDUTF_SPAN
212
213/**
214 * Autodetect the possible encodings of the input in one pass.
215 * E.g., if the input might be UTF-16LE or UTF-8, this function returns
216 * the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
217 *
218 * Overridden by each implementation.
219 *
220 * @param input the string to analyze.
221 * @param length the length of the string in bytes.
222 * @return the detected encoding type
223 */
224simdutf_warn_unused int detect_encodings(const char *input,
225 size_t length) noexcept;
226simdutf_really_inline simdutf_warn_unused int
227detect_encodings(const uint8_t *input, size_t length) noexcept {
228 return detect_encodings(reinterpret_cast<const char *>(input), length);
229}
230 #if SIMDUTF_SPAN
231simdutf_really_inline simdutf_warn_unused int
232detect_encodings(const detail::input_span_of_byte_like auto &input) noexcept {
233 return detect_encodings(reinterpret_cast<const char *>(input.data()),
234 input.size());
235}
236 #endif // SIMDUTF_SPAN
237#endif // SIMDUTF_FEATURE_DETECT_ENCODING
238
239#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
240/**
241 * Validate the UTF-8 string. This function may be best when you expect
242 * the input to be almost always valid. Otherwise, consider using
243 * validate_utf8_with_errors.
244 *
245 * Overridden by each implementation.
246 *
247 * @param buf the UTF-8 string to validate.
248 * @param len the length of the string in bytes.
249 * @return true if and only if the string is valid UTF-8.
250 */
251simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept;
252 #if SIMDUTF_SPAN
253simdutf_constexpr23 simdutf_really_inline simdutf_warn_unused bool
254validate_utf8(const detail::input_span_of_byte_like auto &input) noexcept {
255 #if SIMDUTF_CPLUSPLUS23
256 if consteval {
257 return scalar::utf8::validate(
258 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
259 } else
260 #endif
261 {
262 return validate_utf8(reinterpret_cast<const char *>(input.data()),
263 input.size());
264 }
265}
266 #endif // SIMDUTF_SPAN
267#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
268
269#if SIMDUTF_FEATURE_UTF8
270/**
271 * Validate the UTF-8 string and stop on error.
272 *
273 * Overridden by each implementation.
274 *
275 * @param buf the UTF-8 string to validate.
276 * @param len the length of the string in bytes.
277 * @return a result pair struct (of type simdutf::result containing the two
278 * fields error and count) with an error code and either position of the error
279 * (in the input in code units) if any, or the number of code units validated if
280 * successful.
281 */
282simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
283 size_t len) noexcept;
284 #if SIMDUTF_SPAN
285simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
286validate_utf8_with_errors(
287 const detail::input_span_of_byte_like auto &input) noexcept {
288 #if SIMDUTF_CPLUSPLUS23
289 if consteval {
290 return scalar::utf8::validate_with_errors(
291 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
292 } else
293 #endif
294 {
295 return validate_utf8_with_errors(
296 reinterpret_cast<const char *>(input.data()), input.size());
297 }
298}
299 #endif // SIMDUTF_SPAN
300#endif // SIMDUTF_FEATURE_UTF8
301
302#if SIMDUTF_FEATURE_ASCII
303/**
304 * Validate the ASCII string.
305 *
306 * Overridden by each implementation.
307 *
308 * @param buf the ASCII string to validate.
309 * @param len the length of the string in bytes.
310 * @return true if and only if the string is valid ASCII.
311 */
312simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept;
313 #if SIMDUTF_SPAN
314simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
315validate_ascii(const detail::input_span_of_byte_like auto &input) noexcept {
316 #if SIMDUTF_CPLUSPLUS23
317 if consteval {
318 return scalar::ascii::validate(
319 detail::constexpr_cast_ptr<std::uint8_t>(input.data()), input.size());
320 } else
321 #endif
322 {
323 return validate_ascii(reinterpret_cast<const char *>(input.data()),
324 input.size());
325 }
326}
327 #endif // SIMDUTF_SPAN
328
329/**
330 * Validate the ASCII string and stop on error. It might be faster than
331 * validate_utf8 when an error is expected to occur early.
332 *
333 * Overridden by each implementation.
334 *
335 * @param buf the ASCII string to validate.
336 * @param len the length of the string in bytes.
337 * @return a result pair struct (of type simdutf::result containing the two
338 * fields error and count) with an error code and either position of the error
339 * (in the input in code units) if any, or the number of code units validated if
340 * successful.
341 */
342simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
343 size_t len) noexcept;
344 #if SIMDUTF_SPAN
345simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
346validate_ascii_with_errors(
347 const detail::input_span_of_byte_like auto &input) noexcept {
348 #if SIMDUTF_CPLUSPLUS23
349 if consteval {
350 return scalar::ascii::validate_with_errors(
351 detail::constexpr_cast_ptr<std::uint8_t>(input.data()), input.size());
352 } else
353 #endif
354 {
355 return validate_ascii_with_errors(
356 reinterpret_cast<const char *>(input.data()), input.size());
357 }
358}
359 #endif // SIMDUTF_SPAN
360#endif // SIMDUTF_FEATURE_ASCII
361
362#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
363/**
364 * Validate the ASCII string as a UTF-16 sequence.
365 * An UTF-16 sequence is considered an ASCII sequence
366 * if it could be converted to an ASCII string losslessly.
367 *
368 * Overridden by each implementation.
369 *
370 * @param buf the UTF-16 string to validate.
371 * @param len the length of the string in bytes.
372 * @return true if and only if the string is valid ASCII.
373 */
374simdutf_warn_unused bool validate_utf16_as_ascii(const char16_t *buf,
375 size_t len) noexcept;
376 #if SIMDUTF_SPAN
377simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
378validate_utf16_as_ascii(std::span<const char16_t> input) noexcept {
379 #if SIMDUTF_CPLUSPLUS23
380 if consteval {
381 return scalar::utf16::validate_as_ascii<endianness::NATIVE>(input.data(),
382 input.size());
383 } else
384 #endif
385 {
386 return validate_utf16_as_ascii(input.data(), input.size());
387 }
388}
389 #endif // SIMDUTF_SPAN
390
391/**
392 * Validate the ASCII string as a UTF-16BE sequence.
393 * An UTF-16 sequence is considered an ASCII sequence
394 * if it could be converted to an ASCII string losslessly.
395 *
396 * Overridden by each implementation.
397 *
398 * @param buf the UTF-16BE string to validate.
399 * @param len the length of the string in bytes.
400 * @return true if and only if the string is valid ASCII.
401 */
402simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf,
403 size_t len) noexcept;
404 #if SIMDUTF_SPAN
405simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
406validate_utf16be_as_ascii(std::span<const char16_t> input) noexcept {
407 #if SIMDUTF_CPLUSPLUS23
408 if consteval {
409 return scalar::utf16::validate_as_ascii<endianness::BIG>(input.data(),
410 input.size());
411 } else
412 #endif
413 {
414 return validate_utf16be_as_ascii(input.data(), input.size());
415 }
416}
417 #endif // SIMDUTF_SPAN
418
419/**
420 * Validate the ASCII string as a UTF-16LE sequence.
421 * An UTF-16 sequence is considered an ASCII sequence
422 * if it could be converted to an ASCII string losslessly.
423 *
424 * Overridden by each implementation.
425 *
426 * @param buf the UTF-16LE string to validate.
427 * @param len the length of the string in bytes.
428 * @return true if and only if the string is valid ASCII.
429 */
430simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf,
431 size_t len) noexcept;
432 #if SIMDUTF_SPAN
433simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
434validate_utf16le_as_ascii(std::span<const char16_t> input) noexcept {
435 #if SIMDUTF_CPLUSPLUS23
436 if consteval {
437 return scalar::utf16::validate_as_ascii<endianness::LITTLE>(input.data(),
438 input.size());
439 } else
440 #endif
441 {
442 return validate_utf16le_as_ascii(input.data(), input.size());
443 }
444}
445 #endif // SIMDUTF_SPAN
446#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
447
448#if SIMDUTF_FEATURE_UTF16
449/**
450 * Using native endianness; Validate the UTF-16 string.
451 * This function may be best when you expect the input to be almost always
452 * valid. Otherwise, consider using validate_utf16_with_errors.
453 *
454 * Overridden by each implementation.
455 *
456 * This function is not BOM-aware.
457 *
458 * @param buf the UTF-16 string to validate.
459 * @param len the length of the string in number of 2-byte code units
460 * (char16_t).
461 * @return true if and only if the string is valid UTF-16.
462 */
463simdutf_warn_unused bool validate_utf16(const char16_t *buf,
464 size_t len) noexcept;
465 #if SIMDUTF_SPAN
466simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
467validate_utf16(std::span<const char16_t> input) noexcept {
468 #if SIMDUTF_CPLUSPLUS23
469 if consteval {
470 return scalar::utf16::validate<endianness::NATIVE>(input.data(),
471 input.size());
472 } else
473 #endif
474 {
475 return validate_utf16(input.data(), input.size());
476 }
477}
478 #endif // SIMDUTF_SPAN
479#endif // SIMDUTF_FEATURE_UTF16
480
481#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
482/**
483 * Validate the UTF-16LE string. This function may be best when you expect
484 * the input to be almost always valid. Otherwise, consider using
485 * validate_utf16le_with_errors.
486 *
487 * Overridden by each implementation.
488 *
489 * This function is not BOM-aware.
490 *
491 * @param buf the UTF-16LE string to validate.
492 * @param len the length of the string in number of 2-byte code units
493 * (char16_t).
494 * @return true if and only if the string is valid UTF-16LE.
495 */
496simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
497 size_t len) noexcept;
498 #if SIMDUTF_SPAN
499simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused bool
500validate_utf16le(std::span<const char16_t> input) noexcept {
501 #if SIMDUTF_CPLUSPLUS23
502 if consteval {
503 return scalar::utf16::validate<endianness::LITTLE>(input.data(),
504 input.size());
505 } else
506 #endif
507 {
508 return validate_utf16le(input.data(), input.size());
509 }
510}
511 #endif // SIMDUTF_SPAN
512#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
513
514#if SIMDUTF_FEATURE_UTF16
515/**
516 * Validate the UTF-16BE string. This function may be best when you expect
517 * the input to be almost always valid. Otherwise, consider using
518 * validate_utf16be_with_errors.
519 *
520 * Overridden by each implementation.
521 *
522 * This function is not BOM-aware.
523 *
524 * @param buf the UTF-16BE string to validate.
525 * @param len the length of the string in number of 2-byte code units
526 * (char16_t).
527 * @return true if and only if the string is valid UTF-16BE.
528 */
529simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
530 size_t len) noexcept;
531 #if SIMDUTF_SPAN
532simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
533validate_utf16be(std::span<const char16_t> input) noexcept {
534 #if SIMDUTF_CPLUSPLUS23
535 if consteval {
536 return scalar::utf16::validate<endianness::BIG>(input.data(), input.size());
537 } else
538 #endif
539 {
540 return validate_utf16be(input.data(), input.size());
541 }
542}
543 #endif // SIMDUTF_SPAN
544
545/**
546 * Using native endianness; Validate the UTF-16 string and stop on error.
547 * It might be faster than validate_utf16 when an error is expected to occur
548 * early.
549 *
550 * Overridden by each implementation.
551 *
552 * This function is not BOM-aware.
553 *
554 * @param buf the UTF-16 string to validate.
555 * @param len the length of the string in number of 2-byte code units
556 * (char16_t).
557 * @return a result pair struct (of type simdutf::result containing the two
558 * fields error and count) with an error code and either position of the error
559 * (in the input in code units) if any, or the number of code units validated if
560 * successful.
561 */
562simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
563 size_t len) noexcept;
564 #if SIMDUTF_SPAN
565simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
566validate_utf16_with_errors(std::span<const char16_t> input) noexcept {
567 #if SIMDUTF_CPLUSPLUS23
568 if consteval {
569 return scalar::utf16::validate_with_errors<endianness::NATIVE>(
570 input.data(), input.size());
571 } else
572 #endif
573 {
574 return validate_utf16_with_errors(input.data(), input.size());
575 }
576}
577 #endif // SIMDUTF_SPAN
578
579/**
580 * Validate the UTF-16LE string and stop on error. It might be faster than
581 * validate_utf16le when an error is expected to occur early.
582 *
583 * Overridden by each implementation.
584 *
585 * This function is not BOM-aware.
586 *
587 * @param buf the UTF-16LE string to validate.
588 * @param len the length of the string in number of 2-byte code units
589 * (char16_t).
590 * @return a result pair struct (of type simdutf::result containing the two
591 * fields error and count) with an error code and either position of the error
592 * (in the input in code units) if any, or the number of code units validated if
593 * successful.
594 */
595simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
596 size_t len) noexcept;
597 #if SIMDUTF_SPAN
598simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
599validate_utf16le_with_errors(std::span<const char16_t> input) noexcept {
600 #if SIMDUTF_CPLUSPLUS23
601 if consteval {
602 return scalar::utf16::validate_with_errors<endianness::LITTLE>(
603 input.data(), input.size());
604 } else
605 #endif
606 {
607 return validate_utf16le_with_errors(input.data(), input.size());
608 }
609}
610 #endif // SIMDUTF_SPAN
611
612/**
613 * Validate the UTF-16BE string and stop on error. It might be faster than
614 * validate_utf16be when an error is expected to occur early.
615 *
616 * Overridden by each implementation.
617 *
618 * This function is not BOM-aware.
619 *
620 * @param buf the UTF-16BE string to validate.
621 * @param len the length of the string in number of 2-byte code units
622 * (char16_t).
623 * @return a result pair struct (of type simdutf::result containing the two
624 * fields error and count) with an error code and either position of the error
625 * (in the input in code units) if any, or the number of code units validated if
626 * successful.
627 */
628simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
629 size_t len) noexcept;
630 #if SIMDUTF_SPAN
631simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
632validate_utf16be_with_errors(std::span<const char16_t> input) noexcept {
633 #if SIMDUTF_CPLUSPLUS23
634 if consteval {
635 return scalar::utf16::validate_with_errors<endianness::BIG>(input.data(),
636 input.size());
637 } else
638 #endif
639 {
640 return validate_utf16be_with_errors(input.data(), input.size());
641 }
642}
643 #endif // SIMDUTF_SPAN
644
645/**
646 * Fixes an ill-formed UTF-16LE string by replacing mismatched surrogates with
647 * the Unicode replacement character U+FFFD. If input and output points to
648 * different memory areas, the procedure copies string, and it's expected that
649 * output memory is at least as big as the input. It's also possible to set
650 * input equal output, that makes replacements an in-place operation.
651 *
652 * @param input the UTF-16LE string to correct.
653 * @param len the length of the string in number of 2-byte code units
654 * (char16_t).
655 * @param output the output buffer.
656 */
657void to_well_formed_utf16le(const char16_t *input, size_t len,
658 char16_t *output) noexcept;
659 #if SIMDUTF_SPAN
660simdutf_really_inline simdutf_constexpr23 void
661to_well_formed_utf16le(std::span<const char16_t> input,
662 std::span<char16_t> output) noexcept {
663 #if SIMDUTF_CPLUSPLUS23
664 if consteval {
665 scalar::utf16::to_well_formed_utf16<endianness::LITTLE>(
666 input.data(), input.size(), output.data());
667 } else
668 #endif
669 {
670 to_well_formed_utf16le(input.data(), input.size(), output.data());
671 }
672}
673 #endif // SIMDUTF_SPAN
674
675/**
676 * Fixes an ill-formed UTF-16BE string by replacing mismatched surrogates with
677 * the Unicode replacement character U+FFFD. If input and output points to
678 * different memory areas, the procedure copies string, and it's expected that
679 * output memory is at least as big as the input. It's also possible to set
680 * input equal output, that makes replacements an in-place operation.
681 *
682 * @param input the UTF-16BE string to correct.
683 * @param len the length of the string in number of 2-byte code units
684 * (char16_t).
685 * @param output the output buffer.
686 */
687void to_well_formed_utf16be(const char16_t *input, size_t len,
688 char16_t *output) noexcept;
689 #if SIMDUTF_SPAN
690simdutf_really_inline simdutf_constexpr23 void
691to_well_formed_utf16be(std::span<const char16_t> input,
692 std::span<char16_t> output) noexcept {
693 #if SIMDUTF_CPLUSPLUS23
694 if consteval {
695 scalar::utf16::to_well_formed_utf16<endianness::BIG>(
696 input.data(), input.size(), output.data());
697 } else
698 #endif
699 {
700 to_well_formed_utf16be(input.data(), input.size(), output.data());
701 }
702}
703 #endif // SIMDUTF_SPAN
704
705/**
706 * Fixes an ill-formed UTF-16 string by replacing mismatched surrogates with the
707 * Unicode replacement character U+FFFD. If input and output points to different
708 * memory areas, the procedure copies string, and it's expected that output
709 * memory is at least as big as the input. It's also possible to set input equal
710 * output, that makes replacements an in-place operation.
711 *
712 * @param input the UTF-16 string to correct.
713 * @param len the length of the string in number of 2-byte code units
714 * (char16_t).
715 * @param output the output buffer.
716 */
717void to_well_formed_utf16(const char16_t *input, size_t len,
718 char16_t *output) noexcept;
719 #if SIMDUTF_SPAN
720simdutf_really_inline simdutf_constexpr23 void
721to_well_formed_utf16(std::span<const char16_t> input,
722 std::span<char16_t> output) noexcept {
723 #if SIMDUTF_CPLUSPLUS23
724 if consteval {
725 scalar::utf16::to_well_formed_utf16<endianness::NATIVE>(
726 input.data(), input.size(), output.data());
727 } else
728 #endif
729 {
730 to_well_formed_utf16(input.data(), input.size(), output.data());
731 }
732}
733 #endif // SIMDUTF_SPAN
734
735#endif // SIMDUTF_FEATURE_UTF16
736
737#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
738/**
739 * Validate the UTF-32 string. This function may be best when you expect
740 * the input to be almost always valid. Otherwise, consider using
741 * validate_utf32_with_errors.
742 *
743 * Overridden by each implementation.
744 *
745 * This function is not BOM-aware.
746 *
747 * @param buf the UTF-32 string to validate.
748 * @param len the length of the string in number of 4-byte code units
749 * (char32_t).
750 * @return true if and only if the string is valid UTF-32.
751 */
752simdutf_warn_unused bool validate_utf32(const char32_t *buf,
753 size_t len) noexcept;
754 #if SIMDUTF_SPAN
755simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 bool
756validate_utf32(std::span<const char32_t> input) noexcept {
757 #if SIMDUTF_CPLUSPLUS23
758 if consteval {
759 return scalar::utf32::validate(
760 detail::constexpr_cast_ptr<std::uint32_t>(input.data()), input.size());
761 } else
762 #endif
763 {
764 return validate_utf32(input.data(), input.size());
765 }
766}
767 #endif // SIMDUTF_SPAN
768#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
769
770#if SIMDUTF_FEATURE_UTF32
771/**
772 * Validate the UTF-32 string and stop on error. It might be faster than
773 * validate_utf32 when an error is expected to occur early.
774 *
775 * Overridden by each implementation.
776 *
777 * This function is not BOM-aware.
778 *
779 * @param buf the UTF-32 string to validate.
780 * @param len the length of the string in number of 4-byte code units
781 * (char32_t).
782 * @return a result pair struct (of type simdutf::result containing the two
783 * fields error and count) with an error code and either position of the error
784 * (in the input in code units) if any, or the number of code units validated if
785 * successful.
786 */
787simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
788 size_t len) noexcept;
789 #if SIMDUTF_SPAN
790simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
791validate_utf32_with_errors(std::span<const char32_t> input) noexcept {
792 #if SIMDUTF_CPLUSPLUS23
793 if consteval {
794 return scalar::utf32::validate_with_errors(
795 detail::constexpr_cast_ptr<std::uint32_t>(input.data()), input.size());
796 } else
797 #endif
798 {
799 return validate_utf32_with_errors(input.data(), input.size());
800 }
801}
802 #endif // SIMDUTF_SPAN
803#endif // SIMDUTF_FEATURE_UTF32
804
805#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
806/**
807 * Convert Latin1 string into UTF-8 string.
808 *
809 * This function is suitable to work with inputs from untrusted sources.
810 *
811 * @param input the Latin1 string to convert
812 * @param length the length of the string in bytes
813 * @param utf8_output the pointer to buffer that can hold conversion result
814 * @return the number of written char; 0 if conversion is not possible
815 */
816simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input,
817 size_t length,
818 char *utf8_output) noexcept;
819 #if SIMDUTF_SPAN
820simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
821convert_latin1_to_utf8(
822 const detail::input_span_of_byte_like auto &latin1_input,
823 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
824 #if SIMDUTF_CPLUSPLUS23
825 if consteval {
826 return scalar::latin1_to_utf8::convert(
827 detail::constexpr_cast_ptr<char>(latin1_input.data()),
828 latin1_input.size(),
829 detail::constexpr_cast_writeptr<char>(utf8_output.data()));
830 } else
831 #endif
832 {
833 return convert_latin1_to_utf8(
834 reinterpret_cast<const char *>(latin1_input.data()),
835 latin1_input.size(), reinterpret_cast<char *>(utf8_output.data()));
836 }
837}
838 #endif // SIMDUTF_SPAN
839
840/**
841 * Convert Latin1 string into UTF-8 string with output limit.
842 *
843 * This function is suitable to work with inputs from untrusted sources.
844 *
845 * We write as many characters as possible.
846 *
847 * @param input the Latin1 string to convert
848 * @param length the length of the string in bytes
849 * @param utf8_output the pointer to buffer that can hold conversion result
850 * @param utf8_len the maximum output length
851 * @return the number of written char; 0 if conversion is not possible
852 */
853simdutf_warn_unused size_t
854convert_latin1_to_utf8_safe(const char *input, size_t length, char *utf8_output,
855 size_t utf8_len) noexcept;
856 #if SIMDUTF_SPAN
857simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
858convert_latin1_to_utf8_safe(
859 const detail::input_span_of_byte_like auto &input,
860 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
861 // implementation note: outputspan is a forwarding ref to avoid copying
862 // and allow both lvalues and rvalues. std::span can be copied without
863 // problems, but std::vector should not, and this function should accept
864 // both. it will allow using an owning rvalue ref (example: passing a
865 // temporary std::string) as output, but the user will quickly find out
866 // that he has no way of getting the data out of the object in that case.
867 #if SIMDUTF_CPLUSPLUS23
868 if consteval {
869 return scalar::latin1_to_utf8::convert_safe_constexpr(
870 input.data(), input.size(), utf8_output.data(), utf8_output.size());
871 } else
872 #endif
873 {
874 return convert_latin1_to_utf8_safe(
875 reinterpret_cast<const char *>(input.data()), input.size(),
876 reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
877 }
878}
879 #endif // SIMDUTF_SPAN
880#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
881
882#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
883/**
884 * Convert possibly Latin1 string into UTF-16LE string.
885 *
886 * This function is suitable to work with inputs from untrusted sources.
887 *
888 * @param input the Latin1 string to convert
889 * @param length the length of the string in bytes
890 * @param utf16_output the pointer to buffer that can hold conversion result
891 * @return the number of written char16_t; 0 if conversion is not possible
892 */
893simdutf_warn_unused size_t convert_latin1_to_utf16le(
894 const char *input, size_t length, char16_t *utf16_output) noexcept;
895 #if SIMDUTF_SPAN
896simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
897convert_latin1_to_utf16le(
898 const detail::input_span_of_byte_like auto &latin1_input,
899 std::span<char16_t> utf16_output) noexcept {
900 #if SIMDUTF_CPLUSPLUS23
901 if consteval {
902 return scalar::latin1_to_utf16::convert<endianness::LITTLE>(
903 latin1_input.data(), latin1_input.size(), utf16_output.data());
904 } else
905 #endif
906 {
907 return convert_latin1_to_utf16le(
908 reinterpret_cast<const char *>(latin1_input.data()),
909 latin1_input.size(), utf16_output.data());
910 }
911}
912 #endif // SIMDUTF_SPAN
913
914/**
915 * Convert Latin1 string into UTF-16BE string.
916 *
917 * This function is suitable to work with inputs from untrusted sources.
918 *
919 * @param input the Latin1 string to convert
920 * @param length the length of the string in bytes
921 * @param utf16_output the pointer to buffer that can hold conversion result
922 * @return the number of written char16_t; 0 if conversion is not possible
923 */
924simdutf_warn_unused size_t convert_latin1_to_utf16be(
925 const char *input, size_t length, char16_t *utf16_output) noexcept;
926 #if SIMDUTF_SPAN
927simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
928convert_latin1_to_utf16be(const detail::input_span_of_byte_like auto &input,
929 std::span<char16_t> output) noexcept {
930 #if SIMDUTF_CPLUSPLUS23
931 if consteval {
932 return scalar::latin1_to_utf16::convert<endianness::BIG>(
933 input.data(), input.size(), output.data());
934 } else
935 #endif
936 {
937 return convert_latin1_to_utf16be(
938 reinterpret_cast<const char *>(input.data()), input.size(),
939 output.data());
940 }
941}
942 #endif // SIMDUTF_SPAN
943/**
944 * Compute the number of bytes that this UTF-16 string would require in Latin1
945 * format.
946 *
947 * @param length the length of the string in Latin1 code units (char)
948 * @return the length of the string in Latin1 code units (char) required to
949 * encode the UTF-16 string as Latin1
950 */
951simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
952latin1_length_from_utf16(size_t length) noexcept {
953 return length;
954}
955
956/**
957 * Compute the number of code units that this Latin1 string would require in
958 * UTF-16 format.
959 *
960 * @param length the length of the string in Latin1 code units (char)
961 * @return the length of the string in 2-byte code units (char16_t) required to
962 * encode the Latin1 string as UTF-16
963 */
964simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
965utf16_length_from_latin1(size_t length) noexcept {
966 return length;
967}
968#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
969
970#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
971/**
972 * Convert Latin1 string into UTF-32 string.
973 *
974 * This function is suitable to work with inputs from untrusted sources.
975 *
976 * @param input the Latin1 string to convert
977 * @param length the length of the string in bytes
978 * @param utf32_buffer the pointer to buffer that can hold conversion result
979 * @return the number of written char32_t; 0 if conversion is not possible
980 */
981simdutf_warn_unused size_t convert_latin1_to_utf32(
982 const char *input, size_t length, char32_t *utf32_buffer) noexcept;
983 #if SIMDUTF_SPAN
984simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
985convert_latin1_to_utf32(
986 const detail::input_span_of_byte_like auto &latin1_input,
987 std::span<char32_t> utf32_output) noexcept {
988 #if SIMDUTF_CPLUSPLUS23
989 if consteval {
990 return scalar::latin1_to_utf32::convert(
991 latin1_input.data(), latin1_input.size(), utf32_output.data());
992 } else
993 #endif
994 {
995 return convert_latin1_to_utf32(
996 reinterpret_cast<const char *>(latin1_input.data()),
997 latin1_input.size(), utf32_output.data());
998 }
999}
1000 #endif // SIMDUTF_SPAN
1001#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
1002
1003#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1004/**
1005 * Convert possibly broken UTF-8 string into latin1 string.
1006 *
1007 * During the conversion also validation of the input string is done.
1008 * This function is suitable to work with inputs from untrusted sources.
1009 *
1010 * @param input the UTF-8 string to convert
1011 * @param length the length of the string in bytes
1012 * @param latin1_output the pointer to buffer that can hold conversion result
1013 * @return the number of written char; 0 if the input was not valid UTF-8 string
1014 * or if it cannot be represented as Latin1
1015 */
1016simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input,
1017 size_t length,
1018 char *latin1_output) noexcept;
1019 #if SIMDUTF_SPAN
1020simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1021convert_utf8_to_latin1(
1022 const detail::input_span_of_byte_like auto &input,
1023 detail::output_span_of_byte_like auto &&output) noexcept {
1024 #if SIMDUTF_CPLUSPLUS23
1025 if consteval {
1026 return scalar::utf8_to_latin1::convert(input.data(), input.size(),
1027 output.data());
1028 } else
1029 #endif
1030 {
1031 return convert_utf8_to_latin1(reinterpret_cast<const char *>(input.data()),
1032 input.size(),
1033 reinterpret_cast<char *>(output.data()));
1034 }
1035}
1036 #endif // SIMDUTF_SPAN
1037#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1038
1039#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1040/**
1041 * Using native endianness, convert possibly broken UTF-8 string into a UTF-16
1042 * string.
1043 *
1044 * During the conversion also validation of the input string is done.
1045 * This function is suitable to work with inputs from untrusted sources.
1046 *
1047 * @param input the UTF-8 string to convert
1048 * @param length the length of the string in bytes
1049 * @param utf16_output the pointer to buffer that can hold conversion result
1050 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1051 * string
1052 */
1053simdutf_warn_unused size_t convert_utf8_to_utf16(
1054 const char *input, size_t length, char16_t *utf16_output) noexcept;
1055 #if SIMDUTF_SPAN
1056simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1057convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
1058 std::span<char16_t> output) noexcept {
1059 #if SIMDUTF_CPLUSPLUS23
1060 if consteval {
1061 return scalar::utf8_to_utf16::convert<endianness::NATIVE>(
1062 input.data(), input.size(), output.data());
1063 } else
1064 #endif
1065 {
1066 return convert_utf8_to_utf16(reinterpret_cast<const char *>(input.data()),
1067 input.size(), output.data());
1068 }
1069}
1070 #endif // SIMDUTF_SPAN
1071
1072/**
1073 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
1074 * format even when the UTF-16LE content contains mismatched surrogates
1075 * that have to be replaced by the replacement character (0xFFFD).
1076 *
1077 * @param input the UTF-16LE string to convert
1078 * @param length the length of the string in 2-byte code units (char16_t)
1079 * @return a result pair struct (of type simdutf::result containing the two
1080 * fields error and count) where the count is the number of bytes required to
1081 * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
1082 * SURROGATE. The count is correct regardless of the error field.
1083 * When SURROGATE is returned, it does not indicate an error in the case of this
1084 * function: it indicates that at least one surrogate has been encountered: the
1085 * surrogates may be matched or not (thus this function does not validate). If
1086 * the returned error code is SUCCESS, then the input contains no surrogate, is
1087 * in the Basic Multilingual Plane, and is necessarily valid.
1088 */
1089simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
1090 const char16_t *input, size_t length) noexcept;
1091 #if SIMDUTF_SPAN
1092simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused result
1093utf8_length_from_utf16le_with_replacement(
1094 std::span<const char16_t> valid_utf16_input) noexcept {
1095 #if SIMDUTF_CPLUSPLUS23
1096 if consteval {
1097 return scalar::utf16::utf8_length_from_utf16_with_replacement<
1098 endianness::LITTLE>(valid_utf16_input.data(), valid_utf16_input.size());
1099 } else
1100 #endif
1101 {
1102 return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
1103 valid_utf16_input.size());
1104 }
1105}
1106 #endif // SIMDUTF_SPAN
1107
1108/**
1109 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
1110 * format even when the UTF-16BE content contains mismatched surrogates
1111 * that have to be replaced by the replacement character (0xFFFD).
1112 *
1113 * @param input the UTF-16BE string to convert
1114 * @param length the length of the string in 2-byte code units (char16_t)
1115 * @return a result pair struct (of type simdutf::result containing the two
1116 * fields error and count) where the count is the number of bytes required to
1117 * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
1118 * SURROGATE. The count is correct regardless of the error field.
1119 * When SURROGATE is returned, it does not indicate an error in the case of this
1120 * function: it indicates that at least one surrogate has been encountered: the
1121 * surrogates may be matched or not (thus this function does not validate). If
1122 * the returned error code is SUCCESS, then the input contains no surrogate, is
1123 * in the Basic Multilingual Plane, and is necessarily valid.
1124 */
1125simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
1126 const char16_t *input, size_t length) noexcept;
1127 #if SIMDUTF_SPAN
1128simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1129utf8_length_from_utf16be_with_replacement(
1130 std::span<const char16_t> valid_utf16_input) noexcept {
1131 #if SIMDUTF_CPLUSPLUS23
1132 if consteval {
1133 return scalar::utf16::utf8_length_from_utf16_with_replacement<
1134 endianness::BIG>(valid_utf16_input.data(), valid_utf16_input.size());
1135 } else
1136 #endif
1137 {
1138 return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
1139 valid_utf16_input.size());
1140 }
1141}
1142 #endif // SIMDUTF_SPAN
1143
1144#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1145
1146#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1147/**
1148 * Using native endianness, convert a Latin1 string into a UTF-16 string.
1149 *
1150 * @param input the Latin1 string to convert
1151 * @param length the length of the string in bytes
1152 * @param utf16_output the pointer to buffer that can hold conversion result
1153 * @return the number of written char16_t.
1154 */
1155simdutf_warn_unused size_t convert_latin1_to_utf16(
1156 const char *input, size_t length, char16_t *utf16_output) noexcept;
1157 #if SIMDUTF_SPAN
1158simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1159convert_latin1_to_utf16(const detail::input_span_of_byte_like auto &input,
1160 std::span<char16_t> output) noexcept {
1161 #if SIMDUTF_CPLUSPLUS23
1162 if consteval {
1163 return scalar::latin1_to_utf16::convert<endianness::NATIVE>(
1164 input.data(), input.size(), output.data());
1165 } else
1166 #endif
1167 {
1168 return convert_latin1_to_utf16(reinterpret_cast<const char *>(input.data()),
1169 input.size(), output.data());
1170 }
1171}
1172 #endif // SIMDUTF_SPAN
1173#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1174
1175#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1176/**
1177 * Convert possibly broken UTF-8 string into UTF-16LE string.
1178 *
1179 * During the conversion also validation of the input string is done.
1180 * This function is suitable to work with inputs from untrusted sources.
1181 *
1182 * @param input the UTF-8 string to convert
1183 * @param length the length of the string in bytes
1184 * @param utf16_output the pointer to buffer that can hold conversion result
1185 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1186 * string
1187 */
1188simdutf_warn_unused size_t convert_utf8_to_utf16le(
1189 const char *input, size_t length, char16_t *utf16_output) noexcept;
1190 #if SIMDUTF_SPAN
1191simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1192convert_utf8_to_utf16le(const detail::input_span_of_byte_like auto &utf8_input,
1193 std::span<char16_t> utf16_output) noexcept {
1194 #if SIMDUTF_CPLUSPLUS23
1195 if consteval {
1196 return scalar::utf8_to_utf16::convert<endianness::LITTLE>(
1197 utf8_input.data(), utf8_input.size(), utf16_output.data());
1198 } else
1199 #endif
1200 {
1201 return convert_utf8_to_utf16le(
1202 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1203 utf16_output.data());
1204 }
1205}
1206 #endif // SIMDUTF_SPAN
1207
1208/**
1209 * Convert possibly broken UTF-8 string into UTF-16BE string.
1210 *
1211 * During the conversion also validation of the input string is done.
1212 * This function is suitable to work with inputs from untrusted sources.
1213 *
1214 * @param input the UTF-8 string to convert
1215 * @param length the length of the string in bytes
1216 * @param utf16_output the pointer to buffer that can hold conversion result
1217 * @return the number of written char16_t; 0 if the input was not valid UTF-8
1218 * string
1219 */
1220simdutf_warn_unused size_t convert_utf8_to_utf16be(
1221 const char *input, size_t length, char16_t *utf16_output) noexcept;
1222 #if SIMDUTF_SPAN
1223simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1224convert_utf8_to_utf16be(const detail::input_span_of_byte_like auto &utf8_input,
1225 std::span<char16_t> utf16_output) noexcept {
1226
1227 #if SIMDUTF_CPLUSPLUS23
1228 if consteval {
1229 return scalar::utf8_to_utf16::convert<endianness::BIG>(
1230 utf8_input.data(), utf8_input.size(), utf16_output.data());
1231 } else
1232 #endif
1233 {
1234 return convert_utf8_to_utf16be(
1235 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1236 utf16_output.data());
1237 }
1238}
1239 #endif // SIMDUTF_SPAN
1240#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1241
1242#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1243/**
1244 * Convert possibly broken UTF-8 string into latin1 string with errors.
1245 * If the string cannot be represented as Latin1, an error
1246 * code is returned.
1247 *
1248 * During the conversion also validation of the input string is done.
1249 * This function is suitable to work with inputs from untrusted sources.
1250 *
1251 * @param input the UTF-8 string to convert
1252 * @param length the length of the string in bytes
1253 * @param latin1_output the pointer to buffer that can hold conversion result
1254 * @return a result pair struct (of type simdutf::result containing the two
1255 * fields error and count) with an error code and either position of the error
1256 * (in the input in code units) if any, or the number of code units validated if
1257 * successful.
1258 */
1259simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
1260 const char *input, size_t length, char *latin1_output) noexcept;
1261 #if SIMDUTF_SPAN
1262simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1263convert_utf8_to_latin1_with_errors(
1264 const detail::input_span_of_byte_like auto &utf8_input,
1265 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1266 #if SIMDUTF_CPLUSPLUS23
1267 if consteval {
1268 return scalar::utf8_to_latin1::convert_with_errors(
1269 utf8_input.data(), utf8_input.size(), latin1_output.data());
1270 } else
1271 #endif
1272 {
1273 return convert_utf8_to_latin1_with_errors(
1274 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1275 reinterpret_cast<char *>(latin1_output.data()));
1276 }
1277}
1278 #endif // SIMDUTF_SPAN
1279#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1280
1281#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1282/**
1283 * Using native endianness, convert possibly broken UTF-8 string into UTF-16
1284 * string and stop on error.
1285 *
1286 * During the conversion also validation of the input string is done.
1287 * This function is suitable to work with inputs from untrusted sources.
1288 *
1289 * @param input the UTF-8 string to convert
1290 * @param length the length of the string in bytes
1291 * @param utf16_output the pointer to buffer that can hold conversion result
1292 * @return a result pair struct (of type simdutf::result containing the two
1293 * fields error and count) with an error code and either position of the error
1294 * (in the input in code units) if any, or the number of char16_t written if
1295 * successful.
1296 */
1297simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
1298 const char *input, size_t length, char16_t *utf16_output) noexcept;
1299 #if SIMDUTF_SPAN
1300simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1301convert_utf8_to_utf16_with_errors(
1302 const detail::input_span_of_byte_like auto &utf8_input,
1303 std::span<char16_t> utf16_output) noexcept {
1304 #if SIMDUTF_CPLUSPLUS23
1305 if consteval {
1306 return scalar::utf8_to_utf16::convert_with_errors<endianness::NATIVE>(
1307 utf8_input.data(), utf8_input.size(), utf16_output.data());
1308 } else
1309 #endif
1310 {
1311 return convert_utf8_to_utf16_with_errors(
1312 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1313 utf16_output.data());
1314 }
1315}
1316 #endif // SIMDUTF_SPAN
1317
1318/**
1319 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
1320 *
1321 * During the conversion also validation of the input string is done.
1322 * This function is suitable to work with inputs from untrusted sources.
1323 *
1324 * @param input the UTF-8 string to convert
1325 * @param length the length of the string in bytes
1326 * @param utf16_output the pointer to buffer that can hold conversion result
1327 * @return a result pair struct (of type simdutf::result containing the two
1328 * fields error and count) with an error code and either position of the error
1329 * (in the input in code units) if any, or the number of char16_t written if
1330 * successful.
1331 */
1332simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
1333 const char *input, size_t length, char16_t *utf16_output) noexcept;
1334 #if SIMDUTF_SPAN
1335simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1336convert_utf8_to_utf16le_with_errors(
1337 const detail::input_span_of_byte_like auto &utf8_input,
1338 std::span<char16_t> utf16_output) noexcept {
1339 #if SIMDUTF_CPLUSPLUS23
1340 if consteval {
1341 return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
1342 utf8_input.data(), utf8_input.size(), utf16_output.data());
1343 } else
1344 #endif
1345 {
1346 return convert_utf8_to_utf16le_with_errors(
1347 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1348 utf16_output.data());
1349 }
1350}
1351 #endif // SIMDUTF_SPAN
1352
1353/**
1354 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
1355 *
1356 * During the conversion also validation of the input string is done.
1357 * This function is suitable to work with inputs from untrusted sources.
1358 *
1359 * @param input the UTF-8 string to convert
1360 * @param length the length of the string in bytes
1361 * @param utf16_output the pointer to buffer that can hold conversion result
1362 * @return a result pair struct (of type simdutf::result containing the two
1363 * fields error and count) with an error code and either position of the error
1364 * (in the input in code units) if any, or the number of char16_t written if
1365 * successful.
1366 */
1367simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
1368 const char *input, size_t length, char16_t *utf16_output) noexcept;
1369 #if SIMDUTF_SPAN
1370simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1371convert_utf8_to_utf16be_with_errors(
1372 const detail::input_span_of_byte_like auto &utf8_input,
1373 std::span<char16_t> utf16_output) noexcept {
1374 #if SIMDUTF_CPLUSPLUS23
1375 if consteval {
1376 return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
1377 utf8_input.data(), utf8_input.size(), utf16_output.data());
1378 } else
1379 #endif
1380 {
1381 return convert_utf8_to_utf16be_with_errors(
1382 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1383 utf16_output.data());
1384 }
1385}
1386 #endif // SIMDUTF_SPAN
1387#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1388
1389#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1390/**
1391 * Convert possibly broken UTF-8 string into UTF-32 string.
1392 *
1393 * During the conversion also validation of the input string is done.
1394 * This function is suitable to work with inputs from untrusted sources.
1395 *
1396 * @param input the UTF-8 string to convert
1397 * @param length the length of the string in bytes
1398 * @param utf32_output the pointer to buffer that can hold conversion result
1399 * @return the number of written char32_t; 0 if the input was not valid UTF-8
1400 * string
1401 */
1402simdutf_warn_unused size_t convert_utf8_to_utf32(
1403 const char *input, size_t length, char32_t *utf32_output) noexcept;
1404 #if SIMDUTF_SPAN
1405simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1406convert_utf8_to_utf32(const detail::input_span_of_byte_like auto &utf8_input,
1407 std::span<char32_t> utf32_output) noexcept {
1408 #if SIMDUTF_CPLUSPLUS23
1409 if consteval {
1410 return scalar::utf8_to_utf32::convert(utf8_input.data(), utf8_input.size(),
1411 utf32_output.data());
1412 } else
1413 #endif
1414 {
1415 return convert_utf8_to_utf32(
1416 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1417 utf32_output.data());
1418 }
1419}
1420 #endif // SIMDUTF_SPAN
1421
1422/**
1423 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
1424 *
1425 * During the conversion also validation of the input string is done.
1426 * This function is suitable to work with inputs from untrusted sources.
1427 *
1428 * @param input the UTF-8 string to convert
1429 * @param length the length of the string in bytes
1430 * @param utf32_output the pointer to buffer that can hold conversion result
1431 * @return a result pair struct (of type simdutf::result containing the two
1432 * fields error and count) with an error code and either position of the error
1433 * (in the input in code units) if any, or the number of char32_t written if
1434 * successful.
1435 */
1436simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
1437 const char *input, size_t length, char32_t *utf32_output) noexcept;
1438 #if SIMDUTF_SPAN
1439simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
1440convert_utf8_to_utf32_with_errors(
1441 const detail::input_span_of_byte_like auto &utf8_input,
1442 std::span<char32_t> utf32_output) noexcept {
1443 #if SIMDUTF_CPLUSPLUS23
1444 if consteval {
1445 return scalar::utf8_to_utf32::convert_with_errors(
1446 utf8_input.data(), utf8_input.size(), utf32_output.data());
1447 } else
1448 #endif
1449 {
1450 return convert_utf8_to_utf32_with_errors(
1451 reinterpret_cast<const char *>(utf8_input.data()), utf8_input.size(),
1452 utf32_output.data());
1453 }
1454}
1455 #endif // SIMDUTF_SPAN
1456#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1457
1458#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1459/**
1460 * Convert valid UTF-8 string into latin1 string.
1461 *
1462 * This function assumes that the input string is valid UTF-8 and that it can be
1463 * represented as Latin1. If you violate this assumption, the result is
1464 * implementation defined and may include system-dependent behavior such as
1465 * crashes.
1466 *
1467 * This function is for expert users only and not part of our public API. Use
1468 * convert_utf8_to_latin1 instead. The function may be removed from the library
1469 * in the future.
1470 *
1471 * This function is not BOM-aware.
1472 *
1473 * @param input the UTF-8 string to convert
1474 * @param length the length of the string in bytes
1475 * @param latin1_output the pointer to buffer that can hold conversion result
1476 * @return the number of written char; 0 if the input was not valid UTF-8 string
1477 */
1478simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
1479 const char *input, size_t length, char *latin1_output) noexcept;
1480 #if SIMDUTF_SPAN
1481simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1482convert_valid_utf8_to_latin1(
1483 const detail::input_span_of_byte_like auto &valid_utf8_input,
1484 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1485 #if SIMDUTF_CPLUSPLUS23
1486 if consteval {
1487 return scalar::utf8_to_latin1::convert_valid(
1488 valid_utf8_input.data(), valid_utf8_input.size(), latin1_output.data());
1489 } else
1490 #endif
1491 {
1492 return convert_valid_utf8_to_latin1(
1493 reinterpret_cast<const char *>(valid_utf8_input.data()),
1494 valid_utf8_input.size(), latin1_output.data());
1495 }
1496}
1497 #endif // SIMDUTF_SPAN
1498#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1499
1500#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1501/**
1502 * Using native endianness, convert valid UTF-8 string into a UTF-16 string.
1503 *
1504 * This function assumes that the input string is valid UTF-8.
1505 *
1506 * @param input the UTF-8 string to convert
1507 * @param length the length of the string in bytes
1508 * @param utf16_buffer the pointer to buffer that can hold conversion result
1509 * @return the number of written char16_t
1510 */
1511simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
1512 const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1513 #if SIMDUTF_SPAN
1514simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1515convert_valid_utf8_to_utf16(
1516 const detail::input_span_of_byte_like auto &valid_utf8_input,
1517 std::span<char16_t> utf16_output) noexcept {
1518 #if SIMDUTF_CPLUSPLUS23
1519 if consteval {
1520 return scalar::utf8_to_utf16::convert_valid<endianness::NATIVE>(
1521 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1522 } else
1523 #endif
1524 {
1525 return convert_valid_utf8_to_utf16(
1526 reinterpret_cast<const char *>(valid_utf8_input.data()),
1527 valid_utf8_input.size(), utf16_output.data());
1528 }
1529}
1530 #endif // SIMDUTF_SPAN
1531
1532/**
1533 * Convert valid UTF-8 string into UTF-16LE string.
1534 *
1535 * This function assumes that the input string is valid UTF-8.
1536 *
1537 * @param input the UTF-8 string to convert
1538 * @param length the length of the string in bytes
1539 * @param utf16_buffer the pointer to buffer that can hold conversion result
1540 * @return the number of written char16_t
1541 */
1542simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
1543 const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1544 #if SIMDUTF_SPAN
1545simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1546convert_valid_utf8_to_utf16le(
1547 const detail::input_span_of_byte_like auto &valid_utf8_input,
1548 std::span<char16_t> utf16_output) noexcept {
1549
1550 #if SIMDUTF_CPLUSPLUS23
1551 if consteval {
1552 return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
1553 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1554 } else
1555 #endif
1556 {
1557 return convert_valid_utf8_to_utf16le(
1558 reinterpret_cast<const char *>(valid_utf8_input.data()),
1559 valid_utf8_input.size(), utf16_output.data());
1560 }
1561}
1562 #endif // SIMDUTF_SPAN
1563
1564/**
1565 * Convert valid UTF-8 string into UTF-16BE string.
1566 *
1567 * This function assumes that the input string is valid UTF-8.
1568 *
1569 * @param input the UTF-8 string to convert
1570 * @param length the length of the string in bytes
1571 * @param utf16_buffer the pointer to buffer that can hold conversion result
1572 * @return the number of written char16_t
1573 */
1574simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
1575 const char *input, size_t length, char16_t *utf16_buffer) noexcept;
1576 #if SIMDUTF_SPAN
1577simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1578convert_valid_utf8_to_utf16be(
1579 const detail::input_span_of_byte_like auto &valid_utf8_input,
1580 std::span<char16_t> utf16_output) noexcept {
1581 #if SIMDUTF_CPLUSPLUS23
1582 if consteval {
1583 return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
1584 valid_utf8_input.data(), valid_utf8_input.size(), utf16_output.data());
1585 } else
1586 #endif
1587 {
1588 return convert_valid_utf8_to_utf16be(
1589 reinterpret_cast<const char *>(valid_utf8_input.data()),
1590 valid_utf8_input.size(), utf16_output.data());
1591 }
1592}
1593 #endif // SIMDUTF_SPAN
1594#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1595
1596#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1597/**
1598 * Convert valid UTF-8 string into UTF-32 string.
1599 *
1600 * This function assumes that the input string is valid UTF-8.
1601 *
1602 * @param input the UTF-8 string to convert
1603 * @param length the length of the string in bytes
1604 * @param utf32_buffer the pointer to buffer that can hold conversion result
1605 * @return the number of written char32_t
1606 */
1607simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
1608 const char *input, size_t length, char32_t *utf32_buffer) noexcept;
1609 #if SIMDUTF_SPAN
1610simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1611convert_valid_utf8_to_utf32(
1612 const detail::input_span_of_byte_like auto &valid_utf8_input,
1613 std::span<char32_t> utf32_output) noexcept {
1614 #if SIMDUTF_CPLUSPLUS23
1615 if consteval {
1616 return scalar::utf8_to_utf32::convert_valid(
1617 valid_utf8_input.data(), valid_utf8_input.size(), utf32_output.data());
1618 } else
1619 #endif
1620 {
1621 return convert_valid_utf8_to_utf32(
1622 reinterpret_cast<const char *>(valid_utf8_input.data()),
1623 valid_utf8_input.size(), utf32_output.data());
1624 }
1625}
1626 #endif // SIMDUTF_SPAN
1627#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1628
1629#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1630/**
1631 * Return the number of bytes that this Latin1 string would require in UTF-8
1632 * format.
1633 *
1634 * @param input the Latin1 string to convert
1635 * @param length the length of the string bytes
1636 * @return the number of bytes required to encode the Latin1 string as UTF-8
1637 */
1638simdutf_warn_unused size_t utf8_length_from_latin1(const char *input,
1639 size_t length) noexcept;
1640 #if SIMDUTF_SPAN
1641simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1642utf8_length_from_latin1(
1643 const detail::input_span_of_byte_like auto &latin1_input) noexcept {
1644 #if SIMDUTF_CPLUSPLUS23
1645 if consteval {
1646 return scalar::latin1_to_utf8::utf8_length_from_latin1(latin1_input.data(),
1647 latin1_input.size());
1648 } else
1649 #endif
1650 {
1651 return utf8_length_from_latin1(
1652 reinterpret_cast<const char *>(latin1_input.data()),
1653 latin1_input.size());
1654 }
1655}
1656 #endif // SIMDUTF_SPAN
1657
1658/**
1659 * Compute the number of bytes that this UTF-8 string would require in Latin1
1660 * format.
1661 *
1662 * This function does not validate the input. It is acceptable to pass invalid
1663 * UTF-8 strings but in such cases the result is implementation defined.
1664 *
1665 * This function is not BOM-aware.
1666 *
1667 * @param input the UTF-8 string to convert
1668 * @param length the length of the string in byte
1669 * @return the number of bytes required to encode the UTF-8 string as Latin1
1670 */
1671simdutf_warn_unused size_t latin1_length_from_utf8(const char *input,
1672 size_t length) noexcept;
1673 #if SIMDUTF_SPAN
1674simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1675latin1_length_from_utf8(
1676 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1677 #if SIMDUTF_CPLUSPLUS23
1678 if consteval {
1679 return scalar::utf8::count_code_points(valid_utf8_input.data(),
1680 valid_utf8_input.size());
1681 } else
1682 #endif
1683 {
1684 return latin1_length_from_utf8(
1685 reinterpret_cast<const char *>(valid_utf8_input.data()),
1686 valid_utf8_input.size());
1687 }
1688}
1689 #endif // SIMDUTF_SPAN
1690#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
1691
1692#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1693/**
1694 * Compute the number of 2-byte code units that this UTF-8 string would require
1695 * in UTF-16LE format.
1696 *
1697 * This function does not validate the input. It is acceptable to pass invalid
1698 * UTF-8 strings but in such cases the result is implementation defined.
1699 *
1700 * This function is not BOM-aware.
1701 *
1702 * @param input the UTF-8 string to process
1703 * @param length the length of the string in bytes
1704 * @return the number of char16_t code units required to encode the UTF-8 string
1705 * as UTF-16LE
1706 */
1707simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
1708 size_t length) noexcept;
1709 #if SIMDUTF_SPAN
1710simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1711utf16_length_from_utf8(
1712 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1713 #if SIMDUTF_CPLUSPLUS23
1714 if consteval {
1715 return scalar::utf8::utf16_length_from_utf8(valid_utf8_input.data(),
1716 valid_utf8_input.size());
1717 } else
1718 #endif
1719 {
1720 return utf16_length_from_utf8(
1721 reinterpret_cast<const char *>(valid_utf8_input.data()),
1722 valid_utf8_input.size());
1723 }
1724}
1725 #endif // SIMDUTF_SPAN
1726#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1727
1728#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1729/**
1730 * Compute the number of 4-byte code units that this UTF-8 string would require
1731 * in UTF-32 format.
1732 *
1733 * This function is equivalent to count_utf8
1734 *
1735 * This function does not validate the input. It is acceptable to pass invalid
1736 * UTF-8 strings but in such cases the result is implementation defined.
1737 *
1738 * This function is not BOM-aware.
1739 *
1740 * @param input the UTF-8 string to process
1741 * @param length the length of the string in bytes
1742 * @return the number of char32_t code units required to encode the UTF-8 string
1743 * as UTF-32
1744 */
1745simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
1746 size_t length) noexcept;
1747 #if SIMDUTF_SPAN
1748simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1749utf32_length_from_utf8(
1750 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
1751
1752 #if SIMDUTF_CPLUSPLUS23
1753 if consteval {
1754 return scalar::utf8::count_code_points(valid_utf8_input.data(),
1755 valid_utf8_input.size());
1756 } else
1757 #endif
1758 {
1759 return utf32_length_from_utf8(
1760 reinterpret_cast<const char *>(valid_utf8_input.data()),
1761 valid_utf8_input.size());
1762 }
1763}
1764 #endif // SIMDUTF_SPAN
1765#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
1766
1767#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1768/**
1769 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1770 * string.
1771 *
1772 * During the conversion also validation of the input string is done.
1773 * This function is suitable to work with inputs from untrusted sources.
1774 *
1775 * This function is not BOM-aware.
1776 *
1777 * @param input the UTF-16 string to convert
1778 * @param length the length of the string in 2-byte code units (char16_t)
1779 * @param utf8_buffer the pointer to buffer that can hold conversion result
1780 * @return number of written code units; 0 if input is not a valid UTF-16LE
1781 * string
1782 */
1783simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *input,
1784 size_t length,
1785 char *utf8_buffer) noexcept;
1786 #if SIMDUTF_SPAN
1787simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1788convert_utf16_to_utf8(
1789 std::span<const char16_t> utf16_input,
1790 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1791 #if SIMDUTF_CPLUSPLUS23
1792 if consteval {
1793 return scalar::utf16_to_utf8::convert<endianness::NATIVE>(
1794 utf16_input.data(), utf16_input.size(), utf8_output.data());
1795 } else
1796 #endif
1797 {
1798 return convert_utf16_to_utf8(utf16_input.data(), utf16_input.size(),
1799 reinterpret_cast<char *>(utf8_output.data()));
1800 }
1801}
1802 #endif // SIMDUTF_SPAN
1803
1804/**
1805 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
1806 * string with output limit.
1807 *
1808 * We write as many characters as possible into the output buffer,
1809 *
1810 * During the conversion also validation of the input string is done.
1811 * This function is suitable to work with inputs from untrusted sources.
1812 *
1813 * This function is not BOM-aware.
1814 *
1815 *
1816 * @param input the UTF-16 string to convert
1817 * @param length the length of the string in 16-bit code units (char16_t)
1818 * @param utf8_output the pointer to buffer that can hold conversion result
1819 * @param utf8_len the maximum output length
1820 * @return the number of written char; 0 if conversion is not possible
1821 */
1822simdutf_warn_unused size_t convert_utf16_to_utf8_safe(const char16_t *input,
1823 size_t length,
1824 char *utf8_output,
1825 size_t utf8_len) noexcept;
1826 #if SIMDUTF_SPAN
1827simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1828convert_utf16_to_utf8_safe(
1829 std::span<const char16_t> utf16_input,
1830 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1831 // implementation note: outputspan is a forwarding ref to avoid copying
1832 // and allow both lvalues and rvalues. std::span can be copied without
1833 // problems, but std::vector should not, and this function should accept
1834 // both. it will allow using an owning rvalue ref (example: passing a
1835 // temporary std::string) as output, but the user will quickly find out
1836 // that he has no way of getting the data out of the object in that case.
1837 #if SIMDUTF_CPLUSPLUS23
1838 if consteval {
1839 const full_result r =
1840 scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE, true>(
1841 utf16_input.data(), utf16_input.size(), utf8_output.data(),
1842 utf8_output.size());
1843 if (r.error != error_code::SUCCESS &&
1844 r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) {
1845 return 0;
1846 }
1847 return r.output_count;
1848 } else
1849 #endif
1850 {
1851 return convert_utf16_to_utf8_safe(
1852 utf16_input.data(), utf16_input.size(),
1853 reinterpret_cast<char *>(utf8_output.data()), utf8_output.size());
1854 }
1855}
1856 #endif // SIMDUTF_SPAN
1857#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1858
1859#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1860/**
1861 * Using native endianness, convert possibly broken UTF-16 string into Latin1
1862 * string.
1863 *
1864 * During the conversion also validation of the input string is done.
1865 * This function is suitable to work with inputs from untrusted sources.
1866 *
1867 * This function is not BOM-aware.
1868 *
1869 * @param input the UTF-16 string to convert
1870 * @param length the length of the string in 2-byte code units (char16_t)
1871 * @param latin1_buffer the pointer to buffer that can hold conversion result
1872 * @return number of written code units; 0 if input is not a valid UTF-16 string
1873 * or if it cannot be represented as Latin1
1874 */
1875simdutf_warn_unused size_t convert_utf16_to_latin1(
1876 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1877 #if SIMDUTF_SPAN
1878simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1879convert_utf16_to_latin1(
1880 std::span<const char16_t> utf16_input,
1881 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1882 #if SIMDUTF_CPLUSPLUS23
1883 if consteval {
1884 return scalar::utf16_to_latin1::convert<endianness::NATIVE>(
1885 utf16_input.data(), utf16_input.size(), latin1_output.data());
1886 } else
1887 #endif
1888 {
1889 return convert_utf16_to_latin1(
1890 utf16_input.data(), utf16_input.size(),
1891 reinterpret_cast<char *>(latin1_output.data()));
1892 }
1893}
1894 #endif // SIMDUTF_SPAN
1895
1896/**
1897 * Convert possibly broken UTF-16LE string into Latin1 string.
1898 * If the string cannot be represented as Latin1, an error
1899 * is returned.
1900 *
1901 * During the conversion also validation of the input string is done.
1902 * This function is suitable to work with inputs from untrusted sources.
1903 *
1904 * This function is not BOM-aware.
1905 *
1906 * @param input the UTF-16LE string to convert
1907 * @param length the length of the string in 2-byte code units (char16_t)
1908 * @param latin1_buffer the pointer to buffer that can hold conversion result
1909 * @return number of written code units; 0 if input is not a valid UTF-16LE
1910 * string or if it cannot be represented as Latin1
1911 */
1912simdutf_warn_unused size_t convert_utf16le_to_latin1(
1913 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1914 #if SIMDUTF_SPAN
1915simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1916convert_utf16le_to_latin1(
1917 std::span<const char16_t> utf16_input,
1918 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1919 #if SIMDUTF_CPLUSPLUS23
1920 if consteval {
1921 return scalar::utf16_to_latin1::convert<endianness::LITTLE>(
1922 utf16_input.data(), utf16_input.size(), latin1_output.data());
1923 } else
1924 #endif
1925 {
1926 return convert_utf16le_to_latin1(
1927 utf16_input.data(), utf16_input.size(),
1928 reinterpret_cast<char *>(latin1_output.data()));
1929 }
1930}
1931 #endif // SIMDUTF_SPAN
1932
1933/**
1934 * Convert possibly broken UTF-16BE string into Latin1 string.
1935 *
1936 * During the conversion also validation of the input string is done.
1937 * This function is suitable to work with inputs from untrusted sources.
1938 *
1939 * This function is not BOM-aware.
1940 *
1941 * @param input the UTF-16BE string to convert
1942 * @param length the length of the string in 2-byte code units (char16_t)
1943 * @param latin1_buffer the pointer to buffer that can hold conversion result
1944 * @return number of written code units; 0 if input is not a valid UTF-16BE
1945 * string or if it cannot be represented as Latin1
1946 */
1947simdutf_warn_unused size_t convert_utf16be_to_latin1(
1948 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
1949 #if SIMDUTF_SPAN
1950simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1951convert_utf16be_to_latin1(
1952 std::span<const char16_t> utf16_input,
1953 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
1954 #if SIMDUTF_CPLUSPLUS23
1955 if consteval {
1956 return scalar::utf16_to_latin1::convert<endianness::BIG>(
1957 utf16_input.data(), utf16_input.size(), latin1_output.data());
1958 } else
1959 #endif
1960 {
1961 return convert_utf16be_to_latin1(
1962 utf16_input.data(), utf16_input.size(),
1963 reinterpret_cast<char *>(latin1_output.data()));
1964 }
1965}
1966 #endif // SIMDUTF_SPAN
1967#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
1968
1969#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
1970/**
1971 * Convert possibly broken UTF-16LE string into UTF-8 string.
1972 *
1973 * During the conversion also validation of the input string is done.
1974 * This function is suitable to work with inputs from untrusted sources.
1975 *
1976 * This function is not BOM-aware.
1977 *
1978 * @param input the UTF-16LE string to convert
1979 * @param length the length of the string in 2-byte code units (char16_t)
1980 * @param utf8_buffer the pointer to buffer that can hold conversion result
1981 * @return number of written code units; 0 if input is not a valid UTF-16LE
1982 * string
1983 */
1984simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input,
1985 size_t length,
1986 char *utf8_buffer) noexcept;
1987 #if SIMDUTF_SPAN
1988simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
1989convert_utf16le_to_utf8(
1990 std::span<const char16_t> utf16_input,
1991 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
1992 #if SIMDUTF_CPLUSPLUS23
1993 if consteval {
1994 return scalar::utf16_to_utf8::convert<endianness::LITTLE>(
1995 utf16_input.data(), utf16_input.size(), utf8_output.data());
1996 } else
1997 #endif
1998 {
1999 return convert_utf16le_to_utf8(
2000 utf16_input.data(), utf16_input.size(),
2001 reinterpret_cast<char *>(utf8_output.data()));
2002 }
2003}
2004 #endif // SIMDUTF_SPAN
2005
2006/**
2007 * Convert possibly broken UTF-16BE string into UTF-8 string.
2008 *
2009 * During the conversion also validation of the input string is done.
2010 * This function is suitable to work with inputs from untrusted sources.
2011 *
2012 * This function is not BOM-aware.
2013 *
2014 * @param input the UTF-16BE string to convert
2015 * @param length the length of the string in 2-byte code units (char16_t)
2016 * @param utf8_buffer the pointer to buffer that can hold conversion result
2017 * @return number of written code units; 0 if input is not a valid UTF-16LE
2018 * string
2019 */
2020simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input,
2021 size_t length,
2022 char *utf8_buffer) noexcept;
2023 #if SIMDUTF_SPAN
2024simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2025convert_utf16be_to_utf8(
2026 std::span<const char16_t> utf16_input,
2027 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2028 #if SIMDUTF_CPLUSPLUS23
2029 if consteval {
2030 return scalar::utf16_to_utf8::convert<endianness::BIG>(
2031 utf16_input.data(), utf16_input.size(), utf8_output.data());
2032 } else
2033 #endif
2034 {
2035 return convert_utf16be_to_utf8(
2036 utf16_input.data(), utf16_input.size(),
2037 reinterpret_cast<char *>(utf8_output.data()));
2038 }
2039}
2040 #endif // SIMDUTF_SPAN
2041#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2042
2043#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2044/**
2045 * Using native endianness, convert possibly broken UTF-16 string into Latin1
2046 * string.
2047 *
2048 * During the conversion also validation of the input string is done.
2049 * This function is suitable to work with inputs from untrusted sources.
2050 * This function is not BOM-aware.
2051 *
2052 * @param input the UTF-16 string to convert
2053 * @param length the length of the string in 2-byte code units (char16_t)
2054 * @param latin1_buffer the pointer to buffer that can hold conversion result
2055 * @return a result pair struct (of type simdutf::result containing the two
2056 * fields error and count) with an error code and either position of the error
2057 * (in the input in code units) if any, or the number of char written if
2058 * successful.
2059 */
2060simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
2061 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2062 #if SIMDUTF_SPAN
2063simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2064convert_utf16_to_latin1_with_errors(
2065 std::span<const char16_t> utf16_input,
2066 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2067 #if SIMDUTF_CPLUSPLUS23
2068 if consteval {
2069 return scalar::utf16_to_latin1::convert_with_errors<endianness::NATIVE>(
2070 utf16_input.data(), utf16_input.size(), latin1_output.data());
2071 } else
2072 #endif
2073 {
2074 return convert_utf16_to_latin1_with_errors(
2075 utf16_input.data(), utf16_input.size(),
2076 reinterpret_cast<char *>(latin1_output.data()));
2077 }
2078}
2079 #endif // SIMDUTF_SPAN
2080
2081/**
2082 * Convert possibly broken UTF-16LE string into Latin1 string.
2083 *
2084 * During the conversion also validation of the input string is done.
2085 * This function is suitable to work with inputs from untrusted sources.
2086 * This function is not BOM-aware.
2087 *
2088 * @param input the UTF-16LE string to convert
2089 * @param length the length of the string in 2-byte code units (char16_t)
2090 * @param latin1_buffer the pointer to buffer that can hold conversion result
2091 * @return a result pair struct (of type simdutf::result containing the two
2092 * fields error and count) with an error code and either position of the error
2093 * (in the input in code units) if any, or the number of char written if
2094 * successful.
2095 */
2096simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
2097 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2098 #if SIMDUTF_SPAN
2099simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2100convert_utf16le_to_latin1_with_errors(
2101 std::span<const char16_t> utf16_input,
2102 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2103 #if SIMDUTF_CPLUSPLUS23
2104 if consteval {
2105 return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
2106 utf16_input.data(), utf16_input.size(), latin1_output.data());
2107 } else
2108 #endif
2109 {
2110 return convert_utf16le_to_latin1_with_errors(
2111 utf16_input.data(), utf16_input.size(),
2112 reinterpret_cast<char *>(latin1_output.data()));
2113 }
2114}
2115 #endif // SIMDUTF_SPAN
2116
2117/**
2118 * Convert possibly broken UTF-16BE string into Latin1 string.
2119 * If the string cannot be represented as Latin1, an error
2120 * is returned.
2121 *
2122 * During the conversion also validation of the input string is done.
2123 * This function is suitable to work with inputs from untrusted sources.
2124 * This function is not BOM-aware.
2125 *
2126 * @param input the UTF-16BE string to convert
2127 * @param length the length of the string in 2-byte code units (char16_t)
2128 * @param latin1_buffer the pointer to buffer that can hold conversion result
2129 * @return a result pair struct (of type simdutf::result containing the two
2130 * fields error and count) with an error code and either position of the error
2131 * (in the input in code units) if any, or the number of char written if
2132 * successful.
2133 */
2134simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
2135 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2136 #if SIMDUTF_SPAN
2137simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2138convert_utf16be_to_latin1_with_errors(
2139 std::span<const char16_t> utf16_input,
2140 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2141 #if SIMDUTF_CPLUSPLUS23
2142 if consteval {
2143 return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
2144 utf16_input.data(), utf16_input.size(), latin1_output.data());
2145 } else
2146 #endif
2147 {
2148 return convert_utf16be_to_latin1_with_errors(
2149 utf16_input.data(), utf16_input.size(),
2150 reinterpret_cast<char *>(latin1_output.data()));
2151 }
2152}
2153 #endif // SIMDUTF_SPAN
2154#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2155
2156#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2157/**
2158 * Using native endianness, convert possibly broken UTF-16 string into UTF-8
2159 * string and stop on error.
2160 *
2161 * During the conversion also validation of the input string is done.
2162 * This function is suitable to work with inputs from untrusted sources.
2163 *
2164 * This function is not BOM-aware.
2165 *
2166 * @param input the UTF-16 string to convert
2167 * @param length the length of the string in 2-byte code units (char16_t)
2168 * @param utf8_buffer the pointer to buffer that can hold conversion result
2169 * @return a result pair struct (of type simdutf::result containing the two
2170 * fields error and count) with an error code and either position of the error
2171 * (in the input in code units) if any, or the number of char written if
2172 * successful.
2173 */
2174simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
2175 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2176 #if SIMDUTF_SPAN
2177simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2178convert_utf16_to_utf8_with_errors(
2179 std::span<const char16_t> utf16_input,
2180 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2181 #if SIMDUTF_CPLUSPLUS23
2182 if consteval {
2183 return scalar::utf16_to_utf8::convert_with_errors<endianness::NATIVE>(
2184 utf16_input.data(), utf16_input.size(), utf8_output.data());
2185 } else
2186 #endif
2187 {
2188 return convert_utf16_to_utf8_with_errors(
2189 utf16_input.data(), utf16_input.size(),
2190 reinterpret_cast<char *>(utf8_output.data()));
2191 }
2192}
2193 #endif // SIMDUTF_SPAN
2194
2195/**
2196 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
2197 *
2198 * During the conversion also validation of the input string is done.
2199 * This function is suitable to work with inputs from untrusted sources.
2200 *
2201 * This function is not BOM-aware.
2202 *
2203 * @param input the UTF-16LE string to convert
2204 * @param length the length of the string in 2-byte code units (char16_t)
2205 * @param utf8_buffer the pointer to buffer that can hold conversion result
2206 * @return a result pair struct (of type simdutf::result containing the two
2207 * fields error and count) with an error code and either position of the error
2208 * (in the input in code units) if any, or the number of char written if
2209 * successful.
2210 */
2211simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
2212 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2213 #if SIMDUTF_SPAN
2214simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2215convert_utf16le_to_utf8_with_errors(
2216 std::span<const char16_t> utf16_input,
2217 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2218 #if SIMDUTF_CPLUSPLUS23
2219 if consteval {
2220 return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
2221 utf16_input.data(), utf16_input.size(), utf8_output.data());
2222 } else
2223 #endif
2224 {
2225 return convert_utf16le_to_utf8_with_errors(
2226 utf16_input.data(), utf16_input.size(),
2227 reinterpret_cast<char *>(utf8_output.data()));
2228 }
2229}
2230 #endif // SIMDUTF_SPAN
2231
2232/**
2233 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
2234 *
2235 * During the conversion also validation of the input string is done.
2236 * This function is suitable to work with inputs from untrusted sources.
2237 *
2238 * This function is not BOM-aware.
2239 *
2240 * @param input the UTF-16BE string to convert
2241 * @param length the length of the string in 2-byte code units (char16_t)
2242 * @param utf8_buffer the pointer to buffer that can hold conversion result
2243 * @return a result pair struct (of type simdutf::result containing the two
2244 * fields error and count) with an error code and either position of the error
2245 * (in the input in code units) if any, or the number of char written if
2246 * successful.
2247 */
2248simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
2249 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2250 #if SIMDUTF_SPAN
2251simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2252convert_utf16be_to_utf8_with_errors(
2253 std::span<const char16_t> utf16_input,
2254 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2255 #if SIMDUTF_CPLUSPLUS23
2256 if consteval {
2257 return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
2258 utf16_input.data(), utf16_input.size(), utf8_output.data());
2259 } else
2260 #endif
2261 {
2262 return convert_utf16be_to_utf8_with_errors(
2263 utf16_input.data(), utf16_input.size(),
2264 reinterpret_cast<char *>(utf8_output.data()));
2265 }
2266}
2267 #endif // SIMDUTF_SPAN
2268
2269/**
2270 * Convert possibly broken UTF-16LE string into UTF-8 string, replacing
2271 * unpaired surrogates with the Unicode replacement character U+FFFD.
2272 *
2273 * This function always succeeds: unpaired surrogates are replaced with
2274 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
2275 *
2276 * This function is not BOM-aware.
2277 *
2278 * @param input the UTF-16LE string to convert
2279 * @param length the length of the string in 2-byte code units (char16_t)
2280 * @param utf8_buffer the pointer to buffer that can hold conversion result
2281 * @return number of written code units
2282 */
2283simdutf_warn_unused size_t convert_utf16le_to_utf8_with_replacement(
2284 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2285 #if SIMDUTF_SPAN
2286simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2287convert_utf16le_to_utf8_with_replacement(
2288 std::span<const char16_t> utf16_input,
2289 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2290 #if SIMDUTF_CPLUSPLUS23
2291 if consteval {
2292 return scalar::utf16_to_utf8::convert_with_replacement<endianness::LITTLE>(
2293 utf16_input.data(), utf16_input.size(), utf8_output.data());
2294 } else
2295 #endif
2296 {
2297 return convert_utf16le_to_utf8_with_replacement(
2298 utf16_input.data(), utf16_input.size(),
2299 reinterpret_cast<char *>(utf8_output.data()));
2300 }
2301}
2302 #endif // SIMDUTF_SPAN
2303
2304/**
2305 * Convert possibly broken UTF-16BE string into UTF-8 string, replacing
2306 * unpaired surrogates with the Unicode replacement character U+FFFD.
2307 *
2308 * This function always succeeds: unpaired surrogates are replaced with
2309 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
2310 *
2311 * This function is not BOM-aware.
2312 *
2313 * @param input the UTF-16BE string to convert
2314 * @param length the length of the string in 2-byte code units (char16_t)
2315 * @param utf8_buffer the pointer to buffer that can hold conversion result
2316 * @return number of written code units
2317 */
2318simdutf_warn_unused size_t convert_utf16be_to_utf8_with_replacement(
2319 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2320 #if SIMDUTF_SPAN
2321simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2322convert_utf16be_to_utf8_with_replacement(
2323 std::span<const char16_t> utf16_input,
2324 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2325 #if SIMDUTF_CPLUSPLUS23
2326 if consteval {
2327 return scalar::utf16_to_utf8::convert_with_replacement<endianness::BIG>(
2328 utf16_input.data(), utf16_input.size(), utf8_output.data());
2329 } else
2330 #endif
2331 {
2332 return convert_utf16be_to_utf8_with_replacement(
2333 utf16_input.data(), utf16_input.size(),
2334 reinterpret_cast<char *>(utf8_output.data()));
2335 }
2336}
2337 #endif // SIMDUTF_SPAN
2338
2339/**
2340 * Convert possibly broken UTF-16 string (native endianness) into UTF-8 string,
2341 * replacing unpaired surrogates with the Unicode replacement character U+FFFD.
2342 *
2343 * This function always succeeds: unpaired surrogates are replaced with
2344 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
2345 *
2346 * This function is not BOM-aware.
2347 *
2348 * @param input the UTF-16 string to convert
2349 * @param length the length of the string in 2-byte code units (char16_t)
2350 * @param utf8_buffer the pointer to buffer that can hold conversion result
2351 * @return number of written code units
2352 */
2353simdutf_warn_unused size_t convert_utf16_to_utf8_with_replacement(
2354 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2355 #if SIMDUTF_SPAN
2356simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2357convert_utf16_to_utf8_with_replacement(
2358 std::span<const char16_t> utf16_input,
2359 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2360 #if SIMDUTF_CPLUSPLUS23
2361 if consteval {
2362 return scalar::utf16_to_utf8::convert_with_replacement<endianness::NATIVE>(
2363 utf16_input.data(), utf16_input.size(), utf8_output.data());
2364 } else
2365 #endif
2366 {
2367 return convert_utf16_to_utf8_with_replacement(
2368 utf16_input.data(), utf16_input.size(),
2369 reinterpret_cast<char *>(utf8_output.data()));
2370 }
2371}
2372 #endif // SIMDUTF_SPAN
2373#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2374
2375#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2376/**
2377 * Using native endianness, convert valid UTF-16 string into UTF-8 string.
2378 *
2379 * This function assumes that the input string is valid UTF-16.
2380 *
2381 * This function is not BOM-aware.
2382 *
2383 * @param input the UTF-16 string to convert
2384 * @param length the length of the string in 2-byte code units (char16_t)
2385 * @param utf8_buffer the pointer to a buffer that can hold the conversion
2386 * result
2387 * @return number of written code units; 0 if conversion is not possible
2388 */
2389simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
2390 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2391 #if SIMDUTF_SPAN
2392simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2393convert_valid_utf16_to_utf8(
2394 std::span<const char16_t> valid_utf16_input,
2395 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2396 #if SIMDUTF_CPLUSPLUS23
2397 if consteval {
2398 return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
2399 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2400 } else
2401 #endif
2402 {
2403 return convert_valid_utf16_to_utf8(
2404 valid_utf16_input.data(), valid_utf16_input.size(),
2405 reinterpret_cast<char *>(utf8_output.data()));
2406 }
2407}
2408 #endif // SIMDUTF_SPAN
2409#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2410
2411#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2412/**
2413 * Using native endianness, convert UTF-16 string into Latin1 string.
2414 *
2415 * This function assumes that the input string is valid UTF-16 and that it can
2416 * be represented as Latin1. If you violate this assumption, the result is
2417 * implementation defined and may include system-dependent behavior such as
2418 * crashes.
2419 *
2420 * This function is for expert users only and not part of our public API. Use
2421 * convert_utf16_to_latin1 instead. The function may be removed from the library
2422 * in the future.
2423 *
2424 * This function is not BOM-aware.
2425 *
2426 * @param input the UTF-16 string to convert
2427 * @param length the length of the string in 2-byte code units (char16_t)
2428 * @param latin1_buffer the pointer to buffer that can hold conversion result
2429 * @return number of written code units; 0 if conversion is not possible
2430 */
2431simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
2432 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2433 #if SIMDUTF_SPAN
2434simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2435convert_valid_utf16_to_latin1(
2436 std::span<const char16_t> valid_utf16_input,
2437 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2438 #if SIMDUTF_CPLUSPLUS23
2439 if consteval {
2440 return scalar::utf16_to_latin1::convert_valid_impl<endianness::NATIVE>(
2441 detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2442 valid_utf16_input.size(),
2443 detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2444 } else
2445 #endif
2446 {
2447 return convert_valid_utf16_to_latin1(
2448 valid_utf16_input.data(), valid_utf16_input.size(),
2449 reinterpret_cast<char *>(latin1_output.data()));
2450 }
2451}
2452 #endif // SIMDUTF_SPAN
2453
2454/**
2455 * Convert valid UTF-16LE string into Latin1 string.
2456 *
2457 * This function assumes that the input string is valid UTF-16LE and that it can
2458 * be represented as Latin1. If you violate this assumption, the result is
2459 * implementation defined and may include system-dependent behavior such as
2460 * crashes.
2461 *
2462 * This function is for expert users only and not part of our public API. Use
2463 * convert_utf16le_to_latin1 instead. The function may be removed from the
2464 * library in the future.
2465 *
2466 * This function is not BOM-aware.
2467 *
2468 * @param input the UTF-16LE string to convert
2469 * @param length the length of the string in 2-byte code units (char16_t)
2470 * @param latin1_buffer the pointer to buffer that can hold conversion result
2471 * @return number of written code units; 0 if conversion is not possible
2472 */
2473simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
2474 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2475 #if SIMDUTF_SPAN
2476simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2477convert_valid_utf16le_to_latin1(
2478 std::span<const char16_t> valid_utf16_input,
2479 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2480 #if SIMDUTF_CPLUSPLUS23
2481 if consteval {
2482 return scalar::utf16_to_latin1::convert_valid_impl<endianness::LITTLE>(
2483 detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2484 valid_utf16_input.size(),
2485 detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2486 } else
2487 #endif
2488 {
2489 return convert_valid_utf16le_to_latin1(
2490 valid_utf16_input.data(), valid_utf16_input.size(),
2491 reinterpret_cast<char *>(latin1_output.data()));
2492 }
2493}
2494 #endif // SIMDUTF_SPAN
2495
2496/**
2497 * Convert valid UTF-16BE string into Latin1 string.
2498 *
2499 * This function assumes that the input string is valid UTF-16BE and that it can
2500 * be represented as Latin1. If you violate this assumption, the result is
2501 * implementation defined and may include system-dependent behavior such as
2502 * crashes.
2503 *
2504 * This function is for expert users only and not part of our public API. Use
2505 * convert_utf16be_to_latin1 instead. The function may be removed from the
2506 * library in the future.
2507 *
2508 * This function is not BOM-aware.
2509 *
2510 * @param input the UTF-16BE string to convert
2511 * @param length the length of the string in 2-byte code units (char16_t)
2512 * @param latin1_buffer the pointer to buffer that can hold conversion result
2513 * @return number of written code units; 0 if conversion is not possible
2514 */
2515simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
2516 const char16_t *input, size_t length, char *latin1_buffer) noexcept;
2517 #if SIMDUTF_SPAN
2518simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
2519convert_valid_utf16be_to_latin1(
2520 std::span<const char16_t> valid_utf16_input,
2521 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
2522 #if SIMDUTF_CPLUSPLUS23
2523 if consteval {
2524 return scalar::utf16_to_latin1::convert_valid_impl<endianness::BIG>(
2525 detail::constexpr_cast_ptr<uint16_t>(valid_utf16_input.data()),
2526 valid_utf16_input.size(),
2527 detail::constexpr_cast_writeptr<char>(latin1_output.data()));
2528 } else
2529 #endif
2530 {
2531 return convert_valid_utf16be_to_latin1(
2532 valid_utf16_input.data(), valid_utf16_input.size(),
2533 reinterpret_cast<char *>(latin1_output.data()));
2534 }
2535}
2536 #endif // SIMDUTF_SPAN
2537#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
2538
2539#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2540/**
2541 * Convert valid UTF-16LE string into UTF-8 string.
2542 *
2543 * This function assumes that the input string is valid UTF-16LE
2544 *
2545 * This function is not BOM-aware.
2546 *
2547 * @param input the UTF-16LE string to convert
2548 * @param length the length of the string in 2-byte code units (char16_t)
2549 * @param utf8_buffer the pointer to a buffer that can hold the conversion
2550 * result
2551 * @return number of written code units; 0 if conversion is not possible
2552 */
2553simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
2554 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2555 #if SIMDUTF_SPAN
2556simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2557convert_valid_utf16le_to_utf8(
2558 std::span<const char16_t> valid_utf16_input,
2559 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2560 #if SIMDUTF_CPLUSPLUS23
2561 if consteval {
2562 return scalar::utf16_to_utf8::convert_valid<endianness::NATIVE>(
2563 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2564 } else
2565 #endif
2566 {
2567 return convert_valid_utf16le_to_utf8(
2568 valid_utf16_input.data(), valid_utf16_input.size(),
2569 reinterpret_cast<char *>(utf8_output.data()));
2570 }
2571}
2572 #endif // SIMDUTF_SPAN
2573
2574/**
2575 * Convert valid UTF-16BE string into UTF-8 string.
2576 *
2577 * This function assumes that the input string is valid UTF-16BE.
2578 *
2579 * This function is not BOM-aware.
2580 *
2581 * @param input the UTF-16BE string to convert
2582 * @param length the length of the string in 2-byte code units (char16_t)
2583 * @param utf8_buffer the pointer to a buffer that can hold the conversion
2584 * result
2585 * @return number of written code units; 0 if conversion is not possible
2586 */
2587simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
2588 const char16_t *input, size_t length, char *utf8_buffer) noexcept;
2589 #if SIMDUTF_SPAN
2590simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2591convert_valid_utf16be_to_utf8(
2592 std::span<const char16_t> valid_utf16_input,
2593 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
2594 #if SIMDUTF_CPLUSPLUS23
2595 if consteval {
2596 return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(
2597 valid_utf16_input.data(), valid_utf16_input.size(), utf8_output.data());
2598 } else
2599 #endif
2600 {
2601 return convert_valid_utf16be_to_utf8(
2602 valid_utf16_input.data(), valid_utf16_input.size(),
2603 reinterpret_cast<char *>(utf8_output.data()));
2604 }
2605}
2606 #endif // SIMDUTF_SPAN
2607#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2608
2609#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2610/**
2611 * Using native endianness, convert possibly broken UTF-16 string into UTF-32
2612 * string.
2613 *
2614 * During the conversion also validation of the input string is done.
2615 * This function is suitable to work with inputs from untrusted sources.
2616 *
2617 * This function is not BOM-aware.
2618 *
2619 * @param input the UTF-16 string to convert
2620 * @param length the length of the string in 2-byte code units (char16_t)
2621 * @param utf32_buffer the pointer to buffer that can hold conversion result
2622 * @return number of written code units; 0 if input is not a valid UTF-16LE
2623 * string
2624 */
2625simdutf_warn_unused size_t convert_utf16_to_utf32(
2626 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2627 #if SIMDUTF_SPAN
2628simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2629convert_utf16_to_utf32(std::span<const char16_t> utf16_input,
2630 std::span<char32_t> utf32_output) noexcept {
2631
2632 #if SIMDUTF_CPLUSPLUS23
2633 if consteval {
2634 return scalar::utf16_to_utf32::convert<endianness::NATIVE>(
2635 utf16_input.data(), utf16_input.size(), utf32_output.data());
2636 } else
2637 #endif
2638 {
2639 return convert_utf16_to_utf32(utf16_input.data(), utf16_input.size(),
2640 utf32_output.data());
2641 }
2642}
2643 #endif // SIMDUTF_SPAN
2644
2645/**
2646 * Convert possibly broken UTF-16LE string into UTF-32 string.
2647 *
2648 * During the conversion also validation of the input string is done.
2649 * This function is suitable to work with inputs from untrusted sources.
2650 *
2651 * This function is not BOM-aware.
2652 *
2653 * @param input the UTF-16LE string to convert
2654 * @param length the length of the string in 2-byte code units (char16_t)
2655 * @param utf32_buffer the pointer to buffer that can hold conversion result
2656 * @return number of written code units; 0 if input is not a valid UTF-16LE
2657 * string
2658 */
2659simdutf_warn_unused size_t convert_utf16le_to_utf32(
2660 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2661 #if SIMDUTF_SPAN
2662simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2663convert_utf16le_to_utf32(std::span<const char16_t> utf16_input,
2664 std::span<char32_t> utf32_output) noexcept {
2665 #if SIMDUTF_CPLUSPLUS23
2666 if consteval {
2667 return scalar::utf16_to_utf32::convert<endianness::LITTLE>(
2668 utf16_input.data(), utf16_input.size(), utf32_output.data());
2669 } else
2670 #endif
2671 {
2672 return convert_utf16le_to_utf32(utf16_input.data(), utf16_input.size(),
2673 utf32_output.data());
2674 }
2675}
2676 #endif // SIMDUTF_SPAN
2677
2678/**
2679 * Convert possibly broken UTF-16BE string into UTF-32 string.
2680 *
2681 * During the conversion also validation of the input string is done.
2682 * This function is suitable to work with inputs from untrusted sources.
2683 *
2684 * This function is not BOM-aware.
2685 *
2686 * @param input the UTF-16BE string to convert
2687 * @param length the length of the string in 2-byte code units (char16_t)
2688 * @param utf32_buffer the pointer to buffer that can hold conversion result
2689 * @return number of written code units; 0 if input is not a valid UTF-16LE
2690 * string
2691 */
2692simdutf_warn_unused size_t convert_utf16be_to_utf32(
2693 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2694 #if SIMDUTF_SPAN
2695simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2696convert_utf16be_to_utf32(std::span<const char16_t> utf16_input,
2697 std::span<char32_t> utf32_output) noexcept {
2698 #if SIMDUTF_CPLUSPLUS23
2699 if consteval {
2700 return scalar::utf16_to_utf32::convert<endianness::BIG>(
2701 utf16_input.data(), utf16_input.size(), utf32_output.data());
2702 } else
2703 #endif
2704 {
2705 return convert_utf16be_to_utf32(utf16_input.data(), utf16_input.size(),
2706 utf32_output.data());
2707 }
2708}
2709 #endif // SIMDUTF_SPAN
2710
2711/**
2712 * Using native endianness, convert possibly broken UTF-16 string into
2713 * UTF-32 string and stop on error.
2714 *
2715 * During the conversion also validation of the input string is done.
2716 * This function is suitable to work with inputs from untrusted sources.
2717 *
2718 * This function is not BOM-aware.
2719 *
2720 * @param input the UTF-16 string to convert
2721 * @param length the length of the string in 2-byte code units (char16_t)
2722 * @param utf32_buffer the pointer to buffer that can hold conversion result
2723 * @return a result pair struct (of type simdutf::result containing the two
2724 * fields error and count) with an error code and either position of the error
2725 * (in the input in code units) if any, or the number of char32_t written if
2726 * successful.
2727 */
2728simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
2729 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2730 #if SIMDUTF_SPAN
2731simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2732convert_utf16_to_utf32_with_errors(std::span<const char16_t> utf16_input,
2733 std::span<char32_t> utf32_output) noexcept {
2734 #if SIMDUTF_CPLUSPLUS23
2735 if consteval {
2736 return scalar::utf16_to_utf32::convert_with_errors<endianness::NATIVE>(
2737 utf16_input.data(), utf16_input.size(), utf32_output.data());
2738 } else
2739 #endif
2740 {
2741 return convert_utf16_to_utf32_with_errors(
2742 utf16_input.data(), utf16_input.size(), utf32_output.data());
2743 }
2744}
2745 #endif // SIMDUTF_SPAN
2746
2747/**
2748 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
2749 *
2750 * During the conversion also validation of the input string is done.
2751 * This function is suitable to work with inputs from untrusted sources.
2752 *
2753 * This function is not BOM-aware.
2754 *
2755 * @param input the UTF-16LE string to convert
2756 * @param length the length of the string in 2-byte code units (char16_t)
2757 * @param utf32_buffer the pointer to buffer that can hold conversion result
2758 * @return a result pair struct (of type simdutf::result containing the two
2759 * fields error and count) with an error code and either position of the error
2760 * (in the input in code units) if any, or the number of char32_t written if
2761 * successful.
2762 */
2763simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
2764 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2765 #if SIMDUTF_SPAN
2766simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2767convert_utf16le_to_utf32_with_errors(
2768 std::span<const char16_t> utf16_input,
2769 std::span<char32_t> utf32_output) noexcept {
2770 #if SIMDUTF_CPLUSPLUS23
2771 if consteval {
2772 return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
2773 utf16_input.data(), utf16_input.size(), utf32_output.data());
2774 } else
2775 #endif
2776 {
2777 return convert_utf16le_to_utf32_with_errors(
2778 utf16_input.data(), utf16_input.size(), utf32_output.data());
2779 }
2780}
2781 #endif // SIMDUTF_SPAN
2782
2783/**
2784 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
2785 *
2786 * During the conversion also validation of the input string is done.
2787 * This function is suitable to work with inputs from untrusted sources.
2788 *
2789 * This function is not BOM-aware.
2790 *
2791 * @param input the UTF-16BE string to convert
2792 * @param length the length of the string in 2-byte code units (char16_t)
2793 * @param utf32_buffer the pointer to buffer that can hold conversion result
2794 * @return a result pair struct (of type simdutf::result containing the two
2795 * fields error and count) with an error code and either position of the error
2796 * (in the input in code units) if any, or the number of char32_t written if
2797 * successful.
2798 */
2799simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
2800 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2801 #if SIMDUTF_SPAN
2802simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2803convert_utf16be_to_utf32_with_errors(
2804 std::span<const char16_t> utf16_input,
2805 std::span<char32_t> utf32_output) noexcept {
2806 #if SIMDUTF_CPLUSPLUS23
2807 if consteval {
2808 return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
2809 utf16_input.data(), utf16_input.size(), utf32_output.data());
2810 } else
2811 #endif
2812 {
2813 return convert_utf16be_to_utf32_with_errors(
2814 utf16_input.data(), utf16_input.size(), utf32_output.data());
2815 }
2816}
2817 #endif // SIMDUTF_SPAN
2818
2819/**
2820 * Using native endianness, convert valid UTF-16 string into UTF-32 string.
2821 *
2822 * This function assumes that the input string is valid UTF-16 (native
2823 * endianness).
2824 *
2825 * This function is not BOM-aware.
2826 *
2827 * @param input the UTF-16 string to convert
2828 * @param length the length of the string in 2-byte code units (char16_t)
2829 * @param utf32_buffer the pointer to a buffer that can hold the conversion
2830 * result
2831 * @return number of written code units; 0 if conversion is not possible
2832 */
2833simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
2834 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2835 #if SIMDUTF_SPAN
2836simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2837convert_valid_utf16_to_utf32(std::span<const char16_t> valid_utf16_input,
2838 std::span<char32_t> utf32_output) noexcept {
2839 #if SIMDUTF_CPLUSPLUS23
2840 if consteval {
2841 return scalar::utf16_to_utf32::convert_valid<endianness::NATIVE>(
2842 valid_utf16_input.data(), valid_utf16_input.size(),
2843 utf32_output.data());
2844 } else
2845 #endif
2846 {
2847 return convert_valid_utf16_to_utf32(valid_utf16_input.data(),
2848 valid_utf16_input.size(),
2849 utf32_output.data());
2850 }
2851}
2852 #endif // SIMDUTF_SPAN
2853
2854/**
2855 * Convert valid UTF-16LE string into UTF-32 string.
2856 *
2857 * This function assumes that the input string is valid UTF-16LE.
2858 *
2859 * This function is not BOM-aware.
2860 *
2861 * @param input the UTF-16LE string to convert
2862 * @param length the length of the string in 2-byte code units (char16_t)
2863 * @param utf32_buffer the pointer to a buffer that can hold the conversion
2864 * result
2865 * @return number of written code units; 0 if conversion is not possible
2866 */
2867simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
2868 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2869 #if SIMDUTF_SPAN
2870simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2871convert_valid_utf16le_to_utf32(std::span<const char16_t> valid_utf16_input,
2872 std::span<char32_t> utf32_output) noexcept {
2873 #if SIMDUTF_CPLUSPLUS23
2874 if consteval {
2875 return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
2876 valid_utf16_input.data(), valid_utf16_input.size(),
2877 utf32_output.data());
2878 } else
2879 #endif
2880 {
2881 return convert_valid_utf16le_to_utf32(valid_utf16_input.data(),
2882 valid_utf16_input.size(),
2883 utf32_output.data());
2884 }
2885}
2886 #endif // SIMDUTF_SPAN
2887
2888/**
2889 * Convert valid UTF-16BE string into UTF-32 string.
2890 *
2891 * This function assumes that the input string is valid UTF-16LE.
2892 *
2893 * This function is not BOM-aware.
2894 *
2895 * @param input the UTF-16BE string to convert
2896 * @param length the length of the string in 2-byte code units (char16_t)
2897 * @param utf32_buffer the pointer to a buffer that can hold the conversion
2898 * result
2899 * @return number of written code units; 0 if conversion is not possible
2900 */
2901simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
2902 const char16_t *input, size_t length, char32_t *utf32_buffer) noexcept;
2903 #if SIMDUTF_SPAN
2904simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2905convert_valid_utf16be_to_utf32(std::span<const char16_t> valid_utf16_input,
2906 std::span<char32_t> utf32_output) noexcept {
2907 #if SIMDUTF_CPLUSPLUS23
2908 if consteval {
2909 return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(
2910 valid_utf16_input.data(), valid_utf16_input.size(),
2911 utf32_output.data());
2912 } else
2913 #endif
2914 {
2915 return convert_valid_utf16be_to_utf32(valid_utf16_input.data(),
2916 valid_utf16_input.size(),
2917 utf32_output.data());
2918 }
2919}
2920 #endif // SIMDUTF_SPAN
2921#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
2922
2923#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
2924/**
2925 * Using native endianness; Compute the number of bytes that this UTF-16
2926 * string would require in UTF-8 format.
2927 *
2928 * This function does not validate the input. It is acceptable to pass invalid
2929 * UTF-16 strings but in such cases the result is implementation defined.
2930 *
2931 * @param input the UTF-16 string to convert
2932 * @param length the length of the string in 2-byte code units (char16_t)
2933 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
2934 */
2935simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
2936 size_t length) noexcept;
2937 #if SIMDUTF_SPAN
2938simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
2939utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
2940 #if SIMDUTF_CPLUSPLUS23
2941 if consteval {
2942 return scalar::utf16::utf8_length_from_utf16<endianness::NATIVE>(
2943 valid_utf16_input.data(), valid_utf16_input.size());
2944 } else
2945 #endif
2946 {
2947 return utf8_length_from_utf16(valid_utf16_input.data(),
2948 valid_utf16_input.size());
2949 }
2950}
2951 #endif // SIMDUTF_SPAN
2952
2953/**
2954 * Using native endianness; compute the number of bytes that this UTF-16
2955 * string would require in UTF-8 format even when the UTF-16LE content contains
2956 * mismatched surrogates that have to be replaced by the replacement character
2957 * (0xFFFD).
2958 *
2959 * @param input the UTF-16 string to convert
2960 * @param length the length of the string in 2-byte code units (char16_t)
2961 * @return a result pair struct (of type simdutf::result containing the two
2962 * fields error and count) where the count is the number of bytes required to
2963 * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
2964 * SURROGATE. The count is correct regardless of the error field.
2965 * When SURROGATE is returned, it does not indicate an error in the case of this
2966 * function: it indicates that at least one surrogate has been encountered: the
2967 * surrogates may be matched or not (thus this function does not validate). If
2968 * the returned error code is SUCCESS, then the input contains no surrogate, is
2969 * in the Basic Multilingual Plane, and is necessarily valid.
2970 */
2971simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
2972 const char16_t *input, size_t length) noexcept;
2973 #if SIMDUTF_SPAN
2974simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
2975utf8_length_from_utf16_with_replacement(
2976 std::span<const char16_t> valid_utf16_input) noexcept {
2977 #if SIMDUTF_CPLUSPLUS23
2978 if consteval {
2979 return scalar::utf16::utf8_length_from_utf16_with_replacement<
2980 endianness::NATIVE>(valid_utf16_input.data(), valid_utf16_input.size());
2981 } else
2982 #endif
2983 {
2984 return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
2985 valid_utf16_input.size());
2986 }
2987}
2988 #endif // SIMDUTF_SPAN
2989
2990/**
2991 * Compute the number of bytes that this UTF-16LE string would require in UTF-8
2992 * format.
2993 *
2994 * This function does not validate the input. It is acceptable to pass invalid
2995 * UTF-16 strings but in such cases the result is implementation defined.
2996 *
2997 * @param input the UTF-16LE string to convert
2998 * @param length the length of the string in 2-byte code units (char16_t)
2999 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
3000 */
3001simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
3002 size_t length) noexcept;
3003 #if SIMDUTF_SPAN
3004simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
3005utf8_length_from_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3006 #if SIMDUTF_CPLUSPLUS23
3007 if consteval {
3008 return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
3009 valid_utf16_input.data(), valid_utf16_input.size());
3010 } else
3011 #endif
3012 {
3013 return utf8_length_from_utf16le(valid_utf16_input.data(),
3014 valid_utf16_input.size());
3015 }
3016}
3017 #endif // SIMDUTF_SPAN
3018
3019/**
3020 * Compute the number of bytes that this UTF-16BE string would require in UTF-8
3021 * format.
3022 *
3023 * This function does not validate the input. It is acceptable to pass invalid
3024 * UTF-16 strings but in such cases the result is implementation defined.
3025 *
3026 * @param input the UTF-16BE string to convert
3027 * @param length the length of the string in 2-byte code units (char16_t)
3028 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
3029 */
3030simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
3031 size_t length) noexcept;
3032 #if SIMDUTF_SPAN
3033simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3034utf8_length_from_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3035 #if SIMDUTF_CPLUSPLUS23
3036 if consteval {
3037 return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
3038 valid_utf16_input.data(), valid_utf16_input.size());
3039 } else
3040 #endif
3041 {
3042 return utf8_length_from_utf16be(valid_utf16_input.data(),
3043 valid_utf16_input.size());
3044 }
3045}
3046 #endif // SIMDUTF_SPAN
3047#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
3048
3049#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3050/**
3051 * Convert possibly broken UTF-32 string into UTF-8 string.
3052 *
3053 * During the conversion also validation of the input string is done.
3054 * This function is suitable to work with inputs from untrusted sources.
3055 *
3056 * This function is not BOM-aware.
3057 *
3058 * @param input the UTF-32 string to convert
3059 * @param length the length of the string in 4-byte code units (char32_t)
3060 * @param utf8_buffer the pointer to buffer that can hold conversion result
3061 * @return number of written code units; 0 if input is not a valid UTF-32 string
3062 */
3063simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input,
3064 size_t length,
3065 char *utf8_buffer) noexcept;
3066 #if SIMDUTF_SPAN
3067simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3068convert_utf32_to_utf8(
3069 std::span<const char32_t> utf32_input,
3070 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3071 #if SIMDUTF_CPLUSPLUS23
3072 if consteval {
3073 return scalar::utf32_to_utf8::convert(
3074 utf32_input.data(), utf32_input.size(), utf8_output.data());
3075 } else
3076 #endif
3077 {
3078 return convert_utf32_to_utf8(utf32_input.data(), utf32_input.size(),
3079 reinterpret_cast<char *>(utf8_output.data()));
3080 }
3081}
3082 #endif // SIMDUTF_SPAN
3083
3084/**
3085 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
3086 *
3087 * During the conversion also validation of the input string is done.
3088 * This function is suitable to work with inputs from untrusted sources.
3089 *
3090 * This function is not BOM-aware.
3091 *
3092 * @param input the UTF-32 string to convert
3093 * @param length the length of the string in 4-byte code units (char32_t)
3094 * @param utf8_buffer the pointer to buffer that can hold conversion result
3095 * @return a result pair struct (of type simdutf::result containing the two
3096 * fields error and count) with an error code and either position of the error
3097 * (in the input in code units) if any, or the number of char written if
3098 * successful.
3099 */
3100simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
3101 const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3102 #if SIMDUTF_SPAN
3103simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3104convert_utf32_to_utf8_with_errors(
3105 std::span<const char32_t> utf32_input,
3106 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3107 #if SIMDUTF_CPLUSPLUS23
3108 if consteval {
3109 return scalar::utf32_to_utf8::convert_with_errors(
3110 utf32_input.data(), utf32_input.size(), utf8_output.data());
3111 } else
3112 #endif
3113 {
3114 return convert_utf32_to_utf8_with_errors(
3115 utf32_input.data(), utf32_input.size(),
3116 reinterpret_cast<char *>(utf8_output.data()));
3117 }
3118}
3119 #endif // SIMDUTF_SPAN
3120
3121/**
3122 * Convert valid UTF-32 string into UTF-8 string.
3123 *
3124 * This function assumes that the input string is valid UTF-32.
3125 *
3126 * This function is not BOM-aware.
3127 *
3128 * @param input the UTF-32 string to convert
3129 * @param length the length of the string in 4-byte code units (char32_t)
3130 * @param utf8_buffer the pointer to a buffer that can hold the conversion
3131 * result
3132 * @return number of written code units; 0 if conversion is not possible
3133 */
3134simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
3135 const char32_t *input, size_t length, char *utf8_buffer) noexcept;
3136 #if SIMDUTF_SPAN
3137simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3138convert_valid_utf32_to_utf8(
3139 std::span<const char32_t> valid_utf32_input,
3140 detail::output_span_of_byte_like auto &&utf8_output) noexcept {
3141 #if SIMDUTF_CPLUSPLUS23
3142 if consteval {
3143 return scalar::utf32_to_utf8::convert_valid(
3144 valid_utf32_input.data(), valid_utf32_input.size(), utf8_output.data());
3145 } else
3146 #endif
3147 {
3148 return convert_valid_utf32_to_utf8(
3149 valid_utf32_input.data(), valid_utf32_input.size(),
3150 reinterpret_cast<char *>(utf8_output.data()));
3151 }
3152}
3153 #endif // SIMDUTF_SPAN
3154#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3155
3156#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3157/**
3158 * Using native endianness, convert possibly broken UTF-32 string into a UTF-16
3159 * string.
3160 *
3161 * During the conversion also validation of the input string is done.
3162 * This function is suitable to work with inputs from untrusted sources.
3163 *
3164 * This function is not BOM-aware.
3165 *
3166 * @param input the UTF-32 string to convert
3167 * @param length the length of the string in 4-byte code units (char32_t)
3168 * @param utf16_buffer the pointer to buffer that can hold conversion result
3169 * @return number of written code units; 0 if input is not a valid UTF-32 string
3170 */
3171simdutf_warn_unused size_t convert_utf32_to_utf16(
3172 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3173 #if SIMDUTF_SPAN
3174simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3175convert_utf32_to_utf16(std::span<const char32_t> utf32_input,
3176 std::span<char16_t> utf16_output) noexcept {
3177 #if SIMDUTF_CPLUSPLUS23
3178 if consteval {
3179 return scalar::utf32_to_utf16::convert<endianness::NATIVE>(
3180 utf32_input.data(), utf32_input.size(), utf16_output.data());
3181 } else
3182 #endif
3183 {
3184 return convert_utf32_to_utf16(utf32_input.data(), utf32_input.size(),
3185 utf16_output.data());
3186 }
3187}
3188 #endif // SIMDUTF_SPAN
3189
3190/**
3191 * Convert possibly broken UTF-32 string into UTF-16LE string.
3192 *
3193 * During the conversion also validation of the input string is done.
3194 * This function is suitable to work with inputs from untrusted sources.
3195 *
3196 * This function is not BOM-aware.
3197 *
3198 * @param input the UTF-32 string to convert
3199 * @param length the length of the string in 4-byte code units (char32_t)
3200 * @param utf16_buffer the pointer to buffer that can hold conversion result
3201 * @return number of written code units; 0 if input is not a valid UTF-32 string
3202 */
3203simdutf_warn_unused size_t convert_utf32_to_utf16le(
3204 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3205 #if SIMDUTF_SPAN
3206simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3207convert_utf32_to_utf16le(std::span<const char32_t> utf32_input,
3208 std::span<char16_t> utf16_output) noexcept {
3209 #if SIMDUTF_CPLUSPLUS23
3210 if consteval {
3211 return scalar::utf32_to_utf16::convert<endianness::LITTLE>(
3212 utf32_input.data(), utf32_input.size(), utf16_output.data());
3213 } else
3214 #endif
3215 {
3216 return convert_utf32_to_utf16le(utf32_input.data(), utf32_input.size(),
3217 utf16_output.data());
3218 }
3219}
3220 #endif // SIMDUTF_SPAN
3221#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3222
3223#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3224/**
3225 * Convert possibly broken UTF-32 string into Latin1 string.
3226 *
3227 * During the conversion also validation of the input string is done.
3228 * This function is suitable to work with inputs from untrusted sources.
3229 *
3230 * This function is not BOM-aware.
3231 *
3232 * @param input the UTF-32 string to convert
3233 * @param length the length of the string in 4-byte code units (char32_t)
3234 * @param latin1_buffer the pointer to buffer that can hold conversion result
3235 * @return number of written code units; 0 if input is not a valid UTF-32 string
3236 * or if it cannot be represented as Latin1
3237 */
3238simdutf_warn_unused size_t convert_utf32_to_latin1(
3239 const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3240 #if SIMDUTF_SPAN
3241simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3242convert_utf32_to_latin1(
3243 std::span<const char32_t> utf32_input,
3244 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3245 #if SIMDUTF_CPLUSPLUS23
3246 if consteval {
3247 return scalar::utf32_to_latin1::convert(
3248 utf32_input.data(), utf32_input.size(), latin1_output.data());
3249 } else
3250 #endif
3251 {
3252 return convert_utf32_to_latin1(
3253 utf32_input.data(), utf32_input.size(),
3254 reinterpret_cast<char *>(latin1_output.data()));
3255 }
3256}
3257 #endif // SIMDUTF_SPAN
3258
3259/**
3260 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
3261 * If the string cannot be represented as Latin1, an error is returned.
3262 *
3263 * During the conversion also validation of the input string is done.
3264 * This function is suitable to work with inputs from untrusted sources.
3265 *
3266 * This function is not BOM-aware.
3267 *
3268 * @param input the UTF-32 string to convert
3269 * @param length the length of the string in 4-byte code units (char32_t)
3270 * @param latin1_buffer the pointer to buffer that can hold conversion result
3271 * @return a result pair struct (of type simdutf::result containing the two
3272 * fields error and count) with an error code and either position of the error
3273 * (in the input in code units) if any, or the number of char written if
3274 * successful.
3275 */
3276simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
3277 const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3278 #if SIMDUTF_SPAN
3279simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3280convert_utf32_to_latin1_with_errors(
3281 std::span<const char32_t> utf32_input,
3282 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3283 #if SIMDUTF_CPLUSPLUS23
3284 if consteval {
3285 return scalar::utf32_to_latin1::convert_with_errors(
3286 utf32_input.data(), utf32_input.size(), latin1_output.data());
3287 } else
3288 #endif
3289 {
3290 return convert_utf32_to_latin1_with_errors(
3291 utf32_input.data(), utf32_input.size(),
3292 reinterpret_cast<char *>(latin1_output.data()));
3293 }
3294}
3295 #endif // SIMDUTF_SPAN
3296
3297/**
3298 * Convert valid UTF-32 string into Latin1 string.
3299 *
3300 * This function assumes that the input string is valid UTF-32 and that it can
3301 * be represented as Latin1. If you violate this assumption, the result is
3302 * implementation defined and may include system-dependent behavior such as
3303 * crashes.
3304 *
3305 * This function is for expert users only and not part of our public API. Use
3306 * convert_utf32_to_latin1 instead. The function may be removed from the library
3307 * in the future.
3308 *
3309 * This function is not BOM-aware.
3310 *
3311 * @param input the UTF-32 string to convert
3312 * @param length the length of the string in 4-byte code units (char32_t)
3313 * @param latin1_buffer the pointer to a buffer that can hold the conversion
3314 * result
3315 * @return number of written code units; 0 if conversion is not possible
3316 */
3317simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
3318 const char32_t *input, size_t length, char *latin1_buffer) noexcept;
3319 #if SIMDUTF_SPAN
3320simdutf_really_inline simdutf_constexpr23 simdutf_warn_unused size_t
3321convert_valid_utf32_to_latin1(
3322 std::span<const char32_t> valid_utf32_input,
3323 detail::output_span_of_byte_like auto &&latin1_output) noexcept {
3324 #if SIMDUTF_CPLUSPLUS23
3325 if consteval {
3326 return scalar::utf32_to_latin1::convert_valid(
3327 detail::constexpr_cast_ptr<uint32_t>(valid_utf32_input.data()),
3328 valid_utf32_input.size(),
3329 detail::constexpr_cast_writeptr<char>(latin1_output.data()));
3330 }
3331 #endif
3332 {
3333 return convert_valid_utf32_to_latin1(
3334 valid_utf32_input.data(), valid_utf32_input.size(),
3335 reinterpret_cast<char *>(latin1_output.data()));
3336 }
3337}
3338 #endif // SIMDUTF_SPAN
3339
3340/**
3341 * Compute the number of bytes that this UTF-32 string would require in Latin1
3342 * format.
3343 *
3344 * This function does not validate the input. It is acceptable to pass invalid
3345 * UTF-32 strings but in such cases the result is implementation defined.
3346 *
3347 * This function is not BOM-aware.
3348 *
3349 * @param length the length of the string in 4-byte code units (char32_t)
3350 * @return the number of bytes required to encode the UTF-32 string as Latin1
3351 */
3352simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t
3353latin1_length_from_utf32(size_t length) noexcept {
3354 return length;
3355}
3356
3357/**
3358 * Compute the number of bytes that this Latin1 string would require in UTF-32
3359 * format.
3360 *
3361 * @param length the length of the string in Latin1 code units (char)
3362 * @return the length of the string in 4-byte code units (char32_t) required to
3363 * encode the Latin1 string as UTF-32
3364 */
3365simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 size_t
3366utf32_length_from_latin1(size_t length) noexcept {
3367 return length;
3368}
3369#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
3370
3371#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3372/**
3373 * Convert possibly broken UTF-32 string into UTF-16BE string.
3374 *
3375 * During the conversion also validation of the input string is done.
3376 * This function is suitable to work with inputs from untrusted sources.
3377 *
3378 * This function is not BOM-aware.
3379 *
3380 * @param input the UTF-32 string to convert
3381 * @param length the length of the string in 4-byte code units (char32_t)
3382 * @param utf16_buffer the pointer to buffer that can hold conversion result
3383 * @return number of written code units; 0 if input is not a valid UTF-32 string
3384 */
3385simdutf_warn_unused size_t convert_utf32_to_utf16be(
3386 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3387 #if SIMDUTF_SPAN
3388simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3389convert_utf32_to_utf16be(std::span<const char32_t> utf32_input,
3390 std::span<char16_t> utf16_output) noexcept {
3391 #if SIMDUTF_CPLUSPLUS23
3392 if consteval {
3393 return scalar::utf32_to_utf16::convert<endianness::BIG>(
3394 utf32_input.data(), utf32_input.size(), utf16_output.data());
3395 } else
3396 #endif
3397 {
3398 return convert_utf32_to_utf16be(utf32_input.data(), utf32_input.size(),
3399 utf16_output.data());
3400 }
3401}
3402 #endif // SIMDUTF_SPAN
3403
3404/**
3405 * Using native endianness, convert possibly broken UTF-32 string into UTF-16
3406 * string and stop on error.
3407 *
3408 * During the conversion also validation of the input string is done.
3409 * This function is suitable to work with inputs from untrusted sources.
3410 *
3411 * This function is not BOM-aware.
3412 *
3413 * @param input the UTF-32 string to convert
3414 * @param length the length of the string in 4-byte code units (char32_t)
3415 * @param utf16_buffer the pointer to buffer that can hold conversion result
3416 * @return a result pair struct (of type simdutf::result containing the two
3417 * fields error and count) with an error code and either position of the error
3418 * (in the input in code units) if any, or the number of char16_t written if
3419 * successful.
3420 */
3421simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
3422 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3423 #if SIMDUTF_SPAN
3424simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3425convert_utf32_to_utf16_with_errors(std::span<const char32_t> utf32_input,
3426 std::span<char16_t> utf16_output) noexcept {
3427 #if SIMDUTF_CPLUSPLUS23
3428 if consteval {
3429 return scalar::utf32_to_utf16::convert_with_errors<endianness::NATIVE>(
3430 utf32_input.data(), utf32_input.size(), utf16_output.data());
3431 } else
3432 #endif
3433 {
3434 return convert_utf32_to_utf16_with_errors(
3435 utf32_input.data(), utf32_input.size(), utf16_output.data());
3436 }
3437}
3438 #endif // SIMDUTF_SPAN
3439
3440/**
3441 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
3442 *
3443 * During the conversion also validation of the input string is done.
3444 * This function is suitable to work with inputs from untrusted sources.
3445 *
3446 * This function is not BOM-aware.
3447 *
3448 * @param input the UTF-32 string to convert
3449 * @param length the length of the string in 4-byte code units (char32_t)
3450 * @param utf16_buffer the pointer to buffer that can hold conversion result
3451 * @return a result pair struct (of type simdutf::result containing the two
3452 * fields error and count) with an error code and either position of the error
3453 * (in the input in code units) if any, or the number of char16_t written if
3454 * successful.
3455 */
3456simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
3457 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3458 #if SIMDUTF_SPAN
3459simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3460convert_utf32_to_utf16le_with_errors(
3461 std::span<const char32_t> utf32_input,
3462 std::span<char16_t> utf16_output) noexcept {
3463 #if SIMDUTF_CPLUSPLUS23
3464 if consteval {
3465 return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
3466 utf32_input.data(), utf32_input.size(), utf16_output.data());
3467 } else
3468 #endif
3469 {
3470 return convert_utf32_to_utf16le_with_errors(
3471 utf32_input.data(), utf32_input.size(), utf16_output.data());
3472 }
3473}
3474 #endif // SIMDUTF_SPAN
3475
3476/**
3477 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
3478 *
3479 * During the conversion also validation of the input string is done.
3480 * This function is suitable to work with inputs from untrusted sources.
3481 *
3482 * This function is not BOM-aware.
3483 *
3484 * @param input the UTF-32 string to convert
3485 * @param length the length of the string in 4-byte code units (char32_t)
3486 * @param utf16_buffer the pointer to buffer that can hold conversion result
3487 * @return a result pair struct (of type simdutf::result containing the two
3488 * fields error and count) with an error code and either position of the error
3489 * (in the input in code units) if any, or the number of char16_t written if
3490 * successful.
3491 */
3492simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
3493 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3494 #if SIMDUTF_SPAN
3495simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
3496convert_utf32_to_utf16be_with_errors(
3497 std::span<const char32_t> utf32_input,
3498 std::span<char16_t> utf16_output) noexcept {
3499 #if SIMDUTF_CPLUSPLUS23
3500 if consteval {
3501 return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
3502 utf32_input.data(), utf32_input.size(), utf16_output.data());
3503 } else
3504 #endif
3505 {
3506 return convert_utf32_to_utf16be_with_errors(
3507 utf32_input.data(), utf32_input.size(), utf16_output.data());
3508 }
3509}
3510 #endif // SIMDUTF_SPAN
3511
3512/**
3513 * Using native endianness, convert valid UTF-32 string into a UTF-16 string.
3514 *
3515 * This function assumes that the input string is valid UTF-32.
3516 *
3517 * This function is not BOM-aware.
3518 *
3519 * @param input the UTF-32 string to convert
3520 * @param length the length of the string in 4-byte code units (char32_t)
3521 * @param utf16_buffer the pointer to a buffer that can hold the conversion
3522 * result
3523 * @return number of written code units; 0 if conversion is not possible
3524 */
3525simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
3526 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3527 #if SIMDUTF_SPAN
3528simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3529convert_valid_utf32_to_utf16(std::span<const char32_t> valid_utf32_input,
3530 std::span<char16_t> utf16_output) noexcept {
3531
3532 #if SIMDUTF_CPLUSPLUS23
3533 if consteval {
3534 return scalar::utf32_to_utf16::convert_valid<endianness::NATIVE>(
3535 valid_utf32_input.data(), valid_utf32_input.size(),
3536 utf16_output.data());
3537 } else
3538 #endif
3539 {
3540 return convert_valid_utf32_to_utf16(valid_utf32_input.data(),
3541 valid_utf32_input.size(),
3542 utf16_output.data());
3543 }
3544}
3545 #endif // SIMDUTF_SPAN
3546
3547/**
3548 * Convert valid UTF-32 string into UTF-16LE string.
3549 *
3550 * This function assumes that the input string is valid UTF-32.
3551 *
3552 * This function is not BOM-aware.
3553 *
3554 * @param input the UTF-32 string to convert
3555 * @param length the length of the string in 4-byte code units (char32_t)
3556 * @param utf16_buffer the pointer to a buffer that can hold the conversion
3557 * result
3558 * @return number of written code units; 0 if conversion is not possible
3559 */
3560simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
3561 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3562 #if SIMDUTF_SPAN
3563simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3564convert_valid_utf32_to_utf16le(std::span<const char32_t> valid_utf32_input,
3565 std::span<char16_t> utf16_output) noexcept {
3566 #if SIMDUTF_CPLUSPLUS23
3567 if consteval {
3568 return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
3569 valid_utf32_input.data(), valid_utf32_input.size(),
3570 utf16_output.data());
3571 } else
3572 #endif
3573 {
3574 return convert_valid_utf32_to_utf16le(valid_utf32_input.data(),
3575 valid_utf32_input.size(),
3576 utf16_output.data());
3577 }
3578}
3579 #endif // SIMDUTF_SPAN
3580
3581/**
3582 * Convert valid UTF-32 string into UTF-16BE string.
3583 *
3584 * This function assumes that the input string is valid UTF-32.
3585 *
3586 * This function is not BOM-aware.
3587 *
3588 * @param input the UTF-32 string to convert
3589 * @param length the length of the string in 4-byte code units (char32_t)
3590 * @param utf16_buffer the pointer to a buffer that can hold the conversion
3591 * result
3592 * @return number of written code units; 0 if conversion is not possible
3593 */
3594simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
3595 const char32_t *input, size_t length, char16_t *utf16_buffer) noexcept;
3596 #if SIMDUTF_SPAN
3597simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3598convert_valid_utf32_to_utf16be(std::span<const char32_t> valid_utf32_input,
3599 std::span<char16_t> utf16_output) noexcept {
3600 #if SIMDUTF_CPLUSPLUS23
3601 if consteval {
3602 return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(
3603 valid_utf32_input.data(), valid_utf32_input.size(),
3604 utf16_output.data());
3605 } else
3606 #endif
3607 {
3608 return convert_valid_utf32_to_utf16be(valid_utf32_input.data(),
3609 valid_utf32_input.size(),
3610 utf16_output.data());
3611 }
3612}
3613 #endif // SIMDUTF_SPAN
3614#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3615
3616#if SIMDUTF_FEATURE_UTF16
3617/**
3618 * Change the endianness of the input. Can be used to go from UTF-16LE to
3619 * UTF-16BE or from UTF-16BE to UTF-16LE.
3620 *
3621 * This function does not validate the input.
3622 *
3623 * This function is not BOM-aware.
3624 *
3625 * @param input the UTF-16 string to process
3626 * @param length the length of the string in 2-byte code units (char16_t)
3627 * @param output the pointer to a buffer that can hold the conversion
3628 * result
3629 */
3630void change_endianness_utf16(const char16_t *input, size_t length,
3631 char16_t *output) noexcept;
3632 #if SIMDUTF_SPAN
3633simdutf_really_inline simdutf_constexpr23 void
3634change_endianness_utf16(std::span<const char16_t> utf16_input,
3635 std::span<char16_t> utf16_output) noexcept {
3636 #if SIMDUTF_CPLUSPLUS23
3637 if consteval {
3638 return scalar::utf16::change_endianness_utf16(
3639 utf16_input.data(), utf16_input.size(), utf16_output.data());
3640 } else
3641 #endif
3642 {
3643 return change_endianness_utf16(utf16_input.data(), utf16_input.size(),
3644 utf16_output.data());
3645 }
3646}
3647 #endif // SIMDUTF_SPAN
3648#endif // SIMDUTF_FEATURE_UTF16
3649
3650#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3651/**
3652 * Compute the number of bytes that this UTF-32 string would require in UTF-8
3653 * format.
3654 *
3655 * This function does not validate the input. It is acceptable to pass invalid
3656 * UTF-32 strings but in such cases the result is implementation defined.
3657 *
3658 * @param input the UTF-32 string to convert
3659 * @param length the length of the string in 4-byte code units (char32_t)
3660 * @return the number of bytes required to encode the UTF-32 string as UTF-8
3661 */
3662simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
3663 size_t length) noexcept;
3664 #if SIMDUTF_SPAN
3665simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3666utf8_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3667 #if SIMDUTF_CPLUSPLUS23
3668 if consteval {
3669 return scalar::utf32::utf8_length_from_utf32(valid_utf32_input.data(),
3670 valid_utf32_input.size());
3671 } else
3672 #endif
3673 {
3674 return utf8_length_from_utf32(valid_utf32_input.data(),
3675 valid_utf32_input.size());
3676 }
3677}
3678 #endif // SIMDUTF_SPAN
3679#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
3680
3681#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3682/**
3683 * Compute the number of two-byte code units that this UTF-32 string would
3684 * require in UTF-16 format.
3685 *
3686 * This function does not validate the input. It is acceptable to pass invalid
3687 * UTF-32 strings but in such cases the result is implementation defined.
3688 *
3689 * @param input the UTF-32 string to convert
3690 * @param length the length of the string in 4-byte code units (char32_t)
3691 * @return the number of bytes required to encode the UTF-32 string as UTF-16
3692 */
3693simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
3694 size_t length) noexcept;
3695 #if SIMDUTF_SPAN
3696simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3697utf16_length_from_utf32(std::span<const char32_t> valid_utf32_input) noexcept {
3698 #if SIMDUTF_CPLUSPLUS23
3699 if consteval {
3700 return scalar::utf32::utf16_length_from_utf32(valid_utf32_input.data(),
3701 valid_utf32_input.size());
3702 } else
3703 #endif
3704 {
3705 return utf16_length_from_utf32(valid_utf32_input.data(),
3706 valid_utf32_input.size());
3707 }
3708}
3709 #endif // SIMDUTF_SPAN
3710
3711/**
3712 * Using native endianness; Compute the number of bytes that this UTF-16
3713 * string would require in UTF-32 format.
3714 *
3715 * This function is equivalent to count_utf16.
3716 *
3717 * This function does not validate the input. It is acceptable to pass invalid
3718 * UTF-16 strings but in such cases the result is implementation defined.
3719 *
3720 * This function is not BOM-aware.
3721 *
3722 * @param input the UTF-16 string to convert
3723 * @param length the length of the string in 2-byte code units (char16_t)
3724 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3725 */
3726simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
3727 size_t length) noexcept;
3728 #if SIMDUTF_SPAN
3729simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3730utf32_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3731 #if SIMDUTF_CPLUSPLUS23
3732 if consteval {
3733 return scalar::utf16::utf32_length_from_utf16<endianness::NATIVE>(
3734 valid_utf16_input.data(), valid_utf16_input.size());
3735 } else
3736 #endif
3737 {
3738 return utf32_length_from_utf16(valid_utf16_input.data(),
3739 valid_utf16_input.size());
3740 }
3741}
3742 #endif // SIMDUTF_SPAN
3743
3744/**
3745 * Compute the number of bytes that this UTF-16LE string would require in UTF-32
3746 * format.
3747 *
3748 * This function is equivalent to count_utf16le.
3749 *
3750 * This function does not validate the input. It is acceptable to pass invalid
3751 * UTF-16 strings but in such cases the result is implementation defined.
3752 *
3753 * This function is not BOM-aware.
3754 *
3755 * @param input the UTF-16LE string to convert
3756 * @param length the length of the string in 2-byte code units (char16_t)
3757 * @return the number of bytes required to encode the UTF-16LE string as UTF-32
3758 */
3759simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
3760 size_t length) noexcept;
3761 #if SIMDUTF_SPAN
3762simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3763utf32_length_from_utf16le(
3764 std::span<const char16_t> valid_utf16_input) noexcept {
3765 #if SIMDUTF_CPLUSPLUS23
3766 if consteval {
3767 return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(
3768 valid_utf16_input.data(), valid_utf16_input.size());
3769 } else
3770 #endif
3771 {
3772 return utf32_length_from_utf16le(valid_utf16_input.data(),
3773 valid_utf16_input.size());
3774 }
3775}
3776 #endif // SIMDUTF_SPAN
3777
3778/**
3779 * Compute the number of bytes that this UTF-16BE string would require in UTF-32
3780 * format.
3781 *
3782 * This function is equivalent to count_utf16be.
3783 *
3784 * This function does not validate the input. It is acceptable to pass invalid
3785 * UTF-16 strings but in such cases the result is implementation defined.
3786 *
3787 * This function is not BOM-aware.
3788 *
3789 * @param input the UTF-16BE string to convert
3790 * @param length the length of the string in 2-byte code units (char16_t)
3791 * @return the number of bytes required to encode the UTF-16BE string as UTF-32
3792 */
3793simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
3794 size_t length) noexcept;
3795 #if SIMDUTF_SPAN
3796simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3797utf32_length_from_utf16be(
3798 std::span<const char16_t> valid_utf16_input) noexcept {
3799 #if SIMDUTF_CPLUSPLUS23
3800 if consteval {
3801 return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(
3802 valid_utf16_input.data(), valid_utf16_input.size());
3803 } else
3804 #endif
3805 {
3806 return utf32_length_from_utf16be(valid_utf16_input.data(),
3807 valid_utf16_input.size());
3808 }
3809}
3810 #endif // SIMDUTF_SPAN
3811#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
3812
3813#if SIMDUTF_FEATURE_UTF16
3814/**
3815 * Count the number of code points (characters) in the string assuming that
3816 * it is valid.
3817 *
3818 * This function assumes that the input string is valid UTF-16 (native
3819 * endianness). It is acceptable to pass invalid UTF-16 strings but in such
3820 * cases the result is implementation defined.
3821 *
3822 * This function is not BOM-aware.
3823 *
3824 * @param input the UTF-16 string to process
3825 * @param length the length of the string in 2-byte code units (char16_t)
3826 * @return number of code points
3827 */
3828simdutf_warn_unused size_t count_utf16(const char16_t *input,
3829 size_t length) noexcept;
3830 #if SIMDUTF_SPAN
3831simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3832count_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
3833 #if SIMDUTF_CPLUSPLUS23
3834 if consteval {
3835 return scalar::utf16::count_code_points<endianness::NATIVE>(
3836 valid_utf16_input.data(), valid_utf16_input.size());
3837 } else
3838 #endif
3839 {
3840 return count_utf16(valid_utf16_input.data(), valid_utf16_input.size());
3841 }
3842}
3843 #endif // SIMDUTF_SPAN
3844
3845/**
3846 * Count the number of code points (characters) in the string assuming that
3847 * it is valid.
3848 *
3849 * This function assumes that the input string is valid UTF-16LE.
3850 * It is acceptable to pass invalid UTF-16 strings but in such cases
3851 * the result is implementation defined.
3852 *
3853 * This function is not BOM-aware.
3854 *
3855 * @param input the UTF-16LE string to process
3856 * @param length the length of the string in 2-byte code units (char16_t)
3857 * @return number of code points
3858 */
3859simdutf_warn_unused size_t count_utf16le(const char16_t *input,
3860 size_t length) noexcept;
3861 #if SIMDUTF_SPAN
3862simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3863count_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
3864 #if SIMDUTF_CPLUSPLUS23
3865 if consteval {
3866 return scalar::utf16::count_code_points<endianness::LITTLE>(
3867 valid_utf16_input.data(), valid_utf16_input.size());
3868 } else
3869 #endif
3870 {
3871 return count_utf16le(valid_utf16_input.data(), valid_utf16_input.size());
3872 }
3873}
3874 #endif // SIMDUTF_SPAN
3875
3876/**
3877 * Count the number of code points (characters) in the string assuming that
3878 * it is valid.
3879 *
3880 * This function assumes that the input string is valid UTF-16BE.
3881 * It is acceptable to pass invalid UTF-16 strings but in such cases
3882 * the result is implementation defined.
3883 *
3884 * This function is not BOM-aware.
3885 *
3886 * @param input the UTF-16BE string to process
3887 * @param length the length of the string in 2-byte code units (char16_t)
3888 * @return number of code points
3889 */
3890simdutf_warn_unused size_t count_utf16be(const char16_t *input,
3891 size_t length) noexcept;
3892 #if SIMDUTF_SPAN
3893simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3894count_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3895 #if SIMDUTF_CPLUSPLUS23
3896 if consteval {
3897 return scalar::utf16::count_code_points<endianness::BIG>(
3898 valid_utf16_input.data(), valid_utf16_input.size());
3899 } else
3900 #endif
3901 {
3902 return count_utf16be(valid_utf16_input.data(), valid_utf16_input.size());
3903 }
3904}
3905 #endif // SIMDUTF_SPAN
3906#endif // SIMDUTF_FEATURE_UTF16
3907
3908#if SIMDUTF_FEATURE_UTF8
3909/**
3910 * Count the number of code points (characters) in the string assuming that
3911 * it is valid.
3912 *
3913 * This function assumes that the input string is valid UTF-8.
3914 * It is acceptable to pass invalid UTF-8 strings but in such cases
3915 * the result is implementation defined.
3916 *
3917 * @param input the UTF-8 string to process
3918 * @param length the length of the string in bytes
3919 * @return number of code points
3920 */
3921simdutf_warn_unused size_t count_utf8(const char *input,
3922 size_t length) noexcept;
3923 #if SIMDUTF_SPAN
3924simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t count_utf8(
3925 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3926 #if SIMDUTF_CPLUSPLUS23
3927 if consteval {
3928 return scalar::utf8::count_code_points(valid_utf8_input.data(),
3929 valid_utf8_input.size());
3930 } else
3931 #endif
3932 {
3933 return count_utf8(reinterpret_cast<const char *>(valid_utf8_input.data()),
3934 valid_utf8_input.size());
3935 }
3936}
3937 #endif // SIMDUTF_SPAN
3938
3939/**
3940 * Given a valid UTF-8 string having a possibly truncated last character,
3941 * this function checks the end of string. If the last character is truncated
3942 * (or partial), then it returns a shorter length (shorter by 1 to 3 bytes) so
3943 * that the short UTF-8 strings only contain complete characters. If there is no
3944 * truncated character, the original length is returned.
3945 *
3946 * This function assumes that the input string is valid UTF-8, but possibly
3947 * truncated.
3948 *
3949 * @param input the UTF-8 string to process
3950 * @param length the length of the string in bytes
3951 * @return the length of the string in bytes, possibly shorter by 1 to 3 bytes
3952 */
3953simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length);
3954 #if SIMDUTF_SPAN
3955simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3956trim_partial_utf8(
3957 const detail::input_span_of_byte_like auto &valid_utf8_input) noexcept {
3958 #if SIMDUTF_CPLUSPLUS23
3959 if consteval {
3960 return scalar::utf8::trim_partial_utf8(valid_utf8_input.data(),
3961 valid_utf8_input.size());
3962 } else
3963 #endif
3964 {
3965 return trim_partial_utf8(
3966 reinterpret_cast<const char *>(valid_utf8_input.data()),
3967 valid_utf8_input.size());
3968 }
3969}
3970 #endif // SIMDUTF_SPAN
3971#endif // SIMDUTF_FEATURE_UTF8
3972
3973#if SIMDUTF_FEATURE_UTF16
3974/**
3975 * Given a valid UTF-16BE string having a possibly truncated last character,
3976 * this function checks the end of string. If the last character is truncated
3977 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
3978 * the short UTF-16BE strings only contain complete characters. If there is no
3979 * truncated character, the original length is returned.
3980 *
3981 * This function assumes that the input string is valid UTF-16BE, but possibly
3982 * truncated.
3983 *
3984 * @param input the UTF-16BE string to process
3985 * @param length the length of the string in bytes
3986 * @return the length of the string in bytes, possibly shorter by 1 unit
3987 */
3988simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
3989 size_t length);
3990 #if SIMDUTF_SPAN
3991simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
3992trim_partial_utf16be(std::span<const char16_t> valid_utf16_input) noexcept {
3993 #if SIMDUTF_CPLUSPLUS23
3994 if consteval {
3995 return scalar::utf16::trim_partial_utf16<endianness::BIG>(
3996 valid_utf16_input.data(), valid_utf16_input.size());
3997 } else
3998 #endif
3999 {
4000 return trim_partial_utf16be(valid_utf16_input.data(),
4001 valid_utf16_input.size());
4002 }
4003}
4004 #endif // SIMDUTF_SPAN
4005
4006/**
4007 * Given a valid UTF-16LE string having a possibly truncated last character,
4008 * this function checks the end of string. If the last character is truncated
4009 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
4010 * the short UTF-16LE strings only contain complete characters. If there is no
4011 * truncated character, the original length is returned.
4012 *
4013 * This function assumes that the input string is valid UTF-16LE, but possibly
4014 * truncated.
4015 *
4016 * @param input the UTF-16LE string to process
4017 * @param length the length of the string in bytes
4018 * @return the length of the string in unit, possibly shorter by 1 unit
4019 */
4020simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
4021 size_t length);
4022 #if SIMDUTF_SPAN
4023simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4024trim_partial_utf16le(std::span<const char16_t> valid_utf16_input) noexcept {
4025 #if SIMDUTF_CPLUSPLUS23
4026 if consteval {
4027 return scalar::utf16::trim_partial_utf16<endianness::LITTLE>(
4028 valid_utf16_input.data(), valid_utf16_input.size());
4029 } else
4030 #endif
4031 {
4032 return trim_partial_utf16le(valid_utf16_input.data(),
4033 valid_utf16_input.size());
4034 }
4035}
4036 #endif // SIMDUTF_SPAN
4037
4038/**
4039 * Given a valid UTF-16 string having a possibly truncated last character,
4040 * this function checks the end of string. If the last character is truncated
4041 * (or partial), then it returns a shorter length (shorter by 1 unit) so that
4042 * the short UTF-16 strings only contain complete characters. If there is no
4043 * truncated character, the original length is returned.
4044 *
4045 * This function assumes that the input string is valid UTF-16, but possibly
4046 * truncated. We use the native endianness.
4047 *
4048 * @param input the UTF-16 string to process
4049 * @param length the length of the string in bytes
4050 * @return the length of the string in unit, possibly shorter by 1 unit
4051 */
4052simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
4053 size_t length);
4054 #if SIMDUTF_SPAN
4055simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4056trim_partial_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
4057 #if SIMDUTF_CPLUSPLUS23
4058 if consteval {
4059 return scalar::utf16::trim_partial_utf16<endianness::NATIVE>(
4060 valid_utf16_input.data(), valid_utf16_input.size());
4061 } else
4062 #endif
4063 {
4064 return trim_partial_utf16(valid_utf16_input.data(),
4065 valid_utf16_input.size());
4066 }
4067}
4068 #endif // SIMDUTF_SPAN
4069#endif // SIMDUTF_FEATURE_UTF16
4070
4071#if SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 || \
4072 SIMDUTF_FEATURE_DETECT_ENCODING
4073 #ifndef SIMDUTF_NEED_TRAILING_ZEROES
4074 #define SIMDUTF_NEED_TRAILING_ZEROES 1
4075 #endif
4076#endif // SIMDUTF_FEATURE_BASE64 || SIMDUTF_FEATURE_UTF16 ||
4077 // SIMDUTF_FEATURE_DETECT_ENCODING
4078
4079#if SIMDUTF_FEATURE_BASE64
4080// base64_options are used to specify the base64 encoding options.
4081// ASCII spaces are ' ', '\t', '\n', '\r', '\f'
4082// garbage characters are characters that are not part of the base64 alphabet
4083// nor ASCII spaces.
4084constexpr uint64_t base64_reverse_padding =
4085 2; /* modifier for base64_default and base64_url */
4086enum base64_options : uint64_t {
4087 base64_default = 0, /* standard base64 format (with padding) */
4088 base64_url = 1, /* base64url format (no padding) */
4089 base64_default_no_padding =
4090 base64_default |
4091 base64_reverse_padding, /* standard base64 format without padding */
4092 base64_url_with_padding =
4093 base64_url | base64_reverse_padding, /* base64url with padding */
4094 base64_default_accept_garbage =
4095 4, /* standard base64 format accepting garbage characters, the input stops
4096 with the first '=' if any */
4097 base64_url_accept_garbage =
4098 5, /* base64url format accepting garbage characters, the input stops with
4099 the first '=' if any */
4100 base64_default_or_url =
4101 8, /* standard/base64url hybrid format (only meaningful for decoding!) */
4102 base64_default_or_url_accept_garbage =
4103 12, /* standard/base64url hybrid format accepting garbage characters
4104 (only meaningful for decoding!), the input stops with the first '='
4105 if any */
4106};
4107
4108// last_chunk_handling_options are used to specify the handling of the last
4109// chunk in base64 decoding.
4110// https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4111enum last_chunk_handling_options : uint64_t {
4112 loose = 0, /* standard base64 format, decode partial final chunk */
4113 strict = 1, /* error when the last chunk is partial, 2 or 3 chars, and
4114 unpadded, or non-zero bit padding */
4115 stop_before_partial =
4116 2, /* if the last chunk is partial, ignore it (no error) */
4117 only_full_chunks =
4118 3 /* only decode full blocks (4 base64 characters, no padding) */
4119};
4120
4121inline simdutf_constexpr23 bool
4122is_partial(last_chunk_handling_options options) {
4123 return (options == stop_before_partial) || (options == only_full_chunks);
4124}
4125
4126namespace detail {
4127simdutf_warn_unused const char *find(const char *start, const char *end,
4128 char character) noexcept;
4129simdutf_warn_unused const char16_t *
4130find(const char16_t *start, const char16_t *end, char16_t character) noexcept;
4131} // namespace detail
4132
4133/**
4134 * Find the first occurrence of a character in a string. If the character is
4135 * not found, return a pointer to the end of the string.
4136 * @param start the start of the string
4137 * @param end the end of the string
4138 * @param character the character to find
4139 * @return a pointer to the first occurrence of the character in the string,
4140 * or a pointer to the end of the string if the character is not found.
4141 *
4142 */
4143simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char *
4144find(const char *start, const char *end, char character) noexcept {
4145 #if SIMDUTF_CPLUSPLUS23
4146 if consteval {
4147 for (; start != end; ++start)
4148 if (*start == character)
4149 return start;
4150 return end;
4151 } else
4152 #endif
4153 {
4154 return detail::find(start, end, character);
4155 }
4156}
4157simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 const char16_t *
4158find(const char16_t *start, const char16_t *end, char16_t character) noexcept {
4159 // implementation note: this is repeated instead of a template, to ensure
4160 // the api is still a function and compiles without concepts
4161 #if SIMDUTF_CPLUSPLUS23
4162 if consteval {
4163 for (; start != end; ++start)
4164 if (*start == character)
4165 return start;
4166 return end;
4167 } else
4168 #endif
4169 {
4170 return detail::find(start, end, character);
4171 }
4172}
4173}
4174 // We include base64_tables once.
4175 #include <simdutf/base64_tables.h>
4176 #include <simdutf/scalar/base64.h>
4177
4178namespace simdutf {
4179
4180inline std::string_view to_string(base64_options options) {
4181 switch (options) {
4182 case base64_default:
4183 return "base64_default";
4184 case base64_url:
4185 return "base64_url";
4186 case base64_reverse_padding:
4187 return "base64_reverse_padding";
4188 case base64_url_with_padding:
4189 return "base64_url_with_padding";
4190 case base64_default_accept_garbage:
4191 return "base64_default_accept_garbage";
4192 case base64_url_accept_garbage:
4193 return "base64_url_accept_garbage";
4194 case base64_default_or_url:
4195 return "base64_default_or_url";
4196 case base64_default_or_url_accept_garbage:
4197 return "base64_default_or_url_accept_garbage";
4198 }
4199 return "<unknown>";
4200}
4201
4202inline std::string_view to_string(last_chunk_handling_options options) {
4203 switch (options) {
4204 case loose:
4205 return "loose";
4206 case strict:
4207 return "strict";
4208 case stop_before_partial:
4209 return "stop_before_partial";
4210 case only_full_chunks:
4211 return "only_full_chunks";
4212 }
4213 return "<unknown>";
4214}
4215
4216/**
4217 * Provide the maximal binary length in bytes given the base64 input.
4218 * As long as the input does not contain ignorable characters (e.g., ASCII
4219 * spaces or linefeed characters), the result is exact. In particular, the
4220 * function checks for padding characters.
4221 *
4222 * The function is fast (constant time). It checks up to two characters at
4223 * the end of the string. The input is not otherwise validated or read.
4224 *
4225 * @param input the base64 input to process
4226 * @param length the length of the base64 input in bytes
4227 * @return maximum number of binary bytes
4228 */
4229simdutf_warn_unused size_t
4230maximal_binary_length_from_base64(const char *input, size_t length) noexcept;
4231 #if SIMDUTF_SPAN
4232simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4233maximal_binary_length_from_base64(
4234 const detail::input_span_of_byte_like auto &input) noexcept {
4235 #if SIMDUTF_CPLUSPLUS23
4236 if consteval {
4237 return scalar::base64::maximal_binary_length_from_base64(
4238 detail::constexpr_cast_ptr<uint8_t>(input.data()), input.size());
4239 } else
4240 #endif
4241 {
4242 return maximal_binary_length_from_base64(
4243 reinterpret_cast<const char *>(input.data()), input.size());
4244 }
4245}
4246 #endif // SIMDUTF_SPAN
4247
4248/**
4249 * Provide the maximal binary length in bytes given the base64 input.
4250 * As long as the input does not contain ignorable characters (e.g., ASCII
4251 * spaces or linefeed characters), the result is exact. In particular, the
4252 * function checks for padding characters.
4253 *
4254 * The function is fast (constant time). It checks up to two characters at
4255 * the end of the string. The input is not otherwise validated or read.
4256 *
4257 * @param input the base64 input to process, in ASCII stored as 16-bit
4258 * units
4259 * @param length the length of the base64 input in 16-bit units
4260 * @return maximal number of binary bytes
4261 */
4262simdutf_warn_unused size_t maximal_binary_length_from_base64(
4263 const char16_t *input, size_t length) noexcept;
4264 #if SIMDUTF_SPAN
4265simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4266maximal_binary_length_from_base64(std::span<const char16_t> input) noexcept {
4267 #if SIMDUTF_CPLUSPLUS23
4268 if consteval {
4269 return scalar::base64::maximal_binary_length_from_base64(input.data(),
4270 input.size());
4271 } else
4272 #endif
4273 {
4274 return maximal_binary_length_from_base64(input.data(), input.size());
4275 }
4276}
4277 #endif // SIMDUTF_SPAN
4278
4279/**
4280 * Compute the binary length from a base64 input.
4281 * This function is useful for base64 inputs that may contain ASCII whitespaces
4282 * (such as line breaks). For such inputs, the result is exact, and for any
4283 * inputs the result can be used to size the output buffer passed to
4284 * `base64_to_binary`.
4285 *
4286 * The function ignores whitespace and does not require padding characters
4287 * ('=').
4288 *
4289 * @param input the base64 input to process
4290 * @param length the length of the base64 input in bytes
4291 * @return number of binary bytes
4292 */
4293simdutf_warn_unused size_t binary_length_from_base64(const char *input,
4294 size_t length) noexcept;
4295 #if SIMDUTF_SPAN
4296simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4297binary_length_from_base64(
4298 const detail::input_span_of_byte_like auto &input) noexcept {
4299 #if SIMDUTF_CPLUSPLUS23
4300 if consteval {
4301 return scalar::base64::binary_length_from_base64(input.data(),
4302 input.size());
4303 } else
4304 #endif
4305 {
4306 return binary_length_from_base64(
4307 reinterpret_cast<const char *>(input.data()), input.size());
4308 }
4309}
4310 #endif // SIMDUTF_SPAN
4311
4312/**
4313 * Compute the binary length from a base64 input.
4314 * This function is useful for base64 inputs that may contain ASCII whitespaces
4315 * (such as line breaks). For such inputs, the result is exact, and for any
4316 * inputs the result can be used to size the output buffer passed to
4317 * `base64_to_binary`.
4318 *
4319 * The function ignores whitespace and does not require padding characters
4320 * ('=').
4321 *
4322 * @param input the base64 input to process, in ASCII stored as 16-bit
4323 * units
4324 * @param length the length of the base64 input in 16-bit units
4325 * @return number of binary bytes
4326 */
4327simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input,
4328 size_t length) noexcept;
4329 #if SIMDUTF_SPAN
4330simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4331binary_length_from_base64(std::span<const char16_t> input) noexcept {
4332 #if SIMDUTF_CPLUSPLUS23
4333 if consteval {
4334 return scalar::base64::binary_length_from_base64(input.data(),
4335 input.size());
4336 } else
4337 #endif
4338 {
4339 return binary_length_from_base64(input.data(), input.size());
4340 }
4341}
4342 #endif // SIMDUTF_SPAN
4343
4344/**
4345 * Convert a base64 input to a binary output.
4346 *
4347 * This function follows the WHATWG forgiving-base64 format, which means that it
4348 * will ignore any ASCII spaces in the input. You may provide a padded input
4349 * (with one or two equal signs at the end) or an unpadded input (without any
4350 * equal signs at the end).
4351 *
4352 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4353 *
4354 * This function will fail in case of invalid input. When last_chunk_options =
4355 * loose, there are two possible reasons for failure: the input contains a
4356 * number of base64 characters that when divided by 4, leaves a single remainder
4357 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4358 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4359 *
4360 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4361 * input where the invalid character was found. When the error is
4362 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4363 *
4364 * The default option (simdutf::base64_default) expects the characters `+` and
4365 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4366 * characters `-` and `_` as part of its alphabet.
4367 *
4368 * The padding (`=`) is validated if present. There may be at most two padding
4369 * characters at the end of the input. If there are any padding characters, the
4370 * total number of characters (excluding spaces but including padding
4371 * characters) must be divisible by four.
4372 *
4373 * You should call this function with a buffer that is at least
4374 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
4375 * provide that much space, the function may cause a buffer overflow.
4376 *
4377 * Advanced users may want to tailor how the last chunk is handled. By default,
4378 * we use a loose (forgiving) approach but we also support a strict approach
4379 * as well as a stop_before_partial approach, as per the following proposal:
4380 *
4381 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4382 *
4383 * @param input the base64 string to process
4384 * @param length the length of the string in bytes
4385 * @param output the pointer to a buffer that can hold the conversion
4386 * result (should be at least maximal_binary_length_from_base64(input, length)
4387 * bytes long).
4388 * @param options the base64 options to use, usually base64_default or
4389 * base64_url, and base64_default by default.
4390 * @param last_chunk_options the last chunk handling options,
4391 * last_chunk_handling_options::loose by default
4392 * but can also be last_chunk_handling_options::strict or
4393 * last_chunk_handling_options::stop_before_partial.
4394 * @return a result pair struct (of type simdutf::result containing the two
4395 * fields error and count) with an error code and either position of the error
4396 * (in the input in bytes) if any, or the number of bytes written if successful.
4397 */
4398simdutf_warn_unused result base64_to_binary(
4399 const char *input, size_t length, char *output,
4400 base64_options options = base64_default,
4401 last_chunk_handling_options last_chunk_options = loose) noexcept;
4402 #if SIMDUTF_SPAN
4403simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
4404base64_to_binary(
4405 const detail::input_span_of_byte_like auto &input,
4406 detail::output_span_of_byte_like auto &&binary_output,
4407 base64_options options = base64_default,
4408 last_chunk_handling_options last_chunk_options = loose) noexcept {
4409 #if SIMDUTF_CPLUSPLUS23
4410 if consteval {
4411 return scalar::base64::base64_to_binary_details_impl(
4412 input.data(), input.size(), binary_output.data(), options,
4413 last_chunk_options);
4414 } else
4415 #endif
4416 {
4417 return base64_to_binary(reinterpret_cast<const char *>(input.data()),
4418 input.size(),
4419 reinterpret_cast<char *>(binary_output.data()),
4420 options, last_chunk_options);
4421 }
4422}
4423 #endif // SIMDUTF_SPAN
4424
4425/**
4426 * Provide the base64 length in bytes given the length of a binary input.
4427 *
4428 * @param length the length of the input in bytes
4429 * @param options the base64 options to use (default: base64_default)
4430 * @return number of base64 bytes
4431 */
4432inline simdutf_warn_unused simdutf_constexpr23 size_t base64_length_from_binary(
4433 size_t length, base64_options options = base64_default) noexcept {
4434 return scalar::base64::base64_length_from_binary(length, options);
4435}
4436
4437/**
4438 * Provide the base64 length in bytes given the length of a binary input,
4439 * taking into account line breaks.
4440 *
4441 * @param length the length of the input in bytes
4442 * @param options the base64 options to use (default: base64_default)
4443 * @param line_length the length of lines, must be at least 4 (otherwise it is
4444 * interpreted as 4),
4445 * @return number of base64 bytes
4446 */
4447inline simdutf_warn_unused simdutf_constexpr23 size_t
4448base64_length_from_binary_with_lines(
4449 size_t length, base64_options options = base64_default,
4450 size_t line_length = default_line_length) noexcept {
4451 return scalar::base64::base64_length_from_binary_with_lines(length, options,
4452 line_length);
4453}
4454
4455/**
4456 * Convert a binary input to a base64 output.
4457 *
4458 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4459 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4460 * output to ensure that the output length is a multiple of four.
4461 *
4462 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4463 * of its alphabet. No padding is added at the end of the output.
4464 *
4465 * This function always succeeds.
4466 *
4467 * @param input the binary to process
4468 * @param length the length of the input in bytes
4469 * @param output the pointer to a buffer that can hold the conversion
4470 * result (should be at least base64_length_from_binary(length) bytes long)
4471 * @param options the base64 options to use, can be base64_default or
4472 * base64_url, is base64_default by default.
4473 * @return number of written bytes, will be equal to
4474 * base64_length_from_binary(length, options)
4475 */
4476size_t binary_to_base64(const char *input, size_t length, char *output,
4477 base64_options options = base64_default) noexcept;
4478 #if SIMDUTF_SPAN
4479simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4480binary_to_base64(const detail::input_span_of_byte_like auto &input,
4481 detail::output_span_of_byte_like auto &&binary_output,
4482 base64_options options = base64_default) noexcept {
4483 #if SIMDUTF_CPLUSPLUS23
4484 if consteval {
4485 return scalar::base64::tail_encode_base64(
4486 binary_output.data(), input.data(), input.size(), options);
4487 } else
4488 #endif
4489 {
4490 return binary_to_base64(
4491 reinterpret_cast<const char *>(input.data()), input.size(),
4492 reinterpret_cast<char *>(binary_output.data()), options);
4493 }
4494}
4495 #endif // SIMDUTF_SPAN
4496
4497/**
4498 * Convert a binary input to a base64 output with line breaks.
4499 *
4500 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4501 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4502 * output to ensure that the output length is a multiple of four.
4503 *
4504 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4505 * of its alphabet. No padding is added at the end of the output.
4506 *
4507 * This function always succeeds.
4508 *
4509 * @param input the binary to process
4510 * @param length the length of the input in bytes
4511 * @param output the pointer to a buffer that can hold the conversion
4512 * result (should be at least base64_length_from_binary_with_lines(length,
4513 * options, line_length) bytes long)
4514 * @param line_length the length of lines, must be at least 4 (otherwise it is
4515 * interpreted as 4),
4516 * @param options the base64 options to use, can be base64_default or
4517 * base64_url, is base64_default by default.
4518 * @return number of written bytes, will be equal to
4519 * base64_length_from_binary_with_lines(length, options)
4520 */
4521size_t
4522binary_to_base64_with_lines(const char *input, size_t length, char *output,
4523 size_t line_length = simdutf::default_line_length,
4524 base64_options options = base64_default) noexcept;
4525 #if SIMDUTF_SPAN
4526simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 size_t
4527binary_to_base64_with_lines(
4528 const detail::input_span_of_byte_like auto &input,
4529 detail::output_span_of_byte_like auto &&binary_output,
4530 size_t line_length = simdutf::default_line_length,
4531 base64_options options = base64_default) noexcept {
4532 #if SIMDUTF_CPLUSPLUS23
4533 if consteval {
4534 return scalar::base64::tail_encode_base64_impl<true>(
4535 binary_output.data(), input.data(), input.size(), options, line_length);
4536 } else
4537 #endif
4538 {
4539 return binary_to_base64_with_lines(
4540 reinterpret_cast<const char *>(input.data()), input.size(),
4541 reinterpret_cast<char *>(binary_output.data()), line_length, options);
4542 }
4543}
4544 #endif // SIMDUTF_SPAN
4545
4546 #if SIMDUTF_ATOMIC_REF
4547/**
4548 * Convert a binary input to a base64 output, using atomic accesses.
4549 * This function comes with a potentially significant performance
4550 * penalty, but it may be useful in some cases where the input
4551 * buffers are shared between threads, to avoid undefined
4552 * behavior in case of data races.
4553 *
4554 * The function is for advanced users. Its main use case is when
4555 * to silence sanitizer warnings. We have no documented use case
4556 * where this function is actually necessary in terms of practical correctness.
4557 *
4558 * This function is only available when simdutf is compiled with
4559 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4560 * the availability of this function by checking the macro
4561 * SIMDUTF_ATOMIC_REF.
4562 *
4563 * The default option (simdutf::base64_default) uses the characters `+` and `/`
4564 * as part of its alphabet. Further, it adds padding (`=`) at the end of the
4565 * output to ensure that the output length is a multiple of four.
4566 *
4567 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as part
4568 * of its alphabet. No padding is added at the end of the output.
4569 *
4570 * This function always succeeds.
4571 *
4572 * This function is considered experimental. It is not tested by default
4573 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
4574 * It is not documented in the public API documentation (README). It is
4575 * offered on a best effort basis. We rely on the community for further
4576 * testing and feedback.
4577 *
4578 * @brief atomic_binary_to_base64
4579 * @param input the binary to process
4580 * @param length the length of the input in bytes
4581 * @param output the pointer to a buffer that can hold the conversion
4582 * result (should be at least base64_length_from_binary(length) bytes long)
4583 * @param options the base64 options to use, can be base64_default or
4584 * base64_url, is base64_default by default.
4585 * @return number of written bytes, will be equal to
4586 * base64_length_from_binary(length, options)
4587 */
4588size_t
4589atomic_binary_to_base64(const char *input, size_t length, char *output,
4590 base64_options options = base64_default) noexcept;
4591 #if SIMDUTF_SPAN
4592simdutf_really_inline simdutf_warn_unused size_t
4593atomic_binary_to_base64(const detail::input_span_of_byte_like auto &input,
4594 detail::output_span_of_byte_like auto &&binary_output,
4595 base64_options options = base64_default) noexcept {
4596 return atomic_binary_to_base64(
4597 reinterpret_cast<const char *>(input.data()), input.size(),
4598 reinterpret_cast<char *>(binary_output.data()), options);
4599}
4600 #endif // SIMDUTF_SPAN
4601 #endif // SIMDUTF_ATOMIC_REF
4602
4603/**
4604 * Convert a base64 input to a binary output.
4605 *
4606 * This function follows the WHATWG forgiving-base64 format, which means that it
4607 * will ignore any ASCII spaces in the input. You may provide a padded input
4608 * (with one or two equal signs at the end) or an unpadded input (without any
4609 * equal signs at the end).
4610 *
4611 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4612 *
4613 * This function will fail in case of invalid input. When last_chunk_options =
4614 * loose, there are two possible reasons for failure: the input contains a
4615 * number of base64 characters that when divided by 4, leaves a single remainder
4616 * character (BASE64_INPUT_REMAINDER), or the input contains a character that is
4617 * not a valid base64 character (INVALID_BASE64_CHARACTER).
4618 *
4619 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4620 * input where the invalid character was found. When the error is
4621 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4622 *
4623 * The default option (simdutf::base64_default) expects the characters `+` and
4624 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4625 * characters `-` and `_` as part of its alphabet.
4626 *
4627 * The padding (`=`) is validated if present. There may be at most two padding
4628 * characters at the end of the input. If there are any padding characters, the
4629 * total number of characters (excluding spaces but including padding
4630 * characters) must be divisible by four.
4631 *
4632 * You should call this function with a buffer that is at least
4633 * maximal_binary_length_from_base64(input, length) bytes long. If you fail
4634 * to provide that much space, the function may cause a buffer overflow.
4635 *
4636 * Advanced users may want to tailor how the last chunk is handled. By default,
4637 * we use a loose (forgiving) approach but we also support a strict approach
4638 * as well as a stop_before_partial approach, as per the following proposal:
4639 *
4640 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4641 *
4642 * @param input the base64 string to process, in ASCII stored as 16-bit
4643 * units
4644 * @param length the length of the string in 16-bit units
4645 * @param output the pointer to a buffer that can hold the conversion
4646 * result (should be at least maximal_binary_length_from_base64(input, length)
4647 * bytes long).
4648 * @param options the base64 options to use, can be base64_default or
4649 * base64_url, is base64_default by default.
4650 * @param last_chunk_options the last chunk handling options,
4651 * last_chunk_handling_options::loose by default
4652 * but can also be last_chunk_handling_options::strict or
4653 * last_chunk_handling_options::stop_before_partial.
4654 * @return a result pair struct (of type simdutf::result containing the two
4655 * fields error and count) with an error code and position of the
4656 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4657 * of bytes written if successful.
4658 */
4659simdutf_warn_unused result
4660base64_to_binary(const char16_t *input, size_t length, char *output,
4661 base64_options options = base64_default,
4662 last_chunk_handling_options last_chunk_options =
4663 last_chunk_handling_options::loose) noexcept;
4664 #if SIMDUTF_SPAN
4665simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 result
4666base64_to_binary(
4667 std::span<const char16_t> input,
4668 detail::output_span_of_byte_like auto &&binary_output,
4669 base64_options options = base64_default,
4670 last_chunk_handling_options last_chunk_options = loose) noexcept {
4671 #if SIMDUTF_CPLUSPLUS23
4672 if consteval {
4673 return scalar::base64::base64_to_binary_details_impl(
4674 input.data(), input.size(), binary_output.data(), options,
4675 last_chunk_options);
4676 } else
4677 #endif
4678 {
4679 return base64_to_binary(input.data(), input.size(),
4680 reinterpret_cast<char *>(binary_output.data()),
4681 options, last_chunk_options);
4682 }
4683}
4684 #endif // SIMDUTF_SPAN
4685
4686/**
4687 * Convert a base64 input to a binary output while returning more details
4688 * than base64_to_binary.
4689 *
4690 * This function follows the WHATWG forgiving-base64 format, which means that it
4691 * will ignore any ASCII spaces in the input. You may provide a padded input
4692 * (with one or two equal signs at the end) or an unpadded input (without any
4693 * equal signs at the end).
4694 *
4695 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4696 *
4697 * Unlike base64_to_binary, this function returns a full_result with both
4698 * input_count and output_count, so you always know how much input was consumed
4699 * and how much output was written. There are three cases where the input may
4700 * not be fully consumed:
4701 *
4702 * 1. stop_before_partial: When last_chunk_options is set to
4703 * stop_before_partial, any incomplete 4-character group at the end of the
4704 * input is left unconsumed. This is useful for streaming/chunked decoding
4705 * where you can carry over the unconsumed input to the next chunk.
4706 *
4707 * 2. INVALID_BASE64_CHARACTER: The input contains a character that is not a
4708 * valid base64 character. In this case, input_count indicates where the
4709 * invalid character was found.
4710 *
4711 * 3. BASE64_INPUT_REMAINDER: When last_chunk_options is loose, the input
4712 * contains a number of base64 characters that, when divided by 4, leaves
4713 * a single remainder character (which cannot encode any bytes).
4714 *
4715 * You should call this function with a buffer that is at least
4716 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
4717 * provide that much space, the function may cause a buffer overflow.
4718 *
4719 * @param input the base64 string to process
4720 * @param length the length of the string in bytes
4721 * @param output the pointer to a buffer that can hold the conversion
4722 * result (should be at least maximal_binary_length_from_base64(input, length)
4723 * bytes long).
4724 * @param options the base64 options to use, can be base64_default or
4725 * base64_url, is base64_default by default.
4726 * @param last_chunk_options the last chunk handling options,
4727 * last_chunk_handling_options::loose by default
4728 * but can also be last_chunk_handling_options::strict or
4729 * last_chunk_handling_options::stop_before_partial.
4730 * @return a full_result struct (of type simdutf::full_result containing the
4731 * three fields error, input_count and output_count).
4732 */
4733simdutf_warn_unused full_result
4734base64_to_binary_details(const char *input, size_t length, char *output,
4735 base64_options options = base64_default,
4736 last_chunk_handling_options last_chunk_options =
4737 last_chunk_handling_options::loose) noexcept;
4738 #if SIMDUTF_SPAN
4739simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 full_result
4740base64_to_binary_details(
4741 const detail::input_span_of_byte_like auto &input,
4742 detail::output_span_of_byte_like auto &&binary_output,
4743 base64_options options = base64_default,
4744 last_chunk_handling_options last_chunk_options = loose) noexcept {
4745 #if SIMDUTF_CPLUSPLUS23
4746 if consteval {
4747 return scalar::base64::base64_to_binary_details_impl(
4748 input.data(), input.size(), binary_output.data(), options,
4749 last_chunk_options);
4750 } else
4751 #endif
4752 {
4753 return base64_to_binary_details(
4754 reinterpret_cast<const char *>(input.data()), input.size(),
4755 reinterpret_cast<char *>(binary_output.data()), options,
4756 last_chunk_options);
4757 }
4758}
4759 #endif // SIMDUTF_SPAN
4760
4761/**
4762 * Convert a base64 input to a binary output while returning more details
4763 * than base64_to_binary.
4764 *
4765 * This function follows the WHATWG forgiving-base64 format, which means that it
4766 * will ignore any ASCII spaces in the input. You may provide a padded input
4767 * (with one or two equal signs at the end) or an unpadded input (without any
4768 * equal signs at the end).
4769 *
4770 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4771 *
4772 * Unlike base64_to_binary, this function returns a full_result with both
4773 * input_count and output_count, so you always know how much input was consumed
4774 * and how much output was written. There are three cases where the input may
4775 * not be fully consumed:
4776 *
4777 * 1. stop_before_partial: When last_chunk_options is set to
4778 * stop_before_partial, any incomplete 4-character group at the end of the
4779 * input is left unconsumed. This is useful for streaming/chunked decoding
4780 * where you can carry over the unconsumed input to the next chunk.
4781 *
4782 * 2. INVALID_BASE64_CHARACTER: The input contains a character that is not a
4783 * valid base64 character. In this case, input_count indicates where the
4784 * invalid character was found.
4785 *
4786 * 3. BASE64_INPUT_REMAINDER: When last_chunk_options is loose, the input
4787 * contains a number of base64 characters that, when divided by 4, leaves
4788 * a single remainder character (which cannot encode any bytes).
4789 *
4790 * You should call this function with a buffer that is at least
4791 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
4792 * provide that much space, the function may cause a buffer overflow.
4793 *
4794 * @param input the base64 string to process, in ASCII stored as 16-bit
4795 * units
4796 * @param length the length of the string in 16-bit units
4797 * @param output the pointer to a buffer that can hold the conversion
4798 * result (should be at least maximal_binary_length_from_base64(input, length)
4799 * bytes long).
4800 * @param options the base64 options to use, can be base64_default or
4801 * base64_url, is base64_default by default.
4802 * @param last_chunk_options the last chunk handling options,
4803 * last_chunk_handling_options::loose by default
4804 * but can also be last_chunk_handling_options::strict or
4805 * last_chunk_handling_options::stop_before_partial.
4806 * @return a full_result struct (of type simdutf::full_result containing the
4807 * three fields error, input_count and output_count).
4808 */
4809simdutf_warn_unused full_result
4810base64_to_binary_details(const char16_t *input, size_t length, char *output,
4811 base64_options options = base64_default,
4812 last_chunk_handling_options last_chunk_options =
4813 last_chunk_handling_options::loose) noexcept;
4814 #if SIMDUTF_SPAN
4815simdutf_really_inline simdutf_warn_unused simdutf_constexpr23 full_result
4816base64_to_binary_details(
4817 std::span<const char16_t> input,
4818 detail::output_span_of_byte_like auto &&binary_output,
4819 base64_options options = base64_default,
4820 last_chunk_handling_options last_chunk_options = loose) noexcept {
4821 #if SIMDUTF_CPLUSPLUS23
4822 if consteval {
4823 return scalar::base64::base64_to_binary_details_impl(
4824 input.data(), input.size(), binary_output.data(), options,
4825 last_chunk_options);
4826 } else
4827 #endif
4828 {
4829 return base64_to_binary_details(
4830 input.data(), input.size(),
4831 reinterpret_cast<char *>(binary_output.data()), options,
4832 last_chunk_options);
4833 }
4834}
4835 #endif // SIMDUTF_SPAN
4836
4837/**
4838 * Check if a character is an ignorable base64 character.
4839 * Checking a large input, character by character, is not computationally
4840 * efficient.
4841 *
4842 * @param input the character to check
4843 * @param options the base64 options to use, is base64_default by default.
4844 * @return true if the character is an ignorable base64 character, false
4845 * otherwise.
4846 */
4847simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4848base64_ignorable(char input, base64_options options = base64_default) noexcept {
4849 return scalar::base64::is_ignorable(input, options);
4850}
4851simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4852base64_ignorable(char16_t input,
4853 base64_options options = base64_default) noexcept {
4854 return scalar::base64::is_ignorable(input, options);
4855}
4856
4857/**
4858 * Check if a character is a valid base64 character.
4859 * Checking a large input, character by character, is not computationally
4860 * efficient.
4861 * Note that padding characters are not considered valid base64 characters in
4862 * this context, nor are spaces.
4863 *
4864 * @param input the character to check
4865 * @param options the base64 options to use, is base64_default by default.
4866 * @return true if the character is a base64 character, false otherwise.
4867 */
4868simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4869base64_valid(char input, base64_options options = base64_default) noexcept {
4870 return scalar::base64::is_base64(input, options);
4871}
4872simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4873base64_valid(char16_t input, base64_options options = base64_default) noexcept {
4874 return scalar::base64::is_base64(input, options);
4875}
4876
4877/**
4878 * Check if a character is a valid base64 character or the padding character
4879 * ('='). Checking a large input, character by character, is not computationally
4880 * efficient.
4881 *
4882 * @param input the character to check
4883 * @param options the base64 options to use, is base64_default by default.
4884 * @return true if the character is a base64 character, false otherwise.
4885 */
4886simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4887base64_valid_or_padding(char input,
4888 base64_options options = base64_default) noexcept {
4889 return scalar::base64::is_base64_or_padding(input, options);
4890}
4891simdutf_warn_unused simdutf_really_inline simdutf_constexpr23 bool
4892base64_valid_or_padding(char16_t input,
4893 base64_options options = base64_default) noexcept {
4894 return scalar::base64::is_base64_or_padding(input, options);
4895}
4896
4897/**
4898 * Convert a base64 input to a binary output.
4899 *
4900 * This function follows the WHATWG forgiving-base64 format, which means that it
4901 * will ignore any ASCII spaces in the input. You may provide a padded input
4902 * (with one or two equal signs at the end) or an unpadded input (without any
4903 * equal signs at the end).
4904 *
4905 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
4906 *
4907 * This function will fail in case of invalid input. When last_chunk_options =
4908 * loose, there are three possible reasons for failure: the input contains a
4909 * number of base64 characters that when divided by 4, leaves a single remainder
4910 * character (BASE64_INPUT_REMAINDER), the input contains a character that is
4911 * not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
4912 * is too small (OUTPUT_BUFFER_TOO_SMALL).
4913 *
4914 * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
4915 * and the number of units processed, see description of the parameters and
4916 * returned value.
4917 *
4918 * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the
4919 * input where the invalid character was found. When the error is
4920 * BASE64_INPUT_REMAINDER, then r.count contains the number of bytes decoded.
4921 *
4922 * The default option (simdutf::base64_default) expects the characters `+` and
4923 * `/` as part of its alphabet. The URL option (simdutf::base64_url) expects the
4924 * characters `-` and `_` as part of its alphabet.
4925 *
4926 * The padding (`=`) is validated if present. There may be at most two padding
4927 * characters at the end of the input. If there are any padding characters, the
4928 * total number of characters (excluding spaces but including padding
4929 * characters) must be divisible by four.
4930 *
4931 * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected
4932 * to discard the output unless the parameter decode_up_to_bad_char is set to
4933 * true. In that case, the function will decode up to the first invalid
4934 * character. Extra padding characters ('=') are considered invalid characters.
4935 *
4936 * Advanced users may want to tailor how the last chunk is handled. By default,
4937 * we use a loose (forgiving) approach but we also support a strict approach
4938 * as well as a stop_before_partial approach, as per the following proposal:
4939 *
4940 * https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
4941 *
4942 * @param input the base64 string to process, in ASCII stored as 8-bit
4943 * or 16-bit units
4944 * @param length the length of the string in 8-bit or 16-bit units.
4945 * @param output the pointer to a buffer that can hold the conversion
4946 * result.
4947 * @param outlen the number of bytes that can be written in the output
4948 * buffer. Upon return, it is modified to reflect how many bytes were written.
4949 * @param options the base64 options to use, can be base64_default or
4950 * base64_url, is base64_default by default.
4951 * @param last_chunk_options the last chunk handling options,
4952 * last_chunk_handling_options::loose by default
4953 * but can also be last_chunk_handling_options::strict or
4954 * last_chunk_handling_options::stop_before_partial.
4955 * @param decode_up_to_bad_char if true, the function will decode up to the
4956 * first invalid character. By default (false), it is assumed that the output
4957 * buffer is to be discarded. When there are multiple errors in the input,
4958 * using decode_up_to_bad_char might trigger a different error.
4959 * @return a result pair struct (of type simdutf::result containing the two
4960 * fields error and count) with an error code and position of the
4961 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number
4962 * of units processed if successful.
4963 */
4964simdutf_warn_unused result
4965base64_to_binary_safe(const char *input, size_t length, char *output,
4966 size_t &outlen, base64_options options = base64_default,
4967 last_chunk_handling_options last_chunk_options =
4968 last_chunk_handling_options::loose,
4969 bool decode_up_to_bad_char = false) noexcept;
4970// the span overload has moved to the bottom of the file
4971
4972simdutf_warn_unused result
4973base64_to_binary_safe(const char16_t *input, size_t length, char *output,
4974 size_t &outlen, base64_options options = base64_default,
4975 last_chunk_handling_options last_chunk_options =
4976 last_chunk_handling_options::loose,
4977 bool decode_up_to_bad_char = false) noexcept;
4978 // span overload moved to bottom of file
4979
4980 #if SIMDUTF_ATOMIC_REF
4981/**
4982 * Convert a base64 input to a binary output with a size limit and using atomic
4983 * operations.
4984 *
4985 * Like `base64_to_binary_safe` but using atomic operations, this function is
4986 * thread-safe for concurrent memory access, allowing the output
4987 * buffers to be shared between threads without undefined behavior in case of
4988 * data races.
4989 *
4990 * This function comes with a potentially significant performance penalty, but
4991 * is useful when thread safety is needed during base64 decoding.
4992 *
4993 * This function is only available when simdutf is compiled with
4994 * C++20 support and __cpp_lib_atomic_ref >= 201806L. You may check
4995 * the availability of this function by checking the macro
4996 * SIMDUTF_ATOMIC_REF.
4997 *
4998 * This function is considered experimental. It is not tested by default
4999 * (see the CMake option SIMDUTF_ATOMIC_BASE64_TESTS) nor is it fuzz tested.
5000 * It is not documented in the public API documentation (README). It is
5001 * offered on a best effort basis. We rely on the community for further
5002 * testing and feedback.
5003 *
5004 * @param input the base64 input to decode
5005 * @param length the length of the input in bytes
5006 * @param output the pointer to buffer that can hold the conversion
5007 * result
5008 * @param outlen the number of bytes that can be written in the output
5009 * buffer. Upon return, it is modified to reflect how many bytes were written.
5010 * @param options the base64 options to use (default, url, etc.)
5011 * @param last_chunk_options the last chunk handling options (loose, strict,
5012 * stop_before_partial)
5013 * @param decode_up_to_bad_char if true, the function will decode up to the
5014 * first invalid character. By default (false), it is assumed that the output
5015 * buffer is to be discarded. When there are multiple errors in the input,
5016 * using decode_up_to_bad_char might trigger a different error.
5017 * @return a result struct with an error code and count indicating error
5018 * position or success
5019 */
5020simdutf_warn_unused result atomic_base64_to_binary_safe(
5021 const char *input, size_t length, char *output, size_t &outlen,
5022 base64_options options = base64_default,
5023 last_chunk_handling_options last_chunk_options =
5024 last_chunk_handling_options::loose,
5025 bool decode_up_to_bad_char = false) noexcept;
5026simdutf_warn_unused result atomic_base64_to_binary_safe(
5027 const char16_t *input, size_t length, char *output, size_t &outlen,
5028 base64_options options = base64_default,
5029 last_chunk_handling_options last_chunk_options = loose,
5030 bool decode_up_to_bad_char = false) noexcept;
5031 #if SIMDUTF_SPAN
5032/**
5033 * @brief span overload
5034 * @return a tuple of result and outlen
5035 */
5036simdutf_really_inline simdutf_warn_unused std::tuple<result, std::size_t>
5037atomic_base64_to_binary_safe(
5038 const detail::input_span_of_byte_like auto &binary_input,
5039 detail::output_span_of_byte_like auto &&output,
5040 base64_options options = base64_default,
5041 last_chunk_handling_options last_chunk_options =
5042 last_chunk_handling_options::loose,
5043 bool decode_up_to_bad_char = false) noexcept {
5044 size_t outlen = output.size();
5045 auto ret = atomic_base64_to_binary_safe(
5046 reinterpret_cast<const char *>(binary_input.data()), binary_input.size(),
5047 reinterpret_cast<char *>(output.data()), outlen, options,
5048 last_chunk_options, decode_up_to_bad_char);
5049 return {ret, outlen};
5050}
5051/**
5052 * @brief span overload
5053 * @return a tuple of result and outlen
5054 */
5055simdutf_warn_unused std::tuple<result, std::size_t>
5056atomic_base64_to_binary_safe(
5057 std::span<const char16_t> base64_input,
5058 detail::output_span_of_byte_like auto &&binary_output,
5059 base64_options options = base64_default,
5060 last_chunk_handling_options last_chunk_options = loose,
5061 bool decode_up_to_bad_char = false) noexcept {
5062 size_t outlen = binary_output.size();
5063 auto ret = atomic_base64_to_binary_safe(
5064 base64_input.data(), base64_input.size(),
5065 reinterpret_cast<char *>(binary_output.data()), outlen, options,
5066 last_chunk_options, decode_up_to_bad_char);
5067 return {ret, outlen};
5068}
5069 #endif // SIMDUTF_SPAN
5070 #endif // SIMDUTF_ATOMIC_REF
5071
5072#endif // SIMDUTF_FEATURE_BASE64
5073
5074/**
5075 * An implementation of simdutf for a particular CPU architecture.
5076 *
5077 * Also used to maintain the currently active implementation. The active
5078 * implementation is automatically initialized on first use to the most advanced
5079 * implementation supported by the host.
5080 */
5082public:
5083 /**
5084 * The name of this implementation.
5085 *
5086 * const implementation *impl = simdutf::active_implementation;
5087 * cout << "simdutf is optimized for " << impl->name() << "(" <<
5088 * impl->description() << ")" << endl;
5089 *
5090 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
5091 */
5092 virtual std::string_view name() const noexcept { return _name; }
5093
5094 /**
5095 * The description of this implementation.
5096 *
5097 * const implementation *impl = simdutf::active_implementation;
5098 * cout << "simdutf is optimized for " << impl->name() << "(" <<
5099 * impl->description() << ")" << endl;
5100 *
5101 * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
5102 */
5103 virtual std::string_view description() const noexcept { return _description; }
5104
5105 /**
5106 * The instruction sets this implementation is compiled against
5107 * and the current CPU match. This function may poll the current CPU/system
5108 * and should therefore not be called too often if performance is a concern.
5109 *
5110 *
5111 * @return true if the implementation can be safely used on the current system
5112 * (determined at runtime)
5113 */
5115
5116#if SIMDUTF_FEATURE_DETECT_ENCODING
5117 /**
5118 * This function will try to detect the encoding
5119 * @param input the string to identify
5120 * @param length the length of the string in bytes.
5121 * @return the encoding type detected
5122 */
5123 virtual encoding_type autodetect_encoding(const char *input,
5124 size_t length) const noexcept;
5125
5126 /**
5127 * This function will try to detect the possible encodings in one pass
5128 * @param input the string to identify
5129 * @param length the length of the string in bytes.
5130 * @return the encoding type detected
5131 */
5132 virtual int detect_encodings(const char *input,
5133 size_t length) const noexcept = 0;
5134#endif // SIMDUTF_FEATURE_DETECT_ENCODING
5135
5136 /**
5137 * @private For internal implementation use
5138 *
5139 * The instruction sets this implementation is compiled against.
5140 *
5141 * @return a mask of all required `internal::instruction_set::` values
5142 */
5143 virtual uint32_t required_instruction_sets() const {
5144 return _required_instruction_sets;
5145 }
5146
5147#if SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
5148 /**
5149 * Validate the UTF-8 string.
5150 *
5151 * Overridden by each implementation.
5152 *
5153 * @param buf the UTF-8 string to validate.
5154 * @param len the length of the string in bytes.
5155 * @return true if and only if the string is valid UTF-8.
5156 */
5157 simdutf_warn_unused virtual bool validate_utf8(const char *buf,
5158 size_t len) const noexcept = 0;
5159#endif // SIMDUTF_FEATURE_UTF8 || SIMDUTF_FEATURE_DETECT_ENCODING
5160
5161#if SIMDUTF_FEATURE_UTF8
5162 /**
5163 * Validate the UTF-8 string and stop on errors.
5164 *
5165 * Overridden by each implementation.
5166 *
5167 * @param buf the UTF-8 string to validate.
5168 * @param len the length of the string in bytes.
5169 * @return a result pair struct (of type simdutf::result containing the two
5170 * fields error and count) with an error code and either position of the error
5171 * (in the input in code units) if any, or the number of code units validated
5172 * if successful.
5173 */
5174 simdutf_warn_unused virtual result
5175 validate_utf8_with_errors(const char *buf, size_t len) const noexcept = 0;
5176#endif // SIMDUTF_FEATURE_UTF8
5177
5178#if SIMDUTF_FEATURE_ASCII
5179 /**
5180 * Validate the ASCII string.
5181 *
5182 * Overridden by each implementation.
5183 *
5184 * @param buf the ASCII string to validate.
5185 * @param len the length of the string in bytes.
5186 * @return true if and only if the string is valid ASCII.
5187 */
5188 simdutf_warn_unused virtual bool
5189 validate_ascii(const char *buf, size_t len) const noexcept = 0;
5190
5191 /**
5192 * Validate the ASCII string and stop on error.
5193 *
5194 * Overridden by each implementation.
5195 *
5196 * @param buf the ASCII string to validate.
5197 * @param len the length of the string in bytes.
5198 * @return a result pair struct (of type simdutf::result containing the two
5199 * fields error and count) with an error code and either position of the error
5200 * (in the input in code units) if any, or the number of code units validated
5201 * if successful.
5202 */
5203 simdutf_warn_unused virtual result
5204 validate_ascii_with_errors(const char *buf, size_t len) const noexcept = 0;
5205
5206#endif // SIMDUTF_FEATURE_ASCII
5207
5208#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
5209 /**
5210 * Validate the ASCII string as a UTF-16BE sequence.
5211 * An UTF-16 sequence is considered an ASCII sequence
5212 * if it could be converted to an ASCII string losslessly.
5213 *
5214 * Overridden by each implementation.
5215 *
5216 * @param buf the UTF-16BE string to validate.
5217 * @param len the length of the string in bytes.
5218 * @return true if and only if the string is valid ASCII.
5219 */
5220 simdutf_warn_unused virtual bool
5221 validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
5222
5223 /**
5224 * Validate the ASCII string as a UTF-16LE sequence.
5225 * An UTF-16 sequence is considered an ASCII sequence
5226 * if it could be converted to an ASCII string losslessly.
5227 *
5228 * Overridden by each implementation.
5229 *
5230 * @param buf the UTF-16LE string to validate.
5231 * @param len the length of the string in bytes.
5232 * @return true if and only if the string is valid ASCII.
5233 */
5234 simdutf_warn_unused virtual bool
5235 validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept = 0;
5236#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_ASCII
5237
5238#if SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
5239 /**
5240 * Validate the UTF-16LE string.This function may be best when you expect
5241 * the input to be almost always valid. Otherwise, consider using
5242 * validate_utf16le_with_errors.
5243 *
5244 * Overridden by each implementation.
5245 *
5246 * This function is not BOM-aware.
5247 *
5248 * @param buf the UTF-16LE string to validate.
5249 * @param len the length of the string in number of 2-byte code units
5250 * (char16_t).
5251 * @return true if and only if the string is valid UTF-16LE.
5252 */
5253 simdutf_warn_unused virtual bool
5254 validate_utf16le(const char16_t *buf, size_t len) const noexcept = 0;
5255#endif // SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_DETECT_ENCODING
5256
5257#if SIMDUTF_FEATURE_UTF16
5258 /**
5259 * Validate the UTF-16BE string. This function may be best when you expect
5260 * the input to be almost always valid. Otherwise, consider using
5261 * validate_utf16be_with_errors.
5262 *
5263 * Overridden by each implementation.
5264 *
5265 * This function is not BOM-aware.
5266 *
5267 * @param buf the UTF-16BE string to validate.
5268 * @param len the length of the string in number of 2-byte code units
5269 * (char16_t).
5270 * @return true if and only if the string is valid UTF-16BE.
5271 */
5272 simdutf_warn_unused virtual bool
5273 validate_utf16be(const char16_t *buf, size_t len) const noexcept = 0;
5274
5275 /**
5276 * Validate the UTF-16LE string and stop on error. It might be faster than
5277 * validate_utf16le when an error is expected to occur early.
5278 *
5279 * Overridden by each implementation.
5280 *
5281 * This function is not BOM-aware.
5282 *
5283 * @param buf the UTF-16LE string to validate.
5284 * @param len the length of the string in number of 2-byte code units
5285 * (char16_t).
5286 * @return a result pair struct (of type simdutf::result containing the two
5287 * fields error and count) with an error code and either position of the error
5288 * (in the input in code units) if any, or the number of code units validated
5289 * if successful.
5290 */
5291 simdutf_warn_unused virtual result
5292 validate_utf16le_with_errors(const char16_t *buf,
5293 size_t len) const noexcept = 0;
5294
5295 /**
5296 * Validate the UTF-16BE string and stop on error. It might be faster than
5297 * validate_utf16be when an error is expected to occur early.
5298 *
5299 * Overridden by each implementation.
5300 *
5301 * This function is not BOM-aware.
5302 *
5303 * @param buf the UTF-16BE string to validate.
5304 * @param len the length of the string in number of 2-byte code units
5305 * (char16_t).
5306 * @return a result pair struct (of type simdutf::result containing the two
5307 * fields error and count) with an error code and either position of the error
5308 * (in the input in code units) if any, or the number of code units validated
5309 * if successful.
5310 */
5311 simdutf_warn_unused virtual result
5312 validate_utf16be_with_errors(const char16_t *buf,
5313 size_t len) const noexcept = 0;
5314 /**
5315 * Copies the UTF-16LE string while replacing mismatched surrogates with the
5316 * Unicode replacement character U+FFFD. We allow the input and output to be
5317 * the same buffer so that the correction is done in-place.
5318 *
5319 * Overridden by each implementation.
5320 *
5321 * @param input the UTF-16LE string to correct.
5322 * @param len the length of the string in number of 2-byte code units
5323 * (char16_t).
5324 * @param output the output buffer.
5325 */
5326 virtual void to_well_formed_utf16le(const char16_t *input, size_t len,
5327 char16_t *output) const noexcept = 0;
5328 /**
5329 * Copies the UTF-16BE string while replacing mismatched surrogates with the
5330 * Unicode replacement character U+FFFD. We allow the input and output to be
5331 * the same buffer so that the correction is done in-place.
5332 *
5333 * Overridden by each implementation.
5334 *
5335 * @param input the UTF-16BE string to correct.
5336 * @param len the length of the string in number of 2-byte code units
5337 * (char16_t).
5338 * @param output the output buffer.
5339 */
5340 virtual void to_well_formed_utf16be(const char16_t *input, size_t len,
5341 char16_t *output) const noexcept = 0;
5342#endif // SIMDUTF_FEATURE_UTF16
5343
5344#if SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
5345 /**
5346 * Validate the UTF-32 string.
5347 *
5348 * Overridden by each implementation.
5349 *
5350 * This function is not BOM-aware.
5351 *
5352 * @param buf the UTF-32 string to validate.
5353 * @param len the length of the string in number of 4-byte code units
5354 * (char32_t).
5355 * @return true if and only if the string is valid UTF-32.
5356 */
5357 simdutf_warn_unused virtual bool
5358 validate_utf32(const char32_t *buf, size_t len) const noexcept = 0;
5359#endif // SIMDUTF_FEATURE_UTF32 || SIMDUTF_FEATURE_DETECT_ENCODING
5360
5361#if SIMDUTF_FEATURE_UTF32
5362 /**
5363 * Validate the UTF-32 string and stop on error.
5364 *
5365 * Overridden by each implementation.
5366 *
5367 * This function is not BOM-aware.
5368 *
5369 * @param buf the UTF-32 string to validate.
5370 * @param len the length of the string in number of 4-byte code units
5371 * (char32_t).
5372 * @return a result pair struct (of type simdutf::result containing the two
5373 * fields error and count) with an error code and either position of the error
5374 * (in the input in code units) if any, or the number of code units validated
5375 * if successful.
5376 */
5377 simdutf_warn_unused virtual result
5378 validate_utf32_with_errors(const char32_t *buf,
5379 size_t len) const noexcept = 0;
5380#endif // SIMDUTF_FEATURE_UTF32
5381
5382#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5383 /**
5384 * Convert Latin1 string into UTF-8 string.
5385 *
5386 * This function is suitable to work with inputs from untrusted sources.
5387 *
5388 * @param input the Latin1 string to convert
5389 * @param length the length of the string in bytes
5390 * @param utf8_output the pointer to buffer that can hold conversion result
5391 * @return the number of written char; 0 if conversion is not possible
5392 */
5393 simdutf_warn_unused virtual size_t
5394 convert_latin1_to_utf8(const char *input, size_t length,
5395 char *utf8_output) const noexcept = 0;
5396#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5397
5398#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5399 /**
5400 * Convert possibly Latin1 string into UTF-16LE string.
5401 *
5402 * This function is suitable to work with inputs from untrusted sources.
5403 *
5404 * @param input the Latin1 string to convert
5405 * @param length the length of the string in bytes
5406 * @param utf16_output the pointer to buffer that can hold conversion result
5407 * @return the number of written char16_t; 0 if conversion is not possible
5408 */
5409 simdutf_warn_unused virtual size_t
5410 convert_latin1_to_utf16le(const char *input, size_t length,
5411 char16_t *utf16_output) const noexcept = 0;
5412
5413 /**
5414 * Convert Latin1 string into UTF-16BE string.
5415 *
5416 * This function is suitable to work with inputs from untrusted sources.
5417 *
5418 * @param input the Latin1 string to convert
5419 * @param length the length of the string in bytes
5420 * @param utf16_output the pointer to buffer that can hold conversion result
5421 * @return the number of written char16_t; 0 if conversion is not possible
5422 */
5423 simdutf_warn_unused virtual size_t
5424 convert_latin1_to_utf16be(const char *input, size_t length,
5425 char16_t *utf16_output) const noexcept = 0;
5426#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5427
5428#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5429 /**
5430 * Convert Latin1 string into UTF-32 string.
5431 *
5432 * This function is suitable to work with inputs from untrusted sources.
5433 *
5434 * @param input the Latin1 string to convert
5435 * @param length the length of the string in bytes
5436 * @param utf32_buffer the pointer to buffer that can hold conversion result
5437 * @return the number of written char32_t; 0 if conversion is not possible
5438 */
5439 simdutf_warn_unused virtual size_t
5440 convert_latin1_to_utf32(const char *input, size_t length,
5441 char32_t *utf32_buffer) const noexcept = 0;
5442#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
5443
5444#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5445 /**
5446 * Convert possibly broken UTF-8 string into latin1 string.
5447 *
5448 * During the conversion also validation of the input string is done.
5449 * This function is suitable to work with inputs from untrusted sources.
5450 *
5451 * @param input the UTF-8 string to convert
5452 * @param length the length of the string in bytes
5453 * @param latin1_output the pointer to buffer that can hold conversion result
5454 * @return the number of written char; 0 if the input was not valid UTF-8
5455 * string or if it cannot be represented as Latin1
5456 */
5457 simdutf_warn_unused virtual size_t
5458 convert_utf8_to_latin1(const char *input, size_t length,
5459 char *latin1_output) const noexcept = 0;
5460
5461 /**
5462 * Convert possibly broken UTF-8 string into latin1 string with errors.
5463 * If the string cannot be represented as Latin1, an error
5464 * code is returned.
5465 *
5466 * During the conversion also validation of the input string is done.
5467 * This function is suitable to work with inputs from untrusted sources.
5468 *
5469 * @param input the UTF-8 string to convert
5470 * @param length the length of the string in bytes
5471 * @param latin1_output the pointer to buffer that can hold conversion result
5472 * @return a result pair struct (of type simdutf::result containing the two
5473 * fields error and count) with an error code and either position of the error
5474 * (in the input in code units) if any, or the number of code units validated
5475 * if successful.
5476 */
5477 simdutf_warn_unused virtual result
5478 convert_utf8_to_latin1_with_errors(const char *input, size_t length,
5479 char *latin1_output) const noexcept = 0;
5480
5481 /**
5482 * Convert valid UTF-8 string into latin1 string.
5483 *
5484 * This function assumes that the input string is valid UTF-8 and that it can
5485 * be represented as Latin1. If you violate this assumption, the result is
5486 * implementation defined and may include system-dependent behavior such as
5487 * crashes.
5488 *
5489 * This function is for expert users only and not part of our public API. Use
5490 * convert_utf8_to_latin1 instead.
5491 *
5492 * This function is not BOM-aware.
5493 *
5494 * @param input the UTF-8 string to convert
5495 * @param length the length of the string in bytes
5496 * @param latin1_output the pointer to buffer that can hold conversion result
5497 * @return the number of written char; 0 if the input was not valid UTF-8
5498 * string
5499 */
5500 simdutf_warn_unused virtual size_t
5501 convert_valid_utf8_to_latin1(const char *input, size_t length,
5502 char *latin1_output) const noexcept = 0;
5503#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
5504
5505#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5506 /**
5507 * Convert possibly broken UTF-8 string into UTF-16LE string.
5508 *
5509 * During the conversion also validation of the input string is done.
5510 * This function is suitable to work with inputs from untrusted sources.
5511 *
5512 * @param input the UTF-8 string to convert
5513 * @param length the length of the string in bytes
5514 * @param utf16_output the pointer to buffer that can hold conversion result
5515 * @return the number of written char16_t; 0 if the input was not valid UTF-8
5516 * string
5517 */
5518 simdutf_warn_unused virtual size_t
5519 convert_utf8_to_utf16le(const char *input, size_t length,
5520 char16_t *utf16_output) const noexcept = 0;
5521
5522 /**
5523 * Convert possibly broken UTF-8 string into UTF-16BE string.
5524 *
5525 * During the conversion also validation of the input string is done.
5526 * This function is suitable to work with inputs from untrusted sources.
5527 *
5528 * @param input the UTF-8 string to convert
5529 * @param length the length of the string in bytes
5530 * @param utf16_output the pointer to buffer that can hold conversion result
5531 * @return the number of written char16_t; 0 if the input was not valid UTF-8
5532 * string
5533 */
5534 simdutf_warn_unused virtual size_t
5535 convert_utf8_to_utf16be(const char *input, size_t length,
5536 char16_t *utf16_output) const noexcept = 0;
5537
5538 /**
5539 * Convert possibly broken UTF-8 string into UTF-16LE string and stop on
5540 * error.
5541 *
5542 * During the conversion also validation of the input string is done.
5543 * This function is suitable to work with inputs from untrusted sources.
5544 *
5545 * @param input the UTF-8 string to convert
5546 * @param length the length of the string in bytes
5547 * @param utf16_output the pointer to buffer that can hold conversion result
5548 * @return a result pair struct (of type simdutf::result containing the two
5549 * fields error and count) with an error code and either position of the error
5550 * (in the input in code units) if any, or the number of code units validated
5551 * if successful.
5552 */
5553 simdutf_warn_unused virtual result convert_utf8_to_utf16le_with_errors(
5554 const char *input, size_t length,
5555 char16_t *utf16_output) const noexcept = 0;
5556
5557 /**
5558 * Convert possibly broken UTF-8 string into UTF-16BE string and stop on
5559 * error.
5560 *
5561 * During the conversion also validation of the input string is done.
5562 * This function is suitable to work with inputs from untrusted sources.
5563 *
5564 * @param input the UTF-8 string to convert
5565 * @param length the length of the string in bytes
5566 * @param utf16_output the pointer to buffer that can hold conversion result
5567 * @return a result pair struct (of type simdutf::result containing the two
5568 * fields error and count) with an error code and either position of the error
5569 * (in the input in code units) if any, or the number of code units validated
5570 * if successful.
5571 */
5572 simdutf_warn_unused virtual result convert_utf8_to_utf16be_with_errors(
5573 const char *input, size_t length,
5574 char16_t *utf16_output) const noexcept = 0;
5575 /**
5576 * Compute the number of bytes that this UTF-16LE string would require in
5577 * UTF-8 format even when the UTF-16LE content contains mismatched
5578 * surrogates that have to be replaced by the replacement character (0xFFFD).
5579 *
5580 * @param input the UTF-16LE string to convert
5581 * @param length the length of the string in 2-byte code units
5582 * (char16_t)
5583 * @return a result pair struct (of type simdutf::result containing the two
5584 * fields error and count) where the count is the number of bytes required to
5585 * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
5586 * or SURROGATE. The count is correct regardless of the error field.
5587 * When SURROGATE is returned, it does not indicate an error in the case of
5588 * this function: it indicates that at least one surrogate has been
5589 * encountered: the surrogates may be matched or not (thus this function does
5590 * not validate). If the returned error code is SUCCESS, then the input
5591 * contains no surrogate, is in the Basic Multilingual Plane, and is
5592 * necessarily valid.
5593 */
5595 const char16_t *input, size_t length) const noexcept = 0;
5596
5597 /**
5598 * Compute the number of bytes that this UTF-16BE string would require in
5599 * UTF-8 format even when the UTF-16BE content contains mismatched
5600 * surrogates that have to be replaced by the replacement character (0xFFFD).
5601 *
5602 * @param input the UTF-16BE string to convert
5603 * @param length the length of the string in 2-byte code units
5604 * (char16_t)
5605 * @return a result pair struct (of type simdutf::result containing the two
5606 * fields error and count) where the count is the number of bytes required to
5607 * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
5608 * or SURROGATE. The count is correct regardless of the error field.
5609 * When SURROGATE is returned, it does not indicate an error in the case of
5610 * this function: it indicates that at least one surrogate has been
5611 * encountered: the surrogates may be matched or not (thus this function does
5612 * not validate). If the returned error code is SUCCESS, then the input
5613 * contains no surrogate, is in the Basic Multilingual Plane, and is
5614 * necessarily valid.
5615 */
5617 const char16_t *input, size_t length) const noexcept = 0;
5618
5619#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5620
5621#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5622 /**
5623 * Convert possibly broken UTF-8 string into UTF-32 string.
5624 *
5625 * During the conversion also validation of the input string is done.
5626 * This function is suitable to work with inputs from untrusted sources.
5627 *
5628 * @param input the UTF-8 string to convert
5629 * @param length the length of the string in bytes
5630 * @param utf32_output the pointer to buffer that can hold conversion result
5631 * @return the number of written char16_t; 0 if the input was not valid UTF-8
5632 * string
5633 */
5634 simdutf_warn_unused virtual size_t
5635 convert_utf8_to_utf32(const char *input, size_t length,
5636 char32_t *utf32_output) const noexcept = 0;
5637
5638 /**
5639 * Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
5640 *
5641 * During the conversion also validation of the input string is done.
5642 * This function is suitable to work with inputs from untrusted sources.
5643 *
5644 * @param input the UTF-8 string to convert
5645 * @param length the length of the string in bytes
5646 * @param utf32_output the pointer to buffer that can hold conversion result
5647 * @return a result pair struct (of type simdutf::result containing the two
5648 * fields error and count) with an error code and either position of the error
5649 * (in the input in code units) if any, or the number of char32_t written if
5650 * successful.
5651 */
5652 simdutf_warn_unused virtual result
5653 convert_utf8_to_utf32_with_errors(const char *input, size_t length,
5654 char32_t *utf32_output) const noexcept = 0;
5655#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5656
5657#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5658 /**
5659 * Convert valid UTF-8 string into UTF-16LE string.
5660 *
5661 * This function assumes that the input string is valid UTF-8.
5662 *
5663 * @param input the UTF-8 string to convert
5664 * @param length the length of the string in bytes
5665 * @param utf16_buffer the pointer to buffer that can hold conversion result
5666 * @return the number of written char16_t
5667 */
5668 simdutf_warn_unused virtual size_t
5669 convert_valid_utf8_to_utf16le(const char *input, size_t length,
5670 char16_t *utf16_buffer) const noexcept = 0;
5671
5672 /**
5673 * Convert valid UTF-8 string into UTF-16BE string.
5674 *
5675 * This function assumes that the input string is valid UTF-8.
5676 *
5677 * @param input the UTF-8 string to convert
5678 * @param length the length of the string in bytes
5679 * @param utf16_buffer the pointer to buffer that can hold conversion result
5680 * @return the number of written char16_t
5681 */
5682 simdutf_warn_unused virtual size_t
5683 convert_valid_utf8_to_utf16be(const char *input, size_t length,
5684 char16_t *utf16_buffer) const noexcept = 0;
5685#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5686
5687#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5688 /**
5689 * Convert valid UTF-8 string into UTF-32 string.
5690 *
5691 * This function assumes that the input string is valid UTF-8.
5692 *
5693 * @param input the UTF-8 string to convert
5694 * @param length the length of the string in bytes
5695 * @param utf32_buffer the pointer to buffer that can hold conversion result
5696 * @return the number of written char32_t
5697 */
5698 simdutf_warn_unused virtual size_t
5699 convert_valid_utf8_to_utf32(const char *input, size_t length,
5700 char32_t *utf32_buffer) const noexcept = 0;
5701#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5702
5703#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5704 /**
5705 * Compute the number of 2-byte code units that this UTF-8 string would
5706 * require in UTF-16LE format.
5707 *
5708 * This function does not validate the input. It is acceptable to pass invalid
5709 * UTF-8 strings but in such cases the result is implementation defined.
5710 *
5711 * @param input the UTF-8 string to process
5712 * @param length the length of the string in bytes
5713 * @return the number of char16_t code units required to encode the UTF-8
5714 * string as UTF-16LE
5715 */
5716 simdutf_warn_unused virtual size_t
5717 utf16_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5718#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5719
5720#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5721 /**
5722 * Compute the number of 4-byte code units that this UTF-8 string would
5723 * require in UTF-32 format.
5724 *
5725 * This function is equivalent to count_utf8. It is acceptable to pass invalid
5726 * UTF-8 strings but in such cases the result is implementation defined.
5727 *
5728 * This function does not validate the input.
5729 *
5730 * @param input the UTF-8 string to process
5731 * @param length the length of the string in bytes
5732 * @return the number of char32_t code units required to encode the UTF-8
5733 * string as UTF-32
5734 */
5735 simdutf_warn_unused virtual size_t
5736 utf32_length_from_utf8(const char *input, size_t length) const noexcept = 0;
5737#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
5738
5739#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5740 /**
5741 * Convert possibly broken UTF-16LE string into Latin1 string.
5742 *
5743 * During the conversion also validation of the input string is done.
5744 * This function is suitable to work with inputs from untrusted sources.
5745 *
5746 * This function is not BOM-aware.
5747 *
5748 * @param input the UTF-16LE string to convert
5749 * @param length the length of the string in 2-byte code units
5750 * (char16_t)
5751 * @param latin1_buffer the pointer to buffer that can hold conversion
5752 * result
5753 * @return number of written code units; 0 if input is not a valid UTF-16LE
5754 * string or if it cannot be represented as Latin1
5755 */
5756 simdutf_warn_unused virtual size_t
5757 convert_utf16le_to_latin1(const char16_t *input, size_t length,
5758 char *latin1_buffer) const noexcept = 0;
5759
5760 /**
5761 * Convert possibly broken UTF-16BE string into Latin1 string.
5762 *
5763 * During the conversion also validation of the input string is done.
5764 * This function is suitable to work with inputs from untrusted sources.
5765 *
5766 * This function is not BOM-aware.
5767 *
5768 * @param input the UTF-16BE string to convert
5769 * @param length the length of the string in 2-byte code units
5770 * (char16_t)
5771 * @param latin1_buffer the pointer to buffer that can hold conversion
5772 * result
5773 * @return number of written code units; 0 if input is not a valid UTF-16BE
5774 * string or if it cannot be represented as Latin1
5775 */
5776 simdutf_warn_unused virtual size_t
5777 convert_utf16be_to_latin1(const char16_t *input, size_t length,
5778 char *latin1_buffer) const noexcept = 0;
5779
5780 /**
5781 * Convert possibly broken UTF-16LE string into Latin1 string.
5782 * If the string cannot be represented as Latin1, an error
5783 * is returned.
5784 *
5785 * During the conversion also validation of the input string is done.
5786 * This function is suitable to work with inputs from untrusted sources.
5787 * This function is not BOM-aware.
5788 *
5789 * @param input the UTF-16LE string to convert
5790 * @param length the length of the string in 2-byte code units
5791 * (char16_t)
5792 * @param latin1_buffer the pointer to buffer that can hold conversion
5793 * result
5794 * @return a result pair struct (of type simdutf::result containing the two
5795 * fields error and count) with an error code and either position of the error
5796 * (in the input in code units) if any, or the number of char written if
5797 * successful.
5798 */
5799 simdutf_warn_unused virtual result
5800 convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length,
5801 char *latin1_buffer) const noexcept = 0;
5802
5803 /**
5804 * Convert possibly broken UTF-16BE string into Latin1 string.
5805 * If the string cannot be represented as Latin1, an error
5806 * is returned.
5807 *
5808 * During the conversion also validation of the input string is done.
5809 * This function is suitable to work with inputs from untrusted sources.
5810 * This function is not BOM-aware.
5811 *
5812 * @param input the UTF-16BE string to convert
5813 * @param length the length of the string in 2-byte code units
5814 * (char16_t)
5815 * @param latin1_buffer the pointer to buffer that can hold conversion
5816 * result
5817 * @return a result pair struct (of type simdutf::result containing the two
5818 * fields error and count) with an error code and either position of the error
5819 * (in the input in code units) if any, or the number of char written if
5820 * successful.
5821 */
5822 simdutf_warn_unused virtual result
5823 convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length,
5824 char *latin1_buffer) const noexcept = 0;
5825
5826 /**
5827 * Convert valid UTF-16LE string into Latin1 string.
5828 *
5829 * This function assumes that the input string is valid UTF-L16LE and that it
5830 * can be represented as Latin1. If you violate this assumption, the result is
5831 * implementation defined and may include system-dependent behavior such as
5832 * crashes.
5833 *
5834 * This function is for expert users only and not part of our public API. Use
5835 * convert_utf16le_to_latin1 instead.
5836 *
5837 * This function is not BOM-aware.
5838 *
5839 * @param input the UTF-16LE string to convert
5840 * @param length the length of the string in 2-byte code units
5841 * (char16_t)
5842 * @param latin1_buffer the pointer to buffer that can hold conversion
5843 * result
5844 * @return number of written code units; 0 if conversion is not possible
5845 */
5846 simdutf_warn_unused virtual size_t
5847 convert_valid_utf16le_to_latin1(const char16_t *input, size_t length,
5848 char *latin1_buffer) const noexcept = 0;
5849
5850 /**
5851 * Convert valid UTF-16BE string into Latin1 string.
5852 *
5853 * This function assumes that the input string is valid UTF16-BE and that it
5854 * can be represented as Latin1. If you violate this assumption, the result is
5855 * implementation defined and may include system-dependent behavior such as
5856 * crashes.
5857 *
5858 * This function is for expert users only and not part of our public API. Use
5859 * convert_utf16be_to_latin1 instead.
5860 *
5861 * This function is not BOM-aware.
5862 *
5863 * @param input the UTF-16BE string to convert
5864 * @param length the length of the string in 2-byte code units
5865 * (char16_t)
5866 * @param latin1_buffer the pointer to buffer that can hold conversion
5867 * result
5868 * @return number of written code units; 0 if conversion is not possible
5869 */
5870 simdutf_warn_unused virtual size_t
5871 convert_valid_utf16be_to_latin1(const char16_t *input, size_t length,
5872 char *latin1_buffer) const noexcept = 0;
5873#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
5874
5875#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
5876 /**
5877 * Convert possibly broken UTF-16LE string into UTF-8 string.
5878 *
5879 * During the conversion also validation of the input string is done.
5880 * This function is suitable to work with inputs from untrusted sources.
5881 *
5882 * This function is not BOM-aware.
5883 *
5884 * @param input the UTF-16LE string to convert
5885 * @param length the length of the string in 2-byte code units
5886 * (char16_t)
5887 * @param utf8_buffer the pointer to buffer that can hold conversion result
5888 * @return number of written code units; 0 if input is not a valid UTF-16LE
5889 * string
5890 */
5891 simdutf_warn_unused virtual size_t
5892 convert_utf16le_to_utf8(const char16_t *input, size_t length,
5893 char *utf8_buffer) const noexcept = 0;
5894
5895 /**
5896 * Convert possibly broken UTF-16BE string into UTF-8 string.
5897 *
5898 * During the conversion also validation of the input string is done.
5899 * This function is suitable to work with inputs from untrusted sources.
5900 *
5901 * This function is not BOM-aware.
5902 *
5903 * @param input the UTF-16BE string to convert
5904 * @param length the length of the string in 2-byte code units
5905 * (char16_t)
5906 * @param utf8_buffer the pointer to buffer that can hold conversion result
5907 * @return number of written code units; 0 if input is not a valid UTF-16BE
5908 * string
5909 */
5910 simdutf_warn_unused virtual size_t
5911 convert_utf16be_to_utf8(const char16_t *input, size_t length,
5912 char *utf8_buffer) const noexcept = 0;
5913
5914 /**
5915 * Convert possibly broken UTF-16LE string into UTF-8 string and stop on
5916 * error.
5917 *
5918 * During the conversion also validation of the input string is done.
5919 * This function is suitable to work with inputs from untrusted sources.
5920 *
5921 * This function is not BOM-aware.
5922 *
5923 * @param input the UTF-16LE string to convert
5924 * @param length the length of the string in 2-byte code units
5925 * (char16_t)
5926 * @param utf8_buffer the pointer to buffer that can hold conversion result
5927 * @return a result pair struct (of type simdutf::result containing the two
5928 * fields error and count) with an error code and either position of the error
5929 * (in the input in code units) if any, or the number of char written if
5930 * successful.
5931 */
5932 simdutf_warn_unused virtual result
5933 convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length,
5934 char *utf8_buffer) const noexcept = 0;
5935
5936 /**
5937 * Convert possibly broken UTF-16BE string into UTF-8 string and stop on
5938 * error.
5939 *
5940 * During the conversion also validation of the input string is done.
5941 * This function is suitable to work with inputs from untrusted sources.
5942 *
5943 * This function is not BOM-aware.
5944 *
5945 * @param input the UTF-16BE string to convert
5946 * @param length the length of the string in 2-byte code units
5947 * (char16_t)
5948 * @param utf8_buffer the pointer to buffer that can hold conversion result
5949 * @return a result pair struct (of type simdutf::result containing the two
5950 * fields error and count) with an error code and either position of the error
5951 * (in the input in code units) if any, or the number of char written if
5952 * successful.
5953 */
5954 simdutf_warn_unused virtual result
5955 convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length,
5956 char *utf8_buffer) const noexcept = 0;
5957
5958 /**
5959 * Convert possibly broken UTF-16LE string into UTF-8 string, replacing
5960 * unpaired surrogates with the Unicode replacement character U+FFFD.
5961 *
5962 * This function always succeeds: unpaired surrogates are replaced with
5963 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
5964 *
5965 * This function is not BOM-aware.
5966 *
5967 * @param input the UTF-16LE string to convert
5968 * @param length the length of the string in 2-byte code units
5969 * (char16_t)
5970 * @param utf8_buffer the pointer to buffer that can hold conversion result
5971 * @return number of written code units
5972 */
5973 simdutf_warn_unused virtual size_t convert_utf16le_to_utf8_with_replacement(
5974 const char16_t *input, size_t length,
5975 char *utf8_buffer) const noexcept = 0;
5976
5977 /**
5978 * Convert possibly broken UTF-16BE string into UTF-8 string, replacing
5979 * unpaired surrogates with the Unicode replacement character U+FFFD.
5980 *
5981 * This function always succeeds: unpaired surrogates are replaced with
5982 * U+FFFD (3 bytes in UTF-8: 0xEF 0xBF 0xBD).
5983 *
5984 * This function is not BOM-aware.
5985 *
5986 * @param input the UTF-16BE string to convert
5987 * @param length the length of the string in 2-byte code units
5988 * (char16_t)
5989 * @param utf8_buffer the pointer to buffer that can hold conversion result
5990 * @return number of written code units
5991 */
5992 simdutf_warn_unused virtual size_t convert_utf16be_to_utf8_with_replacement(
5993 const char16_t *input, size_t length,
5994 char *utf8_buffer) const noexcept = 0;
5995
5996 /**
5997 * Convert valid UTF-16LE string into UTF-8 string.
5998 *
5999 * This function assumes that the input string is valid UTF-16LE.
6000 *
6001 * This function is not BOM-aware.
6002 *
6003 * @param input the UTF-16LE string to convert
6004 * @param length the length of the string in 2-byte code units
6005 * (char16_t)
6006 * @param utf8_buffer the pointer to a buffer that can hold the conversion
6007 * result
6008 * @return number of written code units; 0 if conversion is not possible
6009 */
6010 simdutf_warn_unused virtual size_t
6011 convert_valid_utf16le_to_utf8(const char16_t *input, size_t length,
6012 char *utf8_buffer) const noexcept = 0;
6013
6014 /**
6015 * Convert valid UTF-16BE string into UTF-8 string.
6016 *
6017 * This function assumes that the input string is valid UTF-16BE.
6018 *
6019 * This function is not BOM-aware.
6020 *
6021 * @param input the UTF-16BE string to convert
6022 * @param length the length of the string in 2-byte code units
6023 * (char16_t)
6024 * @param utf8_buffer the pointer to a buffer that can hold the conversion
6025 * result
6026 * @return number of written code units; 0 if conversion is not possible
6027 */
6028 simdutf_warn_unused virtual size_t
6029 convert_valid_utf16be_to_utf8(const char16_t *input, size_t length,
6030 char *utf8_buffer) const noexcept = 0;
6031#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
6032
6033#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6034 /**
6035 * Convert possibly broken UTF-16LE string into UTF-32 string.
6036 *
6037 * During the conversion also validation of the input string is done.
6038 * This function is suitable to work with inputs from untrusted sources.
6039 *
6040 * This function is not BOM-aware.
6041 *
6042 * @param input the UTF-16LE string to convert
6043 * @param length the length of the string in 2-byte code units
6044 * (char16_t)
6045 * @param utf32_buffer the pointer to buffer that can hold conversion result
6046 * @return number of written code units; 0 if input is not a valid UTF-16LE
6047 * string
6048 */
6049 simdutf_warn_unused virtual size_t
6050 convert_utf16le_to_utf32(const char16_t *input, size_t length,
6051 char32_t *utf32_buffer) const noexcept = 0;
6052
6053 /**
6054 * Convert possibly broken UTF-16BE string into UTF-32 string.
6055 *
6056 * During the conversion also validation of the input string is done.
6057 * This function is suitable to work with inputs from untrusted sources.
6058 *
6059 * This function is not BOM-aware.
6060 *
6061 * @param input the UTF-16BE string to convert
6062 * @param length the length of the string in 2-byte code units
6063 * (char16_t)
6064 * @param utf32_buffer the pointer to buffer that can hold conversion result
6065 * @return number of written code units; 0 if input is not a valid UTF-16BE
6066 * string
6067 */
6068 simdutf_warn_unused virtual size_t
6069 convert_utf16be_to_utf32(const char16_t *input, size_t length,
6070 char32_t *utf32_buffer) const noexcept = 0;
6071
6072 /**
6073 * Convert possibly broken UTF-16LE string into UTF-32 string and stop on
6074 * error.
6075 *
6076 * During the conversion also validation of the input string is done.
6077 * This function is suitable to work with inputs from untrusted sources.
6078 *
6079 * This function is not BOM-aware.
6080 *
6081 * @param input the UTF-16LE string to convert
6082 * @param length the length of the string in 2-byte code units
6083 * (char16_t)
6084 * @param utf32_buffer the pointer to buffer that can hold conversion result
6085 * @return a result pair struct (of type simdutf::result containing the two
6086 * fields error and count) with an error code and either position of the error
6087 * (in the input in code units) if any, or the number of char32_t written if
6088 * successful.
6089 */
6091 const char16_t *input, size_t length,
6092 char32_t *utf32_buffer) const noexcept = 0;
6093
6094 /**
6095 * Convert possibly broken UTF-16BE string into UTF-32 string and stop on
6096 * error.
6097 *
6098 * During the conversion also validation of the input string is done.
6099 * This function is suitable to work with inputs from untrusted sources.
6100 *
6101 * This function is not BOM-aware.
6102 *
6103 * @param input the UTF-16BE string to convert
6104 * @param length the length of the string in 2-byte code units
6105 * (char16_t)
6106 * @param utf32_buffer the pointer to buffer that can hold conversion result
6107 * @return a result pair struct (of type simdutf::result containing the two
6108 * fields error and count) with an error code and either position of the error
6109 * (in the input in code units) if any, or the number of char32_t written if
6110 * successful.
6111 */
6113 const char16_t *input, size_t length,
6114 char32_t *utf32_buffer) const noexcept = 0;
6115
6116 /**
6117 * Convert valid UTF-16LE string into UTF-32 string.
6118 *
6119 * This function assumes that the input string is valid UTF-16LE.
6120 *
6121 * This function is not BOM-aware.
6122 *
6123 * @param input the UTF-16LE string to convert
6124 * @param length the length of the string in 2-byte code units
6125 * (char16_t)
6126 * @param utf32_buffer the pointer to a buffer that can hold the conversion
6127 * result
6128 * @return number of written code units; 0 if conversion is not possible
6129 */
6130 simdutf_warn_unused virtual size_t
6131 convert_valid_utf16le_to_utf32(const char16_t *input, size_t length,
6132 char32_t *utf32_buffer) const noexcept = 0;
6133
6134 /**
6135 * Convert valid UTF-16LE string into UTF-32BE string.
6136 *
6137 * This function assumes that the input string is valid UTF-16BE.
6138 *
6139 * This function is not BOM-aware.
6140 *
6141 * @param input the UTF-16BE string to convert
6142 * @param length the length of the string in 2-byte code units
6143 * (char16_t)
6144 * @param utf32_buffer the pointer to a buffer that can hold the conversion
6145 * result
6146 * @return number of written code units; 0 if conversion is not possible
6147 */
6148 simdutf_warn_unused virtual size_t
6149 convert_valid_utf16be_to_utf32(const char16_t *input, size_t length,
6150 char32_t *utf32_buffer) const noexcept = 0;
6151#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6152
6153#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
6154 /**
6155 * Compute the number of bytes that this UTF-16LE string would require in
6156 * UTF-8 format.
6157 *
6158 * This function does not validate the input. It is acceptable to pass invalid
6159 * UTF-16 strings but in such cases the result is implementation defined.
6160 *
6161 * This function is not BOM-aware.
6162 *
6163 * @param input the UTF-16LE string to convert
6164 * @param length the length of the string in 2-byte code units
6165 * (char16_t)
6166 * @return the number of bytes required to encode the UTF-16LE string as UTF-8
6167 */
6168 simdutf_warn_unused virtual size_t
6169 utf8_length_from_utf16le(const char16_t *input,
6170 size_t length) const noexcept = 0;
6171
6172 /**
6173 * Compute the number of bytes that this UTF-16BE string would require in
6174 * UTF-8 format.
6175 *
6176 * This function does not validate the input. It is acceptable to pass invalid
6177 * UTF-16 strings but in such cases the result is implementation defined.
6178 *
6179 * This function is not BOM-aware.
6180 *
6181 * @param input the UTF-16BE string to convert
6182 * @param length the length of the string in 2-byte code units
6183 * (char16_t)
6184 * @return the number of bytes required to encode the UTF-16BE string as UTF-8
6185 */
6186 simdutf_warn_unused virtual size_t
6187 utf8_length_from_utf16be(const char16_t *input,
6188 size_t length) const noexcept = 0;
6189#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
6190
6191#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6192 /**
6193 * Convert possibly broken UTF-32 string into Latin1 string.
6194 *
6195 * During the conversion also validation of the input string is done.
6196 * This function is suitable to work with inputs from untrusted sources.
6197 *
6198 * This function is not BOM-aware.
6199 *
6200 * @param input the UTF-32 string to convert
6201 * @param length the length of the string in 4-byte code units
6202 * (char32_t)
6203 * @param latin1_buffer the pointer to buffer that can hold conversion
6204 * result
6205 * @return number of written code units; 0 if input is not a valid UTF-32
6206 * string
6207 */
6208 simdutf_warn_unused virtual size_t
6209 convert_utf32_to_latin1(const char32_t *input, size_t length,
6210 char *latin1_buffer) const noexcept = 0;
6211#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6212
6213#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6214 /**
6215 * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
6216 * If the string cannot be represented as Latin1, an error is returned.
6217 *
6218 * During the conversion also validation of the input string is done.
6219 * This function is suitable to work with inputs from untrusted sources.
6220 *
6221 * This function is not BOM-aware.
6222 *
6223 * @param input the UTF-32 string to convert
6224 * @param length the length of the string in 4-byte code units
6225 * (char32_t)
6226 * @param latin1_buffer the pointer to buffer that can hold conversion
6227 * result
6228 * @return a result pair struct (of type simdutf::result containing the two
6229 * fields error and count) with an error code and either position of the error
6230 * (in the input in code units) if any, or the number of char written if
6231 * successful.
6232 */
6233 simdutf_warn_unused virtual result
6234 convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length,
6235 char *latin1_buffer) const noexcept = 0;
6236
6237 /**
6238 * Convert valid UTF-32 string into Latin1 string.
6239 *
6240 * This function assumes that the input string is valid UTF-32 and can be
6241 * represented as Latin1. If you violate this assumption, the result is
6242 * implementation defined and may include system-dependent behavior such as
6243 * crashes.
6244 *
6245 * This function is for expert users only and not part of our public API. Use
6246 * convert_utf32_to_latin1 instead.
6247 *
6248 * This function is not BOM-aware.
6249 *
6250 * @param input the UTF-32 string to convert
6251 * @param length the length of the string in 4-byte code units
6252 * (char32_t)
6253 * @param latin1_buffer the pointer to a buffer that can hold the conversion
6254 * result
6255 * @return number of written code units; 0 if conversion is not possible
6256 */
6257 simdutf_warn_unused virtual size_t
6258 convert_valid_utf32_to_latin1(const char32_t *input, size_t length,
6259 char *latin1_buffer) const noexcept = 0;
6260#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6261
6262#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6263 /**
6264 * Convert possibly broken UTF-32 string into UTF-8 string.
6265 *
6266 * During the conversion also validation of the input string is done.
6267 * This function is suitable to work with inputs from untrusted sources.
6268 *
6269 * This function is not BOM-aware.
6270 *
6271 * @param input the UTF-32 string to convert
6272 * @param length the length of the string in 4-byte code units
6273 * (char32_t)
6274 * @param utf8_buffer the pointer to buffer that can hold conversion result
6275 * @return number of written code units; 0 if input is not a valid UTF-32
6276 * string
6277 */
6278 simdutf_warn_unused virtual size_t
6279 convert_utf32_to_utf8(const char32_t *input, size_t length,
6280 char *utf8_buffer) const noexcept = 0;
6281
6282 /**
6283 * Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
6284 *
6285 * During the conversion also validation of the input string is done.
6286 * This function is suitable to work with inputs from untrusted sources.
6287 *
6288 * This function is not BOM-aware.
6289 *
6290 * @param input the UTF-32 string to convert
6291 * @param length the length of the string in 4-byte code units
6292 * (char32_t)
6293 * @param utf8_buffer the pointer to buffer that can hold conversion result
6294 * @return a result pair struct (of type simdutf::result containing the two
6295 * fields error and count) with an error code and either position of the error
6296 * (in the input in code units) if any, or the number of char written if
6297 * successful.
6298 */
6299 simdutf_warn_unused virtual result
6300 convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length,
6301 char *utf8_buffer) const noexcept = 0;
6302
6303 /**
6304 * Convert valid UTF-32 string into UTF-8 string.
6305 *
6306 * This function assumes that the input string is valid UTF-32.
6307 *
6308 * This function is not BOM-aware.
6309 *
6310 * @param input the UTF-32 string to convert
6311 * @param length the length of the string in 4-byte code units
6312 * (char32_t)
6313 * @param utf8_buffer the pointer to a buffer that can hold the conversion
6314 * result
6315 * @return number of written code units; 0 if conversion is not possible
6316 */
6317 simdutf_warn_unused virtual size_t
6318 convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
6319 char *utf8_buffer) const noexcept = 0;
6320#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6321
6322#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6323 /**
6324 * Return the number of bytes that this UTF-16 string would require in Latin1
6325 * format.
6326 *
6327 *
6328 * @param length the length of the string in 2-byte code units
6329 * (char16_t)
6330 * @return the number of bytes required to encode the UTF-16 string as Latin1
6331 */
6332 simdutf_warn_unused virtual size_t
6333 utf16_length_from_latin1(size_t length) const noexcept {
6334 return length;
6335 }
6336#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6337
6338#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6339 /**
6340 * Convert possibly broken UTF-32 string into UTF-16LE string.
6341 *
6342 * During the conversion also validation of the input string is done.
6343 * This function is suitable to work with inputs from untrusted sources.
6344 *
6345 * This function is not BOM-aware.
6346 *
6347 * @param input the UTF-32 string to convert
6348 * @param length the length of the string in 4-byte code units
6349 * (char32_t)
6350 * @param utf16_buffer the pointer to buffer that can hold conversion result
6351 * @return number of written code units; 0 if input is not a valid UTF-32
6352 * string
6353 */
6354 simdutf_warn_unused virtual size_t
6355 convert_utf32_to_utf16le(const char32_t *input, size_t length,
6356 char16_t *utf16_buffer) const noexcept = 0;
6357
6358 /**
6359 * Convert possibly broken UTF-32 string into UTF-16BE string.
6360 *
6361 * During the conversion also validation of the input string is done.
6362 * This function is suitable to work with inputs from untrusted sources.
6363 *
6364 * This function is not BOM-aware.
6365 *
6366 * @param input the UTF-32 string to convert
6367 * @param length the length of the string in 4-byte code units
6368 * (char32_t)
6369 * @param utf16_buffer the pointer to buffer that can hold conversion result
6370 * @return number of written code units; 0 if input is not a valid UTF-32
6371 * string
6372 */
6373 simdutf_warn_unused virtual size_t
6374 convert_utf32_to_utf16be(const char32_t *input, size_t length,
6375 char16_t *utf16_buffer) const noexcept = 0;
6376
6377 /**
6378 * Convert possibly broken UTF-32 string into UTF-16LE string and stop on
6379 * error.
6380 *
6381 * During the conversion also validation of the input string is done.
6382 * This function is suitable to work with inputs from untrusted sources.
6383 *
6384 * This function is not BOM-aware.
6385 *
6386 * @param input the UTF-32 string to convert
6387 * @param length the length of the string in 4-byte code units
6388 * (char32_t)
6389 * @param utf16_buffer the pointer to buffer that can hold conversion result
6390 * @return a result pair struct (of type simdutf::result containing the two
6391 * fields error and count) with an error code and either position of the error
6392 * (in the input in code units) if any, or the number of char16_t written if
6393 * successful.
6394 */
6396 const char32_t *input, size_t length,
6397 char16_t *utf16_buffer) const noexcept = 0;
6398
6399 /**
6400 * Convert possibly broken UTF-32 string into UTF-16BE string and stop on
6401 * error.
6402 *
6403 * During the conversion also validation of the input string is done.
6404 * This function is suitable to work with inputs from untrusted sources.
6405 *
6406 * This function is not BOM-aware.
6407 *
6408 * @param input the UTF-32 string to convert
6409 * @param length the length of the string in 4-byte code units
6410 * (char32_t)
6411 * @param utf16_buffer the pointer to buffer that can hold conversion result
6412 * @return a result pair struct (of type simdutf::result containing the two
6413 * fields error and count) with an error code and either position of the error
6414 * (in the input in code units) if any, or the number of char16_t written if
6415 * successful.
6416 */
6418 const char32_t *input, size_t length,
6419 char16_t *utf16_buffer) const noexcept = 0;
6420
6421 /**
6422 * Convert valid UTF-32 string into UTF-16LE string.
6423 *
6424 * This function assumes that the input string is valid UTF-32.
6425 *
6426 * This function is not BOM-aware.
6427 *
6428 * @param input the UTF-32 string to convert
6429 * @param length the length of the string in 4-byte code units
6430 * (char32_t)
6431 * @param utf16_buffer the pointer to a buffer that can hold the conversion
6432 * result
6433 * @return number of written code units; 0 if conversion is not possible
6434 */
6435 simdutf_warn_unused virtual size_t
6436 convert_valid_utf32_to_utf16le(const char32_t *input, size_t length,
6437 char16_t *utf16_buffer) const noexcept = 0;
6438
6439 /**
6440 * Convert valid UTF-32 string into UTF-16BE string.
6441 *
6442 * This function assumes that the input string is valid UTF-32.
6443 *
6444 * This function is not BOM-aware.
6445 *
6446 * @param input the UTF-32 string to convert
6447 * @param length the length of the string in 4-byte code units
6448 * (char32_t)
6449 * @param utf16_buffer the pointer to a buffer that can hold the conversion
6450 * result
6451 * @return number of written code units; 0 if conversion is not possible
6452 */
6453 simdutf_warn_unused virtual size_t
6454 convert_valid_utf32_to_utf16be(const char32_t *input, size_t length,
6455 char16_t *utf16_buffer) const noexcept = 0;
6456#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6457
6458#if SIMDUTF_FEATURE_UTF16
6459 /**
6460 * Change the endianness of the input. Can be used to go from UTF-16LE to
6461 * UTF-16BE or from UTF-16BE to UTF-16LE.
6462 *
6463 * This function does not validate the input.
6464 *
6465 * This function is not BOM-aware.
6466 *
6467 * @param input the UTF-16 string to process
6468 * @param length the length of the string in 2-byte code units
6469 * (char16_t)
6470 * @param output the pointer to a buffer that can hold the conversion
6471 * result
6472 */
6473 virtual void change_endianness_utf16(const char16_t *input, size_t length,
6474 char16_t *output) const noexcept = 0;
6475#endif // SIMDUTF_FEATURE_UTF16
6476
6477#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6478 /**
6479 * Return the number of bytes that this Latin1 string would require in UTF-8
6480 * format.
6481 *
6482 * @param input the Latin1 string to convert
6483 * @param length the length of the string bytes
6484 * @return the number of bytes required to encode the Latin1 string as UTF-8
6485 */
6486 simdutf_warn_unused virtual size_t
6487 utf8_length_from_latin1(const char *input, size_t length) const noexcept = 0;
6488#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6489
6490#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6491 /**
6492 * Compute the number of bytes that this UTF-32 string would require in UTF-8
6493 * format.
6494 *
6495 * This function does not validate the input. It is acceptable to pass invalid
6496 * UTF-32 strings but in such cases the result is implementation defined.
6497 *
6498 * @param input the UTF-32 string to convert
6499 * @param length the length of the string in 4-byte code units
6500 * (char32_t)
6501 * @return the number of bytes required to encode the UTF-32 string as UTF-8
6502 */
6503 simdutf_warn_unused virtual size_t
6504 utf8_length_from_utf32(const char32_t *input,
6505 size_t length) const noexcept = 0;
6506#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF32
6507
6508#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6509 /**
6510 * Compute the number of bytes that this UTF-32 string would require in Latin1
6511 * format.
6512 *
6513 * This function does not validate the input. It is acceptable to pass invalid
6514 * UTF-32 strings but in such cases the result is implementation defined.
6515 *
6516 * @param length the length of the string in 4-byte code units
6517 * (char32_t)
6518 * @return the number of bytes required to encode the UTF-32 string as Latin1
6519 */
6520 simdutf_warn_unused virtual size_t
6521 latin1_length_from_utf32(size_t length) const noexcept {
6522 return length;
6523 }
6524#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6525
6526#if SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6527 /**
6528 * Compute the number of bytes that this UTF-8 string would require in Latin1
6529 * format.
6530 *
6531 * This function does not validate the input. It is acceptable to pass invalid
6532 * UTF-8 strings but in such cases the result is implementation defined.
6533 *
6534 * @param input the UTF-8 string to convert
6535 * @param length the length of the string in byte
6536 * @return the number of bytes required to encode the UTF-8 string as Latin1
6537 */
6538 simdutf_warn_unused virtual size_t
6539 latin1_length_from_utf8(const char *input, size_t length) const noexcept = 0;
6540#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_LATIN1
6541
6542#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6543 /**
6544 * Compute the number of bytes that this UTF-16LE/BE string would require in
6545 * Latin1 format.
6546 *
6547 * This function does not validate the input. It is acceptable to pass invalid
6548 * UTF-16 strings but in such cases the result is implementation defined.
6549 *
6550 * This function is not BOM-aware.
6551 *
6552 * @param length the length of the string in 2-byte code units
6553 * (char16_t)
6554 * @return the number of bytes required to encode the UTF-16LE string as
6555 * Latin1
6556 */
6557 simdutf_warn_unused virtual size_t
6558 latin1_length_from_utf16(size_t length) const noexcept {
6559 return length;
6560 }
6561#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_LATIN1
6562
6563#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6564 /**
6565 * Compute the number of two-byte code units that this UTF-32 string would
6566 * require in UTF-16 format.
6567 *
6568 * This function does not validate the input. It is acceptable to pass invalid
6569 * UTF-32 strings but in such cases the result is implementation defined.
6570 *
6571 * @param input the UTF-32 string to convert
6572 * @param length the length of the string in 4-byte code units
6573 * (char32_t)
6574 * @return the number of bytes required to encode the UTF-32 string as UTF-16
6575 */
6576 simdutf_warn_unused virtual size_t
6577 utf16_length_from_utf32(const char32_t *input,
6578 size_t length) const noexcept = 0;
6579#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6580
6581#if SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6582 /**
6583 * Return the number of bytes that this UTF-32 string would require in Latin1
6584 * format.
6585 *
6586 * @param length the length of the string in 4-byte code units
6587 * (char32_t)
6588 * @return the number of bytes required to encode the UTF-32 string as Latin1
6589 */
6590 simdutf_warn_unused virtual size_t
6591 utf32_length_from_latin1(size_t length) const noexcept {
6592 return length;
6593 }
6594#endif // SIMDUTF_FEATURE_UTF32 && SIMDUTF_FEATURE_LATIN1
6595
6596#if SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6597 /**
6598 * Compute the number of bytes that this UTF-16LE string would require in
6599 * UTF-32 format.
6600 *
6601 * This function is equivalent to count_utf16le.
6602 *
6603 * This function does not validate the input. It is acceptable to pass invalid
6604 * UTF-16 strings but in such cases the result is implementation defined.
6605 *
6606 * This function is not BOM-aware.
6607 *
6608 * @param input the UTF-16LE string to convert
6609 * @param length the length of the string in 2-byte code units
6610 * (char16_t)
6611 * @return the number of bytes required to encode the UTF-16LE string as
6612 * UTF-32
6613 */
6614 simdutf_warn_unused virtual size_t
6615 utf32_length_from_utf16le(const char16_t *input,
6616 size_t length) const noexcept = 0;
6617
6618 /**
6619 * Compute the number of bytes that this UTF-16BE string would require in
6620 * UTF-32 format.
6621 *
6622 * This function is equivalent to count_utf16be.
6623 *
6624 * This function does not validate the input. It is acceptable to pass invalid
6625 * UTF-16 strings but in such cases the result is implementation defined.
6626 *
6627 * This function is not BOM-aware.
6628 *
6629 * @param input the UTF-16BE string to convert
6630 * @param length the length of the string in 2-byte code units
6631 * (char16_t)
6632 * @return the number of bytes required to encode the UTF-16BE string as
6633 * UTF-32
6634 */
6635 simdutf_warn_unused virtual size_t
6636 utf32_length_from_utf16be(const char16_t *input,
6637 size_t length) const noexcept = 0;
6638#endif // SIMDUTF_FEATURE_UTF16 && SIMDUTF_FEATURE_UTF32
6639
6640#if SIMDUTF_FEATURE_UTF16
6641 /**
6642 * Count the number of code points (characters) in the string assuming that
6643 * it is valid.
6644 *
6645 * This function assumes that the input string is valid UTF-16LE.
6646 * It is acceptable to pass invalid UTF-16 strings but in such cases
6647 * the result is implementation defined.
6648 *
6649 * This function is not BOM-aware.
6650 *
6651 * @param input the UTF-16LE string to process
6652 * @param length the length of the string in 2-byte code units
6653 * (char16_t)
6654 * @return number of code points
6655 */
6656 simdutf_warn_unused virtual size_t
6657 count_utf16le(const char16_t *input, size_t length) const noexcept = 0;
6658
6659 /**
6660 * Count the number of code points (characters) in the string assuming that
6661 * it is valid.
6662 *
6663 * This function assumes that the input string is valid UTF-16BE.
6664 * It is acceptable to pass invalid UTF-16 strings but in such cases
6665 * the result is implementation defined.
6666 *
6667 * This function is not BOM-aware.
6668 *
6669 * @param input the UTF-16BE string to process
6670 * @param length the length of the string in 2-byte code units
6671 * (char16_t)
6672 * @return number of code points
6673 */
6674 simdutf_warn_unused virtual size_t
6675 count_utf16be(const char16_t *input, size_t length) const noexcept = 0;
6676#endif // SIMDUTF_FEATURE_UTF16
6677
6678#if SIMDUTF_FEATURE_UTF8
6679 /**
6680 * Count the number of code points (characters) in the string assuming that
6681 * it is valid.
6682 *
6683 * This function assumes that the input string is valid UTF-8.
6684 * It is acceptable to pass invalid UTF-8 strings but in such cases
6685 * the result is implementation defined.
6686 *
6687 * @param input the UTF-8 string to process
6688 * @param length the length of the string in bytes
6689 * @return number of code points
6690 */
6691 simdutf_warn_unused virtual size_t
6692 count_utf8(const char *input, size_t length) const noexcept = 0;
6693#endif // SIMDUTF_FEATURE_UTF8
6694
6695#if SIMDUTF_FEATURE_BASE64
6696 /**
6697 * Provide the maximal binary length in bytes given the base64 input.
6698 * As long as the input does not contain ignorable characters (e.g., ASCII
6699 * spaces or linefeed characters), the result is exact. In particular, the
6700 * function checks for padding characters.
6701 *
6702 * The function is fast (constant time). It checks up to two characters at
6703 * the end of the string. The input is not otherwise validated or read..
6704 *
6705 * @param input the base64 input to process
6706 * @param length the length of the base64 input in bytes
6707 * @return maximal number of binary bytes
6708 */
6709 simdutf_warn_unused size_t maximal_binary_length_from_base64(
6710 const char *input, size_t length) const noexcept;
6711
6712 /**
6713 * Provide the maximal binary length in bytes given the base64 input.
6714 * As long as the input does not contain ignorable characters (e.g., ASCII
6715 * spaces or linefeed characters), the result is exact. In particular, the
6716 * function checks for padding characters.
6717 *
6718 * The function is fast (constant time). It checks up to two characters at
6719 * the end of the string. The input is not otherwise validated or read.
6720 *
6721 * @param input the base64 input to process, in ASCII stored as 16-bit
6722 * units
6723 * @param length the length of the base64 input in 16-bit units
6724 * @return maximal number of binary bytes
6725 */
6726 simdutf_warn_unused size_t maximal_binary_length_from_base64(
6727 const char16_t *input, size_t length) const noexcept;
6728
6729 /**
6730 * Compute the binary length from a base64 input with ASCII spaces.
6731 * This function is useful for well-formed base64 inputs that may contain
6732 * ASCII spaces (such as line breaks). For such inputs, the result is exact.
6733 *
6734 * The function counts non-whitespace characters (ASCII value > 0x20) and
6735 * subtracts padding characters ('=') found at the end.
6736 *
6737 * @param input the base64 input to process
6738 * @param length the length of the base64 input in bytes
6739 * @return number of binary bytes
6740 */
6741 simdutf_warn_unused virtual size_t
6742 binary_length_from_base64(const char *input, size_t length) const noexcept;
6743
6744 /**
6745 * Compute the binary length from a base64 input with ASCII spaces.
6746 * This function is useful for well-formed base64 inputs that may contain
6747 * ASCII spaces (such as line breaks). For such inputs, the result is exact.
6748 *
6749 * The function counts non-whitespace characters (ASCII value > 0x20) and
6750 * subtracts padding characters ('=') found at the end.
6751 *
6752 * @param input the base64 input to process, in ASCII stored as 16-bit
6753 * units
6754 * @param length the length of the base64 input in 16-bit units
6755 * @return number of binary bytes
6756 */
6757 simdutf_warn_unused virtual size_t
6758 binary_length_from_base64(const char16_t *input,
6759 size_t length) const noexcept;
6760
6761 /**
6762 * Convert a base64 input to a binary output.
6763 *
6764 * This function follows the WHATWG forgiving-base64 format, which means that
6765 * it will ignore any ASCII spaces in the input. You may provide a padded
6766 * input (with one or two equal signs at the end) or an unpadded input
6767 * (without any equal signs at the end).
6768 *
6769 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6770 *
6771 * This function will fail in case of invalid input. When last_chunk_options =
6772 * loose, there are two possible reasons for failure: the input contains a
6773 * number of base64 characters that when divided by 4, leaves a single
6774 * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6775 * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6776 *
6777 * You should call this function with a buffer that is at least
6778 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6779 * provide that much space, the function may cause a buffer overflow.
6780 *
6781 * @param input the base64 string to process
6782 * @param length the length of the string in bytes
6783 * @param output the pointer to a buffer that can hold the conversion
6784 * result (should be at least maximal_binary_length_from_base64(input, length)
6785 * bytes long).
6786 * @param options the base64 options to use, can be base64_default or
6787 * base64_url, is base64_default by default.
6788 * @param last_chunk_options the handling of the last chunk (default: loose)
6789 * @return a result pair struct (of type simdutf::result containing the two
6790 * fields error and count) with an error code and either position of the error
6791 * (in the input in bytes) if any, or the number of bytes written if
6792 * successful.
6793 */
6794 simdutf_warn_unused virtual result
6795 base64_to_binary(const char *input, size_t length, char *output,
6796 base64_options options = base64_default,
6797 last_chunk_handling_options last_chunk_options =
6798 last_chunk_handling_options::loose) const noexcept = 0;
6799
6800 /**
6801 * Convert a base64 input to a binary output while returning more details
6802 * than base64_to_binary.
6803 *
6804 * This function follows the WHATWG forgiving-base64 format, which means that
6805 * it will ignore any ASCII spaces in the input. You may provide a padded
6806 * input (with one or two equal signs at the end) or an unpadded input
6807 * (without any equal signs at the end).
6808 *
6809 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6810 *
6811 * This function will fail in case of invalid input. When last_chunk_options =
6812 * loose, there are two possible reasons for failure: the input contains a
6813 * number of base64 characters that when divided by 4, leaves a single
6814 * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6815 * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6816 *
6817 * You should call this function with a buffer that is at least
6818 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6819 * provide that much space, the function may cause a buffer overflow.
6820 *
6821 * @param input the base64 string to process
6822 * @param length the length of the string in bytes
6823 * @param output the pointer to a buffer that can hold the conversion
6824 * result (should be at least maximal_binary_length_from_base64(input, length)
6825 * bytes long).
6826 * @param options the base64 options to use, can be base64_default or
6827 * base64_url, is base64_default by default.
6828 * @param last_chunk_options the handling of the last chunk (default: loose)
6829 * @return a full_result pair struct (of type simdutf::result containing the
6830 * three fields error, input_count and output_count).
6831 */
6832 simdutf_warn_unused virtual full_result base64_to_binary_details(
6833 const char *input, size_t length, char *output,
6834 base64_options options = base64_default,
6835 last_chunk_handling_options last_chunk_options =
6836 last_chunk_handling_options::loose) const noexcept = 0;
6837
6838 /**
6839 * Convert a base64 input to a binary output.
6840 *
6841 * This function follows the WHATWG forgiving-base64 format, which means that
6842 * it will ignore any ASCII spaces in the input. You may provide a padded
6843 * input (with one or two equal signs at the end) or an unpadded input
6844 * (without any equal signs at the end).
6845 *
6846 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6847 *
6848 * This function will fail in case of invalid input. When last_chunk_options =
6849 * loose, there are two possible reasons for failure: the input contains a
6850 * number of base64 characters that when divided by 4, leaves a single
6851 * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6852 * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6853 *
6854 * You should call this function with a buffer that is at least
6855 * maximal_binary_length_from_base64(input, length) bytes long. If you
6856 * fail to provide that much space, the function may cause a buffer overflow.
6857 *
6858 * @param input the base64 string to process, in ASCII stored as
6859 * 16-bit units
6860 * @param length the length of the string in 16-bit units
6861 * @param output the pointer to a buffer that can hold the conversion
6862 * result (should be at least maximal_binary_length_from_base64(input, length)
6863 * bytes long).
6864 * @param options the base64 options to use, can be base64_default or
6865 * base64_url, is base64_default by default.
6866 * @param last_chunk_options the handling of the last chunk (default: loose)
6867 * @return a result pair struct (of type simdutf::result containing the two
6868 * fields error and count) with an error code and position of the
6869 * INVALID_BASE64_CHARACTER error (in the input in units) if any, or the
6870 * number of bytes written if successful.
6871 */
6872 simdutf_warn_unused virtual result
6873 base64_to_binary(const char16_t *input, size_t length, char *output,
6874 base64_options options = base64_default,
6875 last_chunk_handling_options last_chunk_options =
6876 last_chunk_handling_options::loose) const noexcept = 0;
6877
6878 /**
6879 * Convert a base64 input to a binary output while returning more details
6880 * than base64_to_binary.
6881 *
6882 * This function follows the WHATWG forgiving-base64 format, which means that
6883 * it will ignore any ASCII spaces in the input. You may provide a padded
6884 * input (with one or two equal signs at the end) or an unpadded input
6885 * (without any equal signs at the end).
6886 *
6887 * See https://infra.spec.whatwg.org/#forgiving-base64-decode
6888 *
6889 * This function will fail in case of invalid input. When last_chunk_options =
6890 * loose, there are two possible reasons for failure: the input contains a
6891 * number of base64 characters that when divided by 4, leaves a single
6892 * remainder character (BASE64_INPUT_REMAINDER), or the input contains a
6893 * character that is not a valid base64 character (INVALID_BASE64_CHARACTER).
6894 *
6895 * You should call this function with a buffer that is at least
6896 * maximal_binary_length_from_base64(input, length) bytes long. If you fail to
6897 * provide that much space, the function may cause a buffer overflow.
6898 *
6899 * @param input the base64 string to process
6900 * @param length the length of the string in bytes
6901 * @param output the pointer to a buffer that can hold the conversion
6902 * result (should be at least maximal_binary_length_from_base64(input, length)
6903 * bytes long).
6904 * @param options the base64 options to use, can be base64_default or
6905 * base64_url, is base64_default by default.
6906 * @param last_chunk_options the handling of the last chunk (default: loose)
6907 * @return a full_result pair struct (of type simdutf::result containing the
6908 * three fields error, input_count and output_count).
6909 */
6910 simdutf_warn_unused virtual full_result base64_to_binary_details(
6911 const char16_t *input, size_t length, char *output,
6912 base64_options options = base64_default,
6913 last_chunk_handling_options last_chunk_options =
6914 last_chunk_handling_options::loose) const noexcept = 0;
6915
6916 /**
6917 * Provide the base64 length in bytes given the length of a binary input.
6918 *
6919 * @param length the length of the input in bytes
6920 * @param options the base64 options to use, can be base64_default or
6921 * base64_url, is base64_default by default.
6922 * @return number of base64 bytes
6923 */
6924 simdutf_warn_unused size_t base64_length_from_binary(
6925 size_t length, base64_options options = base64_default) const noexcept;
6926
6927 /**
6928 * Convert a binary input to a base64 output.
6929 *
6930 * The default option (simdutf::base64_default) uses the characters `+` and
6931 * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6932 * the output to ensure that the output length is a multiple of four.
6933 *
6934 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6935 * part of its alphabet. No padding is added at the end of the output.
6936 *
6937 * This function always succeeds.
6938 *
6939 * @param input the binary to process
6940 * @param length the length of the input in bytes
6941 * @param output the pointer to a buffer that can hold the conversion
6942 * result (should be at least base64_length_from_binary(length) bytes long)
6943 * @param options the base64 options to use, can be base64_default or
6944 * base64_url, is base64_default by default.
6945 * @return number of written bytes, will be equal to
6946 * base64_length_from_binary(length, options)
6947 */
6948 virtual size_t
6949 binary_to_base64(const char *input, size_t length, char *output,
6950 base64_options options = base64_default) const noexcept = 0;
6951
6952 /**
6953 * Convert a binary input to a base64 output with lines of given length.
6954 * Lines are separated by a single linefeed character.
6955 *
6956 * The default option (simdutf::base64_default) uses the characters `+` and
6957 * `/` as part of its alphabet. Further, it adds padding (`=`) at the end of
6958 * the output to ensure that the output length is a multiple of four.
6959 *
6960 * The URL option (simdutf::base64_url) uses the characters `-` and `_` as
6961 * part of its alphabet. No padding is added at the end of the output.
6962 *
6963 * This function always succeeds.
6964 *
6965 * @param input the binary to process
6966 * @param length the length of the input in bytes
6967 * @param output the pointer to a buffer that can hold the conversion
6968 * result (should be at least base64_length_from_binary_with_lines(length,
6969 * options, line_length) bytes long)
6970 * @param line_length the length of each line, values smaller than 4 are
6971 * interpreted as 4
6972 * @param options the base64 options to use, can be base64_default or
6973 * base64_url, is base64_default by default.
6974 * @return number of written bytes, will be equal to
6975 * base64_length_from_binary_with_lines(length, options, line_length)
6976 */
6978 const char *input, size_t length, char *output,
6979 size_t line_length = simdutf::default_line_length,
6980 base64_options options = base64_default) const noexcept = 0;
6981
6982 /**
6983 * Find the first occurrence of a character in a string. If the character is
6984 * not found, return a pointer to the end of the string.
6985 * @param start the start of the string
6986 * @param end the end of the string
6987 * @param character the character to find
6988 * @return a pointer to the first occurrence of the character in the string,
6989 * or a pointer to the end of the string if the character is not found.
6990 *
6991 */
6992 virtual const char *find(const char *start, const char *end,
6993 char character) const noexcept = 0;
6994 virtual const char16_t *find(const char16_t *start, const char16_t *end,
6995 char16_t character) const noexcept = 0;
6996#endif // SIMDUTF_FEATURE_BASE64
6997
6998#ifdef SIMDUTF_INTERNAL_TESTS
6999 // This method is exported only in developer mode, its purpose
7000 // is to expose some internal test procedures from the given
7001 // implementation and then use them through our standard test
7002 // framework.
7003 //
7004 // Regular users should not use it, the tests of the public
7005 // API are enough.
7006
7007 struct TestProcedure {
7008 // display name
7009 std::string_view name;
7010
7011 // procedure should return whether given test pass or not
7012 void (*procedure)(const implementation &);
7013 };
7014
7015 virtual std::vector<TestProcedure> internal_tests() const;
7016#endif
7017
7018protected:
7019 /** @private Construct an implementation with the given name and description.
7020 * For subclasses.
7021 * @param name the name of this implementation
7022 * @param description a description of this implementation
7023 * @param required_instruction_sets the instruction sets this implementation
7024 * requires
7025 */
7026 simdutf_really_inline implementation(const char *name,
7027 const char *description,
7028 uint32_t required_instruction_sets)
7029 : _name(name), _description(description),
7030 _required_instruction_sets(required_instruction_sets) {}
7031
7032protected:
7033 ~implementation() = default;
7034
7035private:
7036 /**
7037 * The name of this implementation.
7038 */
7039 const char *_name;
7040
7041 /**
7042 * The description of this implementation.
7043 */
7044 const char *_description;
7045
7046 /**
7047 * Instruction sets required for this implementation.
7048 */
7049 const uint32_t _required_instruction_sets;
7050};
7051
7052/** @private */
7053namespace internal {
7054
7055/**
7056 * The list of available implementations compiled into simdutf.
7057 */
7058class available_implementation_list {
7059public:
7060 /** Get the list of available implementations compiled into simdutf */
7061 simdutf_really_inline available_implementation_list() {}
7062 /** Number of implementations */
7063 size_t size() const noexcept;
7064 /** STL const begin() iterator */
7065 const implementation *const *begin() const noexcept;
7066 /** STL const end() iterator */
7067 const implementation *const *end() const noexcept;
7068
7069 /**
7070 * Get the implementation with the given name.
7071 *
7072 * Case sensitive.
7073 *
7074 * const implementation *impl =
7075 * simdutf::available_implementations["westmere"]; if (!impl) { exit(1); } if
7076 * (!imp->supported_by_runtime_system()) { exit(1); }
7077 * simdutf::active_implementation = impl;
7078 *
7079 * @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
7080 * @return the implementation, or nullptr if the parse failed.
7081 */
7082 const implementation *operator[](std::string_view name) const noexcept {
7083 for (const implementation *impl : *this) {
7084 if (impl->name() == name) {
7085 return impl;
7086 }
7087 }
7088 return nullptr;
7089 }
7090
7091 /**
7092 * Detect the most advanced implementation supported by the current host.
7093 *
7094 * This is used to initialize the implementation on startup.
7095 *
7096 * const implementation *impl =
7097 * simdutf::available_implementation::detect_best_supported();
7098 * simdutf::active_implementation = impl;
7099 *
7100 * @return the most advanced supported implementation for the current host, or
7101 * an implementation that returns UNSUPPORTED_ARCHITECTURE if there is no
7102 * supported implementation. Will never return nullptr.
7103 */
7104 const implementation *detect_best_supported() const noexcept;
7105};
7106
7107template <typename T> class atomic_ptr {
7108public:
7109 atomic_ptr(T *_ptr) : ptr{_ptr} {}
7110
7111#if defined(SIMDUTF_NO_THREADS)
7112 operator const T *() const { return ptr; }
7113 const T &operator*() const { return *ptr; }
7114 const T *operator->() const { return ptr; }
7115
7116 operator T *() { return ptr; }
7117 T &operator*() { return *ptr; }
7118 T *operator->() { return ptr; }
7119 atomic_ptr &operator=(T *_ptr) {
7120 ptr = _ptr;
7121 return *this;
7122 }
7123
7124#else
7125 operator const T *() const { return ptr.load(); }
7126 const T &operator*() const { return *ptr; }
7127 const T *operator->() const { return ptr.load(); }
7128
7129 operator T *() { return ptr.load(); }
7130 T &operator*() { return *ptr; }
7131 T *operator->() { return ptr.load(); }
7132 atomic_ptr &operator=(T *_ptr) {
7133 ptr = _ptr;
7134 return *this;
7135 }
7136
7137#endif
7138
7139private:
7140#if defined(SIMDUTF_NO_THREADS)
7141 T *ptr;
7142#else
7143 std::atomic<T *> ptr;
7144#endif
7145};
7146
7147class detect_best_supported_implementation_on_first_use;
7148
7149} // namespace internal
7150
7151/**
7152 * The list of available implementations compiled into simdutf.
7153 */
7154extern SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
7155get_available_implementations();
7156
7157/**
7158 * The active implementation.
7159 *
7160 * Automatically initialized on first use to the most advanced implementation
7161 * supported by this hardware.
7162 */
7163extern SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
7164get_active_implementation();
7165
7166} // namespace simdutf
7167
7168#if SIMDUTF_FEATURE_BASE64
7169 // this header is not part of the public api
7170 #include <simdutf/base64_implementation.h>
7171
7172namespace simdutf {
7173 #if SIMDUTF_SPAN
7174/**
7175 * @brief span overload
7176 * @return a tuple of result and outlen
7177 */
7178simdutf_really_inline
7179 simdutf_constexpr23 simdutf_warn_unused std::tuple<result, std::size_t>
7180 base64_to_binary_safe(
7181 const detail::input_span_of_byte_like auto &input,
7182 detail::output_span_of_byte_like auto &&binary_output,
7183 base64_options options = base64_default,
7184 last_chunk_handling_options last_chunk_options = loose,
7185 bool decode_up_to_bad_char = false) noexcept {
7186 size_t outlen = binary_output.size();
7187 #if SIMDUTF_CPLUSPLUS23
7188 if consteval {
7189 using CInput = std::decay_t<decltype(*input.data())>;
7190 static_assert(std::is_same_v<CInput, char>,
7191 "sorry, the constexpr implementation is for now limited to "
7192 "input of type char");
7193 using COutput = std::decay_t<decltype(*binary_output.data())>;
7194 static_assert(std::is_same_v<COutput, char>,
7195 "sorry, the constexpr implementation is for now limited to "
7196 "output of type char");
7197 auto r = base64_to_binary_safe_impl(
7198 input.data(), input.size(), binary_output.data(), outlen, options,
7199 last_chunk_options, decode_up_to_bad_char);
7200 return {r, outlen};
7201 } else
7202 #endif
7203 {
7204 auto r = base64_to_binary_safe_impl<char>(
7205 reinterpret_cast<const char *>(input.data()), input.size(),
7206 reinterpret_cast<char *>(binary_output.data()), outlen, options,
7207 last_chunk_options, decode_up_to_bad_char);
7208 return {r, outlen};
7209 }
7210}
7211
7212 #if SIMDUTF_SPAN
7213/**
7214 * @brief span overload
7215 * @return a tuple of result and outlen
7216 */
7217simdutf_really_inline
7218 simdutf_warn_unused simdutf_constexpr23 std::tuple<result, std::size_t>
7219 base64_to_binary_safe(
7220 std::span<const char16_t> input,
7221 detail::output_span_of_byte_like auto &&binary_output,
7222 base64_options options = base64_default,
7223 last_chunk_handling_options last_chunk_options = loose,
7224 bool decode_up_to_bad_char = false) noexcept {
7225 size_t outlen = binary_output.size();
7226 #if SIMDUTF_CPLUSPLUS23
7227 if consteval {
7228 auto r = base64_to_binary_safe_impl(
7229 input.data(), input.size(), binary_output.data(), outlen, options,
7230 last_chunk_options, decode_up_to_bad_char);
7231 return {r, outlen};
7232 } else
7233 #endif
7234 {
7235 auto r = base64_to_binary_safe(
7236 input.data(), input.size(),
7237 reinterpret_cast<char *>(binary_output.data()), outlen, options,
7238 last_chunk_options, decode_up_to_bad_char);
7239 return {r, outlen};
7240 }
7241}
7242 #endif // SIMDUTF_SPAN
7243
7244 #endif // SIMDUTF_SPAN
7245} // namespace simdutf
7246
7247#endif // SIMDUTF_FEATURE_BASE64
7248
7249#if SIMDUTF_CPLUSPLUS23 && SIMDUTF_FEATURE_BASE64
7250
7251namespace simdutf {
7252namespace literals {
7253
7254namespace detail {
7255
7256// the detail namespace is not part of the public api
7257
7258template <std::size_t N> struct base64_literal_helper {
7259 std::array<char, N - 1> storage{};
7260 static constexpr std::size_t size() noexcept { return N - 1; }
7261 consteval base64_literal_helper(const char (&str)[N]) {
7262 for (std::size_t i = 0; i < size(); i++) {
7263 storage[i] = str[i];
7264 }
7265 }
7266};
7267
7268template <std::size_t InputLen> struct base64_decode_result {
7269 static constexpr std::size_t max_out = (InputLen + 3) / 4 * 3;
7270 std::array<char, max_out> buffer{};
7271 std::size_t output_count{};
7272};
7273
7274template <std::size_t InputLen>
7275consteval auto base64_decode_literal(const char *str) {
7276 base64_decode_result<InputLen> result{};
7277 auto r = scalar::base64::base64_to_binary_details_impl(
7278 str, InputLen, result.buffer.data(), base64_default, loose);
7279 if (r.error != error_code::SUCCESS) {
7280 #if __cpp_lib_unreachable >= 202202L
7281 std::unreachable(); // invalid base64 input in _base64 literal
7282 #else
7283 // workaround for older stdlib
7284 throw "invalid base64 input in _base64 literal";
7285 #endif
7286 }
7287 result.output_count = r.output_count;
7288 return result;
7289}
7290
7291template <base64_literal_helper a> consteval auto base64_make_array() {
7292 constexpr auto decoded = base64_decode_literal<a.size()>(a.storage.data());
7293 std::array<char, decoded.output_count> ret{};
7294 for (std::size_t i = 0; i < decoded.output_count; i++) {
7295 ret[i] = decoded.buffer[i];
7296 }
7297 return ret;
7298}
7299
7300} // namespace detail
7301
7302/**
7303 * User-defined literal for compile-time base64 decoding.
7304 *
7305 * Usage:
7306 * using namespace simdutf::literals;
7307 * constexpr auto decoded = "SGVsbG8gV29ybGQh"_base64;
7308 * // decoded is a std::array<char, 12> containing "Hello World!"
7309 *
7310 * The input must be valid base64. Whitepace is allowed and ignored.
7311 * A compilation error occurs if the input is invalid.
7312 */
7313template <detail::base64_literal_helper a> consteval auto operator""_base64() {
7314 return detail::base64_make_array<a>();
7315}
7316
7317} // namespace literals
7318} // namespace simdutf
7319
7320#endif // SIMDUTF_CPLUSPLUS23 && SIMDUTF_FEATURE_BASE64
7321
7322#endif // SIMDUTF_IMPLEMENTATION_H
An implementation of simdutf for a particular CPU architecture.
virtual simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert valid UTF-32 string into Latin1 string.
virtual simdutf_warn_unused size_t binary_length_from_base64(const char *input, size_t length) const noexcept
Compute the binary length from a base64 input with ASCII spaces.
virtual const char * find(const char *start, const char *end, char character) const noexcept=0
Find the first occurrence of a character in a string.
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert valid UTF-8 string into UTF-16LE string.
virtual simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept
Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
virtual simdutf_warn_unused size_t convert_utf16le_to_utf8_with_replacement(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into UTF-8 string, replacing unpaired surrogates with the Uni...
virtual simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input, size_t length) const noexcept
Compute the binary length from a base64 input with ASCII spaces.
virtual simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-16BE string would require in UTF-32 format.
virtual simdutf_warn_unused size_t convert_utf8_to_utf16be(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
Convert possibly broken UTF-8 string into UTF-16BE string.
virtual simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into UTF-32 string.
virtual simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert valid UTF-16LE string into UTF-32BE string.
virtual std::string_view name() const noexcept
The name of this implementation.
virtual simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert valid UTF-16BE string into UTF-8 string.
virtual simdutf_warn_unused size_t convert_latin1_to_utf16le(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
Convert possibly Latin1 string into UTF-16LE string.
virtual simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept=0
Validate the UTF-16BE string.
virtual simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into Latin1 string.
virtual simdutf_warn_unused full_result base64_to_binary_details(const char *input, size_t length, char *output, base64_options options=base64_default, last_chunk_handling_options last_chunk_options=last_chunk_handling_options::loose) const noexcept=0
Convert a base64 input to a binary output while returning more details than base64_to_binary.
virtual simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert valid UTF-16LE string into UTF-32 string.
virtual simdutf_warn_unused size_t latin1_length_from_utf8(const char *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-8 string would require in Latin1 format.
virtual simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert possibly broken UTF-32 string into Latin1 string and stop on error.
virtual simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
virtual simdutf_warn_unused size_t convert_latin1_to_utf16be(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
Convert Latin1 string into UTF-16BE string.
virtual simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept
Compute the number of bytes that this UTF-32 string would require in Latin1 format.
virtual simdutf_warn_unused full_result base64_to_binary_details(const char16_t *input, size_t length, char *output, base64_options options=base64_default, last_chunk_handling_options last_chunk_options=last_chunk_handling_options::loose) const noexcept=0
Convert a base64 input to a binary output while returning more details than base64_to_binary.
virtual simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert possibly broken UTF-32 string into UTF-16LE string.
virtual simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept=0
Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
virtual simdutf_warn_unused size_t count_utf8(const char *input, size_t length) const noexcept=0
Count the number of code points (characters) in the string assuming that it is valid.
virtual simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert valid UTF-32 string into UTF-16BE string.
virtual simdutf_warn_unused size_t utf32_length_from_utf8(const char *input, size_t length) const noexcept=0
Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format.
virtual simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert valid UTF-16LE string into UTF-8 string.
virtual void to_well_formed_utf16le(const char16_t *input, size_t len, char16_t *output) const noexcept=0
Copies the UTF-16LE string while replacing mismatched surrogates with the Unicode replacement charact...
virtual simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert possibly broken UTF-32 string into UTF-16BE string and stop on error.
virtual simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept=0
Validate the UTF-8 string and stop on errors.
virtual simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
virtual size_t binary_to_base64_with_lines(const char *input, size_t length, char *output, size_t line_length=simdutf::default_line_length, base64_options options=base64_default) const noexcept=0
Convert a binary input to a base64 output with lines of given length.
virtual simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into UTF-8 string.
virtual simdutf_warn_unused size_t convert_utf8_to_utf16le(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
Convert possibly broken UTF-8 string into UTF-16LE string.
virtual simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into Latin1 string.
virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(const char16_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-16LE string would require in UTF-8 format even when the UTF...
virtual simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept=0
Validate the UTF-16LE string and stop on error.
virtual simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into UTF-8 string and stop on error.
virtual size_t binary_to_base64(const char *input, size_t length, char *output, base64_options options=base64_default) const noexcept=0
Convert a binary input to a base64 output.
virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(const char16_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-16BE string would require in UTF-8 format even when the UTF...
virtual simdutf_warn_unused size_t convert_latin1_to_utf32(const char *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert Latin1 string into UTF-32 string.
virtual simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char *input, size_t length, char32_t *utf32_output) const noexcept=0
Convert possibly broken UTF-8 string into UTF-32 string and stop on error.
virtual simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert valid UTF-32 string into UTF-8 string.
virtual simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept
Return the number of bytes that this UTF-32 string would require in Latin1 format.
virtual std::string_view description() const noexcept
The description of this implementation.
virtual simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept=0
Validate the UTF-16LE string.This function may be best when you expect the input to be almost always ...
virtual simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
Convert possibly broken UTF-8 string into UTF-16LE string and stop on error.
virtual simdutf_warn_unused size_t count_utf16le(const char16_t *input, size_t length) const noexcept=0
Count the number of code points (characters) in the string assuming that it is valid.
virtual simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into UTF-32 string.
virtual simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert valid UTF-16LE string into Latin1 string.
virtual simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert possibly broken UTF-32 string into Latin1 string.
virtual simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert valid UTF-16BE string into Latin1 string.
bool supported_by_runtime_system() const
The instruction sets this implementation is compiled against and the current CPU match.
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert valid UTF-8 string into UTF-32 string.
virtual simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into Latin1 string.
virtual simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept=0
Validate the UTF-16BE string and stop on error.
virtual simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into UTF-8 string and stop on error.
virtual simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into UTF-32 string and stop on error.
virtual simdutf_warn_unused result base64_to_binary(const char *input, size_t length, char *output, base64_options options=base64_default, last_chunk_handling_options last_chunk_options=last_chunk_handling_options::loose) const noexcept=0
Convert a base64 input to a binary output.
virtual simdutf_warn_unused bool validate_utf16be_as_ascii(const char16_t *buf, size_t len) const noexcept=0
Validate the ASCII string as a UTF-16BE sequence.
virtual simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert possibly broken UTF-32 string into UTF-16BE string.
virtual simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept=0
Validate the UTF-32 string and stop on error.
virtual simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
virtual void change_endianness_utf16(const char16_t *input, size_t length, char16_t *output) const noexcept=0
Change the endianness of the input.
virtual simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert valid UTF-8 string into UTF-16BE string.
virtual simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char *input, size_t length, char *latin1_output) const noexcept=0
Convert valid UTF-8 string into latin1 string.
simdutf_warn_unused size_t maximal_binary_length_from_base64(const char *input, size_t length) const noexcept
Provide the maximal binary length in bytes given the base64 input.
virtual simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert valid UTF-32 string into UTF-16LE string.
virtual simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept=0
Validate the ASCII string and stop on error.
virtual simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept=0
Validate the UTF-8 string.
virtual simdutf_warn_unused size_t convert_utf8_to_utf32(const char *input, size_t length, char32_t *utf32_output) const noexcept=0
Convert possibly broken UTF-8 string into UTF-32 string.
simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t *input, size_t length) const noexcept
Provide the maximal binary length in bytes given the base64 input.
virtual simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept=0
Validate the UTF-32 string.
virtual simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t *input, size_t length, char32_t *utf32_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into UTF-32 string and stop on error.
virtual simdutf_warn_unused size_t convert_utf16be_to_utf8_with_replacement(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into UTF-8 string, replacing unpaired surrogates with the Uni...
virtual simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-16LE string into UTF-8 string.
virtual simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept=0
Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
virtual simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-32 string into UTF-8 string.
virtual simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t *input, size_t length, char *latin1_buffer) const noexcept=0
Convert possibly broken UTF-16BE string into Latin1 string.
virtual simdutf_warn_unused size_t convert_utf8_to_latin1(const char *input, size_t length, char *latin1_output) const noexcept=0
Convert possibly broken UTF-8 string into latin1 string.
virtual simdutf_warn_unused result base64_to_binary(const char16_t *input, size_t length, char *output, base64_options options=base64_default, last_chunk_handling_options last_chunk_options=last_chunk_handling_options::loose) const noexcept=0
Convert a base64 input to a binary output.
simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options=base64_default) const noexcept
Provide the base64 length in bytes given the length of a binary input.
virtual int detect_encodings(const char *input, size_t length) const noexcept=0
This function will try to detect the possible encodings in one pass.
virtual simdutf_warn_unused size_t count_utf16be(const char16_t *input, size_t length) const noexcept=0
Count the number of code points (characters) in the string assuming that it is valid.
virtual simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t *input, size_t length, char *utf8_buffer) const noexcept=0
Convert possibly broken UTF-32 string into UTF-8 string and stop on error.
virtual simdutf_warn_unused bool validate_utf16le_as_ascii(const char16_t *buf, size_t len) const noexcept=0
Validate the ASCII string as a UTF-16LE sequence.
virtual void to_well_formed_utf16be(const char16_t *input, size_t len, char16_t *output) const noexcept=0
Copies the UTF-16BE string while replacing mismatched surrogates with the Unicode replacement charact...
virtual simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept=0
Validate the ASCII string.
virtual simdutf_warn_unused size_t utf16_length_from_utf8(const char *input, size_t length) const noexcept=0
Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
virtual simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char *input, size_t length, char16_t *utf16_output) const noexcept=0
Convert possibly broken UTF-8 string into UTF-16BE string and stop on error.
virtual simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char *input, size_t length, char *latin1_output) const noexcept=0
Convert possibly broken UTF-8 string into latin1 string with errors.
virtual simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept
Return the number of bytes that this UTF-16 string would require in Latin1 format.
virtual simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t *input, size_t length, char16_t *utf16_buffer) const noexcept=0
Convert possibly broken UTF-32 string into UTF-16LE string and stop on error.
virtual simdutf_warn_unused size_t convert_latin1_to_utf8(const char *input, size_t length, char *utf8_output) const noexcept=0
Convert Latin1 string into UTF-8 string.
virtual encoding_type autodetect_encoding(const char *input, size_t length) const noexcept
This function will try to detect the encoding.
virtual simdutf_warn_unused size_t utf8_length_from_latin1(const char *input, size_t length) const noexcept=0
Return the number of bytes that this Latin1 string would require in UTF-8 format.