simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
base64.h
1#ifndef SIMDUTF_BASE64_H
2#define SIMDUTF_BASE64_H
3
4#include <cstddef>
5#include <cstdint>
6#include <cstring>
7
8namespace simdutf {
9namespace scalar {
10namespace {
11namespace base64 {
12
13// This function is not expected to be fast. Do not use in long loops.
14// In most instances you should be using is_ignorable.
15template <class char_type> bool is_ascii_white_space(char_type c) {
16 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
17}
18
19template <class char_type> simdutf_constexpr23 bool is_eight_byte(char_type c) {
20 if constexpr (sizeof(char_type) == 1) {
21 return true;
22 }
23 return uint8_t(c) == c;
24}
25
26template <class char_type>
27simdutf_constexpr23 bool is_ignorable(char_type c,
28 simdutf::base64_options options) {
29 const uint8_t *to_base64 =
30 (options & base64_default_or_url)
31 ? tables::base64::to_base64_default_or_url_value
32 : ((options & base64_url) ? tables::base64::to_base64_url_value
33 : tables::base64::to_base64_value);
34 const bool ignore_garbage =
35 (options == base64_options::base64_url_accept_garbage) ||
36 (options == base64_options::base64_default_accept_garbage) ||
37 (options == base64_options::base64_default_or_url_accept_garbage);
38 uint8_t code = to_base64[uint8_t(c)];
39 if (is_eight_byte(c) && code <= 63) {
40 return false;
41 }
42 if (is_eight_byte(c) && code == 64) {
43 return true;
44 }
45 return ignore_garbage;
46}
47template <class char_type>
48simdutf_constexpr23 bool is_base64(char_type c,
49 simdutf::base64_options options) {
50 const uint8_t *to_base64 =
51 (options & base64_default_or_url)
52 ? tables::base64::to_base64_default_or_url_value
53 : ((options & base64_url) ? tables::base64::to_base64_url_value
54 : tables::base64::to_base64_value);
55 uint8_t code = to_base64[uint8_t(c)];
56 if (is_eight_byte(c) && code <= 63) {
57 return true;
58 }
59 return false;
60}
61
62template <class char_type>
63simdutf_constexpr23 bool is_base64_or_padding(char_type c,
64 simdutf::base64_options options) {
65 const uint8_t *to_base64 =
66 (options & base64_default_or_url)
67 ? tables::base64::to_base64_default_or_url_value
68 : ((options & base64_url) ? tables::base64::to_base64_url_value
69 : tables::base64::to_base64_value);
70 if (c == '=') {
71 return true;
72 }
73 uint8_t code = to_base64[uint8_t(c)];
74 if (is_eight_byte(c) && code <= 63) {
75 return true;
76 }
77 return false;
78}
79
80template <class char_type>
81bool is_ignorable_or_padding(char_type c, simdutf::base64_options options) {
82 return is_ignorable(c, options) || c == '=';
83}
84
85struct reduced_input {
86 size_t equalsigns; // number of padding characters '=', typically 0, 1, 2.
87 size_t equallocation; // location of the first padding character if any
88 size_t srclen; // length of the input buffer before padding
89 size_t full_input_length; // length of the input buffer with padding but
90 // without ignorable characters
91};
92
93// find the end of the base64 input buffer
94// It returns the number of padding characters, the location of the first
95// padding character if any, the length of the input buffer before padding
96// and the length of the input buffer with padding. The input buffer is not
97// modified. The function assumes that there are at most two padding characters.
98template <class char_type>
99simdutf_constexpr23 reduced_input find_end(const char_type *src, size_t srclen,
100 simdutf::base64_options options) {
101 const uint8_t *to_base64 =
102 (options & base64_default_or_url)
103 ? tables::base64::to_base64_default_or_url_value
104 : ((options & base64_url) ? tables::base64::to_base64_url_value
105 : tables::base64::to_base64_value);
106 const bool ignore_garbage =
107 (options == base64_options::base64_url_accept_garbage) ||
108 (options == base64_options::base64_default_accept_garbage) ||
109 (options == base64_options::base64_default_or_url_accept_garbage);
110
111 size_t equalsigns = 0;
112 // We intentionally include trailing spaces in the full input length.
113 // See https://github.com/simdutf/simdutf/issues/824
114 size_t full_input_length = srclen;
115 // skip trailing spaces
116 while (!ignore_garbage && srclen > 0 &&
117 scalar::base64::is_eight_byte(src[srclen - 1]) &&
118 to_base64[uint8_t(src[srclen - 1])] == 64) {
119 srclen--;
120 }
121 size_t equallocation =
122 srclen; // location of the first padding character if any
123 if (ignore_garbage) {
124 // Technically, we don't need to find the first padding character, we can
125 // just change our algorithms, but it adds substantial complexity.
126 auto it = simdutf::find(src, src + srclen, '=');
127 if (it != src + srclen) {
128 equallocation = it - src;
129 equalsigns = 1;
130 srclen = equallocation;
131 full_input_length = equallocation + 1;
132 }
133 return {equalsigns, equallocation, srclen, full_input_length};
134 }
135 if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
136 // This is the last '=' sign.
137 equallocation = srclen - 1;
138 srclen--;
139 equalsigns = 1;
140 // skip trailing spaces
141 while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
142 to_base64[uint8_t(src[srclen - 1])] == 64) {
143 srclen--;
144 }
145 if (srclen > 0 && src[srclen - 1] == '=') {
146 // This is the second '=' sign.
147 equallocation = srclen - 1;
148 srclen--;
149 equalsigns = 2;
150 }
151 }
152 return {equalsigns, equallocation, srclen, full_input_length};
153}
154
155// Returns true upon success. The destination buffer must be large enough.
156// This functions assumes that the padding (=) has been removed.
157// if check_capacity is true, it will check that the destination buffer is
158// large enough. If it is not, it will return OUTPUT_BUFFER_TOO_SMALL.
159template <bool check_capacity, class char_type>
160simdutf_constexpr23 full_result base64_tail_decode_impl(
161 char *dst, size_t outlen, const char_type *src, size_t length,
162 size_t padding_characters, // number of padding characters
163 // '=', typically 0, 1, 2.
164 base64_options options, last_chunk_handling_options last_chunk_options) {
165 char *dstend = dst + outlen;
166 (void)dstend;
167 // This looks like 10 branches, but we expect the compiler to resolve this to
168 // two branches (easily predicted):
169 const uint8_t *to_base64 =
170 (options & base64_default_or_url)
171 ? tables::base64::to_base64_default_or_url_value
172 : ((options & base64_url) ? tables::base64::to_base64_url_value
173 : tables::base64::to_base64_value);
174 const uint32_t *d0 =
175 (options & base64_default_or_url)
176 ? tables::base64::base64_default_or_url::d0
177 : ((options & base64_url) ? tables::base64::base64_url::d0
178 : tables::base64::base64_default::d0);
179 const uint32_t *d1 =
180 (options & base64_default_or_url)
181 ? tables::base64::base64_default_or_url::d1
182 : ((options & base64_url) ? tables::base64::base64_url::d1
183 : tables::base64::base64_default::d1);
184 const uint32_t *d2 =
185 (options & base64_default_or_url)
186 ? tables::base64::base64_default_or_url::d2
187 : ((options & base64_url) ? tables::base64::base64_url::d2
188 : tables::base64::base64_default::d2);
189 const uint32_t *d3 =
190 (options & base64_default_or_url)
191 ? tables::base64::base64_default_or_url::d3
192 : ((options & base64_url) ? tables::base64::base64_url::d3
193 : tables::base64::base64_default::d3);
194 const bool ignore_garbage =
195 (options == base64_options::base64_url_accept_garbage) ||
196 (options == base64_options::base64_default_accept_garbage) ||
197 (options == base64_options::base64_default_or_url_accept_garbage);
198
199 const char_type *srcend = src + length;
200 const char_type *srcinit = src;
201 const char *dstinit = dst;
202
203 uint32_t x;
204 size_t idx;
205 uint8_t buffer[4];
206 while (true) {
207 while (srcend - src >= 4 && is_eight_byte(src[0]) &&
208 is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
209 is_eight_byte(src[3]) &&
210 (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
211 d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
212 if (check_capacity && dstend - dst < 3) {
213 return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit),
214 size_t(dst - dstinit)};
215 }
216 *dst++ = static_cast<char>(x & 0xFF);
217 *dst++ = static_cast<char>((x >> 8) & 0xFF);
218 *dst++ = static_cast<char>((x >> 16) & 0xFF);
219 src += 4;
220 }
221 const char_type *srccur = src;
222 idx = 0;
223 // we need at least four characters.
224#ifdef __clang__
225 // If possible, we read four characters at a time. (It is an optimization.)
226 if (ignore_garbage && src + 4 <= srcend) {
227 char_type c0 = src[0];
228 char_type c1 = src[1];
229 char_type c2 = src[2];
230 char_type c3 = src[3];
231
232 uint8_t code0 = to_base64[uint8_t(c0)];
233 uint8_t code1 = to_base64[uint8_t(c1)];
234 uint8_t code2 = to_base64[uint8_t(c2)];
235 uint8_t code3 = to_base64[uint8_t(c3)];
236
237 buffer[idx] = code0;
238 idx += (is_eight_byte(c0) && code0 <= 63);
239 buffer[idx] = code1;
240 idx += (is_eight_byte(c1) && code1 <= 63);
241 buffer[idx] = code2;
242 idx += (is_eight_byte(c2) && code2 <= 63);
243 buffer[idx] = code3;
244 idx += (is_eight_byte(c3) && code3 <= 63);
245 src += 4;
246 }
247#endif
248 while ((idx < 4) && (src < srcend)) {
249 char_type c = *src;
250
251 uint8_t code = to_base64[uint8_t(c)];
252 buffer[idx] = uint8_t(code);
253 if (is_eight_byte(c) && code <= 63) {
254 idx++;
255 } else if (!ignore_garbage &&
256 (code > 64 || !scalar::base64::is_eight_byte(c))) {
257 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
258 size_t(dst - dstinit)};
259 } else {
260 // We have a space or a newline or garbage. We ignore it.
261 }
262 src++;
263 }
264 if (idx != 4) {
265 simdutf_log_assert(idx < 4, "idx should be less than 4");
266 // We never should have that the number of base64 characters + the
267 // number of padding characters is more than 4.
268 if (!ignore_garbage && (idx + padding_characters > 4)) {
269 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
270 size_t(dst - dstinit), true};
271 }
272
273 // The idea here is that in loose mode,
274 // if there is padding at all, it must be used
275 // to form 4-wise chunk. However, in loose mode,
276 // we do accept no padding at all.
277 if (!ignore_garbage &&
278 last_chunk_options == last_chunk_handling_options::loose &&
279 (idx >= 2) && padding_characters > 0 &&
280 ((idx + padding_characters) & 3) != 0) {
281 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
282 size_t(dst - dstinit), true};
283 } else
284
285 // The idea here is that in strict mode, we do not want to accept
286 // incomplete base64 chunks. So if the chunk was otherwise valid, we
287 // return BASE64_INPUT_REMAINDER.
288 if (!ignore_garbage &&
289 last_chunk_options == last_chunk_handling_options::strict &&
290 (idx >= 2) && ((idx + padding_characters) & 3) != 0) {
291 // The partial chunk was at src - idx
292 return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
293 size_t(dst - dstinit), true};
294 } else
295 // If there is a partial chunk with insufficient padding, with
296 // stop_before_partial, we need to just ignore it. In "only full"
297 // mode, skip the minute there are padding characters.
298 if ((last_chunk_options ==
299 last_chunk_handling_options::stop_before_partial &&
300 (padding_characters + idx < 4) && (idx != 0) &&
301 (idx >= 2 || padding_characters == 0)) ||
302 (last_chunk_options ==
303 last_chunk_handling_options::only_full_chunks &&
304 (idx >= 2 || padding_characters == 0))) {
305 // partial means that we are *not* going to consume the read
306 // characters. We need to rewind the src pointer.
307 src = srccur;
308 return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
309 } else {
310 if (idx == 2) {
311 uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
312 (uint32_t(buffer[1]) << 2 * 6);
313 if (!ignore_garbage &&
314 (last_chunk_options == last_chunk_handling_options::strict) &&
315 (triple & 0xffff)) {
316 return {BASE64_EXTRA_BITS, size_t(src - srcinit),
317 size_t(dst - dstinit)};
318 }
319 if (check_capacity && dstend - dst < 1) {
320 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
321 size_t(dst - dstinit)};
322 }
323 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
324 } else if (idx == 3) {
325 uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
326 (uint32_t(buffer[1]) << 2 * 6) +
327 (uint32_t(buffer[2]) << 1 * 6);
328 if (!ignore_garbage &&
329 (last_chunk_options == last_chunk_handling_options::strict) &&
330 (triple & 0xff)) {
331 return {BASE64_EXTRA_BITS, size_t(src - srcinit),
332 size_t(dst - dstinit)};
333 }
334 if (check_capacity && dstend - dst < 2) {
335 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
336 size_t(dst - dstinit)};
337 }
338 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
339 *dst++ = static_cast<char>((triple >> 8) & 0xFF);
340 } else if (!ignore_garbage && idx == 1 &&
341 (!is_partial(last_chunk_options) ||
342 (is_partial(last_chunk_options) &&
343 padding_characters > 0))) {
344 return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
345 size_t(dst - dstinit)};
346 } else if (!ignore_garbage && idx == 0 && padding_characters > 0) {
347 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
348 size_t(dst - dstinit), true};
349 }
350 return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
351 }
352 }
353 if (check_capacity && dstend - dst < 3) {
354 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
355 size_t(dst - dstinit)};
356 }
357 uint32_t triple =
358 (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
359 (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
360 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
361 *dst++ = static_cast<char>((triple >> 8) & 0xFF);
362 *dst++ = static_cast<char>(triple & 0xFF);
363 }
364}
365
366template <class char_type>
367simdutf_constexpr23 full_result base64_tail_decode(
368 char *dst, const char_type *src, size_t length,
369 size_t padding_characters, // number of padding characters
370 // '=', typically 0, 1, 2.
371 base64_options options, last_chunk_handling_options last_chunk_options) {
372 return base64_tail_decode_impl<false>(dst, 0, src, length, padding_characters,
373 options, last_chunk_options);
374}
375
376// like base64_tail_decode, but it will not write past the end of the output
377// buffer. The outlen parameter is modified to reflect the number of bytes
378// written. This functions assumes that the padding (=) has been removed.
379//
380template <class char_type>
381simdutf_constexpr23 full_result base64_tail_decode_safe(
382 char *dst, size_t outlen, const char_type *src, size_t length,
383 size_t padding_characters, // number of padding characters
384 // '=', typically 0, 1, 2.
385 base64_options options, last_chunk_handling_options last_chunk_options) {
386 return base64_tail_decode_impl<true>(dst, outlen, src, length,
387 padding_characters, options,
388 last_chunk_options);
389}
390
391inline simdutf_constexpr23 full_result
392patch_tail_result(full_result r, size_t previous_input, size_t previous_output,
393 size_t equallocation, size_t full_input_length,
394 last_chunk_handling_options last_chunk_options) {
395 r.input_count += previous_input;
396 r.output_count += previous_output;
397 if (r.padding_error) {
398 r.input_count = equallocation;
399 }
400
401 if (r.error == error_code::SUCCESS) {
402 if (!is_partial(last_chunk_options)) {
403 // A success when we are not in stop_before_partial mode.
404 // means that we have consumed the whole input buffer.
405 r.input_count = full_input_length;
406 } else if (r.output_count % 3 != 0) {
407 r.input_count = full_input_length;
408 }
409 }
410 return r;
411}
412
413// Returns the number of bytes written. The destination buffer must be large
414// enough. It will add padding (=) if needed.
415template <bool use_lines = false>
416simdutf_constexpr23 size_t tail_encode_base64_impl(
417 char *dst, const char *src, size_t srclen, base64_options options,
418 size_t line_length = simdutf::default_line_length, size_t line_offset = 0) {
419 if constexpr (use_lines) {
420 // sanitize line_length and starting_line_offset.
421 // line_length must be greater than 3.
422 if (line_length < 4) {
423 line_length = 4;
424 }
425 simdutf_log_assert(line_offset <= line_length,
426 "line_offset should be less than line_length");
427 }
428 // By default, we use padding if we are not using the URL variant.
429 // This is check with ((options & base64_url) == 0) which returns true if we
430 // are not using the URL variant. However, we also allow 'inversion' of the
431 // convention with the base64_reverse_padding option. If the
432 // base64_reverse_padding option is set, we use padding if we are using the
433 // URL variant, and we omit it if we are not using the URL variant. This is
434 // checked with
435 // ((options & base64_reverse_padding) == base64_reverse_padding).
436 bool use_padding =
437 ((options & base64_url) == 0) ^
438 ((options & base64_reverse_padding) == base64_reverse_padding);
439 // This looks like 3 branches, but we expect the compiler to resolve this to
440 // a single branch:
441 const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
442 : tables::base64::base64_default::e0;
443 const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
444 : tables::base64::base64_default::e1;
445 const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
446 : tables::base64::base64_default::e2;
447 char *out = dst;
448 size_t i = 0;
449 uint8_t t1, t2, t3;
450 for (; i + 2 < srclen; i += 3) {
451 t1 = uint8_t(src[i]);
452 t2 = uint8_t(src[i + 1]);
453 t3 = uint8_t(src[i + 2]);
454 if constexpr (use_lines) {
455 if (line_offset + 3 >= line_length) {
456 if (line_offset == line_length) {
457 *out++ = '\n';
458 *out++ = e0[t1];
459 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
460 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
461 *out++ = e2[t3];
462 line_offset = 4;
463 } else if (line_offset + 1 == line_length) {
464 *out++ = e0[t1];
465 *out++ = '\n';
466 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
467 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
468 *out++ = e2[t3];
469 line_offset = 3;
470 } else if (line_offset + 2 == line_length) {
471 *out++ = e0[t1];
472 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
473 *out++ = '\n';
474 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
475 *out++ = e2[t3];
476 line_offset = 2;
477 } else if (line_offset + 3 == line_length) {
478 *out++ = e0[t1];
479 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
480 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
481 *out++ = '\n';
482 *out++ = e2[t3];
483 line_offset = 1;
484 }
485 } else {
486 *out++ = e0[t1];
487 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
488 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
489 *out++ = e2[t3];
490 line_offset += 4;
491 }
492 } else {
493 *out++ = e0[t1];
494 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
495 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
496 *out++ = e2[t3];
497 }
498 }
499 switch (srclen - i) {
500 case 0:
501 break;
502 case 1:
503 t1 = uint8_t(src[i]);
504 if constexpr (use_lines) {
505 if (use_padding) {
506 if (line_offset + 3 >= line_length) {
507 if (line_offset == line_length) {
508 *out++ = '\n';
509 *out++ = e0[t1];
510 *out++ = e1[(t1 & 0x03) << 4];
511 *out++ = '=';
512 *out++ = '=';
513 } else if (line_offset + 1 == line_length) {
514 *out++ = e0[t1];
515 *out++ = '\n';
516 *out++ = e1[(t1 & 0x03) << 4];
517 *out++ = '=';
518 *out++ = '=';
519 } else if (line_offset + 2 == line_length) {
520 *out++ = e0[t1];
521 *out++ = e1[(t1 & 0x03) << 4];
522 *out++ = '\n';
523 *out++ = '=';
524 *out++ = '=';
525 } else if (line_offset + 3 == line_length) {
526 *out++ = e0[t1];
527 *out++ = e1[(t1 & 0x03) << 4];
528 *out++ = '=';
529 *out++ = '\n';
530 *out++ = '=';
531 }
532 } else {
533 *out++ = e0[t1];
534 *out++ = e1[(t1 & 0x03) << 4];
535 *out++ = '=';
536 *out++ = '=';
537 }
538 } else {
539 if (line_offset + 2 >= line_length) {
540 if (line_offset == line_length) {
541 *out++ = '\n';
542 *out++ = e0[uint8_t(src[i])];
543 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
544 } else if (line_offset + 1 == line_length) {
545 *out++ = e0[uint8_t(src[i])];
546 *out++ = '\n';
547 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
548 } else {
549 *out++ = e0[uint8_t(src[i])];
550 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
551 // *out++ = '\n'; ==> no newline at the end of the output
552 }
553 } else {
554 *out++ = e0[uint8_t(src[i])];
555 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
556 }
557 }
558 } else {
559 *out++ = e0[t1];
560 *out++ = e1[(t1 & 0x03) << 4];
561 if (use_padding) {
562 *out++ = '=';
563 *out++ = '=';
564 }
565 }
566 break;
567 default: /* case 2 */
568 t1 = uint8_t(src[i]);
569 t2 = uint8_t(src[i + 1]);
570 if constexpr (use_lines) {
571 if (use_padding) {
572 if (line_offset + 3 >= line_length) {
573 if (line_offset == line_length) {
574 *out++ = '\n';
575 *out++ = e0[t1];
576 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
577 *out++ = e2[(t2 & 0x0F) << 2];
578 *out++ = '=';
579 } else if (line_offset + 1 == line_length) {
580 *out++ = e0[t1];
581 *out++ = '\n';
582 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
583 *out++ = e2[(t2 & 0x0F) << 2];
584 *out++ = '=';
585 } else if (line_offset + 2 == line_length) {
586 *out++ = e0[t1];
587 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
588 *out++ = '\n';
589 *out++ = e2[(t2 & 0x0F) << 2];
590 *out++ = '=';
591 } else if (line_offset + 3 == line_length) {
592 *out++ = e0[t1];
593 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
594 *out++ = e2[(t2 & 0x0F) << 2];
595 *out++ = '\n';
596 *out++ = '=';
597 }
598 } else {
599 *out++ = e0[t1];
600 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
601 *out++ = e2[(t2 & 0x0F) << 2];
602 *out++ = '=';
603 }
604 } else {
605 if (line_offset + 3 >= line_length) {
606 if (line_offset == line_length) {
607 *out++ = '\n';
608 *out++ = e0[t1];
609 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
610 *out++ = e2[(t2 & 0x0F) << 2];
611 } else if (line_offset + 1 == line_length) {
612 *out++ = e0[t1];
613 *out++ = '\n';
614 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
615 *out++ = e2[(t2 & 0x0F) << 2];
616 } else if (line_offset + 2 == line_length) {
617 *out++ = e0[t1];
618 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
619 *out++ = '\n';
620 *out++ = e2[(t2 & 0x0F) << 2];
621 } else {
622 *out++ = e0[t1];
623 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
624 *out++ = e2[(t2 & 0x0F) << 2];
625 // *out++ = '\n'; ==> no newline at the end of the output
626 }
627 } else {
628 *out++ = e0[t1];
629 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
630 *out++ = e2[(t2 & 0x0F) << 2];
631 }
632 }
633 } else {
634 *out++ = e0[t1];
635 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
636 *out++ = e2[(t2 & 0x0F) << 2];
637 if (use_padding) {
638 *out++ = '=';
639 }
640 }
641 }
642 return (size_t)(out - dst);
643}
644
645// Returns the number of bytes written. The destination buffer must be large
646// enough. It will add padding (=) if needed.
647simdutf_unused inline simdutf_constexpr23 size_t tail_encode_base64(
648 char *dst, const char *src, size_t srclen, base64_options options) {
649 return tail_encode_base64_impl(dst, src, srclen, options);
650}
651
652template <class InputPtr>
653simdutf_warn_unused simdutf_constexpr23 size_t
654maximal_binary_length_from_base64(InputPtr input, size_t length) noexcept {
655 // We process the padding characters ('=') at the end to make sure
656 // that we return an exact result when the input has no ignorable characters
657 // (e.g., spaces).
658 size_t padding = 0;
659 if (length > 0) {
660 if (input[length - 1] == '=') {
661 padding++;
662 if (length > 1 && input[length - 2] == '=') {
663 padding++;
664 }
665 }
666 }
667 // The input is not otherwise processed for ignorable characters or
668 // validation, so that the function runs in constant time (very fast). In
669 // practice, base64 inputs without ignorable characters are common and the
670 // common case are line separated inputs with relatively long lines (e.g., 76
671 // characters) which leads this function to a slight (1%) overestimation of
672 // the output size.
673 //
674 // Of course, some inputs might contain an arbitrary number of spaces or
675 // newlines, which would make this function return a very pessimistic output
676 // size but systems that produce base64 outputs typically do not do that and
677 // if they do, they do not care much about minimizing memory usage.
678 //
679 // In specialized applications, users may know that their input is line
680 // separated, which can be checked very quickly by by iterating (e.g., over 76
681 // character chunks, looking for the linefeed characters only). We could
682 // provide a specialized function for that, but it is not clear that the added
683 // complexity is worth it for us.
684 //
685 size_t actual_length = length - padding;
686 if (actual_length % 4 <= 1) {
687 return actual_length / 4 * 3;
688 }
689 // if we have a valid input, then the remainder must be 2 or 3 adding one or
690 // two extra bytes.
691 return actual_length / 4 * 3 + (actual_length % 4) - 1;
692}
693
694// This function computes the binary length by iterating through the input
695// and counting non-whitespace characters (excluding padding characters).
696// We use a simple check (c > ' ') which is easy to parallelize and matches
697// SIMD behavior. Only the last few characters are checked for padding '='.
698template <class char_type>
699simdutf_warn_unused simdutf_constexpr23 size_t
700binary_length_from_base64(const char_type *input, size_t length) noexcept {
701 // Count non-whitespace characters (c > ' ') with loop unrolling
702 size_t count = 0;
703 for (size_t i = 0; i < length; i++) {
704 count += (input[i] > ' ');
705 }
706
707 // Check for padding '=' at the end (at most 2 padding characters)
708 // Scan backwards, skipping whitespace, to find padding
709 size_t padding = 0;
710 size_t pos = length;
711 // Skip trailing whitespace
712 while (pos > 0 && padding < 2) {
713 char_type c = input[--pos];
714 if (c == '=') {
715 padding++;
716 } else if (c > ' ') {
717 break;
718 }
719 }
720 return ((count - padding) * 3) / 4;
721}
722
723template <typename char_type>
724simdutf_warn_unused simdutf_constexpr23 full_result
725base64_to_binary_details_impl(
726 const char_type *input, size_t length, char *output, base64_options options,
727 last_chunk_handling_options last_chunk_options) noexcept {
728 const bool ignore_garbage =
729 (options == base64_options::base64_url_accept_garbage) ||
730 (options == base64_options::base64_default_accept_garbage) ||
731 (options == base64_options::base64_default_or_url_accept_garbage);
732 auto ri = simdutf::scalar::base64::find_end(input, length, options);
733 size_t equallocation = ri.equallocation;
734 size_t equalsigns = ri.equalsigns;
735 length = ri.srclen;
736 size_t full_input_length = ri.full_input_length;
737 if (length == 0) {
738 if (!ignore_garbage && equalsigns > 0) {
739 return {INVALID_BASE64_CHARACTER, equallocation, 0, true};
740 }
741 return {SUCCESS, full_input_length, 0};
742 }
743 full_result r = scalar::base64::base64_tail_decode(
744 output, input, length, equalsigns, options, last_chunk_options);
745 r = scalar::base64::patch_tail_result(r, 0, 0, equallocation,
746 full_input_length, last_chunk_options);
747 if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
748 equalsigns > 0 && !ignore_garbage) {
749 // additional checks
750 if ((r.output_count % 3 == 0) ||
751 ((r.output_count % 3) + 1 + equalsigns != 4)) {
752 return {INVALID_BASE64_CHARACTER, equallocation, r.output_count, true};
753 }
754 }
755 // When is_partial(last_chunk_options) is true, we must either end with
756 // the end of the stream (beyond whitespace) or right after a non-ignorable
757 // character or at the very beginning of the stream.
758 // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
759 if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
760 r.input_count < full_input_length) {
761 // First check if we can extend the input to the end of the stream
762 while (r.input_count < full_input_length &&
763 base64_ignorable(*(input + r.input_count), options)) {
764 r.input_count++;
765 }
766 // If we are still not at the end of the stream, then we must backtrack
767 // to the last non-ignorable character.
768 if (r.input_count < full_input_length) {
769 while (r.input_count > 0 &&
770 base64_ignorable(*(input + r.input_count - 1), options)) {
771 r.input_count--;
772 }
773 }
774 }
775 return r;
776}
777
778template <typename char_type>
779simdutf_constexpr23 simdutf_warn_unused full_result
780base64_to_binary_details_safe_impl(
781 const char_type *input, size_t length, char *output, size_t outlen,
782 base64_options options,
783 last_chunk_handling_options last_chunk_options) noexcept {
784 const bool ignore_garbage =
785 (options == base64_options::base64_url_accept_garbage) ||
786 (options == base64_options::base64_default_accept_garbage) ||
787 (options == base64_options::base64_default_or_url_accept_garbage);
788 auto ri = simdutf::scalar::base64::find_end(input, length, options);
789 size_t equallocation = ri.equallocation;
790 size_t equalsigns = ri.equalsigns;
791 length = ri.srclen;
792 size_t full_input_length = ri.full_input_length;
793 if (length == 0) {
794 if (!ignore_garbage && equalsigns > 0) {
795 return {INVALID_BASE64_CHARACTER, equallocation, 0};
796 }
797 return {SUCCESS, full_input_length, 0};
798 }
799 full_result r = scalar::base64::base64_tail_decode_safe(
800 output, outlen, input, length, equalsigns, options, last_chunk_options);
801 r = scalar::base64::patch_tail_result(r, 0, 0, equallocation,
802 full_input_length, last_chunk_options);
803 if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
804 equalsigns > 0 && !ignore_garbage) {
805 // additional checks
806 if ((r.output_count % 3 == 0) ||
807 ((r.output_count % 3) + 1 + equalsigns != 4)) {
808 return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
809 }
810 }
811
812 // When is_partial(last_chunk_options) is true, we must either end with
813 // the end of the stream (beyond whitespace) or right after a non-ignorable
814 // character or at the very beginning of the stream.
815 // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
816 if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
817 r.input_count < full_input_length) {
818 // First check if we can extend the input to the end of the stream
819 while (r.input_count < full_input_length &&
820 base64_ignorable(*(input + r.input_count), options)) {
821 r.input_count++;
822 }
823 // If we are still not at the end of the stream, then we must backtrack
824 // to the last non-ignorable character.
825 if (r.input_count < full_input_length) {
826 while (r.input_count > 0 &&
827 base64_ignorable(*(input + r.input_count - 1), options)) {
828 r.input_count--;
829 }
830 }
831 }
832 return r;
833}
834
835simdutf_warn_unused simdutf_constexpr23 size_t
836base64_length_from_binary(size_t length, base64_options options) noexcept {
837 // By default, we use padding if we are not using the URL variant.
838 // This is check with ((options & base64_url) == 0) which returns true if we
839 // are not using the URL variant. However, we also allow 'inversion' of the
840 // convention with the base64_reverse_padding option. If the
841 // base64_reverse_padding option is set, we use padding if we are using the
842 // URL variant, and we omit it if we are not using the URL variant. This is
843 // checked with
844 // ((options & base64_reverse_padding) == base64_reverse_padding).
845 bool use_padding =
846 ((options & base64_url) == 0) ^
847 ((options & base64_reverse_padding) == base64_reverse_padding);
848 if (!use_padding) {
849 return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
850 }
851 return (length + 2) / 3 *
852 4; // We use padding to make the length a multiple of 4.
853}
854
855simdutf_warn_unused simdutf_constexpr23 size_t
856base64_length_from_binary_with_lines(size_t length, base64_options options,
857 size_t line_length) noexcept {
858 if (length == 0) {
859 return 0;
860 }
861 size_t base64_length =
862 scalar::base64::base64_length_from_binary(length, options);
863 if (line_length < 4) {
864 line_length = 4;
865 }
866 size_t lines =
867 (base64_length + line_length - 1) / line_length; // number of lines
868 return base64_length + lines - 1;
869}
870
871// Return the length of the prefix that contains count base64 characters.
872// Thus, if count is 3, the function returns the length of the prefix
873// that contains 3 base64 characters.
874// The function returns (size_t)-1 if there is not enough base64 characters in
875// the input.
876template <typename char_type>
877simdutf_warn_unused size_t prefix_length(size_t count,
878 simdutf::base64_options options,
879 const char_type *input,
880 size_t length) noexcept {
881 size_t i = 0;
882 while (i < length && is_ignorable(input[i], options)) {
883 i++;
884 }
885 if (count == 0) {
886 return i; // duh!
887 }
888 for (; i < length; i++) {
889 if (is_ignorable(input[i], options)) {
890 continue;
891 }
892 // We have a base64 character or a padding character.
893 count--;
894 if (count == 0) {
895 return i + 1;
896 }
897 }
898 simdutf_log_assert(false, "You never get here");
899
900 return -1; // should never happen
901}
902
903} // namespace base64
904} // unnamed namespace
905} // namespace scalar
906} // namespace simdutf
907
908#endif