simdutf 8.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
base64.h
1#ifndef SIMDUTF_BASE64_H
2#define SIMDUTF_BASE64_H
3
4#include <algorithm>
5#include <cstddef>
6#include <cstdint>
7#include <cstring>
8#include <iostream>
9
10namespace simdutf {
11namespace scalar {
12namespace {
13namespace base64 {
14
15// This function is not expected to be fast. Do not use in long loops.
16// In most instances you should be using is_ignorable.
17template <class char_type> bool is_ascii_white_space(char_type c) {
18 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
19}
20
21template <class char_type> simdutf_constexpr23 bool is_eight_byte(char_type c) {
22 if simdutf_constexpr (sizeof(char_type) == 1) {
23 return true;
24 }
25 return uint8_t(c) == c;
26}
27
28template <class char_type>
29simdutf_constexpr23 bool is_ignorable(char_type c,
30 simdutf::base64_options options) {
31 const uint8_t *to_base64 =
32 (options & base64_default_or_url)
33 ? tables::base64::to_base64_default_or_url_value
34 : ((options & base64_url) ? tables::base64::to_base64_url_value
35 : tables::base64::to_base64_value);
36 const bool ignore_garbage =
37 (options == base64_options::base64_url_accept_garbage) ||
38 (options == base64_options::base64_default_accept_garbage) ||
39 (options == base64_options::base64_default_or_url_accept_garbage);
40 uint8_t code = to_base64[uint8_t(c)];
41 if (is_eight_byte(c) && code <= 63) {
42 return false;
43 }
44 if (is_eight_byte(c) && code == 64) {
45 return true;
46 }
47 return ignore_garbage;
48}
49template <class char_type>
50simdutf_constexpr23 bool is_base64(char_type c,
51 simdutf::base64_options options) {
52 const uint8_t *to_base64 =
53 (options & base64_default_or_url)
54 ? tables::base64::to_base64_default_or_url_value
55 : ((options & base64_url) ? tables::base64::to_base64_url_value
56 : tables::base64::to_base64_value);
57 uint8_t code = to_base64[uint8_t(c)];
58 if (is_eight_byte(c) && code <= 63) {
59 return true;
60 }
61 return false;
62}
63
64template <class char_type>
65simdutf_constexpr23 bool is_base64_or_padding(char_type c,
66 simdutf::base64_options options) {
67 const uint8_t *to_base64 =
68 (options & base64_default_or_url)
69 ? tables::base64::to_base64_default_or_url_value
70 : ((options & base64_url) ? tables::base64::to_base64_url_value
71 : tables::base64::to_base64_value);
72 if (c == '=') {
73 return true;
74 }
75 uint8_t code = to_base64[uint8_t(c)];
76 if (is_eight_byte(c) && code <= 63) {
77 return true;
78 }
79 return false;
80}
81
82template <class char_type>
83bool is_ignorable_or_padding(char_type c, simdutf::base64_options options) {
84 return is_ignorable(c, options) || c == '=';
85}
86
87struct reduced_input {
88 size_t equalsigns; // number of padding characters '=', typically 0, 1, 2.
89 size_t equallocation; // location of the first padding character if any
90 size_t srclen; // length of the input buffer before padding
91 size_t full_input_length; // length of the input buffer with padding but
92 // without ignorable characters
93};
94
95// find the end of the base64 input buffer
96// It returns the number of padding characters, the location of the first
97// padding character if any, the length of the input buffer before padding
98// and the length of the input buffer with padding. The input buffer is not
99// modified. The function assumes that there are at most two padding characters.
100template <class char_type>
101simdutf_constexpr23 reduced_input find_end(const char_type *src, size_t srclen,
102 simdutf::base64_options options) {
103 const uint8_t *to_base64 =
104 (options & base64_default_or_url)
105 ? tables::base64::to_base64_default_or_url_value
106 : ((options & base64_url) ? tables::base64::to_base64_url_value
107 : tables::base64::to_base64_value);
108 const bool ignore_garbage =
109 (options == base64_options::base64_url_accept_garbage) ||
110 (options == base64_options::base64_default_accept_garbage) ||
111 (options == base64_options::base64_default_or_url_accept_garbage);
112
113 size_t equalsigns = 0;
114 // We intentionally include trailing spaces in the full input length.
115 // See https://github.com/simdutf/simdutf/issues/824
116 size_t full_input_length = srclen;
117 // skip trailing spaces
118 while (!ignore_garbage && srclen > 0 &&
119 scalar::base64::is_eight_byte(src[srclen - 1]) &&
120 to_base64[uint8_t(src[srclen - 1])] == 64) {
121 srclen--;
122 }
123 size_t equallocation =
124 srclen; // location of the first padding character if any
125 if (ignore_garbage) {
126 // Technically, we don't need to find the first padding character, we can
127 // just change our algorithms, but it adds substantial complexity.
128 auto it = simdutf::find(src, src + srclen, '=');
129 if (it != src + srclen) {
130 equallocation = it - src;
131 equalsigns = 1;
132 srclen = equallocation;
133 full_input_length = equallocation + 1;
134 }
135 return {equalsigns, equallocation, srclen, full_input_length};
136 }
137 if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
138 // This is the last '=' sign.
139 equallocation = srclen - 1;
140 srclen--;
141 equalsigns = 1;
142 // skip trailing spaces
143 while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
144 to_base64[uint8_t(src[srclen - 1])] == 64) {
145 srclen--;
146 }
147 if (srclen > 0 && src[srclen - 1] == '=') {
148 // This is the second '=' sign.
149 equallocation = srclen - 1;
150 srclen--;
151 equalsigns = 2;
152 }
153 }
154 return {equalsigns, equallocation, srclen, full_input_length};
155}
156
157// Returns true upon success. The destination buffer must be large enough.
158// This functions assumes that the padding (=) has been removed.
159// if check_capacity is true, it will check that the destination buffer is
160// large enough. If it is not, it will return OUTPUT_BUFFER_TOO_SMALL.
161template <bool check_capacity, class char_type>
162simdutf_constexpr23 full_result base64_tail_decode_impl(
163 char *dst, size_t outlen, const char_type *src, size_t length,
164 size_t padding_characters, // number of padding characters
165 // '=', typically 0, 1, 2.
166 base64_options options, last_chunk_handling_options last_chunk_options) {
167 char *dstend = dst + outlen;
168 (void)dstend;
169 // This looks like 10 branches, but we expect the compiler to resolve this to
170 // two branches (easily predicted):
171 const uint8_t *to_base64 =
172 (options & base64_default_or_url)
173 ? tables::base64::to_base64_default_or_url_value
174 : ((options & base64_url) ? tables::base64::to_base64_url_value
175 : tables::base64::to_base64_value);
176 const uint32_t *d0 =
177 (options & base64_default_or_url)
178 ? tables::base64::base64_default_or_url::d0
179 : ((options & base64_url) ? tables::base64::base64_url::d0
180 : tables::base64::base64_default::d0);
181 const uint32_t *d1 =
182 (options & base64_default_or_url)
183 ? tables::base64::base64_default_or_url::d1
184 : ((options & base64_url) ? tables::base64::base64_url::d1
185 : tables::base64::base64_default::d1);
186 const uint32_t *d2 =
187 (options & base64_default_or_url)
188 ? tables::base64::base64_default_or_url::d2
189 : ((options & base64_url) ? tables::base64::base64_url::d2
190 : tables::base64::base64_default::d2);
191 const uint32_t *d3 =
192 (options & base64_default_or_url)
193 ? tables::base64::base64_default_or_url::d3
194 : ((options & base64_url) ? tables::base64::base64_url::d3
195 : tables::base64::base64_default::d3);
196 const bool ignore_garbage =
197 (options == base64_options::base64_url_accept_garbage) ||
198 (options == base64_options::base64_default_accept_garbage) ||
199 (options == base64_options::base64_default_or_url_accept_garbage);
200
201 const char_type *srcend = src + length;
202 const char_type *srcinit = src;
203 const char *dstinit = dst;
204
205 uint32_t x;
206 size_t idx;
207 uint8_t buffer[4];
208 while (true) {
209 while (srcend - src >= 4 && is_eight_byte(src[0]) &&
210 is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
211 is_eight_byte(src[3]) &&
212 (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
213 d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
214 if (check_capacity && dstend - dst < 3) {
215 return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit),
216 size_t(dst - dstinit)};
217 }
218 *dst++ = static_cast<char>(x & 0xFF);
219 *dst++ = static_cast<char>((x >> 8) & 0xFF);
220 *dst++ = static_cast<char>((x >> 16) & 0xFF);
221 src += 4;
222 }
223 const char_type *srccur = src;
224 idx = 0;
225 // we need at least four characters.
226#ifdef __clang__
227 // If possible, we read four characters at a time. (It is an optimization.)
228 if (ignore_garbage && src + 4 <= srcend) {
229 char_type c0 = src[0];
230 char_type c1 = src[1];
231 char_type c2 = src[2];
232 char_type c3 = src[3];
233
234 uint8_t code0 = to_base64[uint8_t(c0)];
235 uint8_t code1 = to_base64[uint8_t(c1)];
236 uint8_t code2 = to_base64[uint8_t(c2)];
237 uint8_t code3 = to_base64[uint8_t(c3)];
238
239 buffer[idx] = code0;
240 idx += (is_eight_byte(c0) && code0 <= 63);
241 buffer[idx] = code1;
242 idx += (is_eight_byte(c1) && code1 <= 63);
243 buffer[idx] = code2;
244 idx += (is_eight_byte(c2) && code2 <= 63);
245 buffer[idx] = code3;
246 idx += (is_eight_byte(c3) && code3 <= 63);
247 src += 4;
248 }
249#endif
250 while ((idx < 4) && (src < srcend)) {
251 char_type c = *src;
252
253 uint8_t code = to_base64[uint8_t(c)];
254 buffer[idx] = uint8_t(code);
255 if (is_eight_byte(c) && code <= 63) {
256 idx++;
257 } else if (!ignore_garbage &&
258 (code > 64 || !scalar::base64::is_eight_byte(c))) {
259 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
260 size_t(dst - dstinit)};
261 } else {
262 // We have a space or a newline or garbage. We ignore it.
263 }
264 src++;
265 }
266 if (idx != 4) {
267 simdutf_log_assert(idx < 4, "idx should be less than 4");
268 // We never should have that the number of base64 characters + the
269 // number of padding characters is more than 4.
270 if (!ignore_garbage && (idx + padding_characters > 4)) {
271 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
272 size_t(dst - dstinit), true};
273 }
274
275 // The idea here is that in loose mode,
276 // if there is padding at all, it must be used
277 // to form 4-wise chunk. However, in loose mode,
278 // we do accept no padding at all.
279 if (!ignore_garbage &&
280 last_chunk_options == last_chunk_handling_options::loose &&
281 (idx >= 2) && padding_characters > 0 &&
282 ((idx + padding_characters) & 3) != 0) {
283 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
284 size_t(dst - dstinit), true};
285 } else
286
287 // The idea here is that in strict mode, we do not want to accept
288 // incomplete base64 chunks. So if the chunk was otherwise valid, we
289 // return BASE64_INPUT_REMAINDER.
290 if (!ignore_garbage &&
291 last_chunk_options == last_chunk_handling_options::strict &&
292 (idx >= 2) && ((idx + padding_characters) & 3) != 0) {
293 // The partial chunk was at src - idx
294 return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
295 size_t(dst - dstinit), true};
296 } else
297 // If there is a partial chunk with insufficient padding, with
298 // stop_before_partial, we need to just ignore it. In "only full"
299 // mode, skip the minute there are padding characters.
300 if ((last_chunk_options ==
301 last_chunk_handling_options::stop_before_partial &&
302 (padding_characters + idx < 4) && (idx != 0) &&
303 (idx >= 2 || padding_characters == 0)) ||
304 (last_chunk_options ==
305 last_chunk_handling_options::only_full_chunks &&
306 (idx >= 2 || padding_characters == 0))) {
307 // partial means that we are *not* going to consume the read
308 // characters. We need to rewind the src pointer.
309 src = srccur;
310 return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
311 } else {
312 if (idx == 2) {
313 uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
314 (uint32_t(buffer[1]) << 2 * 6);
315 if (!ignore_garbage &&
316 (last_chunk_options == last_chunk_handling_options::strict) &&
317 (triple & 0xffff)) {
318 return {BASE64_EXTRA_BITS, size_t(src - srcinit),
319 size_t(dst - dstinit)};
320 }
321 if (check_capacity && dstend - dst < 1) {
322 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
323 size_t(dst - dstinit)};
324 }
325 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
326 } else if (idx == 3) {
327 uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
328 (uint32_t(buffer[1]) << 2 * 6) +
329 (uint32_t(buffer[2]) << 1 * 6);
330 if (!ignore_garbage &&
331 (last_chunk_options == last_chunk_handling_options::strict) &&
332 (triple & 0xff)) {
333 return {BASE64_EXTRA_BITS, size_t(src - srcinit),
334 size_t(dst - dstinit)};
335 }
336 if (check_capacity && dstend - dst < 2) {
337 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
338 size_t(dst - dstinit)};
339 }
340 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
341 *dst++ = static_cast<char>((triple >> 8) & 0xFF);
342 } else if (!ignore_garbage && idx == 1 &&
343 (!is_partial(last_chunk_options) ||
344 (is_partial(last_chunk_options) &&
345 padding_characters > 0))) {
346 return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
347 size_t(dst - dstinit)};
348 } else if (!ignore_garbage && idx == 0 && padding_characters > 0) {
349 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
350 size_t(dst - dstinit), true};
351 }
352 return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
353 }
354 }
355 if (check_capacity && dstend - dst < 3) {
356 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
357 size_t(dst - dstinit)};
358 }
359 uint32_t triple =
360 (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
361 (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
362 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
363 *dst++ = static_cast<char>((triple >> 8) & 0xFF);
364 *dst++ = static_cast<char>(triple & 0xFF);
365 }
366}
367
368template <class char_type>
369simdutf_constexpr23 full_result base64_tail_decode(
370 char *dst, const char_type *src, size_t length,
371 size_t padding_characters, // number of padding characters
372 // '=', typically 0, 1, 2.
373 base64_options options, last_chunk_handling_options last_chunk_options) {
374 return base64_tail_decode_impl<false>(dst, 0, src, length, padding_characters,
375 options, last_chunk_options);
376}
377
378// like base64_tail_decode, but it will not write past the end of the output
379// buffer. The outlen parameter is modified to reflect the number of bytes
380// written. This functions assumes that the padding (=) has been removed.
381//
382template <class char_type>
383simdutf_constexpr23 full_result base64_tail_decode_safe(
384 char *dst, size_t outlen, const char_type *src, size_t length,
385 size_t padding_characters, // number of padding characters
386 // '=', typically 0, 1, 2.
387 base64_options options, last_chunk_handling_options last_chunk_options) {
388 return base64_tail_decode_impl<true>(dst, outlen, src, length,
389 padding_characters, options,
390 last_chunk_options);
391}
392
393inline simdutf_constexpr23 full_result
394patch_tail_result(full_result r, size_t previous_input, size_t previous_output,
395 size_t equallocation, size_t full_input_length,
396 last_chunk_handling_options last_chunk_options) {
397 r.input_count += previous_input;
398 r.output_count += previous_output;
399 if (r.padding_error) {
400 r.input_count = equallocation;
401 }
402
403 if (r.error == error_code::SUCCESS) {
404 if (!is_partial(last_chunk_options)) {
405 // A success when we are not in stop_before_partial mode.
406 // means that we have consumed the whole input buffer.
407 r.input_count = full_input_length;
408 } else if (r.output_count % 3 != 0) {
409 r.input_count = full_input_length;
410 }
411 }
412 return r;
413}
414
415// Returns the number of bytes written. The destination buffer must be large
416// enough. It will add padding (=) if needed.
417template <bool use_lines = false>
418simdutf_constexpr23 size_t tail_encode_base64_impl(
419 char *dst, const char *src, size_t srclen, base64_options options,
420 size_t line_length = simdutf::default_line_length, size_t line_offset = 0) {
421 if simdutf_constexpr (use_lines) {
422 // sanitize line_length and starting_line_offset.
423 // line_length must be greater than 3.
424 if (line_length < 4) {
425 line_length = 4;
426 }
427 simdutf_log_assert(line_offset <= line_length,
428 "line_offset should be less than line_length");
429 }
430 // By default, we use padding if we are not using the URL variant.
431 // This is check with ((options & base64_url) == 0) which returns true if we
432 // are not using the URL variant. However, we also allow 'inversion' of the
433 // convention with the base64_reverse_padding option. If the
434 // base64_reverse_padding option is set, we use padding if we are using the
435 // URL variant, and we omit it if we are not using the URL variant. This is
436 // checked with
437 // ((options & base64_reverse_padding) == base64_reverse_padding).
438 bool use_padding =
439 ((options & base64_url) == 0) ^
440 ((options & base64_reverse_padding) == base64_reverse_padding);
441 // This looks like 3 branches, but we expect the compiler to resolve this to
442 // a single branch:
443 const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
444 : tables::base64::base64_default::e0;
445 const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
446 : tables::base64::base64_default::e1;
447 const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
448 : tables::base64::base64_default::e2;
449 char *out = dst;
450 size_t i = 0;
451 uint8_t t1, t2, t3;
452 for (; i + 2 < srclen; i += 3) {
453 t1 = uint8_t(src[i]);
454 t2 = uint8_t(src[i + 1]);
455 t3 = uint8_t(src[i + 2]);
456 if simdutf_constexpr (use_lines) {
457 if (line_offset + 3 >= line_length) {
458 if (line_offset == line_length) {
459 *out++ = '\n';
460 *out++ = e0[t1];
461 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
462 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
463 *out++ = e2[t3];
464 line_offset = 4;
465 } else if (line_offset + 1 == line_length) {
466 *out++ = e0[t1];
467 *out++ = '\n';
468 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
469 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
470 *out++ = e2[t3];
471 line_offset = 3;
472 } else if (line_offset + 2 == line_length) {
473 *out++ = e0[t1];
474 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
475 *out++ = '\n';
476 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
477 *out++ = e2[t3];
478 line_offset = 2;
479 } else if (line_offset + 3 == line_length) {
480 *out++ = e0[t1];
481 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
482 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
483 *out++ = '\n';
484 *out++ = e2[t3];
485 line_offset = 1;
486 }
487 } else {
488 *out++ = e0[t1];
489 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
490 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
491 *out++ = e2[t3];
492 line_offset += 4;
493 }
494 } else {
495 *out++ = e0[t1];
496 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
497 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
498 *out++ = e2[t3];
499 }
500 }
501 switch (srclen - i) {
502 case 0:
503 break;
504 case 1:
505 t1 = uint8_t(src[i]);
506 if simdutf_constexpr (use_lines) {
507 if (use_padding) {
508 if (line_offset + 3 >= line_length) {
509 if (line_offset == line_length) {
510 *out++ = '\n';
511 *out++ = e0[t1];
512 *out++ = e1[(t1 & 0x03) << 4];
513 *out++ = '=';
514 *out++ = '=';
515 } else if (line_offset + 1 == line_length) {
516 *out++ = e0[t1];
517 *out++ = '\n';
518 *out++ = e1[(t1 & 0x03) << 4];
519 *out++ = '=';
520 *out++ = '=';
521 } else if (line_offset + 2 == line_length) {
522 *out++ = e0[t1];
523 *out++ = e1[(t1 & 0x03) << 4];
524 *out++ = '\n';
525 *out++ = '=';
526 *out++ = '=';
527 } else if (line_offset + 3 == line_length) {
528 *out++ = e0[t1];
529 *out++ = e1[(t1 & 0x03) << 4];
530 *out++ = '=';
531 *out++ = '\n';
532 *out++ = '=';
533 }
534 } else {
535 *out++ = e0[t1];
536 *out++ = e1[(t1 & 0x03) << 4];
537 *out++ = '=';
538 *out++ = '=';
539 }
540 } else {
541 if (line_offset + 2 >= line_length) {
542 if (line_offset == line_length) {
543 *out++ = '\n';
544 *out++ = e0[uint8_t(src[i])];
545 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
546 } else if (line_offset + 1 == line_length) {
547 *out++ = e0[uint8_t(src[i])];
548 *out++ = '\n';
549 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
550 } else {
551 *out++ = e0[uint8_t(src[i])];
552 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
553 // *out++ = '\n'; ==> no newline at the end of the output
554 }
555 } else {
556 *out++ = e0[uint8_t(src[i])];
557 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
558 }
559 }
560 } else {
561 *out++ = e0[t1];
562 *out++ = e1[(t1 & 0x03) << 4];
563 if (use_padding) {
564 *out++ = '=';
565 *out++ = '=';
566 }
567 }
568 break;
569 default: /* case 2 */
570 t1 = uint8_t(src[i]);
571 t2 = uint8_t(src[i + 1]);
572 if simdutf_constexpr (use_lines) {
573 if (use_padding) {
574 if (line_offset + 3 >= line_length) {
575 if (line_offset == line_length) {
576 *out++ = '\n';
577 *out++ = e0[t1];
578 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
579 *out++ = e2[(t2 & 0x0F) << 2];
580 *out++ = '=';
581 } else if (line_offset + 1 == line_length) {
582 *out++ = e0[t1];
583 *out++ = '\n';
584 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
585 *out++ = e2[(t2 & 0x0F) << 2];
586 *out++ = '=';
587 } else if (line_offset + 2 == line_length) {
588 *out++ = e0[t1];
589 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
590 *out++ = '\n';
591 *out++ = e2[(t2 & 0x0F) << 2];
592 *out++ = '=';
593 } else if (line_offset + 3 == line_length) {
594 *out++ = e0[t1];
595 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
596 *out++ = e2[(t2 & 0x0F) << 2];
597 *out++ = '\n';
598 *out++ = '=';
599 }
600 } else {
601 *out++ = e0[t1];
602 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
603 *out++ = e2[(t2 & 0x0F) << 2];
604 *out++ = '=';
605 }
606 } else {
607 if (line_offset + 3 >= line_length) {
608 if (line_offset == line_length) {
609 *out++ = '\n';
610 *out++ = e0[t1];
611 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
612 *out++ = e2[(t2 & 0x0F) << 2];
613 } else if (line_offset + 1 == line_length) {
614 *out++ = e0[t1];
615 *out++ = '\n';
616 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
617 *out++ = e2[(t2 & 0x0F) << 2];
618 } else if (line_offset + 2 == line_length) {
619 *out++ = e0[t1];
620 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
621 *out++ = '\n';
622 *out++ = e2[(t2 & 0x0F) << 2];
623 } else {
624 *out++ = e0[t1];
625 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
626 *out++ = e2[(t2 & 0x0F) << 2];
627 // *out++ = '\n'; ==> no newline at the end of the output
628 }
629 } else {
630 *out++ = e0[t1];
631 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
632 *out++ = e2[(t2 & 0x0F) << 2];
633 }
634 }
635 } else {
636 *out++ = e0[t1];
637 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
638 *out++ = e2[(t2 & 0x0F) << 2];
639 if (use_padding) {
640 *out++ = '=';
641 }
642 }
643 }
644 return (size_t)(out - dst);
645}
646
647// Returns the number of bytes written. The destination buffer must be large
648// enough. It will add padding (=) if needed.
649inline simdutf_constexpr23 size_t tail_encode_base64(char *dst, const char *src,
650 size_t srclen,
651 base64_options options) {
652 return tail_encode_base64_impl(dst, src, srclen, options);
653}
654
655template <class InputPtr>
656simdutf_warn_unused simdutf_constexpr23 size_t
657maximal_binary_length_from_base64(InputPtr input, size_t length) noexcept {
658 // We process the padding characters ('=') at the end to make sure
659 // that we return an exact result when the input has no ignorable characters
660 // (e.g., spaces).
661 size_t padding = 0;
662 if (length > 0) {
663 if (input[length - 1] == '=') {
664 padding++;
665 if (length > 1 && input[length - 2] == '=') {
666 padding++;
667 }
668 }
669 }
670 // The input is not otherwise processed for ignorable characters or
671 // validation, so that the function runs in constant time (very fast). In
672 // practice, base64 inputs without ignorable characters are common and the
673 // common case are line separated inputs with relatively long lines (e.g., 76
674 // characters) which leads this function to a slight (1%) overestimation of
675 // the output size.
676 //
677 // Of course, some inputs might contain an arbitrary number of spaces or
678 // newlines, which would make this function return a very pessimistic output
679 // size but systems that produce base64 outputs typically do not do that and
680 // if they do, they do not care much about minimizing memory usage.
681 //
682 // In specialized applications, users may know that their input is line
683 // separated, which can be checked very quickly by by iterating (e.g., over 76
684 // character chunks, looking for the linefeed characters only). We could
685 // provide a specialized function for that, but it is not clear that the added
686 // complexity is worth it for us.
687 //
688 size_t actual_length = length - padding;
689 if (actual_length % 4 <= 1) {
690 return actual_length / 4 * 3;
691 }
692 // if we have a valid input, then the remainder must be 2 or 3 adding one or
693 // two extra bytes.
694 return actual_length / 4 * 3 + (actual_length % 4) - 1;
695}
696
697template <typename char_type>
698simdutf_warn_unused simdutf_constexpr23 full_result
699base64_to_binary_details_impl(
700 const char_type *input, size_t length, char *output, base64_options options,
701 last_chunk_handling_options last_chunk_options) noexcept {
702 const bool ignore_garbage =
703 (options == base64_options::base64_url_accept_garbage) ||
704 (options == base64_options::base64_default_accept_garbage) ||
705 (options == base64_options::base64_default_or_url_accept_garbage);
706 auto ri = simdutf::scalar::base64::find_end(input, length, options);
707 size_t equallocation = ri.equallocation;
708 size_t equalsigns = ri.equalsigns;
709 length = ri.srclen;
710 size_t full_input_length = ri.full_input_length;
711 if (length == 0) {
712 if (!ignore_garbage && equalsigns > 0) {
713 return {INVALID_BASE64_CHARACTER, equallocation, 0};
714 }
715 return {SUCCESS, full_input_length, 0};
716 }
717 full_result r = scalar::base64::base64_tail_decode(
718 output, input, length, equalsigns, options, last_chunk_options);
719 r = scalar::base64::patch_tail_result(r, 0, 0, equallocation,
720 full_input_length, last_chunk_options);
721 if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
722 equalsigns > 0 && !ignore_garbage) {
723 // additional checks
724 if ((r.output_count % 3 == 0) ||
725 ((r.output_count % 3) + 1 + equalsigns != 4)) {
726 return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
727 }
728 }
729 // When is_partial(last_chunk_options) is true, we must either end with
730 // the end of the stream (beyond whitespace) or right after a non-ignorable
731 // character or at the very beginning of the stream.
732 // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
733 if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
734 r.input_count < full_input_length) {
735 // First check if we can extend the input to the end of the stream
736 while (r.input_count < full_input_length &&
737 base64_ignorable(*(input + r.input_count), options)) {
738 r.input_count++;
739 }
740 // If we are still not at the end of the stream, then we must backtrack
741 // to the last non-ignorable character.
742 if (r.input_count < full_input_length) {
743 while (r.input_count > 0 &&
744 base64_ignorable(*(input + r.input_count - 1), options)) {
745 r.input_count--;
746 }
747 }
748 }
749 return r;
750}
751
752template <typename char_type>
753simdutf_constexpr23 simdutf_warn_unused full_result
754base64_to_binary_details_safe_impl(
755 const char_type *input, size_t length, char *output, size_t outlen,
756 base64_options options,
757 last_chunk_handling_options last_chunk_options) noexcept {
758 const bool ignore_garbage =
759 (options == base64_options::base64_url_accept_garbage) ||
760 (options == base64_options::base64_default_accept_garbage) ||
761 (options == base64_options::base64_default_or_url_accept_garbage);
762 auto ri = simdutf::scalar::base64::find_end(input, length, options);
763 size_t equallocation = ri.equallocation;
764 size_t equalsigns = ri.equalsigns;
765 length = ri.srclen;
766 size_t full_input_length = ri.full_input_length;
767 if (length == 0) {
768 if (!ignore_garbage && equalsigns > 0) {
769 return {INVALID_BASE64_CHARACTER, equallocation, 0};
770 }
771 return {SUCCESS, full_input_length, 0};
772 }
773 full_result r = scalar::base64::base64_tail_decode_safe(
774 output, outlen, input, length, equalsigns, options, last_chunk_options);
775 r = scalar::base64::patch_tail_result(r, 0, 0, equallocation,
776 full_input_length, last_chunk_options);
777 if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
778 equalsigns > 0 && !ignore_garbage) {
779 // additional checks
780 if ((r.output_count % 3 == 0) ||
781 ((r.output_count % 3) + 1 + equalsigns != 4)) {
782 return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
783 }
784 }
785
786 // When is_partial(last_chunk_options) is true, we must either end with
787 // the end of the stream (beyond whitespace) or right after a non-ignorable
788 // character or at the very beginning of the stream.
789 // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
790 if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
791 r.input_count < full_input_length) {
792 // First check if we can extend the input to the end of the stream
793 while (r.input_count < full_input_length &&
794 base64_ignorable(*(input + r.input_count), options)) {
795 r.input_count++;
796 }
797 // If we are still not at the end of the stream, then we must backtrack
798 // to the last non-ignorable character.
799 if (r.input_count < full_input_length) {
800 while (r.input_count > 0 &&
801 base64_ignorable(*(input + r.input_count - 1), options)) {
802 r.input_count--;
803 }
804 }
805 }
806 return r;
807}
808
809simdutf_warn_unused simdutf_constexpr23 size_t
810base64_length_from_binary(size_t length, base64_options options) noexcept {
811 // By default, we use padding if we are not using the URL variant.
812 // This is check with ((options & base64_url) == 0) which returns true if we
813 // are not using the URL variant. However, we also allow 'inversion' of the
814 // convention with the base64_reverse_padding option. If the
815 // base64_reverse_padding option is set, we use padding if we are using the
816 // URL variant, and we omit it if we are not using the URL variant. This is
817 // checked with
818 // ((options & base64_reverse_padding) == base64_reverse_padding).
819 bool use_padding =
820 ((options & base64_url) == 0) ^
821 ((options & base64_reverse_padding) == base64_reverse_padding);
822 if (!use_padding) {
823 return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
824 }
825 return (length + 2) / 3 *
826 4; // We use padding to make the length a multiple of 4.
827}
828
829simdutf_warn_unused simdutf_constexpr23 size_t
830base64_length_from_binary_with_lines(size_t length, base64_options options,
831 size_t line_length) noexcept {
832 if (length == 0) {
833 return 0;
834 }
835 size_t base64_length =
836 scalar::base64::base64_length_from_binary(length, options);
837 if (line_length < 4) {
838 line_length = 4;
839 }
840 size_t lines =
841 (base64_length + line_length - 1) / line_length; // number of lines
842 return base64_length + lines - 1;
843}
844
845// Return the length of the prefix that contains count base64 characters.
846// Thus, if count is 3, the function returns the length of the prefix
847// that contains 3 base64 characters.
848// The function returns (size_t)-1 if there is not enough base64 characters in
849// the input.
850template <typename char_type>
851simdutf_warn_unused size_t prefix_length(size_t count,
852 simdutf::base64_options options,
853 const char_type *input,
854 size_t length) noexcept {
855 size_t i = 0;
856 while (i < length && is_ignorable(input[i], options)) {
857 i++;
858 }
859 if (count == 0) {
860 return i; // duh!
861 }
862 for (; i < length; i++) {
863 if (is_ignorable(input[i], options)) {
864 continue;
865 }
866 // We have a base64 character or a padding character.
867 count--;
868 if (count == 0) {
869 return i + 1;
870 }
871 }
872 simdutf_log_assert(false, "You never get here");
873
874 return -1; // should never happen
875}
876
877} // namespace base64
878} // unnamed namespace
879} // namespace scalar
880} // namespace simdutf
881
882#endif