simdutf 9.0.0
Unicode at GB/s.
Loading...
Searching...
No Matches
base64.h
1#ifndef SIMDUTF_BASE64_H
2#define SIMDUTF_BASE64_H
3
4#include <algorithm>
5#include <cstddef>
6#include <cstdint>
7#include <cstring>
8
9namespace simdutf {
10namespace scalar {
11namespace {
12namespace base64 {
13
14// This function is not expected to be fast. Do not use in long loops.
15// In most instances you should be using is_ignorable.
16template <class char_type> bool is_ascii_white_space(char_type c) {
17 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
18}
19
20template <class char_type> simdutf_constexpr23 bool is_eight_byte(char_type c) {
21 if constexpr (sizeof(char_type) == 1) {
22 return true;
23 }
24 return uint8_t(c) == c;
25}
26
27template <class char_type>
28simdutf_constexpr23 bool is_ignorable(char_type c,
29 simdutf::base64_options options) {
30 const uint8_t *to_base64 =
31 (options & base64_default_or_url)
32 ? tables::base64::to_base64_default_or_url_value
33 : ((options & base64_url) ? tables::base64::to_base64_url_value
34 : tables::base64::to_base64_value);
35 const bool ignore_garbage =
36 (options == base64_options::base64_url_accept_garbage) ||
37 (options == base64_options::base64_default_accept_garbage) ||
38 (options == base64_options::base64_default_or_url_accept_garbage);
39 uint8_t code = to_base64[uint8_t(c)];
40 if (is_eight_byte(c) && code <= 63) {
41 return false;
42 }
43 if (is_eight_byte(c) && code == 64) {
44 return true;
45 }
46 return ignore_garbage;
47}
48template <class char_type>
49simdutf_constexpr23 bool is_base64(char_type c,
50 simdutf::base64_options options) {
51 const uint8_t *to_base64 =
52 (options & base64_default_or_url)
53 ? tables::base64::to_base64_default_or_url_value
54 : ((options & base64_url) ? tables::base64::to_base64_url_value
55 : tables::base64::to_base64_value);
56 uint8_t code = to_base64[uint8_t(c)];
57 if (is_eight_byte(c) && code <= 63) {
58 return true;
59 }
60 return false;
61}
62
63template <class char_type>
64simdutf_constexpr23 bool is_base64_or_padding(char_type c,
65 simdutf::base64_options options) {
66 const uint8_t *to_base64 =
67 (options & base64_default_or_url)
68 ? tables::base64::to_base64_default_or_url_value
69 : ((options & base64_url) ? tables::base64::to_base64_url_value
70 : tables::base64::to_base64_value);
71 if (c == '=') {
72 return true;
73 }
74 uint8_t code = to_base64[uint8_t(c)];
75 if (is_eight_byte(c) && code <= 63) {
76 return true;
77 }
78 return false;
79}
80
81template <class char_type>
82bool is_ignorable_or_padding(char_type c, simdutf::base64_options options) {
83 return is_ignorable(c, options) || c == '=';
84}
85
86struct reduced_input {
87 size_t equalsigns; // number of padding characters '=', typically 0, 1, 2.
88 size_t equallocation; // location of the first padding character if any
89 size_t srclen; // length of the input buffer before padding
90 size_t full_input_length; // length of the input buffer with padding but
91 // without ignorable characters
92};
93
94// find the end of the base64 input buffer
95// It returns the number of padding characters, the location of the first
96// padding character if any, the length of the input buffer before padding
97// and the length of the input buffer with padding. The input buffer is not
98// modified. The function assumes that there are at most two padding characters.
99template <class char_type>
100simdutf_constexpr23 reduced_input find_end(const char_type *src, size_t srclen,
101 simdutf::base64_options options) {
102 const uint8_t *to_base64 =
103 (options & base64_default_or_url)
104 ? tables::base64::to_base64_default_or_url_value
105 : ((options & base64_url) ? tables::base64::to_base64_url_value
106 : tables::base64::to_base64_value);
107 const bool ignore_garbage =
108 (options == base64_options::base64_url_accept_garbage) ||
109 (options == base64_options::base64_default_accept_garbage) ||
110 (options == base64_options::base64_default_or_url_accept_garbage);
111
112 size_t equalsigns = 0;
113 // We intentionally include trailing spaces in the full input length.
114 // See https://github.com/simdutf/simdutf/issues/824
115 size_t full_input_length = srclen;
116 // skip trailing spaces
117 while (!ignore_garbage && srclen > 0 &&
118 scalar::base64::is_eight_byte(src[srclen - 1]) &&
119 to_base64[uint8_t(src[srclen - 1])] == 64) {
120 srclen--;
121 }
122 size_t equallocation =
123 srclen; // location of the first padding character if any
124 if (ignore_garbage) {
125 // Technically, we don't need to find the first padding character, we can
126 // just change our algorithms, but it adds substantial complexity.
127 auto it = simdutf::find(src, src + srclen, '=');
128 if (it != src + srclen) {
129 equallocation = it - src;
130 equalsigns = 1;
131 srclen = equallocation;
132 full_input_length = equallocation + 1;
133 }
134 return {equalsigns, equallocation, srclen, full_input_length};
135 }
136 if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') {
137 // This is the last '=' sign.
138 equallocation = srclen - 1;
139 srclen--;
140 equalsigns = 1;
141 // skip trailing spaces
142 while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
143 to_base64[uint8_t(src[srclen - 1])] == 64) {
144 srclen--;
145 }
146 if (srclen > 0 && src[srclen - 1] == '=') {
147 // This is the second '=' sign.
148 equallocation = srclen - 1;
149 srclen--;
150 equalsigns = 2;
151 }
152 }
153 return {equalsigns, equallocation, srclen, full_input_length};
154}
155
156// Returns true upon success. The destination buffer must be large enough.
157// This functions assumes that the padding (=) has been removed.
158// if check_capacity is true, it will check that the destination buffer is
159// large enough. If it is not, it will return OUTPUT_BUFFER_TOO_SMALL.
160template <bool check_capacity, class char_type>
161simdutf_constexpr23 full_result base64_tail_decode_impl(
162 char *dst, size_t outlen, const char_type *src, size_t length,
163 size_t padding_characters, // number of padding characters
164 // '=', typically 0, 1, 2.
165 base64_options options, last_chunk_handling_options last_chunk_options) {
166 char *dstend = dst + outlen;
167 (void)dstend;
168 // This looks like 10 branches, but we expect the compiler to resolve this to
169 // two branches (easily predicted):
170 const uint8_t *to_base64 =
171 (options & base64_default_or_url)
172 ? tables::base64::to_base64_default_or_url_value
173 : ((options & base64_url) ? tables::base64::to_base64_url_value
174 : tables::base64::to_base64_value);
175 const uint32_t *d0 =
176 (options & base64_default_or_url)
177 ? tables::base64::base64_default_or_url::d0
178 : ((options & base64_url) ? tables::base64::base64_url::d0
179 : tables::base64::base64_default::d0);
180 const uint32_t *d1 =
181 (options & base64_default_or_url)
182 ? tables::base64::base64_default_or_url::d1
183 : ((options & base64_url) ? tables::base64::base64_url::d1
184 : tables::base64::base64_default::d1);
185 const uint32_t *d2 =
186 (options & base64_default_or_url)
187 ? tables::base64::base64_default_or_url::d2
188 : ((options & base64_url) ? tables::base64::base64_url::d2
189 : tables::base64::base64_default::d2);
190 const uint32_t *d3 =
191 (options & base64_default_or_url)
192 ? tables::base64::base64_default_or_url::d3
193 : ((options & base64_url) ? tables::base64::base64_url::d3
194 : tables::base64::base64_default::d3);
195 const bool ignore_garbage =
196 (options == base64_options::base64_url_accept_garbage) ||
197 (options == base64_options::base64_default_accept_garbage) ||
198 (options == base64_options::base64_default_or_url_accept_garbage);
199
200 const char_type *srcend = src + length;
201 const char_type *srcinit = src;
202 const char *dstinit = dst;
203
204 uint32_t x;
205 size_t idx;
206 uint8_t buffer[4];
207 while (true) {
208 while (srcend - src >= 4 && is_eight_byte(src[0]) &&
209 is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
210 is_eight_byte(src[3]) &&
211 (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
212 d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
213 if (check_capacity && dstend - dst < 3) {
214 return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit),
215 size_t(dst - dstinit)};
216 }
217 *dst++ = static_cast<char>(x & 0xFF);
218 *dst++ = static_cast<char>((x >> 8) & 0xFF);
219 *dst++ = static_cast<char>((x >> 16) & 0xFF);
220 src += 4;
221 }
222 const char_type *srccur = src;
223 idx = 0;
224 // we need at least four characters.
225#ifdef __clang__
226 // If possible, we read four characters at a time. (It is an optimization.)
227 if (ignore_garbage && src + 4 <= srcend) {
228 char_type c0 = src[0];
229 char_type c1 = src[1];
230 char_type c2 = src[2];
231 char_type c3 = src[3];
232
233 uint8_t code0 = to_base64[uint8_t(c0)];
234 uint8_t code1 = to_base64[uint8_t(c1)];
235 uint8_t code2 = to_base64[uint8_t(c2)];
236 uint8_t code3 = to_base64[uint8_t(c3)];
237
238 buffer[idx] = code0;
239 idx += (is_eight_byte(c0) && code0 <= 63);
240 buffer[idx] = code1;
241 idx += (is_eight_byte(c1) && code1 <= 63);
242 buffer[idx] = code2;
243 idx += (is_eight_byte(c2) && code2 <= 63);
244 buffer[idx] = code3;
245 idx += (is_eight_byte(c3) && code3 <= 63);
246 src += 4;
247 }
248#endif
249 while ((idx < 4) && (src < srcend)) {
250 char_type c = *src;
251
252 uint8_t code = to_base64[uint8_t(c)];
253 buffer[idx] = uint8_t(code);
254 if (is_eight_byte(c) && code <= 63) {
255 idx++;
256 } else if (!ignore_garbage &&
257 (code > 64 || !scalar::base64::is_eight_byte(c))) {
258 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
259 size_t(dst - dstinit)};
260 } else {
261 // We have a space or a newline or garbage. We ignore it.
262 }
263 src++;
264 }
265 if (idx != 4) {
266 simdutf_log_assert(idx < 4, "idx should be less than 4");
267 // We never should have that the number of base64 characters + the
268 // number of padding characters is more than 4.
269 if (!ignore_garbage && (idx + padding_characters > 4)) {
270 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
271 size_t(dst - dstinit), true};
272 }
273
274 // The idea here is that in loose mode,
275 // if there is padding at all, it must be used
276 // to form 4-wise chunk. However, in loose mode,
277 // we do accept no padding at all.
278 if (!ignore_garbage &&
279 last_chunk_options == last_chunk_handling_options::loose &&
280 (idx >= 2) && padding_characters > 0 &&
281 ((idx + padding_characters) & 3) != 0) {
282 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
283 size_t(dst - dstinit), true};
284 } else
285
286 // The idea here is that in strict mode, we do not want to accept
287 // incomplete base64 chunks. So if the chunk was otherwise valid, we
288 // return BASE64_INPUT_REMAINDER.
289 if (!ignore_garbage &&
290 last_chunk_options == last_chunk_handling_options::strict &&
291 (idx >= 2) && ((idx + padding_characters) & 3) != 0) {
292 // The partial chunk was at src - idx
293 return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
294 size_t(dst - dstinit), true};
295 } else
296 // If there is a partial chunk with insufficient padding, with
297 // stop_before_partial, we need to just ignore it. In "only full"
298 // mode, skip the minute there are padding characters.
299 if ((last_chunk_options ==
300 last_chunk_handling_options::stop_before_partial &&
301 (padding_characters + idx < 4) && (idx != 0) &&
302 (idx >= 2 || padding_characters == 0)) ||
303 (last_chunk_options ==
304 last_chunk_handling_options::only_full_chunks &&
305 (idx >= 2 || padding_characters == 0))) {
306 // partial means that we are *not* going to consume the read
307 // characters. We need to rewind the src pointer.
308 src = srccur;
309 return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
310 } else {
311 if (idx == 2) {
312 uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
313 (uint32_t(buffer[1]) << 2 * 6);
314 if (!ignore_garbage &&
315 (last_chunk_options == last_chunk_handling_options::strict) &&
316 (triple & 0xffff)) {
317 return {BASE64_EXTRA_BITS, size_t(src - srcinit),
318 size_t(dst - dstinit)};
319 }
320 if (check_capacity && dstend - dst < 1) {
321 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
322 size_t(dst - dstinit)};
323 }
324 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
325 } else if (idx == 3) {
326 uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
327 (uint32_t(buffer[1]) << 2 * 6) +
328 (uint32_t(buffer[2]) << 1 * 6);
329 if (!ignore_garbage &&
330 (last_chunk_options == last_chunk_handling_options::strict) &&
331 (triple & 0xff)) {
332 return {BASE64_EXTRA_BITS, size_t(src - srcinit),
333 size_t(dst - dstinit)};
334 }
335 if (check_capacity && dstend - dst < 2) {
336 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
337 size_t(dst - dstinit)};
338 }
339 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
340 *dst++ = static_cast<char>((triple >> 8) & 0xFF);
341 } else if (!ignore_garbage && idx == 1 &&
342 (!is_partial(last_chunk_options) ||
343 (is_partial(last_chunk_options) &&
344 padding_characters > 0))) {
345 return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
346 size_t(dst - dstinit)};
347 } else if (!ignore_garbage && idx == 0 && padding_characters > 0) {
348 return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
349 size_t(dst - dstinit), true};
350 }
351 return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
352 }
353 }
354 if (check_capacity && dstend - dst < 3) {
355 return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit),
356 size_t(dst - dstinit)};
357 }
358 uint32_t triple =
359 (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
360 (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
361 *dst++ = static_cast<char>((triple >> 16) & 0xFF);
362 *dst++ = static_cast<char>((triple >> 8) & 0xFF);
363 *dst++ = static_cast<char>(triple & 0xFF);
364 }
365}
366
367template <class char_type>
368simdutf_constexpr23 full_result base64_tail_decode(
369 char *dst, const char_type *src, size_t length,
370 size_t padding_characters, // number of padding characters
371 // '=', typically 0, 1, 2.
372 base64_options options, last_chunk_handling_options last_chunk_options) {
373 return base64_tail_decode_impl<false>(dst, 0, src, length, padding_characters,
374 options, last_chunk_options);
375}
376
377// like base64_tail_decode, but it will not write past the end of the output
378// buffer. The outlen parameter is modified to reflect the number of bytes
379// written. This functions assumes that the padding (=) has been removed.
380//
381template <class char_type>
382simdutf_constexpr23 full_result base64_tail_decode_safe(
383 char *dst, size_t outlen, const char_type *src, size_t length,
384 size_t padding_characters, // number of padding characters
385 // '=', typically 0, 1, 2.
386 base64_options options, last_chunk_handling_options last_chunk_options) {
387 return base64_tail_decode_impl<true>(dst, outlen, src, length,
388 padding_characters, options,
389 last_chunk_options);
390}
391
392inline simdutf_constexpr23 full_result
393patch_tail_result(full_result r, size_t previous_input, size_t previous_output,
394 size_t equallocation, size_t full_input_length,
395 last_chunk_handling_options last_chunk_options) {
396 r.input_count += previous_input;
397 r.output_count += previous_output;
398 if (r.padding_error) {
399 r.input_count = equallocation;
400 }
401
402 if (r.error == error_code::SUCCESS) {
403 if (!is_partial(last_chunk_options)) {
404 // A success when we are not in stop_before_partial mode.
405 // means that we have consumed the whole input buffer.
406 r.input_count = full_input_length;
407 } else if (r.output_count % 3 != 0) {
408 r.input_count = full_input_length;
409 }
410 }
411 return r;
412}
413
414// Returns the number of bytes written. The destination buffer must be large
415// enough. It will add padding (=) if needed.
416template <bool use_lines = false>
417simdutf_constexpr23 size_t tail_encode_base64_impl(
418 char *dst, const char *src, size_t srclen, base64_options options,
419 size_t line_length = simdutf::default_line_length, size_t line_offset = 0) {
420 if constexpr (use_lines) {
421 // sanitize line_length and starting_line_offset.
422 // line_length must be greater than 3.
423 if (line_length < 4) {
424 line_length = 4;
425 }
426 simdutf_log_assert(line_offset <= line_length,
427 "line_offset should be less than line_length");
428 }
429 // By default, we use padding if we are not using the URL variant.
430 // This is check with ((options & base64_url) == 0) which returns true if we
431 // are not using the URL variant. However, we also allow 'inversion' of the
432 // convention with the base64_reverse_padding option. If the
433 // base64_reverse_padding option is set, we use padding if we are using the
434 // URL variant, and we omit it if we are not using the URL variant. This is
435 // checked with
436 // ((options & base64_reverse_padding) == base64_reverse_padding).
437 bool use_padding =
438 ((options & base64_url) == 0) ^
439 ((options & base64_reverse_padding) == base64_reverse_padding);
440 // This looks like 3 branches, but we expect the compiler to resolve this to
441 // a single branch:
442 const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
443 : tables::base64::base64_default::e0;
444 const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
445 : tables::base64::base64_default::e1;
446 const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
447 : tables::base64::base64_default::e2;
448 char *out = dst;
449 size_t i = 0;
450 uint8_t t1, t2, t3;
451 for (; i + 2 < srclen; i += 3) {
452 t1 = uint8_t(src[i]);
453 t2 = uint8_t(src[i + 1]);
454 t3 = uint8_t(src[i + 2]);
455 if constexpr (use_lines) {
456 if (line_offset + 3 >= line_length) {
457 if (line_offset == line_length) {
458 *out++ = '\n';
459 *out++ = e0[t1];
460 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
461 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
462 *out++ = e2[t3];
463 line_offset = 4;
464 } else if (line_offset + 1 == line_length) {
465 *out++ = e0[t1];
466 *out++ = '\n';
467 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
468 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
469 *out++ = e2[t3];
470 line_offset = 3;
471 } else if (line_offset + 2 == line_length) {
472 *out++ = e0[t1];
473 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
474 *out++ = '\n';
475 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
476 *out++ = e2[t3];
477 line_offset = 2;
478 } else if (line_offset + 3 == line_length) {
479 *out++ = e0[t1];
480 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
481 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
482 *out++ = '\n';
483 *out++ = e2[t3];
484 line_offset = 1;
485 }
486 } else {
487 *out++ = e0[t1];
488 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
489 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
490 *out++ = e2[t3];
491 line_offset += 4;
492 }
493 } else {
494 *out++ = e0[t1];
495 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
496 *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
497 *out++ = e2[t3];
498 }
499 }
500 switch (srclen - i) {
501 case 0:
502 break;
503 case 1:
504 t1 = uint8_t(src[i]);
505 if constexpr (use_lines) {
506 if (use_padding) {
507 if (line_offset + 3 >= line_length) {
508 if (line_offset == line_length) {
509 *out++ = '\n';
510 *out++ = e0[t1];
511 *out++ = e1[(t1 & 0x03) << 4];
512 *out++ = '=';
513 *out++ = '=';
514 } else if (line_offset + 1 == line_length) {
515 *out++ = e0[t1];
516 *out++ = '\n';
517 *out++ = e1[(t1 & 0x03) << 4];
518 *out++ = '=';
519 *out++ = '=';
520 } else if (line_offset + 2 == line_length) {
521 *out++ = e0[t1];
522 *out++ = e1[(t1 & 0x03) << 4];
523 *out++ = '\n';
524 *out++ = '=';
525 *out++ = '=';
526 } else if (line_offset + 3 == line_length) {
527 *out++ = e0[t1];
528 *out++ = e1[(t1 & 0x03) << 4];
529 *out++ = '=';
530 *out++ = '\n';
531 *out++ = '=';
532 }
533 } else {
534 *out++ = e0[t1];
535 *out++ = e1[(t1 & 0x03) << 4];
536 *out++ = '=';
537 *out++ = '=';
538 }
539 } else {
540 if (line_offset + 2 >= line_length) {
541 if (line_offset == line_length) {
542 *out++ = '\n';
543 *out++ = e0[uint8_t(src[i])];
544 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
545 } else if (line_offset + 1 == line_length) {
546 *out++ = e0[uint8_t(src[i])];
547 *out++ = '\n';
548 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
549 } else {
550 *out++ = e0[uint8_t(src[i])];
551 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
552 // *out++ = '\n'; ==> no newline at the end of the output
553 }
554 } else {
555 *out++ = e0[uint8_t(src[i])];
556 *out++ = e1[(uint8_t(src[i]) & 0x03) << 4];
557 }
558 }
559 } else {
560 *out++ = e0[t1];
561 *out++ = e1[(t1 & 0x03) << 4];
562 if (use_padding) {
563 *out++ = '=';
564 *out++ = '=';
565 }
566 }
567 break;
568 default: /* case 2 */
569 t1 = uint8_t(src[i]);
570 t2 = uint8_t(src[i + 1]);
571 if constexpr (use_lines) {
572 if (use_padding) {
573 if (line_offset + 3 >= line_length) {
574 if (line_offset == line_length) {
575 *out++ = '\n';
576 *out++ = e0[t1];
577 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
578 *out++ = e2[(t2 & 0x0F) << 2];
579 *out++ = '=';
580 } else if (line_offset + 1 == line_length) {
581 *out++ = e0[t1];
582 *out++ = '\n';
583 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
584 *out++ = e2[(t2 & 0x0F) << 2];
585 *out++ = '=';
586 } else if (line_offset + 2 == line_length) {
587 *out++ = e0[t1];
588 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
589 *out++ = '\n';
590 *out++ = e2[(t2 & 0x0F) << 2];
591 *out++ = '=';
592 } else if (line_offset + 3 == line_length) {
593 *out++ = e0[t1];
594 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
595 *out++ = e2[(t2 & 0x0F) << 2];
596 *out++ = '\n';
597 *out++ = '=';
598 }
599 } else {
600 *out++ = e0[t1];
601 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
602 *out++ = e2[(t2 & 0x0F) << 2];
603 *out++ = '=';
604 }
605 } else {
606 if (line_offset + 3 >= line_length) {
607 if (line_offset == line_length) {
608 *out++ = '\n';
609 *out++ = e0[t1];
610 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
611 *out++ = e2[(t2 & 0x0F) << 2];
612 } else if (line_offset + 1 == line_length) {
613 *out++ = e0[t1];
614 *out++ = '\n';
615 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
616 *out++ = e2[(t2 & 0x0F) << 2];
617 } else if (line_offset + 2 == line_length) {
618 *out++ = e0[t1];
619 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
620 *out++ = '\n';
621 *out++ = e2[(t2 & 0x0F) << 2];
622 } else {
623 *out++ = e0[t1];
624 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
625 *out++ = e2[(t2 & 0x0F) << 2];
626 // *out++ = '\n'; ==> no newline at the end of the output
627 }
628 } else {
629 *out++ = e0[t1];
630 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
631 *out++ = e2[(t2 & 0x0F) << 2];
632 }
633 }
634 } else {
635 *out++ = e0[t1];
636 *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
637 *out++ = e2[(t2 & 0x0F) << 2];
638 if (use_padding) {
639 *out++ = '=';
640 }
641 }
642 }
643 return (size_t)(out - dst);
644}
645
646// Returns the number of bytes written. The destination buffer must be large
647// enough. It will add padding (=) if needed.
648inline simdutf_constexpr23 size_t tail_encode_base64(char *dst, const char *src,
649 size_t srclen,
650 base64_options options) {
651 return tail_encode_base64_impl(dst, src, srclen, options);
652}
653
654template <class InputPtr>
655simdutf_warn_unused simdutf_constexpr23 size_t
656maximal_binary_length_from_base64(InputPtr input, size_t length) noexcept {
657 // We process the padding characters ('=') at the end to make sure
658 // that we return an exact result when the input has no ignorable characters
659 // (e.g., spaces).
660 size_t padding = 0;
661 if (length > 0) {
662 if (input[length - 1] == '=') {
663 padding++;
664 if (length > 1 && input[length - 2] == '=') {
665 padding++;
666 }
667 }
668 }
669 // The input is not otherwise processed for ignorable characters or
670 // validation, so that the function runs in constant time (very fast). In
671 // practice, base64 inputs without ignorable characters are common and the
672 // common case are line separated inputs with relatively long lines (e.g., 76
673 // characters) which leads this function to a slight (1%) overestimation of
674 // the output size.
675 //
676 // Of course, some inputs might contain an arbitrary number of spaces or
677 // newlines, which would make this function return a very pessimistic output
678 // size but systems that produce base64 outputs typically do not do that and
679 // if they do, they do not care much about minimizing memory usage.
680 //
681 // In specialized applications, users may know that their input is line
682 // separated, which can be checked very quickly by by iterating (e.g., over 76
683 // character chunks, looking for the linefeed characters only). We could
684 // provide a specialized function for that, but it is not clear that the added
685 // complexity is worth it for us.
686 //
687 size_t actual_length = length - padding;
688 if (actual_length % 4 <= 1) {
689 return actual_length / 4 * 3;
690 }
691 // if we have a valid input, then the remainder must be 2 or 3 adding one or
692 // two extra bytes.
693 return actual_length / 4 * 3 + (actual_length % 4) - 1;
694}
695
696// This function computes the binary length by iterating through the input
697// and counting non-whitespace characters (excluding padding characters).
698// We use a simple check (c > ' ') which is easy to parallelize and matches
699// SIMD behavior. Only the last few characters are checked for padding '='.
700template <class char_type>
701simdutf_warn_unused simdutf_constexpr23 size_t
702binary_length_from_base64(const char_type *input, size_t length) noexcept {
703 // Count non-whitespace characters (c > ' ') with loop unrolling
704 size_t count = 0;
705 for (size_t i = 0; i < length; i++) {
706 count += (input[i] > ' ');
707 }
708
709 // Check for padding '=' at the end (at most 2 padding characters)
710 // Scan backwards, skipping whitespace, to find padding
711 size_t padding = 0;
712 size_t pos = length;
713 // Skip trailing whitespace
714 while (pos > 0 && padding < 2) {
715 char_type c = input[--pos];
716 if (c == '=') {
717 padding++;
718 } else if (c > ' ') {
719 break;
720 }
721 }
722 return ((count - padding) * 3) / 4;
723}
724
725template <typename char_type>
726simdutf_warn_unused simdutf_constexpr23 full_result
727base64_to_binary_details_impl(
728 const char_type *input, size_t length, char *output, base64_options options,
729 last_chunk_handling_options last_chunk_options) noexcept {
730 const bool ignore_garbage =
731 (options == base64_options::base64_url_accept_garbage) ||
732 (options == base64_options::base64_default_accept_garbage) ||
733 (options == base64_options::base64_default_or_url_accept_garbage);
734 auto ri = simdutf::scalar::base64::find_end(input, length, options);
735 size_t equallocation = ri.equallocation;
736 size_t equalsigns = ri.equalsigns;
737 length = ri.srclen;
738 size_t full_input_length = ri.full_input_length;
739 if (length == 0) {
740 if (!ignore_garbage && equalsigns > 0) {
741 return {INVALID_BASE64_CHARACTER, equallocation, 0, true};
742 }
743 return {SUCCESS, full_input_length, 0};
744 }
745 full_result r = scalar::base64::base64_tail_decode(
746 output, input, length, equalsigns, options, last_chunk_options);
747 r = scalar::base64::patch_tail_result(r, 0, 0, equallocation,
748 full_input_length, last_chunk_options);
749 if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
750 equalsigns > 0 && !ignore_garbage) {
751 // additional checks
752 if ((r.output_count % 3 == 0) ||
753 ((r.output_count % 3) + 1 + equalsigns != 4)) {
754 return {INVALID_BASE64_CHARACTER, equallocation, r.output_count, true};
755 }
756 }
757 // When is_partial(last_chunk_options) is true, we must either end with
758 // the end of the stream (beyond whitespace) or right after a non-ignorable
759 // character or at the very beginning of the stream.
760 // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
761 if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
762 r.input_count < full_input_length) {
763 // First check if we can extend the input to the end of the stream
764 while (r.input_count < full_input_length &&
765 base64_ignorable(*(input + r.input_count), options)) {
766 r.input_count++;
767 }
768 // If we are still not at the end of the stream, then we must backtrack
769 // to the last non-ignorable character.
770 if (r.input_count < full_input_length) {
771 while (r.input_count > 0 &&
772 base64_ignorable(*(input + r.input_count - 1), options)) {
773 r.input_count--;
774 }
775 }
776 }
777 return r;
778}
779
780template <typename char_type>
781simdutf_constexpr23 simdutf_warn_unused full_result
782base64_to_binary_details_safe_impl(
783 const char_type *input, size_t length, char *output, size_t outlen,
784 base64_options options,
785 last_chunk_handling_options last_chunk_options) noexcept {
786 const bool ignore_garbage =
787 (options == base64_options::base64_url_accept_garbage) ||
788 (options == base64_options::base64_default_accept_garbage) ||
789 (options == base64_options::base64_default_or_url_accept_garbage);
790 auto ri = simdutf::scalar::base64::find_end(input, length, options);
791 size_t equallocation = ri.equallocation;
792 size_t equalsigns = ri.equalsigns;
793 length = ri.srclen;
794 size_t full_input_length = ri.full_input_length;
795 if (length == 0) {
796 if (!ignore_garbage && equalsigns > 0) {
797 return {INVALID_BASE64_CHARACTER, equallocation, 0};
798 }
799 return {SUCCESS, full_input_length, 0};
800 }
801 full_result r = scalar::base64::base64_tail_decode_safe(
802 output, outlen, input, length, equalsigns, options, last_chunk_options);
803 r = scalar::base64::patch_tail_result(r, 0, 0, equallocation,
804 full_input_length, last_chunk_options);
805 if (!is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
806 equalsigns > 0 && !ignore_garbage) {
807 // additional checks
808 if ((r.output_count % 3 == 0) ||
809 ((r.output_count % 3) + 1 + equalsigns != 4)) {
810 return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
811 }
812 }
813
814 // When is_partial(last_chunk_options) is true, we must either end with
815 // the end of the stream (beyond whitespace) or right after a non-ignorable
816 // character or at the very beginning of the stream.
817 // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
818 if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
819 r.input_count < full_input_length) {
820 // First check if we can extend the input to the end of the stream
821 while (r.input_count < full_input_length &&
822 base64_ignorable(*(input + r.input_count), options)) {
823 r.input_count++;
824 }
825 // If we are still not at the end of the stream, then we must backtrack
826 // to the last non-ignorable character.
827 if (r.input_count < full_input_length) {
828 while (r.input_count > 0 &&
829 base64_ignorable(*(input + r.input_count - 1), options)) {
830 r.input_count--;
831 }
832 }
833 }
834 return r;
835}
836
837simdutf_warn_unused simdutf_constexpr23 size_t
838base64_length_from_binary(size_t length, base64_options options) noexcept {
839 // By default, we use padding if we are not using the URL variant.
840 // This is check with ((options & base64_url) == 0) which returns true if we
841 // are not using the URL variant. However, we also allow 'inversion' of the
842 // convention with the base64_reverse_padding option. If the
843 // base64_reverse_padding option is set, we use padding if we are using the
844 // URL variant, and we omit it if we are not using the URL variant. This is
845 // checked with
846 // ((options & base64_reverse_padding) == base64_reverse_padding).
847 bool use_padding =
848 ((options & base64_url) == 0) ^
849 ((options & base64_reverse_padding) == base64_reverse_padding);
850 if (!use_padding) {
851 return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
852 }
853 return (length + 2) / 3 *
854 4; // We use padding to make the length a multiple of 4.
855}
856
857simdutf_warn_unused simdutf_constexpr23 size_t
858base64_length_from_binary_with_lines(size_t length, base64_options options,
859 size_t line_length) noexcept {
860 if (length == 0) {
861 return 0;
862 }
863 size_t base64_length =
864 scalar::base64::base64_length_from_binary(length, options);
865 if (line_length < 4) {
866 line_length = 4;
867 }
868 size_t lines =
869 (base64_length + line_length - 1) / line_length; // number of lines
870 return base64_length + lines - 1;
871}
872
873// Return the length of the prefix that contains count base64 characters.
874// Thus, if count is 3, the function returns the length of the prefix
875// that contains 3 base64 characters.
876// The function returns (size_t)-1 if there is not enough base64 characters in
877// the input.
878template <typename char_type>
879simdutf_warn_unused size_t prefix_length(size_t count,
880 simdutf::base64_options options,
881 const char_type *input,
882 size_t length) noexcept {
883 size_t i = 0;
884 while (i < length && is_ignorable(input[i], options)) {
885 i++;
886 }
887 if (count == 0) {
888 return i; // duh!
889 }
890 for (; i < length; i++) {
891 if (is_ignorable(input[i], options)) {
892 continue;
893 }
894 // We have a base64 character or a padding character.
895 count--;
896 if (count == 0) {
897 return i + 1;
898 }
899 }
900 simdutf_log_assert(false, "You never get here");
901
902 return -1; // should never happen
903}
904
905} // namespace base64
906} // unnamed namespace
907} // namespace scalar
908} // namespace simdutf
909
910#endif