123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- /**
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- * SPDX-License-Identifier: Apache-2.0.
- */
- #include <emmintrin.h>
- #include <immintrin.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <aws/common/common.h>
- /***** Decode logic *****/
- /*
- * Decodes ranges of bytes in place
- * For each byte of 'in' that is between lo and hi (inclusive), adds offset and _adds_ it to the corresponding offset in
- * out.
- */
- static inline __m256i translate_range(__m256i in, uint8_t lo, uint8_t hi, uint8_t offset) {
- __m256i lovec = _mm256_set1_epi8(lo);
- __m256i hivec = _mm256_set1_epi8((char)(hi - lo));
- __m256i offsetvec = _mm256_set1_epi8(offset);
- __m256i tmp = _mm256_sub_epi8(in, lovec);
- /*
- * we'll use the unsigned min operator to do our comparison. Note that
- * there's no unsigned compare as a comparison intrinsic.
- */
- __m256i mask = _mm256_min_epu8(tmp, hivec);
- /* if mask = tmp, then keep that byte */
- mask = _mm256_cmpeq_epi8(mask, tmp);
- tmp = _mm256_add_epi8(tmp, offsetvec);
- tmp = _mm256_and_si256(tmp, mask);
- return tmp;
- }
- /*
- * For each 8-bit element in in, if the element equals match, add to the corresponding element in out the value decode.
- */
- static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode) {
- __m256i mask = _mm256_cmpeq_epi8(in, _mm256_set1_epi8(match));
- return _mm256_and_si256(mask, _mm256_set1_epi8(decode));
- }
- /*
- * Input: a pointer to a 256-bit vector of base64 characters
- * The pointed-to-vector is replaced by a 256-bit vector of 6-bit decoded parts;
- * on decode failure, returns false, else returns true on success.
- */
- static inline bool decode_vec(__m256i *in) {
- __m256i tmp1, tmp2, tmp3;
- /*
- * Base64 decoding table, see RFC4648
- *
- * Note that we use multiple vector registers to try to allow the CPU to
- * paralellize the merging ORs
- */
- tmp1 = translate_range(*in, 'A', 'Z', 0 + 1);
- tmp2 = translate_range(*in, 'a', 'z', 26 + 1);
- tmp3 = translate_range(*in, '0', '9', 52 + 1);
- tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1));
- tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1));
- tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
- /*
- * We use 0 to mark decode failures, so everything is decoded to one higher
- * than normal. We'll shift this down now.
- */
- *in = _mm256_sub_epi8(tmp3, _mm256_set1_epi8(1));
- /* If any byte is now zero, we had a decode failure */
- __m256i mask = _mm256_cmpeq_epi8(tmp3, _mm256_set1_epi8(0));
- return _mm256_testz_si256(mask, mask);
- }
- AWS_ALIGNED_TYPEDEF(uint8_t, aligned256[32], 32);
- /*
- * Input: a 256-bit vector, interpreted as 32 * 6-bit values
- * Output: a 256-bit vector, the lower 24 bytes of which contain the packed version of the input
- */
- static inline __m256i pack_vec(__m256i in) {
- /*
- * Our basic strategy is to split the input vector into three vectors, for each 6-bit component
- * of each 24-bit group, shift the groups into place, then OR the vectors together. Conveniently,
- * we can do this on a (32 bit) dword-by-dword basis.
- *
- * It's important to note that we're interpreting the vector as being little-endian. That is,
- * on entry, we have dwords that look like this:
- *
- * MSB LSB
- * 00DD DDDD 00CC CCCC 00BB BBBB 00AA AAAA
- *
- * And we want to translate to:
- *
- * MSB LSB
- * 0000 0000 AAAA AABB BBBB CCCC CCDD DDDD
- *
- * After which point we can pack these dwords together to produce our final output.
- */
- __m256i maskA = _mm256_set1_epi32(0xFF); // low bits
- __m256i maskB = _mm256_set1_epi32(0xFF00);
- __m256i maskC = _mm256_set1_epi32(0xFF0000);
- __m256i maskD = _mm256_set1_epi32((int)0xFF000000);
- __m256i bitsA = _mm256_slli_epi32(_mm256_and_si256(in, maskA), 18);
- __m256i bitsB = _mm256_slli_epi32(_mm256_and_si256(in, maskB), 4);
- __m256i bitsC = _mm256_srli_epi32(_mm256_and_si256(in, maskC), 10);
- __m256i bitsD = _mm256_srli_epi32(_mm256_and_si256(in, maskD), 24);
- __m256i dwords = _mm256_or_si256(_mm256_or_si256(bitsA, bitsB), _mm256_or_si256(bitsC, bitsD));
- /*
- * Now we have a series of dwords with empty MSBs.
- * We need to pack them together (and shift down) with a shuffle operation.
- * Unfortunately the shuffle operation operates independently within each 128-bit lane,
- * so we'll need to do this in two steps: First we compact dwords within each lane, then
- * we do a dword shuffle to compact the two lanes together.
- * 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 <- byte index (little endian)
- * -- 09 0a 0b -- 06 07 08 -- 03 04 05 -- 00 01 02 <- data index
- *
- * We also reverse the order of 3-byte fragments within each lane; we've constructed
- * those fragments in little endian but the order of fragments within the overall
- * vector is in memory order (big endian)
- */
- const aligned256 shufvec_buf = {
- /* clang-format off */
- /* MSB */
- 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
- 2, 1, 0,
- 6, 5, 4,
- 10, 9, 8,
- 14, 13, 12,
- 0xFF, 0xFF, 0xFF, 0xFF, /* Zero out the top 4 bytes of the lane */
- 2, 1, 0,
- 6, 5, 4,
- 10, 9, 8,
- 14, 13, 12
- /* LSB */
- /* clang-format on */
- };
- __m256i shufvec = _mm256_load_si256((__m256i const *)&shufvec_buf);
- dwords = _mm256_shuffle_epi8(dwords, shufvec);
- /*
- * Now shuffle the 32-bit words:
- * A B C 0 D E F 0 -> 0 0 A B C D E F
- */
- __m256i shuf32 = _mm256_set_epi32(0, 0, 7, 6, 5, 3, 2, 1);
- dwords = _mm256_permutevar8x32_epi32(dwords, shuf32);
- return dwords;
- }
- static inline bool decode(const unsigned char *in, unsigned char *out) {
- __m256i vec = _mm256_loadu_si256((__m256i const *)in);
- if (!decode_vec(&vec)) {
- return false;
- }
- vec = pack_vec(vec);
- /*
- * We'll do overlapping writes to get both the low 128 bits and the high 64-bits written.
- * Input (memory order): 0 1 2 3 4 5 - - (dwords)
- * Input (little endian) - - 5 4 3 2 1 0
- * Output in memory:
- * [0 1 2 3] [4 5]
- */
- __m128i lo = _mm256_extracti128_si256(vec, 0);
- /*
- * Unfortunately some compilers don't support _mm256_extract_epi64,
- * so we'll just copy right out of the vector as a fallback
- */
- #ifdef HAVE_MM256_EXTRACT_EPI64
- uint64_t hi = _mm256_extract_epi64(vec, 2);
- const uint64_t *p_hi = &hi;
- #else
- const uint64_t *p_hi = (uint64_t *)&vec + 2;
- #endif
- _mm_storeu_si128((__m128i *)out, lo);
- memcpy(out + 16, p_hi, sizeof(*p_hi));
- return true;
- }
- size_t aws_common_private_base64_decode_sse41(const unsigned char *in, unsigned char *out, size_t len) {
- if (len % 4) {
- return (size_t)-1;
- }
- size_t outlen = 0;
- while (len > 32) {
- if (!decode(in, out)) {
- return (size_t)-1;
- }
- len -= 32;
- in += 32;
- out += 24;
- outlen += 24;
- }
- if (len > 0) {
- unsigned char tmp_in[32];
- unsigned char tmp_out[24];
- memset(tmp_out, 0xEE, sizeof(tmp_out));
- /* We need to ensure the vector contains valid b64 characters */
- memset(tmp_in, 'A', sizeof(tmp_in));
- memcpy(tmp_in, in, len);
- size_t final_out = (3 * len) / 4;
- /* Check for end-of-string padding (up to 2 characters) */
- for (int i = 0; i < 2; i++) {
- if (tmp_in[len - 1] == '=') {
- tmp_in[len - 1] = 'A'; /* make sure the inner loop doesn't bail out */
- len--;
- final_out--;
- }
- }
- if (!decode(tmp_in, tmp_out)) {
- return (size_t)-1;
- }
- /* Check that there are no trailing ones bits */
- for (size_t i = final_out; i < sizeof(tmp_out); i++) {
- if (tmp_out[i]) {
- return (size_t)-1;
- }
- }
- memcpy(out, tmp_out, final_out);
- outlen += final_out;
- }
- return outlen;
- }
- /***** Encode logic *****/
- static inline __m256i encode_chars(__m256i in) {
- __m256i tmp1, tmp2, tmp3;
- /*
- * Base64 encoding table, see RFC4648
- *
- * We again use fan-in for the ORs here.
- */
- tmp1 = translate_range(in, 0, 25, 'A');
- tmp2 = translate_range(in, 26, 26 + 25, 'a');
- tmp3 = translate_range(in, 52, 61, '0');
- tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '+'));
- tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '/'));
- return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
- }
- /*
- * Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding
- * Output: A 256-bit vector of base64 characters
- */
- static inline __m256i encode_stride(__m256i vec) {
- /*
- * First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords
- * to balance the amount of actual data between 128-bit subvectors.
- * After this we want the LE representation to look like: -- XX XX XX -- XX XX XX
- */
- __m256i shuf32 = _mm256_set_epi32(7, 5, 4, 3, 6, 2, 1, 0);
- vec = _mm256_permutevar8x32_epi32(vec, shuf32);
- /*
- * Next, within each group of 3 bytes, we need to byteswap into little endian form so our bitshifts
- * will work properly. We also shuffle around so that each dword has one 3-byte group, plus one byte
- * (MSB) of zero-padding.
- * Because this is a byte-shuffle, indexes are within each 128-bit subvector.
- *
- * -- -- -- -- 11 10 09 08 07 06 05 04 03 02 01 00
- */
- const aligned256 shufvec_buf = {
- /* clang-format off */
- /* MSB */
- 2, 1, 0, 0xFF,
- 5, 4, 3, 0xFF,
- 8, 7, 6, 0xFF,
- 11, 10, 9, 0xFF,
- 2, 1, 0, 0xFF,
- 5, 4, 3, 0xFF,
- 8, 7, 6, 0xFF,
- 11, 10, 9, 0xFF
- /* LSB */
- /* clang-format on */
- };
- vec = _mm256_shuffle_epi8(vec, _mm256_load_si256((__m256i const *)&shufvec_buf));
- /*
- * Now shift and mask to split out 6-bit groups.
- * We'll also do a second byteswap to get back into big-endian
- */
- __m256i mask0 = _mm256_set1_epi32(0x3F);
- __m256i mask1 = _mm256_set1_epi32(0x3F << 6);
- __m256i mask2 = _mm256_set1_epi32(0x3F << 12);
- __m256i mask3 = _mm256_set1_epi32(0x3F << 18);
- __m256i digit0 = _mm256_and_si256(mask0, vec);
- __m256i digit1 = _mm256_and_si256(mask1, vec);
- __m256i digit2 = _mm256_and_si256(mask2, vec);
- __m256i digit3 = _mm256_and_si256(mask3, vec);
- /*
- * Because we want to byteswap, the low-order digit0 goes into the
- * high-order byte
- */
- digit0 = _mm256_slli_epi32(digit0, 24);
- digit1 = _mm256_slli_epi32(digit1, 10);
- digit2 = _mm256_srli_epi32(digit2, 4);
- digit3 = _mm256_srli_epi32(digit3, 18);
- vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3));
- /* Finally translate to the base64 character set */
- return encode_chars(vec);
- }
- void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) {
- __m256i instride, outstride;
- while (inlen >= 32) {
- /*
- * Where possible, we'll load a full vector at a time and ignore the over-read.
- * However, if we have < 32 bytes left, this would result in a potential read
- * of unreadable pages, so we use bounce buffers below.
- */
- instride = _mm256_loadu_si256((__m256i const *)input);
- outstride = encode_stride(instride);
- _mm256_storeu_si256((__m256i *)output, outstride);
- input += 24;
- output += 32;
- inlen -= 24;
- }
- while (inlen) {
- /*
- * We need to go through a bounce buffer for anything remaining, as we
- * don't want to over-read or over-write the ends of the buffers.
- */
- size_t stridelen = inlen > 24 ? 24 : inlen;
- size_t outlen = ((stridelen + 2) / 3) * 4;
- memset(&instride, 0, sizeof(instride));
- memcpy(&instride, input, stridelen);
- outstride = encode_stride(instride);
- memcpy(output, &outstride, outlen);
- if (inlen < 24) {
- if (inlen % 3 >= 1) {
- /* AA== or AAA= */
- output[outlen - 1] = '=';
- }
- if (inlen % 3 == 1) {
- /* AA== */
- output[outlen - 2] = '=';
- }
- return;
- }
- input += stridelen;
- output += outlen;
- inlen -= stridelen;
- }
- }
|