#include #include #include #include "libbase64.h" #include "codecs.h" #ifdef __AVX2__ #include #define CMPGT(s,n) _mm256_cmpgt_epi8((s), _mm256_set1_epi8(n)) #define CMPEQ(s,n) _mm256_cmpeq_epi8((s), _mm256_set1_epi8(n)) #define REPLACE(s,n) _mm256_and_si256((s), _mm256_set1_epi8(n)) #define RANGE(s,a,b) _mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1)) static inline __m256i _mm256_bswap_epi32 (const __m256i in) { // _mm256_shuffle_epi8() works on two 128-bit lanes separately: return _mm256_shuffle_epi8(in, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); } static inline __m256i enc_reshuffle (__m256i in) { // Spread out 32-bit words over both halves of the input register: in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32( 0, 1, 2, -1, 3, 4, 5, -1)); // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm256_shuffle_epi8(in, _mm256_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2, -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // cd = [00000000|00000000|0000cccc|ccdddddd] const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF)); // ab = [0000aaaa|aabbbbbb|00000000|00000000] const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000)); // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd] const __m256i merged = _mm256_or_si256(ab, cd); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m256i indices = _mm256_or_si256(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm256_bswap_epi32(indices); } static inline __m256i enc_translate (const __m256i in) { // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Delta Characters // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 -75 0123456789 // 3 [62] [43] -19 -15 + // 4 [63] [47] -16 +3 / // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], // [3,4], and [4]: const __m256i mask1 = CMPGT(in, 25); const __m256i mask2 = CMPGT(in, 51); const __m256i mask3 = CMPGT(in, 61); const __m256i mask4 = CMPEQ(in, 63); // All characters are at least in cumulative set 0, so add 'A': __m256i out = _mm256_add_epi8(in, _mm256_set1_epi8(65)); // For inputs which are also in any of the other cumulative sets, // add delta values against the previous set(s) to correct the shift: out = _mm256_add_epi8(out, REPLACE(mask1, 6)); out = _mm256_sub_epi8(out, REPLACE(mask2, 75)); out = _mm256_sub_epi8(out, REPLACE(mask3, 15)); out = _mm256_add_epi8(out, REPLACE(mask4, 3)); return out; } static inline __m256i dec_reshuffle (__m256i in) { // Shuffle bytes to 32-bit bigendian: in = _mm256_bswap_epi32(in); // Mask in a single byte per shift: __m256i mask = _mm256_set1_epi32(0x3F000000); // Pack bytes together: __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8)); // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1)); // Pack 32-bit words together, squashing empty words 3 and 7: return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( 0, 1, 2, 4, 5, 6, -1, -1)); } #endif // __AVX2__ void avx2_base64_stream_encode ( struct avx2_base64_state *state , const char *src , size_t srclen , char *out , size_t *outlen ) { #if defined(__AVX2__) #include "enc_head.c" #include "enc_avx2.c" #include "enc_tail.c" #else (void)state; (void)src; (void)srclen; (void)out; (void)outlen; abort(); #endif } int avx2_base64_stream_decode ( struct avx2_base64_state *state , const char *src , size_t srclen , char *out , size_t *outlen ) { #if defined(__AVX2__) #include "dec_head.c" #include "dec_avx2.c" #include "dec_tail.c" #else (void)state; (void)src; (void)srclen; (void)out; (void)outlen; abort(); #endif }