codec_avx2.c 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. #include <stdint.h>
  2. #include <stddef.h>
  3. #include <stdlib.h>
  4. #include "libbase64.h"
  5. #include "codecs.h"
  6. #ifdef __AVX2__
  7. #include <immintrin.h>
  8. #define CMPGT(s,n) _mm256_cmpgt_epi8((s), _mm256_set1_epi8(n))
  9. #define CMPEQ(s,n) _mm256_cmpeq_epi8((s), _mm256_set1_epi8(n))
  10. #define REPLACE(s,n) _mm256_and_si256((s), _mm256_set1_epi8(n))
  11. #define RANGE(s,a,b) _mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1))
  12. static inline __m256i
  13. _mm256_bswap_epi32 (const __m256i in)
  14. {
  15. // _mm256_shuffle_epi8() works on two 128-bit lanes separately:
  16. return _mm256_shuffle_epi8(in, _mm256_setr_epi8(
  17. 3, 2, 1, 0,
  18. 7, 6, 5, 4,
  19. 11, 10, 9, 8,
  20. 15, 14, 13, 12,
  21. 3, 2, 1, 0,
  22. 7, 6, 5, 4,
  23. 11, 10, 9, 8,
  24. 15, 14, 13, 12));
  25. }
  26. static inline __m256i
  27. enc_reshuffle (__m256i in)
  28. {
  29. // Spread out 32-bit words over both halves of the input register:
  30. in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32(
  31. 0, 1, 2, -1,
  32. 3, 4, 5, -1));
  33. // Slice into 32-bit chunks and operate on all chunks in parallel.
  34. // All processing is done within the 32-bit chunk. First, shuffle:
  35. // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
  36. // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
  37. in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
  38. -1, 9, 10, 11,
  39. -1, 6, 7, 8,
  40. -1, 3, 4, 5,
  41. -1, 0, 1, 2,
  42. -1, 9, 10, 11,
  43. -1, 6, 7, 8,
  44. -1, 3, 4, 5,
  45. -1, 0, 1, 2));
  46. // cd = [00000000|00000000|0000cccc|ccdddddd]
  47. const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF));
  48. // ab = [0000aaaa|aabbbbbb|00000000|00000000]
  49. const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000));
  50. // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
  51. const __m256i merged = _mm256_or_si256(ab, cd);
  52. // bd = [00000000|00bbbbbb|00000000|00dddddd]
  53. const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));
  54. // ac = [00aaaaaa|00000000|00cccccc|00000000]
  55. const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));
  56. // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
  57. const __m256i indices = _mm256_or_si256(ac, bd);
  58. // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
  59. return _mm256_bswap_epi32(indices);
  60. }
  61. static inline __m256i
  62. enc_translate (const __m256i in)
  63. {
  64. // Translate values 0..63 to the Base64 alphabet. There are five sets:
  65. // # From To Abs Delta Characters
  66. // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ
  67. // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz
  68. // 2 [52..61] [48..57] -4 -75 0123456789
  69. // 3 [62] [43] -19 -15 +
  70. // 4 [63] [47] -16 +3 /
  71. // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4],
  72. // [3,4], and [4]:
  73. const __m256i mask1 = CMPGT(in, 25);
  74. const __m256i mask2 = CMPGT(in, 51);
  75. const __m256i mask3 = CMPGT(in, 61);
  76. const __m256i mask4 = CMPEQ(in, 63);
  77. // All characters are at least in cumulative set 0, so add 'A':
  78. __m256i out = _mm256_add_epi8(in, _mm256_set1_epi8(65));
  79. // For inputs which are also in any of the other cumulative sets,
  80. // add delta values against the previous set(s) to correct the shift:
  81. out = _mm256_add_epi8(out, REPLACE(mask1, 6));
  82. out = _mm256_sub_epi8(out, REPLACE(mask2, 75));
  83. out = _mm256_sub_epi8(out, REPLACE(mask3, 15));
  84. out = _mm256_add_epi8(out, REPLACE(mask4, 3));
  85. return out;
  86. }
  87. static inline __m256i
  88. dec_reshuffle (__m256i in)
  89. {
  90. // Shuffle bytes to 32-bit bigendian:
  91. in = _mm256_bswap_epi32(in);
  92. // Mask in a single byte per shift:
  93. __m256i mask = _mm256_set1_epi32(0x3F000000);
  94. // Pack bytes together:
  95. __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2);
  96. mask = _mm256_srli_epi32(mask, 8);
  97. out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4));
  98. mask = _mm256_srli_epi32(mask, 8);
  99. out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6));
  100. mask = _mm256_srli_epi32(mask, 8);
  101. out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8));
  102. // Pack bytes together within 32-bit words, discarding words 3 and 7:
  103. out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
  104. 3, 2, 1,
  105. 7, 6, 5,
  106. 11, 10, 9,
  107. 15, 14, 13,
  108. -1, -1, -1, -1,
  109. 3, 2, 1,
  110. 7, 6, 5,
  111. 11, 10, 9,
  112. 15, 14, 13,
  113. -1, -1, -1, -1));
  114. // Pack 32-bit words together, squashing empty words 3 and 7:
  115. return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(
  116. 0, 1, 2, 4, 5, 6, -1, -1));
  117. }
  118. #endif // __AVX2__
  119. void
  120. avx2_base64_stream_encode
  121. ( struct avx2_base64_state *state
  122. , const char *src
  123. , size_t srclen
  124. , char *out
  125. , size_t *outlen
  126. )
  127. {
  128. #if defined(__AVX2__)
  129. #include "enc_head.c"
  130. #include "enc_avx2.c"
  131. #include "enc_tail.c"
  132. #else
  133. (void)state;
  134. (void)src;
  135. (void)srclen;
  136. (void)out;
  137. (void)outlen;
  138. abort();
  139. #endif
  140. }
  141. int
  142. avx2_base64_stream_decode
  143. ( struct avx2_base64_state *state
  144. , const char *src
  145. , size_t srclen
  146. , char *out
  147. , size_t *outlen
  148. )
  149. {
  150. #if defined(__AVX2__)
  151. #include "dec_head.c"
  152. #include "dec_avx2.c"
  153. #include "dec_tail.c"
  154. #else
  155. (void)state;
  156. (void)src;
  157. (void)srclen;
  158. (void)out;
  159. (void)outlen;
  160. abort();
  161. #endif
  162. }