blake2b_avx2.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #pragma once
  2. #include <immintrin.h>
  3. #include "blake2b.h"
  4. #include <library/cpp/digest/argonish/internal/rotations/rotations_avx2.h>
  5. namespace NArgonish {
  6. template <>
  7. void* TBlake2B<EInstructionSet::AVX2>::GetIV_() const {
  8. static const __m256i Iv[2] = {
  9. _mm256_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL, 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
  10. _mm256_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL, 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL)};
  11. return (void*)Iv;
  12. }
  13. template <>
  14. void TBlake2B<EInstructionSet::AVX2>::InitialXor_(ui8* h, const ui8* p) {
  15. __m256i* iv = (__m256i*)GetIV_();
  16. __m256i* m_res = (__m256i*)h;
  17. const __m256i* m_second = (__m256i*)p;
  18. _mm256_storeu_si256(m_res, _mm256_xor_si256(iv[0], _mm256_loadu_si256(m_second)));
  19. _mm256_storeu_si256(m_res + 1, _mm256_xor_si256(iv[1], _mm256_loadu_si256(m_second + 1)));
  20. }
  21. /*
  22. * a = v0, v1, v2, v3
  23. * b = v4, v5, v6, v7
  24. * c = v8, v9, v10, v11
  25. * d = v12, v13, v14, v15
  26. */
  27. static inline void G1AVX2(ui32 r, __m256i& a, __m256i& b, __m256i& c, __m256i& d, const ui64* blk, const __m128i vindex[12][4]) {
  28. a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][0], 8)));
  29. d = Rotr32(_mm256_xor_si256(a, d));
  30. c = _mm256_add_epi64(c, d);
  31. b = Rotr24(_mm256_xor_si256(b, c));
  32. a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][1], 8)));
  33. d = Rotr16(_mm256_xor_si256(a, d));
  34. c = _mm256_add_epi64(c, d);
  35. b = Rotr63(_mm256_xor_si256(b, c));
  36. }
  37. static inline void G2AVX2(ui32 r, __m256i& a, __m256i& b, __m256i& c, __m256i& d, const ui64* blk, const __m128i vindex[12][4]) {
  38. a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][2], 8)));
  39. d = Rotr32(_mm256_xor_si256(a, d));
  40. c = _mm256_add_epi64(c, d);
  41. b = Rotr24(_mm256_xor_si256(b, c));
  42. a = _mm256_add_epi64(a, _mm256_add_epi64(b, _mm256_i32gather_epi64((const long long int*)blk, vindex[r][3], 8)));
  43. d = Rotr16(_mm256_xor_si256(a, d));
  44. c = _mm256_add_epi64(c, d);
  45. b = Rotr63(_mm256_xor_si256(b, c));
  46. }
  47. static inline void Diagonalize(__m256i& b, __m256i& c, __m256i& d) {
  48. b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1));
  49. c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2));
  50. d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3));
  51. }
  52. static inline void Undiagonalize(__m256i& b, __m256i& c, __m256i& d) {
  53. b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3));
  54. c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2));
  55. d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1));
  56. }
  57. template <>
  58. void TBlake2B<EInstructionSet::AVX2>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
  59. static const __m128i VIndex[12][4] = {
  60. {_mm_set_epi32(6, 4, 2, 0), _mm_set_epi32(7, 5, 3, 1), _mm_set_epi32(14, 12, 10, 8), _mm_set_epi32(15, 13, 11, 9)},
  61. {_mm_set_epi32(13, 9, 4, 14), _mm_set_epi32(6, 15, 8, 10), _mm_set_epi32(5, 11, 0, 1), _mm_set_epi32(3, 7, 2, 12)},
  62. {_mm_set_epi32(15, 5, 12, 11), _mm_set_epi32(13, 2, 0, 8), _mm_set_epi32(9, 7, 3, 10), _mm_set_epi32(4, 1, 6, 14)},
  63. {_mm_set_epi32(11, 13, 3, 7), _mm_set_epi32(14, 12, 1, 9), _mm_set_epi32(15, 4, 5, 2), _mm_set_epi32(8, 0, 10, 6)},
  64. {_mm_set_epi32(10, 2, 5, 9), _mm_set_epi32(15, 4, 7, 0), _mm_set_epi32(3, 6, 11, 14), _mm_set_epi32(13, 8, 12, 1)},
  65. {_mm_set_epi32(8, 0, 6, 2), _mm_set_epi32(3, 11, 10, 12), _mm_set_epi32(1, 15, 7, 4), _mm_set_epi32(9, 14, 5, 13)},
  66. {_mm_set_epi32(4, 14, 1, 12), _mm_set_epi32(10, 13, 15, 5), _mm_set_epi32(8, 9, 6, 0), _mm_set_epi32(11, 2, 3, 7)},
  67. {_mm_set_epi32(3, 12, 7, 13), _mm_set_epi32(9, 1, 14, 11), _mm_set_epi32(2, 8, 15, 5), _mm_set_epi32(10, 6, 4, 0)},
  68. {_mm_set_epi32(0, 11, 14, 6), _mm_set_epi32(8, 3, 9, 15), _mm_set_epi32(10, 1, 13, 12), _mm_set_epi32(5, 4, 7, 2)},
  69. {_mm_set_epi32(1, 7, 8, 10), _mm_set_epi32(5, 6, 4, 2), _mm_set_epi32(13, 3, 9, 15), _mm_set_epi32(0, 12, 14, 11)},
  70. {_mm_set_epi32(6, 4, 2, 0), _mm_set_epi32(7, 5, 3, 1), _mm_set_epi32(14, 12, 10, 8), _mm_set_epi32(15, 13, 11, 9)},
  71. {_mm_set_epi32(13, 9, 4, 14), _mm_set_epi32(6, 15, 8, 10), _mm_set_epi32(5, 11, 0, 1), _mm_set_epi32(3, 7, 2, 12)},
  72. };
  73. __m256i* iv = (__m256i*)GetIV_();
  74. __m256i a = _mm256_loadu_si256((__m256i*)&State_.H[0]);
  75. __m256i b = _mm256_loadu_si256((__m256i*)&State_.H[4]);
  76. __m256i c = iv[0];
  77. __m256i d = _mm256_xor_si256(iv[1], _mm256_loadu_si256((__m256i*)&State_.T[0]));
  78. for (ui32 r = 0; r < 12; ++r) {
  79. G1AVX2(r, a, b, c, d, block, VIndex);
  80. Diagonalize(b, c, d);
  81. G2AVX2(r, a, b, c, d, block, VIndex);
  82. Undiagonalize(b, c, d);
  83. }
  84. _mm256_storeu_si256((__m256i*)State_.H, _mm256_xor_si256(
  85. _mm256_loadu_si256((__m256i*)State_.H),
  86. _mm256_xor_si256(a, c)));
  87. _mm256_storeu_si256(((__m256i*)State_.H) + 1, _mm256_xor_si256(
  88. _mm256_loadu_si256(((__m256i*)State_.H) + 1),
  89. _mm256_xor_si256(b, d)));
  90. }
  91. }