blake2b_sse2.h 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. #pragma once
  2. #include <emmintrin.h>
  3. #include "blake2b.h"
  4. #include <library/cpp/digest/argonish/internal/rotations/rotations_sse2.h>
  5. namespace NArgonish {
  6. template <>
  7. void* TBlake2B<EInstructionSet::SSE2>::GetIV_() const {
  8. static const __m128i Iv[4] = {
  9. _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
  10. _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
  11. _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
  12. _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
  13. return (void*)Iv;
  14. }
  15. static const ui32 Sigma[12][16] = {
  16. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  17. {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
  18. {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
  19. {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
  20. {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
  21. {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
  22. {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
  23. {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
  24. {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
  25. {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
  26. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  27. {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}};
  28. static inline void G1(
  29. __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
  30. __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
  31. __m128i& b0, __m128i& b1) {
  32. row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
  33. row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
  34. row4l = _mm_xor_si128(row4l, row1l);
  35. row4h = _mm_xor_si128(row4h, row1h);
  36. row4l = Rotr32(row4l);
  37. row4h = Rotr32(row4h);
  38. row3l = _mm_add_epi64(row3l, row4l);
  39. row3h = _mm_add_epi64(row3h, row4h);
  40. row2l = _mm_xor_si128(row2l, row3l);
  41. row2h = _mm_xor_si128(row2h, row3h);
  42. row2l = Rotr24(row2l);
  43. row2h = Rotr24(row2h);
  44. }
  45. static inline void G2(
  46. __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
  47. __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
  48. __m128i& b0, __m128i& b1) {
  49. row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
  50. row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
  51. row4l = _mm_xor_si128(row4l, row1l);
  52. row4h = _mm_xor_si128(row4h, row1h);
  53. row4l = Rotr16(row4l);
  54. row4h = Rotr16(row4h);
  55. row3l = _mm_add_epi64(row3l, row4l);
  56. row3h = _mm_add_epi64(row3h, row4h);
  57. row2l = _mm_xor_si128(row2l, row3l);
  58. row2h = _mm_xor_si128(row2h, row3h);
  59. row2l = Rotr63(row2l);
  60. row2h = Rotr63(row2h);
  61. }
  62. static inline void Diagonalize(
  63. __m128i& row2l, __m128i& row3l, __m128i& row4l,
  64. __m128i& row2h, __m128i& row3h, __m128i& row4h) {
  65. __m128i t0 = row4l;
  66. __m128i t1 = row2l;
  67. row4l = row3l;
  68. row3l = row3h;
  69. row3h = row4l;
  70. row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
  71. row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
  72. row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
  73. row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
  74. }
  75. static inline void Undiagonalize(
  76. __m128i& row2l, __m128i& row3l, __m128i& row4l,
  77. __m128i& row2h, __m128i& row3h, __m128i& row4h) {
  78. __m128i t0 = row3l;
  79. row3l = row3h;
  80. row3h = t0;
  81. t0 = row2l;
  82. __m128i t1 = row4l;
  83. row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
  84. row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
  85. row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
  86. row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
  87. }
  88. static inline void Round(int r, const ui64* block_ptr,
  89. __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
  90. __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h) {
  91. __m128i b0, b1;
  92. b0 = _mm_set_epi64x(block_ptr[Sigma[r][2]], block_ptr[Sigma[r][0]]);
  93. b1 = _mm_set_epi64x(block_ptr[Sigma[r][6]], block_ptr[Sigma[r][4]]);
  94. G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
  95. b0 = _mm_set_epi64x(block_ptr[Sigma[r][3]], block_ptr[Sigma[r][1]]);
  96. b1 = _mm_set_epi64x(block_ptr[Sigma[r][7]], block_ptr[Sigma[r][5]]);
  97. G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
  98. Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
  99. b0 = _mm_set_epi64x(block_ptr[Sigma[r][10]], block_ptr[Sigma[r][8]]);
  100. b1 = _mm_set_epi64x(block_ptr[Sigma[r][14]], block_ptr[Sigma[r][12]]);
  101. G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
  102. b0 = _mm_set_epi64x(block_ptr[Sigma[r][11]], block_ptr[Sigma[r][9]]);
  103. b1 = _mm_set_epi64x(block_ptr[Sigma[r][15]], block_ptr[Sigma[r][13]]);
  104. G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);
  105. Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
  106. }
  107. template <>
  108. void TBlake2B<EInstructionSet::SSE2>::InitialXor_(ui8* h, const ui8* p) {
  109. __m128i* m_res = (__m128i*)h;
  110. const __m128i* m_p = (__m128i*)p;
  111. __m128i* iv = (__m128i*)GetIV_();
  112. _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
  113. _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
  114. _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
  115. _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
  116. }
  117. template <>
  118. void TBlake2B<EInstructionSet::SSE2>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
  119. __m128i* iv = (__m128i*)GetIV_();
  120. __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
  121. __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
  122. __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
  123. __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
  124. __m128i row3l = iv[0];
  125. __m128i row3h = iv[1];
  126. __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
  127. __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
  128. for (int r = 0; r < 12; r++)
  129. Round(r, block, row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
  130. _mm_storeu_si128((__m128i*)&State_.H[0],
  131. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
  132. _mm_storeu_si128((__m128i*)&State_.H[2],
  133. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
  134. _mm_storeu_si128((__m128i*)&State_.H[4],
  135. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
  136. _mm_storeu_si128((__m128i*)&State_.H[6],
  137. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
  138. }
  139. }