blake2b_sse41.h 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #pragma once
  2. #include <smmintrin.h>
  3. #include "blake2b.h"
  4. #include "load_sse41.h"
  5. #include <library/cpp/digest/argonish/internal/rotations/rotations_ssse3.h>
  6. namespace NArgonish {
  7. template <>
  8. void* TBlake2B<EInstructionSet::SSE41>::GetIV_() const {
  9. static const __m128i Iv[4] = {
  10. _mm_set_epi64x(0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL),
  11. _mm_set_epi64x(0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL),
  12. _mm_set_epi64x(0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL),
  13. _mm_set_epi64x(0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL)};
  14. return (void*)Iv;
  15. }
  16. static inline void G1(
  17. __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
  18. __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
  19. __m128i& b0, __m128i& b1) {
  20. row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
  21. row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
  22. row4l = _mm_xor_si128(row4l, row1l);
  23. row4h = _mm_xor_si128(row4h, row1h);
  24. row4l = Rotr32(row4l);
  25. row4h = Rotr32(row4h);
  26. row3l = _mm_add_epi64(row3l, row4l);
  27. row3h = _mm_add_epi64(row3h, row4h);
  28. row2l = _mm_xor_si128(row2l, row3l);
  29. row2h = _mm_xor_si128(row2h, row3h);
  30. row2l = Rotr24(row2l);
  31. row2h = Rotr24(row2h);
  32. }
  33. static inline void G2(
  34. __m128i& row1l, __m128i& row2l, __m128i& row3l, __m128i& row4l,
  35. __m128i& row1h, __m128i& row2h, __m128i& row3h, __m128i& row4h,
  36. __m128i& b0, __m128i& b1) {
  37. row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
  38. row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
  39. row4l = _mm_xor_si128(row4l, row1l);
  40. row4h = _mm_xor_si128(row4h, row1h);
  41. row4l = Rotr16(row4l);
  42. row4h = Rotr16(row4h);
  43. row3l = _mm_add_epi64(row3l, row4l);
  44. row3h = _mm_add_epi64(row3h, row4h);
  45. row2l = _mm_xor_si128(row2l, row3l);
  46. row2h = _mm_xor_si128(row2h, row3h);
  47. row2l = Rotr63(row2l);
  48. row2h = Rotr63(row2h);
  49. }
  50. static inline void Diagonalize(
  51. __m128i& row2l, __m128i& row3l, __m128i& row4l,
  52. __m128i& row2h, __m128i& row3h, __m128i& row4h) {
  53. __m128i t0 = _mm_alignr_epi8(row2h, row2l, 8);
  54. __m128i t1 = _mm_alignr_epi8(row2l, row2h, 8);
  55. row2l = t0;
  56. row2h = t1;
  57. t0 = row3l;
  58. row3l = row3h;
  59. row3h = t0;
  60. t0 = _mm_alignr_epi8(row4h, row4l, 8);
  61. t1 = _mm_alignr_epi8(row4l, row4h, 8);
  62. row4l = t1;
  63. row4h = t0;
  64. }
  65. static inline void Undiagonalize(
  66. __m128i& row2l, __m128i& row3l, __m128i& row4l,
  67. __m128i& row2h, __m128i& row3h, __m128i& row4h) {
  68. __m128i t0 = _mm_alignr_epi8(row2l, row2h, 8);
  69. __m128i t1 = _mm_alignr_epi8(row2h, row2l, 8);
  70. row2l = t0;
  71. row2h = t1;
  72. t0 = row3l;
  73. row3l = row3h;
  74. row3h = t0;
  75. t0 = _mm_alignr_epi8(row4l, row4h, 8);
  76. t1 = _mm_alignr_epi8(row4h, row4l, 8);
  77. row4l = t1;
  78. row4h = t0;
  79. }
  80. #define ROUND(r) \
  81. LOAD_MSG_##r##_1(b0, b1); \
  82. G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
  83. LOAD_MSG_##r##_2(b0, b1); \
  84. G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
  85. Diagonalize(row2l, row3l, row4l, row2h, row3h, row4h); \
  86. LOAD_MSG_##r##_3(b0, b1); \
  87. G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
  88. LOAD_MSG_##r##_4(b0, b1); \
  89. G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \
  90. Undiagonalize(row2l, row3l, row4l, row2h, row3h, row4h);
  91. template <>
  92. void TBlake2B<EInstructionSet::SSE41>::InitialXor_(ui8* h, const ui8* p) {
  93. __m128i* m_res = (__m128i*)h;
  94. const __m128i* m_p = (__m128i*)p;
  95. __m128i* iv = (__m128i*)GetIV_();
  96. _mm_storeu_si128(m_res + 0, _mm_xor_si128(iv[0], _mm_loadu_si128(m_p + 0)));
  97. _mm_storeu_si128(m_res + 1, _mm_xor_si128(iv[1], _mm_loadu_si128(m_p + 1)));
  98. _mm_storeu_si128(m_res + 2, _mm_xor_si128(iv[2], _mm_loadu_si128(m_p + 2)));
  99. _mm_storeu_si128(m_res + 3, _mm_xor_si128(iv[3], _mm_loadu_si128(m_p + 3)));
  100. }
  101. template <>
  102. void TBlake2B<EInstructionSet::SSE41>::Compress_(const ui64 block[BLAKE2B_BLOCKQWORDS]) {
  103. const __m128i* block_ptr = (__m128i*)block;
  104. __m128i* iv = (__m128i*)GetIV_();
  105. const __m128i m0 = _mm_loadu_si128(block_ptr + 0);
  106. const __m128i m1 = _mm_loadu_si128(block_ptr + 1);
  107. const __m128i m2 = _mm_loadu_si128(block_ptr + 2);
  108. const __m128i m3 = _mm_loadu_si128(block_ptr + 3);
  109. const __m128i m4 = _mm_loadu_si128(block_ptr + 4);
  110. const __m128i m5 = _mm_loadu_si128(block_ptr + 5);
  111. const __m128i m6 = _mm_loadu_si128(block_ptr + 6);
  112. const __m128i m7 = _mm_loadu_si128(block_ptr + 7);
  113. __m128i row1l = _mm_loadu_si128((__m128i*)&State_.H[0]);
  114. __m128i row1h = _mm_loadu_si128((__m128i*)&State_.H[2]);
  115. __m128i row2l = _mm_loadu_si128((__m128i*)&State_.H[4]);
  116. __m128i row2h = _mm_loadu_si128((__m128i*)&State_.H[6]);
  117. __m128i row3l = iv[0];
  118. __m128i row3h = iv[1];
  119. __m128i row4l = _mm_xor_si128(iv[2], _mm_loadu_si128((__m128i*)&State_.T[0]));
  120. __m128i row4h = _mm_xor_si128(iv[3], _mm_loadu_si128((__m128i*)&State_.F[0]));
  121. __m128i b0, b1;
  122. ROUND(0);
  123. ROUND(1);
  124. ROUND(2);
  125. ROUND(3);
  126. ROUND(4);
  127. ROUND(5);
  128. ROUND(6);
  129. ROUND(7);
  130. ROUND(8);
  131. ROUND(9);
  132. ROUND(10);
  133. ROUND(11);
  134. _mm_storeu_si128((__m128i*)&State_.H[0],
  135. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[0]), _mm_xor_si128(row3l, row1l)));
  136. _mm_storeu_si128((__m128i*)&State_.H[2],
  137. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[2]), _mm_xor_si128(row3h, row1h)));
  138. _mm_storeu_si128((__m128i*)&State_.H[4],
  139. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[4]), _mm_xor_si128(row4l, row2l)));
  140. _mm_storeu_si128((__m128i*)&State_.H[6],
  141. _mm_xor_si128(_mm_loadu_si128((__m128i*)&State_.H[6]), _mm_xor_si128(row4h, row2h)));
  142. }
  143. #undef ROUND
  144. }