argon2_avx2.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #pragma once
  2. #include <immintrin.h>
  3. #include "argon2_base.h"
  4. #include <library/cpp/digest/argonish/internal/blamka/blamka_avx2.h>
  5. namespace NArgonish {
  6. template <ui32 mcost, ui32 threads>
  7. class TArgon2AVX2 final: public TArgon2<EInstructionSet::AVX2, mcost, threads> {
  8. public:
  9. TArgon2AVX2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
  10. : TArgon2<EInstructionSet::AVX2, mcost, threads>(atype, tcost, key, keylen)
  11. {
  12. }
  13. protected:
  14. virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
  15. __m256i* mdst = (__m256i*)dst;
  16. __m256i* msrc = (__m256i*)src;
  17. for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i)
  18. XorValues(mdst + i, mdst + i, msrc + i);
  19. }
  20. virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
  21. memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
  22. }
  23. virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool with_xor) const override {
  24. __m256i blockxy[ARGON2_HWORDS_IN_BLOCK];
  25. __m256i state[ARGON2_HWORDS_IN_BLOCK];
  26. memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
  27. if (with_xor) {
  28. for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
  29. state[i] = _mm256_xor_si256(state[i], _mm256_loadu_si256((const __m256i*)refBlock->V + i));
  30. blockxy[i] = _mm256_xor_si256(state[i], _mm256_loadu_si256((const __m256i*)nextBlock->V + i));
  31. }
  32. } else {
  33. for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
  34. blockxy[i] = state[i] = _mm256_xor_si256(
  35. state[i], _mm256_loadu_si256((const __m256i*)refBlock->V + i));
  36. }
  37. }
  38. /**
  39. * state[ 8*i + 0 ] = ( v0_0, v1_0, v2_0, v3_0)
  40. * state[ 8*i + 1 ] = ( v4_0, v5_0, v6_0, v7_0)
  41. * state[ 8*i + 2 ] = ( v8_0, v9_0, v10_0, v11_0)
  42. * state[ 8*i + 3 ] = (v12_0, v13_0, v14_0, v15_0)
  43. * state[ 8*i + 4 ] = ( v0_1, v1_1, v2_1, v3_1)
  44. * state[ 8*i + 5 ] = ( v4_1, v5_1, v6_1, v7_1)
  45. * state[ 8*i + 6 ] = ( v8_1, v9_1, v10_1, v11_1)
  46. * state[ 8*i + 7 ] = (v12_1, v13_1, v14_1, v15_1)
  47. */
  48. for (ui32 i = 0; i < 4; ++i) {
  49. BlamkaG1AVX2(
  50. state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
  51. state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
  52. BlamkaG2AVX2(
  53. state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
  54. state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
  55. DiagonalizeAVX21(
  56. state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  57. state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  58. BlamkaG1AVX2(
  59. state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
  60. state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
  61. BlamkaG2AVX2(
  62. state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
  63. state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
  64. UndiagonalizeAVX21(
  65. state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  66. state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  67. }
  68. /**
  69. * state[ 0 + i] = ( v0_0, v1_0, v0_1, v1_1)
  70. * state[ 4 + i] = ( v2_0, v3_0, v2_1, v3_1)
  71. * state[ 8 + i] = ( v4_0, v5_0, v4_1, v5_1)
  72. * state[12 + i] = ( v6_0, v7_0, v6_1, v7_1)
  73. * state[16 + i] = ( v8_0, v9_0, v8_1, v9_1)
  74. * state[20 + i] = (v10_0, v11_0, v10_1, v11_1)
  75. * state[24 + i] = (v12_0, v13_0, v12_1, v13_1)
  76. * state[28 + i] = (v14_0, v15_0, v14_1, v15_1)
  77. */
  78. for (ui32 i = 0; i < 4; ++i) {
  79. BlamkaG1AVX2(
  80. state[0 + i], state[4 + i], state[8 + i], state[12 + i],
  81. state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
  82. BlamkaG2AVX2(
  83. state[0 + i], state[4 + i], state[8 + i], state[12 + i],
  84. state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
  85. DiagonalizeAVX22(
  86. state[8 + i], state[12 + i],
  87. state[16 + i], state[20 + i],
  88. state[24 + i], state[28 + i]);
  89. BlamkaG1AVX2(
  90. state[0 + i], state[4 + i], state[8 + i], state[12 + i],
  91. state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
  92. BlamkaG2AVX2(
  93. state[0 + i], state[4 + i], state[8 + i], state[12 + i],
  94. state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
  95. UndiagonalizeAVX22(
  96. state[8 + i], state[12 + i],
  97. state[16 + i], state[20 + i],
  98. state[24 + i], state[28 + i]);
  99. }
  100. for (ui32 i = 0; i < ARGON2_HWORDS_IN_BLOCK; ++i) {
  101. state[i] = _mm256_xor_si256(state[i], blockxy[i]);
  102. _mm256_storeu_si256((__m256i*)nextBlock->V + i, state[i]);
  103. }
  104. }
  105. };
  106. }