argon2_sse2.h 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #pragma once
  2. #include <emmintrin.h>
  3. #include "argon2_base.h"
  4. #include <library/cpp/digest/argonish/internal/blamka/blamka_sse2.h>
  5. namespace NArgonish {
  6. template <ui32 mcost, ui32 threads>
  7. class TArgon2SSE2 final: public TArgon2<EInstructionSet::SSE2, mcost, threads> {
  8. public:
  9. TArgon2SSE2(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
  10. : TArgon2<EInstructionSet::SSE2, mcost, threads>(atype, tcost, key, keylen)
  11. {
  12. }
  13. protected:
  14. virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
  15. __m128i* mdst = (__m128i*)dst->V;
  16. __m128i* msrc = (__m128i*)src->V;
  17. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
  18. XorValues(mdst + i, msrc + i, mdst + i);
  19. }
  20. virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
  21. memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
  22. }
  23. virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
  24. __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
  25. __m128i state[ARGON2_OWORDS_IN_BLOCK];
  26. memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
  27. if (withXor) {
  28. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
  29. state[i] = _mm_xor_si128(
  30. state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
  31. blockxy[i] = _mm_xor_si128(
  32. state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
  33. }
  34. } else {
  35. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
  36. blockxy[i] = state[i] = _mm_xor_si128(
  37. state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
  38. }
  39. }
  40. for (ui32 i = 0; i < 8; ++i) {
  41. BlamkaG1SSE2(
  42. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  43. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  44. BlamkaG2SSE2(
  45. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  46. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  47. DiagonalizeSSE2(
  48. state[8 * i + 2], state[8 * i + 3],
  49. state[8 * i + 4], state[8 * i + 5],
  50. state[8 * i + 6], state[8 * i + 7]);
  51. BlamkaG1SSE2(
  52. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  53. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  54. BlamkaG2SSE2(
  55. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  56. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  57. UndiagonalizeSSE2(
  58. state[8 * i + 2], state[8 * i + 3],
  59. state[8 * i + 4], state[8 * i + 5],
  60. state[8 * i + 6], state[8 * i + 7]);
  61. }
  62. for (ui32 i = 0; i < 8; ++i) {
  63. BlamkaG1SSE2(
  64. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  65. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  66. BlamkaG2SSE2(
  67. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  68. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  69. DiagonalizeSSE2(
  70. state[8 * 2 + i], state[8 * 3 + i],
  71. state[8 * 4 + i], state[8 * 5 + i],
  72. state[8 * 6 + i], state[8 * 7 + i]);
  73. BlamkaG1SSE2(
  74. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  75. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  76. BlamkaG2SSE2(
  77. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  78. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  79. UndiagonalizeSSE2(
  80. state[8 * 2 + i], state[8 * 3 + i],
  81. state[8 * 4 + i], state[8 * 5 + i],
  82. state[8 * 6 + i], state[8 * 7 + i]);
  83. }
  84. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
  85. state[i] = _mm_xor_si128(state[i], blockxy[i]);
  86. _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
  87. }
  88. }
  89. };
  90. }