argon2_ssse3.h 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. #pragma once
  2. #include <emmintrin.h>
  3. #include <tmmintrin.h>
  4. #include "argon2_base.h"
  5. #include <library/cpp/digest/argonish/internal/blamka/blamka_ssse3.h>
  6. namespace NArgonish {
  7. template <ui32 mcost, ui32 threads>
  8. class TArgon2SSSE3 final: public TArgon2<EInstructionSet::SSSE3, mcost, threads> {
  9. public:
  10. TArgon2SSSE3(EArgon2Type atype, ui32 tcost, const ui8* key, ui32 keylen)
  11. : TArgon2<EInstructionSet::SSSE3, mcost, threads>(atype, tcost, key, keylen)
  12. {
  13. }
  14. protected:
  15. virtual void XorBlock_(TBlock* dst, const TBlock* src) const override {
  16. __m128i* mdst = (__m128i*)dst->V;
  17. __m128i* msrc = (__m128i*)src->V;
  18. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i)
  19. XorValues(mdst + i, msrc + i, mdst + i);
  20. }
  21. virtual void CopyBlock_(TBlock* dst, const TBlock* src) const override {
  22. memcpy(dst->V, src->V, sizeof(ui64) * ARGON2_QWORDS_IN_BLOCK);
  23. }
  24. virtual void FillBlock_(const TBlock* prevBlock, const TBlock* refBlock, TBlock* nextBlock, bool withXor) const override {
  25. __m128i blockxy[ARGON2_OWORDS_IN_BLOCK];
  26. __m128i state[ARGON2_OWORDS_IN_BLOCK];
  27. memcpy(state, prevBlock, ARGON2_BLOCK_SIZE);
  28. if (withXor) {
  29. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
  30. state[i] = _mm_xor_si128(
  31. state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
  32. blockxy[i] = _mm_xor_si128(
  33. state[i], _mm_loadu_si128((const __m128i*)nextBlock->V + i));
  34. }
  35. } else {
  36. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
  37. blockxy[i] = state[i] = _mm_xor_si128(
  38. state[i], _mm_loadu_si128((const __m128i*)refBlock->V + i));
  39. }
  40. }
  41. for (ui32 i = 0; i < 8; ++i) {
  42. BlamkaG1SSSE3(
  43. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  44. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  45. BlamkaG2SSSE3(
  46. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  47. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  48. DiagonalizeSSSE3(
  49. state[8 * i + 2], state[8 * i + 3],
  50. state[8 * i + 4], state[8 * i + 5],
  51. state[8 * i + 6], state[8 * i + 7]);
  52. BlamkaG1SSSE3(
  53. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  54. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  55. BlamkaG2SSSE3(
  56. state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
  57. state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
  58. UndiagonalizeSSSE3(
  59. state[8 * i + 2], state[8 * i + 3],
  60. state[8 * i + 4], state[8 * i + 5],
  61. state[8 * i + 6], state[8 * i + 7]);
  62. }
  63. for (ui32 i = 0; i < 8; ++i) {
  64. BlamkaG1SSSE3(
  65. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  66. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  67. BlamkaG2SSSE3(
  68. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  69. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  70. DiagonalizeSSSE3(
  71. state[8 * 2 + i], state[8 * 3 + i],
  72. state[8 * 4 + i], state[8 * 5 + i],
  73. state[8 * 6 + i], state[8 * 7 + i]);
  74. BlamkaG1SSSE3(
  75. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  76. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  77. BlamkaG2SSSE3(
  78. state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], state[8 * 3 + i],
  79. state[8 * 4 + i], state[8 * 5 + i], state[8 * 6 + i], state[8 * 7 + i]);
  80. UndiagonalizeSSSE3(
  81. state[8 * 2 + i], state[8 * 3 + i],
  82. state[8 * 4 + i], state[8 * 5 + i],
  83. state[8 * 6 + i], state[8 * 7 + i]);
  84. }
  85. for (ui32 i = 0; i < ARGON2_OWORDS_IN_BLOCK; ++i) {
  86. state[i] = _mm_xor_si128(state[i], blockxy[i]);
  87. _mm_storeu_si128((__m128i*)nextBlock->V + i, state[i]);
  88. }
  89. }
  90. };
  91. }