blamka_avx2.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #pragma once
  2. #include <immintrin.h>
  3. #include <library/cpp/digest/argonish/internal/rotations/rotations_avx2.h>
  4. namespace NArgonish {
  5. static inline void BlamkaG1AVX2(
  6. __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1,
  7. __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
  8. __m256i ml = _mm256_mul_epu32(a0, b0);
  9. ml = _mm256_add_epi64(ml, ml);
  10. a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml));
  11. d0 = _mm256_xor_si256(d0, a0);
  12. d0 = Rotr32(d0);
  13. ml = _mm256_mul_epu32(c0, d0);
  14. ml = _mm256_add_epi64(ml, ml);
  15. c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml));
  16. b0 = _mm256_xor_si256(b0, c0);
  17. b0 = Rotr24(b0);
  18. ml = _mm256_mul_epu32(a1, b1);
  19. ml = _mm256_add_epi64(ml, ml);
  20. a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml));
  21. d1 = _mm256_xor_si256(d1, a1);
  22. d1 = Rotr32(d1);
  23. ml = _mm256_mul_epu32(c1, d1);
  24. ml = _mm256_add_epi64(ml, ml);
  25. c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml));
  26. b1 = _mm256_xor_si256(b1, c1);
  27. b1 = Rotr24(b1);
  28. }
  29. static inline void BlamkaG2AVX2(
  30. __m256i& a0, __m256i& a1, __m256i& b0, __m256i& b1,
  31. __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
  32. __m256i ml = _mm256_mul_epu32(a0, b0);
  33. ml = _mm256_add_epi64(ml, ml);
  34. a0 = _mm256_add_epi64(a0, _mm256_add_epi64(b0, ml));
  35. d0 = _mm256_xor_si256(d0, a0);
  36. d0 = Rotr16(d0);
  37. ml = _mm256_mul_epu32(c0, d0);
  38. ml = _mm256_add_epi64(ml, ml);
  39. c0 = _mm256_add_epi64(c0, _mm256_add_epi64(d0, ml));
  40. b0 = _mm256_xor_si256(b0, c0);
  41. b0 = Rotr63(b0);
  42. ml = _mm256_mul_epu32(a1, b1);
  43. ml = _mm256_add_epi64(ml, ml);
  44. a1 = _mm256_add_epi64(a1, _mm256_add_epi64(b1, ml));
  45. d1 = _mm256_xor_si256(d1, a1);
  46. d1 = Rotr16(d1);
  47. ml = _mm256_mul_epu32(c1, d1);
  48. ml = _mm256_add_epi64(ml, ml);
  49. c1 = _mm256_add_epi64(c1, _mm256_add_epi64(d1, ml));
  50. b1 = _mm256_xor_si256(b1, c1);
  51. b1 = Rotr63(b1);
  52. }
  53. /* a = ( v0, v1, v2, v3) */
  54. /* b = ( v4, v5, v6, v7) */
  55. /* c = ( v8, v9, v10, v11) */
  56. /* d = (v12, v13, v14, v15) */
  57. static inline void DiagonalizeAVX21(
  58. __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) {
  59. /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */
  60. b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(0, 3, 2, 1));
  61. /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */
  62. c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
  63. /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */
  64. d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(2, 1, 0, 3));
  65. b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(0, 3, 2, 1));
  66. c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2));
  67. d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(2, 1, 0, 3));
  68. }
  69. static inline void DiagonalizeAVX22(
  70. __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
  71. /* (v4, v5, v6, v7) -> (v5, v6, v7, v4) */
  72. __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v4v7 */
  73. __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v6v5 */
  74. b1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v7v4 */
  75. b0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v5v6 */
  76. /* (v8, v9, v10, v11) -> (v10, v11, v8, v9) */
  77. tmp1 = c0;
  78. c0 = c1;
  79. c1 = tmp1;
  80. /* (v12, v13, v14, v15) -> (v15, v12, v13, v14) */
  81. tmp1 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v12v15 */
  82. tmp2 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v14v13 */
  83. d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v15v12 */
  84. d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v13v14 */
  85. }
  86. static inline void UndiagonalizeAVX21(
  87. __m256i& b0, __m256i& c0, __m256i& d0, __m256i& b1, __m256i& c1, __m256i& d1) {
  88. /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */
  89. b0 = _mm256_permute4x64_epi64(b0, _MM_SHUFFLE(2, 1, 0, 3));
  90. /* (v10, v11, v8, v9) -> (v8, v9, v10, v11) */
  91. c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
  92. /* (v15, v12, v13, v14) -> (v12, v13, v14, v15) */
  93. d0 = _mm256_permute4x64_epi64(d0, _MM_SHUFFLE(0, 3, 2, 1));
  94. b1 = _mm256_permute4x64_epi64(b1, _MM_SHUFFLE(2, 1, 0, 3));
  95. c1 = _mm256_permute4x64_epi64(c1, _MM_SHUFFLE(1, 0, 3, 2));
  96. d1 = _mm256_permute4x64_epi64(d1, _MM_SHUFFLE(0, 3, 2, 1));
  97. }
  98. static inline void UndiagonalizeAVX22(
  99. __m256i& b0, __m256i& b1, __m256i& c0, __m256i& c1, __m256i& d0, __m256i& d1) {
  100. /* (v5, v6, v7, v4) -> (v4, v5, v6, v7) */
  101. __m256i tmp1 = _mm256_blend_epi32(b0, b1, 0b11001100); /* v5v4 */
  102. __m256i tmp2 = _mm256_blend_epi32(b0, b1, 0b00110011); /* v7v6 */
  103. b0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1)); /* v4v5 */
  104. b1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1)); /* v6v7 */
  105. /* (v10,v11,v8,v9) -> (v8,v9,v10,v11) */
  106. tmp1 = c0;
  107. c0 = c1;
  108. c1 = tmp1;
  109. /* (v15,v12,v13,v14) -> (v12,v13,v14,v15) */
  110. tmp1 = _mm256_blend_epi32(d0, d1, 0b00110011); /* v13v12 */
  111. tmp2 = _mm256_blend_epi32(d0, d1, 0b11001100); /* v15v14 */
  112. d0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  113. d1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2, 3, 0, 1));
  114. }
  115. }