blake2b-round.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /*
  2. BLAKE2 reference source code package - optimized C implementations
  3. Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
  4. To the extent possible under law, the author(s) have dedicated all copyright
  5. and related and neighboring rights to this software to the public domain
  6. worldwide. This software is distributed without any warranty.
  7. You should have received a copy of the CC0 Public Domain Dedication along with
  8. this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
  9. */
  10. #pragma once
  11. #ifndef __BLAKE2B_ROUND_H__
  12. #define __BLAKE2B_ROUND_H__
  13. #define LOAD(p) _mm_load_si128( (__m128i *)(p) )
  14. #define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
  15. #define LOADU(p) _mm_loadu_si128( (__m128i *)(p) )
  16. #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
  17. #define TOF(reg) _mm_castsi128_ps((reg))
  18. #define TOI(reg) _mm_castps_si128((reg))
  19. #define LIKELY(x) __builtin_expect((x),1)
  20. /* Microarchitecture-specific macros */
  21. #ifndef HAVE_XOP
  22. #ifdef HAVE_SSSE3
  23. #define _mm_roti_epi64(x, c) \
  24. (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
  25. : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
  26. : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
  27. : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
  28. : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
  29. #else
  30. #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
  31. #endif
  32. #else
  33. /* ... */
  34. #endif
  35. #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  36. row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  37. row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  38. \
  39. row4l = _mm_xor_si128(row4l, row1l); \
  40. row4h = _mm_xor_si128(row4h, row1h); \
  41. \
  42. row4l = _mm_roti_epi64(row4l, -32); \
  43. row4h = _mm_roti_epi64(row4h, -32); \
  44. \
  45. row3l = _mm_add_epi64(row3l, row4l); \
  46. row3h = _mm_add_epi64(row3h, row4h); \
  47. \
  48. row2l = _mm_xor_si128(row2l, row3l); \
  49. row2h = _mm_xor_si128(row2h, row3h); \
  50. \
  51. row2l = _mm_roti_epi64(row2l, -24); \
  52. row2h = _mm_roti_epi64(row2h, -24); \
  53. #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  54. row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  55. row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  56. \
  57. row4l = _mm_xor_si128(row4l, row1l); \
  58. row4h = _mm_xor_si128(row4h, row1h); \
  59. \
  60. row4l = _mm_roti_epi64(row4l, -16); \
  61. row4h = _mm_roti_epi64(row4h, -16); \
  62. \
  63. row3l = _mm_add_epi64(row3l, row4l); \
  64. row3h = _mm_add_epi64(row3h, row4h); \
  65. \
  66. row2l = _mm_xor_si128(row2l, row3l); \
  67. row2h = _mm_xor_si128(row2h, row3h); \
  68. \
  69. row2l = _mm_roti_epi64(row2l, -63); \
  70. row2h = _mm_roti_epi64(row2h, -63); \
  71. #if defined(HAVE_SSSE3)
  72. #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  73. t0 = _mm_alignr_epi8(row2h, row2l, 8); \
  74. t1 = _mm_alignr_epi8(row2l, row2h, 8); \
  75. row2l = t0; \
  76. row2h = t1; \
  77. \
  78. t0 = row3l; \
  79. row3l = row3h; \
  80. row3h = t0; \
  81. \
  82. t0 = _mm_alignr_epi8(row4h, row4l, 8); \
  83. t1 = _mm_alignr_epi8(row4l, row4h, 8); \
  84. row4l = t1; \
  85. row4h = t0;
  86. #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  87. t0 = _mm_alignr_epi8(row2l, row2h, 8); \
  88. t1 = _mm_alignr_epi8(row2h, row2l, 8); \
  89. row2l = t0; \
  90. row2h = t1; \
  91. \
  92. t0 = row3l; \
  93. row3l = row3h; \
  94. row3h = t0; \
  95. \
  96. t0 = _mm_alignr_epi8(row4l, row4h, 8); \
  97. t1 = _mm_alignr_epi8(row4h, row4l, 8); \
  98. row4l = t1; \
  99. row4h = t0;
  100. #else
  101. #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  102. t0 = row4l;\
  103. t1 = row2l;\
  104. row4l = row3l;\
  105. row3l = row3h;\
  106. row3h = row4l;\
  107. row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
  108. row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
  109. row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
  110. row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
  111. #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  112. t0 = row3l;\
  113. row3l = row3h;\
  114. row3h = t0;\
  115. t0 = row2l;\
  116. t1 = row4l;\
  117. row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
  118. row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
  119. row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
  120. row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
  121. #endif
  122. #if defined(HAVE_SSE4_1)
  123. #include "blake2b-load-sse41.h"
  124. #else
  125. #include "blake2b-load-sse2.h"
  126. #endif
  127. #define ROUND(r) \
  128. LOAD_MSG_ ##r ##_1(b0, b1); \
  129. G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  130. LOAD_MSG_ ##r ##_2(b0, b1); \
  131. G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  132. DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
  133. LOAD_MSG_ ##r ##_3(b0, b1); \
  134. G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  135. LOAD_MSG_ ##r ##_4(b0, b1); \
  136. G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  137. UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
  138. #endif