blake2s-round.h 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. /*
  2. BLAKE2 reference source code package - optimized C implementations
  3. Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
  4. To the extent possible under law, the author(s) have dedicated all copyright
  5. and related and neighboring rights to this software to the public domain
  6. worldwide. This software is distributed without any warranty.
  7. You should have received a copy of the CC0 Public Domain Dedication along with
  8. this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
  9. */
  10. #pragma once
  11. #ifndef __BLAKE2S_ROUND_H__
  12. #define __BLAKE2S_ROUND_H__
  13. #define LOAD(p) _mm_load_si128( (__m128i *)(p) )
  14. #define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
  15. #define LOADU(p) _mm_loadu_si128( (__m128i *)(p) )
  16. #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
  17. #define TOF(reg) _mm_castsi128_ps((reg))
  18. #define TOI(reg) _mm_castps_si128((reg))
  19. #define LIKELY(x) __builtin_expect((x),1)
  20. /* Microarchitecture-specific macros */
  21. #ifndef HAVE_XOP
  22. #ifdef HAVE_SSSE3
  23. #define _mm_roti_epi32(r, c) ( \
  24. (8==-(c)) ? _mm_shuffle_epi8(r,r8) \
  25. : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \
  26. : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) )
  27. #else
  28. #define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) ))
  29. #endif
  30. #else
  31. /* ... */
  32. #endif
  33. #define G1(row1,row2,row3,row4,buf) \
  34. row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
  35. row4 = _mm_xor_si128( row4, row1 ); \
  36. row4 = _mm_roti_epi32(row4, -16); \
  37. row3 = _mm_add_epi32( row3, row4 ); \
  38. row2 = _mm_xor_si128( row2, row3 ); \
  39. row2 = _mm_roti_epi32(row2, -12);
  40. #define G2(row1,row2,row3,row4,buf) \
  41. row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \
  42. row4 = _mm_xor_si128( row4, row1 ); \
  43. row4 = _mm_roti_epi32(row4, -8); \
  44. row3 = _mm_add_epi32( row3, row4 ); \
  45. row2 = _mm_xor_si128( row2, row3 ); \
  46. row2 = _mm_roti_epi32(row2, -7);
  47. #define DIAGONALIZE(row1,row2,row3,row4) \
  48. row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
  49. row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
  50. row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) );
  51. #define UNDIAGONALIZE(row1,row2,row3,row4) \
  52. row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
  53. row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
  54. row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) );
  55. #if defined(HAVE_XOP)
  56. #include "blake2s-load-xop.h"
  57. #elif defined(HAVE_SSE4_1)
  58. #include "blake2s-load-sse41.h"
  59. #else
  60. #include "blake2s-load-sse2.h"
  61. #endif
  62. #define ROUND(r) \
  63. LOAD_MSG_ ##r ##_1(buf1); \
  64. G1(row1,row2,row3,row4,buf1); \
  65. LOAD_MSG_ ##r ##_2(buf2); \
  66. G2(row1,row2,row3,row4,buf2); \
  67. DIAGONALIZE(row1,row2,row3,row4); \
  68. LOAD_MSG_ ##r ##_3(buf3); \
  69. G1(row1,row2,row3,row4,buf3); \
  70. LOAD_MSG_ ##r ##_4(buf4); \
  71. G2(row1,row2,row3,row4,buf4); \
  72. UNDIAGONALIZE(row1,row2,row3,row4); \
  73. #endif