AesOpt.c 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. /* AesOpt.c -- Intel's AES
  2. 2017-06-08 : Igor Pavlov : Public domain */
  3. #include "Precomp.h"
  4. #include "CpuArch.h"
  5. #ifdef MY_CPU_X86_OR_AMD64
  6. #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
  7. #define USE_INTEL_AES
  8. #endif
  9. #endif
  10. #ifdef USE_INTEL_AES
  11. #if defined(__clang__)
  12. #define TARGET_AES __attribute__((__target__("aes")))
  13. #else
  14. #define TARGET_AES
  15. #endif
  16. #include <wmmintrin.h>
  17. void TARGET_AES MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
  18. {
  19. __m128i m = *p;
  20. for (; numBlocks != 0; numBlocks--, data++)
  21. {
  22. UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
  23. const __m128i *w = p + 3;
  24. m = _mm_xor_si128(m, *data);
  25. m = _mm_xor_si128(m, p[2]);
  26. do
  27. {
  28. m = _mm_aesenc_si128(m, w[0]);
  29. m = _mm_aesenc_si128(m, w[1]);
  30. w += 2;
  31. }
  32. while (--numRounds2 != 0);
  33. m = _mm_aesenc_si128(m, w[0]);
  34. m = _mm_aesenclast_si128(m, w[1]);
  35. *data = m;
  36. }
  37. *p = m;
  38. }
  39. #define NUM_WAYS 3
  40. #define AES_OP_W(op, n) { \
  41. const __m128i t = w[n]; \
  42. m0 = op(m0, t); \
  43. m1 = op(m1, t); \
  44. m2 = op(m2, t); \
  45. }
  46. #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
  47. #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
  48. #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
  49. #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
  50. void TARGET_AES MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
  51. {
  52. __m128i iv = *p;
  53. for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
  54. {
  55. UInt32 numRounds2 = *(const UInt32 *)(p + 1);
  56. const __m128i *w = p + numRounds2 * 2;
  57. __m128i m0, m1, m2;
  58. {
  59. const __m128i t = w[2];
  60. m0 = _mm_xor_si128(t, data[0]);
  61. m1 = _mm_xor_si128(t, data[1]);
  62. m2 = _mm_xor_si128(t, data[2]);
  63. }
  64. numRounds2--;
  65. do
  66. {
  67. AES_DEC(1)
  68. AES_DEC(0)
  69. w -= 2;
  70. }
  71. while (--numRounds2 != 0);
  72. AES_DEC(1)
  73. AES_DEC_LAST(0)
  74. {
  75. __m128i t;
  76. t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
  77. t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
  78. t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
  79. }
  80. }
  81. for (; numBlocks != 0; numBlocks--, data++)
  82. {
  83. UInt32 numRounds2 = *(const UInt32 *)(p + 1);
  84. const __m128i *w = p + numRounds2 * 2;
  85. __m128i m = _mm_xor_si128(w[2], *data);
  86. numRounds2--;
  87. do
  88. {
  89. m = _mm_aesdec_si128(m, w[1]);
  90. m = _mm_aesdec_si128(m, w[0]);
  91. w -= 2;
  92. }
  93. while (--numRounds2 != 0);
  94. m = _mm_aesdec_si128(m, w[1]);
  95. m = _mm_aesdeclast_si128(m, w[0]);
  96. m = _mm_xor_si128(m, iv);
  97. iv = *data;
  98. *data = m;
  99. }
  100. *p = iv;
  101. }
  102. void TARGET_AES MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
  103. {
  104. __m128i ctr = *p;
  105. __m128i one = _mm_set_epi64x(1, 0);
  106. for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
  107. {
  108. UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
  109. const __m128i *w = p;
  110. __m128i m0, m1, m2;
  111. {
  112. const __m128i t = w[2];
  113. ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
  114. ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
  115. ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
  116. }
  117. w += 3;
  118. do
  119. {
  120. AES_ENC(0)
  121. AES_ENC(1)
  122. w += 2;
  123. }
  124. while (--numRounds2 != 0);
  125. AES_ENC(0)
  126. AES_ENC_LAST(1)
  127. data[0] = _mm_xor_si128(data[0], m0);
  128. data[1] = _mm_xor_si128(data[1], m1);
  129. data[2] = _mm_xor_si128(data[2], m2);
  130. }
  131. for (; numBlocks != 0; numBlocks--, data++)
  132. {
  133. UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
  134. const __m128i *w = p;
  135. __m128i m;
  136. ctr = _mm_add_epi64(ctr, one);
  137. m = _mm_xor_si128(ctr, p[2]);
  138. w += 3;
  139. do
  140. {
  141. m = _mm_aesenc_si128(m, w[0]);
  142. m = _mm_aesenc_si128(m, w[1]);
  143. w += 2;
  144. }
  145. while (--numRounds2 != 0);
  146. m = _mm_aesenc_si128(m, w[0]);
  147. m = _mm_aesenclast_si128(m, w[1]);
  148. *data = _mm_xor_si128(*data, m);
  149. }
  150. *p = ctr;
  151. }
  152. #else
  153. void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
  154. void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
  155. void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
  156. void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
  157. {
  158. AesCbc_Encode(p, data, numBlocks);
  159. }
  160. void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
  161. {
  162. AesCbc_Decode(p, data, numBlocks);
  163. }
  164. void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
  165. {
  166. AesCtr_Code(p, data, numBlocks);
  167. }
  168. #endif