kyber512r3_cbd_avx2.c 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #include <stdint.h>
  2. #include "kyber512r3_params.h"
  3. #include "kyber512r3_cbd_avx2.h"
  4. #if defined(S2N_KYBER512R3_AVX2_BMI2)
  5. /*************************************************
  6. * Name: cbd2
  7. *
  8. * Description: Given an array of uniformly random bytes, compute
  9. * polynomial with coefficients distributed according to
  10. * a centered binomial distribution with parameter eta=2
  11. *
  12. * Arguments: - poly *r: pointer to output polynomial
  13. * - const __m256i *buf: pointer to aligned input byte array
  14. **************************************************/
  15. static void cbd2(poly * restrict r, const __m256i buf[2*S2N_KYBER_512_R3_N/128])
  16. {
  17. unsigned int i;
  18. __m256i f0, f1, f2, f3;
  19. const __m256i mask55 = _mm256_set1_epi32(0x55555555);
  20. const __m256i mask33 = _mm256_set1_epi32(0x33333333);
  21. const __m256i mask03 = _mm256_set1_epi32(0x03030303);
  22. const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);
  23. for(i = 0; i < S2N_KYBER_512_R3_N/64; i++) {
  24. f0 = _mm256_load_si256(&buf[i]);
  25. f1 = _mm256_srli_epi16(f0, 1);
  26. f0 = _mm256_and_si256(mask55, f0);
  27. f1 = _mm256_and_si256(mask55, f1);
  28. f0 = _mm256_add_epi8(f0, f1);
  29. f1 = _mm256_srli_epi16(f0, 2);
  30. f0 = _mm256_and_si256(mask33, f0);
  31. f1 = _mm256_and_si256(mask33, f1);
  32. f0 = _mm256_add_epi8(f0, mask33);
  33. f0 = _mm256_sub_epi8(f0, f1);
  34. f1 = _mm256_srli_epi16(f0, 4);
  35. f0 = _mm256_and_si256(mask0F, f0);
  36. f1 = _mm256_and_si256(mask0F, f1);
  37. f0 = _mm256_sub_epi8(f0, mask03);
  38. f1 = _mm256_sub_epi8(f1, mask03);
  39. f2 = _mm256_unpacklo_epi8(f0, f1);
  40. f3 = _mm256_unpackhi_epi8(f0, f1);
  41. f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
  42. f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2,1));
  43. f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
  44. f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3,1));
  45. _mm256_store_si256(&r->vec[4*i+0], f0);
  46. _mm256_store_si256(&r->vec[4*i+1], f2);
  47. _mm256_store_si256(&r->vec[4*i+2], f1);
  48. _mm256_store_si256(&r->vec[4*i+3], f3);
  49. }
  50. }
  51. /*************************************************
  52. * Name: cbd3
  53. *
  54. * Description: Given an array of uniformly random bytes, compute
  55. * polynomial with coefficients distributed according to
  56. * a centered binomial distribution with parameter eta=3
  57. * This function is only needed for Kyber-512
  58. *
  59. * Arguments: - poly *r: pointer to output polynomial
  60. * - const __m256i *buf: pointer to aligned input byte array
  61. **************************************************/
  62. static void cbd3(poly * restrict r, const uint8_t buf[3*S2N_KYBER_512_R3_N/4+8])
  63. {
  64. unsigned int i;
  65. __m256i f0, f1, f2, f3;
  66. const __m256i mask249 = _mm256_set1_epi32(0x249249);
  67. const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
  68. const __m256i mask07 = _mm256_set1_epi32(7);
  69. const __m256i mask70 = _mm256_set1_epi32(7 << 16);
  70. const __m256i mask3 = _mm256_set1_epi16(3);
  71. const __m256i shufbidx = _mm256_set_epi8(-1,15,14,13,-1,12,11,10,-1, 9, 8, 7,-1, 6, 5, 4,
  72. -1,11,10, 9,-1, 8, 7, 6,-1, 5, 4, 3,-1, 2, 1, 0);
  73. for(i = 0; i < S2N_KYBER_512_R3_N/32; i++) {
  74. // correcting cast-align and cast-qual errors
  75. // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[24*i]);
  76. f0 = _mm256_loadu_si256((const void *)&buf[24*i]);
  77. f0 = _mm256_permute4x64_epi64(f0,0x94);
  78. f0 = _mm256_shuffle_epi8(f0,shufbidx);
  79. f1 = _mm256_srli_epi32(f0,1);
  80. f2 = _mm256_srli_epi32(f0,2);
  81. f0 = _mm256_and_si256(mask249,f0);
  82. f1 = _mm256_and_si256(mask249,f1);
  83. f2 = _mm256_and_si256(mask249,f2);
  84. f0 = _mm256_add_epi32(f0,f1);
  85. f0 = _mm256_add_epi32(f0,f2);
  86. f1 = _mm256_srli_epi32(f0,3);
  87. f0 = _mm256_add_epi32(f0,mask6DB);
  88. f0 = _mm256_sub_epi32(f0,f1);
  89. f1 = _mm256_slli_epi32(f0,10);
  90. f2 = _mm256_srli_epi32(f0,12);
  91. f3 = _mm256_srli_epi32(f0, 2);
  92. f0 = _mm256_and_si256(f0,mask07);
  93. f1 = _mm256_and_si256(f1,mask70);
  94. f2 = _mm256_and_si256(f2,mask07);
  95. f3 = _mm256_and_si256(f3,mask70);
  96. f0 = _mm256_add_epi16(f0,f1);
  97. f1 = _mm256_add_epi16(f2,f3);
  98. f0 = _mm256_sub_epi16(f0,mask3);
  99. f1 = _mm256_sub_epi16(f1,mask3);
  100. f2 = _mm256_unpacklo_epi32(f0,f1);
  101. f3 = _mm256_unpackhi_epi32(f0,f1);
  102. f0 = _mm256_permute2x128_si256(f2,f3,0x20);
  103. f1 = _mm256_permute2x128_si256(f2,f3,0x31);
  104. _mm256_store_si256(&r->vec[2*i+0], f0);
  105. _mm256_store_si256(&r->vec[2*i+1], f1);
  106. }
  107. }
  108. /* buf 32 bytes longer for cbd3 */
  109. void poly_cbd_eta1_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA1*S2N_KYBER_512_R3_N/128+1])
  110. {
  111. // correcting cast-align and cast-qual errors
  112. // old version: cbd3(r, (uint8_t *)buf);
  113. cbd3(r, (const void *)buf);
  114. }
  115. void poly_cbd_eta2_avx2(poly *r, const __m256i buf[S2N_KYBER_512_R3_ETA2*S2N_KYBER_512_R3_N/128])
  116. {
  117. cbd2(r, buf);
  118. }
  119. #endif