kyber512r3_polyvec_avx2.c 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #include <stdint.h>
  2. #include <string.h>
  3. #include "kyber512r3_polyvec_avx2.h"
  4. #include "kyber512r3_poly_avx2.h"
  5. #include "kyber512r3_consts_avx2.h"
  6. #if defined(S2N_KYBER512R3_AVX2_BMI2)
  7. #include <immintrin.h>
  8. static void poly_compress10(uint8_t r[320], const poly * restrict a)
  9. {
  10. unsigned int i;
  11. __m256i f0, f1, f2;
  12. __m128i t0, t1;
  13. const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
  14. const __m256i v8 = _mm256_slli_epi16(v,3);
  15. const __m256i off = _mm256_set1_epi16(15);
  16. const __m256i shift1 = _mm256_set1_epi16(1 << 12);
  17. const __m256i mask = _mm256_set1_epi16(1023);
  18. const __m256i shift2 = _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
  19. const __m256i sllvdidx = _mm256_set1_epi64x(12);
  20. const __m256i shufbidx = _mm256_set_epi8( 8, 4, 3, 2, 1, 0,-1,-1,-1,-1,-1,-1,12,11,10, 9,
  21. -1,-1,-1,-1,-1,-1,12,11,10, 9, 8, 4, 3, 2, 1, 0);
  22. for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
  23. f0 = _mm256_load_si256(&a->vec[i]);
  24. f1 = _mm256_mullo_epi16(f0,v8);
  25. f2 = _mm256_add_epi16(f0,off);
  26. f0 = _mm256_slli_epi16(f0,3);
  27. f0 = _mm256_mulhi_epi16(f0,v);
  28. f2 = _mm256_sub_epi16(f1,f2);
  29. f1 = _mm256_andnot_si256(f1,f2);
  30. f1 = _mm256_srli_epi16(f1,15);
  31. f0 = _mm256_sub_epi16(f0,f1);
  32. f0 = _mm256_mulhrs_epi16(f0,shift1);
  33. f0 = _mm256_and_si256(f0,mask);
  34. f0 = _mm256_madd_epi16(f0,shift2);
  35. f0 = _mm256_sllv_epi32(f0,sllvdidx);
  36. f0 = _mm256_srli_epi64(f0,12);
  37. f0 = _mm256_shuffle_epi8(f0,shufbidx);
  38. t0 = _mm256_castsi256_si128(f0);
  39. t1 = _mm256_extracti128_si256(f0,1);
  40. t0 = _mm_blend_epi16(t0,t1,0xE0);
  41. // correcting cast-align error
  42. // old version: _mm_storeu_si128((__m128i *)&r[20*i+ 0],t0);
  43. _mm_storeu_si128((void *)&r[20*i+ 0],t0);
  44. memcpy(&r[20*i+16],&t1,4);
  45. }
  46. }
  47. static void poly_decompress10(poly * restrict r, const uint8_t a[320+12])
  48. {
  49. unsigned int i;
  50. __m256i f;
  51. const __m256i q = _mm256_set1_epi32((S2N_KYBER_512_R3_Q << 16) + 4*S2N_KYBER_512_R3_Q);
  52. const __m256i shufbidx = _mm256_set_epi8(11,10,10, 9, 9, 8, 8, 7,
  53. 6, 5, 5, 4, 4, 3, 3, 2,
  54. 9, 8, 8, 7, 7, 6, 6, 5,
  55. 4, 3, 3, 2, 2, 1, 1, 0);
  56. const __m256i sllvdidx = _mm256_set1_epi64x(4);
  57. const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
  58. for(i=0;i<S2N_KYBER_512_R3_N/16;i++) {
  59. // correcting cast-align and cast-qual errors
  60. // old version: f = _mm256_loadu_si256((__m256i *)&a[20*i]);
  61. f = _mm256_loadu_si256((const void *)&a[20*i]);
  62. f = _mm256_permute4x64_epi64(f,0x94);
  63. f = _mm256_shuffle_epi8(f,shufbidx);
  64. f = _mm256_sllv_epi32(f,sllvdidx);
  65. f = _mm256_srli_epi16(f,1);
  66. f = _mm256_and_si256(f,mask);
  67. f = _mm256_mulhrs_epi16(f,q);
  68. _mm256_store_si256(&r->vec[i],f);
  69. }
  70. }
  71. /*************************************************
  72. * Name: polyvec_compress_avx2
  73. *
  74. * Description: Compress and serialize vector of polynomials
  75. *
  76. * Arguments: - uint8_t *r: pointer to output byte array
  77. * (needs space for S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
  78. * - polyvec *a: pointer to input vector of polynomials
  79. **************************************************/
  80. void polyvec_compress_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+2], const polyvec *a)
  81. {
  82. unsigned int i;
  83. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  84. poly_compress10(&r[320*i],&a->vec[i]);
  85. }
  86. /*************************************************
  87. * Name: polyvec_decompress_avx2
  88. *
  89. * Description: De-serialize and decompress vector of polynomials;
  90. * approximate inverse of polyvec_compress_avx2
  91. *
  92. * Arguments: - polyvec *r: pointer to output vector of polynomials
  93. * - const uint8_t *a: pointer to input byte array
  94. * (of length S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES)
  95. **************************************************/
  96. void polyvec_decompress_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECCOMPRESSEDBYTES+12])
  97. {
  98. unsigned int i;
  99. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  100. poly_decompress10(&r->vec[i],&a[320*i]);
  101. }
  102. /*************************************************
  103. * Name: polyvec_tobytes_avx2
  104. *
  105. * Description: Serialize vector of polynomials
  106. *
  107. * Arguments: - uint8_t *r: pointer to output byte array
  108. * (needs space for S2N_KYBER_512_R3_POLYVECBYTES)
  109. * - polyvec *a: pointer to input vector of polynomials
  110. **************************************************/
  111. void polyvec_tobytes_avx2(uint8_t r[S2N_KYBER_512_R3_POLYVECBYTES], const polyvec *a)
  112. {
  113. unsigned int i;
  114. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  115. poly_tobytes_avx2(r+i*S2N_KYBER_512_R3_POLYBYTES, &a->vec[i]);
  116. }
  117. /*************************************************
  118. * Name: polyvec_frombytes_avx2
  119. *
  120. * Description: De-serialize vector of polynomials;
  121. * inverse of polyvec_tobytes_avx2
  122. *
  123. * Arguments: - uint8_t *r: pointer to output byte array
  124. * - const polyvec *a: pointer to input vector of polynomials
  125. * (of length S2N_KYBER_512_R3_POLYVECBYTES)
  126. **************************************************/
  127. void polyvec_frombytes_avx2(polyvec *r, const uint8_t a[S2N_KYBER_512_R3_POLYVECBYTES])
  128. {
  129. unsigned int i;
  130. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  131. poly_frombytes_avx2(&r->vec[i], a+i*S2N_KYBER_512_R3_POLYBYTES);
  132. }
  133. /*************************************************
  134. * Name: polyvec_ntt_avx2
  135. *
  136. * Description: Apply forward NTT to all elements of a vector of polynomials
  137. *
  138. * Arguments: - polyvec *r: pointer to in/output vector of polynomials
  139. **************************************************/
  140. void polyvec_ntt_avx2(polyvec *r)
  141. {
  142. unsigned int i;
  143. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  144. poly_ntt_avx2(&r->vec[i]);
  145. }
  146. /*************************************************
  147. * Name: polyvec_invntt_tomont_avx2
  148. *
  149. * Description: Apply inverse NTT to all elements of a vector of polynomials
  150. * and multiply by Montgomery factor 2^16
  151. *
  152. * Arguments: - polyvec *r: pointer to in/output vector of polynomials
  153. **************************************************/
  154. void polyvec_invntt_tomont_avx2(polyvec *r)
  155. {
  156. unsigned int i;
  157. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  158. poly_invntt_tomont_avx2(&r->vec[i]);
  159. }
  160. /*************************************************
  161. * Name: polyvec_basemul_acc_montgomery_avx2
  162. *
  163. * Description: Multiply elements in a and b in NTT domain, accumulate into r,
  164. * and multiply by 2^-16.
  165. *
  166. * Arguments: - poly *r: pointer to output polynomial
  167. * - const polyvec *a: pointer to first input vector of polynomials
  168. * - const polyvec *b: pointer to second input vector of polynomials
  169. **************************************************/
  170. void polyvec_basemul_acc_montgomery_avx2(poly *r, const polyvec *a, const polyvec *b)
  171. {
  172. unsigned int i;
  173. poly tmp;
  174. poly_basemul_montgomery_avx2(r,&a->vec[0],&b->vec[0]);
  175. for(i=1;i<S2N_KYBER_512_R3_K;i++) {
  176. poly_basemul_montgomery_avx2(&tmp,&a->vec[i],&b->vec[i]);
  177. poly_add_avx2(r,r,&tmp);
  178. }
  179. }
  180. /*************************************************
  181. * Name: polyvec_reduce_avx2
  182. *
  183. * Description: Applies Barrett reduction to each coefficient
  184. * of each element of a vector of polynomials;
  185. * for details of the Barrett reduction see comments in reduce.c
  186. *
  187. * Arguments: - polyvec *r: pointer to input/output polynomial
  188. **************************************************/
  189. void polyvec_reduce_avx2(polyvec *r)
  190. {
  191. unsigned int i;
  192. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  193. poly_reduce_avx2(&r->vec[i]);
  194. }
  195. /*************************************************
  196. * Name: polyvec_add_avx2
  197. *
  198. * Description: Add vectors of polynomials
  199. *
  200. * Arguments: - polyvec *r: pointer to output vector of polynomials
  201. * - const polyvec *a: pointer to first input vector of polynomials
  202. * - const polyvec *b: pointer to second input vector of polynomials
  203. **************************************************/
  204. void polyvec_add_avx2(polyvec *r, const polyvec *a, const polyvec *b)
  205. {
  206. unsigned int i;
  207. for(i=0;i<S2N_KYBER_512_R3_K;i++)
  208. poly_add_avx2(&r->vec[i], &a->vec[i], &b->vec[i]);
  209. }
  210. #endif