kyber512r3_rejsample_avx2.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. #include <stdint.h>
  2. #include <string.h>
  3. #include "kyber512r3_params.h"
  4. #include "kyber512r3_consts_avx2.h"
  5. #include "kyber512r3_rejsample_avx2.h"
  6. #if defined(S2N_KYBER512R3_AVX2_BMI2)
  7. #include <immintrin.h>
  8. //#define BMI
  9. #ifndef BMI
  10. static const uint8_t idx[256][8] = {
  11. {-1, -1, -1, -1, -1, -1, -1, -1},
  12. { 0, -1, -1, -1, -1, -1, -1, -1},
  13. { 2, -1, -1, -1, -1, -1, -1, -1},
  14. { 0, 2, -1, -1, -1, -1, -1, -1},
  15. { 4, -1, -1, -1, -1, -1, -1, -1},
  16. { 0, 4, -1, -1, -1, -1, -1, -1},
  17. { 2, 4, -1, -1, -1, -1, -1, -1},
  18. { 0, 2, 4, -1, -1, -1, -1, -1},
  19. { 6, -1, -1, -1, -1, -1, -1, -1},
  20. { 0, 6, -1, -1, -1, -1, -1, -1},
  21. { 2, 6, -1, -1, -1, -1, -1, -1},
  22. { 0, 2, 6, -1, -1, -1, -1, -1},
  23. { 4, 6, -1, -1, -1, -1, -1, -1},
  24. { 0, 4, 6, -1, -1, -1, -1, -1},
  25. { 2, 4, 6, -1, -1, -1, -1, -1},
  26. { 0, 2, 4, 6, -1, -1, -1, -1},
  27. { 8, -1, -1, -1, -1, -1, -1, -1},
  28. { 0, 8, -1, -1, -1, -1, -1, -1},
  29. { 2, 8, -1, -1, -1, -1, -1, -1},
  30. { 0, 2, 8, -1, -1, -1, -1, -1},
  31. { 4, 8, -1, -1, -1, -1, -1, -1},
  32. { 0, 4, 8, -1, -1, -1, -1, -1},
  33. { 2, 4, 8, -1, -1, -1, -1, -1},
  34. { 0, 2, 4, 8, -1, -1, -1, -1},
  35. { 6, 8, -1, -1, -1, -1, -1, -1},
  36. { 0, 6, 8, -1, -1, -1, -1, -1},
  37. { 2, 6, 8, -1, -1, -1, -1, -1},
  38. { 0, 2, 6, 8, -1, -1, -1, -1},
  39. { 4, 6, 8, -1, -1, -1, -1, -1},
  40. { 0, 4, 6, 8, -1, -1, -1, -1},
  41. { 2, 4, 6, 8, -1, -1, -1, -1},
  42. { 0, 2, 4, 6, 8, -1, -1, -1},
  43. {10, -1, -1, -1, -1, -1, -1, -1},
  44. { 0, 10, -1, -1, -1, -1, -1, -1},
  45. { 2, 10, -1, -1, -1, -1, -1, -1},
  46. { 0, 2, 10, -1, -1, -1, -1, -1},
  47. { 4, 10, -1, -1, -1, -1, -1, -1},
  48. { 0, 4, 10, -1, -1, -1, -1, -1},
  49. { 2, 4, 10, -1, -1, -1, -1, -1},
  50. { 0, 2, 4, 10, -1, -1, -1, -1},
  51. { 6, 10, -1, -1, -1, -1, -1, -1},
  52. { 0, 6, 10, -1, -1, -1, -1, -1},
  53. { 2, 6, 10, -1, -1, -1, -1, -1},
  54. { 0, 2, 6, 10, -1, -1, -1, -1},
  55. { 4, 6, 10, -1, -1, -1, -1, -1},
  56. { 0, 4, 6, 10, -1, -1, -1, -1},
  57. { 2, 4, 6, 10, -1, -1, -1, -1},
  58. { 0, 2, 4, 6, 10, -1, -1, -1},
  59. { 8, 10, -1, -1, -1, -1, -1, -1},
  60. { 0, 8, 10, -1, -1, -1, -1, -1},
  61. { 2, 8, 10, -1, -1, -1, -1, -1},
  62. { 0, 2, 8, 10, -1, -1, -1, -1},
  63. { 4, 8, 10, -1, -1, -1, -1, -1},
  64. { 0, 4, 8, 10, -1, -1, -1, -1},
  65. { 2, 4, 8, 10, -1, -1, -1, -1},
  66. { 0, 2, 4, 8, 10, -1, -1, -1},
  67. { 6, 8, 10, -1, -1, -1, -1, -1},
  68. { 0, 6, 8, 10, -1, -1, -1, -1},
  69. { 2, 6, 8, 10, -1, -1, -1, -1},
  70. { 0, 2, 6, 8, 10, -1, -1, -1},
  71. { 4, 6, 8, 10, -1, -1, -1, -1},
  72. { 0, 4, 6, 8, 10, -1, -1, -1},
  73. { 2, 4, 6, 8, 10, -1, -1, -1},
  74. { 0, 2, 4, 6, 8, 10, -1, -1},
  75. {12, -1, -1, -1, -1, -1, -1, -1},
  76. { 0, 12, -1, -1, -1, -1, -1, -1},
  77. { 2, 12, -1, -1, -1, -1, -1, -1},
  78. { 0, 2, 12, -1, -1, -1, -1, -1},
  79. { 4, 12, -1, -1, -1, -1, -1, -1},
  80. { 0, 4, 12, -1, -1, -1, -1, -1},
  81. { 2, 4, 12, -1, -1, -1, -1, -1},
  82. { 0, 2, 4, 12, -1, -1, -1, -1},
  83. { 6, 12, -1, -1, -1, -1, -1, -1},
  84. { 0, 6, 12, -1, -1, -1, -1, -1},
  85. { 2, 6, 12, -1, -1, -1, -1, -1},
  86. { 0, 2, 6, 12, -1, -1, -1, -1},
  87. { 4, 6, 12, -1, -1, -1, -1, -1},
  88. { 0, 4, 6, 12, -1, -1, -1, -1},
  89. { 2, 4, 6, 12, -1, -1, -1, -1},
  90. { 0, 2, 4, 6, 12, -1, -1, -1},
  91. { 8, 12, -1, -1, -1, -1, -1, -1},
  92. { 0, 8, 12, -1, -1, -1, -1, -1},
  93. { 2, 8, 12, -1, -1, -1, -1, -1},
  94. { 0, 2, 8, 12, -1, -1, -1, -1},
  95. { 4, 8, 12, -1, -1, -1, -1, -1},
  96. { 0, 4, 8, 12, -1, -1, -1, -1},
  97. { 2, 4, 8, 12, -1, -1, -1, -1},
  98. { 0, 2, 4, 8, 12, -1, -1, -1},
  99. { 6, 8, 12, -1, -1, -1, -1, -1},
  100. { 0, 6, 8, 12, -1, -1, -1, -1},
  101. { 2, 6, 8, 12, -1, -1, -1, -1},
  102. { 0, 2, 6, 8, 12, -1, -1, -1},
  103. { 4, 6, 8, 12, -1, -1, -1, -1},
  104. { 0, 4, 6, 8, 12, -1, -1, -1},
  105. { 2, 4, 6, 8, 12, -1, -1, -1},
  106. { 0, 2, 4, 6, 8, 12, -1, -1},
  107. {10, 12, -1, -1, -1, -1, -1, -1},
  108. { 0, 10, 12, -1, -1, -1, -1, -1},
  109. { 2, 10, 12, -1, -1, -1, -1, -1},
  110. { 0, 2, 10, 12, -1, -1, -1, -1},
  111. { 4, 10, 12, -1, -1, -1, -1, -1},
  112. { 0, 4, 10, 12, -1, -1, -1, -1},
  113. { 2, 4, 10, 12, -1, -1, -1, -1},
  114. { 0, 2, 4, 10, 12, -1, -1, -1},
  115. { 6, 10, 12, -1, -1, -1, -1, -1},
  116. { 0, 6, 10, 12, -1, -1, -1, -1},
  117. { 2, 6, 10, 12, -1, -1, -1, -1},
  118. { 0, 2, 6, 10, 12, -1, -1, -1},
  119. { 4, 6, 10, 12, -1, -1, -1, -1},
  120. { 0, 4, 6, 10, 12, -1, -1, -1},
  121. { 2, 4, 6, 10, 12, -1, -1, -1},
  122. { 0, 2, 4, 6, 10, 12, -1, -1},
  123. { 8, 10, 12, -1, -1, -1, -1, -1},
  124. { 0, 8, 10, 12, -1, -1, -1, -1},
  125. { 2, 8, 10, 12, -1, -1, -1, -1},
  126. { 0, 2, 8, 10, 12, -1, -1, -1},
  127. { 4, 8, 10, 12, -1, -1, -1, -1},
  128. { 0, 4, 8, 10, 12, -1, -1, -1},
  129. { 2, 4, 8, 10, 12, -1, -1, -1},
  130. { 0, 2, 4, 8, 10, 12, -1, -1},
  131. { 6, 8, 10, 12, -1, -1, -1, -1},
  132. { 0, 6, 8, 10, 12, -1, -1, -1},
  133. { 2, 6, 8, 10, 12, -1, -1, -1},
  134. { 0, 2, 6, 8, 10, 12, -1, -1},
  135. { 4, 6, 8, 10, 12, -1, -1, -1},
  136. { 0, 4, 6, 8, 10, 12, -1, -1},
  137. { 2, 4, 6, 8, 10, 12, -1, -1},
  138. { 0, 2, 4, 6, 8, 10, 12, -1},
  139. {14, -1, -1, -1, -1, -1, -1, -1},
  140. { 0, 14, -1, -1, -1, -1, -1, -1},
  141. { 2, 14, -1, -1, -1, -1, -1, -1},
  142. { 0, 2, 14, -1, -1, -1, -1, -1},
  143. { 4, 14, -1, -1, -1, -1, -1, -1},
  144. { 0, 4, 14, -1, -1, -1, -1, -1},
  145. { 2, 4, 14, -1, -1, -1, -1, -1},
  146. { 0, 2, 4, 14, -1, -1, -1, -1},
  147. { 6, 14, -1, -1, -1, -1, -1, -1},
  148. { 0, 6, 14, -1, -1, -1, -1, -1},
  149. { 2, 6, 14, -1, -1, -1, -1, -1},
  150. { 0, 2, 6, 14, -1, -1, -1, -1},
  151. { 4, 6, 14, -1, -1, -1, -1, -1},
  152. { 0, 4, 6, 14, -1, -1, -1, -1},
  153. { 2, 4, 6, 14, -1, -1, -1, -1},
  154. { 0, 2, 4, 6, 14, -1, -1, -1},
  155. { 8, 14, -1, -1, -1, -1, -1, -1},
  156. { 0, 8, 14, -1, -1, -1, -1, -1},
  157. { 2, 8, 14, -1, -1, -1, -1, -1},
  158. { 0, 2, 8, 14, -1, -1, -1, -1},
  159. { 4, 8, 14, -1, -1, -1, -1, -1},
  160. { 0, 4, 8, 14, -1, -1, -1, -1},
  161. { 2, 4, 8, 14, -1, -1, -1, -1},
  162. { 0, 2, 4, 8, 14, -1, -1, -1},
  163. { 6, 8, 14, -1, -1, -1, -1, -1},
  164. { 0, 6, 8, 14, -1, -1, -1, -1},
  165. { 2, 6, 8, 14, -1, -1, -1, -1},
  166. { 0, 2, 6, 8, 14, -1, -1, -1},
  167. { 4, 6, 8, 14, -1, -1, -1, -1},
  168. { 0, 4, 6, 8, 14, -1, -1, -1},
  169. { 2, 4, 6, 8, 14, -1, -1, -1},
  170. { 0, 2, 4, 6, 8, 14, -1, -1},
  171. {10, 14, -1, -1, -1, -1, -1, -1},
  172. { 0, 10, 14, -1, -1, -1, -1, -1},
  173. { 2, 10, 14, -1, -1, -1, -1, -1},
  174. { 0, 2, 10, 14, -1, -1, -1, -1},
  175. { 4, 10, 14, -1, -1, -1, -1, -1},
  176. { 0, 4, 10, 14, -1, -1, -1, -1},
  177. { 2, 4, 10, 14, -1, -1, -1, -1},
  178. { 0, 2, 4, 10, 14, -1, -1, -1},
  179. { 6, 10, 14, -1, -1, -1, -1, -1},
  180. { 0, 6, 10, 14, -1, -1, -1, -1},
  181. { 2, 6, 10, 14, -1, -1, -1, -1},
  182. { 0, 2, 6, 10, 14, -1, -1, -1},
  183. { 4, 6, 10, 14, -1, -1, -1, -1},
  184. { 0, 4, 6, 10, 14, -1, -1, -1},
  185. { 2, 4, 6, 10, 14, -1, -1, -1},
  186. { 0, 2, 4, 6, 10, 14, -1, -1},
  187. { 8, 10, 14, -1, -1, -1, -1, -1},
  188. { 0, 8, 10, 14, -1, -1, -1, -1},
  189. { 2, 8, 10, 14, -1, -1, -1, -1},
  190. { 0, 2, 8, 10, 14, -1, -1, -1},
  191. { 4, 8, 10, 14, -1, -1, -1, -1},
  192. { 0, 4, 8, 10, 14, -1, -1, -1},
  193. { 2, 4, 8, 10, 14, -1, -1, -1},
  194. { 0, 2, 4, 8, 10, 14, -1, -1},
  195. { 6, 8, 10, 14, -1, -1, -1, -1},
  196. { 0, 6, 8, 10, 14, -1, -1, -1},
  197. { 2, 6, 8, 10, 14, -1, -1, -1},
  198. { 0, 2, 6, 8, 10, 14, -1, -1},
  199. { 4, 6, 8, 10, 14, -1, -1, -1},
  200. { 0, 4, 6, 8, 10, 14, -1, -1},
  201. { 2, 4, 6, 8, 10, 14, -1, -1},
  202. { 0, 2, 4, 6, 8, 10, 14, -1},
  203. {12, 14, -1, -1, -1, -1, -1, -1},
  204. { 0, 12, 14, -1, -1, -1, -1, -1},
  205. { 2, 12, 14, -1, -1, -1, -1, -1},
  206. { 0, 2, 12, 14, -1, -1, -1, -1},
  207. { 4, 12, 14, -1, -1, -1, -1, -1},
  208. { 0, 4, 12, 14, -1, -1, -1, -1},
  209. { 2, 4, 12, 14, -1, -1, -1, -1},
  210. { 0, 2, 4, 12, 14, -1, -1, -1},
  211. { 6, 12, 14, -1, -1, -1, -1, -1},
  212. { 0, 6, 12, 14, -1, -1, -1, -1},
  213. { 2, 6, 12, 14, -1, -1, -1, -1},
  214. { 0, 2, 6, 12, 14, -1, -1, -1},
  215. { 4, 6, 12, 14, -1, -1, -1, -1},
  216. { 0, 4, 6, 12, 14, -1, -1, -1},
  217. { 2, 4, 6, 12, 14, -1, -1, -1},
  218. { 0, 2, 4, 6, 12, 14, -1, -1},
  219. { 8, 12, 14, -1, -1, -1, -1, -1},
  220. { 0, 8, 12, 14, -1, -1, -1, -1},
  221. { 2, 8, 12, 14, -1, -1, -1, -1},
  222. { 0, 2, 8, 12, 14, -1, -1, -1},
  223. { 4, 8, 12, 14, -1, -1, -1, -1},
  224. { 0, 4, 8, 12, 14, -1, -1, -1},
  225. { 2, 4, 8, 12, 14, -1, -1, -1},
  226. { 0, 2, 4, 8, 12, 14, -1, -1},
  227. { 6, 8, 12, 14, -1, -1, -1, -1},
  228. { 0, 6, 8, 12, 14, -1, -1, -1},
  229. { 2, 6, 8, 12, 14, -1, -1, -1},
  230. { 0, 2, 6, 8, 12, 14, -1, -1},
  231. { 4, 6, 8, 12, 14, -1, -1, -1},
  232. { 0, 4, 6, 8, 12, 14, -1, -1},
  233. { 2, 4, 6, 8, 12, 14, -1, -1},
  234. { 0, 2, 4, 6, 8, 12, 14, -1},
  235. {10, 12, 14, -1, -1, -1, -1, -1},
  236. { 0, 10, 12, 14, -1, -1, -1, -1},
  237. { 2, 10, 12, 14, -1, -1, -1, -1},
  238. { 0, 2, 10, 12, 14, -1, -1, -1},
  239. { 4, 10, 12, 14, -1, -1, -1, -1},
  240. { 0, 4, 10, 12, 14, -1, -1, -1},
  241. { 2, 4, 10, 12, 14, -1, -1, -1},
  242. { 0, 2, 4, 10, 12, 14, -1, -1},
  243. { 6, 10, 12, 14, -1, -1, -1, -1},
  244. { 0, 6, 10, 12, 14, -1, -1, -1},
  245. { 2, 6, 10, 12, 14, -1, -1, -1},
  246. { 0, 2, 6, 10, 12, 14, -1, -1},
  247. { 4, 6, 10, 12, 14, -1, -1, -1},
  248. { 0, 4, 6, 10, 12, 14, -1, -1},
  249. { 2, 4, 6, 10, 12, 14, -1, -1},
  250. { 0, 2, 4, 6, 10, 12, 14, -1},
  251. { 8, 10, 12, 14, -1, -1, -1, -1},
  252. { 0, 8, 10, 12, 14, -1, -1, -1},
  253. { 2, 8, 10, 12, 14, -1, -1, -1},
  254. { 0, 2, 8, 10, 12, 14, -1, -1},
  255. { 4, 8, 10, 12, 14, -1, -1, -1},
  256. { 0, 4, 8, 10, 12, 14, -1, -1},
  257. { 2, 4, 8, 10, 12, 14, -1, -1},
  258. { 0, 2, 4, 8, 10, 12, 14, -1},
  259. { 6, 8, 10, 12, 14, -1, -1, -1},
  260. { 0, 6, 8, 10, 12, 14, -1, -1},
  261. { 2, 6, 8, 10, 12, 14, -1, -1},
  262. { 0, 2, 6, 8, 10, 12, 14, -1},
  263. { 4, 6, 8, 10, 12, 14, -1, -1},
  264. { 0, 4, 6, 8, 10, 12, 14, -1},
  265. { 2, 4, 6, 8, 10, 12, 14, -1},
  266. { 0, 2, 4, 6, 8, 10, 12, 14}
  267. };
  268. #endif
  269. #define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
  270. #define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)
  271. unsigned int rej_uniform_avx2(int16_t * restrict r, const uint8_t *buf)
  272. {
  273. unsigned int ctr, pos;
  274. uint16_t val0, val1;
  275. uint32_t good;
  276. #ifdef BMI
  277. uint64_t idx0, idx1, idx2, idx3;
  278. #endif
  279. const __m256i bound = _mm256_load_si256(&qdata.vec[_16XQ/16]);
  280. const __m256i ones = _mm256_set1_epi8(1);
  281. const __m256i mask = _mm256_set1_epi16(0xFFF);
  282. const __m256i idx8 = _mm256_set_epi8(15,14,14,13,12,11,11,10,
  283. 9, 8, 8, 7, 6, 5, 5, 4,
  284. 11,10,10, 9, 8, 7, 7, 6,
  285. 5, 4, 4, 3, 2, 1, 1, 0);
  286. __m256i f0, f1, g0, g1, g2, g3;
  287. __m128i f, t, pilo, pihi;
  288. ctr = pos = 0;
  289. while(ctr <= S2N_KYBER_512_R3_N - 32 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 48) {
  290. // correcting cast-align and cast-qual errors
  291. // old version: f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
  292. f0 = _mm256_loadu_si256((const void *)&buf[pos]);
  293. // old version: f1 = _mm256_loadu_si256((__m256i *)&buf[pos+24]);
  294. f1 = _mm256_loadu_si256((const void *)&buf[pos+24]);
  295. f0 = _mm256_permute4x64_epi64(f0, 0x94);
  296. f1 = _mm256_permute4x64_epi64(f1, 0x94);
  297. f0 = _mm256_shuffle_epi8(f0, idx8);
  298. f1 = _mm256_shuffle_epi8(f1, idx8);
  299. g0 = _mm256_srli_epi16(f0, 4);
  300. g1 = _mm256_srli_epi16(f1, 4);
  301. f0 = _mm256_blend_epi16(f0, g0, 0xAA);
  302. f1 = _mm256_blend_epi16(f1, g1, 0xAA);
  303. f0 = _mm256_and_si256(f0, mask);
  304. f1 = _mm256_and_si256(f1, mask);
  305. pos += 48;
  306. g0 = _mm256_cmpgt_epi16(bound, f0);
  307. g1 = _mm256_cmpgt_epi16(bound, f1);
  308. g0 = _mm256_packs_epi16(g0, g1);
  309. good = _mm256_movemask_epi8(g0);
  310. #ifdef BMI
  311. idx0 = _pdep_u64(good >> 0, 0x0101010101010101);
  312. idx1 = _pdep_u64(good >> 8, 0x0101010101010101);
  313. idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
  314. idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
  315. idx0 = (idx0 << 8) - idx0;
  316. idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
  317. idx1 = (idx1 << 8) - idx1;
  318. idx1 = _pext_u64(0x0E0C0A0806040200, idx1);
  319. idx2 = (idx2 << 8) - idx2;
  320. idx2 = _pext_u64(0x0E0C0A0806040200, idx2);
  321. idx3 = (idx3 << 8) - idx3;
  322. idx3 = _pext_u64(0x0E0C0A0806040200, idx3);
  323. g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
  324. g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
  325. g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
  326. g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);
  327. #else
  328. // correcting cast-align and cast-qual errors
  329. // old version: g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 0) & 0xFF]));
  330. g0 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 0) & 0xFF]));
  331. // old version: g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx[(good >> 8) & 0xFF]));
  332. g1 = _mm256_castsi128_si256(_mm_loadl_epi64((const void *)&idx[(good >> 8) & 0xFF]));
  333. // old version: g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx[(good >> 16) & 0xFF]), 1);
  334. g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((const void *)&idx[(good >> 16) & 0xFF]), 1);
  335. // old version: g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx[(good >> 24) & 0xFF]), 1);
  336. g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((const void *)&idx[(good >> 24) & 0xFF]), 1);
  337. #endif
  338. g2 = _mm256_add_epi8(g0, ones);
  339. g3 = _mm256_add_epi8(g1, ones);
  340. g0 = _mm256_unpacklo_epi8(g0, g2);
  341. g1 = _mm256_unpacklo_epi8(g1, g3);
  342. f0 = _mm256_shuffle_epi8(f0, g0);
  343. f1 = _mm256_shuffle_epi8(f1, g1);
  344. // correcting cast-align errors
  345. // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
  346. _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f0));
  347. ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
  348. // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
  349. _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f0, 1));
  350. ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
  351. // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
  352. _mm_storeu_si128((void *)&r[ctr], _mm256_castsi256_si128(f1));
  353. ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
  354. // old version: _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
  355. _mm_storeu_si128((void *)&r[ctr], _mm256_extracti128_si256(f1, 1));
  356. ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
  357. }
  358. while(ctr <= S2N_KYBER_512_R3_N - 8 && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 12) {
  359. // correcting cast-align and cast-qual errors
  360. // old version: f = _mm_loadu_si128((__m128i *)&buf[pos]);
  361. f = _mm_loadu_si128((const void *)&buf[pos]);
  362. f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
  363. t = _mm_srli_epi16(f, 4);
  364. f = _mm_blend_epi16(f, t, 0xAA);
  365. f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
  366. pos += 12;
  367. t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
  368. good = _mm_movemask_epi8(t);
  369. #ifdef BMI
  370. good &= 0x5555;
  371. idx0 = _pdep_u64(good, 0x1111111111111111);
  372. idx0 = (idx0 << 8) - idx0;
  373. idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
  374. pilo = _mm_cvtsi64_si128(idx0);
  375. #else
  376. good = _pext_u32(good, 0x5555);
  377. // correcting cast-align and cast-qual errors
  378. // old version: pilo = _mm_loadl_epi64((__m128i *)&idx[good]);
  379. pilo = _mm_loadl_epi64((const void *)&idx[good]);
  380. #endif
  381. pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
  382. pilo = _mm_unpacklo_epi8(pilo, pihi);
  383. f = _mm_shuffle_epi8(f, pilo);
  384. // correcting cast-align error
  385. // old version: _mm_storeu_si128((__m128i *)&r[ctr], f);
  386. _mm_storeu_si128((void *)&r[ctr], f);
  387. ctr += _mm_popcnt_u32(good);
  388. }
  389. while(ctr < S2N_KYBER_512_R3_N && pos <= S2N_KYBER_512_R3_REJ_UNIFORM_AVX_BUFLEN - 3) {
  390. val0 = ((buf[pos+0] >> 0) | ((uint16_t)buf[pos+1] << 8)) & 0xFFF;
  391. val1 = ((buf[pos+1] >> 4) | ((uint16_t)buf[pos+2] << 4));
  392. pos += 3;
  393. if(val0 < S2N_KYBER_512_R3_Q)
  394. r[ctr++] = val0;
  395. if(val1 < S2N_KYBER_512_R3_Q && ctr < S2N_KYBER_512_R3_N)
  396. r[ctr++] = val1;
  397. }
  398. return ctr;
  399. }
  400. #endif