kyber512r3_shuffle_avx2.S 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. #include "kyber512r3_consts_avx2.h"
  2. // The small macros (.inc files) are combined with .S files directly
  3. /*****.include "fq.inc"*****/
  4. /***************************/
  5. .macro red16 r,rs=0,x=12
  6. vpmulhw %ymm1,%ymm\r,%ymm\x
  7. .if \rs
  8. vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
  9. .else
  10. vpsraw $10,%ymm\x,%ymm\x
  11. .endif
  12. vpmullw %ymm0,%ymm\x,%ymm\x
  13. vpsubw %ymm\x,%ymm\r,%ymm\r
  14. .endm
  15. .macro csubq r,x=12
  16. vpsubw %ymm0,%ymm\r,%ymm\r
  17. vpsraw $15,%ymm\r,%ymm\x
  18. vpand %ymm0,%ymm\x,%ymm\x
  19. vpaddw %ymm\x,%ymm\r,%ymm\r
  20. .endm
  21. .macro caddq r,x=12
  22. vpsraw $15,%ymm\r,%ymm\x
  23. vpand %ymm0,%ymm\x,%ymm\x
  24. vpaddw %ymm\x,%ymm\r,%ymm\r
  25. .endm
  26. .macro fqmulprecomp al,ah,b,x=12
  27. vpmullw %ymm\al,%ymm\b,%ymm\x
  28. vpmulhw %ymm\ah,%ymm\b,%ymm\b
  29. vpmulhw %ymm0,%ymm\x,%ymm\x
  30. vpsubw %ymm\x,%ymm\b,%ymm\b
  31. .endm
  32. /***************************/
  33. /*****.include "shuffle.inc"*****/
  34. /********************************/
  35. .macro shuffle8 r0,r1,r2,r3
  36. vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
  37. vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
  38. .endm
  39. .macro shuffle4 r0,r1,r2,r3
  40. vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
  41. vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
  42. .endm
  43. .macro shuffle2 r0,r1,r2,r3
  44. #vpsllq $32,%ymm\r1,%ymm\r2
  45. vmovsldup %ymm\r1,%ymm\r2
  46. vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
  47. vpsrlq $32,%ymm\r0,%ymm\r0
  48. #vmovshdup %ymm\r0,%ymm\r0
  49. vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
  50. .endm
  51. .macro shuffle1 r0,r1,r2,r3
  52. vpslld $16,%ymm\r1,%ymm\r2
  53. vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
  54. vpsrld $16,%ymm\r0,%ymm\r0
  55. vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
  56. .endm
  57. /********************************/
  58. .text
  59. nttunpack128_avx:
  60. #load
  61. vmovdqa (%rdi),%ymm4
  62. vmovdqa 32(%rdi),%ymm5
  63. vmovdqa 64(%rdi),%ymm6
  64. vmovdqa 96(%rdi),%ymm7
  65. vmovdqa 128(%rdi),%ymm8
  66. vmovdqa 160(%rdi),%ymm9
  67. vmovdqa 192(%rdi),%ymm10
  68. vmovdqa 224(%rdi),%ymm11
  69. shuffle8 4,8,3,8
  70. shuffle8 5,9,4,9
  71. shuffle8 6,10,5,10
  72. shuffle8 7,11,6,11
  73. shuffle4 3,5,7,5
  74. shuffle4 8,10,3,10
  75. shuffle4 4,6,8,6
  76. shuffle4 9,11,4,11
  77. shuffle2 7,8,9,8
  78. shuffle2 5,6,7,6
  79. shuffle2 3,4,5,4
  80. shuffle2 10,11,3,11
  81. shuffle1 9,5,10,5
  82. shuffle1 8,4,9,4
  83. shuffle1 7,3,8,3
  84. shuffle1 6,11,7,11
  85. #store
  86. vmovdqa %ymm10,(%rdi)
  87. vmovdqa %ymm5,32(%rdi)
  88. vmovdqa %ymm9,64(%rdi)
  89. vmovdqa %ymm4,96(%rdi)
  90. vmovdqa %ymm8,128(%rdi)
  91. vmovdqa %ymm3,160(%rdi)
  92. vmovdqa %ymm7,192(%rdi)
  93. vmovdqa %ymm11,224(%rdi)
  94. ret
  95. .global cdecl(nttunpack_avx2_asm)
  96. cdecl(nttunpack_avx2_asm):
  97. call nttunpack128_avx
  98. add $256,%rdi
  99. call nttunpack128_avx
  100. ret
  101. ntttobytes128_avx:
  102. #load
  103. vmovdqa (%rsi),%ymm5
  104. vmovdqa 32(%rsi),%ymm6
  105. vmovdqa 64(%rsi),%ymm7
  106. vmovdqa 96(%rsi),%ymm8
  107. vmovdqa 128(%rsi),%ymm9
  108. vmovdqa 160(%rsi),%ymm10
  109. vmovdqa 192(%rsi),%ymm11
  110. vmovdqa 224(%rsi),%ymm12
  111. #csubq
  112. csubq 5,13
  113. csubq 6,13
  114. csubq 7,13
  115. csubq 8,13
  116. csubq 9,13
  117. csubq 10,13
  118. csubq 11,13
  119. csubq 12,13
  120. #bitpack
  121. vpsllw $12,%ymm6,%ymm4
  122. vpor %ymm4,%ymm5,%ymm4
  123. vpsrlw $4,%ymm6,%ymm5
  124. vpsllw $8,%ymm7,%ymm6
  125. vpor %ymm5,%ymm6,%ymm5
  126. vpsrlw $8,%ymm7,%ymm6
  127. vpsllw $4,%ymm8,%ymm7
  128. vpor %ymm6,%ymm7,%ymm6
  129. vpsllw $12,%ymm10,%ymm7
  130. vpor %ymm7,%ymm9,%ymm7
  131. vpsrlw $4,%ymm10,%ymm8
  132. vpsllw $8,%ymm11,%ymm9
  133. vpor %ymm8,%ymm9,%ymm8
  134. vpsrlw $8,%ymm11,%ymm9
  135. vpsllw $4,%ymm12,%ymm10
  136. vpor %ymm9,%ymm10,%ymm9
  137. shuffle1 4,5,3,5
  138. shuffle1 6,7,4,7
  139. shuffle1 8,9,6,9
  140. shuffle2 3,4,8,4
  141. shuffle2 6,5,3,5
  142. shuffle2 7,9,6,9
  143. shuffle4 8,3,7,3
  144. shuffle4 6,4,8,4
  145. shuffle4 5,9,6,9
  146. shuffle8 7,8,5,8
  147. shuffle8 6,3,7,3
  148. shuffle8 4,9,6,9
  149. #store
  150. vmovdqu %ymm5,(%rdi)
  151. vmovdqu %ymm7,32(%rdi)
  152. vmovdqu %ymm6,64(%rdi)
  153. vmovdqu %ymm8,96(%rdi)
  154. vmovdqu %ymm3,128(%rdi)
  155. vmovdqu %ymm9,160(%rdi)
  156. ret
  157. .global cdecl(ntttobytes_avx2_asm)
  158. cdecl(ntttobytes_avx2_asm):
  159. #consts
  160. vmovdqa _16XQ*2(%rdx),%ymm0
  161. call ntttobytes128_avx
  162. add $256,%rsi
  163. add $192,%rdi
  164. call ntttobytes128_avx
  165. ret
  166. nttfrombytes128_avx:
  167. #load
  168. vmovdqu (%rsi),%ymm4
  169. vmovdqu 32(%rsi),%ymm5
  170. vmovdqu 64(%rsi),%ymm6
  171. vmovdqu 96(%rsi),%ymm7
  172. vmovdqu 128(%rsi),%ymm8
  173. vmovdqu 160(%rsi),%ymm9
  174. shuffle8 4,7,3,7
  175. shuffle8 5,8,4,8
  176. shuffle8 6,9,5,9
  177. shuffle4 3,8,6,8
  178. shuffle4 7,5,3,5
  179. shuffle4 4,9,7,9
  180. shuffle2 6,5,4,5
  181. shuffle2 8,7,6,7
  182. shuffle2 3,9,8,9
  183. shuffle1 4,7,10,7
  184. shuffle1 5,8,4,8
  185. shuffle1 6,9,5,9
  186. #bitunpack
  187. vpsrlw $12,%ymm10,%ymm11
  188. vpsllw $4,%ymm7,%ymm12
  189. vpor %ymm11,%ymm12,%ymm11
  190. vpand %ymm0,%ymm10,%ymm10
  191. vpand %ymm0,%ymm11,%ymm11
  192. vpsrlw $8,%ymm7,%ymm12
  193. vpsllw $8,%ymm4,%ymm13
  194. vpor %ymm12,%ymm13,%ymm12
  195. vpand %ymm0,%ymm12,%ymm12
  196. vpsrlw $4,%ymm4,%ymm13
  197. vpand %ymm0,%ymm13,%ymm13
  198. vpsrlw $12,%ymm8,%ymm14
  199. vpsllw $4,%ymm5,%ymm15
  200. vpor %ymm14,%ymm15,%ymm14
  201. vpand %ymm0,%ymm8,%ymm8
  202. vpand %ymm0,%ymm14,%ymm14
  203. vpsrlw $8,%ymm5,%ymm15
  204. vpsllw $8,%ymm9,%ymm1
  205. vpor %ymm15,%ymm1,%ymm15
  206. vpand %ymm0,%ymm15,%ymm15
  207. vpsrlw $4,%ymm9,%ymm1
  208. vpand %ymm0,%ymm1,%ymm1
  209. #store
  210. vmovdqa %ymm10,(%rdi)
  211. vmovdqa %ymm11,32(%rdi)
  212. vmovdqa %ymm12,64(%rdi)
  213. vmovdqa %ymm13,96(%rdi)
  214. vmovdqa %ymm8,128(%rdi)
  215. vmovdqa %ymm14,160(%rdi)
  216. vmovdqa %ymm15,192(%rdi)
  217. vmovdqa %ymm1,224(%rdi)
  218. ret
  219. .global cdecl(nttfrombytes_avx2_asm)
  220. cdecl(nttfrombytes_avx2_asm):
  221. #consts
  222. vmovdqa _16XMASK*2(%rdx),%ymm0
  223. call nttfrombytes128_avx
  224. add $256,%rdi
  225. add $192,%rsi
  226. call nttfrombytes128_avx
  227. ret