kyber512r3_invntt_avx2.S 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. #include "kyber512r3_consts_avx2.h"
  2. // The small macros (.inc files) are combined with .S files directly
  3. /*****.include "shuffle.inc"*****/
  4. /********************************/
  5. .macro shuffle8 r0,r1,r2,r3
  6. vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
  7. vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
  8. .endm
  9. .macro shuffle4 r0,r1,r2,r3
  10. vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
  11. vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
  12. .endm
  13. .macro shuffle2 r0,r1,r2,r3
  14. #vpsllq $32,%ymm\r1,%ymm\r2
  15. vmovsldup %ymm\r1,%ymm\r2
  16. vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
  17. vpsrlq $32,%ymm\r0,%ymm\r0
  18. #vmovshdup %ymm\r0,%ymm\r0
  19. vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
  20. .endm
  21. .macro shuffle1 r0,r1,r2,r3
  22. vpslld $16,%ymm\r1,%ymm\r2
  23. vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
  24. vpsrld $16,%ymm\r0,%ymm\r0
  25. vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
  26. .endm
  27. /********************************/
  28. /*****.include "fq.inc"*****/
  29. /***************************/
  30. .macro red16 r,rs=0,x=12
  31. vpmulhw %ymm1,%ymm\r,%ymm\x
  32. .if \rs
  33. vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
  34. .else
  35. vpsraw $10,%ymm\x,%ymm\x
  36. .endif
  37. vpmullw %ymm0,%ymm\x,%ymm\x
  38. vpsubw %ymm\x,%ymm\r,%ymm\r
  39. .endm
  40. .macro csubq r,x=12
  41. vpsubw %ymm0,%ymm\r,%ymm\r
  42. vpsraw $15,%ymm\r,%ymm\x
  43. vpand %ymm0,%ymm\x,%ymm\x
  44. vpaddw %ymm\x,%ymm\r,%ymm\r
  45. .endm
  46. .macro caddq r,x=12
  47. vpsraw $15,%ymm\r,%ymm\x
  48. vpand %ymm0,%ymm\x,%ymm\x
  49. vpaddw %ymm\x,%ymm\r,%ymm\r
  50. .endm
  51. .macro fqmulprecomp al,ah,b,x=12
  52. vpmullw %ymm\al,%ymm\b,%ymm\x
  53. vpmulhw %ymm\ah,%ymm\b,%ymm\b
  54. vpmulhw %ymm0,%ymm\x,%ymm\x
  55. vpsubw %ymm\x,%ymm\b,%ymm\b
  56. .endm
  57. /***************************/
  58. .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
  59. vpsubw %ymm\rl0,%ymm\rh0,%ymm12
  60. vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
  61. vpsubw %ymm\rl1,%ymm\rh1,%ymm13
  62. vpmullw %ymm\zl0,%ymm12,%ymm\rh0
  63. vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
  64. vpsubw %ymm\rl2,%ymm\rh2,%ymm14
  65. vpmullw %ymm\zl0,%ymm13,%ymm\rh1
  66. vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
  67. vpsubw %ymm\rl3,%ymm\rh3,%ymm15
  68. vpmullw %ymm\zl1,%ymm14,%ymm\rh2
  69. vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
  70. vpmullw %ymm\zl1,%ymm15,%ymm\rh3
  71. vpmulhw %ymm\zh0,%ymm12,%ymm12
  72. vpmulhw %ymm\zh0,%ymm13,%ymm13
  73. vpmulhw %ymm\zh1,%ymm14,%ymm14
  74. vpmulhw %ymm\zh1,%ymm15,%ymm15
  75. vpmulhw %ymm0,%ymm\rh0,%ymm\rh0
  76. vpmulhw %ymm0,%ymm\rh1,%ymm\rh1
  77. vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
  78. vpmulhw %ymm0,%ymm\rh3,%ymm\rh3
  79. #
  80. #
  81. vpsubw %ymm\rh0,%ymm12,%ymm\rh0
  82. vpsubw %ymm\rh1,%ymm13,%ymm\rh1
  83. vpsubw %ymm\rh2,%ymm14,%ymm\rh2
  84. vpsubw %ymm\rh3,%ymm15,%ymm\rh3
  85. .endm
  86. .macro intt_levels0t5 off
  87. /* level 0 */
  88. vmovdqa _16XFLO*2(%rsi),%ymm2
  89. vmovdqa _16XFHI*2(%rsi),%ymm3
  90. vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
  91. vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
  92. vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
  93. vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
  94. fqmulprecomp 2,3,4
  95. fqmulprecomp 2,3,6
  96. fqmulprecomp 2,3,5
  97. fqmulprecomp 2,3,7
  98. vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
  99. vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
  100. vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
  101. vmovdqa (128*\off+112)*2(%rdi),%ymm11
  102. fqmulprecomp 2,3,8
  103. fqmulprecomp 2,3,10
  104. fqmulprecomp 2,3,9
  105. fqmulprecomp 2,3,11
  106. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
  107. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
  108. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
  109. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
  110. vmovdqa _REVIDXB*2(%rsi),%ymm12
  111. vpshufb %ymm12,%ymm15,%ymm15
  112. vpshufb %ymm12,%ymm1,%ymm1
  113. vpshufb %ymm12,%ymm2,%ymm2
  114. vpshufb %ymm12,%ymm3,%ymm3
  115. butterfly 4,5,8,9,6,7,10,11,15,1,2,3
  116. /* level 1 */
  117. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
  118. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
  119. vmovdqa _REVIDXB*2(%rsi),%ymm1
  120. vpshufb %ymm1,%ymm2,%ymm2
  121. vpshufb %ymm1,%ymm3,%ymm3
  122. butterfly 4,5,6,7,8,9,10,11,2,2,3,3
  123. shuffle1 4,5,3,5
  124. shuffle1 6,7,4,7
  125. shuffle1 8,9,6,9
  126. shuffle1 10,11,8,11
  127. /* level 2 */
  128. vmovdqa _REVIDXD*2(%rsi),%ymm12
  129. vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
  130. vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
  131. butterfly 3,4,6,8,5,7,9,11,2,2,10,10
  132. vmovdqa _16XV*2(%rsi),%ymm1
  133. red16 3
  134. shuffle2 3,4,10,4
  135. shuffle2 6,8,3,8
  136. shuffle2 5,7,6,7
  137. shuffle2 9,11,5,11
  138. /* level 3 */
  139. vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
  140. vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
  141. butterfly 10,3,6,5,4,8,7,11,2,2,9,9
  142. shuffle4 10,3,9,3
  143. shuffle4 6,5,10,5
  144. shuffle4 4,8,6,8
  145. shuffle4 7,11,4,11
  146. /* level 4 */
  147. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
  148. vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
  149. butterfly 9,10,6,4,3,5,8,11,2,2,7,7
  150. red16 9
  151. shuffle8 9,10,7,10
  152. shuffle8 6,4,9,4
  153. shuffle8 3,5,6,5
  154. shuffle8 8,11,3,11
  155. /* level 5 */
  156. vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
  157. vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
  158. butterfly 7,9,6,3,10,4,5,11,2,2,8,8
  159. vmovdqa %ymm7,(128*\off+ 0)*2(%rdi)
  160. vmovdqa %ymm9,(128*\off+ 16)*2(%rdi)
  161. vmovdqa %ymm6,(128*\off+ 32)*2(%rdi)
  162. vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
  163. vmovdqa %ymm10,(128*\off+ 64)*2(%rdi)
  164. vmovdqa %ymm4,(128*\off+ 80)*2(%rdi)
  165. vmovdqa %ymm5,(128*\off+ 96)*2(%rdi)
  166. vmovdqa %ymm11,(128*\off+112)*2(%rdi)
  167. .endm
  168. .macro intt_level6 off
  169. /* level 6 */
  170. vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
  171. vmovdqa (64*\off+128)*2(%rdi),%ymm8
  172. vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
  173. vmovdqa (64*\off+144)*2(%rdi),%ymm9
  174. vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2
  175. vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
  176. vmovdqa (64*\off+160)*2(%rdi),%ymm10
  177. vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
  178. vmovdqa (64*\off+176)*2(%rdi),%ymm11
  179. vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3
  180. butterfly 4,5,6,7,8,9,10,11
  181. .if \off == 0
  182. red16 4
  183. .endif
  184. vmovdqa %ymm4,(64*\off+ 0)*2(%rdi)
  185. vmovdqa %ymm5,(64*\off+ 16)*2(%rdi)
  186. vmovdqa %ymm6,(64*\off+ 32)*2(%rdi)
  187. vmovdqa %ymm7,(64*\off+ 48)*2(%rdi)
  188. vmovdqa %ymm8,(64*\off+128)*2(%rdi)
  189. vmovdqa %ymm9,(64*\off+144)*2(%rdi)
  190. vmovdqa %ymm10,(64*\off+160)*2(%rdi)
  191. vmovdqa %ymm11,(64*\off+176)*2(%rdi)
  192. .endm
  193. .text
  194. .global cdecl(invntt_avx2_asm)
  195. cdecl(invntt_avx2_asm):
  196. vmovdqa _16XQ*2(%rsi),%ymm0
  197. intt_levels0t5 0
  198. intt_levels0t5 1
  199. intt_level6 0
  200. intt_level6 1
  201. ret