kyber512r3_ntt_avx2.S 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. #include "kyber512r3_consts_avx2.h"
  2. // The small macros (.inc files) are combined with .S files directly
  3. /*****.include "shuffle.inc"*****/
  4. /********************************/
  5. .macro shuffle8 r0,r1,r2,r3
  6. vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
  7. vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
  8. .endm
  9. .macro shuffle4 r0,r1,r2,r3
  10. vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
  11. vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
  12. .endm
  13. .macro shuffle2 r0,r1,r2,r3
  14. #vpsllq $32,%ymm\r1,%ymm\r2
  15. vmovsldup %ymm\r1,%ymm\r2
  16. vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
  17. vpsrlq $32,%ymm\r0,%ymm\r0
  18. #vmovshdup %ymm\r0,%ymm\r0
  19. vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
  20. .endm
  21. .macro shuffle1 r0,r1,r2,r3
  22. vpslld $16,%ymm\r1,%ymm\r2
  23. vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
  24. vpsrld $16,%ymm\r0,%ymm\r0
  25. vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
  26. .endm
  27. /********************************/
  28. .macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
  29. vpmullw %ymm\zl0,%ymm\rh0,%ymm12
  30. vpmullw %ymm\zl0,%ymm\rh1,%ymm13
  31. vpmullw %ymm\zl1,%ymm\rh2,%ymm14
  32. vpmullw %ymm\zl1,%ymm\rh3,%ymm15
  33. vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0
  34. vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1
  35. vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2
  36. vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3
  37. .endm
  38. .macro reduce
  39. vpmulhw %ymm0,%ymm12,%ymm12
  40. vpmulhw %ymm0,%ymm13,%ymm13
  41. vpmulhw %ymm0,%ymm14,%ymm14
  42. vpmulhw %ymm0,%ymm15,%ymm15
  43. .endm
  44. .macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
  45. vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln
  46. vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0
  47. vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0
  48. vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1
  49. vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1
  50. vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2
  51. vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2
  52. vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3
  53. vpsubw %ymm12,%ymm\rln,%ymm\rln
  54. vpaddw %ymm12,%ymm\rh0,%ymm\rh0
  55. vpsubw %ymm13,%ymm\rl0,%ymm\rl0
  56. vpaddw %ymm13,%ymm\rh1,%ymm\rh1
  57. vpsubw %ymm14,%ymm\rl1,%ymm\rl1
  58. vpaddw %ymm14,%ymm\rh2,%ymm\rh2
  59. vpsubw %ymm15,%ymm\rl2,%ymm\rl2
  60. vpaddw %ymm15,%ymm\rh3,%ymm\rh3
  61. .endm
  62. .macro level0 off
  63. vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15
  64. vmovdqa (64*\off+128)*2(%rdi),%ymm8
  65. vmovdqa (64*\off+144)*2(%rdi),%ymm9
  66. vmovdqa (64*\off+160)*2(%rdi),%ymm10
  67. vmovdqa (64*\off+176)*2(%rdi),%ymm11
  68. vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2
  69. mul 8,9,10,11
  70. vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
  71. vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
  72. vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
  73. vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
  74. reduce
  75. update 3,4,5,6,7,8,9,10,11
  76. vmovdqa %ymm3,(64*\off+ 0)*2(%rdi)
  77. vmovdqa %ymm4,(64*\off+ 16)*2(%rdi)
  78. vmovdqa %ymm5,(64*\off+ 32)*2(%rdi)
  79. vmovdqa %ymm6,(64*\off+ 48)*2(%rdi)
  80. vmovdqa %ymm8,(64*\off+128)*2(%rdi)
  81. vmovdqa %ymm9,(64*\off+144)*2(%rdi)
  82. vmovdqa %ymm10,(64*\off+160)*2(%rdi)
  83. vmovdqa %ymm11,(64*\off+176)*2(%rdi)
  84. .endm
  85. .macro levels1t6 off
  86. /* level 1 */
  87. vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
  88. vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
  89. vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
  90. vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
  91. vmovdqa (128*\off+112)*2(%rdi),%ymm11
  92. vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
  93. mul 8,9,10,11
  94. vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
  95. vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
  96. vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
  97. vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
  98. reduce
  99. update 3,4,5,6,7,8,9,10,11
  100. /* level 2 */
  101. shuffle8 5,10,7,10
  102. shuffle8 6,11,5,11
  103. vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
  104. vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
  105. mul 7,10,5,11
  106. shuffle8 3,8,6,8
  107. shuffle8 4,9,3,9
  108. reduce
  109. update 4,6,8,3,9,7,10,5,11
  110. /* level 3 */
  111. shuffle4 8,5,9,5
  112. shuffle4 3,11,8,11
  113. vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
  114. vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
  115. mul 9,5,8,11
  116. shuffle4 4,7,3,7
  117. shuffle4 6,10,4,10
  118. reduce
  119. update 6,3,7,4,10,9,5,8,11
  120. /* level 4 */
  121. shuffle2 7,8,10,8
  122. shuffle2 4,11,7,11
  123. vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
  124. vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
  125. mul 10,8,7,11
  126. shuffle2 6,9,4,9
  127. shuffle2 3,5,6,5
  128. reduce
  129. update 3,4,9,6,5,10,8,7,11
  130. /* level 5 */
  131. shuffle1 9,7,5,7
  132. shuffle1 6,11,9,11
  133. vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
  134. vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
  135. mul 5,7,9,11
  136. shuffle1 3,10,6,10
  137. shuffle1 4,8,3,8
  138. reduce
  139. update 4,6,10,3,8,5,7,9,11
  140. /* level 6 */
  141. vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
  142. vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
  143. vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
  144. vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
  145. mul 10,3,9,11,14,15,8,2
  146. reduce
  147. update 8,4,6,5,7,10,3,9,11
  148. vmovdqa %ymm8,(128*\off+ 0)*2(%rdi)
  149. vmovdqa %ymm4,(128*\off+ 16)*2(%rdi)
  150. vmovdqa %ymm10,(128*\off+ 32)*2(%rdi)
  151. vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
  152. vmovdqa %ymm6,(128*\off+ 64)*2(%rdi)
  153. vmovdqa %ymm5,(128*\off+ 80)*2(%rdi)
  154. vmovdqa %ymm9,(128*\off+ 96)*2(%rdi)
  155. vmovdqa %ymm11,(128*\off+112)*2(%rdi)
  156. .endm
  157. .text
  158. .global cdecl(ntt_avx2_asm)
  159. cdecl(ntt_avx2_asm):
  160. vmovdqa _16XQ*2(%rsi),%ymm0
  161. level0 0
  162. level0 1
  163. levels1t6 0
  164. levels1t6 1
  165. ret