123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- #include "kyber512r3_consts_avx2.h"
- // The small macros (.inc files) are combined with .S files directly
- /*****.include "shuffle.inc"*****/
- /********************************/
- .macro shuffle8 r0,r1,r2,r3
- vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
- vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
- .endm
- .macro shuffle4 r0,r1,r2,r3
- vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
- vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
- .endm
- .macro shuffle2 r0,r1,r2,r3
- #vpsllq $32,%ymm\r1,%ymm\r2
- vmovsldup %ymm\r1,%ymm\r2
- vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
- vpsrlq $32,%ymm\r0,%ymm\r0
- #vmovshdup %ymm\r0,%ymm\r0
- vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
- .endm
- .macro shuffle1 r0,r1,r2,r3
- vpslld $16,%ymm\r1,%ymm\r2
- vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
- vpsrld $16,%ymm\r0,%ymm\r0
- vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
- .endm
- /********************************/
- .macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
- vpmullw %ymm\zl0,%ymm\rh0,%ymm12
- vpmullw %ymm\zl0,%ymm\rh1,%ymm13
- vpmullw %ymm\zl1,%ymm\rh2,%ymm14
- vpmullw %ymm\zl1,%ymm\rh3,%ymm15
- vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0
- vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1
- vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2
- vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3
- .endm
- .macro reduce
- vpmulhw %ymm0,%ymm12,%ymm12
- vpmulhw %ymm0,%ymm13,%ymm13
- vpmulhw %ymm0,%ymm14,%ymm14
- vpmulhw %ymm0,%ymm15,%ymm15
- .endm
- .macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
- vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln
- vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0
- vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0
- vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1
- vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1
- vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2
- vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2
- vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3
- vpsubw %ymm12,%ymm\rln,%ymm\rln
- vpaddw %ymm12,%ymm\rh0,%ymm\rh0
- vpsubw %ymm13,%ymm\rl0,%ymm\rl0
- vpaddw %ymm13,%ymm\rh1,%ymm\rh1
- vpsubw %ymm14,%ymm\rl1,%ymm\rl1
- vpaddw %ymm14,%ymm\rh2,%ymm\rh2
- vpsubw %ymm15,%ymm\rl2,%ymm\rl2
- vpaddw %ymm15,%ymm\rh3,%ymm\rh3
- .endm
- .macro level0 off
- vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15
- vmovdqa (64*\off+128)*2(%rdi),%ymm8
- vmovdqa (64*\off+144)*2(%rdi),%ymm9
- vmovdqa (64*\off+160)*2(%rdi),%ymm10
- vmovdqa (64*\off+176)*2(%rdi),%ymm11
- vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2
- mul 8,9,10,11
- vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
- vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
- vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
- vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
- reduce
- update 3,4,5,6,7,8,9,10,11
- vmovdqa %ymm3,(64*\off+ 0)*2(%rdi)
- vmovdqa %ymm4,(64*\off+ 16)*2(%rdi)
- vmovdqa %ymm5,(64*\off+ 32)*2(%rdi)
- vmovdqa %ymm6,(64*\off+ 48)*2(%rdi)
- vmovdqa %ymm8,(64*\off+128)*2(%rdi)
- vmovdqa %ymm9,(64*\off+144)*2(%rdi)
- vmovdqa %ymm10,(64*\off+160)*2(%rdi)
- vmovdqa %ymm11,(64*\off+176)*2(%rdi)
- .endm
- .macro levels1t6 off
- /* level 1 */
- vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
- vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
- vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
- vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
- vmovdqa (128*\off+112)*2(%rdi),%ymm11
- vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2
- mul 8,9,10,11
- vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
- vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
- vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
- vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
- reduce
- update 3,4,5,6,7,8,9,10,11
- /* level 2 */
- shuffle8 5,10,7,10
- shuffle8 6,11,5,11
- vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
- vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2
- mul 7,10,5,11
- shuffle8 3,8,6,8
- shuffle8 4,9,3,9
- reduce
- update 4,6,8,3,9,7,10,5,11
- /* level 3 */
- shuffle4 8,5,9,5
- shuffle4 3,11,8,11
- vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
- vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2
- mul 9,5,8,11
- shuffle4 4,7,3,7
- shuffle4 6,10,4,10
- reduce
- update 6,3,7,4,10,9,5,8,11
- /* level 4 */
- shuffle2 7,8,10,8
- shuffle2 4,11,7,11
- vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
- vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2
- mul 10,8,7,11
- shuffle2 6,9,4,9
- shuffle2 3,5,6,5
- reduce
- update 3,4,9,6,5,10,8,7,11
- /* level 5 */
- shuffle1 9,7,5,7
- shuffle1 6,11,9,11
- vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
- vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2
- mul 5,7,9,11
- shuffle1 3,10,6,10
- shuffle1 4,8,3,8
- reduce
- update 4,6,10,3,8,5,7,9,11
- /* level 6 */
- vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
- vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
- vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
- vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2
- mul 10,3,9,11,14,15,8,2
- reduce
- update 8,4,6,5,7,10,3,9,11
- vmovdqa %ymm8,(128*\off+ 0)*2(%rdi)
- vmovdqa %ymm4,(128*\off+ 16)*2(%rdi)
- vmovdqa %ymm10,(128*\off+ 32)*2(%rdi)
- vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
- vmovdqa %ymm6,(128*\off+ 64)*2(%rdi)
- vmovdqa %ymm5,(128*\off+ 80)*2(%rdi)
- vmovdqa %ymm9,(128*\off+ 96)*2(%rdi)
- vmovdqa %ymm11,(128*\off+112)*2(%rdi)
- .endm
- .text
- .global cdecl(ntt_avx2_asm)
- cdecl(ntt_avx2_asm):
- vmovdqa _16XQ*2(%rsi),%ymm0
- level0 0
- level0 1
- levels1t6 0
- levels1t6 1
- ret
|