123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- #include "kyber512r3_consts_avx2.h"
- // The small macros (.inc files) are combined with .S files directly
- /*****.include "shuffle.inc"*****/
- /********************************/
- .macro shuffle8 r0,r1,r2,r3
- vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
- vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
- .endm
- .macro shuffle4 r0,r1,r2,r3
- vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
- vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
- .endm
- .macro shuffle2 r0,r1,r2,r3
- #vpsllq $32,%ymm\r1,%ymm\r2
- vmovsldup %ymm\r1,%ymm\r2
- vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
- vpsrlq $32,%ymm\r0,%ymm\r0
- #vmovshdup %ymm\r0,%ymm\r0
- vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
- .endm
- .macro shuffle1 r0,r1,r2,r3
- vpslld $16,%ymm\r1,%ymm\r2
- vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
- vpsrld $16,%ymm\r0,%ymm\r0
- vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
- .endm
- /********************************/
- /*****.include "fq.inc"*****/
- /***************************/
- .macro red16 r,rs=0,x=12
- vpmulhw %ymm1,%ymm\r,%ymm\x
- .if \rs
- vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
- .else
- vpsraw $10,%ymm\x,%ymm\x
- .endif
- vpmullw %ymm0,%ymm\x,%ymm\x
- vpsubw %ymm\x,%ymm\r,%ymm\r
- .endm
- .macro csubq r,x=12
- vpsubw %ymm0,%ymm\r,%ymm\r
- vpsraw $15,%ymm\r,%ymm\x
- vpand %ymm0,%ymm\x,%ymm\x
- vpaddw %ymm\x,%ymm\r,%ymm\r
- .endm
- .macro caddq r,x=12
- vpsraw $15,%ymm\r,%ymm\x
- vpand %ymm0,%ymm\x,%ymm\x
- vpaddw %ymm\x,%ymm\r,%ymm\r
- .endm
- .macro fqmulprecomp al,ah,b,x=12
- vpmullw %ymm\al,%ymm\b,%ymm\x
- vpmulhw %ymm\ah,%ymm\b,%ymm\b
- vpmulhw %ymm0,%ymm\x,%ymm\x
- vpsubw %ymm\x,%ymm\b,%ymm\b
- .endm
- /***************************/
- .macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
- vpsubw %ymm\rl0,%ymm\rh0,%ymm12
- vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
- vpsubw %ymm\rl1,%ymm\rh1,%ymm13
- vpmullw %ymm\zl0,%ymm12,%ymm\rh0
- vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
- vpsubw %ymm\rl2,%ymm\rh2,%ymm14
- vpmullw %ymm\zl0,%ymm13,%ymm\rh1
- vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
- vpsubw %ymm\rl3,%ymm\rh3,%ymm15
- vpmullw %ymm\zl1,%ymm14,%ymm\rh2
- vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
- vpmullw %ymm\zl1,%ymm15,%ymm\rh3
- vpmulhw %ymm\zh0,%ymm12,%ymm12
- vpmulhw %ymm\zh0,%ymm13,%ymm13
- vpmulhw %ymm\zh1,%ymm14,%ymm14
- vpmulhw %ymm\zh1,%ymm15,%ymm15
- vpmulhw %ymm0,%ymm\rh0,%ymm\rh0
- vpmulhw %ymm0,%ymm\rh1,%ymm\rh1
- vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
- vpmulhw %ymm0,%ymm\rh3,%ymm\rh3
- #
- #
- vpsubw %ymm\rh0,%ymm12,%ymm\rh0
- vpsubw %ymm\rh1,%ymm13,%ymm\rh1
- vpsubw %ymm\rh2,%ymm14,%ymm\rh2
- vpsubw %ymm\rh3,%ymm15,%ymm\rh3
- .endm
- .macro intt_levels0t5 off
- /* level 0 */
- vmovdqa _16XFLO*2(%rsi),%ymm2
- vmovdqa _16XFHI*2(%rsi),%ymm3
- vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
- vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
- vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
- vmovdqa (128*\off+ 48)*2(%rdi),%ymm7
- fqmulprecomp 2,3,4
- fqmulprecomp 2,3,6
- fqmulprecomp 2,3,5
- fqmulprecomp 2,3,7
- vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
- vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
- vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
- vmovdqa (128*\off+112)*2(%rdi),%ymm11
- fqmulprecomp 2,3,8
- fqmulprecomp 2,3,10
- fqmulprecomp 2,3,9
- fqmulprecomp 2,3,11
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
- vmovdqa _REVIDXB*2(%rsi),%ymm12
- vpshufb %ymm12,%ymm15,%ymm15
- vpshufb %ymm12,%ymm1,%ymm1
- vpshufb %ymm12,%ymm2,%ymm2
- vpshufb %ymm12,%ymm3,%ymm3
- butterfly 4,5,8,9,6,7,10,11,15,1,2,3
- /* level 1 */
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
- vmovdqa _REVIDXB*2(%rsi),%ymm1
- vpshufb %ymm1,%ymm2,%ymm2
- vpshufb %ymm1,%ymm3,%ymm3
- butterfly 4,5,6,7,8,9,10,11,2,2,3,3
- shuffle1 4,5,3,5
- shuffle1 6,7,4,7
- shuffle1 8,9,6,9
- shuffle1 10,11,8,11
- /* level 2 */
- vmovdqa _REVIDXD*2(%rsi),%ymm12
- vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
- vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10
- butterfly 3,4,6,8,5,7,9,11,2,2,10,10
- vmovdqa _16XV*2(%rsi),%ymm1
- red16 3
- shuffle2 3,4,10,4
- shuffle2 6,8,3,8
- shuffle2 5,7,6,7
- shuffle2 9,11,5,11
- /* level 3 */
- vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
- vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9
- butterfly 10,3,6,5,4,8,7,11,2,2,9,9
- shuffle4 10,3,9,3
- shuffle4 6,5,10,5
- shuffle4 4,8,6,8
- shuffle4 7,11,4,11
- /* level 4 */
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
- vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7
- butterfly 9,10,6,4,3,5,8,11,2,2,7,7
- red16 9
- shuffle8 9,10,7,10
- shuffle8 6,4,9,4
- shuffle8 3,5,6,5
- shuffle8 8,11,3,11
- /* level 5 */
- vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
- vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8
- butterfly 7,9,6,3,10,4,5,11,2,2,8,8
- vmovdqa %ymm7,(128*\off+ 0)*2(%rdi)
- vmovdqa %ymm9,(128*\off+ 16)*2(%rdi)
- vmovdqa %ymm6,(128*\off+ 32)*2(%rdi)
- vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
- vmovdqa %ymm10,(128*\off+ 64)*2(%rdi)
- vmovdqa %ymm4,(128*\off+ 80)*2(%rdi)
- vmovdqa %ymm5,(128*\off+ 96)*2(%rdi)
- vmovdqa %ymm11,(128*\off+112)*2(%rdi)
- .endm
- .macro intt_level6 off
- /* level 6 */
- vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
- vmovdqa (64*\off+128)*2(%rdi),%ymm8
- vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
- vmovdqa (64*\off+144)*2(%rdi),%ymm9
- vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2
- vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
- vmovdqa (64*\off+160)*2(%rdi),%ymm10
- vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
- vmovdqa (64*\off+176)*2(%rdi),%ymm11
- vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3
- butterfly 4,5,6,7,8,9,10,11
- .if \off == 0
- red16 4
- .endif
- vmovdqa %ymm4,(64*\off+ 0)*2(%rdi)
- vmovdqa %ymm5,(64*\off+ 16)*2(%rdi)
- vmovdqa %ymm6,(64*\off+ 32)*2(%rdi)
- vmovdqa %ymm7,(64*\off+ 48)*2(%rdi)
- vmovdqa %ymm8,(64*\off+128)*2(%rdi)
- vmovdqa %ymm9,(64*\off+144)*2(%rdi)
- vmovdqa %ymm10,(64*\off+160)*2(%rdi)
- vmovdqa %ymm11,(64*\off+176)*2(%rdi)
- .endm
- .text
- .global cdecl(invntt_avx2_asm)
- cdecl(invntt_avx2_asm):
- vmovdqa _16XQ*2(%rsi),%ymm0
- intt_levels0t5 0
- intt_levels0t5 1
- intt_level6 0
- intt_level6 1
- ret
|