123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- #include "kyber512r3_consts_avx2.h"
- // The small macros (.inc files) are combined with .S files directly
- /*****.include "fq.inc"*****/
- /***************************/
- .macro red16 r,rs=0,x=12
- vpmulhw %ymm1,%ymm\r,%ymm\x
- .if \rs
- vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
- .else
- vpsraw $10,%ymm\x,%ymm\x
- .endif
- vpmullw %ymm0,%ymm\x,%ymm\x
- vpsubw %ymm\x,%ymm\r,%ymm\r
- .endm
- .macro csubq r,x=12
- vpsubw %ymm0,%ymm\r,%ymm\r
- vpsraw $15,%ymm\r,%ymm\x
- vpand %ymm0,%ymm\x,%ymm\x
- vpaddw %ymm\x,%ymm\r,%ymm\r
- .endm
- .macro caddq r,x=12
- vpsraw $15,%ymm\r,%ymm\x
- vpand %ymm0,%ymm\x,%ymm\x
- vpaddw %ymm\x,%ymm\r,%ymm\r
- .endm
- .macro fqmulprecomp al,ah,b,x=12
- vpmullw %ymm\al,%ymm\b,%ymm\x
- vpmulhw %ymm\ah,%ymm\b,%ymm\b
- vpmulhw %ymm0,%ymm\x,%ymm\x
- vpsubw %ymm\x,%ymm\b,%ymm\b
- .endm
- /***************************/
- .text
- reduce128_avx:
- #load
- vmovdqa (%rdi),%ymm2
- vmovdqa 32(%rdi),%ymm3
- vmovdqa 64(%rdi),%ymm4
- vmovdqa 96(%rdi),%ymm5
- vmovdqa 128(%rdi),%ymm6
- vmovdqa 160(%rdi),%ymm7
- vmovdqa 192(%rdi),%ymm8
- vmovdqa 224(%rdi),%ymm9
- red16 2
- red16 3
- red16 4
- red16 5
- red16 6
- red16 7
- red16 8
- red16 9
- #store
- vmovdqa %ymm2,(%rdi)
- vmovdqa %ymm3,32(%rdi)
- vmovdqa %ymm4,64(%rdi)
- vmovdqa %ymm5,96(%rdi)
- vmovdqa %ymm6,128(%rdi)
- vmovdqa %ymm7,160(%rdi)
- vmovdqa %ymm8,192(%rdi)
- vmovdqa %ymm9,224(%rdi)
- ret
- .global cdecl(reduce_avx2_asm)
- cdecl(reduce_avx2_asm):
- #consts
- vmovdqa _16XQ*2(%rsi),%ymm0
- vmovdqa _16XV*2(%rsi),%ymm1
- call reduce128_avx
- add $256,%rdi
- call reduce128_avx
- ret
- tomont128_avx:
- #load
- vmovdqa (%rdi),%ymm3
- vmovdqa 32(%rdi),%ymm4
- vmovdqa 64(%rdi),%ymm5
- vmovdqa 96(%rdi),%ymm6
- vmovdqa 128(%rdi),%ymm7
- vmovdqa 160(%rdi),%ymm8
- vmovdqa 192(%rdi),%ymm9
- vmovdqa 224(%rdi),%ymm10
- fqmulprecomp 1,2,3,11
- fqmulprecomp 1,2,4,12
- fqmulprecomp 1,2,5,13
- fqmulprecomp 1,2,6,14
- fqmulprecomp 1,2,7,15
- fqmulprecomp 1,2,8,11
- fqmulprecomp 1,2,9,12
- fqmulprecomp 1,2,10,13
- #store
- vmovdqa %ymm3,(%rdi)
- vmovdqa %ymm4,32(%rdi)
- vmovdqa %ymm5,64(%rdi)
- vmovdqa %ymm6,96(%rdi)
- vmovdqa %ymm7,128(%rdi)
- vmovdqa %ymm8,160(%rdi)
- vmovdqa %ymm9,192(%rdi)
- vmovdqa %ymm10,224(%rdi)
- ret
- .global cdecl(tomont_avx2_asm)
- cdecl(tomont_avx2_asm):
- #consts
- vmovdqa _16XQ*2(%rsi),%ymm0
- vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
- vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
- call tomont128_avx
- add $256,%rdi
- call tomont128_avx
- ret
|