kyber512r3_basemul_avx2.S 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. #include "kyber512r3_consts_avx2.h"
  2. .macro schoolbook off
  3. vmovdqa _16XQINV*2(%rcx),%ymm0
  4. vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0
  5. vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0
  6. vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1
  7. vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1
  8. vpmullw %ymm0,%ymm1,%ymm9 # a0.lo
  9. vpmullw %ymm0,%ymm2,%ymm10 # b0.lo
  10. vpmullw %ymm0,%ymm3,%ymm11 # a1.lo
  11. vpmullw %ymm0,%ymm4,%ymm12 # b1.lo
  12. vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0
  13. vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0
  14. vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi
  15. vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi
  16. vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi
  17. vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi
  18. vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1
  19. vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1
  20. vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi
  21. vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi
  22. vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi
  23. vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi
  24. vmovdqa %ymm13,(%rsp)
  25. vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo
  26. vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo
  27. vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo
  28. vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo
  29. vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo
  30. vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo
  31. vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo
  32. vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo
  33. vmovdqa _16XQ*2(%rcx),%ymm8
  34. vpmulhw %ymm8,%ymm13,%ymm13
  35. vpmulhw %ymm8,%ymm9,%ymm9
  36. vpmulhw %ymm8,%ymm5,%ymm5
  37. vpmulhw %ymm8,%ymm10,%ymm10
  38. vpmulhw %ymm8,%ymm6,%ymm6
  39. vpmulhw %ymm8,%ymm11,%ymm11
  40. vpmulhw %ymm8,%ymm7,%ymm7
  41. vpmulhw %ymm8,%ymm12,%ymm12
  42. vpsubw (%rsp),%ymm13,%ymm13 # -a0c0
  43. vpsubw %ymm9,%ymm1,%ymm9 # a0d0
  44. vpsubw %ymm5,%ymm14,%ymm5 # b0c0
  45. vpsubw %ymm10,%ymm2,%ymm10 # b0d0
  46. vpsubw %ymm6,%ymm15,%ymm6 # a1c1
  47. vpsubw %ymm11,%ymm3,%ymm11 # a1d1
  48. vpsubw %ymm7,%ymm0,%ymm7 # b1c1
  49. vpsubw %ymm12,%ymm4,%ymm12 # b1d1
  50. vmovdqa (%r9),%ymm0
  51. vmovdqa 32(%r9),%ymm1
  52. vpmullw %ymm0,%ymm10,%ymm2
  53. vpmullw %ymm0,%ymm12,%ymm3
  54. vpmulhw %ymm1,%ymm10,%ymm10
  55. vpmulhw %ymm1,%ymm12,%ymm12
  56. vpmulhw %ymm8,%ymm2,%ymm2
  57. vpmulhw %ymm8,%ymm3,%ymm3
  58. vpsubw %ymm2,%ymm10,%ymm10 # rb0d0
  59. vpsubw %ymm3,%ymm12,%ymm12 # rb1d1
  60. vpaddw %ymm5,%ymm9,%ymm9
  61. vpaddw %ymm7,%ymm11,%ymm11
  62. vpsubw %ymm13,%ymm10,%ymm13
  63. vpsubw %ymm12,%ymm6,%ymm6
  64. vmovdqa %ymm13,(64*\off+ 0)*2(%rdi)
  65. vmovdqa %ymm9,(64*\off+16)*2(%rdi)
  66. vmovdqa %ymm6,(64*\off+32)*2(%rdi)
  67. vmovdqa %ymm11,(64*\off+48)*2(%rdi)
  68. .endm
  69. .text
  70. .global cdecl(basemul_avx2_asm)
  71. cdecl(basemul_avx2_asm):
  72. mov %rsp,%r8
  73. and $-32,%rsp
  74. sub $32,%rsp
  75. lea (_ZETAS_EXP+176)*2(%rcx),%r9
  76. schoolbook 0
  77. add $32*2,%r9
  78. schoolbook 1
  79. add $192*2,%r9
  80. schoolbook 2
  81. add $32*2,%r9
  82. schoolbook 3
  83. mov %r8,%rsp
  84. ret