kyber512r3_fq_avx2.S 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. #include "kyber512r3_consts_avx2.h"
  2. // The small macros (.inc files) are combined with .S files directly
  3. /*****.include "fq.inc"*****/
  4. /***************************/
  5. .macro red16 r,rs=0,x=12
  6. vpmulhw %ymm1,%ymm\r,%ymm\x
  7. .if \rs
  8. vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
  9. .else
  10. vpsraw $10,%ymm\x,%ymm\x
  11. .endif
  12. vpmullw %ymm0,%ymm\x,%ymm\x
  13. vpsubw %ymm\x,%ymm\r,%ymm\r
  14. .endm
  15. .macro csubq r,x=12
  16. vpsubw %ymm0,%ymm\r,%ymm\r
  17. vpsraw $15,%ymm\r,%ymm\x
  18. vpand %ymm0,%ymm\x,%ymm\x
  19. vpaddw %ymm\x,%ymm\r,%ymm\r
  20. .endm
  21. .macro caddq r,x=12
  22. vpsraw $15,%ymm\r,%ymm\x
  23. vpand %ymm0,%ymm\x,%ymm\x
  24. vpaddw %ymm\x,%ymm\r,%ymm\r
  25. .endm
  26. .macro fqmulprecomp al,ah,b,x=12
  27. vpmullw %ymm\al,%ymm\b,%ymm\x
  28. vpmulhw %ymm\ah,%ymm\b,%ymm\b
  29. vpmulhw %ymm0,%ymm\x,%ymm\x
  30. vpsubw %ymm\x,%ymm\b,%ymm\b
  31. .endm
  32. /***************************/
  33. .text
  34. reduce128_avx:
  35. #load
  36. vmovdqa (%rdi),%ymm2
  37. vmovdqa 32(%rdi),%ymm3
  38. vmovdqa 64(%rdi),%ymm4
  39. vmovdqa 96(%rdi),%ymm5
  40. vmovdqa 128(%rdi),%ymm6
  41. vmovdqa 160(%rdi),%ymm7
  42. vmovdqa 192(%rdi),%ymm8
  43. vmovdqa 224(%rdi),%ymm9
  44. red16 2
  45. red16 3
  46. red16 4
  47. red16 5
  48. red16 6
  49. red16 7
  50. red16 8
  51. red16 9
  52. #store
  53. vmovdqa %ymm2,(%rdi)
  54. vmovdqa %ymm3,32(%rdi)
  55. vmovdqa %ymm4,64(%rdi)
  56. vmovdqa %ymm5,96(%rdi)
  57. vmovdqa %ymm6,128(%rdi)
  58. vmovdqa %ymm7,160(%rdi)
  59. vmovdqa %ymm8,192(%rdi)
  60. vmovdqa %ymm9,224(%rdi)
  61. ret
  62. .global cdecl(reduce_avx2_asm)
  63. cdecl(reduce_avx2_asm):
  64. #consts
  65. vmovdqa _16XQ*2(%rsi),%ymm0
  66. vmovdqa _16XV*2(%rsi),%ymm1
  67. call reduce128_avx
  68. add $256,%rdi
  69. call reduce128_avx
  70. ret
  71. tomont128_avx:
  72. #load
  73. vmovdqa (%rdi),%ymm3
  74. vmovdqa 32(%rdi),%ymm4
  75. vmovdqa 64(%rdi),%ymm5
  76. vmovdqa 96(%rdi),%ymm6
  77. vmovdqa 128(%rdi),%ymm7
  78. vmovdqa 160(%rdi),%ymm8
  79. vmovdqa 192(%rdi),%ymm9
  80. vmovdqa 224(%rdi),%ymm10
  81. fqmulprecomp 1,2,3,11
  82. fqmulprecomp 1,2,4,12
  83. fqmulprecomp 1,2,5,13
  84. fqmulprecomp 1,2,6,14
  85. fqmulprecomp 1,2,7,15
  86. fqmulprecomp 1,2,8,11
  87. fqmulprecomp 1,2,9,12
  88. fqmulprecomp 1,2,10,13
  89. #store
  90. vmovdqa %ymm3,(%rdi)
  91. vmovdqa %ymm4,32(%rdi)
  92. vmovdqa %ymm5,64(%rdi)
  93. vmovdqa %ymm6,96(%rdi)
  94. vmovdqa %ymm7,128(%rdi)
  95. vmovdqa %ymm8,160(%rdi)
  96. vmovdqa %ymm9,192(%rdi)
  97. vmovdqa %ymm10,224(%rdi)
  98. ret
  99. .global cdecl(tomont_avx2_asm)
  100. cdecl(tomont_avx2_asm):
  101. #consts
  102. vmovdqa _16XQ*2(%rsi),%ymm0
  103. vmovdqa _16XMONTSQLO*2(%rsi),%ymm1
  104. vmovdqa _16XMONTSQHI*2(%rsi),%ymm2
  105. call tomont128_avx
  106. add $256,%rdi
  107. call tomont128_avx
  108. ret