gf_6vect_dot_prod_avx512.asm 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2. ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
  3. ;
  4. ; Redistribution and use in source and binary forms, with or without
  5. ; modification, are permitted provided that the following conditions
  6. ; are met:
  7. ; * Redistributions of source code must retain the above copyright
  8. ; notice, this list of conditions and the following disclaimer.
  9. ; * Redistributions in binary form must reproduce the above copyright
  10. ; notice, this list of conditions and the following disclaimer in
  11. ; the documentation and/or other materials provided with the
  12. ; distribution.
  13. ; * Neither the name of Intel Corporation nor the names of its
  14. ; contributors may be used to endorse or promote products derived
  15. ; from this software without specific prior written permission.
  16. ;
  17. ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18. ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19. ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20. ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21. ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22. ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23. ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24. ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25. ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27. ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29. ;;;
  30. ;;; gf_6vect_dot_prod_avx512(len, vec, *g_tbls, **buffs, **dests);
  31. ;;;
  32. %include "reg_sizes.asm"
  33. %ifdef HAVE_AS_KNOWS_AVX512
  34. %ifidn __OUTPUT_FORMAT__, elf64
  35. %define arg0 rdi
  36. %define arg1 rsi
  37. %define arg2 rdx
  38. %define arg3 rcx
  39. %define arg4 r8
  40. %define arg5 r9
  41. %define tmp r11
  42. %define tmp2 r10
  43. %define tmp3 r13 ; must be saved and restored
  44. %define tmp4 r12 ; must be saved and restored
  45. %define tmp5 r14 ; must be saved and restored
  46. %define tmp6 r15 ; must be saved and restored
  47. %define tmp7 rbp ; must be saved and restored
  48. %define tmp8 rbx ; must be saved and restored
  49. %define return rax
  50. %define PS 8
  51. %define LOG_PS 3
  52. %define stack_size 6*8
  53. %define func(x) x: endbranch
  54. %macro FUNC_SAVE 0
  55. sub rsp, stack_size
  56. mov [rsp + 0*8], r12
  57. mov [rsp + 1*8], r13
  58. mov [rsp + 2*8], r14
  59. mov [rsp + 3*8], r15
  60. mov [rsp + 4*8], rbp
  61. mov [rsp + 5*8], rbx
  62. %endmacro
  63. %macro FUNC_RESTORE 0
  64. mov r12, [rsp + 0*8]
  65. mov r13, [rsp + 1*8]
  66. mov r14, [rsp + 2*8]
  67. mov r15, [rsp + 3*8]
  68. mov rbp, [rsp + 4*8]
  69. mov rbx, [rsp + 5*8]
  70. add rsp, stack_size
  71. %endmacro
  72. %endif
  73. %ifidn __OUTPUT_FORMAT__, win64
  74. %define arg0 rcx
  75. %define arg1 rdx
  76. %define arg2 r8
  77. %define arg3 r9
  78. %define arg4 r12 ; must be saved, loaded and restored
  79. %define arg5 r15 ; must be saved and restored
  80. %define tmp r11
  81. %define tmp2 r10
  82. %define tmp3 r13 ; must be saved and restored
  83. %define tmp4 r14 ; must be saved and restored
  84. %define tmp5 rdi ; must be saved and restored
  85. %define tmp6 rsi ; must be saved and restored
  86. %define tmp7 rbp ; must be saved and restored
  87. %define tmp8 rbx ; must be saved and restored
  88. %define return rax
  89. %define PS 8
  90. %define LOG_PS 3
  91. %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
  92. %define arg(x) [rsp + stack_size + PS + PS*x]
  93. %define func(x) proc_frame x
  94. %macro FUNC_SAVE 0
  95. alloc_stack stack_size
  96. vmovdqa [rsp + 0*16], xmm6
  97. vmovdqa [rsp + 1*16], xmm7
  98. vmovdqa [rsp + 2*16], xmm8
  99. vmovdqa [rsp + 3*16], xmm9
  100. vmovdqa [rsp + 4*16], xmm10
  101. vmovdqa [rsp + 5*16], xmm11
  102. vmovdqa [rsp + 6*16], xmm12
  103. vmovdqa [rsp + 7*16], xmm13
  104. vmovdqa [rsp + 8*16], xmm14
  105. vmovdqa [rsp + 9*16], xmm15
  106. save_reg r12, 10*16 + 0*8
  107. save_reg r13, 10*16 + 1*8
  108. save_reg r14, 10*16 + 2*8
  109. save_reg r15, 10*16 + 3*8
  110. save_reg rdi, 10*16 + 4*8
  111. save_reg rsi, 10*16 + 5*8
  112. save_reg rbp, 10*16 + 6*8
  113. save_reg rbx, 10*16 + 7*8
  114. end_prolog
  115. mov arg4, arg(4)
  116. %endmacro
  117. %macro FUNC_RESTORE 0
  118. vmovdqa xmm6, [rsp + 0*16]
  119. vmovdqa xmm7, [rsp + 1*16]
  120. vmovdqa xmm8, [rsp + 2*16]
  121. vmovdqa xmm9, [rsp + 3*16]
  122. vmovdqa xmm10, [rsp + 4*16]
  123. vmovdqa xmm11, [rsp + 5*16]
  124. vmovdqa xmm12, [rsp + 6*16]
  125. vmovdqa xmm13, [rsp + 7*16]
  126. vmovdqa xmm14, [rsp + 8*16]
  127. vmovdqa xmm15, [rsp + 9*16]
  128. mov r12, [rsp + 10*16 + 0*8]
  129. mov r13, [rsp + 10*16 + 1*8]
  130. mov r14, [rsp + 10*16 + 2*8]
  131. mov r15, [rsp + 10*16 + 3*8]
  132. mov rdi, [rsp + 10*16 + 4*8]
  133. mov rsi, [rsp + 10*16 + 5*8]
  134. mov rbp, [rsp + 10*16 + 6*8]
  135. mov rbx, [rsp + 10*16 + 7*8]
  136. add rsp, stack_size
  137. %endmacro
  138. %endif
  139. %define len arg0
  140. %define vec arg1
  141. %define mul_array arg2
  142. %define src arg3
  143. %define dest1 arg4
  144. %define ptr arg5
  145. %define vec_i tmp2
  146. %define dest2 tmp3
  147. %define dest3 tmp4
  148. %define dest4 tmp5
  149. %define vskip3 tmp6
  150. %define dest5 tmp7
  151. %define vskip1 tmp8
  152. %define pos return
  153. %ifndef EC_ALIGNED_ADDR
  154. ;;; Use Un-aligned load/store
  155. %define XLDR vmovdqu8
  156. %define XSTR vmovdqu8
  157. %else
  158. ;;; Use Non-temporal load/stor
  159. %ifdef NO_NT_LDST
  160. %define XLDR vmovdqa64
  161. %define XSTR vmovdqa64
  162. %else
  163. %define XLDR vmovntdqa
  164. %define XSTR vmovntdq
  165. %endif
  166. %endif
  167. %define xmask0f zmm20
  168. %define xgft1_lo zmm19
  169. %define xgft1_loy ymm19
  170. %define xgft1_hi zmm18
  171. %define xgft2_lo zmm17
  172. %define xgft2_loy ymm17
  173. %define xgft2_hi zmm16
  174. %define xgft3_lo zmm15
  175. %define xgft3_loy ymm15
  176. %define xgft3_hi zmm14
  177. %define xgft4_lo zmm13
  178. %define xgft4_loy ymm13
  179. %define xgft4_hi zmm12
  180. %define xgft5_lo zmm11
  181. %define xgft5_loy ymm11
  182. %define xgft5_hi zmm10
  183. %define xgft6_lo zmm9
  184. %define xgft6_loy ymm9
  185. %define xgft6_hi zmm8
  186. %define x0 zmm0
  187. %define xtmpa zmm1
  188. %define xp1 zmm2
  189. %define xp2 zmm3
  190. %define xp3 zmm4
  191. %define xp4 zmm5
  192. %define xp5 zmm6
  193. %define xp6 zmm7
  194. default rel
  195. [bits 64]
  196. section .text
  197. align 16
  198. global gf_6vect_dot_prod_avx512, function
  199. func(gf_6vect_dot_prod_avx512)
  200. FUNC_SAVE
  201. sub len, 64
  202. jl .return_fail
  203. xor pos, pos
  204. mov tmp, 0x0f
  205. vpbroadcastb xmask0f, tmp ;Construct mask 0x0f0f0f...
  206. mov vskip1, vec
  207. imul vskip1, 32
  208. mov vskip3, vec
  209. imul vskip3, 96
  210. sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS
  211. mov dest2, [dest1+PS]
  212. mov dest3, [dest1+2*PS]
  213. mov dest4, [dest1+3*PS]
  214. mov dest5, [dest1+4*PS]
  215. .loop64:
  216. vpxorq xp1, xp1, xp1
  217. vpxorq xp2, xp2, xp2
  218. vpxorq xp3, xp3, xp3
  219. vpxorq xp4, xp4, xp4
  220. vpxorq xp5, xp5, xp5
  221. vpxorq xp6, xp6, xp6
  222. mov tmp, mul_array
  223. xor vec_i, vec_i
  224. .next_vect:
  225. mov ptr, [src+vec_i]
  226. XLDR x0, [ptr+pos] ;Get next source vector
  227. add vec_i, PS
  228. vpandq xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
  229. vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
  230. vpandq x0, x0, xmask0f ;Mask high src nibble in bits 4-0
  231. vmovdqu8 xgft1_loy, [tmp] ;Load array Ax{00}..{0f}, Ax{00}..{f0}
  232. vmovdqu8 xgft2_loy, [tmp+vec*(32/PS)] ;Load array Bx{00}..{0f}, Bx{00}..{f0}
  233. vmovdqu8 xgft3_loy, [tmp+vec*(64/PS)] ;Load array Cx{00}..{0f}, Cx{00}..{f0}
  234. vmovdqu8 xgft4_loy, [tmp+vskip3] ;Load array Dx{00}..{0f}, Dx{00}..{f0}
  235. vmovdqu8 xgft5_loy, [tmp+vskip1*4] ;Load array Ex{00}..{0f}, Ex{00}..{f0}
  236. lea ptr, [vskip1 + vskip1*4] ;ptr = vskip5
  237. vmovdqu8 xgft6_loy, [tmp+ptr] ;Load array Fx{00}..{0f}, Fx{00}..{f0}
  238. add tmp, 32
  239. vshufi64x2 xgft1_hi, xgft1_lo, xgft1_lo, 0x55
  240. vshufi64x2 xgft1_lo, xgft1_lo, xgft1_lo, 0x00
  241. vshufi64x2 xgft2_hi, xgft2_lo, xgft2_lo, 0x55
  242. vshufi64x2 xgft2_lo, xgft2_lo, xgft2_lo, 0x00
  243. vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble
  244. vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble
  245. vpxorq xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials
  246. vpxorq xp1, xp1, xgft1_hi ;xp1 += partial
  247. vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble
  248. vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble
  249. vpxorq xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials
  250. vpxorq xp2, xp2, xgft2_hi ;xp2 += partial
  251. vshufi64x2 xgft3_hi, xgft3_lo, xgft3_lo, 0x55
  252. vshufi64x2 xgft3_lo, xgft3_lo, xgft3_lo, 0x00
  253. vshufi64x2 xgft4_hi, xgft4_lo, xgft4_lo, 0x55
  254. vshufi64x2 xgft4_lo, xgft4_lo, xgft4_lo, 0x00
  255. vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble
  256. vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble
  257. vpxorq xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials
  258. vpxorq xp3, xp3, xgft3_hi ;xp3 += partial
  259. vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble
  260. vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble
  261. vpxorq xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials
  262. vpxorq xp4, xp4, xgft4_hi ;xp4 += partial
  263. vshufi64x2 xgft5_hi, xgft5_lo, xgft5_lo, 0x55
  264. vshufi64x2 xgft5_lo, xgft5_lo, xgft5_lo, 0x00
  265. vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble
  266. vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble
  267. vpxorq xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials
  268. vpxorq xp5, xp5, xgft5_hi ;xp5 += partial
  269. vshufi64x2 xgft6_hi, xgft6_lo, xgft6_lo, 0x55
  270. vshufi64x2 xgft6_lo, xgft6_lo, xgft6_lo, 0x00
  271. vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble
  272. vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble
  273. vpxorq xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials
  274. vpxorq xp6, xp6, xgft6_hi ;x6 += partial
  275. cmp vec_i, vec
  276. jl .next_vect
  277. mov ptr, [dest1] ;reuse ptr
  278. mov tmp, [dest1+5*PS] ;reuse tmp
  279. XSTR [dest2+pos], xp2
  280. XSTR [dest3+pos], xp3
  281. XSTR [dest4+pos], xp4
  282. XSTR [dest5+pos], xp5
  283. XSTR [ptr+pos], xp1
  284. XSTR [tmp+pos], xp6
  285. add pos, 64 ;Loop on 64 bytes at a time
  286. cmp pos, len
  287. jle .loop64
  288. lea tmp, [len + 64]
  289. cmp pos, tmp
  290. je .return_pass
  291. ;; Tail len
  292. mov pos, len ;Overlapped offset length-64
  293. jmp .loop64 ;Do one more overlap pass
  294. .return_pass:
  295. mov return, 0
  296. FUNC_RESTORE
  297. ret
  298. .return_fail:
  299. mov return, 1
  300. FUNC_RESTORE
  301. ret
  302. endproc_frame
  303. %else
  304. %ifidn __OUTPUT_FORMAT__, win64
  305. global no_gf_6vect_dot_prod_avx512
  306. no_gf_6vect_dot_prod_avx512:
  307. %endif
  308. %endif ; ifdef HAVE_AS_KNOWS_AVX512