gf_vect_dot_prod_avx.asm 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2. ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
  3. ;
  4. ; Redistribution and use in source and binary forms, with or without
  5. ; modification, are permitted provided that the following conditions
  6. ; are met:
  7. ; * Redistributions of source code must retain the above copyright
  8. ; notice, this list of conditions and the following disclaimer.
  9. ; * Redistributions in binary form must reproduce the above copyright
  10. ; notice, this list of conditions and the following disclaimer in
  11. ; the documentation and/or other materials provided with the
  12. ; distribution.
  13. ; * Neither the name of Intel Corporation nor the names of its
  14. ; contributors may be used to endorse or promote products derived
  15. ; from this software without specific prior written permission.
  16. ;
  17. ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18. ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19. ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20. ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21. ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22. ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23. ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24. ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25. ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27. ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29. ;;;
  30. ;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
  31. ;;;
  32. %include "reg_sizes.asm"
  33. %ifidn __OUTPUT_FORMAT__, elf64
  34. %define arg0 rdi
  35. %define arg1 rsi
  36. %define arg2 rdx
  37. %define arg3 rcx
  38. %define arg4 r8
  39. %define tmp r11
  40. %define tmp2 r10
  41. %define tmp3 r9
  42. %define return rax
  43. %macro SLDR 2
  44. %endmacro
  45. %define SSTR SLDR
  46. %define PS 8
  47. %define func(x) x:
  48. %define FUNC_SAVE
  49. %define FUNC_RESTORE
  50. %endif
  51. %ifidn __OUTPUT_FORMAT__, win64
  52. %define arg0 rcx
  53. %define arg1 rdx
  54. %define arg2 r8
  55. %define arg3 r9
  56. %define arg4 r12 ; must be saved and loaded
  57. %define tmp r11
  58. %define tmp2 r10
  59. %define tmp3 rdi ; must be saved and loaded
  60. %define return rax
  61. %macro SLDR 2
  62. %endmacro
  63. %define SSTR SLDR
  64. %define PS 8
  65. %define frame_size 2*8
  66. %define arg(x) [rsp + frame_size + PS + PS*x]
  67. %define func(x) proc_frame x
  68. %macro FUNC_SAVE 0
  69. rex_push_reg r12
  70. push_reg rdi
  71. end_prolog
  72. mov arg4, arg(4)
  73. %endmacro
  74. %macro FUNC_RESTORE 0
  75. pop rdi
  76. pop r12
  77. %endmacro
  78. %endif
  79. %ifidn __OUTPUT_FORMAT__, elf32
  80. ;;;================== High Address;
  81. ;;; arg4
  82. ;;; arg3
  83. ;;; arg2
  84. ;;; arg1
  85. ;;; arg0
  86. ;;; return
  87. ;;;<================= esp of caller
  88. ;;; ebp
  89. ;;;<================= ebp = esp
  90. ;;; esi
  91. ;;; edi
  92. ;;; ebx
  93. ;;;<================= esp of callee
  94. ;;;
  95. ;;;================== Low Address;
  96. %define PS 4
  97. %define LOG_PS 2
  98. %define func(x) x:
  99. %define arg(x) [ebp + PS*2 + PS*x]
  100. %define trans ecx ;trans is for the variables in stack
  101. %define arg0 trans
  102. %define arg0_m arg(0)
  103. %define arg1 trans
  104. %define arg1_m arg(1)
  105. %define arg2 arg2_m
  106. %define arg2_m arg(2)
  107. %define arg3 ebx
  108. %define arg4 trans
  109. %define arg4_m arg(4)
  110. %define tmp edx
  111. %define tmp2 edi
  112. %define tmp3 esi
  113. %define return eax
  114. %macro SLDR 2 ;; stack load/restore
  115. mov %1, %2
  116. %endmacro
  117. %define SSTR SLDR
  118. %macro FUNC_SAVE 0
  119. push ebp
  120. mov ebp, esp
  121. push esi
  122. push edi
  123. push ebx
  124. mov arg3, arg(3)
  125. %endmacro
  126. %macro FUNC_RESTORE 0
  127. pop ebx
  128. pop edi
  129. pop esi
  130. mov esp, ebp
  131. pop ebp
  132. %endmacro
  133. %endif ; output formats
  134. %define len arg0
  135. %define vec arg1
  136. %define mul_array arg2
  137. %define src arg3
  138. %define dest arg4
  139. %define vec_i tmp2
  140. %define ptr tmp3
  141. %define pos return
  142. %ifidn PS,4 ;32-bit code
  143. %define vec_m arg1_m
  144. %define len_m arg0_m
  145. %define dest_m arg4_m
  146. %endif
  147. %ifndef EC_ALIGNED_ADDR
  148. ;;; Use Un-aligned load/store
  149. %define XLDR vmovdqu
  150. %define XSTR vmovdqu
  151. %else
  152. ;;; Use Non-temporal load/stor
  153. %ifdef NO_NT_LDST
  154. %define XLDR vmovdqa
  155. %define XSTR vmovdqa
  156. %else
  157. %define XLDR vmovntdqa
  158. %define XSTR vmovntdq
  159. %endif
  160. %endif
  161. %ifidn PS,8 ; 64-bit code
  162. default rel
  163. [bits 64]
  164. %endif
  165. section .text
  166. %define xmask0f xmm5
  167. %define xgft_lo xmm4
  168. %define xgft_hi xmm3
  169. %define x0 xmm0
  170. %define xtmpa xmm1
  171. %define xp xmm2
  172. align 16
  173. global gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
  174. func(gf_vect_dot_prod_avx)
  175. %ifidn __OUTPUT_FORMAT__, macho64
  176. global _gf_vect_dot_prod_avx:ISAL_SYM_TYPE_FUNCTION
  177. func(_gf_vect_dot_prod_avx)
  178. %endif
  179. FUNC_SAVE
  180. SLDR len, len_m
  181. sub len, 16
  182. SSTR len_m, len
  183. jl .return_fail
  184. xor pos, pos
  185. vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte
  186. .loop16:
  187. vpxor xp, xp
  188. mov tmp, mul_array
  189. xor vec_i, vec_i
  190. .next_vect:
  191. mov ptr, [src+vec_i*PS]
  192. vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f}
  193. vmovdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0}
  194. XLDR x0, [ptr+pos] ;Get next source vector
  195. add tmp, 32
  196. add vec_i, 1
  197. vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
  198. vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
  199. vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
  200. vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble
  201. vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
  202. vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
  203. vpxor xp, xp, xgft_hi ;xp += partial
  204. SLDR vec, vec_m
  205. cmp vec_i, vec
  206. jl .next_vect
  207. SLDR dest, dest_m
  208. XSTR [dest+pos], xp
  209. add pos, 16 ;Loop on 16 bytes at a time
  210. SLDR len, len_m
  211. cmp pos, len
  212. jle .loop16
  213. lea tmp, [len + 16]
  214. cmp pos, tmp
  215. je .return_pass
  216. ;; Tail len
  217. mov pos, len ;Overlapped offset length-16
  218. jmp .loop16 ;Do one more overlap pass
  219. .return_pass:
  220. mov return, 0
  221. FUNC_RESTORE
  222. ret
  223. .return_fail:
  224. mov return, 1
  225. FUNC_RESTORE
  226. ret
  227. endproc_frame
  228. section .data
  229. align 16
  230. mask0f:
  231. dq 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
  232. ;;; func core, ver, snum
  233. slversion gf_vect_dot_prod_avx, 02, 05, 0061