float_dsp.asm 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION .text
  24. ;-----------------------------------------------------------------------------
  25. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  26. ;-----------------------------------------------------------------------------
  27. %macro VECTOR_FMUL 0
  28. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  29. lea lenq, [lend*4 - 64]
  30. ALIGN 16
  31. .loop:
  32. %assign a 0
  33. %rep 32/mmsize
  34. mova m0, [src0q + lenq + (a+0)*mmsize]
  35. mova m1, [src0q + lenq + (a+1)*mmsize]
  36. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  37. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  38. mova [dstq + lenq + (a+0)*mmsize], m0
  39. mova [dstq + lenq + (a+1)*mmsize], m1
  40. %assign a a+2
  41. %endrep
  42. sub lenq, 64
  43. jge .loop
  44. REP_RET
  45. %endmacro
  46. INIT_XMM sse
  47. VECTOR_FMUL
  48. %if HAVE_AVX_EXTERNAL
  49. INIT_YMM avx
  50. VECTOR_FMUL
  51. %endif
  52. ;------------------------------------------------------------------------------
  53. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  54. ;------------------------------------------------------------------------------
  55. %macro VECTOR_FMAC_SCALAR 0
  56. %if UNIX64
  57. cglobal vector_fmac_scalar, 3,3,5, dst, src, len
  58. %else
  59. cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
  60. %endif
  61. %if ARCH_X86_32
  62. VBROADCASTSS m0, mulm
  63. %else
  64. %if WIN64
  65. SWAP 0, 2
  66. %endif
  67. shufps xm0, xm0, 0
  68. %if cpuflag(avx)
  69. vinsertf128 m0, m0, xm0, 1
  70. %endif
  71. %endif
  72. lea lenq, [lend*4-64]
  73. .loop:
  74. %if cpuflag(fma3)
  75. mova m1, [dstq+lenq]
  76. mova m2, [dstq+lenq+1*mmsize]
  77. fmaddps m1, m0, [srcq+lenq], m1
  78. fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
  79. %else ; cpuflag
  80. mulps m1, m0, [srcq+lenq]
  81. mulps m2, m0, [srcq+lenq+1*mmsize]
  82. %if mmsize < 32
  83. mulps m3, m0, [srcq+lenq+2*mmsize]
  84. mulps m4, m0, [srcq+lenq+3*mmsize]
  85. %endif ; mmsize
  86. addps m1, m1, [dstq+lenq]
  87. addps m2, m2, [dstq+lenq+1*mmsize]
  88. %if mmsize < 32
  89. addps m3, m3, [dstq+lenq+2*mmsize]
  90. addps m4, m4, [dstq+lenq+3*mmsize]
  91. %endif ; mmsize
  92. %endif ; cpuflag
  93. mova [dstq+lenq], m1
  94. mova [dstq+lenq+1*mmsize], m2
  95. %if mmsize < 32
  96. mova [dstq+lenq+2*mmsize], m3
  97. mova [dstq+lenq+3*mmsize], m4
  98. %endif ; mmsize
  99. sub lenq, 64
  100. jge .loop
  101. REP_RET
  102. %endmacro
  103. INIT_XMM sse
  104. VECTOR_FMAC_SCALAR
  105. %if HAVE_AVX_EXTERNAL
  106. INIT_YMM avx
  107. VECTOR_FMAC_SCALAR
  108. %endif
  109. %if HAVE_FMA3_EXTERNAL
  110. INIT_YMM fma3
  111. VECTOR_FMAC_SCALAR
  112. %endif
  113. ;------------------------------------------------------------------------------
  114. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  115. ;------------------------------------------------------------------------------
  116. %macro VECTOR_FMUL_SCALAR 0
  117. %if UNIX64
  118. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  119. %else
  120. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  121. %endif
  122. %if ARCH_X86_32
  123. movss m0, mulm
  124. %elif WIN64
  125. SWAP 0, 2
  126. %endif
  127. shufps m0, m0, 0
  128. lea lenq, [lend*4-mmsize]
  129. .loop:
  130. mova m1, [srcq+lenq]
  131. mulps m1, m0
  132. mova [dstq+lenq], m1
  133. sub lenq, mmsize
  134. jge .loop
  135. REP_RET
  136. %endmacro
  137. INIT_XMM sse
  138. VECTOR_FMUL_SCALAR
  139. ;------------------------------------------------------------------------------
  140. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  141. ; int len)
  142. ;------------------------------------------------------------------------------
  143. %macro VECTOR_DMUL_SCALAR 0
  144. %if ARCH_X86_32
  145. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  146. mov lenq, lenaddrm
  147. %elif UNIX64
  148. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  149. %else
  150. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  151. %endif
  152. %if ARCH_X86_32
  153. VBROADCASTSD m0, mulm
  154. %else
  155. %if WIN64
  156. SWAP 0, 2
  157. %endif
  158. movlhps xm0, xm0
  159. %if cpuflag(avx)
  160. vinsertf128 ym0, ym0, xm0, 1
  161. %endif
  162. %endif
  163. lea lenq, [lend*8-2*mmsize]
  164. .loop:
  165. mulpd m1, m0, [srcq+lenq ]
  166. mulpd m2, m0, [srcq+lenq+mmsize]
  167. mova [dstq+lenq ], m1
  168. mova [dstq+lenq+mmsize], m2
  169. sub lenq, 2*mmsize
  170. jge .loop
  171. REP_RET
  172. %endmacro
  173. INIT_XMM sse2
  174. VECTOR_DMUL_SCALAR
  175. %if HAVE_AVX_EXTERNAL
  176. INIT_YMM avx
  177. VECTOR_DMUL_SCALAR
  178. %endif
  179. ;-----------------------------------------------------------------------------
  180. ; vector_fmul_window(float *dst, const float *src0,
  181. ; const float *src1, const float *win, int len);
  182. ;-----------------------------------------------------------------------------
  183. %macro VECTOR_FMUL_WINDOW 0
  184. cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
  185. shl lend, 2
  186. lea len1q, [lenq - mmsize]
  187. add src0q, lenq
  188. add dstq, lenq
  189. add winq, lenq
  190. neg lenq
  191. .loop
  192. mova m0, [winq + lenq]
  193. mova m4, [src0q + lenq]
  194. %if cpuflag(sse)
  195. mova m1, [winq + len1q]
  196. mova m5, [src1q + len1q]
  197. shufps m1, m1, 0x1b
  198. shufps m5, m5, 0x1b
  199. mova m2, m0
  200. mova m3, m1
  201. mulps m2, m4
  202. mulps m3, m5
  203. mulps m1, m4
  204. mulps m0, m5
  205. addps m2, m3
  206. subps m1, m0
  207. shufps m2, m2, 0x1b
  208. %else
  209. pswapd m1, [winq + len1q]
  210. pswapd m5, [src1q + len1q]
  211. mova m2, m0
  212. mova m3, m1
  213. pfmul m2, m4
  214. pfmul m3, m5
  215. pfmul m1, m4
  216. pfmul m0, m5
  217. pfadd m2, m3
  218. pfsub m1, m0
  219. pswapd m2, m2
  220. %endif
  221. mova [dstq + lenq], m1
  222. mova [dstq + len1q], m2
  223. sub len1q, mmsize
  224. add lenq, mmsize
  225. jl .loop
  226. %if mmsize == 8
  227. femms
  228. %endif
  229. REP_RET
  230. %endmacro
  231. INIT_MMX 3dnowext
  232. VECTOR_FMUL_WINDOW
  233. INIT_XMM sse
  234. VECTOR_FMUL_WINDOW
  235. ;-----------------------------------------------------------------------------
  236. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  237. ; const float *src2, int len)
  238. ;-----------------------------------------------------------------------------
  239. %macro VECTOR_FMUL_ADD 0
  240. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  241. lea lenq, [lend*4 - 2*mmsize]
  242. ALIGN 16
  243. .loop:
  244. mova m0, [src0q + lenq]
  245. mova m1, [src0q + lenq + mmsize]
  246. %if cpuflag(fma3)
  247. mova m2, [src2q + lenq]
  248. mova m3, [src2q + lenq + mmsize]
  249. fmaddps m0, m0, [src1q + lenq], m2
  250. fmaddps m1, m1, [src1q + lenq + mmsize], m3
  251. %else
  252. mulps m0, m0, [src1q + lenq]
  253. mulps m1, m1, [src1q + lenq + mmsize]
  254. addps m0, m0, [src2q + lenq]
  255. addps m1, m1, [src2q + lenq + mmsize]
  256. %endif
  257. mova [dstq + lenq], m0
  258. mova [dstq + lenq + mmsize], m1
  259. sub lenq, 2*mmsize
  260. jge .loop
  261. REP_RET
  262. %endmacro
  263. INIT_XMM sse
  264. VECTOR_FMUL_ADD
  265. %if HAVE_AVX_EXTERNAL
  266. INIT_YMM avx
  267. VECTOR_FMUL_ADD
  268. %endif
  269. %if HAVE_FMA3_EXTERNAL
  270. INIT_YMM fma3
  271. VECTOR_FMUL_ADD
  272. %endif
  273. ;-----------------------------------------------------------------------------
  274. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  275. ; int len)
  276. ;-----------------------------------------------------------------------------
  277. %macro VECTOR_FMUL_REVERSE 0
  278. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  279. lea lenq, [lend*4 - 2*mmsize]
  280. ALIGN 16
  281. .loop:
  282. %if cpuflag(avx)
  283. vmovaps xmm0, [src1q + 16]
  284. vinsertf128 m0, m0, [src1q], 1
  285. vshufps m0, m0, m0, q0123
  286. vmovaps xmm1, [src1q + mmsize + 16]
  287. vinsertf128 m1, m1, [src1q + mmsize], 1
  288. vshufps m1, m1, m1, q0123
  289. %else
  290. mova m0, [src1q]
  291. mova m1, [src1q + mmsize]
  292. shufps m0, m0, q0123
  293. shufps m1, m1, q0123
  294. %endif
  295. mulps m0, m0, [src0q + lenq + mmsize]
  296. mulps m1, m1, [src0q + lenq]
  297. mova [dstq + lenq + mmsize], m0
  298. mova [dstq + lenq], m1
  299. add src1q, 2*mmsize
  300. sub lenq, 2*mmsize
  301. jge .loop
  302. REP_RET
  303. %endmacro
  304. INIT_XMM sse
  305. VECTOR_FMUL_REVERSE
  306. %if HAVE_AVX_EXTERNAL
  307. INIT_YMM avx
  308. VECTOR_FMUL_REVERSE
  309. %endif
  310. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  311. INIT_XMM sse
  312. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  313. shl offsetd, 2
  314. add v1q, offsetq
  315. add v2q, offsetq
  316. neg offsetq
  317. xorps xmm0, xmm0
  318. .loop:
  319. movaps xmm1, [v1q+offsetq]
  320. mulps xmm1, [v2q+offsetq]
  321. addps xmm0, xmm1
  322. add offsetq, 16
  323. js .loop
  324. movhlps xmm1, xmm0
  325. addps xmm0, xmm1
  326. movss xmm1, xmm0
  327. shufps xmm0, xmm0, 1
  328. addss xmm0, xmm1
  329. %if ARCH_X86_64 == 0
  330. movss r0m, xmm0
  331. fld dword r0m
  332. %endif
  333. RET
  334. ;-----------------------------------------------------------------------------
  335. ; void ff_butterflies_float(float *src0, float *src1, int len);
  336. ;-----------------------------------------------------------------------------
  337. INIT_XMM sse
  338. cglobal butterflies_float, 3,3,3, src0, src1, len
  339. %if ARCH_X86_64
  340. movsxd lenq, lend
  341. %endif
  342. test lenq, lenq
  343. jz .end
  344. shl lenq, 2
  345. add src0q, lenq
  346. add src1q, lenq
  347. neg lenq
  348. .loop:
  349. mova m0, [src0q + lenq]
  350. mova m1, [src1q + lenq]
  351. subps m2, m0, m1
  352. addps m0, m0, m1
  353. mova [src1q + lenq], m2
  354. mova [src0q + lenq], m0
  355. add lenq, mmsize
  356. jl .loop
  357. .end:
  358. REP_RET