float_dsp.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86util.asm"
  23. SECTION_RODATA 32
  24. pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
  25. SECTION .text
  26. ;-----------------------------------------------------------------------------
  27. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  28. ;-----------------------------------------------------------------------------
  29. %macro VECTOR_FMUL 0
  30. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  31. lea lenq, [lend*4 - 64]
  32. ALIGN 16
  33. .loop:
  34. %assign a 0
  35. %rep 32/mmsize
  36. mova m0, [src0q + lenq + (a+0)*mmsize]
  37. mova m1, [src0q + lenq + (a+1)*mmsize]
  38. mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
  39. mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
  40. mova [dstq + lenq + (a+0)*mmsize], m0
  41. mova [dstq + lenq + (a+1)*mmsize], m1
  42. %assign a a+2
  43. %endrep
  44. sub lenq, 64
  45. jge .loop
  46. REP_RET
  47. %endmacro
  48. INIT_XMM sse
  49. VECTOR_FMUL
  50. %if HAVE_AVX_EXTERNAL
  51. INIT_YMM avx
  52. VECTOR_FMUL
  53. %endif
  54. ;------------------------------------------------------------------------------
  55. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  56. ;------------------------------------------------------------------------------
  57. %macro VECTOR_FMAC_SCALAR 0
  58. %if UNIX64
  59. cglobal vector_fmac_scalar, 3,3,5, dst, src, len
  60. %else
  61. cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
  62. %endif
  63. %if ARCH_X86_32
  64. VBROADCASTSS m0, mulm
  65. %else
  66. %if WIN64
  67. SWAP 0, 2
  68. %endif
  69. shufps xm0, xm0, 0
  70. %if cpuflag(avx)
  71. vinsertf128 m0, m0, xm0, 1
  72. %endif
  73. %endif
  74. lea lenq, [lend*4-64]
  75. .loop:
  76. %if cpuflag(fma3)
  77. mova m1, [dstq+lenq]
  78. mova m2, [dstq+lenq+1*mmsize]
  79. fmaddps m1, m0, [srcq+lenq], m1
  80. fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
  81. %else ; cpuflag
  82. mulps m1, m0, [srcq+lenq]
  83. mulps m2, m0, [srcq+lenq+1*mmsize]
  84. %if mmsize < 32
  85. mulps m3, m0, [srcq+lenq+2*mmsize]
  86. mulps m4, m0, [srcq+lenq+3*mmsize]
  87. %endif ; mmsize
  88. addps m1, m1, [dstq+lenq]
  89. addps m2, m2, [dstq+lenq+1*mmsize]
  90. %if mmsize < 32
  91. addps m3, m3, [dstq+lenq+2*mmsize]
  92. addps m4, m4, [dstq+lenq+3*mmsize]
  93. %endif ; mmsize
  94. %endif ; cpuflag
  95. mova [dstq+lenq], m1
  96. mova [dstq+lenq+1*mmsize], m2
  97. %if mmsize < 32
  98. mova [dstq+lenq+2*mmsize], m3
  99. mova [dstq+lenq+3*mmsize], m4
  100. %endif ; mmsize
  101. sub lenq, 64
  102. jge .loop
  103. REP_RET
  104. %endmacro
  105. INIT_XMM sse
  106. VECTOR_FMAC_SCALAR
  107. %if HAVE_AVX_EXTERNAL
  108. INIT_YMM avx
  109. VECTOR_FMAC_SCALAR
  110. %endif
  111. %if HAVE_FMA3_EXTERNAL
  112. INIT_YMM fma3
  113. VECTOR_FMAC_SCALAR
  114. %endif
  115. ;------------------------------------------------------------------------------
  116. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  117. ;------------------------------------------------------------------------------
  118. %macro VECTOR_FMUL_SCALAR 0
  119. %if UNIX64
  120. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  121. %else
  122. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  123. %endif
  124. %if ARCH_X86_32
  125. movss m0, mulm
  126. %elif WIN64
  127. SWAP 0, 2
  128. %endif
  129. shufps m0, m0, 0
  130. lea lenq, [lend*4-mmsize]
  131. .loop:
  132. mova m1, [srcq+lenq]
  133. mulps m1, m0
  134. mova [dstq+lenq], m1
  135. sub lenq, mmsize
  136. jge .loop
  137. REP_RET
  138. %endmacro
  139. INIT_XMM sse
  140. VECTOR_FMUL_SCALAR
  141. ;------------------------------------------------------------------------------
  142. ; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
  143. ; int len)
  144. ;------------------------------------------------------------------------------
  145. %macro VECTOR_DMAC_SCALAR 0
  146. %if ARCH_X86_32
  147. cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
  148. mov lenq, lenaddrm
  149. VBROADCASTSD m0, mulm
  150. %else
  151. %if UNIX64
  152. cglobal vector_dmac_scalar, 3,3,5, dst, src, len
  153. %else
  154. cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
  155. SWAP 0, 2
  156. %endif
  157. movlhps xm0, xm0
  158. %if cpuflag(avx)
  159. vinsertf128 m0, m0, xm0, 1
  160. %endif
  161. %endif
  162. lea lenq, [lend*8-mmsize*4]
  163. .loop:
  164. %if cpuflag(fma3)
  165. movaps m1, [dstq+lenq]
  166. movaps m2, [dstq+lenq+1*mmsize]
  167. movaps m3, [dstq+lenq+2*mmsize]
  168. movaps m4, [dstq+lenq+3*mmsize]
  169. fmaddpd m1, m0, [srcq+lenq], m1
  170. fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
  171. fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
  172. fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
  173. %else ; cpuflag
  174. mulpd m1, m0, [srcq+lenq]
  175. mulpd m2, m0, [srcq+lenq+1*mmsize]
  176. mulpd m3, m0, [srcq+lenq+2*mmsize]
  177. mulpd m4, m0, [srcq+lenq+3*mmsize]
  178. addpd m1, m1, [dstq+lenq]
  179. addpd m2, m2, [dstq+lenq+1*mmsize]
  180. addpd m3, m3, [dstq+lenq+2*mmsize]
  181. addpd m4, m4, [dstq+lenq+3*mmsize]
  182. %endif ; cpuflag
  183. movaps [dstq+lenq], m1
  184. movaps [dstq+lenq+1*mmsize], m2
  185. movaps [dstq+lenq+2*mmsize], m3
  186. movaps [dstq+lenq+3*mmsize], m4
  187. sub lenq, mmsize*4
  188. jge .loop
  189. REP_RET
  190. %endmacro
  191. INIT_XMM sse2
  192. VECTOR_DMAC_SCALAR
  193. %if HAVE_AVX_EXTERNAL
  194. INIT_YMM avx
  195. VECTOR_DMAC_SCALAR
  196. %endif
  197. %if HAVE_FMA3_EXTERNAL
  198. INIT_YMM fma3
  199. VECTOR_DMAC_SCALAR
  200. %endif
  201. ;------------------------------------------------------------------------------
  202. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  203. ; int len)
  204. ;------------------------------------------------------------------------------
  205. %macro VECTOR_DMUL_SCALAR 0
  206. %if ARCH_X86_32
  207. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  208. mov lenq, lenaddrm
  209. %elif UNIX64
  210. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  211. %else
  212. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  213. %endif
  214. %if ARCH_X86_32
  215. VBROADCASTSD m0, mulm
  216. %else
  217. %if WIN64
  218. SWAP 0, 2
  219. %endif
  220. movlhps xm0, xm0
  221. %if cpuflag(avx)
  222. vinsertf128 ym0, ym0, xm0, 1
  223. %endif
  224. %endif
  225. lea lenq, [lend*8-2*mmsize]
  226. .loop:
  227. mulpd m1, m0, [srcq+lenq ]
  228. mulpd m2, m0, [srcq+lenq+mmsize]
  229. movaps [dstq+lenq ], m1
  230. movaps [dstq+lenq+mmsize], m2
  231. sub lenq, 2*mmsize
  232. jge .loop
  233. REP_RET
  234. %endmacro
  235. INIT_XMM sse2
  236. VECTOR_DMUL_SCALAR
  237. %if HAVE_AVX_EXTERNAL
  238. INIT_YMM avx
  239. VECTOR_DMUL_SCALAR
  240. %endif
  241. ;-----------------------------------------------------------------------------
  242. ; vector_fmul_window(float *dst, const float *src0,
  243. ; const float *src1, const float *win, int len);
  244. ;-----------------------------------------------------------------------------
  245. %macro VECTOR_FMUL_WINDOW 0
  246. cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
  247. shl lend, 2
  248. lea len1q, [lenq - mmsize]
  249. add src0q, lenq
  250. add dstq, lenq
  251. add winq, lenq
  252. neg lenq
  253. .loop:
  254. mova m0, [winq + lenq]
  255. mova m4, [src0q + lenq]
  256. %if cpuflag(sse)
  257. mova m1, [winq + len1q]
  258. mova m5, [src1q + len1q]
  259. shufps m1, m1, 0x1b
  260. shufps m5, m5, 0x1b
  261. mova m2, m0
  262. mova m3, m1
  263. mulps m2, m4
  264. mulps m3, m5
  265. mulps m1, m4
  266. mulps m0, m5
  267. addps m2, m3
  268. subps m1, m0
  269. shufps m2, m2, 0x1b
  270. %else
  271. pswapd m1, [winq + len1q]
  272. pswapd m5, [src1q + len1q]
  273. mova m2, m0
  274. mova m3, m1
  275. pfmul m2, m4
  276. pfmul m3, m5
  277. pfmul m1, m4
  278. pfmul m0, m5
  279. pfadd m2, m3
  280. pfsub m1, m0
  281. pswapd m2, m2
  282. %endif
  283. mova [dstq + lenq], m1
  284. mova [dstq + len1q], m2
  285. sub len1q, mmsize
  286. add lenq, mmsize
  287. jl .loop
  288. %if mmsize == 8
  289. femms
  290. %endif
  291. REP_RET
  292. %endmacro
  293. INIT_MMX 3dnowext
  294. VECTOR_FMUL_WINDOW
  295. INIT_XMM sse
  296. VECTOR_FMUL_WINDOW
  297. ;-----------------------------------------------------------------------------
  298. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  299. ; const float *src2, int len)
  300. ;-----------------------------------------------------------------------------
  301. %macro VECTOR_FMUL_ADD 0
  302. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  303. lea lenq, [lend*4 - 2*mmsize]
  304. ALIGN 16
  305. .loop:
  306. mova m0, [src0q + lenq]
  307. mova m1, [src0q + lenq + mmsize]
  308. %if cpuflag(fma3)
  309. mova m2, [src2q + lenq]
  310. mova m3, [src2q + lenq + mmsize]
  311. fmaddps m0, m0, [src1q + lenq], m2
  312. fmaddps m1, m1, [src1q + lenq + mmsize], m3
  313. %else
  314. mulps m0, m0, [src1q + lenq]
  315. mulps m1, m1, [src1q + lenq + mmsize]
  316. addps m0, m0, [src2q + lenq]
  317. addps m1, m1, [src2q + lenq + mmsize]
  318. %endif
  319. mova [dstq + lenq], m0
  320. mova [dstq + lenq + mmsize], m1
  321. sub lenq, 2*mmsize
  322. jge .loop
  323. REP_RET
  324. %endmacro
  325. INIT_XMM sse
  326. VECTOR_FMUL_ADD
  327. %if HAVE_AVX_EXTERNAL
  328. INIT_YMM avx
  329. VECTOR_FMUL_ADD
  330. %endif
  331. %if HAVE_FMA3_EXTERNAL
  332. INIT_YMM fma3
  333. VECTOR_FMUL_ADD
  334. %endif
  335. ;-----------------------------------------------------------------------------
  336. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  337. ; int len)
  338. ;-----------------------------------------------------------------------------
  339. %macro VECTOR_FMUL_REVERSE 0
  340. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  341. %if cpuflag(avx2)
  342. movaps m2, [pd_reverse]
  343. %endif
  344. lea lenq, [lend*4 - 2*mmsize]
  345. ALIGN 16
  346. .loop:
  347. %if cpuflag(avx2)
  348. vpermps m0, m2, [src1q]
  349. vpermps m1, m2, [src1q+mmsize]
  350. %elif cpuflag(avx)
  351. vmovaps xmm0, [src1q + 16]
  352. vinsertf128 m0, m0, [src1q], 1
  353. vshufps m0, m0, m0, q0123
  354. vmovaps xmm1, [src1q + mmsize + 16]
  355. vinsertf128 m1, m1, [src1q + mmsize], 1
  356. vshufps m1, m1, m1, q0123
  357. %else
  358. mova m0, [src1q]
  359. mova m1, [src1q + mmsize]
  360. shufps m0, m0, q0123
  361. shufps m1, m1, q0123
  362. %endif
  363. mulps m0, m0, [src0q + lenq + mmsize]
  364. mulps m1, m1, [src0q + lenq]
  365. movaps [dstq + lenq + mmsize], m0
  366. movaps [dstq + lenq], m1
  367. add src1q, 2*mmsize
  368. sub lenq, 2*mmsize
  369. jge .loop
  370. REP_RET
  371. %endmacro
  372. INIT_XMM sse
  373. VECTOR_FMUL_REVERSE
  374. %if HAVE_AVX_EXTERNAL
  375. INIT_YMM avx
  376. VECTOR_FMUL_REVERSE
  377. %endif
  378. %if HAVE_AVX2_EXTERNAL
  379. INIT_YMM avx2
  380. VECTOR_FMUL_REVERSE
  381. %endif
  382. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  383. INIT_XMM sse
  384. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  385. shl offsetd, 2
  386. add v1q, offsetq
  387. add v2q, offsetq
  388. neg offsetq
  389. xorps xmm0, xmm0
  390. .loop:
  391. movaps xmm1, [v1q+offsetq]
  392. mulps xmm1, [v2q+offsetq]
  393. addps xmm0, xmm1
  394. add offsetq, 16
  395. js .loop
  396. movhlps xmm1, xmm0
  397. addps xmm0, xmm1
  398. movss xmm1, xmm0
  399. shufps xmm0, xmm0, 1
  400. addss xmm0, xmm1
  401. %if ARCH_X86_64 == 0
  402. movss r0m, xmm0
  403. fld dword r0m
  404. %endif
  405. RET
  406. ;-----------------------------------------------------------------------------
  407. ; void ff_butterflies_float(float *src0, float *src1, int len);
  408. ;-----------------------------------------------------------------------------
  409. INIT_XMM sse
  410. cglobal butterflies_float, 3,3,3, src0, src1, len
  411. shl lend, 2
  412. add src0q, lenq
  413. add src1q, lenq
  414. neg lenq
  415. .loop:
  416. mova m0, [src0q + lenq]
  417. mova m1, [src1q + lenq]
  418. subps m2, m0, m1
  419. addps m0, m0, m1
  420. mova [src1q + lenq], m2
  421. mova [src0q + lenq], m0
  422. add lenq, mmsize
  423. jl .loop
  424. REP_RET