audio_mix.asm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. ;******************************************************************************
  2. ;* x86 optimized channel mixing
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. %include "util.asm"
  24. SECTION_TEXT
  25. ;-----------------------------------------------------------------------------
  26. ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
  27. ; int out_ch, int in_ch);
  28. ;-----------------------------------------------------------------------------
  29. %macro MIX_2_TO_1_FLTP_FLT 0
  30. cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
  31. mov src1q, [srcq+gprsize]
  32. mov srcq, [srcq ]
  33. sub src1q, srcq
  34. mov matrixq, [matrixq ]
  35. VBROADCASTSS m4, [matrixq ]
  36. VBROADCASTSS m5, [matrixq+4]
  37. ALIGN 16
  38. .loop:
  39. mulps m0, m4, [srcq ]
  40. mulps m1, m5, [srcq+src1q ]
  41. mulps m2, m4, [srcq+ mmsize]
  42. mulps m3, m5, [srcq+src1q+mmsize]
  43. addps m0, m0, m1
  44. addps m2, m2, m3
  45. mova [srcq ], m0
  46. mova [srcq+mmsize], m2
  47. add srcq, mmsize*2
  48. sub lend, mmsize*2/4
  49. jg .loop
  50. REP_RET
  51. %endmacro
  52. INIT_XMM sse
  53. MIX_2_TO_1_FLTP_FLT
  54. %if HAVE_AVX_EXTERNAL
  55. INIT_YMM avx
  56. MIX_2_TO_1_FLTP_FLT
  57. %endif
  58. ;-----------------------------------------------------------------------------
  59. ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
  60. ; int out_ch, int in_ch);
  61. ;-----------------------------------------------------------------------------
  62. %macro MIX_2_TO_1_S16P_FLT 0
  63. cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
  64. mov src1q, [srcq+gprsize]
  65. mov srcq, [srcq]
  66. sub src1q, srcq
  67. mov matrixq, [matrixq ]
  68. VBROADCASTSS m4, [matrixq ]
  69. VBROADCASTSS m5, [matrixq+4]
  70. ALIGN 16
  71. .loop:
  72. mova m0, [srcq ]
  73. mova m2, [srcq+src1q]
  74. S16_TO_S32_SX 0, 1
  75. S16_TO_S32_SX 2, 3
  76. cvtdq2ps m0, m0
  77. cvtdq2ps m1, m1
  78. cvtdq2ps m2, m2
  79. cvtdq2ps m3, m3
  80. mulps m0, m4
  81. mulps m1, m4
  82. mulps m2, m5
  83. mulps m3, m5
  84. addps m0, m2
  85. addps m1, m3
  86. cvtps2dq m0, m0
  87. cvtps2dq m1, m1
  88. packssdw m0, m1
  89. mova [srcq], m0
  90. add srcq, mmsize
  91. sub lend, mmsize/2
  92. jg .loop
  93. REP_RET
  94. %endmacro
  95. INIT_XMM sse2
  96. MIX_2_TO_1_S16P_FLT
  97. INIT_XMM sse4
  98. MIX_2_TO_1_S16P_FLT
  99. ;-----------------------------------------------------------------------------
  100. ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
  101. ; int out_ch, int in_ch);
  102. ;-----------------------------------------------------------------------------
  103. INIT_XMM sse2
  104. cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
  105. mov src1q, [srcq+gprsize]
  106. mov srcq, [srcq]
  107. sub src1q, srcq
  108. mov matrixq, [matrixq]
  109. movd m4, [matrixq]
  110. movd m5, [matrixq]
  111. SPLATW m4, m4, 0
  112. SPLATW m5, m5, 1
  113. pxor m0, m0
  114. punpcklwd m4, m0
  115. punpcklwd m5, m0
  116. ALIGN 16
  117. .loop:
  118. mova m0, [srcq ]
  119. mova m2, [srcq+src1q]
  120. punpckhwd m1, m0, m0
  121. punpcklwd m0, m0
  122. punpckhwd m3, m2, m2
  123. punpcklwd m2, m2
  124. pmaddwd m0, m4
  125. pmaddwd m1, m4
  126. pmaddwd m2, m5
  127. pmaddwd m3, m5
  128. paddd m0, m2
  129. paddd m1, m3
  130. psrad m0, 8
  131. psrad m1, 8
  132. packssdw m0, m1
  133. mova [srcq], m0
  134. add srcq, mmsize
  135. sub lend, mmsize/2
  136. jg .loop
  137. REP_RET
  138. ;-----------------------------------------------------------------------------
  139. ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
  140. ; int out_ch, int in_ch);
  141. ;-----------------------------------------------------------------------------
  142. %macro MIX_1_TO_2_FLTP_FLT 0
  143. cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
  144. mov src1q, [src0q+gprsize]
  145. mov src0q, [src0q]
  146. sub src1q, src0q
  147. mov matrix1q, [matrix0q+gprsize]
  148. mov matrix0q, [matrix0q]
  149. VBROADCASTSS m2, [matrix0q]
  150. VBROADCASTSS m3, [matrix1q]
  151. ALIGN 16
  152. .loop:
  153. mova m0, [src0q]
  154. mulps m1, m0, m3
  155. mulps m0, m0, m2
  156. mova [src0q ], m0
  157. mova [src0q+src1q], m1
  158. add src0q, mmsize
  159. sub lend, mmsize/4
  160. jg .loop
  161. REP_RET
  162. %endmacro
  163. INIT_XMM sse
  164. MIX_1_TO_2_FLTP_FLT
  165. %if HAVE_AVX_EXTERNAL
  166. INIT_YMM avx
  167. MIX_1_TO_2_FLTP_FLT
  168. %endif
  169. ;-----------------------------------------------------------------------------
  170. ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
  171. ; int out_ch, int in_ch);
  172. ;-----------------------------------------------------------------------------
  173. %macro MIX_1_TO_2_S16P_FLT 0
  174. cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
  175. mov src1q, [src0q+gprsize]
  176. mov src0q, [src0q]
  177. sub src1q, src0q
  178. mov matrix1q, [matrix0q+gprsize]
  179. mov matrix0q, [matrix0q]
  180. VBROADCASTSS m4, [matrix0q]
  181. VBROADCASTSS m5, [matrix1q]
  182. ALIGN 16
  183. .loop:
  184. mova m0, [src0q]
  185. S16_TO_S32_SX 0, 2
  186. cvtdq2ps m0, m0
  187. cvtdq2ps m2, m2
  188. mulps m1, m0, m5
  189. mulps m0, m0, m4
  190. mulps m3, m2, m5
  191. mulps m2, m2, m4
  192. cvtps2dq m0, m0
  193. cvtps2dq m1, m1
  194. cvtps2dq m2, m2
  195. cvtps2dq m3, m3
  196. packssdw m0, m2
  197. packssdw m1, m3
  198. mova [src0q ], m0
  199. mova [src0q+src1q], m1
  200. add src0q, mmsize
  201. sub lend, mmsize/2
  202. jg .loop
  203. REP_RET
  204. %endmacro
  205. INIT_XMM sse2
  206. MIX_1_TO_2_S16P_FLT
  207. INIT_XMM sse4
  208. MIX_1_TO_2_S16P_FLT
  209. %if HAVE_AVX_EXTERNAL
  210. INIT_XMM avx
  211. MIX_1_TO_2_S16P_FLT
  212. %endif
  213. ;-----------------------------------------------------------------------------
  214. ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
  215. ; int len, int out_ch, int in_ch);
  216. ;-----------------------------------------------------------------------------
  217. %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
  218. ; define some names to make the code clearer
  219. %assign in_channels %1
  220. %assign out_channels %2
  221. %assign stereo out_channels - 1
  222. %ifidn %3, s16p
  223. %assign is_s16 1
  224. %else
  225. %assign is_s16 0
  226. %endif
  227. ; determine how many matrix elements must go on the stack vs. mmregs
  228. %assign matrix_elements in_channels * out_channels
  229. %if is_s16
  230. %if stereo
  231. %assign needed_mmregs 7
  232. %else
  233. %assign needed_mmregs 5
  234. %endif
  235. %else
  236. %if stereo
  237. %assign needed_mmregs 4
  238. %else
  239. %assign needed_mmregs 3
  240. %endif
  241. %endif
  242. %assign matrix_elements_mm num_mmregs - needed_mmregs
  243. %if matrix_elements < matrix_elements_mm
  244. %assign matrix_elements_mm matrix_elements
  245. %endif
  246. %if matrix_elements_mm < matrix_elements
  247. %assign matrix_elements_stack matrix_elements - matrix_elements_mm
  248. %else
  249. %assign matrix_elements_stack 0
  250. %endif
  251. cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
  252. ; get aligned stack space if needed
  253. %if matrix_elements_stack > 0
  254. %if mmsize == 32
  255. %assign bkpreg %1 + 1
  256. %define bkpq r %+ bkpreg %+ q
  257. mov bkpq, rsp
  258. and rsp, ~(mmsize-1)
  259. sub rsp, matrix_elements_stack * mmsize
  260. %else
  261. %assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
  262. SUB rsp, pad
  263. %endif
  264. %endif
  265. ; load matrix pointers
  266. %define matrix0q r1q
  267. %define matrix1q r3q
  268. %if stereo
  269. mov matrix1q, [matrix0q+gprsize]
  270. %endif
  271. mov matrix0q, [matrix0q]
  272. ; define matrix coeff names
  273. %assign %%i 0
  274. %assign %%j needed_mmregs
  275. %rep in_channels
  276. %if %%i >= matrix_elements_mm
  277. CAT_XDEFINE mx_stack_0_, %%i, 1
  278. CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
  279. %else
  280. CAT_XDEFINE mx_stack_0_, %%i, 0
  281. CAT_XDEFINE mx_0_, %%i, m %+ %%j
  282. %assign %%j %%j+1
  283. %endif
  284. %assign %%i %%i+1
  285. %endrep
  286. %if stereo
  287. %assign %%i 0
  288. %rep in_channels
  289. %if in_channels + %%i >= matrix_elements_mm
  290. CAT_XDEFINE mx_stack_1_, %%i, 1
  291. CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
  292. %else
  293. CAT_XDEFINE mx_stack_1_, %%i, 0
  294. CAT_XDEFINE mx_1_, %%i, m %+ %%j
  295. %assign %%j %%j+1
  296. %endif
  297. %assign %%i %%i+1
  298. %endrep
  299. %endif
  300. ; load/splat matrix coeffs
  301. %assign %%i 0
  302. %rep in_channels
  303. %if mx_stack_0_ %+ %%i
  304. VBROADCASTSS m0, [matrix0q+4*%%i]
  305. mova mx_0_ %+ %%i, m0
  306. %else
  307. VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
  308. %endif
  309. %if stereo
  310. %if mx_stack_1_ %+ %%i
  311. VBROADCASTSS m0, [matrix1q+4*%%i]
  312. mova mx_1_ %+ %%i, m0
  313. %else
  314. VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
  315. %endif
  316. %endif
  317. %assign %%i %%i+1
  318. %endrep
  319. ; load channel pointers to registers as offsets from the first channel pointer
  320. %if ARCH_X86_64
  321. movsxd lenq, r2d
  322. %endif
  323. shl lenq, 2-is_s16
  324. %assign %%i 1
  325. %rep (in_channels - 1)
  326. %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  327. mov src5q, [src0q+%%i*gprsize]
  328. add src5q, lenq
  329. mov src %+ %%i %+ m, src5q
  330. %else
  331. mov src %+ %%i %+ q, [src0q+%%i*gprsize]
  332. add src %+ %%i %+ q, lenq
  333. %endif
  334. %assign %%i %%i+1
  335. %endrep
  336. mov src0q, [src0q]
  337. add src0q, lenq
  338. neg lenq
  339. .loop:
  340. ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
  341. ; pointers, so we have to load some of them from the stack each time
  342. %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  343. %if is_s16
  344. ; mix with s16p input
  345. mova m0, [src0q+lenq]
  346. S16_TO_S32_SX 0, 1
  347. cvtdq2ps m0, m0
  348. cvtdq2ps m1, m1
  349. %if stereo
  350. mulps m2, m0, mx_1_0
  351. mulps m3, m1, mx_1_0
  352. %endif
  353. mulps m0, m0, mx_0_0
  354. mulps m1, m1, mx_0_0
  355. %assign %%i 1
  356. %rep (in_channels - 1)
  357. %if copy_src_from_stack
  358. %define src_ptr src5q
  359. %else
  360. %define src_ptr src %+ %%i %+ q
  361. %endif
  362. %if stereo
  363. %if copy_src_from_stack
  364. mov src_ptr, src %+ %%i %+ m
  365. %endif
  366. mova m4, [src_ptr+lenq]
  367. S16_TO_S32_SX 4, 5
  368. cvtdq2ps m4, m4
  369. cvtdq2ps m5, m5
  370. fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
  371. fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
  372. fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
  373. fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
  374. %else
  375. %if copy_src_from_stack
  376. mov src_ptr, src %+ %%i %+ m
  377. %endif
  378. mova m2, [src_ptr+lenq]
  379. S16_TO_S32_SX 2, 3
  380. cvtdq2ps m2, m2
  381. cvtdq2ps m3, m3
  382. fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
  383. fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
  384. %endif
  385. %assign %%i %%i+1
  386. %endrep
  387. %if stereo
  388. cvtps2dq m2, m2
  389. cvtps2dq m3, m3
  390. packssdw m2, m3
  391. mova [src1q+lenq], m2
  392. %endif
  393. cvtps2dq m0, m0
  394. cvtps2dq m1, m1
  395. packssdw m0, m1
  396. mova [src0q+lenq], m0
  397. %else
  398. ; mix with fltp input
  399. %if stereo || mx_stack_0_0
  400. mova m0, [src0q+lenq]
  401. %endif
  402. %if stereo
  403. mulps m1, m0, mx_1_0
  404. %endif
  405. %if stereo || mx_stack_0_0
  406. mulps m0, m0, mx_0_0
  407. %else
  408. mulps m0, [src0q+lenq], mx_0_0
  409. %endif
  410. %assign %%i 1
  411. %rep (in_channels - 1)
  412. %if copy_src_from_stack
  413. %define src_ptr src5q
  414. mov src_ptr, src %+ %%i %+ m
  415. %else
  416. %define src_ptr src %+ %%i %+ q
  417. %endif
  418. ; avoid extra load for mono if matrix is in a mm register
  419. %if stereo || mx_stack_0_ %+ %%i
  420. mova m2, [src_ptr+lenq]
  421. %endif
  422. %if stereo
  423. fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
  424. %endif
  425. %if stereo || mx_stack_0_ %+ %%i
  426. fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
  427. %else
  428. fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
  429. %endif
  430. %assign %%i %%i+1
  431. %endrep
  432. mova [src0q+lenq], m0
  433. %if stereo
  434. mova [src1q+lenq], m1
  435. %endif
  436. %endif
  437. add lenq, mmsize
  438. jl .loop
  439. ; restore stack pointer
  440. %if matrix_elements_stack > 0
  441. %if mmsize == 32
  442. mov rsp, bkpq
  443. %else
  444. ADD rsp, pad
  445. %endif
  446. %endif
  447. ; zero ymm high halves
  448. %if mmsize == 32
  449. vzeroupper
  450. %endif
  451. RET
  452. %endmacro
  453. %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
  454. %assign %%i 3
  455. %rep 6
  456. INIT_XMM sse
  457. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  458. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  459. INIT_XMM sse2
  460. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  461. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  462. INIT_XMM sse4
  463. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  464. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  465. ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
  466. %if HAVE_AVX_EXTERNAL
  467. %if ARCH_X86_64 || %%i < 6
  468. INIT_YMM avx
  469. %else
  470. INIT_XMM avx
  471. %endif
  472. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  473. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  474. INIT_XMM avx
  475. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  476. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  477. %endif
  478. %if HAVE_FMA4_EXTERNAL
  479. %if ARCH_X86_64 || %%i < 6
  480. INIT_YMM fma4
  481. %else
  482. INIT_XMM fma4
  483. %endif
  484. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  485. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  486. INIT_XMM fma4
  487. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  488. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  489. %endif
  490. %assign %%i %%i+1
  491. %endrep
  492. %endmacro
  493. MIX_3_8_TO_1_2_FLT_FUNCS