audio_mix.asm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. ;******************************************************************************
  2. ;* x86 optimized channel mixing
  3. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. %include "util.asm"
  23. SECTION_TEXT
  24. ;-----------------------------------------------------------------------------
  25. ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
  26. ; int out_ch, int in_ch);
  27. ;-----------------------------------------------------------------------------
  28. %macro MIX_2_TO_1_FLTP_FLT 0
  29. cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
  30. mov src1q, [srcq+gprsize]
  31. mov srcq, [srcq ]
  32. sub src1q, srcq
  33. mov matrixq, [matrixq ]
  34. VBROADCASTSS m4, [matrixq ]
  35. VBROADCASTSS m5, [matrixq+4]
  36. ALIGN 16
  37. .loop:
  38. mulps m0, m4, [srcq ]
  39. mulps m1, m5, [srcq+src1q ]
  40. mulps m2, m4, [srcq+ mmsize]
  41. mulps m3, m5, [srcq+src1q+mmsize]
  42. addps m0, m0, m1
  43. addps m2, m2, m3
  44. mova [srcq ], m0
  45. mova [srcq+mmsize], m2
  46. add srcq, mmsize*2
  47. sub lend, mmsize*2/4
  48. jg .loop
  49. REP_RET
  50. %endmacro
  51. INIT_XMM sse
  52. MIX_2_TO_1_FLTP_FLT
  53. %if HAVE_AVX_EXTERNAL
  54. INIT_YMM avx
  55. MIX_2_TO_1_FLTP_FLT
  56. %endif
  57. ;-----------------------------------------------------------------------------
  58. ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
  59. ; int out_ch, int in_ch);
  60. ;-----------------------------------------------------------------------------
  61. %macro MIX_2_TO_1_S16P_FLT 0
  62. cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
  63. mov src1q, [srcq+gprsize]
  64. mov srcq, [srcq]
  65. sub src1q, srcq
  66. mov matrixq, [matrixq ]
  67. VBROADCASTSS m4, [matrixq ]
  68. VBROADCASTSS m5, [matrixq+4]
  69. ALIGN 16
  70. .loop:
  71. mova m0, [srcq ]
  72. mova m2, [srcq+src1q]
  73. S16_TO_S32_SX 0, 1
  74. S16_TO_S32_SX 2, 3
  75. cvtdq2ps m0, m0
  76. cvtdq2ps m1, m1
  77. cvtdq2ps m2, m2
  78. cvtdq2ps m3, m3
  79. mulps m0, m4
  80. mulps m1, m4
  81. mulps m2, m5
  82. mulps m3, m5
  83. addps m0, m2
  84. addps m1, m3
  85. cvtps2dq m0, m0
  86. cvtps2dq m1, m1
  87. packssdw m0, m1
  88. mova [srcq], m0
  89. add srcq, mmsize
  90. sub lend, mmsize/2
  91. jg .loop
  92. REP_RET
  93. %endmacro
  94. INIT_XMM sse2
  95. MIX_2_TO_1_S16P_FLT
  96. INIT_XMM sse4
  97. MIX_2_TO_1_S16P_FLT
  98. ;-----------------------------------------------------------------------------
  99. ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
  100. ; int out_ch, int in_ch);
  101. ;-----------------------------------------------------------------------------
  102. INIT_XMM sse2
  103. cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
  104. mov src1q, [srcq+gprsize]
  105. mov srcq, [srcq]
  106. sub src1q, srcq
  107. mov matrixq, [matrixq]
  108. movd m4, [matrixq]
  109. movd m5, [matrixq]
  110. SPLATW m4, m4, 0
  111. SPLATW m5, m5, 1
  112. pxor m0, m0
  113. punpcklwd m4, m0
  114. punpcklwd m5, m0
  115. ALIGN 16
  116. .loop:
  117. mova m0, [srcq ]
  118. mova m2, [srcq+src1q]
  119. punpckhwd m1, m0, m0
  120. punpcklwd m0, m0
  121. punpckhwd m3, m2, m2
  122. punpcklwd m2, m2
  123. pmaddwd m0, m4
  124. pmaddwd m1, m4
  125. pmaddwd m2, m5
  126. pmaddwd m3, m5
  127. paddd m0, m2
  128. paddd m1, m3
  129. psrad m0, 8
  130. psrad m1, 8
  131. packssdw m0, m1
  132. mova [srcq], m0
  133. add srcq, mmsize
  134. sub lend, mmsize/2
  135. jg .loop
  136. REP_RET
  137. ;-----------------------------------------------------------------------------
  138. ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
  139. ; int out_ch, int in_ch);
  140. ;-----------------------------------------------------------------------------
  141. %macro MIX_1_TO_2_FLTP_FLT 0
  142. cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
  143. mov src1q, [src0q+gprsize]
  144. mov src0q, [src0q]
  145. sub src1q, src0q
  146. mov matrix1q, [matrix0q+gprsize]
  147. mov matrix0q, [matrix0q]
  148. VBROADCASTSS m2, [matrix0q]
  149. VBROADCASTSS m3, [matrix1q]
  150. ALIGN 16
  151. .loop:
  152. mova m0, [src0q]
  153. mulps m1, m0, m3
  154. mulps m0, m0, m2
  155. mova [src0q ], m0
  156. mova [src0q+src1q], m1
  157. add src0q, mmsize
  158. sub lend, mmsize/4
  159. jg .loop
  160. REP_RET
  161. %endmacro
  162. INIT_XMM sse
  163. MIX_1_TO_2_FLTP_FLT
  164. %if HAVE_AVX_EXTERNAL
  165. INIT_YMM avx
  166. MIX_1_TO_2_FLTP_FLT
  167. %endif
  168. ;-----------------------------------------------------------------------------
  169. ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
  170. ; int out_ch, int in_ch);
  171. ;-----------------------------------------------------------------------------
  172. %macro MIX_1_TO_2_S16P_FLT 0
  173. cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
  174. mov src1q, [src0q+gprsize]
  175. mov src0q, [src0q]
  176. sub src1q, src0q
  177. mov matrix1q, [matrix0q+gprsize]
  178. mov matrix0q, [matrix0q]
  179. VBROADCASTSS m4, [matrix0q]
  180. VBROADCASTSS m5, [matrix1q]
  181. ALIGN 16
  182. .loop:
  183. mova m0, [src0q]
  184. S16_TO_S32_SX 0, 2
  185. cvtdq2ps m0, m0
  186. cvtdq2ps m2, m2
  187. mulps m1, m0, m5
  188. mulps m0, m0, m4
  189. mulps m3, m2, m5
  190. mulps m2, m2, m4
  191. cvtps2dq m0, m0
  192. cvtps2dq m1, m1
  193. cvtps2dq m2, m2
  194. cvtps2dq m3, m3
  195. packssdw m0, m2
  196. packssdw m1, m3
  197. mova [src0q ], m0
  198. mova [src0q+src1q], m1
  199. add src0q, mmsize
  200. sub lend, mmsize/2
  201. jg .loop
  202. REP_RET
  203. %endmacro
  204. INIT_XMM sse2
  205. MIX_1_TO_2_S16P_FLT
  206. INIT_XMM sse4
  207. MIX_1_TO_2_S16P_FLT
  208. %if HAVE_AVX_EXTERNAL
  209. INIT_XMM avx
  210. MIX_1_TO_2_S16P_FLT
  211. %endif
  212. ;-----------------------------------------------------------------------------
  213. ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
  214. ; int len, int out_ch, int in_ch);
  215. ;-----------------------------------------------------------------------------
  216. %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
  217. ; define some names to make the code clearer
  218. %assign in_channels %1
  219. %assign out_channels %2
  220. %assign stereo out_channels - 1
  221. %ifidn %3, s16p
  222. %assign is_s16 1
  223. %else
  224. %assign is_s16 0
  225. %endif
  226. ; determine how many matrix elements must go on the stack vs. mmregs
  227. %assign matrix_elements in_channels * out_channels
  228. %if is_s16
  229. %if stereo
  230. %assign needed_mmregs 7
  231. %else
  232. %assign needed_mmregs 5
  233. %endif
  234. %else
  235. %if stereo
  236. %assign needed_mmregs 4
  237. %else
  238. %assign needed_mmregs 3
  239. %endif
  240. %endif
  241. %assign matrix_elements_mm num_mmregs - needed_mmregs
  242. %if matrix_elements < matrix_elements_mm
  243. %assign matrix_elements_mm matrix_elements
  244. %endif
  245. %if matrix_elements_mm < matrix_elements
  246. %assign matrix_elements_stack matrix_elements - matrix_elements_mm
  247. %else
  248. %assign matrix_elements_stack 0
  249. %endif
  250. %assign matrix_stack_size matrix_elements_stack * mmsize
  251. %assign needed_stack_size -1 * matrix_stack_size
  252. %if ARCH_X86_32 && in_channels >= 7
  253. %assign needed_stack_size needed_stack_size - 16
  254. %endif
  255. cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7
  256. ; define src pointers on stack if needed
  257. %if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7
  258. %define src5m [rsp+matrix_stack_size+0]
  259. %define src6m [rsp+matrix_stack_size+4]
  260. %define src7m [rsp+matrix_stack_size+8]
  261. %endif
  262. ; load matrix pointers
  263. %define matrix0q r1q
  264. %define matrix1q r3q
  265. %if stereo
  266. mov matrix1q, [matrix0q+gprsize]
  267. %endif
  268. mov matrix0q, [matrix0q]
  269. ; define matrix coeff names
  270. %assign %%i 0
  271. %assign %%j needed_mmregs
  272. %rep in_channels
  273. %if %%i >= matrix_elements_mm
  274. CAT_XDEFINE mx_stack_0_, %%i, 1
  275. CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
  276. %else
  277. CAT_XDEFINE mx_stack_0_, %%i, 0
  278. CAT_XDEFINE mx_0_, %%i, m %+ %%j
  279. %assign %%j %%j+1
  280. %endif
  281. %assign %%i %%i+1
  282. %endrep
  283. %if stereo
  284. %assign %%i 0
  285. %rep in_channels
  286. %if in_channels + %%i >= matrix_elements_mm
  287. CAT_XDEFINE mx_stack_1_, %%i, 1
  288. CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
  289. %else
  290. CAT_XDEFINE mx_stack_1_, %%i, 0
  291. CAT_XDEFINE mx_1_, %%i, m %+ %%j
  292. %assign %%j %%j+1
  293. %endif
  294. %assign %%i %%i+1
  295. %endrep
  296. %endif
  297. ; load/splat matrix coeffs
  298. %assign %%i 0
  299. %rep in_channels
  300. %if mx_stack_0_ %+ %%i
  301. VBROADCASTSS m0, [matrix0q+4*%%i]
  302. mova mx_0_ %+ %%i, m0
  303. %else
  304. VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
  305. %endif
  306. %if stereo
  307. %if mx_stack_1_ %+ %%i
  308. VBROADCASTSS m0, [matrix1q+4*%%i]
  309. mova mx_1_ %+ %%i, m0
  310. %else
  311. VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
  312. %endif
  313. %endif
  314. %assign %%i %%i+1
  315. %endrep
  316. ; load channel pointers to registers as offsets from the first channel pointer
  317. %if ARCH_X86_64
  318. movsxd lenq, r2d
  319. %endif
  320. shl lenq, 2-is_s16
  321. %assign %%i 1
  322. %rep (in_channels - 1)
  323. %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  324. mov src5q, [src0q+%%i*gprsize]
  325. add src5q, lenq
  326. mov src %+ %%i %+ m, src5q
  327. %else
  328. mov src %+ %%i %+ q, [src0q+%%i*gprsize]
  329. add src %+ %%i %+ q, lenq
  330. %endif
  331. %assign %%i %%i+1
  332. %endrep
  333. mov src0q, [src0q]
  334. add src0q, lenq
  335. neg lenq
  336. .loop:
  337. ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
  338. ; pointers, so we have to load some of them from the stack each time
  339. %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
  340. %if is_s16
  341. ; mix with s16p input
  342. mova m0, [src0q+lenq]
  343. S16_TO_S32_SX 0, 1
  344. cvtdq2ps m0, m0
  345. cvtdq2ps m1, m1
  346. %if stereo
  347. mulps m2, m0, mx_1_0
  348. mulps m3, m1, mx_1_0
  349. %endif
  350. mulps m0, m0, mx_0_0
  351. mulps m1, m1, mx_0_0
  352. %assign %%i 1
  353. %rep (in_channels - 1)
  354. %if copy_src_from_stack
  355. %define src_ptr src5q
  356. %else
  357. %define src_ptr src %+ %%i %+ q
  358. %endif
  359. %if stereo
  360. %if copy_src_from_stack
  361. mov src_ptr, src %+ %%i %+ m
  362. %endif
  363. mova m4, [src_ptr+lenq]
  364. S16_TO_S32_SX 4, 5
  365. cvtdq2ps m4, m4
  366. cvtdq2ps m5, m5
  367. fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
  368. fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
  369. fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
  370. fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
  371. %else
  372. %if copy_src_from_stack
  373. mov src_ptr, src %+ %%i %+ m
  374. %endif
  375. mova m2, [src_ptr+lenq]
  376. S16_TO_S32_SX 2, 3
  377. cvtdq2ps m2, m2
  378. cvtdq2ps m3, m3
  379. fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
  380. fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
  381. %endif
  382. %assign %%i %%i+1
  383. %endrep
  384. %if stereo
  385. cvtps2dq m2, m2
  386. cvtps2dq m3, m3
  387. packssdw m2, m3
  388. mova [src1q+lenq], m2
  389. %endif
  390. cvtps2dq m0, m0
  391. cvtps2dq m1, m1
  392. packssdw m0, m1
  393. mova [src0q+lenq], m0
  394. %else
  395. ; mix with fltp input
  396. %if stereo || mx_stack_0_0
  397. mova m0, [src0q+lenq]
  398. %endif
  399. %if stereo
  400. mulps m1, m0, mx_1_0
  401. %endif
  402. %if stereo || mx_stack_0_0
  403. mulps m0, m0, mx_0_0
  404. %else
  405. mulps m0, [src0q+lenq], mx_0_0
  406. %endif
  407. %assign %%i 1
  408. %rep (in_channels - 1)
  409. %if copy_src_from_stack
  410. %define src_ptr src5q
  411. mov src_ptr, src %+ %%i %+ m
  412. %else
  413. %define src_ptr src %+ %%i %+ q
  414. %endif
  415. ; avoid extra load for mono if matrix is in a mm register
  416. %if stereo || mx_stack_0_ %+ %%i
  417. mova m2, [src_ptr+lenq]
  418. %endif
  419. %if stereo
  420. fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
  421. %endif
  422. %if stereo || mx_stack_0_ %+ %%i
  423. fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
  424. %else
  425. fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
  426. %endif
  427. %assign %%i %%i+1
  428. %endrep
  429. mova [src0q+lenq], m0
  430. %if stereo
  431. mova [src1q+lenq], m1
  432. %endif
  433. %endif
  434. add lenq, mmsize
  435. jl .loop
  436. ; zero ymm high halves
  437. %if mmsize == 32
  438. vzeroupper
  439. %endif
  440. RET
  441. %endmacro
  442. %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
  443. %assign %%i 3
  444. %rep 6
  445. INIT_XMM sse
  446. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  447. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  448. INIT_XMM sse2
  449. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  450. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  451. INIT_XMM sse4
  452. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  453. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  454. ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
  455. %if HAVE_AVX_EXTERNAL
  456. %if ARCH_X86_64 || %%i < 6
  457. INIT_YMM avx
  458. %else
  459. INIT_XMM avx
  460. %endif
  461. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  462. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  463. INIT_XMM avx
  464. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  465. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  466. %endif
  467. %if HAVE_FMA4_EXTERNAL
  468. %if ARCH_X86_64 || %%i < 6
  469. INIT_YMM fma4
  470. %else
  471. INIT_XMM fma4
  472. %endif
  473. MIX_3_8_TO_1_2_FLT %%i, 1, fltp
  474. MIX_3_8_TO_1_2_FLT %%i, 2, fltp
  475. INIT_XMM fma4
  476. MIX_3_8_TO_1_2_FLT %%i, 1, s16p
  477. MIX_3_8_TO_1_2_FLT %%i, 2, s16p
  478. %endif
  479. %assign %%i %%i+1
  480. %endrep
  481. %endmacro
  482. MIX_3_8_TO_1_2_FLT_FUNCS