audio_convert.asm 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261
  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. %include "util.asm"
  24. SECTION_RODATA 32
  25. pf_s32_inv_scale: times 8 dd 0x30000000
  26. pf_s32_scale: times 8 dd 0x4f000000
  27. pf_s32_clip: times 8 dd 0x4effffff
  28. pf_s16_inv_scale: times 4 dd 0x38000000
  29. pf_s16_scale: times 4 dd 0x47000000
  30. pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
  31. pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
  32. pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
  33. pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
  34. pw_zero_even: times 4 dw 0x0000, 0xffff
  35. SECTION .text
  36. ;------------------------------------------------------------------------------
  37. ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
  38. ;------------------------------------------------------------------------------
  39. INIT_XMM sse2
  40. cglobal conv_s16_to_s32, 3,3,3, dst, src, len
  41. lea lenq, [2*lend]
  42. lea dstq, [dstq+2*lenq]
  43. add srcq, lenq
  44. neg lenq
  45. .loop:
  46. mova m2, [srcq+lenq]
  47. pxor m0, m0
  48. pxor m1, m1
  49. punpcklwd m0, m2
  50. punpckhwd m1, m2
  51. mova [dstq+2*lenq ], m0
  52. mova [dstq+2*lenq+mmsize], m1
  53. add lenq, mmsize
  54. jl .loop
  55. REP_RET
  56. ;------------------------------------------------------------------------------
  57. ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
  58. ;------------------------------------------------------------------------------
  59. %macro CONV_S16_TO_FLT 0
  60. cglobal conv_s16_to_flt, 3,3,3, dst, src, len
  61. lea lenq, [2*lend]
  62. add srcq, lenq
  63. lea dstq, [dstq + 2*lenq]
  64. neg lenq
  65. mova m2, [pf_s16_inv_scale]
  66. ALIGN 16
  67. .loop:
  68. mova m0, [srcq+lenq]
  69. S16_TO_S32_SX 0, 1
  70. cvtdq2ps m0, m0
  71. cvtdq2ps m1, m1
  72. mulps m0, m2
  73. mulps m1, m2
  74. mova [dstq+2*lenq ], m0
  75. mova [dstq+2*lenq+mmsize], m1
  76. add lenq, mmsize
  77. jl .loop
  78. REP_RET
  79. %endmacro
  80. INIT_XMM sse2
  81. CONV_S16_TO_FLT
  82. INIT_XMM sse4
  83. CONV_S16_TO_FLT
  84. ;------------------------------------------------------------------------------
  85. ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
  86. ;------------------------------------------------------------------------------
  87. %macro CONV_S32_TO_S16 0
  88. cglobal conv_s32_to_s16, 3,3,4, dst, src, len
  89. lea lenq, [2*lend]
  90. lea srcq, [srcq+2*lenq]
  91. add dstq, lenq
  92. neg lenq
  93. .loop:
  94. mova m0, [srcq+2*lenq ]
  95. mova m1, [srcq+2*lenq+ mmsize]
  96. mova m2, [srcq+2*lenq+2*mmsize]
  97. mova m3, [srcq+2*lenq+3*mmsize]
  98. psrad m0, 16
  99. psrad m1, 16
  100. psrad m2, 16
  101. psrad m3, 16
  102. packssdw m0, m1
  103. packssdw m2, m3
  104. mova [dstq+lenq ], m0
  105. mova [dstq+lenq+mmsize], m2
  106. add lenq, mmsize*2
  107. jl .loop
  108. %if mmsize == 8
  109. emms
  110. RET
  111. %else
  112. REP_RET
  113. %endif
  114. %endmacro
  115. INIT_MMX mmx
  116. CONV_S32_TO_S16
  117. INIT_XMM sse2
  118. CONV_S32_TO_S16
  119. ;------------------------------------------------------------------------------
  120. ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
  121. ;------------------------------------------------------------------------------
  122. %macro CONV_S32_TO_FLT 0
  123. cglobal conv_s32_to_flt, 3,3,3, dst, src, len
  124. lea lenq, [4*lend]
  125. add srcq, lenq
  126. add dstq, lenq
  127. neg lenq
  128. mova m0, [pf_s32_inv_scale]
  129. ALIGN 16
  130. .loop:
  131. cvtdq2ps m1, [srcq+lenq ]
  132. cvtdq2ps m2, [srcq+lenq+mmsize]
  133. mulps m1, m1, m0
  134. mulps m2, m2, m0
  135. mova [dstq+lenq ], m1
  136. mova [dstq+lenq+mmsize], m2
  137. add lenq, mmsize*2
  138. jl .loop
  139. REP_RET
  140. %endmacro
  141. INIT_XMM sse2
  142. CONV_S32_TO_FLT
  143. %if HAVE_AVX_EXTERNAL
  144. INIT_YMM avx
  145. CONV_S32_TO_FLT
  146. %endif
  147. ;------------------------------------------------------------------------------
  148. ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
  149. ;------------------------------------------------------------------------------
  150. INIT_XMM sse2
  151. cglobal conv_flt_to_s16, 3,3,5, dst, src, len
  152. lea lenq, [2*lend]
  153. lea srcq, [srcq+2*lenq]
  154. add dstq, lenq
  155. neg lenq
  156. mova m4, [pf_s16_scale]
  157. .loop:
  158. mova m0, [srcq+2*lenq ]
  159. mova m1, [srcq+2*lenq+1*mmsize]
  160. mova m2, [srcq+2*lenq+2*mmsize]
  161. mova m3, [srcq+2*lenq+3*mmsize]
  162. mulps m0, m4
  163. mulps m1, m4
  164. mulps m2, m4
  165. mulps m3, m4
  166. cvtps2dq m0, m0
  167. cvtps2dq m1, m1
  168. cvtps2dq m2, m2
  169. cvtps2dq m3, m3
  170. packssdw m0, m1
  171. packssdw m2, m3
  172. mova [dstq+lenq ], m0
  173. mova [dstq+lenq+mmsize], m2
  174. add lenq, mmsize*2
  175. jl .loop
  176. REP_RET
  177. ;------------------------------------------------------------------------------
  178. ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
  179. ;------------------------------------------------------------------------------
  180. %macro CONV_FLT_TO_S32 0
  181. cglobal conv_flt_to_s32, 3,3,6, dst, src, len
  182. lea lenq, [lend*4]
  183. add srcq, lenq
  184. add dstq, lenq
  185. neg lenq
  186. mova m4, [pf_s32_scale]
  187. mova m5, [pf_s32_clip]
  188. .loop:
  189. mulps m0, m4, [srcq+lenq ]
  190. mulps m1, m4, [srcq+lenq+1*mmsize]
  191. mulps m2, m4, [srcq+lenq+2*mmsize]
  192. mulps m3, m4, [srcq+lenq+3*mmsize]
  193. minps m0, m0, m5
  194. minps m1, m1, m5
  195. minps m2, m2, m5
  196. minps m3, m3, m5
  197. cvtps2dq m0, m0
  198. cvtps2dq m1, m1
  199. cvtps2dq m2, m2
  200. cvtps2dq m3, m3
  201. mova [dstq+lenq ], m0
  202. mova [dstq+lenq+1*mmsize], m1
  203. mova [dstq+lenq+2*mmsize], m2
  204. mova [dstq+lenq+3*mmsize], m3
  205. add lenq, mmsize*4
  206. jl .loop
  207. REP_RET
  208. %endmacro
  209. INIT_XMM sse2
  210. CONV_FLT_TO_S32
  211. %if HAVE_AVX_EXTERNAL
  212. INIT_YMM avx
  213. CONV_FLT_TO_S32
  214. %endif
  215. ;------------------------------------------------------------------------------
  216. ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
  217. ; int channels);
  218. ;------------------------------------------------------------------------------
  219. %macro CONV_S16P_TO_S16_2CH 0
  220. cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
  221. mov src1q, [src0q+gprsize]
  222. mov src0q, [src0q ]
  223. lea lenq, [2*lend]
  224. add src0q, lenq
  225. add src1q, lenq
  226. lea dstq, [dstq+2*lenq]
  227. neg lenq
  228. .loop:
  229. mova m0, [src0q+lenq ]
  230. mova m1, [src1q+lenq ]
  231. mova m2, [src0q+lenq+mmsize]
  232. mova m3, [src1q+lenq+mmsize]
  233. SBUTTERFLY2 wd, 0, 1, 4
  234. SBUTTERFLY2 wd, 2, 3, 4
  235. mova [dstq+2*lenq+0*mmsize], m0
  236. mova [dstq+2*lenq+1*mmsize], m1
  237. mova [dstq+2*lenq+2*mmsize], m2
  238. mova [dstq+2*lenq+3*mmsize], m3
  239. add lenq, 2*mmsize
  240. jl .loop
  241. REP_RET
  242. %endmacro
  243. INIT_XMM sse2
  244. CONV_S16P_TO_S16_2CH
  245. %if HAVE_AVX_EXTERNAL
  246. INIT_XMM avx
  247. CONV_S16P_TO_S16_2CH
  248. %endif
  249. ;------------------------------------------------------------------------------
  250. ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
  251. ; int channels);
  252. ;------------------------------------------------------------------------------
  253. ;------------------------------------------------------------------------------
  254. ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
  255. ; instead of just a counter, which would avoid incrementing the
  256. ; pointers, but the extra complexity and amount of code is not worth
  257. ; the small gain. On x86-32 there are not enough registers to use len
  258. ; as an index without keeping two of the pointers on the stack and
  259. ; loading them in each iteration.
  260. ;------------------------------------------------------------------------------
  261. %macro CONV_S16P_TO_S16_6CH 0
  262. %if ARCH_X86_64
  263. cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
  264. %else
  265. cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
  266. %define lend dword r2m
  267. %endif
  268. mov src1q, [src0q+1*gprsize]
  269. mov src2q, [src0q+2*gprsize]
  270. mov src3q, [src0q+3*gprsize]
  271. mov src4q, [src0q+4*gprsize]
  272. mov src5q, [src0q+5*gprsize]
  273. mov src0q, [src0q]
  274. sub src1q, src0q
  275. sub src2q, src0q
  276. sub src3q, src0q
  277. sub src4q, src0q
  278. sub src5q, src0q
  279. .loop:
  280. %if cpuflag(sse2slow)
  281. movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
  282. movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  283. movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  284. movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  285. movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  286. movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  287. ; unpack words:
  288. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  289. punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
  290. punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
  291. ; blend dwords
  292. shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
  293. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  294. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  295. ; shuffle dwords
  296. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  297. pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
  298. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  299. movq [dstq+0*mmsize/2], m1
  300. movq [dstq+1*mmsize/2], m0
  301. movq [dstq+2*mmsize/2], m2
  302. movhps [dstq+3*mmsize/2], m1
  303. movhps [dstq+4*mmsize/2], m0
  304. movhps [dstq+5*mmsize/2], m2
  305. add src0q, mmsize/2
  306. add dstq, mmsize*3
  307. sub lend, mmsize/4
  308. %else
  309. mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
  310. mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
  311. mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
  312. mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
  313. mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
  314. mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
  315. ; unpack words:
  316. SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  317. ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
  318. SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  319. ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
  320. SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  321. ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
  322. ; blend dwords
  323. shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
  324. shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  325. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  326. SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
  327. shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
  328. shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
  329. shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
  330. SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
  331. ; shuffle dwords
  332. pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
  333. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  334. pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
  335. pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
  336. pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
  337. pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
  338. ; shuffle qwords
  339. punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
  340. punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  341. shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
  342. SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
  343. punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
  344. punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
  345. shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
  346. SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
  347. mova [dstq+0*mmsize], m4
  348. mova [dstq+1*mmsize], m2
  349. mova [dstq+2*mmsize], m0
  350. mova [dstq+3*mmsize], m5
  351. mova [dstq+4*mmsize], m3
  352. mova [dstq+5*mmsize], m1
  353. add src0q, mmsize
  354. add dstq, mmsize*6
  355. sub lend, mmsize/2
  356. %endif
  357. jg .loop
  358. REP_RET
  359. %endmacro
  360. INIT_XMM sse2
  361. CONV_S16P_TO_S16_6CH
  362. INIT_XMM sse2slow
  363. CONV_S16P_TO_S16_6CH
  364. %if HAVE_AVX_EXTERNAL
  365. INIT_XMM avx
  366. CONV_S16P_TO_S16_6CH
  367. %endif
  368. ;------------------------------------------------------------------------------
  369. ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
  370. ; int channels);
  371. ;------------------------------------------------------------------------------
  372. %macro CONV_S16P_TO_FLT_2CH 0
  373. cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
  374. lea lenq, [2*lend]
  375. mov src1q, [src0q+gprsize]
  376. mov src0q, [src0q ]
  377. lea dstq, [dstq+4*lenq]
  378. add src0q, lenq
  379. add src1q, lenq
  380. neg lenq
  381. mova m5, [pf_s32_inv_scale]
  382. .loop:
  383. mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
  384. mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
  385. SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
  386. ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
  387. pxor m3, m3
  388. punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
  389. punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
  390. punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
  391. punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
  392. cvtdq2ps m0, m0
  393. cvtdq2ps m1, m1
  394. cvtdq2ps m2, m2
  395. cvtdq2ps m3, m3
  396. mulps m0, m5
  397. mulps m1, m5
  398. mulps m2, m5
  399. mulps m3, m5
  400. mova [dstq+4*lenq ], m0
  401. mova [dstq+4*lenq+ mmsize], m1
  402. mova [dstq+4*lenq+2*mmsize], m2
  403. mova [dstq+4*lenq+3*mmsize], m3
  404. add lenq, mmsize
  405. jl .loop
  406. REP_RET
  407. %endmacro
  408. INIT_XMM sse2
  409. CONV_S16P_TO_FLT_2CH
  410. %if HAVE_AVX_EXTERNAL
  411. INIT_XMM avx
  412. CONV_S16P_TO_FLT_2CH
  413. %endif
  414. ;------------------------------------------------------------------------------
  415. ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
  416. ; int channels);
  417. ;------------------------------------------------------------------------------
  418. %macro CONV_S16P_TO_FLT_6CH 0
  419. %if ARCH_X86_64
  420. cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
  421. %else
  422. cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
  423. %define lend dword r2m
  424. %endif
  425. mov src1q, [srcq+1*gprsize]
  426. mov src2q, [srcq+2*gprsize]
  427. mov src3q, [srcq+3*gprsize]
  428. mov src4q, [srcq+4*gprsize]
  429. mov src5q, [srcq+5*gprsize]
  430. mov srcq, [srcq]
  431. sub src1q, srcq
  432. sub src2q, srcq
  433. sub src3q, srcq
  434. sub src4q, srcq
  435. sub src5q, srcq
  436. mova m7, [pf_s32_inv_scale]
  437. %if cpuflag(ssse3)
  438. %define unpack_even m6
  439. mova m6, [pb_shuf_unpack_even]
  440. %if ARCH_X86_64
  441. %define unpack_odd m8
  442. mova m8, [pb_shuf_unpack_odd]
  443. %else
  444. %define unpack_odd [pb_shuf_unpack_odd]
  445. %endif
  446. %endif
  447. .loop:
  448. movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
  449. movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
  450. movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
  451. movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
  452. movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
  453. movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
  454. ; unpack words:
  455. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  456. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  457. punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
  458. ; blend dwords
  459. shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
  460. shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
  461. shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  462. %if cpuflag(ssse3)
  463. pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
  464. pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
  465. pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
  466. pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
  467. pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
  468. pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
  469. %else
  470. ; shuffle dwords
  471. pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
  472. pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
  473. pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
  474. pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
  475. punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
  476. punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
  477. punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
  478. punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
  479. punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
  480. punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
  481. SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
  482. %endif
  483. cvtdq2ps m0, m0 ; convert s32 to float
  484. cvtdq2ps m1, m1
  485. cvtdq2ps m2, m2
  486. cvtdq2ps m3, m3
  487. cvtdq2ps m4, m4
  488. cvtdq2ps m5, m5
  489. mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
  490. mulps m1, m7
  491. mulps m2, m7
  492. mulps m3, m7
  493. mulps m4, m7
  494. mulps m5, m7
  495. mova [dstq ], m0
  496. mova [dstq+ mmsize], m1
  497. mova [dstq+2*mmsize], m2
  498. mova [dstq+3*mmsize], m3
  499. mova [dstq+4*mmsize], m4
  500. mova [dstq+5*mmsize], m5
  501. add srcq, mmsize/2
  502. add dstq, mmsize*6
  503. sub lend, mmsize/4
  504. jg .loop
  505. REP_RET
  506. %endmacro
  507. INIT_XMM sse2
  508. CONV_S16P_TO_FLT_6CH
  509. INIT_XMM ssse3
  510. CONV_S16P_TO_FLT_6CH
  511. %if HAVE_AVX_EXTERNAL
  512. INIT_XMM avx
  513. CONV_S16P_TO_FLT_6CH
  514. %endif
  515. ;------------------------------------------------------------------------------
  516. ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
  517. ; int channels);
  518. ;------------------------------------------------------------------------------
  519. %macro CONV_FLTP_TO_S16_2CH 0
  520. cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
  521. lea lenq, [4*lend]
  522. mov src1q, [src0q+gprsize]
  523. mov src0q, [src0q ]
  524. add dstq, lenq
  525. add src0q, lenq
  526. add src1q, lenq
  527. neg lenq
  528. mova m2, [pf_s16_scale]
  529. %if cpuflag(ssse3)
  530. mova m3, [pb_interleave_words]
  531. %endif
  532. .loop:
  533. mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
  534. mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
  535. cvtps2dq m0, m0
  536. cvtps2dq m1, m1
  537. %if cpuflag(ssse3)
  538. packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  539. pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  540. %else
  541. packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
  542. packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
  543. punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  544. %endif
  545. mova [dstq+lenq], m0
  546. add lenq, mmsize
  547. jl .loop
  548. REP_RET
  549. %endmacro
  550. INIT_XMM sse2
  551. CONV_FLTP_TO_S16_2CH
  552. INIT_XMM ssse3
  553. CONV_FLTP_TO_S16_2CH
  554. ;------------------------------------------------------------------------------
  555. ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
  556. ; int channels);
  557. ;------------------------------------------------------------------------------
  558. %macro CONV_FLTP_TO_S16_6CH 0
  559. %if ARCH_X86_64
  560. cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
  561. %else
  562. cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
  563. %define lend dword r2m
  564. %endif
  565. mov src1q, [srcq+1*gprsize]
  566. mov src2q, [srcq+2*gprsize]
  567. mov src3q, [srcq+3*gprsize]
  568. mov src4q, [srcq+4*gprsize]
  569. mov src5q, [srcq+5*gprsize]
  570. mov srcq, [srcq]
  571. sub src1q, srcq
  572. sub src2q, srcq
  573. sub src3q, srcq
  574. sub src4q, srcq
  575. sub src5q, srcq
  576. movaps xmm6, [pf_s16_scale]
  577. .loop:
  578. %if cpuflag(sse2)
  579. mulps m0, m6, [srcq ]
  580. mulps m1, m6, [srcq+src1q]
  581. mulps m2, m6, [srcq+src2q]
  582. mulps m3, m6, [srcq+src3q]
  583. mulps m4, m6, [srcq+src4q]
  584. mulps m5, m6, [srcq+src5q]
  585. cvtps2dq m0, m0
  586. cvtps2dq m1, m1
  587. cvtps2dq m2, m2
  588. cvtps2dq m3, m3
  589. cvtps2dq m4, m4
  590. cvtps2dq m5, m5
  591. packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
  592. packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
  593. packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
  594. ; unpack words:
  595. movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
  596. punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
  597. punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
  598. punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
  599. ; blend dwords:
  600. shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
  601. shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
  602. shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
  603. ; shuffle dwords:
  604. shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  605. shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
  606. shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
  607. mova [dstq+0*mmsize], m3
  608. mova [dstq+1*mmsize], m1
  609. mova [dstq+2*mmsize], m0
  610. %else ; sse
  611. movlps xmm0, [srcq ]
  612. movlps xmm1, [srcq+src1q]
  613. movlps xmm2, [srcq+src2q]
  614. movlps xmm3, [srcq+src3q]
  615. movlps xmm4, [srcq+src4q]
  616. movlps xmm5, [srcq+src5q]
  617. mulps xmm0, xmm6
  618. mulps xmm1, xmm6
  619. mulps xmm2, xmm6
  620. mulps xmm3, xmm6
  621. mulps xmm4, xmm6
  622. mulps xmm5, xmm6
  623. cvtps2pi mm0, xmm0
  624. cvtps2pi mm1, xmm1
  625. cvtps2pi mm2, xmm2
  626. cvtps2pi mm3, xmm3
  627. cvtps2pi mm4, xmm4
  628. cvtps2pi mm5, xmm5
  629. packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
  630. packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
  631. packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
  632. ; unpack words
  633. pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
  634. punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
  635. punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
  636. punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
  637. ; unpack dwords
  638. pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
  639. punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
  640. punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
  641. punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
  642. mova [dstq+0*mmsize], mm0
  643. mova [dstq+1*mmsize], mm1
  644. mova [dstq+2*mmsize], mm2
  645. %endif
  646. add srcq, mmsize
  647. add dstq, mmsize*3
  648. sub lend, mmsize/4
  649. jg .loop
  650. %if mmsize == 8
  651. emms
  652. RET
  653. %else
  654. REP_RET
  655. %endif
  656. %endmacro
  657. INIT_MMX sse
  658. CONV_FLTP_TO_S16_6CH
  659. INIT_XMM sse2
  660. CONV_FLTP_TO_S16_6CH
  661. %if HAVE_AVX_EXTERNAL
  662. INIT_XMM avx
  663. CONV_FLTP_TO_S16_6CH
  664. %endif
  665. ;------------------------------------------------------------------------------
  666. ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
  667. ; int channels);
  668. ;------------------------------------------------------------------------------
  669. %macro CONV_FLTP_TO_FLT_2CH 0
  670. cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
  671. mov src1q, [src0q+gprsize]
  672. mov src0q, [src0q]
  673. lea lenq, [4*lend]
  674. add src0q, lenq
  675. add src1q, lenq
  676. lea dstq, [dstq+2*lenq]
  677. neg lenq
  678. .loop:
  679. mova m0, [src0q+lenq ]
  680. mova m1, [src1q+lenq ]
  681. mova m2, [src0q+lenq+mmsize]
  682. mova m3, [src1q+lenq+mmsize]
  683. SBUTTERFLYPS 0, 1, 4
  684. SBUTTERFLYPS 2, 3, 4
  685. mova [dstq+2*lenq+0*mmsize], m0
  686. mova [dstq+2*lenq+1*mmsize], m1
  687. mova [dstq+2*lenq+2*mmsize], m2
  688. mova [dstq+2*lenq+3*mmsize], m3
  689. add lenq, 2*mmsize
  690. jl .loop
  691. REP_RET
  692. %endmacro
  693. INIT_XMM sse
  694. CONV_FLTP_TO_FLT_2CH
  695. %if HAVE_AVX_EXTERNAL
  696. INIT_XMM avx
  697. CONV_FLTP_TO_FLT_2CH
  698. %endif
  699. ;-----------------------------------------------------------------------------
  700. ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
  701. ; int channels);
  702. ;-----------------------------------------------------------------------------
  703. %macro CONV_FLTP_TO_FLT_6CH 0
  704. cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  705. %if ARCH_X86_64
  706. mov lend, r2d
  707. %else
  708. %define lend dword r2m
  709. %endif
  710. mov src1q, [srcq+1*gprsize]
  711. mov src2q, [srcq+2*gprsize]
  712. mov src3q, [srcq+3*gprsize]
  713. mov src4q, [srcq+4*gprsize]
  714. mov src5q, [srcq+5*gprsize]
  715. mov srcq, [srcq]
  716. sub src1q, srcq
  717. sub src2q, srcq
  718. sub src3q, srcq
  719. sub src4q, srcq
  720. sub src5q, srcq
  721. .loop:
  722. mova m0, [srcq ]
  723. mova m1, [srcq+src1q]
  724. mova m2, [srcq+src2q]
  725. mova m3, [srcq+src3q]
  726. mova m4, [srcq+src4q]
  727. mova m5, [srcq+src5q]
  728. %if cpuflag(sse4)
  729. SBUTTERFLYPS 0, 1, 6
  730. SBUTTERFLYPS 2, 3, 6
  731. SBUTTERFLYPS 4, 5, 6
  732. blendps m6, m4, m0, 1100b
  733. movlhps m0, m2
  734. movhlps m4, m2
  735. blendps m2, m5, m1, 1100b
  736. movlhps m1, m3
  737. movhlps m5, m3
  738. movaps [dstq ], m0
  739. movaps [dstq+16], m6
  740. movaps [dstq+32], m4
  741. movaps [dstq+48], m1
  742. movaps [dstq+64], m2
  743. movaps [dstq+80], m5
  744. %else ; mmx
  745. SBUTTERFLY dq, 0, 1, 6
  746. SBUTTERFLY dq, 2, 3, 6
  747. SBUTTERFLY dq, 4, 5, 6
  748. movq [dstq ], m0
  749. movq [dstq+ 8], m2
  750. movq [dstq+16], m4
  751. movq [dstq+24], m1
  752. movq [dstq+32], m3
  753. movq [dstq+40], m5
  754. %endif
  755. add srcq, mmsize
  756. add dstq, mmsize*6
  757. sub lend, mmsize/4
  758. jg .loop
  759. %if mmsize == 8
  760. emms
  761. RET
  762. %else
  763. REP_RET
  764. %endif
  765. %endmacro
  766. INIT_MMX mmx
  767. CONV_FLTP_TO_FLT_6CH
  768. INIT_XMM sse4
  769. CONV_FLTP_TO_FLT_6CH
  770. %if HAVE_AVX_EXTERNAL
  771. INIT_XMM avx
  772. CONV_FLTP_TO_FLT_6CH
  773. %endif
  774. ;------------------------------------------------------------------------------
  775. ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
  776. ; int channels);
  777. ;------------------------------------------------------------------------------
  778. %macro CONV_S16_TO_S16P_2CH 0
  779. cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
  780. lea lenq, [2*lend]
  781. mov dst1q, [dst0q+gprsize]
  782. mov dst0q, [dst0q ]
  783. lea srcq, [srcq+2*lenq]
  784. add dst0q, lenq
  785. add dst1q, lenq
  786. neg lenq
  787. %if cpuflag(ssse3)
  788. mova m3, [pb_deinterleave_words]
  789. %endif
  790. .loop:
  791. mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  792. mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  793. %if cpuflag(ssse3)
  794. pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
  795. pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
  796. SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  797. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  798. %else ; sse2
  799. pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
  800. pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
  801. pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
  802. pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
  803. DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
  804. ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
  805. %endif
  806. mova [dst0q+lenq], m0
  807. mova [dst1q+lenq], m1
  808. add lenq, mmsize
  809. jl .loop
  810. REP_RET
  811. %endmacro
  812. INIT_XMM sse2
  813. CONV_S16_TO_S16P_2CH
  814. INIT_XMM ssse3
  815. CONV_S16_TO_S16P_2CH
  816. %if HAVE_AVX_EXTERNAL
  817. INIT_XMM avx
  818. CONV_S16_TO_S16P_2CH
  819. %endif
  820. ;------------------------------------------------------------------------------
  821. ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
  822. ; int channels);
  823. ;------------------------------------------------------------------------------
  824. %macro CONV_S16_TO_S16P_6CH 0
  825. %if ARCH_X86_64
  826. cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
  827. %else
  828. cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
  829. %define lend dword r2m
  830. %endif
  831. mov dst1q, [dstq+ gprsize]
  832. mov dst2q, [dstq+2*gprsize]
  833. mov dst3q, [dstq+3*gprsize]
  834. mov dst4q, [dstq+4*gprsize]
  835. mov dst5q, [dstq+5*gprsize]
  836. mov dstq, [dstq ]
  837. sub dst1q, dstq
  838. sub dst2q, dstq
  839. sub dst3q, dstq
  840. sub dst4q, dstq
  841. sub dst5q, dstq
  842. .loop:
  843. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  844. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  845. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  846. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  847. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  848. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  849. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  850. ; m1 = 4, 10, 5, 11, x, x, x, x
  851. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  852. ; m2 = 16, 22, 17, 23, x, x, x, x
  853. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  854. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  855. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  856. movq [dstq ], m0
  857. movhps [dstq+dst1q], m0
  858. movq [dstq+dst2q], m3
  859. movhps [dstq+dst3q], m3
  860. movq [dstq+dst4q], m1
  861. movhps [dstq+dst5q], m1
  862. add srcq, mmsize*3
  863. add dstq, mmsize/2
  864. sub lend, mmsize/4
  865. jg .loop
  866. REP_RET
  867. %endmacro
  868. INIT_XMM sse2
  869. CONV_S16_TO_S16P_6CH
  870. INIT_XMM ssse3
  871. CONV_S16_TO_S16P_6CH
  872. %if HAVE_AVX_EXTERNAL
  873. INIT_XMM avx
  874. CONV_S16_TO_S16P_6CH
  875. %endif
  876. ;------------------------------------------------------------------------------
  877. ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
  878. ; int channels);
  879. ;------------------------------------------------------------------------------
  880. %macro CONV_S16_TO_FLTP_2CH 0
  881. cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
  882. lea lenq, [4*lend]
  883. mov dst1q, [dst0q+gprsize]
  884. mov dst0q, [dst0q ]
  885. add srcq, lenq
  886. add dst0q, lenq
  887. add dst1q, lenq
  888. neg lenq
  889. mova m3, [pf_s32_inv_scale]
  890. mova m4, [pw_zero_even]
  891. .loop:
  892. mova m1, [srcq+lenq]
  893. pslld m0, m1, 16
  894. pand m1, m4
  895. cvtdq2ps m0, m0
  896. cvtdq2ps m1, m1
  897. mulps m0, m0, m3
  898. mulps m1, m1, m3
  899. mova [dst0q+lenq], m0
  900. mova [dst1q+lenq], m1
  901. add lenq, mmsize
  902. jl .loop
  903. REP_RET
  904. %endmacro
  905. INIT_XMM sse2
  906. CONV_S16_TO_FLTP_2CH
  907. %if HAVE_AVX_EXTERNAL
  908. INIT_XMM avx
  909. CONV_S16_TO_FLTP_2CH
  910. %endif
  911. ;------------------------------------------------------------------------------
  912. ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
  913. ; int channels);
  914. ;------------------------------------------------------------------------------
  915. %macro CONV_S16_TO_FLTP_6CH 0
  916. %if ARCH_X86_64
  917. cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  918. %else
  919. cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  920. %define lend dword r2m
  921. %endif
  922. mov dst1q, [dstq+ gprsize]
  923. mov dst2q, [dstq+2*gprsize]
  924. mov dst3q, [dstq+3*gprsize]
  925. mov dst4q, [dstq+4*gprsize]
  926. mov dst5q, [dstq+5*gprsize]
  927. mov dstq, [dstq ]
  928. sub dst1q, dstq
  929. sub dst2q, dstq
  930. sub dst3q, dstq
  931. sub dst4q, dstq
  932. sub dst5q, dstq
  933. mova m6, [pf_s16_inv_scale]
  934. .loop:
  935. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  936. mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
  937. mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  938. PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
  939. shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
  940. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  941. SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  942. ; m1 = 4, 10, 5, 11, x, x, x, x
  943. SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
  944. ; m2 = 16, 22, 17, 23, x, x, x, x
  945. SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  946. ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
  947. punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
  948. S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
  949. ; m2 = 1, 7, 13, 19
  950. S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
  951. ; m4 = 3, 9, 15, 21
  952. S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
  953. ; m5 = 5, 11, 17, 23
  954. SWAP 1,2,3,4
  955. cvtdq2ps m0, m0
  956. cvtdq2ps m1, m1
  957. cvtdq2ps m2, m2
  958. cvtdq2ps m3, m3
  959. cvtdq2ps m4, m4
  960. cvtdq2ps m5, m5
  961. mulps m0, m6
  962. mulps m1, m6
  963. mulps m2, m6
  964. mulps m3, m6
  965. mulps m4, m6
  966. mulps m5, m6
  967. mova [dstq ], m0
  968. mova [dstq+dst1q], m1
  969. mova [dstq+dst2q], m2
  970. mova [dstq+dst3q], m3
  971. mova [dstq+dst4q], m4
  972. mova [dstq+dst5q], m5
  973. add srcq, mmsize*3
  974. add dstq, mmsize
  975. sub lend, mmsize/4
  976. jg .loop
  977. REP_RET
  978. %endmacro
  979. INIT_XMM sse2
  980. CONV_S16_TO_FLTP_6CH
  981. INIT_XMM ssse3
  982. CONV_S16_TO_FLTP_6CH
  983. INIT_XMM sse4
  984. CONV_S16_TO_FLTP_6CH
  985. %if HAVE_AVX_EXTERNAL
  986. INIT_XMM avx
  987. CONV_S16_TO_FLTP_6CH
  988. %endif
  989. ;------------------------------------------------------------------------------
  990. ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
  991. ; int channels);
  992. ;------------------------------------------------------------------------------
  993. %macro CONV_FLT_TO_S16P_2CH 0
  994. cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
  995. lea lenq, [2*lend]
  996. mov dst1q, [dst0q+gprsize]
  997. mov dst0q, [dst0q ]
  998. lea srcq, [srcq+4*lenq]
  999. add dst0q, lenq
  1000. add dst1q, lenq
  1001. neg lenq
  1002. mova m5, [pf_s16_scale]
  1003. .loop:
  1004. mova m0, [srcq+4*lenq ]
  1005. mova m1, [srcq+4*lenq+ mmsize]
  1006. mova m2, [srcq+4*lenq+2*mmsize]
  1007. mova m3, [srcq+4*lenq+3*mmsize]
  1008. DEINT2_PS 0, 1, 4
  1009. DEINT2_PS 2, 3, 4
  1010. mulps m0, m0, m5
  1011. mulps m1, m1, m5
  1012. mulps m2, m2, m5
  1013. mulps m3, m3, m5
  1014. cvtps2dq m0, m0
  1015. cvtps2dq m1, m1
  1016. cvtps2dq m2, m2
  1017. cvtps2dq m3, m3
  1018. packssdw m0, m2
  1019. packssdw m1, m3
  1020. mova [dst0q+lenq], m0
  1021. mova [dst1q+lenq], m1
  1022. add lenq, mmsize
  1023. jl .loop
  1024. REP_RET
  1025. %endmacro
  1026. INIT_XMM sse2
  1027. CONV_FLT_TO_S16P_2CH
  1028. %if HAVE_AVX_EXTERNAL
  1029. INIT_XMM avx
  1030. CONV_FLT_TO_S16P_2CH
  1031. %endif
  1032. ;------------------------------------------------------------------------------
  1033. ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
  1034. ; int channels);
  1035. ;------------------------------------------------------------------------------
  1036. %macro CONV_FLT_TO_S16P_6CH 0
  1037. %if ARCH_X86_64
  1038. cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1039. %else
  1040. cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1041. %define lend dword r2m
  1042. %endif
  1043. mov dst1q, [dstq+ gprsize]
  1044. mov dst2q, [dstq+2*gprsize]
  1045. mov dst3q, [dstq+3*gprsize]
  1046. mov dst4q, [dstq+4*gprsize]
  1047. mov dst5q, [dstq+5*gprsize]
  1048. mov dstq, [dstq ]
  1049. sub dst1q, dstq
  1050. sub dst2q, dstq
  1051. sub dst3q, dstq
  1052. sub dst4q, dstq
  1053. sub dst5q, dstq
  1054. mova m6, [pf_s16_scale]
  1055. .loop:
  1056. mulps m0, m6, [srcq+0*mmsize]
  1057. mulps m3, m6, [srcq+1*mmsize]
  1058. mulps m1, m6, [srcq+2*mmsize]
  1059. mulps m4, m6, [srcq+3*mmsize]
  1060. mulps m2, m6, [srcq+4*mmsize]
  1061. mulps m5, m6, [srcq+5*mmsize]
  1062. cvtps2dq m0, m0
  1063. cvtps2dq m1, m1
  1064. cvtps2dq m2, m2
  1065. cvtps2dq m3, m3
  1066. cvtps2dq m4, m4
  1067. cvtps2dq m5, m5
  1068. packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
  1069. packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
  1070. packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
  1071. PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
  1072. shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
  1073. psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
  1074. SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
  1075. ; m3 = 4, 10, 5, 11, x, x, x, x
  1076. SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
  1077. ; m2 = 16, 22, 17, 23, x, x, x, x
  1078. SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
  1079. ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
  1080. punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
  1081. movq [dstq ], m0
  1082. movhps [dstq+dst1q], m0
  1083. movq [dstq+dst2q], m1
  1084. movhps [dstq+dst3q], m1
  1085. movq [dstq+dst4q], m3
  1086. movhps [dstq+dst5q], m3
  1087. add srcq, mmsize*6
  1088. add dstq, mmsize/2
  1089. sub lend, mmsize/4
  1090. jg .loop
  1091. REP_RET
  1092. %endmacro
  1093. INIT_XMM sse2
  1094. CONV_FLT_TO_S16P_6CH
  1095. INIT_XMM ssse3
  1096. CONV_FLT_TO_S16P_6CH
  1097. %if HAVE_AVX_EXTERNAL
  1098. INIT_XMM avx
  1099. CONV_FLT_TO_S16P_6CH
  1100. %endif
  1101. ;------------------------------------------------------------------------------
  1102. ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
  1103. ; int channels);
  1104. ;------------------------------------------------------------------------------
  1105. %macro CONV_FLT_TO_FLTP_2CH 0
  1106. cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
  1107. lea lenq, [4*lend]
  1108. mov dst1q, [dst0q+gprsize]
  1109. mov dst0q, [dst0q ]
  1110. lea srcq, [srcq+2*lenq]
  1111. add dst0q, lenq
  1112. add dst1q, lenq
  1113. neg lenq
  1114. .loop:
  1115. mova m0, [srcq+2*lenq ]
  1116. mova m1, [srcq+2*lenq+mmsize]
  1117. DEINT2_PS 0, 1, 2
  1118. mova [dst0q+lenq], m0
  1119. mova [dst1q+lenq], m1
  1120. add lenq, mmsize
  1121. jl .loop
  1122. REP_RET
  1123. %endmacro
  1124. INIT_XMM sse
  1125. CONV_FLT_TO_FLTP_2CH
  1126. %if HAVE_AVX_EXTERNAL
  1127. INIT_XMM avx
  1128. CONV_FLT_TO_FLTP_2CH
  1129. %endif
  1130. ;------------------------------------------------------------------------------
  1131. ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
  1132. ; int channels);
  1133. ;------------------------------------------------------------------------------
  1134. %macro CONV_FLT_TO_FLTP_6CH 0
  1135. %if ARCH_X86_64
  1136. cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
  1137. %else
  1138. cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
  1139. %define lend dword r2m
  1140. %endif
  1141. mov dst1q, [dstq+ gprsize]
  1142. mov dst2q, [dstq+2*gprsize]
  1143. mov dst3q, [dstq+3*gprsize]
  1144. mov dst4q, [dstq+4*gprsize]
  1145. mov dst5q, [dstq+5*gprsize]
  1146. mov dstq, [dstq ]
  1147. sub dst1q, dstq
  1148. sub dst2q, dstq
  1149. sub dst3q, dstq
  1150. sub dst4q, dstq
  1151. sub dst5q, dstq
  1152. .loop:
  1153. mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
  1154. mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
  1155. mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
  1156. mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
  1157. mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
  1158. mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
  1159. SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
  1160. ; m3 = 2, 14, 3, 15
  1161. SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
  1162. ; m4 = 6, 18, 7, 19
  1163. SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
  1164. ; m5 = 10, 22, 11, 23
  1165. SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
  1166. ; m4 = 1, 7, 13, 19
  1167. SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
  1168. ; m2 = 3, 9, 15, 21
  1169. SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
  1170. ; m5 = 5, 11, 17, 23
  1171. mova [dstq ], m0
  1172. mova [dstq+dst1q], m4
  1173. mova [dstq+dst2q], m3
  1174. mova [dstq+dst3q], m2
  1175. mova [dstq+dst4q], m1
  1176. mova [dstq+dst5q], m5
  1177. add srcq, mmsize*6
  1178. add dstq, mmsize
  1179. sub lend, mmsize/4
  1180. jg .loop
  1181. REP_RET
  1182. %endmacro
  1183. INIT_XMM sse2
  1184. CONV_FLT_TO_FLTP_6CH
  1185. %if HAVE_AVX_EXTERNAL
  1186. INIT_XMM avx
  1187. CONV_FLT_TO_FLTP_6CH
  1188. %endif