audio_convert.asm 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730
  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA 32
  22. flt2pm31: times 8 dd 4.6566129e-10
  23. flt2p31 : times 8 dd 2147483648.0
  24. flt2p15 : times 8 dd 32768.0
  25. word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
  26. SECTION .text
  27. ;to, from, a/u, log2_outsize, log_intsize, const
  28. %macro PACK_2CH 5-7
  29. cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
  30. mov src2q , [srcq+gprsize]
  31. mov srcq , [srcq]
  32. mov dstq , [dstq]
  33. %ifidn %3, a
  34. test dstq, mmsize-1
  35. jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  36. test srcq, mmsize-1
  37. jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  38. test src2q, mmsize-1
  39. jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  40. %else
  41. pack_2ch_%2_to_%1_u_int %+ SUFFIX:
  42. %endif
  43. lea srcq , [srcq + (1<<%5)*lenq]
  44. lea src2q, [src2q + (1<<%5)*lenq]
  45. lea dstq , [dstq + (2<<%4)*lenq]
  46. neg lenq
  47. %7 m0,m1,m2,m3,m4,m5
  48. .next:
  49. %if %4 >= %5
  50. mov%3 m0, [ srcq +(1<<%5)*lenq]
  51. mova m1, m0
  52. mov%3 m2, [ src2q+(1<<%5)*lenq]
  53. %if %5 == 1
  54. punpcklwd m0, m2
  55. punpckhwd m1, m2
  56. %else
  57. punpckldq m0, m2
  58. punpckhdq m1, m2
  59. %endif
  60. %6 m0,m1,m2,m3,m4,m5
  61. %else
  62. mov%3 m0, [ srcq +(1<<%5)*lenq]
  63. mov%3 m1, [mmsize + srcq +(1<<%5)*lenq]
  64. mov%3 m2, [ src2q+(1<<%5)*lenq]
  65. mov%3 m3, [mmsize + src2q+(1<<%5)*lenq]
  66. %6 m0,m1,m2,m3,m4,m5
  67. mova m2, m0
  68. punpcklwd m0, m1
  69. punpckhwd m2, m1
  70. SWAP 1,2
  71. %endif
  72. mov%3 [ dstq+(2<<%4)*lenq], m0
  73. mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1
  74. %if %4 > %5
  75. mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
  76. mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
  77. add lenq, 4*mmsize/(2<<%4)
  78. %else
  79. add lenq, 2*mmsize/(2<<%4)
  80. %endif
  81. jl .next
  82. RET
  83. %endmacro
  84. %macro UNPACK_2CH 5-7
  85. cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
  86. mov dst2q , [dstq+gprsize]
  87. mov srcq , [srcq]
  88. mov dstq , [dstq]
  89. %ifidn %3, a
  90. test dstq, mmsize-1
  91. jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  92. test srcq, mmsize-1
  93. jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  94. test dst2q, mmsize-1
  95. jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  96. %else
  97. unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
  98. %endif
  99. lea srcq , [srcq + (2<<%5)*lenq]
  100. lea dstq , [dstq + (1<<%4)*lenq]
  101. lea dst2q, [dst2q + (1<<%4)*lenq]
  102. neg lenq
  103. %7 m0,m1,m2,m3,m4,m5
  104. mova m6, [word_unpack_shuf]
  105. .next:
  106. mov%3 m0, [ srcq +(2<<%5)*lenq]
  107. mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq]
  108. %if %5 == 1
  109. %ifidn SUFFIX, _ssse3
  110. pshufb m0, m6
  111. mova m1, m0
  112. pshufb m2, m6
  113. punpcklqdq m0,m2
  114. punpckhqdq m1,m2
  115. %else
  116. mova m1, m0
  117. punpcklwd m0,m2
  118. punpckhwd m1,m2
  119. mova m2, m0
  120. punpcklwd m0,m1
  121. punpckhwd m2,m1
  122. mova m1, m0
  123. punpcklwd m0,m2
  124. punpckhwd m1,m2
  125. %endif
  126. %else
  127. mova m1, m0
  128. shufps m0, m2, 10001000b
  129. shufps m1, m2, 11011101b
  130. %endif
  131. %if %4 < %5
  132. mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq]
  133. mova m3, m2
  134. mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq]
  135. shufps m2, m4, 10001000b
  136. shufps m3, m4, 11011101b
  137. SWAP 1,2
  138. %endif
  139. %6 m0,m1,m2,m3,m4,m5
  140. mov%3 [ dstq+(1<<%4)*lenq], m0
  141. %if %4 > %5
  142. mov%3 [ dst2q+(1<<%4)*lenq], m2
  143. mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
  144. mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
  145. add lenq, 2*mmsize/(1<<%4)
  146. %else
  147. mov%3 [ dst2q+(1<<%4)*lenq], m1
  148. add lenq, mmsize/(1<<%4)
  149. %endif
  150. jl .next
  151. RET
  152. %endmacro
  153. %macro CONV 5-7
  154. cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
  155. mov srcq , [srcq]
  156. mov dstq , [dstq]
  157. %ifidn %3, a
  158. test dstq, mmsize-1
  159. jne %2_to_%1_u_int %+ SUFFIX
  160. test srcq, mmsize-1
  161. jne %2_to_%1_u_int %+ SUFFIX
  162. %else
  163. %2_to_%1_u_int %+ SUFFIX:
  164. %endif
  165. lea srcq , [srcq + (1<<%5)*lenq]
  166. lea dstq , [dstq + (1<<%4)*lenq]
  167. neg lenq
  168. %7 m0,m1,m2,m3,m4,m5
  169. .next:
  170. mov%3 m0, [ srcq +(1<<%5)*lenq]
  171. mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq]
  172. %if %4 < %5
  173. mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq]
  174. mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq]
  175. %endif
  176. %6 m0,m1,m2,m3,m4,m5
  177. mov%3 [ dstq+(1<<%4)*lenq], m0
  178. mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
  179. %if %4 > %5
  180. mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
  181. mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
  182. add lenq, 4*mmsize/(1<<%4)
  183. %else
  184. add lenq, 2*mmsize/(1<<%4)
  185. %endif
  186. jl .next
  187. %if mmsize == 8
  188. emms
  189. RET
  190. %else
  191. RET
  192. %endif
  193. %endmacro
  194. %macro PACK_6CH 8
  195. cglobal pack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, src1, src2, src3, src4, src5, len
  196. %if ARCH_X86_64
  197. mov lend, r2d
  198. %else
  199. %define lend dword r2m
  200. %endif
  201. mov src1q, [srcq+1*gprsize]
  202. mov src2q, [srcq+2*gprsize]
  203. mov src3q, [srcq+3*gprsize]
  204. mov src4q, [srcq+4*gprsize]
  205. mov src5q, [srcq+5*gprsize]
  206. mov srcq, [srcq]
  207. mov dstq, [dstq]
  208. %ifidn %3, a
  209. test dstq, mmsize-1
  210. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  211. test srcq, mmsize-1
  212. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  213. test src1q, mmsize-1
  214. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  215. test src2q, mmsize-1
  216. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  217. test src3q, mmsize-1
  218. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  219. test src4q, mmsize-1
  220. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  221. test src5q, mmsize-1
  222. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  223. %else
  224. pack_6ch_%2_to_%1_u_int %+ SUFFIX:
  225. %endif
  226. sub src1q, srcq
  227. sub src2q, srcq
  228. sub src3q, srcq
  229. sub src4q, srcq
  230. sub src5q, srcq
  231. %8 x,x,x,x,m7,x
  232. .loop:
  233. mov%3 m0, [srcq ]
  234. mov%3 m1, [srcq+src1q]
  235. mov%3 m2, [srcq+src2q]
  236. mov%3 m3, [srcq+src3q]
  237. mov%3 m4, [srcq+src4q]
  238. mov%3 m5, [srcq+src5q]
  239. %if cpuflag(sse)
  240. SBUTTERFLYPS 0, 1, 6
  241. SBUTTERFLYPS 2, 3, 6
  242. SBUTTERFLYPS 4, 5, 6
  243. %if cpuflag(avx)
  244. blendps m6, m4, m0, 1100b
  245. %else
  246. movaps m6, m4
  247. shufps m4, m0, q3210
  248. SWAP 4,6
  249. %endif
  250. movlhps m0, m2
  251. movhlps m4, m2
  252. %if cpuflag(avx)
  253. blendps m2, m5, m1, 1100b
  254. %else
  255. movaps m2, m5
  256. shufps m5, m1, q3210
  257. SWAP 2,5
  258. %endif
  259. movlhps m1, m3
  260. movhlps m5, m3
  261. %7 m0,m6,x,x,m7,m3
  262. %7 m4,m1,x,x,m7,m3
  263. %7 m2,m5,x,x,m7,m3
  264. mov %+ %3 %+ ps [dstq ], m0
  265. mov %+ %3 %+ ps [dstq+16], m6
  266. mov %+ %3 %+ ps [dstq+32], m4
  267. mov %+ %3 %+ ps [dstq+48], m1
  268. mov %+ %3 %+ ps [dstq+64], m2
  269. mov %+ %3 %+ ps [dstq+80], m5
  270. %else ; mmx
  271. SBUTTERFLY dq, 0, 1, 6
  272. SBUTTERFLY dq, 2, 3, 6
  273. SBUTTERFLY dq, 4, 5, 6
  274. movq [dstq ], m0
  275. movq [dstq+ 8], m2
  276. movq [dstq+16], m4
  277. movq [dstq+24], m1
  278. movq [dstq+32], m3
  279. movq [dstq+40], m5
  280. %endif
  281. add srcq, mmsize
  282. add dstq, mmsize*6
  283. sub lend, mmsize/4
  284. jg .loop
  285. %if mmsize == 8
  286. emms
  287. RET
  288. %else
  289. RET
  290. %endif
  291. %endmacro
  292. %macro UNPACK_6CH 8
  293. cglobal unpack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, dst1, dst2, dst3, dst4, dst5, len
  294. %if ARCH_X86_64
  295. mov lend, r2d
  296. %else
  297. %define lend dword r2m
  298. %endif
  299. mov dst1q, [dstq+1*gprsize]
  300. mov dst2q, [dstq+2*gprsize]
  301. mov dst3q, [dstq+3*gprsize]
  302. mov dst4q, [dstq+4*gprsize]
  303. mov dst5q, [dstq+5*gprsize]
  304. mov dstq, [dstq]
  305. mov srcq, [srcq]
  306. %ifidn %3, a
  307. test dstq, mmsize-1
  308. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  309. test srcq, mmsize-1
  310. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  311. test dst1q, mmsize-1
  312. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  313. test dst2q, mmsize-1
  314. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  315. test dst3q, mmsize-1
  316. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  317. test dst4q, mmsize-1
  318. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  319. test dst5q, mmsize-1
  320. jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  321. %else
  322. unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
  323. %endif
  324. sub dst1q, dstq
  325. sub dst2q, dstq
  326. sub dst3q, dstq
  327. sub dst4q, dstq
  328. sub dst5q, dstq
  329. %8 x,x,x,x,m7,x
  330. .loop:
  331. mov%3 m0, [srcq ]
  332. mov%3 m1, [srcq+16]
  333. mov%3 m2, [srcq+32]
  334. mov%3 m3, [srcq+48]
  335. mov%3 m4, [srcq+64]
  336. mov%3 m5, [srcq+80]
  337. SBUTTERFLYPS 0, 3, 6
  338. SBUTTERFLYPS 1, 4, 6
  339. SBUTTERFLYPS 2, 5, 6
  340. SBUTTERFLYPS 0, 4, 6
  341. SBUTTERFLYPS 3, 2, 6
  342. SBUTTERFLYPS 1, 5, 6
  343. SWAP 1, 4
  344. SWAP 2, 3
  345. %7 m0,m1,x,x,m7,m6
  346. %7 m2,m3,x,x,m7,m6
  347. %7 m4,m5,x,x,m7,m6
  348. mov %+ %3 %+ ps [dstq ], m0
  349. mov %+ %3 %+ ps [dstq+dst1q], m1
  350. mov %+ %3 %+ ps [dstq+dst2q], m2
  351. mov %+ %3 %+ ps [dstq+dst3q], m3
  352. mov %+ %3 %+ ps [dstq+dst4q], m4
  353. mov %+ %3 %+ ps [dstq+dst5q], m5
  354. add srcq, mmsize*6
  355. add dstq, mmsize
  356. sub lend, mmsize/4
  357. jg .loop
  358. RET
  359. %endmacro
  360. %define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
  361. %macro PACK_8CH 8
  362. cglobal pack_8ch_%2_to_%1_%3, 2, PACK_8CH_GPRS, %6, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
  363. mov dstq, [dstq]
  364. %if ARCH_X86_32
  365. DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
  366. %define lend dword r2m
  367. %define src1q r0q
  368. %define src1m dword [rsp+32]
  369. %if HAVE_ALIGNED_STACK == 0
  370. DEFINE_ARGS dst, src, src2, src3, src5, src6
  371. %define src4q r0q
  372. %define src4m dword [rsp+36]
  373. %endif
  374. %define src7q r0q
  375. %define src7m dword [rsp+40]
  376. mov dstm, dstq
  377. %endif
  378. mov src7q, [srcq+7*gprsize]
  379. mov src6q, [srcq+6*gprsize]
  380. %if ARCH_X86_32
  381. mov src7m, src7q
  382. %endif
  383. mov src5q, [srcq+5*gprsize]
  384. mov src4q, [srcq+4*gprsize]
  385. mov src3q, [srcq+3*gprsize]
  386. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  387. mov src4m, src4q
  388. %endif
  389. mov src2q, [srcq+2*gprsize]
  390. mov src1q, [srcq+1*gprsize]
  391. mov srcq, [srcq]
  392. %ifidn %3, a
  393. %if ARCH_X86_32
  394. test dstmp, mmsize-1
  395. %else
  396. test dstq, mmsize-1
  397. %endif
  398. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  399. test srcq, mmsize-1
  400. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  401. test src1q, mmsize-1
  402. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  403. test src2q, mmsize-1
  404. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  405. test src3q, mmsize-1
  406. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  407. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  408. test src4m, mmsize-1
  409. %else
  410. test src4q, mmsize-1
  411. %endif
  412. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  413. test src5q, mmsize-1
  414. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  415. test src6q, mmsize-1
  416. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  417. %if ARCH_X86_32
  418. test src7m, mmsize-1
  419. %else
  420. test src7q, mmsize-1
  421. %endif
  422. jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  423. %else
  424. pack_8ch_%2_to_%1_u_int %+ SUFFIX:
  425. %endif
  426. sub src1q, srcq
  427. sub src2q, srcq
  428. sub src3q, srcq
  429. %if ARCH_X86_64 || HAVE_ALIGNED_STACK
  430. sub src4q, srcq
  431. %else
  432. sub src4m, srcq
  433. %endif
  434. sub src5q, srcq
  435. sub src6q, srcq
  436. %if ARCH_X86_64
  437. sub src7q, srcq
  438. %else
  439. mov src1m, src1q
  440. sub src7m, srcq
  441. %endif
  442. %if ARCH_X86_64
  443. %8 x,x,x,x,m9,x
  444. %elifidn %1, int32
  445. %define m9 [flt2p31]
  446. %else
  447. %define m9 [flt2pm31]
  448. %endif
  449. .loop:
  450. mov%3 m0, [srcq ]
  451. mov%3 m1, [srcq+src1q]
  452. mov%3 m2, [srcq+src2q]
  453. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  454. mov src4q, src4m
  455. %endif
  456. mov%3 m3, [srcq+src3q]
  457. mov%3 m4, [srcq+src4q]
  458. mov%3 m5, [srcq+src5q]
  459. %if ARCH_X86_32
  460. mov src7q, src7m
  461. %endif
  462. mov%3 m6, [srcq+src6q]
  463. mov%3 m7, [srcq+src7q]
  464. %if ARCH_X86_64
  465. TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
  466. %7 m0,m1,x,x,m9,m8
  467. %7 m2,m3,x,x,m9,m8
  468. %7 m4,m5,x,x,m9,m8
  469. %7 m6,m7,x,x,m9,m8
  470. mov%3 [dstq], m0
  471. %else
  472. mov dstq, dstm
  473. TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
  474. %7 m0,m1,x,x,m9,m2
  475. mova m2, [rsp]
  476. mov%3 [dstq], m0
  477. %7 m2,m3,x,x,m9,m0
  478. %7 m4,m5,x,x,m9,m0
  479. %7 m6,m7,x,x,m9,m0
  480. %endif
  481. mov%3 [dstq+16], m1
  482. mov%3 [dstq+32], m2
  483. mov%3 [dstq+48], m3
  484. mov%3 [dstq+64], m4
  485. mov%3 [dstq+80], m5
  486. mov%3 [dstq+96], m6
  487. mov%3 [dstq+112], m7
  488. add srcq, mmsize
  489. add dstq, mmsize*8
  490. %if ARCH_X86_32
  491. mov dstm, dstq
  492. mov src1q, src1m
  493. %endif
  494. sub lend, mmsize/4
  495. jg .loop
  496. RET
  497. %endmacro
  498. %macro INT16_TO_INT32_N 6
  499. pxor m2, m2
  500. pxor m3, m3
  501. punpcklwd m2, m1
  502. punpckhwd m3, m1
  503. SWAP 4,0
  504. pxor m0, m0
  505. pxor m1, m1
  506. punpcklwd m0, m4
  507. punpckhwd m1, m4
  508. %endmacro
  509. %macro INT32_TO_INT16_N 6
  510. psrad m0, 16
  511. psrad m1, 16
  512. psrad m2, 16
  513. psrad m3, 16
  514. packssdw m0, m1
  515. packssdw m2, m3
  516. SWAP 1,2
  517. %endmacro
  518. %macro INT32_TO_FLOAT_INIT 6
  519. mova %5, [flt2pm31]
  520. %endmacro
  521. %macro INT32_TO_FLOAT_N 6
  522. cvtdq2ps %1, %1
  523. cvtdq2ps %2, %2
  524. mulps %1, %1, %5
  525. mulps %2, %2, %5
  526. %endmacro
  527. %macro FLOAT_TO_INT32_INIT 6
  528. mova %5, [flt2p31]
  529. %endmacro
  530. %macro FLOAT_TO_INT32_N 6
  531. mulps %1, %5
  532. mulps %2, %5
  533. cvtps2dq %6, %1
  534. cmpps %1, %1, %5, 5
  535. paddd %1, %6
  536. cvtps2dq %6, %2
  537. cmpps %2, %2, %5, 5
  538. paddd %2, %6
  539. %endmacro
  540. %macro INT16_TO_FLOAT_INIT 6
  541. mova m5, [flt2pm31]
  542. %endmacro
  543. %macro INT16_TO_FLOAT_N 6
  544. INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
  545. cvtdq2ps m0, m0
  546. cvtdq2ps m1, m1
  547. cvtdq2ps m2, m2
  548. cvtdq2ps m3, m3
  549. mulps m0, m0, m5
  550. mulps m1, m1, m5
  551. mulps m2, m2, m5
  552. mulps m3, m3, m5
  553. %endmacro
  554. %macro FLOAT_TO_INT16_INIT 6
  555. mova m5, [flt2p15]
  556. %endmacro
  557. %macro FLOAT_TO_INT16_N 6
  558. mulps m0, m5
  559. mulps m1, m5
  560. mulps m2, m5
  561. mulps m3, m5
  562. cvtps2dq m0, m0
  563. cvtps2dq m1, m1
  564. packssdw m0, m1
  565. cvtps2dq m1, m2
  566. cvtps2dq m3, m3
  567. packssdw m1, m3
  568. %endmacro
  569. %macro NOP_N 0-6
  570. %endmacro
  571. INIT_XMM sse
  572. PACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
  573. PACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
  574. UNPACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
  575. UNPACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
  576. INIT_XMM sse2
  577. CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  578. CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  579. CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  580. CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  581. PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  582. PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  583. PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
  584. PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
  585. PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  586. PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  587. PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  588. PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  589. UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  590. UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  591. UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
  592. UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
  593. UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  594. UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  595. UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  596. UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  597. CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  598. CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  599. CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  600. CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  601. CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  602. CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  603. CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  604. CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  605. PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  606. PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  607. PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  608. PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  609. PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  610. PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  611. PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  612. PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  613. UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  614. UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  615. UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  616. UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  617. UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  618. UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  619. UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  620. UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  621. PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  622. PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  623. PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  624. PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  625. UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  626. UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  627. UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  628. UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  629. PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
  630. PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
  631. PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  632. PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  633. PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  634. PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  635. INIT_XMM ssse3
  636. UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  637. UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  638. UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  639. UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  640. UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  641. UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  642. %if HAVE_AVX_EXTERNAL
  643. INIT_XMM avx
  644. PACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
  645. PACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
  646. UNPACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
  647. UNPACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
  648. PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  649. PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  650. PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  651. PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  652. UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  653. UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  654. UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  655. UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  656. PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
  657. PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
  658. PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  659. PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  660. PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  661. PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  662. INIT_YMM avx
  663. CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  664. CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  665. %endif
  666. %if HAVE_AVX2_EXTERNAL
  667. INIT_YMM avx2
  668. CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  669. CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  670. %endif