dct32_sse.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. ;******************************************************************************
  2. ;* 32 point SSE-optimized DCT transform
  3. ;* Copyright (c) 2010 Vitor Sessak
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA 32
  24. align 32
  25. ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
  26. dd 0.553104, 0.582935, 0.622504, 0.674808
  27. dd -10.190008, -3.407609, -2.057781, -1.484165
  28. dd -1.169440, -0.972568, -0.839350, -0.744536
  29. dd 0.502419, 0.522499, 0.566944, 0.646822
  30. dd 0.788155, 1.060678, 1.722447, 5.101149
  31. dd 0.509796, 0.601345, 0.899976, 2.562916
  32. dd 0.509796, 0.601345, 0.899976, 2.562916
  33. dd 1.000000, 1.000000, 1.306563, 0.541196
  34. dd 1.000000, 1.000000, 1.306563, 0.541196
  35. dd 1.000000, 0.707107, 1.000000, -0.707107
  36. dd 1.000000, 0.707107, 1.000000, -0.707107
  37. dd 0.707107, 0.707107, 0.707107, 0.707107
  38. align 32
  39. ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
  40. %macro BUTTERFLY_SSE 4
  41. movaps %4, %1
  42. subps %1, %2
  43. addps %2, %4
  44. mulps %1, %3
  45. %endmacro
  46. %macro BUTTERFLY_AVX 4
  47. vsubps %4, %1, %2
  48. vaddps %2, %2, %1
  49. vmulps %1, %4, %3
  50. %endmacro
  51. %macro BUTTERFLY0_SSE 5
  52. movaps %4, %1
  53. shufps %1, %1, %5
  54. xorps %4, %2
  55. addps %1, %4
  56. mulps %1, %3
  57. %endmacro
  58. %macro BUTTERFLY0_SSE2 5
  59. pshufd %4, %1, %5
  60. xorps %1, %2
  61. addps %1, %4
  62. mulps %1, %3
  63. %endmacro
  64. %macro BUTTERFLY0_AVX 5
  65. vshufps %4, %1, %1, %5
  66. vxorps %1, %1, %2
  67. vaddps %4, %4, %1
  68. vmulps %1, %4, %3
  69. %endmacro
  70. %macro BUTTERFLY2 4
  71. BUTTERFLY0 %1, %2, %3, %4, 0x1b
  72. %endmacro
  73. %macro BUTTERFLY3 4
  74. BUTTERFLY0 %1, %2, %3, %4, 0xb1
  75. %endmacro
  76. %macro BUTTERFLY3V 5
  77. movaps m%5, m%1
  78. addps m%1, m%2
  79. subps m%5, m%2
  80. SWAP %2, %5
  81. mulps m%2, [ps_cos_vec+192]
  82. movaps m%5, m%3
  83. addps m%3, m%4
  84. subps m%4, m%5
  85. mulps m%4, [ps_cos_vec+192]
  86. %endmacro
  87. %macro PASS6_AND_PERMUTE 0
  88. mov tmpd, [outq+4]
  89. movss m7, [outq+72]
  90. addss m7, [outq+76]
  91. movss m3, [outq+56]
  92. addss m3, [outq+60]
  93. addss m4, m3
  94. movss m2, [outq+52]
  95. addss m2, m3
  96. movss m3, [outq+104]
  97. addss m3, [outq+108]
  98. addss m1, m3
  99. addss m5, m4
  100. movss [outq+ 16], m1
  101. movss m1, [outq+100]
  102. addss m1, m3
  103. movss m3, [outq+40]
  104. movss [outq+ 48], m1
  105. addss m3, [outq+44]
  106. movss m1, [outq+100]
  107. addss m4, m3
  108. addss m3, m2
  109. addss m1, [outq+108]
  110. movss [outq+ 40], m3
  111. addss m2, [outq+36]
  112. movss m3, [outq+8]
  113. movss [outq+ 56], m2
  114. addss m3, [outq+12]
  115. movss [outq+ 32], m3
  116. movss m3, [outq+80]
  117. movss [outq+ 8], m5
  118. movss [outq+ 80], m1
  119. movss m2, [outq+52]
  120. movss m5, [outq+120]
  121. addss m5, [outq+124]
  122. movss m1, [outq+64]
  123. addss m2, [outq+60]
  124. addss m0, m5
  125. addss m5, [outq+116]
  126. mov [outq+64], tmpd
  127. addss m6, m0
  128. addss m1, m6
  129. mov tmpd, [outq+12]
  130. mov [outq+ 96], tmpd
  131. movss [outq+ 4], m1
  132. movss m1, [outq+24]
  133. movss [outq+ 24], m4
  134. movss m4, [outq+88]
  135. addss m4, [outq+92]
  136. addss m3, m4
  137. addss m4, [outq+84]
  138. mov tmpd, [outq+108]
  139. addss m1, [outq+28]
  140. addss m0, m1
  141. addss m1, m5
  142. addss m6, m3
  143. addss m3, m0
  144. addss m0, m7
  145. addss m5, [outq+20]
  146. addss m7, m1
  147. movss [outq+ 12], m6
  148. mov [outq+112], tmpd
  149. movss m6, [outq+28]
  150. movss [outq+ 28], m0
  151. movss m0, [outq+36]
  152. movss [outq+ 36], m7
  153. addss m1, m4
  154. movss m7, [outq+116]
  155. addss m0, m2
  156. addss m7, [outq+124]
  157. movss [outq+ 72], m0
  158. movss m0, [outq+44]
  159. addss m2, m0
  160. movss [outq+ 44], m1
  161. movss [outq+ 88], m2
  162. addss m0, [outq+60]
  163. mov tmpd, [outq+60]
  164. mov [outq+120], tmpd
  165. movss [outq+104], m0
  166. addss m4, m5
  167. addss m5, [outq+68]
  168. movss [outq+52], m4
  169. movss [outq+60], m5
  170. movss m4, [outq+68]
  171. movss m5, [outq+20]
  172. movss [outq+ 20], m3
  173. addss m5, m7
  174. addss m7, m6
  175. addss m4, m5
  176. movss m2, [outq+84]
  177. addss m2, [outq+92]
  178. addss m5, m2
  179. movss [outq+ 68], m4
  180. addss m2, m7
  181. movss m4, [outq+76]
  182. movss [outq+ 84], m2
  183. movss [outq+ 76], m5
  184. addss m7, m4
  185. addss m6, [outq+124]
  186. addss m4, m6
  187. addss m6, [outq+92]
  188. movss [outq+100], m4
  189. movss [outq+108], m6
  190. movss m6, [outq+92]
  191. movss [outq+92], m7
  192. addss m6, [outq+124]
  193. movss [outq+116], m6
  194. %endmacro
  195. %define BUTTERFLY BUTTERFLY_AVX
  196. %define BUTTERFLY0 BUTTERFLY0_AVX
  197. INIT_YMM
  198. SECTION_TEXT
  199. %ifdef HAVE_AVX
  200. ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
  201. cglobal dct32_float_avx, 2,3,8, out, in, tmp
  202. ; pass 1
  203. vmovaps m4, [inq+0]
  204. vinsertf128 m5, m5, [inq+96], 1
  205. vinsertf128 m5, m5, [inq+112], 0
  206. vshufps m5, m5, m5, 0x1b
  207. BUTTERFLY m4, m5, [ps_cos_vec], m6
  208. vmovaps m2, [inq+64]
  209. vinsertf128 m6, m6, [inq+32], 1
  210. vinsertf128 m6, m6, [inq+48], 0
  211. vshufps m6, m6, m6, 0x1b
  212. BUTTERFLY m2, m6, [ps_cos_vec+32], m0
  213. ; pass 2
  214. BUTTERFLY m5, m6, [ps_cos_vec+64], m0
  215. BUTTERFLY m4, m2, [ps_cos_vec+64], m7
  216. ; pass 3
  217. vperm2f128 m3, m6, m4, 0x31
  218. vperm2f128 m1, m6, m4, 0x20
  219. vshufps m3, m3, m3, 0x1b
  220. BUTTERFLY m1, m3, [ps_cos_vec+96], m6
  221. vperm2f128 m4, m5, m2, 0x20
  222. vperm2f128 m5, m5, m2, 0x31
  223. vshufps m5, m5, m5, 0x1b
  224. BUTTERFLY m4, m5, [ps_cos_vec+96], m6
  225. ; pass 4
  226. vmovaps m6, [ps_p1p1m1m1+0]
  227. vmovaps m2, [ps_cos_vec+128]
  228. BUTTERFLY2 m5, m6, m2, m7
  229. BUTTERFLY2 m4, m6, m2, m7
  230. BUTTERFLY2 m1, m6, m2, m7
  231. BUTTERFLY2 m3, m6, m2, m7
  232. ; pass 5
  233. vshufps m6, m6, m6, 0xcc
  234. vmovaps m2, [ps_cos_vec+160]
  235. BUTTERFLY3 m5, m6, m2, m7
  236. BUTTERFLY3 m4, m6, m2, m7
  237. BUTTERFLY3 m1, m6, m2, m7
  238. BUTTERFLY3 m3, m6, m2, m7
  239. vperm2f128 m6, m3, m3, 0x31
  240. vmovaps [outq], m3
  241. vextractf128 [outq+64], m5, 1
  242. vextractf128 [outq+32], m5, 0
  243. vextractf128 [outq+80], m4, 1
  244. vextractf128 [outq+48], m4, 0
  245. vperm2f128 m0, m1, m1, 0x31
  246. vmovaps [outq+96], m1
  247. vzeroupper
  248. ; pass 6, no SIMD...
  249. INIT_XMM
  250. PASS6_AND_PERMUTE
  251. RET
  252. %endif
  253. %define BUTTERFLY BUTTERFLY_SSE
  254. %define BUTTERFLY0 BUTTERFLY0_SSE
  255. %ifdef ARCH_X86_64
  256. %define SPILL SWAP
  257. %define UNSPILL SWAP
  258. %macro PASS5 0
  259. nop ; FIXME code alignment
  260. SWAP 5, 8
  261. SWAP 4, 12
  262. SWAP 6, 14
  263. SWAP 7, 13
  264. SWAP 0, 15
  265. PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
  266. TRANSPOSE4x4PS 8, 9, 10, 11, 0
  267. BUTTERFLY3V 8, 9, 10, 11, 0
  268. addps m10, m11
  269. TRANSPOSE4x4PS 12, 13, 14, 15, 0
  270. BUTTERFLY3V 12, 13, 14, 15, 0
  271. addps m14, m15
  272. addps m12, m14
  273. addps m14, m13
  274. addps m13, m15
  275. %endmacro
  276. %macro PASS6 0
  277. SWAP 9, 12
  278. SWAP 11, 14
  279. movss [outq+0x00], m8
  280. pshuflw m0, m8, 0xe
  281. movss [outq+0x10], m9
  282. pshuflw m1, m9, 0xe
  283. movss [outq+0x20], m10
  284. pshuflw m2, m10, 0xe
  285. movss [outq+0x30], m11
  286. pshuflw m3, m11, 0xe
  287. movss [outq+0x40], m12
  288. pshuflw m4, m12, 0xe
  289. movss [outq+0x50], m13
  290. pshuflw m5, m13, 0xe
  291. movss [outq+0x60], m14
  292. pshuflw m6, m14, 0xe
  293. movaps [outq+0x70], m15
  294. pshuflw m7, m15, 0xe
  295. addss m0, m1
  296. addss m1, m2
  297. movss [outq+0x08], m0
  298. addss m2, m3
  299. movss [outq+0x18], m1
  300. addss m3, m4
  301. movss [outq+0x28], m2
  302. addss m4, m5
  303. movss [outq+0x38], m3
  304. addss m5, m6
  305. movss [outq+0x48], m4
  306. addss m6, m7
  307. movss [outq+0x58], m5
  308. movss [outq+0x68], m6
  309. movss [outq+0x78], m7
  310. PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
  311. movhlps m0, m1
  312. pshufd m1, m1, 3
  313. SWAP 0, 2, 4, 6, 8, 10, 12, 14
  314. SWAP 1, 3, 5, 7, 9, 11, 13, 15
  315. %rep 7
  316. movhlps m0, m1
  317. pshufd m1, m1, 3
  318. addss m15, m1
  319. SWAP 0, 2, 4, 6, 8, 10, 12, 14
  320. SWAP 1, 3, 5, 7, 9, 11, 13, 15
  321. %endrep
  322. %assign i 4
  323. %rep 15
  324. addss m0, m1
  325. movss [outq+i], m0
  326. SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  327. %assign i i+8
  328. %endrep
  329. %endmacro
  330. %else ; ARCH_X86_32
  331. %macro SPILL 2 ; xmm#, mempos
  332. movaps [outq+(%2-8)*16], m%1
  333. %endmacro
  334. %macro UNSPILL 2
  335. movaps m%1, [outq+(%2-8)*16]
  336. %endmacro
  337. %define PASS6 PASS6_AND_PERMUTE
  338. %macro PASS5 0
  339. movaps m2, [ps_cos_vec+160]
  340. shufps m3, m3, 0xcc
  341. BUTTERFLY3 m5, m3, m2, m1
  342. SPILL 5, 8
  343. UNSPILL 1, 9
  344. BUTTERFLY3 m1, m3, m2, m5
  345. SPILL 1, 14
  346. BUTTERFLY3 m4, m3, m2, m5
  347. SPILL 4, 12
  348. BUTTERFLY3 m7, m3, m2, m5
  349. SPILL 7, 13
  350. UNSPILL 5, 10
  351. BUTTERFLY3 m5, m3, m2, m7
  352. SPILL 5, 10
  353. UNSPILL 4, 11
  354. BUTTERFLY3 m4, m3, m2, m7
  355. SPILL 4, 11
  356. BUTTERFLY3 m6, m3, m2, m7
  357. SPILL 6, 9
  358. BUTTERFLY3 m0, m3, m2, m7
  359. SPILL 0, 15
  360. %endmacro
  361. %endif
  362. INIT_XMM
  363. %macro DCT32_FUNC 1
  364. ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
  365. cglobal dct32_float_%1, 2,3,16, out, in, tmp
  366. ; pass 1
  367. movaps m0, [inq+0]
  368. LOAD_INV m1, [inq+112]
  369. BUTTERFLY m0, m1, [ps_cos_vec], m3
  370. movaps m7, [inq+64]
  371. LOAD_INV m4, [inq+48]
  372. BUTTERFLY m7, m4, [ps_cos_vec+32], m3
  373. ; pass 2
  374. movaps m2, [ps_cos_vec+64]
  375. BUTTERFLY m1, m4, m2, m3
  376. SPILL 1, 11
  377. SPILL 4, 8
  378. ; pass 1
  379. movaps m1, [inq+16]
  380. LOAD_INV m6, [inq+96]
  381. BUTTERFLY m1, m6, [ps_cos_vec+16], m3
  382. movaps m4, [inq+80]
  383. LOAD_INV m5, [inq+32]
  384. BUTTERFLY m4, m5, [ps_cos_vec+48], m3
  385. ; pass 2
  386. BUTTERFLY m0, m7, m2, m3
  387. movaps m2, [ps_cos_vec+80]
  388. BUTTERFLY m6, m5, m2, m3
  389. BUTTERFLY m1, m4, m2, m3
  390. ; pass 3
  391. movaps m2, [ps_cos_vec+96]
  392. shufps m1, m1, 0x1b
  393. BUTTERFLY m0, m1, m2, m3
  394. SPILL 0, 15
  395. SPILL 1, 14
  396. UNSPILL 0, 8
  397. shufps m5, m5, 0x1b
  398. BUTTERFLY m0, m5, m2, m3
  399. UNSPILL 1, 11
  400. shufps m6, m6, 0x1b
  401. BUTTERFLY m1, m6, m2, m3
  402. SPILL 1, 11
  403. shufps m4, m4, 0x1b
  404. BUTTERFLY m7, m4, m2, m3
  405. ; pass 4
  406. movaps m3, [ps_p1p1m1m1+0]
  407. movaps m2, [ps_cos_vec+128]
  408. BUTTERFLY2 m5, m3, m2, m1
  409. BUTTERFLY2 m0, m3, m2, m1
  410. SPILL 0, 9
  411. BUTTERFLY2 m6, m3, m2, m1
  412. SPILL 6, 10
  413. UNSPILL 0, 11
  414. BUTTERFLY2 m0, m3, m2, m1
  415. SPILL 0, 11
  416. BUTTERFLY2 m4, m3, m2, m1
  417. BUTTERFLY2 m7, m3, m2, m1
  418. UNSPILL 6, 14
  419. BUTTERFLY2 m6, m3, m2, m1
  420. UNSPILL 0, 15
  421. BUTTERFLY2 m0, m3, m2, m1
  422. PASS5
  423. PASS6
  424. RET
  425. %endmacro
  426. %macro LOAD_INV_SSE 2
  427. movaps %1, %2
  428. shufps %1, %1, 0x1b
  429. %endmacro
  430. %define LOAD_INV LOAD_INV_SSE
  431. DCT32_FUNC sse
  432. %macro LOAD_INV_SSE2 2
  433. pshufd %1, %2, 0x1b
  434. %endmacro
  435. %define LOAD_INV LOAD_INV_SSE2
  436. %define BUTTERFLY0 BUTTERFLY0_SSE2
  437. DCT32_FUNC sse2