dct32.asm 11 KB


  1. ;******************************************************************************
  2. ;* 32 point SSE-optimized DCT transform
  3. ;* Copyright (c) 2010 Vitor Sessak
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA 32
  23. align 32
  24. ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
  25. dd 0.553104, 0.582935, 0.622504, 0.674808
  26. dd -10.190008, -3.407609, -2.057781, -1.484165
  27. dd -1.169440, -0.972568, -0.839350, -0.744536
  28. dd 0.502419, 0.522499, 0.566944, 0.646822
  29. dd 0.788155, 1.060678, 1.722447, 5.101149
  30. dd 0.509796, 0.601345, 0.899976, 2.562916
  31. dd 0.509796, 0.601345, 0.899976, 2.562916
  32. dd 1.000000, 1.000000, 1.306563, 0.541196
  33. dd 1.000000, 1.000000, 1.306563, 0.541196
  34. dd 1.000000, 0.707107, 1.000000, -0.707107
  35. dd 1.000000, 0.707107, 1.000000, -0.707107
  36. dd 0.707107, 0.707107, 0.707107, 0.707107
  37. align 32
  38. ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
  39. %macro BUTTERFLY 4
  40. subps %4, %1, %2
  41. addps %2, %2, %1
  42. mulps %1, %4, %3
  43. %endmacro
  44. %macro BUTTERFLY0 5
  45. %if cpuflag(sse2) && notcpuflag(avx)
  46. pshufd %4, %1, %5
  47. xorps %1, %2
  48. addps %1, %4
  49. mulps %1, %3
  50. %else
  51. shufps %4, %1, %1, %5
  52. xorps %1, %1, %2
  53. addps %4, %4, %1
  54. mulps %1, %4, %3
  55. %endif
  56. %endmacro
  57. %macro BUTTERFLY2 4
  58. BUTTERFLY0 %1, %2, %3, %4, 0x1b
  59. %endmacro
  60. %macro BUTTERFLY3 4
  61. BUTTERFLY0 %1, %2, %3, %4, 0xb1
  62. %endmacro
  63. %macro BUTTERFLY3V 5
  64. movaps m%5, m%1
  65. addps m%1, m%2
  66. subps m%5, m%2
  67. SWAP %2, %5
  68. mulps m%2, [ps_cos_vec+192]
  69. movaps m%5, m%3
  70. addps m%3, m%4
  71. subps m%4, m%5
  72. mulps m%4, [ps_cos_vec+192]
  73. %endmacro
  74. %macro PASS6_AND_PERMUTE 0
  75. mov tmpd, [outq+4]
  76. movss m7, [outq+72]
  77. addss m7, [outq+76]
  78. movss m3, [outq+56]
  79. addss m3, [outq+60]
  80. addss m4, m3
  81. movss m2, [outq+52]
  82. addss m2, m3
  83. movss m3, [outq+104]
  84. addss m3, [outq+108]
  85. addss m1, m3
  86. addss m5, m4
  87. movss [outq+ 16], m1
  88. movss m1, [outq+100]
  89. addss m1, m3
  90. movss m3, [outq+40]
  91. movss [outq+ 48], m1
  92. addss m3, [outq+44]
  93. movss m1, [outq+100]
  94. addss m4, m3
  95. addss m3, m2
  96. addss m1, [outq+108]
  97. movss [outq+ 40], m3
  98. addss m2, [outq+36]
  99. movss m3, [outq+8]
  100. movss [outq+ 56], m2
  101. addss m3, [outq+12]
  102. movss [outq+ 32], m3
  103. movss m3, [outq+80]
  104. movss [outq+ 8], m5
  105. movss [outq+ 80], m1
  106. movss m2, [outq+52]
  107. movss m5, [outq+120]
  108. addss m5, [outq+124]
  109. movss m1, [outq+64]
  110. addss m2, [outq+60]
  111. addss m0, m5
  112. addss m5, [outq+116]
  113. mov [outq+64], tmpd
  114. addss m6, m0
  115. addss m1, m6
  116. mov tmpd, [outq+12]
  117. mov [outq+ 96], tmpd
  118. movss [outq+ 4], m1
  119. movss m1, [outq+24]
  120. movss [outq+ 24], m4
  121. movss m4, [outq+88]
  122. addss m4, [outq+92]
  123. addss m3, m4
  124. addss m4, [outq+84]
  125. mov tmpd, [outq+108]
  126. addss m1, [outq+28]
  127. addss m0, m1
  128. addss m1, m5
  129. addss m6, m3
  130. addss m3, m0
  131. addss m0, m7
  132. addss m5, [outq+20]
  133. addss m7, m1
  134. movss [outq+ 12], m6
  135. mov [outq+112], tmpd
  136. movss m6, [outq+28]
  137. movss [outq+ 28], m0
  138. movss m0, [outq+36]
  139. movss [outq+ 36], m7
  140. addss m1, m4
  141. movss m7, [outq+116]
  142. addss m0, m2
  143. addss m7, [outq+124]
  144. movss [outq+ 72], m0
  145. movss m0, [outq+44]
  146. addss m2, m0
  147. movss [outq+ 44], m1
  148. movss [outq+ 88], m2
  149. addss m0, [outq+60]
  150. mov tmpd, [outq+60]
  151. mov [outq+120], tmpd
  152. movss [outq+104], m0
  153. addss m4, m5
  154. addss m5, [outq+68]
  155. movss [outq+52], m4
  156. movss [outq+60], m5
  157. movss m4, [outq+68]
  158. movss m5, [outq+20]
  159. movss [outq+ 20], m3
  160. addss m5, m7
  161. addss m7, m6
  162. addss m4, m5
  163. movss m2, [outq+84]
  164. addss m2, [outq+92]
  165. addss m5, m2
  166. movss [outq+ 68], m4
  167. addss m2, m7
  168. movss m4, [outq+76]
  169. movss [outq+ 84], m2
  170. movss [outq+ 76], m5
  171. addss m7, m4
  172. addss m6, [outq+124]
  173. addss m4, m6
  174. addss m6, [outq+92]
  175. movss [outq+100], m4
  176. movss [outq+108], m6
  177. movss m6, [outq+92]
  178. movss [outq+92], m7
  179. addss m6, [outq+124]
  180. movss [outq+116], m6
  181. %endmacro
  182. INIT_YMM avx
  183. SECTION .text
  184. %if HAVE_AVX_EXTERNAL
  185. ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
  186. cglobal dct32_float, 2,3,8, out, in, tmp
  187. ; pass 1
  188. vmovaps m4, [inq+0]
  189. vinsertf128 m5, m5, [inq+96], 1
  190. vinsertf128 m5, m5, [inq+112], 0
  191. vshufps m5, m5, m5, 0x1b
  192. BUTTERFLY m4, m5, [ps_cos_vec], m6
  193. vmovaps m2, [inq+64]
  194. vinsertf128 m6, m6, [inq+32], 1
  195. vinsertf128 m6, m6, [inq+48], 0
  196. vshufps m6, m6, m6, 0x1b
  197. BUTTERFLY m2, m6, [ps_cos_vec+32], m0
  198. ; pass 2
  199. BUTTERFLY m5, m6, [ps_cos_vec+64], m0
  200. BUTTERFLY m4, m2, [ps_cos_vec+64], m7
  201. ; pass 3
  202. vperm2f128 m3, m6, m4, 0x31
  203. vperm2f128 m1, m6, m4, 0x20
  204. vshufps m3, m3, m3, 0x1b
  205. BUTTERFLY m1, m3, [ps_cos_vec+96], m6
  206. vperm2f128 m4, m5, m2, 0x20
  207. vperm2f128 m5, m5, m2, 0x31
  208. vshufps m5, m5, m5, 0x1b
  209. BUTTERFLY m4, m5, [ps_cos_vec+96], m6
  210. ; pass 4
  211. vmovaps m6, [ps_p1p1m1m1+0]
  212. vmovaps m2, [ps_cos_vec+128]
  213. BUTTERFLY2 m5, m6, m2, m7
  214. BUTTERFLY2 m4, m6, m2, m7
  215. BUTTERFLY2 m1, m6, m2, m7
  216. BUTTERFLY2 m3, m6, m2, m7
  217. ; pass 5
  218. vshufps m6, m6, m6, 0xcc
  219. vmovaps m2, [ps_cos_vec+160]
  220. BUTTERFLY3 m5, m6, m2, m7
  221. BUTTERFLY3 m4, m6, m2, m7
  222. BUTTERFLY3 m1, m6, m2, m7
  223. BUTTERFLY3 m3, m6, m2, m7
  224. vperm2f128 m6, m3, m3, 0x31
  225. vmovaps [outq], m3
  226. vextractf128 [outq+64], m5, 1
  227. vextractf128 [outq+32], m5, 0
  228. vextractf128 [outq+80], m4, 1
  229. vextractf128 [outq+48], m4, 0
  230. vperm2f128 m0, m1, m1, 0x31
  231. vmovaps [outq+96], m1
  232. vzeroupper
  233. ; pass 6, no SIMD...
  234. INIT_XMM
  235. PASS6_AND_PERMUTE
  236. RET
  237. %endif
  238. %if ARCH_X86_64
  239. %define SPILL SWAP
  240. %define UNSPILL SWAP
  241. %macro PASS5 0
  242. nop ; FIXME code alignment
  243. SWAP 5, 8
  244. SWAP 4, 12
  245. SWAP 6, 14
  246. SWAP 7, 13
  247. SWAP 0, 15
  248. PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
  249. TRANSPOSE4x4PS 8, 9, 10, 11, 0
  250. BUTTERFLY3V 8, 9, 10, 11, 0
  251. addps m10, m11
  252. TRANSPOSE4x4PS 12, 13, 14, 15, 0
  253. BUTTERFLY3V 12, 13, 14, 15, 0
  254. addps m14, m15
  255. addps m12, m14
  256. addps m14, m13
  257. addps m13, m15
  258. %endmacro
  259. %macro PASS6 0
  260. SWAP 9, 12
  261. SWAP 11, 14
  262. movss [outq+0x00], m8
  263. pshuflw m0, m8, 0xe
  264. movss [outq+0x10], m9
  265. pshuflw m1, m9, 0xe
  266. movss [outq+0x20], m10
  267. pshuflw m2, m10, 0xe
  268. movss [outq+0x30], m11
  269. pshuflw m3, m11, 0xe
  270. movss [outq+0x40], m12
  271. pshuflw m4, m12, 0xe
  272. movss [outq+0x50], m13
  273. pshuflw m5, m13, 0xe
  274. movss [outq+0x60], m14
  275. pshuflw m6, m14, 0xe
  276. movaps [outq+0x70], m15
  277. pshuflw m7, m15, 0xe
  278. addss m0, m1
  279. addss m1, m2
  280. movss [outq+0x08], m0
  281. addss m2, m3
  282. movss [outq+0x18], m1
  283. addss m3, m4
  284. movss [outq+0x28], m2
  285. addss m4, m5
  286. movss [outq+0x38], m3
  287. addss m5, m6
  288. movss [outq+0x48], m4
  289. addss m6, m7
  290. movss [outq+0x58], m5
  291. movss [outq+0x68], m6
  292. movss [outq+0x78], m7
  293. PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
  294. movhlps m0, m1
  295. pshufd m1, m1, 3
  296. SWAP 0, 2, 4, 6, 8, 10, 12, 14
  297. SWAP 1, 3, 5, 7, 9, 11, 13, 15
  298. %rep 7
  299. movhlps m0, m1
  300. pshufd m1, m1, 3
  301. addss m15, m1
  302. SWAP 0, 2, 4, 6, 8, 10, 12, 14
  303. SWAP 1, 3, 5, 7, 9, 11, 13, 15
  304. %endrep
  305. %assign i 4
  306. %rep 15
  307. addss m0, m1
  308. movss [outq+i], m0
  309. SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  310. %assign i i+8
  311. %endrep
  312. %endmacro
  313. %else ; ARCH_X86_32
  314. %macro SPILL 2 ; xmm#, mempos
  315. movaps [outq+(%2-8)*16], m%1
  316. %endmacro
  317. %macro UNSPILL 2
  318. movaps m%1, [outq+(%2-8)*16]
  319. %endmacro
  320. %define PASS6 PASS6_AND_PERMUTE
  321. %macro PASS5 0
  322. movaps m2, [ps_cos_vec+160]
  323. shufps m3, m3, 0xcc
  324. BUTTERFLY3 m5, m3, m2, m1
  325. SPILL 5, 8
  326. UNSPILL 1, 9
  327. BUTTERFLY3 m1, m3, m2, m5
  328. SPILL 1, 14
  329. BUTTERFLY3 m4, m3, m2, m5
  330. SPILL 4, 12
  331. BUTTERFLY3 m7, m3, m2, m5
  332. SPILL 7, 13
  333. UNSPILL 5, 10
  334. BUTTERFLY3 m5, m3, m2, m7
  335. SPILL 5, 10
  336. UNSPILL 4, 11
  337. BUTTERFLY3 m4, m3, m2, m7
  338. SPILL 4, 11
  339. BUTTERFLY3 m6, m3, m2, m7
  340. SPILL 6, 9
  341. BUTTERFLY3 m0, m3, m2, m7
  342. SPILL 0, 15
  343. %endmacro
  344. %endif
  345. ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
  346. %macro DCT32_FUNC 0
  347. cglobal dct32_float, 2, 3, 16, out, in, tmp
  348. ; pass 1
  349. movaps m0, [inq+0]
  350. LOAD_INV m1, [inq+112]
  351. BUTTERFLY m0, m1, [ps_cos_vec], m3
  352. movaps m7, [inq+64]
  353. LOAD_INV m4, [inq+48]
  354. BUTTERFLY m7, m4, [ps_cos_vec+32], m3
  355. ; pass 2
  356. movaps m2, [ps_cos_vec+64]
  357. BUTTERFLY m1, m4, m2, m3
  358. SPILL 1, 11
  359. SPILL 4, 8
  360. ; pass 1
  361. movaps m1, [inq+16]
  362. LOAD_INV m6, [inq+96]
  363. BUTTERFLY m1, m6, [ps_cos_vec+16], m3
  364. movaps m4, [inq+80]
  365. LOAD_INV m5, [inq+32]
  366. BUTTERFLY m4, m5, [ps_cos_vec+48], m3
  367. ; pass 2
  368. BUTTERFLY m0, m7, m2, m3
  369. movaps m2, [ps_cos_vec+80]
  370. BUTTERFLY m6, m5, m2, m3
  371. BUTTERFLY m1, m4, m2, m3
  372. ; pass 3
  373. movaps m2, [ps_cos_vec+96]
  374. shufps m1, m1, 0x1b
  375. BUTTERFLY m0, m1, m2, m3
  376. SPILL 0, 15
  377. SPILL 1, 14
  378. UNSPILL 0, 8
  379. shufps m5, m5, 0x1b
  380. BUTTERFLY m0, m5, m2, m3
  381. UNSPILL 1, 11
  382. shufps m6, m6, 0x1b
  383. BUTTERFLY m1, m6, m2, m3
  384. SPILL 1, 11
  385. shufps m4, m4, 0x1b
  386. BUTTERFLY m7, m4, m2, m3
  387. ; pass 4
  388. movaps m3, [ps_p1p1m1m1+0]
  389. movaps m2, [ps_cos_vec+128]
  390. BUTTERFLY2 m5, m3, m2, m1
  391. BUTTERFLY2 m0, m3, m2, m1
  392. SPILL 0, 9
  393. BUTTERFLY2 m6, m3, m2, m1
  394. SPILL 6, 10
  395. UNSPILL 0, 11
  396. BUTTERFLY2 m0, m3, m2, m1
  397. SPILL 0, 11
  398. BUTTERFLY2 m4, m3, m2, m1
  399. BUTTERFLY2 m7, m3, m2, m1
  400. UNSPILL 6, 14
  401. BUTTERFLY2 m6, m3, m2, m1
  402. UNSPILL 0, 15
  403. BUTTERFLY2 m0, m3, m2, m1
  404. PASS5
  405. PASS6
  406. RET
  407. %endmacro
  408. %macro LOAD_INV 2
  409. %if cpuflag(sse2)
  410. pshufd %1, %2, 0x1b
  411. %elif cpuflag(sse)
  412. movaps %1, %2
  413. shufps %1, %1, 0x1b
  414. %endif
  415. %endmacro
  416. %if ARCH_X86_32
  417. INIT_XMM sse
  418. DCT32_FUNC
  419. %endif
  420. INIT_XMM sse2
  421. DCT32_FUNC