fft.asm 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of FFmpeg.
  10. ;*
  11. ;* FFmpeg is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* FFmpeg is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with FFmpeg; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. ; These functions are not individually interchangeable with the C versions.
  26. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  27. ; in blocks as conventient to the vector size.
  28. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  29. %include "libavutil/x86/x86util.asm"
  30. %if ARCH_X86_64
  31. %define pointer resq
  32. %else
  33. %define pointer resd
  34. %endif
  35. struc FFTContext
  36. .nbits: resd 1
  37. .reverse: resd 1
  38. .revtab: pointer 1
  39. .tmpbuf: pointer 1
  40. .mdctsize: resd 1
  41. .mdctbits: resd 1
  42. .tcos: pointer 1
  43. .tsin: pointer 1
  44. .fftperm: pointer 1
  45. .fftcalc: pointer 1
  46. .imdctcalc:pointer 1
  47. .imdcthalf:pointer 1
  48. endstruc
  49. SECTION_RODATA 32
  50. %define M_SQRT1_2 0.70710678118654752440
  51. %define M_COS_PI_1_8 0.923879532511287
  52. %define M_COS_PI_3_8 0.38268343236509
  53. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  54. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  55. ps_root2: times 8 dd M_SQRT1_2
  56. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  57. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  58. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  59. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  60. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  61. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  62. ps_m1p1: dd 1<<31, 0
  63. cextern ps_neg
  64. %assign i 16
  65. %rep 13
  66. cextern cos_ %+ i
  67. %assign i i<<1
  68. %endrep
  69. %if ARCH_X86_64
  70. %define pointer dq
  71. %else
  72. %define pointer dd
  73. %endif
  74. %macro IF0 1+
  75. %endmacro
  76. %macro IF1 1+
  77. %1
  78. %endmacro
  79. SECTION .text
  80. %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
  81. mova %1, %3
  82. mova %2, %1
  83. pfadd %1, %4
  84. pfsub %2, %4
  85. %endmacro
  86. %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
  87. mova %5, %3
  88. pfsub %3, %4
  89. pfadd %5, %4 ; {t6,t5}
  90. pxor %3, [ps_m1p1] ; {t8,t7}
  91. mova %6, %1
  92. movd [r0+12], %3
  93. punpckhdq %3, [r0+8]
  94. pfadd %1, %5 ; {r0,i0}
  95. pfsub %6, %5 ; {r2,i2}
  96. mova %4, %2
  97. pfadd %2, %3 ; {r1,i1}
  98. pfsub %4, %3 ; {r3,i3}
  99. SWAP %3, %6
  100. %endmacro
  101. ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  102. ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  103. ; %3, %4, %5 tmp
  104. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  105. ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  106. %macro T8_AVX 5
  107. vsubps %5, %1, %2 ; v = %1 - %2
  108. vaddps %3, %1, %2 ; w = %1 + %2
  109. vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
  110. vpermilps %2, %2, [perm1]
  111. vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  112. vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  113. vsubps %4, %5, %1 ; s = r - q
  114. vaddps %1, %5, %1 ; u = r + q
  115. vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
  116. vshufps %5, %4, %1, 0xbb
  117. vshufps %3, %4, %1, 0xee
  118. vperm2f128 %3, %3, %5, 0x13
  119. vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
  120. vshufps %2, %1, %4, 0xdd
  121. vshufps %1, %1, %4, 0x88
  122. vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
  123. vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
  124. vsubps %5, %1, %3
  125. vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  126. vsubps %2, %4, %1 ; %2 = v - w
  127. vaddps %1, %4, %1 ; %1 = v + w
  128. %endmacro
  129. ; In SSE mode do one fft4 transforms
  130. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  131. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  132. ;
  133. ; In AVX mode do two fft4 transforms
  134. ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  135. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  136. %macro T4_SSE 3
  137. subps %3, %1, %2 ; {t3,t4,-t8,t7}
  138. addps %1, %1, %2 ; {t1,t2,t6,t5}
  139. xorps %3, %3, [ps_p1p1m1p1]
  140. shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  141. shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  142. subps %3, %1, %2 ; {r2,i2,r3,i3}
  143. addps %1, %1, %2 ; {r0,i0,r1,i1}
  144. shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  145. shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  146. %endmacro
  147. ; In SSE mode do one FFT8
  148. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  149. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  150. ;
  151. ; In AVX mode do two FFT8
  152. ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  153. ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  154. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  155. ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  156. %macro T8_SSE 6
  157. addps %6, %3, %4 ; {t1,t2,t3,t4}
  158. subps %3, %3, %4 ; {r5,i5,r7,i7}
  159. shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  160. mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  161. mulps %4, %4, [ps_root2]
  162. addps %3, %3, %4 ; {t8,t7,ta,t9}
  163. shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  164. shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  165. subps %3, %6, %4 ; {t6,t5,tc,tb}
  166. addps %6, %6, %4 ; {t1,t2,t9,ta}
  167. shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  168. shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  169. subps %3, %1, %6 ; {r4,r5,r6,r7}
  170. addps %1, %1, %6 ; {r0,r1,r2,r3}
  171. subps %4, %2, %5 ; {i4,i5,i6,i7}
  172. addps %2, %2, %5 ; {i0,i1,i2,i3}
  173. %endmacro
  174. ; scheduled for cpu-bound sizes
  175. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  176. IF%1 mova m4, Z(4)
  177. IF%1 mova m5, Z(5)
  178. mova m0, %2 ; wre
  179. mova m1, %3 ; wim
  180. mulps m2, m4, m0 ; r2*wre
  181. IF%1 mova m6, Z2(6)
  182. mulps m3, m5, m1 ; i2*wim
  183. IF%1 mova m7, Z2(7)
  184. mulps m4, m4, m1 ; r2*wim
  185. mulps m5, m5, m0 ; i2*wre
  186. addps m2, m2, m3 ; r2*wre + i2*wim
  187. mulps m3, m1, m7 ; i3*wim
  188. subps m5, m5, m4 ; i2*wre - r2*wim
  189. mulps m1, m1, m6 ; r3*wim
  190. mulps m4, m0, m6 ; r3*wre
  191. mulps m0, m0, m7 ; i3*wre
  192. subps m4, m4, m3 ; r3*wre - i3*wim
  193. mova m3, Z(0)
  194. addps m0, m0, m1 ; i3*wre + r3*wim
  195. subps m1, m4, m2 ; t3
  196. addps m4, m4, m2 ; t5
  197. subps m3, m3, m4 ; r2
  198. addps m4, m4, Z(0) ; r0
  199. mova m6, Z(2)
  200. mova Z(4), m3
  201. mova Z(0), m4
  202. subps m3, m5, m0 ; t4
  203. subps m4, m6, m3 ; r3
  204. addps m3, m3, m6 ; r1
  205. mova Z2(6), m4
  206. mova Z(2), m3
  207. mova m2, Z(3)
  208. addps m3, m5, m0 ; t6
  209. subps m2, m2, m1 ; i3
  210. mova m7, Z(1)
  211. addps m1, m1, Z(3) ; i1
  212. mova Z2(7), m2
  213. mova Z(3), m1
  214. subps m4, m7, m3 ; i2
  215. addps m3, m3, m7 ; i0
  216. mova Z(5), m4
  217. mova Z(1), m3
  218. %endmacro
  219. ; scheduled to avoid store->load aliasing
  220. %macro PASS_BIG 1 ; (!interleave)
  221. mova m4, Z(4) ; r2
  222. mova m5, Z(5) ; i2
  223. mova m0, [wq] ; wre
  224. mova m1, [wq+o1q] ; wim
  225. mulps m2, m4, m0 ; r2*wre
  226. mova m6, Z2(6) ; r3
  227. mulps m3, m5, m1 ; i2*wim
  228. mova m7, Z2(7) ; i3
  229. mulps m4, m4, m1 ; r2*wim
  230. mulps m5, m5, m0 ; i2*wre
  231. addps m2, m2, m3 ; r2*wre + i2*wim
  232. mulps m3, m1, m7 ; i3*wim
  233. mulps m1, m1, m6 ; r3*wim
  234. subps m5, m5, m4 ; i2*wre - r2*wim
  235. mulps m4, m0, m6 ; r3*wre
  236. mulps m0, m0, m7 ; i3*wre
  237. subps m4, m4, m3 ; r3*wre - i3*wim
  238. mova m3, Z(0)
  239. addps m0, m0, m1 ; i3*wre + r3*wim
  240. subps m1, m4, m2 ; t3
  241. addps m4, m4, m2 ; t5
  242. subps m3, m3, m4 ; r2
  243. addps m4, m4, Z(0) ; r0
  244. mova m6, Z(2)
  245. mova Z(4), m3
  246. mova Z(0), m4
  247. subps m3, m5, m0 ; t4
  248. subps m4, m6, m3 ; r3
  249. addps m3, m3, m6 ; r1
  250. IF%1 mova Z2(6), m4
  251. IF%1 mova Z(2), m3
  252. mova m2, Z(3)
  253. addps m5, m5, m0 ; t6
  254. subps m2, m2, m1 ; i3
  255. mova m7, Z(1)
  256. addps m1, m1, Z(3) ; i1
  257. IF%1 mova Z2(7), m2
  258. IF%1 mova Z(3), m1
  259. subps m6, m7, m5 ; i2
  260. addps m5, m5, m7 ; i0
  261. IF%1 mova Z(5), m6
  262. IF%1 mova Z(1), m5
  263. %if %1==0
  264. INTERL m1, m3, m7, Z, 2
  265. INTERL m2, m4, m0, Z2, 6
  266. mova m1, Z(0)
  267. mova m2, Z(4)
  268. INTERL m5, m1, m3, Z, 0
  269. INTERL m6, m2, m7, Z, 4
  270. %endif
  271. %endmacro
  272. %macro PUNPCK 3
  273. mova %3, %1
  274. punpckldq %1, %2
  275. punpckhdq %3, %2
  276. %endmacro
  277. %define Z(x) [r0+mmsize*x]
  278. %define Z2(x) [r0+mmsize*x]
  279. %define ZH(x) [r0+mmsize*x+mmsize/2]
  280. INIT_YMM avx
  281. %if HAVE_AVX_EXTERNAL
  282. align 16
  283. fft8_avx:
  284. mova m0, Z(0)
  285. mova m1, Z(1)
  286. T8_AVX m0, m1, m2, m3, m4
  287. mova Z(0), m0
  288. mova Z(1), m1
  289. ret
  290. align 16
  291. fft16_avx:
  292. mova m2, Z(2)
  293. mova m3, Z(3)
  294. T4_SSE m2, m3, m7
  295. mova m0, Z(0)
  296. mova m1, Z(1)
  297. T8_AVX m0, m1, m4, m5, m7
  298. mova m4, [ps_cos16_1]
  299. mova m5, [ps_cos16_2]
  300. vmulps m6, m2, m4
  301. vmulps m7, m3, m5
  302. vaddps m7, m7, m6
  303. vmulps m2, m2, m5
  304. vmulps m3, m3, m4
  305. vsubps m3, m3, m2
  306. vblendps m2, m7, m3, 0xf0
  307. vperm2f128 m3, m7, m3, 0x21
  308. vaddps m4, m2, m3
  309. vsubps m2, m3, m2
  310. vperm2f128 m2, m2, m2, 0x01
  311. vsubps m3, m1, m2
  312. vaddps m1, m1, m2
  313. vsubps m5, m0, m4
  314. vaddps m0, m0, m4
  315. vextractf128 Z(0), m0, 0
  316. vextractf128 ZH(0), m1, 0
  317. vextractf128 Z(1), m0, 1
  318. vextractf128 ZH(1), m1, 1
  319. vextractf128 Z(2), m5, 0
  320. vextractf128 ZH(2), m3, 0
  321. vextractf128 Z(3), m5, 1
  322. vextractf128 ZH(3), m3, 1
  323. ret
  324. align 16
  325. fft32_avx:
  326. call fft16_avx
  327. mova m0, Z(4)
  328. mova m1, Z(5)
  329. T4_SSE m0, m1, m4
  330. mova m2, Z(6)
  331. mova m3, Z(7)
  332. T8_SSE m0, m1, m2, m3, m4, m6
  333. ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  334. ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  335. vperm2f128 m4, m0, m2, 0x20
  336. vperm2f128 m5, m1, m3, 0x20
  337. vperm2f128 m6, m0, m2, 0x31
  338. vperm2f128 m7, m1, m3, 0x31
  339. PASS_SMALL 0, [cos_32], [cos_32+32]
  340. ret
  341. fft32_interleave_avx:
  342. call fft32_avx
  343. mov r2d, 32
  344. .deint_loop:
  345. mova m2, Z(0)
  346. mova m3, Z(1)
  347. vunpcklps m0, m2, m3
  348. vunpckhps m1, m2, m3
  349. vextractf128 Z(0), m0, 0
  350. vextractf128 ZH(0), m1, 0
  351. vextractf128 Z(1), m0, 1
  352. vextractf128 ZH(1), m1, 1
  353. add r0, mmsize*2
  354. sub r2d, mmsize/4
  355. jg .deint_loop
  356. ret
  357. %endif
  358. INIT_XMM sse
  359. align 16
  360. fft4_avx:
  361. fft4_sse:
  362. mova m0, Z(0)
  363. mova m1, Z(1)
  364. T4_SSE m0, m1, m2
  365. mova Z(0), m0
  366. mova Z(1), m1
  367. ret
  368. align 16
  369. fft8_sse:
  370. mova m0, Z(0)
  371. mova m1, Z(1)
  372. T4_SSE m0, m1, m2
  373. mova m2, Z(2)
  374. mova m3, Z(3)
  375. T8_SSE m0, m1, m2, m3, m4, m5
  376. mova Z(0), m0
  377. mova Z(1), m1
  378. mova Z(2), m2
  379. mova Z(3), m3
  380. ret
  381. align 16
  382. fft16_sse:
  383. mova m0, Z(0)
  384. mova m1, Z(1)
  385. T4_SSE m0, m1, m2
  386. mova m2, Z(2)
  387. mova m3, Z(3)
  388. T8_SSE m0, m1, m2, m3, m4, m5
  389. mova m4, Z(4)
  390. mova m5, Z(5)
  391. mova Z(0), m0
  392. mova Z(1), m1
  393. mova Z(2), m2
  394. mova Z(3), m3
  395. T4_SSE m4, m5, m6
  396. mova m6, Z2(6)
  397. mova m7, Z2(7)
  398. T4_SSE m6, m7, m0
  399. PASS_SMALL 0, [cos_16], [cos_16+16]
  400. ret
  401. %macro FFT48_3DNOW 0
  402. align 16
  403. fft4 %+ SUFFIX:
  404. T2_3DNOW m0, m1, Z(0), Z(1)
  405. mova m2, Z(2)
  406. mova m3, Z(3)
  407. T4_3DNOW m0, m1, m2, m3, m4, m5
  408. PUNPCK m0, m1, m4
  409. PUNPCK m2, m3, m5
  410. mova Z(0), m0
  411. mova Z(1), m4
  412. mova Z(2), m2
  413. mova Z(3), m5
  414. ret
  415. align 16
  416. fft8 %+ SUFFIX:
  417. T2_3DNOW m0, m1, Z(0), Z(1)
  418. mova m2, Z(2)
  419. mova m3, Z(3)
  420. T4_3DNOW m0, m1, m2, m3, m4, m5
  421. mova Z(0), m0
  422. mova Z(2), m2
  423. T2_3DNOW m4, m5, Z(4), Z(5)
  424. T2_3DNOW m6, m7, Z2(6), Z2(7)
  425. PSWAPD m0, m5
  426. PSWAPD m2, m7
  427. pxor m0, [ps_m1p1]
  428. pxor m2, [ps_m1p1]
  429. pfsub m5, m0
  430. pfadd m7, m2
  431. pfmul m5, [ps_root2]
  432. pfmul m7, [ps_root2]
  433. T4_3DNOW m1, m3, m5, m7, m0, m2
  434. mova Z(5), m5
  435. mova Z2(7), m7
  436. mova m0, Z(0)
  437. mova m2, Z(2)
  438. T4_3DNOW m0, m2, m4, m6, m5, m7
  439. PUNPCK m0, m1, m5
  440. PUNPCK m2, m3, m7
  441. mova Z(0), m0
  442. mova Z(1), m5
  443. mova Z(2), m2
  444. mova Z(3), m7
  445. PUNPCK m4, Z(5), m5
  446. PUNPCK m6, Z2(7), m7
  447. mova Z(4), m4
  448. mova Z(5), m5
  449. mova Z2(6), m6
  450. mova Z2(7), m7
  451. ret
  452. %endmacro
  453. %if ARCH_X86_32
  454. INIT_MMX 3dnowext
  455. FFT48_3DNOW
  456. INIT_MMX 3dnow
  457. FFT48_3DNOW
  458. %endif
  459. %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
  460. %define Z2(x) [zcq + o3q + mmsize*(x&1)]
  461. %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  462. %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
  463. %macro DECL_PASS 2+ ; name, payload
  464. align 16
  465. %1:
  466. DEFINE_ARGS zc, w, n, o1, o3
  467. lea o3q, [nq*3]
  468. lea o1q, [nq*8]
  469. shl o3q, 4
  470. .loop:
  471. %2
  472. add zcq, mmsize*2
  473. add wq, mmsize
  474. sub nd, mmsize/8
  475. jg .loop
  476. rep ret
  477. %endmacro
  478. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  479. lea r2, [dispatch_tab%1]
  480. mov r2, [r2 + (%2q-2)*gprsize]
  481. %ifdef PIC
  482. lea r3, [$$]
  483. add r2, r3
  484. %endif
  485. call r2
  486. %endmacro ; FFT_DISPATCH
  487. INIT_YMM avx
  488. %if HAVE_AVX_EXTERNAL
  489. %macro INTERL_AVX 5
  490. vunpckhps %3, %2, %1
  491. vunpcklps %2, %2, %1
  492. vextractf128 %4(%5), %2, 0
  493. vextractf128 %4 %+ H(%5), %3, 0
  494. vextractf128 %4(%5 + 1), %2, 1
  495. vextractf128 %4 %+ H(%5 + 1), %3, 1
  496. %endmacro
  497. %define INTERL INTERL_AVX
  498. DECL_PASS pass_avx, PASS_BIG 1
  499. DECL_PASS pass_interleave_avx, PASS_BIG 0
  500. cglobal fft_calc, 2,5,8
  501. mov r3d, [r0 + FFTContext.nbits]
  502. mov r0, r1
  503. mov r1, r3
  504. FFT_DISPATCH _interleave %+ SUFFIX, r1
  505. REP_RET
  506. %endif
  507. INIT_XMM sse
  508. %macro INTERL_SSE 5
  509. mova %3, %2
  510. unpcklps %2, %1
  511. unpckhps %3, %1
  512. mova %4(%5), %2
  513. mova %4(%5+1), %3
  514. %endmacro
  515. %define INTERL INTERL_SSE
  516. DECL_PASS pass_sse, PASS_BIG 1
  517. DECL_PASS pass_interleave_sse, PASS_BIG 0
  518. %macro FFT_CALC_FUNC 0
  519. cglobal fft_calc, 2,5,8
  520. mov r3d, [r0 + FFTContext.nbits]
  521. PUSH r1
  522. PUSH r3
  523. mov r0, r1
  524. mov r1, r3
  525. FFT_DISPATCH _interleave %+ SUFFIX, r1
  526. POP rcx
  527. POP r4
  528. cmp rcx, 3+(mmsize/16)
  529. jg .end
  530. mov r2, -1
  531. add rcx, 3
  532. shl r2, cl
  533. sub r4, r2
  534. .loop:
  535. %if mmsize == 8
  536. PSWAPD m0, [r4 + r2 + 4]
  537. mova [r4 + r2 + 4], m0
  538. %else
  539. movaps xmm0, [r4 + r2]
  540. movaps xmm1, xmm0
  541. unpcklps xmm0, [r4 + r2 + 16]
  542. unpckhps xmm1, [r4 + r2 + 16]
  543. movaps [r4 + r2], xmm0
  544. movaps [r4 + r2 + 16], xmm1
  545. %endif
  546. add r2, mmsize*2
  547. jl .loop
  548. .end:
  549. %if cpuflag(3dnow)
  550. femms
  551. RET
  552. %else
  553. REP_RET
  554. %endif
  555. %endmacro
  556. %if ARCH_X86_32
  557. INIT_MMX 3dnow
  558. FFT_CALC_FUNC
  559. INIT_MMX 3dnowext
  560. FFT_CALC_FUNC
  561. %endif
  562. INIT_XMM sse
  563. FFT_CALC_FUNC
  564. cglobal fft_permute, 2,7,1
  565. mov r4, [r0 + FFTContext.revtab]
  566. mov r5, [r0 + FFTContext.tmpbuf]
  567. mov ecx, [r0 + FFTContext.nbits]
  568. mov r2, 1
  569. shl r2, cl
  570. xor r0, r0
  571. %if ARCH_X86_32
  572. mov r1, r1m
  573. %endif
  574. .loop:
  575. movaps xmm0, [r1 + 8*r0]
  576. movzx r6, word [r4 + 2*r0]
  577. movzx r3, word [r4 + 2*r0 + 2]
  578. movlps [r5 + 8*r6], xmm0
  579. movhps [r5 + 8*r3], xmm0
  580. add r0, 2
  581. cmp r0, r2
  582. jl .loop
  583. shl r2, 3
  584. add r1, r2
  585. add r5, r2
  586. neg r2
  587. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  588. .loopcopy:
  589. movaps xmm0, [r5 + r2]
  590. movaps xmm1, [r5 + r2 + 16]
  591. movaps [r1 + r2], xmm0
  592. movaps [r1 + r2 + 16], xmm1
  593. add r2, 32
  594. jl .loopcopy
  595. REP_RET
  596. %macro IMDCT_CALC_FUNC 0
  597. cglobal imdct_calc, 3,5,3
  598. mov r3d, [r0 + FFTContext.mdctsize]
  599. mov r4, [r0 + FFTContext.imdcthalf]
  600. add r1, r3
  601. PUSH r3
  602. PUSH r1
  603. %if ARCH_X86_32
  604. push r2
  605. push r1
  606. push r0
  607. %else
  608. sub rsp, 8+32*WIN64 ; allocate win64 shadow space
  609. %endif
  610. call r4
  611. %if ARCH_X86_32
  612. add esp, 12
  613. %else
  614. add rsp, 8+32*WIN64
  615. %endif
  616. POP r1
  617. POP r3
  618. lea r0, [r1 + 2*r3]
  619. mov r2, r3
  620. sub r3, mmsize
  621. neg r2
  622. mova m2, [ps_neg]
  623. .loop:
  624. %if mmsize == 8
  625. PSWAPD m0, [r1 + r3]
  626. PSWAPD m1, [r0 + r2]
  627. pxor m0, m2
  628. %else
  629. mova m0, [r1 + r3]
  630. mova m1, [r0 + r2]
  631. shufps m0, m0, 0x1b
  632. shufps m1, m1, 0x1b
  633. xorps m0, m2
  634. %endif
  635. mova [r0 + r3], m1
  636. mova [r1 + r2], m0
  637. sub r3, mmsize
  638. add r2, mmsize
  639. jl .loop
  640. %if cpuflag(3dnow)
  641. femms
  642. RET
  643. %else
  644. REP_RET
  645. %endif
  646. %endmacro
  647. %if ARCH_X86_32
  648. INIT_MMX 3dnow
  649. IMDCT_CALC_FUNC
  650. INIT_MMX 3dnowext
  651. IMDCT_CALC_FUNC
  652. %endif
  653. INIT_XMM sse
  654. IMDCT_CALC_FUNC
  655. %if ARCH_X86_32
  656. INIT_MMX 3dnow
  657. %define mulps pfmul
  658. %define addps pfadd
  659. %define subps pfsub
  660. %define unpcklps punpckldq
  661. %define unpckhps punpckhdq
  662. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  663. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  664. %define pass_3dnowext pass_3dnow
  665. %define pass_interleave_3dnowext pass_interleave_3dnow
  666. %endif
  667. %ifdef PIC
  668. %define SECTION_REL - $$
  669. %else
  670. %define SECTION_REL
  671. %endif
  672. %macro DECL_FFT 1-2 ; nbits, suffix
  673. %ifidn %0, 1
  674. %xdefine fullsuffix SUFFIX
  675. %else
  676. %xdefine fullsuffix %2 %+ SUFFIX
  677. %endif
  678. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  679. %if %1>=5
  680. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  681. %endif
  682. %if %1>=6
  683. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  684. %endif
  685. %assign n 1<<%1
  686. %rep 17-%1
  687. %assign n2 n/2
  688. %assign n4 n/4
  689. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  690. align 16
  691. fft %+ n %+ fullsuffix:
  692. call fft %+ n2 %+ SUFFIX
  693. add r0, n*4 - (n&(-2<<%1))
  694. call fft %+ n4 %+ SUFFIX
  695. add r0, n*2 - (n2&(-2<<%1))
  696. call fft %+ n4 %+ SUFFIX
  697. sub r0, n*6 + (n2&(-2<<%1))
  698. lea r1, [cos_ %+ n]
  699. mov r2d, n4/2
  700. jmp pass %+ fullsuffix
  701. %assign n n*2
  702. %endrep
  703. %undef n
  704. align 8
  705. dispatch_tab %+ fullsuffix: pointer list_of_fft
  706. %endmacro ; DECL_FFT
  707. %if HAVE_AVX_EXTERNAL
  708. INIT_YMM avx
  709. DECL_FFT 6
  710. DECL_FFT 6, _interleave
  711. %endif
  712. INIT_XMM sse
  713. DECL_FFT 5
  714. DECL_FFT 5, _interleave
  715. %if ARCH_X86_32
  716. INIT_MMX 3dnow
  717. DECL_FFT 4
  718. DECL_FFT 4, _interleave
  719. INIT_MMX 3dnowext
  720. DECL_FFT 4
  721. DECL_FFT 4, _interleave
  722. %endif
  723. INIT_XMM sse
  724. %undef mulps
  725. %undef addps
  726. %undef subps
  727. %undef unpcklps
  728. %undef unpckhps
  729. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  730. %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
  731. PSWAPD m0, [%3+%2*4]
  732. movq m2, [%3+%1*4-8]
  733. movq m3, m0
  734. punpckldq m0, m2
  735. punpckhdq m2, m3
  736. movd m1, [%4+%1*2-4] ; tcos[j]
  737. movd m3, [%4+%2*2] ; tcos[n4-j-1]
  738. punpckldq m1, [%5+%1*2-4] ; tsin[j]
  739. punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
  740. mova m4, m0
  741. PSWAPD m5, m1
  742. pfmul m0, m1
  743. pfmul m4, m5
  744. mova m6, m2
  745. PSWAPD m5, m3
  746. pfmul m2, m3
  747. pfmul m6, m5
  748. %if cpuflag(3dnowext)
  749. pfpnacc m0, m4
  750. pfpnacc m2, m6
  751. %else
  752. SBUTTERFLY dq, 0, 4, 1
  753. SBUTTERFLY dq, 2, 6, 3
  754. pxor m4, m7
  755. pxor m6, m7
  756. pfadd m0, m4
  757. pfadd m2, m6
  758. %endif
  759. %else
  760. movaps xmm0, [%3+%2*4]
  761. movaps xmm1, [%3+%1*4-0x10]
  762. movaps xmm2, xmm0
  763. shufps xmm0, xmm1, 0x88
  764. shufps xmm1, xmm2, 0x77
  765. movlps xmm4, [%4+%2*2]
  766. movlps xmm5, [%5+%2*2+0x0]
  767. movhps xmm4, [%4+%1*2-0x8]
  768. movhps xmm5, [%5+%1*2-0x8]
  769. movaps xmm2, xmm0
  770. movaps xmm3, xmm1
  771. mulps xmm0, xmm5
  772. mulps xmm1, xmm4
  773. mulps xmm2, xmm4
  774. mulps xmm3, xmm5
  775. subps xmm1, xmm0
  776. addps xmm2, xmm3
  777. movaps xmm0, xmm1
  778. unpcklps xmm1, xmm2
  779. unpckhps xmm0, xmm2
  780. %endif
  781. %endmacro
  782. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  783. mulps m6, %3, [%5+%1]
  784. mulps m7, %2, [%5+%1]
  785. mulps %2, %2, [%6+%1]
  786. mulps %3, %3, [%6+%1]
  787. subps %2, %2, m6
  788. addps %3, %3, m7
  789. %endmacro
  790. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  791. .post:
  792. vmovaps ymm1, [%3+%1*2]
  793. vmovaps ymm0, [%3+%1*2+0x20]
  794. vmovaps ymm3, [%3+%2*2]
  795. vmovaps ymm2, [%3+%2*2+0x20]
  796. CMUL %1, ymm0, ymm1, %3, %4, %5
  797. CMUL %2, ymm2, ymm3, %3, %4, %5
  798. vshufps ymm1, ymm1, ymm1, 0x1b
  799. vshufps ymm3, ymm3, ymm3, 0x1b
  800. vperm2f128 ymm1, ymm1, ymm1, 0x01
  801. vperm2f128 ymm3, ymm3, ymm3, 0x01
  802. vunpcklps ymm6, ymm2, ymm1
  803. vunpckhps ymm4, ymm2, ymm1
  804. vunpcklps ymm7, ymm0, ymm3
  805. vunpckhps ymm5, ymm0, ymm3
  806. vextractf128 [%3+%1*2], ymm7, 0
  807. vextractf128 [%3+%1*2+0x10], ymm5, 0
  808. vextractf128 [%3+%1*2+0x20], ymm7, 1
  809. vextractf128 [%3+%1*2+0x30], ymm5, 1
  810. vextractf128 [%3+%2*2], ymm6, 0
  811. vextractf128 [%3+%2*2+0x10], ymm4, 0
  812. vextractf128 [%3+%2*2+0x20], ymm6, 1
  813. vextractf128 [%3+%2*2+0x30], ymm4, 1
  814. sub %2, 0x20
  815. add %1, 0x20
  816. jl .post
  817. %endmacro
  818. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  819. .post:
  820. movaps xmm1, [%3+%1*2]
  821. movaps xmm0, [%3+%1*2+0x10]
  822. CMUL %1, xmm0, xmm1, %3, %4, %5
  823. movaps xmm5, [%3+%2*2]
  824. movaps xmm4, [%3+%2*2+0x10]
  825. CMUL %2, xmm4, xmm5, %3, %4, %5
  826. shufps xmm1, xmm1, 0x1b
  827. shufps xmm5, xmm5, 0x1b
  828. movaps xmm6, xmm4
  829. unpckhps xmm4, xmm1
  830. unpcklps xmm6, xmm1
  831. movaps xmm2, xmm0
  832. unpcklps xmm0, xmm5
  833. unpckhps xmm2, xmm5
  834. movaps [%3+%2*2], xmm6
  835. movaps [%3+%2*2+0x10], xmm4
  836. movaps [%3+%1*2], xmm0
  837. movaps [%3+%1*2+0x10], xmm2
  838. sub %2, 0x10
  839. add %1, 0x10
  840. jl .post
  841. %endmacro
  842. %macro CMUL_3DNOW 6
  843. mova m6, [%1+%2*2]
  844. mova %3, [%1+%2*2+8]
  845. mova %4, m6
  846. mova m7, %3
  847. pfmul m6, [%5+%2]
  848. pfmul %3, [%6+%2]
  849. pfmul %4, [%6+%2]
  850. pfmul m7, [%5+%2]
  851. pfsub %3, m6
  852. pfadd %4, m7
  853. %endmacro
  854. %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
  855. .post:
  856. CMUL_3DNOW %3, %1, m0, m1, %4, %5
  857. CMUL_3DNOW %3, %2, m2, m3, %4, %5
  858. movd [%3+%1*2+ 0], m0
  859. movd [%3+%2*2+12], m1
  860. movd [%3+%2*2+ 0], m2
  861. movd [%3+%1*2+12], m3
  862. psrlq m0, 32
  863. psrlq m1, 32
  864. psrlq m2, 32
  865. psrlq m3, 32
  866. movd [%3+%1*2+ 8], m0
  867. movd [%3+%2*2+ 4], m1
  868. movd [%3+%2*2+ 8], m2
  869. movd [%3+%1*2+ 4], m3
  870. sub %2, 8
  871. add %1, 8
  872. jl .post
  873. %endmacro
  874. %macro DECL_IMDCT 1
  875. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  876. %if ARCH_X86_64
  877. %define rrevtab r7
  878. %define rtcos r8
  879. %define rtsin r9
  880. %else
  881. %define rrevtab r6
  882. %define rtsin r6
  883. %define rtcos r5
  884. %endif
  885. mov r3d, [r0+FFTContext.mdctsize]
  886. add r2, r3
  887. shr r3, 1
  888. mov rtcos, [r0+FFTContext.tcos]
  889. mov rtsin, [r0+FFTContext.tsin]
  890. add rtcos, r3
  891. add rtsin, r3
  892. %if ARCH_X86_64 == 0
  893. push rtcos
  894. push rtsin
  895. %endif
  896. shr r3, 1
  897. mov rrevtab, [r0+FFTContext.revtab]
  898. add rrevtab, r3
  899. %if ARCH_X86_64 == 0
  900. push rrevtab
  901. %endif
  902. %if mmsize == 8
  903. sub r3, 2
  904. %else
  905. sub r3, 4
  906. %endif
  907. %if ARCH_X86_64 || mmsize == 8
  908. xor r4, r4
  909. sub r4, r3
  910. %endif
  911. %if notcpuflag(3dnowext) && mmsize == 8
  912. movd m7, [ps_neg]
  913. %endif
  914. .pre:
  915. %if ARCH_X86_64 == 0
  916. ;unspill
  917. %if mmsize != 8
  918. xor r4, r4
  919. sub r4, r3
  920. %endif
  921. mov rtcos, [esp+8]
  922. mov rtsin, [esp+4]
  923. %endif
  924. PREROTATER r4, r3, r2, rtcos, rtsin
  925. %if mmsize == 8
  926. mov r6, [esp] ; rrevtab = ptr+n8
  927. movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
  928. movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
  929. mova [r1+r5*8], m0
  930. mova [r1+r6*8], m2
  931. add r4, 2
  932. sub r3, 2
  933. %else
  934. %if ARCH_X86_64
  935. movzx r5, word [rrevtab+r4-4]
  936. movzx r6, word [rrevtab+r4-2]
  937. movzx r10, word [rrevtab+r3]
  938. movzx r11, word [rrevtab+r3+2]
  939. movlps [r1+r5 *8], xmm0
  940. movhps [r1+r6 *8], xmm0
  941. movlps [r1+r10*8], xmm1
  942. movhps [r1+r11*8], xmm1
  943. add r4, 4
  944. %else
  945. mov r6, [esp]
  946. movzx r5, word [r6+r4-4]
  947. movzx r4, word [r6+r4-2]
  948. movlps [r1+r5*8], xmm0
  949. movhps [r1+r4*8], xmm0
  950. movzx r5, word [r6+r3]
  951. movzx r4, word [r6+r3+2]
  952. movlps [r1+r5*8], xmm1
  953. movhps [r1+r4*8], xmm1
  954. %endif
  955. sub r3, 4
  956. %endif
  957. jns .pre
  958. mov r5, r0
  959. mov r6, r1
  960. mov r0, r1
  961. mov r1d, [r5+FFTContext.nbits]
  962. FFT_DISPATCH SUFFIX, r1
  963. mov r0d, [r5+FFTContext.mdctsize]
  964. add r6, r0
  965. shr r0, 1
  966. %if ARCH_X86_64 == 0
  967. %define rtcos r2
  968. %define rtsin r3
  969. mov rtcos, [esp+8]
  970. mov rtsin, [esp+4]
  971. %endif
  972. neg r0
  973. mov r1, -mmsize
  974. sub r1, r0
  975. %1 r0, r1, r6, rtcos, rtsin
  976. %if ARCH_X86_64 == 0
  977. add esp, 12
  978. %endif
  979. %if mmsize == 8
  980. femms
  981. %endif
  982. RET
  983. %endmacro
  984. DECL_IMDCT POSROTATESHUF
  985. %if ARCH_X86_32
  986. INIT_MMX 3dnow
  987. DECL_IMDCT POSROTATESHUF_3DNOW
  988. INIT_MMX 3dnowext
  989. DECL_IMDCT POSROTATESHUF_3DNOW
  990. %endif
  991. INIT_YMM avx
  992. %if HAVE_AVX_EXTERNAL
  993. DECL_IMDCT POSROTATESHUF_AVX
  994. %endif