fft_altivec_s.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. /*
  2. * FFT transform with Altivec optimizations
  3. * Copyright (c) 2009 Loren Merritt
  4. *
  5. * This algorithm (though not any of the implementation details) is
  6. * based on libdjbfft by D. J. Bernstein.
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. /*
  25. * These functions are not individually interchangeable with the C versions.
  26. * While C takes arrays of FFTComplex, Altivec leaves intermediate results
  27. * in blocks as convenient to the vector size.
  28. * i.e. {4x real, 4x imaginary, 4x real, ...}
  29. *
  30. * I ignore standard calling convention.
  31. * Instead, the following registers are treated as global constants:
  32. * v14: zero
  33. * v15..v18: cosines
  34. * v19..v29: permutations
  35. * r9: 16
  36. * r12: ff_cos_tabs
  37. * and the rest are free for local use.
  38. */
  39. #include "config.h"
  40. #include "asm.S"
  41. .text
  42. .macro addi2 ra, imm // add 32-bit immediate
  43. .if \imm & 0xffff
  44. addi \ra, \ra, \imm@l
  45. .endif
  46. .if (\imm+0x8000)>>16
  47. addis \ra, \ra, \imm@ha
  48. .endif
  49. .endm
  50. .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
  51. vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
  52. vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
  53. vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
  54. vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
  55. vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
  56. vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
  57. vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
  58. vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
  59. vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
  60. vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
  61. .endm
  62. .macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
  63. vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
  64. vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
  65. vperm \b2,\b0,\b1,v20
  66. vperm \b3,\b0,\b1,v21
  67. vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
  68. vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
  69. vaddfp \b0,\b2,\b3
  70. vsubfp \b1,\b2,\b3
  71. vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
  72. vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
  73. vmrghw \b2,\b0,\b1
  74. vperm \b3,\b0,\b1,v22
  75. vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
  76. vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
  77. vaddfp \b0,\b2,\b3
  78. vsubfp \b1,\b2,\b3
  79. vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
  80. vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
  81. vperm \b2,\b0,\b1,v23
  82. vperm \b3,\b0,\b1,v24
  83. .endm
  84. .macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
  85. vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
  86. vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
  87. vperm \a2,\a0,\a1,v20 // FFT4 ...
  88. vperm \a3,\a0,\a1,v21
  89. vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
  90. vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
  91. vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
  92. vaddfp \a0,\a2,\a3
  93. vsubfp \a1,\a2,\a3
  94. vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
  95. vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
  96. vmrghw \a2,\a0,\a1
  97. vperm \a3,\a0,\a1,v22
  98. vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
  99. vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
  100. vaddfp \a0,\a2,\a3
  101. vsubfp \a1,\a2,\a3
  102. vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
  103. vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
  104. vperm \a2,\a0,\a1,v23
  105. vperm \a3,\a0,\a1,v24
  106. vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
  107. vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
  108. vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
  109. vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
  110. vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
  111. vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
  112. .endm
  113. .macro BF d0,d1,s0,s1
  114. vsubfp \d1,\s0,\s1
  115. vaddfp \d0,\s0,\s1
  116. .endm
  117. .macro zip d0,d1,s0,s1
  118. vmrghw \d0,\s0,\s1
  119. vmrglw \d1,\s0,\s1
  120. .endm
  121. .macro def_fft4 interleave
  122. fft4\interleave\()_altivec:
  123. lvx v0, 0,r3
  124. lvx v1,r9,r3
  125. FFT4 v0,v1,v2,v3
  126. .ifnb \interleave
  127. zip v0,v1,v2,v3
  128. stvx v0, 0,r3
  129. stvx v1,r9,r3
  130. .else
  131. stvx v2, 0,r3
  132. stvx v3,r9,r3
  133. .endif
  134. blr
  135. .endm
  136. .macro def_fft8 interleave
  137. fft8\interleave\()_altivec:
  138. addi r4,r3,32
  139. lvx v0, 0,r3
  140. lvx v1,r9,r3
  141. lvx v2, 0,r4
  142. lvx v3,r9,r4
  143. FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
  144. .ifnb \interleave
  145. zip v4,v5,v0,v1
  146. zip v6,v7,v2,v3
  147. stvx v4, 0,r3
  148. stvx v5,r9,r3
  149. stvx v6, 0,r4
  150. stvx v7,r9,r4
  151. .else
  152. stvx v0, 0,r3
  153. stvx v1,r9,r3
  154. stvx v2, 0,r4
  155. stvx v3,r9,r4
  156. .endif
  157. blr
  158. .endm
  159. .macro def_fft16 interleave
  160. fft16\interleave\()_altivec:
  161. addi r5,r3,64
  162. addi r6,r3,96
  163. addi r4,r3,32
  164. lvx v0, 0,r5
  165. lvx v1,r9,r5
  166. lvx v2, 0,r6
  167. lvx v3,r9,r6
  168. FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
  169. lvx v0, 0,r3
  170. lvx v1,r9,r3
  171. lvx v2, 0,r4
  172. lvx v3,r9,r4
  173. FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
  174. vmaddfp v8,v4,v15,v14 // r2*wre
  175. vmaddfp v9,v5,v15,v14 // i2*wre
  176. vmaddfp v10,v6,v15,v14 // r3*wre
  177. vmaddfp v11,v7,v15,v14 // i3*wre
  178. vmaddfp v8,v5,v16,v8 // i2*wim
  179. vnmsubfp v9,v4,v16,v9 // r2*wim
  180. vnmsubfp v10,v7,v16,v10 // i3*wim
  181. vmaddfp v11,v6,v16,v11 // r3*wim
  182. BF v10,v12,v10,v8
  183. BF v11,v13,v9,v11
  184. BF v0,v4,v0,v10
  185. BF v3,v7,v3,v12
  186. BF v1,v5,v1,v11
  187. BF v2,v6,v2,v13
  188. .ifnb \interleave
  189. zip v8, v9,v0,v1
  190. zip v10,v11,v2,v3
  191. zip v12,v13,v4,v5
  192. zip v14,v15,v6,v7
  193. stvx v8, 0,r3
  194. stvx v9,r9,r3
  195. stvx v10, 0,r4
  196. stvx v11,r9,r4
  197. stvx v12, 0,r5
  198. stvx v13,r9,r5
  199. stvx v14, 0,r6
  200. stvx v15,r9,r6
  201. .else
  202. stvx v0, 0,r3
  203. stvx v4, 0,r5
  204. stvx v3,r9,r4
  205. stvx v7,r9,r6
  206. stvx v1,r9,r3
  207. stvx v5,r9,r5
  208. stvx v2, 0,r4
  209. stvx v6, 0,r6
  210. .endif
  211. blr
  212. .endm
  213. // void pass(float *z, float *wre, int n)
  214. .macro PASS interleave, suffix
  215. fft_pass\suffix\()_altivec:
  216. mtctr r5
  217. slwi r0,r5,4
  218. slwi r7,r5,6 // o2
  219. slwi r5,r5,5 // o1
  220. add r10,r5,r7 // o3
  221. add r0,r4,r0 // wim
  222. addi r6,r5,16 // o1+16
  223. addi r8,r7,16 // o2+16
  224. addi r11,r10,16 // o3+16
  225. 1:
  226. lvx v8, 0,r4 // wre
  227. lvx v10, 0,r0 // wim
  228. sub r0,r0,r9
  229. lvx v9, 0,r0
  230. vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
  231. lvx v4,r3,r7 // r2 = z[o2]
  232. lvx v5,r3,r8 // i2 = z[o2+16]
  233. lvx v6,r3,r10 // r3 = z[o3]
  234. lvx v7,r3,r11 // i3 = z[o3+16]
  235. vmaddfp v10,v4,v8,v14 // r2*wre
  236. vmaddfp v11,v5,v8,v14 // i2*wre
  237. vmaddfp v12,v6,v8,v14 // r3*wre
  238. vmaddfp v13,v7,v8,v14 // i3*wre
  239. lvx v0, 0,r3 // r0 = z[0]
  240. lvx v3,r3,r6 // i1 = z[o1+16]
  241. vmaddfp v10,v5,v9,v10 // i2*wim
  242. vnmsubfp v11,v4,v9,v11 // r2*wim
  243. vnmsubfp v12,v7,v9,v12 // i3*wim
  244. vmaddfp v13,v6,v9,v13 // r3*wim
  245. lvx v1,r3,r9 // i0 = z[16]
  246. lvx v2,r3,r5 // r1 = z[o1]
  247. BF v12,v8,v12,v10
  248. BF v13,v9,v11,v13
  249. BF v0,v4,v0,v12
  250. BF v3,v7,v3,v8
  251. .if !\interleave
  252. stvx v0, 0,r3
  253. stvx v4,r3,r7
  254. stvx v3,r3,r6
  255. stvx v7,r3,r11
  256. .endif
  257. BF v1,v5,v1,v13
  258. BF v2,v6,v2,v9
  259. .if !\interleave
  260. stvx v1,r3,r9
  261. stvx v2,r3,r5
  262. stvx v5,r3,r8
  263. stvx v6,r3,r10
  264. .else
  265. vmrghw v8,v0,v1
  266. vmrglw v9,v0,v1
  267. stvx v8, 0,r3
  268. stvx v9,r3,r9
  269. vmrghw v8,v2,v3
  270. vmrglw v9,v2,v3
  271. stvx v8,r3,r5
  272. stvx v9,r3,r6
  273. vmrghw v8,v4,v5
  274. vmrglw v9,v4,v5
  275. stvx v8,r3,r7
  276. stvx v9,r3,r8
  277. vmrghw v8,v6,v7
  278. vmrglw v9,v6,v7
  279. stvx v8,r3,r10
  280. stvx v9,r3,r11
  281. .endif
  282. addi r3,r3,32
  283. addi r4,r4,16
  284. bdnz 1b
  285. sub r3,r3,r5
  286. blr
  287. .endm
  288. #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
  289. #define WORD_0 0x00,0x01,0x02,0x03
  290. #define WORD_1 0x04,0x05,0x06,0x07
  291. #define WORD_2 0x08,0x09,0x0a,0x0b
  292. #define WORD_3 0x0c,0x0d,0x0e,0x0f
  293. #define WORD_s0 0x10,0x11,0x12,0x13
  294. #define WORD_s1 0x14,0x15,0x16,0x17
  295. #define WORD_s2 0x18,0x19,0x1a,0x1b
  296. #define WORD_s3 0x1c,0x1d,0x1e,0x1f
  297. #define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
  298. .rodata
  299. .align 4
  300. fft_data:
  301. .float 0, 0, 0, 0
  302. .float 1, 0.92387953, M_SQRT1_2, 0.38268343
  303. .float 0, 0.38268343, M_SQRT1_2, 0.92387953
  304. .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
  305. .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  306. vcprm(s0,3,2,1)
  307. vcprm(0,1,s2,s1)
  308. vcprm(2,3,s0,s3)
  309. vcprm(2,s3,3,s2)
  310. vcprm(0,1,s0,s1)
  311. vcprm(2,3,s2,s3)
  312. vcprm(2,3,0,1)
  313. vcprm(1,2,s3,s0)
  314. vcprm(0,3,s2,s1)
  315. vcprm(0,2,s1,s3)
  316. vcprm(1,3,s0,s2)
  317. .macro lvm b, r, regs:vararg
  318. lvx \r, 0, \b
  319. addi \b, \b, 16
  320. .ifnb \regs
  321. lvm \b, \regs
  322. .endif
  323. .endm
  324. .macro stvm b, r, regs:vararg
  325. stvx \r, 0, \b
  326. addi \b, \b, 16
  327. .ifnb \regs
  328. stvm \b, \regs
  329. .endif
  330. .endm
  331. .macro fft_calc interleave
  332. extfunc ff_fft_calc\interleave\()_altivec
  333. mflr r0
  334. stp r0, 2*PS(r1)
  335. stpu r1, -(160+16*PS)(r1)
  336. get_got r11
  337. addi r6, r1, 16*PS
  338. stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
  339. mfvrsave r0
  340. stw r0, 15*PS(r1)
  341. li r6, 0xfffffffc
  342. mtvrsave r6
  343. movrel r6, fft_data, r11
  344. lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
  345. lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
  346. li r9, 16
  347. movrel r12, X(ff_cos_tabs), r11
  348. movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
  349. lwz r3, 0(r3)
  350. subi r3, r3, 2
  351. slwi r3, r3, 2+ARCH_PPC64
  352. lpx r3, r3, r6
  353. mtctr r3
  354. mr r3, r4
  355. bctrl
  356. addi r6, r1, 16*PS
  357. lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
  358. lwz r6, 15*PS(r1)
  359. mtvrsave r6
  360. lp r1, 0(r1)
  361. lp r0, 2*PS(r1)
  362. mtlr r0
  363. blr
  364. .endm
  365. .macro DECL_FFT suffix, bits, n, n2, n4
  366. fft\n\suffix\()_altivec:
  367. mflr r0
  368. stp r0,PS*(\bits-3)(r1)
  369. bl fft\n2\()_altivec
  370. addi2 r3,\n*4
  371. bl fft\n4\()_altivec
  372. addi2 r3,\n*2
  373. bl fft\n4\()_altivec
  374. addi2 r3,\n*-6
  375. lp r0,PS*(\bits-3)(r1)
  376. lp r4,\bits*PS(r12)
  377. mtlr r0
  378. li r5,\n/16
  379. b fft_pass\suffix\()_altivec
  380. .endm
  381. .macro DECL_FFTS interleave, suffix
  382. .text
  383. def_fft4 \suffix
  384. def_fft8 \suffix
  385. def_fft16 \suffix
  386. PASS \interleave, \suffix
  387. DECL_FFT \suffix, 5, 32, 16, 8
  388. DECL_FFT \suffix, 6, 64, 32, 16
  389. DECL_FFT \suffix, 7, 128, 64, 32
  390. DECL_FFT \suffix, 8, 256, 128, 64
  391. DECL_FFT \suffix, 9, 512, 256, 128
  392. DECL_FFT \suffix,10, 1024, 512, 256
  393. DECL_FFT \suffix,11, 2048, 1024, 512
  394. DECL_FFT \suffix,12, 4096, 2048, 1024
  395. DECL_FFT \suffix,13, 8192, 4096, 2048
  396. DECL_FFT \suffix,14,16384, 8192, 4096
  397. DECL_FFT \suffix,15,32768,16384, 8192
  398. DECL_FFT \suffix,16,65536,32768,16384
  399. fft_calc \suffix
  400. .rodata
  401. .align 3
  402. fft_dispatch_tab\suffix\()_altivec:
  403. PTR fft4\suffix\()_altivec
  404. PTR fft8\suffix\()_altivec
  405. PTR fft16\suffix\()_altivec
  406. PTR fft32\suffix\()_altivec
  407. PTR fft64\suffix\()_altivec
  408. PTR fft128\suffix\()_altivec
  409. PTR fft256\suffix\()_altivec
  410. PTR fft512\suffix\()_altivec
  411. PTR fft1024\suffix\()_altivec
  412. PTR fft2048\suffix\()_altivec
  413. PTR fft4096\suffix\()_altivec
  414. PTR fft8192\suffix\()_altivec
  415. PTR fft16384\suffix\()_altivec
  416. PTR fft32768\suffix\()_altivec
  417. PTR fft65536\suffix\()_altivec
  418. .endm
  419. DECL_FFTS 0
  420. DECL_FFTS 1, _interleave