fft_fixed_neon.S 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /*
  2. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "asm.S"
  21. .macro bflies d0, d1, r0, r1
  22. vrev64.32 \r0, \d1 @ t5, t6, t1, t2
  23. vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
  24. vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
  25. vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
  26. vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
  27. @ t5, t6, t4, t3
  28. vhsub.s16 \d1, \d0, \r0
  29. vhadd.s16 \d0, \d0, \r0
  30. .endm
  31. .macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
  32. vrev32.16 \r0, \d3
  33. vmull.s16 \w0, \d3, \c0
  34. vmlal.s16 \w0, \r0, \c1
  35. vshrn.s32 \d3, \w0, #15
  36. bflies \q0, \q1, \w0, \w1
  37. .endm
  38. .macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
  39. r0, r1, w0, w1
  40. vrev32.16 \r0, \d1
  41. vrev32.16 \r1, \d3
  42. vmull.s16 \w0, \d1, \c0
  43. vmlal.s16 \w0, \r0, \c1
  44. vmull.s16 \w1, \d3, \c2
  45. vmlal.s16 \w1, \r1, \c3
  46. vshrn.s32 \d1, \w0, #15
  47. vshrn.s32 \d3, \w1, #15
  48. bflies \q0, \q1, \w0, \w1
  49. .endm
  50. .macro fft4 d0, d1, r0, r1
  51. vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
  52. vhsub.s16 \r1, \d1, \d0
  53. vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
  54. vmov.i64 \d1, #0xffff00000000
  55. vbit \r0, \r1, \d1
  56. vrev64.16 \r1, \r0 @ t7, t8, t4, t3
  57. vtrn.32 \r0, \r1 @ t3, t4, t7, t8
  58. vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
  59. vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
  60. vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
  61. .endm
  62. .macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
  63. fft4 \d0, \d1, \r0, \r1
  64. vtrn.32 \d0, \d1 @ z0, z2, z1, z3
  65. vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
  66. vhsub.s16 \d3, \d2, \d3 @ z5, z7
  67. vmov \d2, \r0
  68. transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
  69. .endm
  70. function fft4_neon
  71. vld1.16 {d0-d1}, [r0,:128]
  72. fft4 d0, d1, d2, d3
  73. vst1.16 {d0-d1}, [r0,:128]
  74. bx lr
  75. endfunc
  76. function fft8_neon
  77. vld1.16 {d0-d3}, [r0,:128]
  78. movrel r1, coefs
  79. vld1.16 {d30}, [r1,:64]
  80. vdup.16 d31, d30[0]
  81. fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
  82. vtrn.32 d0, d1
  83. vtrn.32 d2, d3
  84. vst1.16 {d0-d3}, [r0,:128]
  85. bx lr
  86. endfunc
  87. function fft16_neon
  88. vld1.16 {d0-d3}, [r0,:128]!
  89. vld1.16 {d4-d7}, [r0,:128]
  90. movrel r1, coefs
  91. sub r0, r0, #32
  92. vld1.16 {d28-d31},[r1,:128]
  93. vdup.16 d31, d28[0]
  94. fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
  95. vswp d5, d6
  96. fft4 q2, q3, q8, q9
  97. vswp d5, d6
  98. vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
  99. vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
  100. vswp d1, d2
  101. vdup.16 d31, d28[0]
  102. transform01 q0, q2, d5, d31, d28, d20, q8, q9
  103. vdup.16 d26, d29[0]
  104. vdup.16 d27, d30[0]
  105. transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
  106. d20, d21, q8, q9
  107. vtrn.32 q0, q1
  108. vtrn.32 q2, q3
  109. vst1.16 {d0-d3}, [r0,:128]!
  110. vst1.16 {d4-d7}, [r0,:128]
  111. bx lr
  112. endfunc
  113. function fft_pass_neon
  114. push {r4,lr}
  115. movrel lr, coefs+24
  116. vld1.16 {d30}, [lr,:64]
  117. lsl r12, r2, #3
  118. vmov d31, d30
  119. add r3, r1, r2, lsl #2
  120. mov lr, #-8
  121. sub r3, r3, #2
  122. mov r4, r0
  123. vld1.16 {d27[]}, [r3,:16]
  124. sub r3, r3, #6
  125. vld1.16 {q0}, [r4,:128], r12
  126. vld1.16 {q1}, [r4,:128], r12
  127. vld1.16 {q2}, [r4,:128], r12
  128. vld1.16 {q3}, [r4,:128], r12
  129. vld1.16 {d28}, [r1,:64]!
  130. vld1.16 {d29}, [r3,:64], lr
  131. vswp d1, d2
  132. vswp d5, d6
  133. vtrn.32 d0, d1
  134. vtrn.32 d4, d5
  135. vdup.16 d25, d28[1]
  136. vmul.s16 d27, d27, d31
  137. transform01 q0, q2, d5, d25, d27, d20, q8, q9
  138. b 2f
  139. 1:
  140. mov r4, r0
  141. vdup.16 d26, d29[0]
  142. vld1.16 {q0}, [r4,:128], r12
  143. vld1.16 {q1}, [r4,:128], r12
  144. vld1.16 {q2}, [r4,:128], r12
  145. vld1.16 {q3}, [r4,:128], r12
  146. vld1.16 {d28}, [r1,:64]!
  147. vld1.16 {d29}, [r3,:64], lr
  148. vswp d1, d2
  149. vswp d5, d6
  150. vtrn.32 d0, d1
  151. vtrn.32 d4, d5
  152. vdup.16 d24, d28[0]
  153. vdup.16 d25, d28[1]
  154. vdup.16 d27, d29[3]
  155. vmul.s16 q13, q13, q15
  156. transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
  157. d16, d17, q9, q10
  158. 2:
  159. vtrn.32 d2, d3
  160. vtrn.32 d6, d7
  161. vdup.16 d24, d28[2]
  162. vdup.16 d26, d29[2]
  163. vdup.16 d25, d28[3]
  164. vdup.16 d27, d29[1]
  165. vmul.s16 q13, q13, q15
  166. transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
  167. d16, d17, q9, q10
  168. vtrn.32 d0, d1
  169. vtrn.32 d2, d3
  170. vtrn.32 d4, d5
  171. vtrn.32 d6, d7
  172. vswp d1, d2
  173. vswp d5, d6
  174. mov r4, r0
  175. vst1.16 {q0}, [r4,:128], r12
  176. vst1.16 {q1}, [r4,:128], r12
  177. vst1.16 {q2}, [r4,:128], r12
  178. vst1.16 {q3}, [r4,:128], r12
  179. add r0, r0, #16
  180. subs r2, r2, #2
  181. bgt 1b
  182. pop {r4,pc}
  183. endfunc
  184. #define F_SQRT1_2 23170
  185. #define F_COS_16_1 30274
  186. #define F_COS_16_3 12540
  187. const coefs, align=4
  188. .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
  189. .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
  190. .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
  191. .short 1, -1, -1, 1
  192. endconst
  193. .macro def_fft n, n2, n4
  194. function fft\n\()_neon
  195. push {r4, lr}
  196. mov r4, r0
  197. bl fft\n2\()_neon
  198. add r0, r4, #\n4*2*4
  199. bl fft\n4\()_neon
  200. add r0, r4, #\n4*3*4
  201. bl fft\n4\()_neon
  202. mov r0, r4
  203. pop {r4, lr}
  204. movrel r1, X(ff_cos_\n\()_fixed)
  205. mov r2, #\n4/2
  206. b fft_pass_neon
  207. endfunc
  208. .endm
  209. def_fft 32, 16, 8
  210. def_fft 64, 32, 16
  211. def_fft 128, 64, 32
  212. def_fft 256, 128, 64
  213. def_fft 512, 256, 128
  214. def_fft 1024, 512, 256
  215. def_fft 2048, 1024, 512
  216. def_fft 4096, 2048, 1024
  217. def_fft 8192, 4096, 2048
  218. def_fft 16384, 8192, 4096
  219. def_fft 32768, 16384, 8192
  220. def_fft 65536, 32768, 16384
  221. function ff_fft_fixed_calc_neon, export=1
  222. ldr r2, [r0]
  223. sub r2, r2, #2
  224. movrel r3, fft_fixed_tab_neon
  225. ldr r3, [r3, r2, lsl #2]
  226. mov r0, r1
  227. bx r3
  228. endfunc
  229. const fft_fixed_tab_neon
  230. .word fft4_neon
  231. .word fft8_neon
  232. .word fft16_neon
  233. .word fft32_neon
  234. .word fft64_neon
  235. .word fft128_neon
  236. .word fft256_neon
  237. .word fft512_neon
  238. .word fft1024_neon
  239. .word fft2048_neon
  240. .word fft4096_neon
  241. .word fft8192_neon
  242. .word fft16384_neon
  243. .word fft32768_neon
  244. .word fft65536_neon
  245. endconst