float_dsp_neon.S 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /*
  2. * ARM NEON optimised Float DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. function ff_vector_fmul_neon, export=1
  24. subs r3, r3, #8
  25. vld1.32 {d0-d3}, [r1,:128]!
  26. vld1.32 {d4-d7}, [r2,:128]!
  27. vmul.f32 q8, q0, q2
  28. vmul.f32 q9, q1, q3
  29. beq 3f
  30. bics ip, r3, #15
  31. beq 2f
  32. 1: subs ip, ip, #16
  33. vld1.32 {d0-d1}, [r1,:128]!
  34. vld1.32 {d4-d5}, [r2,:128]!
  35. vmul.f32 q10, q0, q2
  36. vld1.32 {d2-d3}, [r1,:128]!
  37. vld1.32 {d6-d7}, [r2,:128]!
  38. vmul.f32 q11, q1, q3
  39. vst1.32 {d16-d19},[r0,:128]!
  40. vld1.32 {d0-d1}, [r1,:128]!
  41. vld1.32 {d4-d5}, [r2,:128]!
  42. vmul.f32 q8, q0, q2
  43. vld1.32 {d2-d3}, [r1,:128]!
  44. vld1.32 {d6-d7}, [r2,:128]!
  45. vmul.f32 q9, q1, q3
  46. vst1.32 {d20-d23},[r0,:128]!
  47. bne 1b
  48. ands r3, r3, #15
  49. beq 3f
  50. 2: vld1.32 {d0-d1}, [r1,:128]!
  51. vld1.32 {d4-d5}, [r2,:128]!
  52. vst1.32 {d16-d17},[r0,:128]!
  53. vmul.f32 q8, q0, q2
  54. vld1.32 {d2-d3}, [r1,:128]!
  55. vld1.32 {d6-d7}, [r2,:128]!
  56. vst1.32 {d18-d19},[r0,:128]!
  57. vmul.f32 q9, q1, q3
  58. 3: vst1.32 {d16-d19},[r0,:128]!
  59. bx lr
  60. endfunc
  61. function ff_vector_fmac_scalar_neon, export=1
  62. VFP len .req r2
  63. VFP acc .req r3
  64. NOVFP len .req r3
  65. NOVFP acc .req r2
  66. VFP vdup.32 q15, d0[0]
  67. NOVFP vdup.32 q15, r2
  68. bics r12, len, #15
  69. mov acc, r0
  70. beq 3f
  71. vld1.32 {q0}, [r1,:128]!
  72. vld1.32 {q8}, [acc,:128]!
  73. vld1.32 {q1}, [r1,:128]!
  74. vld1.32 {q9}, [acc,:128]!
  75. 1: vmla.f32 q8, q0, q15
  76. vld1.32 {q2}, [r1,:128]!
  77. vld1.32 {q10}, [acc,:128]!
  78. vmla.f32 q9, q1, q15
  79. vld1.32 {q3}, [r1,:128]!
  80. vld1.32 {q11}, [acc,:128]!
  81. vmla.f32 q10, q2, q15
  82. vst1.32 {q8}, [r0,:128]!
  83. vmla.f32 q11, q3, q15
  84. vst1.32 {q9}, [r0,:128]!
  85. subs r12, r12, #16
  86. beq 2f
  87. vld1.32 {q0}, [r1,:128]!
  88. vld1.32 {q8}, [acc,:128]!
  89. vst1.32 {q10}, [r0,:128]!
  90. vld1.32 {q1}, [r1,:128]!
  91. vld1.32 {q9}, [acc,:128]!
  92. vst1.32 {q11}, [r0,:128]!
  93. b 1b
  94. 2: vst1.32 {q10}, [r0,:128]!
  95. vst1.32 {q11}, [r0,:128]!
  96. ands len, len, #15
  97. it eq
  98. bxeq lr
  99. 3: vld1.32 {q0}, [r1,:128]!
  100. vld1.32 {q8}, [acc,:128]!
  101. vmla.f32 q8, q0, q15
  102. vst1.32 {q8}, [r0,:128]!
  103. subs len, len, #4
  104. bgt 3b
  105. bx lr
  106. .unreq len
  107. endfunc
  108. function ff_vector_fmul_scalar_neon, export=1
  109. VFP len .req r2
  110. NOVFP len .req r3
  111. VFP vdup.32 q8, d0[0]
  112. NOVFP vdup.32 q8, r2
  113. bics r12, len, #15
  114. beq 3f
  115. vld1.32 {q0},[r1,:128]!
  116. vld1.32 {q1},[r1,:128]!
  117. 1: vmul.f32 q0, q0, q8
  118. vld1.32 {q2},[r1,:128]!
  119. vmul.f32 q1, q1, q8
  120. vld1.32 {q3},[r1,:128]!
  121. vmul.f32 q2, q2, q8
  122. vst1.32 {q0},[r0,:128]!
  123. vmul.f32 q3, q3, q8
  124. vst1.32 {q1},[r0,:128]!
  125. subs r12, r12, #16
  126. beq 2f
  127. vld1.32 {q0},[r1,:128]!
  128. vst1.32 {q2},[r0,:128]!
  129. vld1.32 {q1},[r1,:128]!
  130. vst1.32 {q3},[r0,:128]!
  131. b 1b
  132. 2: vst1.32 {q2},[r0,:128]!
  133. vst1.32 {q3},[r0,:128]!
  134. ands len, len, #15
  135. it eq
  136. bxeq lr
  137. 3: vld1.32 {q0},[r1,:128]!
  138. vmul.f32 q0, q0, q8
  139. vst1.32 {q0},[r0,:128]!
  140. subs len, len, #4
  141. bgt 3b
  142. bx lr
  143. .unreq len
  144. endfunc
  145. function ff_vector_fmul_window_neon, export=1
  146. push {r4,r5,lr}
  147. ldr lr, [sp, #12]
  148. sub r2, r2, #8
  149. sub r5, lr, #2
  150. add r2, r2, r5, lsl #2
  151. add r4, r3, r5, lsl #3
  152. add ip, r0, r5, lsl #3
  153. mov r5, #-16
  154. vld1.32 {d0,d1}, [r1,:128]!
  155. vld1.32 {d2,d3}, [r2,:128], r5
  156. vld1.32 {d4,d5}, [r3,:128]!
  157. vld1.32 {d6,d7}, [r4,:128], r5
  158. 1: subs lr, lr, #4
  159. vmul.f32 d22, d0, d4
  160. vrev64.32 q3, q3
  161. vmul.f32 d23, d1, d5
  162. vrev64.32 q1, q1
  163. vmul.f32 d20, d0, d7
  164. vmul.f32 d21, d1, d6
  165. beq 2f
  166. vmla.f32 d22, d3, d7
  167. vld1.32 {d0,d1}, [r1,:128]!
  168. vmla.f32 d23, d2, d6
  169. vld1.32 {d18,d19},[r2,:128], r5
  170. vmls.f32 d20, d3, d4
  171. vld1.32 {d24,d25},[r3,:128]!
  172. vmls.f32 d21, d2, d5
  173. vld1.32 {d6,d7}, [r4,:128], r5
  174. vmov q1, q9
  175. vrev64.32 q11, q11
  176. vmov q2, q12
  177. vswp d22, d23
  178. vst1.32 {d20,d21},[r0,:128]!
  179. vst1.32 {d22,d23},[ip,:128], r5
  180. b 1b
  181. 2: vmla.f32 d22, d3, d7
  182. vmla.f32 d23, d2, d6
  183. vmls.f32 d20, d3, d4
  184. vmls.f32 d21, d2, d5
  185. vrev64.32 q11, q11
  186. vswp d22, d23
  187. vst1.32 {d20,d21},[r0,:128]!
  188. vst1.32 {d22,d23},[ip,:128], r5
  189. pop {r4,r5,pc}
  190. endfunc
  191. function ff_vector_fmul_add_neon, export=1
  192. ldr r12, [sp]
  193. vld1.32 {q0-q1}, [r1,:128]!
  194. vld1.32 {q8-q9}, [r2,:128]!
  195. vld1.32 {q2-q3}, [r3,:128]!
  196. vmul.f32 q10, q0, q8
  197. vmul.f32 q11, q1, q9
  198. 1: vadd.f32 q12, q2, q10
  199. vadd.f32 q13, q3, q11
  200. pld [r1, #16]
  201. pld [r2, #16]
  202. pld [r3, #16]
  203. subs r12, r12, #8
  204. beq 2f
  205. vld1.32 {q0}, [r1,:128]!
  206. vld1.32 {q8}, [r2,:128]!
  207. vmul.f32 q10, q0, q8
  208. vld1.32 {q1}, [r1,:128]!
  209. vld1.32 {q9}, [r2,:128]!
  210. vmul.f32 q11, q1, q9
  211. vld1.32 {q2-q3}, [r3,:128]!
  212. vst1.32 {q12-q13},[r0,:128]!
  213. b 1b
  214. 2: vst1.32 {q12-q13},[r0,:128]!
  215. bx lr
  216. endfunc
  217. function ff_vector_fmul_reverse_neon, export=1
  218. add r2, r2, r3, lsl #2
  219. sub r2, r2, #32
  220. mov r12, #-32
  221. vld1.32 {q0-q1}, [r1,:128]!
  222. vld1.32 {q2-q3}, [r2,:128], r12
  223. 1: pld [r1, #32]
  224. vrev64.32 q3, q3
  225. vmul.f32 d16, d0, d7
  226. vmul.f32 d17, d1, d6
  227. pld [r2, #-32]
  228. vrev64.32 q2, q2
  229. vmul.f32 d18, d2, d5
  230. vmul.f32 d19, d3, d4
  231. subs r3, r3, #8
  232. beq 2f
  233. vld1.32 {q0-q1}, [r1,:128]!
  234. vld1.32 {q2-q3}, [r2,:128], r12
  235. vst1.32 {q8-q9}, [r0,:128]!
  236. b 1b
  237. 2: vst1.32 {q8-q9}, [r0,:128]!
  238. bx lr
  239. endfunc
  240. function ff_butterflies_float_neon, export=1
  241. 1: vld1.32 {q0},[r0,:128]
  242. vld1.32 {q1},[r1,:128]
  243. vsub.f32 q2, q0, q1
  244. vadd.f32 q1, q0, q1
  245. vst1.32 {q2},[r1,:128]!
  246. vst1.32 {q1},[r0,:128]!
  247. subs r2, r2, #4
  248. bgt 1b
  249. bx lr
  250. endfunc
  251. function ff_scalarproduct_float_neon, export=1
  252. vmov.f32 q2, #0.0
  253. 1: vld1.32 {q0},[r0,:128]!
  254. vld1.32 {q1},[r1,:128]!
  255. vmla.f32 q2, q0, q1
  256. subs r2, r2, #4
  257. bgt 1b
  258. vadd.f32 d0, d4, d5
  259. vpadd.f32 d0, d0, d0
  260. NOVFP vmov.32 r0, d0[0]
  261. bx lr
  262. endfunc