float_dsp_neon.S 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. /*
  2. * ARM NEON optimised Float DSP functions
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "asm.S"
  23. function ff_vector_fmul_neon, export=1
  24. subs r3, r3, #8
  25. vld1.32 {d0-d3}, [r1,:128]!
  26. vld1.32 {d4-d7}, [r2,:128]!
  27. vmul.f32 q8, q0, q2
  28. vmul.f32 q9, q1, q3
  29. beq 3f
  30. bics ip, r3, #15
  31. beq 2f
  32. 1: subs ip, ip, #16
  33. vld1.32 {d0-d1}, [r1,:128]!
  34. vld1.32 {d4-d5}, [r2,:128]!
  35. vmul.f32 q10, q0, q2
  36. vld1.32 {d2-d3}, [r1,:128]!
  37. vld1.32 {d6-d7}, [r2,:128]!
  38. vmul.f32 q11, q1, q3
  39. vst1.32 {d16-d19},[r0,:128]!
  40. vld1.32 {d0-d1}, [r1,:128]!
  41. vld1.32 {d4-d5}, [r2,:128]!
  42. vmul.f32 q8, q0, q2
  43. vld1.32 {d2-d3}, [r1,:128]!
  44. vld1.32 {d6-d7}, [r2,:128]!
  45. vmul.f32 q9, q1, q3
  46. vst1.32 {d20-d23},[r0,:128]!
  47. bne 1b
  48. ands r3, r3, #15
  49. beq 3f
  50. 2: vld1.32 {d0-d1}, [r1,:128]!
  51. vld1.32 {d4-d5}, [r2,:128]!
  52. vst1.32 {d16-d17},[r0,:128]!
  53. vmul.f32 q8, q0, q2
  54. vld1.32 {d2-d3}, [r1,:128]!
  55. vld1.32 {d6-d7}, [r2,:128]!
  56. vst1.32 {d18-d19},[r0,:128]!
  57. vmul.f32 q9, q1, q3
  58. 3: vst1.32 {d16-d19},[r0,:128]!
  59. bx lr
  60. endfunc
  61. function ff_vector_fmac_scalar_neon, export=1
  62. VFP len .req r2
  63. VFP acc .req r3
  64. NOVFP len .req r3
  65. NOVFP acc .req r2
  66. VFP vdup.32 q15, d0[0]
  67. NOVFP vdup.32 q15, r2
  68. bics r12, len, #15
  69. mov acc, r0
  70. beq 3f
  71. vld1.32 {q0}, [r1,:128]!
  72. vld1.32 {q8}, [acc,:128]!
  73. vld1.32 {q1}, [r1,:128]!
  74. vld1.32 {q9}, [acc,:128]!
  75. 1: vmla.f32 q8, q0, q15
  76. vld1.32 {q2}, [r1,:128]!
  77. vld1.32 {q10}, [acc,:128]!
  78. vmla.f32 q9, q1, q15
  79. vld1.32 {q3}, [r1,:128]!
  80. vld1.32 {q11}, [acc,:128]!
  81. vmla.f32 q10, q2, q15
  82. vst1.32 {q8}, [r0,:128]!
  83. vmla.f32 q11, q3, q15
  84. vst1.32 {q9}, [r0,:128]!
  85. subs r12, r12, #16
  86. beq 2f
  87. vld1.32 {q0}, [r1,:128]!
  88. vld1.32 {q8}, [acc,:128]!
  89. vst1.32 {q10}, [r0,:128]!
  90. vld1.32 {q1}, [r1,:128]!
  91. vld1.32 {q9}, [acc,:128]!
  92. vst1.32 {q11}, [r0,:128]!
  93. b 1b
  94. 2: vst1.32 {q10}, [r0,:128]!
  95. vst1.32 {q11}, [r0,:128]!
  96. ands len, len, #15
  97. it eq
  98. bxeq lr
  99. 3: vld1.32 {q0}, [r1,:128]!
  100. vld1.32 {q8}, [acc,:128]!
  101. vmla.f32 q8, q0, q15
  102. vst1.32 {q8}, [r0,:128]!
  103. subs len, len, #4
  104. bgt 3b
  105. bx lr
  106. .unreq len
  107. endfunc
  108. function ff_vector_fmul_scalar_neon, export=1
  109. VFP len .req r2
  110. NOVFP len .req r3
  111. VFP vdup.32 q8, d0[0]
  112. NOVFP vdup.32 q8, r2
  113. bics r12, len, #15
  114. beq 3f
  115. vld1.32 {q0},[r1,:128]!
  116. vld1.32 {q1},[r1,:128]!
  117. 1: vmul.f32 q0, q0, q8
  118. vld1.32 {q2},[r1,:128]!
  119. vmul.f32 q1, q1, q8
  120. vld1.32 {q3},[r1,:128]!
  121. vmul.f32 q2, q2, q8
  122. vst1.32 {q0},[r0,:128]!
  123. vmul.f32 q3, q3, q8
  124. vst1.32 {q1},[r0,:128]!
  125. subs r12, r12, #16
  126. beq 2f
  127. vld1.32 {q0},[r1,:128]!
  128. vst1.32 {q2},[r0,:128]!
  129. vld1.32 {q1},[r1,:128]!
  130. vst1.32 {q3},[r0,:128]!
  131. b 1b
  132. 2: vst1.32 {q2},[r0,:128]!
  133. vst1.32 {q3},[r0,:128]!
  134. ands len, len, #15
  135. it eq
  136. bxeq lr
  137. 3: vld1.32 {q0},[r1,:128]!
  138. vmul.f32 q0, q0, q8
  139. vst1.32 {q0},[r0,:128]!
  140. subs len, len, #4
  141. bgt 3b
  142. bx lr
  143. .unreq len
  144. endfunc