resample_neon.S 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. /*
  2. * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. #include "asm-offsets.h"
  22. .macro resample_one fmt, es=2
  23. .ifnc \fmt, dbl
  24. .macro M_MUL2 x:vararg
  25. .endm
  26. .macro M_MLA2 x:vararg
  27. .endm
  28. .endif
  29. function ff_resample_one_\fmt\()_neon, export=1
  30. sxtw x2, w2
  31. ldr x9, [x0, #FILTER_BANK]
  32. ldr w6, [x0, #FILTER_LENGTH]
  33. ldp w7, w8, [x0, #PHASE_SHIFT] // and phase_mask
  34. lsr x10, x4, x7 // sample_index
  35. and x4, x4, x8
  36. lsl x11, x6, #\es // filter_length * elem_size
  37. add x3, x3, x10, lsl #\es // src[sample_index]
  38. madd x9, x11, x4, x9 // filter
  39. cmp w6, #16
  40. b.lt 5f
  41. 8: // remaining filter_length at least 16
  42. subs w6, w6, #16
  43. LOAD8 v4, v5, v6, v7, x3
  44. LOAD8 v16, v17, v18, v19, x9
  45. M_MUL v0, v4, v16, v1
  46. M_MUL2 v1, v6, v18
  47. 7:
  48. LOAD8 v20, v21, v22, v23, x3
  49. M_MLA v0, v5, v17, v1
  50. M_MLA2 v1, v7, v19
  51. LOAD8 v24, v25, v26, v27, x9
  52. M_MLA v0, v20, v24, v1
  53. M_MLA2 v1, v22, v26
  54. b.eq 6f
  55. cmp w6, #16
  56. M_MLA v0, v21, v25, v1
  57. M_MLA2 v1, v23, v27
  58. b.lt 4f
  59. subs w6, w6, #16
  60. LOAD8 v4, v5, v6, v7, x3
  61. LOAD8 v16, v17, v18, v19, x9
  62. M_MLA v0, v4, v16, v1
  63. M_MLA2 v1, v6, v18
  64. b 7b
  65. 6:
  66. M_MLA v0, v21, v25, v1
  67. M_MLA2 v1, v23, v27
  68. STORE_ONE 0, x1, x2, v1
  69. ret
  70. 5:
  71. movi v0.16b, #0
  72. movi v1.16b, #0
  73. 4: // remaining filter_length 1-15
  74. cmp w6, #4
  75. b.lt 2f
  76. subs w6, w6, #4
  77. LOAD4 v4, v5, x3
  78. LOAD4 v6, v7, x9
  79. M_MLA v0, v4, v6, v1
  80. M_MLA2 v1, v5, v7
  81. b.eq 0f
  82. b 4b
  83. 2: // remaining filter_length 1-3
  84. cmp w6, #2
  85. b.lt 1f
  86. LOAD2 2, x3
  87. LOAD2 3, x9
  88. subs w6, w6, #2
  89. M_MLA v0, v2, v3
  90. b.eq 0f
  91. 1: // remaining filter_length 1
  92. LOAD1 6, x3
  93. LOAD1 7, x9
  94. M_MLA v0, v6, v7
  95. 0:
  96. STORE_ONE 0, x1, x2, v1
  97. ret
  98. endfunc
  99. .purgem LOAD1
  100. .purgem LOAD2
  101. .purgem LOAD4
  102. .purgem LOAD8
  103. .purgem M_MLA
  104. .purgem M_MLA2
  105. .purgem M_MUL
  106. .purgem M_MUL2
  107. .purgem STORE_ONE
  108. .endm
  109. .macro LOAD1 d1, addr
  110. ldr d\d1, [\addr], #8
  111. .endm
  112. .macro LOAD2 d1, addr
  113. ld1 {v\d1\().2d}, [\addr], #16
  114. .endm
  115. .macro LOAD4 d1, d2, addr
  116. ld1 {\d1\().2d,\d2\().2d}, [\addr], #32
  117. .endm
  118. .macro LOAD8 d1, d2, d3, d4, addr
  119. ld1 {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64
  120. .endm
  121. .macro M_MLA d, r0, r1, d2:vararg
  122. fmla \d\().2d, \r0\().2d, \r1\().2d
  123. .endm
  124. .macro M_MLA2 second:vararg
  125. M_MLA \second
  126. .endm
  127. .macro M_MUL d, r0, r1, d2:vararg
  128. fmul \d\().2d, \r0\().2d, \r1\().2d
  129. .endm
  130. .macro M_MUL2 second:vararg
  131. M_MUL \second
  132. .endm
  133. .macro STORE_ONE rn, addr, idx, d2
  134. fadd v\rn\().2d, v\rn\().2d, \d2\().2d
  135. faddp d\rn\(), v\rn\().2d
  136. str d\rn\(), [\addr, \idx, lsl #3]
  137. .endm
  138. resample_one dbl, 3
  139. .macro LOAD1 d1, addr
  140. ldr s\d1, [\addr], #4
  141. .endm
  142. .macro LOAD2 d1, addr
  143. ld1 {v\d1\().2s}, [\addr], #8
  144. .endm
  145. .macro LOAD4 d1, d2, addr
  146. ld1 {\d1\().4s}, [\addr], #16
  147. .endm
  148. .macro LOAD8 d1, d2, d3, d4, addr
  149. ld1 {\d1\().4s,\d2\().4s}, [\addr], #32
  150. .endm
  151. .macro M_MLA d, r0, r1, d2:vararg
  152. fmla \d\().4s, \r0\().4s, \r1\().4s
  153. .endm
  154. .macro M_MUL d, r0, r1, d2:vararg
  155. fmul \d\().4s, \r0\().4s, \r1\().4s
  156. .endm
  157. .macro STORE_ONE rn, addr, idx, d2
  158. faddp v\rn\().4s, v\rn\().4s, v\rn\().4s
  159. faddp s\rn\(), v\rn\().2s
  160. str s\rn\(), [\addr, \idx, lsl #2]
  161. .endm
  162. resample_one flt
  163. .macro LOAD1 d1, addr
  164. ldr h\d1, [\addr], #2
  165. .endm
  166. .macro LOAD2 d1, addr
  167. ldr s\d1, [\addr], #4
  168. .endm
  169. .macro LOAD4 d1, d2, addr
  170. ld1 {\d1\().4h}, [\addr], #8
  171. .endm
  172. .macro LOAD8 d1, d2, d3, d4, addr
  173. ld1 {\d1\().4h,\d2\().4h}, [\addr], #16
  174. .endm
  175. .macro M_MLA d, r0, r1, d2:vararg
  176. smlal \d\().4s, \r0\().4h, \r1\().4h
  177. .endm
  178. .macro M_MUL d, r0, r1, d2:vararg
  179. smull \d\().4s, \r0\().4h, \r1\().4h
  180. .endm
  181. .macro STORE_ONE rn, addr, idx, d2
  182. addp v\rn\().4s, v\rn\().4s, v\rn\().4s
  183. addp v\rn\().4s, v\rn\().4s, v\rn\().4s
  184. sqrshrn v\rn\().4h, v\rn\().4s, #15
  185. str h\rn\(), [\addr, \idx, lsl #1]
  186. .endm
  187. resample_one s16, 1
  188. .macro LOAD1 d1, addr
  189. ldr s\d1, [\addr], #4
  190. .endm
  191. .macro LOAD2 d1, addr
  192. ld1 {v\d1\().2s}, [\addr], #8
  193. .endm
  194. .macro LOAD4 d1, d2, addr
  195. ld1 {\d1\().4s}, [\addr], #16
  196. .endm
  197. .macro LOAD8 d1, d2, d3, d4, addr
  198. ld1 {\d1\().4s,\d2\().4s}, [\addr], #32
  199. .endm
  200. .macro M_MLA d1, r0, r1, d2:vararg
  201. smlal \d1\().2d, \r0\().2s, \r1\().2s
  202. .ifnb \d2
  203. smlal2 \d2\().2d, \r0\().4s, \r1\().4s
  204. .endif
  205. .endm
  206. .macro M_MUL d1, r0, r1, d2:vararg
  207. smull \d1\().2d, \r0\().2s, \r1\().2s
  208. .ifnb \d2
  209. smull2 \d2\().2d, \r0\().4s, \r1\().4s
  210. .endif
  211. .endm
  212. .macro STORE_ONE rn, addr, idx, d2
  213. add v\rn\().2d, v\rn\().2d, \d2\().2d
  214. addp d\rn\(), v\rn\().2d
  215. sqrshrn v\rn\().2s, v\rn\().2d, #30
  216. str s\rn\(), [\addr, \idx, lsl #2]
  217. .endm
  218. resample_one s32