resample_neon.S 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*
  2. * Copyright (c) 2014 Peter Meerwald <pmeerw@pmeerw.net>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. #include "asm-offsets.h"
  22. .macro resample_one fmt, es=2
  23. function ff_resample_one_\fmt\()_neon, export=1
  24. push {r4, r5}
  25. add r1, r1, r2, lsl #\es
  26. ldr r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
  27. ldr ip, [sp, #8] /* index */
  28. ldr r5, [r0, #FILTER_LENGTH]
  29. and r2, ip, r2 /* (index & phase_mask) */
  30. ldr r4, [r0, #PHASE_SHIFT]
  31. lsr r4, ip, r4 /* compute sample_index */
  32. mul r2, r2, r5
  33. ldr ip, [r0, #FILTER_BANK]
  34. add r3, r3, r4, lsl #\es /* &src[sample_index] */
  35. cmp r5, #8
  36. add r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
  37. blt 5f
  38. 8:
  39. subs r5, r5, #8
  40. LOAD4
  41. MUL4
  42. 7:
  43. LOAD4
  44. beq 6f
  45. cmp r5, #8
  46. MLA4
  47. blt 4f
  48. subs r5, r5, #8
  49. LOAD4
  50. MLA4
  51. b 7b
  52. 6:
  53. MLA4
  54. STORE
  55. pop {r4, r5}
  56. bx lr
  57. 5:
  58. INIT4
  59. 4: /* remaining filter_length 1 to 7 */
  60. cmp r5, #4
  61. blt 2f
  62. subs r5, r5, #4
  63. LOAD4
  64. MLA4
  65. beq 0f
  66. 2: /* remaining filter_length 1 to 3 */
  67. cmp r5, #2
  68. blt 1f
  69. subs r5, r5, #2
  70. LOAD2
  71. MLA2
  72. beq 0f
  73. 1: /* remaining filter_length 1 */
  74. LOAD1
  75. MLA1
  76. 0:
  77. STORE
  78. pop {r4, r5}
  79. bx lr
  80. endfunc
  81. .purgem LOAD1
  82. .purgem LOAD2
  83. .purgem LOAD4
  84. .purgem MLA1
  85. .purgem MLA2
  86. .purgem MLA4
  87. .purgem MUL4
  88. .purgem INIT4
  89. .purgem STORE
  90. .endm
  91. /* float32 */
  92. .macro LOAD1
  93. veor.32 d0, d0
  94. vld1.32 {d0[0]}, [r0]! /* load filter */
  95. vld1.32 {d4[0]}, [r3]! /* load src */
  96. .endm
  97. .macro LOAD2
  98. vld1.32 {d0}, [r0]! /* load filter */
  99. vld1.32 {d4}, [r3]! /* load src */
  100. .endm
  101. .macro LOAD4
  102. vld1.32 {d0,d1}, [r0]! /* load filter */
  103. vld1.32 {d4,d5}, [r3]! /* load src */
  104. .endm
  105. .macro MLA1
  106. vmla.f32 d16, d0, d4[0]
  107. .endm
  108. .macro MLA2
  109. vmla.f32 d16, d0, d4
  110. .endm
  111. .macro MLA4
  112. vmla.f32 d16, d0, d4
  113. vmla.f32 d17, d1, d5
  114. .endm
  115. .macro MUL4
  116. vmul.f32 d16, d0, d4
  117. vmul.f32 d17, d1, d5
  118. .endm
  119. .macro INIT4
  120. veor.f32 q8, q8
  121. .endm
  122. .macro STORE
  123. vpadd.f32 d16, d16, d17
  124. vpadd.f32 d16, d16, d16
  125. vst1.32 d16[0], [r1]
  126. .endm
  127. resample_one flt, 2
  128. /* s32 */
  129. .macro LOAD1
  130. veor.32 d0, d0
  131. vld1.32 {d0[0]}, [r0]! /* load filter */
  132. vld1.32 {d4[0]}, [r3]! /* load src */
  133. .endm
  134. .macro LOAD2
  135. vld1.32 {d0}, [r0]! /* load filter */
  136. vld1.32 {d4}, [r3]! /* load src */
  137. .endm
  138. .macro LOAD4
  139. vld1.32 {d0,d1}, [r0]! /* load filter */
  140. vld1.32 {d4,d5}, [r3]! /* load src */
  141. .endm
  142. .macro MLA1
  143. vmlal.s32 q8, d0, d4[0]
  144. .endm
  145. .macro MLA2
  146. vmlal.s32 q8, d0, d4
  147. .endm
  148. .macro MLA4
  149. vmlal.s32 q8, d0, d4
  150. vmlal.s32 q9, d1, d5
  151. .endm
  152. .macro MUL4
  153. vmull.s32 q8, d0, d4
  154. vmull.s32 q9, d1, d5
  155. .endm
  156. .macro INIT4
  157. veor.s64 q8, q8
  158. veor.s64 q9, q9
  159. .endm
  160. .macro STORE
  161. vadd.s64 q8, q8, q9
  162. vadd.s64 d16, d16, d17
  163. vqrshrn.s64 d16, q8, #30
  164. vst1.32 d16[0], [r1]
  165. .endm
  166. resample_one s32, 2
  167. /* s16 */
  168. .macro LOAD1
  169. veor.16 d0, d0
  170. vld1.16 {d0[0]}, [r0]! /* load filter */
  171. vld1.16 {d4[0]}, [r3]! /* load src */
  172. .endm
  173. .macro LOAD2
  174. veor.16 d0, d0
  175. vld1.32 {d0[0]}, [r0]! /* load filter */
  176. veor.16 d4, d4
  177. vld1.32 {d4[0]}, [r3]! /* load src */
  178. .endm
  179. .macro LOAD4
  180. vld1.16 {d0}, [r0]! /* load filter */
  181. vld1.16 {d4}, [r3]! /* load src */
  182. .endm
  183. .macro MLA1
  184. vmlal.s16 q8, d0, d4[0]
  185. .endm
  186. .macro MLA2
  187. vmlal.s16 q8, d0, d4
  188. .endm
  189. .macro MLA4
  190. vmlal.s16 q8, d0, d4
  191. .endm
  192. .macro MUL4
  193. vmull.s16 q8, d0, d4
  194. .endm
  195. .macro INIT4
  196. veor.s32 q8, q8
  197. .endm
  198. .macro STORE
  199. vpadd.s32 d16, d16, d17
  200. vpadd.s32 d16, d16, d16
  201. vqrshrn.s32 d16, q8, #15
  202. vst1.16 d16[0], [r1]
  203. .endm
  204. resample_one s16, 1
  205. .macro resample_linear fmt, es=2
  206. function ff_resample_linear_\fmt\()_neon, export=1
  207. push {r4, r5}
  208. add r1, r1, r2, lsl #\es
  209. ldr r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
  210. ldr ip, [sp, #8] /* index */
  211. ldr r5, [r0, #FILTER_LENGTH]
  212. and r2, ip, r2 /* (index & phase_mask) */
  213. ldr r4, [r0, #PHASE_SHIFT]
  214. lsr r4, ip, r4 /* compute sample_index */
  215. mul r2, r2, r5
  216. ldr ip, [r0, #FILTER_BANK]
  217. add r3, r3, r4, lsl #\es /* &src[sample_index] */
  218. cmp r5, #8
  219. ldr r4, [r0, #SRC_INCR]
  220. add r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
  221. add r2, r0, r5, lsl #\es /* filter[... + c->filter_length] */
  222. blt 5f
  223. 8:
  224. subs r5, r5, #8
  225. LOAD4
  226. MUL4
  227. 7:
  228. LOAD4
  229. beq 6f
  230. cmp r5, #8
  231. MLA4
  232. blt 4f
  233. subs r5, r5, #8
  234. LOAD4
  235. MLA4
  236. b 7b
  237. 6:
  238. MLA4
  239. STORE
  240. pop {r4, r5}
  241. bx lr
  242. 5:
  243. INIT4
  244. 4: /* remaining filter_length 1 to 7 */
  245. cmp r5, #4
  246. blt 2f
  247. subs r5, r5, #4
  248. LOAD4
  249. MLA4
  250. beq 0f
  251. 2: /* remaining filter_length 1 to 3 */
  252. cmp r5, #2
  253. blt 1f
  254. subs r5, r5, #2
  255. LOAD2
  256. MLA2
  257. beq 0f
  258. 1: /* remaining filter_length 1 */
  259. LOAD1
  260. MLA1
  261. 0:
  262. STORE
  263. pop {r4, r5}
  264. bx lr
  265. endfunc
  266. .purgem LOAD1
  267. .purgem LOAD2
  268. .purgem LOAD4
  269. .purgem MLA1
  270. .purgem MLA2
  271. .purgem MLA4
  272. .purgem MUL4
  273. .purgem INIT4
  274. .purgem STORE
  275. .endm
  276. /* float32 linear */
  277. .macro LOAD1
  278. veor.32 d0, d0
  279. veor.32 d2, d2
  280. vld1.32 {d0[0]}, [r0]! /* load filter */
  281. vld1.32 {d2[0]}, [r2]! /* load filter */
  282. vld1.32 {d4[0]}, [r3]! /* load src */
  283. .endm
  284. .macro LOAD2
  285. vld1.32 {d0}, [r0]! /* load filter */
  286. vld1.32 {d2}, [r2]! /* load filter */
  287. vld1.32 {d4}, [r3]! /* load src */
  288. .endm
  289. .macro LOAD4
  290. vld1.32 {d0,d1}, [r0]! /* load filter */
  291. vld1.32 {d2,d3}, [r2]! /* load filter */
  292. vld1.32 {d4,d5}, [r3]! /* load src */
  293. .endm
  294. .macro MLA1
  295. vmla.f32 d18, d0, d4[0]
  296. vmla.f32 d16, d2, d4[0]
  297. .endm
  298. .macro MLA2
  299. vmla.f32 d18, d0, d4
  300. vmla.f32 d16, d2, d4
  301. .endm
  302. .macro MLA4
  303. vmla.f32 q9, q0, q2
  304. vmla.f32 q8, q1, q2
  305. .endm
  306. .macro MUL4
  307. vmul.f32 q9, q0, q2
  308. vmul.f32 q8, q1, q2
  309. .endm
  310. .macro INIT4
  311. veor.f32 q9, q9
  312. veor.f32 q8, q8
  313. .endm
  314. .macro STORE
  315. vldr s0, [sp, #12] /* frac */
  316. vmov s1, r4
  317. vcvt.f32.s32 d0, d0
  318. vsub.f32 q8, q8, q9 /* v2 - val */
  319. vpadd.f32 d18, d18, d19
  320. vpadd.f32 d16, d16, d17
  321. vpadd.f32 d2, d18, d18
  322. vpadd.f32 d1, d16, d16
  323. vmul.f32 s2, s2, s0 /* (v2 - val) * frac */
  324. vdiv.f32 s2, s2, s1 /* / c->src_incr */
  325. vadd.f32 s4, s4, s2
  326. vstr s4, [r1]
  327. .endm
  328. resample_linear flt, 2