audio_convert_neon.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /*
  2. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of libswresample.
  5. *
  6. * libswresample is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * libswresample is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with libswresample; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "config.h"
  21. #include "libavutil/arm/asm.S"
  22. function swri_oldapi_conv_flt_to_s16_neon, export=1
  23. .L_swri_oldapi_conv_flt_to_s16_neon:
  24. subs r2, r2, #8
  25. vld1.32 {q0}, [r1,:128]!
  26. vcvt.s32.f32 q8, q0, #31
  27. vld1.32 {q1}, [r1,:128]!
  28. vcvt.s32.f32 q9, q1, #31
  29. beq 3f
  30. bics r12, r2, #15
  31. beq 2f
  32. 1: subs r12, r12, #16
  33. vqrshrn.s32 d4, q8, #16
  34. vld1.32 {q0}, [r1,:128]!
  35. vcvt.s32.f32 q0, q0, #31
  36. vqrshrn.s32 d5, q9, #16
  37. vld1.32 {q1}, [r1,:128]!
  38. vcvt.s32.f32 q1, q1, #31
  39. vqrshrn.s32 d6, q0, #16
  40. vst1.16 {q2}, [r0,:128]!
  41. vqrshrn.s32 d7, q1, #16
  42. vld1.32 {q8}, [r1,:128]!
  43. vcvt.s32.f32 q8, q8, #31
  44. vld1.32 {q9}, [r1,:128]!
  45. vcvt.s32.f32 q9, q9, #31
  46. vst1.16 {q3}, [r0,:128]!
  47. bne 1b
  48. ands r2, r2, #15
  49. beq 3f
  50. 2: vld1.32 {q0}, [r1,:128]!
  51. vqrshrn.s32 d4, q8, #16
  52. vcvt.s32.f32 q0, q0, #31
  53. vld1.32 {q1}, [r1,:128]!
  54. vqrshrn.s32 d5, q9, #16
  55. vcvt.s32.f32 q1, q1, #31
  56. vqrshrn.s32 d6, q0, #16
  57. vst1.16 {q2}, [r0,:128]!
  58. vqrshrn.s32 d7, q1, #16
  59. vst1.16 {q3}, [r0,:128]!
  60. bx lr
  61. 3: vqrshrn.s32 d4, q8, #16
  62. vqrshrn.s32 d5, q9, #16
  63. vst1.16 {q2}, [r0,:128]!
  64. bx lr
  65. endfunc
  66. function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1
  67. .L_swri_oldapi_conv_fltp_to_s16_2ch_neon:
  68. ldm r1, {r1, r3}
  69. subs r2, r2, #8
  70. vld1.32 {q0}, [r1,:128]!
  71. vcvt.s32.f32 q8, q0, #31
  72. vld1.32 {q1}, [r1,:128]!
  73. vcvt.s32.f32 q9, q1, #31
  74. vld1.32 {q10}, [r3,:128]!
  75. vcvt.s32.f32 q10, q10, #31
  76. vld1.32 {q11}, [r3,:128]!
  77. vcvt.s32.f32 q11, q11, #31
  78. beq 3f
  79. bics r12, r2, #15
  80. beq 2f
  81. 1: subs r12, r12, #16
  82. vld1.32 {q0}, [r1,:128]!
  83. vcvt.s32.f32 q0, q0, #31
  84. vsri.32 q10, q8, #16
  85. vld1.32 {q1}, [r1,:128]!
  86. vcvt.s32.f32 q1, q1, #31
  87. vld1.32 {q12}, [r3,:128]!
  88. vcvt.s32.f32 q12, q12, #31
  89. vld1.32 {q13}, [r3,:128]!
  90. vsri.32 q11, q9, #16
  91. vst1.16 {q10}, [r0,:128]!
  92. vcvt.s32.f32 q13, q13, #31
  93. vst1.16 {q11}, [r0,:128]!
  94. vsri.32 q12, q0, #16
  95. vld1.32 {q8}, [r1,:128]!
  96. vsri.32 q13, q1, #16
  97. vst1.16 {q12}, [r0,:128]!
  98. vcvt.s32.f32 q8, q8, #31
  99. vld1.32 {q9}, [r1,:128]!
  100. vcvt.s32.f32 q9, q9, #31
  101. vld1.32 {q10}, [r3,:128]!
  102. vcvt.s32.f32 q10, q10, #31
  103. vld1.32 {q11}, [r3,:128]!
  104. vcvt.s32.f32 q11, q11, #31
  105. vst1.16 {q13}, [r0,:128]!
  106. bne 1b
  107. ands r2, r2, #15
  108. beq 3f
  109. 2: vsri.32 q10, q8, #16
  110. vld1.32 {q0}, [r1,:128]!
  111. vcvt.s32.f32 q0, q0, #31
  112. vld1.32 {q1}, [r1,:128]!
  113. vcvt.s32.f32 q1, q1, #31
  114. vld1.32 {q12}, [r3,:128]!
  115. vcvt.s32.f32 q12, q12, #31
  116. vsri.32 q11, q9, #16
  117. vld1.32 {q13}, [r3,:128]!
  118. vcvt.s32.f32 q13, q13, #31
  119. vst1.16 {q10}, [r0,:128]!
  120. vsri.32 q12, q0, #16
  121. vst1.16 {q11}, [r0,:128]!
  122. vsri.32 q13, q1, #16
  123. vst1.16 {q12-q13},[r0,:128]!
  124. bx lr
  125. 3: vsri.32 q10, q8, #16
  126. vsri.32 q11, q9, #16
  127. vst1.16 {q10-q11},[r0,:128]!
  128. bx lr
  129. endfunc
  130. function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1
  131. cmp r3, #2
  132. itt lt
  133. ldrlt r1, [r1]
  134. blt .L_swri_oldapi_conv_flt_to_s16_neon
  135. beq .L_swri_oldapi_conv_fltp_to_s16_2ch_neon
  136. push {r4-r8, lr}
  137. cmp r3, #4
  138. lsl r12, r3, #1
  139. blt 4f
  140. @ 4 channels
  141. 5: ldm r1!, {r4-r7}
  142. mov lr, r2
  143. mov r8, r0
  144. vld1.32 {q8}, [r4,:128]!
  145. vcvt.s32.f32 q8, q8, #31
  146. vld1.32 {q9}, [r5,:128]!
  147. vcvt.s32.f32 q9, q9, #31
  148. vld1.32 {q10}, [r6,:128]!
  149. vcvt.s32.f32 q10, q10, #31
  150. vld1.32 {q11}, [r7,:128]!
  151. vcvt.s32.f32 q11, q11, #31
  152. 6: subs lr, lr, #8
  153. vld1.32 {q0}, [r4,:128]!
  154. vcvt.s32.f32 q0, q0, #31
  155. vsri.32 q9, q8, #16
  156. vld1.32 {q1}, [r5,:128]!
  157. vcvt.s32.f32 q1, q1, #31
  158. vsri.32 q11, q10, #16
  159. vld1.32 {q2}, [r6,:128]!
  160. vcvt.s32.f32 q2, q2, #31
  161. vzip.32 d18, d22
  162. vld1.32 {q3}, [r7,:128]!
  163. vcvt.s32.f32 q3, q3, #31
  164. vzip.32 d19, d23
  165. vst1.16 {d18}, [r8], r12
  166. vsri.32 q1, q0, #16
  167. vst1.16 {d22}, [r8], r12
  168. vsri.32 q3, q2, #16
  169. vst1.16 {d19}, [r8], r12
  170. vzip.32 d2, d6
  171. vst1.16 {d23}, [r8], r12
  172. vzip.32 d3, d7
  173. beq 7f
  174. vld1.32 {q8}, [r4,:128]!
  175. vcvt.s32.f32 q8, q8, #31
  176. vst1.16 {d2}, [r8], r12
  177. vld1.32 {q9}, [r5,:128]!
  178. vcvt.s32.f32 q9, q9, #31
  179. vst1.16 {d6}, [r8], r12
  180. vld1.32 {q10}, [r6,:128]!
  181. vcvt.s32.f32 q10, q10, #31
  182. vst1.16 {d3}, [r8], r12
  183. vld1.32 {q11}, [r7,:128]!
  184. vcvt.s32.f32 q11, q11, #31
  185. vst1.16 {d7}, [r8], r12
  186. b 6b
  187. 7: vst1.16 {d2}, [r8], r12
  188. vst1.16 {d6}, [r8], r12
  189. vst1.16 {d3}, [r8], r12
  190. vst1.16 {d7}, [r8], r12
  191. subs r3, r3, #4
  192. it eq
  193. popeq {r4-r8, pc}
  194. cmp r3, #4
  195. add r0, r0, #8
  196. bge 5b
  197. @ 2 channels
  198. 4: cmp r3, #2
  199. blt 4f
  200. ldm r1!, {r4-r5}
  201. mov lr, r2
  202. mov r8, r0
  203. tst lr, #8
  204. vld1.32 {q8}, [r4,:128]!
  205. vcvt.s32.f32 q8, q8, #31
  206. vld1.32 {q9}, [r5,:128]!
  207. vcvt.s32.f32 q9, q9, #31
  208. vld1.32 {q10}, [r4,:128]!
  209. vcvt.s32.f32 q10, q10, #31
  210. vld1.32 {q11}, [r5,:128]!
  211. vcvt.s32.f32 q11, q11, #31
  212. beq 6f
  213. subs lr, lr, #8
  214. beq 7f
  215. vsri.32 d18, d16, #16
  216. vsri.32 d19, d17, #16
  217. vld1.32 {q8}, [r4,:128]!
  218. vcvt.s32.f32 q8, q8, #31
  219. vst1.32 {d18[0]}, [r8], r12
  220. vsri.32 d22, d20, #16
  221. vst1.32 {d18[1]}, [r8], r12
  222. vsri.32 d23, d21, #16
  223. vst1.32 {d19[0]}, [r8], r12
  224. vst1.32 {d19[1]}, [r8], r12
  225. vld1.32 {q9}, [r5,:128]!
  226. vcvt.s32.f32 q9, q9, #31
  227. vst1.32 {d22[0]}, [r8], r12
  228. vst1.32 {d22[1]}, [r8], r12
  229. vld1.32 {q10}, [r4,:128]!
  230. vcvt.s32.f32 q10, q10, #31
  231. vst1.32 {d23[0]}, [r8], r12
  232. vst1.32 {d23[1]}, [r8], r12
  233. vld1.32 {q11}, [r5,:128]!
  234. vcvt.s32.f32 q11, q11, #31
  235. 6: subs lr, lr, #16
  236. vld1.32 {q0}, [r4,:128]!
  237. vcvt.s32.f32 q0, q0, #31
  238. vsri.32 d18, d16, #16
  239. vld1.32 {q1}, [r5,:128]!
  240. vcvt.s32.f32 q1, q1, #31
  241. vsri.32 d19, d17, #16
  242. vld1.32 {q2}, [r4,:128]!
  243. vcvt.s32.f32 q2, q2, #31
  244. vld1.32 {q3}, [r5,:128]!
  245. vcvt.s32.f32 q3, q3, #31
  246. vst1.32 {d18[0]}, [r8], r12
  247. vsri.32 d22, d20, #16
  248. vst1.32 {d18[1]}, [r8], r12
  249. vsri.32 d23, d21, #16
  250. vst1.32 {d19[0]}, [r8], r12
  251. vsri.32 d2, d0, #16
  252. vst1.32 {d19[1]}, [r8], r12
  253. vsri.32 d3, d1, #16
  254. vst1.32 {d22[0]}, [r8], r12
  255. vsri.32 d6, d4, #16
  256. vst1.32 {d22[1]}, [r8], r12
  257. vsri.32 d7, d5, #16
  258. vst1.32 {d23[0]}, [r8], r12
  259. vst1.32 {d23[1]}, [r8], r12
  260. beq 6f
  261. vld1.32 {q8}, [r4,:128]!
  262. vcvt.s32.f32 q8, q8, #31
  263. vst1.32 {d2[0]}, [r8], r12
  264. vst1.32 {d2[1]}, [r8], r12
  265. vld1.32 {q9}, [r5,:128]!
  266. vcvt.s32.f32 q9, q9, #31
  267. vst1.32 {d3[0]}, [r8], r12
  268. vst1.32 {d3[1]}, [r8], r12
  269. vld1.32 {q10}, [r4,:128]!
  270. vcvt.s32.f32 q10, q10, #31
  271. vst1.32 {d6[0]}, [r8], r12
  272. vst1.32 {d6[1]}, [r8], r12
  273. vld1.32 {q11}, [r5,:128]!
  274. vcvt.s32.f32 q11, q11, #31
  275. vst1.32 {d7[0]}, [r8], r12
  276. vst1.32 {d7[1]}, [r8], r12
  277. bgt 6b
  278. 6: vst1.32 {d2[0]}, [r8], r12
  279. vst1.32 {d2[1]}, [r8], r12
  280. vst1.32 {d3[0]}, [r8], r12
  281. vst1.32 {d3[1]}, [r8], r12
  282. vst1.32 {d6[0]}, [r8], r12
  283. vst1.32 {d6[1]}, [r8], r12
  284. vst1.32 {d7[0]}, [r8], r12
  285. vst1.32 {d7[1]}, [r8], r12
  286. b 8f
  287. 7: vsri.32 d18, d16, #16
  288. vsri.32 d19, d17, #16
  289. vst1.32 {d18[0]}, [r8], r12
  290. vsri.32 d22, d20, #16
  291. vst1.32 {d18[1]}, [r8], r12
  292. vsri.32 d23, d21, #16
  293. vst1.32 {d19[0]}, [r8], r12
  294. vst1.32 {d19[1]}, [r8], r12
  295. vst1.32 {d22[0]}, [r8], r12
  296. vst1.32 {d22[1]}, [r8], r12
  297. vst1.32 {d23[0]}, [r8], r12
  298. vst1.32 {d23[1]}, [r8], r12
  299. 8: subs r3, r3, #2
  300. add r0, r0, #4
  301. it eq
  302. popeq {r4-r8, pc}
  303. @ 1 channel
  304. 4: ldr r4, [r1]
  305. tst r2, #8
  306. mov lr, r2
  307. mov r5, r0
  308. vld1.32 {q0}, [r4,:128]!
  309. vcvt.s32.f32 q0, q0, #31
  310. vld1.32 {q1}, [r4,:128]!
  311. vcvt.s32.f32 q1, q1, #31
  312. bne 8f
  313. 6: subs lr, lr, #16
  314. vld1.32 {q2}, [r4,:128]!
  315. vcvt.s32.f32 q2, q2, #31
  316. vld1.32 {q3}, [r4,:128]!
  317. vcvt.s32.f32 q3, q3, #31
  318. vst1.16 {d0[1]}, [r5,:16], r12
  319. vst1.16 {d0[3]}, [r5,:16], r12
  320. vst1.16 {d1[1]}, [r5,:16], r12
  321. vst1.16 {d1[3]}, [r5,:16], r12
  322. vst1.16 {d2[1]}, [r5,:16], r12
  323. vst1.16 {d2[3]}, [r5,:16], r12
  324. vst1.16 {d3[1]}, [r5,:16], r12
  325. vst1.16 {d3[3]}, [r5,:16], r12
  326. beq 7f
  327. vld1.32 {q0}, [r4,:128]!
  328. vcvt.s32.f32 q0, q0, #31
  329. vld1.32 {q1}, [r4,:128]!
  330. vcvt.s32.f32 q1, q1, #31
  331. 7: vst1.16 {d4[1]}, [r5,:16], r12
  332. vst1.16 {d4[3]}, [r5,:16], r12
  333. vst1.16 {d5[1]}, [r5,:16], r12
  334. vst1.16 {d5[3]}, [r5,:16], r12
  335. vst1.16 {d6[1]}, [r5,:16], r12
  336. vst1.16 {d6[3]}, [r5,:16], r12
  337. vst1.16 {d7[1]}, [r5,:16], r12
  338. vst1.16 {d7[3]}, [r5,:16], r12
  339. bgt 6b
  340. pop {r4-r8, pc}
  341. 8: subs lr, lr, #8
  342. vst1.16 {d0[1]}, [r5,:16], r12
  343. vst1.16 {d0[3]}, [r5,:16], r12
  344. vst1.16 {d1[1]}, [r5,:16], r12
  345. vst1.16 {d1[3]}, [r5,:16], r12
  346. vst1.16 {d2[1]}, [r5,:16], r12
  347. vst1.16 {d2[3]}, [r5,:16], r12
  348. vst1.16 {d3[1]}, [r5,:16], r12
  349. vst1.16 {d3[3]}, [r5,:16], r12
  350. it eq
  351. popeq {r4-r8, pc}
  352. vld1.32 {q0}, [r4,:128]!
  353. vcvt.s32.f32 q0, q0, #31
  354. vld1.32 {q1}, [r4,:128]!
  355. vcvt.s32.f32 q1, q1, #31
  356. b 6b
  357. endfunc