fmtconvert_neon.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. /*
  2. * ARM NEON optimised Format Conversion Utils
  3. * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/arm/asm.S"
  23. function ff_float_to_int16_neon, export=1
  24. subs r2, r2, #8
  25. vld1.64 {d0-d1}, [r1,:128]!
  26. vcvt.s32.f32 q8, q0, #16
  27. vld1.64 {d2-d3}, [r1,:128]!
  28. vcvt.s32.f32 q9, q1, #16
  29. beq 3f
  30. bics ip, r2, #15
  31. beq 2f
  32. 1: subs ip, ip, #16
  33. vshrn.s32 d4, q8, #16
  34. vld1.64 {d0-d1}, [r1,:128]!
  35. vcvt.s32.f32 q0, q0, #16
  36. vshrn.s32 d5, q9, #16
  37. vld1.64 {d2-d3}, [r1,:128]!
  38. vcvt.s32.f32 q1, q1, #16
  39. vshrn.s32 d6, q0, #16
  40. vst1.64 {d4-d5}, [r0,:128]!
  41. vshrn.s32 d7, q1, #16
  42. vld1.64 {d16-d17},[r1,:128]!
  43. vcvt.s32.f32 q8, q8, #16
  44. vld1.64 {d18-d19},[r1,:128]!
  45. vcvt.s32.f32 q9, q9, #16
  46. vst1.64 {d6-d7}, [r0,:128]!
  47. bne 1b
  48. ands r2, r2, #15
  49. beq 3f
  50. 2: vld1.64 {d0-d1}, [r1,:128]!
  51. vshrn.s32 d4, q8, #16
  52. vcvt.s32.f32 q0, q0, #16
  53. vld1.64 {d2-d3}, [r1,:128]!
  54. vshrn.s32 d5, q9, #16
  55. vcvt.s32.f32 q1, q1, #16
  56. vshrn.s32 d6, q0, #16
  57. vst1.64 {d4-d5}, [r0,:128]!
  58. vshrn.s32 d7, q1, #16
  59. vst1.64 {d6-d7}, [r0,:128]!
  60. bx lr
  61. 3: vshrn.s32 d4, q8, #16
  62. vshrn.s32 d5, q9, #16
  63. vst1.64 {d4-d5}, [r0,:128]!
  64. bx lr
  65. endfunc
  66. function ff_float_to_int16_interleave_neon, export=1
  67. cmp r3, #2
  68. itt lt
  69. ldrlt r1, [r1]
  70. blt X(ff_float_to_int16_neon)
  71. bne 4f
  72. ldr r3, [r1]
  73. ldr r1, [r1, #4]
  74. subs r2, r2, #8
  75. vld1.64 {d0-d1}, [r3,:128]!
  76. vcvt.s32.f32 q8, q0, #16
  77. vld1.64 {d2-d3}, [r3,:128]!
  78. vcvt.s32.f32 q9, q1, #16
  79. vld1.64 {d20-d21},[r1,:128]!
  80. vcvt.s32.f32 q10, q10, #16
  81. vld1.64 {d22-d23},[r1,:128]!
  82. vcvt.s32.f32 q11, q11, #16
  83. beq 3f
  84. bics ip, r2, #15
  85. beq 2f
  86. 1: subs ip, ip, #16
  87. vld1.64 {d0-d1}, [r3,:128]!
  88. vcvt.s32.f32 q0, q0, #16
  89. vsri.32 q10, q8, #16
  90. vld1.64 {d2-d3}, [r3,:128]!
  91. vcvt.s32.f32 q1, q1, #16
  92. vld1.64 {d24-d25},[r1,:128]!
  93. vcvt.s32.f32 q12, q12, #16
  94. vld1.64 {d26-d27},[r1,:128]!
  95. vsri.32 q11, q9, #16
  96. vst1.64 {d20-d21},[r0,:128]!
  97. vcvt.s32.f32 q13, q13, #16
  98. vst1.64 {d22-d23},[r0,:128]!
  99. vsri.32 q12, q0, #16
  100. vld1.64 {d16-d17},[r3,:128]!
  101. vsri.32 q13, q1, #16
  102. vst1.64 {d24-d25},[r0,:128]!
  103. vcvt.s32.f32 q8, q8, #16
  104. vld1.64 {d18-d19},[r3,:128]!
  105. vcvt.s32.f32 q9, q9, #16
  106. vld1.64 {d20-d21},[r1,:128]!
  107. vcvt.s32.f32 q10, q10, #16
  108. vld1.64 {d22-d23},[r1,:128]!
  109. vcvt.s32.f32 q11, q11, #16
  110. vst1.64 {d26-d27},[r0,:128]!
  111. bne 1b
  112. ands r2, r2, #15
  113. beq 3f
  114. 2: vsri.32 q10, q8, #16
  115. vld1.64 {d0-d1}, [r3,:128]!
  116. vcvt.s32.f32 q0, q0, #16
  117. vld1.64 {d2-d3}, [r3,:128]!
  118. vcvt.s32.f32 q1, q1, #16
  119. vld1.64 {d24-d25},[r1,:128]!
  120. vcvt.s32.f32 q12, q12, #16
  121. vsri.32 q11, q9, #16
  122. vld1.64 {d26-d27},[r1,:128]!
  123. vcvt.s32.f32 q13, q13, #16
  124. vst1.64 {d20-d21},[r0,:128]!
  125. vsri.32 q12, q0, #16
  126. vst1.64 {d22-d23},[r0,:128]!
  127. vsri.32 q13, q1, #16
  128. vst1.64 {d24-d27},[r0,:128]!
  129. bx lr
  130. 3: vsri.32 q10, q8, #16
  131. vsri.32 q11, q9, #16
  132. vst1.64 {d20-d23},[r0,:128]!
  133. bx lr
  134. 4: push {r4-r8,lr}
  135. cmp r3, #4
  136. lsl ip, r3, #1
  137. blt 4f
  138. @ 4 channels
  139. 5: ldmia r1!, {r4-r7}
  140. mov lr, r2
  141. mov r8, r0
  142. vld1.64 {d16-d17},[r4,:128]!
  143. vcvt.s32.f32 q8, q8, #16
  144. vld1.64 {d18-d19},[r5,:128]!
  145. vcvt.s32.f32 q9, q9, #16
  146. vld1.64 {d20-d21},[r6,:128]!
  147. vcvt.s32.f32 q10, q10, #16
  148. vld1.64 {d22-d23},[r7,:128]!
  149. vcvt.s32.f32 q11, q11, #16
  150. 6: subs lr, lr, #8
  151. vld1.64 {d0-d1}, [r4,:128]!
  152. vcvt.s32.f32 q0, q0, #16
  153. vsri.32 q9, q8, #16
  154. vld1.64 {d2-d3}, [r5,:128]!
  155. vcvt.s32.f32 q1, q1, #16
  156. vsri.32 q11, q10, #16
  157. vld1.64 {d4-d5}, [r6,:128]!
  158. vcvt.s32.f32 q2, q2, #16
  159. vzip.32 d18, d22
  160. vld1.64 {d6-d7}, [r7,:128]!
  161. vcvt.s32.f32 q3, q3, #16
  162. vzip.32 d19, d23
  163. vst1.64 {d18}, [r8], ip
  164. vsri.32 q1, q0, #16
  165. vst1.64 {d22}, [r8], ip
  166. vsri.32 q3, q2, #16
  167. vst1.64 {d19}, [r8], ip
  168. vzip.32 d2, d6
  169. vst1.64 {d23}, [r8], ip
  170. vzip.32 d3, d7
  171. beq 7f
  172. vld1.64 {d16-d17},[r4,:128]!
  173. vcvt.s32.f32 q8, q8, #16
  174. vst1.64 {d2}, [r8], ip
  175. vld1.64 {d18-d19},[r5,:128]!
  176. vcvt.s32.f32 q9, q9, #16
  177. vst1.64 {d6}, [r8], ip
  178. vld1.64 {d20-d21},[r6,:128]!
  179. vcvt.s32.f32 q10, q10, #16
  180. vst1.64 {d3}, [r8], ip
  181. vld1.64 {d22-d23},[r7,:128]!
  182. vcvt.s32.f32 q11, q11, #16
  183. vst1.64 {d7}, [r8], ip
  184. b 6b
  185. 7: vst1.64 {d2}, [r8], ip
  186. vst1.64 {d6}, [r8], ip
  187. vst1.64 {d3}, [r8], ip
  188. vst1.64 {d7}, [r8], ip
  189. subs r3, r3, #4
  190. it eq
  191. popeq {r4-r8,pc}
  192. cmp r3, #4
  193. add r0, r0, #8
  194. bge 5b
  195. @ 2 channels
  196. 4: cmp r3, #2
  197. blt 4f
  198. ldmia r1!, {r4-r5}
  199. mov lr, r2
  200. mov r8, r0
  201. tst lr, #8
  202. vld1.64 {d16-d17},[r4,:128]!
  203. vcvt.s32.f32 q8, q8, #16
  204. vld1.64 {d18-d19},[r5,:128]!
  205. vcvt.s32.f32 q9, q9, #16
  206. vld1.64 {d20-d21},[r4,:128]!
  207. vcvt.s32.f32 q10, q10, #16
  208. vld1.64 {d22-d23},[r5,:128]!
  209. vcvt.s32.f32 q11, q11, #16
  210. beq 6f
  211. subs lr, lr, #8
  212. beq 7f
  213. vsri.32 d18, d16, #16
  214. vsri.32 d19, d17, #16
  215. vld1.64 {d16-d17},[r4,:128]!
  216. vcvt.s32.f32 q8, q8, #16
  217. vst1.32 {d18[0]}, [r8], ip
  218. vsri.32 d22, d20, #16
  219. vst1.32 {d18[1]}, [r8], ip
  220. vsri.32 d23, d21, #16
  221. vst1.32 {d19[0]}, [r8], ip
  222. vst1.32 {d19[1]}, [r8], ip
  223. vld1.64 {d18-d19},[r5,:128]!
  224. vcvt.s32.f32 q9, q9, #16
  225. vst1.32 {d22[0]}, [r8], ip
  226. vst1.32 {d22[1]}, [r8], ip
  227. vld1.64 {d20-d21},[r4,:128]!
  228. vcvt.s32.f32 q10, q10, #16
  229. vst1.32 {d23[0]}, [r8], ip
  230. vst1.32 {d23[1]}, [r8], ip
  231. vld1.64 {d22-d23},[r5,:128]!
  232. vcvt.s32.f32 q11, q11, #16
  233. 6: subs lr, lr, #16
  234. vld1.64 {d0-d1}, [r4,:128]!
  235. vcvt.s32.f32 q0, q0, #16
  236. vsri.32 d18, d16, #16
  237. vld1.64 {d2-d3}, [r5,:128]!
  238. vcvt.s32.f32 q1, q1, #16
  239. vsri.32 d19, d17, #16
  240. vld1.64 {d4-d5}, [r4,:128]!
  241. vcvt.s32.f32 q2, q2, #16
  242. vld1.64 {d6-d7}, [r5,:128]!
  243. vcvt.s32.f32 q3, q3, #16
  244. vst1.32 {d18[0]}, [r8], ip
  245. vsri.32 d22, d20, #16
  246. vst1.32 {d18[1]}, [r8], ip
  247. vsri.32 d23, d21, #16
  248. vst1.32 {d19[0]}, [r8], ip
  249. vsri.32 d2, d0, #16
  250. vst1.32 {d19[1]}, [r8], ip
  251. vsri.32 d3, d1, #16
  252. vst1.32 {d22[0]}, [r8], ip
  253. vsri.32 d6, d4, #16
  254. vst1.32 {d22[1]}, [r8], ip
  255. vsri.32 d7, d5, #16
  256. vst1.32 {d23[0]}, [r8], ip
  257. vst1.32 {d23[1]}, [r8], ip
  258. beq 6f
  259. vld1.64 {d16-d17},[r4,:128]!
  260. vcvt.s32.f32 q8, q8, #16
  261. vst1.32 {d2[0]}, [r8], ip
  262. vst1.32 {d2[1]}, [r8], ip
  263. vld1.64 {d18-d19},[r5,:128]!
  264. vcvt.s32.f32 q9, q9, #16
  265. vst1.32 {d3[0]}, [r8], ip
  266. vst1.32 {d3[1]}, [r8], ip
  267. vld1.64 {d20-d21},[r4,:128]!
  268. vcvt.s32.f32 q10, q10, #16
  269. vst1.32 {d6[0]}, [r8], ip
  270. vst1.32 {d6[1]}, [r8], ip
  271. vld1.64 {d22-d23},[r5,:128]!
  272. vcvt.s32.f32 q11, q11, #16
  273. vst1.32 {d7[0]}, [r8], ip
  274. vst1.32 {d7[1]}, [r8], ip
  275. bgt 6b
  276. 6: vst1.32 {d2[0]}, [r8], ip
  277. vst1.32 {d2[1]}, [r8], ip
  278. vst1.32 {d3[0]}, [r8], ip
  279. vst1.32 {d3[1]}, [r8], ip
  280. vst1.32 {d6[0]}, [r8], ip
  281. vst1.32 {d6[1]}, [r8], ip
  282. vst1.32 {d7[0]}, [r8], ip
  283. vst1.32 {d7[1]}, [r8], ip
  284. b 8f
  285. 7: vsri.32 d18, d16, #16
  286. vsri.32 d19, d17, #16
  287. vst1.32 {d18[0]}, [r8], ip
  288. vsri.32 d22, d20, #16
  289. vst1.32 {d18[1]}, [r8], ip
  290. vsri.32 d23, d21, #16
  291. vst1.32 {d19[0]}, [r8], ip
  292. vst1.32 {d19[1]}, [r8], ip
  293. vst1.32 {d22[0]}, [r8], ip
  294. vst1.32 {d22[1]}, [r8], ip
  295. vst1.32 {d23[0]}, [r8], ip
  296. vst1.32 {d23[1]}, [r8], ip
  297. 8: subs r3, r3, #2
  298. add r0, r0, #4
  299. it eq
  300. popeq {r4-r8,pc}
  301. @ 1 channel
  302. 4: ldr r4, [r1],#4
  303. tst r2, #8
  304. mov lr, r2
  305. mov r5, r0
  306. vld1.64 {d0-d1}, [r4,:128]!
  307. vcvt.s32.f32 q0, q0, #16
  308. vld1.64 {d2-d3}, [r4,:128]!
  309. vcvt.s32.f32 q1, q1, #16
  310. bne 8f
  311. 6: subs lr, lr, #16
  312. vld1.64 {d4-d5}, [r4,:128]!
  313. vcvt.s32.f32 q2, q2, #16
  314. vld1.64 {d6-d7}, [r4,:128]!
  315. vcvt.s32.f32 q3, q3, #16
  316. vst1.16 {d0[1]}, [r5,:16], ip
  317. vst1.16 {d0[3]}, [r5,:16], ip
  318. vst1.16 {d1[1]}, [r5,:16], ip
  319. vst1.16 {d1[3]}, [r5,:16], ip
  320. vst1.16 {d2[1]}, [r5,:16], ip
  321. vst1.16 {d2[3]}, [r5,:16], ip
  322. vst1.16 {d3[1]}, [r5,:16], ip
  323. vst1.16 {d3[3]}, [r5,:16], ip
  324. beq 7f
  325. vld1.64 {d0-d1}, [r4,:128]!
  326. vcvt.s32.f32 q0, q0, #16
  327. vld1.64 {d2-d3}, [r4,:128]!
  328. vcvt.s32.f32 q1, q1, #16
  329. 7: vst1.16 {d4[1]}, [r5,:16], ip
  330. vst1.16 {d4[3]}, [r5,:16], ip
  331. vst1.16 {d5[1]}, [r5,:16], ip
  332. vst1.16 {d5[3]}, [r5,:16], ip
  333. vst1.16 {d6[1]}, [r5,:16], ip
  334. vst1.16 {d6[3]}, [r5,:16], ip
  335. vst1.16 {d7[1]}, [r5,:16], ip
  336. vst1.16 {d7[3]}, [r5,:16], ip
  337. bgt 6b
  338. pop {r4-r8,pc}
  339. 8: subs lr, lr, #8
  340. vst1.16 {d0[1]}, [r5,:16], ip
  341. vst1.16 {d0[3]}, [r5,:16], ip
  342. vst1.16 {d1[1]}, [r5,:16], ip
  343. vst1.16 {d1[3]}, [r5,:16], ip
  344. vst1.16 {d2[1]}, [r5,:16], ip
  345. vst1.16 {d2[3]}, [r5,:16], ip
  346. vst1.16 {d3[1]}, [r5,:16], ip
  347. vst1.16 {d3[3]}, [r5,:16], ip
  348. it eq
  349. popeq {r4-r8,pc}
  350. vld1.64 {d0-d1}, [r4,:128]!
  351. vcvt.s32.f32 q0, q0, #16
  352. vld1.64 {d2-d3}, [r4,:128]!
  353. vcvt.s32.f32 q1, q1, #16
  354. b 6b
  355. endfunc
  356. function ff_int32_to_float_fmul_scalar_neon, export=1
  357. VFP vdup.32 q0, d0[0]
  358. VFP len .req r2
  359. NOVFP vdup.32 q0, r2
  360. NOVFP len .req r3
  361. vld1.32 {q1},[r1,:128]!
  362. vcvt.f32.s32 q3, q1
  363. vld1.32 {q2},[r1,:128]!
  364. vcvt.f32.s32 q8, q2
  365. 1: subs len, len, #8
  366. pld [r1, #16]
  367. vmul.f32 q9, q3, q0
  368. vmul.f32 q10, q8, q0
  369. beq 2f
  370. vld1.32 {q1},[r1,:128]!
  371. vcvt.f32.s32 q3, q1
  372. vld1.32 {q2},[r1,:128]!
  373. vcvt.f32.s32 q8, q2
  374. vst1.32 {q9}, [r0,:128]!
  375. vst1.32 {q10},[r0,:128]!
  376. b 1b
  377. 2: vst1.32 {q9}, [r0,:128]!
  378. vst1.32 {q10},[r0,:128]!
  379. bx lr
  380. .unreq len
  381. endfunc