float_dsp_vfp.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. /*
  2. * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
  3. *
  4. * This file is part of FFmpeg
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "config.h"
  21. #include "asm.S"
  22. /**
  23. * Assume that len is a positive number and is multiple of 8
  24. */
  25. @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
  26. function ff_vector_fmul_vfp, export=1
  27. vpush {d8-d15}
  28. fmrx r12, fpscr
  29. orr r12, r12, #(3 << 16) /* set vector size to 4 */
  30. fmxr fpscr, r12
  31. vldmia r1!, {s0-s3}
  32. vldmia r2!, {s8-s11}
  33. vldmia r1!, {s4-s7}
  34. vldmia r2!, {s12-s15}
  35. vmul.f32 s8, s0, s8
  36. 1:
  37. subs r3, r3, #16
  38. vmul.f32 s12, s4, s12
  39. itttt ge
  40. vldmiage r1!, {s16-s19}
  41. vldmiage r2!, {s24-s27}
  42. vldmiage r1!, {s20-s23}
  43. vldmiage r2!, {s28-s31}
  44. it ge
  45. vmulge.f32 s24, s16, s24
  46. vstmia r0!, {s8-s11}
  47. vstmia r0!, {s12-s15}
  48. it ge
  49. vmulge.f32 s28, s20, s28
  50. itttt gt
  51. vldmiagt r1!, {s0-s3}
  52. vldmiagt r2!, {s8-s11}
  53. vldmiagt r1!, {s4-s7}
  54. vldmiagt r2!, {s12-s15}
  55. ittt ge
  56. vmulge.f32 s8, s0, s8
  57. vstmiage r0!, {s24-s27}
  58. vstmiage r0!, {s28-s31}
  59. bgt 1b
  60. bic r12, r12, #(7 << 16) /* set vector size back to 1 */
  61. fmxr fpscr, r12
  62. vpop {d8-d15}
  63. bx lr
  64. endfunc
  65. /**
  66. * ARM VFP implementation of 'vector_fmul_window_c' function
  67. * Assume that len is a positive non-zero number
  68. */
  69. @ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
  70. @ const float *src1, const float *win, int len)
  71. function ff_vector_fmul_window_vfp, export=1
  72. DST0 .req a1
  73. SRC0 .req a2
  74. SRC1 .req a3
  75. WIN0 .req a4
  76. LEN .req v1
  77. DST1 .req v2
  78. WIN1 .req v3
  79. OLDFPSCR .req ip
  80. push {v1-v3,lr}
  81. ldr LEN, [sp, #4*4+0]
  82. vpush {s16-s31}
  83. fmrx OLDFPSCR, FPSCR
  84. add DST1, DST0, LEN, lsl #3
  85. add SRC1, SRC1, LEN, lsl #2
  86. add WIN1, WIN0, LEN, lsl #3
  87. tst LEN, #7
  88. beq 4f @ common case: len is a multiple of 8
  89. ldr lr, =0x03000000 @ RunFast mode, scalar mode
  90. fmxr FPSCR, lr
  91. tst LEN, #1
  92. beq 1f
  93. vldmdb WIN1!, {s0}
  94. vldmia SRC0!, {s8}
  95. vldmia WIN0!, {s16}
  96. vmul.f s24, s0, s8
  97. vldmdb SRC1!, {s20}
  98. vmul.f s8, s16, s8
  99. vmls.f s24, s16, s20
  100. vmla.f s8, s0, s20
  101. vstmia DST0!, {s24}
  102. vstmdb DST1!, {s8}
  103. 1:
  104. tst LEN, #2
  105. beq 2f
  106. vldmdb WIN1!, {s0}
  107. vldmdb WIN1!, {s1}
  108. vldmia SRC0!, {s8-s9}
  109. vldmia WIN0!, {s16-s17}
  110. vmul.f s24, s0, s8
  111. vmul.f s25, s1, s9
  112. vldmdb SRC1!, {s20}
  113. vldmdb SRC1!, {s21}
  114. vmul.f s8, s16, s8
  115. vmul.f s9, s17, s9
  116. vmls.f s24, s16, s20
  117. vmls.f s25, s17, s21
  118. vmla.f s8, s0, s20
  119. vmla.f s9, s1, s21
  120. vstmia DST0!, {s24-s25}
  121. vstmdb DST1!, {s8}
  122. vstmdb DST1!, {s9}
  123. 2:
  124. tst LEN, #4
  125. beq 3f
  126. vldmdb WIN1!, {s0}
  127. vldmdb WIN1!, {s1}
  128. vldmdb WIN1!, {s2}
  129. vldmdb WIN1!, {s3}
  130. vldmia SRC0!, {s8-s11}
  131. vldmia WIN0!, {s16-s19}
  132. vmul.f s24, s0, s8
  133. vmul.f s25, s1, s9
  134. vmul.f s26, s2, s10
  135. vmul.f s27, s3, s11
  136. vldmdb SRC1!, {s20}
  137. vldmdb SRC1!, {s21}
  138. vldmdb SRC1!, {s22}
  139. vldmdb SRC1!, {s23}
  140. vmul.f s8, s16, s8
  141. vmul.f s9, s17, s9
  142. vmul.f s10, s18, s10
  143. vmul.f s11, s19, s11
  144. vmls.f s24, s16, s20
  145. vmls.f s25, s17, s21
  146. vmls.f s26, s18, s22
  147. vmls.f s27, s19, s23
  148. vmla.f s8, s0, s20
  149. vmla.f s9, s1, s21
  150. vmla.f s10, s2, s22
  151. vmla.f s11, s3, s23
  152. vstmia DST0!, {s24-s27}
  153. vstmdb DST1!, {s8}
  154. vstmdb DST1!, {s9}
  155. vstmdb DST1!, {s10}
  156. vstmdb DST1!, {s11}
  157. 3:
  158. bics LEN, LEN, #7
  159. beq 7f
  160. 4:
  161. ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  162. fmxr FPSCR, lr
  163. vldmdb WIN1!, {s0}
  164. vldmdb WIN1!, {s1}
  165. vldmdb WIN1!, {s2}
  166. vldmdb WIN1!, {s3}
  167. vldmia SRC0!, {s8-s11}
  168. vldmia WIN0!, {s16-s19}
  169. vmul.f s24, s0, s8 @ vector * vector
  170. vldmdb SRC1!, {s20}
  171. vldmdb SRC1!, {s21}
  172. vldmdb SRC1!, {s22}
  173. vldmdb SRC1!, {s23}
  174. vmul.f s8, s16, s8 @ vector * vector
  175. vmls.f s24, s16, s20 @ vector * vector
  176. vldmdb WIN1!, {s4}
  177. vldmdb WIN1!, {s5}
  178. vldmdb WIN1!, {s6}
  179. vldmdb WIN1!, {s7}
  180. vldmia SRC0!, {s12-s13}
  181. vmla.f s8, s0, s20 @ vector * vector
  182. vldmia SRC0!, {s14-s15}
  183. subs LEN, LEN, #8
  184. beq 6f
  185. 5: vldmia WIN0!, {s20-s23}
  186. vmul.f s28, s4, s12 @ vector * vector
  187. vstmia DST0!, {s24-s25}
  188. vldmdb SRC1!, {s16}
  189. vldmdb SRC1!, {s17}
  190. vldmdb SRC1!, {s18}
  191. vldmdb SRC1!, {s19}
  192. vmul.f s12, s20, s12 @ vector * vector
  193. vstmia DST0!, {s26-s27}
  194. vstmdb DST1!, {s8}
  195. vstmdb DST1!, {s9}
  196. vstmdb DST1!, {s10}
  197. vstmdb DST1!, {s11}
  198. vmls.f s28, s20, s16 @ vector * vector
  199. vldmdb WIN1!, {s0}
  200. vldmdb WIN1!, {s1}
  201. vldmdb WIN1!, {s2}
  202. vldmdb WIN1!, {s3}
  203. vldmia SRC0!, {s8-s9}
  204. vmla.f s12, s4, s16 @ vector * vector
  205. vldmia SRC0!, {s10-s11}
  206. subs LEN, LEN, #8
  207. vldmia WIN0!, {s16-s19}
  208. vmul.f s24, s0, s8 @ vector * vector
  209. vstmia DST0!, {s28-s29}
  210. vldmdb SRC1!, {s20}
  211. vldmdb SRC1!, {s21}
  212. vldmdb SRC1!, {s22}
  213. vldmdb SRC1!, {s23}
  214. vmul.f s8, s16, s8 @ vector * vector
  215. vstmia DST0!, {s30-s31}
  216. vstmdb DST1!, {s12}
  217. vstmdb DST1!, {s13}
  218. vstmdb DST1!, {s14}
  219. vstmdb DST1!, {s15}
  220. vmls.f s24, s16, s20 @ vector * vector
  221. vldmdb WIN1!, {s4}
  222. vldmdb WIN1!, {s5}
  223. vldmdb WIN1!, {s6}
  224. vldmdb WIN1!, {s7}
  225. vldmia SRC0!, {s12-s13}
  226. vmla.f s8, s0, s20 @ vector * vector
  227. vldmia SRC0!, {s14-s15}
  228. bne 5b
  229. 6: vldmia WIN0!, {s20-s23}
  230. vmul.f s28, s4, s12 @ vector * vector
  231. vstmia DST0!, {s24-s25}
  232. vldmdb SRC1!, {s16}
  233. vldmdb SRC1!, {s17}
  234. vldmdb SRC1!, {s18}
  235. vldmdb SRC1!, {s19}
  236. vmul.f s12, s20, s12 @ vector * vector
  237. vstmia DST0!, {s26-s27}
  238. vstmdb DST1!, {s8}
  239. vstmdb DST1!, {s9}
  240. vstmdb DST1!, {s10}
  241. vstmdb DST1!, {s11}
  242. vmls.f s28, s20, s16 @ vector * vector
  243. vmla.f s12, s4, s16 @ vector * vector
  244. vstmia DST0!, {s28-s31}
  245. vstmdb DST1!, {s12}
  246. vstmdb DST1!, {s13}
  247. vstmdb DST1!, {s14}
  248. vstmdb DST1!, {s15}
  249. 7:
  250. fmxr FPSCR, OLDFPSCR
  251. vpop {s16-s31}
  252. pop {v1-v3,pc}
  253. .unreq DST0
  254. .unreq SRC0
  255. .unreq SRC1
  256. .unreq WIN0
  257. .unreq LEN
  258. .unreq OLDFPSCR
  259. .unreq DST1
  260. .unreq WIN1
  261. endfunc
  262. /**
  263. * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
  264. * Assume that len is a positive number and is multiple of 8
  265. */
  266. @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
  267. @ const float *src1, int len)
  268. function ff_vector_fmul_reverse_vfp, export=1
  269. vpush {d8-d15}
  270. add r2, r2, r3, lsl #2
  271. vldmdb r2!, {s0-s3}
  272. vldmia r1!, {s8-s11}
  273. vldmdb r2!, {s4-s7}
  274. vldmia r1!, {s12-s15}
  275. vmul.f32 s8, s3, s8
  276. vmul.f32 s9, s2, s9
  277. vmul.f32 s10, s1, s10
  278. vmul.f32 s11, s0, s11
  279. 1:
  280. subs r3, r3, #16
  281. it ge
  282. vldmdbge r2!, {s16-s19}
  283. vmul.f32 s12, s7, s12
  284. it ge
  285. vldmiage r1!, {s24-s27}
  286. vmul.f32 s13, s6, s13
  287. it ge
  288. vldmdbge r2!, {s20-s23}
  289. vmul.f32 s14, s5, s14
  290. it ge
  291. vldmiage r1!, {s28-s31}
  292. vmul.f32 s15, s4, s15
  293. it ge
  294. vmulge.f32 s24, s19, s24
  295. it gt
  296. vldmdbgt r2!, {s0-s3}
  297. it ge
  298. vmulge.f32 s25, s18, s25
  299. vstmia r0!, {s8-s13}
  300. it ge
  301. vmulge.f32 s26, s17, s26
  302. it gt
  303. vldmiagt r1!, {s8-s11}
  304. itt ge
  305. vmulge.f32 s27, s16, s27
  306. vmulge.f32 s28, s23, s28
  307. it gt
  308. vldmdbgt r2!, {s4-s7}
  309. it ge
  310. vmulge.f32 s29, s22, s29
  311. vstmia r0!, {s14-s15}
  312. ittt ge
  313. vmulge.f32 s30, s21, s30
  314. vmulge.f32 s31, s20, s31
  315. vmulge.f32 s8, s3, s8
  316. it gt
  317. vldmiagt r1!, {s12-s15}
  318. itttt ge
  319. vmulge.f32 s9, s2, s9
  320. vmulge.f32 s10, s1, s10
  321. vstmiage r0!, {s24-s27}
  322. vmulge.f32 s11, s0, s11
  323. it ge
  324. vstmiage r0!, {s28-s31}
  325. bgt 1b
  326. vpop {d8-d15}
  327. bx lr
  328. endfunc
  329. /**
  330. * ARM VFP implementation of 'butterflies_float_c' function
  331. * Assume that len is a positive non-zero number
  332. */
  333. @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
  334. function ff_butterflies_float_vfp, export=1
  335. BASE1 .req a1
  336. BASE2 .req a2
  337. LEN .req a3
  338. OLDFPSCR .req a4
  339. vpush {s16-s31}
  340. fmrx OLDFPSCR, FPSCR
  341. tst LEN, #7
  342. beq 4f @ common case: len is a multiple of 8
  343. ldr ip, =0x03000000 @ RunFast mode, scalar mode
  344. fmxr FPSCR, ip
  345. tst LEN, #1
  346. beq 1f
  347. vldmia BASE1!, {s0}
  348. vldmia BASE2!, {s8}
  349. vadd.f s16, s0, s8
  350. vsub.f s24, s0, s8
  351. vstr s16, [BASE1, #0-4*1]
  352. vstr s24, [BASE2, #0-4*1]
  353. 1:
  354. tst LEN, #2
  355. beq 2f
  356. vldmia BASE1!, {s0-s1}
  357. vldmia BASE2!, {s8-s9}
  358. vadd.f s16, s0, s8
  359. vadd.f s17, s1, s9
  360. vsub.f s24, s0, s8
  361. vsub.f s25, s1, s9
  362. vstr d8, [BASE1, #0-8*1] @ s16,s17
  363. vstr d12, [BASE2, #0-8*1] @ s24,s25
  364. 2:
  365. tst LEN, #4
  366. beq 3f
  367. vldmia BASE1!, {s0-s1}
  368. vldmia BASE2!, {s8-s9}
  369. vldmia BASE1!, {s2-s3}
  370. vldmia BASE2!, {s10-s11}
  371. vadd.f s16, s0, s8
  372. vadd.f s17, s1, s9
  373. vsub.f s24, s0, s8
  374. vsub.f s25, s1, s9
  375. vadd.f s18, s2, s10
  376. vadd.f s19, s3, s11
  377. vsub.f s26, s2, s10
  378. vsub.f s27, s3, s11
  379. vstr d8, [BASE1, #0-16*1] @ s16,s17
  380. vstr d12, [BASE2, #0-16*1] @ s24,s25
  381. vstr d9, [BASE1, #8-16*1] @ s18,s19
  382. vstr d13, [BASE2, #8-16*1] @ s26,s27
  383. 3:
  384. bics LEN, LEN, #7
  385. beq 7f
  386. 4:
  387. ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  388. fmxr FPSCR, ip
  389. vldmia BASE1!, {s0-s1}
  390. vldmia BASE2!, {s8-s9}
  391. vldmia BASE1!, {s2-s3}
  392. vldmia BASE2!, {s10-s11}
  393. vadd.f s16, s0, s8
  394. vldmia BASE1!, {s4-s5}
  395. vldmia BASE2!, {s12-s13}
  396. vldmia BASE1!, {s6-s7}
  397. vldmia BASE2!, {s14-s15}
  398. vsub.f s24, s0, s8
  399. vadd.f s20, s4, s12
  400. subs LEN, LEN, #8
  401. beq 6f
  402. 5: vldmia BASE1!, {s0-s3}
  403. vldmia BASE2!, {s8-s11}
  404. vsub.f s28, s4, s12
  405. vstr d8, [BASE1, #0-16*3] @ s16,s17
  406. vstr d9, [BASE1, #8-16*3] @ s18,s19
  407. vstr d12, [BASE2, #0-16*3] @ s24,s25
  408. vstr d13, [BASE2, #8-16*3] @ s26,s27
  409. vadd.f s16, s0, s8
  410. vldmia BASE1!, {s4-s7}
  411. vldmia BASE2!, {s12-s15}
  412. vsub.f s24, s0, s8
  413. vstr d10, [BASE1, #0-16*3] @ s20,s21
  414. vstr d11, [BASE1, #8-16*3] @ s22,s23
  415. vstr d14, [BASE2, #0-16*3] @ s28,s29
  416. vstr d15, [BASE2, #8-16*3] @ s30,s31
  417. vadd.f s20, s4, s12
  418. subs LEN, LEN, #8
  419. bne 5b
  420. 6: vsub.f s28, s4, s12
  421. vstr d8, [BASE1, #0-16*2] @ s16,s17
  422. vstr d9, [BASE1, #8-16*2] @ s18,s19
  423. vstr d12, [BASE2, #0-16*2] @ s24,s25
  424. vstr d13, [BASE2, #8-16*2] @ s26,s27
  425. vstr d10, [BASE1, #0-16*1] @ s20,s21
  426. vstr d11, [BASE1, #8-16*1] @ s22,s23
  427. vstr d14, [BASE2, #0-16*1] @ s28,s29
  428. vstr d15, [BASE2, #8-16*1] @ s30,s31
  429. 7:
  430. fmxr FPSCR, OLDFPSCR
  431. vpop {s16-s31}
  432. bx lr
  433. .unreq BASE1
  434. .unreq BASE2
  435. .unreq LEN
  436. .unreq OLDFPSCR
  437. endfunc