123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457 |
- /*
- * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
- #include "config.h"
- #include "asm.S"
- /**
- * Assume that len is a positive number and is multiple of 8
- */
- @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
- function ff_vector_fmul_vfp, export=1
- vpush {d8-d15}
- fmrx r12, fpscr
- orr r12, r12, #(3 << 16) /* set vector size to 4 */
- fmxr fpscr, r12
- vldmia r1!, {s0-s3}
- vldmia r2!, {s8-s11}
- vldmia r1!, {s4-s7}
- vldmia r2!, {s12-s15}
- vmul.f32 s8, s0, s8
- 1:
- subs r3, r3, #16
- vmul.f32 s12, s4, s12
- itttt ge
- vldmiage r1!, {s16-s19}
- vldmiage r2!, {s24-s27}
- vldmiage r1!, {s20-s23}
- vldmiage r2!, {s28-s31}
- it ge
- vmulge.f32 s24, s16, s24
- vstmia r0!, {s8-s11}
- vstmia r0!, {s12-s15}
- it ge
- vmulge.f32 s28, s20, s28
- itttt gt
- vldmiagt r1!, {s0-s3}
- vldmiagt r2!, {s8-s11}
- vldmiagt r1!, {s4-s7}
- vldmiagt r2!, {s12-s15}
- ittt ge
- vmulge.f32 s8, s0, s8
- vstmiage r0!, {s24-s27}
- vstmiage r0!, {s28-s31}
- bgt 1b
- bic r12, r12, #(7 << 16) /* set vector size back to 1 */
- fmxr fpscr, r12
- vpop {d8-d15}
- bx lr
- endfunc
- /**
- * ARM VFP implementation of 'vector_fmul_window_c' function
- * Assume that len is a positive non-zero number
- */
- @ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
- @ const float *src1, const float *win, int len)
- function ff_vector_fmul_window_vfp, export=1
- DST0 .req a1
- SRC0 .req a2
- SRC1 .req a3
- WIN0 .req a4
- LEN .req v1
- DST1 .req v2
- WIN1 .req v3
- OLDFPSCR .req ip
- push {v1-v3,lr}
- ldr LEN, [sp, #4*4+0]
- vpush {s16-s31}
- fmrx OLDFPSCR, FPSCR
- add DST1, DST0, LEN, lsl #3
- add SRC1, SRC1, LEN, lsl #2
- add WIN1, WIN0, LEN, lsl #3
- tst LEN, #7
- beq 4f @ common case: len is a multiple of 8
- ldr lr, =0x03000000 @ RunFast mode, scalar mode
- fmxr FPSCR, lr
- tst LEN, #1
- beq 1f
- vldmdb WIN1!, {s0}
- vldmia SRC0!, {s8}
- vldmia WIN0!, {s16}
- vmul.f s24, s0, s8
- vldmdb SRC1!, {s20}
- vmul.f s8, s16, s8
- vmls.f s24, s16, s20
- vmla.f s8, s0, s20
- vstmia DST0!, {s24}
- vstmdb DST1!, {s8}
- 1:
- tst LEN, #2
- beq 2f
- vldmdb WIN1!, {s0}
- vldmdb WIN1!, {s1}
- vldmia SRC0!, {s8-s9}
- vldmia WIN0!, {s16-s17}
- vmul.f s24, s0, s8
- vmul.f s25, s1, s9
- vldmdb SRC1!, {s20}
- vldmdb SRC1!, {s21}
- vmul.f s8, s16, s8
- vmul.f s9, s17, s9
- vmls.f s24, s16, s20
- vmls.f s25, s17, s21
- vmla.f s8, s0, s20
- vmla.f s9, s1, s21
- vstmia DST0!, {s24-s25}
- vstmdb DST1!, {s8}
- vstmdb DST1!, {s9}
- 2:
- tst LEN, #4
- beq 3f
- vldmdb WIN1!, {s0}
- vldmdb WIN1!, {s1}
- vldmdb WIN1!, {s2}
- vldmdb WIN1!, {s3}
- vldmia SRC0!, {s8-s11}
- vldmia WIN0!, {s16-s19}
- vmul.f s24, s0, s8
- vmul.f s25, s1, s9
- vmul.f s26, s2, s10
- vmul.f s27, s3, s11
- vldmdb SRC1!, {s20}
- vldmdb SRC1!, {s21}
- vldmdb SRC1!, {s22}
- vldmdb SRC1!, {s23}
- vmul.f s8, s16, s8
- vmul.f s9, s17, s9
- vmul.f s10, s18, s10
- vmul.f s11, s19, s11
- vmls.f s24, s16, s20
- vmls.f s25, s17, s21
- vmls.f s26, s18, s22
- vmls.f s27, s19, s23
- vmla.f s8, s0, s20
- vmla.f s9, s1, s21
- vmla.f s10, s2, s22
- vmla.f s11, s3, s23
- vstmia DST0!, {s24-s27}
- vstmdb DST1!, {s8}
- vstmdb DST1!, {s9}
- vstmdb DST1!, {s10}
- vstmdb DST1!, {s11}
- 3:
- bics LEN, LEN, #7
- beq 7f
- 4:
- ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, lr
- vldmdb WIN1!, {s0}
- vldmdb WIN1!, {s1}
- vldmdb WIN1!, {s2}
- vldmdb WIN1!, {s3}
- vldmia SRC0!, {s8-s11}
- vldmia WIN0!, {s16-s19}
- vmul.f s24, s0, s8 @ vector * vector
- vldmdb SRC1!, {s20}
- vldmdb SRC1!, {s21}
- vldmdb SRC1!, {s22}
- vldmdb SRC1!, {s23}
- vmul.f s8, s16, s8 @ vector * vector
- vmls.f s24, s16, s20 @ vector * vector
- vldmdb WIN1!, {s4}
- vldmdb WIN1!, {s5}
- vldmdb WIN1!, {s6}
- vldmdb WIN1!, {s7}
- vldmia SRC0!, {s12-s13}
- vmla.f s8, s0, s20 @ vector * vector
- vldmia SRC0!, {s14-s15}
- subs LEN, LEN, #8
- beq 6f
- 5: vldmia WIN0!, {s20-s23}
- vmul.f s28, s4, s12 @ vector * vector
- vstmia DST0!, {s24-s25}
- vldmdb SRC1!, {s16}
- vldmdb SRC1!, {s17}
- vldmdb SRC1!, {s18}
- vldmdb SRC1!, {s19}
- vmul.f s12, s20, s12 @ vector * vector
- vstmia DST0!, {s26-s27}
- vstmdb DST1!, {s8}
- vstmdb DST1!, {s9}
- vstmdb DST1!, {s10}
- vstmdb DST1!, {s11}
- vmls.f s28, s20, s16 @ vector * vector
- vldmdb WIN1!, {s0}
- vldmdb WIN1!, {s1}
- vldmdb WIN1!, {s2}
- vldmdb WIN1!, {s3}
- vldmia SRC0!, {s8-s9}
- vmla.f s12, s4, s16 @ vector * vector
- vldmia SRC0!, {s10-s11}
- subs LEN, LEN, #8
- vldmia WIN0!, {s16-s19}
- vmul.f s24, s0, s8 @ vector * vector
- vstmia DST0!, {s28-s29}
- vldmdb SRC1!, {s20}
- vldmdb SRC1!, {s21}
- vldmdb SRC1!, {s22}
- vldmdb SRC1!, {s23}
- vmul.f s8, s16, s8 @ vector * vector
- vstmia DST0!, {s30-s31}
- vstmdb DST1!, {s12}
- vstmdb DST1!, {s13}
- vstmdb DST1!, {s14}
- vstmdb DST1!, {s15}
- vmls.f s24, s16, s20 @ vector * vector
- vldmdb WIN1!, {s4}
- vldmdb WIN1!, {s5}
- vldmdb WIN1!, {s6}
- vldmdb WIN1!, {s7}
- vldmia SRC0!, {s12-s13}
- vmla.f s8, s0, s20 @ vector * vector
- vldmia SRC0!, {s14-s15}
- bne 5b
- 6: vldmia WIN0!, {s20-s23}
- vmul.f s28, s4, s12 @ vector * vector
- vstmia DST0!, {s24-s25}
- vldmdb SRC1!, {s16}
- vldmdb SRC1!, {s17}
- vldmdb SRC1!, {s18}
- vldmdb SRC1!, {s19}
- vmul.f s12, s20, s12 @ vector * vector
- vstmia DST0!, {s26-s27}
- vstmdb DST1!, {s8}
- vstmdb DST1!, {s9}
- vstmdb DST1!, {s10}
- vstmdb DST1!, {s11}
- vmls.f s28, s20, s16 @ vector * vector
- vmla.f s12, s4, s16 @ vector * vector
- vstmia DST0!, {s28-s31}
- vstmdb DST1!, {s12}
- vstmdb DST1!, {s13}
- vstmdb DST1!, {s14}
- vstmdb DST1!, {s15}
- 7:
- fmxr FPSCR, OLDFPSCR
- vpop {s16-s31}
- pop {v1-v3,pc}
- .unreq DST0
- .unreq SRC0
- .unreq SRC1
- .unreq WIN0
- .unreq LEN
- .unreq OLDFPSCR
- .unreq DST1
- .unreq WIN1
- endfunc
- /**
- * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
- * Assume that len is a positive number and is multiple of 8
- */
- @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
- @ const float *src1, int len)
- function ff_vector_fmul_reverse_vfp, export=1
- vpush {d8-d15}
- add r2, r2, r3, lsl #2
- vldmdb r2!, {s0-s3}
- vldmia r1!, {s8-s11}
- vldmdb r2!, {s4-s7}
- vldmia r1!, {s12-s15}
- vmul.f32 s8, s3, s8
- vmul.f32 s9, s2, s9
- vmul.f32 s10, s1, s10
- vmul.f32 s11, s0, s11
- 1:
- subs r3, r3, #16
- it ge
- vldmdbge r2!, {s16-s19}
- vmul.f32 s12, s7, s12
- it ge
- vldmiage r1!, {s24-s27}
- vmul.f32 s13, s6, s13
- it ge
- vldmdbge r2!, {s20-s23}
- vmul.f32 s14, s5, s14
- it ge
- vldmiage r1!, {s28-s31}
- vmul.f32 s15, s4, s15
- it ge
- vmulge.f32 s24, s19, s24
- it gt
- vldmdbgt r2!, {s0-s3}
- it ge
- vmulge.f32 s25, s18, s25
- vstmia r0!, {s8-s13}
- it ge
- vmulge.f32 s26, s17, s26
- it gt
- vldmiagt r1!, {s8-s11}
- itt ge
- vmulge.f32 s27, s16, s27
- vmulge.f32 s28, s23, s28
- it gt
- vldmdbgt r2!, {s4-s7}
- it ge
- vmulge.f32 s29, s22, s29
- vstmia r0!, {s14-s15}
- ittt ge
- vmulge.f32 s30, s21, s30
- vmulge.f32 s31, s20, s31
- vmulge.f32 s8, s3, s8
- it gt
- vldmiagt r1!, {s12-s15}
- itttt ge
- vmulge.f32 s9, s2, s9
- vmulge.f32 s10, s1, s10
- vstmiage r0!, {s24-s27}
- vmulge.f32 s11, s0, s11
- it ge
- vstmiage r0!, {s28-s31}
- bgt 1b
- vpop {d8-d15}
- bx lr
- endfunc
- /**
- * ARM VFP implementation of 'butterflies_float_c' function
- * Assume that len is a positive non-zero number
- */
- @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
- function ff_butterflies_float_vfp, export=1
- BASE1 .req a1
- BASE2 .req a2
- LEN .req a3
- OLDFPSCR .req a4
- vpush {s16-s31}
- fmrx OLDFPSCR, FPSCR
- tst LEN, #7
- beq 4f @ common case: len is a multiple of 8
- ldr ip, =0x03000000 @ RunFast mode, scalar mode
- fmxr FPSCR, ip
- tst LEN, #1
- beq 1f
- vldmia BASE1!, {s0}
- vldmia BASE2!, {s8}
- vadd.f s16, s0, s8
- vsub.f s24, s0, s8
- vstr s16, [BASE1, #0-4*1]
- vstr s24, [BASE2, #0-4*1]
- 1:
- tst LEN, #2
- beq 2f
- vldmia BASE1!, {s0-s1}
- vldmia BASE2!, {s8-s9}
- vadd.f s16, s0, s8
- vadd.f s17, s1, s9
- vsub.f s24, s0, s8
- vsub.f s25, s1, s9
- vstr d8, [BASE1, #0-8*1] @ s16,s17
- vstr d12, [BASE2, #0-8*1] @ s24,s25
- 2:
- tst LEN, #4
- beq 3f
- vldmia BASE1!, {s0-s1}
- vldmia BASE2!, {s8-s9}
- vldmia BASE1!, {s2-s3}
- vldmia BASE2!, {s10-s11}
- vadd.f s16, s0, s8
- vadd.f s17, s1, s9
- vsub.f s24, s0, s8
- vsub.f s25, s1, s9
- vadd.f s18, s2, s10
- vadd.f s19, s3, s11
- vsub.f s26, s2, s10
- vsub.f s27, s3, s11
- vstr d8, [BASE1, #0-16*1] @ s16,s17
- vstr d12, [BASE2, #0-16*1] @ s24,s25
- vstr d9, [BASE1, #8-16*1] @ s18,s19
- vstr d13, [BASE2, #8-16*1] @ s26,s27
- 3:
- bics LEN, LEN, #7
- beq 7f
- 4:
- ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, ip
- vldmia BASE1!, {s0-s1}
- vldmia BASE2!, {s8-s9}
- vldmia BASE1!, {s2-s3}
- vldmia BASE2!, {s10-s11}
- vadd.f s16, s0, s8
- vldmia BASE1!, {s4-s5}
- vldmia BASE2!, {s12-s13}
- vldmia BASE1!, {s6-s7}
- vldmia BASE2!, {s14-s15}
- vsub.f s24, s0, s8
- vadd.f s20, s4, s12
- subs LEN, LEN, #8
- beq 6f
- 5: vldmia BASE1!, {s0-s3}
- vldmia BASE2!, {s8-s11}
- vsub.f s28, s4, s12
- vstr d8, [BASE1, #0-16*3] @ s16,s17
- vstr d9, [BASE1, #8-16*3] @ s18,s19
- vstr d12, [BASE2, #0-16*3] @ s24,s25
- vstr d13, [BASE2, #8-16*3] @ s26,s27
- vadd.f s16, s0, s8
- vldmia BASE1!, {s4-s7}
- vldmia BASE2!, {s12-s15}
- vsub.f s24, s0, s8
- vstr d10, [BASE1, #0-16*3] @ s20,s21
- vstr d11, [BASE1, #8-16*3] @ s22,s23
- vstr d14, [BASE2, #0-16*3] @ s28,s29
- vstr d15, [BASE2, #8-16*3] @ s30,s31
- vadd.f s20, s4, s12
- subs LEN, LEN, #8
- bne 5b
- 6: vsub.f s28, s4, s12
- vstr d8, [BASE1, #0-16*2] @ s16,s17
- vstr d9, [BASE1, #8-16*2] @ s18,s19
- vstr d12, [BASE2, #0-16*2] @ s24,s25
- vstr d13, [BASE2, #8-16*2] @ s26,s27
- vstr d10, [BASE1, #0-16*1] @ s20,s21
- vstr d11, [BASE1, #8-16*1] @ s22,s23
- vstr d14, [BASE2, #0-16*1] @ s28,s29
- vstr d15, [BASE2, #8-16*1] @ s30,s31
- 7:
- fmxr FPSCR, OLDFPSCR
- vpop {s16-s31}
- bx lr
- .unreq BASE1
- .unreq BASE2
- .unreq LEN
- .unreq OLDFPSCR
- endfunc
|