123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501 |
- ;******************************************************************************
- ;* MMX/SSE2-optimized functions for the RV40 decoder
- ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
- ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
- ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
- ;*
- ;* This file is part of FFmpeg.
- ;*
- ;* FFmpeg is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* FFmpeg is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with FFmpeg; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;******************************************************************************
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA
- align 16
- pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
- sixtap_filter_hb_m: times 8 db 1, -5
- times 8 db 52, 20
- ; multiplied by 2 to have the same shift
- times 8 db 2, -10
- times 8 db 40, 40
- ; back to normal
- times 8 db 1, -5
- times 8 db 20, 52
- sixtap_filter_v_m: times 8 dw 1
- times 8 dw -5
- times 8 dw 52
- times 8 dw 20
- ; multiplied by 2 to have the same shift
- times 8 dw 2
- times 8 dw -10
- times 8 dw 40
- times 8 dw 40
- ; back to normal
- times 8 dw 1
- times 8 dw -5
- times 8 dw 20
- times 8 dw 52
- %ifdef PIC
- %define sixtap_filter_hw picregq
- %define sixtap_filter_hb picregq
- %define sixtap_filter_v picregq
- %define npicregs 1
- %else
- %define sixtap_filter_hw sixtap_filter_hw_m
- %define sixtap_filter_hb sixtap_filter_hb_m
- %define sixtap_filter_v sixtap_filter_v_m
- %define npicregs 0
- %endif
- filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
- filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
- filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
- cextern pw_32
- cextern pw_16
- cextern pw_512
- SECTION .text
- ;-----------------------------------------------------------------------------
- ; subpel MC functions:
- ;
- ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
- ; uint8_t *src, int srcstride,
- ; int len, int m);
- ;----------------------------------------------------------------------
- %macro LOAD 2
- %if WIN64
- movsxd %1q, %1d
- %endif
- %ifdef PIC
- add %1q, picregq
- %else
- add %1q, %2
- %endif
- %endmacro
- %macro STORE 3
- %ifidn %3, avg
- movh %2, [dstq]
- %endif
- packuswb %1, %1
- %ifidn %3, avg
- PAVGB %1, %2
- %endif
- movh [dstq], %1
- %endmacro
- %macro FILTER_V 1
- cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
- %ifdef PIC
- lea picregq, [sixtap_filter_v_m]
- %endif
- pxor m7, m7
- LOAD my, sixtap_filter_v
- ; read 5 lines
- sub srcq, srcstrideq
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- punpcklbw m4, m7
- %ifdef m8
- mova m8, [myq+ 0]
- mova m9, [myq+16]
- mova m10, [myq+32]
- mova m11, [myq+48]
- %define COEFF05 m8
- %define COEFF14 m9
- %define COEFF2 m10
- %define COEFF3 m11
- %else
- %define COEFF05 [myq+ 0]
- %define COEFF14 [myq+16]
- %define COEFF2 [myq+32]
- %define COEFF3 [myq+48]
- %endif
- .nextrow:
- mova m6, m1
- movh m5, [srcq+2*srcstrideq] ; read new row
- paddw m6, m4
- punpcklbw m5, m7
- pmullw m6, COEFF14
- paddw m0, m5
- pmullw m0, COEFF05
- paddw m6, m0
- mova m0, m1
- paddw m6, [pw_32]
- mova m1, m2
- pmullw m2, COEFF2
- paddw m6, m2
- mova m2, m3
- pmullw m3, COEFF3
- paddw m6, m3
- ; round/clip/store
- mova m3, m4
- psraw m6, 6
- mova m4, m5
- STORE m6, m5, %1
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
- %endmacro
- %macro FILTER_H 1
- cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
- %ifdef PIC
- lea picregq, [sixtap_filter_v_m]
- %endif
- pxor m7, m7
- LOAD mx, sixtap_filter_v
- mova m6, [pw_32]
- %ifdef m8
- mova m8, [mxq+ 0]
- mova m9, [mxq+16]
- mova m10, [mxq+32]
- mova m11, [mxq+48]
- %define COEFF05 m8
- %define COEFF14 m9
- %define COEFF2 m10
- %define COEFF3 m11
- %else
- %define COEFF05 [mxq+ 0]
- %define COEFF14 [mxq+16]
- %define COEFF2 [mxq+32]
- %define COEFF3 [mxq+48]
- %endif
- .nextrow:
- movq m0, [srcq-2]
- movq m5, [srcq+3]
- movq m1, [srcq-1]
- movq m4, [srcq+2]
- punpcklbw m0, m7
- punpcklbw m5, m7
- punpcklbw m1, m7
- punpcklbw m4, m7
- movq m2, [srcq-0]
- movq m3, [srcq+1]
- paddw m0, m5
- paddw m1, m4
- punpcklbw m2, m7
- punpcklbw m3, m7
- pmullw m0, COEFF05
- pmullw m1, COEFF14
- pmullw m2, COEFF2
- pmullw m3, COEFF3
- paddw m0, m6
- paddw m1, m2
- paddw m0, m3
- paddw m0, m1
- psraw m0, 6
- STORE m0, m1, %1
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
- %endmacro
- %if ARCH_X86_32
- INIT_MMX mmx
- FILTER_V put
- FILTER_H put
- INIT_MMX mmxext
- FILTER_V avg
- FILTER_H avg
- INIT_MMX 3dnow
- FILTER_V avg
- FILTER_H avg
- %endif
- INIT_XMM sse2
- FILTER_H put
- FILTER_H avg
- FILTER_V put
- FILTER_V avg
- %macro FILTER_SSSE3 1
- cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
- %ifdef PIC
- lea picregq, [sixtap_filter_hb_m]
- %endif
- ; read 5 lines
- sub srcq, srcstrideq
- LOAD my, sixtap_filter_hb
- sub srcq, srcstrideq
- movh m0, [srcq]
- movh m1, [srcq+srcstrideq]
- movh m2, [srcq+srcstrideq*2]
- lea srcq, [srcq+srcstrideq*2]
- add srcq, srcstrideq
- mova m5, [myq]
- movh m3, [srcq]
- movh m4, [srcq+srcstrideq]
- lea srcq, [srcq+2*srcstrideq]
- .nextrow:
- mova m6, m2
- punpcklbw m0, m1
- punpcklbw m6, m3
- pmaddubsw m0, m5
- pmaddubsw m6, [myq+16]
- movh m7, [srcq] ; read new row
- paddw m6, m0
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m7
- punpcklbw m7, m3
- pmaddubsw m7, m5
- paddw m6, m7
- pmulhrsw m6, [pw_512]
- STORE m6, m7, %1
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
- cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
- %ifdef PIC
- lea picregq, [sixtap_filter_hb_m]
- %endif
- mova m3, [filter_h6_shuf2]
- mova m4, [filter_h6_shuf3]
- LOAD mx, sixtap_filter_hb
- mova m5, [mxq] ; set up 6tap filter in bytes
- mova m6, [mxq+16]
- mova m7, [filter_h6_shuf1]
- .nextrow:
- movu m0, [srcq-2]
- mova m1, m0
- mova m2, m0
- pshufb m0, m7
- pshufb m1, m3
- pshufb m2, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m6
- pmaddubsw m2, m5
- paddw m0, m1
- paddw m0, m2
- pmulhrsw m0, [pw_512]
- STORE m0, m1, %1
- ; go to next line
- add dstq, dststrideq
- add srcq, srcstrideq
- dec heightd ; next row
- jg .nextrow
- REP_RET
- %endmacro
- INIT_XMM ssse3
- FILTER_SSSE3 put
- FILTER_SSSE3 avg
- ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
- %macro RV40_WCORE 4-5
- movh m4, [%3 + r6 + 0]
- movh m5, [%4 + r6 + 0]
- %if %0 == 4
- %define OFFSET r6 + mmsize / 2
- %else
- ; 8x8 block and sse2, stride was provided
- %define OFFSET r6
- add r6, r5
- %endif
- movh m6, [%3 + OFFSET]
- movh m7, [%4 + OFFSET]
- %if %1 == 0
- ; 14bits weights
- punpcklbw m4, m0
- punpcklbw m5, m0
- punpcklbw m6, m0
- punpcklbw m7, m0
- psllw m4, 7
- psllw m5, 7
- psllw m6, 7
- psllw m7, 7
- pmulhw m4, m3
- pmulhw m5, m2
- pmulhw m6, m3
- pmulhw m7, m2
- paddw m4, m5
- paddw m6, m7
- %else
- ; 5bits weights
- %if cpuflag(ssse3)
- punpcklbw m4, m5
- punpcklbw m6, m7
- pmaddubsw m4, m3
- pmaddubsw m6, m3
- %else
- punpcklbw m4, m0
- punpcklbw m5, m0
- punpcklbw m6, m0
- punpcklbw m7, m0
- pmullw m4, m3
- pmullw m5, m2
- pmullw m6, m3
- pmullw m7, m2
- paddw m4, m5
- paddw m6, m7
- %endif
- %endif
- ; bias and shift down
- %if cpuflag(ssse3)
- pmulhrsw m4, m1
- pmulhrsw m6, m1
- %else
- paddw m4, m1
- paddw m6, m1
- psrlw m4, 5
- psrlw m6, 5
- %endif
- packuswb m4, m6
- %if %0 == 5
- ; Only called for 8x8 blocks and sse2
- sub r6, r5
- movh [%2 + r6], m4
- add r6, r5
- movhps [%2 + r6], m4
- %else
- mova [%2 + r6], m4
- %endif
- %endmacro
- %macro MAIN_LOOP 2
- %if mmsize == 8
- RV40_WCORE %2, r0, r1, r2
- %if %1 == 16
- RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
- %endif
- ; Prepare for next loop
- add r6, r5
- %else
- %ifidn %1, 8
- RV40_WCORE %2, r0, r1, r2, r5
- ; Prepare 2 next lines
- add r6, r5
- %else
- RV40_WCORE %2, r0, r1, r2
- ; Prepare single next line
- add r6, r5
- %endif
- %endif
- %endmacro
- ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
- ; %1=size %2=num of xmm regs
- ; The weights are FP0.14 notation of fractions depending on pts.
- ; For timebases without rounding error (i.e. PAL), the fractions
- ; can be simplified, and several operations can be avoided.
- ; Therefore, we check here whether they are multiples of 2^9 for
- ; those simplifications to occur.
- %macro RV40_WEIGHT 3
- cglobal rv40_weight_func_%1_%2, 6, 7, 8
- %if cpuflag(ssse3)
- mova m1, [pw_1024]
- %else
- mova m1, [pw_16]
- %endif
- pxor m0, m0
- ; Set loop counter and increments
- mov r6, r5
- shl r6, %3
- add r0, r6
- add r1, r6
- add r2, r6
- neg r6
- movd m2, r3d
- movd m3, r4d
- %ifidn %1,rnd
- %define RND 0
- SPLATW m2, m2
- %else
- %define RND 1
- %if cpuflag(ssse3)
- punpcklbw m3, m2
- %else
- SPLATW m2, m2
- %endif
- %endif
- SPLATW m3, m3
- .loop:
- MAIN_LOOP %2, RND
- jnz .loop
- REP_RET
- %endmacro
- INIT_MMX mmxext
- RV40_WEIGHT rnd, 8, 3
- RV40_WEIGHT rnd, 16, 4
- RV40_WEIGHT nornd, 8, 3
- RV40_WEIGHT nornd, 16, 4
- INIT_XMM sse2
- RV40_WEIGHT rnd, 8, 3
- RV40_WEIGHT rnd, 16, 4
- RV40_WEIGHT nornd, 8, 3
- RV40_WEIGHT nornd, 16, 4
- INIT_XMM ssse3
- RV40_WEIGHT rnd, 8, 3
- RV40_WEIGHT rnd, 16, 4
- RV40_WEIGHT nornd, 8, 3
- RV40_WEIGHT nornd, 16, 4
|