12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097 |
- ;*****************************************************************************
- ;* x86-optimized functions for colorspace filter
- ;*
- ;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
- ;*
- ;* This file is part of FFmpeg.
- ;*
- ;* FFmpeg is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* FFmpeg is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with FFmpeg; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;******************************************************************************
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA
- pw_1: times 8 dw 1
- pw_2: times 8 dw 2
- pw_4: times 8 dw 4
- pw_8: times 8 dw 8
- pw_16: times 8 dw 16
- pw_64: times 8 dw 64
- pw_128: times 8 dw 128
- pw_256: times 8 dw 256
- pw_512: times 8 dw 512
- pw_1023: times 8 dw 1023
- pw_1024: times 8 dw 1024
- pw_2048: times 8 dw 2048
- pw_4095: times 8 dw 4095
- pw_8192: times 8 dw 8192
- pw_16384: times 8 dw 16384
- pd_1: times 4 dd 1
- pd_2: times 4 dd 2
- pd_128: times 4 dd 128
- pd_512: times 4 dd 512
- pd_2048: times 4 dd 2048
- pd_8192: times 4 dd 8192
- pd_32768: times 4 dd 32768
- pd_131072: times 4 dd 131072
- SECTION .text
- ; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
- ; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
- ; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
- ; const int16_t yuv_offset[2][8])
- %if ARCH_X86_64
- %macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
- %assign %%sh (14 + %1 - %2)
- %assign %%rnd (1 << (%%sh - 1))
- %assign %%uvinoff (128 << (%1 - 8))
- %assign %%uvoutoff (128 << (%2 - 8))
- %if %3 == 0
- %assign %%ss 444
- %elif %4 == 0
- %assign %%ss 422
- %else ; %4 == 1
- %assign %%ss 420
- %endif ; %3/%4
- %if %2 != 8
- %assign %%maxval (1 << %2) - 1
- %endif ; %2 != 8
- %assign %%ypsh %%sh - 1
- %if %%ypsh > 14
- %assign %%yoffsh %%ypsh - 13
- %assign %%ypsh 14
- %else
- %assign %%yoffsh 1
- %endif
- %assign %%yprnd (1 << (%%yoffsh - 1))
- %assign %%ypmul (1 << %%ypsh)
- cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
- yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo
- %if %3 == 1
- inc wd
- sar wd, 1
- %if %4 == 1
- inc hd
- sar hd, 1
- %endif ; %4 == 1
- %endif ; %3 == 1
- mov [rsp+3*mmsize+0], wd
- mov [rsp+3*mmsize+4], hd
- mova m10, [cq]
- pxor m11, m11
- mova m12, [pd_ %+ %%uvoutoff]
- pslld m12, %%sh
- paddd m12, [pd_ %+ %%rnd]
- mova m13, [pw_ %+ %%uvinoff]
- mova m14, [yoffq+ 0] ; y_off_in
- mova m15, [yoffq+16] ; y_off_out
- %if %%yoffsh != 0
- psllw m15, %%yoffsh
- %endif
- paddw m15, [pw_ %+ %%yprnd]
- punpcklwd m10, m15
- mova m15, [pw_ %+ %%ypmul]
- movh m0, [cq+1*16] ; cyu
- movh m1, [cq+2*16] ; cyv
- movh m2, [cq+4*16] ; cuu
- movh m3, [cq+5*16] ; cuv
- movh m4, [cq+7*16] ; cvu
- movh m5, [cq+8*16] ; cvv
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpcklwd m4, m5
- mova [rsp+0*mmsize], m0
- mova [rsp+1*mmsize], m2
- mova [rsp+2*mmsize], m4
- DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp
- mov uiq, [yiq+gprsize*1]
- mov viq, [yiq+gprsize*2]
- mov yiq, [yiq+gprsize*0]
- mov uoq, [yoq+gprsize*1]
- mov voq, [yoq+gprsize*2]
- mov yoq, [yoq+gprsize*0]
- mov uisq, [yisq+gprsize*1]
- mov visq, [yisq+gprsize*2]
- mov yisq, [yisq+gprsize*0]
- mov uosq, [yosq+gprsize*1]
- mov vosq, [yosq+gprsize*2]
- mov yosq, [yosq+gprsize*0]
- .loop_v:
- xor xq, xq
- .loop_h:
- %if %4 == 1
- lea tmpq, [yiq+yisq]
- %endif ; %4 == 1
- %if %1 == 8
- movu m0, [yiq+xq*(1<<%3)] ; y00/01
- %if %4 == 1
- movu m2, [tmpq+xq*2] ; y10/11
- %endif ; %4 == 1
- %if %3 == 1
- movh m4, [uiq+xq] ; u
- movh m5, [viq+xq] ; v
- %else ; %3 != 1
- movu m4, [uiq+xq] ; u
- movu m5, [viq+xq] ; v
- %endif ; %3 ==/!= 1
- punpckhbw m1, m0, m11
- punpcklbw m0, m11
- %if %4 == 1
- punpckhbw m3, m2, m11
- punpcklbw m2, m11
- %endif ; %4 == 1
- %if %3 == 0
- punpckhbw m2, m4, m11
- punpckhbw m3, m5, m11
- %endif ; %3 == 0
- punpcklbw m4, m11
- punpcklbw m5, m11
- %else ; %1 != 8
- movu m0, [yiq+xq*(2<<%3)] ; y00/01
- movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
- %if %4 == 1
- movu m2, [tmpq+xq*4] ; y10/11
- movu m3, [tmpq+xq*4+mmsize] ; y10/11
- %endif ; %4 == 1
- movu m4, [uiq+xq*2] ; u
- movu m5, [viq+xq*2] ; v
- %if %3 == 0
- movu m2, [uiq+xq*2+mmsize]
- movu m3, [viq+xq*2+mmsize]
- %endif ; %3 == 0
- %endif ; %1 ==/!= 8
- psubw m0, m14
- psubw m1, m14
- %if %4 == 1
- psubw m2, m14
- psubw m3, m14
- %endif ; %4 == 1
- psubw m4, m13
- psubw m5, m13
- %if %3 == 0
- psubw m2, m13
- psubw m3, m13
- %endif ; %3 == 0
- SBUTTERFLY wd, 4, 5, 6
- pmaddwd m6, m4, [rsp+1*mmsize]
- pmaddwd m7, m5, [rsp+1*mmsize]
- %if %3 == 0
- SBUTTERFLY wd, 2, 3, 8
- pmaddwd m8, m2, [rsp+1*mmsize]
- pmaddwd m9, m3, [rsp+1*mmsize]
- %else ; %3 != 0
- pmaddwd m8, m4, [rsp+2*mmsize]
- pmaddwd m9, m5, [rsp+2*mmsize]
- %endif
- paddd m6, m12
- paddd m7, m12
- paddd m8, m12
- paddd m9, m12
- psrad m6, %%sh
- psrad m7, %%sh
- psrad m8, %%sh
- psrad m9, %%sh
- packssdw m6, m7
- packssdw m8, m9
- %if %2 == 8
- packuswb m6, m8
- %if %3 == 0
- movu [uoq+xq], m6
- %else ; %3 != 0
- movh [uoq+xq], m6
- movhps [voq+xq], m6
- %endif ; %3 ==/!= 0
- %else ; %2 != 8
- CLIPW m6, m11, [pw_ %+ %%maxval]
- CLIPW m8, m11, [pw_ %+ %%maxval]
- movu [uoq+xq*2], m6
- %if %3 == 0
- movu [uoq+xq*2+mmsize], m8
- %else ; %3 != 0
- movu [voq+xq*2], m8
- %endif ; %3 ==/!= 0
- %endif ; %2 ==/!= 8
- %if %3 == 0
- pmaddwd m6, m4, [rsp+2*mmsize]
- pmaddwd m7, m5, [rsp+2*mmsize]
- pmaddwd m8, m2, [rsp+2*mmsize]
- pmaddwd m9, m3, [rsp+2*mmsize]
- paddd m6, m12
- paddd m7, m12
- paddd m8, m12
- paddd m9, m12
- psrad m6, %%sh
- psrad m7, %%sh
- psrad m8, %%sh
- psrad m9, %%sh
- packssdw m6, m7
- packssdw m8, m9
- %if %2 == 8
- packuswb m6, m8
- movu [voq+xq], m6
- %else ; %2 != 8
- CLIPW m6, m11, [pw_ %+ %%maxval]
- CLIPW m8, m11, [pw_ %+ %%maxval]
- movu [voq+xq*2], m6
- movu [voq+xq*2+mmsize], m8
- %endif ; %2 ==/!= 8
- %endif ; %3 == 0
- pmaddwd m4, [rsp+0*mmsize]
- pmaddwd m5, [rsp+0*mmsize] ; uv_val
- %if %3 == 0
- pmaddwd m2, [rsp+0*mmsize]
- pmaddwd m3, [rsp+0*mmsize]
- %endif ; %3 == 0
- ; unpack y pixels with m15 (shifted round + offset), then multiply
- ; by m10, add uv pixels, and we're done!
- %if %3 == 1
- punpckhdq m8, m4, m4
- punpckldq m4, m4
- punpckhdq m9, m5, m5
- punpckldq m5, m5
- %else ; %3 != 1
- SWAP 8, 5, 2
- SWAP 3, 9
- %endif ; %3 ==/!= 1
- %if %4 == 1
- punpckhwd m6, m2, m15
- punpcklwd m2, m15
- punpckhwd m7, m3, m15
- punpcklwd m3, m15
- pmaddwd m2, m10
- pmaddwd m6, m10
- pmaddwd m3, m10
- pmaddwd m7, m10
- paddd m2, m4
- paddd m6, m8
- paddd m3, m5
- paddd m7, m9
- psrad m2, %%sh
- psrad m6, %%sh
- psrad m3, %%sh
- psrad m7, %%sh
- packssdw m2, m6
- packssdw m3, m7
- lea tmpq, [yoq+yosq]
- %if %2 == 8
- packuswb m2, m3
- movu [tmpq+xq*2], m2
- %else ; %2 != 8
- CLIPW m2, m11, [pw_ %+ %%maxval]
- CLIPW m3, m11, [pw_ %+ %%maxval]
- movu [tmpq+xq*4], m2
- movu [tmpq+xq*4+mmsize], m3
- %endif ; %2 ==/!= 8
- %endif ; %4 == 1
- punpckhwd m6, m0, m15
- punpcklwd m0, m15
- punpckhwd m7, m1, m15
- punpcklwd m1, m15
- pmaddwd m0, m10
- pmaddwd m6, m10
- pmaddwd m1, m10
- pmaddwd m7, m10
- paddd m0, m4
- paddd m6, m8
- paddd m1, m5
- paddd m7, m9
- psrad m0, %%sh
- psrad m6, %%sh
- psrad m1, %%sh
- psrad m7, %%sh
- packssdw m0, m6
- packssdw m1, m7
- %if %2 == 8
- packuswb m0, m1
- movu [yoq+xq*(1<<%3)], m0
- %else ; %2 != 8
- CLIPW m0, m11, [pw_ %+ %%maxval]
- CLIPW m1, m11, [pw_ %+ %%maxval]
- movu [yoq+xq*(2<<%3)], m0
- movu [yoq+xq*(2<<%3)+mmsize], m1
- %endif ; %2 ==/!= 8
- add xq, mmsize >> %3
- cmp xd, dword [rsp+3*mmsize+0]
- jl .loop_h
- %if %4 == 1
- lea yiq, [yiq+yisq*2]
- lea yoq, [yoq+yosq*2]
- %else ; %4 != 1
- add yiq, yisq
- add yoq, yosq
- %endif ; %4 ==/!= 1
- add uiq, uisq
- add viq, visq
- add uoq, uosq
- add voq, vosq
- dec dword [rsp+3*mmsize+4]
- jg .loop_v
- RET
- %endmacro
- %macro YUV2YUV_FNS 2 ; ss_w, ss_h
- YUV2YUV_FN 8, 8, %1, %2
- YUV2YUV_FN 10, 8, %1, %2
- YUV2YUV_FN 12, 8, %1, %2
- YUV2YUV_FN 8, 10, %1, %2
- YUV2YUV_FN 10, 10, %1, %2
- YUV2YUV_FN 12, 10, %1, %2
- YUV2YUV_FN 8, 12, %1, %2
- YUV2YUV_FN 10, 12, %1, %2
- YUV2YUV_FN 12, 12, %1, %2
- %endmacro
- INIT_XMM sse2
- YUV2YUV_FNS 0, 0
- YUV2YUV_FNS 1, 0
- YUV2YUV_FNS 1, 1
- ; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
- ; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
- ; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
- ; const int16_t yuv_offset[8])
- %macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
- %assign %%sh (%1 - 1)
- %assign %%rnd (1 << (%%sh - 1))
- %assign %%uvoff (1 << (%1 - 1))
- %if %2 == 0
- %assign %%ss 444
- %elif %3 == 0
- %assign %%ss 422
- %else ; %3 == 1
- %assign %%ss 420
- %endif ; %2/%3
- cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
- rgb, rgbs, yuv, yuvs, ww, h, c, yoff
- %if %2 == 1
- inc wwd
- sar wwd, 1
- %endif ; %2 == 1
- %if %3 == 1
- inc hd
- sar hd, 1
- %endif ; %3 == 1
- pxor m11, m11
- mova m15, [yoffq] ; yoff
- movh m14, [cq+ 0] ; cy
- movh m10, [cq+ 32] ; crv
- movh m13, [cq+112] ; cbu
- movh m12, [cq+ 64] ; cgu
- movh m9, [cq+ 80] ; cgv
- punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd
- punpcklwd m13, m11 ; cbu, 0
- punpcklwd m11, m10 ; 0, crv
- punpcklwd m12, m9 ; cgu, cgv
- mova [rsp+0*mmsize], m11
- mova [rsp+1*mmsize], m12
- mova [rsp+2*mmsize], m13
- mova [rsp+3*mmsize], m14
- pxor m14, m14
- DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
- mov gq, [rq+1*gprsize]
- mov bq, [rq+2*gprsize]
- mov rq, [rq+0*gprsize]
- mov uq, [yq+1*gprsize]
- mov vq, [yq+2*gprsize]
- mov yq, [yq+0*gprsize]
- mov usq, [ysq+1*gprsize]
- mov vsq, [ysq+2*gprsize]
- mov ysq, [ysq+0*gprsize]
- .loop_v:
- xor xq, xq
- .loop_h:
- %if %3 == 1
- lea tmpq, [yq+ysq]
- %endif ; %3 == 1
- %if %1 == 8
- movu m0, [yq+xq*(1<<%2)]
- %if %3 == 1
- movu m2, [tmpq+xq*2]
- %endif ; %3 == 1
- %if %2 == 1
- movh m4, [uq+xq]
- movh m5, [vq+xq]
- %else ; %2 != 1
- movu m4, [uq+xq]
- movu m5, [vq+xq]
- %endif ; %2 ==/!= 1
- punpckhbw m1, m0, m14
- punpcklbw m0, m14
- %if %3 == 1
- punpckhbw m3, m2, m14
- punpcklbw m2, m14
- %endif ; %3 == 1
- %if %2 == 0
- punpckhbw m2, m4, m14
- punpckhbw m3, m5, m14
- %endif ; %2 == 0
- punpcklbw m4, m14
- punpcklbw m5, m14
- %else ; %1 != 8
- movu m0, [yq+xq*(2<<%2)]
- movu m1, [yq+xq*(2<<%2)+mmsize]
- %if %3 == 1
- movu m2, [tmpq+xq*4]
- movu m3, [tmpq+xq*4+mmsize]
- %endif ; %3 == 1
- movu m4, [uq+xq*2]
- movu m5, [vq+xq*2]
- %if %2 == 0
- movu m2, [uq+xq*2+mmsize]
- movu m3, [vq+xq*2+mmsize]
- %endif ; %2 == 0
- %endif ; %1 ==/!= 8
- psubw m0, m15
- psubw m1, m15
- %if %3 == 1
- psubw m2, m15
- psubw m3, m15
- %endif ; %3 == 1
- psubw m4, [pw_ %+ %%uvoff]
- psubw m5, [pw_ %+ %%uvoff]
- SBUTTERFLY wd, 4, 5, 6
- %if %2 == 0
- psubw m2, [pw_ %+ %%uvoff]
- psubw m3, [pw_ %+ %%uvoff]
- SBUTTERFLY wd, 2, 3, 6
- %endif ; %2 == 0
- ; calculate y+rnd full-resolution [0-3,6-9]
- punpckhwd m6, m0, [pw_1] ; y, 1
- punpcklwd m0, [pw_1] ; y, 1
- punpckhwd m7, m1, [pw_1] ; y, 1
- punpcklwd m1, [pw_1] ; y, 1
- pmaddwd m0, [rsp+3*mmsize]
- pmaddwd m6, [rsp+3*mmsize]
- pmaddwd m1, [rsp+3*mmsize]
- pmaddwd m7, [rsp+3*mmsize]
- %if %3 == 1
- punpckhwd m8, m2, [pw_1] ; y, 1
- punpcklwd m2, [pw_1] ; y, 1
- punpckhwd m9, m3, [pw_1] ; y, 1
- punpcklwd m3, [pw_1] ; y, 1
- pmaddwd m2, [rsp+3*mmsize]
- pmaddwd m8, [rsp+3*mmsize]
- pmaddwd m3, [rsp+3*mmsize]
- pmaddwd m9, [rsp+3*mmsize]
- mova [rsp+4*mmsize], m2
- mova [rsp+5*mmsize], m8
- mova [rsp+6*mmsize], m3
- mova [rsp+7*mmsize], m9
- %endif ; %3 == 1
- ; calculate r offsets (un-subsampled, then duplicate)
- pmaddwd m10, m4, [rsp+0*mmsize]
- %if %2 == 1
- pmaddwd m12, m5, [rsp+0*mmsize]
- punpckhdq m11, m10, m10
- punpckldq m10, m10
- punpckhdq m13, m12, m12
- punpckldq m12, m12
- %else ; %2 != 1
- pmaddwd m11, m5, [rsp+0*mmsize]
- pmaddwd m12, m2, [rsp+0*mmsize]
- pmaddwd m13, m3, [rsp+0*mmsize]
- %endif ; %2 ==/!= 1
- %if %3 == 1
- paddd m2, m10, [rsp+4*mmsize]
- paddd m3, m11, [rsp+5*mmsize]
- paddd m8, m12, [rsp+6*mmsize]
- paddd m9, m13, [rsp+7*mmsize]
- %endif
- paddd m10, m0
- paddd m11, m6
- paddd m12, m1
- paddd m13, m7
- %if %3 == 1
- psrad m2, %%sh
- psrad m3, %%sh
- psrad m8, %%sh
- psrad m9, %%sh
- %endif ; %3 == 1
- psrad m10, %%sh
- psrad m11, %%sh
- psrad m12, %%sh
- psrad m13, %%sh
- %if %3 == 1
- lea tmpq, [rq+rgbsq*2]
- packssdw m2, m3
- packssdw m8, m9
- mova [tmpq+xq*4], m2
- mova [tmpq+xq*4+mmsize], m8
- %endif ; %3 == 1
- packssdw m10, m11
- packssdw m12, m13
- mova [rq+xq*(2 << %2)], m10
- mova [rq+xq*(2 << %2)+mmsize], m12
- ; calculate g offsets (un-subsampled, then duplicate)
- pmaddwd m10, m4, [rsp+1*mmsize]
- %if %2 == 1
- pmaddwd m12, m5, [rsp+1*mmsize]
- punpckhdq m11, m10, m10
- punpckldq m10, m10
- punpckhdq m13, m12, m12
- punpckldq m12, m12
- %else ; %2 != 1
- pmaddwd m11, m5, [rsp+1*mmsize]
- pmaddwd m12, m2, [rsp+1*mmsize]
- pmaddwd m13, m3, [rsp+1*mmsize]
- %endif ; %2 ==/!= 1
- %if %3 == 1
- paddd m2, m10, [rsp+4*mmsize]
- paddd m3, m11, [rsp+5*mmsize]
- paddd m8, m12, [rsp+6*mmsize]
- paddd m9, m13, [rsp+7*mmsize]
- %endif ; %3 == 1
- paddd m10, m0
- paddd m11, m6
- paddd m12, m1
- paddd m13, m7
- %if %3 == 1
- psrad m2, %%sh
- psrad m3, %%sh
- psrad m8, %%sh
- psrad m9, %%sh
- %endif ; %3 == 1
- psrad m10, %%sh
- psrad m11, %%sh
- psrad m12, %%sh
- psrad m13, %%sh
- %if %3 == 1
- lea tmpq, [gq+rgbsq*2]
- packssdw m2, m3
- packssdw m8, m9
- mova [tmpq+xq*4], m2
- mova [tmpq+xq*4+mmsize], m8
- %endif ; %3 == 1
- packssdw m10, m11
- packssdw m12, m13
- mova [gq+xq*(2 << %2)], m10
- mova [gq+xq*(2 << %2)+mmsize], m12
- ; calculate b offsets (un-subsampled, then duplicate)
- pmaddwd m4, [rsp+2*mmsize]
- pmaddwd m5, [rsp+2*mmsize]
- %if %2 == 1
- punpckhdq m2, m4, m4
- punpckldq m4, m4
- punpckhdq m3, m5, m5
- punpckldq m5, m5
- %else ; %2 != 1
- pmaddwd m2, [rsp+2*mmsize]
- pmaddwd m3, [rsp+2*mmsize]
- SWAP 2, 5
- %endif ; %2 ==/!= 1
- paddd m0, m4
- paddd m6, m2
- paddd m1, m5
- paddd m7, m3
- %if %3 == 1
- paddd m4, [rsp+4*mmsize]
- paddd m2, [rsp+5*mmsize]
- paddd m5, [rsp+6*mmsize]
- paddd m3, [rsp+7*mmsize]
- %endif ; %3 == 1
- psrad m0, %%sh
- psrad m6, %%sh
- psrad m1, %%sh
- psrad m7, %%sh
- %if %3 == 1
- psrad m4, %%sh
- psrad m2, %%sh
- psrad m5, %%sh
- psrad m3, %%sh
- %endif ; %3 == 1
- packssdw m0, m6
- packssdw m1, m7
- movu [bq+xq*(2 << %2)], m0
- movu [bq+xq*(2 << %2)+mmsize], m1
- %if %3 == 1
- lea tmpq, [bq+rgbsq*2]
- packssdw m4, m2
- packssdw m5, m3
- movu [tmpq+xq*4], m4
- movu [tmpq+xq*4+mmsize], m5
- %endif ; %3 == 1
- add xd, mmsize >> %2
- cmp xd, wwd
- jl .loop_h
- lea rq, [rq+rgbsq*(2 << %3)]
- lea gq, [gq+rgbsq*(2 << %3)]
- lea bq, [bq+rgbsq*(2 << %3)]
- %if %3 == 1
- lea yq, [yq+ysq*2]
- %else ; %3 != 0
- add yq, ysq
- %endif ; %3 ==/!= 1
- add uq, usq
- add vq, vsq
- dec hd
- jg .loop_v
- RET
- %endmacro
- %macro YUV2RGB_FNS 2
- YUV2RGB_FN 8, %1, %2
- YUV2RGB_FN 10, %1, %2
- YUV2RGB_FN 12, %1, %2
- %endmacro
- INIT_XMM sse2
- YUV2RGB_FNS 0, 0
- YUV2RGB_FNS 1, 0
- YUV2RGB_FNS 1, 1
- %macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
- %assign %%sh 29 - %1
- %assign %%rnd (1 << (%%sh - 15))
- %assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
- %if %1 != 8
- %assign %%maxval ((1 << %1) - 1)
- %endif ; %1 != 8
- %if %2 == 0
- %assign %%ss 444
- %elif %3 == 0
- %assign %%ss 422
- %else ; %3 == 1
- %assign %%ss 420
- %endif ; %2/%3
- cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
- yuv, yuvs, rgb, rgbs, ww, h, c, off
- %if %2 == 1
- inc wwd
- sar wwd, 1
- %endif ; %2 == 1
- %if %3 == 1
- inc hd
- sar hd, 1
- %endif ; %3 == 1
- ; prepare coeffs
- movh m8, [offq]
- movh m9, [pw_ %+ %%uvrnd]
- psllw m8, %%sh - 14
- paddw m9, [pw_ %+ %%rnd]
- paddw m8, [pw_ %+ %%rnd]
- movh m0, [cq+ 0]
- movh m1, [cq+ 16]
- movh m2, [cq+ 32]
- movh m3, [cq+ 48]
- movh m4, [cq+ 64]
- movh m5, [cq+ 80]
- movh m6, [cq+112]
- movh m7, [cq+128]
- punpcklwd m0, m1
- punpcklwd m2, m8
- punpcklwd m3, m4
- punpcklwd m4, m5, m9
- punpcklwd m5, m6
- punpcklwd m7, m9
- mova [rsp+0*mmsize], m0 ; cry, cgy
- mova [rsp+1*mmsize], m2 ; cby, off + rnd
- mova [rsp+2*mmsize], m3 ; cru, cgu
- mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd
- mova [rsp+4*mmsize], m5 ; cburv, cgv
- mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd
- DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
- mov gq, [rq+gprsize*1]
- mov bq, [rq+gprsize*2]
- mov rq, [rq+gprsize*0]
- mov uq, [yq+gprsize*1]
- mov vq, [yq+gprsize*2]
- mov yq, [yq+gprsize*0]
- mov usq, [ysq+gprsize*1]
- mov vsq, [ysq+gprsize*2]
- mov ysq, [ysq+gprsize*0]
- pxor m15, m15
- .loop_v:
- xor xd, xd
- .loop_h:
- ; top line y
- mova m0, [rq+xq*(2<<%2)]
- mova m3, [rq+xq*(2<<%2)+mmsize]
- mova m1, [gq+xq*(2<<%2)]
- mova m4, [gq+xq*(2<<%2)+mmsize]
- mova m2, [bq+xq*(2<<%2)]
- mova m5, [bq+xq*(2<<%2)+mmsize]
- punpcklwd m6, m0, m1
- punpckhwd m7, m0, m1
- punpcklwd m8, m3, m4
- punpckhwd m9, m3, m4
- punpcklwd m10, m2, [pw_16384]
- punpckhwd m11, m2, [pw_16384]
- punpcklwd m12, m5, [pw_16384]
- punpckhwd m13, m5, [pw_16384]
- pmaddwd m6, [rsp+0*mmsize]
- pmaddwd m7, [rsp+0*mmsize]
- pmaddwd m8, [rsp+0*mmsize]
- pmaddwd m9, [rsp+0*mmsize]
- pmaddwd m10, [rsp+1*mmsize]
- pmaddwd m11, [rsp+1*mmsize]
- pmaddwd m12, [rsp+1*mmsize]
- pmaddwd m13, [rsp+1*mmsize]
- paddd m6, m10
- paddd m7, m11
- paddd m8, m12
- paddd m9, m13
- psrad m6, %%sh
- psrad m7, %%sh
- psrad m8, %%sh
- psrad m9, %%sh
- packssdw m6, m7
- packssdw m8, m9
- %if %1 == 8
- packuswb m6, m8
- movu [yq+xq*(1<<%2)], m6
- %else
- CLIPW m6, m15, [pw_ %+ %%maxval]
- CLIPW m8, m15, [pw_ %+ %%maxval]
- movu [yq+xq*(2<<%2)], m6
- movu [yq+xq*(2<<%2)+mmsize], m8
- %endif
- %if %2 == 1
- ; subsampling cached data
- pmaddwd m0, [pw_1]
- pmaddwd m1, [pw_1]
- pmaddwd m2, [pw_1]
- pmaddwd m3, [pw_1]
- pmaddwd m4, [pw_1]
- pmaddwd m5, [pw_1]
- %if %3 == 1
- ; bottom line y, r/g portion only
- lea tmpq, [rgbsq+xq*2]
- mova m6, [rq+tmpq*2]
- mova m9, [rq+tmpq*2+mmsize]
- mova m7, [gq+tmpq*2]
- mova m10, [gq+tmpq*2+mmsize]
- mova m8, [bq+tmpq*2]
- mova m11, [bq+tmpq*2+mmsize]
- punpcklwd m12, m6, m7
- punpckhwd m13, m6, m7
- punpcklwd m14, m9, m10
- punpckhwd m15, m9, m10
- ; release two more registers
- pmaddwd m6, [pw_1]
- pmaddwd m7, [pw_1]
- pmaddwd m9, [pw_1]
- pmaddwd m10, [pw_1]
- paddd m0, m6
- paddd m3, m9
- paddd m1, m7
- paddd m4, m10
- ; bottom line y, b/rnd portion only
- punpcklwd m6, m8, [pw_16384]
- punpckhwd m7, m8, [pw_16384]
- punpcklwd m9, m11, [pw_16384]
- punpckhwd m10, m11, [pw_16384]
- pmaddwd m12, [rsp+0*mmsize]
- pmaddwd m13, [rsp+0*mmsize]
- pmaddwd m14, [rsp+0*mmsize]
- pmaddwd m15, [rsp+0*mmsize]
- pmaddwd m6, [rsp+1*mmsize]
- pmaddwd m7, [rsp+1*mmsize]
- pmaddwd m9, [rsp+1*mmsize]
- pmaddwd m10, [rsp+1*mmsize]
- paddd m12, m6
- paddd m13, m7
- paddd m14, m9
- paddd m15, m10
- psrad m12, %%sh
- psrad m13, %%sh
- psrad m14, %%sh
- psrad m15, %%sh
- packssdw m12, m13
- packssdw m14, m15
- lea tmpq, [yq+ysq]
- %if %1 == 8
- packuswb m12, m14
- movu [tmpq+xq*2], m12
- %else
- pxor m15, m15
- CLIPW m12, m15, [pw_ %+ %%maxval]
- CLIPW m14, m15, [pw_ %+ %%maxval]
- movu [tmpq+xq*4], m12
- movu [tmpq+xq*4+mmsize], m14
- %endif
- ; complete subsampling of r/g/b pixels for u/v
- pmaddwd m8, [pw_1]
- pmaddwd m11, [pw_1]
- paddd m2, m8
- paddd m5, m11
- paddd m0, [pd_2]
- paddd m1, [pd_2]
- paddd m2, [pd_2]
- paddd m3, [pd_2]
- paddd m4, [pd_2]
- paddd m5, [pd_2]
- psrad m0, 2
- psrad m1, 2
- psrad m2, 2
- psrad m3, 2
- psrad m4, 2
- psrad m5, 2
- %else ; %3 != 1
- paddd m0, [pd_1]
- paddd m1, [pd_1]
- paddd m2, [pd_1]
- paddd m3, [pd_1]
- paddd m4, [pd_1]
- paddd m5, [pd_1]
- psrad m0, 1
- psrad m1, 1
- psrad m2, 1
- psrad m3, 1
- psrad m4, 1
- psrad m5, 1
- %endif ; %3 ==/!= 1
- packssdw m0, m3
- packssdw m1, m4
- packssdw m2, m5
- %endif ; %2 == 1
- ; convert u/v pixels
- SBUTTERFLY wd, 0, 1, 6
- punpckhwd m6, m2, [pw_16384]
- punpcklwd m2, [pw_16384]
- pmaddwd m7, m0, [rsp+2*mmsize]
- pmaddwd m8, m1, [rsp+2*mmsize]
- pmaddwd m9, m2, [rsp+3*mmsize]
- pmaddwd m10, m6, [rsp+3*mmsize]
- pmaddwd m0, [rsp+4*mmsize]
- pmaddwd m1, [rsp+4*mmsize]
- pmaddwd m2, [rsp+5*mmsize]
- pmaddwd m6, [rsp+5*mmsize]
- paddd m7, m9
- paddd m8, m10
- paddd m0, m2
- paddd m1, m6
- psrad m7, %%sh
- psrad m8, %%sh
- psrad m0, %%sh
- psrad m1, %%sh
- packssdw m7, m8
- packssdw m0, m1
- %if %2 == 1
- %if %1 == 8
- packuswb m7, m0
- movh [uq+xq], m7
- movhps [vq+xq], m7
- %else
- CLIPW m7, m15, [pw_ %+ %%maxval]
- CLIPW m0, m15, [pw_ %+ %%maxval]
- movu [uq+xq*2], m7
- movu [vq+xq*2], m0
- %endif
- %else ; %2 != 1
- ; second set of u/v pixels
- SBUTTERFLY wd, 3, 4, 6
- punpckhwd m6, m5, [pw_16384]
- punpcklwd m5, [pw_16384]
- pmaddwd m8, m3, [rsp+2*mmsize]
- pmaddwd m9, m4, [rsp+2*mmsize]
- pmaddwd m10, m5, [rsp+3*mmsize]
- pmaddwd m11, m6, [rsp+3*mmsize]
- pmaddwd m3, [rsp+4*mmsize]
- pmaddwd m4, [rsp+4*mmsize]
- pmaddwd m5, [rsp+5*mmsize]
- pmaddwd m6, [rsp+5*mmsize]
- paddd m8, m10
- paddd m9, m11
- paddd m3, m5
- paddd m4, m6
- psrad m8, %%sh
- psrad m9, %%sh
- psrad m3, %%sh
- psrad m4, %%sh
- packssdw m8, m9
- packssdw m3, m4
- %if %1 == 8
- packuswb m7, m8
- packuswb m0, m3
- movu [uq+xq], m7
- movu [vq+xq], m0
- %else
- CLIPW m7, m15, [pw_ %+ %%maxval]
- CLIPW m0, m15, [pw_ %+ %%maxval]
- CLIPW m8, m15, [pw_ %+ %%maxval]
- CLIPW m3, m15, [pw_ %+ %%maxval]
- movu [uq+xq*2], m7
- movu [uq+xq*2+mmsize], m8
- movu [vq+xq*2], m0
- movu [vq+xq*2+mmsize], m3
- %endif
- %endif ; %2 ==/!= 1
- add xq, mmsize >> %2
- cmp xd, wwd
- jl .loop_h
- %if %3 == 0
- add yq, ysq
- %else ; %3 != 0
- lea yq, [yq+ysq*2]
- %endif ; %3 ==/!= 0
- add uq, usq
- add vq, vsq
- lea rq, [rq+rgbsq*(2<<%3)]
- lea gq, [gq+rgbsq*(2<<%3)]
- lea bq, [bq+rgbsq*(2<<%3)]
- dec hd
- jg .loop_v
- RET
- %endmacro
- %macro RGB2YUV_FNS 2
- RGB2YUV_FN 8, %1, %2
- RGB2YUV_FN 10, %1, %2
- RGB2YUV_FN 12, %1, %2
- %endmacro
- INIT_XMM sse2
- RGB2YUV_FNS 0, 0
- RGB2YUV_FNS 1, 0
- RGB2YUV_FNS 1, 1
- ; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
- ; int w, int h, const int16_t coeff[3][3][8])
- INIT_XMM sse2
- cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
- movh m0, [cq+ 0]
- movh m1, [cq+ 32]
- movh m2, [cq+ 48]
- movh m3, [cq+ 80]
- movh m4, [cq+ 96]
- movh m5, [cq+128]
- punpcklwd m0, [cq+ 16]
- punpcklwd m1, [pw_8192]
- punpcklwd m2, [cq+ 64]
- punpcklwd m3, [pw_8192]
- punpcklwd m4, [cq+112]
- punpcklwd m5, [pw_8192]
- DEFINE_ARGS data0, stride, ww, h, data1, data2, x
- shl strideq, 1
- mov data1q, [data0q+gprsize*1]
- mov data2q, [data0q+gprsize*2]
- mov data0q, [data0q+gprsize*0]
- .loop_v:
- xor xd, xd
- .loop_h:
- mova m6, [data0q+xq*2]
- mova m7, [data1q+xq*2]
- mova m8, [data2q+xq*2]
- SBUTTERFLY wd, 6, 7, 9
- punpckhwd m9, m8, [pw_1]
- punpcklwd m8, [pw_1]
- pmaddwd m10, m6, m0
- pmaddwd m11, m7, m0
- pmaddwd m12, m8, m1
- pmaddwd m13, m9, m1
- paddd m10, m12
- paddd m11, m13
- psrad m10, 14
- psrad m11, 14
- pmaddwd m12, m6, m2
- pmaddwd m13, m7, m2
- pmaddwd m14, m8, m3
- pmaddwd m15, m9, m3
- paddd m12, m14
- paddd m13, m15
- psrad m12, 14
- psrad m13, 14
- pmaddwd m6, m4
- pmaddwd m7, m4
- pmaddwd m8, m5
- pmaddwd m9, m5
- paddd m6, m8
- paddd m7, m9
- psrad m6, 14
- psrad m7, 14
- packssdw m10, m11
- packssdw m12, m13
- packssdw m6, m7
- mova [data0q+xq*2], m10
- mova [data1q+xq*2], m12
- mova [data2q+xq*2], m6
- add xd, mmsize / 2
- cmp xd, wwd
- jl .loop_h
- add data0q, strideq
- add data1q, strideq
- add data2q, strideq
- dec hd
- jg .loop_v
- RET
- %endif
|