12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717 |
- ;******************************************************************************
- ;* H.264 intra prediction asm optimizations
- ;* Copyright (c) 2010 Fiona Glaser
- ;* Copyright (c) 2010 Holger Lubitz
- ;* Copyright (c) 2010 Loren Merritt
- ;* Copyright (c) 2010 Ronald S. Bultje
- ;*
- ;* This file is part of FFmpeg.
- ;*
- ;* FFmpeg is free software; you can redistribute it and/or
- ;* modify it under the terms of the GNU Lesser General Public
- ;* License as published by the Free Software Foundation; either
- ;* version 2.1 of the License, or (at your option) any later version.
- ;*
- ;* FFmpeg is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- ;* Lesser General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU Lesser General Public
- ;* License along with FFmpeg; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- ;******************************************************************************
- %include "libavutil/x86/x86util.asm"
- SECTION_RODATA
- tm_shuf: times 8 db 0x03, 0x80
- pw_ff00: times 8 dw 0xff00
- plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
- db 1, 2, 3, 4, 5, 6, 7, 8
- plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
- db 1, 2, 3, 4, 0, 0, 0, 0
- pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
- pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
- pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
- pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
- SECTION .text
- cextern pb_1
- cextern pb_3
- cextern pw_4
- cextern pw_5
- cextern pw_8
- cextern pw_16
- cextern pw_17
- cextern pw_32
- ;-----------------------------------------------------------------------------
- ; void ff_pred16x16_vertical_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmx
- cglobal pred16x16_vertical_8, 2,3
- sub r0, r1
- mov r2, 8
- movq mm0, [r0+0]
- movq mm1, [r0+8]
- .loop:
- movq [r0+r1*1+0], mm0
- movq [r0+r1*1+8], mm1
- movq [r0+r1*2+0], mm0
- movq [r0+r1*2+8], mm1
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
- INIT_XMM sse
- cglobal pred16x16_vertical_8, 2,3
- sub r0, r1
- mov r2, 4
- movaps xmm0, [r0]
- .loop:
- movaps [r0+r1*1], xmm0
- movaps [r0+r1*2], xmm0
- lea r0, [r0+r1*2]
- movaps [r0+r1*1], xmm0
- movaps [r0+r1*2], xmm0
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred16x16_horizontal_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED16x16_H 0
- cglobal pred16x16_horizontal_8, 2,3
- mov r2, 8
- %if cpuflag(ssse3)
- mova m2, [pb_3]
- %endif
- .loop:
- movd m0, [r0+r1*0-4]
- movd m1, [r0+r1*1-4]
- %if cpuflag(ssse3)
- pshufb m0, m2
- pshufb m1, m2
- %else
- punpcklbw m0, m0
- punpcklbw m1, m1
- SPLATW m0, m0, 3
- SPLATW m1, m1, 3
- mova [r0+r1*0+8], m0
- mova [r0+r1*1+8], m1
- %endif
- mova [r0+r1*0], m0
- mova [r0+r1*1], m1
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- PRED16x16_H
- INIT_MMX mmxext
- PRED16x16_H
- INIT_XMM ssse3
- PRED16x16_H
- ;-----------------------------------------------------------------------------
- ; void ff_pred16x16_dc_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED16x16_DC 0
- cglobal pred16x16_dc_8, 2,7
- mov r4, r0
- sub r0, r1
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [r0+0]
- psadbw mm1, [r0+8]
- dec r0
- movzx r5d, byte [r0+r1*1]
- paddw mm0, mm1
- movd r6d, mm0
- lea r0, [r0+r1*2]
- %rep 7
- movzx r2d, byte [r0+r1*0]
- movzx r3d, byte [r0+r1*1]
- add r5d, r2d
- add r6d, r3d
- lea r0, [r0+r1*2]
- %endrep
- movzx r2d, byte [r0+r1*0]
- add r5d, r6d
- lea r2d, [r2+r5+16]
- shr r2d, 5
- %if cpuflag(ssse3)
- pxor m1, m1
- %endif
- SPLATB_REG m0, r2, m1
- %if mmsize==8
- mov r3d, 8
- .loop:
- mova [r4+r1*0+0], m0
- mova [r4+r1*0+8], m0
- mova [r4+r1*1+0], m0
- mova [r4+r1*1+8], m0
- %else
- mov r3d, 4
- .loop:
- mova [r4+r1*0], m0
- mova [r4+r1*1], m0
- lea r4, [r4+r1*2]
- mova [r4+r1*0], m0
- mova [r4+r1*1], m0
- %endif
- lea r4, [r4+r1*2]
- dec r3d
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmxext
- PRED16x16_DC
- INIT_XMM sse2
- PRED16x16_DC
- INIT_XMM ssse3
- PRED16x16_DC
- ;-----------------------------------------------------------------------------
- ; void ff_pred16x16_tm_vp8_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED16x16_TM 0
- cglobal pred16x16_tm_vp8_8, 2,5
- sub r0, r1
- pxor mm7, mm7
- movq mm0, [r0+0]
- movq mm2, [r0+8]
- movq mm1, mm0
- movq mm3, mm2
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- movzx r3d, byte [r0-1]
- mov r4d, 16
- .loop:
- movzx r2d, byte [r0+r1-1]
- sub r2d, r3d
- movd mm4, r2d
- SPLATW mm4, mm4, 0
- movq mm5, mm4
- movq mm6, mm4
- movq mm7, mm4
- paddw mm4, mm0
- paddw mm5, mm1
- paddw mm6, mm2
- paddw mm7, mm3
- packuswb mm4, mm5
- packuswb mm6, mm7
- movq [r0+r1+0], mm4
- movq [r0+r1+8], mm6
- add r0, r1
- dec r4d
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- PRED16x16_TM
- INIT_MMX mmxext
- PRED16x16_TM
- INIT_XMM sse2
- cglobal pred16x16_tm_vp8_8, 2,6,6
- sub r0, r1
- pxor xmm2, xmm2
- movdqa xmm0, [r0]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- movzx r4d, byte [r0-1]
- mov r5d, 8
- .loop:
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- sub r2d, r4d
- sub r3d, r4d
- movd xmm2, r2d
- movd xmm4, r3d
- pshuflw xmm2, xmm2, 0
- pshuflw xmm4, xmm4, 0
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm4, xmm4
- movdqa xmm3, xmm2
- movdqa xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm3, xmm1
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- packuswb xmm2, xmm3
- packuswb xmm4, xmm5
- movdqa [r0+r1*1], xmm2
- movdqa [r0+r1*2], xmm4
- lea r0, [r0+r1*2]
- dec r5d
- jg .loop
- REP_RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro H264_PRED16x16_PLANE 1
- cglobal pred16x16_plane_%1_8, 2,9,7
- mov r2, r1 ; +stride
- neg r1 ; -stride
- movh m0, [r0+r1 -1]
- %if mmsize == 8
- pxor m4, m4
- movh m1, [r0+r1 +3 ]
- movh m2, [r0+r1 +8 ]
- movh m3, [r0+r1 +12]
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
- pmullw m0, [pw_m8tom1 ]
- pmullw m1, [pw_m8tom1+8]
- pmullw m2, [pw_1to8 ]
- pmullw m3, [pw_1to8 +8]
- paddw m0, m2
- paddw m1, m3
- %else ; mmsize == 16
- %if cpuflag(ssse3)
- movhps m0, [r0+r1 +8]
- pmaddubsw m0, [plane_shuf] ; H coefficients
- %else ; sse2
- pxor m2, m2
- movh m1, [r0+r1 +8]
- punpcklbw m0, m2
- punpcklbw m1, m2
- pmullw m0, [pw_m8tom1]
- pmullw m1, [pw_1to8]
- paddw m0, m1
- %endif
- movhlps m1, m0
- %endif
- paddw m0, m1
- %if cpuflag(mmxext)
- PSHUFLW m1, m0, 0xE
- %elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 32
- %endif
- paddw m0, m1
- %if cpuflag(mmxext)
- PSHUFLW m1, m0, 0x1
- %elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 16
- %endif
- paddw m0, m1 ; sum of H coefficients
- lea r4, [r0+r2*8-1]
- lea r3, [r0+r2*4-1]
- add r4, r2
- %if ARCH_X86_64
- %define e_reg r8
- %else
- %define e_reg r0
- %endif
- movzx e_reg, byte [r3+r2*2 ]
- movzx r5, byte [r4+r1 ]
- sub r5, e_reg
- movzx e_reg, byte [r3+r2 ]
- movzx r6, byte [r4 ]
- sub r6, e_reg
- lea r5, [r5+r6*2]
- movzx e_reg, byte [r3+r1 ]
- movzx r6, byte [r4+r2*2 ]
- sub r6, e_reg
- lea r5, [r5+r6*4]
- movzx e_reg, byte [r3 ]
- %if ARCH_X86_64
- movzx r7, byte [r4+r2 ]
- sub r7, e_reg
- %else
- movzx r6, byte [r4+r2 ]
- sub r6, e_reg
- lea r5, [r5+r6*4]
- sub r5, r6
- %endif
- lea e_reg, [r3+r1*4]
- lea r3, [r4+r2*4]
- movzx r4, byte [e_reg+r2 ]
- movzx r6, byte [r3 ]
- sub r6, r4
- %if ARCH_X86_64
- lea r6, [r7+r6*2]
- lea r5, [r5+r6*2]
- add r5, r6
- %else
- lea r5, [r5+r6*4]
- lea r5, [r5+r6*2]
- %endif
- movzx r4, byte [e_reg ]
- %if ARCH_X86_64
- movzx r7, byte [r3 +r2 ]
- sub r7, r4
- sub r5, r7
- %else
- movzx r6, byte [r3 +r2 ]
- sub r6, r4
- lea r5, [r5+r6*8]
- sub r5, r6
- %endif
- movzx r4, byte [e_reg+r1 ]
- movzx r6, byte [r3 +r2*2]
- sub r6, r4
- %if ARCH_X86_64
- add r6, r7
- %endif
- lea r5, [r5+r6*8]
- movzx r4, byte [e_reg+r2*2]
- movzx r6, byte [r3 +r1 ]
- sub r6, r4
- lea r5, [r5+r6*4]
- add r5, r6 ; sum of V coefficients
- %if ARCH_X86_64 == 0
- mov r0, r0m
- %endif
- %ifidn %1, h264
- lea r5, [r5*5+32]
- sar r5, 6
- %elifidn %1, rv40
- lea r5, [r5*5]
- sar r5, 6
- %elifidn %1, svq3
- test r5, r5
- lea r6, [r5+3]
- cmovs r5, r6
- sar r5, 2 ; V/4
- lea r5, [r5*5] ; 5*(V/4)
- test r5, r5
- lea r6, [r5+15]
- cmovs r5, r6
- sar r5, 4 ; (5*(V/4))/16
- %endif
- movzx r4, byte [r0+r1 +15]
- movzx r3, byte [r3+r2*2 ]
- lea r3, [r3+r4+1]
- shl r3, 4
- movd r1d, m0
- movsx r1d, r1w
- %ifnidn %1, svq3
- %ifidn %1, h264
- lea r1d, [r1d*5+32]
- %else ; rv40
- lea r1d, [r1d*5]
- %endif
- sar r1d, 6
- %else ; svq3
- test r1d, r1d
- lea r4d, [r1d+3]
- cmovs r1d, r4d
- sar r1d, 2 ; H/4
- lea r1d, [r1d*5] ; 5*(H/4)
- test r1d, r1d
- lea r4d, [r1d+15]
- cmovs r1d, r4d
- sar r1d, 4 ; (5*(H/4))/16
- %endif
- movd m0, r1d
- add r1d, r5d
- add r3d, r1d
- shl r1d, 3
- sub r3d, r1d ; a
- movd m1, r5d
- movd m3, r3d
- SPLATW m0, m0, 0 ; H
- SPLATW m1, m1, 0 ; V
- SPLATW m3, m3, 0 ; a
- %ifidn %1, svq3
- SWAP 0, 1
- %endif
- mova m2, m0
- %if mmsize == 8
- mova m5, m0
- %endif
- pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
- %if mmsize == 16
- psllw m2, 3
- %else
- psllw m5, 3
- psllw m2, 2
- mova m6, m5
- paddw m6, m2
- %endif
- paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
- paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
- %if mmsize == 8
- paddw m5, m0 ; a + {8,9,10,11}*H
- paddw m6, m0 ; a + {12,13,14,15}*H
- %endif
- mov r4, 8
- .loop:
- mova m3, m0 ; b[0..7]
- mova m4, m2 ; b[8..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0], m3
- %if mmsize == 8
- mova m3, m5 ; b[8..11]
- mova m4, m6 ; b[12..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0+8], m3
- %endif
- paddw m0, m1
- paddw m2, m1
- %if mmsize == 8
- paddw m5, m1
- paddw m6, m1
- %endif
- mova m3, m0 ; b[0..7]
- mova m4, m2 ; b[8..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0+r2], m3
- %if mmsize == 8
- mova m3, m5 ; b[8..11]
- mova m4, m6 ; b[12..15]
- psraw m3, 5
- psraw m4, 5
- packuswb m3, m4
- mova [r0+r2+8], m3
- %endif
- paddw m0, m1
- paddw m2, m1
- %if mmsize == 8
- paddw m5, m1
- paddw m6, m1
- %endif
- lea r0, [r0+r2*2]
- dec r4
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- H264_PRED16x16_PLANE h264
- H264_PRED16x16_PLANE rv40
- H264_PRED16x16_PLANE svq3
- INIT_MMX mmxext
- H264_PRED16x16_PLANE h264
- H264_PRED16x16_PLANE rv40
- H264_PRED16x16_PLANE svq3
- INIT_XMM sse2
- H264_PRED16x16_PLANE h264
- H264_PRED16x16_PLANE rv40
- H264_PRED16x16_PLANE svq3
- INIT_XMM ssse3
- H264_PRED16x16_PLANE h264
- H264_PRED16x16_PLANE rv40
- H264_PRED16x16_PLANE svq3
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_plane_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro H264_PRED8x8_PLANE 0
- cglobal pred8x8_plane_8, 2,9,7
- mov r2, r1 ; +stride
- neg r1 ; -stride
- movd m0, [r0+r1 -1]
- %if mmsize == 8
- pxor m2, m2
- movh m1, [r0+r1 +4 ]
- punpcklbw m0, m2
- punpcklbw m1, m2
- pmullw m0, [pw_m4to4]
- pmullw m1, [pw_m4to4+8]
- %else ; mmsize == 16
- %if cpuflag(ssse3)
- movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
- pmaddubsw m0, [plane8_shuf] ; H coefficients
- %else ; sse2
- pxor m2, m2
- movd m1, [r0+r1 +4]
- punpckldq m0, m1
- punpcklbw m0, m2
- pmullw m0, [pw_m4to4]
- %endif
- movhlps m1, m0
- %endif
- paddw m0, m1
- %if notcpuflag(ssse3)
- %if cpuflag(mmxext)
- PSHUFLW m1, m0, 0xE
- %elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 32
- %endif
- paddw m0, m1
- %endif ; !ssse3
- %if cpuflag(mmxext)
- PSHUFLW m1, m0, 0x1
- %elif cpuflag(mmx)
- mova m1, m0
- psrlq m1, 16
- %endif
- paddw m0, m1 ; sum of H coefficients
- lea r4, [r0+r2*4-1]
- lea r3, [r0 -1]
- add r4, r2
- %if ARCH_X86_64
- %define e_reg r8
- %else
- %define e_reg r0
- %endif
- movzx e_reg, byte [r3+r2*2 ]
- movzx r5, byte [r4+r1 ]
- sub r5, e_reg
- movzx e_reg, byte [r3 ]
- %if ARCH_X86_64
- movzx r7, byte [r4+r2 ]
- sub r7, e_reg
- sub r5, r7
- %else
- movzx r6, byte [r4+r2 ]
- sub r6, e_reg
- lea r5, [r5+r6*4]
- sub r5, r6
- %endif
- movzx e_reg, byte [r3+r1 ]
- movzx r6, byte [r4+r2*2 ]
- sub r6, e_reg
- %if ARCH_X86_64
- add r6, r7
- %endif
- lea r5, [r5+r6*4]
- movzx e_reg, byte [r3+r2 ]
- movzx r6, byte [r4 ]
- sub r6, e_reg
- lea r6, [r5+r6*2]
- lea r5, [r6*9+16]
- lea r5, [r5+r6*8]
- sar r5, 5
- %if ARCH_X86_64 == 0
- mov r0, r0m
- %endif
- movzx r3, byte [r4+r2*2 ]
- movzx r4, byte [r0+r1 +7]
- lea r3, [r3+r4+1]
- shl r3, 4
- movd r1d, m0
- movsx r1d, r1w
- imul r1d, 17
- add r1d, 16
- sar r1d, 5
- movd m0, r1d
- add r1d, r5d
- sub r3d, r1d
- add r1d, r1d
- sub r3d, r1d ; a
- movd m1, r5d
- movd m3, r3d
- SPLATW m0, m0, 0 ; H
- SPLATW m1, m1, 0 ; V
- SPLATW m3, m3, 0 ; a
- %if mmsize == 8
- mova m2, m0
- %endif
- pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
- paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
- %if mmsize == 8
- psllw m2, 2
- paddw m2, m0 ; a + {4,5,6,7}*H
- %endif
- mov r4, 4
- ALIGN 16
- .loop:
- %if mmsize == 16
- mova m3, m0 ; b[0..7]
- paddw m0, m1
- psraw m3, 5
- mova m4, m0 ; V+b[0..7]
- paddw m0, m1
- psraw m4, 5
- packuswb m3, m4
- movh [r0], m3
- movhps [r0+r2], m3
- %else ; mmsize == 8
- mova m3, m0 ; b[0..3]
- mova m4, m2 ; b[4..7]
- paddw m0, m1
- paddw m2, m1
- psraw m3, 5
- psraw m4, 5
- mova m5, m0 ; V+b[0..3]
- mova m6, m2 ; V+b[4..7]
- paddw m0, m1
- paddw m2, m1
- psraw m5, 5
- psraw m6, 5
- packuswb m3, m4
- packuswb m5, m6
- mova [r0], m3
- mova [r0+r2], m5
- %endif
- lea r0, [r0+r2*2]
- dec r4
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- H264_PRED8x8_PLANE
- INIT_MMX mmxext
- H264_PRED8x8_PLANE
- INIT_XMM sse2
- H264_PRED8x8_PLANE
- INIT_XMM ssse3
- H264_PRED8x8_PLANE
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_vertical_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmx
- cglobal pred8x8_vertical_8, 2,2
- sub r0, r1
- movq mm0, [r0]
- %rep 3
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- lea r0, [r0+r1*2]
- %endrep
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_horizontal_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8_H 0
- cglobal pred8x8_horizontal_8, 2,3
- mov r2, 4
- %if cpuflag(ssse3)
- mova m2, [pb_3]
- %endif
- .loop:
- SPLATB_LOAD m0, r0+r1*0-1, m2
- SPLATB_LOAD m1, r0+r1*1-1, m2
- mova [r0+r1*0], m0
- mova [r0+r1*1], m1
- lea r0, [r0+r1*2]
- dec r2
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- PRED8x8_H
- INIT_MMX mmxext
- PRED8x8_H
- INIT_MMX ssse3
- PRED8x8_H
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8_top_dc_8, 2,5
- sub r0, r1
- movq mm0, [r0]
- pxor mm1, mm1
- pxor mm2, mm2
- lea r2, [r0+r1*2]
- punpckhbw mm1, mm0
- punpcklbw mm0, mm2
- psadbw mm1, mm2 ; s1
- lea r3, [r2+r1*2]
- psadbw mm0, mm2 ; s0
- psrlw mm1, 1
- psrlw mm0, 1
- pavgw mm1, mm2
- lea r4, [r3+r1*2]
- pavgw mm0, mm2
- pshufw mm1, mm1, 0
- pshufw mm0, mm0, 0 ; dc0 (w)
- packuswb mm0, mm1 ; dc0,dc1 (b)
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- lea r0, [r3+r1*2]
- movq [r2+r1*1], mm0
- movq [r2+r1*2], mm0
- movq [r3+r1*1], mm0
- movq [r3+r1*2], mm0
- movq [r0+r1*1], mm0
- movq [r0+r1*2], mm0
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8_dc_8, 2,5
- sub r0, r1
- pxor m7, m7
- movd m0, [r0+0]
- movd m1, [r0+4]
- psadbw m0, m7 ; s0
- mov r4, r0
- psadbw m1, m7 ; s1
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- lea r0, [r0+r1*2]
- add r2d, r3d
- movzx r3d, byte [r0+r1*1-1]
- add r2d, r3d
- movzx r3d, byte [r0+r1*2-1]
- add r2d, r3d
- lea r0, [r0+r1*2]
- movd m2, r2d ; s2
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- lea r0, [r0+r1*2]
- add r2d, r3d
- movzx r3d, byte [r0+r1*1-1]
- add r2d, r3d
- movzx r3d, byte [r0+r1*2-1]
- add r2d, r3d
- movd m3, r2d ; s3
- punpcklwd m0, m1
- mov r0, r4
- punpcklwd m2, m3
- punpckldq m0, m2 ; s0, s1, s2, s3
- pshufw m3, m0, 11110110b ; s2, s1, s3, s3
- lea r2, [r0+r1*2]
- pshufw m0, m0, 01110100b ; s0, s1, s3, s1
- paddw m0, m3
- lea r3, [r2+r1*2]
- psrlw m0, 2
- pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
- lea r4, [r3+r1*2]
- packuswb m0, m0
- punpcklbw m0, m0
- movq m1, m0
- punpcklbw m0, m0
- punpckhbw m1, m1
- movq [r0+r1*1], m0
- movq [r0+r1*2], m0
- movq [r2+r1*1], m0
- movq [r2+r1*2], m0
- movq [r3+r1*1], m1
- movq [r3+r1*2], m1
- movq [r4+r1*1], m1
- movq [r4+r1*2], m1
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_dc_rv40_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8_dc_rv40_8, 2,7
- mov r4, r0
- sub r0, r1
- pxor mm0, mm0
- psadbw mm0, [r0]
- dec r0
- movzx r5d, byte [r0+r1*1]
- movd r6d, mm0
- lea r0, [r0+r1*2]
- %rep 3
- movzx r2d, byte [r0+r1*0]
- movzx r3d, byte [r0+r1*1]
- add r5d, r2d
- add r6d, r3d
- lea r0, [r0+r1*2]
- %endrep
- movzx r2d, byte [r0+r1*0]
- add r5d, r6d
- lea r2d, [r2+r5+8]
- shr r2d, 4
- movd mm0, r2d
- punpcklbw mm0, mm0
- pshufw mm0, mm0, 0
- mov r3d, 4
- .loop:
- movq [r4+r1*0], mm0
- movq [r4+r1*1], mm0
- lea r4, [r4+r1*2]
- dec r3d
- jg .loop
- REP_RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8_tm_vp8_8(uint8_t *src, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8_TM 0
- cglobal pred8x8_tm_vp8_8, 2,6
- sub r0, r1
- pxor mm7, mm7
- movq mm0, [r0]
- movq mm1, mm0
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- movzx r4d, byte [r0-1]
- mov r5d, 4
- .loop:
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- sub r2d, r4d
- sub r3d, r4d
- movd mm2, r2d
- movd mm4, r3d
- SPLATW mm2, mm2, 0
- SPLATW mm4, mm4, 0
- movq mm3, mm2
- movq mm5, mm4
- paddw mm2, mm0
- paddw mm3, mm1
- paddw mm4, mm0
- paddw mm5, mm1
- packuswb mm2, mm3
- packuswb mm4, mm5
- movq [r0+r1*1], mm2
- movq [r0+r1*2], mm4
- lea r0, [r0+r1*2]
- dec r5d
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- PRED8x8_TM
- INIT_MMX mmxext
- PRED8x8_TM
- INIT_XMM sse2
- cglobal pred8x8_tm_vp8_8, 2,6,4
- sub r0, r1
- pxor xmm1, xmm1
- movq xmm0, [r0]
- punpcklbw xmm0, xmm1
- movzx r4d, byte [r0-1]
- mov r5d, 4
- .loop:
- movzx r2d, byte [r0+r1*1-1]
- movzx r3d, byte [r0+r1*2-1]
- sub r2d, r4d
- sub r3d, r4d
- movd xmm2, r2d
- movd xmm3, r3d
- pshuflw xmm2, xmm2, 0
- pshuflw xmm3, xmm3, 0
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
- paddw xmm2, xmm0
- paddw xmm3, xmm0
- packuswb xmm2, xmm3
- movq [r0+r1*1], xmm2
- movhps [r0+r1*2], xmm2
- lea r0, [r0+r1*2]
- dec r5d
- jg .loop
- REP_RET
- INIT_XMM ssse3
- cglobal pred8x8_tm_vp8_8, 2,3,6
- sub r0, r1
- movdqa xmm4, [tm_shuf]
- pxor xmm1, xmm1
- movq xmm0, [r0]
- punpcklbw xmm0, xmm1
- movd xmm5, [r0-4]
- pshufb xmm5, xmm4
- mov r2d, 4
- .loop:
- movd xmm2, [r0+r1*1-4]
- movd xmm3, [r0+r1*2-4]
- pshufb xmm2, xmm4
- pshufb xmm3, xmm4
- psubw xmm2, xmm5
- psubw xmm3, xmm5
- paddw xmm2, xmm0
- paddw xmm3, xmm0
- packuswb xmm2, xmm3
- movq [r0+r1*1], xmm2
- movhps [r0+r1*2], xmm2
- lea r0, [r0+r1*2]
- dec r2d
- jg .loop
- REP_RET
- ; dest, left, right, src, tmp
- ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
- %macro PRED4x4_LOWPASS 5
- mova %5, %2
- pavgb %2, %3
- pxor %3, %5
- mova %1, %4
- pand %3, [pb_1]
- psubusb %2, %3
- pavgb %1, %2
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_TOP_DC 0
- cglobal pred8x8l_top_dc_8, 4,4
- sub r0, r3
- pxor mm7, mm7
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- jmp .body
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .body
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- .body:
- PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
- psadbw mm7, mm0
- paddw mm7, [pw_4]
- psrlw mm7, 3
- pshufw mm7, mm7, 0
- packuswb mm7, mm7
- %rep 3
- movq [r0+r3*1], mm7
- movq [r0+r3*2], mm7
- lea r0, [r0+r3*2]
- %endrep
- movq [r0+r3*1], mm7
- movq [r0+r3*2], mm7
- RET
- %endmacro
- INIT_MMX mmxext
- PRED8x8L_TOP_DC
- INIT_MMX ssse3
- PRED8x8L_TOP_DC
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_DC 0
- cglobal pred8x8l_dc_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .body
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .body
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- .body:
- lea r1, [r0+r3*2]
- PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
- pxor mm0, mm0
- pxor mm1, mm1
- lea r2, [r1+r3*2]
- psadbw mm0, mm7
- psadbw mm1, mm6
- paddw mm0, [pw_8]
- paddw mm0, mm1
- lea r4, [r2+r3*2]
- psrlw mm0, 4
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm0
- movq [r1+r3*1], mm0
- movq [r1+r3*2], mm0
- movq [r2+r3*1], mm0
- movq [r2+r3*2], mm0
- movq [r4+r3*1], mm0
- movq [r4+r3*2], mm0
- RET
- %endmacro
- INIT_MMX mmxext
- PRED8x8L_DC
- INIT_MMX ssse3
- PRED8x8L_DC
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_HORIZONTAL 0
- cglobal pred8x8l_horizontal_8, 4,4
- sub r0, r3
- lea r2, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- test r1, r1
- lea r1, [r0+r3]
- cmovnz r1, r0
- punpckhbw mm0, [r1+r3*0-8]
- movq mm1, [r2+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r2, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r1+r3*0-8]
- mov r0, r2
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm3, mm7
- lea r1, [r0+r3*2]
- movq mm7, mm3
- punpckhbw mm3, mm3
- punpcklbw mm7, mm7
- pshufw mm0, mm3, 0xff
- pshufw mm1, mm3, 0xaa
- lea r2, [r1+r3*2]
- pshufw mm2, mm3, 0x55
- pshufw mm3, mm3, 0x00
- pshufw mm4, mm7, 0xff
- pshufw mm5, mm7, 0xaa
- pshufw mm6, mm7, 0x55
- pshufw mm7, mm7, 0x00
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm1
- movq [r1+r3*1], mm2
- movq [r1+r3*2], mm3
- movq [r2+r3*1], mm4
- movq [r2+r3*2], mm5
- lea r0, [r2+r3*2]
- movq [r0+r3*1], mm6
- movq [r0+r3*2], mm7
- RET
- %endmacro
- INIT_MMX mmxext
- PRED8x8L_HORIZONTAL
- INIT_MMX ssse3
- PRED8x8L_HORIZONTAL
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_VERTICAL 0
- cglobal pred8x8l_vertical_8, 4,4
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- jmp .body
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .body
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- .body:
- PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
- %rep 3
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm0
- lea r0, [r0+r3*2]
- %endrep
- movq [r0+r3*1], mm0
- movq [r0+r3*2], mm0
- RET
- %endmacro
- INIT_MMX mmxext
- PRED8x8L_VERTICAL
- INIT_MMX ssse3
- PRED8x8L_VERTICAL
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8l_down_left_8, 4,5
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- jmp .do_top
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq mm7, mm4
- test r2, r2
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
- .do_topright:
- lea r1, [r0+r3*2]
- movq mm6, mm1
- psrlq mm1, 56
- movq mm4, mm1
- lea r2, [r1+r3*2]
- movq mm2, mm6
- PALIGNR mm2, mm7, 1, mm0
- movq mm3, mm6
- PALIGNR mm3, mm7, 7, mm0
- PALIGNR mm4, mm6, 1, mm0
- movq mm5, mm7
- movq mm1, mm7
- movq mm7, mm6
- lea r4, [r2+r3*2]
- psllq mm1, 8
- PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
- PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
- movq [r4+r3*2], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r4+r3*1], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r2+r3*2], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r2+r3*1], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r1+r3*2], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r1+r3*1], mm1
- movq mm2, mm0
- psllq mm1, 8
- psrlq mm2, 56
- psllq mm0, 8
- por mm1, mm2
- movq [r0+r3*2], mm1
- psllq mm1, 8
- psrlq mm0, 56
- por mm1, mm0
- movq [r0+r3*1], mm1
- RET
- %macro PRED8x8L_DOWN_LEFT 0
- cglobal pred8x8l_down_left_8, 4,4
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- jmp .do_top
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm3, mm4
- test r2, r2 ; top_right
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
- .do_topright:
- movq2dq xmm4, mm1
- psrlq mm1, 56
- movq2dq xmm5, mm1
- lea r1, [r0+r3*2]
- pslldq xmm4, 8
- por xmm3, xmm4
- movdqa xmm2, xmm3
- psrldq xmm2, 1
- pslldq xmm5, 15
- por xmm2, xmm5
- lea r2, [r1+r3*2]
- movdqa xmm1, xmm3
- pslldq xmm1, 1
- INIT_XMM cpuname
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
- psrldq xmm0, 1
- movq [r0+r3*1], xmm0
- psrldq xmm0, 1
- movq [r0+r3*2], xmm0
- psrldq xmm0, 1
- lea r0, [r2+r3*2]
- movq [r1+r3*1], xmm0
- psrldq xmm0, 1
- movq [r1+r3*2], xmm0
- psrldq xmm0, 1
- movq [r2+r3*1], xmm0
- psrldq xmm0, 1
- movq [r2+r3*2], xmm0
- psrldq xmm0, 1
- movq [r0+r3*1], xmm0
- psrldq xmm0, 1
- movq [r0+r3*2], xmm0
- RET
- %endmacro
- INIT_MMX sse2
- PRED8x8L_DOWN_LEFT
- INIT_MMX ssse3
- PRED8x8L_DOWN_LEFT
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8l_down_right_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1 ; top_left
- jz .fix_lt_1
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- movq mm6, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1 ; top_left
- jz .fix_lt_2
- test r2, r2 ; top_right
- jz .fix_tr_1
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq mm5, mm4
- jmp .body
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2 ; top_right
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .body:
- lea r1, [r0+r3*2]
- movq mm1, mm7
- movq mm7, mm5
- movq mm5, mm6
- movq mm2, mm7
- lea r2, [r1+r3*2]
- PALIGNR mm2, mm6, 1, mm0
- movq mm3, mm7
- PALIGNR mm3, mm6, 7, mm0
- movq mm4, mm7
- lea r4, [r2+r3*2]
- psrlq mm4, 8
- PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
- PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
- movq [r4+r3*2], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r4+r3*1], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r2+r3*2], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r2+r3*1], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r1+r3*2], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r1+r3*1], mm0
- movq mm2, mm1
- psrlq mm0, 8
- psllq mm2, 56
- psrlq mm1, 8
- por mm0, mm2
- movq [r0+r3*2], mm0
- psrlq mm0, 8
- psllq mm1, 56
- por mm0, mm1
- movq [r0+r3*1], mm0
- RET
- %macro PRED8x8L_DOWN_RIGHT 0
- cglobal pred8x8l_down_right_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jz .fix_lt_1
- jmp .do_left
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- movq2dq xmm3, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq2dq xmm1, mm7
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm4, mm4
- lea r1, [r0+r3*2]
- movdqa xmm0, xmm3
- pslldq xmm4, 8
- por xmm3, xmm4
- lea r2, [r1+r3*2]
- pslldq xmm4, 1
- por xmm1, xmm4
- psrldq xmm0, 7
- pslldq xmm0, 15
- psrldq xmm0, 7
- por xmm1, xmm0
- lea r0, [r2+r3*2]
- movdqa xmm2, xmm3
- psrldq xmm2, 1
- INIT_XMM cpuname
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- movq [r0+r3*2], xmm0
- movq [r0+r3*1], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
- movq [r2+r3*2], xmm0
- movq [r2+r3*1], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
- movq [r1+r3*2], xmm0
- movq [r1+r3*1], xmm1
- psrldq xmm0, 2
- psrldq xmm1, 2
- movq [r4+r3*2], xmm0
- movq [r4+r3*1], xmm1
- RET
- %endmacro
- INIT_MMX sse2
- PRED8x8L_DOWN_RIGHT
- INIT_MMX ssse3
- PRED8x8L_DOWN_RIGHT
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8l_vertical_right_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jz .fix_lt_1
- jmp .do_left
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm7, mm2
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- .do_top:
- PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
- lea r1, [r0+r3*2]
- movq mm2, mm6
- movq mm3, mm6
- PALIGNR mm3, mm7, 7, mm0
- PALIGNR mm6, mm7, 6, mm1
- movq mm4, mm3
- pavgb mm3, mm2
- lea r2, [r1+r3*2]
- PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
- movq [r0+r3*1], mm3
- movq [r0+r3*2], mm0
- movq mm5, mm0
- movq mm6, mm3
- movq mm1, mm7
- movq mm2, mm1
- psllq mm2, 8
- movq mm3, mm1
- psllq mm3, 16
- lea r4, [r2+r3*2]
- PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
- PALIGNR mm6, mm0, 7, mm2
- movq [r1+r3*1], mm6
- psllq mm0, 8
- PALIGNR mm5, mm0, 7, mm1
- movq [r1+r3*2], mm5
- psllq mm0, 8
- PALIGNR mm6, mm0, 7, mm2
- movq [r2+r3*1], mm6
- psllq mm0, 8
- PALIGNR mm5, mm0, 7, mm1
- movq [r2+r3*2], mm5
- psllq mm0, 8
- PALIGNR mm6, mm0, 7, mm2
- movq [r4+r3*1], mm6
- psllq mm0, 8
- PALIGNR mm5, mm0, 7, mm1
- movq [r4+r3*2], mm5
- RET
- %macro PRED8x8L_VERTICAL_RIGHT 0
- cglobal pred8x8l_vertical_right_8, 4,5,7
- ; manually spill XMM registers for Win64 because
- ; the code here is initialized with INIT_MMX
- WIN64_SPILL_XMM 7
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq2dq xmm0, mm2
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- .do_top:
- PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
- lea r1, [r0+r3*2]
- movq2dq xmm4, mm6
- pslldq xmm4, 8
- por xmm0, xmm4
- movdqa xmm6, [pw_ff00]
- movdqa xmm1, xmm0
- lea r2, [r1+r3*2]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- pslldq xmm0, 1
- pslldq xmm1, 2
- pavgb xmm2, xmm0
- INIT_XMM cpuname
- PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
- pandn xmm6, xmm4
- movdqa xmm5, xmm4
- psrlw xmm4, 8
- packuswb xmm6, xmm4
- movhlps xmm4, xmm6
- movhps [r0+r3*2], xmm5
- movhps [r0+r3*1], xmm2
- psrldq xmm5, 4
- movss xmm5, xmm6
- psrldq xmm2, 4
- movss xmm2, xmm4
- lea r0, [r2+r3*2]
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r0+r3*2], xmm5
- movq [r0+r3*1], xmm2
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r2+r3*2], xmm5
- movq [r2+r3*1], xmm2
- psrldq xmm5, 1
- psrldq xmm2, 1
- movq [r1+r3*2], xmm5
- movq [r1+r3*1], xmm2
- RET
- %endmacro
- INIT_MMX sse2
- PRED8x8L_VERTICAL_RIGHT
- INIT_MMX ssse3
- PRED8x8L_VERTICAL_RIGHT
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_VERTICAL_LEFT 0
- cglobal pred8x8l_vertical_left_8, 4,4
- sub r0, r3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- jmp .do_top
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm4, mm4
- test r2, r2
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
- .do_topright:
- movq2dq xmm3, mm1
- lea r1, [r0+r3*2]
- pslldq xmm3, 8
- por xmm4, xmm3
- movdqa xmm2, xmm4
- movdqa xmm1, xmm4
- movdqa xmm3, xmm4
- psrldq xmm2, 1
- pslldq xmm1, 1
- pavgb xmm3, xmm2
- lea r2, [r1+r3*2]
- INIT_XMM cpuname
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
- psrldq xmm0, 1
- movq [r0+r3*1], xmm3
- movq [r0+r3*2], xmm0
- lea r0, [r2+r3*2]
- psrldq xmm3, 1
- psrldq xmm0, 1
- movq [r1+r3*1], xmm3
- movq [r1+r3*2], xmm0
- psrldq xmm3, 1
- psrldq xmm0, 1
- movq [r2+r3*1], xmm3
- movq [r2+r3*2], xmm0
- psrldq xmm3, 1
- psrldq xmm0, 1
- movq [r0+r3*1], xmm3
- movq [r0+r3*2], xmm0
- RET
- %endmacro
- INIT_MMX sse2
- PRED8x8L_VERTICAL_LEFT
- INIT_MMX ssse3
- PRED8x8L_VERTICAL_LEFT
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED8x8L_HORIZONTAL_UP 0
- cglobal pred8x8l_horizontal_up_8, 4,4
- sub r0, r3
- lea r2, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- test r1, r1
- lea r1, [r0+r3]
- cmovnz r1, r0
- punpckhbw mm0, [r1+r3*0-8]
- movq mm1, [r2+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r2, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r1+r3*0-8]
- mov r0, r2
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- lea r1, [r0+r3*2]
- pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
- psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
- movq mm2, mm0
- psllw mm0, 8
- psrlw mm2, 8
- por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
- movq mm3, mm2
- movq mm4, mm2
- movq mm5, mm2
- psrlq mm2, 8
- psrlq mm3, 16
- lea r2, [r1+r3*2]
- por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckhbw mm7, mm7
- por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavgb mm4, mm2
- PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
- movq mm5, mm4
- punpcklbw mm4, mm1 ; p4 p3 p2 p1
- punpckhbw mm5, mm1 ; p8 p7 p6 p5
- movq mm6, mm5
- movq mm7, mm5
- movq mm0, mm5
- PALIGNR mm5, mm4, 2, mm1
- pshufw mm1, mm6, 11111001b
- PALIGNR mm6, mm4, 4, mm2
- pshufw mm2, mm7, 11111110b
- PALIGNR mm7, mm4, 6, mm3
- pshufw mm3, mm0, 11111111b
- movq [r0+r3*1], mm4
- movq [r0+r3*2], mm5
- lea r0, [r2+r3*2]
- movq [r1+r3*1], mm6
- movq [r1+r3*2], mm7
- movq [r2+r3*1], mm0
- movq [r2+r3*2], mm1
- movq [r0+r3*1], mm2
- movq [r0+r3*2], mm3
- RET
- %endmacro
- INIT_MMX mmxext
- PRED8x8L_HORIZONTAL_UP
- INIT_MMX ssse3
- PRED8x8L_HORIZONTAL_UP
- ;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
- ; int has_topright, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred8x8l_horizontal_down_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq mm4, mm0
- movq mm7, mm2
- movq mm6, mm2
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- psllq mm1, 56
- PALIGNR mm7, mm1, 7, mm3
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq mm5, mm4
- lea r1, [r0+r3*2]
- psllq mm7, 56
- movq mm2, mm5
- movq mm3, mm6
- movq mm4, mm2
- PALIGNR mm2, mm6, 7, mm5
- PALIGNR mm6, mm7, 7, mm0
- lea r2, [r1+r3*2]
- PALIGNR mm4, mm3, 1, mm7
- movq mm5, mm3
- pavgb mm3, mm6
- PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
- movq mm4, mm2
- movq mm1, mm2
- lea r4, [r2+r3*2]
- psrlq mm4, 16
- psrlq mm1, 8
- PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
- movq mm7, mm3
- punpcklbw mm3, mm0
- punpckhbw mm7, mm0
- movq mm1, mm7
- movq mm0, mm7
- movq mm4, mm7
- movq [r4+r3*2], mm3
- PALIGNR mm7, mm3, 2, mm5
- movq [r4+r3*1], mm7
- PALIGNR mm1, mm3, 4, mm5
- movq [r2+r3*2], mm1
- PALIGNR mm0, mm3, 6, mm3
- movq [r2+r3*1], mm0
- movq mm2, mm6
- movq mm3, mm6
- movq [r1+r3*2], mm4
- PALIGNR mm6, mm4, 2, mm5
- movq [r1+r3*1], mm6
- PALIGNR mm2, mm4, 4, mm5
- movq [r0+r3*2], mm2
- PALIGNR mm3, mm4, 6, mm4
- movq [r0+r3*1], mm3
- RET
- %macro PRED8x8L_HORIZONTAL_DOWN 0
- cglobal pred8x8l_horizontal_down_8, 4,5
- sub r0, r3
- lea r4, [r0+r3*2]
- movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
- movq mm1, [r4+r3*1-8]
- punpckhbw mm1, [r0+r3*2-8]
- mov r4, r0
- punpckhwd mm1, mm0
- lea r0, [r0+r3*4]
- movq mm2, [r0+r3*1-8]
- punpckhbw mm2, [r0+r3*0-8]
- lea r0, [r0+r3*2]
- movq mm3, [r0+r3*1-8]
- punpckhbw mm3, [r0+r3*0-8]
- punpckhwd mm3, mm2
- punpckhdq mm3, mm1
- lea r0, [r0+r3*2]
- movq mm0, [r0+r3*0-8]
- movq mm1, [r4]
- mov r0, r4
- movq mm4, mm3
- movq mm2, mm3
- PALIGNR mm4, mm0, 7, mm0
- PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
- .fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
- jmp .do_left
- .fix_lt_2:
- movq mm5, mm3
- pxor mm5, mm2
- psllq mm5, 56
- psrlq mm5, 56
- pxor mm2, mm5
- test r2, r2
- jnz .do_top
- .fix_tr_1:
- movq mm5, mm3
- pxor mm5, mm1
- psrlq mm5, 56
- psllq mm5, 56
- pxor mm1, mm5
- jmp .do_top
- .fix_tr_2:
- punpckhbw mm3, mm3
- pshufw mm1, mm3, 0xFF
- jmp .do_topright
- .do_left:
- movq mm0, mm4
- PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
- movq2dq xmm0, mm2
- pslldq xmm0, 8
- movq mm4, mm0
- PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
- movq2dq xmm2, mm1
- pslldq xmm2, 15
- psrldq xmm2, 8
- por xmm0, xmm2
- movq mm0, [r0-8]
- movq mm3, [r0]
- movq mm1, [r0+8]
- movq mm2, mm3
- movq mm4, mm3
- PALIGNR mm2, mm0, 7, mm0
- PALIGNR mm1, mm4, 1, mm4
- test r1, r1
- jz .fix_lt_2
- test r2, r2
- jz .fix_tr_1
- .do_top:
- PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
- movq2dq xmm1, mm4
- test r2, r2
- jz .fix_tr_2
- movq mm0, [r0+8]
- movq mm5, mm0
- movq mm2, mm0
- movq mm4, mm0
- psrlq mm5, 56
- PALIGNR mm2, mm3, 7, mm3
- PALIGNR mm5, mm4, 1, mm4
- PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
- .do_topright:
- movq2dq xmm5, mm1
- pslldq xmm5, 8
- por xmm1, xmm5
- INIT_XMM cpuname
- lea r2, [r4+r3*2]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- PALIGNR xmm1, xmm0, 7, xmm4
- PALIGNR xmm2, xmm0, 9, xmm5
- lea r1, [r2+r3*2]
- PALIGNR xmm3, xmm0, 8, xmm0
- movdqa xmm4, xmm1
- pavgb xmm4, xmm3
- lea r0, [r1+r3*2]
- PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
- punpcklbw xmm4, xmm0
- movhlps xmm0, xmm4
- movq [r0+r3*2], xmm4
- movq [r2+r3*2], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
- movq [r0+r3*1], xmm4
- movq [r2+r3*1], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
- movq [r1+r3*2], xmm4
- movq [r4+r3*2], xmm0
- psrldq xmm4, 2
- psrldq xmm0, 2
- movq [r1+r3*1], xmm4
- movq [r4+r3*1], xmm0
- RET
- %endmacro
- INIT_MMX sse2
- PRED8x8L_HORIZONTAL_DOWN
- INIT_MMX ssse3
- PRED8x8L_HORIZONTAL_DOWN
- ;-------------------------------------------------------------------------------
- ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
- ;-------------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_dc_8, 3,5
- pxor mm7, mm7
- mov r4, r0
- sub r0, r2
- movd mm0, [r0]
- psadbw mm0, mm7
- movzx r1d, byte [r0+r2*1-1]
- movd r3d, mm0
- add r3d, r1d
- movzx r1d, byte [r0+r2*2-1]
- lea r0, [r0+r2*2]
- add r3d, r1d
- movzx r1d, byte [r0+r2*1-1]
- add r3d, r1d
- movzx r1d, byte [r0+r2*2-1]
- add r3d, r1d
- add r3d, 4
- shr r3d, 3
- imul r3d, 0x01010101
- mov [r4+r2*0], r3d
- mov [r0+r2*0], r3d
- mov [r0+r2*1], r3d
- mov [r0+r2*2], r3d
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- %macro PRED4x4_TM 0
- cglobal pred4x4_tm_vp8_8, 3,6
- sub r0, r2
- pxor mm7, mm7
- movd mm0, [r0]
- punpcklbw mm0, mm7
- movzx r4d, byte [r0-1]
- mov r5d, 2
- .loop:
- movzx r1d, byte [r0+r2*1-1]
- movzx r3d, byte [r0+r2*2-1]
- sub r1d, r4d
- sub r3d, r4d
- movd mm2, r1d
- movd mm4, r3d
- %if cpuflag(mmxext)
- pshufw mm2, mm2, 0
- pshufw mm4, mm4, 0
- %else
- punpcklwd mm2, mm2
- punpcklwd mm4, mm4
- punpckldq mm2, mm2
- punpckldq mm4, mm4
- %endif
- paddw mm2, mm0
- paddw mm4, mm0
- packuswb mm2, mm2
- packuswb mm4, mm4
- movd [r0+r2*1], mm2
- movd [r0+r2*2], mm4
- lea r0, [r0+r2*2]
- dec r5d
- jg .loop
- REP_RET
- %endmacro
- INIT_MMX mmx
- PRED4x4_TM
- INIT_MMX mmxext
- PRED4x4_TM
- INIT_XMM ssse3
- cglobal pred4x4_tm_vp8_8, 3,3
- sub r0, r2
- movq mm6, [tm_shuf]
- pxor mm1, mm1
- movd mm0, [r0]
- punpcklbw mm0, mm1
- movd mm7, [r0-4]
- pshufb mm7, mm6
- lea r1, [r0+r2*2]
- movd mm2, [r0+r2*1-4]
- movd mm3, [r0+r2*2-4]
- movd mm4, [r1+r2*1-4]
- movd mm5, [r1+r2*2-4]
- pshufb mm2, mm6
- pshufb mm3, mm6
- pshufb mm4, mm6
- pshufb mm5, mm6
- psubw mm0, mm7
- paddw mm2, mm0
- paddw mm3, mm0
- paddw mm4, mm0
- paddw mm5, mm0
- packuswb mm2, mm2
- packuswb mm3, mm3
- packuswb mm4, mm4
- packuswb mm5, mm5
- movd [r0+r2*1], mm2
- movd [r0+r2*2], mm3
- movd [r1+r2*1], mm4
- movd [r1+r2*2], mm5
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_vertical_vp8_8, 3,3
- sub r0, r2
- movd m1, [r0-1]
- movd m0, [r0]
- mova m2, m0 ;t0 t1 t2 t3
- punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
- lea r1, [r0+r2*2]
- psrlq m0, 8 ;t1 t2 t3 t4
- PRED4x4_LOWPASS m3, m1, m0, m2, m4
- movd [r0+r2*1], m3
- movd [r0+r2*2], m3
- movd [r1+r2*1], m3
- movd [r1+r2*2], m3
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_down_left_8, 3,3
- sub r0, r2
- movq m1, [r0]
- punpckldq m1, [r1]
- movq m2, m1
- movq m3, m1
- psllq m1, 8
- pxor m2, m1
- psrlq m2, 8
- pxor m2, m3
- PRED4x4_LOWPASS m0, m1, m2, m3, m4
- lea r1, [r0+r2*2]
- psrlq m0, 8
- movd [r0+r2*1], m0
- psrlq m0, 8
- movd [r0+r2*2], m0
- psrlq m0, 8
- movd [r1+r2*1], m0
- psrlq m0, 8
- movd [r1+r2*2], m0
- RET
- ;------------------------------------------------------------------------------
- ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
- ; int stride)
- ;------------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_vertical_left_8, 3,3
- sub r0, r2
- movq m1, [r0]
- punpckldq m1, [r1]
- movq m3, m1
- movq m2, m1
- psrlq m3, 8
- psrlq m2, 16
- movq m4, m3
- pavgb m4, m1
- PRED4x4_LOWPASS m0, m1, m2, m3, m5
- lea r1, [r0+r2*2]
- movh [r0+r2*1], m4
- movh [r0+r2*2], m0
- psrlq m4, 8
- psrlq m0, 8
- movh [r1+r2*1], m4
- movh [r1+r2*2], m0
- RET
- ;------------------------------------------------------------------------------
- ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
- ; int stride)
- ;------------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_horizontal_up_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movd m0, [r0+r2*1-4]
- punpcklbw m0, [r0+r2*2-4]
- movd m1, [r1+r2*1-4]
- punpcklbw m1, [r1+r2*2-4]
- punpckhwd m0, m1
- movq m1, m0
- punpckhbw m1, m1
- pshufw m1, m1, 0xFF
- punpckhdq m0, m1
- movq m2, m0
- movq m3, m0
- movq m7, m0
- psrlq m2, 16
- psrlq m3, 8
- pavgb m7, m3
- PRED4x4_LOWPASS m4, m0, m2, m3, m5
- punpcklbw m7, m4
- movd [r0+r2*1], m7
- psrlq m7, 16
- movd [r0+r2*2], m7
- psrlq m7, 16
- movd [r1+r2*1], m7
- movd [r1+r2*2], m1
- RET
- ;------------------------------------------------------------------------------
- ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
- ; const uint8_t *topright, int stride)
- ;------------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_horizontal_down_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movh m0, [r0-4] ; lt ..
- punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
- psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
- movd m1, [r1+r2*2-4] ; l3
- punpcklbw m1, [r1+r2*1-4] ; l2 l3
- movd m2, [r0+r2*2-4] ; l1
- punpcklbw m2, [r0+r2*1-4] ; l0 l1
- punpckhwd m1, m2 ; l0 l1 l2 l3
- punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- movq m0, m1
- movq m2, m1
- movq m5, m1
- psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
- psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
- pavgb m5, m2
- PRED4x4_LOWPASS m3, m1, m0, m2, m4
- punpcklbw m5, m3
- psrlq m3, 32
- PALIGNR m3, m5, 6, m4
- movh [r1+r2*2], m5
- psrlq m5, 16
- movh [r1+r2*1], m5
- psrlq m5, 16
- movh [r0+r2*2], m5
- movh [r0+r2*1], m3
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
- ; const uint8_t *topright, int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_vertical_right_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movh m0, [r0] ; ........t3t2t1t0
- movq m5, m0
- PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
- pavgb m5, m0
- PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
- movq m1, m0
- PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
- movq m2, m0
- PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
- PRED4x4_LOWPASS m3, m1, m0, m2, m4
- movq m1, m3
- psrlq m3, 16
- psllq m1, 48
- movh [r0+r2*1], m5
- movh [r0+r2*2], m3
- PALIGNR m5, m1, 7, m2
- psllq m1, 8
- movh [r1+r2*1], m5
- PALIGNR m3, m1, 7, m1
- movh [r1+r2*2], m3
- RET
- ;-----------------------------------------------------------------------------
- ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
- ; int stride)
- ;-----------------------------------------------------------------------------
- INIT_MMX mmxext
- cglobal pred4x4_down_right_8, 3,3
- sub r0, r2
- lea r1, [r0+r2*2]
- movq m1, [r1-8]
- movq m2, [r0+r2*1-8]
- punpckhbw m2, [r0-8]
- movh m3, [r0]
- punpckhwd m1, m2
- PALIGNR m3, m1, 5, m1
- movq m1, m3
- PALIGNR m3, [r1+r2*1-8], 7, m4
- movq m2, m3
- PALIGNR m3, [r1+r2*2-8], 7, m4
- PRED4x4_LOWPASS m0, m3, m1, m2, m4
- movh [r1+r2*2], m0
- psrlq m0, 8
- movh [r1+r2*1], m0
- psrlq m0, 8
- movh [r0+r2*2], m0
- psrlq m0, 8
- movh [r0+r2*1], m0
- RET
|