hevc_mc_biw_msa.c 250 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051
  1. /*
  2. * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/mips/generic_macros_msa.h"
  21. #include "libavcodec/mips/hevcdsp_mips.h"
  22. #include "libavcodec/mips/hevc_macros_msa.h"
  23. static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
  24. /* 8 width cases */
  25. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  26. 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
  27. };
  28. #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
  29. out0, out1) \
  30. { \
  31. v4i32 out0_r, out1_r, out0_l, out1_l; \
  32. \
  33. ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
  34. ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
  35. \
  36. out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
  37. out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
  38. out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
  39. out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
  40. \
  41. SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
  42. PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
  43. CLIP_SH2_0_255(out0, out1); \
  44. }
  45. #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
  46. wgt, rnd, offset, out0, out1, out2, out3) \
  47. { \
  48. HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
  49. HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
  50. }
  51. #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
  52. offset, out0, out1) \
  53. { \
  54. v4i32 out0_r, out1_r, out0_l, out1_l; \
  55. \
  56. ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
  57. ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
  58. out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
  59. out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
  60. out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
  61. out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
  62. SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
  63. PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
  64. CLIP_SH2_0_255_MAX_SATU(out0, out1); \
  65. }
  66. #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
  67. vec3, wgt, rnd, offset, out0, out1, \
  68. out2, out3) \
  69. { \
  70. HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
  71. out0, out1); \
  72. HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
  73. out2, out3); \
  74. }
  75. static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
  76. int32_t src_stride,
  77. int16_t *src1_ptr,
  78. int32_t src2_stride,
  79. uint8_t *dst,
  80. int32_t dst_stride,
  81. int32_t height,
  82. int32_t weight0,
  83. int32_t weight1,
  84. int32_t offset0,
  85. int32_t offset1,
  86. int32_t rnd_val)
  87. {
  88. uint32_t loop_cnt, tp0, tp1, tp2, tp3;
  89. uint64_t tpd0, tpd1, tpd2, tpd3;
  90. int32_t offset, weight;
  91. v16u8 out0, out1;
  92. v16i8 zero = { 0 };
  93. v16i8 src0 = { 0 }, src1 = { 0 };
  94. v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
  95. v8i16 dst0, dst1, dst2, dst3, weight_vec;
  96. v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
  97. offset = (offset0 + offset1) << rnd_val;
  98. weight0 = weight0 & 0x0000FFFF;
  99. weight = weight0 | (weight1 << 16);
  100. offset_vec = __msa_fill_w(offset);
  101. weight_vec = (v8i16) __msa_fill_w(weight);
  102. rnd_vec = __msa_fill_w(rnd_val + 1);
  103. if (2 == height) {
  104. LW2(src0_ptr, src_stride, tp0, tp1);
  105. INSERT_W2_SB(tp0, tp1, src0);
  106. LD2(src1_ptr, src2_stride, tpd0, tpd1);
  107. INSERT_D2_SH(tpd0, tpd1, in0);
  108. dst0 = (v8i16) __msa_ilvr_b(zero, src0);
  109. dst0 <<= 6;
  110. ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
  111. dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
  112. dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
  113. SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
  114. dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
  115. dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
  116. out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
  117. ST_W2(out0, 0, 1, dst, dst_stride);
  118. } else if (4 == height) {
  119. LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
  120. INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
  121. LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
  122. INSERT_D2_SH(tpd0, tpd1, in0);
  123. INSERT_D2_SH(tpd2, tpd3, in1);
  124. ILVRL_B2_SH(zero, src0, dst0, dst1);
  125. SLLI_2V(dst0, dst1, 6);
  126. HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
  127. offset_vec, dst0, dst1);
  128. out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  129. ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
  130. } else if (0 == height % 8) {
  131. for (loop_cnt = (height >> 3); loop_cnt--;) {
  132. LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
  133. src0_ptr += 4 * src_stride;
  134. INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
  135. LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
  136. src0_ptr += 4 * src_stride;
  137. INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
  138. LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
  139. src1_ptr += (4 * src2_stride);
  140. INSERT_D2_SH(tpd0, tpd1, in0);
  141. INSERT_D2_SH(tpd2, tpd3, in1);
  142. LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
  143. src1_ptr += (4 * src2_stride);
  144. INSERT_D2_SH(tpd0, tpd1, in2);
  145. INSERT_D2_SH(tpd2, tpd3, in3);
  146. ILVRL_B2_SH(zero, src0, dst0, dst1);
  147. ILVRL_B2_SH(zero, src1, dst2, dst3);
  148. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  149. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
  150. in3, weight_vec, rnd_vec, offset_vec,
  151. dst0, dst1, dst2, dst3);
  152. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  153. ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  154. dst += (8 * dst_stride);
  155. }
  156. }
  157. }
  158. static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
  159. int32_t src_stride,
  160. int16_t *src1_ptr,
  161. int32_t src2_stride,
  162. uint8_t *dst,
  163. int32_t dst_stride,
  164. int32_t height,
  165. int32_t weight0,
  166. int32_t weight1,
  167. int32_t offset0,
  168. int32_t offset1,
  169. int32_t rnd_val)
  170. {
  171. uint32_t loop_cnt;
  172. int32_t offset, weight;
  173. uint64_t tp0, tp1, tp2, tp3;
  174. v16u8 out0, out1;
  175. v16i8 zero = { 0 };
  176. v16i8 src0 = { 0 }, src1 = { 0 };
  177. v8i16 in0, in1, in2, in3;
  178. v8i16 dst0, dst1, dst2, dst3;
  179. v4i32 offset_vec, weight_vec, rnd_vec;
  180. offset = (offset0 + offset1) << rnd_val;
  181. weight0 = weight0 & 0x0000FFFF;
  182. weight = weight0 | (weight1 << 16);
  183. weight_vec = __msa_fill_w(weight);
  184. offset_vec = __msa_fill_w(offset);
  185. rnd_vec = __msa_fill_w(rnd_val + 1);
  186. for (loop_cnt = (height >> 2); loop_cnt--;) {
  187. LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
  188. src0_ptr += (4 * src_stride);
  189. INSERT_D2_SB(tp0, tp1, src0);
  190. INSERT_D2_SB(tp2, tp3, src1);
  191. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  192. src1_ptr += (4 * src2_stride);
  193. ILVRL_B2_SH(zero, src0, dst0, dst1);
  194. ILVRL_B2_SH(zero, src1, dst2, dst3);
  195. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  196. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
  197. in0, in1, in2, in3,
  198. weight_vec, rnd_vec, offset_vec,
  199. dst0, dst1, dst2, dst3);
  200. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  201. ST_W2(out0, 0, 2, dst, dst_stride);
  202. ST_H2(out0, 2, 6, dst + 4, dst_stride);
  203. ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
  204. ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
  205. dst += (4 * dst_stride);
  206. }
  207. }
  208. static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
  209. int32_t src_stride,
  210. int16_t *src1_ptr,
  211. int32_t src2_stride,
  212. uint8_t *dst,
  213. int32_t dst_stride,
  214. int32_t height,
  215. int32_t weight0,
  216. int32_t weight1,
  217. int32_t offset0,
  218. int32_t offset1,
  219. int32_t rnd_val)
  220. {
  221. uint64_t tp0, tp1, tp2, tp3;
  222. int32_t offset, weight;
  223. v16u8 out0, out1, out2;
  224. v16i8 zero = { 0 };
  225. v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
  226. v8i16 in0, in1, in2, in3, in4, in5;
  227. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  228. v4i32 offset_vec, weight_vec, rnd_vec;
  229. offset = (offset0 + offset1) << rnd_val;
  230. weight0 = weight0 & 0x0000FFFF;
  231. weight = weight0 | (weight1 << 16);
  232. offset_vec = __msa_fill_w(offset);
  233. weight_vec = __msa_fill_w(weight);
  234. rnd_vec = __msa_fill_w(rnd_val + 1);
  235. if (2 == height) {
  236. LD2(src0_ptr, src_stride, tp0, tp1);
  237. INSERT_D2_SB(tp0, tp1, src0);
  238. LD_SH2(src1_ptr, src2_stride, in0, in1);
  239. ILVRL_B2_SH(zero, src0, dst0, dst1);
  240. SLLI_2V(dst0, dst1, 6);
  241. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
  242. weight_vec, rnd_vec, offset_vec,
  243. dst0, dst1);
  244. out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  245. ST_D2(out0, 0, 1, dst, dst_stride);
  246. } else if (6 == height) {
  247. LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
  248. src0_ptr += 4 * src_stride;
  249. INSERT_D2_SB(tp0, tp1, src0);
  250. INSERT_D2_SB(tp2, tp3, src1);
  251. LD2(src0_ptr, src_stride, tp0, tp1);
  252. INSERT_D2_SB(tp0, tp1, src2);
  253. ILVRL_B2_SH(zero, src0, dst0, dst1);
  254. ILVRL_B2_SH(zero, src1, dst2, dst3);
  255. ILVRL_B2_SH(zero, src2, dst4, dst5);
  256. LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
  257. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  258. SLLI_2V(dst4, dst5, 6);
  259. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
  260. weight_vec, rnd_vec, offset_vec, dst0, dst1,
  261. dst2, dst3);
  262. HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
  263. offset_vec, dst4, dst5);
  264. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  265. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  266. ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
  267. } else if (0 == height % 4) {
  268. uint32_t loop_cnt;
  269. for (loop_cnt = (height >> 2); loop_cnt--;) {
  270. LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
  271. src0_ptr += (4 * src_stride);
  272. INSERT_D2_SB(tp0, tp1, src0);
  273. INSERT_D2_SB(tp2, tp3, src1);
  274. ILVRL_B2_SH(zero, src0, dst0, dst1);
  275. ILVRL_B2_SH(zero, src1, dst2, dst3);
  276. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  277. src1_ptr += (4 * src2_stride);
  278. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  279. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
  280. in3, weight_vec, rnd_vec, offset_vec,
  281. dst0, dst1, dst2, dst3);
  282. PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
  283. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  284. dst += (4 * dst_stride);
  285. }
  286. }
  287. }
  288. static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
  289. int32_t src_stride,
  290. int16_t *src1_ptr,
  291. int32_t src2_stride,
  292. uint8_t *dst,
  293. int32_t dst_stride,
  294. int32_t height,
  295. int32_t weight0,
  296. int32_t weight1,
  297. int32_t offset0,
  298. int32_t offset1,
  299. int32_t rnd_val)
  300. {
  301. uint32_t loop_cnt;
  302. int32_t offset, weight;
  303. v16i8 zero = { 0 };
  304. v16u8 out0, out1, out2;
  305. v16i8 src0, src1, src2, src3;
  306. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  307. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  308. v4i32 offset_vec, weight_vec, rnd_vec;
  309. offset = (offset0 + offset1) << rnd_val;
  310. weight0 = weight0 & 0x0000FFFF;
  311. weight = weight0 | (weight1 << 16);
  312. offset_vec = __msa_fill_w(offset);
  313. weight_vec = __msa_fill_w(weight);
  314. rnd_vec = __msa_fill_w(rnd_val + 1);
  315. for (loop_cnt = (16 >> 2); loop_cnt--;) {
  316. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  317. src0_ptr += (4 * src_stride);
  318. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  319. LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
  320. src1_ptr += (4 * src2_stride);
  321. ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
  322. ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
  323. dst0, dst1, dst2, dst3);
  324. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  325. ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
  326. ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
  327. dst4 <<= 6;
  328. dst5 <<= 6;
  329. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
  330. weight_vec, rnd_vec, offset_vec, dst0, dst1,
  331. dst2, dst3);
  332. HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
  333. offset_vec, dst4, dst5);
  334. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  335. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  336. ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
  337. dst += (4 * dst_stride);
  338. }
  339. }
  340. static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
  341. int32_t src_stride,
  342. int16_t *src1_ptr,
  343. int32_t src2_stride,
  344. uint8_t *dst,
  345. int32_t dst_stride,
  346. int32_t height,
  347. int32_t weight0,
  348. int32_t weight1,
  349. int32_t offset0,
  350. int32_t offset1,
  351. int32_t rnd_val)
  352. {
  353. uint32_t loop_cnt;
  354. int32_t offset, weight;
  355. v16u8 out0, out1, out2, out3;
  356. v16i8 zero = { 0 };
  357. v16i8 src0, src1, src2, src3;
  358. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  359. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  360. v4i32 offset_vec, weight_vec, rnd_vec;
  361. offset = (offset0 + offset1) << rnd_val;
  362. weight0 = weight0 & 0x0000FFFF;
  363. weight = weight0 | (weight1 << 16);
  364. offset_vec = __msa_fill_w(offset);
  365. weight_vec = __msa_fill_w(weight);
  366. rnd_vec = __msa_fill_w(rnd_val + 1);
  367. for (loop_cnt = (height >> 2); loop_cnt--;) {
  368. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  369. src0_ptr += (4 * src_stride);
  370. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  371. LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
  372. src1_ptr += (4 * src2_stride);
  373. ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
  374. tmp2, tmp3);
  375. ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
  376. tmp6, tmp7);
  377. SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
  378. SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
  379. HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
  380. weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
  381. tmp4, tmp5);
  382. HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
  383. weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
  384. tmp6, tmp7);
  385. PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
  386. PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
  387. ST_UB4(out0, out1, out2, out3, dst, dst_stride);
  388. dst += (4 * dst_stride);
  389. }
  390. }
  391. static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
  392. int32_t src_stride,
  393. int16_t *src1_ptr,
  394. int32_t src2_stride,
  395. uint8_t *dst,
  396. int32_t dst_stride,
  397. int32_t height,
  398. int32_t weight0,
  399. int32_t weight1,
  400. int32_t offset0,
  401. int32_t offset1,
  402. int32_t rnd_val)
  403. {
  404. uint32_t loop_cnt;
  405. int32_t offset, weight;
  406. v16u8 out0, out1, out2, out3, out4, out5;
  407. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
  408. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
  409. v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
  410. v4i32 offset_vec, weight_vec, rnd_vec;
  411. offset = (offset0 + offset1) << rnd_val;
  412. weight0 = weight0 & 0x0000FFFF;
  413. weight = weight0 | (weight1 << 16);
  414. offset_vec = __msa_fill_w(offset);
  415. weight_vec = __msa_fill_w(weight);
  416. rnd_vec = __msa_fill_w(rnd_val + 1);
  417. for (loop_cnt = 8; loop_cnt--;) {
  418. LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
  419. LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
  420. src0_ptr += (4 * src_stride);
  421. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  422. LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
  423. LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
  424. src1_ptr += (4 * src2_stride);
  425. ILVRL_B2_SH(zero, src0, dst0, dst1);
  426. ILVRL_B2_SH(zero, src1, dst2, dst3);
  427. ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
  428. ILVRL_B2_SH(zero, src4, dst6, dst7);
  429. ILVRL_B2_SH(zero, src5, dst8, dst9);
  430. ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
  431. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  432. SLLI_4V(dst4, dst5, dst6, dst7, 6);
  433. SLLI_4V(dst8, dst9, dst10, dst11, 6);
  434. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
  435. weight_vec, rnd_vec, offset_vec, dst0, dst1,
  436. dst2, dst3);
  437. HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
  438. weight_vec, rnd_vec, offset_vec, dst4, dst5,
  439. dst6, dst7);
  440. HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
  441. in11, weight_vec, rnd_vec, offset_vec,
  442. dst8, dst9, dst10, dst11);
  443. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  444. PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
  445. ST_UB4(out0, out1, out3, out4, dst, dst_stride);
  446. ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
  447. dst += (4 * dst_stride);
  448. }
  449. }
  450. static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
  451. int32_t src_stride,
  452. int16_t *src1_ptr,
  453. int32_t src2_stride,
  454. uint8_t *dst,
  455. int32_t dst_stride,
  456. int32_t height,
  457. int32_t weight0,
  458. int32_t weight1,
  459. int32_t offset0,
  460. int32_t offset1,
  461. int32_t rnd_val)
  462. {
  463. uint32_t loop_cnt;
  464. int32_t offset, weight;
  465. v16u8 out0, out1, out2, out3;
  466. v16i8 zero = { 0 };
  467. v16i8 src0, src1, src2, src3;
  468. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  469. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  470. v4i32 offset_vec, weight_vec, rnd_vec;
  471. offset = (offset0 + offset1) << rnd_val;
  472. weight0 = weight0 & 0x0000FFFF;
  473. weight = weight0 | (weight1 << 16);
  474. offset_vec = __msa_fill_w(offset);
  475. weight_vec = __msa_fill_w(weight);
  476. rnd_vec = __msa_fill_w(rnd_val + 1);
  477. for (loop_cnt = (height >> 1); loop_cnt--;) {
  478. LD_SB2(src0_ptr, 16, src0, src1);
  479. src0_ptr += src_stride;
  480. LD_SB2(src0_ptr, 16, src2, src3);
  481. src0_ptr += src_stride;
  482. LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
  483. src1_ptr += src2_stride;
  484. LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
  485. src1_ptr += src2_stride;
  486. ILVRL_B2_SH(zero, src0, tmp0, tmp4);
  487. ILVRL_B2_SH(zero, src1, tmp1, tmp5);
  488. ILVRL_B2_SH(zero, src2, tmp2, tmp6);
  489. ILVRL_B2_SH(zero, src3, tmp3, tmp7);
  490. SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
  491. SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
  492. HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
  493. weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
  494. tmp1, tmp5);
  495. HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
  496. weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
  497. tmp3, tmp7);
  498. PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
  499. PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
  500. ST_UB2(out0, out1, dst, 16);
  501. dst += dst_stride;
  502. ST_UB2(out2, out3, dst, 16);
  503. dst += dst_stride;
  504. }
  505. }
  506. static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
  507. int32_t src_stride,
  508. int16_t *src1_ptr,
  509. int32_t src2_stride,
  510. uint8_t *dst,
  511. int32_t dst_stride,
  512. int32_t height,
  513. int32_t weight0,
  514. int32_t weight1,
  515. int32_t offset0,
  516. int32_t offset1,
  517. int32_t rnd_val)
  518. {
  519. uint32_t loop_cnt;
  520. int32_t offset, weight;
  521. v16u8 out0, out1, out2;
  522. v16i8 src0, src1, src2;
  523. v16i8 zero = { 0 };
  524. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
  525. v4i32 offset_vec, weight_vec, rnd_vec;
  526. offset = (offset0 + offset1) << rnd_val;
  527. weight0 = weight0 & 0x0000FFFF;
  528. weight = weight0 | (weight1 << 16);
  529. offset_vec = __msa_fill_w(offset);
  530. weight_vec = __msa_fill_w(weight);
  531. rnd_vec = __msa_fill_w(rnd_val + 1);
  532. for (loop_cnt = 64; loop_cnt--;) {
  533. LD_SB3(src0_ptr, 16, src0, src1, src2);
  534. src0_ptr += src_stride;
  535. LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
  536. src1_ptr += src2_stride;
  537. ILVRL_B2_SH(zero, src0, dst0, dst1);
  538. ILVRL_B2_SH(zero, src1, dst2, dst3);
  539. ILVRL_B2_SH(zero, src2, dst4, dst5);
  540. SLLI_4V(dst0, dst1, dst2, dst3, 6);
  541. SLLI_2V(dst4, dst5, 6);
  542. HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
  543. weight_vec, rnd_vec, offset_vec, dst0, dst1,
  544. dst2, dst3);
  545. HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
  546. offset_vec, dst4, dst5);
  547. PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
  548. ST_UB2(out0, out1, dst, 16);
  549. ST_UB(out2, dst + 32);
  550. dst += dst_stride;
  551. }
  552. }
  553. static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
  554. int32_t src_stride,
  555. int16_t *src1_ptr,
  556. int32_t src2_stride,
  557. uint8_t *dst,
  558. int32_t dst_stride,
  559. int32_t height,
  560. int32_t weight0,
  561. int32_t weight1,
  562. int32_t offset0,
  563. int32_t offset1,
  564. int32_t rnd_val)
  565. {
  566. uint32_t loop_cnt;
  567. int32_t offset, weight;
  568. v16u8 out0, out1, out2, out3;
  569. v16i8 zero = { 0 };
  570. v16i8 src0, src1, src2, src3;
  571. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  572. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  573. v4i32 offset_vec, weight_vec, rnd_vec;
  574. offset = (offset0 + offset1) << rnd_val;
  575. weight0 = weight0 & 0x0000FFFF;
  576. weight = weight0 | (weight1 << 16);
  577. offset_vec = __msa_fill_w(offset);
  578. weight_vec = __msa_fill_w(weight);
  579. rnd_vec = __msa_fill_w(rnd_val + 1);
  580. for (loop_cnt = height; loop_cnt--;) {
  581. LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
  582. src0_ptr += src_stride;
  583. LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  584. src1_ptr += src2_stride;
  585. ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
  586. tmp2, tmp3);
  587. ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
  588. tmp6, tmp7);
  589. SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
  590. SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
  591. HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
  592. weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
  593. tmp1, tmp5);
  594. HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
  595. weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
  596. tmp3, tmp7);
  597. PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
  598. PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
  599. ST_UB4(out0, out1, out2, out3, dst, 16);
  600. dst += dst_stride;
  601. }
  602. }
  603. static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
  604. int32_t src_stride,
  605. int16_t *src1_ptr,
  606. int32_t src2_stride,
  607. uint8_t *dst,
  608. int32_t dst_stride,
  609. const int8_t *filter,
  610. int32_t height,
  611. int32_t weight0,
  612. int32_t weight1,
  613. int32_t offset0,
  614. int32_t offset1,
  615. int32_t rnd_val)
  616. {
  617. uint32_t loop_cnt;
  618. int32_t offset, weight, constant;
  619. v8i16 filt0, filt1, filt2, filt3;
  620. v16i8 src0, src1, src2, src3;
  621. v16i8 mask1, mask2, mask3;
  622. v16i8 vec0, vec1, vec2, vec3;
  623. v8i16 dst0, dst1;
  624. v8i16 in0, in1, in2, in3;
  625. v8i16 filter_vec, out0, out1;
  626. v4i32 weight_vec, offset_vec, rnd_vec;
  627. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  628. src0_ptr -= 3;
  629. filter_vec = LD_SH(filter);
  630. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  631. mask1 = mask0 + 2;
  632. mask2 = mask0 + 4;
  633. mask3 = mask0 + 6;
  634. offset = (offset0 + offset1) << rnd_val;
  635. weight0 = weight0 & 0x0000FFFF;
  636. weight = weight0 | (weight1 << 16);
  637. constant = 128 * weight1;
  638. constant <<= 6;
  639. offset += constant;
  640. offset_vec = __msa_fill_w(offset);
  641. weight_vec = __msa_fill_w(weight);
  642. rnd_vec = __msa_fill_w(rnd_val + 1);
  643. for (loop_cnt = (height >> 2); loop_cnt--;) {
  644. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  645. src0_ptr += (4 * src_stride);
  646. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  647. src1_ptr += (4 * src2_stride);
  648. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  649. XORI_B4_128_SB(src0, src1, src2, src3);
  650. VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
  651. vec0, vec1, vec2, vec3);
  652. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  653. filt3);
  654. VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
  655. vec0, vec1, vec2, vec3);
  656. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  657. filt3);
  658. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
  659. weight_vec, rnd_vec, offset_vec,
  660. out0, out1);
  661. out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
  662. ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
  663. dst += (4 * dst_stride);
  664. }
  665. }
  666. static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
  667. int32_t src_stride,
  668. int16_t *src1_ptr,
  669. int32_t src2_stride,
  670. uint8_t *dst,
  671. int32_t dst_stride,
  672. const int8_t *filter,
  673. int32_t height,
  674. int32_t weight0,
  675. int32_t weight1,
  676. int32_t offset0,
  677. int32_t offset1,
  678. int32_t rnd_val)
  679. {
  680. uint32_t loop_cnt;
  681. int32_t offset, weight, constant;
  682. v8i16 filt0, filt1, filt2, filt3;
  683. v16i8 src0, src1, src2, src3;
  684. v16i8 mask1, mask2, mask3;
  685. v16i8 vec0, vec1, vec2, vec3;
  686. v8i16 dst0, dst1, dst2, dst3;
  687. v8i16 in0, in1, in2, in3;
  688. v8i16 filter_vec, out0, out1, out2, out3;
  689. v4i32 weight_vec, offset_vec, rnd_vec;
  690. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  691. src0_ptr -= 3;
  692. offset = (offset0 + offset1) << rnd_val;
  693. weight0 = weight0 & 0x0000FFFF;
  694. weight = weight0 | (weight1 << 16);
  695. constant = 128 * weight1;
  696. constant <<= 6;
  697. offset += constant;
  698. offset_vec = __msa_fill_w(offset);
  699. weight_vec = __msa_fill_w(weight);
  700. rnd_vec = __msa_fill_w(rnd_val + 1);
  701. filter_vec = LD_SH(filter);
  702. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  703. mask1 = mask0 + 2;
  704. mask2 = mask0 + 4;
  705. mask3 = mask0 + 6;
  706. for (loop_cnt = (height >> 2); loop_cnt--;) {
  707. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  708. src0_ptr += (4 * src_stride);
  709. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  710. src1_ptr += (4 * src2_stride);
  711. XORI_B4_128_SB(src0, src1, src2, src3);
  712. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  713. vec0, vec1, vec2, vec3);
  714. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  715. filt3);
  716. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  717. vec0, vec1, vec2, vec3);
  718. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  719. filt3);
  720. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  721. vec0, vec1, vec2, vec3);
  722. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  723. filt3);
  724. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  725. vec0, vec1, vec2, vec3);
  726. dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  727. filt3);
  728. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  729. in0, in1, in2, in3,
  730. weight_vec, rnd_vec, offset_vec,
  731. out0, out1, out2, out3);
  732. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  733. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  734. dst += (4 * dst_stride);
  735. }
  736. }
  737. static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
  738. int32_t src_stride,
  739. int16_t *src1_ptr,
  740. int32_t src2_stride,
  741. uint8_t *dst,
  742. int32_t dst_stride,
  743. const int8_t *filter,
  744. int32_t height,
  745. int32_t weight0,
  746. int32_t weight1,
  747. int32_t offset0,
  748. int32_t offset1,
  749. int32_t rnd_val)
  750. {
  751. uint32_t loop_cnt;
  752. int32_t offset, weight, constant;
  753. v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
  754. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  755. v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
  756. v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
  757. v4i32 weight_vec, offset_vec, rnd_vec;
  758. src0_ptr -= 3;
  759. weight0 = weight0 & 0x0000FFFF;
  760. weight = weight0 | (weight1 << 16);
  761. constant = 128 * weight1;
  762. constant <<= 6;
  763. offset = (offset0 + offset1) << rnd_val;
  764. offset += constant;
  765. offset_vec = __msa_fill_w(offset);
  766. weight_vec = __msa_fill_w(weight);
  767. rnd_vec = __msa_fill_w(rnd_val + 1);
  768. filter_vec = LD_SH(filter);
  769. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  770. mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  771. mask1 = mask0 + 2;
  772. mask2 = mask0 + 4;
  773. mask3 = mask0 + 6;
  774. mask4 = LD_SB(&ff_hevc_mask_arr[16]);
  775. mask5 = mask4 + 2;
  776. mask6 = mask4 + 4;
  777. mask7 = mask4 + 6;
  778. for (loop_cnt = 4; loop_cnt--;) {
  779. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  780. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  781. XORI_B4_128_SB(src0, src1, src2, src3);
  782. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  783. vec3);
  784. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  785. filt3);
  786. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  787. vec3);
  788. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  789. filt3);
  790. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  791. vec3);
  792. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  793. filt3);
  794. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  795. vec3);
  796. dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  797. filt3);
  798. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
  799. weight_vec, rnd_vec, offset_vec, out0, out1, out2,
  800. out3);
  801. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  802. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  803. LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
  804. src0_ptr += (4 * src_stride);
  805. LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
  806. src1_ptr += (4 * src2_stride);
  807. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  808. XORI_B4_128_SB(src0, src1, src2, src3);
  809. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
  810. vec3);
  811. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  812. filt3);
  813. VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
  814. vec3);
  815. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  816. filt3);
  817. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
  818. offset_vec, out0, out1);
  819. out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
  820. ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
  821. dst += (4 * dst_stride);
  822. }
  823. }
  824. static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
  825. int32_t src_stride,
  826. int16_t *src1_ptr,
  827. int32_t src2_stride,
  828. uint8_t *dst,
  829. int32_t dst_stride,
  830. const int8_t *filter,
  831. int32_t height,
  832. int32_t weight0,
  833. int32_t weight1,
  834. int32_t offset0,
  835. int32_t offset1,
  836. int32_t rnd_val)
  837. {
  838. uint32_t loop_cnt;
  839. int32_t offset, weight, constant;
  840. v16i8 src0, src1, src2, src3;
  841. v8i16 in0, in1, in2, in3;
  842. v8i16 filt0, filt1, filt2, filt3;
  843. v16i8 mask1, mask2, mask3;
  844. v8i16 filter_vec, out0, out1, out2, out3;
  845. v16i8 vec0, vec1, vec2, vec3;
  846. v8i16 dst0, dst1, dst2, dst3;
  847. v4i32 weight_vec, offset_vec, rnd_vec;
  848. v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  849. src0_ptr -= 3;
  850. offset = (offset0 + offset1) << rnd_val;
  851. weight0 = weight0 & 0x0000FFFF;
  852. weight = weight0 | (weight1 << 16);
  853. constant = 128 * weight1;
  854. constant <<= 6;
  855. offset += constant;
  856. offset_vec = __msa_fill_w(offset);
  857. weight_vec = __msa_fill_w(weight);
  858. rnd_vec = __msa_fill_w(rnd_val + 1);
  859. filter_vec = LD_SH(filter);
  860. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  861. mask1 = mask0 + 2;
  862. mask2 = mask0 + 4;
  863. mask3 = mask0 + 6;
  864. for (loop_cnt = (height >> 1); loop_cnt--;) {
  865. LD_SB2(src0_ptr, 8, src0, src1);
  866. src0_ptr += src_stride;
  867. LD_SB2(src0_ptr, 8, src2, src3);
  868. src0_ptr += src_stride;
  869. LD_SH2(src1_ptr, 8, in0, in1);
  870. src1_ptr += src2_stride;
  871. LD_SH2(src1_ptr, 8, in2, in3);
  872. src1_ptr += src2_stride;
  873. XORI_B4_128_SB(src0, src1, src2, src3);
  874. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  875. vec0, vec1, vec2, vec3);
  876. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  877. filt3);
  878. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  879. vec0, vec1, vec2, vec3);
  880. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  881. filt3);
  882. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  883. vec0, vec1, vec2, vec3);
  884. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  885. filt3);
  886. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  887. vec0, vec1, vec2, vec3);
  888. dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  889. filt3);
  890. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  891. in0, in1, in2, in3,
  892. weight_vec, rnd_vec, offset_vec,
  893. out0, out1, out2, out3);
  894. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  895. ST_SH2(out0, out1, dst, dst_stride);
  896. dst += (2 * dst_stride);
  897. }
  898. }
  899. static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
  900. int32_t src_stride,
  901. int16_t *src1_ptr,
  902. int32_t src2_stride,
  903. uint8_t *dst,
  904. int32_t dst_stride,
  905. const int8_t *filter,
  906. int32_t height,
  907. int32_t weight0,
  908. int32_t weight1,
  909. int32_t offset0,
  910. int32_t offset1,
  911. int32_t rnd_val)
  912. {
  913. uint32_t loop_cnt;
  914. uint64_t dst_val0;
  915. int32_t offset, weight, constant;
  916. v16i8 src0, src1;
  917. v8i16 in0, in1, in2;
  918. v8i16 filt0, filt1, filt2, filt3;
  919. v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  920. v16i8 vec0, vec1, vec2, vec3;
  921. v8i16 dst0, dst1, dst2;
  922. v4i32 dst2_r, dst2_l;
  923. v8i16 filter_vec, out0, out1, out2;
  924. v4i32 weight_vec, offset_vec, rnd_vec;
  925. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  926. src0_ptr = src0_ptr - 3;
  927. offset = (offset0 + offset1) << rnd_val;
  928. weight0 = weight0 & 0x0000FFFF;
  929. weight = weight0 | (weight1 << 16);
  930. constant = 128 * weight1;
  931. constant <<= 6;
  932. offset += constant;
  933. offset_vec = __msa_fill_w(offset);
  934. weight_vec = __msa_fill_w(weight);
  935. rnd_vec = __msa_fill_w(rnd_val + 1);
  936. filter_vec = LD_SH(filter);
  937. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  938. mask1 = mask0 + 2;
  939. mask2 = mask0 + 4;
  940. mask3 = mask0 + 6;
  941. mask4 = mask0 + 8;
  942. mask5 = mask0 + 10;
  943. mask6 = mask0 + 12;
  944. mask7 = mask0 + 14;
  945. LD_SB2(src0_ptr, 16, src0, src1);
  946. src0_ptr += src_stride;
  947. LD_SH2(src1_ptr, 8, in0, in1);
  948. in2 = LD_SH(src1_ptr + 16);
  949. src1_ptr += src2_stride;
  950. XORI_B2_128_SB(src0, src1);
  951. for (loop_cnt = 31; loop_cnt--;) {
  952. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  953. vec0, vec1, vec2, vec3);
  954. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  955. filt3);
  956. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  957. vec0, vec1, vec2, vec3);
  958. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  959. filt3);
  960. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  961. vec0, vec1, vec2, vec3);
  962. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  963. filt3);
  964. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
  965. weight_vec, rnd_vec, offset_vec,
  966. out0, out1);
  967. ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
  968. dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
  969. (v8i16) weight_vec);
  970. dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
  971. (v8i16) weight_vec);
  972. SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
  973. dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
  974. out2 = CLIP_SH_0_255(dst2_r);
  975. LD_SB2(src0_ptr, 16, src0, src1);
  976. src0_ptr += src_stride;
  977. LD_SH2(src1_ptr, 8, in0, in1);
  978. in2 = LD_SH(src1_ptr + 16);
  979. src1_ptr += src2_stride;
  980. XORI_B2_128_SB(src0, src1);
  981. PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
  982. dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
  983. ST_SH(out0, dst);
  984. SD(dst_val0, dst + 16);
  985. dst += dst_stride;
  986. }
  987. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  988. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  989. filt3);
  990. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
  991. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  992. filt3);
  993. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  994. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  995. filt3);
  996. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
  997. out0, out1);
  998. ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
  999. dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
  1000. dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
  1001. SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
  1002. dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
  1003. out2 = CLIP_SH_0_255(dst2_r);
  1004. PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
  1005. dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
  1006. ST_SH(out0, dst);
  1007. SD(dst_val0, dst + 16);
  1008. dst += dst_stride;
  1009. }
  1010. static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
  1011. int32_t src_stride,
  1012. int16_t *src1_ptr,
  1013. int32_t src2_stride,
  1014. uint8_t *dst,
  1015. int32_t dst_stride,
  1016. const int8_t *filter,
  1017. int32_t height,
  1018. int32_t weight0,
  1019. int32_t weight1,
  1020. int32_t offset0,
  1021. int32_t offset1,
  1022. int32_t rnd_val)
  1023. {
  1024. uint32_t loop_cnt;
  1025. int32_t offset, weight, constant;
  1026. v16i8 src0, src1, src2;
  1027. v8i16 in0, in1, in2, in3;
  1028. v8i16 filt0, filt1, filt2, filt3;
  1029. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  1030. v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  1031. v16i8 vec0, vec1, vec2, vec3;
  1032. v8i16 dst0, dst1, dst2, dst3;
  1033. v8i16 filter_vec, out0, out1, out2, out3;
  1034. v4i32 weight_vec, offset_vec, rnd_vec;
  1035. src0_ptr -= 3;
  1036. offset = (offset0 + offset1) << rnd_val;
  1037. weight0 = weight0 & 0x0000FFFF;
  1038. weight = weight0 | (weight1 << 16);
  1039. constant = 128 * weight1;
  1040. constant <<= 6;
  1041. offset += constant;
  1042. offset_vec = __msa_fill_w(offset);
  1043. weight_vec = __msa_fill_w(weight);
  1044. rnd_vec = __msa_fill_w(rnd_val + 1);
  1045. filter_vec = LD_SH(filter);
  1046. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1047. mask1 = mask0 + 2;
  1048. mask2 = mask0 + 4;
  1049. mask3 = mask0 + 6;
  1050. mask4 = mask0 + 8;
  1051. mask5 = mask0 + 10;
  1052. mask6 = mask0 + 12;
  1053. mask7 = mask0 + 14;
  1054. for (loop_cnt = height; loop_cnt--;) {
  1055. LD_SB2(src0_ptr, 16, src0, src1);
  1056. src2 = LD_SB(src0_ptr + 24);
  1057. src0_ptr += src_stride;
  1058. LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
  1059. src1_ptr += src2_stride;
  1060. XORI_B3_128_SB(src0, src1, src2);
  1061. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1062. vec0, vec1, vec2, vec3);
  1063. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1064. filt3);
  1065. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  1066. vec0, vec1, vec2, vec3);
  1067. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1068. filt3);
  1069. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1070. vec0, vec1, vec2, vec3);
  1071. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1072. filt3);
  1073. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1074. vec0, vec1, vec2, vec3);
  1075. dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1076. filt3);
  1077. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  1078. in0, in1, in2, in3,
  1079. weight_vec, rnd_vec, offset_vec,
  1080. out0, out1, out2, out3);
  1081. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  1082. ST_SH2(out0, out1, dst, 16);
  1083. dst += dst_stride;
  1084. }
  1085. }
  1086. static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
  1087. int32_t src_stride,
  1088. int16_t *src1_ptr,
  1089. int32_t src2_stride,
  1090. uint8_t *dst,
  1091. int32_t dst_stride,
  1092. const int8_t *filter,
  1093. int32_t height,
  1094. int32_t weight0,
  1095. int32_t weight1,
  1096. int32_t offset0,
  1097. int32_t offset1,
  1098. int32_t rnd_val)
  1099. {
  1100. uint32_t loop_cnt;
  1101. int32_t offset, weight, constant;
  1102. v16i8 src0, src1, src2, src3, src4;
  1103. v8i16 in0, in1, in2, in3;
  1104. v8i16 filt0, filt1, filt2, filt3;
  1105. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  1106. v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  1107. v16i8 vec0, vec1, vec2, vec3;
  1108. v8i16 dst0, dst1, dst2, dst3;
  1109. v8i16 filter_vec, out0, out1, out2, out3;
  1110. v4i32 weight_vec, offset_vec, rnd_vec;
  1111. src0_ptr -= 3;
  1112. offset = (offset0 + offset1) << rnd_val;
  1113. weight0 = weight0 & 0x0000FFFF;
  1114. weight = weight0 | (weight1 << 16);
  1115. constant = 128 * weight1;
  1116. constant <<= 6;
  1117. offset += constant;
  1118. offset_vec = __msa_fill_w(offset);
  1119. weight_vec = __msa_fill_w(weight);
  1120. rnd_vec = __msa_fill_w(rnd_val + 1);
  1121. filter_vec = LD_SH(filter);
  1122. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1123. mask1 = mask0 + 2;
  1124. mask2 = mask0 + 4;
  1125. mask3 = mask0 + 6;
  1126. mask4 = mask0 + 8;
  1127. mask5 = mask0 + 10;
  1128. mask6 = mask0 + 12;
  1129. mask7 = mask0 + 14;
  1130. for (loop_cnt = 64; loop_cnt--;) {
  1131. LD_SB2(src0_ptr, 16, src0, src1);
  1132. src2 = LD_SB(src0_ptr + 24);
  1133. LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
  1134. XORI_B3_128_SB(src0, src1, src2);
  1135. LD_SB2(src0_ptr + 32, 8, src3, src4);
  1136. src0_ptr += src_stride;
  1137. XORI_B2_128_SB(src3, src4);
  1138. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1139. vec0, vec1, vec2, vec3);
  1140. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1141. filt3);
  1142. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  1143. vec0, vec1, vec2, vec3);
  1144. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1145. filt3);
  1146. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1147. vec0, vec1, vec2, vec3);
  1148. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1149. filt3);
  1150. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1151. vec0, vec1, vec2, vec3);
  1152. dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1153. filt3);
  1154. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
  1155. weight_vec, rnd_vec, offset_vec,
  1156. out0, out1, out2, out3);
  1157. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  1158. ST_SH2(out0, out1, dst, 16);
  1159. LD_SH2(src1_ptr + 32, 8, in2, in3);
  1160. src1_ptr += src2_stride;
  1161. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  1162. vec0, vec1, vec2, vec3);
  1163. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1164. filt3);
  1165. VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
  1166. vec0, vec1, vec2, vec3);
  1167. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1168. filt3);
  1169. HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
  1170. weight_vec, rnd_vec, offset_vec,
  1171. out0, out1);
  1172. out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
  1173. ST_SH(out0, dst + 32);
  1174. dst += dst_stride;
  1175. }
  1176. }
  1177. static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
  1178. int32_t src_stride,
  1179. int16_t *src1_ptr,
  1180. int32_t src2_stride,
  1181. uint8_t *dst,
  1182. int32_t dst_stride,
  1183. const int8_t *filter,
  1184. int32_t height,
  1185. int32_t weight0,
  1186. int32_t weight1,
  1187. int32_t offset0,
  1188. int32_t offset1,
  1189. int32_t rnd_val)
  1190. {
  1191. uint8_t *src0_ptr_tmp;
  1192. uint8_t *dst_tmp;
  1193. int16_t *src1_ptr_tmp;
  1194. uint32_t loop_cnt, cnt;
  1195. int32_t offset, weight, constant;
  1196. v16i8 src0, src1, src2;
  1197. v8i16 in0, in1, in2, in3;
  1198. v8i16 filt0, filt1, filt2, filt3;
  1199. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  1200. v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  1201. v16i8 vec0, vec1, vec2, vec3;
  1202. v8i16 dst0, dst1, dst2, dst3;
  1203. v8i16 filter_vec, out0, out1, out2, out3;
  1204. v4i32 weight_vec, offset_vec, rnd_vec;
  1205. src0_ptr -= 3;
  1206. offset = (offset0 + offset1) << rnd_val;
  1207. weight0 = weight0 & 0x0000FFFF;
  1208. weight = weight0 | (weight1 << 16);
  1209. constant = 128 * weight1;
  1210. constant <<= 6;
  1211. offset += constant;
  1212. offset_vec = __msa_fill_w(offset);
  1213. weight_vec = __msa_fill_w(weight);
  1214. rnd_vec = __msa_fill_w(rnd_val + 1);
  1215. filter_vec = LD_SH(filter);
  1216. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1217. mask1 = mask0 + 2;
  1218. mask2 = mask0 + 4;
  1219. mask3 = mask0 + 6;
  1220. mask4 = mask0 + 8;
  1221. mask5 = mask0 + 10;
  1222. mask6 = mask0 + 12;
  1223. mask7 = mask0 + 14;
  1224. for (loop_cnt = height; loop_cnt--;) {
  1225. src0_ptr_tmp = src0_ptr;
  1226. dst_tmp = dst;
  1227. src1_ptr_tmp = src1_ptr;
  1228. for (cnt = 2; cnt--;) {
  1229. LD_SB2(src0_ptr_tmp, 16, src0, src1);
  1230. src2 = LD_SB(src0_ptr_tmp + 24);
  1231. src0_ptr_tmp += 32;
  1232. LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
  1233. src1_ptr_tmp += 32;
  1234. XORI_B3_128_SB(src0, src1, src2);
  1235. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1236. vec0, vec1, vec2, vec3);
  1237. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1238. filt2, filt3);
  1239. VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
  1240. vec0, vec1, vec2, vec3);
  1241. dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1242. filt2, filt3);
  1243. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1244. vec0, vec1, vec2, vec3);
  1245. dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1246. filt2, filt3);
  1247. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1248. vec0, vec1, vec2, vec3);
  1249. dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1250. filt2, filt3);
  1251. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  1252. in0, in1, in2, in3,
  1253. weight_vec, rnd_vec, offset_vec,
  1254. out0, out1, out2, out3);
  1255. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  1256. ST_SH2(out0, out1, dst_tmp, 16);
  1257. dst_tmp += 32;
  1258. }
  1259. src0_ptr += src_stride;
  1260. src1_ptr += src2_stride;
  1261. dst += dst_stride;
  1262. }
  1263. }
  1264. static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
  1265. int32_t src_stride,
  1266. int16_t *src1_ptr,
  1267. int32_t src2_stride,
  1268. uint8_t *dst,
  1269. int32_t dst_stride,
  1270. const int8_t *filter,
  1271. int32_t height,
  1272. int32_t weight0,
  1273. int32_t weight1,
  1274. int32_t offset0,
  1275. int32_t offset1,
  1276. int32_t rnd_val)
  1277. {
  1278. uint32_t loop_cnt;
  1279. int32_t offset, weight;
  1280. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1281. v16i8 src11, src12, src13, src14;
  1282. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  1283. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  1284. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  1285. v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
  1286. v16i8 src2110, src4332, src6554, src8776, src10998;
  1287. v16i8 src12111110, src14131312;
  1288. v8i16 dst10, dst32, dst54, dst76;
  1289. v8i16 filt0, filt1, filt2, filt3;
  1290. v8i16 filter_vec, out0, out1, out2, out3;
  1291. v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
  1292. src0_ptr -= (3 * src_stride);
  1293. offset = (offset0 + offset1) << rnd_val;
  1294. weight0 = weight0 & 0x0000FFFF;
  1295. weight = weight0 | (weight1 << 16);
  1296. const_vec = __msa_ldi_w(128);
  1297. const_vec <<= 6;
  1298. offset_vec = __msa_fill_w(offset);
  1299. weight_vec = __msa_fill_w(weight);
  1300. rnd_vec = __msa_fill_w(rnd_val + 1);
  1301. weight1_vec = __msa_fill_w(weight1);
  1302. offset_vec += const_vec * weight1_vec;
  1303. filter_vec = LD_SH(filter);
  1304. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1305. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1306. src0_ptr += (7 * src_stride);
  1307. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1308. src10_r, src32_r, src54_r, src21_r);
  1309. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1310. ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
  1311. src2110, src4332, src6554);
  1312. XORI_B3_128_SB(src2110, src4332, src6554);
  1313. for (loop_cnt = (height >> 3); loop_cnt--;) {
  1314. LD_SB8(src0_ptr, src_stride,
  1315. src7, src8, src9, src10, src11, src12, src13, src14);
  1316. src0_ptr += (8 * src_stride);
  1317. LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
  1318. src1_ptr += (8 * src2_stride);
  1319. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  1320. ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
  1321. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1322. src76_r, src87_r, src98_r, src109_r);
  1323. ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
  1324. src1110_r, src1211_r, src1312_r, src1413_r);
  1325. ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
  1326. src1413_r, src1312_r,
  1327. src8776, src10998, src12111110, src14131312);
  1328. XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
  1329. DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
  1330. filt0, dst10, dst32, dst54, dst76);
  1331. DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
  1332. filt1, dst10, dst32, dst54, dst76);
  1333. DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
  1334. filt2, filt2, dst10, dst32, dst54, dst76);
  1335. DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
  1336. filt3, filt3, dst10, dst32, dst54, dst76);
  1337. HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
  1338. in0, in1, in2, in3,
  1339. weight_vec, rnd_vec, offset_vec,
  1340. out0, out1, out2, out3);
  1341. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  1342. ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  1343. dst += (8 * dst_stride);
  1344. src2110 = src10998;
  1345. src4332 = src12111110;
  1346. src6554 = src14131312;
  1347. src6 = src14;
  1348. }
  1349. }
  1350. static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
  1351. int32_t src_stride,
  1352. int16_t *src1_ptr,
  1353. int32_t src2_stride,
  1354. uint8_t *dst,
  1355. int32_t dst_stride,
  1356. const int8_t *filter,
  1357. int32_t height,
  1358. int32_t weight0,
  1359. int32_t weight1,
  1360. int32_t offset0,
  1361. int32_t offset1,
  1362. int32_t rnd_val)
  1363. {
  1364. uint32_t loop_cnt;
  1365. int32_t offset, weight;
  1366. v16i8 src0, src1, src2, src3, src4, src5;
  1367. v16i8 src6, src7, src8, src9, src10;
  1368. v8i16 in0, in1, in2, in3;
  1369. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  1370. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  1371. v8i16 tmp0, tmp1, tmp2, tmp3;
  1372. v8i16 filt0, filt1, filt2, filt3;
  1373. v8i16 filter_vec, out0, out1, out2, out3;
  1374. v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
  1375. src0_ptr -= (3 * src_stride);
  1376. offset = (offset0 + offset1) << rnd_val;
  1377. weight0 = weight0 & 0x0000FFFF;
  1378. weight = weight0 | (weight1 << 16);
  1379. const_vec = __msa_ldi_w(128);
  1380. const_vec <<= 6;
  1381. offset_vec = __msa_fill_w(offset);
  1382. weight_vec = __msa_fill_w(weight);
  1383. rnd_vec = __msa_fill_w(rnd_val + 1);
  1384. weight1_vec = __msa_fill_w(weight1);
  1385. offset_vec += const_vec * weight1_vec;
  1386. filter_vec = LD_SH(filter);
  1387. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1388. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1389. src0_ptr += (7 * src_stride);
  1390. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1391. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1392. src10_r, src32_r, src54_r, src21_r);
  1393. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1394. for (loop_cnt = (height >> 2); loop_cnt--;) {
  1395. LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
  1396. src0_ptr += (4 * src_stride);
  1397. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  1398. src1_ptr += (4 * src2_stride);
  1399. XORI_B4_128_SB(src7, src8, src9, src10);
  1400. ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
  1401. src76_r, src87_r, src98_r, src109_r);
  1402. DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
  1403. filt0, tmp0, tmp1, tmp2, tmp3);
  1404. DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
  1405. filt1, tmp0, tmp1, tmp2, tmp3);
  1406. DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
  1407. filt2, tmp0, tmp1, tmp2, tmp3);
  1408. DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
  1409. filt3, tmp0, tmp1, tmp2, tmp3);
  1410. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  1411. in0, in1, in2, in3,
  1412. weight_vec, rnd_vec, offset_vec,
  1413. out0, out1, out2, out3);
  1414. PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
  1415. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  1416. dst += (4 * dst_stride);
  1417. src10_r = src54_r;
  1418. src32_r = src76_r;
  1419. src54_r = src98_r;
  1420. src21_r = src65_r;
  1421. src43_r = src87_r;
  1422. src65_r = src109_r;
  1423. src6 = src10;
  1424. }
  1425. }
  1426. static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
  1427. int32_t src_stride,
  1428. int16_t *src1_ptr,
  1429. int32_t src2_stride,
  1430. uint8_t *dst,
  1431. int32_t dst_stride,
  1432. const int8_t *filter,
  1433. int32_t height,
  1434. int32_t weight0,
  1435. int32_t weight1,
  1436. int32_t offset0,
  1437. int32_t offset1,
  1438. int32_t rnd_val)
  1439. {
  1440. uint32_t loop_cnt;
  1441. int32_t offset, weight;
  1442. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  1443. v8i16 in0, in1, in2, in3;
  1444. v16i8 src10_r, src32_r, src54_r, src76_r;
  1445. v16i8 src21_r, src43_r, src65_r, src87_r;
  1446. v8i16 tmp0, tmp1, tmp2;
  1447. v16i8 src10_l, src32_l, src54_l, src76_l;
  1448. v16i8 src21_l, src43_l, src65_l, src87_l;
  1449. v16i8 src2110, src4332, src6554, src8776;
  1450. v8i16 filt0, filt1, filt2, filt3;
  1451. v8i16 out0, out1, out2, filter_vec;
  1452. v4i32 dst2_r, dst2_l;
  1453. v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
  1454. src0_ptr -= (3 * src_stride);
  1455. offset = (offset0 + offset1) << rnd_val;
  1456. weight0 = weight0 & 0x0000FFFF;
  1457. weight = weight0 | (weight1 << 16);
  1458. const_vec = __msa_ldi_w(128);
  1459. const_vec <<= 6;
  1460. offset_vec = __msa_fill_w(offset);
  1461. weight_vec = __msa_fill_w(weight);
  1462. rnd_vec = __msa_fill_w(rnd_val + 1);
  1463. weight1_vec = __msa_fill_w(weight1);
  1464. offset_vec += const_vec * weight1_vec;
  1465. filter_vec = LD_SH(filter);
  1466. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1467. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1468. src0_ptr += (7 * src_stride);
  1469. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1470. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1471. src10_r, src32_r, src54_r, src21_r);
  1472. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1473. ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1474. src10_l, src32_l, src54_l, src21_l);
  1475. ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
  1476. ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
  1477. src2110, src4332, src6554);
  1478. for (loop_cnt = 8; loop_cnt--;) {
  1479. LD_SB2(src0_ptr, src_stride, src7, src8);
  1480. src0_ptr += (2 * src_stride);
  1481. LD_SH2(src1_ptr, src2_stride, in0, in1);
  1482. LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
  1483. src1_ptr += (2 * src2_stride);
  1484. in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
  1485. XORI_B2_128_SB(src7, src8);
  1486. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  1487. ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
  1488. src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
  1489. DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
  1490. tmp0, tmp1, tmp2);
  1491. DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
  1492. tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
  1493. DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
  1494. tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
  1495. DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
  1496. tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
  1497. HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
  1498. weight_vec, rnd_vec, offset_vec,
  1499. out0, out1);
  1500. ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
  1501. dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
  1502. (v8i16) weight_vec);
  1503. dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
  1504. (v8i16) weight_vec);
  1505. SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
  1506. dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
  1507. out2 = CLIP_SH_0_255(dst2_r);
  1508. PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
  1509. ST_D2(out0, 0, 1, dst, dst_stride);
  1510. ST_W2(out2, 0, 1, dst + 8, dst_stride);
  1511. dst += (2 * dst_stride);
  1512. src10_r = src32_r;
  1513. src32_r = src54_r;
  1514. src54_r = src76_r;
  1515. src21_r = src43_r;
  1516. src43_r = src65_r;
  1517. src65_r = src87_r;
  1518. src2110 = src4332;
  1519. src4332 = src6554;
  1520. src6554 = src8776;
  1521. src6 = src8;
  1522. }
  1523. }
  1524. static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
  1525. int32_t src_stride,
  1526. int16_t *src1_ptr,
  1527. int32_t src2_stride,
  1528. uint8_t *dst,
  1529. int32_t dst_stride,
  1530. const int8_t *filter,
  1531. int32_t height,
  1532. int32_t weight0,
  1533. int32_t weight1,
  1534. int32_t offset0,
  1535. int32_t offset1,
  1536. int32_t rnd_val,
  1537. int32_t width)
  1538. {
  1539. uint8_t *src0_ptr_tmp;
  1540. int16_t *src1_ptr_tmp;
  1541. uint8_t *dst_tmp;
  1542. uint32_t loop_cnt, cnt;
  1543. int32_t offset, weight;
  1544. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  1545. v8i16 in0, in1, in2, in3;
  1546. v16i8 src10_r, src32_r, src54_r, src76_r;
  1547. v16i8 src21_r, src43_r, src65_r, src87_r;
  1548. v16i8 src10_l, src32_l, src54_l, src76_l;
  1549. v16i8 src21_l, src43_l, src65_l, src87_l;
  1550. v8i16 tmp0, tmp1, tmp2, tmp3;
  1551. v8i16 filt0, filt1, filt2, filt3;
  1552. v8i16 filter_vec;
  1553. v8i16 out0, out1, out2, out3;
  1554. v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
  1555. src0_ptr -= (3 * src_stride);
  1556. offset = (offset0 + offset1) << rnd_val;
  1557. weight0 = weight0 & 0x0000FFFF;
  1558. weight = weight0 | (weight1 << 16);
  1559. const_vec = __msa_ldi_w(128);
  1560. const_vec <<= 6;
  1561. offset_vec = __msa_fill_w(offset);
  1562. weight_vec = __msa_fill_w(weight);
  1563. rnd_vec = __msa_fill_w(rnd_val + 1);
  1564. weight1_vec = __msa_fill_w(weight1);
  1565. offset_vec += const_vec * weight1_vec;
  1566. filter_vec = LD_SH(filter);
  1567. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1568. for (cnt = (width >> 4); cnt--;) {
  1569. src0_ptr_tmp = src0_ptr;
  1570. src1_ptr_tmp = src1_ptr;
  1571. dst_tmp = dst;
  1572. LD_SB7(src0_ptr_tmp, src_stride,
  1573. src0, src1, src2, src3, src4, src5, src6);
  1574. src0_ptr_tmp += (7 * src_stride);
  1575. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1576. ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1577. src10_r, src32_r, src54_r, src21_r);
  1578. ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  1579. ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
  1580. src10_l, src32_l, src54_l, src21_l);
  1581. ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
  1582. for (loop_cnt = (height >> 1); loop_cnt--;) {
  1583. LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
  1584. src0_ptr_tmp += (2 * src_stride);
  1585. LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
  1586. LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
  1587. src1_ptr_tmp += (2 * src2_stride);
  1588. XORI_B2_128_SB(src7, src8);
  1589. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  1590. ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
  1591. DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
  1592. filt0, filt0, tmp0, tmp1, tmp2, tmp3);
  1593. DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
  1594. filt1, filt1, tmp0, tmp1, tmp2, tmp3);
  1595. DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
  1596. filt2, filt2, tmp0, tmp1, tmp2, tmp3);
  1597. DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
  1598. filt3, filt3, tmp0, tmp1, tmp2, tmp3);
  1599. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  1600. in0, in1, in2, in3,
  1601. weight_vec, rnd_vec, offset_vec,
  1602. out0, out1, out2, out3);
  1603. PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
  1604. ST_SH2(out0, out1, dst_tmp, dst_stride);
  1605. dst_tmp += (2 * dst_stride);
  1606. src10_r = src32_r;
  1607. src32_r = src54_r;
  1608. src54_r = src76_r;
  1609. src21_r = src43_r;
  1610. src43_r = src65_r;
  1611. src65_r = src87_r;
  1612. src10_l = src32_l;
  1613. src32_l = src54_l;
  1614. src54_l = src76_l;
  1615. src21_l = src43_l;
  1616. src43_l = src65_l;
  1617. src65_l = src87_l;
  1618. src6 = src8;
  1619. }
  1620. src0_ptr += 16;
  1621. src1_ptr += 16;
  1622. dst += 16;
  1623. }
  1624. }
  1625. static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
  1626. int32_t src_stride,
  1627. int16_t *src1_ptr,
  1628. int32_t src2_stride,
  1629. uint8_t *dst,
  1630. int32_t dst_stride,
  1631. const int8_t *filter,
  1632. int32_t height,
  1633. int32_t weight0,
  1634. int32_t weight1,
  1635. int32_t offset0,
  1636. int32_t offset1,
  1637. int32_t rnd_val)
  1638. {
  1639. hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
  1640. src1_ptr, src2_stride,
  1641. dst, dst_stride, filter, height,
  1642. weight0, weight1, offset0, offset1,
  1643. rnd_val, 16);
  1644. }
  1645. static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
  1646. int32_t src_stride,
  1647. int16_t *src1_ptr,
  1648. int32_t src2_stride,
  1649. uint8_t *dst,
  1650. int32_t dst_stride,
  1651. const int8_t *filter,
  1652. int32_t height,
  1653. int32_t weight0,
  1654. int32_t weight1,
  1655. int32_t offset0,
  1656. int32_t offset1,
  1657. int32_t rnd_val)
  1658. {
  1659. hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
  1660. src1_ptr, src2_stride,
  1661. dst, dst_stride, filter, height,
  1662. weight0, weight1, offset0, offset1,
  1663. rnd_val, 16);
  1664. hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
  1665. src1_ptr + 16, src2_stride,
  1666. dst + 16, dst_stride, filter, height,
  1667. weight0, weight1, offset0, offset1, rnd_val);
  1668. }
  1669. static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
  1670. int32_t src_stride,
  1671. int16_t *src1_ptr,
  1672. int32_t src2_stride,
  1673. uint8_t *dst,
  1674. int32_t dst_stride,
  1675. const int8_t *filter,
  1676. int32_t height,
  1677. int32_t weight0,
  1678. int32_t weight1,
  1679. int32_t offset0,
  1680. int32_t offset1,
  1681. int32_t rnd_val)
  1682. {
  1683. hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
  1684. src1_ptr, src2_stride,
  1685. dst, dst_stride, filter, height,
  1686. weight0, weight1, offset0, offset1,
  1687. rnd_val, 32);
  1688. }
  1689. static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
  1690. int32_t src_stride,
  1691. int16_t *src1_ptr,
  1692. int32_t src2_stride,
  1693. uint8_t *dst,
  1694. int32_t dst_stride,
  1695. const int8_t *filter,
  1696. int32_t height,
  1697. int32_t weight0,
  1698. int32_t weight1,
  1699. int32_t offset0,
  1700. int32_t offset1,
  1701. int32_t rnd_val)
  1702. {
  1703. hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
  1704. src1_ptr, src2_stride,
  1705. dst, dst_stride, filter, height,
  1706. weight0, weight1, offset0, offset1,
  1707. rnd_val, 48);
  1708. }
  1709. static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
  1710. int32_t src_stride,
  1711. int16_t *src1_ptr,
  1712. int32_t src2_stride,
  1713. uint8_t *dst,
  1714. int32_t dst_stride,
  1715. const int8_t *filter,
  1716. int32_t height,
  1717. int32_t weight0,
  1718. int32_t weight1,
  1719. int32_t offset0,
  1720. int32_t offset1,
  1721. int32_t rnd_val)
  1722. {
  1723. hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
  1724. src1_ptr, src2_stride,
  1725. dst, dst_stride, filter, height,
  1726. weight0, weight1, offset0, offset1,
  1727. rnd_val, 64);
  1728. }
  1729. static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
  1730. int32_t src_stride,
  1731. int16_t *src1_ptr,
  1732. int32_t src2_stride,
  1733. uint8_t *dst,
  1734. int32_t dst_stride,
  1735. const int8_t *filter_x,
  1736. const int8_t *filter_y,
  1737. int32_t height,
  1738. int32_t weight0,
  1739. int32_t weight1,
  1740. int32_t offset0,
  1741. int32_t offset1,
  1742. int32_t rnd_val)
  1743. {
  1744. uint32_t loop_cnt;
  1745. uint64_t tp0, tp1;
  1746. int32_t offset, weight;
  1747. v16u8 out;
  1748. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  1749. v8i16 in0 = { 0 }, in1 = { 0 };
  1750. v8i16 filt0, filt1, filt2, filt3;
  1751. v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
  1752. v16i8 mask1, mask2, mask3;
  1753. v8i16 filter_vec, weight_vec;
  1754. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1755. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1756. v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
  1757. v8i16 tmp0, tmp1, tmp2, tmp3;
  1758. v8i16 dst10, dst32, dst54, dst76;
  1759. v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
  1760. v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
  1761. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  1762. src0_ptr -= ((3 * src_stride) + 3);
  1763. filter_vec = LD_SH(filter_x);
  1764. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1765. filter_vec = LD_SH(filter_y);
  1766. UNPCK_R_SB_SH(filter_vec, filter_vec);
  1767. SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
  1768. mask1 = mask0 + 2;
  1769. mask2 = mask0 + 4;
  1770. mask3 = mask0 + 6;
  1771. offset = (offset0 + offset1) << rnd_val;
  1772. weight0 = weight0 & 0x0000FFFF;
  1773. weight = weight0 | (weight1 << 16);
  1774. const_vec = __msa_fill_w((128 * weight1));
  1775. const_vec <<= 6;
  1776. offset_vec = __msa_fill_w(offset);
  1777. rnd_vec = __msa_fill_w(rnd_val + 1);
  1778. offset_vec += const_vec;
  1779. weight_vec = (v8i16) __msa_fill_w(weight);
  1780. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  1781. src0_ptr += (7 * src_stride);
  1782. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1783. VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  1784. VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
  1785. VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
  1786. vec8, vec9, vec10, vec11);
  1787. VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
  1788. vec12, vec13, vec14, vec15);
  1789. dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1790. filt3);
  1791. dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1792. filt3);
  1793. dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1794. filt3);
  1795. dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
  1796. filt3);
  1797. ILVRL_H2_SH(dst41, dst30, dst10, dst43);
  1798. ILVRL_H2_SH(dst52, dst41, dst21, dst54);
  1799. ILVRL_H2_SH(dst63, dst52, dst32, dst65);
  1800. dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
  1801. for (loop_cnt = height >> 2; loop_cnt--;) {
  1802. LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
  1803. src0_ptr += (4 * src_stride);
  1804. XORI_B4_128_SB(src7, src8, src9, src10);
  1805. LD2(src1_ptr, src2_stride, tp0, tp1);
  1806. INSERT_D2_SH(tp0, tp1, in0);
  1807. src1_ptr += (2 * src2_stride);
  1808. LD2(src1_ptr, src2_stride, tp0, tp1);
  1809. INSERT_D2_SH(tp0, tp1, in1);
  1810. src1_ptr += (2 * src2_stride);
  1811. VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
  1812. vec0, vec1, vec2, vec3);
  1813. VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
  1814. vec4, vec5, vec6, vec7);
  1815. dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1816. filt3);
  1817. dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1818. filt3);
  1819. dst76 = __msa_ilvr_h(dst97, dst66);
  1820. ILVRL_H2_SH(dst108, dst97, dst87, dst109);
  1821. dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
  1822. dst98 = __msa_ilvr_h(dst66, dst108);
  1823. dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
  1824. filt_h2, filt_h3);
  1825. dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
  1826. filt_h2, filt_h3);
  1827. dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
  1828. filt_h2, filt_h3);
  1829. dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
  1830. filt_h2, filt_h3);
  1831. SRA_4V(dst0, dst1, dst2, dst3, 6);
  1832. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
  1833. ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
  1834. ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
  1835. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  1836. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  1837. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  1838. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  1839. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  1840. CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
  1841. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
  1842. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  1843. ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
  1844. dst += (4 * dst_stride);
  1845. dst10 = dst54;
  1846. dst32 = dst76;
  1847. dst54 = dst98;
  1848. dst21 = dst65;
  1849. dst43 = dst87;
  1850. dst65 = dst109;
  1851. dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
  1852. }
  1853. }
  1854. static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
  1855. int32_t src_stride,
  1856. int16_t *src1_ptr,
  1857. int32_t src2_stride,
  1858. uint8_t *dst,
  1859. int32_t dst_stride,
  1860. const int8_t *filter_x,
  1861. const int8_t *filter_y,
  1862. int32_t height,
  1863. int32_t weight0,
  1864. int32_t weight1,
  1865. int32_t offset0,
  1866. int32_t offset1,
  1867. int32_t rnd_val,
  1868. int32_t width8mult)
  1869. {
  1870. uint32_t loop_cnt, cnt;
  1871. int32_t offset, weight;
  1872. uint8_t *src0_ptr_tmp;
  1873. int16_t *src1_ptr_tmp;
  1874. uint8_t *dst_tmp;
  1875. v16u8 out;
  1876. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  1877. v8i16 in0, in1;
  1878. v8i16 filt0, filt1, filt2, filt3;
  1879. v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
  1880. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  1881. v16i8 mask1, mask2, mask3;
  1882. v8i16 filter_vec, weight_vec;
  1883. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  1884. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  1885. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
  1886. v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
  1887. v8i16 tmp0, tmp1, tmp2, tmp3;
  1888. v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
  1889. v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
  1890. v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
  1891. v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
  1892. v4i32 offset_vec, rnd_vec, const_vec;
  1893. src0_ptr -= ((3 * src_stride) + 3);
  1894. offset = (offset0 + offset1) << rnd_val;
  1895. weight0 = weight0 & 0x0000FFFF;
  1896. weight = weight0 | (weight1 << 16);
  1897. const_vec = __msa_fill_w((128 * weight1));
  1898. const_vec <<= 6;
  1899. offset_vec = __msa_fill_w(offset);
  1900. rnd_vec = __msa_fill_w(rnd_val + 1);
  1901. offset_vec += const_vec;
  1902. weight_vec = (v8i16) __msa_fill_w(weight);
  1903. filter_vec = LD_SH(filter_x);
  1904. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  1905. filter_vec = LD_SH(filter_y);
  1906. UNPCK_R_SB_SH(filter_vec, filter_vec);
  1907. SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
  1908. mask1 = mask0 + 2;
  1909. mask2 = mask0 + 4;
  1910. mask3 = mask0 + 6;
  1911. for (cnt = width8mult; cnt--;) {
  1912. src0_ptr_tmp = src0_ptr;
  1913. src1_ptr_tmp = src1_ptr;
  1914. dst_tmp = dst;
  1915. LD_SB7(src0_ptr_tmp, src_stride,
  1916. src0, src1, src2, src3, src4, src5, src6);
  1917. src0_ptr_tmp += (7 * src_stride);
  1918. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  1919. /* row 0 row 1 row 2 row 3 */
  1920. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
  1921. vec0, vec1, vec2, vec3);
  1922. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
  1923. vec4, vec5, vec6, vec7);
  1924. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
  1925. vec8, vec9, vec10, vec11);
  1926. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
  1927. vec12, vec13, vec14, vec15);
  1928. dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1929. filt3);
  1930. dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1931. filt3);
  1932. dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1933. filt3);
  1934. dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  1935. filt2, filt3);
  1936. /* row 4 row 5 row 6 */
  1937. VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
  1938. vec0, vec1, vec2, vec3);
  1939. VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
  1940. vec4, vec5, vec6, vec7);
  1941. VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
  1942. vec8, vec9, vec10, vec11);
  1943. dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  1944. filt3);
  1945. dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  1946. filt3);
  1947. dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  1948. filt3);
  1949. for (loop_cnt = height >> 1; loop_cnt--;) {
  1950. LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
  1951. XORI_B2_128_SB(src7, src8);
  1952. src0_ptr_tmp += 2 * src_stride;
  1953. LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
  1954. src1_ptr_tmp += (2 * src2_stride);
  1955. ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
  1956. dst32_r, dst54_r, dst21_r);
  1957. ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
  1958. dst32_l, dst54_l, dst21_l);
  1959. ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
  1960. ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
  1961. VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
  1962. vec0, vec1, vec2, vec3);
  1963. dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1964. filt2, filt3);
  1965. ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
  1966. dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
  1967. filt_h0, filt_h1, filt_h2, filt_h3);
  1968. dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
  1969. filt_h0, filt_h1, filt_h2, filt_h3);
  1970. dst0_r >>= 6;
  1971. dst0_l >>= 6;
  1972. /* row 8 */
  1973. VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
  1974. vec0, vec1, vec2, vec3);
  1975. dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
  1976. filt2, filt3);
  1977. ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
  1978. dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
  1979. filt_h0, filt_h1, filt_h2, filt_h3);
  1980. dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
  1981. filt_h0, filt_h1, filt_h2, filt_h3);
  1982. dst1_r >>= 6;
  1983. dst1_l >>= 6;
  1984. PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
  1985. ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
  1986. ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
  1987. dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  1988. dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  1989. dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  1990. dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  1991. SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
  1992. CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
  1993. PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
  1994. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  1995. ST_D2(out, 0, 1, dst_tmp, dst_stride);
  1996. dst_tmp += (2 * dst_stride);
  1997. dst0 = dst2;
  1998. dst1 = dst3;
  1999. dst2 = dst4;
  2000. dst3 = dst5;
  2001. dst4 = dst6;
  2002. dst5 = dst7;
  2003. dst6 = dst8;
  2004. }
  2005. src0_ptr += 8;
  2006. src1_ptr += 8;
  2007. dst += 8;
  2008. }
  2009. }
  2010. static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
  2011. int32_t src_stride,
  2012. int16_t *src1_ptr,
  2013. int32_t src2_stride,
  2014. uint8_t *dst,
  2015. int32_t dst_stride,
  2016. const int8_t *filter_x,
  2017. const int8_t *filter_y,
  2018. int32_t height,
  2019. int32_t weight0,
  2020. int32_t weight1,
  2021. int32_t offset0,
  2022. int32_t offset1,
  2023. int32_t rnd_val)
  2024. {
  2025. hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
  2026. src1_ptr, src2_stride,
  2027. dst, dst_stride, filter_x, filter_y,
  2028. height, weight0, weight1, offset0,
  2029. offset1, rnd_val, 1);
  2030. }
  2031. static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
  2032. int32_t src_stride,
  2033. int16_t *src1_ptr,
  2034. int32_t src2_stride,
  2035. uint8_t *dst,
  2036. int32_t dst_stride,
  2037. const int8_t *filter_x,
  2038. const int8_t *filter_y,
  2039. int32_t height,
  2040. int32_t weight0,
  2041. int32_t weight1,
  2042. int32_t offset0,
  2043. int32_t offset1,
  2044. int32_t rnd_val)
  2045. {
  2046. uint32_t loop_cnt;
  2047. uint8_t *src0_ptr_tmp, *dst_tmp;
  2048. int16_t *src1_ptr_tmp;
  2049. int32_t offset, weight;
  2050. uint64_t tp0, tp1;
  2051. v16u8 out;
  2052. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  2053. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  2054. v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
  2055. v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
  2056. v8i16 in0 = { 0 }, in1 = { 0 };
  2057. v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
  2058. v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
  2059. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
  2060. v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
  2061. v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
  2062. v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
  2063. v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
  2064. v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
  2065. src0_ptr -= ((3 * src_stride) + 3);
  2066. offset = (offset0 + offset1) << rnd_val;
  2067. weight0 = weight0 & 0x0000FFFF;
  2068. weight = weight0 | (weight1 << 16);
  2069. const_vec = __msa_fill_w((128 * weight1));
  2070. const_vec <<= 6;
  2071. offset_vec = __msa_fill_w(offset);
  2072. rnd_vec = __msa_fill_w(rnd_val + 1);
  2073. offset_vec += const_vec;
  2074. weight_vec = (v8i16) __msa_fill_w(weight);
  2075. filter_vec = LD_SH(filter_x);
  2076. SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
  2077. filter_vec = LD_SH(filter_y);
  2078. UNPCK_R_SB_SH(filter_vec, filter_vec);
  2079. SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
  2080. mask0 = LD_SB(ff_hevc_mask_arr);
  2081. mask1 = mask0 + 2;
  2082. mask2 = mask0 + 4;
  2083. mask3 = mask0 + 6;
  2084. src0_ptr_tmp = src0_ptr;
  2085. src1_ptr_tmp = src1_ptr;
  2086. dst_tmp = dst;
  2087. LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
  2088. src0_ptr_tmp += (7 * src_stride);
  2089. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  2090. VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  2091. VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
  2092. VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
  2093. vec11);
  2094. VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
  2095. vec15);
  2096. dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  2097. filt3);
  2098. dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  2099. filt3);
  2100. dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  2101. filt3);
  2102. dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
  2103. filt2, filt3);
  2104. VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
  2105. VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
  2106. VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
  2107. vec11);
  2108. dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  2109. filt3);
  2110. dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  2111. filt3);
  2112. dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  2113. filt3);
  2114. for (loop_cnt = 8; loop_cnt--;) {
  2115. LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
  2116. src0_ptr_tmp += (2 * src_stride);
  2117. XORI_B2_128_SB(src7, src8);
  2118. LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
  2119. src1_ptr_tmp += (2 * src2_stride);
  2120. ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
  2121. dst10_r, dst32_r, dst54_r, dst21_r);
  2122. ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
  2123. dst10_l, dst32_l, dst54_l, dst21_l);
  2124. ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
  2125. ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
  2126. VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  2127. vec3);
  2128. dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  2129. filt3);
  2130. ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
  2131. dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
  2132. filt_h1, filt_h2, filt_h3);
  2133. dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
  2134. filt_h1, filt_h2, filt_h3);
  2135. dst0 >>= 6;
  2136. dst1 >>= 6;
  2137. VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
  2138. vec3);
  2139. dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  2140. filt3);
  2141. ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
  2142. dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
  2143. filt_h1, filt_h2, filt_h3);
  2144. dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
  2145. filt_h1, filt_h2, filt_h3);
  2146. dst2 >>= 6;
  2147. dst3 >>= 6;
  2148. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
  2149. ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
  2150. ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
  2151. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  2152. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  2153. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  2154. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  2155. SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
  2156. CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
  2157. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
  2158. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  2159. ST_D2(out, 0, 1, dst_tmp, dst_stride);
  2160. dst_tmp += (2 * dst_stride);
  2161. dsth0 = dsth2;
  2162. dsth1 = dsth3;
  2163. dsth2 = dsth4;
  2164. dsth3 = dsth5;
  2165. dsth4 = dsth6;
  2166. dsth5 = dsth7;
  2167. dsth6 = dsth8;
  2168. }
  2169. src0_ptr += 8;
  2170. src1_ptr += 8;
  2171. dst += 8;
  2172. mask4 = LD_SB(ff_hevc_mask_arr + 16);
  2173. mask5 = mask4 + 2;
  2174. mask6 = mask4 + 4;
  2175. mask7 = mask4 + 6;
  2176. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  2177. src0_ptr += (7 * src_stride);
  2178. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  2179. VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
  2180. VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
  2181. VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
  2182. vec11);
  2183. VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
  2184. vec15);
  2185. dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  2186. filt3);
  2187. dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  2188. filt3);
  2189. dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
  2190. filt3);
  2191. dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
  2192. filt3);
  2193. ILVRL_H2_SH(dst41, dst30, dst10, dst43);
  2194. ILVRL_H2_SH(dst52, dst41, dst21, dst54);
  2195. ILVRL_H2_SH(dst63, dst52, dst32, dst65);
  2196. dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
  2197. for (loop_cnt = 4; loop_cnt--;) {
  2198. LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
  2199. src0_ptr += (4 * src_stride);
  2200. XORI_B4_128_SB(src7, src8, src9, src10);
  2201. LD2(src1_ptr, src2_stride, tp0, tp1);
  2202. INSERT_D2_SH(tp0, tp1, in0);
  2203. src1_ptr += (2 * src2_stride);
  2204. LD2(src1_ptr, src2_stride, tp0, tp1);
  2205. INSERT_D2_SH(tp0, tp1, in1);
  2206. src1_ptr += (2 * src2_stride);
  2207. VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
  2208. vec3);
  2209. VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
  2210. vec7);
  2211. dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
  2212. filt3);
  2213. dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
  2214. filt3);
  2215. dst76 = __msa_ilvr_h(dst97, dst66);
  2216. ILVRL_H2_SH(dst108, dst97, dst87, dst109);
  2217. dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
  2218. dst98 = __msa_ilvr_h(dst66, dst108);
  2219. dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
  2220. filt_h2, filt_h3);
  2221. dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
  2222. filt_h2, filt_h3);
  2223. dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
  2224. filt_h2, filt_h3);
  2225. dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
  2226. filt_h2, filt_h3);
  2227. SRA_4V(dst0, dst1, dst2, dst3, 6);
  2228. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
  2229. ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
  2230. ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
  2231. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  2232. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  2233. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  2234. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  2235. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  2236. CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
  2237. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
  2238. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  2239. ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
  2240. dst += (4 * dst_stride);
  2241. dst10 = dst54;
  2242. dst32 = dst76;
  2243. dst54 = dst98;
  2244. dst21 = dst65;
  2245. dst43 = dst87;
  2246. dst65 = dst109;
  2247. dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
  2248. }
  2249. }
  2250. static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
  2251. int32_t src_stride,
  2252. int16_t *src1_ptr,
  2253. int32_t src2_stride,
  2254. uint8_t *dst,
  2255. int32_t dst_stride,
  2256. const int8_t *filter_x,
  2257. const int8_t *filter_y,
  2258. int32_t height,
  2259. int32_t weight0,
  2260. int32_t weight1,
  2261. int32_t offset0,
  2262. int32_t offset1,
  2263. int32_t rnd_val)
  2264. {
  2265. hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
  2266. src1_ptr, src2_stride,
  2267. dst, dst_stride, filter_x, filter_y,
  2268. height, weight0, weight1, offset0,
  2269. offset1, rnd_val, 2);
  2270. }
  2271. static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
  2272. int32_t src_stride,
  2273. int16_t *src1_ptr,
  2274. int32_t src2_stride,
  2275. uint8_t *dst,
  2276. int32_t dst_stride,
  2277. const int8_t *filter_x,
  2278. const int8_t *filter_y,
  2279. int32_t height,
  2280. int32_t weight0,
  2281. int32_t weight1,
  2282. int32_t offset0,
  2283. int32_t offset1,
  2284. int32_t rnd_val)
  2285. {
  2286. hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
  2287. src1_ptr, src2_stride,
  2288. dst, dst_stride, filter_x, filter_y,
  2289. height, weight0, weight1, offset0,
  2290. offset1, rnd_val, 3);
  2291. }
  2292. static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
  2293. int32_t src_stride,
  2294. int16_t *src1_ptr,
  2295. int32_t src2_stride,
  2296. uint8_t *dst,
  2297. int32_t dst_stride,
  2298. const int8_t *filter_x,
  2299. const int8_t *filter_y,
  2300. int32_t height,
  2301. int32_t weight0,
  2302. int32_t weight1,
  2303. int32_t offset0,
  2304. int32_t offset1,
  2305. int32_t rnd_val)
  2306. {
  2307. hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
  2308. src1_ptr, src2_stride,
  2309. dst, dst_stride, filter_x, filter_y,
  2310. height, weight0, weight1, offset0,
  2311. offset1, rnd_val, 4);
  2312. }
  2313. static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
  2314. int32_t src_stride,
  2315. int16_t *src1_ptr,
  2316. int32_t src2_stride,
  2317. uint8_t *dst,
  2318. int32_t dst_stride,
  2319. const int8_t *filter_x,
  2320. const int8_t *filter_y,
  2321. int32_t height,
  2322. int32_t weight0,
  2323. int32_t weight1,
  2324. int32_t offset0,
  2325. int32_t offset1,
  2326. int32_t rnd_val)
  2327. {
  2328. hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
  2329. src1_ptr, src2_stride,
  2330. dst, dst_stride, filter_x, filter_y,
  2331. height, weight0, weight1, offset0,
  2332. offset1, rnd_val, 6);
  2333. }
  2334. static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
  2335. int32_t src_stride,
  2336. int16_t *src1_ptr,
  2337. int32_t src2_stride,
  2338. uint8_t *dst,
  2339. int32_t dst_stride,
  2340. const int8_t *filter_x,
  2341. const int8_t *filter_y,
  2342. int32_t height,
  2343. int32_t weight0,
  2344. int32_t weight1,
  2345. int32_t offset0,
  2346. int32_t offset1,
  2347. int32_t rnd_val)
  2348. {
  2349. hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
  2350. src1_ptr, src2_stride,
  2351. dst, dst_stride, filter_x, filter_y,
  2352. height, weight0, weight1, offset0,
  2353. offset1, rnd_val, 8);
  2354. }
  2355. static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  2356. int32_t src_stride,
  2357. int16_t *src1_ptr,
  2358. int32_t src2_stride,
  2359. uint8_t *dst,
  2360. int32_t dst_stride,
  2361. const int8_t *filter,
  2362. int32_t weight0,
  2363. int32_t weight1,
  2364. int32_t offset0,
  2365. int32_t offset1,
  2366. int32_t rnd_val)
  2367. {
  2368. int32_t offset, weight, constant;
  2369. v8i16 filt0, filt1;
  2370. v16i8 src0, src1;
  2371. v8i16 in0, in1;
  2372. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  2373. v16i8 mask1, vec0, vec1;
  2374. v8i16 dst0;
  2375. v4i32 dst0_r, dst0_l;
  2376. v8i16 out0, filter_vec;
  2377. v4i32 weight_vec, offset_vec, rnd_vec;
  2378. src0_ptr -= 1;
  2379. filter_vec = LD_SH(filter);
  2380. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2381. mask1 = mask0 + 2;
  2382. offset = (offset0 + offset1) << rnd_val;
  2383. weight0 = weight0 & 0x0000FFFF;
  2384. weight = weight0 | (weight1 << 16);
  2385. constant = 128 * weight1;
  2386. constant <<= 6;
  2387. offset += constant;
  2388. offset_vec = __msa_fill_w(offset);
  2389. weight_vec = __msa_fill_w(weight);
  2390. rnd_vec = __msa_fill_w(rnd_val + 1);
  2391. LD_SB2(src0_ptr, src_stride, src0, src1);
  2392. LD_SH2(src1_ptr, src2_stride, in0, in1);
  2393. in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
  2394. XORI_B2_128_SB(src0, src1);
  2395. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  2396. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2397. ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
  2398. dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
  2399. dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
  2400. SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
  2401. dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
  2402. out0 = CLIP_SH_0_255(dst0_r);
  2403. out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
  2404. ST_W2(out0, 0, 1, dst, dst_stride);
  2405. }
  2406. static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
  2407. int32_t src_stride,
  2408. int16_t *src1_ptr,
  2409. int32_t src2_stride,
  2410. uint8_t *dst,
  2411. int32_t dst_stride,
  2412. const int8_t *filter,
  2413. int32_t weight0,
  2414. int32_t weight1,
  2415. int32_t offset0,
  2416. int32_t offset1,
  2417. int32_t rnd_val)
  2418. {
  2419. int32_t offset, weight, constant;
  2420. v8i16 filt0, filt1;
  2421. v16i8 src0, src1, src2, src3;
  2422. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  2423. v16i8 mask1;
  2424. v8i16 dst0, dst1;
  2425. v16i8 vec0, vec1;
  2426. v8i16 in0, in1, in2, in3;
  2427. v8i16 filter_vec;
  2428. v4i32 weight_vec, offset_vec, rnd_vec;
  2429. src0_ptr -= 1;
  2430. /* rearranging filter */
  2431. filter_vec = LD_SH(filter);
  2432. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2433. mask1 = mask0 + 2;
  2434. offset = (offset0 + offset1) << rnd_val;
  2435. weight0 = weight0 & 0x0000FFFF;
  2436. weight = weight0 | (weight1 << 16);
  2437. constant = 128 * weight1;
  2438. constant <<= 6;
  2439. offset += constant;
  2440. offset_vec = __msa_fill_w(offset);
  2441. weight_vec = __msa_fill_w(weight);
  2442. rnd_vec = __msa_fill_w(rnd_val + 1);
  2443. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  2444. XORI_B4_128_SB(src0, src1, src2, src3);
  2445. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  2446. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  2447. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  2448. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2449. VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
  2450. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2451. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
  2452. weight_vec, rnd_vec, offset_vec,
  2453. dst0, dst1);
  2454. dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  2455. ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
  2456. }
  2457. static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
  2458. int32_t src_stride,
  2459. int16_t *src1_ptr,
  2460. int32_t src2_stride,
  2461. uint8_t *dst,
  2462. int32_t dst_stride,
  2463. const int8_t *filter,
  2464. int32_t height,
  2465. int32_t weight0,
  2466. int32_t weight1,
  2467. int32_t offset0,
  2468. int32_t offset1,
  2469. int32_t rnd_val)
  2470. {
  2471. uint32_t loop_cnt;
  2472. int32_t weight, offset, constant;
  2473. v8i16 filt0, filt1;
  2474. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  2475. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
  2476. v16i8 mask1;
  2477. v16i8 vec0, vec1;
  2478. v8i16 dst0, dst1, dst2, dst3;
  2479. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  2480. v8i16 filter_vec;
  2481. v4i32 weight_vec, offset_vec, rnd_vec;
  2482. src0_ptr -= 1;
  2483. filter_vec = LD_SH(filter);
  2484. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2485. offset = (offset0 + offset1) << rnd_val;
  2486. weight0 = weight0 & 0x0000FFFF;
  2487. weight = weight0 | (weight1 << 16);
  2488. constant = 128 * weight1;
  2489. constant <<= 6;
  2490. offset += constant;
  2491. offset_vec = __msa_fill_w(offset);
  2492. weight_vec = __msa_fill_w(weight);
  2493. rnd_vec = __msa_fill_w(rnd_val + 1);
  2494. mask1 = mask0 + 2;
  2495. for (loop_cnt = (height >> 3); loop_cnt--;) {
  2496. LD_SB8(src0_ptr, src_stride,
  2497. src0, src1, src2, src3, src4, src5, src6, src7);
  2498. src0_ptr += (8 * src_stride);
  2499. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  2500. src1_ptr += (4 * src2_stride);
  2501. LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
  2502. src1_ptr += (4 * src2_stride);
  2503. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  2504. ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
  2505. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  2506. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  2507. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2508. VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
  2509. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2510. VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
  2511. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2512. VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
  2513. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2514. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  2515. in0, in1, in2, in3,
  2516. weight_vec, rnd_vec, offset_vec,
  2517. dst0, dst1, dst2, dst3);
  2518. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2519. ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  2520. dst += (8 * dst_stride);
  2521. }
  2522. }
  2523. static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
  2524. int32_t src_stride,
  2525. int16_t *src1_ptr,
  2526. int32_t src2_stride,
  2527. uint8_t *dst,
  2528. int32_t dst_stride,
  2529. const int8_t *filter,
  2530. int32_t height,
  2531. int32_t weight0,
  2532. int32_t weight1,
  2533. int32_t offset0,
  2534. int32_t offset1,
  2535. int32_t rnd_val)
  2536. {
  2537. if (2 == height) {
  2538. hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  2539. dst, dst_stride, filter,
  2540. weight0, weight1, offset0, offset1, rnd_val);
  2541. } else if (4 == height) {
  2542. hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  2543. dst, dst_stride, filter,
  2544. weight0, weight1, offset0, offset1, rnd_val);
  2545. } else if (0 == (height % 8)) {
  2546. hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
  2547. src1_ptr, src2_stride,
  2548. dst, dst_stride, filter, height,
  2549. weight0, weight1, offset0, offset1,
  2550. rnd_val);
  2551. }
  2552. }
  2553. static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
  2554. int32_t src_stride,
  2555. int16_t *src1_ptr,
  2556. int32_t src2_stride,
  2557. uint8_t *dst,
  2558. int32_t dst_stride,
  2559. const int8_t *filter,
  2560. int32_t height,
  2561. int32_t weight0,
  2562. int32_t weight1,
  2563. int32_t offset0,
  2564. int32_t offset1,
  2565. int32_t rnd_val)
  2566. {
  2567. uint32_t loop_cnt;
  2568. int32_t offset, weight, constant;
  2569. v8i16 filt0, filt1;
  2570. v16i8 src0, src1, src2, src3;
  2571. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2572. v16i8 mask1;
  2573. v16i8 vec0, vec1;
  2574. v8i16 in0, in1, in2, in3;
  2575. v8i16 dst0, dst1, dst2, dst3;
  2576. v8i16 filter_vec;
  2577. v4i32 weight_vec, offset_vec, rnd_vec;
  2578. src0_ptr -= 1;
  2579. filter_vec = LD_SH(filter);
  2580. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2581. offset = (offset0 + offset1) << rnd_val;
  2582. weight0 = weight0 & 0x0000FFFF;
  2583. weight = weight0 | (weight1 << 16);
  2584. constant = 128 * weight1;
  2585. constant <<= 6;
  2586. offset += constant;
  2587. offset_vec = __msa_fill_w(offset);
  2588. weight_vec = __msa_fill_w(weight);
  2589. rnd_vec = __msa_fill_w(rnd_val + 1);
  2590. mask1 = mask0 + 2;
  2591. for (loop_cnt = 2; loop_cnt--;) {
  2592. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  2593. src0_ptr += (4 * src_stride);
  2594. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  2595. src1_ptr += (4 * src2_stride);
  2596. XORI_B4_128_SB(src0, src1, src2, src3);
  2597. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2598. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2599. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2600. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2601. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  2602. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2603. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  2604. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2605. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  2606. in0, in1, in2, in3,
  2607. weight_vec, rnd_vec, offset_vec,
  2608. dst0, dst1, dst2, dst3);
  2609. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2610. ST_W2(dst0, 0, 2, dst, dst_stride);
  2611. ST_H2(dst0, 2, 6, dst + 4, dst_stride);
  2612. ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
  2613. ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
  2614. dst += (4 * dst_stride);
  2615. }
  2616. }
  2617. static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
  2618. int32_t src_stride,
  2619. int16_t *src1_ptr,
  2620. int32_t src2_stride,
  2621. uint8_t *dst,
  2622. int32_t dst_stride,
  2623. const int8_t *filter,
  2624. int32_t weight0,
  2625. int32_t weight1,
  2626. int32_t offset0,
  2627. int32_t offset1,
  2628. int32_t rnd_val)
  2629. {
  2630. int32_t offset, weight, constant;
  2631. v8i16 filt0, filt1;
  2632. v16i8 src0, src1;
  2633. v8i16 in0, in1;
  2634. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2635. v16i8 mask1, vec0, vec1;
  2636. v8i16 dst0, dst1;
  2637. v8i16 filter_vec;
  2638. v4i32 weight_vec, offset_vec, rnd_vec;
  2639. src0_ptr -= 1;
  2640. filter_vec = LD_SH(filter);
  2641. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2642. offset = (offset0 + offset1) << rnd_val;
  2643. weight0 = weight0 & 0x0000FFFF;
  2644. weight = weight0 | (weight1 << 16);
  2645. constant = 128 * weight1;
  2646. constant <<= 6;
  2647. offset += constant;
  2648. offset_vec = __msa_fill_w(offset);
  2649. weight_vec = __msa_fill_w(weight);
  2650. rnd_vec = __msa_fill_w(rnd_val + 1);
  2651. mask1 = mask0 + 2;
  2652. LD_SB2(src0_ptr, src_stride, src0, src1);
  2653. LD_SH2(src1_ptr, src2_stride, in0, in1);
  2654. XORI_B2_128_SB(src0, src1);
  2655. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2656. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2657. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2658. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2659. HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
  2660. weight_vec, rnd_vec, offset_vec,
  2661. dst0, dst1);
  2662. dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  2663. ST_D2(dst0, 0, 1, dst, dst_stride);
  2664. }
  2665. static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
  2666. int32_t src_stride,
  2667. int16_t *src1_ptr,
  2668. int32_t src2_stride,
  2669. uint8_t *dst,
  2670. int32_t dst_stride,
  2671. const int8_t *filter,
  2672. int32_t weight0,
  2673. int32_t weight1,
  2674. int32_t offset0,
  2675. int32_t offset1,
  2676. int32_t rnd_val)
  2677. {
  2678. int32_t weight, offset, constant;
  2679. v8i16 filt0, filt1;
  2680. v16i8 src0, src1, src2, src3, src4, src5;
  2681. v8i16 in0, in1, in2, in3, in4, in5;
  2682. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2683. v16i8 mask1;
  2684. v16i8 vec0, vec1;
  2685. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  2686. v8i16 filter_vec;
  2687. v4i32 weight_vec, offset_vec, rnd_vec;
  2688. src0_ptr -= 1;
  2689. filter_vec = LD_SH(filter);
  2690. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2691. offset = (offset0 + offset1) << rnd_val;
  2692. weight0 = weight0 & 0x0000FFFF;
  2693. weight = weight0 | (weight1 << 16);
  2694. constant = 128 * weight1;
  2695. constant <<= 6;
  2696. offset += constant;
  2697. offset_vec = __msa_fill_w(offset);
  2698. weight_vec = __msa_fill_w(weight);
  2699. rnd_vec = __msa_fill_w(rnd_val + 1);
  2700. mask1 = mask0 + 2;
  2701. LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
  2702. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  2703. src1_ptr += (4 * src2_stride);
  2704. LD_SH2(src1_ptr, src2_stride, in4, in5);
  2705. XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
  2706. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2707. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2708. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2709. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2710. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  2711. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2712. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  2713. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2714. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
  2715. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2716. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
  2717. dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2718. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  2719. in0, in1, in2, in3,
  2720. weight_vec, rnd_vec, offset_vec,
  2721. dst0, dst1, dst2, dst3);
  2722. HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
  2723. weight_vec, rnd_vec, offset_vec,
  2724. dst4, dst5);
  2725. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2726. dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
  2727. ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
  2728. ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
  2729. }
  2730. static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
  2731. int32_t src_stride,
  2732. int16_t *src1_ptr,
  2733. int32_t src2_stride,
  2734. uint8_t *dst,
  2735. int32_t dst_stride,
  2736. const int8_t *filter,
  2737. int32_t height,
  2738. int32_t weight0,
  2739. int32_t weight1,
  2740. int32_t offset0,
  2741. int32_t offset1,
  2742. int32_t rnd_val)
  2743. {
  2744. uint32_t loop_cnt;
  2745. int32_t offset, weight, constant;
  2746. v8i16 filt0, filt1;
  2747. v16i8 src0, src1, src2, src3;
  2748. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  2749. v16i8 mask1;
  2750. v16i8 vec0, vec1;
  2751. v8i16 in0, in1, in2, in3;
  2752. v8i16 dst0, dst1, dst2, dst3;
  2753. v8i16 filter_vec;
  2754. v4i32 weight_vec, offset_vec, rnd_vec;
  2755. src0_ptr -= 1;
  2756. filter_vec = LD_SH(filter);
  2757. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2758. offset = (offset0 + offset1) << rnd_val;
  2759. weight0 = weight0 & 0x0000FFFF;
  2760. weight = weight0 | (weight1 << 16);
  2761. constant = 128 * weight1;
  2762. constant <<= 6;
  2763. offset += constant;
  2764. offset_vec = __msa_fill_w(offset);
  2765. weight_vec = __msa_fill_w(weight);
  2766. rnd_vec = __msa_fill_w(rnd_val + 1);
  2767. mask1 = mask0 + 2;
  2768. for (loop_cnt = (height >> 2); loop_cnt--;) {
  2769. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  2770. src0_ptr += (4 * src_stride);
  2771. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  2772. src1_ptr += (4 * src2_stride);
  2773. XORI_B4_128_SB(src0, src1, src2, src3);
  2774. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2775. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2776. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2777. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2778. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  2779. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2780. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  2781. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2782. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  2783. in0, in1, in2, in3,
  2784. weight_vec, rnd_vec, offset_vec,
  2785. dst0, dst1, dst2, dst3);
  2786. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2787. ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
  2788. dst += (4 * dst_stride);
  2789. }
  2790. }
  2791. static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
  2792. int32_t src_stride,
  2793. int16_t *src1_ptr,
  2794. int32_t src2_stride,
  2795. uint8_t *dst,
  2796. int32_t dst_stride,
  2797. const int8_t *filter,
  2798. int32_t height,
  2799. int32_t weight0,
  2800. int32_t weight1,
  2801. int32_t offset0,
  2802. int32_t offset1,
  2803. int32_t rnd_val)
  2804. {
  2805. if (2 == height) {
  2806. hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  2807. dst, dst_stride, filter,
  2808. weight0, weight1, offset0, offset1, rnd_val);
  2809. } else if (6 == height) {
  2810. hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  2811. dst, dst_stride, filter,
  2812. weight0, weight1, offset0, offset1, rnd_val);
  2813. } else if (0 == (height % 4)) {
  2814. hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
  2815. src1_ptr, src2_stride,
  2816. dst, dst_stride, filter, height,
  2817. weight0, weight1, offset0, offset1,
  2818. rnd_val);
  2819. }
  2820. }
  2821. static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
  2822. int32_t src_stride,
  2823. int16_t *src1_ptr,
  2824. int32_t src2_stride,
  2825. uint8_t *dst,
  2826. int32_t dst_stride,
  2827. const int8_t *filter,
  2828. int32_t height,
  2829. int32_t weight0,
  2830. int32_t weight1,
  2831. int32_t offset0,
  2832. int32_t offset1,
  2833. int32_t rnd_val)
  2834. {
  2835. uint32_t loop_cnt;
  2836. int32_t offset, weight, constant;
  2837. v8i16 filt0, filt1;
  2838. v16i8 src0, src1, src2, src3;
  2839. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  2840. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2841. v16i8 mask2 = {
  2842. 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
  2843. };
  2844. v16i8 mask1, mask3;
  2845. v16i8 vec0, vec1;
  2846. v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
  2847. v8i16 filter_vec;
  2848. v4i32 weight_vec, offset_vec, rnd_vec;
  2849. src0_ptr -= 1;
  2850. filter_vec = LD_SH(filter);
  2851. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2852. offset = (offset0 + offset1) << rnd_val;
  2853. weight0 = weight0 & 0x0000FFFF;
  2854. weight = weight0 | (weight1 << 16);
  2855. constant = 128 * weight1;
  2856. constant <<= 6;
  2857. offset += constant;
  2858. offset_vec = __msa_fill_w(offset);
  2859. weight_vec = __msa_fill_w(weight);
  2860. rnd_vec = __msa_fill_w(rnd_val + 1);
  2861. mask1 = mask0 + 2;
  2862. mask3 = mask2 + 2;
  2863. for (loop_cnt = 4; loop_cnt--;) {
  2864. LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
  2865. src0_ptr += (4 * src_stride);
  2866. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  2867. LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
  2868. src1_ptr += (4 * src2_stride);
  2869. ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
  2870. XORI_B4_128_SB(src0, src1, src2, src3);
  2871. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2872. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2873. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2874. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2875. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  2876. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2877. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  2878. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2879. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
  2880. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2881. VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
  2882. dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2883. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  2884. in0, in1, in2, in3,
  2885. weight_vec, rnd_vec, offset_vec,
  2886. dst0, dst1, dst2, dst3);
  2887. HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
  2888. weight_vec, rnd_vec, offset_vec,
  2889. dst4, dst5);
  2890. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2891. dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
  2892. ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
  2893. ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
  2894. dst += (4 * dst_stride);
  2895. }
  2896. }
  2897. static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
  2898. int32_t src_stride,
  2899. int16_t *src1_ptr,
  2900. int32_t src2_stride,
  2901. uint8_t *dst,
  2902. int32_t dst_stride,
  2903. const int8_t *filter,
  2904. int32_t height,
  2905. int32_t weight0,
  2906. int32_t weight1,
  2907. int32_t offset0,
  2908. int32_t offset1,
  2909. int32_t rnd_val)
  2910. {
  2911. uint32_t loop_cnt;
  2912. int32_t offset, weight, constant;
  2913. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  2914. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  2915. v8i16 filt0, filt1;
  2916. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2917. v16i8 mask1;
  2918. v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  2919. v16i8 vec0, vec1;
  2920. v8i16 filter_vec;
  2921. v4i32 weight_vec, offset_vec, rnd_vec;
  2922. src0_ptr -= 1;
  2923. filter_vec = LD_SH(filter);
  2924. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  2925. offset = (offset0 + offset1) << rnd_val;
  2926. weight0 = weight0 & 0x0000FFFF;
  2927. weight = weight0 | (weight1 << 16);
  2928. constant = 128 * weight1;
  2929. constant <<= 6;
  2930. offset += constant;
  2931. offset_vec = __msa_fill_w(offset);
  2932. weight_vec = __msa_fill_w(weight);
  2933. rnd_vec = __msa_fill_w(rnd_val + 1);
  2934. mask1 = mask0 + 2;
  2935. for (loop_cnt = (height >> 2); loop_cnt--;) {
  2936. LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
  2937. LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
  2938. src0_ptr += (4 * src_stride);
  2939. LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
  2940. LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
  2941. src1_ptr += (4 * src2_stride);
  2942. XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
  2943. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  2944. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2945. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  2946. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2947. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  2948. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2949. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  2950. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2951. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
  2952. dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2953. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
  2954. dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2955. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
  2956. dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2957. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
  2958. dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  2959. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  2960. in0, in1, in2, in3,
  2961. weight_vec, rnd_vec, offset_vec,
  2962. dst0, dst1, dst2, dst3);
  2963. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2964. ST_SH2(dst0, dst1, dst, dst_stride);
  2965. dst += (2 * dst_stride);
  2966. HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
  2967. in4, in5, in6, in7,
  2968. weight_vec, rnd_vec, offset_vec,
  2969. dst0, dst1, dst2, dst3);
  2970. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  2971. ST_SH2(dst0, dst1, dst, dst_stride);
  2972. dst += (2 * dst_stride);
  2973. }
  2974. }
  2975. static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
  2976. int32_t src_stride,
  2977. int16_t *src1_ptr,
  2978. int32_t src2_stride,
  2979. uint8_t *dst,
  2980. int32_t dst_stride,
  2981. const int8_t *filter,
  2982. int32_t height,
  2983. int32_t weight0,
  2984. int32_t weight1,
  2985. int32_t offset0,
  2986. int32_t offset1,
  2987. int32_t rnd_val)
  2988. {
  2989. uint32_t loop_cnt;
  2990. int32_t offset, weight, constant;
  2991. v16i8 src0, src1, src2, src3;
  2992. v8i16 filt0, filt1;
  2993. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  2994. v16i8 mask1, mask2, mask3;
  2995. v16i8 vec0, vec1;
  2996. v8i16 dst0, dst1, dst2, dst3;
  2997. v8i16 in0, in1, in2, in3, in4, in5;
  2998. v8i16 filter_vec;
  2999. v4i32 weight_vec, offset_vec, rnd_vec;
  3000. src0_ptr -= 1;
  3001. filter_vec = LD_SH(filter);
  3002. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3003. offset = (offset0 + offset1) << rnd_val;
  3004. weight0 = weight0 & 0x0000FFFF;
  3005. weight = weight0 | (weight1 << 16);
  3006. constant = 128 * weight1;
  3007. constant <<= 6;
  3008. offset += constant;
  3009. offset_vec = __msa_fill_w(offset);
  3010. weight_vec = __msa_fill_w(weight);
  3011. rnd_vec = __msa_fill_w(rnd_val + 1);
  3012. mask1 = mask0 + 2;
  3013. mask2 = mask0 + 8;
  3014. mask3 = mask0 + 10;
  3015. for (loop_cnt = 16; loop_cnt--;) {
  3016. LD_SB2(src0_ptr, src_stride, src0, src2);
  3017. LD_SB2(src0_ptr + 16, src_stride, src1, src3);
  3018. src0_ptr += (2 * src_stride);
  3019. LD_SH2(src1_ptr, src2_stride, in0, in2);
  3020. LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
  3021. LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
  3022. src1_ptr += (2 * src2_stride);
  3023. XORI_B4_128_SB(src0, src1, src2, src3);
  3024. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  3025. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3026. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
  3027. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3028. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  3029. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3030. VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
  3031. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3032. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  3033. in0, in1, in2, in3,
  3034. weight_vec, rnd_vec, offset_vec,
  3035. dst0, dst1, dst2, dst3);
  3036. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  3037. ST_SH2(dst0, dst1, dst, dst_stride);
  3038. /* 8 width */
  3039. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  3040. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3041. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  3042. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3043. HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
  3044. weight_vec, rnd_vec, offset_vec,
  3045. dst0, dst1);
  3046. dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
  3047. ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
  3048. dst += (2 * dst_stride);
  3049. }
  3050. }
  3051. static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
  3052. int32_t src_stride,
  3053. int16_t *src1_ptr,
  3054. int32_t src2_stride,
  3055. uint8_t *dst,
  3056. int32_t dst_stride,
  3057. const int8_t *filter,
  3058. int32_t height,
  3059. int32_t weight0,
  3060. int32_t weight1,
  3061. int32_t offset0,
  3062. int32_t offset1,
  3063. int32_t rnd_val)
  3064. {
  3065. uint32_t loop_cnt;
  3066. int32_t offset, weight, constant;
  3067. v16i8 src0, src1, src2;
  3068. v8i16 filt0, filt1;
  3069. v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
  3070. v16i8 mask1, mask2, mask3;
  3071. v8i16 dst0, dst1, dst2, dst3;
  3072. v16i8 vec0, vec1;
  3073. v8i16 in0, in1, in2, in3;
  3074. v8i16 filter_vec;
  3075. v4i32 weight_vec, offset_vec, rnd_vec;
  3076. src0_ptr -= 1;
  3077. filter_vec = LD_SH(filter);
  3078. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3079. offset = (offset0 + offset1) << rnd_val;
  3080. weight0 = weight0 & 0x0000FFFF;
  3081. weight = weight0 | (weight1 << 16);
  3082. constant = 128 * weight1;
  3083. constant <<= 6;
  3084. offset += constant;
  3085. offset_vec = __msa_fill_w(offset);
  3086. weight_vec = __msa_fill_w(weight);
  3087. rnd_vec = __msa_fill_w(rnd_val + 1);
  3088. mask1 = mask0 + 2;
  3089. mask2 = mask0 + 8;
  3090. mask3 = mask0 + 10;
  3091. for (loop_cnt = height; loop_cnt--;) {
  3092. LD_SB2(src0_ptr, 16, src0, src1);
  3093. src2 = LD_SB(src0_ptr + 24);
  3094. src0_ptr += src_stride;
  3095. LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
  3096. src1_ptr += src2_stride;
  3097. XORI_B3_128_SB(src0, src1, src2);
  3098. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  3099. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3100. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
  3101. dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3102. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
  3103. dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3104. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
  3105. dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  3106. HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
  3107. in0, in1, in2, in3,
  3108. weight_vec, rnd_vec, offset_vec,
  3109. dst0, dst1, dst2, dst3);
  3110. PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
  3111. ST_SH2(dst0, dst1, dst, 16);
  3112. dst += dst_stride;
  3113. }
  3114. }
  3115. static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  3116. int32_t src_stride,
  3117. int16_t *src1_ptr,
  3118. int32_t src2_stride,
  3119. uint8_t *dst,
  3120. int32_t dst_stride,
  3121. const int8_t *filter,
  3122. int32_t weight0,
  3123. int32_t weight1,
  3124. int32_t offset0,
  3125. int32_t offset1,
  3126. int32_t rnd_val)
  3127. {
  3128. int32_t weight, offset, constant;
  3129. v16i8 src0, src1, src2, src3, src4;
  3130. v8i16 in0, in1, dst10;
  3131. v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
  3132. v4i32 dst10_r, dst10_l;
  3133. v8i16 filt0, filt1;
  3134. v8i16 filter_vec, out;
  3135. v4i32 weight_vec, offset_vec, rnd_vec;
  3136. src0_ptr -= src_stride;
  3137. offset = (offset0 + offset1) << rnd_val;
  3138. weight0 = weight0 & 0x0000FFFF;
  3139. weight = weight0 | (weight1 << 16);
  3140. constant = 128 * weight1;
  3141. constant <<= 6;
  3142. offset += constant;
  3143. offset_vec = __msa_fill_w(offset);
  3144. weight_vec = __msa_fill_w(weight);
  3145. rnd_vec = __msa_fill_w(rnd_val + 1);
  3146. filter_vec = LD_SH(filter);
  3147. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3148. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3149. src0_ptr += (3 * src_stride);
  3150. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3151. src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
  3152. src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
  3153. LD_SB2(src0_ptr, src_stride, src3, src4);
  3154. src0_ptr += (2 * src_stride);
  3155. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3156. src1_ptr += (2 * src2_stride);
  3157. in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
  3158. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3159. src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
  3160. src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
  3161. dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  3162. ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
  3163. dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
  3164. dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
  3165. SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
  3166. dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
  3167. out = CLIP_SH_0_255(dst10_r);
  3168. out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
  3169. ST_W2(out, 0, 1, dst, dst_stride);
  3170. }
  3171. static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
  3172. int32_t src_stride,
  3173. int16_t *src1_ptr,
  3174. int32_t src2_stride,
  3175. uint8_t *dst,
  3176. int32_t dst_stride,
  3177. const int8_t *filter,
  3178. int32_t weight0,
  3179. int32_t weight1,
  3180. int32_t offset0,
  3181. int32_t offset1,
  3182. int32_t rnd_val)
  3183. {
  3184. int32_t weight, offset, constant;
  3185. v16i8 src0, src1, src2, src3, src4, src5, src6;
  3186. v8i16 in0, in1, in2, in3;
  3187. v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
  3188. v16i8 src2110, src4332, src6554;
  3189. v8i16 dst10, dst32;
  3190. v8i16 filt0, filt1;
  3191. v8i16 filter_vec;
  3192. v4i32 weight_vec, offset_vec, rnd_vec;
  3193. src0_ptr -= src_stride;
  3194. offset = (offset0 + offset1) << rnd_val;
  3195. weight0 = weight0 & 0x0000FFFF;
  3196. weight = weight0 | (weight1 << 16);
  3197. constant = 128 * weight1;
  3198. constant <<= 6;
  3199. offset += constant;
  3200. offset_vec = __msa_fill_w(offset);
  3201. weight_vec = __msa_fill_w(weight);
  3202. rnd_vec = __msa_fill_w(rnd_val + 1);
  3203. filter_vec = LD_SH(filter);
  3204. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3205. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3206. src0_ptr += (3 * src_stride);
  3207. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3208. src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
  3209. src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
  3210. LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
  3211. src0_ptr += (4 * src_stride);
  3212. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  3213. src1_ptr += (4 * src2_stride);
  3214. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  3215. ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
  3216. src32_r, src43_r, src54_r, src65_r);
  3217. ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
  3218. XORI_B2_128_SB(src4332, src6554);
  3219. dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  3220. dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
  3221. HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
  3222. weight_vec, rnd_vec, offset_vec,
  3223. dst10, dst32);
  3224. dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
  3225. ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
  3226. dst += (4 * dst_stride);
  3227. }
  3228. static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
  3229. int32_t src_stride,
  3230. int16_t *src1_ptr,
  3231. int32_t src2_stride,
  3232. uint8_t *dst,
  3233. int32_t dst_stride,
  3234. const int8_t *filter,
  3235. int32_t height,
  3236. int32_t weight0,
  3237. int32_t weight1,
  3238. int32_t offset0,
  3239. int32_t offset1,
  3240. int32_t rnd_val)
  3241. {
  3242. uint32_t loop_cnt;
  3243. int32_t weight, offset, constant;
  3244. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  3245. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  3246. v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
  3247. v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
  3248. v16i8 src2110, src4332, src6554, src8776;
  3249. v8i16 dst10, dst32, dst54, dst76;
  3250. v8i16 filt0, filt1;
  3251. v8i16 filter_vec;
  3252. v4i32 weight_vec, offset_vec, rnd_vec;
  3253. src0_ptr -= src_stride;
  3254. offset = (offset0 + offset1) << rnd_val;
  3255. weight0 = weight0 & 0x0000FFFF;
  3256. weight = weight0 | (weight1 << 16);
  3257. constant = 128 * weight1;
  3258. constant <<= 6;
  3259. offset += constant;
  3260. offset_vec = __msa_fill_w(offset);
  3261. weight_vec = __msa_fill_w(weight);
  3262. rnd_vec = __msa_fill_w(rnd_val + 1);
  3263. filter_vec = LD_SH(filter);
  3264. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3265. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3266. src0_ptr += (3 * src_stride);
  3267. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3268. src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
  3269. src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
  3270. for (loop_cnt = (height >> 3); loop_cnt--;) {
  3271. LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
  3272. src0_ptr += (6 * src_stride);
  3273. LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
  3274. src1_ptr += (8 * src2_stride);
  3275. ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
  3276. ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
  3277. ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
  3278. src32_r, src43_r, src54_r, src65_r);
  3279. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  3280. ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
  3281. src4332, src6554, src8776);
  3282. XORI_B3_128_SB(src4332, src6554, src8776);
  3283. dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  3284. dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
  3285. dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
  3286. LD_SB2(src0_ptr, src_stride, src9, src2);
  3287. src0_ptr += (2 * src_stride);
  3288. ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
  3289. src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
  3290. src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
  3291. dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
  3292. HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
  3293. in0, in1, in2, in3,
  3294. weight_vec, rnd_vec, offset_vec,
  3295. dst10, dst32, dst54, dst76);
  3296. PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
  3297. ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  3298. dst += (8 * dst_stride);
  3299. }
  3300. }
  3301. static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
  3302. int32_t src_stride,
  3303. int16_t *src1_ptr,
  3304. int32_t src2_stride,
  3305. uint8_t *dst,
  3306. int32_t dst_stride,
  3307. const int8_t *filter,
  3308. int32_t height,
  3309. int32_t weight0,
  3310. int32_t weight1,
  3311. int32_t offset0,
  3312. int32_t offset1,
  3313. int32_t rnd_val)
  3314. {
  3315. if (2 == height) {
  3316. hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  3317. dst, dst_stride, filter,
  3318. weight0, weight1, offset0, offset1, rnd_val);
  3319. } else if (4 == height) {
  3320. hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  3321. dst, dst_stride, filter,
  3322. weight0, weight1, offset0, offset1, rnd_val);
  3323. } else if (0 == (height % 8)) {
  3324. hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
  3325. src1_ptr, src2_stride,
  3326. dst, dst_stride, filter, height,
  3327. weight0, weight1, offset0, offset1,
  3328. rnd_val);
  3329. }
  3330. }
  3331. static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
  3332. int32_t src_stride,
  3333. int16_t *src1_ptr,
  3334. int32_t src2_stride,
  3335. uint8_t *dst,
  3336. int32_t dst_stride,
  3337. const int8_t *filter,
  3338. int32_t height,
  3339. int32_t weight0,
  3340. int32_t weight1,
  3341. int32_t offset0,
  3342. int32_t offset1,
  3343. int32_t rnd_val)
  3344. {
  3345. uint32_t loop_cnt;
  3346. int32_t offset, weight, constant;
  3347. v16i8 src0, src1, src2, src3, src4;
  3348. v8i16 in0, in1, in2, in3;
  3349. v16i8 src10_r, src32_r, src21_r, src43_r;
  3350. v8i16 tmp0, tmp1, tmp2, tmp3;
  3351. v8i16 filt0, filt1;
  3352. v8i16 filter_vec;
  3353. v4i32 weight_vec, offset_vec, rnd_vec;
  3354. src0_ptr -= src_stride;
  3355. offset = (offset0 + offset1) << rnd_val;
  3356. weight0 = weight0 & 0x0000FFFF;
  3357. weight = weight0 | (weight1 << 16);
  3358. constant = 128 * weight1;
  3359. constant <<= 6;
  3360. offset += constant;
  3361. offset_vec = __msa_fill_w(offset);
  3362. weight_vec = __msa_fill_w(weight);
  3363. rnd_vec = __msa_fill_w(rnd_val + 1);
  3364. filter_vec = LD_SH(filter);
  3365. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3366. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3367. src0_ptr += (3 * src_stride);
  3368. XORI_B3_128_SB(src0, src1, src2);
  3369. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3370. for (loop_cnt = (height >> 2); loop_cnt--;) {
  3371. LD_SB2(src0_ptr, src_stride, src3, src4);
  3372. src0_ptr += (2 * src_stride);
  3373. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  3374. src1_ptr += (4 * src2_stride);
  3375. XORI_B2_128_SB(src3, src4);
  3376. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3377. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3378. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3379. LD_SB2(src0_ptr, src_stride, src1, src2);
  3380. src0_ptr += (2 * src_stride);
  3381. XORI_B2_128_SB(src1, src2);
  3382. ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
  3383. tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
  3384. tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
  3385. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  3386. in0, in1, in2, in3,
  3387. weight_vec, rnd_vec, offset_vec,
  3388. tmp0, tmp1, tmp2, tmp3);
  3389. PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
  3390. ST_W2(tmp0, 0, 2, dst, dst_stride);
  3391. ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
  3392. ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
  3393. ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
  3394. dst += (4 * dst_stride);
  3395. }
  3396. }
  3397. static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
  3398. int32_t src_stride,
  3399. int16_t *src1_ptr,
  3400. int32_t src2_stride,
  3401. uint8_t *dst,
  3402. int32_t dst_stride,
  3403. const int8_t *filter,
  3404. int32_t weight0,
  3405. int32_t weight1,
  3406. int32_t offset0,
  3407. int32_t offset1,
  3408. int32_t rnd_val)
  3409. {
  3410. int32_t offset, weight, constant;
  3411. v16i8 src0, src1, src2, src3, src4;
  3412. v8i16 in0, in1, tmp0, tmp1;
  3413. v16i8 src10_r, src32_r, src21_r, src43_r;
  3414. v8i16 filt0, filt1;
  3415. v8i16 filter_vec;
  3416. v4i32 weight_vec, offset_vec, rnd_vec;
  3417. src0_ptr -= src_stride;
  3418. offset = (offset0 + offset1) << rnd_val;
  3419. weight0 = weight0 & 0x0000FFFF;
  3420. weight = weight0 | (weight1 << 16);
  3421. constant = 128 * weight1;
  3422. constant <<= 6;
  3423. offset += constant;
  3424. offset_vec = __msa_fill_w(offset);
  3425. weight_vec = __msa_fill_w(weight);
  3426. rnd_vec = __msa_fill_w(rnd_val + 1);
  3427. filter_vec = LD_SH(filter);
  3428. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3429. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3430. src0_ptr += (3 * src_stride);
  3431. XORI_B3_128_SB(src0, src1, src2);
  3432. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3433. LD_SB2(src0_ptr, src_stride, src3, src4);
  3434. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3435. XORI_B2_128_SB(src3, src4);
  3436. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3437. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3438. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3439. HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
  3440. weight_vec, rnd_vec, offset_vec,
  3441. tmp0, tmp1);
  3442. tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  3443. ST_D2(tmp0, 0, 1, dst, dst_stride);
  3444. }
  3445. static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
  3446. int32_t src_stride,
  3447. int16_t *src1_ptr,
  3448. int32_t src2_stride,
  3449. uint8_t *dst,
  3450. int32_t dst_stride,
  3451. const int8_t *filter,
  3452. int32_t weight0,
  3453. int32_t weight1,
  3454. int32_t offset0,
  3455. int32_t offset1,
  3456. int32_t rnd_val)
  3457. {
  3458. int32_t offset, weight, constant;
  3459. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  3460. v8i16 in0, in1, in2, in3, in4, in5;
  3461. v16i8 src10_r, src32_r, src54_r, src76_r;
  3462. v16i8 src21_r, src43_r, src65_r, src87_r;
  3463. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  3464. v8i16 filt0, filt1;
  3465. v8i16 filter_vec;
  3466. v4i32 weight_vec, offset_vec, rnd_vec;
  3467. src0_ptr -= src_stride;
  3468. offset = (offset0 + offset1) << rnd_val;
  3469. weight0 = weight0 & 0x0000FFFF;
  3470. weight = weight0 | (weight1 << 16);
  3471. constant = 128 * weight1;
  3472. constant <<= 6;
  3473. offset += constant;
  3474. offset_vec = __msa_fill_w(offset);
  3475. weight_vec = __msa_fill_w(weight);
  3476. rnd_vec = __msa_fill_w(rnd_val + 1);
  3477. filter_vec = LD_SH(filter);
  3478. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3479. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3480. src0_ptr += (3 * src_stride);
  3481. XORI_B3_128_SB(src0, src1, src2);
  3482. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3483. LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
  3484. LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
  3485. XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
  3486. ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
  3487. src32_r, src43_r, src54_r, src65_r);
  3488. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  3489. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3490. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3491. tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
  3492. tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
  3493. tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
  3494. tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
  3495. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  3496. in0, in1, in2, in3,
  3497. weight_vec, rnd_vec, offset_vec,
  3498. tmp0, tmp1, tmp2, tmp3);
  3499. HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
  3500. weight_vec, rnd_vec, offset_vec,
  3501. tmp4, tmp5);
  3502. PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
  3503. tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
  3504. ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
  3505. ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
  3506. }
  3507. static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
  3508. int32_t src_stride,
  3509. int16_t *src1_ptr,
  3510. int32_t src2_stride,
  3511. uint8_t *dst,
  3512. int32_t dst_stride,
  3513. const int8_t *filter,
  3514. int32_t height,
  3515. int32_t weight0,
  3516. int32_t weight1,
  3517. int32_t offset0,
  3518. int32_t offset1,
  3519. int32_t rnd_val)
  3520. {
  3521. uint32_t loop_cnt;
  3522. int32_t offset, weight, constant;
  3523. v16i8 src0, src1, src2, src3, src4;
  3524. v8i16 in0, in1, in2, in3;
  3525. v16i8 src10_r, src32_r, src21_r, src43_r;
  3526. v8i16 tmp0, tmp1, tmp2, tmp3;
  3527. v8i16 filt0, filt1;
  3528. v8i16 filter_vec;
  3529. v4i32 weight_vec, offset_vec, rnd_vec;
  3530. src0_ptr -= src_stride;
  3531. offset = (offset0 + offset1) << rnd_val;
  3532. weight0 = weight0 & 0x0000FFFF;
  3533. weight = weight0 | (weight1 << 16);
  3534. constant = 128 * weight1;
  3535. constant <<= 6;
  3536. offset += constant;
  3537. offset_vec = __msa_fill_w(offset);
  3538. weight_vec = __msa_fill_w(weight);
  3539. rnd_vec = __msa_fill_w(rnd_val + 1);
  3540. filter_vec = LD_SH(filter);
  3541. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3542. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3543. src0_ptr += (3 * src_stride);
  3544. XORI_B3_128_SB(src0, src1, src2);
  3545. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3546. for (loop_cnt = (height >> 2); loop_cnt--;) {
  3547. LD_SB2(src0_ptr, src_stride, src3, src4);
  3548. src0_ptr += (2 * src_stride);
  3549. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  3550. src1_ptr += (4 * src2_stride);
  3551. XORI_B2_128_SB(src3, src4);
  3552. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3553. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3554. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3555. LD_SB2(src0_ptr, src_stride, src1, src2);
  3556. src0_ptr += (2 * src_stride);
  3557. XORI_B2_128_SB(src1, src2);
  3558. ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
  3559. tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
  3560. tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
  3561. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  3562. in0, in1, in2, in3,
  3563. weight_vec, rnd_vec, offset_vec,
  3564. tmp0, tmp1, tmp2, tmp3);
  3565. PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
  3566. ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
  3567. dst += (4 * dst_stride);
  3568. }
  3569. }
  3570. static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
  3571. int32_t src_stride,
  3572. int16_t *src1_ptr,
  3573. int32_t src2_stride,
  3574. uint8_t *dst,
  3575. int32_t dst_stride,
  3576. const int8_t *filter,
  3577. int32_t height,
  3578. int32_t weight0,
  3579. int32_t weight1,
  3580. int32_t offset0,
  3581. int32_t offset1,
  3582. int32_t rnd_val)
  3583. {
  3584. if (2 == height) {
  3585. hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  3586. dst, dst_stride, filter,
  3587. weight0, weight1, offset0, offset1, rnd_val);
  3588. } else if (6 == height) {
  3589. hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  3590. dst, dst_stride, filter,
  3591. weight0, weight1, offset0, offset1, rnd_val);
  3592. } else {
  3593. hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
  3594. src1_ptr, src2_stride,
  3595. dst, dst_stride, filter, height,
  3596. weight0, weight1, offset0, offset1,
  3597. rnd_val);
  3598. }
  3599. }
  3600. static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
  3601. int32_t src_stride,
  3602. int16_t *src1_ptr,
  3603. int32_t src2_stride,
  3604. uint8_t *dst,
  3605. int32_t dst_stride,
  3606. const int8_t *filter,
  3607. int32_t height,
  3608. int32_t weight0,
  3609. int32_t weight1,
  3610. int32_t offset0,
  3611. int32_t offset1,
  3612. int32_t rnd_val)
  3613. {
  3614. uint32_t loop_cnt;
  3615. int32_t offset, weight, constant;
  3616. v16i8 src0, src1, src2, src3, src4, src5;
  3617. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  3618. v16i8 src10_r, src32_r, src21_r, src43_r;
  3619. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  3620. v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
  3621. v16i8 src2110, src4332;
  3622. v8i16 filt0, filt1;
  3623. v8i16 filter_vec;
  3624. v4i32 weight_vec, offset_vec, rnd_vec;
  3625. src0_ptr -= (1 * src_stride);
  3626. offset = (offset0 + offset1) << rnd_val;
  3627. weight0 = weight0 & 0x0000FFFF;
  3628. weight = weight0 | (weight1 << 16);
  3629. constant = 128 * weight1;
  3630. constant <<= 6;
  3631. offset += constant;
  3632. offset_vec = __msa_fill_w(offset);
  3633. weight_vec = __msa_fill_w(weight);
  3634. rnd_vec = __msa_fill_w(rnd_val + 1);
  3635. filter_vec = LD_SH(filter);
  3636. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3637. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3638. src0_ptr += (3 * src_stride);
  3639. XORI_B3_128_SB(src0, src1, src2);
  3640. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3641. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3642. src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
  3643. for (loop_cnt = (height >> 2); loop_cnt--;) {
  3644. LD_SB2(src0_ptr, src_stride, src3, src4);
  3645. src0_ptr += (2 * src_stride);
  3646. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  3647. LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
  3648. src1_ptr += (4 * src2_stride);
  3649. ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
  3650. XORI_B2_128_SB(src3, src4);
  3651. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3652. ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
  3653. src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
  3654. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3655. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3656. tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
  3657. LD_SB2(src0_ptr, src_stride, src5, src2);
  3658. src0_ptr += (2 * src_stride);
  3659. XORI_B2_128_SB(src5, src2);
  3660. ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
  3661. ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
  3662. src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
  3663. tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
  3664. tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
  3665. tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
  3666. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  3667. in0, in1, in2, in3,
  3668. weight_vec, rnd_vec, offset_vec,
  3669. tmp0, tmp1, tmp2, tmp3);
  3670. HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
  3671. weight_vec, rnd_vec, offset_vec,
  3672. tmp4, tmp5);
  3673. PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
  3674. tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
  3675. ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
  3676. ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
  3677. dst += (4 * dst_stride);
  3678. }
  3679. }
  3680. static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
  3681. int32_t src_stride,
  3682. int16_t *src1_ptr,
  3683. int32_t src2_stride,
  3684. uint8_t *dst,
  3685. int32_t dst_stride,
  3686. const int8_t *filter,
  3687. int32_t height,
  3688. int32_t weight0,
  3689. int32_t weight1,
  3690. int32_t offset0,
  3691. int32_t offset1,
  3692. int32_t rnd_val)
  3693. {
  3694. uint32_t loop_cnt;
  3695. int32_t offset, weight, constant;
  3696. v16i8 src0, src1, src2, src3, src4, src5;
  3697. v8i16 in0, in1, in2, in3;
  3698. v16i8 src10_r, src32_r, src21_r, src43_r;
  3699. v16i8 src10_l, src32_l, src21_l, src43_l;
  3700. v8i16 tmp0, tmp1, tmp2, tmp3;
  3701. v8i16 filt0, filt1;
  3702. v8i16 filter_vec;
  3703. v4i32 weight_vec, offset_vec, rnd_vec;
  3704. src0_ptr -= src_stride;
  3705. offset = (offset0 + offset1) << rnd_val;
  3706. weight0 = weight0 & 0x0000FFFF;
  3707. weight = weight0 | (weight1 << 16);
  3708. constant = 128 * weight1;
  3709. constant <<= 6;
  3710. offset += constant;
  3711. offset_vec = __msa_fill_w(offset);
  3712. weight_vec = __msa_fill_w(weight);
  3713. rnd_vec = __msa_fill_w(rnd_val + 1);
  3714. filter_vec = LD_SH(filter);
  3715. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3716. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3717. src0_ptr += (3 * src_stride);
  3718. XORI_B3_128_SB(src0, src1, src2);
  3719. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3720. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3721. for (loop_cnt = (height >> 2); loop_cnt--;) {
  3722. LD_SB2(src0_ptr, src_stride, src3, src4);
  3723. src0_ptr += (2 * src_stride);
  3724. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3725. LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
  3726. src1_ptr += (2 * src2_stride);
  3727. XORI_B2_128_SB(src3, src4);
  3728. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3729. ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
  3730. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3731. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3732. tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
  3733. tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
  3734. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  3735. in0, in1, in2, in3,
  3736. weight_vec, rnd_vec, offset_vec,
  3737. tmp0, tmp1, tmp2, tmp3);
  3738. PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
  3739. ST_SH2(tmp0, tmp1, dst, dst_stride);
  3740. dst += (2 * dst_stride);
  3741. LD_SB2(src0_ptr, src_stride, src5, src2);
  3742. src0_ptr += (2 * src_stride);
  3743. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3744. LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
  3745. src1_ptr += (2 * src2_stride);
  3746. XORI_B2_128_SB(src5, src2);
  3747. ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
  3748. ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
  3749. tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
  3750. tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
  3751. tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
  3752. tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
  3753. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
  3754. in0, in1, in2, in3,
  3755. weight_vec, rnd_vec, offset_vec,
  3756. tmp0, tmp1, tmp2, tmp3);
  3757. PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
  3758. ST_SH2(tmp0, tmp1, dst, dst_stride);
  3759. dst += (2 * dst_stride);
  3760. }
  3761. }
  3762. static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
  3763. int32_t src_stride,
  3764. int16_t *src1_ptr,
  3765. int32_t src2_stride,
  3766. uint8_t *dst,
  3767. int32_t dst_stride,
  3768. const int8_t *filter,
  3769. int32_t height,
  3770. int32_t weight0,
  3771. int32_t weight1,
  3772. int32_t offset0,
  3773. int32_t offset1,
  3774. int32_t rnd_val)
  3775. {
  3776. uint32_t loop_cnt;
  3777. int32_t offset, weight, constant;
  3778. v16i8 src0, src1, src2, src3, src4, src5;
  3779. v16i8 src6, src7, src8, src9, src10, src11;
  3780. v8i16 in0, in1, in2, in3, in4, in5;
  3781. v16i8 src10_r, src32_r, src76_r, src98_r;
  3782. v16i8 src10_l, src32_l, src21_l, src43_l;
  3783. v16i8 src21_r, src43_r, src87_r, src109_r;
  3784. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  3785. v8i16 filt0, filt1;
  3786. v8i16 filter_vec;
  3787. v4i32 weight_vec, offset_vec, rnd_vec;
  3788. src0_ptr -= src_stride;
  3789. offset = (offset0 + offset1) << rnd_val;
  3790. weight0 = weight0 & 0x0000FFFF;
  3791. weight = weight0 | (weight1 << 16);
  3792. constant = 128 * weight1;
  3793. constant <<= 6;
  3794. offset += constant;
  3795. offset_vec = __msa_fill_w(offset);
  3796. weight_vec = __msa_fill_w(weight);
  3797. rnd_vec = __msa_fill_w(rnd_val + 1);
  3798. filter_vec = LD_SH(filter);
  3799. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3800. /* 16width */
  3801. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3802. XORI_B3_128_SB(src0, src1, src2);
  3803. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3804. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3805. /* 8width */
  3806. LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
  3807. src0_ptr += (3 * src_stride);
  3808. XORI_B3_128_SB(src6, src7, src8);
  3809. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  3810. for (loop_cnt = (height >> 2); loop_cnt--;) {
  3811. /* 16width */
  3812. LD_SB2(src0_ptr, src_stride, src3, src4);
  3813. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3814. LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
  3815. XORI_B2_128_SB(src3, src4);
  3816. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3817. ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
  3818. /* 8width */
  3819. LD_SB2(src0_ptr + 16, src_stride, src9, src10);
  3820. src0_ptr += (2 * src_stride);
  3821. LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
  3822. src1_ptr += (2 * src2_stride);
  3823. XORI_B2_128_SB(src9, src10);
  3824. ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
  3825. /* 16width */
  3826. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3827. tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
  3828. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3829. tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
  3830. /* 8width */
  3831. tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
  3832. tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
  3833. /* 16width */
  3834. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
  3835. in0, in1, in2, in3,
  3836. weight_vec, rnd_vec, offset_vec,
  3837. tmp0, tmp1, tmp4, tmp5);
  3838. /* 8width */
  3839. HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
  3840. weight_vec, rnd_vec, offset_vec,
  3841. tmp2, tmp3);
  3842. /* 16width */
  3843. PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
  3844. /* 8width */
  3845. tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
  3846. ST_SH2(tmp0, tmp1, dst, dst_stride);
  3847. ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
  3848. dst += (2 * dst_stride);
  3849. /* 16width */
  3850. LD_SB2(src0_ptr, src_stride, src5, src2);
  3851. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3852. LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
  3853. XORI_B2_128_SB(src5, src2);
  3854. ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
  3855. ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
  3856. /* 8width */
  3857. LD_SB2(src0_ptr + 16, src_stride, src11, src8);
  3858. src0_ptr += (2 * src_stride);
  3859. LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
  3860. src1_ptr += (2 * src2_stride);
  3861. XORI_B2_128_SB(src11, src8);
  3862. ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
  3863. /* 16width */
  3864. tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
  3865. tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
  3866. tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
  3867. tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
  3868. /* 8width */
  3869. tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
  3870. tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
  3871. /* 16width */
  3872. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
  3873. in0, in1, in2, in3,
  3874. weight_vec, rnd_vec, offset_vec,
  3875. tmp0, tmp1, tmp4, tmp5);
  3876. /* 8width */
  3877. HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
  3878. weight_vec, rnd_vec, offset_vec,
  3879. tmp2, tmp3);
  3880. /* 16width */
  3881. PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
  3882. /* 8width */
  3883. tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
  3884. ST_SH2(tmp0, tmp1, dst, dst_stride);
  3885. ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
  3886. dst += (2 * dst_stride);
  3887. }
  3888. }
  3889. static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
  3890. int32_t src_stride,
  3891. int16_t *src1_ptr,
  3892. int32_t src2_stride,
  3893. uint8_t *dst,
  3894. int32_t dst_stride,
  3895. const int8_t *filter,
  3896. int32_t height,
  3897. int32_t weight0,
  3898. int32_t weight1,
  3899. int32_t offset0,
  3900. int32_t offset1,
  3901. int32_t rnd_val)
  3902. {
  3903. uint32_t loop_cnt;
  3904. uint8_t *dst_tmp = dst + 16;
  3905. int32_t offset, weight, constant;
  3906. v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
  3907. v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  3908. v16i8 src10_r, src32_r, src76_r, src98_r;
  3909. v16i8 src21_r, src43_r, src87_r, src109_r;
  3910. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  3911. v16i8 src10_l, src32_l, src76_l, src98_l;
  3912. v16i8 src21_l, src43_l, src87_l, src109_l;
  3913. v8i16 filt0, filt1;
  3914. v8i16 filter_vec;
  3915. v4i32 weight_vec, offset_vec, rnd_vec;
  3916. src0_ptr -= src_stride;
  3917. offset = (offset0 + offset1) << rnd_val;
  3918. weight0 = weight0 & 0x0000FFFF;
  3919. weight = weight0 | (weight1 << 16);
  3920. constant = 128 * weight1;
  3921. constant <<= 6;
  3922. offset += constant;
  3923. offset_vec = __msa_fill_w(offset);
  3924. weight_vec = __msa_fill_w(weight);
  3925. rnd_vec = __msa_fill_w(rnd_val + 1);
  3926. filter_vec = LD_SH(filter);
  3927. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  3928. /* 16width */
  3929. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  3930. XORI_B3_128_SB(src0, src1, src2);
  3931. ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
  3932. ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
  3933. /* next 16width */
  3934. LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
  3935. src0_ptr += (3 * src_stride);
  3936. XORI_B3_128_SB(src6, src7, src8);
  3937. ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
  3938. ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
  3939. for (loop_cnt = (height >> 1); loop_cnt--;) {
  3940. /* 16width */
  3941. LD_SB2(src0_ptr, src_stride, src3, src4);
  3942. LD_SH2(src1_ptr, src2_stride, in0, in1);
  3943. LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
  3944. XORI_B2_128_SB(src3, src4);
  3945. ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
  3946. ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
  3947. /* 16width */
  3948. tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
  3949. tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
  3950. tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
  3951. tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
  3952. /* 16width */
  3953. HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
  3954. in0, in1, in2, in3,
  3955. weight_vec, rnd_vec, offset_vec,
  3956. tmp0, tmp1, tmp4, tmp5);
  3957. /* 16width */
  3958. PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
  3959. ST_SH2(tmp0, tmp1, dst, dst_stride);
  3960. dst += (2 * dst_stride);
  3961. src10_r = src32_r;
  3962. src21_r = src43_r;
  3963. src10_l = src32_l;
  3964. src21_l = src43_l;
  3965. src2 = src4;
  3966. /* next 16width */
  3967. LD_SB2(src0_ptr + 16, src_stride, src9, src10);
  3968. src0_ptr += (2 * src_stride);
  3969. LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
  3970. LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
  3971. src1_ptr += (2 * src2_stride);
  3972. XORI_B2_128_SB(src9, src10);
  3973. ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
  3974. ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
  3975. /* next 16width */
  3976. tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
  3977. tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
  3978. tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
  3979. tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
  3980. /* next 16width */
  3981. HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
  3982. in4, in5, in6, in7,
  3983. weight_vec, rnd_vec, offset_vec,
  3984. tmp2, tmp3, tmp6, tmp7);
  3985. /* next 16width */
  3986. PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
  3987. ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
  3988. dst_tmp += (2 * dst_stride);
  3989. src76_r = src98_r;
  3990. src87_r = src109_r;
  3991. src76_l = src98_l;
  3992. src87_l = src109_l;
  3993. src8 = src10;
  3994. }
  3995. }
  3996. static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
  3997. int32_t src_stride,
  3998. int16_t *src1_ptr,
  3999. int32_t src2_stride,
  4000. uint8_t *dst,
  4001. int32_t dst_stride,
  4002. const int8_t *filter_x,
  4003. const int8_t *filter_y,
  4004. int32_t weight0,
  4005. int32_t weight1,
  4006. int32_t offset0,
  4007. int32_t offset1,
  4008. int32_t rnd_val)
  4009. {
  4010. uint64_t tp0, tp1;
  4011. int32_t offset, weight;
  4012. v8i16 in0 = { 0 };
  4013. v16u8 out;
  4014. v16i8 src0, src1, src2, src3, src4;
  4015. v8i16 filt0, filt1;
  4016. v8i16 filt_h0, filt_h1;
  4017. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  4018. v16i8 mask1;
  4019. v8i16 filter_vec, tmp, weight_vec;
  4020. v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
  4021. v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
  4022. v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
  4023. src0_ptr -= (src_stride + 1);
  4024. filter_vec = LD_SH(filter_x);
  4025. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4026. filter_vec = LD_SH(filter_y);
  4027. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4028. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4029. mask1 = mask0 + 2;
  4030. offset = (offset0 + offset1) << rnd_val;
  4031. weight0 = weight0 & 0x0000FFFF;
  4032. weight = weight0 | (weight1 << 16);
  4033. const_vec = __msa_fill_w((128 * weight1));
  4034. const_vec <<= 6;
  4035. offset_vec = __msa_fill_w(offset);
  4036. weight_vec = (v8i16) __msa_fill_w(weight);
  4037. rnd_vec = __msa_fill_w(rnd_val + 1);
  4038. offset_vec += const_vec;
  4039. LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
  4040. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  4041. VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
  4042. VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
  4043. VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
  4044. dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4045. dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4046. dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4047. ILVRL_H2_SH(dst31, dst20, dst10, dst32);
  4048. ILVRL_H2_SH(dst42, dst31, dst21, dst43);
  4049. dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
  4050. dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
  4051. dst0 >>= 6;
  4052. dst1 >>= 6;
  4053. dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
  4054. LD2(src1_ptr, src2_stride, tp0, tp1);
  4055. INSERT_D2_SH(tp0, tp1, in0);
  4056. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  4057. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4058. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4059. SRAR_W2_SW(dst0, dst1, rnd_vec);
  4060. tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
  4061. tmp = CLIP_SH_0_255_MAX_SATU(tmp);
  4062. out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
  4063. ST_W2(out, 0, 1, dst, dst_stride);
  4064. }
  4065. static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
  4066. int32_t src_stride,
  4067. int16_t *src1_ptr,
  4068. int32_t src2_stride,
  4069. uint8_t *dst,
  4070. int32_t dst_stride,
  4071. const int8_t *filter_x,
  4072. const int8_t *filter_y,
  4073. int32_t weight0,
  4074. int32_t weight1,
  4075. int32_t offset0,
  4076. int32_t offset1,
  4077. int32_t rnd_val)
  4078. {
  4079. uint64_t tp0, tp1;
  4080. int32_t offset, weight;
  4081. v16u8 out;
  4082. v8i16 in0 = { 0 }, in1 = { 0 };
  4083. v16i8 src0, src1, src2, src3, src4, src5, src6;
  4084. v8i16 filt0, filt1;
  4085. v8i16 filt_h0, filt_h1;
  4086. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  4087. v16i8 mask1;
  4088. v8i16 filter_vec, weight_vec;
  4089. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4090. v8i16 tmp0, tmp1, tmp2, tmp3;
  4091. v8i16 dst30, dst41, dst52, dst63;
  4092. v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
  4093. v4i32 offset_vec, rnd_vec, const_vec;
  4094. v4i32 dst0, dst1, dst2, dst3;
  4095. src0_ptr -= (src_stride + 1);
  4096. filter_vec = LD_SH(filter_x);
  4097. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4098. filter_vec = LD_SH(filter_y);
  4099. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4100. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4101. mask1 = mask0 + 2;
  4102. offset = (offset0 + offset1) << rnd_val;
  4103. weight0 = weight0 & 0x0000FFFF;
  4104. weight = weight0 | (weight1 << 16);
  4105. const_vec = __msa_fill_w((128 * weight1));
  4106. const_vec <<= 6;
  4107. offset_vec = __msa_fill_w(offset);
  4108. weight_vec = (v8i16) __msa_fill_w(weight);
  4109. rnd_vec = __msa_fill_w(rnd_val + 1);
  4110. offset_vec += const_vec;
  4111. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  4112. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  4113. VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
  4114. VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
  4115. VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
  4116. VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
  4117. dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4118. dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4119. dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4120. dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4121. ILVRL_H2_SH(dst41, dst30, dst10, dst43);
  4122. ILVRL_H2_SH(dst52, dst41, dst21, dst54);
  4123. ILVRL_H2_SH(dst63, dst52, dst32, dst65);
  4124. dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
  4125. dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
  4126. dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
  4127. dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
  4128. SRA_4V(dst0, dst1, dst2, dst3, 6);
  4129. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
  4130. LD2(src1_ptr, src2_stride, tp0, tp1);
  4131. INSERT_D2_SH(tp0, tp1, in0);
  4132. src1_ptr += (2 * src2_stride);
  4133. LD2(src1_ptr, src2_stride, tp0, tp1);
  4134. INSERT_D2_SH(tp0, tp1, in1);
  4135. ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
  4136. ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
  4137. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4138. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4139. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4140. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4141. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4142. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
  4143. CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
  4144. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  4145. ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
  4146. }
  4147. static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
  4148. int32_t src_stride,
  4149. int16_t *src1_ptr,
  4150. int32_t src2_stride,
  4151. uint8_t *dst,
  4152. int32_t dst_stride,
  4153. const int8_t *filter_x,
  4154. const int8_t *filter_y,
  4155. int32_t height,
  4156. int32_t weight0,
  4157. int32_t weight1,
  4158. int32_t offset0,
  4159. int32_t offset1,
  4160. int32_t rnd_val)
  4161. {
  4162. uint32_t loop_cnt;
  4163. uint64_t tp0, tp1;
  4164. int32_t offset, weight;
  4165. v16u8 out0, out1;
  4166. v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
  4167. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  4168. v8i16 filt0, filt1;
  4169. v8i16 filt_h0, filt_h1;
  4170. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4171. v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
  4172. v16i8 mask1;
  4173. v8i16 filter_vec, weight_vec;
  4174. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  4175. v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
  4176. v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
  4177. v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
  4178. v8i16 dst98_r, dst109_r;
  4179. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  4180. v4i32 offset_vec, rnd_vec, const_vec;
  4181. src0_ptr -= (src_stride + 1);
  4182. filter_vec = LD_SH(filter_x);
  4183. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4184. filter_vec = LD_SH(filter_y);
  4185. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4186. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4187. mask1 = mask0 + 2;
  4188. offset = (offset0 + offset1) << rnd_val;
  4189. weight0 = weight0 & 0x0000FFFF;
  4190. weight = weight0 | (weight1 << 16);
  4191. const_vec = __msa_fill_w((128 * weight1));
  4192. const_vec <<= 6;
  4193. offset_vec = __msa_fill_w(offset);
  4194. weight_vec = (v8i16) __msa_fill_w(weight);
  4195. rnd_vec = __msa_fill_w(rnd_val + 1);
  4196. offset_vec += const_vec;
  4197. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  4198. src0_ptr += (3 * src_stride);
  4199. XORI_B3_128_SB(src0, src1, src2);
  4200. VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
  4201. VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
  4202. dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4203. dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4204. ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
  4205. dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
  4206. for (loop_cnt = height >> 3; loop_cnt--;) {
  4207. LD_SB8(src0_ptr, src_stride,
  4208. src3, src4, src5, src6, src7, src8, src9, src10);
  4209. src0_ptr += (8 * src_stride);
  4210. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  4211. VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
  4212. VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
  4213. VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
  4214. VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
  4215. dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4216. dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4217. dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4218. dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4219. dst32_r = __msa_ilvr_h(dst73, dst22);
  4220. ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
  4221. ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
  4222. ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
  4223. dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
  4224. dst76_r = __msa_ilvr_h(dst22, dst106);
  4225. LD2(src1_ptr, src2_stride, tp0, tp1);
  4226. src1_ptr += 2 * src2_stride;
  4227. INSERT_D2_SH(tp0, tp1, in0);
  4228. LD2(src1_ptr, src2_stride, tp0, tp1);
  4229. src1_ptr += 2 * src2_stride;
  4230. INSERT_D2_SH(tp0, tp1, in1);
  4231. LD2(src1_ptr, src2_stride, tp0, tp1);
  4232. src1_ptr += 2 * src2_stride;
  4233. INSERT_D2_SH(tp0, tp1, in2);
  4234. LD2(src1_ptr, src2_stride, tp0, tp1);
  4235. src1_ptr += 2 * src2_stride;
  4236. INSERT_D2_SH(tp0, tp1, in3);
  4237. dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4238. dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4239. dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4240. dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4241. dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  4242. dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  4243. dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
  4244. dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
  4245. SRA_4V(dst0, dst1, dst2, dst3, 6);
  4246. SRA_4V(dst4, dst5, dst6, dst7, 6);
  4247. PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
  4248. dst2, dst3);
  4249. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  4250. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  4251. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  4252. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  4253. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4254. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4255. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4256. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4257. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  4258. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  4259. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  4260. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  4261. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4262. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  4263. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
  4264. tmp2, tmp3);
  4265. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4266. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4267. ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  4268. dst += (8 * dst_stride);
  4269. dst10_r = dst98_r;
  4270. dst21_r = dst109_r;
  4271. dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
  4272. }
  4273. }
  4274. static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
  4275. int32_t src_stride,
  4276. int16_t *src1_ptr,
  4277. int32_t src2_stride,
  4278. uint8_t *dst,
  4279. int32_t dst_stride,
  4280. const int8_t *filter_x,
  4281. const int8_t *filter_y,
  4282. int32_t height,
  4283. int32_t weight0,
  4284. int32_t weight1,
  4285. int32_t offset0,
  4286. int32_t offset1,
  4287. int32_t rnd_val)
  4288. {
  4289. if (2 == height) {
  4290. hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  4291. dst, dst_stride, filter_x, filter_y,
  4292. weight0, weight1, offset0, offset1, rnd_val);
  4293. } else if (4 == height) {
  4294. hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  4295. dst, dst_stride, filter_x, filter_y,
  4296. weight0, weight1, offset0, offset1, rnd_val);
  4297. } else if (0 == (height % 8)) {
  4298. hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
  4299. src1_ptr, src2_stride,
  4300. dst, dst_stride, filter_x, filter_y,
  4301. height, weight0, weight1,
  4302. offset0, offset1, rnd_val);
  4303. }
  4304. }
  4305. static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
  4306. int32_t src_stride,
  4307. int16_t *src1_ptr,
  4308. int32_t src2_stride,
  4309. uint8_t *dst,
  4310. int32_t dst_stride,
  4311. const int8_t *filter_x,
  4312. const int8_t *filter_y,
  4313. int32_t height,
  4314. int32_t weight0,
  4315. int32_t weight1,
  4316. int32_t offset0,
  4317. int32_t offset1,
  4318. int32_t rnd_val)
  4319. {
  4320. uint32_t tpw0, tpw1, tpw2, tpw3;
  4321. uint64_t tp0, tp1;
  4322. int32_t offset, weight;
  4323. v16u8 out0, out1, out2;
  4324. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  4325. v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
  4326. v8i16 in4 = { 0 }, in5 = { 0 };
  4327. v8i16 filt0, filt1;
  4328. v8i16 filt_h0, filt_h1, filter_vec;
  4329. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4330. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  4331. v16i8 mask1;
  4332. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
  4333. v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
  4334. v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
  4335. v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
  4336. v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
  4337. v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
  4338. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4339. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  4340. v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
  4341. v4i32 offset_vec, rnd_vec, const_vec;
  4342. src0_ptr -= (src_stride + 1);
  4343. filter_vec = LD_SH(filter_x);
  4344. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4345. filter_vec = LD_SH(filter_y);
  4346. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4347. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4348. mask1 = mask0 + 2;
  4349. offset = (offset0 + offset1) << rnd_val;
  4350. weight0 = weight0 & 0x0000FFFF;
  4351. weight = weight0 | (weight1 << 16);
  4352. const_vec = __msa_fill_w((128 * weight1));
  4353. const_vec <<= 6;
  4354. offset_vec = __msa_fill_w(offset);
  4355. weight_vec = (v8i16) __msa_fill_w(weight);
  4356. rnd_vec = __msa_fill_w(rnd_val + 1);
  4357. offset_vec += const_vec;
  4358. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  4359. src0_ptr += (3 * src_stride);
  4360. XORI_B3_128_SB(src0, src1, src2);
  4361. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4362. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4363. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4364. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4365. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4366. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4367. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  4368. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  4369. LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
  4370. src10);
  4371. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  4372. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  4373. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  4374. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  4375. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  4376. dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4377. dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4378. dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4379. dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4380. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
  4381. VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
  4382. VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
  4383. VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
  4384. dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4385. dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4386. dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4387. dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4388. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  4389. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  4390. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  4391. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  4392. ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
  4393. ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
  4394. ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
  4395. ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
  4396. PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
  4397. PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
  4398. dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
  4399. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4400. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4401. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4402. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4403. dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  4404. dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  4405. dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
  4406. dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
  4407. dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
  4408. dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
  4409. dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
  4410. dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
  4411. SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
  4412. SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
  4413. SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
  4414. PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
  4415. PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
  4416. LD2(src1_ptr, src2_stride, tp0, tp1);
  4417. INSERT_D2_SH(tp0, tp1, in0);
  4418. LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
  4419. INSERT_D2_SH(tp0, tp1, in1);
  4420. LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
  4421. INSERT_D2_SH(tp0, tp1, in2);
  4422. LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
  4423. INSERT_D2_SH(tp0, tp1, in3);
  4424. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  4425. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  4426. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  4427. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  4428. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4429. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4430. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4431. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4432. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  4433. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  4434. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  4435. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  4436. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4437. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  4438. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
  4439. tmp2, tmp3);
  4440. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4441. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4442. ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  4443. PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
  4444. LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
  4445. src1_ptr += (4 * src2_stride);
  4446. INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
  4447. LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
  4448. INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
  4449. ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
  4450. ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
  4451. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4452. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4453. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4454. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4455. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4456. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
  4457. CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
  4458. out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
  4459. ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
  4460. }
  4461. static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
  4462. int32_t src_stride,
  4463. int16_t *src1_ptr,
  4464. int32_t src2_stride,
  4465. uint8_t *dst,
  4466. int32_t dst_stride,
  4467. const int8_t *filter_x,
  4468. const int8_t *filter_y,
  4469. int32_t weight0,
  4470. int32_t weight1,
  4471. int32_t offset0,
  4472. int32_t offset1,
  4473. int32_t rnd_val)
  4474. {
  4475. int32_t weight, offset;
  4476. v16u8 out;
  4477. v16i8 src0, src1, src2, src3, src4;
  4478. v8i16 filt0, filt1;
  4479. v8i16 filt_h0, filt_h1;
  4480. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  4481. v16i8 mask1;
  4482. v8i16 filter_vec, weight_vec;
  4483. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  4484. v8i16 dst0, dst1, dst2, dst3, dst4;
  4485. v8i16 in0, in1;
  4486. v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
  4487. v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
  4488. v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
  4489. v8i16 tmp0, tmp1, tmp2, tmp3;
  4490. v4i32 offset_vec, rnd_vec, const_vec;
  4491. src0_ptr -= (src_stride + 1);
  4492. filter_vec = LD_SH(filter_x);
  4493. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4494. filter_vec = LD_SH(filter_y);
  4495. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4496. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4497. mask1 = mask0 + 2;
  4498. offset = (offset0 + offset1) << rnd_val;
  4499. weight0 = weight0 & 0x0000FFFF;
  4500. weight = weight0 | (weight1 << 16);
  4501. const_vec = __msa_fill_w((128 * weight1));
  4502. const_vec <<= 6;
  4503. offset_vec = __msa_fill_w(offset);
  4504. weight_vec = (v8i16) __msa_fill_w(weight);
  4505. rnd_vec = __msa_fill_w(rnd_val + 1);
  4506. offset_vec += const_vec;
  4507. LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
  4508. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  4509. LD_SH2(src1_ptr, src2_stride, in0, in1);
  4510. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4511. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4512. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4513. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  4514. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
  4515. dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4516. dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4517. dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4518. dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4519. dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
  4520. ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
  4521. ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
  4522. ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
  4523. ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
  4524. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4525. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4526. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4527. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4528. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4529. PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
  4530. ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
  4531. ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
  4532. dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4533. dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4534. dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4535. dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4536. SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
  4537. PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
  4538. CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
  4539. out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
  4540. ST_D2(out, 0, 1, dst, dst_stride);
  4541. }
  4542. static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
  4543. int32_t src_stride,
  4544. int16_t *src1_ptr,
  4545. int32_t src2_stride,
  4546. uint8_t *dst,
  4547. int32_t dst_stride,
  4548. const int8_t *filter_x,
  4549. const int8_t *filter_y,
  4550. int32_t weight0,
  4551. int32_t weight1,
  4552. int32_t offset0,
  4553. int32_t offset1,
  4554. int32_t rnd_val,
  4555. int32_t width8mult)
  4556. {
  4557. int32_t weight, offset;
  4558. uint32_t cnt;
  4559. v16u8 out0, out1;
  4560. v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
  4561. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4562. v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
  4563. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
  4564. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
  4565. v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
  4566. v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
  4567. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4568. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  4569. v4i32 offset_vec, rnd_vec, const_vec;
  4570. src0_ptr -= (src_stride + 1);
  4571. filter_vec = LD_SH(filter_x);
  4572. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4573. filter_vec = LD_SH(filter_y);
  4574. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4575. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4576. mask0 = LD_SB(ff_hevc_mask_arr);
  4577. mask1 = mask0 + 2;
  4578. offset = (offset0 + offset1) << rnd_val;
  4579. weight0 = weight0 & 0x0000FFFF;
  4580. weight = weight0 | (weight1 << 16);
  4581. const_vec = __msa_fill_w((128 * weight1));
  4582. const_vec <<= 6;
  4583. offset_vec = __msa_fill_w(offset);
  4584. rnd_vec = __msa_fill_w(rnd_val + 1);
  4585. offset_vec += const_vec;
  4586. weight_vec = (v8i16) __msa_fill_w(weight);
  4587. for (cnt = width8mult; cnt--;) {
  4588. LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
  4589. src0_ptr += 8;
  4590. XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
  4591. LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
  4592. src1_ptr += 8;
  4593. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4594. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4595. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4596. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4597. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4598. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4599. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  4600. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  4601. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  4602. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  4603. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  4604. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  4605. dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4606. dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4607. dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4608. dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4609. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  4610. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  4611. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  4612. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  4613. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4614. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4615. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4616. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4617. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4618. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4619. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4620. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4621. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4622. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4623. PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
  4624. dst3_r, dst0, dst1, dst2, dst3);
  4625. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  4626. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  4627. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  4628. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  4629. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4630. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4631. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4632. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4633. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  4634. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  4635. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  4636. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  4637. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4638. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  4639. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  4640. tmp0, tmp1, tmp2, tmp3);
  4641. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4642. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4643. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  4644. dst += 8;
  4645. }
  4646. }
  4647. static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
  4648. int32_t src_stride,
  4649. int16_t *src1_ptr,
  4650. int32_t src2_stride,
  4651. uint8_t *dst,
  4652. int32_t dst_stride,
  4653. const int8_t *filter_x,
  4654. const int8_t *filter_y,
  4655. int32_t weight0,
  4656. int32_t weight1,
  4657. int32_t offset0,
  4658. int32_t offset1,
  4659. int32_t rnd_val)
  4660. {
  4661. uint32_t offset, weight;
  4662. v16u8 out0, out1, out2;
  4663. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  4664. v8i16 filt0, filt1;
  4665. v8i16 filt_h0, filt_h1;
  4666. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  4667. v16i8 mask1;
  4668. v8i16 filter_vec, weight_vec;
  4669. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
  4670. v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
  4671. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
  4672. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4673. v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
  4674. v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
  4675. v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
  4676. v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
  4677. v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
  4678. v8i16 in0, in1, in2, in3, in4, in5;
  4679. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  4680. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  4681. v4i32 offset_vec, rnd_vec, const_vec;
  4682. src0_ptr -= (src_stride + 1);
  4683. filter_vec = LD_SH(filter_x);
  4684. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4685. filter_vec = LD_SH(filter_y);
  4686. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4687. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4688. mask1 = mask0 + 2;
  4689. offset = (offset0 + offset1) << rnd_val;
  4690. weight0 = weight0 & 0x0000FFFF;
  4691. weight = weight0 | (weight1 << 16);
  4692. const_vec = __msa_fill_w((128 * weight1));
  4693. const_vec <<= 6;
  4694. offset_vec = __msa_fill_w(offset);
  4695. weight_vec = (v8i16) __msa_fill_w(weight);
  4696. rnd_vec = __msa_fill_w(rnd_val + 1);
  4697. offset_vec += const_vec;
  4698. LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
  4699. src0_ptr += (5 * src_stride);
  4700. LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
  4701. XORI_B5_128_SB(src0, src1, src2, src3, src4);
  4702. XORI_B4_128_SB(src5, src6, src7, src8);
  4703. LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
  4704. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4705. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4706. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4707. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
  4708. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
  4709. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
  4710. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
  4711. VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
  4712. VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
  4713. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4714. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4715. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4716. dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4717. dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
  4718. dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
  4719. dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
  4720. dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
  4721. dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
  4722. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  4723. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  4724. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  4725. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  4726. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  4727. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  4728. ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
  4729. ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
  4730. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4731. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4732. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4733. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4734. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4735. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4736. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4737. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4738. dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  4739. dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
  4740. dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  4741. dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
  4742. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4743. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4744. SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
  4745. PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
  4746. dst0, dst1, dst2, dst3);
  4747. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  4748. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  4749. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  4750. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  4751. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4752. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4753. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4754. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4755. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  4756. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  4757. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  4758. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  4759. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4760. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  4761. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  4762. tmp0, tmp1, tmp2, tmp3);
  4763. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4764. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4765. PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
  4766. ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
  4767. ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
  4768. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4769. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4770. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4771. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4772. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4773. PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
  4774. CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
  4775. out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
  4776. ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
  4777. ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
  4778. }
  4779. static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
  4780. int32_t src_stride,
  4781. int16_t *src1_ptr,
  4782. int32_t src2_stride,
  4783. uint8_t *dst,
  4784. int32_t dst_stride,
  4785. const int8_t *filter_x,
  4786. const int8_t *filter_y,
  4787. int32_t height,
  4788. int32_t weight0,
  4789. int32_t weight1,
  4790. int32_t offset0,
  4791. int32_t offset1,
  4792. int32_t rnd_val,
  4793. int32_t width)
  4794. {
  4795. uint32_t loop_cnt;
  4796. uint32_t cnt;
  4797. int32_t offset, weight;
  4798. uint8_t *src0_ptr_tmp;
  4799. int16_t *src1_ptr_tmp;
  4800. uint8_t *dst_tmp;
  4801. v16u8 out0, out1;
  4802. v16i8 src0, src1, src2, src3, src4, src5, src6;
  4803. v8i16 in0, in1, in2, in3;
  4804. v8i16 filt0, filt1;
  4805. v8i16 filt_h0, filt_h1;
  4806. v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
  4807. v16i8 mask1;
  4808. v8i16 filter_vec;
  4809. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4810. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
  4811. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4812. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  4813. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  4814. v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
  4815. v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
  4816. v4i32 offset_vec, rnd_vec, const_vec;
  4817. src0_ptr -= (src_stride + 1);
  4818. filter_vec = LD_SH(filter_x);
  4819. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4820. filter_vec = LD_SH(filter_y);
  4821. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4822. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4823. mask1 = mask0 + 2;
  4824. offset = (offset0 + offset1) << rnd_val;
  4825. weight0 = weight0 & 0x0000FFFF;
  4826. weight = weight0 | (weight1 << 16);
  4827. const_vec = __msa_fill_w((128 * weight1));
  4828. const_vec <<= 6;
  4829. offset_vec = __msa_fill_w(offset);
  4830. weight_vec = (v8i16) __msa_fill_w(weight);
  4831. rnd_vec = __msa_fill_w(rnd_val + 1);
  4832. offset_vec += const_vec;
  4833. for (cnt = width >> 3; cnt--;) {
  4834. src0_ptr_tmp = src0_ptr;
  4835. src1_ptr_tmp = src1_ptr;
  4836. dst_tmp = dst;
  4837. LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
  4838. src0_ptr_tmp += (3 * src_stride);
  4839. XORI_B3_128_SB(src0, src1, src2);
  4840. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  4841. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  4842. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  4843. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4844. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4845. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4846. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  4847. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  4848. for (loop_cnt = height >> 2; loop_cnt--;) {
  4849. LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
  4850. src0_ptr_tmp += (4 * src_stride);
  4851. LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
  4852. src1_ptr_tmp += (4 * src2_stride);
  4853. XORI_B4_128_SB(src3, src4, src5, src6);
  4854. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  4855. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  4856. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  4857. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  4858. dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  4859. dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  4860. dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  4861. dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  4862. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  4863. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  4864. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  4865. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  4866. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  4867. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  4868. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  4869. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  4870. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  4871. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  4872. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  4873. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  4874. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  4875. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  4876. PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
  4877. dst3_r, dst0, dst1, dst2, dst3);
  4878. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  4879. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  4880. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  4881. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  4882. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  4883. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  4884. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  4885. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  4886. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  4887. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  4888. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  4889. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  4890. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  4891. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  4892. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  4893. tmp0, tmp1, tmp2, tmp3);
  4894. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  4895. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  4896. ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
  4897. dst_tmp += (4 * dst_stride);
  4898. dst10_r = dst54_r;
  4899. dst10_l = dst54_l;
  4900. dst21_r = dst65_r;
  4901. dst21_l = dst65_l;
  4902. dsth2 = dsth6;
  4903. }
  4904. src0_ptr += 8;
  4905. dst += 8;
  4906. src1_ptr += 8;
  4907. }
  4908. }
  4909. static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
  4910. int32_t src_stride,
  4911. int16_t *src1_ptr,
  4912. int32_t src2_stride,
  4913. uint8_t *dst,
  4914. int32_t dst_stride,
  4915. const int8_t *filter_x,
  4916. const int8_t *filter_y,
  4917. int32_t height,
  4918. int32_t weight0,
  4919. int32_t weight1,
  4920. int32_t offset0,
  4921. int32_t offset1,
  4922. int32_t rnd_val)
  4923. {
  4924. if (2 == height) {
  4925. hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  4926. dst, dst_stride, filter_x, filter_y,
  4927. weight0, weight1, offset0, offset1, rnd_val);
  4928. } else if (4 == height) {
  4929. hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
  4930. src2_stride, dst, dst_stride, filter_x,
  4931. filter_y, weight0, weight1, offset0,
  4932. offset1, rnd_val, 1);
  4933. } else if (6 == height) {
  4934. hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
  4935. dst, dst_stride, filter_x, filter_y,
  4936. weight0, weight1, offset0, offset1, rnd_val);
  4937. } else if (0 == (height % 4)) {
  4938. hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
  4939. src1_ptr, src2_stride,
  4940. dst, dst_stride, filter_x, filter_y,
  4941. height, weight0,
  4942. weight1, offset0, offset1, rnd_val, 8);
  4943. }
  4944. }
  4945. static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
  4946. int32_t src_stride,
  4947. int16_t *src1_ptr,
  4948. int32_t src2_stride,
  4949. uint8_t *dst,
  4950. int32_t dst_stride,
  4951. const int8_t *filter_x,
  4952. const int8_t *filter_y,
  4953. int32_t height,
  4954. int32_t weight0,
  4955. int32_t weight1,
  4956. int32_t offset0,
  4957. int32_t offset1,
  4958. int32_t rnd_val)
  4959. {
  4960. uint32_t loop_cnt;
  4961. uint64_t tp0, tp1;
  4962. int32_t offset, weight;
  4963. uint8_t *src0_ptr_tmp, *dst_tmp;
  4964. int16_t *src1_ptr_tmp;
  4965. v16u8 out0, out1;
  4966. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
  4967. v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  4968. v16i8 mask0, mask1, mask2, mask3;
  4969. v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
  4970. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  4971. v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
  4972. v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
  4973. v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
  4974. v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
  4975. v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
  4976. v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
  4977. v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
  4978. v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  4979. v4i32 offset_vec, rnd_vec, const_vec;
  4980. src0_ptr -= (src_stride + 1);
  4981. filter_vec = LD_SH(filter_x);
  4982. SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
  4983. filter_vec = LD_SH(filter_y);
  4984. UNPCK_R_SB_SH(filter_vec, filter_vec);
  4985. SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
  4986. mask0 = LD_SB(ff_hevc_mask_arr);
  4987. mask1 = mask0 + 2;
  4988. offset = (offset0 + offset1) << rnd_val;
  4989. weight0 = weight0 & 0x0000FFFF;
  4990. weight = weight0 | (weight1 << 16);
  4991. const_vec = __msa_fill_w((128 * weight1));
  4992. const_vec <<= 6;
  4993. offset_vec = __msa_fill_w(offset);
  4994. rnd_vec = __msa_fill_w(rnd_val + 1);
  4995. offset_vec += const_vec;
  4996. weight_vec = (v8i16) __msa_fill_w(weight);
  4997. src0_ptr_tmp = src0_ptr;
  4998. dst_tmp = dst;
  4999. src1_ptr_tmp = src1_ptr;
  5000. LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
  5001. src0_ptr_tmp += (3 * src_stride);
  5002. XORI_B3_128_SB(src0, src1, src2);
  5003. VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
  5004. VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
  5005. VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
  5006. dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  5007. dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  5008. dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  5009. ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
  5010. ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
  5011. for (loop_cnt = 4; loop_cnt--;) {
  5012. LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
  5013. src0_ptr_tmp += (4 * src_stride);
  5014. XORI_B4_128_SB(src3, src4, src5, src6);
  5015. LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
  5016. src1_ptr_tmp += (4 * src2_stride);
  5017. VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
  5018. VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
  5019. VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
  5020. VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
  5021. dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  5022. dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  5023. dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  5024. dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  5025. ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
  5026. ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
  5027. ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
  5028. ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
  5029. dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  5030. dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
  5031. dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  5032. dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
  5033. dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  5034. dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
  5035. dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  5036. dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
  5037. SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
  5038. SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
  5039. PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
  5040. dst3_r, dst0, dst1, dst2, dst3);
  5041. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  5042. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  5043. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  5044. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  5045. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  5046. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  5047. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  5048. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  5049. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  5050. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  5051. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  5052. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  5053. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  5054. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  5055. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  5056. tmp0, tmp1, tmp2, tmp3);
  5057. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  5058. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  5059. ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
  5060. dst_tmp += (4 * dst_stride);
  5061. dst10_r = dst54_r;
  5062. dst10_l = dst54_l;
  5063. dst21_r = dst65_r;
  5064. dst21_l = dst65_l;
  5065. dsth2 = dsth6;
  5066. }
  5067. src0_ptr += 8;
  5068. dst += 8;
  5069. src1_ptr += 8;
  5070. mask2 = LD_SB(ff_hevc_mask_arr + 16);
  5071. mask3 = mask2 + 2;
  5072. LD_SB3(src0_ptr, src_stride, src0, src1, src2);
  5073. src0_ptr += (3 * src_stride);
  5074. XORI_B3_128_SB(src0, src1, src2);
  5075. VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
  5076. VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
  5077. dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  5078. dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  5079. ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
  5080. dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
  5081. for (loop_cnt = 2; loop_cnt--;) {
  5082. LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
  5083. src10);
  5084. src0_ptr += (8 * src_stride);
  5085. XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
  5086. VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
  5087. VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
  5088. VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
  5089. VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
  5090. dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
  5091. dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
  5092. dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
  5093. dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
  5094. dst32_r = __msa_ilvr_h(dst73, dst22);
  5095. ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
  5096. ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
  5097. ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
  5098. dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
  5099. dst76_r = __msa_ilvr_h(dst22, dst106);
  5100. LD2(src1_ptr, src2_stride, tp0, tp1);
  5101. src1_ptr += 2 * src2_stride;
  5102. INSERT_D2_SH(tp0, tp1, in0);
  5103. LD2(src1_ptr, src2_stride, tp0, tp1);
  5104. src1_ptr += 2 * src2_stride;
  5105. INSERT_D2_SH(tp0, tp1, in1);
  5106. LD2(src1_ptr, src2_stride, tp0, tp1);
  5107. src1_ptr += 2 * src2_stride;
  5108. INSERT_D2_SH(tp0, tp1, in2);
  5109. LD2(src1_ptr, src2_stride, tp0, tp1);
  5110. src1_ptr += 2 * src2_stride;
  5111. INSERT_D2_SH(tp0, tp1, in3);
  5112. dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
  5113. dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
  5114. dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
  5115. dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
  5116. dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
  5117. dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
  5118. dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
  5119. dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
  5120. SRA_4V(dst0, dst1, dst2, dst3, 6);
  5121. SRA_4V(dst4, dst5, dst6, dst7, 6);
  5122. PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  5123. dst0, dst1, dst2, dst3);
  5124. ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
  5125. ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
  5126. ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
  5127. ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
  5128. dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
  5129. dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
  5130. dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
  5131. dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
  5132. dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
  5133. dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
  5134. dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
  5135. dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
  5136. SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
  5137. SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
  5138. PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
  5139. tmp0, tmp1, tmp2, tmp3);
  5140. CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
  5141. PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
  5142. ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
  5143. dst += (8 * dst_stride);
  5144. dst10_r = dst98_r;
  5145. dst21_r = dst109_r;
  5146. dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
  5147. }
  5148. }
  5149. static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
  5150. int32_t src_stride,
  5151. int16_t *src1_ptr,
  5152. int32_t src2_stride,
  5153. uint8_t *dst,
  5154. int32_t dst_stride,
  5155. const int8_t *filter_x,
  5156. const int8_t *filter_y,
  5157. int32_t height,
  5158. int32_t weight0,
  5159. int32_t weight1,
  5160. int32_t offset0,
  5161. int32_t offset1,
  5162. int32_t rnd_val)
  5163. {
  5164. if (4 == height) {
  5165. hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
  5166. src2_stride, dst, dst_stride, filter_x,
  5167. filter_y, weight0, weight1, offset0,
  5168. offset1, rnd_val, 2);
  5169. } else {
  5170. hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
  5171. src2_stride, dst, dst_stride,
  5172. filter_x, filter_y, height, weight0,
  5173. weight1, offset0, offset1, rnd_val, 16);
  5174. }
  5175. }
  5176. static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
  5177. int32_t src_stride,
  5178. int16_t *src1_ptr,
  5179. int32_t src2_stride,
  5180. uint8_t *dst,
  5181. int32_t dst_stride,
  5182. const int8_t *filter_x,
  5183. const int8_t *filter_y,
  5184. int32_t height,
  5185. int32_t weight0,
  5186. int32_t weight1,
  5187. int32_t offset0,
  5188. int32_t offset1,
  5189. int32_t rnd_val)
  5190. {
  5191. hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
  5192. src1_ptr, src2_stride,
  5193. dst, dst_stride,
  5194. filter_x, filter_y, height, weight0,
  5195. weight1, offset0, offset1, rnd_val, 24);
  5196. }
  5197. static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
  5198. int32_t src_stride,
  5199. int16_t *src1_ptr,
  5200. int32_t src2_stride,
  5201. uint8_t *dst,
  5202. int32_t dst_stride,
  5203. const int8_t *filter_x,
  5204. const int8_t *filter_y,
  5205. int32_t height,
  5206. int32_t weight0,
  5207. int32_t weight1,
  5208. int32_t offset0,
  5209. int32_t offset1,
  5210. int32_t rnd_val)
  5211. {
  5212. hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
  5213. src1_ptr, src2_stride,
  5214. dst, dst_stride,
  5215. filter_x, filter_y, height, weight0,
  5216. weight1, offset0, offset1, rnd_val, 32);
  5217. }
  5218. #define BI_W_MC_COPY(WIDTH) \
  5219. void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
  5220. ptrdiff_t dst_stride, \
  5221. uint8_t *src, \
  5222. ptrdiff_t src_stride, \
  5223. int16_t *src_16bit, \
  5224. int height, \
  5225. int denom, \
  5226. int weight0, \
  5227. int weight1, \
  5228. int offset0, \
  5229. int offset1, \
  5230. intptr_t mx, \
  5231. intptr_t my, \
  5232. int width) \
  5233. { \
  5234. int shift = 14 + 1 - 8; \
  5235. int log2Wd = denom + shift - 1; \
  5236. \
  5237. hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
  5238. dst, dst_stride, height, \
  5239. weight0, weight1, offset0, \
  5240. offset1, log2Wd); \
  5241. }
  5242. BI_W_MC_COPY(4);
  5243. BI_W_MC_COPY(6);
  5244. BI_W_MC_COPY(8);
  5245. BI_W_MC_COPY(12);
  5246. BI_W_MC_COPY(16);
  5247. BI_W_MC_COPY(24);
  5248. BI_W_MC_COPY(32);
  5249. BI_W_MC_COPY(48);
  5250. BI_W_MC_COPY(64);
  5251. #undef BI_W_MC_COPY
  5252. #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
  5253. void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
  5254. ptrdiff_t \
  5255. dst_stride, \
  5256. uint8_t *src, \
  5257. ptrdiff_t \
  5258. src_stride, \
  5259. int16_t *src_16bit, \
  5260. int height, \
  5261. int denom, \
  5262. int weight0, \
  5263. int weight1, \
  5264. int offset0, \
  5265. int offset1, \
  5266. intptr_t mx, \
  5267. intptr_t my, \
  5268. int width) \
  5269. { \
  5270. const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
  5271. int log2Wd = denom + 14 - 8; \
  5272. \
  5273. hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
  5274. MAX_PB_SIZE, dst, dst_stride, \
  5275. filter, height, weight0, \
  5276. weight1, offset0, offset1, \
  5277. log2Wd); \
  5278. }
  5279. BI_W_MC(qpel, h, 4, 8, hz, mx);
  5280. BI_W_MC(qpel, h, 8, 8, hz, mx);
  5281. BI_W_MC(qpel, h, 12, 8, hz, mx);
  5282. BI_W_MC(qpel, h, 16, 8, hz, mx);
  5283. BI_W_MC(qpel, h, 24, 8, hz, mx);
  5284. BI_W_MC(qpel, h, 32, 8, hz, mx);
  5285. BI_W_MC(qpel, h, 48, 8, hz, mx);
  5286. BI_W_MC(qpel, h, 64, 8, hz, mx);
  5287. BI_W_MC(qpel, v, 4, 8, vt, my);
  5288. BI_W_MC(qpel, v, 8, 8, vt, my);
  5289. BI_W_MC(qpel, v, 12, 8, vt, my);
  5290. BI_W_MC(qpel, v, 16, 8, vt, my);
  5291. BI_W_MC(qpel, v, 24, 8, vt, my);
  5292. BI_W_MC(qpel, v, 32, 8, vt, my);
  5293. BI_W_MC(qpel, v, 48, 8, vt, my);
  5294. BI_W_MC(qpel, v, 64, 8, vt, my);
  5295. BI_W_MC(epel, h, 4, 4, hz, mx);
  5296. BI_W_MC(epel, h, 8, 4, hz, mx);
  5297. BI_W_MC(epel, h, 6, 4, hz, mx);
  5298. BI_W_MC(epel, h, 12, 4, hz, mx);
  5299. BI_W_MC(epel, h, 16, 4, hz, mx);
  5300. BI_W_MC(epel, h, 24, 4, hz, mx);
  5301. BI_W_MC(epel, h, 32, 4, hz, mx);
  5302. BI_W_MC(epel, v, 4, 4, vt, my);
  5303. BI_W_MC(epel, v, 8, 4, vt, my);
  5304. BI_W_MC(epel, v, 6, 4, vt, my);
  5305. BI_W_MC(epel, v, 12, 4, vt, my);
  5306. BI_W_MC(epel, v, 16, 4, vt, my);
  5307. BI_W_MC(epel, v, 24, 4, vt, my);
  5308. BI_W_MC(epel, v, 32, 4, vt, my);
  5309. #undef BI_W_MC
  5310. #define BI_W_MC_HV(PEL, WIDTH, TAP) \
  5311. void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
  5312. ptrdiff_t dst_stride, \
  5313. uint8_t *src, \
  5314. ptrdiff_t src_stride, \
  5315. int16_t *src_16bit, \
  5316. int height, \
  5317. int denom, \
  5318. int weight0, \
  5319. int weight1, \
  5320. int offset0, \
  5321. int offset1, \
  5322. intptr_t mx, \
  5323. intptr_t my, \
  5324. int width) \
  5325. { \
  5326. const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
  5327. const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
  5328. int log2Wd = denom + 14 - 8; \
  5329. \
  5330. hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
  5331. MAX_PB_SIZE, dst, dst_stride, \
  5332. filter_x, filter_y, height, \
  5333. weight0, weight1, offset0, \
  5334. offset1, log2Wd); \
  5335. }
  5336. BI_W_MC_HV(qpel, 4, 8);
  5337. BI_W_MC_HV(qpel, 8, 8);
  5338. BI_W_MC_HV(qpel, 12, 8);
  5339. BI_W_MC_HV(qpel, 16, 8);
  5340. BI_W_MC_HV(qpel, 24, 8);
  5341. BI_W_MC_HV(qpel, 32, 8);
  5342. BI_W_MC_HV(qpel, 48, 8);
  5343. BI_W_MC_HV(qpel, 64, 8);
  5344. BI_W_MC_HV(epel, 4, 4);
  5345. BI_W_MC_HV(epel, 8, 4);
  5346. BI_W_MC_HV(epel, 6, 4);
  5347. BI_W_MC_HV(epel, 12, 4);
  5348. BI_W_MC_HV(epel, 16, 4);
  5349. BI_W_MC_HV(epel, 24, 4);
  5350. BI_W_MC_HV(epel, 32, 4);
  5351. #undef BI_W_MC_HV