X86TargetTransformInfo.cpp 250 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778
  1. //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// This file implements a TargetTransformInfo analysis pass specific to the
  10. /// X86 target machine. It uses the target's detailed information to provide
  11. /// more precise answers to certain TTI queries, while letting the target
  12. /// independent and default TTI implementations handle the rest.
  13. ///
  14. //===----------------------------------------------------------------------===//
  15. /// About Cost Model numbers used below it's necessary to say the following:
  16. /// the numbers correspond to some "generic" X86 CPU instead of usage of
  17. /// concrete CPU model. Usually the numbers correspond to CPU where the feature
  18. /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
  19. /// the lookups below the cost is based on Nehalem as that was the first CPU
  20. /// to support that feature level and thus has most likely the worst case cost.
  21. /// Some examples of other technologies/CPUs:
  22. /// SSE 3 - Pentium4 / Athlon64
  23. /// SSE 4.1 - Penryn
  24. /// SSE 4.2 - Nehalem
  25. /// AVX - Sandy Bridge
  26. /// AVX2 - Haswell
  27. /// AVX-512 - Xeon Phi / Skylake
  28. /// And some examples of instruction target dependent costs (latency)
  29. /// divss sqrtss rsqrtss
  30. /// AMD K7 11-16 19 3
  31. /// Piledriver 9-24 13-15 5
  32. /// Jaguar 14 16 2
  33. /// Pentium II,III 18 30 2
  34. /// Nehalem 7-14 7-18 3
  35. /// Haswell 10-13 11 5
  36. /// TODO: Develop and implement the target dependent cost model and
  37. /// specialize cost numbers for different Cost Model Targets such as throughput,
  38. /// code size, latency and uop count.
  39. //===----------------------------------------------------------------------===//
  40. #include "X86TargetTransformInfo.h"
  41. #include "llvm/Analysis/TargetTransformInfo.h"
  42. #include "llvm/CodeGen/BasicTTIImpl.h"
  43. #include "llvm/CodeGen/CostTable.h"
  44. #include "llvm/CodeGen/TargetLowering.h"
  45. #include "llvm/IR/InstIterator.h"
  46. #include "llvm/IR/IntrinsicInst.h"
  47. #include "llvm/Support/Debug.h"
  48. using namespace llvm;
  49. #define DEBUG_TYPE "x86tti"
  50. //===----------------------------------------------------------------------===//
  51. //
  52. // X86 cost model.
  53. //
  54. //===----------------------------------------------------------------------===//
  55. TargetTransformInfo::PopcntSupportKind
  56. X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
  57. assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
  58. // TODO: Currently the __builtin_popcount() implementation using SSE3
  59. // instructions is inefficient. Once the problem is fixed, we should
  60. // call ST->hasSSE3() instead of ST->hasPOPCNT().
  61. return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
  62. }
  63. llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
  64. TargetTransformInfo::CacheLevel Level) const {
  65. switch (Level) {
  66. case TargetTransformInfo::CacheLevel::L1D:
  67. // - Penryn
  68. // - Nehalem
  69. // - Westmere
  70. // - Sandy Bridge
  71. // - Ivy Bridge
  72. // - Haswell
  73. // - Broadwell
  74. // - Skylake
  75. // - Kabylake
  76. return 32 * 1024; // 32 KByte
  77. case TargetTransformInfo::CacheLevel::L2D:
  78. // - Penryn
  79. // - Nehalem
  80. // - Westmere
  81. // - Sandy Bridge
  82. // - Ivy Bridge
  83. // - Haswell
  84. // - Broadwell
  85. // - Skylake
  86. // - Kabylake
  87. return 256 * 1024; // 256 KByte
  88. }
  89. llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
  90. }
  91. llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
  92. TargetTransformInfo::CacheLevel Level) const {
  93. // - Penryn
  94. // - Nehalem
  95. // - Westmere
  96. // - Sandy Bridge
  97. // - Ivy Bridge
  98. // - Haswell
  99. // - Broadwell
  100. // - Skylake
  101. // - Kabylake
  102. switch (Level) {
  103. case TargetTransformInfo::CacheLevel::L1D:
  104. LLVM_FALLTHROUGH;
  105. case TargetTransformInfo::CacheLevel::L2D:
  106. return 8;
  107. }
  108. llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
  109. }
  110. unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
  111. bool Vector = (ClassID == 1);
  112. if (Vector && !ST->hasSSE1())
  113. return 0;
  114. if (ST->is64Bit()) {
  115. if (Vector && ST->hasAVX512())
  116. return 32;
  117. return 16;
  118. }
  119. return 8;
  120. }
  121. TypeSize
  122. X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
  123. unsigned PreferVectorWidth = ST->getPreferVectorWidth();
  124. switch (K) {
  125. case TargetTransformInfo::RGK_Scalar:
  126. return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
  127. case TargetTransformInfo::RGK_FixedWidthVector:
  128. if (ST->hasAVX512() && PreferVectorWidth >= 512)
  129. return TypeSize::getFixed(512);
  130. if (ST->hasAVX() && PreferVectorWidth >= 256)
  131. return TypeSize::getFixed(256);
  132. if (ST->hasSSE1() && PreferVectorWidth >= 128)
  133. return TypeSize::getFixed(128);
  134. return TypeSize::getFixed(0);
  135. case TargetTransformInfo::RGK_ScalableVector:
  136. return TypeSize::getScalable(0);
  137. }
  138. llvm_unreachable("Unsupported register kind");
  139. }
  140. unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
  141. return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
  142. .getFixedSize();
  143. }
  144. unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
  145. // If the loop will not be vectorized, don't interleave the loop.
  146. // Let regular unroll to unroll the loop, which saves the overflow
  147. // check and memory check cost.
  148. if (VF == 1)
  149. return 1;
  150. if (ST->isAtom())
  151. return 1;
  152. // Sandybridge and Haswell have multiple execution ports and pipelined
  153. // vector units.
  154. if (ST->hasAVX())
  155. return 4;
  156. return 2;
  157. }
  158. InstructionCost X86TTIImpl::getArithmeticInstrCost(
  159. unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
  160. TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
  161. TTI::OperandValueProperties Opd1PropInfo,
  162. TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
  163. const Instruction *CxtI) {
  164. // TODO: Handle more cost kinds.
  165. if (CostKind != TTI::TCK_RecipThroughput)
  166. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
  167. Op2Info, Opd1PropInfo,
  168. Opd2PropInfo, Args, CxtI);
  169. // vXi8 multiplications are always promoted to vXi16.
  170. if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
  171. Ty->getScalarSizeInBits() == 8) {
  172. Type *WideVecTy =
  173. VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
  174. return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
  175. TargetTransformInfo::CastContextHint::None,
  176. CostKind) +
  177. getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
  178. TargetTransformInfo::CastContextHint::None,
  179. CostKind) +
  180. getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
  181. Opd1PropInfo, Opd2PropInfo);
  182. }
  183. // Legalize the type.
  184. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  185. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  186. assert(ISD && "Invalid opcode");
  187. if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
  188. LT.second.getScalarType() == MVT::i32) {
  189. // Check if the operands can be represented as a smaller datatype.
  190. bool Op1Signed = false, Op2Signed = false;
  191. unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
  192. unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
  193. unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
  194. // If both are representable as i15 and at least one is constant,
  195. // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
  196. // can treat this as PMADDWD which has the same costs as a vXi16 multiply.
  197. if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
  198. bool Op1Constant =
  199. isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
  200. bool Op2Constant =
  201. isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
  202. bool Op1Sext = isa<SExtInst>(Args[0]) &&
  203. (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
  204. bool Op2Sext = isa<SExtInst>(Args[1]) &&
  205. (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
  206. bool IsZeroExtended = !Op1Signed || !Op2Signed;
  207. bool IsConstant = Op1Constant || Op2Constant;
  208. bool IsSext = Op1Sext || Op2Sext;
  209. if (IsConstant || IsZeroExtended || IsSext)
  210. LT.second =
  211. MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
  212. }
  213. }
  214. // Vector multiply by pow2 will be simplified to shifts.
  215. if (ISD == ISD::MUL &&
  216. (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  217. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  218. Opd2PropInfo == TargetTransformInfo::OP_PowerOf2)
  219. return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info,
  220. Op2Info, TargetTransformInfo::OP_None,
  221. TargetTransformInfo::OP_None);
  222. // On X86, vector signed division by constants power-of-two are
  223. // normally expanded to the sequence SRA + SRL + ADD + SRA.
  224. // The OperandValue properties may not be the same as that of the previous
  225. // operation; conservatively assume OP_None.
  226. if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
  227. (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  228. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  229. Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
  230. InstructionCost Cost =
  231. 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
  232. Op2Info, TargetTransformInfo::OP_None,
  233. TargetTransformInfo::OP_None);
  234. Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
  235. Op2Info, TargetTransformInfo::OP_None,
  236. TargetTransformInfo::OP_None);
  237. Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
  238. Op2Info, TargetTransformInfo::OP_None,
  239. TargetTransformInfo::OP_None);
  240. if (ISD == ISD::SREM) {
  241. // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
  242. Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
  243. Op2Info);
  244. Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
  245. Op2Info);
  246. }
  247. return Cost;
  248. }
  249. // Vector unsigned division/remainder will be simplified to shifts/masks.
  250. if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
  251. (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  252. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  253. Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
  254. if (ISD == ISD::UDIV)
  255. return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
  256. Op2Info, TargetTransformInfo::OP_None,
  257. TargetTransformInfo::OP_None);
  258. // UREM
  259. return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
  260. Op2Info, TargetTransformInfo::OP_None,
  261. TargetTransformInfo::OP_None);
  262. }
  263. static const CostTblEntry GLMCostTable[] = {
  264. { ISD::FDIV, MVT::f32, 18 }, // divss
  265. { ISD::FDIV, MVT::v4f32, 35 }, // divps
  266. { ISD::FDIV, MVT::f64, 33 }, // divsd
  267. { ISD::FDIV, MVT::v2f64, 65 }, // divpd
  268. };
  269. if (ST->useGLMDivSqrtCosts())
  270. if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
  271. LT.second))
  272. return LT.first * Entry->Cost;
  273. static const CostTblEntry SLMCostTable[] = {
  274. { ISD::MUL, MVT::v4i32, 11 }, // pmulld
  275. { ISD::MUL, MVT::v8i16, 2 }, // pmullw
  276. { ISD::FMUL, MVT::f64, 2 }, // mulsd
  277. { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
  278. { ISD::FMUL, MVT::v4f32, 2 }, // mulps
  279. { ISD::FDIV, MVT::f32, 17 }, // divss
  280. { ISD::FDIV, MVT::v4f32, 39 }, // divps
  281. { ISD::FDIV, MVT::f64, 32 }, // divsd
  282. { ISD::FDIV, MVT::v2f64, 69 }, // divpd
  283. { ISD::FADD, MVT::v2f64, 2 }, // addpd
  284. { ISD::FSUB, MVT::v2f64, 2 }, // subpd
  285. // v2i64/v4i64 mul is custom lowered as a series of long:
  286. // multiplies(3), shifts(3) and adds(2)
  287. // slm muldq version throughput is 2 and addq throughput 4
  288. // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
  289. // 3X4 (addq throughput) = 17
  290. { ISD::MUL, MVT::v2i64, 17 },
  291. // slm addq\subq throughput is 4
  292. { ISD::ADD, MVT::v2i64, 4 },
  293. { ISD::SUB, MVT::v2i64, 4 },
  294. };
  295. if (ST->useSLMArithCosts()) {
  296. if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
  297. // Check if the operands can be shrinked into a smaller datatype.
  298. // TODO: Merge this into generiic vXi32 MUL patterns above.
  299. bool Op1Signed = false;
  300. unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
  301. bool Op2Signed = false;
  302. unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
  303. bool SignedMode = Op1Signed || Op2Signed;
  304. unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
  305. if (OpMinSize <= 7)
  306. return LT.first * 3; // pmullw/sext
  307. if (!SignedMode && OpMinSize <= 8)
  308. return LT.first * 3; // pmullw/zext
  309. if (OpMinSize <= 15)
  310. return LT.first * 5; // pmullw/pmulhw/pshuf
  311. if (!SignedMode && OpMinSize <= 16)
  312. return LT.first * 5; // pmullw/pmulhw/pshuf
  313. }
  314. if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
  315. LT.second)) {
  316. return LT.first * Entry->Cost;
  317. }
  318. }
  319. static const CostTblEntry AVX512BWUniformConstCostTable[] = {
  320. { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
  321. { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
  322. { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
  323. };
  324. if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
  325. ST->hasBWI()) {
  326. if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
  327. LT.second))
  328. return LT.first * Entry->Cost;
  329. }
  330. static const CostTblEntry AVX512UniformConstCostTable[] = {
  331. { ISD::SRA, MVT::v2i64, 1 },
  332. { ISD::SRA, MVT::v4i64, 1 },
  333. { ISD::SRA, MVT::v8i64, 1 },
  334. { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
  335. { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
  336. { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
  337. { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
  338. { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
  339. { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
  340. { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
  341. };
  342. if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
  343. ST->hasAVX512()) {
  344. if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
  345. LT.second))
  346. return LT.first * Entry->Cost;
  347. }
  348. static const CostTblEntry AVX2UniformConstCostTable[] = {
  349. { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
  350. { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
  351. { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
  352. { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
  353. { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
  354. { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
  355. { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
  356. { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
  357. };
  358. if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
  359. ST->hasAVX2()) {
  360. if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
  361. LT.second))
  362. return LT.first * Entry->Cost;
  363. }
  364. static const CostTblEntry SSE2UniformConstCostTable[] = {
  365. { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
  366. { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
  367. { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
  368. { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
  369. { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
  370. { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
  371. { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
  372. { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
  373. { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
  374. { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
  375. { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
  376. { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
  377. { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
  378. { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
  379. };
  380. // XOP has faster vXi8 shifts.
  381. if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
  382. ST->hasSSE2() && !ST->hasXOP()) {
  383. if (const auto *Entry =
  384. CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
  385. return LT.first * Entry->Cost;
  386. }
  387. static const CostTblEntry AVX512BWConstCostTable[] = {
  388. { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
  389. { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
  390. { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
  391. { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
  392. { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
  393. { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
  394. { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
  395. { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
  396. };
  397. if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  398. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  399. ST->hasBWI()) {
  400. if (const auto *Entry =
  401. CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
  402. return LT.first * Entry->Cost;
  403. }
  404. static const CostTblEntry AVX512ConstCostTable[] = {
  405. { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
  406. { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
  407. { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
  408. { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
  409. { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
  410. { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
  411. { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
  412. { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
  413. { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
  414. { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
  415. { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
  416. { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
  417. };
  418. if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  419. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  420. ST->hasAVX512()) {
  421. if (const auto *Entry =
  422. CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
  423. return LT.first * Entry->Cost;
  424. }
  425. static const CostTblEntry AVX2ConstCostTable[] = {
  426. { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
  427. { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
  428. { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
  429. { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
  430. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
  431. { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
  432. { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
  433. { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
  434. { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
  435. { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
  436. { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
  437. { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
  438. };
  439. if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  440. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  441. ST->hasAVX2()) {
  442. if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
  443. return LT.first * Entry->Cost;
  444. }
  445. static const CostTblEntry SSE2ConstCostTable[] = {
  446. { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
  447. { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
  448. { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
  449. { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
  450. { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
  451. { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
  452. { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
  453. { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
  454. { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
  455. { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
  456. { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
  457. { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
  458. { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
  459. { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
  460. { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
  461. { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
  462. { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
  463. { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
  464. { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
  465. { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
  466. { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
  467. { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
  468. { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
  469. { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
  470. };
  471. if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  472. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
  473. ST->hasSSE2()) {
  474. // pmuldq sequence.
  475. if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
  476. return LT.first * 32;
  477. if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
  478. return LT.first * 38;
  479. if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
  480. return LT.first * 15;
  481. if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
  482. return LT.first * 20;
  483. if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
  484. return LT.first * Entry->Cost;
  485. }
  486. static const CostTblEntry AVX512BWShiftCostTable[] = {
  487. { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
  488. { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
  489. { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
  490. { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
  491. { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
  492. { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
  493. { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
  494. { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
  495. { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
  496. { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
  497. { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
  498. { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
  499. { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
  500. { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
  501. { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
  502. { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
  503. { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
  504. { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
  505. };
  506. if (ST->hasBWI())
  507. if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
  508. return LT.first * Entry->Cost;
  509. static const CostTblEntry AVX2UniformCostTable[] = {
  510. // Uniform splats are cheaper for the following instructions.
  511. { ISD::SHL, MVT::v16i16, 1 }, // psllw.
  512. { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
  513. { ISD::SRA, MVT::v16i16, 1 }, // psraw.
  514. { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
  515. { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
  516. { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
  517. { ISD::SHL, MVT::v8i32, 1 }, // pslld
  518. { ISD::SRL, MVT::v8i32, 1 }, // psrld
  519. { ISD::SRA, MVT::v8i32, 1 }, // psrad
  520. { ISD::SHL, MVT::v4i64, 1 }, // psllq
  521. { ISD::SRL, MVT::v4i64, 1 }, // psrlq
  522. };
  523. if (ST->hasAVX2() &&
  524. ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
  525. (Op2Info == TargetTransformInfo::OK_UniformValue))) {
  526. if (const auto *Entry =
  527. CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
  528. return LT.first * Entry->Cost;
  529. }
  530. static const CostTblEntry SSE2UniformCostTable[] = {
  531. // Uniform splats are cheaper for the following instructions.
  532. { ISD::SHL, MVT::v8i16, 1 }, // psllw.
  533. { ISD::SHL, MVT::v4i32, 1 }, // pslld
  534. { ISD::SHL, MVT::v2i64, 1 }, // psllq.
  535. { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
  536. { ISD::SRL, MVT::v4i32, 1 }, // psrld.
  537. { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
  538. { ISD::SRA, MVT::v8i16, 1 }, // psraw.
  539. { ISD::SRA, MVT::v4i32, 1 }, // psrad.
  540. };
  541. if (ST->hasSSE2() &&
  542. ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
  543. (Op2Info == TargetTransformInfo::OK_UniformValue))) {
  544. if (const auto *Entry =
  545. CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
  546. return LT.first * Entry->Cost;
  547. }
  548. static const CostTblEntry AVX512DQCostTable[] = {
  549. { ISD::MUL, MVT::v2i64, 2 }, // pmullq
  550. { ISD::MUL, MVT::v4i64, 2 }, // pmullq
  551. { ISD::MUL, MVT::v8i64, 2 } // pmullq
  552. };
  553. // Look for AVX512DQ lowering tricks for custom cases.
  554. if (ST->hasDQI())
  555. if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
  556. return LT.first * Entry->Cost;
  557. static const CostTblEntry AVX512BWCostTable[] = {
  558. { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
  559. { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
  560. { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
  561. };
  562. // Look for AVX512BW lowering tricks for custom cases.
  563. if (ST->hasBWI())
  564. if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
  565. return LT.first * Entry->Cost;
  566. static const CostTblEntry AVX512CostTable[] = {
  567. { ISD::SHL, MVT::v4i32, 1 },
  568. { ISD::SRL, MVT::v4i32, 1 },
  569. { ISD::SRA, MVT::v4i32, 1 },
  570. { ISD::SHL, MVT::v8i32, 1 },
  571. { ISD::SRL, MVT::v8i32, 1 },
  572. { ISD::SRA, MVT::v8i32, 1 },
  573. { ISD::SHL, MVT::v16i32, 1 },
  574. { ISD::SRL, MVT::v16i32, 1 },
  575. { ISD::SRA, MVT::v16i32, 1 },
  576. { ISD::SHL, MVT::v2i64, 1 },
  577. { ISD::SRL, MVT::v2i64, 1 },
  578. { ISD::SHL, MVT::v4i64, 1 },
  579. { ISD::SRL, MVT::v4i64, 1 },
  580. { ISD::SHL, MVT::v8i64, 1 },
  581. { ISD::SRL, MVT::v8i64, 1 },
  582. { ISD::SRA, MVT::v2i64, 1 },
  583. { ISD::SRA, MVT::v4i64, 1 },
  584. { ISD::SRA, MVT::v8i64, 1 },
  585. { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
  586. { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
  587. { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
  588. { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
  589. { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/
  590. { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
  591. { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
  592. { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
  593. { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
  594. { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
  595. { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
  596. { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
  597. { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
  598. { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
  599. { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
  600. { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
  601. { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
  602. { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
  603. { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
  604. { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
  605. { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
  606. };
  607. if (ST->hasAVX512())
  608. if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
  609. return LT.first * Entry->Cost;
  610. static const CostTblEntry AVX2ShiftCostTable[] = {
  611. // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
  612. // customize them to detect the cases where shift amount is a scalar one.
  613. { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
  614. { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
  615. { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
  616. { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
  617. { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
  618. { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
  619. { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
  620. { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
  621. { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
  622. { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
  623. };
  624. if (ST->hasAVX512()) {
  625. if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
  626. (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  627. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
  628. // On AVX512, a packed v32i16 shift left by a constant build_vector
  629. // is lowered into a vector multiply (vpmullw).
  630. return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
  631. Op1Info, Op2Info,
  632. TargetTransformInfo::OP_None,
  633. TargetTransformInfo::OP_None);
  634. }
  635. // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
  636. if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
  637. if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
  638. (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  639. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
  640. // On AVX2, a packed v16i16 shift left by a constant build_vector
  641. // is lowered into a vector multiply (vpmullw).
  642. return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
  643. Op1Info, Op2Info,
  644. TargetTransformInfo::OP_None,
  645. TargetTransformInfo::OP_None);
  646. if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
  647. return LT.first * Entry->Cost;
  648. }
  649. static const CostTblEntry XOPShiftCostTable[] = {
  650. // 128bit shifts take 1cy, but right shifts require negation beforehand.
  651. { ISD::SHL, MVT::v16i8, 1 },
  652. { ISD::SRL, MVT::v16i8, 2 },
  653. { ISD::SRA, MVT::v16i8, 2 },
  654. { ISD::SHL, MVT::v8i16, 1 },
  655. { ISD::SRL, MVT::v8i16, 2 },
  656. { ISD::SRA, MVT::v8i16, 2 },
  657. { ISD::SHL, MVT::v4i32, 1 },
  658. { ISD::SRL, MVT::v4i32, 2 },
  659. { ISD::SRA, MVT::v4i32, 2 },
  660. { ISD::SHL, MVT::v2i64, 1 },
  661. { ISD::SRL, MVT::v2i64, 2 },
  662. { ISD::SRA, MVT::v2i64, 2 },
  663. // 256bit shifts require splitting if AVX2 didn't catch them above.
  664. { ISD::SHL, MVT::v32i8, 2+2 },
  665. { ISD::SRL, MVT::v32i8, 4+2 },
  666. { ISD::SRA, MVT::v32i8, 4+2 },
  667. { ISD::SHL, MVT::v16i16, 2+2 },
  668. { ISD::SRL, MVT::v16i16, 4+2 },
  669. { ISD::SRA, MVT::v16i16, 4+2 },
  670. { ISD::SHL, MVT::v8i32, 2+2 },
  671. { ISD::SRL, MVT::v8i32, 4+2 },
  672. { ISD::SRA, MVT::v8i32, 4+2 },
  673. { ISD::SHL, MVT::v4i64, 2+2 },
  674. { ISD::SRL, MVT::v4i64, 4+2 },
  675. { ISD::SRA, MVT::v4i64, 4+2 },
  676. };
  677. // Look for XOP lowering tricks.
  678. if (ST->hasXOP()) {
  679. // If the right shift is constant then we'll fold the negation so
  680. // it's as cheap as a left shift.
  681. int ShiftISD = ISD;
  682. if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
  683. (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
  684. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
  685. ShiftISD = ISD::SHL;
  686. if (const auto *Entry =
  687. CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
  688. return LT.first * Entry->Cost;
  689. }
  690. static const CostTblEntry SSE2UniformShiftCostTable[] = {
  691. // Uniform splats are cheaper for the following instructions.
  692. { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
  693. { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
  694. { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
  695. { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
  696. { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
  697. { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
  698. { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
  699. { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
  700. { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
  701. { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
  702. };
  703. if (ST->hasSSE2() &&
  704. ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
  705. (Op2Info == TargetTransformInfo::OK_UniformValue))) {
  706. // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
  707. if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
  708. return LT.first * 4; // 2*psrad + shuffle.
  709. if (const auto *Entry =
  710. CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
  711. return LT.first * Entry->Cost;
  712. }
  713. if (ISD == ISD::SHL &&
  714. Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
  715. MVT VT = LT.second;
  716. // Vector shift left by non uniform constant can be lowered
  717. // into vector multiply.
  718. if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
  719. ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
  720. ISD = ISD::MUL;
  721. }
  722. static const CostTblEntry AVX2CostTable[] = {
  723. { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
  724. { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
  725. { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
  726. { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
  727. { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
  728. { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
  729. { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
  730. { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
  731. { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
  732. { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
  733. { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
  734. { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
  735. { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
  736. { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
  737. { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
  738. { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
  739. { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
  740. { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
  741. { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
  742. { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
  743. { ISD::SUB, MVT::v32i8, 1 }, // psubb
  744. { ISD::ADD, MVT::v32i8, 1 }, // paddb
  745. { ISD::SUB, MVT::v16i16, 1 }, // psubw
  746. { ISD::ADD, MVT::v16i16, 1 }, // paddw
  747. { ISD::SUB, MVT::v8i32, 1 }, // psubd
  748. { ISD::ADD, MVT::v8i32, 1 }, // paddd
  749. { ISD::SUB, MVT::v4i64, 1 }, // psubq
  750. { ISD::ADD, MVT::v4i64, 1 }, // paddq
  751. { ISD::MUL, MVT::v16i16, 1 }, // pmullw
  752. { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
  753. { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
  754. { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
  755. { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
  756. { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
  757. { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
  758. { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
  759. { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
  760. { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
  761. { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
  762. { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
  763. { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
  764. { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
  765. { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
  766. { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
  767. { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
  768. { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
  769. { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
  770. };
  771. // Look for AVX2 lowering tricks for custom cases.
  772. if (ST->hasAVX2())
  773. if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
  774. return LT.first * Entry->Cost;
  775. static const CostTblEntry AVX1CostTable[] = {
  776. // We don't have to scalarize unsupported ops. We can issue two half-sized
  777. // operations and we only need to extract the upper YMM half.
  778. // Two ops + 1 extract + 1 insert = 4.
  779. { ISD::MUL, MVT::v16i16, 4 },
  780. { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
  781. { ISD::MUL, MVT::v4i64, 12 },
  782. { ISD::SUB, MVT::v32i8, 4 },
  783. { ISD::ADD, MVT::v32i8, 4 },
  784. { ISD::SUB, MVT::v16i16, 4 },
  785. { ISD::ADD, MVT::v16i16, 4 },
  786. { ISD::SUB, MVT::v8i32, 4 },
  787. { ISD::ADD, MVT::v8i32, 4 },
  788. { ISD::SUB, MVT::v4i64, 4 },
  789. { ISD::ADD, MVT::v4i64, 4 },
  790. { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
  791. { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
  792. { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
  793. { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
  794. { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
  795. { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
  796. { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
  797. { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
  798. { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
  799. { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
  800. { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
  801. { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
  802. { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
  803. { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
  804. { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
  805. { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
  806. { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
  807. { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
  808. { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
  809. { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
  810. { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
  811. { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
  812. { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
  813. { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
  814. { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
  815. { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
  816. { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
  817. { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
  818. { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
  819. { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
  820. };
  821. if (ST->hasAVX())
  822. if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
  823. return LT.first * Entry->Cost;
  824. static const CostTblEntry SSE42CostTable[] = {
  825. { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
  826. { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
  827. { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
  828. { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
  829. { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
  830. { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
  831. { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
  832. { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
  833. { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
  834. { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
  835. { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
  836. { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
  837. { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
  838. { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
  839. { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
  840. { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
  841. { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
  842. };
  843. if (ST->hasSSE42())
  844. if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
  845. return LT.first * Entry->Cost;
  846. static const CostTblEntry SSE41CostTable[] = {
  847. { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
  848. { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
  849. { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
  850. { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
  851. { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
  852. { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
  853. { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
  854. { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
  855. { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
  856. };
  857. if (ST->hasSSE41())
  858. if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
  859. return LT.first * Entry->Cost;
  860. static const CostTblEntry SSE2CostTable[] = {
  861. // We don't correctly identify costs of casts because they are marked as
  862. // custom.
  863. { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
  864. { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
  865. { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
  866. { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
  867. { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
  868. { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
  869. { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
  870. { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
  871. { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
  872. { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
  873. { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
  874. { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
  875. { ISD::MUL, MVT::v8i16, 1 }, // pmullw
  876. { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
  877. { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
  878. { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
  879. { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
  880. { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
  881. { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
  882. { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
  883. { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
  884. { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
  885. { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
  886. { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
  887. { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
  888. { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
  889. { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
  890. };
  891. if (ST->hasSSE2())
  892. if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
  893. return LT.first * Entry->Cost;
  894. static const CostTblEntry SSE1CostTable[] = {
  895. { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
  896. { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
  897. { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
  898. { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
  899. { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
  900. { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
  901. { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
  902. { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
  903. };
  904. if (ST->hasSSE1())
  905. if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
  906. return LT.first * Entry->Cost;
  907. static const CostTblEntry X64CostTbl[] = { // 64-bit targets
  908. { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
  909. { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
  910. { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
  911. };
  912. if (ST->is64Bit())
  913. if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
  914. return LT.first * Entry->Cost;
  915. static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
  916. { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
  917. { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
  918. { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
  919. { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
  920. { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
  921. { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
  922. };
  923. if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
  924. return LT.first * Entry->Cost;
  925. // It is not a good idea to vectorize division. We have to scalarize it and
  926. // in the process we will often end up having to spilling regular
  927. // registers. The overhead of division is going to dominate most kernels
  928. // anyways so try hard to prevent vectorization of division - it is
  929. // generally a bad idea. Assume somewhat arbitrarily that we have to be able
  930. // to hide "20 cycles" for each lane.
  931. if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
  932. ISD == ISD::UDIV || ISD == ISD::UREM)) {
  933. InstructionCost ScalarCost = getArithmeticInstrCost(
  934. Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
  935. TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
  936. return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
  937. }
  938. // Fallback to the default implementation.
  939. return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
  940. }
  941. InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
  942. VectorType *BaseTp,
  943. ArrayRef<int> Mask, int Index,
  944. VectorType *SubTp) {
  945. // 64-bit packed float vectors (v2f32) are widened to type v4f32.
  946. // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
  947. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
  948. Kind = improveShuffleKindFromMask(Kind, Mask);
  949. // Treat Transpose as 2-op shuffles - there's no difference in lowering.
  950. if (Kind == TTI::SK_Transpose)
  951. Kind = TTI::SK_PermuteTwoSrc;
  952. // For Broadcasts we are splatting the first element from the first input
  953. // register, so only need to reference that input and all the output
  954. // registers are the same.
  955. if (Kind == TTI::SK_Broadcast)
  956. LT.first = 1;
  957. // Subvector extractions are free if they start at the beginning of a
  958. // vector and cheap if the subvectors are aligned.
  959. if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
  960. int NumElts = LT.second.getVectorNumElements();
  961. if ((Index % NumElts) == 0)
  962. return 0;
  963. std::pair<InstructionCost, MVT> SubLT =
  964. TLI->getTypeLegalizationCost(DL, SubTp);
  965. if (SubLT.second.isVector()) {
  966. int NumSubElts = SubLT.second.getVectorNumElements();
  967. if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
  968. return SubLT.first;
  969. // Handle some cases for widening legalization. For now we only handle
  970. // cases where the original subvector was naturally aligned and evenly
  971. // fit in its legalized subvector type.
  972. // FIXME: Remove some of the alignment restrictions.
  973. // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
  974. // vectors.
  975. int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
  976. if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
  977. (NumSubElts % OrigSubElts) == 0 &&
  978. LT.second.getVectorElementType() ==
  979. SubLT.second.getVectorElementType() &&
  980. LT.second.getVectorElementType().getSizeInBits() ==
  981. BaseTp->getElementType()->getPrimitiveSizeInBits()) {
  982. assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
  983. "Unexpected number of elements!");
  984. auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
  985. LT.second.getVectorNumElements());
  986. auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
  987. SubLT.second.getVectorNumElements());
  988. int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
  989. InstructionCost ExtractCost = getShuffleCost(
  990. TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
  991. // If the original size is 32-bits or more, we can use pshufd. Otherwise
  992. // if we have SSSE3 we can use pshufb.
  993. if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
  994. return ExtractCost + 1; // pshufd or pshufb
  995. assert(SubTp->getPrimitiveSizeInBits() == 16 &&
  996. "Unexpected vector size");
  997. return ExtractCost + 2; // worst case pshufhw + pshufd
  998. }
  999. }
  1000. }
  1001. // Subvector insertions are cheap if the subvectors are aligned.
  1002. // Note that in general, the insertion starting at the beginning of a vector
  1003. // isn't free, because we need to preserve the rest of the wide vector.
  1004. if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
  1005. int NumElts = LT.second.getVectorNumElements();
  1006. std::pair<InstructionCost, MVT> SubLT =
  1007. TLI->getTypeLegalizationCost(DL, SubTp);
  1008. if (SubLT.second.isVector()) {
  1009. int NumSubElts = SubLT.second.getVectorNumElements();
  1010. if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
  1011. return SubLT.first;
  1012. }
  1013. // If the insertion isn't aligned, treat it like a 2-op shuffle.
  1014. Kind = TTI::SK_PermuteTwoSrc;
  1015. }
  1016. // Handle some common (illegal) sub-vector types as they are often very cheap
  1017. // to shuffle even on targets without PSHUFB.
  1018. EVT VT = TLI->getValueType(DL, BaseTp);
  1019. if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
  1020. !ST->hasSSSE3()) {
  1021. static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
  1022. {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
  1023. {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
  1024. {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
  1025. {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
  1026. {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
  1027. {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
  1028. {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
  1029. {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
  1030. {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
  1031. {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
  1032. {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
  1033. {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
  1034. {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
  1035. {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
  1036. {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
  1037. {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
  1038. {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
  1039. {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
  1040. {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
  1041. };
  1042. if (ST->hasSSE2())
  1043. if (const auto *Entry =
  1044. CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
  1045. return Entry->Cost;
  1046. }
  1047. // We are going to permute multiple sources and the result will be in multiple
  1048. // destinations. Providing an accurate cost only for splits where the element
  1049. // type remains the same.
  1050. if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
  1051. MVT LegalVT = LT.second;
  1052. if (LegalVT.isVector() &&
  1053. LegalVT.getVectorElementType().getSizeInBits() ==
  1054. BaseTp->getElementType()->getPrimitiveSizeInBits() &&
  1055. LegalVT.getVectorNumElements() <
  1056. cast<FixedVectorType>(BaseTp)->getNumElements()) {
  1057. unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
  1058. unsigned LegalVTSize = LegalVT.getStoreSize();
  1059. // Number of source vectors after legalization:
  1060. unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
  1061. // Number of destination vectors after legalization:
  1062. InstructionCost NumOfDests = LT.first;
  1063. auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
  1064. LegalVT.getVectorNumElements());
  1065. InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
  1066. return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
  1067. None, 0, nullptr);
  1068. }
  1069. return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
  1070. }
  1071. // For 2-input shuffles, we must account for splitting the 2 inputs into many.
  1072. if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
  1073. // We assume that source and destination have the same vector type.
  1074. InstructionCost NumOfDests = LT.first;
  1075. InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
  1076. LT.first = NumOfDests * NumOfShufflesPerDest;
  1077. }
  1078. static const CostTblEntry AVX512FP16ShuffleTbl[] = {
  1079. {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
  1080. {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
  1081. {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
  1082. {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
  1083. {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
  1084. {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
  1085. {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
  1086. {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
  1087. {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
  1088. {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
  1089. {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
  1090. {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
  1091. };
  1092. if (!ST->useSoftFloat() && ST->hasFP16())
  1093. if (const auto *Entry =
  1094. CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
  1095. return LT.first * Entry->Cost;
  1096. static const CostTblEntry AVX512VBMIShuffleTbl[] = {
  1097. {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
  1098. {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
  1099. {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
  1100. {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
  1101. {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
  1102. {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
  1103. {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
  1104. };
  1105. if (ST->hasVBMI())
  1106. if (const auto *Entry =
  1107. CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
  1108. return LT.first * Entry->Cost;
  1109. static const CostTblEntry AVX512BWShuffleTbl[] = {
  1110. {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
  1111. {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
  1112. {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
  1113. {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
  1114. {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
  1115. {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
  1116. {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
  1117. {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
  1118. {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
  1119. {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
  1120. {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
  1121. {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
  1122. {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
  1123. {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
  1124. };
  1125. if (ST->hasBWI())
  1126. if (const auto *Entry =
  1127. CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
  1128. return LT.first * Entry->Cost;
  1129. static const CostTblEntry AVX512ShuffleTbl[] = {
  1130. {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
  1131. {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
  1132. {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
  1133. {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
  1134. {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
  1135. {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
  1136. {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
  1137. {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
  1138. {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
  1139. {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
  1140. {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
  1141. {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
  1142. {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
  1143. {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
  1144. {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
  1145. {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
  1146. {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
  1147. {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
  1148. {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
  1149. {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
  1150. {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
  1151. {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
  1152. {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
  1153. {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
  1154. {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
  1155. {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
  1156. {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
  1157. {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
  1158. {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
  1159. {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
  1160. {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
  1161. {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
  1162. {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
  1163. {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
  1164. {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
  1165. {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
  1166. {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
  1167. // FIXME: This just applies the type legalization cost rules above
  1168. // assuming these completely split.
  1169. {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
  1170. {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
  1171. {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
  1172. {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
  1173. {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
  1174. {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
  1175. {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
  1176. {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
  1177. {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
  1178. {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
  1179. };
  1180. if (ST->hasAVX512())
  1181. if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
  1182. return LT.first * Entry->Cost;
  1183. static const CostTblEntry AVX2ShuffleTbl[] = {
  1184. {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
  1185. {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
  1186. {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
  1187. {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
  1188. {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
  1189. {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
  1190. {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
  1191. {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
  1192. {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
  1193. {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
  1194. {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
  1195. {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
  1196. {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
  1197. {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
  1198. {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
  1199. {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
  1200. {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
  1201. {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
  1202. {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
  1203. // + vpblendvb
  1204. {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
  1205. // + vpblendvb
  1206. {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
  1207. {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
  1208. {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
  1209. {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
  1210. {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
  1211. // + vpblendvb
  1212. {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
  1213. // + vpblendvb
  1214. };
  1215. if (ST->hasAVX2())
  1216. if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
  1217. return LT.first * Entry->Cost;
  1218. static const CostTblEntry XOPShuffleTbl[] = {
  1219. {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
  1220. {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
  1221. {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
  1222. {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
  1223. {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
  1224. // + vinsertf128
  1225. {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
  1226. // + vinsertf128
  1227. {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
  1228. // + vinsertf128
  1229. {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
  1230. {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
  1231. // + vinsertf128
  1232. {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
  1233. };
  1234. if (ST->hasXOP())
  1235. if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
  1236. return LT.first * Entry->Cost;
  1237. static const CostTblEntry AVX1ShuffleTbl[] = {
  1238. {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
  1239. {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
  1240. {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
  1241. {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
  1242. {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
  1243. {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
  1244. {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
  1245. {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
  1246. {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
  1247. {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
  1248. {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
  1249. // + vinsertf128
  1250. {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
  1251. // + vinsertf128
  1252. {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
  1253. {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
  1254. {TTI::SK_Select, MVT::v8i32, 1}, // vblendps
  1255. {TTI::SK_Select, MVT::v8f32, 1}, // vblendps
  1256. {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
  1257. {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
  1258. {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
  1259. {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
  1260. {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
  1261. {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
  1262. {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
  1263. // + 2*por + vinsertf128
  1264. {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
  1265. // + 2*por + vinsertf128
  1266. {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
  1267. {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
  1268. {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
  1269. {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
  1270. {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
  1271. // + 4*por + vinsertf128
  1272. {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
  1273. // + 4*por + vinsertf128
  1274. };
  1275. if (ST->hasAVX())
  1276. if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
  1277. return LT.first * Entry->Cost;
  1278. static const CostTblEntry SSE41ShuffleTbl[] = {
  1279. {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
  1280. {TTI::SK_Select, MVT::v2f64, 1}, // movsd
  1281. {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
  1282. {TTI::SK_Select, MVT::v4f32, 1}, // blendps
  1283. {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
  1284. {TTI::SK_Select, MVT::v16i8, 1} // pblendvb
  1285. };
  1286. if (ST->hasSSE41())
  1287. if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
  1288. return LT.first * Entry->Cost;
  1289. static const CostTblEntry SSSE3ShuffleTbl[] = {
  1290. {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
  1291. {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
  1292. {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
  1293. {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
  1294. {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
  1295. {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
  1296. {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
  1297. {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
  1298. {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
  1299. {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
  1300. };
  1301. if (ST->hasSSSE3())
  1302. if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
  1303. return LT.first * Entry->Cost;
  1304. static const CostTblEntry SSE2ShuffleTbl[] = {
  1305. {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
  1306. {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
  1307. {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
  1308. {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
  1309. {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
  1310. {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
  1311. {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
  1312. {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
  1313. {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
  1314. {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
  1315. // + 2*pshufd + 2*unpck + packus
  1316. {TTI::SK_Select, MVT::v2i64, 1}, // movsd
  1317. {TTI::SK_Select, MVT::v2f64, 1}, // movsd
  1318. {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
  1319. {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
  1320. {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
  1321. {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
  1322. {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
  1323. {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
  1324. {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
  1325. // + pshufd/unpck
  1326. { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
  1327. // + 2*pshufd + 2*unpck + 2*packus
  1328. { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
  1329. { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
  1330. { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
  1331. { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
  1332. { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
  1333. };
  1334. if (ST->hasSSE2())
  1335. if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
  1336. return LT.first * Entry->Cost;
  1337. static const CostTblEntry SSE1ShuffleTbl[] = {
  1338. { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
  1339. { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
  1340. { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
  1341. { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
  1342. { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
  1343. };
  1344. if (ST->hasSSE1())
  1345. if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
  1346. return LT.first * Entry->Cost;
  1347. return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
  1348. }
  1349. InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
  1350. Type *Src,
  1351. TTI::CastContextHint CCH,
  1352. TTI::TargetCostKind CostKind,
  1353. const Instruction *I) {
  1354. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  1355. assert(ISD && "Invalid opcode");
  1356. // TODO: Allow non-throughput costs that aren't binary.
  1357. auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
  1358. if (CostKind != TTI::TCK_RecipThroughput)
  1359. return Cost == 0 ? 0 : 1;
  1360. return Cost;
  1361. };
  1362. // The cost tables include both specific, custom (non-legal) src/dst type
  1363. // conversions and generic, legalized types. We test for customs first, before
  1364. // falling back to legalization.
  1365. // FIXME: Need a better design of the cost table to handle non-simple types of
  1366. // potential massive combinations (elem_num x src_type x dst_type).
  1367. static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
  1368. { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
  1369. { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
  1370. // Mask sign extend has an instruction.
  1371. { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
  1372. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
  1373. { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
  1374. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
  1375. { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
  1376. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
  1377. { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
  1378. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
  1379. { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
  1380. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
  1381. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
  1382. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
  1383. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
  1384. { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
  1385. { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
  1386. { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
  1387. { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
  1388. // Mask zero extend is a sext + shift.
  1389. { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
  1390. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
  1391. { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
  1392. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
  1393. { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
  1394. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
  1395. { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
  1396. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
  1397. { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
  1398. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
  1399. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
  1400. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
  1401. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
  1402. { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
  1403. { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
  1404. { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
  1405. { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
  1406. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
  1407. { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
  1408. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
  1409. { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
  1410. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
  1411. { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
  1412. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
  1413. { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
  1414. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
  1415. { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
  1416. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
  1417. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
  1418. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
  1419. { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
  1420. { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
  1421. { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
  1422. { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
  1423. { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
  1424. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
  1425. { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
  1426. { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
  1427. { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
  1428. };
  1429. static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
  1430. // Mask sign extend has an instruction.
  1431. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
  1432. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
  1433. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
  1434. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
  1435. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
  1436. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
  1437. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
  1438. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
  1439. // Mask zero extend is a sext + shift.
  1440. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
  1441. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
  1442. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
  1443. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
  1444. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
  1445. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
  1446. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
  1447. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
  1448. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
  1449. { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
  1450. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
  1451. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
  1452. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
  1453. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
  1454. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
  1455. { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
  1456. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
  1457. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
  1458. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
  1459. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
  1460. { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
  1461. { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
  1462. { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
  1463. { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
  1464. };
  1465. // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
  1466. // 256-bit wide vectors.
  1467. static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
  1468. { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
  1469. { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
  1470. { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
  1471. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
  1472. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
  1473. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
  1474. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
  1475. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
  1476. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
  1477. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
  1478. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
  1479. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
  1480. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
  1481. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
  1482. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
  1483. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
  1484. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
  1485. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
  1486. { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
  1487. { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
  1488. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
  1489. { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
  1490. { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
  1491. { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
  1492. { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
  1493. { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
  1494. { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
  1495. { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
  1496. { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
  1497. { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
  1498. { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
  1499. { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
  1500. { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
  1501. { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
  1502. { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
  1503. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
  1504. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
  1505. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
  1506. { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
  1507. { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
  1508. // Sign extend is zmm vpternlogd+vptruncdb.
  1509. // Zero extend is zmm broadcast load+vptruncdw.
  1510. { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
  1511. { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
  1512. { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
  1513. { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
  1514. { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
  1515. { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
  1516. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
  1517. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
  1518. // Sign extend is zmm vpternlogd+vptruncdw.
  1519. // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
  1520. { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
  1521. { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
  1522. { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
  1523. { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
  1524. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
  1525. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
  1526. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
  1527. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
  1528. { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
  1529. { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
  1530. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
  1531. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
  1532. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
  1533. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
  1534. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
  1535. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
  1536. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
  1537. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
  1538. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
  1539. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
  1540. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
  1541. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
  1542. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
  1543. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
  1544. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
  1545. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
  1546. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
  1547. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
  1548. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
  1549. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
  1550. { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
  1551. { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
  1552. { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
  1553. { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
  1554. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
  1555. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
  1556. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
  1557. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
  1558. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
  1559. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
  1560. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
  1561. { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
  1562. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
  1563. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
  1564. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
  1565. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
  1566. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
  1567. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
  1568. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
  1569. { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
  1570. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
  1571. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
  1572. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
  1573. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
  1574. { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
  1575. { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
  1576. { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
  1577. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
  1578. { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
  1579. { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
  1580. { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
  1581. { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
  1582. { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
  1583. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
  1584. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
  1585. { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
  1586. { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
  1587. { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
  1588. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
  1589. };
  1590. static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
  1591. // Mask sign extend has an instruction.
  1592. { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
  1593. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
  1594. { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
  1595. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
  1596. { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
  1597. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
  1598. { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
  1599. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
  1600. { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
  1601. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
  1602. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
  1603. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
  1604. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
  1605. { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
  1606. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
  1607. { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
  1608. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
  1609. // Mask zero extend is a sext + shift.
  1610. { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
  1611. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
  1612. { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
  1613. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
  1614. { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
  1615. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
  1616. { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
  1617. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
  1618. { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
  1619. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
  1620. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
  1621. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
  1622. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
  1623. { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
  1624. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
  1625. { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
  1626. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
  1627. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
  1628. { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
  1629. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
  1630. { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
  1631. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
  1632. { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
  1633. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
  1634. { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
  1635. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
  1636. { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
  1637. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
  1638. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
  1639. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
  1640. { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
  1641. { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
  1642. { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
  1643. { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
  1644. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
  1645. };
  1646. static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
  1647. // Mask sign extend has an instruction.
  1648. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
  1649. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
  1650. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
  1651. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
  1652. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
  1653. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
  1654. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
  1655. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
  1656. // Mask zero extend is a sext + shift.
  1657. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
  1658. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
  1659. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
  1660. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
  1661. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
  1662. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
  1663. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
  1664. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
  1665. { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
  1666. { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
  1667. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
  1668. { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
  1669. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
  1670. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
  1671. { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
  1672. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
  1673. { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
  1674. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
  1675. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
  1676. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
  1677. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
  1678. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
  1679. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
  1680. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
  1681. { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
  1682. { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
  1683. { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
  1684. { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
  1685. { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
  1686. { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
  1687. { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
  1688. { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
  1689. };
  1690. static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
  1691. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
  1692. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
  1693. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
  1694. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
  1695. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
  1696. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
  1697. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
  1698. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
  1699. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
  1700. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
  1701. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
  1702. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
  1703. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
  1704. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
  1705. { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
  1706. { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
  1707. { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
  1708. // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
  1709. // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
  1710. { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
  1711. { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
  1712. { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
  1713. { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
  1714. { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
  1715. { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
  1716. { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
  1717. { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
  1718. // sign extend is vpcmpeq+maskedmove+vpmovdw
  1719. // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
  1720. { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
  1721. { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
  1722. { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
  1723. { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
  1724. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
  1725. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
  1726. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
  1727. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
  1728. { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
  1729. { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
  1730. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
  1731. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
  1732. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
  1733. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
  1734. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
  1735. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
  1736. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
  1737. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
  1738. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
  1739. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
  1740. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
  1741. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
  1742. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
  1743. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
  1744. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
  1745. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
  1746. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
  1747. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
  1748. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
  1749. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
  1750. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
  1751. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
  1752. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
  1753. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
  1754. { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
  1755. { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
  1756. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
  1757. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
  1758. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
  1759. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
  1760. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
  1761. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  1762. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
  1763. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
  1764. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
  1765. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
  1766. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
  1767. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
  1768. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
  1769. { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
  1770. { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
  1771. { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
  1772. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
  1773. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
  1774. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
  1775. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
  1776. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
  1777. };
  1778. static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
  1779. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
  1780. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
  1781. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
  1782. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
  1783. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
  1784. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
  1785. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
  1786. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
  1787. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
  1788. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
  1789. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  1790. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
  1791. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
  1792. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
  1793. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
  1794. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
  1795. { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
  1796. { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
  1797. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
  1798. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
  1799. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
  1800. { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 },
  1801. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 },
  1802. { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
  1803. { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
  1804. { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
  1805. { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
  1806. { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
  1807. { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
  1808. { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
  1809. { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
  1810. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
  1811. { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
  1812. { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
  1813. { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
  1814. { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
  1815. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
  1816. { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
  1817. { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
  1818. { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
  1819. { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
  1820. { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
  1821. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
  1822. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
  1823. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
  1824. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
  1825. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
  1826. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
  1827. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
  1828. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
  1829. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
  1830. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
  1831. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
  1832. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
  1833. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
  1834. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
  1835. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
  1836. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
  1837. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
  1838. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
  1839. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
  1840. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
  1841. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
  1842. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
  1843. };
  1844. static const TypeConversionCostTblEntry AVXConversionTbl[] = {
  1845. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
  1846. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
  1847. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
  1848. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
  1849. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
  1850. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
  1851. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
  1852. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
  1853. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
  1854. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
  1855. { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
  1856. { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
  1857. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
  1858. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
  1859. { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
  1860. { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
  1861. { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
  1862. { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
  1863. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
  1864. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
  1865. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
  1866. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
  1867. { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
  1868. { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
  1869. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
  1870. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
  1871. { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
  1872. { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
  1873. { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
  1874. { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
  1875. { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
  1876. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
  1877. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
  1878. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
  1879. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
  1880. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
  1881. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  1882. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
  1883. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
  1884. { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
  1885. { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
  1886. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
  1887. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
  1888. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
  1889. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
  1890. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
  1891. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
  1892. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
  1893. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
  1894. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
  1895. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
  1896. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
  1897. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
  1898. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
  1899. { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
  1900. { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
  1901. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
  1902. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
  1903. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
  1904. { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
  1905. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
  1906. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
  1907. { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
  1908. { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
  1909. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
  1910. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
  1911. { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
  1912. { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
  1913. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
  1914. { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
  1915. { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
  1916. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
  1917. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
  1918. { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
  1919. { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
  1920. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
  1921. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
  1922. { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
  1923. { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
  1924. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
  1925. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
  1926. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
  1927. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
  1928. { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
  1929. { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
  1930. { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
  1931. };
  1932. static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
  1933. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
  1934. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
  1935. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
  1936. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
  1937. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
  1938. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
  1939. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
  1940. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
  1941. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
  1942. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
  1943. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
  1944. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
  1945. // These truncates end up widening elements.
  1946. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
  1947. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
  1948. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
  1949. { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
  1950. { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
  1951. { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
  1952. { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
  1953. { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
  1954. { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
  1955. { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
  1956. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
  1957. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
  1958. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
  1959. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
  1960. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
  1961. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
  1962. { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
  1963. { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
  1964. { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
  1965. { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
  1966. { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
  1967. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
  1968. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
  1969. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
  1970. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
  1971. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
  1972. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
  1973. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
  1974. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
  1975. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
  1976. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
  1977. { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
  1978. { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
  1979. { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
  1980. { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
  1981. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
  1982. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
  1983. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
  1984. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
  1985. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
  1986. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
  1987. { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
  1988. { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
  1989. { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
  1990. { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
  1991. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
  1992. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
  1993. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
  1994. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
  1995. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
  1996. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
  1997. };
  1998. static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
  1999. // These are somewhat magic numbers justified by comparing the
  2000. // output of llvm-mca for our various supported scheduler models
  2001. // and basing it off the worst case scenario.
  2002. { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
  2003. { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
  2004. { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
  2005. { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
  2006. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
  2007. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
  2008. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
  2009. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
  2010. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
  2011. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
  2012. { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
  2013. { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
  2014. { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
  2015. { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
  2016. { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
  2017. { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
  2018. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
  2019. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
  2020. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
  2021. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
  2022. { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
  2023. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
  2024. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
  2025. { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
  2026. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
  2027. { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
  2028. { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
  2029. { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
  2030. { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
  2031. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
  2032. { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
  2033. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
  2034. { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
  2035. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
  2036. { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
  2037. { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
  2038. { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
  2039. { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
  2040. { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
  2041. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
  2042. { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
  2043. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
  2044. { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
  2045. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
  2046. { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
  2047. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
  2048. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
  2049. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
  2050. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
  2051. { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
  2052. { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
  2053. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
  2054. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
  2055. { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
  2056. { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
  2057. { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
  2058. { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
  2059. // These truncates are really widening elements.
  2060. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
  2061. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
  2062. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
  2063. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
  2064. { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
  2065. { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
  2066. { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
  2067. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
  2068. { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
  2069. { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
  2070. { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
  2071. { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
  2072. { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
  2073. { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
  2074. { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
  2075. { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
  2076. { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
  2077. };
  2078. // Attempt to map directly to (simple) MVT types to let us match custom entries.
  2079. EVT SrcTy = TLI->getValueType(DL, Src);
  2080. EVT DstTy = TLI->getValueType(DL, Dst);
  2081. // The function getSimpleVT only handles simple value types.
  2082. if (SrcTy.isSimple() && DstTy.isSimple()) {
  2083. MVT SimpleSrcTy = SrcTy.getSimpleVT();
  2084. MVT SimpleDstTy = DstTy.getSimpleVT();
  2085. if (ST->useAVX512Regs()) {
  2086. if (ST->hasBWI())
  2087. if (const auto *Entry = ConvertCostTableLookup(
  2088. AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
  2089. return AdjustCost(Entry->Cost);
  2090. if (ST->hasDQI())
  2091. if (const auto *Entry = ConvertCostTableLookup(
  2092. AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
  2093. return AdjustCost(Entry->Cost);
  2094. if (ST->hasAVX512())
  2095. if (const auto *Entry = ConvertCostTableLookup(
  2096. AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
  2097. return AdjustCost(Entry->Cost);
  2098. }
  2099. if (ST->hasBWI())
  2100. if (const auto *Entry = ConvertCostTableLookup(
  2101. AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
  2102. return AdjustCost(Entry->Cost);
  2103. if (ST->hasDQI())
  2104. if (const auto *Entry = ConvertCostTableLookup(
  2105. AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
  2106. return AdjustCost(Entry->Cost);
  2107. if (ST->hasAVX512())
  2108. if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
  2109. SimpleDstTy, SimpleSrcTy))
  2110. return AdjustCost(Entry->Cost);
  2111. if (ST->hasAVX2()) {
  2112. if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
  2113. SimpleDstTy, SimpleSrcTy))
  2114. return AdjustCost(Entry->Cost);
  2115. }
  2116. if (ST->hasAVX()) {
  2117. if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
  2118. SimpleDstTy, SimpleSrcTy))
  2119. return AdjustCost(Entry->Cost);
  2120. }
  2121. if (ST->hasSSE41()) {
  2122. if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
  2123. SimpleDstTy, SimpleSrcTy))
  2124. return AdjustCost(Entry->Cost);
  2125. }
  2126. if (ST->hasSSE2()) {
  2127. if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
  2128. SimpleDstTy, SimpleSrcTy))
  2129. return AdjustCost(Entry->Cost);
  2130. }
  2131. }
  2132. // Fall back to legalized types.
  2133. std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
  2134. std::pair<InstructionCost, MVT> LTDest =
  2135. TLI->getTypeLegalizationCost(DL, Dst);
  2136. if (ST->useAVX512Regs()) {
  2137. if (ST->hasBWI())
  2138. if (const auto *Entry = ConvertCostTableLookup(
  2139. AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
  2140. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2141. if (ST->hasDQI())
  2142. if (const auto *Entry = ConvertCostTableLookup(
  2143. AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
  2144. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2145. if (ST->hasAVX512())
  2146. if (const auto *Entry = ConvertCostTableLookup(
  2147. AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
  2148. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2149. }
  2150. if (ST->hasBWI())
  2151. if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
  2152. LTDest.second, LTSrc.second))
  2153. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2154. if (ST->hasDQI())
  2155. if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
  2156. LTDest.second, LTSrc.second))
  2157. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2158. if (ST->hasAVX512())
  2159. if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
  2160. LTDest.second, LTSrc.second))
  2161. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2162. if (ST->hasAVX2())
  2163. if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
  2164. LTDest.second, LTSrc.second))
  2165. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2166. if (ST->hasAVX())
  2167. if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
  2168. LTDest.second, LTSrc.second))
  2169. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2170. if (ST->hasSSE41())
  2171. if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
  2172. LTDest.second, LTSrc.second))
  2173. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2174. if (ST->hasSSE2())
  2175. if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
  2176. LTDest.second, LTSrc.second))
  2177. return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
  2178. // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
  2179. // sitofp.
  2180. if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
  2181. 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
  2182. Type *ExtSrc = Src->getWithNewBitWidth(32);
  2183. unsigned ExtOpc =
  2184. (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
  2185. // For scalar loads the extend would be free.
  2186. InstructionCost ExtCost = 0;
  2187. if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
  2188. ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
  2189. return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
  2190. TTI::CastContextHint::None, CostKind);
  2191. }
  2192. // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
  2193. // i32.
  2194. if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
  2195. 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
  2196. Type *TruncDst = Dst->getWithNewBitWidth(32);
  2197. return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
  2198. getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
  2199. TTI::CastContextHint::None, CostKind);
  2200. }
  2201. return AdjustCost(
  2202. BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
  2203. }
  2204. InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  2205. Type *CondTy,
  2206. CmpInst::Predicate VecPred,
  2207. TTI::TargetCostKind CostKind,
  2208. const Instruction *I) {
  2209. // TODO: Handle other cost kinds.
  2210. if (CostKind != TTI::TCK_RecipThroughput)
  2211. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
  2212. I);
  2213. // Legalize the type.
  2214. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
  2215. MVT MTy = LT.second;
  2216. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  2217. assert(ISD && "Invalid opcode");
  2218. unsigned ExtraCost = 0;
  2219. if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
  2220. // Some vector comparison predicates cost extra instructions.
  2221. // TODO: Should we invert this and assume worst case cmp costs
  2222. // and reduce for particular predicates?
  2223. if (MTy.isVector() &&
  2224. !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) ||
  2225. (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) ||
  2226. ST->hasBWI())) {
  2227. // Fallback to I if a specific predicate wasn't specified.
  2228. CmpInst::Predicate Pred = VecPred;
  2229. if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE ||
  2230. Pred == CmpInst::BAD_FCMP_PREDICATE))
  2231. Pred = cast<CmpInst>(I)->getPredicate();
  2232. switch (Pred) {
  2233. case CmpInst::Predicate::ICMP_NE:
  2234. // xor(cmpeq(x,y),-1)
  2235. ExtraCost = 1;
  2236. break;
  2237. case CmpInst::Predicate::ICMP_SGE:
  2238. case CmpInst::Predicate::ICMP_SLE:
  2239. // xor(cmpgt(x,y),-1)
  2240. ExtraCost = 1;
  2241. break;
  2242. case CmpInst::Predicate::ICMP_ULT:
  2243. case CmpInst::Predicate::ICMP_UGT:
  2244. // cmpgt(xor(x,signbit),xor(y,signbit))
  2245. // xor(cmpeq(pmaxu(x,y),x),-1)
  2246. ExtraCost = 2;
  2247. break;
  2248. case CmpInst::Predicate::ICMP_ULE:
  2249. case CmpInst::Predicate::ICMP_UGE:
  2250. if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) ||
  2251. (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) {
  2252. // cmpeq(psubus(x,y),0)
  2253. // cmpeq(pminu(x,y),x)
  2254. ExtraCost = 1;
  2255. } else {
  2256. // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1)
  2257. ExtraCost = 3;
  2258. }
  2259. break;
  2260. case CmpInst::Predicate::BAD_ICMP_PREDICATE:
  2261. case CmpInst::Predicate::BAD_FCMP_PREDICATE:
  2262. // Assume worst case scenario and add the maximum extra cost.
  2263. ExtraCost = 3;
  2264. break;
  2265. default:
  2266. break;
  2267. }
  2268. }
  2269. }
  2270. static const CostTblEntry SLMCostTbl[] = {
  2271. // slm pcmpeq/pcmpgt throughput is 2
  2272. { ISD::SETCC, MVT::v2i64, 2 },
  2273. };
  2274. static const CostTblEntry AVX512BWCostTbl[] = {
  2275. { ISD::SETCC, MVT::v32i16, 1 },
  2276. { ISD::SETCC, MVT::v64i8, 1 },
  2277. { ISD::SELECT, MVT::v32i16, 1 },
  2278. { ISD::SELECT, MVT::v64i8, 1 },
  2279. };
  2280. static const CostTblEntry AVX512CostTbl[] = {
  2281. { ISD::SETCC, MVT::v8i64, 1 },
  2282. { ISD::SETCC, MVT::v16i32, 1 },
  2283. { ISD::SETCC, MVT::v8f64, 1 },
  2284. { ISD::SETCC, MVT::v16f32, 1 },
  2285. { ISD::SELECT, MVT::v8i64, 1 },
  2286. { ISD::SELECT, MVT::v16i32, 1 },
  2287. { ISD::SELECT, MVT::v8f64, 1 },
  2288. { ISD::SELECT, MVT::v16f32, 1 },
  2289. { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
  2290. { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
  2291. { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
  2292. { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
  2293. };
  2294. static const CostTblEntry AVX2CostTbl[] = {
  2295. { ISD::SETCC, MVT::v4i64, 1 },
  2296. { ISD::SETCC, MVT::v8i32, 1 },
  2297. { ISD::SETCC, MVT::v16i16, 1 },
  2298. { ISD::SETCC, MVT::v32i8, 1 },
  2299. { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb
  2300. { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb
  2301. { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb
  2302. { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb
  2303. };
  2304. static const CostTblEntry AVX1CostTbl[] = {
  2305. { ISD::SETCC, MVT::v4f64, 1 },
  2306. { ISD::SETCC, MVT::v8f32, 1 },
  2307. // AVX1 does not support 8-wide integer compare.
  2308. { ISD::SETCC, MVT::v4i64, 4 },
  2309. { ISD::SETCC, MVT::v8i32, 4 },
  2310. { ISD::SETCC, MVT::v16i16, 4 },
  2311. { ISD::SETCC, MVT::v32i8, 4 },
  2312. { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd
  2313. { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps
  2314. { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd
  2315. { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps
  2316. { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps
  2317. { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps
  2318. };
  2319. static const CostTblEntry SSE42CostTbl[] = {
  2320. { ISD::SETCC, MVT::v2f64, 1 },
  2321. { ISD::SETCC, MVT::v4f32, 1 },
  2322. { ISD::SETCC, MVT::v2i64, 1 },
  2323. };
  2324. static const CostTblEntry SSE41CostTbl[] = {
  2325. { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
  2326. { ISD::SELECT, MVT::v4f32, 1 }, // blendvps
  2327. { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
  2328. { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
  2329. { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
  2330. { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb
  2331. };
  2332. static const CostTblEntry SSE2CostTbl[] = {
  2333. { ISD::SETCC, MVT::v2f64, 2 },
  2334. { ISD::SETCC, MVT::f64, 1 },
  2335. { ISD::SETCC, MVT::v2i64, 8 },
  2336. { ISD::SETCC, MVT::v4i32, 1 },
  2337. { ISD::SETCC, MVT::v8i16, 1 },
  2338. { ISD::SETCC, MVT::v16i8, 1 },
  2339. { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd
  2340. { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por
  2341. { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por
  2342. { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por
  2343. { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por
  2344. };
  2345. static const CostTblEntry SSE1CostTbl[] = {
  2346. { ISD::SETCC, MVT::v4f32, 2 },
  2347. { ISD::SETCC, MVT::f32, 1 },
  2348. { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
  2349. };
  2350. if (ST->useSLMArithCosts())
  2351. if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
  2352. return LT.first * (ExtraCost + Entry->Cost);
  2353. if (ST->hasBWI())
  2354. if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
  2355. return LT.first * (ExtraCost + Entry->Cost);
  2356. if (ST->hasAVX512())
  2357. if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
  2358. return LT.first * (ExtraCost + Entry->Cost);
  2359. if (ST->hasAVX2())
  2360. if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
  2361. return LT.first * (ExtraCost + Entry->Cost);
  2362. if (ST->hasAVX())
  2363. if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
  2364. return LT.first * (ExtraCost + Entry->Cost);
  2365. if (ST->hasSSE42())
  2366. if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
  2367. return LT.first * (ExtraCost + Entry->Cost);
  2368. if (ST->hasSSE41())
  2369. if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
  2370. return LT.first * (ExtraCost + Entry->Cost);
  2371. if (ST->hasSSE2())
  2372. if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
  2373. return LT.first * (ExtraCost + Entry->Cost);
  2374. if (ST->hasSSE1())
  2375. if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
  2376. return LT.first * (ExtraCost + Entry->Cost);
  2377. return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
  2378. }
  2379. unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
  2380. InstructionCost
  2381. X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  2382. TTI::TargetCostKind CostKind) {
  2383. // Costs should match the codegen from:
  2384. // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
  2385. // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
  2386. // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
  2387. // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
  2388. // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
  2389. // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
  2390. // specialized in these tables yet.
  2391. static const CostTblEntry AVX512BITALGCostTbl[] = {
  2392. { ISD::CTPOP, MVT::v32i16, 1 },
  2393. { ISD::CTPOP, MVT::v64i8, 1 },
  2394. { ISD::CTPOP, MVT::v16i16, 1 },
  2395. { ISD::CTPOP, MVT::v32i8, 1 },
  2396. { ISD::CTPOP, MVT::v8i16, 1 },
  2397. { ISD::CTPOP, MVT::v16i8, 1 },
  2398. };
  2399. static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
  2400. { ISD::CTPOP, MVT::v8i64, 1 },
  2401. { ISD::CTPOP, MVT::v16i32, 1 },
  2402. { ISD::CTPOP, MVT::v4i64, 1 },
  2403. { ISD::CTPOP, MVT::v8i32, 1 },
  2404. { ISD::CTPOP, MVT::v2i64, 1 },
  2405. { ISD::CTPOP, MVT::v4i32, 1 },
  2406. };
  2407. static const CostTblEntry AVX512CDCostTbl[] = {
  2408. { ISD::CTLZ, MVT::v8i64, 1 },
  2409. { ISD::CTLZ, MVT::v16i32, 1 },
  2410. { ISD::CTLZ, MVT::v32i16, 8 },
  2411. { ISD::CTLZ, MVT::v64i8, 20 },
  2412. { ISD::CTLZ, MVT::v4i64, 1 },
  2413. { ISD::CTLZ, MVT::v8i32, 1 },
  2414. { ISD::CTLZ, MVT::v16i16, 4 },
  2415. { ISD::CTLZ, MVT::v32i8, 10 },
  2416. { ISD::CTLZ, MVT::v2i64, 1 },
  2417. { ISD::CTLZ, MVT::v4i32, 1 },
  2418. { ISD::CTLZ, MVT::v8i16, 4 },
  2419. { ISD::CTLZ, MVT::v16i8, 4 },
  2420. };
  2421. static const CostTblEntry AVX512BWCostTbl[] = {
  2422. { ISD::ABS, MVT::v32i16, 1 },
  2423. { ISD::ABS, MVT::v64i8, 1 },
  2424. { ISD::BITREVERSE, MVT::v8i64, 3 },
  2425. { ISD::BITREVERSE, MVT::v16i32, 3 },
  2426. { ISD::BITREVERSE, MVT::v32i16, 3 },
  2427. { ISD::BITREVERSE, MVT::v64i8, 2 },
  2428. { ISD::BSWAP, MVT::v8i64, 1 },
  2429. { ISD::BSWAP, MVT::v16i32, 1 },
  2430. { ISD::BSWAP, MVT::v32i16, 1 },
  2431. { ISD::CTLZ, MVT::v8i64, 23 },
  2432. { ISD::CTLZ, MVT::v16i32, 22 },
  2433. { ISD::CTLZ, MVT::v32i16, 18 },
  2434. { ISD::CTLZ, MVT::v64i8, 17 },
  2435. { ISD::CTPOP, MVT::v8i64, 7 },
  2436. { ISD::CTPOP, MVT::v16i32, 11 },
  2437. { ISD::CTPOP, MVT::v32i16, 9 },
  2438. { ISD::CTPOP, MVT::v64i8, 6 },
  2439. { ISD::CTTZ, MVT::v8i64, 10 },
  2440. { ISD::CTTZ, MVT::v16i32, 14 },
  2441. { ISD::CTTZ, MVT::v32i16, 12 },
  2442. { ISD::CTTZ, MVT::v64i8, 9 },
  2443. { ISD::SADDSAT, MVT::v32i16, 1 },
  2444. { ISD::SADDSAT, MVT::v64i8, 1 },
  2445. { ISD::SMAX, MVT::v32i16, 1 },
  2446. { ISD::SMAX, MVT::v64i8, 1 },
  2447. { ISD::SMIN, MVT::v32i16, 1 },
  2448. { ISD::SMIN, MVT::v64i8, 1 },
  2449. { ISD::SSUBSAT, MVT::v32i16, 1 },
  2450. { ISD::SSUBSAT, MVT::v64i8, 1 },
  2451. { ISD::UADDSAT, MVT::v32i16, 1 },
  2452. { ISD::UADDSAT, MVT::v64i8, 1 },
  2453. { ISD::UMAX, MVT::v32i16, 1 },
  2454. { ISD::UMAX, MVT::v64i8, 1 },
  2455. { ISD::UMIN, MVT::v32i16, 1 },
  2456. { ISD::UMIN, MVT::v64i8, 1 },
  2457. { ISD::USUBSAT, MVT::v32i16, 1 },
  2458. { ISD::USUBSAT, MVT::v64i8, 1 },
  2459. };
  2460. static const CostTblEntry AVX512CostTbl[] = {
  2461. { ISD::ABS, MVT::v8i64, 1 },
  2462. { ISD::ABS, MVT::v16i32, 1 },
  2463. { ISD::ABS, MVT::v32i16, 2 },
  2464. { ISD::ABS, MVT::v64i8, 2 },
  2465. { ISD::ABS, MVT::v4i64, 1 },
  2466. { ISD::ABS, MVT::v2i64, 1 },
  2467. { ISD::BITREVERSE, MVT::v8i64, 36 },
  2468. { ISD::BITREVERSE, MVT::v16i32, 24 },
  2469. { ISD::BITREVERSE, MVT::v32i16, 10 },
  2470. { ISD::BITREVERSE, MVT::v64i8, 10 },
  2471. { ISD::BSWAP, MVT::v8i64, 4 },
  2472. { ISD::BSWAP, MVT::v16i32, 4 },
  2473. { ISD::BSWAP, MVT::v32i16, 4 },
  2474. { ISD::CTLZ, MVT::v8i64, 29 },
  2475. { ISD::CTLZ, MVT::v16i32, 35 },
  2476. { ISD::CTLZ, MVT::v32i16, 28 },
  2477. { ISD::CTLZ, MVT::v64i8, 18 },
  2478. { ISD::CTPOP, MVT::v8i64, 16 },
  2479. { ISD::CTPOP, MVT::v16i32, 24 },
  2480. { ISD::CTPOP, MVT::v32i16, 18 },
  2481. { ISD::CTPOP, MVT::v64i8, 12 },
  2482. { ISD::CTTZ, MVT::v8i64, 20 },
  2483. { ISD::CTTZ, MVT::v16i32, 28 },
  2484. { ISD::CTTZ, MVT::v32i16, 24 },
  2485. { ISD::CTTZ, MVT::v64i8, 18 },
  2486. { ISD::SMAX, MVT::v8i64, 1 },
  2487. { ISD::SMAX, MVT::v16i32, 1 },
  2488. { ISD::SMAX, MVT::v32i16, 2 },
  2489. { ISD::SMAX, MVT::v64i8, 2 },
  2490. { ISD::SMAX, MVT::v4i64, 1 },
  2491. { ISD::SMAX, MVT::v2i64, 1 },
  2492. { ISD::SMIN, MVT::v8i64, 1 },
  2493. { ISD::SMIN, MVT::v16i32, 1 },
  2494. { ISD::SMIN, MVT::v32i16, 2 },
  2495. { ISD::SMIN, MVT::v64i8, 2 },
  2496. { ISD::SMIN, MVT::v4i64, 1 },
  2497. { ISD::SMIN, MVT::v2i64, 1 },
  2498. { ISD::UMAX, MVT::v8i64, 1 },
  2499. { ISD::UMAX, MVT::v16i32, 1 },
  2500. { ISD::UMAX, MVT::v32i16, 2 },
  2501. { ISD::UMAX, MVT::v64i8, 2 },
  2502. { ISD::UMAX, MVT::v4i64, 1 },
  2503. { ISD::UMAX, MVT::v2i64, 1 },
  2504. { ISD::UMIN, MVT::v8i64, 1 },
  2505. { ISD::UMIN, MVT::v16i32, 1 },
  2506. { ISD::UMIN, MVT::v32i16, 2 },
  2507. { ISD::UMIN, MVT::v64i8, 2 },
  2508. { ISD::UMIN, MVT::v4i64, 1 },
  2509. { ISD::UMIN, MVT::v2i64, 1 },
  2510. { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
  2511. { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
  2512. { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
  2513. { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
  2514. { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd
  2515. { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
  2516. { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
  2517. { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
  2518. { ISD::SADDSAT, MVT::v32i16, 2 },
  2519. { ISD::SADDSAT, MVT::v64i8, 2 },
  2520. { ISD::SSUBSAT, MVT::v32i16, 2 },
  2521. { ISD::SSUBSAT, MVT::v64i8, 2 },
  2522. { ISD::UADDSAT, MVT::v32i16, 2 },
  2523. { ISD::UADDSAT, MVT::v64i8, 2 },
  2524. { ISD::USUBSAT, MVT::v32i16, 2 },
  2525. { ISD::USUBSAT, MVT::v64i8, 2 },
  2526. { ISD::FMAXNUM, MVT::f32, 2 },
  2527. { ISD::FMAXNUM, MVT::v4f32, 2 },
  2528. { ISD::FMAXNUM, MVT::v8f32, 2 },
  2529. { ISD::FMAXNUM, MVT::v16f32, 2 },
  2530. { ISD::FMAXNUM, MVT::f64, 2 },
  2531. { ISD::FMAXNUM, MVT::v2f64, 2 },
  2532. { ISD::FMAXNUM, MVT::v4f64, 2 },
  2533. { ISD::FMAXNUM, MVT::v8f64, 2 },
  2534. };
  2535. static const CostTblEntry XOPCostTbl[] = {
  2536. { ISD::BITREVERSE, MVT::v4i64, 4 },
  2537. { ISD::BITREVERSE, MVT::v8i32, 4 },
  2538. { ISD::BITREVERSE, MVT::v16i16, 4 },
  2539. { ISD::BITREVERSE, MVT::v32i8, 4 },
  2540. { ISD::BITREVERSE, MVT::v2i64, 1 },
  2541. { ISD::BITREVERSE, MVT::v4i32, 1 },
  2542. { ISD::BITREVERSE, MVT::v8i16, 1 },
  2543. { ISD::BITREVERSE, MVT::v16i8, 1 },
  2544. { ISD::BITREVERSE, MVT::i64, 3 },
  2545. { ISD::BITREVERSE, MVT::i32, 3 },
  2546. { ISD::BITREVERSE, MVT::i16, 3 },
  2547. { ISD::BITREVERSE, MVT::i8, 3 }
  2548. };
  2549. static const CostTblEntry AVX2CostTbl[] = {
  2550. { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
  2551. { ISD::ABS, MVT::v8i32, 1 },
  2552. { ISD::ABS, MVT::v16i16, 1 },
  2553. { ISD::ABS, MVT::v32i8, 1 },
  2554. { ISD::BITREVERSE, MVT::v2i64, 3 },
  2555. { ISD::BITREVERSE, MVT::v4i64, 3 },
  2556. { ISD::BITREVERSE, MVT::v4i32, 3 },
  2557. { ISD::BITREVERSE, MVT::v8i32, 3 },
  2558. { ISD::BITREVERSE, MVT::v8i16, 3 },
  2559. { ISD::BITREVERSE, MVT::v16i16, 3 },
  2560. { ISD::BITREVERSE, MVT::v16i8, 3 },
  2561. { ISD::BITREVERSE, MVT::v32i8, 3 },
  2562. { ISD::BSWAP, MVT::v4i64, 1 },
  2563. { ISD::BSWAP, MVT::v8i32, 1 },
  2564. { ISD::BSWAP, MVT::v16i16, 1 },
  2565. { ISD::CTLZ, MVT::v2i64, 7 },
  2566. { ISD::CTLZ, MVT::v4i64, 7 },
  2567. { ISD::CTLZ, MVT::v4i32, 5 },
  2568. { ISD::CTLZ, MVT::v8i32, 5 },
  2569. { ISD::CTLZ, MVT::v8i16, 4 },
  2570. { ISD::CTLZ, MVT::v16i16, 4 },
  2571. { ISD::CTLZ, MVT::v16i8, 3 },
  2572. { ISD::CTLZ, MVT::v32i8, 3 },
  2573. { ISD::CTPOP, MVT::v2i64, 3 },
  2574. { ISD::CTPOP, MVT::v4i64, 3 },
  2575. { ISD::CTPOP, MVT::v4i32, 7 },
  2576. { ISD::CTPOP, MVT::v8i32, 7 },
  2577. { ISD::CTPOP, MVT::v8i16, 3 },
  2578. { ISD::CTPOP, MVT::v16i16, 3 },
  2579. { ISD::CTPOP, MVT::v16i8, 2 },
  2580. { ISD::CTPOP, MVT::v32i8, 2 },
  2581. { ISD::CTTZ, MVT::v2i64, 4 },
  2582. { ISD::CTTZ, MVT::v4i64, 4 },
  2583. { ISD::CTTZ, MVT::v4i32, 7 },
  2584. { ISD::CTTZ, MVT::v8i32, 7 },
  2585. { ISD::CTTZ, MVT::v8i16, 4 },
  2586. { ISD::CTTZ, MVT::v16i16, 4 },
  2587. { ISD::CTTZ, MVT::v16i8, 3 },
  2588. { ISD::CTTZ, MVT::v32i8, 3 },
  2589. { ISD::SADDSAT, MVT::v16i16, 1 },
  2590. { ISD::SADDSAT, MVT::v32i8, 1 },
  2591. { ISD::SMAX, MVT::v8i32, 1 },
  2592. { ISD::SMAX, MVT::v16i16, 1 },
  2593. { ISD::SMAX, MVT::v32i8, 1 },
  2594. { ISD::SMIN, MVT::v8i32, 1 },
  2595. { ISD::SMIN, MVT::v16i16, 1 },
  2596. { ISD::SMIN, MVT::v32i8, 1 },
  2597. { ISD::SSUBSAT, MVT::v16i16, 1 },
  2598. { ISD::SSUBSAT, MVT::v32i8, 1 },
  2599. { ISD::UADDSAT, MVT::v16i16, 1 },
  2600. { ISD::UADDSAT, MVT::v32i8, 1 },
  2601. { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
  2602. { ISD::UMAX, MVT::v8i32, 1 },
  2603. { ISD::UMAX, MVT::v16i16, 1 },
  2604. { ISD::UMAX, MVT::v32i8, 1 },
  2605. { ISD::UMIN, MVT::v8i32, 1 },
  2606. { ISD::UMIN, MVT::v16i16, 1 },
  2607. { ISD::UMIN, MVT::v32i8, 1 },
  2608. { ISD::USUBSAT, MVT::v16i16, 1 },
  2609. { ISD::USUBSAT, MVT::v32i8, 1 },
  2610. { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
  2611. { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
  2612. { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
  2613. { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
  2614. { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
  2615. { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
  2616. { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
  2617. { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
  2618. { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
  2619. };
  2620. static const CostTblEntry AVX1CostTbl[] = {
  2621. { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
  2622. { ISD::ABS, MVT::v8i32, 3 },
  2623. { ISD::ABS, MVT::v16i16, 3 },
  2624. { ISD::ABS, MVT::v32i8, 3 },
  2625. { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
  2626. { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
  2627. { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
  2628. { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
  2629. { ISD::BSWAP, MVT::v4i64, 4 },
  2630. { ISD::BSWAP, MVT::v8i32, 4 },
  2631. { ISD::BSWAP, MVT::v16i16, 4 },
  2632. { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
  2633. { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
  2634. { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
  2635. { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
  2636. { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
  2637. { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
  2638. { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
  2639. { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
  2640. { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
  2641. { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
  2642. { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
  2643. { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
  2644. { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2645. { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2646. { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
  2647. { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2648. { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2649. { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
  2650. { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2651. { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2652. { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2653. { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2654. { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2655. { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2656. { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
  2657. { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
  2658. { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2659. { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2660. { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
  2661. { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2662. { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2663. { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
  2664. { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
  2665. { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
  2666. { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
  2667. { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
  2668. { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
  2669. { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
  2670. { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
  2671. { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
  2672. { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
  2673. { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
  2674. { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
  2675. { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
  2676. { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
  2677. { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
  2678. };
  2679. static const CostTblEntry GLMCostTbl[] = {
  2680. { ISD::FSQRT, MVT::f32, 19 }, // sqrtss
  2681. { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
  2682. { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
  2683. { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
  2684. };
  2685. static const CostTblEntry SLMCostTbl[] = {
  2686. { ISD::FSQRT, MVT::f32, 20 }, // sqrtss
  2687. { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
  2688. { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
  2689. { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
  2690. };
  2691. static const CostTblEntry SSE42CostTbl[] = {
  2692. { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
  2693. { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd
  2694. { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
  2695. { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
  2696. };
  2697. static const CostTblEntry SSE41CostTbl[] = {
  2698. { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
  2699. { ISD::SMAX, MVT::v4i32, 1 },
  2700. { ISD::SMAX, MVT::v16i8, 1 },
  2701. { ISD::SMIN, MVT::v4i32, 1 },
  2702. { ISD::SMIN, MVT::v16i8, 1 },
  2703. { ISD::UMAX, MVT::v4i32, 1 },
  2704. { ISD::UMAX, MVT::v8i16, 1 },
  2705. { ISD::UMIN, MVT::v4i32, 1 },
  2706. { ISD::UMIN, MVT::v8i16, 1 },
  2707. };
  2708. static const CostTblEntry SSSE3CostTbl[] = {
  2709. { ISD::ABS, MVT::v4i32, 1 },
  2710. { ISD::ABS, MVT::v8i16, 1 },
  2711. { ISD::ABS, MVT::v16i8, 1 },
  2712. { ISD::BITREVERSE, MVT::v2i64, 5 },
  2713. { ISD::BITREVERSE, MVT::v4i32, 5 },
  2714. { ISD::BITREVERSE, MVT::v8i16, 5 },
  2715. { ISD::BITREVERSE, MVT::v16i8, 5 },
  2716. { ISD::BSWAP, MVT::v2i64, 1 },
  2717. { ISD::BSWAP, MVT::v4i32, 1 },
  2718. { ISD::BSWAP, MVT::v8i16, 1 },
  2719. { ISD::CTLZ, MVT::v2i64, 23 },
  2720. { ISD::CTLZ, MVT::v4i32, 18 },
  2721. { ISD::CTLZ, MVT::v8i16, 14 },
  2722. { ISD::CTLZ, MVT::v16i8, 9 },
  2723. { ISD::CTPOP, MVT::v2i64, 7 },
  2724. { ISD::CTPOP, MVT::v4i32, 11 },
  2725. { ISD::CTPOP, MVT::v8i16, 9 },
  2726. { ISD::CTPOP, MVT::v16i8, 6 },
  2727. { ISD::CTTZ, MVT::v2i64, 10 },
  2728. { ISD::CTTZ, MVT::v4i32, 14 },
  2729. { ISD::CTTZ, MVT::v8i16, 12 },
  2730. { ISD::CTTZ, MVT::v16i8, 9 }
  2731. };
  2732. static const CostTblEntry SSE2CostTbl[] = {
  2733. { ISD::ABS, MVT::v2i64, 4 },
  2734. { ISD::ABS, MVT::v4i32, 3 },
  2735. { ISD::ABS, MVT::v8i16, 2 },
  2736. { ISD::ABS, MVT::v16i8, 2 },
  2737. { ISD::BITREVERSE, MVT::v2i64, 29 },
  2738. { ISD::BITREVERSE, MVT::v4i32, 27 },
  2739. { ISD::BITREVERSE, MVT::v8i16, 27 },
  2740. { ISD::BITREVERSE, MVT::v16i8, 20 },
  2741. { ISD::BSWAP, MVT::v2i64, 7 },
  2742. { ISD::BSWAP, MVT::v4i32, 7 },
  2743. { ISD::BSWAP, MVT::v8i16, 7 },
  2744. { ISD::CTLZ, MVT::v2i64, 25 },
  2745. { ISD::CTLZ, MVT::v4i32, 26 },
  2746. { ISD::CTLZ, MVT::v8i16, 20 },
  2747. { ISD::CTLZ, MVT::v16i8, 17 },
  2748. { ISD::CTPOP, MVT::v2i64, 12 },
  2749. { ISD::CTPOP, MVT::v4i32, 15 },
  2750. { ISD::CTPOP, MVT::v8i16, 13 },
  2751. { ISD::CTPOP, MVT::v16i8, 10 },
  2752. { ISD::CTTZ, MVT::v2i64, 14 },
  2753. { ISD::CTTZ, MVT::v4i32, 18 },
  2754. { ISD::CTTZ, MVT::v8i16, 16 },
  2755. { ISD::CTTZ, MVT::v16i8, 13 },
  2756. { ISD::SADDSAT, MVT::v8i16, 1 },
  2757. { ISD::SADDSAT, MVT::v16i8, 1 },
  2758. { ISD::SMAX, MVT::v8i16, 1 },
  2759. { ISD::SMIN, MVT::v8i16, 1 },
  2760. { ISD::SSUBSAT, MVT::v8i16, 1 },
  2761. { ISD::SSUBSAT, MVT::v16i8, 1 },
  2762. { ISD::UADDSAT, MVT::v8i16, 1 },
  2763. { ISD::UADDSAT, MVT::v16i8, 1 },
  2764. { ISD::UMAX, MVT::v8i16, 2 },
  2765. { ISD::UMAX, MVT::v16i8, 1 },
  2766. { ISD::UMIN, MVT::v8i16, 2 },
  2767. { ISD::UMIN, MVT::v16i8, 1 },
  2768. { ISD::USUBSAT, MVT::v8i16, 1 },
  2769. { ISD::USUBSAT, MVT::v16i8, 1 },
  2770. { ISD::FMAXNUM, MVT::f64, 4 },
  2771. { ISD::FMAXNUM, MVT::v2f64, 4 },
  2772. { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
  2773. { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
  2774. };
  2775. static const CostTblEntry SSE1CostTbl[] = {
  2776. { ISD::FMAXNUM, MVT::f32, 4 },
  2777. { ISD::FMAXNUM, MVT::v4f32, 4 },
  2778. { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
  2779. { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
  2780. };
  2781. static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
  2782. { ISD::CTTZ, MVT::i64, 1 },
  2783. };
  2784. static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
  2785. { ISD::CTTZ, MVT::i32, 1 },
  2786. { ISD::CTTZ, MVT::i16, 1 },
  2787. { ISD::CTTZ, MVT::i8, 1 },
  2788. };
  2789. static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
  2790. { ISD::CTLZ, MVT::i64, 1 },
  2791. };
  2792. static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
  2793. { ISD::CTLZ, MVT::i32, 1 },
  2794. { ISD::CTLZ, MVT::i16, 1 },
  2795. { ISD::CTLZ, MVT::i8, 1 },
  2796. };
  2797. static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
  2798. { ISD::CTPOP, MVT::i64, 1 },
  2799. };
  2800. static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
  2801. { ISD::CTPOP, MVT::i32, 1 },
  2802. { ISD::CTPOP, MVT::i16, 1 },
  2803. { ISD::CTPOP, MVT::i8, 1 },
  2804. };
  2805. static const CostTblEntry X64CostTbl[] = { // 64-bit targets
  2806. { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
  2807. { ISD::BITREVERSE, MVT::i64, 14 },
  2808. { ISD::BSWAP, MVT::i64, 1 },
  2809. { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
  2810. { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
  2811. { ISD::CTPOP, MVT::i64, 10 },
  2812. { ISD::SADDO, MVT::i64, 1 },
  2813. { ISD::UADDO, MVT::i64, 1 },
  2814. { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
  2815. };
  2816. static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
  2817. { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
  2818. { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
  2819. { ISD::BITREVERSE, MVT::i32, 14 },
  2820. { ISD::BITREVERSE, MVT::i16, 14 },
  2821. { ISD::BITREVERSE, MVT::i8, 11 },
  2822. { ISD::BSWAP, MVT::i32, 1 },
  2823. { ISD::BSWAP, MVT::i16, 1 }, // ROL
  2824. { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
  2825. { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
  2826. { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
  2827. { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
  2828. { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
  2829. { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
  2830. { ISD::CTPOP, MVT::i32, 8 },
  2831. { ISD::CTPOP, MVT::i16, 9 },
  2832. { ISD::CTPOP, MVT::i8, 7 },
  2833. { ISD::SADDO, MVT::i32, 1 },
  2834. { ISD::SADDO, MVT::i16, 1 },
  2835. { ISD::SADDO, MVT::i8, 1 },
  2836. { ISD::UADDO, MVT::i32, 1 },
  2837. { ISD::UADDO, MVT::i16, 1 },
  2838. { ISD::UADDO, MVT::i8, 1 },
  2839. { ISD::UMULO, MVT::i32, 2 }, // mul + seto
  2840. { ISD::UMULO, MVT::i16, 2 },
  2841. { ISD::UMULO, MVT::i8, 2 },
  2842. };
  2843. Type *RetTy = ICA.getReturnType();
  2844. Type *OpTy = RetTy;
  2845. Intrinsic::ID IID = ICA.getID();
  2846. unsigned ISD = ISD::DELETED_NODE;
  2847. switch (IID) {
  2848. default:
  2849. break;
  2850. case Intrinsic::abs:
  2851. ISD = ISD::ABS;
  2852. break;
  2853. case Intrinsic::bitreverse:
  2854. ISD = ISD::BITREVERSE;
  2855. break;
  2856. case Intrinsic::bswap:
  2857. ISD = ISD::BSWAP;
  2858. break;
  2859. case Intrinsic::ctlz:
  2860. ISD = ISD::CTLZ;
  2861. break;
  2862. case Intrinsic::ctpop:
  2863. ISD = ISD::CTPOP;
  2864. break;
  2865. case Intrinsic::cttz:
  2866. ISD = ISD::CTTZ;
  2867. break;
  2868. case Intrinsic::maxnum:
  2869. case Intrinsic::minnum:
  2870. // FMINNUM has same costs so don't duplicate.
  2871. ISD = ISD::FMAXNUM;
  2872. break;
  2873. case Intrinsic::sadd_sat:
  2874. ISD = ISD::SADDSAT;
  2875. break;
  2876. case Intrinsic::smax:
  2877. ISD = ISD::SMAX;
  2878. break;
  2879. case Intrinsic::smin:
  2880. ISD = ISD::SMIN;
  2881. break;
  2882. case Intrinsic::ssub_sat:
  2883. ISD = ISD::SSUBSAT;
  2884. break;
  2885. case Intrinsic::uadd_sat:
  2886. ISD = ISD::UADDSAT;
  2887. break;
  2888. case Intrinsic::umax:
  2889. ISD = ISD::UMAX;
  2890. break;
  2891. case Intrinsic::umin:
  2892. ISD = ISD::UMIN;
  2893. break;
  2894. case Intrinsic::usub_sat:
  2895. ISD = ISD::USUBSAT;
  2896. break;
  2897. case Intrinsic::sqrt:
  2898. ISD = ISD::FSQRT;
  2899. break;
  2900. case Intrinsic::sadd_with_overflow:
  2901. case Intrinsic::ssub_with_overflow:
  2902. // SSUBO has same costs so don't duplicate.
  2903. ISD = ISD::SADDO;
  2904. OpTy = RetTy->getContainedType(0);
  2905. break;
  2906. case Intrinsic::uadd_with_overflow:
  2907. case Intrinsic::usub_with_overflow:
  2908. // USUBO has same costs so don't duplicate.
  2909. ISD = ISD::UADDO;
  2910. OpTy = RetTy->getContainedType(0);
  2911. break;
  2912. case Intrinsic::umul_with_overflow:
  2913. case Intrinsic::smul_with_overflow:
  2914. // SMULO has same costs so don't duplicate.
  2915. ISD = ISD::UMULO;
  2916. OpTy = RetTy->getContainedType(0);
  2917. break;
  2918. }
  2919. if (ISD != ISD::DELETED_NODE) {
  2920. // Legalize the type.
  2921. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
  2922. MVT MTy = LT.second;
  2923. // Attempt to lookup cost.
  2924. if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
  2925. MTy.isVector()) {
  2926. // With PSHUFB the code is very similar for all types. If we have integer
  2927. // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
  2928. // we also need a PSHUFB.
  2929. unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
  2930. // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
  2931. // instructions. We also need an extract and an insert.
  2932. if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
  2933. (ST->hasBWI() && MTy.is512BitVector())))
  2934. Cost = Cost * 2 + 2;
  2935. return LT.first * Cost;
  2936. }
  2937. auto adjustTableCost = [](const CostTblEntry &Entry,
  2938. InstructionCost LegalizationCost,
  2939. FastMathFlags FMF) {
  2940. // If there are no NANs to deal with, then these are reduced to a
  2941. // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
  2942. // assume is used in the non-fast case.
  2943. if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
  2944. if (FMF.noNaNs())
  2945. return LegalizationCost * 1;
  2946. }
  2947. return LegalizationCost * (int)Entry.Cost;
  2948. };
  2949. if (ST->useGLMDivSqrtCosts())
  2950. if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
  2951. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2952. if (ST->useSLMArithCosts())
  2953. if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
  2954. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2955. if (ST->hasBITALG())
  2956. if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
  2957. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2958. if (ST->hasVPOPCNTDQ())
  2959. if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
  2960. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2961. if (ST->hasCDI())
  2962. if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
  2963. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2964. if (ST->hasBWI())
  2965. if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
  2966. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2967. if (ST->hasAVX512())
  2968. if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
  2969. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2970. if (ST->hasXOP())
  2971. if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
  2972. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2973. if (ST->hasAVX2())
  2974. if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
  2975. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2976. if (ST->hasAVX())
  2977. if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
  2978. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2979. if (ST->hasSSE42())
  2980. if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
  2981. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2982. if (ST->hasSSE41())
  2983. if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
  2984. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2985. if (ST->hasSSSE3())
  2986. if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
  2987. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2988. if (ST->hasSSE2())
  2989. if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
  2990. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2991. if (ST->hasSSE1())
  2992. if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
  2993. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2994. if (ST->hasBMI()) {
  2995. if (ST->is64Bit())
  2996. if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
  2997. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  2998. if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
  2999. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3000. }
  3001. if (ST->hasLZCNT()) {
  3002. if (ST->is64Bit())
  3003. if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
  3004. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3005. if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
  3006. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3007. }
  3008. if (ST->hasPOPCNT()) {
  3009. if (ST->is64Bit())
  3010. if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
  3011. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3012. if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
  3013. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3014. }
  3015. if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
  3016. if (const Instruction *II = ICA.getInst()) {
  3017. if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
  3018. return TTI::TCC_Free;
  3019. if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
  3020. if (LI->hasOneUse())
  3021. return TTI::TCC_Free;
  3022. }
  3023. }
  3024. }
  3025. if (ST->is64Bit())
  3026. if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
  3027. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3028. if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
  3029. return adjustTableCost(*Entry, LT.first, ICA.getFlags());
  3030. }
  3031. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  3032. }
  3033. InstructionCost
  3034. X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
  3035. TTI::TargetCostKind CostKind) {
  3036. if (ICA.isTypeBasedOnly())
  3037. return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
  3038. static const CostTblEntry AVX512BWCostTbl[] = {
  3039. { ISD::ROTL, MVT::v32i16, 2 },
  3040. { ISD::ROTL, MVT::v16i16, 2 },
  3041. { ISD::ROTL, MVT::v8i16, 2 },
  3042. { ISD::ROTL, MVT::v64i8, 5 },
  3043. { ISD::ROTL, MVT::v32i8, 5 },
  3044. { ISD::ROTL, MVT::v16i8, 5 },
  3045. { ISD::ROTR, MVT::v32i16, 2 },
  3046. { ISD::ROTR, MVT::v16i16, 2 },
  3047. { ISD::ROTR, MVT::v8i16, 2 },
  3048. { ISD::ROTR, MVT::v64i8, 5 },
  3049. { ISD::ROTR, MVT::v32i8, 5 },
  3050. { ISD::ROTR, MVT::v16i8, 5 }
  3051. };
  3052. static const CostTblEntry AVX512CostTbl[] = {
  3053. { ISD::ROTL, MVT::v8i64, 1 },
  3054. { ISD::ROTL, MVT::v4i64, 1 },
  3055. { ISD::ROTL, MVT::v2i64, 1 },
  3056. { ISD::ROTL, MVT::v16i32, 1 },
  3057. { ISD::ROTL, MVT::v8i32, 1 },
  3058. { ISD::ROTL, MVT::v4i32, 1 },
  3059. { ISD::ROTR, MVT::v8i64, 1 },
  3060. { ISD::ROTR, MVT::v4i64, 1 },
  3061. { ISD::ROTR, MVT::v2i64, 1 },
  3062. { ISD::ROTR, MVT::v16i32, 1 },
  3063. { ISD::ROTR, MVT::v8i32, 1 },
  3064. { ISD::ROTR, MVT::v4i32, 1 }
  3065. };
  3066. // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y))
  3067. static const CostTblEntry XOPCostTbl[] = {
  3068. { ISD::ROTL, MVT::v4i64, 4 },
  3069. { ISD::ROTL, MVT::v8i32, 4 },
  3070. { ISD::ROTL, MVT::v16i16, 4 },
  3071. { ISD::ROTL, MVT::v32i8, 4 },
  3072. { ISD::ROTL, MVT::v2i64, 1 },
  3073. { ISD::ROTL, MVT::v4i32, 1 },
  3074. { ISD::ROTL, MVT::v8i16, 1 },
  3075. { ISD::ROTL, MVT::v16i8, 1 },
  3076. { ISD::ROTR, MVT::v4i64, 6 },
  3077. { ISD::ROTR, MVT::v8i32, 6 },
  3078. { ISD::ROTR, MVT::v16i16, 6 },
  3079. { ISD::ROTR, MVT::v32i8, 6 },
  3080. { ISD::ROTR, MVT::v2i64, 2 },
  3081. { ISD::ROTR, MVT::v4i32, 2 },
  3082. { ISD::ROTR, MVT::v8i16, 2 },
  3083. { ISD::ROTR, MVT::v16i8, 2 }
  3084. };
  3085. static const CostTblEntry X64CostTbl[] = { // 64-bit targets
  3086. { ISD::ROTL, MVT::i64, 1 },
  3087. { ISD::ROTR, MVT::i64, 1 },
  3088. { ISD::FSHL, MVT::i64, 4 }
  3089. };
  3090. static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
  3091. { ISD::ROTL, MVT::i32, 1 },
  3092. { ISD::ROTL, MVT::i16, 1 },
  3093. { ISD::ROTL, MVT::i8, 1 },
  3094. { ISD::ROTR, MVT::i32, 1 },
  3095. { ISD::ROTR, MVT::i16, 1 },
  3096. { ISD::ROTR, MVT::i8, 1 },
  3097. { ISD::FSHL, MVT::i32, 4 },
  3098. { ISD::FSHL, MVT::i16, 4 },
  3099. { ISD::FSHL, MVT::i8, 4 }
  3100. };
  3101. Intrinsic::ID IID = ICA.getID();
  3102. Type *RetTy = ICA.getReturnType();
  3103. const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
  3104. unsigned ISD = ISD::DELETED_NODE;
  3105. switch (IID) {
  3106. default:
  3107. break;
  3108. case Intrinsic::fshl:
  3109. ISD = ISD::FSHL;
  3110. if (Args[0] == Args[1])
  3111. ISD = ISD::ROTL;
  3112. break;
  3113. case Intrinsic::fshr:
  3114. // FSHR has same costs so don't duplicate.
  3115. ISD = ISD::FSHL;
  3116. if (Args[0] == Args[1])
  3117. ISD = ISD::ROTR;
  3118. break;
  3119. }
  3120. if (ISD != ISD::DELETED_NODE) {
  3121. // Legalize the type.
  3122. std::pair<InstructionCost, MVT> LT =
  3123. TLI->getTypeLegalizationCost(DL, RetTy);
  3124. MVT MTy = LT.second;
  3125. // Attempt to lookup cost.
  3126. if (ST->hasBWI())
  3127. if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
  3128. return LT.first * Entry->Cost;
  3129. if (ST->hasAVX512())
  3130. if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
  3131. return LT.first * Entry->Cost;
  3132. if (ST->hasXOP())
  3133. if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
  3134. return LT.first * Entry->Cost;
  3135. if (ST->is64Bit())
  3136. if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
  3137. return LT.first * Entry->Cost;
  3138. if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
  3139. return LT.first * Entry->Cost;
  3140. }
  3141. return BaseT::getIntrinsicInstrCost(ICA, CostKind);
  3142. }
  3143. InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
  3144. unsigned Index) {
  3145. static const CostTblEntry SLMCostTbl[] = {
  3146. { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
  3147. { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
  3148. { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
  3149. { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
  3150. };
  3151. assert(Val->isVectorTy() && "This must be a vector type");
  3152. Type *ScalarType = Val->getScalarType();
  3153. int RegisterFileMoveCost = 0;
  3154. // Non-immediate extraction/insertion can be handled as a sequence of
  3155. // aliased loads+stores via the stack.
  3156. if (Index == -1U && (Opcode == Instruction::ExtractElement ||
  3157. Opcode == Instruction::InsertElement)) {
  3158. // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
  3159. // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
  3160. // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
  3161. assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
  3162. Align VecAlign = DL.getPrefTypeAlign(Val);
  3163. Align SclAlign = DL.getPrefTypeAlign(ScalarType);
  3164. // Extract - store vector to stack, load scalar.
  3165. if (Opcode == Instruction::ExtractElement) {
  3166. return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
  3167. TTI::TargetCostKind::TCK_RecipThroughput) +
  3168. getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
  3169. TTI::TargetCostKind::TCK_RecipThroughput);
  3170. }
  3171. // Insert - store vector to stack, store scalar, load vector.
  3172. if (Opcode == Instruction::InsertElement) {
  3173. return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
  3174. TTI::TargetCostKind::TCK_RecipThroughput) +
  3175. getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
  3176. TTI::TargetCostKind::TCK_RecipThroughput) +
  3177. getMemoryOpCost(Instruction::Load, Val, VecAlign, 0,
  3178. TTI::TargetCostKind::TCK_RecipThroughput);
  3179. }
  3180. }
  3181. if (Index != -1U && (Opcode == Instruction::ExtractElement ||
  3182. Opcode == Instruction::InsertElement)) {
  3183. // Legalize the type.
  3184. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
  3185. // This type is legalized to a scalar type.
  3186. if (!LT.second.isVector())
  3187. return 0;
  3188. // The type may be split. Normalize the index to the new type.
  3189. unsigned NumElts = LT.second.getVectorNumElements();
  3190. unsigned SubNumElts = NumElts;
  3191. Index = Index % NumElts;
  3192. // For >128-bit vectors, we need to extract higher 128-bit subvectors.
  3193. // For inserts, we also need to insert the subvector back.
  3194. if (LT.second.getSizeInBits() > 128) {
  3195. assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
  3196. unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
  3197. SubNumElts = NumElts / NumSubVecs;
  3198. if (SubNumElts <= Index) {
  3199. RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
  3200. Index %= SubNumElts;
  3201. }
  3202. }
  3203. if (Index == 0) {
  3204. // Floating point scalars are already located in index #0.
  3205. // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
  3206. // true for all.
  3207. if (ScalarType->isFloatingPointTy())
  3208. return RegisterFileMoveCost;
  3209. // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
  3210. if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
  3211. return 1 + RegisterFileMoveCost;
  3212. }
  3213. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  3214. assert(ISD && "Unexpected vector opcode");
  3215. MVT MScalarTy = LT.second.getScalarType();
  3216. if (ST->useSLMArithCosts())
  3217. if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
  3218. return Entry->Cost + RegisterFileMoveCost;
  3219. // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
  3220. if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
  3221. (MScalarTy.isInteger() && ST->hasSSE41()))
  3222. return 1 + RegisterFileMoveCost;
  3223. // Assume insertps is relatively cheap on all targets.
  3224. if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
  3225. Opcode == Instruction::InsertElement)
  3226. return 1 + RegisterFileMoveCost;
  3227. // For extractions we just need to shuffle the element to index 0, which
  3228. // should be very cheap (assume cost = 1). For insertions we need to shuffle
  3229. // the elements to its destination. In both cases we must handle the
  3230. // subvector move(s).
  3231. // If the vector type is already less than 128-bits then don't reduce it.
  3232. // TODO: Under what circumstances should we shuffle using the full width?
  3233. InstructionCost ShuffleCost = 1;
  3234. if (Opcode == Instruction::InsertElement) {
  3235. auto *SubTy = cast<VectorType>(Val);
  3236. EVT VT = TLI->getValueType(DL, Val);
  3237. if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
  3238. SubTy = FixedVectorType::get(ScalarType, SubNumElts);
  3239. ShuffleCost =
  3240. getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
  3241. }
  3242. int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
  3243. return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
  3244. }
  3245. // Add to the base cost if we know that the extracted element of a vector is
  3246. // destined to be moved to and used in the integer register file.
  3247. if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
  3248. RegisterFileMoveCost += 1;
  3249. return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
  3250. }
  3251. InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
  3252. const APInt &DemandedElts,
  3253. bool Insert,
  3254. bool Extract) {
  3255. InstructionCost Cost = 0;
  3256. // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
  3257. // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
  3258. if (Insert) {
  3259. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  3260. MVT MScalarTy = LT.second.getScalarType();
  3261. if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
  3262. (MScalarTy.isInteger() && ST->hasSSE41()) ||
  3263. (MScalarTy == MVT::f32 && ST->hasSSE41())) {
  3264. // For types we can insert directly, insertion into 128-bit sub vectors is
  3265. // cheap, followed by a cheap chain of concatenations.
  3266. if (LT.second.getSizeInBits() <= 128) {
  3267. Cost +=
  3268. BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
  3269. } else {
  3270. // In each 128-lane, if at least one index is demanded but not all
  3271. // indices are demanded and this 128-lane is not the first 128-lane of
  3272. // the legalized-vector, then this 128-lane needs a extracti128; If in
  3273. // each 128-lane, there is at least one demanded index, this 128-lane
  3274. // needs a inserti128.
  3275. // The following cases will help you build a better understanding:
  3276. // Assume we insert several elements into a v8i32 vector in avx2,
  3277. // Case#1: inserting into 1th index needs vpinsrd + inserti128.
  3278. // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
  3279. // inserti128.
  3280. // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
  3281. const int CostValue = *LT.first.getValue();
  3282. assert(CostValue >= 0 && "Negative cost!");
  3283. unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
  3284. unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
  3285. APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
  3286. unsigned Scale = NumElts / Num128Lanes;
  3287. // We iterate each 128-lane, and check if we need a
  3288. // extracti128/inserti128 for this 128-lane.
  3289. for (unsigned I = 0; I < NumElts; I += Scale) {
  3290. APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
  3291. APInt MaskedDE = Mask & WidenedDemandedElts;
  3292. unsigned Population = MaskedDE.countPopulation();
  3293. Cost += (Population > 0 && Population != Scale &&
  3294. I % LT.second.getVectorNumElements() != 0);
  3295. Cost += Population > 0;
  3296. }
  3297. Cost += DemandedElts.countPopulation();
  3298. // For vXf32 cases, insertion into the 0'th index in each v4f32
  3299. // 128-bit vector is free.
  3300. // NOTE: This assumes legalization widens vXf32 vectors.
  3301. if (MScalarTy == MVT::f32)
  3302. for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
  3303. i < e; i += 4)
  3304. if (DemandedElts[i])
  3305. Cost--;
  3306. }
  3307. } else if (LT.second.isVector()) {
  3308. // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
  3309. // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
  3310. // series of UNPCK followed by CONCAT_VECTORS - all of these can be
  3311. // considered cheap.
  3312. if (Ty->isIntOrIntVectorTy())
  3313. Cost += DemandedElts.countPopulation();
  3314. // Get the smaller of the legalized or original pow2-extended number of
  3315. // vector elements, which represents the number of unpacks we'll end up
  3316. // performing.
  3317. unsigned NumElts = LT.second.getVectorNumElements();
  3318. unsigned Pow2Elts =
  3319. PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
  3320. Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
  3321. }
  3322. }
  3323. // TODO: Use default extraction for now, but we should investigate extending this
  3324. // to handle repeated subvector extraction.
  3325. if (Extract)
  3326. Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
  3327. return Cost;
  3328. }
  3329. InstructionCost
  3330. X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
  3331. int VF, const APInt &DemandedDstElts,
  3332. TTI::TargetCostKind CostKind) {
  3333. const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy);
  3334. // We don't differentiate element types here, only element bit width.
  3335. EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits);
  3336. auto bailout = [&]() {
  3337. return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
  3338. DemandedDstElts, CostKind);
  3339. };
  3340. // For now, only deal with AVX512 cases.
  3341. if (!ST->hasAVX512())
  3342. return bailout();
  3343. // Do we have a native shuffle for this element type, or should we promote?
  3344. unsigned PromEltTyBits = EltTyBits;
  3345. switch (EltTyBits) {
  3346. case 32:
  3347. case 64:
  3348. break; // AVX512F.
  3349. case 16:
  3350. if (!ST->hasBWI())
  3351. PromEltTyBits = 32; // promote to i32, AVX512F.
  3352. break; // AVX512BW
  3353. case 8:
  3354. if (!ST->hasVBMI())
  3355. PromEltTyBits = 32; // promote to i32, AVX512F.
  3356. break; // AVX512VBMI
  3357. case 1:
  3358. // There is no support for shuffling i1 elements. We *must* promote.
  3359. if (ST->hasBWI()) {
  3360. if (ST->hasVBMI())
  3361. PromEltTyBits = 8; // promote to i8, AVX512VBMI.
  3362. else
  3363. PromEltTyBits = 16; // promote to i16, AVX512BW.
  3364. break;
  3365. }
  3366. if (ST->hasDQI()) {
  3367. PromEltTyBits = 32; // promote to i32, AVX512F.
  3368. break;
  3369. }
  3370. return bailout();
  3371. default:
  3372. return bailout();
  3373. }
  3374. auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits);
  3375. auto *SrcVecTy = FixedVectorType::get(EltTy, VF);
  3376. auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF);
  3377. int NumDstElements = VF * ReplicationFactor;
  3378. auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements);
  3379. auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements);
  3380. // Legalize the types.
  3381. MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second;
  3382. MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second;
  3383. MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second;
  3384. MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second;
  3385. // They should have legalized into vector types.
  3386. if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() ||
  3387. !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector())
  3388. return bailout();
  3389. if (PromEltTyBits != EltTyBits) {
  3390. // If we have to perform the shuffle with wider elt type than our data type,
  3391. // then we will first need to anyext (we don't care about the new bits)
  3392. // the source elements, and then truncate Dst elements.
  3393. InstructionCost PromotionCost;
  3394. PromotionCost += getCastInstrCost(
  3395. Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy,
  3396. TargetTransformInfo::CastContextHint::None, CostKind);
  3397. PromotionCost +=
  3398. getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy,
  3399. /*Src=*/PromDstVecTy,
  3400. TargetTransformInfo::CastContextHint::None, CostKind);
  3401. return PromotionCost + getReplicationShuffleCost(PromEltTy,
  3402. ReplicationFactor, VF,
  3403. DemandedDstElts, CostKind);
  3404. }
  3405. assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&
  3406. LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&
  3407. "We expect that the legalization doesn't affect the element width, "
  3408. "doesn't coalesce/split elements.");
  3409. unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements();
  3410. unsigned NumDstVectors =
  3411. divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec);
  3412. auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec);
  3413. // Not all the produced Dst elements may be demanded. In our case,
  3414. // given that a single Dst vector is formed by a single shuffle,
  3415. // if all elements that will form a single Dst vector aren't demanded,
  3416. // then we won't need to do that shuffle, so adjust the cost accordingly.
  3417. APInt DemandedDstVectors = APIntOps::ScaleBitMask(
  3418. DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec),
  3419. NumDstVectors);
  3420. unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation();
  3421. InstructionCost SingleShuffleCost =
  3422. getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy,
  3423. /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr);
  3424. return NumDstVectorsDemanded * SingleShuffleCost;
  3425. }
  3426. InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
  3427. MaybeAlign Alignment,
  3428. unsigned AddressSpace,
  3429. TTI::TargetCostKind CostKind,
  3430. const Instruction *I) {
  3431. // TODO: Handle other cost kinds.
  3432. if (CostKind != TTI::TCK_RecipThroughput) {
  3433. if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
  3434. // Store instruction with index and scale costs 2 Uops.
  3435. // Check the preceding GEP to identify non-const indices.
  3436. if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
  3437. if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
  3438. return TTI::TCC_Basic * 2;
  3439. }
  3440. }
  3441. return TTI::TCC_Basic;
  3442. }
  3443. assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
  3444. "Invalid Opcode");
  3445. // Type legalization can't handle structs
  3446. if (TLI->getValueType(DL, Src, true) == MVT::Other)
  3447. return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  3448. CostKind);
  3449. // Legalize the type.
  3450. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  3451. auto *VTy = dyn_cast<FixedVectorType>(Src);
  3452. // Handle the simple case of non-vectors.
  3453. // NOTE: this assumes that legalization never creates vector from scalars!
  3454. if (!VTy || !LT.second.isVector())
  3455. // Each load/store unit costs 1.
  3456. return LT.first * 1;
  3457. bool IsLoad = Opcode == Instruction::Load;
  3458. Type *EltTy = VTy->getElementType();
  3459. const int EltTyBits = DL.getTypeSizeInBits(EltTy);
  3460. InstructionCost Cost = 0;
  3461. // Source of truth: how many elements were there in the original IR vector?
  3462. const unsigned SrcNumElt = VTy->getNumElements();
  3463. // How far have we gotten?
  3464. int NumEltRemaining = SrcNumElt;
  3465. // Note that we intentionally capture by-reference, NumEltRemaining changes.
  3466. auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
  3467. const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
  3468. // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
  3469. const unsigned XMMBits = 128;
  3470. if (XMMBits % EltTyBits != 0)
  3471. // Vector size must be a multiple of the element size. I.e. no padding.
  3472. return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  3473. CostKind);
  3474. const int NumEltPerXMM = XMMBits / EltTyBits;
  3475. auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
  3476. for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
  3477. NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
  3478. // How many elements would a single op deal with at once?
  3479. if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
  3480. // Vector size must be a multiple of the element size. I.e. no padding.
  3481. return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
  3482. CostKind);
  3483. int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
  3484. assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
  3485. assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
  3486. (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
  3487. "Unless we haven't halved the op size yet, "
  3488. "we have less than two op's sized units of work left.");
  3489. auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
  3490. ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
  3491. : XMMVecTy;
  3492. assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
  3493. "After halving sizes, the vector elt count is no longer a multiple "
  3494. "of number of elements per operation?");
  3495. auto *CoalescedVecTy =
  3496. CurrNumEltPerOp == 1
  3497. ? CurrVecTy
  3498. : FixedVectorType::get(
  3499. IntegerType::get(Src->getContext(),
  3500. EltTyBits * CurrNumEltPerOp),
  3501. CurrVecTy->getNumElements() / CurrNumEltPerOp);
  3502. assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
  3503. DL.getTypeSizeInBits(CurrVecTy) &&
  3504. "coalesciing elements doesn't change vector width.");
  3505. while (NumEltRemaining > 0) {
  3506. assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
  3507. // Can we use this vector size, as per the remaining element count?
  3508. // Iff the vector is naturally aligned, we can do a wide load regardless.
  3509. if (NumEltRemaining < CurrNumEltPerOp &&
  3510. (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
  3511. CurrOpSizeBytes != 1)
  3512. break; // Try smalled vector size.
  3513. bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
  3514. // If we have fully processed the previous reg, we need to replenish it.
  3515. if (SubVecEltsLeft == 0) {
  3516. SubVecEltsLeft += CurrVecTy->getNumElements();
  3517. // And that's free only for the 0'th subvector of a legalized vector.
  3518. if (!Is0thSubVec)
  3519. Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
  3520. : TTI::ShuffleKind::SK_ExtractSubvector,
  3521. VTy, None, NumEltDone(), CurrVecTy);
  3522. }
  3523. // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
  3524. // for smaller widths (32/16/8) we have to insert/extract them separately.
  3525. // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
  3526. // but let's pretend that it is also true for 16/8 bit wide ops...)
  3527. if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
  3528. int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
  3529. assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
  3530. int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
  3531. APInt DemandedElts =
  3532. APInt::getBitsSet(CoalescedVecTy->getNumElements(),
  3533. CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
  3534. assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
  3535. Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
  3536. !IsLoad);
  3537. }
  3538. // This isn't exactly right. We're using slow unaligned 32-byte accesses
  3539. // as a proxy for a double-pumped AVX memory interface such as on
  3540. // Sandybridge.
  3541. if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
  3542. Cost += 2;
  3543. else
  3544. Cost += 1;
  3545. SubVecEltsLeft -= CurrNumEltPerOp;
  3546. NumEltRemaining -= CurrNumEltPerOp;
  3547. Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
  3548. }
  3549. }
  3550. assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
  3551. return Cost;
  3552. }
  3553. InstructionCost
  3554. X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
  3555. unsigned AddressSpace,
  3556. TTI::TargetCostKind CostKind) {
  3557. bool IsLoad = (Instruction::Load == Opcode);
  3558. bool IsStore = (Instruction::Store == Opcode);
  3559. auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
  3560. if (!SrcVTy)
  3561. // To calculate scalar take the regular cost, without mask
  3562. return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
  3563. unsigned NumElem = SrcVTy->getNumElements();
  3564. auto *MaskTy =
  3565. FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
  3566. if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
  3567. (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
  3568. // Scalarization
  3569. APInt DemandedElts = APInt::getAllOnes(NumElem);
  3570. InstructionCost MaskSplitCost =
  3571. getScalarizationOverhead(MaskTy, DemandedElts, false, true);
  3572. InstructionCost ScalarCompareCost = getCmpSelInstrCost(
  3573. Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
  3574. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  3575. InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
  3576. InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
  3577. InstructionCost ValueSplitCost =
  3578. getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
  3579. InstructionCost MemopCost =
  3580. NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
  3581. Alignment, AddressSpace, CostKind);
  3582. return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
  3583. }
  3584. // Legalize the type.
  3585. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
  3586. auto VT = TLI->getValueType(DL, SrcVTy);
  3587. InstructionCost Cost = 0;
  3588. if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
  3589. LT.second.getVectorNumElements() == NumElem)
  3590. // Promotion requires extend/truncate for data and a shuffle for mask.
  3591. Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
  3592. getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
  3593. else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
  3594. auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
  3595. LT.second.getVectorNumElements());
  3596. // Expanding requires fill mask with zeroes
  3597. Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
  3598. }
  3599. // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
  3600. if (!ST->hasAVX512())
  3601. return Cost + LT.first * (IsLoad ? 2 : 8);
  3602. // AVX-512 masked load/store is cheapper
  3603. return Cost + LT.first;
  3604. }
  3605. InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
  3606. ScalarEvolution *SE,
  3607. const SCEV *Ptr) {
  3608. // Address computations in vectorized code with non-consecutive addresses will
  3609. // likely result in more instructions compared to scalar code where the
  3610. // computation can more often be merged into the index mode. The resulting
  3611. // extra micro-ops can significantly decrease throughput.
  3612. const unsigned NumVectorInstToHideOverhead = 10;
  3613. // Cost modeling of Strided Access Computation is hidden by the indexing
  3614. // modes of X86 regardless of the stride value. We dont believe that there
  3615. // is a difference between constant strided access in gerenal and constant
  3616. // strided value which is less than or equal to 64.
  3617. // Even in the case of (loop invariant) stride whose value is not known at
  3618. // compile time, the address computation will not incur more than one extra
  3619. // ADD instruction.
  3620. if (Ty->isVectorTy() && SE && !ST->hasAVX2()) {
  3621. // TODO: AVX2 is the current cut-off because we don't have correct
  3622. // interleaving costs for prior ISA's.
  3623. if (!BaseT::isStridedAccess(Ptr))
  3624. return NumVectorInstToHideOverhead;
  3625. if (!BaseT::getConstantStrideStep(SE, Ptr))
  3626. return 1;
  3627. }
  3628. return BaseT::getAddressComputationCost(Ty, SE, Ptr);
  3629. }
  3630. InstructionCost
  3631. X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
  3632. Optional<FastMathFlags> FMF,
  3633. TTI::TargetCostKind CostKind) {
  3634. if (TTI::requiresOrderedReduction(FMF))
  3635. return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
  3636. // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
  3637. // and make it as the cost.
  3638. static const CostTblEntry SLMCostTblNoPairWise[] = {
  3639. { ISD::FADD, MVT::v2f64, 3 },
  3640. { ISD::ADD, MVT::v2i64, 5 },
  3641. };
  3642. static const CostTblEntry SSE2CostTblNoPairWise[] = {
  3643. { ISD::FADD, MVT::v2f64, 2 },
  3644. { ISD::FADD, MVT::v2f32, 2 },
  3645. { ISD::FADD, MVT::v4f32, 4 },
  3646. { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
  3647. { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
  3648. { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
  3649. { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
  3650. { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
  3651. { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
  3652. { ISD::ADD, MVT::v2i8, 2 },
  3653. { ISD::ADD, MVT::v4i8, 2 },
  3654. { ISD::ADD, MVT::v8i8, 2 },
  3655. { ISD::ADD, MVT::v16i8, 3 },
  3656. };
  3657. static const CostTblEntry AVX1CostTblNoPairWise[] = {
  3658. { ISD::FADD, MVT::v4f64, 3 },
  3659. { ISD::FADD, MVT::v4f32, 3 },
  3660. { ISD::FADD, MVT::v8f32, 4 },
  3661. { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
  3662. { ISD::ADD, MVT::v4i64, 3 },
  3663. { ISD::ADD, MVT::v8i32, 5 },
  3664. { ISD::ADD, MVT::v16i16, 5 },
  3665. { ISD::ADD, MVT::v32i8, 4 },
  3666. };
  3667. int ISD = TLI->InstructionOpcodeToISD(Opcode);
  3668. assert(ISD && "Invalid opcode");
  3669. // Before legalizing the type, give a chance to look up illegal narrow types
  3670. // in the table.
  3671. // FIXME: Is there a better way to do this?
  3672. EVT VT = TLI->getValueType(DL, ValTy);
  3673. if (VT.isSimple()) {
  3674. MVT MTy = VT.getSimpleVT();
  3675. if (ST->useSLMArithCosts())
  3676. if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
  3677. return Entry->Cost;
  3678. if (ST->hasAVX())
  3679. if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
  3680. return Entry->Cost;
  3681. if (ST->hasSSE2())
  3682. if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
  3683. return Entry->Cost;
  3684. }
  3685. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
  3686. MVT MTy = LT.second;
  3687. auto *ValVTy = cast<FixedVectorType>(ValTy);
  3688. // Special case: vXi8 mul reductions are performed as vXi16.
  3689. if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
  3690. auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
  3691. auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
  3692. return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
  3693. TargetTransformInfo::CastContextHint::None,
  3694. CostKind) +
  3695. getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
  3696. }
  3697. InstructionCost ArithmeticCost = 0;
  3698. if (LT.first != 1 && MTy.isVector() &&
  3699. MTy.getVectorNumElements() < ValVTy->getNumElements()) {
  3700. // Type needs to be split. We need LT.first - 1 arithmetic ops.
  3701. auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
  3702. MTy.getVectorNumElements());
  3703. ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
  3704. ArithmeticCost *= LT.first - 1;
  3705. }
  3706. if (ST->useSLMArithCosts())
  3707. if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
  3708. return ArithmeticCost + Entry->Cost;
  3709. if (ST->hasAVX())
  3710. if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
  3711. return ArithmeticCost + Entry->Cost;
  3712. if (ST->hasSSE2())
  3713. if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
  3714. return ArithmeticCost + Entry->Cost;
  3715. // FIXME: These assume a naive kshift+binop lowering, which is probably
  3716. // conservative in most cases.
  3717. static const CostTblEntry AVX512BoolReduction[] = {
  3718. { ISD::AND, MVT::v2i1, 3 },
  3719. { ISD::AND, MVT::v4i1, 5 },
  3720. { ISD::AND, MVT::v8i1, 7 },
  3721. { ISD::AND, MVT::v16i1, 9 },
  3722. { ISD::AND, MVT::v32i1, 11 },
  3723. { ISD::AND, MVT::v64i1, 13 },
  3724. { ISD::OR, MVT::v2i1, 3 },
  3725. { ISD::OR, MVT::v4i1, 5 },
  3726. { ISD::OR, MVT::v8i1, 7 },
  3727. { ISD::OR, MVT::v16i1, 9 },
  3728. { ISD::OR, MVT::v32i1, 11 },
  3729. { ISD::OR, MVT::v64i1, 13 },
  3730. };
  3731. static const CostTblEntry AVX2BoolReduction[] = {
  3732. { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
  3733. { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
  3734. { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp
  3735. { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp
  3736. };
  3737. static const CostTblEntry AVX1BoolReduction[] = {
  3738. { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp
  3739. { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp
  3740. { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
  3741. { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp
  3742. { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp
  3743. { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp
  3744. { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
  3745. { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp
  3746. };
  3747. static const CostTblEntry SSE2BoolReduction[] = {
  3748. { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp
  3749. { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp
  3750. { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp
  3751. { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp
  3752. { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp
  3753. { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp
  3754. { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp
  3755. { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp
  3756. };
  3757. // Handle bool allof/anyof patterns.
  3758. if (ValVTy->getElementType()->isIntegerTy(1)) {
  3759. InstructionCost ArithmeticCost = 0;
  3760. if (LT.first != 1 && MTy.isVector() &&
  3761. MTy.getVectorNumElements() < ValVTy->getNumElements()) {
  3762. // Type needs to be split. We need LT.first - 1 arithmetic ops.
  3763. auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
  3764. MTy.getVectorNumElements());
  3765. ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
  3766. ArithmeticCost *= LT.first - 1;
  3767. }
  3768. if (ST->hasAVX512())
  3769. if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
  3770. return ArithmeticCost + Entry->Cost;
  3771. if (ST->hasAVX2())
  3772. if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
  3773. return ArithmeticCost + Entry->Cost;
  3774. if (ST->hasAVX())
  3775. if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
  3776. return ArithmeticCost + Entry->Cost;
  3777. if (ST->hasSSE2())
  3778. if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
  3779. return ArithmeticCost + Entry->Cost;
  3780. return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
  3781. }
  3782. unsigned NumVecElts = ValVTy->getNumElements();
  3783. unsigned ScalarSize = ValVTy->getScalarSizeInBits();
  3784. // Special case power of 2 reductions where the scalar type isn't changed
  3785. // by type legalization.
  3786. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
  3787. return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
  3788. InstructionCost ReductionCost = 0;
  3789. auto *Ty = ValVTy;
  3790. if (LT.first != 1 && MTy.isVector() &&
  3791. MTy.getVectorNumElements() < ValVTy->getNumElements()) {
  3792. // Type needs to be split. We need LT.first - 1 arithmetic ops.
  3793. Ty = FixedVectorType::get(ValVTy->getElementType(),
  3794. MTy.getVectorNumElements());
  3795. ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
  3796. ReductionCost *= LT.first - 1;
  3797. NumVecElts = MTy.getVectorNumElements();
  3798. }
  3799. // Now handle reduction with the legal type, taking into account size changes
  3800. // at each level.
  3801. while (NumVecElts > 1) {
  3802. // Determine the size of the remaining vector we need to reduce.
  3803. unsigned Size = NumVecElts * ScalarSize;
  3804. NumVecElts /= 2;
  3805. // If we're reducing from 256/512 bits, use an extract_subvector.
  3806. if (Size > 128) {
  3807. auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
  3808. ReductionCost +=
  3809. getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
  3810. Ty = SubTy;
  3811. } else if (Size == 128) {
  3812. // Reducing from 128 bits is a permute of v2f64/v2i64.
  3813. FixedVectorType *ShufTy;
  3814. if (ValVTy->isFloatingPointTy())
  3815. ShufTy =
  3816. FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
  3817. else
  3818. ShufTy =
  3819. FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
  3820. ReductionCost +=
  3821. getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
  3822. } else if (Size == 64) {
  3823. // Reducing from 64 bits is a shuffle of v4f32/v4i32.
  3824. FixedVectorType *ShufTy;
  3825. if (ValVTy->isFloatingPointTy())
  3826. ShufTy =
  3827. FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
  3828. else
  3829. ShufTy =
  3830. FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
  3831. ReductionCost +=
  3832. getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
  3833. } else {
  3834. // Reducing from smaller size is a shift by immediate.
  3835. auto *ShiftTy = FixedVectorType::get(
  3836. Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
  3837. ReductionCost += getArithmeticInstrCost(
  3838. Instruction::LShr, ShiftTy, CostKind,
  3839. TargetTransformInfo::OK_AnyValue,
  3840. TargetTransformInfo::OK_UniformConstantValue,
  3841. TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
  3842. }
  3843. // Add the arithmetic op for this level.
  3844. ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
  3845. }
  3846. // Add the final extract element to the cost.
  3847. return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
  3848. }
  3849. InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
  3850. bool IsUnsigned) {
  3851. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  3852. MVT MTy = LT.second;
  3853. int ISD;
  3854. if (Ty->isIntOrIntVectorTy()) {
  3855. ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
  3856. } else {
  3857. assert(Ty->isFPOrFPVectorTy() &&
  3858. "Expected float point or integer vector type.");
  3859. ISD = ISD::FMINNUM;
  3860. }
  3861. static const CostTblEntry SSE1CostTbl[] = {
  3862. {ISD::FMINNUM, MVT::v4f32, 1},
  3863. };
  3864. static const CostTblEntry SSE2CostTbl[] = {
  3865. {ISD::FMINNUM, MVT::v2f64, 1},
  3866. {ISD::SMIN, MVT::v8i16, 1},
  3867. {ISD::UMIN, MVT::v16i8, 1},
  3868. };
  3869. static const CostTblEntry SSE41CostTbl[] = {
  3870. {ISD::SMIN, MVT::v4i32, 1},
  3871. {ISD::UMIN, MVT::v4i32, 1},
  3872. {ISD::UMIN, MVT::v8i16, 1},
  3873. {ISD::SMIN, MVT::v16i8, 1},
  3874. };
  3875. static const CostTblEntry SSE42CostTbl[] = {
  3876. {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
  3877. };
  3878. static const CostTblEntry AVX1CostTbl[] = {
  3879. {ISD::FMINNUM, MVT::v8f32, 1},
  3880. {ISD::FMINNUM, MVT::v4f64, 1},
  3881. {ISD::SMIN, MVT::v8i32, 3},
  3882. {ISD::UMIN, MVT::v8i32, 3},
  3883. {ISD::SMIN, MVT::v16i16, 3},
  3884. {ISD::UMIN, MVT::v16i16, 3},
  3885. {ISD::SMIN, MVT::v32i8, 3},
  3886. {ISD::UMIN, MVT::v32i8, 3},
  3887. };
  3888. static const CostTblEntry AVX2CostTbl[] = {
  3889. {ISD::SMIN, MVT::v8i32, 1},
  3890. {ISD::UMIN, MVT::v8i32, 1},
  3891. {ISD::SMIN, MVT::v16i16, 1},
  3892. {ISD::UMIN, MVT::v16i16, 1},
  3893. {ISD::SMIN, MVT::v32i8, 1},
  3894. {ISD::UMIN, MVT::v32i8, 1},
  3895. };
  3896. static const CostTblEntry AVX512CostTbl[] = {
  3897. {ISD::FMINNUM, MVT::v16f32, 1},
  3898. {ISD::FMINNUM, MVT::v8f64, 1},
  3899. {ISD::SMIN, MVT::v2i64, 1},
  3900. {ISD::UMIN, MVT::v2i64, 1},
  3901. {ISD::SMIN, MVT::v4i64, 1},
  3902. {ISD::UMIN, MVT::v4i64, 1},
  3903. {ISD::SMIN, MVT::v8i64, 1},
  3904. {ISD::UMIN, MVT::v8i64, 1},
  3905. {ISD::SMIN, MVT::v16i32, 1},
  3906. {ISD::UMIN, MVT::v16i32, 1},
  3907. };
  3908. static const CostTblEntry AVX512BWCostTbl[] = {
  3909. {ISD::SMIN, MVT::v32i16, 1},
  3910. {ISD::UMIN, MVT::v32i16, 1},
  3911. {ISD::SMIN, MVT::v64i8, 1},
  3912. {ISD::UMIN, MVT::v64i8, 1},
  3913. };
  3914. // If we have a native MIN/MAX instruction for this type, use it.
  3915. if (ST->hasBWI())
  3916. if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
  3917. return LT.first * Entry->Cost;
  3918. if (ST->hasAVX512())
  3919. if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
  3920. return LT.first * Entry->Cost;
  3921. if (ST->hasAVX2())
  3922. if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
  3923. return LT.first * Entry->Cost;
  3924. if (ST->hasAVX())
  3925. if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
  3926. return LT.first * Entry->Cost;
  3927. if (ST->hasSSE42())
  3928. if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
  3929. return LT.first * Entry->Cost;
  3930. if (ST->hasSSE41())
  3931. if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
  3932. return LT.first * Entry->Cost;
  3933. if (ST->hasSSE2())
  3934. if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
  3935. return LT.first * Entry->Cost;
  3936. if (ST->hasSSE1())
  3937. if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
  3938. return LT.first * Entry->Cost;
  3939. unsigned CmpOpcode;
  3940. if (Ty->isFPOrFPVectorTy()) {
  3941. CmpOpcode = Instruction::FCmp;
  3942. } else {
  3943. assert(Ty->isIntOrIntVectorTy() &&
  3944. "expecting floating point or integer type for min/max reduction");
  3945. CmpOpcode = Instruction::ICmp;
  3946. }
  3947. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  3948. // Otherwise fall back to cmp+select.
  3949. InstructionCost Result =
  3950. getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
  3951. CostKind) +
  3952. getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
  3953. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  3954. return Result;
  3955. }
  3956. InstructionCost
  3957. X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
  3958. bool IsUnsigned,
  3959. TTI::TargetCostKind CostKind) {
  3960. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
  3961. MVT MTy = LT.second;
  3962. int ISD;
  3963. if (ValTy->isIntOrIntVectorTy()) {
  3964. ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
  3965. } else {
  3966. assert(ValTy->isFPOrFPVectorTy() &&
  3967. "Expected float point or integer vector type.");
  3968. ISD = ISD::FMINNUM;
  3969. }
  3970. // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
  3971. // and make it as the cost.
  3972. static const CostTblEntry SSE2CostTblNoPairWise[] = {
  3973. {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
  3974. {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
  3975. {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
  3976. };
  3977. static const CostTblEntry SSE41CostTblNoPairWise[] = {
  3978. {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
  3979. {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
  3980. {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
  3981. {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
  3982. {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
  3983. {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
  3984. {ISD::SMIN, MVT::v2i8, 3}, // pminsb
  3985. {ISD::SMIN, MVT::v4i8, 5}, // pminsb
  3986. {ISD::SMIN, MVT::v8i8, 7}, // pminsb
  3987. {ISD::SMIN, MVT::v16i8, 6},
  3988. {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
  3989. {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
  3990. {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
  3991. {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
  3992. };
  3993. static const CostTblEntry AVX1CostTblNoPairWise[] = {
  3994. {ISD::SMIN, MVT::v16i16, 6},
  3995. {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
  3996. {ISD::SMIN, MVT::v32i8, 8},
  3997. {ISD::UMIN, MVT::v32i8, 8},
  3998. };
  3999. static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
  4000. {ISD::SMIN, MVT::v32i16, 8},
  4001. {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
  4002. {ISD::SMIN, MVT::v64i8, 10},
  4003. {ISD::UMIN, MVT::v64i8, 10},
  4004. };
  4005. // Before legalizing the type, give a chance to look up illegal narrow types
  4006. // in the table.
  4007. // FIXME: Is there a better way to do this?
  4008. EVT VT = TLI->getValueType(DL, ValTy);
  4009. if (VT.isSimple()) {
  4010. MVT MTy = VT.getSimpleVT();
  4011. if (ST->hasBWI())
  4012. if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
  4013. return Entry->Cost;
  4014. if (ST->hasAVX())
  4015. if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
  4016. return Entry->Cost;
  4017. if (ST->hasSSE41())
  4018. if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
  4019. return Entry->Cost;
  4020. if (ST->hasSSE2())
  4021. if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
  4022. return Entry->Cost;
  4023. }
  4024. auto *ValVTy = cast<FixedVectorType>(ValTy);
  4025. unsigned NumVecElts = ValVTy->getNumElements();
  4026. auto *Ty = ValVTy;
  4027. InstructionCost MinMaxCost = 0;
  4028. if (LT.first != 1 && MTy.isVector() &&
  4029. MTy.getVectorNumElements() < ValVTy->getNumElements()) {
  4030. // Type needs to be split. We need LT.first - 1 operations ops.
  4031. Ty = FixedVectorType::get(ValVTy->getElementType(),
  4032. MTy.getVectorNumElements());
  4033. auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
  4034. MTy.getVectorNumElements());
  4035. MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
  4036. MinMaxCost *= LT.first - 1;
  4037. NumVecElts = MTy.getVectorNumElements();
  4038. }
  4039. if (ST->hasBWI())
  4040. if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
  4041. return MinMaxCost + Entry->Cost;
  4042. if (ST->hasAVX())
  4043. if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
  4044. return MinMaxCost + Entry->Cost;
  4045. if (ST->hasSSE41())
  4046. if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
  4047. return MinMaxCost + Entry->Cost;
  4048. if (ST->hasSSE2())
  4049. if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
  4050. return MinMaxCost + Entry->Cost;
  4051. unsigned ScalarSize = ValTy->getScalarSizeInBits();
  4052. // Special case power of 2 reductions where the scalar type isn't changed
  4053. // by type legalization.
  4054. if (!isPowerOf2_32(ValVTy->getNumElements()) ||
  4055. ScalarSize != MTy.getScalarSizeInBits())
  4056. return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
  4057. // Now handle reduction with the legal type, taking into account size changes
  4058. // at each level.
  4059. while (NumVecElts > 1) {
  4060. // Determine the size of the remaining vector we need to reduce.
  4061. unsigned Size = NumVecElts * ScalarSize;
  4062. NumVecElts /= 2;
  4063. // If we're reducing from 256/512 bits, use an extract_subvector.
  4064. if (Size > 128) {
  4065. auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
  4066. MinMaxCost +=
  4067. getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
  4068. Ty = SubTy;
  4069. } else if (Size == 128) {
  4070. // Reducing from 128 bits is a permute of v2f64/v2i64.
  4071. VectorType *ShufTy;
  4072. if (ValTy->isFloatingPointTy())
  4073. ShufTy =
  4074. FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
  4075. else
  4076. ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
  4077. MinMaxCost +=
  4078. getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
  4079. } else if (Size == 64) {
  4080. // Reducing from 64 bits is a shuffle of v4f32/v4i32.
  4081. FixedVectorType *ShufTy;
  4082. if (ValTy->isFloatingPointTy())
  4083. ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
  4084. else
  4085. ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
  4086. MinMaxCost +=
  4087. getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
  4088. } else {
  4089. // Reducing from smaller size is a shift by immediate.
  4090. auto *ShiftTy = FixedVectorType::get(
  4091. Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
  4092. MinMaxCost += getArithmeticInstrCost(
  4093. Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
  4094. TargetTransformInfo::OK_AnyValue,
  4095. TargetTransformInfo::OK_UniformConstantValue,
  4096. TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
  4097. }
  4098. // Add the arithmetic op for this level.
  4099. auto *SubCondTy =
  4100. FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
  4101. MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
  4102. }
  4103. // Add the final extract element to the cost.
  4104. return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
  4105. }
  4106. /// Calculate the cost of materializing a 64-bit value. This helper
  4107. /// method might only calculate a fraction of a larger immediate. Therefore it
  4108. /// is valid to return a cost of ZERO.
  4109. InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
  4110. if (Val == 0)
  4111. return TTI::TCC_Free;
  4112. if (isInt<32>(Val))
  4113. return TTI::TCC_Basic;
  4114. return 2 * TTI::TCC_Basic;
  4115. }
  4116. InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
  4117. TTI::TargetCostKind CostKind) {
  4118. assert(Ty->isIntegerTy());
  4119. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  4120. if (BitSize == 0)
  4121. return ~0U;
  4122. // Never hoist constants larger than 128bit, because this might lead to
  4123. // incorrect code generation or assertions in codegen.
  4124. // Fixme: Create a cost model for types larger than i128 once the codegen
  4125. // issues have been fixed.
  4126. if (BitSize > 128)
  4127. return TTI::TCC_Free;
  4128. if (Imm == 0)
  4129. return TTI::TCC_Free;
  4130. // Sign-extend all constants to a multiple of 64-bit.
  4131. APInt ImmVal = Imm;
  4132. if (BitSize % 64 != 0)
  4133. ImmVal = Imm.sext(alignTo(BitSize, 64));
  4134. // Split the constant into 64-bit chunks and calculate the cost for each
  4135. // chunk.
  4136. InstructionCost Cost = 0;
  4137. for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
  4138. APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
  4139. int64_t Val = Tmp.getSExtValue();
  4140. Cost += getIntImmCost(Val);
  4141. }
  4142. // We need at least one instruction to materialize the constant.
  4143. return std::max<InstructionCost>(1, Cost);
  4144. }
  4145. InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  4146. const APInt &Imm, Type *Ty,
  4147. TTI::TargetCostKind CostKind,
  4148. Instruction *Inst) {
  4149. assert(Ty->isIntegerTy());
  4150. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  4151. // There is no cost model for constants with a bit size of 0. Return TCC_Free
  4152. // here, so that constant hoisting will ignore this constant.
  4153. if (BitSize == 0)
  4154. return TTI::TCC_Free;
  4155. unsigned ImmIdx = ~0U;
  4156. switch (Opcode) {
  4157. default:
  4158. return TTI::TCC_Free;
  4159. case Instruction::GetElementPtr:
  4160. // Always hoist the base address of a GetElementPtr. This prevents the
  4161. // creation of new constants for every base constant that gets constant
  4162. // folded with the offset.
  4163. if (Idx == 0)
  4164. return 2 * TTI::TCC_Basic;
  4165. return TTI::TCC_Free;
  4166. case Instruction::Store:
  4167. ImmIdx = 0;
  4168. break;
  4169. case Instruction::ICmp:
  4170. // This is an imperfect hack to prevent constant hoisting of
  4171. // compares that might be trying to check if a 64-bit value fits in
  4172. // 32-bits. The backend can optimize these cases using a right shift by 32.
  4173. // Ideally we would check the compare predicate here. There also other
  4174. // similar immediates the backend can use shifts for.
  4175. if (Idx == 1 && Imm.getBitWidth() == 64) {
  4176. uint64_t ImmVal = Imm.getZExtValue();
  4177. if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
  4178. return TTI::TCC_Free;
  4179. }
  4180. ImmIdx = 1;
  4181. break;
  4182. case Instruction::And:
  4183. // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
  4184. // by using a 32-bit operation with implicit zero extension. Detect such
  4185. // immediates here as the normal path expects bit 31 to be sign extended.
  4186. if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
  4187. return TTI::TCC_Free;
  4188. ImmIdx = 1;
  4189. break;
  4190. case Instruction::Add:
  4191. case Instruction::Sub:
  4192. // For add/sub, we can use the opposite instruction for INT32_MIN.
  4193. if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000)
  4194. return TTI::TCC_Free;
  4195. ImmIdx = 1;
  4196. break;
  4197. case Instruction::UDiv:
  4198. case Instruction::SDiv:
  4199. case Instruction::URem:
  4200. case Instruction::SRem:
  4201. // Division by constant is typically expanded later into a different
  4202. // instruction sequence. This completely changes the constants.
  4203. // Report them as "free" to stop ConstantHoist from marking them as opaque.
  4204. return TTI::TCC_Free;
  4205. case Instruction::Mul:
  4206. case Instruction::Or:
  4207. case Instruction::Xor:
  4208. ImmIdx = 1;
  4209. break;
  4210. // Always return TCC_Free for the shift value of a shift instruction.
  4211. case Instruction::Shl:
  4212. case Instruction::LShr:
  4213. case Instruction::AShr:
  4214. if (Idx == 1)
  4215. return TTI::TCC_Free;
  4216. break;
  4217. case Instruction::Trunc:
  4218. case Instruction::ZExt:
  4219. case Instruction::SExt:
  4220. case Instruction::IntToPtr:
  4221. case Instruction::PtrToInt:
  4222. case Instruction::BitCast:
  4223. case Instruction::PHI:
  4224. case Instruction::Call:
  4225. case Instruction::Select:
  4226. case Instruction::Ret:
  4227. case Instruction::Load:
  4228. break;
  4229. }
  4230. if (Idx == ImmIdx) {
  4231. int NumConstants = divideCeil(BitSize, 64);
  4232. InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  4233. return (Cost <= NumConstants * TTI::TCC_Basic)
  4234. ? static_cast<int>(TTI::TCC_Free)
  4235. : Cost;
  4236. }
  4237. return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  4238. }
  4239. InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
  4240. const APInt &Imm, Type *Ty,
  4241. TTI::TargetCostKind CostKind) {
  4242. assert(Ty->isIntegerTy());
  4243. unsigned BitSize = Ty->getPrimitiveSizeInBits();
  4244. // There is no cost model for constants with a bit size of 0. Return TCC_Free
  4245. // here, so that constant hoisting will ignore this constant.
  4246. if (BitSize == 0)
  4247. return TTI::TCC_Free;
  4248. switch (IID) {
  4249. default:
  4250. return TTI::TCC_Free;
  4251. case Intrinsic::sadd_with_overflow:
  4252. case Intrinsic::uadd_with_overflow:
  4253. case Intrinsic::ssub_with_overflow:
  4254. case Intrinsic::usub_with_overflow:
  4255. case Intrinsic::smul_with_overflow:
  4256. case Intrinsic::umul_with_overflow:
  4257. if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
  4258. return TTI::TCC_Free;
  4259. break;
  4260. case Intrinsic::experimental_stackmap:
  4261. if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  4262. return TTI::TCC_Free;
  4263. break;
  4264. case Intrinsic::experimental_patchpoint_void:
  4265. case Intrinsic::experimental_patchpoint_i64:
  4266. if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
  4267. return TTI::TCC_Free;
  4268. break;
  4269. }
  4270. return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
  4271. }
  4272. InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
  4273. TTI::TargetCostKind CostKind,
  4274. const Instruction *I) {
  4275. if (CostKind != TTI::TCK_RecipThroughput)
  4276. return Opcode == Instruction::PHI ? 0 : 1;
  4277. // Branches are assumed to be predicted.
  4278. return 0;
  4279. }
  4280. int X86TTIImpl::getGatherOverhead() const {
  4281. // Some CPUs have more overhead for gather. The specified overhead is relative
  4282. // to the Load operation. "2" is the number provided by Intel architects. This
  4283. // parameter is used for cost estimation of Gather Op and comparison with
  4284. // other alternatives.
  4285. // TODO: Remove the explicit hasAVX512()?, That would mean we would only
  4286. // enable gather with a -march.
  4287. if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
  4288. return 2;
  4289. return 1024;
  4290. }
  4291. int X86TTIImpl::getScatterOverhead() const {
  4292. if (ST->hasAVX512())
  4293. return 2;
  4294. return 1024;
  4295. }
  4296. // Return an average cost of Gather / Scatter instruction, maybe improved later.
  4297. // FIXME: Add TargetCostKind support.
  4298. InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
  4299. const Value *Ptr, Align Alignment,
  4300. unsigned AddressSpace) {
  4301. assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
  4302. unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
  4303. // Try to reduce index size from 64 bit (default for GEP)
  4304. // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
  4305. // operation will use 16 x 64 indices which do not fit in a zmm and needs
  4306. // to split. Also check that the base pointer is the same for all lanes,
  4307. // and that there's at most one variable index.
  4308. auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
  4309. unsigned IndexSize = DL.getPointerSizeInBits();
  4310. const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  4311. if (IndexSize < 64 || !GEP)
  4312. return IndexSize;
  4313. unsigned NumOfVarIndices = 0;
  4314. const Value *Ptrs = GEP->getPointerOperand();
  4315. if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
  4316. return IndexSize;
  4317. for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
  4318. if (isa<Constant>(GEP->getOperand(i)))
  4319. continue;
  4320. Type *IndxTy = GEP->getOperand(i)->getType();
  4321. if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
  4322. IndxTy = IndexVTy->getElementType();
  4323. if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
  4324. !isa<SExtInst>(GEP->getOperand(i))) ||
  4325. ++NumOfVarIndices > 1)
  4326. return IndexSize; // 64
  4327. }
  4328. return (unsigned)32;
  4329. };
  4330. // Trying to reduce IndexSize to 32 bits for vector 16.
  4331. // By default the IndexSize is equal to pointer size.
  4332. unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
  4333. ? getIndexSizeInBits(Ptr, DL)
  4334. : DL.getPointerSizeInBits();
  4335. auto *IndexVTy = FixedVectorType::get(
  4336. IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
  4337. std::pair<InstructionCost, MVT> IdxsLT =
  4338. TLI->getTypeLegalizationCost(DL, IndexVTy);
  4339. std::pair<InstructionCost, MVT> SrcLT =
  4340. TLI->getTypeLegalizationCost(DL, SrcVTy);
  4341. InstructionCost::CostType SplitFactor =
  4342. *std::max(IdxsLT.first, SrcLT.first).getValue();
  4343. if (SplitFactor > 1) {
  4344. // Handle splitting of vector of pointers
  4345. auto *SplitSrcTy =
  4346. FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
  4347. return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
  4348. AddressSpace);
  4349. }
  4350. // The gather / scatter cost is given by Intel architects. It is a rough
  4351. // number since we are looking at one instruction in a time.
  4352. const int GSOverhead = (Opcode == Instruction::Load)
  4353. ? getGatherOverhead()
  4354. : getScatterOverhead();
  4355. return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
  4356. MaybeAlign(Alignment), AddressSpace,
  4357. TTI::TCK_RecipThroughput);
  4358. }
  4359. /// Return the cost of full scalarization of gather / scatter operation.
  4360. ///
  4361. /// Opcode - Load or Store instruction.
  4362. /// SrcVTy - The type of the data vector that should be gathered or scattered.
  4363. /// VariableMask - The mask is non-constant at compile time.
  4364. /// Alignment - Alignment for one element.
  4365. /// AddressSpace - pointer[s] address space.
  4366. ///
  4367. /// FIXME: Add TargetCostKind support.
  4368. InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
  4369. bool VariableMask, Align Alignment,
  4370. unsigned AddressSpace) {
  4371. Type *ScalarTy = SrcVTy->getScalarType();
  4372. unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
  4373. APInt DemandedElts = APInt::getAllOnes(VF);
  4374. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
  4375. InstructionCost MaskUnpackCost = 0;
  4376. if (VariableMask) {
  4377. auto *MaskTy =
  4378. FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
  4379. MaskUnpackCost = getScalarizationOverhead(
  4380. MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true);
  4381. InstructionCost ScalarCompareCost = getCmpSelInstrCost(
  4382. Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
  4383. CmpInst::BAD_ICMP_PREDICATE, CostKind);
  4384. InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
  4385. MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
  4386. }
  4387. InstructionCost AddressUnpackCost = getScalarizationOverhead(
  4388. FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts,
  4389. /*Insert=*/false, /*Extract=*/true);
  4390. // The cost of the scalar loads/stores.
  4391. InstructionCost MemoryOpCost =
  4392. VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment),
  4393. AddressSpace, CostKind);
  4394. // The cost of forming the vector from loaded scalars/
  4395. // scalarizing the vector to perform scalar stores.
  4396. InstructionCost InsertExtractCost =
  4397. getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts,
  4398. /*Insert=*/Opcode == Instruction::Load,
  4399. /*Extract=*/Opcode == Instruction::Store);
  4400. return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost;
  4401. }
  4402. /// Calculate the cost of Gather / Scatter operation
  4403. InstructionCost X86TTIImpl::getGatherScatterOpCost(
  4404. unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
  4405. Align Alignment, TTI::TargetCostKind CostKind,
  4406. const Instruction *I = nullptr) {
  4407. if (CostKind != TTI::TCK_RecipThroughput) {
  4408. if ((Opcode == Instruction::Load &&
  4409. isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
  4410. !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
  4411. Align(Alignment))) ||
  4412. (Opcode == Instruction::Store &&
  4413. isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
  4414. !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
  4415. Align(Alignment))))
  4416. return 1;
  4417. return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
  4418. Alignment, CostKind, I);
  4419. }
  4420. assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
  4421. PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
  4422. if (!PtrTy && Ptr->getType()->isVectorTy())
  4423. PtrTy = dyn_cast<PointerType>(
  4424. cast<VectorType>(Ptr->getType())->getElementType());
  4425. assert(PtrTy && "Unexpected type for Ptr argument");
  4426. unsigned AddressSpace = PtrTy->getAddressSpace();
  4427. if ((Opcode == Instruction::Load &&
  4428. (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
  4429. forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
  4430. Align(Alignment)))) ||
  4431. (Opcode == Instruction::Store &&
  4432. (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
  4433. forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
  4434. Align(Alignment)))))
  4435. return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
  4436. AddressSpace);
  4437. return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
  4438. }
  4439. bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
  4440. TargetTransformInfo::LSRCost &C2) {
  4441. // X86 specific here are "instruction number 1st priority".
  4442. return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
  4443. C1.NumIVMuls, C1.NumBaseAdds,
  4444. C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
  4445. std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
  4446. C2.NumIVMuls, C2.NumBaseAdds,
  4447. C2.ScaleCost, C2.ImmCost, C2.SetupCost);
  4448. }
  4449. bool X86TTIImpl::canMacroFuseCmp() {
  4450. return ST->hasMacroFusion() || ST->hasBranchFusion();
  4451. }
  4452. bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
  4453. if (!ST->hasAVX())
  4454. return false;
  4455. // The backend can't handle a single element vector.
  4456. if (isa<VectorType>(DataTy) &&
  4457. cast<FixedVectorType>(DataTy)->getNumElements() == 1)
  4458. return false;
  4459. Type *ScalarTy = DataTy->getScalarType();
  4460. if (ScalarTy->isPointerTy())
  4461. return true;
  4462. if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
  4463. return true;
  4464. if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
  4465. return true;
  4466. if (!ScalarTy->isIntegerTy())
  4467. return false;
  4468. unsigned IntWidth = ScalarTy->getIntegerBitWidth();
  4469. return IntWidth == 32 || IntWidth == 64 ||
  4470. ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
  4471. }
  4472. bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
  4473. return isLegalMaskedLoad(DataType, Alignment);
  4474. }
  4475. bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
  4476. unsigned DataSize = DL.getTypeStoreSize(DataType);
  4477. // The only supported nontemporal loads are for aligned vectors of 16 or 32
  4478. // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
  4479. // (the equivalent stores only require AVX).
  4480. if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
  4481. return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
  4482. return false;
  4483. }
  4484. bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
  4485. unsigned DataSize = DL.getTypeStoreSize(DataType);
  4486. // SSE4A supports nontemporal stores of float and double at arbitrary
  4487. // alignment.
  4488. if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
  4489. return true;
  4490. // Besides the SSE4A subtarget exception above, only aligned stores are
  4491. // available nontemporaly on any other subtarget. And only stores with a size
  4492. // of 4..32 bytes (powers of 2, only) are permitted.
  4493. if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
  4494. !isPowerOf2_32(DataSize))
  4495. return false;
  4496. // 32-byte vector nontemporal stores are supported by AVX (the equivalent
  4497. // loads require AVX2).
  4498. if (DataSize == 32)
  4499. return ST->hasAVX();
  4500. if (DataSize == 16)
  4501. return ST->hasSSE1();
  4502. return true;
  4503. }
  4504. bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
  4505. if (!isa<VectorType>(DataTy))
  4506. return false;
  4507. if (!ST->hasAVX512())
  4508. return false;
  4509. // The backend can't handle a single element vector.
  4510. if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
  4511. return false;
  4512. Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
  4513. if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
  4514. return true;
  4515. if (!ScalarTy->isIntegerTy())
  4516. return false;
  4517. unsigned IntWidth = ScalarTy->getIntegerBitWidth();
  4518. return IntWidth == 32 || IntWidth == 64 ||
  4519. ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2());
  4520. }
  4521. bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
  4522. return isLegalMaskedExpandLoad(DataTy);
  4523. }
  4524. bool X86TTIImpl::supportsGather() const {
  4525. // Some CPUs have better gather performance than others.
  4526. // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
  4527. // enable gather with a -march.
  4528. return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
  4529. }
  4530. bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
  4531. // Gather / Scatter for vector 2 is not profitable on KNL / SKX
  4532. // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
  4533. // it to 8 elements, but zeroing upper bits of the mask vector will add more
  4534. // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
  4535. // Check, maybe the gather/scatter instruction is better in the VariableMask
  4536. // case.
  4537. unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
  4538. return NumElts == 1 ||
  4539. (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
  4540. }
  4541. bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
  4542. if (!supportsGather())
  4543. return false;
  4544. Type *ScalarTy = DataTy->getScalarType();
  4545. if (ScalarTy->isPointerTy())
  4546. return true;
  4547. if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
  4548. return true;
  4549. if (!ScalarTy->isIntegerTy())
  4550. return false;
  4551. unsigned IntWidth = ScalarTy->getIntegerBitWidth();
  4552. return IntWidth == 32 || IntWidth == 64;
  4553. }
  4554. bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
  4555. // AVX2 doesn't support scatter
  4556. if (!ST->hasAVX512())
  4557. return false;
  4558. return isLegalMaskedGather(DataType, Alignment);
  4559. }
  4560. bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
  4561. EVT VT = TLI->getValueType(DL, DataType);
  4562. return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
  4563. }
  4564. bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
  4565. return false;
  4566. }
  4567. bool X86TTIImpl::areInlineCompatible(const Function *Caller,
  4568. const Function *Callee) const {
  4569. const TargetMachine &TM = getTLI()->getTargetMachine();
  4570. // Work this as a subsetting of subtarget features.
  4571. const FeatureBitset &CallerBits =
  4572. TM.getSubtargetImpl(*Caller)->getFeatureBits();
  4573. const FeatureBitset &CalleeBits =
  4574. TM.getSubtargetImpl(*Callee)->getFeatureBits();
  4575. // Check whether features are the same (apart from the ignore list).
  4576. FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
  4577. FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
  4578. if (RealCallerBits == RealCalleeBits)
  4579. return true;
  4580. // If the features are a subset, we need to additionally check for calls
  4581. // that may become ABI-incompatible as a result of inlining.
  4582. if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
  4583. return false;
  4584. for (const Instruction &I : instructions(Callee)) {
  4585. if (const auto *CB = dyn_cast<CallBase>(&I)) {
  4586. SmallVector<Type *, 8> Types;
  4587. for (Value *Arg : CB->args())
  4588. Types.push_back(Arg->getType());
  4589. if (!CB->getType()->isVoidTy())
  4590. Types.push_back(CB->getType());
  4591. // Simple types are always ABI compatible.
  4592. auto IsSimpleTy = [](Type *Ty) {
  4593. return !Ty->isVectorTy() && !Ty->isAggregateType();
  4594. };
  4595. if (all_of(Types, IsSimpleTy))
  4596. continue;
  4597. if (Function *NestedCallee = CB->getCalledFunction()) {
  4598. // Assume that intrinsics are always ABI compatible.
  4599. if (NestedCallee->isIntrinsic())
  4600. continue;
  4601. // Do a precise compatibility check.
  4602. if (!areTypesABICompatible(Caller, NestedCallee, Types))
  4603. return false;
  4604. } else {
  4605. // We don't know the target features of the callee,
  4606. // assume it is incompatible.
  4607. return false;
  4608. }
  4609. }
  4610. }
  4611. return true;
  4612. }
  4613. bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
  4614. const Function *Callee,
  4615. const ArrayRef<Type *> &Types) const {
  4616. if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
  4617. return false;
  4618. // If we get here, we know the target features match. If one function
  4619. // considers 512-bit vectors legal and the other does not, consider them
  4620. // incompatible.
  4621. const TargetMachine &TM = getTLI()->getTargetMachine();
  4622. if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
  4623. TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
  4624. return true;
  4625. // Consider the arguments compatible if they aren't vectors or aggregates.
  4626. // FIXME: Look at the size of vectors.
  4627. // FIXME: Look at the element types of aggregates to see if there are vectors.
  4628. return llvm::none_of(Types,
  4629. [](Type *T) { return T->isVectorTy() || T->isAggregateType(); });
  4630. }
  4631. X86TTIImpl::TTI::MemCmpExpansionOptions
  4632. X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
  4633. TTI::MemCmpExpansionOptions Options;
  4634. Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
  4635. Options.NumLoadsPerBlock = 2;
  4636. // All GPR and vector loads can be unaligned.
  4637. Options.AllowOverlappingLoads = true;
  4638. if (IsZeroCmp) {
  4639. // Only enable vector loads for equality comparison. Right now the vector
  4640. // version is not as fast for three way compare (see #33329).
  4641. const unsigned PreferredWidth = ST->getPreferVectorWidth();
  4642. if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
  4643. if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
  4644. if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
  4645. }
  4646. if (ST->is64Bit()) {
  4647. Options.LoadSizes.push_back(8);
  4648. }
  4649. Options.LoadSizes.push_back(4);
  4650. Options.LoadSizes.push_back(2);
  4651. Options.LoadSizes.push_back(1);
  4652. return Options;
  4653. }
  4654. bool X86TTIImpl::prefersVectorizedAddressing() const {
  4655. return supportsGather();
  4656. }
  4657. bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const {
  4658. return false;
  4659. }
  4660. bool X86TTIImpl::enableInterleavedAccessVectorization() {
  4661. // TODO: We expect this to be beneficial regardless of arch,
  4662. // but there are currently some unexplained performance artifacts on Atom.
  4663. // As a temporary solution, disable on Atom.
  4664. return !(ST->isAtom());
  4665. }
  4666. // Get estimation for interleaved load/store operations and strided load.
  4667. // \p Indices contains indices for strided load.
  4668. // \p Factor - the factor of interleaving.
  4669. // AVX-512 provides 3-src shuffles that significantly reduces the cost.
  4670. InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
  4671. unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
  4672. ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
  4673. TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
  4674. // VecTy for interleave memop is <VF*Factor x Elt>.
  4675. // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
  4676. // VecTy = <12 x i32>.
  4677. // Calculate the number of memory operations (NumOfMemOps), required
  4678. // for load/store the VecTy.
  4679. MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
  4680. unsigned VecTySize = DL.getTypeStoreSize(VecTy);
  4681. unsigned LegalVTSize = LegalVT.getStoreSize();
  4682. unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
  4683. // Get the cost of one memory operation.
  4684. auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
  4685. LegalVT.getVectorNumElements());
  4686. InstructionCost MemOpCost;
  4687. bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
  4688. if (UseMaskedMemOp)
  4689. MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
  4690. AddressSpace, CostKind);
  4691. else
  4692. MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
  4693. AddressSpace, CostKind);
  4694. unsigned VF = VecTy->getNumElements() / Factor;
  4695. MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
  4696. InstructionCost MaskCost;
  4697. if (UseMaskedMemOp) {
  4698. APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
  4699. for (unsigned Index : Indices) {
  4700. assert(Index < Factor && "Invalid index for interleaved memory op");
  4701. for (unsigned Elm = 0; Elm < VF; Elm++)
  4702. DemandedLoadStoreElts.setBit(Index + Elm * Factor);
  4703. }
  4704. Type *I1Type = Type::getInt1Ty(VecTy->getContext());
  4705. MaskCost = getReplicationShuffleCost(
  4706. I1Type, Factor, VF,
  4707. UseMaskForGaps ? DemandedLoadStoreElts
  4708. : APInt::getAllOnes(VecTy->getNumElements()),
  4709. CostKind);
  4710. // The Gaps mask is invariant and created outside the loop, therefore the
  4711. // cost of creating it is not accounted for here. However if we have both
  4712. // a MaskForGaps and some other mask that guards the execution of the
  4713. // memory access, we need to account for the cost of And-ing the two masks
  4714. // inside the loop.
  4715. if (UseMaskForGaps) {
  4716. auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
  4717. MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
  4718. }
  4719. }
  4720. if (Opcode == Instruction::Load) {
  4721. // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
  4722. // contain the cost of the optimized shuffle sequence that the
  4723. // X86InterleavedAccess pass will generate.
  4724. // The cost of loads and stores are computed separately from the table.
  4725. // X86InterleavedAccess support only the following interleaved-access group.
  4726. static const CostTblEntry AVX512InterleavedLoadTbl[] = {
  4727. {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
  4728. {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
  4729. {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
  4730. };
  4731. if (const auto *Entry =
  4732. CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
  4733. return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
  4734. //If an entry does not exist, fallback to the default implementation.
  4735. // Kind of shuffle depends on number of loaded values.
  4736. // If we load the entire data in one register, we can use a 1-src shuffle.
  4737. // Otherwise, we'll merge 2 sources in each operation.
  4738. TTI::ShuffleKind ShuffleKind =
  4739. (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
  4740. InstructionCost ShuffleCost =
  4741. getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
  4742. unsigned NumOfLoadsInInterleaveGrp =
  4743. Indices.size() ? Indices.size() : Factor;
  4744. auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
  4745. VecTy->getNumElements() / Factor);
  4746. InstructionCost NumOfResults =
  4747. getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
  4748. NumOfLoadsInInterleaveGrp;
  4749. // About a half of the loads may be folded in shuffles when we have only
  4750. // one result. If we have more than one result, or the loads are masked,
  4751. // we do not fold loads at all.
  4752. unsigned NumOfUnfoldedLoads =
  4753. UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
  4754. // Get a number of shuffle operations per result.
  4755. unsigned NumOfShufflesPerResult =
  4756. std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
  4757. // The SK_MergeTwoSrc shuffle clobbers one of src operands.
  4758. // When we have more than one destination, we need additional instructions
  4759. // to keep sources.
  4760. InstructionCost NumOfMoves = 0;
  4761. if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
  4762. NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
  4763. InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
  4764. MaskCost + NumOfUnfoldedLoads * MemOpCost +
  4765. NumOfMoves;
  4766. return Cost;
  4767. }
  4768. // Store.
  4769. assert(Opcode == Instruction::Store &&
  4770. "Expected Store Instruction at this point");
  4771. // X86InterleavedAccess support only the following interleaved-access group.
  4772. static const CostTblEntry AVX512InterleavedStoreTbl[] = {
  4773. {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
  4774. {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
  4775. {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
  4776. {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
  4777. {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
  4778. {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
  4779. {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
  4780. };
  4781. if (const auto *Entry =
  4782. CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
  4783. return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
  4784. //If an entry does not exist, fallback to the default implementation.
  4785. // There is no strided stores meanwhile. And store can't be folded in
  4786. // shuffle.
  4787. unsigned NumOfSources = Factor; // The number of values to be merged.
  4788. InstructionCost ShuffleCost =
  4789. getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
  4790. unsigned NumOfShufflesPerStore = NumOfSources - 1;
  4791. // The SK_MergeTwoSrc shuffle clobbers one of src operands.
  4792. // We need additional instructions to keep sources.
  4793. unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
  4794. InstructionCost Cost =
  4795. MaskCost +
  4796. NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
  4797. NumOfMoves;
  4798. return Cost;
  4799. }
  4800. InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
  4801. unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices,
  4802. Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
  4803. bool UseMaskForCond, bool UseMaskForGaps) {
  4804. auto *VecTy = cast<FixedVectorType>(BaseTy);
  4805. auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
  4806. Type *EltTy = cast<VectorType>(VecTy)->getElementType();
  4807. if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
  4808. EltTy->isIntegerTy(32) || EltTy->isPointerTy())
  4809. return true;
  4810. if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
  4811. (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
  4812. return HasBW;
  4813. return false;
  4814. };
  4815. if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
  4816. return getInterleavedMemoryOpCostAVX512(
  4817. Opcode, VecTy, Factor, Indices, Alignment,
  4818. AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
  4819. if (UseMaskForCond || UseMaskForGaps)
  4820. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  4821. Alignment, AddressSpace, CostKind,
  4822. UseMaskForCond, UseMaskForGaps);
  4823. // Get estimation for interleaved load/store operations for SSE-AVX2.
  4824. // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
  4825. // computing the cost using a generic formula as a function of generic
  4826. // shuffles. We therefore use a lookup table instead, filled according to
  4827. // the instruction sequences that codegen currently generates.
  4828. // VecTy for interleave memop is <VF*Factor x Elt>.
  4829. // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
  4830. // VecTy = <12 x i32>.
  4831. MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
  4832. // This function can be called with VecTy=<6xi128>, Factor=3, in which case
  4833. // the VF=2, while v2i128 is an unsupported MVT vector type
  4834. // (see MachineValueType.h::getVectorVT()).
  4835. if (!LegalVT.isVector())
  4836. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  4837. Alignment, AddressSpace, CostKind);
  4838. unsigned VF = VecTy->getNumElements() / Factor;
  4839. Type *ScalarTy = VecTy->getElementType();
  4840. // Deduplicate entries, model floats/pointers as appropriately-sized integers.
  4841. if (!ScalarTy->isIntegerTy())
  4842. ScalarTy =
  4843. Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
  4844. // Get the cost of all the memory operations.
  4845. // FIXME: discount dead loads.
  4846. InstructionCost MemOpCosts = getMemoryOpCost(
  4847. Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
  4848. auto *VT = FixedVectorType::get(ScalarTy, VF);
  4849. EVT ETy = TLI->getValueType(DL, VT);
  4850. if (!ETy.isSimple())
  4851. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  4852. Alignment, AddressSpace, CostKind);
  4853. // TODO: Complete for other data-types and strides.
  4854. // Each combination of Stride, element bit width and VF results in a different
  4855. // sequence; The cost tables are therefore accessed with:
  4856. // Factor (stride) and VectorType=VFxiN.
  4857. // The Cost accounts only for the shuffle sequence;
  4858. // The cost of the loads/stores is accounted for separately.
  4859. //
  4860. static const CostTblEntry AVX2InterleavedLoadTbl[] = {
  4861. {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8
  4862. {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8
  4863. {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8
  4864. {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8
  4865. {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8
  4866. {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16
  4867. {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16
  4868. {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16
  4869. {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32
  4870. {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32
  4871. {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32
  4872. {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64
  4873. {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64
  4874. {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64
  4875. {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64
  4876. {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8
  4877. {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8
  4878. {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8
  4879. {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
  4880. {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8
  4881. {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16
  4882. {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16
  4883. {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16
  4884. {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16
  4885. {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16
  4886. {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32
  4887. {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32
  4888. {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32
  4889. {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32
  4890. {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32
  4891. {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64
  4892. {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64
  4893. {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64
  4894. {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64
  4895. {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8
  4896. {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
  4897. {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8
  4898. {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8
  4899. {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8
  4900. {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16
  4901. {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16
  4902. {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16
  4903. {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16
  4904. {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16
  4905. {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32
  4906. {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32
  4907. {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32
  4908. {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32
  4909. {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32
  4910. {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64
  4911. {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64
  4912. {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64
  4913. {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64
  4914. {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8
  4915. {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8
  4916. {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8
  4917. {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8
  4918. {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8
  4919. {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16
  4920. {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16
  4921. {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16
  4922. {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16
  4923. {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16
  4924. {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32
  4925. {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32
  4926. {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32
  4927. {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32
  4928. {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64
  4929. {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64
  4930. {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64
  4931. {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
  4932. };
  4933. static const CostTblEntry SSSE3InterleavedLoadTbl[] = {
  4934. {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16
  4935. };
  4936. static const CostTblEntry SSE2InterleavedLoadTbl[] = {
  4937. {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16
  4938. {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16
  4939. {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32
  4940. {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32
  4941. {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64
  4942. };
  4943. static const CostTblEntry AVX2InterleavedStoreTbl[] = {
  4944. {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store)
  4945. {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store)
  4946. {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store)
  4947. {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store)
  4948. {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store)
  4949. {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store)
  4950. {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store)
  4951. {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store)
  4952. {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store)
  4953. {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store)
  4954. {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store)
  4955. {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store)
  4956. {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store)
  4957. {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store)
  4958. {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store)
  4959. {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store)
  4960. {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store)
  4961. {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
  4962. {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
  4963. {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store)
  4964. {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store)
  4965. {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store)
  4966. {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store)
  4967. {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store)
  4968. {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store)
  4969. {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store)
  4970. {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store)
  4971. {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store)
  4972. {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store)
  4973. {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store)
  4974. {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store)
  4975. {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store)
  4976. {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store)
  4977. {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store)
  4978. {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store)
  4979. {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store)
  4980. {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store)
  4981. {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store)
  4982. {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store)
  4983. {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store)
  4984. {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store)
  4985. {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store)
  4986. {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store)
  4987. {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store)
  4988. {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store)
  4989. {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store)
  4990. {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store)
  4991. {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store)
  4992. {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store)
  4993. {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store)
  4994. {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store)
  4995. {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store)
  4996. {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store)
  4997. {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store)
  4998. {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store)
  4999. {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store)
  5000. {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store)
  5001. {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store)
  5002. {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store)
  5003. {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store)
  5004. {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store)
  5005. {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store)
  5006. {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store)
  5007. {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store)
  5008. {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store)
  5009. {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store)
  5010. {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store)
  5011. {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store)
  5012. {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store)
  5013. };
  5014. static const CostTblEntry SSE2InterleavedStoreTbl[] = {
  5015. {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store)
  5016. {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store)
  5017. {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store)
  5018. {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store)
  5019. {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store)
  5020. {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store)
  5021. };
  5022. if (Opcode == Instruction::Load) {
  5023. auto GetDiscountedCost = [Factor, NumMembers = Indices.size(),
  5024. MemOpCosts](const CostTblEntry *Entry) {
  5025. // NOTE: this is just an approximation!
  5026. // It can over/under -estimate the cost!
  5027. return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor);
  5028. };
  5029. if (ST->hasAVX2())
  5030. if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor,
  5031. ETy.getSimpleVT()))
  5032. return GetDiscountedCost(Entry);
  5033. if (ST->hasSSSE3())
  5034. if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor,
  5035. ETy.getSimpleVT()))
  5036. return GetDiscountedCost(Entry);
  5037. if (ST->hasSSE2())
  5038. if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor,
  5039. ETy.getSimpleVT()))
  5040. return GetDiscountedCost(Entry);
  5041. } else {
  5042. assert(Opcode == Instruction::Store &&
  5043. "Expected Store Instruction at this point");
  5044. assert((!Indices.size() || Indices.size() == Factor) &&
  5045. "Interleaved store only supports fully-interleaved groups.");
  5046. if (ST->hasAVX2())
  5047. if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor,
  5048. ETy.getSimpleVT()))
  5049. return MemOpCosts + Entry->Cost;
  5050. if (ST->hasSSE2())
  5051. if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor,
  5052. ETy.getSimpleVT()))
  5053. return MemOpCosts + Entry->Cost;
  5054. }
  5055. return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
  5056. Alignment, AddressSpace, CostKind,
  5057. UseMaskForCond, UseMaskForGaps);
  5058. }