X86ISelDAGToDAG.cpp 231 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225
  1. //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines a DAG pattern matching instruction selector for X86,
  10. // converting from a legalized dag to a X86 dag.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "X86.h"
  14. #include "X86MachineFunctionInfo.h"
  15. #include "X86RegisterInfo.h"
  16. #include "X86Subtarget.h"
  17. #include "X86TargetMachine.h"
  18. #include "llvm/ADT/Statistic.h"
  19. #include "llvm/CodeGen/MachineModuleInfo.h"
  20. #include "llvm/CodeGen/SelectionDAGISel.h"
  21. #include "llvm/Config/llvm-config.h"
  22. #include "llvm/IR/ConstantRange.h"
  23. #include "llvm/IR/Function.h"
  24. #include "llvm/IR/Instructions.h"
  25. #include "llvm/IR/Intrinsics.h"
  26. #include "llvm/IR/IntrinsicsX86.h"
  27. #include "llvm/IR/Type.h"
  28. #include "llvm/Support/Debug.h"
  29. #include "llvm/Support/ErrorHandling.h"
  30. #include "llvm/Support/KnownBits.h"
  31. #include "llvm/Support/MathExtras.h"
  32. #include <cstdint>
  33. using namespace llvm;
  34. #define DEBUG_TYPE "x86-isel"
  35. #define PASS_NAME "X86 DAG->DAG Instruction Selection"
  36. STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
  37. static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
  38. cl::desc("Enable setting constant bits to reduce size of mask immediates"),
  39. cl::Hidden);
  40. static cl::opt<bool> EnablePromoteAnyextLoad(
  41. "x86-promote-anyext-load", cl::init(true),
  42. cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
  43. extern cl::opt<bool> IndirectBranchTracking;
  44. //===----------------------------------------------------------------------===//
  45. // Pattern Matcher Implementation
  46. //===----------------------------------------------------------------------===//
  47. namespace {
  48. /// This corresponds to X86AddressMode, but uses SDValue's instead of register
  49. /// numbers for the leaves of the matched tree.
  50. struct X86ISelAddressMode {
  51. enum {
  52. RegBase,
  53. FrameIndexBase
  54. } BaseType = RegBase;
  55. // This is really a union, discriminated by BaseType!
  56. SDValue Base_Reg;
  57. int Base_FrameIndex = 0;
  58. unsigned Scale = 1;
  59. SDValue IndexReg;
  60. int32_t Disp = 0;
  61. SDValue Segment;
  62. const GlobalValue *GV = nullptr;
  63. const Constant *CP = nullptr;
  64. const BlockAddress *BlockAddr = nullptr;
  65. const char *ES = nullptr;
  66. MCSymbol *MCSym = nullptr;
  67. int JT = -1;
  68. Align Alignment; // CP alignment.
  69. unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
  70. bool NegateIndex = false;
  71. X86ISelAddressMode() = default;
  72. bool hasSymbolicDisplacement() const {
  73. return GV != nullptr || CP != nullptr || ES != nullptr ||
  74. MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
  75. }
  76. bool hasBaseOrIndexReg() const {
  77. return BaseType == FrameIndexBase ||
  78. IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
  79. }
  80. /// Return true if this addressing mode is already RIP-relative.
  81. bool isRIPRelative() const {
  82. if (BaseType != RegBase) return false;
  83. if (RegisterSDNode *RegNode =
  84. dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
  85. return RegNode->getReg() == X86::RIP;
  86. return false;
  87. }
  88. void setBaseReg(SDValue Reg) {
  89. BaseType = RegBase;
  90. Base_Reg = Reg;
  91. }
  92. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  93. void dump(SelectionDAG *DAG = nullptr) {
  94. dbgs() << "X86ISelAddressMode " << this << '\n';
  95. dbgs() << "Base_Reg ";
  96. if (Base_Reg.getNode())
  97. Base_Reg.getNode()->dump(DAG);
  98. else
  99. dbgs() << "nul\n";
  100. if (BaseType == FrameIndexBase)
  101. dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
  102. dbgs() << " Scale " << Scale << '\n'
  103. << "IndexReg ";
  104. if (NegateIndex)
  105. dbgs() << "negate ";
  106. if (IndexReg.getNode())
  107. IndexReg.getNode()->dump(DAG);
  108. else
  109. dbgs() << "nul\n";
  110. dbgs() << " Disp " << Disp << '\n'
  111. << "GV ";
  112. if (GV)
  113. GV->dump();
  114. else
  115. dbgs() << "nul";
  116. dbgs() << " CP ";
  117. if (CP)
  118. CP->dump();
  119. else
  120. dbgs() << "nul";
  121. dbgs() << '\n'
  122. << "ES ";
  123. if (ES)
  124. dbgs() << ES;
  125. else
  126. dbgs() << "nul";
  127. dbgs() << " MCSym ";
  128. if (MCSym)
  129. dbgs() << MCSym;
  130. else
  131. dbgs() << "nul";
  132. dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
  133. }
  134. #endif
  135. };
  136. }
  137. namespace {
  138. //===--------------------------------------------------------------------===//
  139. /// ISel - X86-specific code to select X86 machine instructions for
  140. /// SelectionDAG operations.
  141. ///
  142. class X86DAGToDAGISel final : public SelectionDAGISel {
  143. /// Keep a pointer to the X86Subtarget around so that we can
  144. /// make the right decision when generating code for different targets.
  145. const X86Subtarget *Subtarget;
  146. /// If true, selector should try to optimize for minimum code size.
  147. bool OptForMinSize;
  148. /// Disable direct TLS access through segment registers.
  149. bool IndirectTlsSegRefs;
  150. public:
  151. static char ID;
  152. X86DAGToDAGISel() = delete;
  153. explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
  154. : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr),
  155. OptForMinSize(false), IndirectTlsSegRefs(false) {}
  156. bool runOnMachineFunction(MachineFunction &MF) override {
  157. // Reset the subtarget each time through.
  158. Subtarget = &MF.getSubtarget<X86Subtarget>();
  159. IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
  160. "indirect-tls-seg-refs");
  161. // OptFor[Min]Size are used in pattern predicates that isel is matching.
  162. OptForMinSize = MF.getFunction().hasMinSize();
  163. assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
  164. "OptForMinSize implies OptForSize");
  165. SelectionDAGISel::runOnMachineFunction(MF);
  166. return true;
  167. }
  168. void emitFunctionEntryCode() override;
  169. bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
  170. void PreprocessISelDAG() override;
  171. void PostprocessISelDAG() override;
  172. // Include the pieces autogenerated from the target description.
  173. #include "X86GenDAGISel.inc"
  174. private:
  175. void Select(SDNode *N) override;
  176. bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
  177. bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
  178. bool AllowSegmentRegForX32 = false);
  179. bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
  180. bool matchAddress(SDValue N, X86ISelAddressMode &AM);
  181. bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
  182. bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
  183. bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
  184. unsigned Depth);
  185. bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
  186. unsigned Depth);
  187. bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
  188. bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
  189. SDValue &Scale, SDValue &Index, SDValue &Disp,
  190. SDValue &Segment);
  191. bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
  192. SDValue ScaleOp, SDValue &Base, SDValue &Scale,
  193. SDValue &Index, SDValue &Disp, SDValue &Segment);
  194. bool selectMOV64Imm32(SDValue N, SDValue &Imm);
  195. bool selectLEAAddr(SDValue N, SDValue &Base,
  196. SDValue &Scale, SDValue &Index, SDValue &Disp,
  197. SDValue &Segment);
  198. bool selectLEA64_32Addr(SDValue N, SDValue &Base,
  199. SDValue &Scale, SDValue &Index, SDValue &Disp,
  200. SDValue &Segment);
  201. bool selectTLSADDRAddr(SDValue N, SDValue &Base,
  202. SDValue &Scale, SDValue &Index, SDValue &Disp,
  203. SDValue &Segment);
  204. bool selectRelocImm(SDValue N, SDValue &Op);
  205. bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
  206. SDValue &Base, SDValue &Scale,
  207. SDValue &Index, SDValue &Disp,
  208. SDValue &Segment);
  209. // Convenience method where P is also root.
  210. bool tryFoldLoad(SDNode *P, SDValue N,
  211. SDValue &Base, SDValue &Scale,
  212. SDValue &Index, SDValue &Disp,
  213. SDValue &Segment) {
  214. return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
  215. }
  216. bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
  217. SDValue &Base, SDValue &Scale,
  218. SDValue &Index, SDValue &Disp,
  219. SDValue &Segment);
  220. bool isProfitableToFormMaskedOp(SDNode *N) const;
  221. /// Implement addressing mode selection for inline asm expressions.
  222. bool SelectInlineAsmMemoryOperand(const SDValue &Op,
  223. unsigned ConstraintID,
  224. std::vector<SDValue> &OutOps) override;
  225. void emitSpecialCodeForMain();
  226. inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
  227. MVT VT, SDValue &Base, SDValue &Scale,
  228. SDValue &Index, SDValue &Disp,
  229. SDValue &Segment) {
  230. if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
  231. Base = CurDAG->getTargetFrameIndex(
  232. AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
  233. else if (AM.Base_Reg.getNode())
  234. Base = AM.Base_Reg;
  235. else
  236. Base = CurDAG->getRegister(0, VT);
  237. Scale = getI8Imm(AM.Scale, DL);
  238. // Negate the index if needed.
  239. if (AM.NegateIndex) {
  240. unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
  241. SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
  242. AM.IndexReg), 0);
  243. AM.IndexReg = Neg;
  244. }
  245. if (AM.IndexReg.getNode())
  246. Index = AM.IndexReg;
  247. else
  248. Index = CurDAG->getRegister(0, VT);
  249. // These are 32-bit even in 64-bit mode since RIP-relative offset
  250. // is 32-bit.
  251. if (AM.GV)
  252. Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
  253. MVT::i32, AM.Disp,
  254. AM.SymbolFlags);
  255. else if (AM.CP)
  256. Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
  257. AM.Disp, AM.SymbolFlags);
  258. else if (AM.ES) {
  259. assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
  260. Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
  261. } else if (AM.MCSym) {
  262. assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
  263. assert(AM.SymbolFlags == 0 && "oo");
  264. Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
  265. } else if (AM.JT != -1) {
  266. assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
  267. Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
  268. } else if (AM.BlockAddr)
  269. Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
  270. AM.SymbolFlags);
  271. else
  272. Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
  273. if (AM.Segment.getNode())
  274. Segment = AM.Segment;
  275. else
  276. Segment = CurDAG->getRegister(0, MVT::i16);
  277. }
  278. // Utility function to determine whether we should avoid selecting
  279. // immediate forms of instructions for better code size or not.
  280. // At a high level, we'd like to avoid such instructions when
  281. // we have similar constants used within the same basic block
  282. // that can be kept in a register.
  283. //
  284. bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
  285. uint32_t UseCount = 0;
  286. // Do not want to hoist if we're not optimizing for size.
  287. // TODO: We'd like to remove this restriction.
  288. // See the comment in X86InstrInfo.td for more info.
  289. if (!CurDAG->shouldOptForSize())
  290. return false;
  291. // Walk all the users of the immediate.
  292. for (const SDNode *User : N->uses()) {
  293. if (UseCount >= 2)
  294. break;
  295. // This user is already selected. Count it as a legitimate use and
  296. // move on.
  297. if (User->isMachineOpcode()) {
  298. UseCount++;
  299. continue;
  300. }
  301. // We want to count stores of immediates as real uses.
  302. if (User->getOpcode() == ISD::STORE &&
  303. User->getOperand(1).getNode() == N) {
  304. UseCount++;
  305. continue;
  306. }
  307. // We don't currently match users that have > 2 operands (except
  308. // for stores, which are handled above)
  309. // Those instruction won't match in ISEL, for now, and would
  310. // be counted incorrectly.
  311. // This may change in the future as we add additional instruction
  312. // types.
  313. if (User->getNumOperands() != 2)
  314. continue;
  315. // If this is a sign-extended 8-bit integer immediate used in an ALU
  316. // instruction, there is probably an opcode encoding to save space.
  317. auto *C = dyn_cast<ConstantSDNode>(N);
  318. if (C && isInt<8>(C->getSExtValue()))
  319. continue;
  320. // Immediates that are used for offsets as part of stack
  321. // manipulation should be left alone. These are typically
  322. // used to indicate SP offsets for argument passing and
  323. // will get pulled into stores/pushes (implicitly).
  324. if (User->getOpcode() == X86ISD::ADD ||
  325. User->getOpcode() == ISD::ADD ||
  326. User->getOpcode() == X86ISD::SUB ||
  327. User->getOpcode() == ISD::SUB) {
  328. // Find the other operand of the add/sub.
  329. SDValue OtherOp = User->getOperand(0);
  330. if (OtherOp.getNode() == N)
  331. OtherOp = User->getOperand(1);
  332. // Don't count if the other operand is SP.
  333. RegisterSDNode *RegNode;
  334. if (OtherOp->getOpcode() == ISD::CopyFromReg &&
  335. (RegNode = dyn_cast_or_null<RegisterSDNode>(
  336. OtherOp->getOperand(1).getNode())))
  337. if ((RegNode->getReg() == X86::ESP) ||
  338. (RegNode->getReg() == X86::RSP))
  339. continue;
  340. }
  341. // ... otherwise, count this and move on.
  342. UseCount++;
  343. }
  344. // If we have more than 1 use, then recommend for hoisting.
  345. return (UseCount > 1);
  346. }
  347. /// Return a target constant with the specified value of type i8.
  348. inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
  349. return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
  350. }
  351. /// Return a target constant with the specified value, of type i32.
  352. inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
  353. return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
  354. }
  355. /// Return a target constant with the specified value, of type i64.
  356. inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
  357. return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
  358. }
  359. SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
  360. const SDLoc &DL) {
  361. assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
  362. uint64_t Index = N->getConstantOperandVal(1);
  363. MVT VecVT = N->getOperand(0).getSimpleValueType();
  364. return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
  365. }
  366. SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
  367. const SDLoc &DL) {
  368. assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
  369. uint64_t Index = N->getConstantOperandVal(2);
  370. MVT VecVT = N->getSimpleValueType(0);
  371. return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
  372. }
  373. SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
  374. const SDLoc &DL) {
  375. assert(VecWidth == 128 && "Unexpected vector width");
  376. uint64_t Index = N->getConstantOperandVal(2);
  377. MVT VecVT = N->getSimpleValueType(0);
  378. uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
  379. assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
  380. // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
  381. // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
  382. return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
  383. }
  384. SDValue getSBBZero(SDNode *N) {
  385. SDLoc dl(N);
  386. MVT VT = N->getSimpleValueType(0);
  387. // Create zero.
  388. SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
  389. SDValue Zero = SDValue(
  390. CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
  391. if (VT == MVT::i64) {
  392. Zero = SDValue(
  393. CurDAG->getMachineNode(
  394. TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
  395. CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
  396. CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
  397. 0);
  398. }
  399. // Copy flags to the EFLAGS register and glue it to next node.
  400. unsigned Opcode = N->getOpcode();
  401. assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
  402. "Unexpected opcode for SBB materialization");
  403. unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
  404. SDValue EFLAGS =
  405. CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
  406. N->getOperand(FlagOpIndex), SDValue());
  407. // Create a 64-bit instruction if the result is 64-bits otherwise use the
  408. // 32-bit version.
  409. unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
  410. MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
  411. VTs = CurDAG->getVTList(SBBVT, MVT::i32);
  412. return SDValue(
  413. CurDAG->getMachineNode(Opc, dl, VTs,
  414. {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
  415. 0);
  416. }
  417. // Helper to detect unneeded and instructions on shift amounts. Called
  418. // from PatFrags in tablegen.
  419. bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
  420. assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
  421. const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
  422. if (Val.countTrailingOnes() >= Width)
  423. return true;
  424. APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
  425. return Mask.countTrailingOnes() >= Width;
  426. }
  427. /// Return an SDNode that returns the value of the global base register.
  428. /// Output instructions required to initialize the global base register,
  429. /// if necessary.
  430. SDNode *getGlobalBaseReg();
  431. /// Return a reference to the TargetMachine, casted to the target-specific
  432. /// type.
  433. const X86TargetMachine &getTargetMachine() const {
  434. return static_cast<const X86TargetMachine &>(TM);
  435. }
  436. /// Return a reference to the TargetInstrInfo, casted to the target-specific
  437. /// type.
  438. const X86InstrInfo *getInstrInfo() const {
  439. return Subtarget->getInstrInfo();
  440. }
  441. /// Return a condition code of the given SDNode
  442. X86::CondCode getCondFromNode(SDNode *N) const;
  443. /// Address-mode matching performs shift-of-and to and-of-shift
  444. /// reassociation in order to expose more scaled addressing
  445. /// opportunities.
  446. bool ComplexPatternFuncMutatesDAG() const override {
  447. return true;
  448. }
  449. bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
  450. // Indicates we should prefer to use a non-temporal load for this load.
  451. bool useNonTemporalLoad(LoadSDNode *N) const {
  452. if (!N->isNonTemporal())
  453. return false;
  454. unsigned StoreSize = N->getMemoryVT().getStoreSize();
  455. if (N->getAlign().value() < StoreSize)
  456. return false;
  457. switch (StoreSize) {
  458. default: llvm_unreachable("Unsupported store size");
  459. case 4:
  460. case 8:
  461. return false;
  462. case 16:
  463. return Subtarget->hasSSE41();
  464. case 32:
  465. return Subtarget->hasAVX2();
  466. case 64:
  467. return Subtarget->hasAVX512();
  468. }
  469. }
  470. bool foldLoadStoreIntoMemOperand(SDNode *Node);
  471. MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
  472. bool matchBitExtract(SDNode *Node);
  473. bool shrinkAndImmediate(SDNode *N);
  474. bool isMaskZeroExtended(SDNode *N) const;
  475. bool tryShiftAmountMod(SDNode *N);
  476. bool tryShrinkShlLogicImm(SDNode *N);
  477. bool tryVPTERNLOG(SDNode *N);
  478. bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
  479. SDNode *ParentC, SDValue A, SDValue B, SDValue C,
  480. uint8_t Imm);
  481. bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
  482. bool tryMatchBitSelect(SDNode *N);
  483. MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
  484. const SDLoc &dl, MVT VT, SDNode *Node);
  485. MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
  486. const SDLoc &dl, MVT VT, SDNode *Node,
  487. SDValue &InFlag);
  488. bool tryOptimizeRem8Extend(SDNode *N);
  489. bool onlyUsesZeroFlag(SDValue Flags) const;
  490. bool hasNoSignFlagUses(SDValue Flags) const;
  491. bool hasNoCarryFlagUses(SDValue Flags) const;
  492. };
  493. }
  494. char X86DAGToDAGISel::ID = 0;
  495. INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
  496. // Returns true if this masked compare can be implemented legally with this
  497. // type.
  498. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
  499. unsigned Opcode = N->getOpcode();
  500. if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
  501. Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
  502. Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
  503. // We can get 256-bit 8 element types here without VLX being enabled. When
  504. // this happens we will use 512-bit operations and the mask will not be
  505. // zero extended.
  506. EVT OpVT = N->getOperand(0).getValueType();
  507. // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
  508. // second operand.
  509. if (Opcode == X86ISD::STRICT_CMPM)
  510. OpVT = N->getOperand(1).getValueType();
  511. if (OpVT.is256BitVector() || OpVT.is128BitVector())
  512. return Subtarget->hasVLX();
  513. return true;
  514. }
  515. // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
  516. if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
  517. Opcode == X86ISD::FSETCCM_SAE)
  518. return true;
  519. return false;
  520. }
  521. // Returns true if we can assume the writer of the mask has zero extended it
  522. // for us.
  523. bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
  524. // If this is an AND, check if we have a compare on either side. As long as
  525. // one side guarantees the mask is zero extended, the AND will preserve those
  526. // zeros.
  527. if (N->getOpcode() == ISD::AND)
  528. return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
  529. isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
  530. return isLegalMaskCompare(N, Subtarget);
  531. }
  532. bool
  533. X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
  534. if (OptLevel == CodeGenOpt::None) return false;
  535. if (!N.hasOneUse())
  536. return false;
  537. if (N.getOpcode() != ISD::LOAD)
  538. return true;
  539. // Don't fold non-temporal loads if we have an instruction for them.
  540. if (useNonTemporalLoad(cast<LoadSDNode>(N)))
  541. return false;
  542. // If N is a load, do additional profitability checks.
  543. if (U == Root) {
  544. switch (U->getOpcode()) {
  545. default: break;
  546. case X86ISD::ADD:
  547. case X86ISD::ADC:
  548. case X86ISD::SUB:
  549. case X86ISD::SBB:
  550. case X86ISD::AND:
  551. case X86ISD::XOR:
  552. case X86ISD::OR:
  553. case ISD::ADD:
  554. case ISD::ADDCARRY:
  555. case ISD::AND:
  556. case ISD::OR:
  557. case ISD::XOR: {
  558. SDValue Op1 = U->getOperand(1);
  559. // If the other operand is a 8-bit immediate we should fold the immediate
  560. // instead. This reduces code size.
  561. // e.g.
  562. // movl 4(%esp), %eax
  563. // addl $4, %eax
  564. // vs.
  565. // movl $4, %eax
  566. // addl 4(%esp), %eax
  567. // The former is 2 bytes shorter. In case where the increment is 1, then
  568. // the saving can be 4 bytes (by using incl %eax).
  569. if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
  570. if (Imm->getAPIntValue().isSignedIntN(8))
  571. return false;
  572. // If this is a 64-bit AND with an immediate that fits in 32-bits,
  573. // prefer using the smaller and over folding the load. This is needed to
  574. // make sure immediates created by shrinkAndImmediate are always folded.
  575. // Ideally we would narrow the load during DAG combine and get the
  576. // best of both worlds.
  577. if (U->getOpcode() == ISD::AND &&
  578. Imm->getAPIntValue().getBitWidth() == 64 &&
  579. Imm->getAPIntValue().isIntN(32))
  580. return false;
  581. // If this really a zext_inreg that can be represented with a movzx
  582. // instruction, prefer that.
  583. // TODO: We could shrink the load and fold if it is non-volatile.
  584. if (U->getOpcode() == ISD::AND &&
  585. (Imm->getAPIntValue() == UINT8_MAX ||
  586. Imm->getAPIntValue() == UINT16_MAX ||
  587. Imm->getAPIntValue() == UINT32_MAX))
  588. return false;
  589. // ADD/SUB with can negate the immediate and use the opposite operation
  590. // to fit 128 into a sign extended 8 bit immediate.
  591. if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
  592. (-Imm->getAPIntValue()).isSignedIntN(8))
  593. return false;
  594. if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
  595. (-Imm->getAPIntValue()).isSignedIntN(8) &&
  596. hasNoCarryFlagUses(SDValue(U, 1)))
  597. return false;
  598. }
  599. // If the other operand is a TLS address, we should fold it instead.
  600. // This produces
  601. // movl %gs:0, %eax
  602. // leal i@NTPOFF(%eax), %eax
  603. // instead of
  604. // movl $i@NTPOFF, %eax
  605. // addl %gs:0, %eax
  606. // if the block also has an access to a second TLS address this will save
  607. // a load.
  608. // FIXME: This is probably also true for non-TLS addresses.
  609. if (Op1.getOpcode() == X86ISD::Wrapper) {
  610. SDValue Val = Op1.getOperand(0);
  611. if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
  612. return false;
  613. }
  614. // Don't fold load if this matches the BTS/BTR/BTC patterns.
  615. // BTS: (or X, (shl 1, n))
  616. // BTR: (and X, (rotl -2, n))
  617. // BTC: (xor X, (shl 1, n))
  618. if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
  619. if (U->getOperand(0).getOpcode() == ISD::SHL &&
  620. isOneConstant(U->getOperand(0).getOperand(0)))
  621. return false;
  622. if (U->getOperand(1).getOpcode() == ISD::SHL &&
  623. isOneConstant(U->getOperand(1).getOperand(0)))
  624. return false;
  625. }
  626. if (U->getOpcode() == ISD::AND) {
  627. SDValue U0 = U->getOperand(0);
  628. SDValue U1 = U->getOperand(1);
  629. if (U0.getOpcode() == ISD::ROTL) {
  630. auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
  631. if (C && C->getSExtValue() == -2)
  632. return false;
  633. }
  634. if (U1.getOpcode() == ISD::ROTL) {
  635. auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
  636. if (C && C->getSExtValue() == -2)
  637. return false;
  638. }
  639. }
  640. break;
  641. }
  642. case ISD::SHL:
  643. case ISD::SRA:
  644. case ISD::SRL:
  645. // Don't fold a load into a shift by immediate. The BMI2 instructions
  646. // support folding a load, but not an immediate. The legacy instructions
  647. // support folding an immediate, but can't fold a load. Folding an
  648. // immediate is preferable to folding a load.
  649. if (isa<ConstantSDNode>(U->getOperand(1)))
  650. return false;
  651. break;
  652. }
  653. }
  654. // Prevent folding a load if this can implemented with an insert_subreg or
  655. // a move that implicitly zeroes.
  656. if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
  657. isNullConstant(Root->getOperand(2)) &&
  658. (Root->getOperand(0).isUndef() ||
  659. ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
  660. return false;
  661. return true;
  662. }
  663. // Indicates it is profitable to form an AVX512 masked operation. Returning
  664. // false will favor a masked register-register masked move or vblendm and the
  665. // operation will be selected separately.
  666. bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
  667. assert(
  668. (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
  669. "Unexpected opcode!");
  670. // If the operation has additional users, the operation will be duplicated.
  671. // Check the use count to prevent that.
  672. // FIXME: Are there cheap opcodes we might want to duplicate?
  673. return N->getOperand(1).hasOneUse();
  674. }
  675. /// Replace the original chain operand of the call with
  676. /// load's chain operand and move load below the call's chain operand.
  677. static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
  678. SDValue Call, SDValue OrigChain) {
  679. SmallVector<SDValue, 8> Ops;
  680. SDValue Chain = OrigChain.getOperand(0);
  681. if (Chain.getNode() == Load.getNode())
  682. Ops.push_back(Load.getOperand(0));
  683. else {
  684. assert(Chain.getOpcode() == ISD::TokenFactor &&
  685. "Unexpected chain operand");
  686. for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
  687. if (Chain.getOperand(i).getNode() == Load.getNode())
  688. Ops.push_back(Load.getOperand(0));
  689. else
  690. Ops.push_back(Chain.getOperand(i));
  691. SDValue NewChain =
  692. CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
  693. Ops.clear();
  694. Ops.push_back(NewChain);
  695. }
  696. Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
  697. CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
  698. CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
  699. Load.getOperand(1), Load.getOperand(2));
  700. Ops.clear();
  701. Ops.push_back(SDValue(Load.getNode(), 1));
  702. Ops.append(Call->op_begin() + 1, Call->op_end());
  703. CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
  704. }
  705. /// Return true if call address is a load and it can be
  706. /// moved below CALLSEQ_START and the chains leading up to the call.
  707. /// Return the CALLSEQ_START by reference as a second output.
  708. /// In the case of a tail call, there isn't a callseq node between the call
  709. /// chain and the load.
  710. static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
  711. // The transformation is somewhat dangerous if the call's chain was glued to
  712. // the call. After MoveBelowOrigChain the load is moved between the call and
  713. // the chain, this can create a cycle if the load is not folded. So it is
  714. // *really* important that we are sure the load will be folded.
  715. if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
  716. return false;
  717. auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
  718. if (!LD ||
  719. !LD->isSimple() ||
  720. LD->getAddressingMode() != ISD::UNINDEXED ||
  721. LD->getExtensionType() != ISD::NON_EXTLOAD)
  722. return false;
  723. // Now let's find the callseq_start.
  724. while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
  725. if (!Chain.hasOneUse())
  726. return false;
  727. Chain = Chain.getOperand(0);
  728. }
  729. if (!Chain.getNumOperands())
  730. return false;
  731. // Since we are not checking for AA here, conservatively abort if the chain
  732. // writes to memory. It's not safe to move the callee (a load) across a store.
  733. if (isa<MemSDNode>(Chain.getNode()) &&
  734. cast<MemSDNode>(Chain.getNode())->writeMem())
  735. return false;
  736. if (Chain.getOperand(0).getNode() == Callee.getNode())
  737. return true;
  738. if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
  739. Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
  740. Callee.getValue(1).hasOneUse())
  741. return true;
  742. return false;
  743. }
  744. static bool isEndbrImm64(uint64_t Imm) {
  745. // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
  746. // i.g: 0xF3660F1EFA, 0xF3670F1EFA
  747. if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
  748. return false;
  749. uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
  750. 0x65, 0x66, 0x67, 0xf0, 0xf2};
  751. int i = 24; // 24bit 0x0F1EFA has matched
  752. while (i < 64) {
  753. uint8_t Byte = (Imm >> i) & 0xFF;
  754. if (Byte == 0xF3)
  755. return true;
  756. if (!llvm::is_contained(OptionalPrefixBytes, Byte))
  757. return false;
  758. i += 8;
  759. }
  760. return false;
  761. }
  762. void X86DAGToDAGISel::PreprocessISelDAG() {
  763. bool MadeChange = false;
  764. for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
  765. E = CurDAG->allnodes_end(); I != E; ) {
  766. SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
  767. // This is for CET enhancement.
  768. //
  769. // ENDBR32 and ENDBR64 have specific opcodes:
  770. // ENDBR32: F3 0F 1E FB
  771. // ENDBR64: F3 0F 1E FA
  772. // And we want that attackers won’t find unintended ENDBR32/64
  773. // opcode matches in the binary
  774. // Here’s an example:
  775. // If the compiler had to generate asm for the following code:
  776. // a = 0xF30F1EFA
  777. // it could, for example, generate:
  778. // mov 0xF30F1EFA, dword ptr[a]
  779. // In such a case, the binary would include a gadget that starts
  780. // with a fake ENDBR64 opcode. Therefore, we split such generation
  781. // into multiple operations, let it not shows in the binary
  782. if (N->getOpcode() == ISD::Constant) {
  783. MVT VT = N->getSimpleValueType(0);
  784. int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
  785. int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
  786. if (Imm == EndbrImm || isEndbrImm64(Imm)) {
  787. // Check that the cf-protection-branch is enabled.
  788. Metadata *CFProtectionBranch =
  789. MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
  790. if (CFProtectionBranch || IndirectBranchTracking) {
  791. SDLoc dl(N);
  792. SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
  793. Complement = CurDAG->getNOT(dl, Complement, VT);
  794. --I;
  795. CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
  796. ++I;
  797. MadeChange = true;
  798. continue;
  799. }
  800. }
  801. }
  802. // If this is a target specific AND node with no flag usages, turn it back
  803. // into ISD::AND to enable test instruction matching.
  804. if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
  805. SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
  806. N->getOperand(0), N->getOperand(1));
  807. --I;
  808. CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
  809. ++I;
  810. MadeChange = true;
  811. continue;
  812. }
  813. // Convert vector increment or decrement to sub/add with an all-ones
  814. // constant:
  815. // add X, <1, 1...> --> sub X, <-1, -1...>
  816. // sub X, <1, 1...> --> add X, <-1, -1...>
  817. // The all-ones vector constant can be materialized using a pcmpeq
  818. // instruction that is commonly recognized as an idiom (has no register
  819. // dependency), so that's better/smaller than loading a splat 1 constant.
  820. //
  821. // But don't do this if it would inhibit a potentially profitable load
  822. // folding opportunity for the other operand. That only occurs with the
  823. // intersection of:
  824. // (1) The other operand (op0) is load foldable.
  825. // (2) The op is an add (otherwise, we are *creating* an add and can still
  826. // load fold the other op).
  827. // (3) The target has AVX (otherwise, we have a destructive add and can't
  828. // load fold the other op without killing the constant op).
  829. // (4) The constant 1 vector has multiple uses (so it is profitable to load
  830. // into a register anyway).
  831. auto mayPreventLoadFold = [&]() {
  832. return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
  833. N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
  834. !N->getOperand(1).hasOneUse();
  835. };
  836. if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
  837. N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
  838. APInt SplatVal;
  839. if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
  840. SplatVal.isOne()) {
  841. SDLoc DL(N);
  842. MVT VT = N->getSimpleValueType(0);
  843. unsigned NumElts = VT.getSizeInBits() / 32;
  844. SDValue AllOnes =
  845. CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
  846. AllOnes = CurDAG->getBitcast(VT, AllOnes);
  847. unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
  848. SDValue Res =
  849. CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
  850. --I;
  851. CurDAG->ReplaceAllUsesWith(N, Res.getNode());
  852. ++I;
  853. MadeChange = true;
  854. continue;
  855. }
  856. }
  857. switch (N->getOpcode()) {
  858. case X86ISD::VBROADCAST: {
  859. MVT VT = N->getSimpleValueType(0);
  860. // Emulate v32i16/v64i8 broadcast without BWI.
  861. if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
  862. MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
  863. SDLoc dl(N);
  864. SDValue NarrowBCast =
  865. CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
  866. SDValue Res =
  867. CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
  868. NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
  869. unsigned Index = VT == MVT::v32i16 ? 16 : 32;
  870. Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
  871. CurDAG->getIntPtrConstant(Index, dl));
  872. --I;
  873. CurDAG->ReplaceAllUsesWith(N, Res.getNode());
  874. ++I;
  875. MadeChange = true;
  876. continue;
  877. }
  878. break;
  879. }
  880. case X86ISD::VBROADCAST_LOAD: {
  881. MVT VT = N->getSimpleValueType(0);
  882. // Emulate v32i16/v64i8 broadcast without BWI.
  883. if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
  884. MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
  885. auto *MemNode = cast<MemSDNode>(N);
  886. SDLoc dl(N);
  887. SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
  888. SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
  889. SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
  890. X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
  891. MemNode->getMemOperand());
  892. SDValue Res =
  893. CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
  894. NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
  895. unsigned Index = VT == MVT::v32i16 ? 16 : 32;
  896. Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
  897. CurDAG->getIntPtrConstant(Index, dl));
  898. --I;
  899. SDValue To[] = {Res, NarrowBCast.getValue(1)};
  900. CurDAG->ReplaceAllUsesWith(N, To);
  901. ++I;
  902. MadeChange = true;
  903. continue;
  904. }
  905. break;
  906. }
  907. case ISD::VSELECT: {
  908. // Replace VSELECT with non-mask conditions with with BLENDV.
  909. if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
  910. break;
  911. assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
  912. SDValue Blendv =
  913. CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
  914. N->getOperand(0), N->getOperand(1), N->getOperand(2));
  915. --I;
  916. CurDAG->ReplaceAllUsesWith(N, Blendv.getNode());
  917. ++I;
  918. MadeChange = true;
  919. continue;
  920. }
  921. case ISD::FP_ROUND:
  922. case ISD::STRICT_FP_ROUND:
  923. case ISD::FP_TO_SINT:
  924. case ISD::FP_TO_UINT:
  925. case ISD::STRICT_FP_TO_SINT:
  926. case ISD::STRICT_FP_TO_UINT: {
  927. // Replace vector fp_to_s/uint with their X86 specific equivalent so we
  928. // don't need 2 sets of patterns.
  929. if (!N->getSimpleValueType(0).isVector())
  930. break;
  931. unsigned NewOpc;
  932. switch (N->getOpcode()) {
  933. default: llvm_unreachable("Unexpected opcode!");
  934. case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
  935. case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
  936. case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
  937. case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
  938. case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
  939. case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
  940. }
  941. SDValue Res;
  942. if (N->isStrictFPOpcode())
  943. Res =
  944. CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
  945. {N->getOperand(0), N->getOperand(1)});
  946. else
  947. Res =
  948. CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
  949. N->getOperand(0));
  950. --I;
  951. CurDAG->ReplaceAllUsesWith(N, Res.getNode());
  952. ++I;
  953. MadeChange = true;
  954. continue;
  955. }
  956. case ISD::SHL:
  957. case ISD::SRA:
  958. case ISD::SRL: {
  959. // Replace vector shifts with their X86 specific equivalent so we don't
  960. // need 2 sets of patterns.
  961. if (!N->getValueType(0).isVector())
  962. break;
  963. unsigned NewOpc;
  964. switch (N->getOpcode()) {
  965. default: llvm_unreachable("Unexpected opcode!");
  966. case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
  967. case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
  968. case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
  969. }
  970. SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
  971. N->getOperand(0), N->getOperand(1));
  972. --I;
  973. CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
  974. ++I;
  975. MadeChange = true;
  976. continue;
  977. }
  978. case ISD::ANY_EXTEND:
  979. case ISD::ANY_EXTEND_VECTOR_INREG: {
  980. // Replace vector any extend with the zero extend equivalents so we don't
  981. // need 2 sets of patterns. Ignore vXi1 extensions.
  982. if (!N->getValueType(0).isVector())
  983. break;
  984. unsigned NewOpc;
  985. if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
  986. assert(N->getOpcode() == ISD::ANY_EXTEND &&
  987. "Unexpected opcode for mask vector!");
  988. NewOpc = ISD::SIGN_EXTEND;
  989. } else {
  990. NewOpc = N->getOpcode() == ISD::ANY_EXTEND
  991. ? ISD::ZERO_EXTEND
  992. : ISD::ZERO_EXTEND_VECTOR_INREG;
  993. }
  994. SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
  995. N->getOperand(0));
  996. --I;
  997. CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
  998. ++I;
  999. MadeChange = true;
  1000. continue;
  1001. }
  1002. case ISD::FCEIL:
  1003. case ISD::STRICT_FCEIL:
  1004. case ISD::FFLOOR:
  1005. case ISD::STRICT_FFLOOR:
  1006. case ISD::FTRUNC:
  1007. case ISD::STRICT_FTRUNC:
  1008. case ISD::FROUNDEVEN:
  1009. case ISD::STRICT_FROUNDEVEN:
  1010. case ISD::FNEARBYINT:
  1011. case ISD::STRICT_FNEARBYINT:
  1012. case ISD::FRINT:
  1013. case ISD::STRICT_FRINT: {
  1014. // Replace fp rounding with their X86 specific equivalent so we don't
  1015. // need 2 sets of patterns.
  1016. unsigned Imm;
  1017. switch (N->getOpcode()) {
  1018. default: llvm_unreachable("Unexpected opcode!");
  1019. case ISD::STRICT_FCEIL:
  1020. case ISD::FCEIL: Imm = 0xA; break;
  1021. case ISD::STRICT_FFLOOR:
  1022. case ISD::FFLOOR: Imm = 0x9; break;
  1023. case ISD::STRICT_FTRUNC:
  1024. case ISD::FTRUNC: Imm = 0xB; break;
  1025. case ISD::STRICT_FROUNDEVEN:
  1026. case ISD::FROUNDEVEN: Imm = 0x8; break;
  1027. case ISD::STRICT_FNEARBYINT:
  1028. case ISD::FNEARBYINT: Imm = 0xC; break;
  1029. case ISD::STRICT_FRINT:
  1030. case ISD::FRINT: Imm = 0x4; break;
  1031. }
  1032. SDLoc dl(N);
  1033. bool IsStrict = N->isStrictFPOpcode();
  1034. SDValue Res;
  1035. if (IsStrict)
  1036. Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
  1037. {N->getValueType(0), MVT::Other},
  1038. {N->getOperand(0), N->getOperand(1),
  1039. CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
  1040. else
  1041. Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
  1042. N->getOperand(0),
  1043. CurDAG->getTargetConstant(Imm, dl, MVT::i32));
  1044. --I;
  1045. CurDAG->ReplaceAllUsesWith(N, Res.getNode());
  1046. ++I;
  1047. MadeChange = true;
  1048. continue;
  1049. }
  1050. case X86ISD::FANDN:
  1051. case X86ISD::FAND:
  1052. case X86ISD::FOR:
  1053. case X86ISD::FXOR: {
  1054. // Widen scalar fp logic ops to vector to reduce isel patterns.
  1055. // FIXME: Can we do this during lowering/combine.
  1056. MVT VT = N->getSimpleValueType(0);
  1057. if (VT.isVector() || VT == MVT::f128)
  1058. break;
  1059. MVT VecVT = VT == MVT::f64 ? MVT::v2f64
  1060. : VT == MVT::f32 ? MVT::v4f32
  1061. : MVT::v8f16;
  1062. SDLoc dl(N);
  1063. SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
  1064. N->getOperand(0));
  1065. SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
  1066. N->getOperand(1));
  1067. SDValue Res;
  1068. if (Subtarget->hasSSE2()) {
  1069. EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
  1070. Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
  1071. Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
  1072. unsigned Opc;
  1073. switch (N->getOpcode()) {
  1074. default: llvm_unreachable("Unexpected opcode!");
  1075. case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
  1076. case X86ISD::FAND: Opc = ISD::AND; break;
  1077. case X86ISD::FOR: Opc = ISD::OR; break;
  1078. case X86ISD::FXOR: Opc = ISD::XOR; break;
  1079. }
  1080. Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
  1081. Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
  1082. } else {
  1083. Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
  1084. }
  1085. Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
  1086. CurDAG->getIntPtrConstant(0, dl));
  1087. --I;
  1088. CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
  1089. ++I;
  1090. MadeChange = true;
  1091. continue;
  1092. }
  1093. }
  1094. if (OptLevel != CodeGenOpt::None &&
  1095. // Only do this when the target can fold the load into the call or
  1096. // jmp.
  1097. !Subtarget->useIndirectThunkCalls() &&
  1098. ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
  1099. (N->getOpcode() == X86ISD::TC_RETURN &&
  1100. (Subtarget->is64Bit() ||
  1101. !getTargetMachine().isPositionIndependent())))) {
  1102. /// Also try moving call address load from outside callseq_start to just
  1103. /// before the call to allow it to be folded.
  1104. ///
  1105. /// [Load chain]
  1106. /// ^
  1107. /// |
  1108. /// [Load]
  1109. /// ^ ^
  1110. /// | |
  1111. /// / \--
  1112. /// / |
  1113. ///[CALLSEQ_START] |
  1114. /// ^ |
  1115. /// | |
  1116. /// [LOAD/C2Reg] |
  1117. /// | |
  1118. /// \ /
  1119. /// \ /
  1120. /// [CALL]
  1121. bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
  1122. SDValue Chain = N->getOperand(0);
  1123. SDValue Load = N->getOperand(1);
  1124. if (!isCalleeLoad(Load, Chain, HasCallSeq))
  1125. continue;
  1126. moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
  1127. ++NumLoadMoved;
  1128. MadeChange = true;
  1129. continue;
  1130. }
  1131. // Lower fpround and fpextend nodes that target the FP stack to be store and
  1132. // load to the stack. This is a gross hack. We would like to simply mark
  1133. // these as being illegal, but when we do that, legalize produces these when
  1134. // it expands calls, then expands these in the same legalize pass. We would
  1135. // like dag combine to be able to hack on these between the call expansion
  1136. // and the node legalization. As such this pass basically does "really
  1137. // late" legalization of these inline with the X86 isel pass.
  1138. // FIXME: This should only happen when not compiled with -O0.
  1139. switch (N->getOpcode()) {
  1140. default: continue;
  1141. case ISD::FP_ROUND:
  1142. case ISD::FP_EXTEND:
  1143. {
  1144. MVT SrcVT = N->getOperand(0).getSimpleValueType();
  1145. MVT DstVT = N->getSimpleValueType(0);
  1146. // If any of the sources are vectors, no fp stack involved.
  1147. if (SrcVT.isVector() || DstVT.isVector())
  1148. continue;
  1149. // If the source and destination are SSE registers, then this is a legal
  1150. // conversion that should not be lowered.
  1151. const X86TargetLowering *X86Lowering =
  1152. static_cast<const X86TargetLowering *>(TLI);
  1153. bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
  1154. bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
  1155. if (SrcIsSSE && DstIsSSE)
  1156. continue;
  1157. if (!SrcIsSSE && !DstIsSSE) {
  1158. // If this is an FPStack extension, it is a noop.
  1159. if (N->getOpcode() == ISD::FP_EXTEND)
  1160. continue;
  1161. // If this is a value-preserving FPStack truncation, it is a noop.
  1162. if (N->getConstantOperandVal(1))
  1163. continue;
  1164. }
  1165. // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
  1166. // FPStack has extload and truncstore. SSE can fold direct loads into other
  1167. // operations. Based on this, decide what we want to do.
  1168. MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
  1169. SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
  1170. int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
  1171. MachinePointerInfo MPI =
  1172. MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
  1173. SDLoc dl(N);
  1174. // FIXME: optimize the case where the src/dest is a load or store?
  1175. SDValue Store = CurDAG->getTruncStore(
  1176. CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
  1177. SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
  1178. MemTmp, MPI, MemVT);
  1179. // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
  1180. // extload we created. This will cause general havok on the dag because
  1181. // anything below the conversion could be folded into other existing nodes.
  1182. // To avoid invalidating 'I', back it up to the convert node.
  1183. --I;
  1184. CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
  1185. break;
  1186. }
  1187. //The sequence of events for lowering STRICT_FP versions of these nodes requires
  1188. //dealing with the chain differently, as there is already a preexisting chain.
  1189. case ISD::STRICT_FP_ROUND:
  1190. case ISD::STRICT_FP_EXTEND:
  1191. {
  1192. MVT SrcVT = N->getOperand(1).getSimpleValueType();
  1193. MVT DstVT = N->getSimpleValueType(0);
  1194. // If any of the sources are vectors, no fp stack involved.
  1195. if (SrcVT.isVector() || DstVT.isVector())
  1196. continue;
  1197. // If the source and destination are SSE registers, then this is a legal
  1198. // conversion that should not be lowered.
  1199. const X86TargetLowering *X86Lowering =
  1200. static_cast<const X86TargetLowering *>(TLI);
  1201. bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
  1202. bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
  1203. if (SrcIsSSE && DstIsSSE)
  1204. continue;
  1205. if (!SrcIsSSE && !DstIsSSE) {
  1206. // If this is an FPStack extension, it is a noop.
  1207. if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
  1208. continue;
  1209. // If this is a value-preserving FPStack truncation, it is a noop.
  1210. if (N->getConstantOperandVal(2))
  1211. continue;
  1212. }
  1213. // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
  1214. // FPStack has extload and truncstore. SSE can fold direct loads into other
  1215. // operations. Based on this, decide what we want to do.
  1216. MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
  1217. SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
  1218. int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
  1219. MachinePointerInfo MPI =
  1220. MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
  1221. SDLoc dl(N);
  1222. // FIXME: optimize the case where the src/dest is a load or store?
  1223. //Since the operation is StrictFP, use the preexisting chain.
  1224. SDValue Store, Result;
  1225. if (!SrcIsSSE) {
  1226. SDVTList VTs = CurDAG->getVTList(MVT::Other);
  1227. SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
  1228. Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
  1229. MPI, /*Align*/ std::nullopt,
  1230. MachineMemOperand::MOStore);
  1231. if (N->getFlags().hasNoFPExcept()) {
  1232. SDNodeFlags Flags = Store->getFlags();
  1233. Flags.setNoFPExcept(true);
  1234. Store->setFlags(Flags);
  1235. }
  1236. } else {
  1237. assert(SrcVT == MemVT && "Unexpected VT!");
  1238. Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
  1239. MPI);
  1240. }
  1241. if (!DstIsSSE) {
  1242. SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
  1243. SDValue Ops[] = {Store, MemTmp};
  1244. Result = CurDAG->getMemIntrinsicNode(
  1245. X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
  1246. /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
  1247. if (N->getFlags().hasNoFPExcept()) {
  1248. SDNodeFlags Flags = Result->getFlags();
  1249. Flags.setNoFPExcept(true);
  1250. Result->setFlags(Flags);
  1251. }
  1252. } else {
  1253. assert(DstVT == MemVT && "Unexpected VT!");
  1254. Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
  1255. }
  1256. // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
  1257. // extload we created. This will cause general havok on the dag because
  1258. // anything below the conversion could be folded into other existing nodes.
  1259. // To avoid invalidating 'I', back it up to the convert node.
  1260. --I;
  1261. CurDAG->ReplaceAllUsesWith(N, Result.getNode());
  1262. break;
  1263. }
  1264. }
  1265. // Now that we did that, the node is dead. Increment the iterator to the
  1266. // next node to process, then delete N.
  1267. ++I;
  1268. MadeChange = true;
  1269. }
  1270. // Remove any dead nodes that may have been left behind.
  1271. if (MadeChange)
  1272. CurDAG->RemoveDeadNodes();
  1273. }
  1274. // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
  1275. bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
  1276. unsigned Opc = N->getMachineOpcode();
  1277. if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
  1278. Opc != X86::MOVSX64rr8)
  1279. return false;
  1280. SDValue N0 = N->getOperand(0);
  1281. // We need to be extracting the lower bit of an extend.
  1282. if (!N0.isMachineOpcode() ||
  1283. N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
  1284. N0.getConstantOperandVal(1) != X86::sub_8bit)
  1285. return false;
  1286. // We're looking for either a movsx or movzx to match the original opcode.
  1287. unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
  1288. : X86::MOVSX32rr8_NOREX;
  1289. SDValue N00 = N0.getOperand(0);
  1290. if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
  1291. return false;
  1292. if (Opc == X86::MOVSX64rr8) {
  1293. // If we had a sign extend from 8 to 64 bits. We still need to go from 32
  1294. // to 64.
  1295. MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
  1296. MVT::i64, N00);
  1297. ReplaceUses(N, Extend);
  1298. } else {
  1299. // Ok we can drop this extend and just use the original extend.
  1300. ReplaceUses(N, N00.getNode());
  1301. }
  1302. return true;
  1303. }
  1304. void X86DAGToDAGISel::PostprocessISelDAG() {
  1305. // Skip peepholes at -O0.
  1306. if (TM.getOptLevel() == CodeGenOpt::None)
  1307. return;
  1308. SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
  1309. bool MadeChange = false;
  1310. while (Position != CurDAG->allnodes_begin()) {
  1311. SDNode *N = &*--Position;
  1312. // Skip dead nodes and any non-machine opcodes.
  1313. if (N->use_empty() || !N->isMachineOpcode())
  1314. continue;
  1315. if (tryOptimizeRem8Extend(N)) {
  1316. MadeChange = true;
  1317. continue;
  1318. }
  1319. // Look for a TESTrr+ANDrr pattern where both operands of the test are
  1320. // the same. Rewrite to remove the AND.
  1321. unsigned Opc = N->getMachineOpcode();
  1322. if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
  1323. Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
  1324. N->getOperand(0) == N->getOperand(1) &&
  1325. N->getOperand(0)->hasNUsesOfValue(2, N->getOperand(0).getResNo()) &&
  1326. N->getOperand(0).isMachineOpcode()) {
  1327. SDValue And = N->getOperand(0);
  1328. unsigned N0Opc = And.getMachineOpcode();
  1329. if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
  1330. N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) &&
  1331. !And->hasAnyUseOfValue(1)) {
  1332. MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
  1333. MVT::i32,
  1334. And.getOperand(0),
  1335. And.getOperand(1));
  1336. ReplaceUses(N, Test);
  1337. MadeChange = true;
  1338. continue;
  1339. }
  1340. if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
  1341. N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) &&
  1342. !And->hasAnyUseOfValue(1)) {
  1343. unsigned NewOpc;
  1344. switch (N0Opc) {
  1345. case X86::AND8rm: NewOpc = X86::TEST8mr; break;
  1346. case X86::AND16rm: NewOpc = X86::TEST16mr; break;
  1347. case X86::AND32rm: NewOpc = X86::TEST32mr; break;
  1348. case X86::AND64rm: NewOpc = X86::TEST64mr; break;
  1349. }
  1350. // Need to swap the memory and register operand.
  1351. SDValue Ops[] = { And.getOperand(1),
  1352. And.getOperand(2),
  1353. And.getOperand(3),
  1354. And.getOperand(4),
  1355. And.getOperand(5),
  1356. And.getOperand(0),
  1357. And.getOperand(6) /* Chain */ };
  1358. MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
  1359. MVT::i32, MVT::Other, Ops);
  1360. CurDAG->setNodeMemRefs(
  1361. Test, cast<MachineSDNode>(And.getNode())->memoperands());
  1362. ReplaceUses(And.getValue(2), SDValue(Test, 1));
  1363. ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
  1364. MadeChange = true;
  1365. continue;
  1366. }
  1367. }
  1368. // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
  1369. // used. We're doing this late so we can prefer to fold the AND into masked
  1370. // comparisons. Doing that can be better for the live range of the mask
  1371. // register.
  1372. if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
  1373. Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
  1374. N->getOperand(0) == N->getOperand(1) &&
  1375. N->isOnlyUserOf(N->getOperand(0).getNode()) &&
  1376. N->getOperand(0).isMachineOpcode() &&
  1377. onlyUsesZeroFlag(SDValue(N, 0))) {
  1378. SDValue And = N->getOperand(0);
  1379. unsigned N0Opc = And.getMachineOpcode();
  1380. // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
  1381. // KAND instructions and KTEST use the same ISA feature.
  1382. if (N0Opc == X86::KANDBrr ||
  1383. (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
  1384. N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
  1385. unsigned NewOpc;
  1386. switch (Opc) {
  1387. default: llvm_unreachable("Unexpected opcode!");
  1388. case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
  1389. case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
  1390. case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
  1391. case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
  1392. }
  1393. MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
  1394. MVT::i32,
  1395. And.getOperand(0),
  1396. And.getOperand(1));
  1397. ReplaceUses(N, KTest);
  1398. MadeChange = true;
  1399. continue;
  1400. }
  1401. }
  1402. // Attempt to remove vectors moves that were inserted to zero upper bits.
  1403. if (Opc != TargetOpcode::SUBREG_TO_REG)
  1404. continue;
  1405. unsigned SubRegIdx = N->getConstantOperandVal(2);
  1406. if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
  1407. continue;
  1408. SDValue Move = N->getOperand(1);
  1409. if (!Move.isMachineOpcode())
  1410. continue;
  1411. // Make sure its one of the move opcodes we recognize.
  1412. switch (Move.getMachineOpcode()) {
  1413. default:
  1414. continue;
  1415. case X86::VMOVAPDrr: case X86::VMOVUPDrr:
  1416. case X86::VMOVAPSrr: case X86::VMOVUPSrr:
  1417. case X86::VMOVDQArr: case X86::VMOVDQUrr:
  1418. case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
  1419. case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
  1420. case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
  1421. case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
  1422. case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
  1423. case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
  1424. case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
  1425. case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
  1426. case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
  1427. case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
  1428. case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
  1429. break;
  1430. }
  1431. SDValue In = Move.getOperand(0);
  1432. if (!In.isMachineOpcode() ||
  1433. In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
  1434. continue;
  1435. // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
  1436. // the SHA instructions which use a legacy encoding.
  1437. uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
  1438. if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
  1439. (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
  1440. (TSFlags & X86II::EncodingMask) != X86II::XOP)
  1441. continue;
  1442. // Producing instruction is another vector instruction. We can drop the
  1443. // move.
  1444. CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
  1445. MadeChange = true;
  1446. }
  1447. if (MadeChange)
  1448. CurDAG->RemoveDeadNodes();
  1449. }
  1450. /// Emit any code that needs to be executed only in the main function.
  1451. void X86DAGToDAGISel::emitSpecialCodeForMain() {
  1452. if (Subtarget->isTargetCygMing()) {
  1453. TargetLowering::ArgListTy Args;
  1454. auto &DL = CurDAG->getDataLayout();
  1455. TargetLowering::CallLoweringInfo CLI(*CurDAG);
  1456. CLI.setChain(CurDAG->getRoot())
  1457. .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
  1458. CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
  1459. std::move(Args));
  1460. const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
  1461. std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
  1462. CurDAG->setRoot(Result.second);
  1463. }
  1464. }
  1465. void X86DAGToDAGISel::emitFunctionEntryCode() {
  1466. // If this is main, emit special code for main.
  1467. const Function &F = MF->getFunction();
  1468. if (F.hasExternalLinkage() && F.getName() == "main")
  1469. emitSpecialCodeForMain();
  1470. }
  1471. static bool isDispSafeForFrameIndex(int64_t Val) {
  1472. // On 64-bit platforms, we can run into an issue where a frame index
  1473. // includes a displacement that, when added to the explicit displacement,
  1474. // will overflow the displacement field. Assuming that the frame index
  1475. // displacement fits into a 31-bit integer (which is only slightly more
  1476. // aggressive than the current fundamental assumption that it fits into
  1477. // a 32-bit integer), a 31-bit disp should always be safe.
  1478. return isInt<31>(Val);
  1479. }
  1480. bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
  1481. X86ISelAddressMode &AM) {
  1482. // We may have already matched a displacement and the caller just added the
  1483. // symbolic displacement. So we still need to do the checks even if Offset
  1484. // is zero.
  1485. int64_t Val = AM.Disp + Offset;
  1486. // Cannot combine ExternalSymbol displacements with integer offsets.
  1487. if (Val != 0 && (AM.ES || AM.MCSym))
  1488. return true;
  1489. CodeModel::Model M = TM.getCodeModel();
  1490. if (Subtarget->is64Bit()) {
  1491. if (Val != 0 &&
  1492. !X86::isOffsetSuitableForCodeModel(Val, M,
  1493. AM.hasSymbolicDisplacement()))
  1494. return true;
  1495. // In addition to the checks required for a register base, check that
  1496. // we do not try to use an unsafe Disp with a frame index.
  1497. if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
  1498. !isDispSafeForFrameIndex(Val))
  1499. return true;
  1500. }
  1501. AM.Disp = Val;
  1502. return false;
  1503. }
  1504. bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
  1505. bool AllowSegmentRegForX32) {
  1506. SDValue Address = N->getOperand(1);
  1507. // load gs:0 -> GS segment register.
  1508. // load fs:0 -> FS segment register.
  1509. //
  1510. // This optimization is generally valid because the GNU TLS model defines that
  1511. // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
  1512. // with 32-bit registers, as we get in ILP32 mode, those registers are first
  1513. // zero-extended to 64 bits and then added it to the base address, which gives
  1514. // unwanted results when the register holds a negative value.
  1515. // For more information see http://people.redhat.com/drepper/tls.pdf
  1516. if (auto *C = dyn_cast<ConstantSDNode>(Address)) {
  1517. if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
  1518. !IndirectTlsSegRefs &&
  1519. (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
  1520. Subtarget->isTargetFuchsia())) {
  1521. if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
  1522. return true;
  1523. switch (N->getPointerInfo().getAddrSpace()) {
  1524. case X86AS::GS:
  1525. AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
  1526. return false;
  1527. case X86AS::FS:
  1528. AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
  1529. return false;
  1530. // Address space X86AS::SS is not handled here, because it is not used to
  1531. // address TLS areas.
  1532. }
  1533. }
  1534. }
  1535. return true;
  1536. }
  1537. /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
  1538. /// mode. These wrap things that will resolve down into a symbol reference.
  1539. /// If no match is possible, this returns true, otherwise it returns false.
  1540. bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
  1541. // If the addressing mode already has a symbol as the displacement, we can
  1542. // never match another symbol.
  1543. if (AM.hasSymbolicDisplacement())
  1544. return true;
  1545. bool IsRIPRelTLS = false;
  1546. bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
  1547. if (IsRIPRel) {
  1548. SDValue Val = N.getOperand(0);
  1549. if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
  1550. IsRIPRelTLS = true;
  1551. }
  1552. // We can't use an addressing mode in the 64-bit large code model.
  1553. // Global TLS addressing is an exception. In the medium code model,
  1554. // we use can use a mode when RIP wrappers are present.
  1555. // That signifies access to globals that are known to be "near",
  1556. // such as the GOT itself.
  1557. CodeModel::Model M = TM.getCodeModel();
  1558. if (Subtarget->is64Bit() &&
  1559. ((M == CodeModel::Large && !IsRIPRelTLS) ||
  1560. (M == CodeModel::Medium && !IsRIPRel)))
  1561. return true;
  1562. // Base and index reg must be 0 in order to use %rip as base.
  1563. if (IsRIPRel && AM.hasBaseOrIndexReg())
  1564. return true;
  1565. // Make a local copy in case we can't do this fold.
  1566. X86ISelAddressMode Backup = AM;
  1567. int64_t Offset = 0;
  1568. SDValue N0 = N.getOperand(0);
  1569. if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
  1570. AM.GV = G->getGlobal();
  1571. AM.SymbolFlags = G->getTargetFlags();
  1572. Offset = G->getOffset();
  1573. } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
  1574. AM.CP = CP->getConstVal();
  1575. AM.Alignment = CP->getAlign();
  1576. AM.SymbolFlags = CP->getTargetFlags();
  1577. Offset = CP->getOffset();
  1578. } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
  1579. AM.ES = S->getSymbol();
  1580. AM.SymbolFlags = S->getTargetFlags();
  1581. } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
  1582. AM.MCSym = S->getMCSymbol();
  1583. } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
  1584. AM.JT = J->getIndex();
  1585. AM.SymbolFlags = J->getTargetFlags();
  1586. } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
  1587. AM.BlockAddr = BA->getBlockAddress();
  1588. AM.SymbolFlags = BA->getTargetFlags();
  1589. Offset = BA->getOffset();
  1590. } else
  1591. llvm_unreachable("Unhandled symbol reference node.");
  1592. if (foldOffsetIntoAddress(Offset, AM)) {
  1593. AM = Backup;
  1594. return true;
  1595. }
  1596. if (IsRIPRel)
  1597. AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
  1598. // Commit the changes now that we know this fold is safe.
  1599. return false;
  1600. }
  1601. /// Add the specified node to the specified addressing mode, returning true if
  1602. /// it cannot be done. This just pattern matches for the addressing mode.
  1603. bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
  1604. if (matchAddressRecursively(N, AM, 0))
  1605. return true;
  1606. // Post-processing: Make a second attempt to fold a load, if we now know
  1607. // that there will not be any other register. This is only performed for
  1608. // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
  1609. // any foldable load the first time.
  1610. if (Subtarget->isTarget64BitILP32() &&
  1611. AM.BaseType == X86ISelAddressMode::RegBase &&
  1612. AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
  1613. SDValue Save_Base_Reg = AM.Base_Reg;
  1614. if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
  1615. AM.Base_Reg = SDValue();
  1616. if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
  1617. AM.Base_Reg = Save_Base_Reg;
  1618. }
  1619. }
  1620. // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
  1621. // a smaller encoding and avoids a scaled-index.
  1622. if (AM.Scale == 2 &&
  1623. AM.BaseType == X86ISelAddressMode::RegBase &&
  1624. AM.Base_Reg.getNode() == nullptr) {
  1625. AM.Base_Reg = AM.IndexReg;
  1626. AM.Scale = 1;
  1627. }
  1628. // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
  1629. // because it has a smaller encoding.
  1630. // TODO: Which other code models can use this?
  1631. switch (TM.getCodeModel()) {
  1632. default: break;
  1633. case CodeModel::Small:
  1634. case CodeModel::Kernel:
  1635. if (Subtarget->is64Bit() &&
  1636. AM.Scale == 1 &&
  1637. AM.BaseType == X86ISelAddressMode::RegBase &&
  1638. AM.Base_Reg.getNode() == nullptr &&
  1639. AM.IndexReg.getNode() == nullptr &&
  1640. AM.SymbolFlags == X86II::MO_NO_FLAG &&
  1641. AM.hasSymbolicDisplacement())
  1642. AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
  1643. break;
  1644. }
  1645. return false;
  1646. }
  1647. bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
  1648. unsigned Depth) {
  1649. // Add an artificial use to this node so that we can keep track of
  1650. // it if it gets CSE'd with a different node.
  1651. HandleSDNode Handle(N);
  1652. X86ISelAddressMode Backup = AM;
  1653. if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
  1654. !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
  1655. return false;
  1656. AM = Backup;
  1657. // Try again after commutating the operands.
  1658. if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
  1659. Depth + 1) &&
  1660. !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
  1661. return false;
  1662. AM = Backup;
  1663. // If we couldn't fold both operands into the address at the same time,
  1664. // see if we can just put each operand into a register and fold at least
  1665. // the add.
  1666. if (AM.BaseType == X86ISelAddressMode::RegBase &&
  1667. !AM.Base_Reg.getNode() &&
  1668. !AM.IndexReg.getNode()) {
  1669. N = Handle.getValue();
  1670. AM.Base_Reg = N.getOperand(0);
  1671. AM.IndexReg = N.getOperand(1);
  1672. AM.Scale = 1;
  1673. return false;
  1674. }
  1675. N = Handle.getValue();
  1676. return true;
  1677. }
  1678. // Insert a node into the DAG at least before the Pos node's position. This
  1679. // will reposition the node as needed, and will assign it a node ID that is <=
  1680. // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
  1681. // IDs! The selection DAG must no longer depend on their uniqueness when this
  1682. // is used.
  1683. static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
  1684. if (N->getNodeId() == -1 ||
  1685. (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
  1686. SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
  1687. DAG.RepositionNode(Pos->getIterator(), N.getNode());
  1688. // Mark Node as invalid for pruning as after this it may be a successor to a
  1689. // selected node but otherwise be in the same position of Pos.
  1690. // Conservatively mark it with the same -abs(Id) to assure node id
  1691. // invariant is preserved.
  1692. N->setNodeId(Pos->getNodeId());
  1693. SelectionDAGISel::InvalidateNodeId(N.getNode());
  1694. }
  1695. }
  1696. // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
  1697. // safe. This allows us to convert the shift and and into an h-register
  1698. // extract and a scaled index. Returns false if the simplification is
  1699. // performed.
  1700. static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
  1701. uint64_t Mask,
  1702. SDValue Shift, SDValue X,
  1703. X86ISelAddressMode &AM) {
  1704. if (Shift.getOpcode() != ISD::SRL ||
  1705. !isa<ConstantSDNode>(Shift.getOperand(1)) ||
  1706. !Shift.hasOneUse())
  1707. return true;
  1708. int ScaleLog = 8 - Shift.getConstantOperandVal(1);
  1709. if (ScaleLog <= 0 || ScaleLog >= 4 ||
  1710. Mask != (0xffu << ScaleLog))
  1711. return true;
  1712. MVT VT = N.getSimpleValueType();
  1713. SDLoc DL(N);
  1714. SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
  1715. SDValue NewMask = DAG.getConstant(0xff, DL, VT);
  1716. SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
  1717. SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
  1718. SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
  1719. SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
  1720. // Insert the new nodes into the topological ordering. We must do this in
  1721. // a valid topological ordering as nothing is going to go back and re-sort
  1722. // these nodes. We continually insert before 'N' in sequence as this is
  1723. // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
  1724. // hierarchy left to express.
  1725. insertDAGNode(DAG, N, Eight);
  1726. insertDAGNode(DAG, N, Srl);
  1727. insertDAGNode(DAG, N, NewMask);
  1728. insertDAGNode(DAG, N, And);
  1729. insertDAGNode(DAG, N, ShlCount);
  1730. insertDAGNode(DAG, N, Shl);
  1731. DAG.ReplaceAllUsesWith(N, Shl);
  1732. DAG.RemoveDeadNode(N.getNode());
  1733. AM.IndexReg = And;
  1734. AM.Scale = (1 << ScaleLog);
  1735. return false;
  1736. }
  1737. // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
  1738. // allows us to fold the shift into this addressing mode. Returns false if the
  1739. // transform succeeded.
  1740. static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
  1741. X86ISelAddressMode &AM) {
  1742. SDValue Shift = N.getOperand(0);
  1743. // Use a signed mask so that shifting right will insert sign bits. These
  1744. // bits will be removed when we shift the result left so it doesn't matter
  1745. // what we use. This might allow a smaller immediate encoding.
  1746. int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
  1747. // If we have an any_extend feeding the AND, look through it to see if there
  1748. // is a shift behind it. But only if the AND doesn't use the extended bits.
  1749. // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
  1750. bool FoundAnyExtend = false;
  1751. if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
  1752. Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
  1753. isUInt<32>(Mask)) {
  1754. FoundAnyExtend = true;
  1755. Shift = Shift.getOperand(0);
  1756. }
  1757. if (Shift.getOpcode() != ISD::SHL ||
  1758. !isa<ConstantSDNode>(Shift.getOperand(1)))
  1759. return true;
  1760. SDValue X = Shift.getOperand(0);
  1761. // Not likely to be profitable if either the AND or SHIFT node has more
  1762. // than one use (unless all uses are for address computation). Besides,
  1763. // isel mechanism requires their node ids to be reused.
  1764. if (!N.hasOneUse() || !Shift.hasOneUse())
  1765. return true;
  1766. // Verify that the shift amount is something we can fold.
  1767. unsigned ShiftAmt = Shift.getConstantOperandVal(1);
  1768. if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
  1769. return true;
  1770. MVT VT = N.getSimpleValueType();
  1771. SDLoc DL(N);
  1772. if (FoundAnyExtend) {
  1773. SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
  1774. insertDAGNode(DAG, N, NewX);
  1775. X = NewX;
  1776. }
  1777. SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
  1778. SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
  1779. SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
  1780. // Insert the new nodes into the topological ordering. We must do this in
  1781. // a valid topological ordering as nothing is going to go back and re-sort
  1782. // these nodes. We continually insert before 'N' in sequence as this is
  1783. // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
  1784. // hierarchy left to express.
  1785. insertDAGNode(DAG, N, NewMask);
  1786. insertDAGNode(DAG, N, NewAnd);
  1787. insertDAGNode(DAG, N, NewShift);
  1788. DAG.ReplaceAllUsesWith(N, NewShift);
  1789. DAG.RemoveDeadNode(N.getNode());
  1790. AM.Scale = 1 << ShiftAmt;
  1791. AM.IndexReg = NewAnd;
  1792. return false;
  1793. }
  1794. // Implement some heroics to detect shifts of masked values where the mask can
  1795. // be replaced by extending the shift and undoing that in the addressing mode
  1796. // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
  1797. // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
  1798. // the addressing mode. This results in code such as:
  1799. //
  1800. // int f(short *y, int *lookup_table) {
  1801. // ...
  1802. // return *y + lookup_table[*y >> 11];
  1803. // }
  1804. //
  1805. // Turning into:
  1806. // movzwl (%rdi), %eax
  1807. // movl %eax, %ecx
  1808. // shrl $11, %ecx
  1809. // addl (%rsi,%rcx,4), %eax
  1810. //
  1811. // Instead of:
  1812. // movzwl (%rdi), %eax
  1813. // movl %eax, %ecx
  1814. // shrl $9, %ecx
  1815. // andl $124, %rcx
  1816. // addl (%rsi,%rcx), %eax
  1817. //
  1818. // Note that this function assumes the mask is provided as a mask *after* the
  1819. // value is shifted. The input chain may or may not match that, but computing
  1820. // such a mask is trivial.
  1821. static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
  1822. uint64_t Mask,
  1823. SDValue Shift, SDValue X,
  1824. X86ISelAddressMode &AM) {
  1825. if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
  1826. !isa<ConstantSDNode>(Shift.getOperand(1)))
  1827. return true;
  1828. unsigned ShiftAmt = Shift.getConstantOperandVal(1);
  1829. unsigned MaskLZ = countLeadingZeros(Mask);
  1830. unsigned MaskTZ = countTrailingZeros(Mask);
  1831. // The amount of shift we're trying to fit into the addressing mode is taken
  1832. // from the trailing zeros of the mask.
  1833. unsigned AMShiftAmt = MaskTZ;
  1834. // There is nothing we can do here unless the mask is removing some bits.
  1835. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
  1836. if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
  1837. // We also need to ensure that mask is a continuous run of bits.
  1838. if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
  1839. // Scale the leading zero count down based on the actual size of the value.
  1840. // Also scale it down based on the size of the shift.
  1841. unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
  1842. if (MaskLZ < ScaleDown)
  1843. return true;
  1844. MaskLZ -= ScaleDown;
  1845. // The final check is to ensure that any masked out high bits of X are
  1846. // already known to be zero. Otherwise, the mask has a semantic impact
  1847. // other than masking out a couple of low bits. Unfortunately, because of
  1848. // the mask, zero extensions will be removed from operands in some cases.
  1849. // This code works extra hard to look through extensions because we can
  1850. // replace them with zero extensions cheaply if necessary.
  1851. bool ReplacingAnyExtend = false;
  1852. if (X.getOpcode() == ISD::ANY_EXTEND) {
  1853. unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
  1854. X.getOperand(0).getSimpleValueType().getSizeInBits();
  1855. // Assume that we'll replace the any-extend with a zero-extend, and
  1856. // narrow the search to the extended value.
  1857. X = X.getOperand(0);
  1858. MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
  1859. ReplacingAnyExtend = true;
  1860. }
  1861. APInt MaskedHighBits =
  1862. APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
  1863. KnownBits Known = DAG.computeKnownBits(X);
  1864. if (MaskedHighBits != Known.Zero) return true;
  1865. // We've identified a pattern that can be transformed into a single shift
  1866. // and an addressing mode. Make it so.
  1867. MVT VT = N.getSimpleValueType();
  1868. if (ReplacingAnyExtend) {
  1869. assert(X.getValueType() != VT);
  1870. // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
  1871. SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
  1872. insertDAGNode(DAG, N, NewX);
  1873. X = NewX;
  1874. }
  1875. SDLoc DL(N);
  1876. SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
  1877. SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
  1878. SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
  1879. SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
  1880. // Insert the new nodes into the topological ordering. We must do this in
  1881. // a valid topological ordering as nothing is going to go back and re-sort
  1882. // these nodes. We continually insert before 'N' in sequence as this is
  1883. // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
  1884. // hierarchy left to express.
  1885. insertDAGNode(DAG, N, NewSRLAmt);
  1886. insertDAGNode(DAG, N, NewSRL);
  1887. insertDAGNode(DAG, N, NewSHLAmt);
  1888. insertDAGNode(DAG, N, NewSHL);
  1889. DAG.ReplaceAllUsesWith(N, NewSHL);
  1890. DAG.RemoveDeadNode(N.getNode());
  1891. AM.Scale = 1 << AMShiftAmt;
  1892. AM.IndexReg = NewSRL;
  1893. return false;
  1894. }
  1895. // Transform "(X >> SHIFT) & (MASK << C1)" to
  1896. // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
  1897. // matched to a BEXTR later. Returns false if the simplification is performed.
  1898. static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
  1899. uint64_t Mask,
  1900. SDValue Shift, SDValue X,
  1901. X86ISelAddressMode &AM,
  1902. const X86Subtarget &Subtarget) {
  1903. if (Shift.getOpcode() != ISD::SRL ||
  1904. !isa<ConstantSDNode>(Shift.getOperand(1)) ||
  1905. !Shift.hasOneUse() || !N.hasOneUse())
  1906. return true;
  1907. // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
  1908. if (!Subtarget.hasTBM() &&
  1909. !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
  1910. return true;
  1911. // We need to ensure that mask is a continuous run of bits.
  1912. if (!isShiftedMask_64(Mask)) return true;
  1913. unsigned ShiftAmt = Shift.getConstantOperandVal(1);
  1914. // The amount of shift we're trying to fit into the addressing mode is taken
  1915. // from the trailing zeros of the mask.
  1916. unsigned AMShiftAmt = countTrailingZeros(Mask);
  1917. // There is nothing we can do here unless the mask is removing some bits.
  1918. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
  1919. if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
  1920. MVT VT = N.getSimpleValueType();
  1921. SDLoc DL(N);
  1922. SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
  1923. SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
  1924. SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
  1925. SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
  1926. SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
  1927. SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
  1928. // Insert the new nodes into the topological ordering. We must do this in
  1929. // a valid topological ordering as nothing is going to go back and re-sort
  1930. // these nodes. We continually insert before 'N' in sequence as this is
  1931. // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
  1932. // hierarchy left to express.
  1933. insertDAGNode(DAG, N, NewSRLAmt);
  1934. insertDAGNode(DAG, N, NewSRL);
  1935. insertDAGNode(DAG, N, NewMask);
  1936. insertDAGNode(DAG, N, NewAnd);
  1937. insertDAGNode(DAG, N, NewSHLAmt);
  1938. insertDAGNode(DAG, N, NewSHL);
  1939. DAG.ReplaceAllUsesWith(N, NewSHL);
  1940. DAG.RemoveDeadNode(N.getNode());
  1941. AM.Scale = 1 << AMShiftAmt;
  1942. AM.IndexReg = NewAnd;
  1943. return false;
  1944. }
  1945. bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
  1946. unsigned Depth) {
  1947. SDLoc dl(N);
  1948. LLVM_DEBUG({
  1949. dbgs() << "MatchAddress: ";
  1950. AM.dump(CurDAG);
  1951. });
  1952. // Limit recursion.
  1953. if (Depth > 5)
  1954. return matchAddressBase(N, AM);
  1955. // If this is already a %rip relative address, we can only merge immediates
  1956. // into it. Instead of handling this in every case, we handle it here.
  1957. // RIP relative addressing: %rip + 32-bit displacement!
  1958. if (AM.isRIPRelative()) {
  1959. // FIXME: JumpTable and ExternalSymbol address currently don't like
  1960. // displacements. It isn't very important, but this should be fixed for
  1961. // consistency.
  1962. if (!(AM.ES || AM.MCSym) && AM.JT != -1)
  1963. return true;
  1964. if (auto *Cst = dyn_cast<ConstantSDNode>(N))
  1965. if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
  1966. return false;
  1967. return true;
  1968. }
  1969. switch (N.getOpcode()) {
  1970. default: break;
  1971. case ISD::LOCAL_RECOVER: {
  1972. if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
  1973. if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
  1974. // Use the symbol and don't prefix it.
  1975. AM.MCSym = ESNode->getMCSymbol();
  1976. return false;
  1977. }
  1978. break;
  1979. }
  1980. case ISD::Constant: {
  1981. uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
  1982. if (!foldOffsetIntoAddress(Val, AM))
  1983. return false;
  1984. break;
  1985. }
  1986. case X86ISD::Wrapper:
  1987. case X86ISD::WrapperRIP:
  1988. if (!matchWrapper(N, AM))
  1989. return false;
  1990. break;
  1991. case ISD::LOAD:
  1992. if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
  1993. return false;
  1994. break;
  1995. case ISD::FrameIndex:
  1996. if (AM.BaseType == X86ISelAddressMode::RegBase &&
  1997. AM.Base_Reg.getNode() == nullptr &&
  1998. (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
  1999. AM.BaseType = X86ISelAddressMode::FrameIndexBase;
  2000. AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
  2001. return false;
  2002. }
  2003. break;
  2004. case ISD::SHL:
  2005. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
  2006. break;
  2007. if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
  2008. unsigned Val = CN->getZExtValue();
  2009. // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
  2010. // that the base operand remains free for further matching. If
  2011. // the base doesn't end up getting used, a post-processing step
  2012. // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
  2013. if (Val == 1 || Val == 2 || Val == 3) {
  2014. AM.Scale = 1 << Val;
  2015. SDValue ShVal = N.getOperand(0);
  2016. // Okay, we know that we have a scale by now. However, if the scaled
  2017. // value is an add of something and a constant, we can fold the
  2018. // constant into the disp field here.
  2019. if (CurDAG->isBaseWithConstantOffset(ShVal)) {
  2020. AM.IndexReg = ShVal.getOperand(0);
  2021. auto *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
  2022. uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
  2023. if (!foldOffsetIntoAddress(Disp, AM))
  2024. return false;
  2025. }
  2026. AM.IndexReg = ShVal;
  2027. return false;
  2028. }
  2029. }
  2030. break;
  2031. case ISD::SRL: {
  2032. // Scale must not be used already.
  2033. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
  2034. // We only handle up to 64-bit values here as those are what matter for
  2035. // addressing mode optimizations.
  2036. assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
  2037. "Unexpected value size!");
  2038. SDValue And = N.getOperand(0);
  2039. if (And.getOpcode() != ISD::AND) break;
  2040. SDValue X = And.getOperand(0);
  2041. // The mask used for the transform is expected to be post-shift, but we
  2042. // found the shift first so just apply the shift to the mask before passing
  2043. // it down.
  2044. if (!isa<ConstantSDNode>(N.getOperand(1)) ||
  2045. !isa<ConstantSDNode>(And.getOperand(1)))
  2046. break;
  2047. uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
  2048. // Try to fold the mask and shift into the scale, and return false if we
  2049. // succeed.
  2050. if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
  2051. return false;
  2052. break;
  2053. }
  2054. case ISD::SMUL_LOHI:
  2055. case ISD::UMUL_LOHI:
  2056. // A mul_lohi where we need the low part can be folded as a plain multiply.
  2057. if (N.getResNo() != 0) break;
  2058. [[fallthrough]];
  2059. case ISD::MUL:
  2060. case X86ISD::MUL_IMM:
  2061. // X*[3,5,9] -> X+X*[2,4,8]
  2062. if (AM.BaseType == X86ISelAddressMode::RegBase &&
  2063. AM.Base_Reg.getNode() == nullptr &&
  2064. AM.IndexReg.getNode() == nullptr) {
  2065. if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
  2066. if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
  2067. CN->getZExtValue() == 9) {
  2068. AM.Scale = unsigned(CN->getZExtValue())-1;
  2069. SDValue MulVal = N.getOperand(0);
  2070. SDValue Reg;
  2071. // Okay, we know that we have a scale by now. However, if the scaled
  2072. // value is an add of something and a constant, we can fold the
  2073. // constant into the disp field here.
  2074. if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
  2075. isa<ConstantSDNode>(MulVal.getOperand(1))) {
  2076. Reg = MulVal.getOperand(0);
  2077. auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
  2078. uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
  2079. if (foldOffsetIntoAddress(Disp, AM))
  2080. Reg = N.getOperand(0);
  2081. } else {
  2082. Reg = N.getOperand(0);
  2083. }
  2084. AM.IndexReg = AM.Base_Reg = Reg;
  2085. return false;
  2086. }
  2087. }
  2088. break;
  2089. case ISD::SUB: {
  2090. // Given A-B, if A can be completely folded into the address and
  2091. // the index field with the index field unused, use -B as the index.
  2092. // This is a win if a has multiple parts that can be folded into
  2093. // the address. Also, this saves a mov if the base register has
  2094. // other uses, since it avoids a two-address sub instruction, however
  2095. // it costs an additional mov if the index register has other uses.
  2096. // Add an artificial use to this node so that we can keep track of
  2097. // it if it gets CSE'd with a different node.
  2098. HandleSDNode Handle(N);
  2099. // Test if the LHS of the sub can be folded.
  2100. X86ISelAddressMode Backup = AM;
  2101. if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
  2102. N = Handle.getValue();
  2103. AM = Backup;
  2104. break;
  2105. }
  2106. N = Handle.getValue();
  2107. // Test if the index field is free for use.
  2108. if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
  2109. AM = Backup;
  2110. break;
  2111. }
  2112. int Cost = 0;
  2113. SDValue RHS = N.getOperand(1);
  2114. // If the RHS involves a register with multiple uses, this
  2115. // transformation incurs an extra mov, due to the neg instruction
  2116. // clobbering its operand.
  2117. if (!RHS.getNode()->hasOneUse() ||
  2118. RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
  2119. RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
  2120. RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
  2121. (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
  2122. RHS.getOperand(0).getValueType() == MVT::i32))
  2123. ++Cost;
  2124. // If the base is a register with multiple uses, this
  2125. // transformation may save a mov.
  2126. if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
  2127. !AM.Base_Reg.getNode()->hasOneUse()) ||
  2128. AM.BaseType == X86ISelAddressMode::FrameIndexBase)
  2129. --Cost;
  2130. // If the folded LHS was interesting, this transformation saves
  2131. // address arithmetic.
  2132. if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
  2133. ((AM.Disp != 0) && (Backup.Disp == 0)) +
  2134. (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
  2135. --Cost;
  2136. // If it doesn't look like it may be an overall win, don't do it.
  2137. if (Cost >= 0) {
  2138. AM = Backup;
  2139. break;
  2140. }
  2141. // Ok, the transformation is legal and appears profitable. Go for it.
  2142. // Negation will be emitted later to avoid creating dangling nodes if this
  2143. // was an unprofitable LEA.
  2144. AM.IndexReg = RHS;
  2145. AM.NegateIndex = true;
  2146. AM.Scale = 1;
  2147. return false;
  2148. }
  2149. case ISD::ADD:
  2150. if (!matchAdd(N, AM, Depth))
  2151. return false;
  2152. break;
  2153. case ISD::OR:
  2154. // We want to look through a transform in InstCombine and DAGCombiner that
  2155. // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
  2156. // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
  2157. // An 'lea' can then be used to match the shift (multiply) and add:
  2158. // and $1, %esi
  2159. // lea (%rsi, %rdi, 8), %rax
  2160. if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
  2161. !matchAdd(N, AM, Depth))
  2162. return false;
  2163. break;
  2164. case ISD::XOR:
  2165. // We want to look through a transform in InstCombine that
  2166. // turns 'add' with min_signed_val into 'xor', so we can treat this 'xor'
  2167. // exactly like an 'add'.
  2168. if (isMinSignedConstant(N.getOperand(1)) && !matchAdd(N, AM, Depth))
  2169. return false;
  2170. break;
  2171. case ISD::AND: {
  2172. // Perform some heroic transforms on an and of a constant-count shift
  2173. // with a constant to enable use of the scaled offset field.
  2174. // Scale must not be used already.
  2175. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
  2176. // We only handle up to 64-bit values here as those are what matter for
  2177. // addressing mode optimizations.
  2178. assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
  2179. "Unexpected value size!");
  2180. if (!isa<ConstantSDNode>(N.getOperand(1)))
  2181. break;
  2182. if (N.getOperand(0).getOpcode() == ISD::SRL) {
  2183. SDValue Shift = N.getOperand(0);
  2184. SDValue X = Shift.getOperand(0);
  2185. uint64_t Mask = N.getConstantOperandVal(1);
  2186. // Try to fold the mask and shift into an extract and scale.
  2187. if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
  2188. return false;
  2189. // Try to fold the mask and shift directly into the scale.
  2190. if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
  2191. return false;
  2192. // Try to fold the mask and shift into BEXTR and scale.
  2193. if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
  2194. return false;
  2195. }
  2196. // Try to swap the mask and shift to place shifts which can be done as
  2197. // a scale on the outside of the mask.
  2198. if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
  2199. return false;
  2200. break;
  2201. }
  2202. case ISD::ZERO_EXTEND: {
  2203. // Try to widen a zexted shift left to the same size as its use, so we can
  2204. // match the shift as a scale factor.
  2205. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
  2206. break;
  2207. if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
  2208. break;
  2209. // Give up if the shift is not a valid scale factor [1,2,3].
  2210. SDValue Shl = N.getOperand(0);
  2211. auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
  2212. if (!ShAmtC || ShAmtC->getZExtValue() > 3)
  2213. break;
  2214. // The narrow shift must only shift out zero bits (it must be 'nuw').
  2215. // That makes it safe to widen to the destination type.
  2216. APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
  2217. ShAmtC->getZExtValue());
  2218. if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
  2219. break;
  2220. // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
  2221. MVT VT = N.getSimpleValueType();
  2222. SDLoc DL(N);
  2223. SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
  2224. SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
  2225. // Convert the shift to scale factor.
  2226. AM.Scale = 1 << ShAmtC->getZExtValue();
  2227. AM.IndexReg = Zext;
  2228. insertDAGNode(*CurDAG, N, Zext);
  2229. insertDAGNode(*CurDAG, N, NewShl);
  2230. CurDAG->ReplaceAllUsesWith(N, NewShl);
  2231. CurDAG->RemoveDeadNode(N.getNode());
  2232. return false;
  2233. }
  2234. }
  2235. return matchAddressBase(N, AM);
  2236. }
  2237. /// Helper for MatchAddress. Add the specified node to the
  2238. /// specified addressing mode without any further recursion.
  2239. bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
  2240. // Is the base register already occupied?
  2241. if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
  2242. // If so, check to see if the scale index register is set.
  2243. if (!AM.IndexReg.getNode()) {
  2244. AM.IndexReg = N;
  2245. AM.Scale = 1;
  2246. return false;
  2247. }
  2248. // Otherwise, we cannot select it.
  2249. return true;
  2250. }
  2251. // Default, generate it as a register.
  2252. AM.BaseType = X86ISelAddressMode::RegBase;
  2253. AM.Base_Reg = N;
  2254. return false;
  2255. }
  2256. bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
  2257. X86ISelAddressMode &AM,
  2258. unsigned Depth) {
  2259. SDLoc dl(N);
  2260. LLVM_DEBUG({
  2261. dbgs() << "MatchVectorAddress: ";
  2262. AM.dump(CurDAG);
  2263. });
  2264. // Limit recursion.
  2265. if (Depth > 5)
  2266. return matchAddressBase(N, AM);
  2267. // TODO: Support other operations.
  2268. switch (N.getOpcode()) {
  2269. case ISD::Constant: {
  2270. uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
  2271. if (!foldOffsetIntoAddress(Val, AM))
  2272. return false;
  2273. break;
  2274. }
  2275. case X86ISD::Wrapper:
  2276. if (!matchWrapper(N, AM))
  2277. return false;
  2278. break;
  2279. case ISD::ADD: {
  2280. // Add an artificial use to this node so that we can keep track of
  2281. // it if it gets CSE'd with a different node.
  2282. HandleSDNode Handle(N);
  2283. X86ISelAddressMode Backup = AM;
  2284. if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
  2285. !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
  2286. Depth + 1))
  2287. return false;
  2288. AM = Backup;
  2289. // Try again after commuting the operands.
  2290. if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
  2291. Depth + 1) &&
  2292. !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
  2293. Depth + 1))
  2294. return false;
  2295. AM = Backup;
  2296. N = Handle.getValue();
  2297. break;
  2298. }
  2299. }
  2300. return matchAddressBase(N, AM);
  2301. }
  2302. /// Helper for selectVectorAddr. Handles things that can be folded into a
  2303. /// gather/scatter address. The index register and scale should have already
  2304. /// been handled.
  2305. bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
  2306. return matchVectorAddressRecursively(N, AM, 0);
  2307. }
  2308. bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
  2309. SDValue IndexOp, SDValue ScaleOp,
  2310. SDValue &Base, SDValue &Scale,
  2311. SDValue &Index, SDValue &Disp,
  2312. SDValue &Segment) {
  2313. X86ISelAddressMode AM;
  2314. AM.IndexReg = IndexOp;
  2315. AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
  2316. unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
  2317. if (AddrSpace == X86AS::GS)
  2318. AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
  2319. if (AddrSpace == X86AS::FS)
  2320. AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
  2321. if (AddrSpace == X86AS::SS)
  2322. AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
  2323. SDLoc DL(BasePtr);
  2324. MVT VT = BasePtr.getSimpleValueType();
  2325. // Try to match into the base and displacement fields.
  2326. if (matchVectorAddress(BasePtr, AM))
  2327. return false;
  2328. getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
  2329. return true;
  2330. }
  2331. /// Returns true if it is able to pattern match an addressing mode.
  2332. /// It returns the operands which make up the maximal addressing mode it can
  2333. /// match by reference.
  2334. ///
  2335. /// Parent is the parent node of the addr operand that is being matched. It
  2336. /// is always a load, store, atomic node, or null. It is only null when
  2337. /// checking memory operands for inline asm nodes.
  2338. bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
  2339. SDValue &Scale, SDValue &Index,
  2340. SDValue &Disp, SDValue &Segment) {
  2341. X86ISelAddressMode AM;
  2342. if (Parent &&
  2343. // This list of opcodes are all the nodes that have an "addr:$ptr" operand
  2344. // that are not a MemSDNode, and thus don't have proper addrspace info.
  2345. Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
  2346. Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
  2347. Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
  2348. Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
  2349. Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
  2350. Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
  2351. Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
  2352. unsigned AddrSpace =
  2353. cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
  2354. if (AddrSpace == X86AS::GS)
  2355. AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
  2356. if (AddrSpace == X86AS::FS)
  2357. AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
  2358. if (AddrSpace == X86AS::SS)
  2359. AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
  2360. }
  2361. // Save the DL and VT before calling matchAddress, it can invalidate N.
  2362. SDLoc DL(N);
  2363. MVT VT = N.getSimpleValueType();
  2364. if (matchAddress(N, AM))
  2365. return false;
  2366. getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
  2367. return true;
  2368. }
  2369. bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
  2370. // In static codegen with small code model, we can get the address of a label
  2371. // into a register with 'movl'
  2372. if (N->getOpcode() != X86ISD::Wrapper)
  2373. return false;
  2374. N = N.getOperand(0);
  2375. // At least GNU as does not accept 'movl' for TPOFF relocations.
  2376. // FIXME: We could use 'movl' when we know we are targeting MC.
  2377. if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
  2378. return false;
  2379. Imm = N;
  2380. if (N->getOpcode() != ISD::TargetGlobalAddress)
  2381. return TM.getCodeModel() == CodeModel::Small;
  2382. std::optional<ConstantRange> CR =
  2383. cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
  2384. if (!CR)
  2385. return TM.getCodeModel() == CodeModel::Small;
  2386. return CR->getUnsignedMax().ult(1ull << 32);
  2387. }
  2388. bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
  2389. SDValue &Scale, SDValue &Index,
  2390. SDValue &Disp, SDValue &Segment) {
  2391. // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
  2392. SDLoc DL(N);
  2393. if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
  2394. return false;
  2395. auto *RN = dyn_cast<RegisterSDNode>(Base);
  2396. if (RN && RN->getReg() == 0)
  2397. Base = CurDAG->getRegister(0, MVT::i64);
  2398. else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
  2399. // Base could already be %rip, particularly in the x32 ABI.
  2400. SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
  2401. MVT::i64), 0);
  2402. Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
  2403. Base);
  2404. }
  2405. RN = dyn_cast<RegisterSDNode>(Index);
  2406. if (RN && RN->getReg() == 0)
  2407. Index = CurDAG->getRegister(0, MVT::i64);
  2408. else {
  2409. assert(Index.getValueType() == MVT::i32 &&
  2410. "Expect to be extending 32-bit registers for use in LEA");
  2411. SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
  2412. MVT::i64), 0);
  2413. Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
  2414. Index);
  2415. }
  2416. return true;
  2417. }
  2418. /// Calls SelectAddr and determines if the maximal addressing
  2419. /// mode it matches can be cost effectively emitted as an LEA instruction.
  2420. bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
  2421. SDValue &Base, SDValue &Scale,
  2422. SDValue &Index, SDValue &Disp,
  2423. SDValue &Segment) {
  2424. X86ISelAddressMode AM;
  2425. // Save the DL and VT before calling matchAddress, it can invalidate N.
  2426. SDLoc DL(N);
  2427. MVT VT = N.getSimpleValueType();
  2428. // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
  2429. // segments.
  2430. SDValue Copy = AM.Segment;
  2431. SDValue T = CurDAG->getRegister(0, MVT::i32);
  2432. AM.Segment = T;
  2433. if (matchAddress(N, AM))
  2434. return false;
  2435. assert (T == AM.Segment);
  2436. AM.Segment = Copy;
  2437. unsigned Complexity = 0;
  2438. if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
  2439. Complexity = 1;
  2440. else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
  2441. Complexity = 4;
  2442. if (AM.IndexReg.getNode())
  2443. Complexity++;
  2444. // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
  2445. // a simple shift.
  2446. if (AM.Scale > 1)
  2447. Complexity++;
  2448. // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
  2449. // to a LEA. This is determined with some experimentation but is by no means
  2450. // optimal (especially for code size consideration). LEA is nice because of
  2451. // its three-address nature. Tweak the cost function again when we can run
  2452. // convertToThreeAddress() at register allocation time.
  2453. if (AM.hasSymbolicDisplacement()) {
  2454. // For X86-64, always use LEA to materialize RIP-relative addresses.
  2455. if (Subtarget->is64Bit())
  2456. Complexity = 4;
  2457. else
  2458. Complexity += 2;
  2459. }
  2460. // Heuristic: try harder to form an LEA from ADD if the operands set flags.
  2461. // Unlike ADD, LEA does not affect flags, so we will be less likely to require
  2462. // duplicating flag-producing instructions later in the pipeline.
  2463. if (N.getOpcode() == ISD::ADD) {
  2464. auto isMathWithFlags = [](SDValue V) {
  2465. switch (V.getOpcode()) {
  2466. case X86ISD::ADD:
  2467. case X86ISD::SUB:
  2468. case X86ISD::ADC:
  2469. case X86ISD::SBB:
  2470. case X86ISD::SMUL:
  2471. case X86ISD::UMUL:
  2472. /* TODO: These opcodes can be added safely, but we may want to justify
  2473. their inclusion for different reasons (better for reg-alloc).
  2474. case X86ISD::OR:
  2475. case X86ISD::XOR:
  2476. case X86ISD::AND:
  2477. */
  2478. // Value 1 is the flag output of the node - verify it's not dead.
  2479. return !SDValue(V.getNode(), 1).use_empty();
  2480. default:
  2481. return false;
  2482. }
  2483. };
  2484. // TODO: We might want to factor in whether there's a load folding
  2485. // opportunity for the math op that disappears with LEA.
  2486. if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
  2487. Complexity++;
  2488. }
  2489. if (AM.Disp)
  2490. Complexity++;
  2491. // If it isn't worth using an LEA, reject it.
  2492. if (Complexity <= 2)
  2493. return false;
  2494. getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
  2495. return true;
  2496. }
  2497. /// This is only run on TargetGlobalTLSAddress nodes.
  2498. bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
  2499. SDValue &Scale, SDValue &Index,
  2500. SDValue &Disp, SDValue &Segment) {
  2501. assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
  2502. auto *GA = cast<GlobalAddressSDNode>(N);
  2503. X86ISelAddressMode AM;
  2504. AM.GV = GA->getGlobal();
  2505. AM.Disp += GA->getOffset();
  2506. AM.SymbolFlags = GA->getTargetFlags();
  2507. if (Subtarget->is32Bit()) {
  2508. AM.Scale = 1;
  2509. AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
  2510. }
  2511. MVT VT = N.getSimpleValueType();
  2512. getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
  2513. return true;
  2514. }
  2515. bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
  2516. // Keep track of the original value type and whether this value was
  2517. // truncated. If we see a truncation from pointer type to VT that truncates
  2518. // bits that are known to be zero, we can use a narrow reference.
  2519. EVT VT = N.getValueType();
  2520. bool WasTruncated = false;
  2521. if (N.getOpcode() == ISD::TRUNCATE) {
  2522. WasTruncated = true;
  2523. N = N.getOperand(0);
  2524. }
  2525. if (N.getOpcode() != X86ISD::Wrapper)
  2526. return false;
  2527. // We can only use non-GlobalValues as immediates if they were not truncated,
  2528. // as we do not have any range information. If we have a GlobalValue and the
  2529. // address was not truncated, we can select it as an operand directly.
  2530. unsigned Opc = N.getOperand(0)->getOpcode();
  2531. if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
  2532. Op = N.getOperand(0);
  2533. // We can only select the operand directly if we didn't have to look past a
  2534. // truncate.
  2535. return !WasTruncated;
  2536. }
  2537. // Check that the global's range fits into VT.
  2538. auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
  2539. std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
  2540. if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
  2541. return false;
  2542. // Okay, we can use a narrow reference.
  2543. Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
  2544. GA->getOffset(), GA->getTargetFlags());
  2545. return true;
  2546. }
  2547. bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
  2548. SDValue &Base, SDValue &Scale,
  2549. SDValue &Index, SDValue &Disp,
  2550. SDValue &Segment) {
  2551. assert(Root && P && "Unknown root/parent nodes");
  2552. if (!ISD::isNON_EXTLoad(N.getNode()) ||
  2553. !IsProfitableToFold(N, P, Root) ||
  2554. !IsLegalToFold(N, P, Root, OptLevel))
  2555. return false;
  2556. return selectAddr(N.getNode(),
  2557. N.getOperand(1), Base, Scale, Index, Disp, Segment);
  2558. }
  2559. bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
  2560. SDValue &Base, SDValue &Scale,
  2561. SDValue &Index, SDValue &Disp,
  2562. SDValue &Segment) {
  2563. assert(Root && P && "Unknown root/parent nodes");
  2564. if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
  2565. !IsProfitableToFold(N, P, Root) ||
  2566. !IsLegalToFold(N, P, Root, OptLevel))
  2567. return false;
  2568. return selectAddr(N.getNode(),
  2569. N.getOperand(1), Base, Scale, Index, Disp, Segment);
  2570. }
  2571. /// Return an SDNode that returns the value of the global base register.
  2572. /// Output instructions required to initialize the global base register,
  2573. /// if necessary.
  2574. SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
  2575. unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
  2576. auto &DL = MF->getDataLayout();
  2577. return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
  2578. }
  2579. bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
  2580. if (N->getOpcode() == ISD::TRUNCATE)
  2581. N = N->getOperand(0).getNode();
  2582. if (N->getOpcode() != X86ISD::Wrapper)
  2583. return false;
  2584. auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
  2585. if (!GA)
  2586. return false;
  2587. std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
  2588. if (!CR)
  2589. return Width == 32 && TM.getCodeModel() == CodeModel::Small;
  2590. return CR->getSignedMin().sge(-1ull << Width) &&
  2591. CR->getSignedMax().slt(1ull << Width);
  2592. }
  2593. X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
  2594. assert(N->isMachineOpcode() && "Unexpected node");
  2595. unsigned Opc = N->getMachineOpcode();
  2596. const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
  2597. int CondNo = X86::getCondSrcNoFromDesc(MCID);
  2598. if (CondNo < 0)
  2599. return X86::COND_INVALID;
  2600. return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
  2601. }
  2602. /// Test whether the given X86ISD::CMP node has any users that use a flag
  2603. /// other than ZF.
  2604. bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
  2605. // Examine each user of the node.
  2606. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
  2607. UI != UE; ++UI) {
  2608. // Only check things that use the flags.
  2609. if (UI.getUse().getResNo() != Flags.getResNo())
  2610. continue;
  2611. // Only examine CopyToReg uses that copy to EFLAGS.
  2612. if (UI->getOpcode() != ISD::CopyToReg ||
  2613. cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
  2614. return false;
  2615. // Examine each user of the CopyToReg use.
  2616. for (SDNode::use_iterator FlagUI = UI->use_begin(),
  2617. FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
  2618. // Only examine the Flag result.
  2619. if (FlagUI.getUse().getResNo() != 1) continue;
  2620. // Anything unusual: assume conservatively.
  2621. if (!FlagUI->isMachineOpcode()) return false;
  2622. // Examine the condition code of the user.
  2623. X86::CondCode CC = getCondFromNode(*FlagUI);
  2624. switch (CC) {
  2625. // Comparisons which only use the zero flag.
  2626. case X86::COND_E: case X86::COND_NE:
  2627. continue;
  2628. // Anything else: assume conservatively.
  2629. default:
  2630. return false;
  2631. }
  2632. }
  2633. }
  2634. return true;
  2635. }
  2636. /// Test whether the given X86ISD::CMP node has any uses which require the SF
  2637. /// flag to be accurate.
  2638. bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
  2639. // Examine each user of the node.
  2640. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
  2641. UI != UE; ++UI) {
  2642. // Only check things that use the flags.
  2643. if (UI.getUse().getResNo() != Flags.getResNo())
  2644. continue;
  2645. // Only examine CopyToReg uses that copy to EFLAGS.
  2646. if (UI->getOpcode() != ISD::CopyToReg ||
  2647. cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
  2648. return false;
  2649. // Examine each user of the CopyToReg use.
  2650. for (SDNode::use_iterator FlagUI = UI->use_begin(),
  2651. FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
  2652. // Only examine the Flag result.
  2653. if (FlagUI.getUse().getResNo() != 1) continue;
  2654. // Anything unusual: assume conservatively.
  2655. if (!FlagUI->isMachineOpcode()) return false;
  2656. // Examine the condition code of the user.
  2657. X86::CondCode CC = getCondFromNode(*FlagUI);
  2658. switch (CC) {
  2659. // Comparisons which don't examine the SF flag.
  2660. case X86::COND_A: case X86::COND_AE:
  2661. case X86::COND_B: case X86::COND_BE:
  2662. case X86::COND_E: case X86::COND_NE:
  2663. case X86::COND_O: case X86::COND_NO:
  2664. case X86::COND_P: case X86::COND_NP:
  2665. continue;
  2666. // Anything else: assume conservatively.
  2667. default:
  2668. return false;
  2669. }
  2670. }
  2671. }
  2672. return true;
  2673. }
  2674. static bool mayUseCarryFlag(X86::CondCode CC) {
  2675. switch (CC) {
  2676. // Comparisons which don't examine the CF flag.
  2677. case X86::COND_O: case X86::COND_NO:
  2678. case X86::COND_E: case X86::COND_NE:
  2679. case X86::COND_S: case X86::COND_NS:
  2680. case X86::COND_P: case X86::COND_NP:
  2681. case X86::COND_L: case X86::COND_GE:
  2682. case X86::COND_G: case X86::COND_LE:
  2683. return false;
  2684. // Anything else: assume conservatively.
  2685. default:
  2686. return true;
  2687. }
  2688. }
  2689. /// Test whether the given node which sets flags has any uses which require the
  2690. /// CF flag to be accurate.
  2691. bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
  2692. // Examine each user of the node.
  2693. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
  2694. UI != UE; ++UI) {
  2695. // Only check things that use the flags.
  2696. if (UI.getUse().getResNo() != Flags.getResNo())
  2697. continue;
  2698. unsigned UIOpc = UI->getOpcode();
  2699. if (UIOpc == ISD::CopyToReg) {
  2700. // Only examine CopyToReg uses that copy to EFLAGS.
  2701. if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
  2702. return false;
  2703. // Examine each user of the CopyToReg use.
  2704. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
  2705. FlagUI != FlagUE; ++FlagUI) {
  2706. // Only examine the Flag result.
  2707. if (FlagUI.getUse().getResNo() != 1)
  2708. continue;
  2709. // Anything unusual: assume conservatively.
  2710. if (!FlagUI->isMachineOpcode())
  2711. return false;
  2712. // Examine the condition code of the user.
  2713. X86::CondCode CC = getCondFromNode(*FlagUI);
  2714. if (mayUseCarryFlag(CC))
  2715. return false;
  2716. }
  2717. // This CopyToReg is ok. Move on to the next user.
  2718. continue;
  2719. }
  2720. // This might be an unselected node. So look for the pre-isel opcodes that
  2721. // use flags.
  2722. unsigned CCOpNo;
  2723. switch (UIOpc) {
  2724. default:
  2725. // Something unusual. Be conservative.
  2726. return false;
  2727. case X86ISD::SETCC: CCOpNo = 0; break;
  2728. case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
  2729. case X86ISD::CMOV: CCOpNo = 2; break;
  2730. case X86ISD::BRCOND: CCOpNo = 2; break;
  2731. }
  2732. X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
  2733. if (mayUseCarryFlag(CC))
  2734. return false;
  2735. }
  2736. return true;
  2737. }
  2738. /// Check whether or not the chain ending in StoreNode is suitable for doing
  2739. /// the {load; op; store} to modify transformation.
  2740. static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
  2741. SDValue StoredVal, SelectionDAG *CurDAG,
  2742. unsigned LoadOpNo,
  2743. LoadSDNode *&LoadNode,
  2744. SDValue &InputChain) {
  2745. // Is the stored value result 0 of the operation?
  2746. if (StoredVal.getResNo() != 0) return false;
  2747. // Are there other uses of the operation other than the store?
  2748. if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
  2749. // Is the store non-extending and non-indexed?
  2750. if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
  2751. return false;
  2752. SDValue Load = StoredVal->getOperand(LoadOpNo);
  2753. // Is the stored value a non-extending and non-indexed load?
  2754. if (!ISD::isNormalLoad(Load.getNode())) return false;
  2755. // Return LoadNode by reference.
  2756. LoadNode = cast<LoadSDNode>(Load);
  2757. // Is store the only read of the loaded value?
  2758. if (!Load.hasOneUse())
  2759. return false;
  2760. // Is the address of the store the same as the load?
  2761. if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
  2762. LoadNode->getOffset() != StoreNode->getOffset())
  2763. return false;
  2764. bool FoundLoad = false;
  2765. SmallVector<SDValue, 4> ChainOps;
  2766. SmallVector<const SDNode *, 4> LoopWorklist;
  2767. SmallPtrSet<const SDNode *, 16> Visited;
  2768. const unsigned int Max = 1024;
  2769. // Visualization of Load-Op-Store fusion:
  2770. // -------------------------
  2771. // Legend:
  2772. // *-lines = Chain operand dependencies.
  2773. // |-lines = Normal operand dependencies.
  2774. // Dependencies flow down and right. n-suffix references multiple nodes.
  2775. //
  2776. // C Xn C
  2777. // * * *
  2778. // * * *
  2779. // Xn A-LD Yn TF Yn
  2780. // * * \ | * |
  2781. // * * \ | * |
  2782. // * * \ | => A--LD_OP_ST
  2783. // * * \| \
  2784. // TF OP \
  2785. // * | \ Zn
  2786. // * | \
  2787. // A-ST Zn
  2788. //
  2789. // This merge induced dependences from: #1: Xn -> LD, OP, Zn
  2790. // #2: Yn -> LD
  2791. // #3: ST -> Zn
  2792. // Ensure the transform is safe by checking for the dual
  2793. // dependencies to make sure we do not induce a loop.
  2794. // As LD is a predecessor to both OP and ST we can do this by checking:
  2795. // a). if LD is a predecessor to a member of Xn or Yn.
  2796. // b). if a Zn is a predecessor to ST.
  2797. // However, (b) can only occur through being a chain predecessor to
  2798. // ST, which is the same as Zn being a member or predecessor of Xn,
  2799. // which is a subset of LD being a predecessor of Xn. So it's
  2800. // subsumed by check (a).
  2801. SDValue Chain = StoreNode->getChain();
  2802. // Gather X elements in ChainOps.
  2803. if (Chain == Load.getValue(1)) {
  2804. FoundLoad = true;
  2805. ChainOps.push_back(Load.getOperand(0));
  2806. } else if (Chain.getOpcode() == ISD::TokenFactor) {
  2807. for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
  2808. SDValue Op = Chain.getOperand(i);
  2809. if (Op == Load.getValue(1)) {
  2810. FoundLoad = true;
  2811. // Drop Load, but keep its chain. No cycle check necessary.
  2812. ChainOps.push_back(Load.getOperand(0));
  2813. continue;
  2814. }
  2815. LoopWorklist.push_back(Op.getNode());
  2816. ChainOps.push_back(Op);
  2817. }
  2818. }
  2819. if (!FoundLoad)
  2820. return false;
  2821. // Worklist is currently Xn. Add Yn to worklist.
  2822. for (SDValue Op : StoredVal->ops())
  2823. if (Op.getNode() != LoadNode)
  2824. LoopWorklist.push_back(Op.getNode());
  2825. // Check (a) if Load is a predecessor to Xn + Yn
  2826. if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
  2827. true))
  2828. return false;
  2829. InputChain =
  2830. CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
  2831. return true;
  2832. }
  2833. // Change a chain of {load; op; store} of the same value into a simple op
  2834. // through memory of that value, if the uses of the modified value and its
  2835. // address are suitable.
  2836. //
  2837. // The tablegen pattern memory operand pattern is currently not able to match
  2838. // the case where the EFLAGS on the original operation are used.
  2839. //
  2840. // To move this to tablegen, we'll need to improve tablegen to allow flags to
  2841. // be transferred from a node in the pattern to the result node, probably with
  2842. // a new keyword. For example, we have this
  2843. // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
  2844. // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
  2845. // (implicit EFLAGS)]>;
  2846. // but maybe need something like this
  2847. // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
  2848. // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
  2849. // (transferrable EFLAGS)]>;
  2850. //
  2851. // Until then, we manually fold these and instruction select the operation
  2852. // here.
  2853. bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
  2854. auto *StoreNode = cast<StoreSDNode>(Node);
  2855. SDValue StoredVal = StoreNode->getOperand(1);
  2856. unsigned Opc = StoredVal->getOpcode();
  2857. // Before we try to select anything, make sure this is memory operand size
  2858. // and opcode we can handle. Note that this must match the code below that
  2859. // actually lowers the opcodes.
  2860. EVT MemVT = StoreNode->getMemoryVT();
  2861. if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
  2862. MemVT != MVT::i8)
  2863. return false;
  2864. bool IsCommutable = false;
  2865. bool IsNegate = false;
  2866. switch (Opc) {
  2867. default:
  2868. return false;
  2869. case X86ISD::SUB:
  2870. IsNegate = isNullConstant(StoredVal.getOperand(0));
  2871. break;
  2872. case X86ISD::SBB:
  2873. break;
  2874. case X86ISD::ADD:
  2875. case X86ISD::ADC:
  2876. case X86ISD::AND:
  2877. case X86ISD::OR:
  2878. case X86ISD::XOR:
  2879. IsCommutable = true;
  2880. break;
  2881. }
  2882. unsigned LoadOpNo = IsNegate ? 1 : 0;
  2883. LoadSDNode *LoadNode = nullptr;
  2884. SDValue InputChain;
  2885. if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
  2886. LoadNode, InputChain)) {
  2887. if (!IsCommutable)
  2888. return false;
  2889. // This operation is commutable, try the other operand.
  2890. LoadOpNo = 1;
  2891. if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
  2892. LoadNode, InputChain))
  2893. return false;
  2894. }
  2895. SDValue Base, Scale, Index, Disp, Segment;
  2896. if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
  2897. Segment))
  2898. return false;
  2899. auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
  2900. unsigned Opc8) {
  2901. switch (MemVT.getSimpleVT().SimpleTy) {
  2902. case MVT::i64:
  2903. return Opc64;
  2904. case MVT::i32:
  2905. return Opc32;
  2906. case MVT::i16:
  2907. return Opc16;
  2908. case MVT::i8:
  2909. return Opc8;
  2910. default:
  2911. llvm_unreachable("Invalid size!");
  2912. }
  2913. };
  2914. MachineSDNode *Result;
  2915. switch (Opc) {
  2916. case X86ISD::SUB:
  2917. // Handle negate.
  2918. if (IsNegate) {
  2919. unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
  2920. X86::NEG8m);
  2921. const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
  2922. Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
  2923. MVT::Other, Ops);
  2924. break;
  2925. }
  2926. [[fallthrough]];
  2927. case X86ISD::ADD:
  2928. // Try to match inc/dec.
  2929. if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
  2930. bool IsOne = isOneConstant(StoredVal.getOperand(1));
  2931. bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
  2932. // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
  2933. if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
  2934. unsigned NewOpc =
  2935. ((Opc == X86ISD::ADD) == IsOne)
  2936. ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
  2937. : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
  2938. const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
  2939. Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
  2940. MVT::Other, Ops);
  2941. break;
  2942. }
  2943. }
  2944. [[fallthrough]];
  2945. case X86ISD::ADC:
  2946. case X86ISD::SBB:
  2947. case X86ISD::AND:
  2948. case X86ISD::OR:
  2949. case X86ISD::XOR: {
  2950. auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
  2951. switch (Opc) {
  2952. case X86ISD::ADD:
  2953. return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
  2954. X86::ADD8mr);
  2955. case X86ISD::ADC:
  2956. return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
  2957. X86::ADC8mr);
  2958. case X86ISD::SUB:
  2959. return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
  2960. X86::SUB8mr);
  2961. case X86ISD::SBB:
  2962. return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
  2963. X86::SBB8mr);
  2964. case X86ISD::AND:
  2965. return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
  2966. X86::AND8mr);
  2967. case X86ISD::OR:
  2968. return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
  2969. case X86ISD::XOR:
  2970. return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
  2971. X86::XOR8mr);
  2972. default:
  2973. llvm_unreachable("Invalid opcode!");
  2974. }
  2975. };
  2976. auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
  2977. switch (Opc) {
  2978. case X86ISD::ADD:
  2979. return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
  2980. case X86ISD::ADC:
  2981. return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
  2982. case X86ISD::SUB:
  2983. return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
  2984. case X86ISD::SBB:
  2985. return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
  2986. case X86ISD::AND:
  2987. return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
  2988. case X86ISD::OR:
  2989. return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
  2990. case X86ISD::XOR:
  2991. return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
  2992. default:
  2993. llvm_unreachable("Invalid opcode!");
  2994. }
  2995. };
  2996. auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
  2997. switch (Opc) {
  2998. case X86ISD::ADD:
  2999. return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
  3000. X86::ADD8mi);
  3001. case X86ISD::ADC:
  3002. return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
  3003. X86::ADC8mi);
  3004. case X86ISD::SUB:
  3005. return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
  3006. X86::SUB8mi);
  3007. case X86ISD::SBB:
  3008. return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
  3009. X86::SBB8mi);
  3010. case X86ISD::AND:
  3011. return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
  3012. X86::AND8mi);
  3013. case X86ISD::OR:
  3014. return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
  3015. X86::OR8mi);
  3016. case X86ISD::XOR:
  3017. return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
  3018. X86::XOR8mi);
  3019. default:
  3020. llvm_unreachable("Invalid opcode!");
  3021. }
  3022. };
  3023. unsigned NewOpc = SelectRegOpcode(Opc);
  3024. SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
  3025. // See if the operand is a constant that we can fold into an immediate
  3026. // operand.
  3027. if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
  3028. int64_t OperandV = OperandC->getSExtValue();
  3029. // Check if we can shrink the operand enough to fit in an immediate (or
  3030. // fit into a smaller immediate) by negating it and switching the
  3031. // operation.
  3032. if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
  3033. ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
  3034. (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
  3035. isInt<32>(-OperandV))) &&
  3036. hasNoCarryFlagUses(StoredVal.getValue(1))) {
  3037. OperandV = -OperandV;
  3038. Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
  3039. }
  3040. // First try to fit this into an Imm8 operand. If it doesn't fit, then try
  3041. // the larger immediate operand.
  3042. if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
  3043. Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
  3044. NewOpc = SelectImm8Opcode(Opc);
  3045. } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
  3046. Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
  3047. NewOpc = SelectImmOpcode(Opc);
  3048. }
  3049. }
  3050. if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
  3051. SDValue CopyTo =
  3052. CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
  3053. StoredVal.getOperand(2), SDValue());
  3054. const SDValue Ops[] = {Base, Scale, Index, Disp,
  3055. Segment, Operand, CopyTo, CopyTo.getValue(1)};
  3056. Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
  3057. Ops);
  3058. } else {
  3059. const SDValue Ops[] = {Base, Scale, Index, Disp,
  3060. Segment, Operand, InputChain};
  3061. Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
  3062. Ops);
  3063. }
  3064. break;
  3065. }
  3066. default:
  3067. llvm_unreachable("Invalid opcode!");
  3068. }
  3069. MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
  3070. LoadNode->getMemOperand()};
  3071. CurDAG->setNodeMemRefs(Result, MemOps);
  3072. // Update Load Chain uses as well.
  3073. ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
  3074. ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
  3075. ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
  3076. CurDAG->RemoveDeadNode(Node);
  3077. return true;
  3078. }
  3079. // See if this is an X & Mask that we can match to BEXTR/BZHI.
  3080. // Where Mask is one of the following patterns:
  3081. // a) x & (1 << nbits) - 1
  3082. // b) x & ~(-1 << nbits)
  3083. // c) x & (-1 >> (32 - y))
  3084. // d) x << (32 - y) >> (32 - y)
  3085. bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
  3086. assert(
  3087. (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
  3088. "Should be either an and-mask, or right-shift after clearing high bits.");
  3089. // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
  3090. if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
  3091. return false;
  3092. MVT NVT = Node->getSimpleValueType(0);
  3093. // Only supported for 32 and 64 bits.
  3094. if (NVT != MVT::i32 && NVT != MVT::i64)
  3095. return false;
  3096. SDValue NBits;
  3097. bool NegateNBits;
  3098. // If we have BMI2's BZHI, we are ok with muti-use patterns.
  3099. // Else, if we only have BMI1's BEXTR, we require one-use.
  3100. const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
  3101. auto checkUses = [AllowExtraUsesByDefault](
  3102. SDValue Op, unsigned NUses,
  3103. std::optional<bool> AllowExtraUses) {
  3104. return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
  3105. Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
  3106. };
  3107. auto checkOneUse = [checkUses](SDValue Op,
  3108. std::optional<bool> AllowExtraUses =
  3109. std::nullopt) {
  3110. return checkUses(Op, 1, AllowExtraUses);
  3111. };
  3112. auto checkTwoUse = [checkUses](SDValue Op,
  3113. std::optional<bool> AllowExtraUses =
  3114. std::nullopt) {
  3115. return checkUses(Op, 2, AllowExtraUses);
  3116. };
  3117. auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
  3118. if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
  3119. assert(V.getSimpleValueType() == MVT::i32 &&
  3120. V.getOperand(0).getSimpleValueType() == MVT::i64 &&
  3121. "Expected i64 -> i32 truncation");
  3122. V = V.getOperand(0);
  3123. }
  3124. return V;
  3125. };
  3126. // a) x & ((1 << nbits) + (-1))
  3127. auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
  3128. &NegateNBits](SDValue Mask) -> bool {
  3129. // Match `add`. Must only have one use!
  3130. if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
  3131. return false;
  3132. // We should be adding all-ones constant (i.e. subtracting one.)
  3133. if (!isAllOnesConstant(Mask->getOperand(1)))
  3134. return false;
  3135. // Match `1 << nbits`. Might be truncated. Must only have one use!
  3136. SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
  3137. if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
  3138. return false;
  3139. if (!isOneConstant(M0->getOperand(0)))
  3140. return false;
  3141. NBits = M0->getOperand(1);
  3142. NegateNBits = false;
  3143. return true;
  3144. };
  3145. auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
  3146. V = peekThroughOneUseTruncation(V);
  3147. return CurDAG->MaskedValueIsAllOnes(
  3148. V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
  3149. NVT.getSizeInBits()));
  3150. };
  3151. // b) x & ~(-1 << nbits)
  3152. auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
  3153. &NBits, &NegateNBits](SDValue Mask) -> bool {
  3154. // Match `~()`. Must only have one use!
  3155. if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
  3156. return false;
  3157. // The -1 only has to be all-ones for the final Node's NVT.
  3158. if (!isAllOnes(Mask->getOperand(1)))
  3159. return false;
  3160. // Match `-1 << nbits`. Might be truncated. Must only have one use!
  3161. SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
  3162. if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
  3163. return false;
  3164. // The -1 only has to be all-ones for the final Node's NVT.
  3165. if (!isAllOnes(M0->getOperand(0)))
  3166. return false;
  3167. NBits = M0->getOperand(1);
  3168. NegateNBits = false;
  3169. return true;
  3170. };
  3171. // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
  3172. // or leave the shift amount as-is, but then we'll have to negate it.
  3173. auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
  3174. unsigned Bitwidth) {
  3175. NBits = ShiftAmt;
  3176. NegateNBits = true;
  3177. // Skip over a truncate of the shift amount, if any.
  3178. if (NBits.getOpcode() == ISD::TRUNCATE)
  3179. NBits = NBits.getOperand(0);
  3180. // Try to match the shift amount as (bitwidth - y). It should go away, too.
  3181. // If it doesn't match, that's fine, we'll just negate it ourselves.
  3182. if (NBits.getOpcode() != ISD::SUB)
  3183. return;
  3184. auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
  3185. if (!V0 || V0->getZExtValue() != Bitwidth)
  3186. return;
  3187. NBits = NBits.getOperand(1);
  3188. NegateNBits = false;
  3189. };
  3190. // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
  3191. // or
  3192. // c) x & (-1 >> (32 - y))
  3193. auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
  3194. canonicalizeShiftAmt](SDValue Mask) -> bool {
  3195. // The mask itself may be truncated.
  3196. Mask = peekThroughOneUseTruncation(Mask);
  3197. unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
  3198. // Match `l>>`. Must only have one use!
  3199. if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
  3200. return false;
  3201. // We should be shifting truly all-ones constant.
  3202. if (!isAllOnesConstant(Mask.getOperand(0)))
  3203. return false;
  3204. SDValue M1 = Mask.getOperand(1);
  3205. // The shift amount should not be used externally.
  3206. if (!checkOneUse(M1))
  3207. return false;
  3208. canonicalizeShiftAmt(M1, Bitwidth);
  3209. // Pattern c. is non-canonical, and is expanded into pattern d. iff there
  3210. // is no extra use of the mask. Clearly, there was one since we are here.
  3211. // But at the same time, if we need to negate the shift amount,
  3212. // then we don't want the mask to stick around, else it's unprofitable.
  3213. return !NegateNBits;
  3214. };
  3215. SDValue X;
  3216. // d) x << z >> z but then we'll have to subtract z from bitwidth
  3217. // or
  3218. // d) x << (32 - y) >> (32 - y)
  3219. auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
  3220. AllowExtraUsesByDefault, &NegateNBits,
  3221. &X](SDNode *Node) -> bool {
  3222. if (Node->getOpcode() != ISD::SRL)
  3223. return false;
  3224. SDValue N0 = Node->getOperand(0);
  3225. if (N0->getOpcode() != ISD::SHL)
  3226. return false;
  3227. unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
  3228. SDValue N1 = Node->getOperand(1);
  3229. SDValue N01 = N0->getOperand(1);
  3230. // Both of the shifts must be by the exact same value.
  3231. if (N1 != N01)
  3232. return false;
  3233. canonicalizeShiftAmt(N1, Bitwidth);
  3234. // There should not be any external uses of the inner shift / shift amount.
  3235. // Note that while we are generally okay with external uses given BMI2,
  3236. // iff we need to negate the shift amount, we are not okay with extra uses.
  3237. const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
  3238. if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
  3239. return false;
  3240. X = N0->getOperand(0);
  3241. return true;
  3242. };
  3243. auto matchLowBitMask = [matchPatternA, matchPatternB,
  3244. matchPatternC](SDValue Mask) -> bool {
  3245. return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
  3246. };
  3247. if (Node->getOpcode() == ISD::AND) {
  3248. X = Node->getOperand(0);
  3249. SDValue Mask = Node->getOperand(1);
  3250. if (matchLowBitMask(Mask)) {
  3251. // Great.
  3252. } else {
  3253. std::swap(X, Mask);
  3254. if (!matchLowBitMask(Mask))
  3255. return false;
  3256. }
  3257. } else if (!matchPatternD(Node))
  3258. return false;
  3259. // If we need to negate the shift amount, require BMI2 BZHI support.
  3260. // It's just too unprofitable for BMI1 BEXTR.
  3261. if (NegateNBits && !Subtarget->hasBMI2())
  3262. return false;
  3263. SDLoc DL(Node);
  3264. // Truncate the shift amount.
  3265. NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
  3266. insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
  3267. // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
  3268. // All the other bits are undefined, we do not care about them.
  3269. SDValue ImplDef = SDValue(
  3270. CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
  3271. insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
  3272. SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
  3273. insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
  3274. NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
  3275. MVT::i32, ImplDef, NBits, SRIdxVal),
  3276. 0);
  3277. insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
  3278. // We might have matched the amount of high bits to be cleared,
  3279. // but we want the amount of low bits to be kept, so negate it then.
  3280. if (NegateNBits) {
  3281. SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
  3282. insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
  3283. NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
  3284. insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
  3285. }
  3286. if (Subtarget->hasBMI2()) {
  3287. // Great, just emit the the BZHI..
  3288. if (NVT != MVT::i32) {
  3289. // But have to place the bit count into the wide-enough register first.
  3290. NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
  3291. insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
  3292. }
  3293. SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
  3294. ReplaceNode(Node, Extract.getNode());
  3295. SelectCode(Extract.getNode());
  3296. return true;
  3297. }
  3298. // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
  3299. // *logically* shifted (potentially with one-use trunc inbetween),
  3300. // and the truncation was the only use of the shift,
  3301. // and if so look past one-use truncation.
  3302. {
  3303. SDValue RealX = peekThroughOneUseTruncation(X);
  3304. // FIXME: only if the shift is one-use?
  3305. if (RealX != X && RealX.getOpcode() == ISD::SRL)
  3306. X = RealX;
  3307. }
  3308. MVT XVT = X.getSimpleValueType();
  3309. // Else, emitting BEXTR requires one more step.
  3310. // The 'control' of BEXTR has the pattern of:
  3311. // [15...8 bit][ 7...0 bit] location
  3312. // [ bit count][ shift] name
  3313. // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
  3314. // Shift NBits left by 8 bits, thus producing 'control'.
  3315. // This makes the low 8 bits to be zero.
  3316. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
  3317. insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
  3318. SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
  3319. insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
  3320. // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
  3321. // FIXME: only if the shift is one-use?
  3322. if (X.getOpcode() == ISD::SRL) {
  3323. SDValue ShiftAmt = X.getOperand(1);
  3324. X = X.getOperand(0);
  3325. assert(ShiftAmt.getValueType() == MVT::i8 &&
  3326. "Expected shift amount to be i8");
  3327. // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
  3328. // We could zext to i16 in some form, but we intentionally don't do that.
  3329. SDValue OrigShiftAmt = ShiftAmt;
  3330. ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
  3331. insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
  3332. // And now 'or' these low 8 bits of shift amount into the 'control'.
  3333. Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
  3334. insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
  3335. }
  3336. // But have to place the 'control' into the wide-enough register first.
  3337. if (XVT != MVT::i32) {
  3338. Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
  3339. insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
  3340. }
  3341. // And finally, form the BEXTR itself.
  3342. SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
  3343. // The 'X' was originally truncated. Do that now.
  3344. if (XVT != NVT) {
  3345. insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
  3346. Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
  3347. }
  3348. ReplaceNode(Node, Extract.getNode());
  3349. SelectCode(Extract.getNode());
  3350. return true;
  3351. }
  3352. // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
  3353. MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
  3354. MVT NVT = Node->getSimpleValueType(0);
  3355. SDLoc dl(Node);
  3356. SDValue N0 = Node->getOperand(0);
  3357. SDValue N1 = Node->getOperand(1);
  3358. // If we have TBM we can use an immediate for the control. If we have BMI
  3359. // we should only do this if the BEXTR instruction is implemented well.
  3360. // Otherwise moving the control into a register makes this more costly.
  3361. // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
  3362. // hoisting the move immediate would make it worthwhile with a less optimal
  3363. // BEXTR?
  3364. bool PreferBEXTR =
  3365. Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
  3366. if (!PreferBEXTR && !Subtarget->hasBMI2())
  3367. return nullptr;
  3368. // Must have a shift right.
  3369. if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
  3370. return nullptr;
  3371. // Shift can't have additional users.
  3372. if (!N0->hasOneUse())
  3373. return nullptr;
  3374. // Only supported for 32 and 64 bits.
  3375. if (NVT != MVT::i32 && NVT != MVT::i64)
  3376. return nullptr;
  3377. // Shift amount and RHS of and must be constant.
  3378. auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
  3379. auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
  3380. if (!MaskCst || !ShiftCst)
  3381. return nullptr;
  3382. // And RHS must be a mask.
  3383. uint64_t Mask = MaskCst->getZExtValue();
  3384. if (!isMask_64(Mask))
  3385. return nullptr;
  3386. uint64_t Shift = ShiftCst->getZExtValue();
  3387. uint64_t MaskSize = llvm::popcount(Mask);
  3388. // Don't interfere with something that can be handled by extracting AH.
  3389. // TODO: If we are able to fold a load, BEXTR might still be better than AH.
  3390. if (Shift == 8 && MaskSize == 8)
  3391. return nullptr;
  3392. // Make sure we are only using bits that were in the original value, not
  3393. // shifted in.
  3394. if (Shift + MaskSize > NVT.getSizeInBits())
  3395. return nullptr;
  3396. // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
  3397. // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
  3398. // does not fit into 32 bits. Load folding is not a sufficient reason.
  3399. if (!PreferBEXTR && MaskSize <= 32)
  3400. return nullptr;
  3401. SDValue Control;
  3402. unsigned ROpc, MOpc;
  3403. if (!PreferBEXTR) {
  3404. assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
  3405. // If we can't make use of BEXTR then we can't fuse shift+mask stages.
  3406. // Let's perform the mask first, and apply shift later. Note that we need to
  3407. // widen the mask to account for the fact that we'll apply shift afterwards!
  3408. Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
  3409. ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
  3410. MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
  3411. unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
  3412. Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
  3413. } else {
  3414. // The 'control' of BEXTR has the pattern of:
  3415. // [15...8 bit][ 7...0 bit] location
  3416. // [ bit count][ shift] name
  3417. // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
  3418. Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
  3419. if (Subtarget->hasTBM()) {
  3420. ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
  3421. MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
  3422. } else {
  3423. assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
  3424. // BMI requires the immediate to placed in a register.
  3425. ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
  3426. MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
  3427. unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
  3428. Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
  3429. }
  3430. }
  3431. MachineSDNode *NewNode;
  3432. SDValue Input = N0->getOperand(0);
  3433. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  3434. if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  3435. SDValue Ops[] = {
  3436. Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
  3437. SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
  3438. NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  3439. // Update the chain.
  3440. ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
  3441. // Record the mem-refs
  3442. CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
  3443. } else {
  3444. NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
  3445. }
  3446. if (!PreferBEXTR) {
  3447. // We still need to apply the shift.
  3448. SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
  3449. unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
  3450. NewNode =
  3451. CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
  3452. }
  3453. return NewNode;
  3454. }
  3455. // Emit a PCMISTR(I/M) instruction.
  3456. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
  3457. bool MayFoldLoad, const SDLoc &dl,
  3458. MVT VT, SDNode *Node) {
  3459. SDValue N0 = Node->getOperand(0);
  3460. SDValue N1 = Node->getOperand(1);
  3461. SDValue Imm = Node->getOperand(2);
  3462. auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
  3463. Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
  3464. // Try to fold a load. No need to check alignment.
  3465. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  3466. if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  3467. SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
  3468. N1.getOperand(0) };
  3469. SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
  3470. MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  3471. // Update the chain.
  3472. ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
  3473. // Record the mem-refs
  3474. CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
  3475. return CNode;
  3476. }
  3477. SDValue Ops[] = { N0, N1, Imm };
  3478. SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
  3479. MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
  3480. return CNode;
  3481. }
  3482. // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
  3483. // to emit a second instruction after this one. This is needed since we have two
  3484. // copyToReg nodes glued before this and we need to continue that glue through.
  3485. MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
  3486. bool MayFoldLoad, const SDLoc &dl,
  3487. MVT VT, SDNode *Node,
  3488. SDValue &InFlag) {
  3489. SDValue N0 = Node->getOperand(0);
  3490. SDValue N2 = Node->getOperand(2);
  3491. SDValue Imm = Node->getOperand(4);
  3492. auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
  3493. Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
  3494. // Try to fold a load. No need to check alignment.
  3495. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  3496. if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  3497. SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
  3498. N2.getOperand(0), InFlag };
  3499. SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
  3500. MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  3501. InFlag = SDValue(CNode, 3);
  3502. // Update the chain.
  3503. ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
  3504. // Record the mem-refs
  3505. CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
  3506. return CNode;
  3507. }
  3508. SDValue Ops[] = { N0, N2, Imm, InFlag };
  3509. SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
  3510. MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
  3511. InFlag = SDValue(CNode, 2);
  3512. return CNode;
  3513. }
  3514. bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
  3515. EVT VT = N->getValueType(0);
  3516. // Only handle scalar shifts.
  3517. if (VT.isVector())
  3518. return false;
  3519. // Narrower shifts only mask to 5 bits in hardware.
  3520. unsigned Size = VT == MVT::i64 ? 64 : 32;
  3521. SDValue OrigShiftAmt = N->getOperand(1);
  3522. SDValue ShiftAmt = OrigShiftAmt;
  3523. SDLoc DL(N);
  3524. // Skip over a truncate of the shift amount.
  3525. if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
  3526. ShiftAmt = ShiftAmt->getOperand(0);
  3527. // This function is called after X86DAGToDAGISel::matchBitExtract(),
  3528. // so we are not afraid that we might mess up BZHI/BEXTR pattern.
  3529. SDValue NewShiftAmt;
  3530. if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
  3531. ShiftAmt->getOpcode() == ISD::XOR) {
  3532. SDValue Add0 = ShiftAmt->getOperand(0);
  3533. SDValue Add1 = ShiftAmt->getOperand(1);
  3534. auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
  3535. auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
  3536. // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
  3537. // to avoid the ADD/SUB/XOR.
  3538. if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
  3539. NewShiftAmt = Add0;
  3540. } else if (ShiftAmt->getOpcode() != ISD::ADD &&
  3541. ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
  3542. (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
  3543. // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
  3544. // we can replace it with a NOT. In the XOR case it may save some code
  3545. // size, in the SUB case it also may save a move.
  3546. assert(Add0C == nullptr || Add1C == nullptr);
  3547. // We can only do N-X, not X-N
  3548. if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
  3549. return false;
  3550. EVT OpVT = ShiftAmt.getValueType();
  3551. SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
  3552. NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
  3553. Add0C == nullptr ? Add0 : Add1, AllOnes);
  3554. insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
  3555. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
  3556. // If we are shifting by N-X where N == 0 mod Size, then just shift by
  3557. // -X to generate a NEG instead of a SUB of a constant.
  3558. } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
  3559. Add0C->getZExtValue() != 0) {
  3560. EVT SubVT = ShiftAmt.getValueType();
  3561. SDValue X;
  3562. if (Add0C->getZExtValue() % Size == 0)
  3563. X = Add1;
  3564. else if (ShiftAmt.hasOneUse() && Size == 64 &&
  3565. Add0C->getZExtValue() % 32 == 0) {
  3566. // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
  3567. // This is mainly beneficial if we already compute (x+n*32).
  3568. if (Add1.getOpcode() == ISD::TRUNCATE) {
  3569. Add1 = Add1.getOperand(0);
  3570. SubVT = Add1.getValueType();
  3571. }
  3572. if (Add0.getValueType() != SubVT) {
  3573. Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
  3574. insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
  3575. }
  3576. X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
  3577. insertDAGNode(*CurDAG, OrigShiftAmt, X);
  3578. } else
  3579. return false;
  3580. // Insert a negate op.
  3581. // TODO: This isn't guaranteed to replace the sub if there is a logic cone
  3582. // that uses it that's not a shift.
  3583. SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
  3584. SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
  3585. NewShiftAmt = Neg;
  3586. // Insert these operands into a valid topological order so they can
  3587. // get selected independently.
  3588. insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
  3589. insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
  3590. } else
  3591. return false;
  3592. } else
  3593. return false;
  3594. if (NewShiftAmt.getValueType() != MVT::i8) {
  3595. // Need to truncate the shift amount.
  3596. NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
  3597. // Add to a correct topological ordering.
  3598. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
  3599. }
  3600. // Insert a new mask to keep the shift amount legal. This should be removed
  3601. // by isel patterns.
  3602. NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
  3603. CurDAG->getConstant(Size - 1, DL, MVT::i8));
  3604. // Place in a correct topological ordering.
  3605. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
  3606. SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
  3607. NewShiftAmt);
  3608. if (UpdatedNode != N) {
  3609. // If we found an existing node, we should replace ourselves with that node
  3610. // and wait for it to be selected after its other users.
  3611. ReplaceNode(N, UpdatedNode);
  3612. return true;
  3613. }
  3614. // If the original shift amount is now dead, delete it so that we don't run
  3615. // it through isel.
  3616. if (OrigShiftAmt.getNode()->use_empty())
  3617. CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
  3618. // Now that we've optimized the shift amount, defer to normal isel to get
  3619. // load folding and legacy vs BMI2 selection without repeating it here.
  3620. SelectCode(N);
  3621. return true;
  3622. }
  3623. bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
  3624. MVT NVT = N->getSimpleValueType(0);
  3625. unsigned Opcode = N->getOpcode();
  3626. SDLoc dl(N);
  3627. // For operations of the form (x << C1) op C2, check if we can use a smaller
  3628. // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
  3629. SDValue Shift = N->getOperand(0);
  3630. SDValue N1 = N->getOperand(1);
  3631. auto *Cst = dyn_cast<ConstantSDNode>(N1);
  3632. if (!Cst)
  3633. return false;
  3634. int64_t Val = Cst->getSExtValue();
  3635. // If we have an any_extend feeding the AND, look through it to see if there
  3636. // is a shift behind it. But only if the AND doesn't use the extended bits.
  3637. // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
  3638. bool FoundAnyExtend = false;
  3639. if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
  3640. Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
  3641. isUInt<32>(Val)) {
  3642. FoundAnyExtend = true;
  3643. Shift = Shift.getOperand(0);
  3644. }
  3645. if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
  3646. return false;
  3647. // i8 is unshrinkable, i16 should be promoted to i32.
  3648. if (NVT != MVT::i32 && NVT != MVT::i64)
  3649. return false;
  3650. auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
  3651. if (!ShlCst)
  3652. return false;
  3653. uint64_t ShAmt = ShlCst->getZExtValue();
  3654. // Make sure that we don't change the operation by removing bits.
  3655. // This only matters for OR and XOR, AND is unaffected.
  3656. uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
  3657. if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
  3658. return false;
  3659. // Check the minimum bitwidth for the new constant.
  3660. // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
  3661. auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
  3662. if (Opcode == ISD::AND) {
  3663. // AND32ri is the same as AND64ri32 with zext imm.
  3664. // Try this before sign extended immediates below.
  3665. ShiftedVal = (uint64_t)Val >> ShAmt;
  3666. if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
  3667. return true;
  3668. // Also swap order when the AND can become MOVZX.
  3669. if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
  3670. return true;
  3671. }
  3672. ShiftedVal = Val >> ShAmt;
  3673. if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
  3674. (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
  3675. return true;
  3676. if (Opcode != ISD::AND) {
  3677. // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
  3678. ShiftedVal = (uint64_t)Val >> ShAmt;
  3679. if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
  3680. return true;
  3681. }
  3682. return false;
  3683. };
  3684. int64_t ShiftedVal;
  3685. if (!CanShrinkImmediate(ShiftedVal))
  3686. return false;
  3687. // Ok, we can reorder to get a smaller immediate.
  3688. // But, its possible the original immediate allowed an AND to become MOVZX.
  3689. // Doing this late due to avoid the MakedValueIsZero call as late as
  3690. // possible.
  3691. if (Opcode == ISD::AND) {
  3692. // Find the smallest zext this could possibly be.
  3693. unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
  3694. ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
  3695. // Figure out which bits need to be zero to achieve that mask.
  3696. APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
  3697. ZExtWidth);
  3698. NeededMask &= ~Cst->getAPIntValue();
  3699. if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
  3700. return false;
  3701. }
  3702. SDValue X = Shift.getOperand(0);
  3703. if (FoundAnyExtend) {
  3704. SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
  3705. insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
  3706. X = NewX;
  3707. }
  3708. SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
  3709. insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
  3710. SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
  3711. insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
  3712. SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
  3713. Shift.getOperand(1));
  3714. ReplaceNode(N, NewSHL.getNode());
  3715. SelectCode(NewSHL.getNode());
  3716. return true;
  3717. }
  3718. bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
  3719. SDNode *ParentB, SDNode *ParentC,
  3720. SDValue A, SDValue B, SDValue C,
  3721. uint8_t Imm) {
  3722. assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
  3723. C.isOperandOf(ParentC) && "Incorrect parent node");
  3724. auto tryFoldLoadOrBCast =
  3725. [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
  3726. SDValue &Index, SDValue &Disp, SDValue &Segment) {
  3727. if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
  3728. return true;
  3729. // Not a load, check for broadcast which may be behind a bitcast.
  3730. if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
  3731. P = L.getNode();
  3732. L = L.getOperand(0);
  3733. }
  3734. if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
  3735. return false;
  3736. // Only 32 and 64 bit broadcasts are supported.
  3737. auto *MemIntr = cast<MemIntrinsicSDNode>(L);
  3738. unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
  3739. if (Size != 32 && Size != 64)
  3740. return false;
  3741. return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
  3742. };
  3743. bool FoldedLoad = false;
  3744. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  3745. if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  3746. FoldedLoad = true;
  3747. } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
  3748. Tmp4)) {
  3749. FoldedLoad = true;
  3750. std::swap(A, C);
  3751. // Swap bits 1/4 and 3/6.
  3752. uint8_t OldImm = Imm;
  3753. Imm = OldImm & 0xa5;
  3754. if (OldImm & 0x02) Imm |= 0x10;
  3755. if (OldImm & 0x10) Imm |= 0x02;
  3756. if (OldImm & 0x08) Imm |= 0x40;
  3757. if (OldImm & 0x40) Imm |= 0x08;
  3758. } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
  3759. Tmp4)) {
  3760. FoldedLoad = true;
  3761. std::swap(B, C);
  3762. // Swap bits 1/2 and 5/6.
  3763. uint8_t OldImm = Imm;
  3764. Imm = OldImm & 0x99;
  3765. if (OldImm & 0x02) Imm |= 0x04;
  3766. if (OldImm & 0x04) Imm |= 0x02;
  3767. if (OldImm & 0x20) Imm |= 0x40;
  3768. if (OldImm & 0x40) Imm |= 0x20;
  3769. }
  3770. SDLoc DL(Root);
  3771. SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
  3772. MVT NVT = Root->getSimpleValueType(0);
  3773. MachineSDNode *MNode;
  3774. if (FoldedLoad) {
  3775. SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
  3776. unsigned Opc;
  3777. if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
  3778. auto *MemIntr = cast<MemIntrinsicSDNode>(C);
  3779. unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
  3780. assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
  3781. bool UseD = EltSize == 32;
  3782. if (NVT.is128BitVector())
  3783. Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
  3784. else if (NVT.is256BitVector())
  3785. Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
  3786. else if (NVT.is512BitVector())
  3787. Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
  3788. else
  3789. llvm_unreachable("Unexpected vector size!");
  3790. } else {
  3791. bool UseD = NVT.getVectorElementType() == MVT::i32;
  3792. if (NVT.is128BitVector())
  3793. Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
  3794. else if (NVT.is256BitVector())
  3795. Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
  3796. else if (NVT.is512BitVector())
  3797. Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
  3798. else
  3799. llvm_unreachable("Unexpected vector size!");
  3800. }
  3801. SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
  3802. MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
  3803. // Update the chain.
  3804. ReplaceUses(C.getValue(1), SDValue(MNode, 1));
  3805. // Record the mem-refs
  3806. CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
  3807. } else {
  3808. bool UseD = NVT.getVectorElementType() == MVT::i32;
  3809. unsigned Opc;
  3810. if (NVT.is128BitVector())
  3811. Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
  3812. else if (NVT.is256BitVector())
  3813. Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
  3814. else if (NVT.is512BitVector())
  3815. Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
  3816. else
  3817. llvm_unreachable("Unexpected vector size!");
  3818. MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
  3819. }
  3820. ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
  3821. CurDAG->RemoveDeadNode(Root);
  3822. return true;
  3823. }
  3824. // Try to match two logic ops to a VPTERNLOG.
  3825. // FIXME: Handle more complex patterns that use an operand more than once?
  3826. bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
  3827. MVT NVT = N->getSimpleValueType(0);
  3828. // Make sure we support VPTERNLOG.
  3829. if (!NVT.isVector() || !Subtarget->hasAVX512() ||
  3830. NVT.getVectorElementType() == MVT::i1)
  3831. return false;
  3832. // We need VLX for 128/256-bit.
  3833. if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
  3834. return false;
  3835. SDValue N0 = N->getOperand(0);
  3836. SDValue N1 = N->getOperand(1);
  3837. auto getFoldableLogicOp = [](SDValue Op) {
  3838. // Peek through single use bitcast.
  3839. if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
  3840. Op = Op.getOperand(0);
  3841. if (!Op.hasOneUse())
  3842. return SDValue();
  3843. unsigned Opc = Op.getOpcode();
  3844. if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
  3845. Opc == X86ISD::ANDNP)
  3846. return Op;
  3847. return SDValue();
  3848. };
  3849. SDValue A, FoldableOp;
  3850. if ((FoldableOp = getFoldableLogicOp(N1))) {
  3851. A = N0;
  3852. } else if ((FoldableOp = getFoldableLogicOp(N0))) {
  3853. A = N1;
  3854. } else
  3855. return false;
  3856. SDValue B = FoldableOp.getOperand(0);
  3857. SDValue C = FoldableOp.getOperand(1);
  3858. SDNode *ParentA = N;
  3859. SDNode *ParentB = FoldableOp.getNode();
  3860. SDNode *ParentC = FoldableOp.getNode();
  3861. // We can build the appropriate control immediate by performing the logic
  3862. // operation we're matching using these constants for A, B, and C.
  3863. uint8_t TernlogMagicA = 0xf0;
  3864. uint8_t TernlogMagicB = 0xcc;
  3865. uint8_t TernlogMagicC = 0xaa;
  3866. // Some of the inputs may be inverted, peek through them and invert the
  3867. // magic values accordingly.
  3868. // TODO: There may be a bitcast before the xor that we should peek through.
  3869. auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
  3870. if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
  3871. ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
  3872. Magic = ~Magic;
  3873. Parent = Op.getNode();
  3874. Op = Op.getOperand(0);
  3875. }
  3876. };
  3877. PeekThroughNot(A, ParentA, TernlogMagicA);
  3878. PeekThroughNot(B, ParentB, TernlogMagicB);
  3879. PeekThroughNot(C, ParentC, TernlogMagicC);
  3880. uint8_t Imm;
  3881. switch (FoldableOp.getOpcode()) {
  3882. default: llvm_unreachable("Unexpected opcode!");
  3883. case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
  3884. case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
  3885. case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
  3886. case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
  3887. }
  3888. switch (N->getOpcode()) {
  3889. default: llvm_unreachable("Unexpected opcode!");
  3890. case X86ISD::ANDNP:
  3891. if (A == N0)
  3892. Imm &= ~TernlogMagicA;
  3893. else
  3894. Imm = ~(Imm) & TernlogMagicA;
  3895. break;
  3896. case ISD::AND: Imm &= TernlogMagicA; break;
  3897. case ISD::OR: Imm |= TernlogMagicA; break;
  3898. case ISD::XOR: Imm ^= TernlogMagicA; break;
  3899. }
  3900. return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
  3901. }
  3902. /// If the high bits of an 'and' operand are known zero, try setting the
  3903. /// high bits of an 'and' constant operand to produce a smaller encoding by
  3904. /// creating a small, sign-extended negative immediate rather than a large
  3905. /// positive one. This reverses a transform in SimplifyDemandedBits that
  3906. /// shrinks mask constants by clearing bits. There is also a possibility that
  3907. /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
  3908. /// case, just replace the 'and'. Return 'true' if the node is replaced.
  3909. bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
  3910. // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
  3911. // have immediate operands.
  3912. MVT VT = And->getSimpleValueType(0);
  3913. if (VT != MVT::i32 && VT != MVT::i64)
  3914. return false;
  3915. auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
  3916. if (!And1C)
  3917. return false;
  3918. // Bail out if the mask constant is already negative. It's can't shrink more.
  3919. // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
  3920. // patterns to use a 32-bit and instead of a 64-bit and by relying on the
  3921. // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
  3922. // are negative too.
  3923. APInt MaskVal = And1C->getAPIntValue();
  3924. unsigned MaskLZ = MaskVal.countLeadingZeros();
  3925. if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
  3926. return false;
  3927. // Don't extend into the upper 32 bits of a 64 bit mask.
  3928. if (VT == MVT::i64 && MaskLZ >= 32) {
  3929. MaskLZ -= 32;
  3930. MaskVal = MaskVal.trunc(32);
  3931. }
  3932. SDValue And0 = And->getOperand(0);
  3933. APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
  3934. APInt NegMaskVal = MaskVal | HighZeros;
  3935. // If a negative constant would not allow a smaller encoding, there's no need
  3936. // to continue. Only change the constant when we know it's a win.
  3937. unsigned MinWidth = NegMaskVal.getMinSignedBits();
  3938. if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
  3939. return false;
  3940. // Extend masks if we truncated above.
  3941. if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
  3942. NegMaskVal = NegMaskVal.zext(64);
  3943. HighZeros = HighZeros.zext(64);
  3944. }
  3945. // The variable operand must be all zeros in the top bits to allow using the
  3946. // new, negative constant as the mask.
  3947. if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
  3948. return false;
  3949. // Check if the mask is -1. In that case, this is an unnecessary instruction
  3950. // that escaped earlier analysis.
  3951. if (NegMaskVal.isAllOnes()) {
  3952. ReplaceNode(And, And0.getNode());
  3953. return true;
  3954. }
  3955. // A negative mask allows a smaller encoding. Create a new 'and' node.
  3956. SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
  3957. insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
  3958. SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
  3959. ReplaceNode(And, NewAnd.getNode());
  3960. SelectCode(NewAnd.getNode());
  3961. return true;
  3962. }
  3963. static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
  3964. bool FoldedBCast, bool Masked) {
  3965. #define VPTESTM_CASE(VT, SUFFIX) \
  3966. case MVT::VT: \
  3967. if (Masked) \
  3968. return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
  3969. return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
  3970. #define VPTESTM_BROADCAST_CASES(SUFFIX) \
  3971. default: llvm_unreachable("Unexpected VT!"); \
  3972. VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
  3973. VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
  3974. VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
  3975. VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
  3976. VPTESTM_CASE(v16i32, DZ##SUFFIX) \
  3977. VPTESTM_CASE(v8i64, QZ##SUFFIX)
  3978. #define VPTESTM_FULL_CASES(SUFFIX) \
  3979. VPTESTM_BROADCAST_CASES(SUFFIX) \
  3980. VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
  3981. VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
  3982. VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
  3983. VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
  3984. VPTESTM_CASE(v64i8, BZ##SUFFIX) \
  3985. VPTESTM_CASE(v32i16, WZ##SUFFIX)
  3986. if (FoldedBCast) {
  3987. switch (TestVT.SimpleTy) {
  3988. VPTESTM_BROADCAST_CASES(rmb)
  3989. }
  3990. }
  3991. if (FoldedLoad) {
  3992. switch (TestVT.SimpleTy) {
  3993. VPTESTM_FULL_CASES(rm)
  3994. }
  3995. }
  3996. switch (TestVT.SimpleTy) {
  3997. VPTESTM_FULL_CASES(rr)
  3998. }
  3999. #undef VPTESTM_FULL_CASES
  4000. #undef VPTESTM_BROADCAST_CASES
  4001. #undef VPTESTM_CASE
  4002. }
  4003. // Try to create VPTESTM instruction. If InMask is not null, it will be used
  4004. // to form a masked operation.
  4005. bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
  4006. SDValue InMask) {
  4007. assert(Subtarget->hasAVX512() && "Expected AVX512!");
  4008. assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
  4009. "Unexpected VT!");
  4010. // Look for equal and not equal compares.
  4011. ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
  4012. if (CC != ISD::SETEQ && CC != ISD::SETNE)
  4013. return false;
  4014. SDValue SetccOp0 = Setcc.getOperand(0);
  4015. SDValue SetccOp1 = Setcc.getOperand(1);
  4016. // Canonicalize the all zero vector to the RHS.
  4017. if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
  4018. std::swap(SetccOp0, SetccOp1);
  4019. // See if we're comparing against zero.
  4020. if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
  4021. return false;
  4022. SDValue N0 = SetccOp0;
  4023. MVT CmpVT = N0.getSimpleValueType();
  4024. MVT CmpSVT = CmpVT.getVectorElementType();
  4025. // Start with both operands the same. We'll try to refine this.
  4026. SDValue Src0 = N0;
  4027. SDValue Src1 = N0;
  4028. {
  4029. // Look through single use bitcasts.
  4030. SDValue N0Temp = N0;
  4031. if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
  4032. N0Temp = N0.getOperand(0);
  4033. // Look for single use AND.
  4034. if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
  4035. Src0 = N0Temp.getOperand(0);
  4036. Src1 = N0Temp.getOperand(1);
  4037. }
  4038. }
  4039. // Without VLX we need to widen the operation.
  4040. bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
  4041. auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
  4042. SDValue &Base, SDValue &Scale, SDValue &Index,
  4043. SDValue &Disp, SDValue &Segment) {
  4044. // If we need to widen, we can't fold the load.
  4045. if (!Widen)
  4046. if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
  4047. return true;
  4048. // If we didn't fold a load, try to match broadcast. No widening limitation
  4049. // for this. But only 32 and 64 bit types are supported.
  4050. if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
  4051. return false;
  4052. // Look through single use bitcasts.
  4053. if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
  4054. P = L.getNode();
  4055. L = L.getOperand(0);
  4056. }
  4057. if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
  4058. return false;
  4059. auto *MemIntr = cast<MemIntrinsicSDNode>(L);
  4060. if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
  4061. return false;
  4062. return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
  4063. };
  4064. // We can only fold loads if the sources are unique.
  4065. bool CanFoldLoads = Src0 != Src1;
  4066. bool FoldedLoad = false;
  4067. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  4068. if (CanFoldLoads) {
  4069. FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
  4070. Tmp3, Tmp4);
  4071. if (!FoldedLoad) {
  4072. // And is commutative.
  4073. FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
  4074. Tmp2, Tmp3, Tmp4);
  4075. if (FoldedLoad)
  4076. std::swap(Src0, Src1);
  4077. }
  4078. }
  4079. bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
  4080. bool IsMasked = InMask.getNode() != nullptr;
  4081. SDLoc dl(Root);
  4082. MVT ResVT = Setcc.getSimpleValueType();
  4083. MVT MaskVT = ResVT;
  4084. if (Widen) {
  4085. // Widen the inputs using insert_subreg or copy_to_regclass.
  4086. unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
  4087. unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
  4088. unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
  4089. CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
  4090. MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
  4091. SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
  4092. CmpVT), 0);
  4093. Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
  4094. if (!FoldedBCast)
  4095. Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
  4096. if (IsMasked) {
  4097. // Widen the mask.
  4098. unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
  4099. SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
  4100. InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
  4101. dl, MaskVT, InMask, RC), 0);
  4102. }
  4103. }
  4104. bool IsTestN = CC == ISD::SETEQ;
  4105. unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
  4106. IsMasked);
  4107. MachineSDNode *CNode;
  4108. if (FoldedLoad) {
  4109. SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
  4110. if (IsMasked) {
  4111. SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
  4112. Src1.getOperand(0) };
  4113. CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
  4114. } else {
  4115. SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
  4116. Src1.getOperand(0) };
  4117. CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
  4118. }
  4119. // Update the chain.
  4120. ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
  4121. // Record the mem-refs
  4122. CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
  4123. } else {
  4124. if (IsMasked)
  4125. CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
  4126. else
  4127. CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
  4128. }
  4129. // If we widened, we need to shrink the mask VT.
  4130. if (Widen) {
  4131. unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
  4132. SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
  4133. CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
  4134. dl, ResVT, SDValue(CNode, 0), RC);
  4135. }
  4136. ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
  4137. CurDAG->RemoveDeadNode(Root);
  4138. return true;
  4139. }
  4140. // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
  4141. // into vpternlog.
  4142. bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
  4143. assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
  4144. MVT NVT = N->getSimpleValueType(0);
  4145. // Make sure we support VPTERNLOG.
  4146. if (!NVT.isVector() || !Subtarget->hasAVX512())
  4147. return false;
  4148. // We need VLX for 128/256-bit.
  4149. if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
  4150. return false;
  4151. SDValue N0 = N->getOperand(0);
  4152. SDValue N1 = N->getOperand(1);
  4153. // Canonicalize AND to LHS.
  4154. if (N1.getOpcode() == ISD::AND)
  4155. std::swap(N0, N1);
  4156. if (N0.getOpcode() != ISD::AND ||
  4157. N1.getOpcode() != X86ISD::ANDNP ||
  4158. !N0.hasOneUse() || !N1.hasOneUse())
  4159. return false;
  4160. // ANDN is not commutable, use it to pick down A and C.
  4161. SDValue A = N1.getOperand(0);
  4162. SDValue C = N1.getOperand(1);
  4163. // AND is commutable, if one operand matches A, the other operand is B.
  4164. // Otherwise this isn't a match.
  4165. SDValue B;
  4166. if (N0.getOperand(0) == A)
  4167. B = N0.getOperand(1);
  4168. else if (N0.getOperand(1) == A)
  4169. B = N0.getOperand(0);
  4170. else
  4171. return false;
  4172. SDLoc dl(N);
  4173. SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
  4174. SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
  4175. ReplaceNode(N, Ternlog.getNode());
  4176. return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
  4177. Ternlog.getNode(), A, B, C, 0xCA);
  4178. }
  4179. void X86DAGToDAGISel::Select(SDNode *Node) {
  4180. MVT NVT = Node->getSimpleValueType(0);
  4181. unsigned Opcode = Node->getOpcode();
  4182. SDLoc dl(Node);
  4183. if (Node->isMachineOpcode()) {
  4184. LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
  4185. Node->setNodeId(-1);
  4186. return; // Already selected.
  4187. }
  4188. switch (Opcode) {
  4189. default: break;
  4190. case ISD::INTRINSIC_W_CHAIN: {
  4191. unsigned IntNo = Node->getConstantOperandVal(1);
  4192. switch (IntNo) {
  4193. default: break;
  4194. case Intrinsic::x86_encodekey128:
  4195. case Intrinsic::x86_encodekey256: {
  4196. if (!Subtarget->hasKL())
  4197. break;
  4198. unsigned Opcode;
  4199. switch (IntNo) {
  4200. default: llvm_unreachable("Impossible intrinsic");
  4201. case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
  4202. case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
  4203. }
  4204. SDValue Chain = Node->getOperand(0);
  4205. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
  4206. SDValue());
  4207. if (Opcode == X86::ENCODEKEY256)
  4208. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
  4209. Chain.getValue(1));
  4210. MachineSDNode *Res = CurDAG->getMachineNode(
  4211. Opcode, dl, Node->getVTList(),
  4212. {Node->getOperand(2), Chain, Chain.getValue(1)});
  4213. ReplaceNode(Node, Res);
  4214. return;
  4215. }
  4216. case Intrinsic::x86_tileloadd64_internal:
  4217. case Intrinsic::x86_tileloaddt164_internal: {
  4218. if (!Subtarget->hasAMXTILE())
  4219. break;
  4220. unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
  4221. ? X86::PTILELOADDV
  4222. : X86::PTILELOADDT1V;
  4223. // _tile_loadd_internal(row, col, buf, STRIDE)
  4224. SDValue Base = Node->getOperand(4);
  4225. SDValue Scale = getI8Imm(1, dl);
  4226. SDValue Index = Node->getOperand(5);
  4227. SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
  4228. SDValue Segment = CurDAG->getRegister(0, MVT::i16);
  4229. SDValue Chain = Node->getOperand(0);
  4230. MachineSDNode *CNode;
  4231. SDValue Ops[] = {Node->getOperand(2),
  4232. Node->getOperand(3),
  4233. Base,
  4234. Scale,
  4235. Index,
  4236. Disp,
  4237. Segment,
  4238. Chain};
  4239. CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
  4240. ReplaceNode(Node, CNode);
  4241. return;
  4242. }
  4243. }
  4244. break;
  4245. }
  4246. case ISD::INTRINSIC_VOID: {
  4247. unsigned IntNo = Node->getConstantOperandVal(1);
  4248. switch (IntNo) {
  4249. default: break;
  4250. case Intrinsic::x86_sse3_monitor:
  4251. case Intrinsic::x86_monitorx:
  4252. case Intrinsic::x86_clzero: {
  4253. bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
  4254. unsigned Opc = 0;
  4255. switch (IntNo) {
  4256. default: llvm_unreachable("Unexpected intrinsic!");
  4257. case Intrinsic::x86_sse3_monitor:
  4258. if (!Subtarget->hasSSE3())
  4259. break;
  4260. Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
  4261. break;
  4262. case Intrinsic::x86_monitorx:
  4263. if (!Subtarget->hasMWAITX())
  4264. break;
  4265. Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
  4266. break;
  4267. case Intrinsic::x86_clzero:
  4268. if (!Subtarget->hasCLZERO())
  4269. break;
  4270. Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
  4271. break;
  4272. }
  4273. if (Opc) {
  4274. unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
  4275. SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
  4276. Node->getOperand(2), SDValue());
  4277. SDValue InFlag = Chain.getValue(1);
  4278. if (IntNo == Intrinsic::x86_sse3_monitor ||
  4279. IntNo == Intrinsic::x86_monitorx) {
  4280. // Copy the other two operands to ECX and EDX.
  4281. Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
  4282. InFlag);
  4283. InFlag = Chain.getValue(1);
  4284. Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
  4285. InFlag);
  4286. InFlag = Chain.getValue(1);
  4287. }
  4288. MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
  4289. { Chain, InFlag});
  4290. ReplaceNode(Node, CNode);
  4291. return;
  4292. }
  4293. break;
  4294. }
  4295. case Intrinsic::x86_tilestored64_internal: {
  4296. unsigned Opc = X86::PTILESTOREDV;
  4297. // _tile_stored_internal(row, col, buf, STRIDE, c)
  4298. SDValue Base = Node->getOperand(4);
  4299. SDValue Scale = getI8Imm(1, dl);
  4300. SDValue Index = Node->getOperand(5);
  4301. SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
  4302. SDValue Segment = CurDAG->getRegister(0, MVT::i16);
  4303. SDValue Chain = Node->getOperand(0);
  4304. MachineSDNode *CNode;
  4305. SDValue Ops[] = {Node->getOperand(2),
  4306. Node->getOperand(3),
  4307. Base,
  4308. Scale,
  4309. Index,
  4310. Disp,
  4311. Segment,
  4312. Node->getOperand(6),
  4313. Chain};
  4314. CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
  4315. ReplaceNode(Node, CNode);
  4316. return;
  4317. }
  4318. case Intrinsic::x86_tileloadd64:
  4319. case Intrinsic::x86_tileloaddt164:
  4320. case Intrinsic::x86_tilestored64: {
  4321. if (!Subtarget->hasAMXTILE())
  4322. break;
  4323. unsigned Opc;
  4324. switch (IntNo) {
  4325. default: llvm_unreachable("Unexpected intrinsic!");
  4326. case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
  4327. case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
  4328. case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
  4329. }
  4330. // FIXME: Match displacement and scale.
  4331. unsigned TIndex = Node->getConstantOperandVal(2);
  4332. SDValue TReg = getI8Imm(TIndex, dl);
  4333. SDValue Base = Node->getOperand(3);
  4334. SDValue Scale = getI8Imm(1, dl);
  4335. SDValue Index = Node->getOperand(4);
  4336. SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
  4337. SDValue Segment = CurDAG->getRegister(0, MVT::i16);
  4338. SDValue Chain = Node->getOperand(0);
  4339. MachineSDNode *CNode;
  4340. if (Opc == X86::PTILESTORED) {
  4341. SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
  4342. CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
  4343. } else {
  4344. SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
  4345. CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
  4346. }
  4347. ReplaceNode(Node, CNode);
  4348. return;
  4349. }
  4350. }
  4351. break;
  4352. }
  4353. case ISD::BRIND:
  4354. case X86ISD::NT_BRIND: {
  4355. if (Subtarget->isTargetNaCl())
  4356. // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
  4357. // leave the instruction alone.
  4358. break;
  4359. if (Subtarget->isTarget64BitILP32()) {
  4360. // Converts a 32-bit register to a 64-bit, zero-extended version of
  4361. // it. This is needed because x86-64 can do many things, but jmp %r32
  4362. // ain't one of them.
  4363. SDValue Target = Node->getOperand(1);
  4364. assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
  4365. SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
  4366. SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
  4367. Node->getOperand(0), ZextTarget);
  4368. ReplaceNode(Node, Brind.getNode());
  4369. SelectCode(ZextTarget.getNode());
  4370. SelectCode(Brind.getNode());
  4371. return;
  4372. }
  4373. break;
  4374. }
  4375. case X86ISD::GlobalBaseReg:
  4376. ReplaceNode(Node, getGlobalBaseReg());
  4377. return;
  4378. case ISD::BITCAST:
  4379. // Just drop all 128/256/512-bit bitcasts.
  4380. if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
  4381. NVT == MVT::f128) {
  4382. ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
  4383. CurDAG->RemoveDeadNode(Node);
  4384. return;
  4385. }
  4386. break;
  4387. case ISD::SRL:
  4388. if (matchBitExtract(Node))
  4389. return;
  4390. [[fallthrough]];
  4391. case ISD::SRA:
  4392. case ISD::SHL:
  4393. if (tryShiftAmountMod(Node))
  4394. return;
  4395. break;
  4396. case X86ISD::VPTERNLOG: {
  4397. uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
  4398. if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
  4399. Node->getOperand(1), Node->getOperand(2), Imm))
  4400. return;
  4401. break;
  4402. }
  4403. case X86ISD::ANDNP:
  4404. if (tryVPTERNLOG(Node))
  4405. return;
  4406. break;
  4407. case ISD::AND:
  4408. if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
  4409. // Try to form a masked VPTESTM. Operands can be in either order.
  4410. SDValue N0 = Node->getOperand(0);
  4411. SDValue N1 = Node->getOperand(1);
  4412. if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
  4413. tryVPTESTM(Node, N0, N1))
  4414. return;
  4415. if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
  4416. tryVPTESTM(Node, N1, N0))
  4417. return;
  4418. }
  4419. if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
  4420. ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
  4421. CurDAG->RemoveDeadNode(Node);
  4422. return;
  4423. }
  4424. if (matchBitExtract(Node))
  4425. return;
  4426. if (AndImmShrink && shrinkAndImmediate(Node))
  4427. return;
  4428. [[fallthrough]];
  4429. case ISD::OR:
  4430. case ISD::XOR:
  4431. if (tryShrinkShlLogicImm(Node))
  4432. return;
  4433. if (Opcode == ISD::OR && tryMatchBitSelect(Node))
  4434. return;
  4435. if (tryVPTERNLOG(Node))
  4436. return;
  4437. [[fallthrough]];
  4438. case ISD::ADD:
  4439. case ISD::SUB: {
  4440. // Try to avoid folding immediates with multiple uses for optsize.
  4441. // This code tries to select to register form directly to avoid going
  4442. // through the isel table which might fold the immediate. We can't change
  4443. // the patterns on the add/sub/and/or/xor with immediate paterns in the
  4444. // tablegen files to check immediate use count without making the patterns
  4445. // unavailable to the fast-isel table.
  4446. if (!CurDAG->shouldOptForSize())
  4447. break;
  4448. // Only handle i8/i16/i32/i64.
  4449. if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
  4450. break;
  4451. SDValue N0 = Node->getOperand(0);
  4452. SDValue N1 = Node->getOperand(1);
  4453. auto *Cst = dyn_cast<ConstantSDNode>(N1);
  4454. if (!Cst)
  4455. break;
  4456. int64_t Val = Cst->getSExtValue();
  4457. // Make sure its an immediate that is considered foldable.
  4458. // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
  4459. if (!isInt<8>(Val) && !isInt<32>(Val))
  4460. break;
  4461. // If this can match to INC/DEC, let it go.
  4462. if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
  4463. break;
  4464. // Check if we should avoid folding this immediate.
  4465. if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
  4466. break;
  4467. // We should not fold the immediate. So we need a register form instead.
  4468. unsigned ROpc, MOpc;
  4469. switch (NVT.SimpleTy) {
  4470. default: llvm_unreachable("Unexpected VT!");
  4471. case MVT::i8:
  4472. switch (Opcode) {
  4473. default: llvm_unreachable("Unexpected opcode!");
  4474. case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
  4475. case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
  4476. case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
  4477. case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
  4478. case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
  4479. }
  4480. break;
  4481. case MVT::i16:
  4482. switch (Opcode) {
  4483. default: llvm_unreachable("Unexpected opcode!");
  4484. case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
  4485. case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
  4486. case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
  4487. case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
  4488. case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
  4489. }
  4490. break;
  4491. case MVT::i32:
  4492. switch (Opcode) {
  4493. default: llvm_unreachable("Unexpected opcode!");
  4494. case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
  4495. case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
  4496. case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
  4497. case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
  4498. case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
  4499. }
  4500. break;
  4501. case MVT::i64:
  4502. switch (Opcode) {
  4503. default: llvm_unreachable("Unexpected opcode!");
  4504. case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
  4505. case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
  4506. case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
  4507. case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
  4508. case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
  4509. }
  4510. break;
  4511. }
  4512. // Ok this is a AND/OR/XOR/ADD/SUB with constant.
  4513. // If this is a not a subtract, we can still try to fold a load.
  4514. if (Opcode != ISD::SUB) {
  4515. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  4516. if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  4517. SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
  4518. SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
  4519. MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  4520. // Update the chain.
  4521. ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
  4522. // Record the mem-refs
  4523. CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
  4524. ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
  4525. CurDAG->RemoveDeadNode(Node);
  4526. return;
  4527. }
  4528. }
  4529. CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
  4530. return;
  4531. }
  4532. case X86ISD::SMUL:
  4533. // i16/i32/i64 are handled with isel patterns.
  4534. if (NVT != MVT::i8)
  4535. break;
  4536. [[fallthrough]];
  4537. case X86ISD::UMUL: {
  4538. SDValue N0 = Node->getOperand(0);
  4539. SDValue N1 = Node->getOperand(1);
  4540. unsigned LoReg, ROpc, MOpc;
  4541. switch (NVT.SimpleTy) {
  4542. default: llvm_unreachable("Unsupported VT!");
  4543. case MVT::i8:
  4544. LoReg = X86::AL;
  4545. ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
  4546. MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
  4547. break;
  4548. case MVT::i16:
  4549. LoReg = X86::AX;
  4550. ROpc = X86::MUL16r;
  4551. MOpc = X86::MUL16m;
  4552. break;
  4553. case MVT::i32:
  4554. LoReg = X86::EAX;
  4555. ROpc = X86::MUL32r;
  4556. MOpc = X86::MUL32m;
  4557. break;
  4558. case MVT::i64:
  4559. LoReg = X86::RAX;
  4560. ROpc = X86::MUL64r;
  4561. MOpc = X86::MUL64m;
  4562. break;
  4563. }
  4564. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  4565. bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
  4566. // Multiply is commutative.
  4567. if (!FoldedLoad) {
  4568. FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
  4569. if (FoldedLoad)
  4570. std::swap(N0, N1);
  4571. }
  4572. SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
  4573. N0, SDValue()).getValue(1);
  4574. MachineSDNode *CNode;
  4575. if (FoldedLoad) {
  4576. // i16/i32/i64 use an instruction that produces a low and high result even
  4577. // though only the low result is used.
  4578. SDVTList VTs;
  4579. if (NVT == MVT::i8)
  4580. VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
  4581. else
  4582. VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
  4583. SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
  4584. InFlag };
  4585. CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  4586. // Update the chain.
  4587. ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
  4588. // Record the mem-refs
  4589. CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
  4590. } else {
  4591. // i16/i32/i64 use an instruction that produces a low and high result even
  4592. // though only the low result is used.
  4593. SDVTList VTs;
  4594. if (NVT == MVT::i8)
  4595. VTs = CurDAG->getVTList(NVT, MVT::i32);
  4596. else
  4597. VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
  4598. CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
  4599. }
  4600. ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
  4601. ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
  4602. CurDAG->RemoveDeadNode(Node);
  4603. return;
  4604. }
  4605. case ISD::SMUL_LOHI:
  4606. case ISD::UMUL_LOHI: {
  4607. SDValue N0 = Node->getOperand(0);
  4608. SDValue N1 = Node->getOperand(1);
  4609. unsigned Opc, MOpc;
  4610. unsigned LoReg, HiReg;
  4611. bool IsSigned = Opcode == ISD::SMUL_LOHI;
  4612. bool UseMULX = !IsSigned && Subtarget->hasBMI2();
  4613. bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
  4614. switch (NVT.SimpleTy) {
  4615. default: llvm_unreachable("Unsupported VT!");
  4616. case MVT::i32:
  4617. Opc = UseMULXHi ? X86::MULX32Hrr :
  4618. UseMULX ? X86::MULX32rr :
  4619. IsSigned ? X86::IMUL32r : X86::MUL32r;
  4620. MOpc = UseMULXHi ? X86::MULX32Hrm :
  4621. UseMULX ? X86::MULX32rm :
  4622. IsSigned ? X86::IMUL32m : X86::MUL32m;
  4623. LoReg = UseMULX ? X86::EDX : X86::EAX;
  4624. HiReg = X86::EDX;
  4625. break;
  4626. case MVT::i64:
  4627. Opc = UseMULXHi ? X86::MULX64Hrr :
  4628. UseMULX ? X86::MULX64rr :
  4629. IsSigned ? X86::IMUL64r : X86::MUL64r;
  4630. MOpc = UseMULXHi ? X86::MULX64Hrm :
  4631. UseMULX ? X86::MULX64rm :
  4632. IsSigned ? X86::IMUL64m : X86::MUL64m;
  4633. LoReg = UseMULX ? X86::RDX : X86::RAX;
  4634. HiReg = X86::RDX;
  4635. break;
  4636. }
  4637. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  4638. bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
  4639. // Multiply is commutative.
  4640. if (!foldedLoad) {
  4641. foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
  4642. if (foldedLoad)
  4643. std::swap(N0, N1);
  4644. }
  4645. SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
  4646. N0, SDValue()).getValue(1);
  4647. SDValue ResHi, ResLo;
  4648. if (foldedLoad) {
  4649. SDValue Chain;
  4650. MachineSDNode *CNode = nullptr;
  4651. SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
  4652. InFlag };
  4653. if (UseMULXHi) {
  4654. SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
  4655. CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  4656. ResHi = SDValue(CNode, 0);
  4657. Chain = SDValue(CNode, 1);
  4658. } else if (UseMULX) {
  4659. SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
  4660. CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  4661. ResHi = SDValue(CNode, 0);
  4662. ResLo = SDValue(CNode, 1);
  4663. Chain = SDValue(CNode, 2);
  4664. } else {
  4665. SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
  4666. CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
  4667. Chain = SDValue(CNode, 0);
  4668. InFlag = SDValue(CNode, 1);
  4669. }
  4670. // Update the chain.
  4671. ReplaceUses(N1.getValue(1), Chain);
  4672. // Record the mem-refs
  4673. CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
  4674. } else {
  4675. SDValue Ops[] = { N1, InFlag };
  4676. if (UseMULXHi) {
  4677. SDVTList VTs = CurDAG->getVTList(NVT);
  4678. SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
  4679. ResHi = SDValue(CNode, 0);
  4680. } else if (UseMULX) {
  4681. SDVTList VTs = CurDAG->getVTList(NVT, NVT);
  4682. SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
  4683. ResHi = SDValue(CNode, 0);
  4684. ResLo = SDValue(CNode, 1);
  4685. } else {
  4686. SDVTList VTs = CurDAG->getVTList(MVT::Glue);
  4687. SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
  4688. InFlag = SDValue(CNode, 0);
  4689. }
  4690. }
  4691. // Copy the low half of the result, if it is needed.
  4692. if (!SDValue(Node, 0).use_empty()) {
  4693. if (!ResLo) {
  4694. assert(LoReg && "Register for low half is not defined!");
  4695. ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
  4696. NVT, InFlag);
  4697. InFlag = ResLo.getValue(2);
  4698. }
  4699. ReplaceUses(SDValue(Node, 0), ResLo);
  4700. LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
  4701. dbgs() << '\n');
  4702. }
  4703. // Copy the high half of the result, if it is needed.
  4704. if (!SDValue(Node, 1).use_empty()) {
  4705. if (!ResHi) {
  4706. assert(HiReg && "Register for high half is not defined!");
  4707. ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
  4708. NVT, InFlag);
  4709. InFlag = ResHi.getValue(2);
  4710. }
  4711. ReplaceUses(SDValue(Node, 1), ResHi);
  4712. LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
  4713. dbgs() << '\n');
  4714. }
  4715. CurDAG->RemoveDeadNode(Node);
  4716. return;
  4717. }
  4718. case ISD::SDIVREM:
  4719. case ISD::UDIVREM: {
  4720. SDValue N0 = Node->getOperand(0);
  4721. SDValue N1 = Node->getOperand(1);
  4722. unsigned ROpc, MOpc;
  4723. bool isSigned = Opcode == ISD::SDIVREM;
  4724. if (!isSigned) {
  4725. switch (NVT.SimpleTy) {
  4726. default: llvm_unreachable("Unsupported VT!");
  4727. case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
  4728. case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
  4729. case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
  4730. case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
  4731. }
  4732. } else {
  4733. switch (NVT.SimpleTy) {
  4734. default: llvm_unreachable("Unsupported VT!");
  4735. case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
  4736. case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
  4737. case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
  4738. case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
  4739. }
  4740. }
  4741. unsigned LoReg, HiReg, ClrReg;
  4742. unsigned SExtOpcode;
  4743. switch (NVT.SimpleTy) {
  4744. default: llvm_unreachable("Unsupported VT!");
  4745. case MVT::i8:
  4746. LoReg = X86::AL; ClrReg = HiReg = X86::AH;
  4747. SExtOpcode = 0; // Not used.
  4748. break;
  4749. case MVT::i16:
  4750. LoReg = X86::AX; HiReg = X86::DX;
  4751. ClrReg = X86::DX;
  4752. SExtOpcode = X86::CWD;
  4753. break;
  4754. case MVT::i32:
  4755. LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
  4756. SExtOpcode = X86::CDQ;
  4757. break;
  4758. case MVT::i64:
  4759. LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
  4760. SExtOpcode = X86::CQO;
  4761. break;
  4762. }
  4763. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  4764. bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
  4765. bool signBitIsZero = CurDAG->SignBitIsZero(N0);
  4766. SDValue InFlag;
  4767. if (NVT == MVT::i8) {
  4768. // Special case for div8, just use a move with zero extension to AX to
  4769. // clear the upper 8 bits (AH).
  4770. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
  4771. MachineSDNode *Move;
  4772. if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  4773. SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
  4774. unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
  4775. : X86::MOVZX16rm8;
  4776. Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
  4777. Chain = SDValue(Move, 1);
  4778. ReplaceUses(N0.getValue(1), Chain);
  4779. // Record the mem-refs
  4780. CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
  4781. } else {
  4782. unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
  4783. : X86::MOVZX16rr8;
  4784. Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
  4785. Chain = CurDAG->getEntryNode();
  4786. }
  4787. Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
  4788. SDValue());
  4789. InFlag = Chain.getValue(1);
  4790. } else {
  4791. InFlag =
  4792. CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
  4793. LoReg, N0, SDValue()).getValue(1);
  4794. if (isSigned && !signBitIsZero) {
  4795. // Sign extend the low part into the high part.
  4796. InFlag =
  4797. SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
  4798. } else {
  4799. // Zero out the high part, effectively zero extending the input.
  4800. SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
  4801. SDValue ClrNode = SDValue(
  4802. CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
  4803. switch (NVT.SimpleTy) {
  4804. case MVT::i16:
  4805. ClrNode =
  4806. SDValue(CurDAG->getMachineNode(
  4807. TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
  4808. CurDAG->getTargetConstant(X86::sub_16bit, dl,
  4809. MVT::i32)),
  4810. 0);
  4811. break;
  4812. case MVT::i32:
  4813. break;
  4814. case MVT::i64:
  4815. ClrNode =
  4816. SDValue(CurDAG->getMachineNode(
  4817. TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
  4818. CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
  4819. CurDAG->getTargetConstant(X86::sub_32bit, dl,
  4820. MVT::i32)),
  4821. 0);
  4822. break;
  4823. default:
  4824. llvm_unreachable("Unexpected division source");
  4825. }
  4826. InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
  4827. ClrNode, InFlag).getValue(1);
  4828. }
  4829. }
  4830. if (foldedLoad) {
  4831. SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
  4832. InFlag };
  4833. MachineSDNode *CNode =
  4834. CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
  4835. InFlag = SDValue(CNode, 1);
  4836. // Update the chain.
  4837. ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
  4838. // Record the mem-refs
  4839. CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
  4840. } else {
  4841. InFlag =
  4842. SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
  4843. }
  4844. // Prevent use of AH in a REX instruction by explicitly copying it to
  4845. // an ABCD_L register.
  4846. //
  4847. // The current assumption of the register allocator is that isel
  4848. // won't generate explicit references to the GR8_ABCD_H registers. If
  4849. // the allocator and/or the backend get enhanced to be more robust in
  4850. // that regard, this can be, and should be, removed.
  4851. if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
  4852. SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
  4853. unsigned AHExtOpcode =
  4854. isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
  4855. SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
  4856. MVT::Glue, AHCopy, InFlag);
  4857. SDValue Result(RNode, 0);
  4858. InFlag = SDValue(RNode, 1);
  4859. Result =
  4860. CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
  4861. ReplaceUses(SDValue(Node, 1), Result);
  4862. LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
  4863. dbgs() << '\n');
  4864. }
  4865. // Copy the division (low) result, if it is needed.
  4866. if (!SDValue(Node, 0).use_empty()) {
  4867. SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
  4868. LoReg, NVT, InFlag);
  4869. InFlag = Result.getValue(2);
  4870. ReplaceUses(SDValue(Node, 0), Result);
  4871. LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
  4872. dbgs() << '\n');
  4873. }
  4874. // Copy the remainder (high) result, if it is needed.
  4875. if (!SDValue(Node, 1).use_empty()) {
  4876. SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
  4877. HiReg, NVT, InFlag);
  4878. InFlag = Result.getValue(2);
  4879. ReplaceUses(SDValue(Node, 1), Result);
  4880. LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
  4881. dbgs() << '\n');
  4882. }
  4883. CurDAG->RemoveDeadNode(Node);
  4884. return;
  4885. }
  4886. case X86ISD::FCMP:
  4887. case X86ISD::STRICT_FCMP:
  4888. case X86ISD::STRICT_FCMPS: {
  4889. bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
  4890. Node->getOpcode() == X86ISD::STRICT_FCMPS;
  4891. SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
  4892. SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
  4893. // Save the original VT of the compare.
  4894. MVT CmpVT = N0.getSimpleValueType();
  4895. // Floating point needs special handling if we don't have FCOMI.
  4896. if (Subtarget->canUseCMOV())
  4897. break;
  4898. bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
  4899. unsigned Opc;
  4900. switch (CmpVT.SimpleTy) {
  4901. default: llvm_unreachable("Unexpected type!");
  4902. case MVT::f32:
  4903. Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
  4904. break;
  4905. case MVT::f64:
  4906. Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
  4907. break;
  4908. case MVT::f80:
  4909. Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
  4910. break;
  4911. }
  4912. SDValue Chain =
  4913. IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
  4914. SDValue Glue;
  4915. if (IsStrictCmp) {
  4916. SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
  4917. Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
  4918. Glue = Chain.getValue(1);
  4919. } else {
  4920. Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
  4921. }
  4922. // Move FPSW to AX.
  4923. SDValue FNSTSW =
  4924. SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
  4925. // Extract upper 8-bits of AX.
  4926. SDValue Extract =
  4927. CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
  4928. // Move AH into flags.
  4929. // Some 64-bit targets lack SAHF support, but they do support FCOMI.
  4930. assert(Subtarget->canUseLAHFSAHF() &&
  4931. "Target doesn't support SAHF or FCOMI?");
  4932. SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
  4933. Chain = AH;
  4934. SDValue SAHF = SDValue(
  4935. CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
  4936. if (IsStrictCmp)
  4937. ReplaceUses(SDValue(Node, 1), Chain);
  4938. ReplaceUses(SDValue(Node, 0), SAHF);
  4939. CurDAG->RemoveDeadNode(Node);
  4940. return;
  4941. }
  4942. case X86ISD::CMP: {
  4943. SDValue N0 = Node->getOperand(0);
  4944. SDValue N1 = Node->getOperand(1);
  4945. // Optimizations for TEST compares.
  4946. if (!isNullConstant(N1))
  4947. break;
  4948. // Save the original VT of the compare.
  4949. MVT CmpVT = N0.getSimpleValueType();
  4950. // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
  4951. // by a test instruction. The test should be removed later by
  4952. // analyzeCompare if we are using only the zero flag.
  4953. // TODO: Should we check the users and use the BEXTR flags directly?
  4954. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
  4955. if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
  4956. unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
  4957. : X86::TEST32rr;
  4958. SDValue BEXTR = SDValue(NewNode, 0);
  4959. NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
  4960. ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
  4961. CurDAG->RemoveDeadNode(Node);
  4962. return;
  4963. }
  4964. }
  4965. // We can peek through truncates, but we need to be careful below.
  4966. if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
  4967. N0 = N0.getOperand(0);
  4968. // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
  4969. // use a smaller encoding.
  4970. // Look past the truncate if CMP is the only use of it.
  4971. if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
  4972. N0.getValueType() != MVT::i8) {
  4973. auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
  4974. if (!MaskC)
  4975. break;
  4976. // We may have looked through a truncate so mask off any bits that
  4977. // shouldn't be part of the compare.
  4978. uint64_t Mask = MaskC->getZExtValue();
  4979. Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
  4980. // Check if we can replace AND+IMM{32,64} with a shift. This is possible
  4981. // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
  4982. // zero flag.
  4983. if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
  4984. onlyUsesZeroFlag(SDValue(Node, 0))) {
  4985. unsigned ShiftOpcode = ISD::DELETED_NODE;
  4986. unsigned ShiftAmt;
  4987. unsigned SubRegIdx;
  4988. MVT SubRegVT;
  4989. unsigned TestOpcode;
  4990. unsigned LeadingZeros = countLeadingZeros(Mask);
  4991. unsigned TrailingZeros = countTrailingZeros(Mask);
  4992. // With leading/trailing zeros, the transform is profitable if we can
  4993. // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
  4994. // incurring any extra register moves.
  4995. bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
  4996. if (LeadingZeros == 0 && SavesBytes) {
  4997. // If the mask covers the most significant bit, then we can replace
  4998. // TEST+AND with a SHR and check eflags.
  4999. // This emits a redundant TEST which is subsequently eliminated.
  5000. ShiftOpcode = X86::SHR64ri;
  5001. ShiftAmt = TrailingZeros;
  5002. SubRegIdx = 0;
  5003. TestOpcode = X86::TEST64rr;
  5004. } else if (TrailingZeros == 0 && SavesBytes) {
  5005. // If the mask covers the least significant bit, then we can replace
  5006. // TEST+AND with a SHL and check eflags.
  5007. // This emits a redundant TEST which is subsequently eliminated.
  5008. ShiftOpcode = X86::SHL64ri;
  5009. ShiftAmt = LeadingZeros;
  5010. SubRegIdx = 0;
  5011. TestOpcode = X86::TEST64rr;
  5012. } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
  5013. // If the shifted mask extends into the high half and is 8/16/32 bits
  5014. // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
  5015. unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
  5016. if (PopCount == 8) {
  5017. ShiftOpcode = X86::SHR64ri;
  5018. ShiftAmt = TrailingZeros;
  5019. SubRegIdx = X86::sub_8bit;
  5020. SubRegVT = MVT::i8;
  5021. TestOpcode = X86::TEST8rr;
  5022. } else if (PopCount == 16) {
  5023. ShiftOpcode = X86::SHR64ri;
  5024. ShiftAmt = TrailingZeros;
  5025. SubRegIdx = X86::sub_16bit;
  5026. SubRegVT = MVT::i16;
  5027. TestOpcode = X86::TEST16rr;
  5028. } else if (PopCount == 32) {
  5029. ShiftOpcode = X86::SHR64ri;
  5030. ShiftAmt = TrailingZeros;
  5031. SubRegIdx = X86::sub_32bit;
  5032. SubRegVT = MVT::i32;
  5033. TestOpcode = X86::TEST32rr;
  5034. }
  5035. }
  5036. if (ShiftOpcode != ISD::DELETED_NODE) {
  5037. SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
  5038. SDValue Shift = SDValue(
  5039. CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
  5040. N0.getOperand(0), ShiftC),
  5041. 0);
  5042. if (SubRegIdx != 0) {
  5043. Shift =
  5044. CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
  5045. }
  5046. MachineSDNode *Test =
  5047. CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
  5048. ReplaceNode(Node, Test);
  5049. return;
  5050. }
  5051. }
  5052. MVT VT;
  5053. int SubRegOp;
  5054. unsigned ROpc, MOpc;
  5055. // For each of these checks we need to be careful if the sign flag is
  5056. // being used. It is only safe to use the sign flag in two conditions,
  5057. // either the sign bit in the shrunken mask is zero or the final test
  5058. // size is equal to the original compare size.
  5059. if (isUInt<8>(Mask) &&
  5060. (!(Mask & 0x80) || CmpVT == MVT::i8 ||
  5061. hasNoSignFlagUses(SDValue(Node, 0)))) {
  5062. // For example, convert "testl %eax, $8" to "testb %al, $8"
  5063. VT = MVT::i8;
  5064. SubRegOp = X86::sub_8bit;
  5065. ROpc = X86::TEST8ri;
  5066. MOpc = X86::TEST8mi;
  5067. } else if (OptForMinSize && isUInt<16>(Mask) &&
  5068. (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
  5069. hasNoSignFlagUses(SDValue(Node, 0)))) {
  5070. // For example, "testl %eax, $32776" to "testw %ax, $32776".
  5071. // NOTE: We only want to form TESTW instructions if optimizing for
  5072. // min size. Otherwise we only save one byte and possibly get a length
  5073. // changing prefix penalty in the decoders.
  5074. VT = MVT::i16;
  5075. SubRegOp = X86::sub_16bit;
  5076. ROpc = X86::TEST16ri;
  5077. MOpc = X86::TEST16mi;
  5078. } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
  5079. ((!(Mask & 0x80000000) &&
  5080. // Without minsize 16-bit Cmps can get here so we need to
  5081. // be sure we calculate the correct sign flag if needed.
  5082. (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
  5083. CmpVT == MVT::i32 ||
  5084. hasNoSignFlagUses(SDValue(Node, 0)))) {
  5085. // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
  5086. // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
  5087. // Otherwize, we find ourselves in a position where we have to do
  5088. // promotion. If previous passes did not promote the and, we assume
  5089. // they had a good reason not to and do not promote here.
  5090. VT = MVT::i32;
  5091. SubRegOp = X86::sub_32bit;
  5092. ROpc = X86::TEST32ri;
  5093. MOpc = X86::TEST32mi;
  5094. } else {
  5095. // No eligible transformation was found.
  5096. break;
  5097. }
  5098. SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
  5099. SDValue Reg = N0.getOperand(0);
  5100. // Emit a testl or testw.
  5101. MachineSDNode *NewNode;
  5102. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
  5103. if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
  5104. if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
  5105. if (!LoadN->isSimple()) {
  5106. unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
  5107. if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
  5108. (MOpc == X86::TEST16mi && NumVolBits != 16) ||
  5109. (MOpc == X86::TEST32mi && NumVolBits != 32))
  5110. break;
  5111. }
  5112. }
  5113. SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
  5114. Reg.getOperand(0) };
  5115. NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
  5116. // Update the chain.
  5117. ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
  5118. // Record the mem-refs
  5119. CurDAG->setNodeMemRefs(NewNode,
  5120. {cast<LoadSDNode>(Reg)->getMemOperand()});
  5121. } else {
  5122. // Extract the subregister if necessary.
  5123. if (N0.getValueType() != VT)
  5124. Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
  5125. NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
  5126. }
  5127. // Replace CMP with TEST.
  5128. ReplaceNode(Node, NewNode);
  5129. return;
  5130. }
  5131. break;
  5132. }
  5133. case X86ISD::PCMPISTR: {
  5134. if (!Subtarget->hasSSE42())
  5135. break;
  5136. bool NeedIndex = !SDValue(Node, 0).use_empty();
  5137. bool NeedMask = !SDValue(Node, 1).use_empty();
  5138. // We can't fold a load if we are going to make two instructions.
  5139. bool MayFoldLoad = !NeedIndex || !NeedMask;
  5140. MachineSDNode *CNode;
  5141. if (NeedMask) {
  5142. unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
  5143. unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
  5144. CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
  5145. ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
  5146. }
  5147. if (NeedIndex || !NeedMask) {
  5148. unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
  5149. unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
  5150. CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
  5151. ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
  5152. }
  5153. // Connect the flag usage to the last instruction created.
  5154. ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
  5155. CurDAG->RemoveDeadNode(Node);
  5156. return;
  5157. }
  5158. case X86ISD::PCMPESTR: {
  5159. if (!Subtarget->hasSSE42())
  5160. break;
  5161. // Copy the two implicit register inputs.
  5162. SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
  5163. Node->getOperand(1),
  5164. SDValue()).getValue(1);
  5165. InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
  5166. Node->getOperand(3), InFlag).getValue(1);
  5167. bool NeedIndex = !SDValue(Node, 0).use_empty();
  5168. bool NeedMask = !SDValue(Node, 1).use_empty();
  5169. // We can't fold a load if we are going to make two instructions.
  5170. bool MayFoldLoad = !NeedIndex || !NeedMask;
  5171. MachineSDNode *CNode;
  5172. if (NeedMask) {
  5173. unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
  5174. unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
  5175. CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
  5176. InFlag);
  5177. ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
  5178. }
  5179. if (NeedIndex || !NeedMask) {
  5180. unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
  5181. unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
  5182. CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
  5183. ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
  5184. }
  5185. // Connect the flag usage to the last instruction created.
  5186. ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
  5187. CurDAG->RemoveDeadNode(Node);
  5188. return;
  5189. }
  5190. case ISD::SETCC: {
  5191. if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
  5192. return;
  5193. break;
  5194. }
  5195. case ISD::STORE:
  5196. if (foldLoadStoreIntoMemOperand(Node))
  5197. return;
  5198. break;
  5199. case X86ISD::SETCC_CARRY: {
  5200. MVT VT = Node->getSimpleValueType(0);
  5201. SDValue Result;
  5202. if (Subtarget->hasSBBDepBreaking()) {
  5203. // We have to do this manually because tblgen will put the eflags copy in
  5204. // the wrong place if we use an extract_subreg in the pattern.
  5205. // Copy flags to the EFLAGS register and glue it to next node.
  5206. SDValue EFLAGS =
  5207. CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
  5208. Node->getOperand(1), SDValue());
  5209. // Create a 64-bit instruction if the result is 64-bits otherwise use the
  5210. // 32-bit version.
  5211. unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
  5212. MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
  5213. Result = SDValue(
  5214. CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
  5215. 0);
  5216. } else {
  5217. // The target does not recognize sbb with the same reg operand as a
  5218. // no-source idiom, so we explicitly zero the input values.
  5219. Result = getSBBZero(Node);
  5220. }
  5221. // For less than 32-bits we need to extract from the 32-bit node.
  5222. if (VT == MVT::i8 || VT == MVT::i16) {
  5223. int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
  5224. Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
  5225. }
  5226. ReplaceUses(SDValue(Node, 0), Result);
  5227. CurDAG->RemoveDeadNode(Node);
  5228. return;
  5229. }
  5230. case X86ISD::SBB: {
  5231. if (isNullConstant(Node->getOperand(0)) &&
  5232. isNullConstant(Node->getOperand(1))) {
  5233. SDValue Result = getSBBZero(Node);
  5234. // Replace the flag use.
  5235. ReplaceUses(SDValue(Node, 1), Result.getValue(1));
  5236. // Replace the result use.
  5237. if (!SDValue(Node, 0).use_empty()) {
  5238. // For less than 32-bits we need to extract from the 32-bit node.
  5239. MVT VT = Node->getSimpleValueType(0);
  5240. if (VT == MVT::i8 || VT == MVT::i16) {
  5241. int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
  5242. Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
  5243. }
  5244. ReplaceUses(SDValue(Node, 0), Result);
  5245. }
  5246. CurDAG->RemoveDeadNode(Node);
  5247. return;
  5248. }
  5249. break;
  5250. }
  5251. case X86ISD::MGATHER: {
  5252. auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
  5253. SDValue IndexOp = Mgt->getIndex();
  5254. SDValue Mask = Mgt->getMask();
  5255. MVT IndexVT = IndexOp.getSimpleValueType();
  5256. MVT ValueVT = Node->getSimpleValueType(0);
  5257. MVT MaskVT = Mask.getSimpleValueType();
  5258. // This is just to prevent crashes if the nodes are malformed somehow. We're
  5259. // otherwise only doing loose type checking in here based on type what
  5260. // a type constraint would say just like table based isel.
  5261. if (!ValueVT.isVector() || !MaskVT.isVector())
  5262. break;
  5263. unsigned NumElts = ValueVT.getVectorNumElements();
  5264. MVT ValueSVT = ValueVT.getVectorElementType();
  5265. bool IsFP = ValueSVT.isFloatingPoint();
  5266. unsigned EltSize = ValueSVT.getSizeInBits();
  5267. unsigned Opc = 0;
  5268. bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
  5269. if (AVX512Gather) {
  5270. if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
  5271. Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
  5272. else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
  5273. Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
  5274. else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
  5275. Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
  5276. else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
  5277. Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
  5278. else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
  5279. Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
  5280. else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
  5281. Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
  5282. else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
  5283. Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
  5284. else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
  5285. Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
  5286. else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
  5287. Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
  5288. else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
  5289. Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
  5290. else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
  5291. Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
  5292. else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
  5293. Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
  5294. } else {
  5295. assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
  5296. "Unexpected mask VT!");
  5297. if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
  5298. Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
  5299. else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
  5300. Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
  5301. else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
  5302. Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
  5303. else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
  5304. Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
  5305. else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
  5306. Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
  5307. else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
  5308. Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
  5309. else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
  5310. Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
  5311. else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
  5312. Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
  5313. }
  5314. if (!Opc)
  5315. break;
  5316. SDValue Base, Scale, Index, Disp, Segment;
  5317. if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
  5318. Base, Scale, Index, Disp, Segment))
  5319. break;
  5320. SDValue PassThru = Mgt->getPassThru();
  5321. SDValue Chain = Mgt->getChain();
  5322. // Gather instructions have a mask output not in the ISD node.
  5323. SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
  5324. MachineSDNode *NewNode;
  5325. if (AVX512Gather) {
  5326. SDValue Ops[] = {PassThru, Mask, Base, Scale,
  5327. Index, Disp, Segment, Chain};
  5328. NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
  5329. } else {
  5330. SDValue Ops[] = {PassThru, Base, Scale, Index,
  5331. Disp, Segment, Mask, Chain};
  5332. NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
  5333. }
  5334. CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
  5335. ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
  5336. ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
  5337. CurDAG->RemoveDeadNode(Node);
  5338. return;
  5339. }
  5340. case X86ISD::MSCATTER: {
  5341. auto *Sc = cast<X86MaskedScatterSDNode>(Node);
  5342. SDValue Value = Sc->getValue();
  5343. SDValue IndexOp = Sc->getIndex();
  5344. MVT IndexVT = IndexOp.getSimpleValueType();
  5345. MVT ValueVT = Value.getSimpleValueType();
  5346. // This is just to prevent crashes if the nodes are malformed somehow. We're
  5347. // otherwise only doing loose type checking in here based on type what
  5348. // a type constraint would say just like table based isel.
  5349. if (!ValueVT.isVector())
  5350. break;
  5351. unsigned NumElts = ValueVT.getVectorNumElements();
  5352. MVT ValueSVT = ValueVT.getVectorElementType();
  5353. bool IsFP = ValueSVT.isFloatingPoint();
  5354. unsigned EltSize = ValueSVT.getSizeInBits();
  5355. unsigned Opc;
  5356. if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
  5357. Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
  5358. else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
  5359. Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
  5360. else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
  5361. Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
  5362. else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
  5363. Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
  5364. else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
  5365. Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
  5366. else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
  5367. Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
  5368. else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
  5369. Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
  5370. else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
  5371. Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
  5372. else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
  5373. Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
  5374. else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
  5375. Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
  5376. else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
  5377. Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
  5378. else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
  5379. Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
  5380. else
  5381. break;
  5382. SDValue Base, Scale, Index, Disp, Segment;
  5383. if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
  5384. Base, Scale, Index, Disp, Segment))
  5385. break;
  5386. SDValue Mask = Sc->getMask();
  5387. SDValue Chain = Sc->getChain();
  5388. // Scatter instructions have a mask output not in the ISD node.
  5389. SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
  5390. SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
  5391. MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
  5392. CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
  5393. ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
  5394. CurDAG->RemoveDeadNode(Node);
  5395. return;
  5396. }
  5397. case ISD::PREALLOCATED_SETUP: {
  5398. auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
  5399. auto CallId = MFI->getPreallocatedIdForCallSite(
  5400. cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
  5401. SDValue Chain = Node->getOperand(0);
  5402. SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
  5403. MachineSDNode *New = CurDAG->getMachineNode(
  5404. TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
  5405. ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
  5406. CurDAG->RemoveDeadNode(Node);
  5407. return;
  5408. }
  5409. case ISD::PREALLOCATED_ARG: {
  5410. auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
  5411. auto CallId = MFI->getPreallocatedIdForCallSite(
  5412. cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
  5413. SDValue Chain = Node->getOperand(0);
  5414. SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
  5415. SDValue ArgIndex = Node->getOperand(2);
  5416. SDValue Ops[3];
  5417. Ops[0] = CallIdValue;
  5418. Ops[1] = ArgIndex;
  5419. Ops[2] = Chain;
  5420. MachineSDNode *New = CurDAG->getMachineNode(
  5421. TargetOpcode::PREALLOCATED_ARG, dl,
  5422. CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
  5423. MVT::Other),
  5424. Ops);
  5425. ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
  5426. ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
  5427. CurDAG->RemoveDeadNode(Node);
  5428. return;
  5429. }
  5430. case X86ISD::AESENCWIDE128KL:
  5431. case X86ISD::AESDECWIDE128KL:
  5432. case X86ISD::AESENCWIDE256KL:
  5433. case X86ISD::AESDECWIDE256KL: {
  5434. if (!Subtarget->hasWIDEKL())
  5435. break;
  5436. unsigned Opcode;
  5437. switch (Node->getOpcode()) {
  5438. default:
  5439. llvm_unreachable("Unexpected opcode!");
  5440. case X86ISD::AESENCWIDE128KL:
  5441. Opcode = X86::AESENCWIDE128KL;
  5442. break;
  5443. case X86ISD::AESDECWIDE128KL:
  5444. Opcode = X86::AESDECWIDE128KL;
  5445. break;
  5446. case X86ISD::AESENCWIDE256KL:
  5447. Opcode = X86::AESENCWIDE256KL;
  5448. break;
  5449. case X86ISD::AESDECWIDE256KL:
  5450. Opcode = X86::AESDECWIDE256KL;
  5451. break;
  5452. }
  5453. SDValue Chain = Node->getOperand(0);
  5454. SDValue Addr = Node->getOperand(1);
  5455. SDValue Base, Scale, Index, Disp, Segment;
  5456. if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
  5457. break;
  5458. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
  5459. SDValue());
  5460. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
  5461. Chain.getValue(1));
  5462. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
  5463. Chain.getValue(1));
  5464. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
  5465. Chain.getValue(1));
  5466. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
  5467. Chain.getValue(1));
  5468. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
  5469. Chain.getValue(1));
  5470. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
  5471. Chain.getValue(1));
  5472. Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
  5473. Chain.getValue(1));
  5474. MachineSDNode *Res = CurDAG->getMachineNode(
  5475. Opcode, dl, Node->getVTList(),
  5476. {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
  5477. CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
  5478. ReplaceNode(Node, Res);
  5479. return;
  5480. }
  5481. }
  5482. SelectCode(Node);
  5483. }
  5484. bool X86DAGToDAGISel::
  5485. SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
  5486. std::vector<SDValue> &OutOps) {
  5487. SDValue Op0, Op1, Op2, Op3, Op4;
  5488. switch (ConstraintID) {
  5489. default:
  5490. llvm_unreachable("Unexpected asm memory constraint");
  5491. case InlineAsm::Constraint_o: // offsetable ??
  5492. case InlineAsm::Constraint_v: // not offsetable ??
  5493. case InlineAsm::Constraint_m: // memory
  5494. case InlineAsm::Constraint_X:
  5495. case InlineAsm::Constraint_p: // address
  5496. if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
  5497. return true;
  5498. break;
  5499. }
  5500. OutOps.push_back(Op0);
  5501. OutOps.push_back(Op1);
  5502. OutOps.push_back(Op2);
  5503. OutOps.push_back(Op3);
  5504. OutOps.push_back(Op4);
  5505. return false;
  5506. }
  5507. /// This pass converts a legalized DAG into a X86-specific DAG,
  5508. /// ready for instruction scheduling.
  5509. FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
  5510. CodeGenOpt::Level OptLevel) {
  5511. return new X86DAGToDAGISel(TM, OptLevel);
  5512. }