ARMISelLowering.cpp 826 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088180891809018091180921809318094180951809618097180981809918100181011810218103181041810518106181071810818109181101811118112181131811418115181161811718118181191812018121181221812318124181251812618127181281812918130181311813218133181341813518136181371813818139181401814118142181431814418145181461814718148181491815018151181521815318154181551815618157181581815918160181611816218163181641816518166181671816818169181701817118172181731817418175181761817718178181791818018181181821818318184181851818618187181881818918190181911819218193181941819518196181971819818199182001820118202182031820418205182061820718208182091821018211182121821318214182151821618217182181821918220182211822218223182241822518226182271822818229182301823118232182331823418235182361823718238182391824018241182421824318244182451824618247182481824918250182511825218253182541825518256182571825818259182601826118262182631826418265182661826718268182691827018271182721827318274182751827618277182781827918280182811828218283182841828518286182871828818289182901829118292182931829418295182961829718298182991830018301183021830318304183051830618307183081830918310183111831218313183141831518316183171831818319183201832118322183231832418325183261832718328183291833018331183321833318334183351833618337183381833918340183411834218343183441834518346183471834818349183501835118352183531835418355183561835718358183591836018361183621836318364183651836618367183681836918370183711837218373183741837518376183771837818379183801838118382183831838418385183861838718388183891839018391183921839318394183951839618397183981839918400184011840218403184041840518406184071840818409184101841118412184131841418415184161841718418184191842018421184221842318424184251842618427184281842918430184311843218433184341843518436184371843818439184401844118442184431844418445184461844718448184491845018451184521845318454184551845618457184581845918460184611846218463184641846518466184671846818469184701847118472184731847418475184761847718478184791848018481184821848318484184851848618487184881848918490184911849218493184941849518496184971849818499185001850118502185031850418505185061850718508185091851018511185121851318514185151851618517185181851918520185211852218523185241852518526185271852818529185301853118532185331853418535185361853718538185391854018541185421854318544185451854618547185481854918550185511855218553185541855518556185571855818559185601856118562185631856418565185661856718568185691857018571185721857318574185751857618577185781857918580185811858218583185841858518586185871858818589185901859118592185931859418595185961859718598185991860018601186021860318604186051860618607186081860918610186111861218613186141861518616186171861818619186201862118622186231862418625186261862718628186291863018631186321863318634186351863618637186381863918640186411864218643186441864518646186471864818649186501865118652186531865418655186561865718658186591866018661186621866318664186651866618667186681866918670186711867218673186741867518676186771867818679186801868118682186831868418685186861868718688186891869018691186921869318694186951869618697186981869918700187011870218703187041870518706187071870818709187101871118712187131871418715187161871718718187191872018721187221872318724187251872618727187281872918730187311873218733187341873518736187371873818739187401874118742187431874418745187461874718748187491875018751187521875318754187551875618757187581875918760187611876218763187641876518766187671876818769187701877118772187731877418775187761877718778187791878018781187821878318784187851878618787187881878918790187911879218793187941879518796187971879818799188001880118802188031880418805188061880718808188091881018811188121881318814188151881618817188181881918820188211882218823188241882518826188271882818829188301883118832188331883418835188361883718838188391884018841188421884318844188451884618847188481884918850188511885218853188541885518856188571885818859188601886118862188631886418865188661886718868188691887018871188721887318874188751887618877188781887918880188811888218883188841888518886188871888818889188901889118892188931889418895188961889718898188991890018901189021890318904189051890618907189081890918910189111891218913189141891518916189171891818919189201892118922189231892418925189261892718928189291893018931189321893318934189351893618937189381893918940189411894218943189441894518946189471894818949189501895118952189531895418955189561895718958189591896018961189621896318964189651896618967189681896918970189711897218973189741897518976189771897818979189801898118982189831898418985189861898718988189891899018991189921899318994189951899618997189981899919000190011900219003190041900519006190071900819009190101901119012190131901419015190161901719018190191902019021190221902319024190251902619027190281902919030190311903219033190341903519036190371903819039190401904119042190431904419045190461904719048190491905019051190521905319054190551905619057190581905919060190611906219063190641906519066190671906819069190701907119072190731907419075190761907719078190791908019081190821908319084190851908619087190881908919090190911909219093190941909519096190971909819099191001910119102191031910419105191061910719108191091911019111191121911319114191151911619117191181911919120191211912219123191241912519126191271912819129191301913119132191331913419135191361913719138191391914019141191421914319144191451914619147191481914919150191511915219153191541915519156191571915819159191601916119162191631916419165191661916719168191691917019171191721917319174191751917619177191781917919180191811918219183191841918519186191871918819189191901919119192191931919419195191961919719198191991920019201192021920319204192051920619207192081920919210192111921219213192141921519216192171921819219192201922119222192231922419225192261922719228192291923019231192321923319234192351923619237192381923919240192411924219243192441924519246192471924819249192501925119252192531925419255192561925719258192591926019261192621926319264192651926619267192681926919270192711927219273192741927519276192771927819279192801928119282192831928419285192861928719288192891929019291192921929319294192951929619297192981929919300193011930219303193041930519306193071930819309193101931119312193131931419315193161931719318193191932019321193221932319324193251932619327193281932919330193311933219333193341933519336193371933819339193401934119342193431934419345193461934719348193491935019351193521935319354193551935619357193581935919360193611936219363193641936519366193671936819369193701937119372193731937419375193761937719378193791938019381193821938319384193851938619387193881938919390193911939219393193941939519396193971939819399194001940119402194031940419405194061940719408194091941019411194121941319414194151941619417194181941919420194211942219423194241942519426194271942819429194301943119432194331943419435194361943719438194391944019441194421944319444194451944619447194481944919450194511945219453194541945519456194571945819459194601946119462194631946419465194661946719468194691947019471194721947319474194751947619477194781947919480194811948219483194841948519486194871948819489194901949119492194931949419495194961949719498194991950019501195021950319504195051950619507195081950919510195111951219513195141951519516195171951819519195201952119522195231952419525195261952719528195291953019531195321953319534195351953619537195381953919540195411954219543195441954519546195471954819549195501955119552195531955419555195561955719558195591956019561195621956319564195651956619567195681956919570195711957219573195741957519576195771957819579195801958119582195831958419585195861958719588195891959019591195921959319594195951959619597195981959919600196011960219603196041960519606196071960819609196101961119612196131961419615196161961719618196191962019621196221962319624196251962619627196281962919630196311963219633196341963519636196371963819639196401964119642196431964419645196461964719648196491965019651196521965319654196551965619657196581965919660196611966219663196641966519666196671966819669196701967119672196731967419675196761967719678196791968019681196821968319684196851968619687196881968919690196911969219693196941969519696196971969819699197001970119702197031970419705197061970719708197091971019711197121971319714197151971619717197181971919720197211972219723197241972519726197271972819729197301973119732197331973419735197361973719738197391974019741197421974319744197451974619747197481974919750197511975219753197541975519756197571975819759197601976119762197631976419765197661976719768197691977019771197721977319774197751977619777197781977919780197811978219783197841978519786197871978819789197901979119792197931979419795197961979719798197991980019801198021980319804198051980619807198081980919810198111981219813198141981519816198171981819819198201982119822198231982419825198261982719828198291983019831198321983319834198351983619837198381983919840198411984219843198441984519846198471984819849198501985119852198531985419855198561985719858198591986019861198621986319864198651986619867198681986919870198711987219873198741987519876198771987819879198801988119882198831988419885198861988719888198891989019891198921989319894198951989619897198981989919900199011990219903199041990519906199071990819909199101991119912199131991419915199161991719918199191992019921199221992319924199251992619927199281992919930199311993219933199341993519936199371993819939199401994119942199431994419945199461994719948199491995019951199521995319954199551995619957199581995919960199611996219963199641996519966199671996819969199701997119972199731997419975199761997719978199791998019981199821998319984199851998619987199881998919990199911999219993199941999519996199971999819999200002000120002200032000420005200062000720008200092001020011200122001320014200152001620017200182001920020200212002220023200242002520026200272002820029200302003120032200332003420035200362003720038200392004020041200422004320044200452004620047200482004920050200512005220053200542005520056200572005820059200602006120062200632006420065200662006720068200692007020071200722007320074200752007620077200782007920080200812008220083200842008520086200872008820089200902009120092200932009420095200962009720098200992010020101201022010320104201052010620107201082010920110201112011220113201142011520116201172011820119201202012120122201232012420125201262012720128201292013020131201322013320134201352013620137201382013920140201412014220143201442014520146201472014820149201502015120152201532015420155201562015720158201592016020161201622016320164201652016620167201682016920170201712017220173201742017520176201772017820179201802018120182201832018420185201862018720188201892019020191201922019320194201952019620197201982019920200202012020220203202042020520206202072020820209202102021120212202132021420215202162021720218202192022020221202222022320224202252022620227202282022920230202312023220233202342023520236202372023820239202402024120242202432024420245202462024720248202492025020251202522025320254202552025620257202582025920260202612026220263202642026520266202672026820269202702027120272202732027420275202762027720278202792028020281202822028320284202852028620287202882028920290202912029220293202942029520296202972029820299203002030120302203032030420305203062030720308203092031020311203122031320314203152031620317203182031920320203212032220323203242032520326203272032820329203302033120332203332033420335203362033720338203392034020341203422034320344203452034620347203482034920350203512035220353203542035520356203572035820359203602036120362203632036420365203662036720368203692037020371203722037320374203752037620377203782037920380203812038220383203842038520386203872038820389203902039120392203932039420395203962039720398203992040020401204022040320404204052040620407204082040920410204112041220413204142041520416204172041820419204202042120422204232042420425204262042720428204292043020431204322043320434204352043620437204382043920440204412044220443204442044520446204472044820449204502045120452204532045420455204562045720458204592046020461204622046320464204652046620467204682046920470204712047220473204742047520476204772047820479204802048120482204832048420485204862048720488204892049020491204922049320494204952049620497204982049920500205012050220503205042050520506205072050820509205102051120512205132051420515205162051720518205192052020521205222052320524205252052620527205282052920530205312053220533205342053520536205372053820539205402054120542205432054420545205462054720548205492055020551205522055320554205552055620557205582055920560205612056220563205642056520566205672056820569205702057120572205732057420575205762057720578205792058020581205822058320584205852058620587205882058920590205912059220593205942059520596205972059820599206002060120602206032060420605206062060720608206092061020611206122061320614206152061620617206182061920620206212062220623206242062520626206272062820629206302063120632206332063420635206362063720638206392064020641206422064320644206452064620647206482064920650206512065220653206542065520656206572065820659206602066120662206632066420665206662066720668206692067020671206722067320674206752067620677206782067920680206812068220683206842068520686206872068820689206902069120692206932069420695206962069720698206992070020701207022070320704207052070620707207082070920710207112071220713207142071520716207172071820719207202072120722207232072420725207262072720728207292073020731207322073320734207352073620737207382073920740207412074220743207442074520746207472074820749207502075120752207532075420755207562075720758207592076020761207622076320764207652076620767207682076920770207712077220773207742077520776207772077820779207802078120782207832078420785207862078720788207892079020791207922079320794207952079620797207982079920800208012080220803208042080520806208072080820809208102081120812208132081420815208162081720818208192082020821208222082320824208252082620827208282082920830208312083220833208342083520836208372083820839208402084120842208432084420845208462084720848208492085020851208522085320854208552085620857208582085920860208612086220863208642086520866208672086820869208702087120872208732087420875208762087720878208792088020881208822088320884208852088620887208882088920890208912089220893208942089520896208972089820899209002090120902209032090420905209062090720908209092091020911209122091320914209152091620917209182091920920209212092220923209242092520926209272092820929209302093120932209332093420935209362093720938209392094020941209422094320944209452094620947209482094920950209512095220953209542095520956209572095820959209602096120962209632096420965209662096720968209692097020971209722097320974209752097620977209782097920980209812098220983209842098520986209872098820989209902099120992209932099420995209962099720998209992100021001210022100321004210052100621007210082100921010210112101221013210142101521016210172101821019210202102121022210232102421025210262102721028210292103021031210322103321034210352103621037210382103921040210412104221043210442104521046210472104821049210502105121052210532105421055210562105721058210592106021061210622106321064210652106621067210682106921070210712107221073210742107521076210772107821079210802108121082210832108421085210862108721088210892109021091210922109321094210952109621097210982109921100211012110221103211042110521106211072110821109211102111121112211132111421115211162111721118211192112021121211222112321124211252112621127211282112921130211312113221133211342113521136211372113821139211402114121142211432114421145211462114721148211492115021151211522115321154211552115621157211582115921160211612116221163211642116521166211672116821169211702117121172211732117421175211762117721178211792118021181211822118321184211852118621187211882118921190211912119221193211942119521196211972119821199212002120121202212032120421205212062120721208212092121021211212122121321214212152121621217212182121921220212212122221223212242122521226212272122821229212302123121232212332123421235212362123721238212392124021241212422124321244212452124621247212482124921250212512125221253212542125521256212572125821259212602126121262212632126421265212662126721268212692127021271212722127321274212752127621277212782127921280212812128221283212842128521286212872128821289212902129121292212932129421295212962129721298212992130021301213022130321304213052130621307213082130921310213112131221313213142131521316213172131821319213202132121322213232132421325213262132721328213292133021331213322133321334213352133621337213382133921340213412134221343213442134521346213472134821349213502135121352213532135421355213562135721358213592136021361213622136321364213652136621367213682136921370213712137221373213742137521376213772137821379213802138121382213832138421385213862138721388213892139021391213922139321394213952139621397213982139921400214012140221403214042140521406214072140821409214102141121412214132141421415214162141721418214192142021421214222142321424214252142621427214282142921430214312143221433214342143521436214372143821439214402144121442214432144421445214462144721448214492145021451214522145321454214552145621457214582145921460214612146221463214642146521466214672146821469214702147121472214732147421475214762147721478214792148021481214822148321484214852148621487214882148921490214912149221493214942149521496214972149821499215002150121502215032150421505215062150721508215092151021511215122151321514215152151621517215182151921520215212152221523215242152521526215272152821529215302153121532215332153421535215362153721538215392154021541215422154321544215452154621547215482154921550215512155221553215542155521556215572155821559215602156121562215632156421565215662156721568215692157021571215722157321574215752157621577215782157921580215812158221583215842158521586215872158821589215902159121592215932159421595215962159721598215992160021601216022160321604216052160621607216082160921610216112161221613216142161521616216172161821619216202162121622216232162421625216262162721628216292163021631216322163321634216352163621637216382163921640216412164221643216442164521646216472164821649216502165121652216532165421655216562165721658216592166021661216622166321664
  1. //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the interfaces that ARM uses to lower LLVM code into a
  10. // selection DAG.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "ARMISelLowering.h"
  14. #include "ARMBaseInstrInfo.h"
  15. #include "ARMBaseRegisterInfo.h"
  16. #include "ARMCallingConv.h"
  17. #include "ARMConstantPoolValue.h"
  18. #include "ARMMachineFunctionInfo.h"
  19. #include "ARMPerfectShuffle.h"
  20. #include "ARMRegisterInfo.h"
  21. #include "ARMSelectionDAGInfo.h"
  22. #include "ARMSubtarget.h"
  23. #include "ARMTargetTransformInfo.h"
  24. #include "MCTargetDesc/ARMAddressingModes.h"
  25. #include "MCTargetDesc/ARMBaseInfo.h"
  26. #include "Utils/ARMBaseInfo.h"
  27. #include "llvm/ADT/APFloat.h"
  28. #include "llvm/ADT/APInt.h"
  29. #include "llvm/ADT/ArrayRef.h"
  30. #include "llvm/ADT/BitVector.h"
  31. #include "llvm/ADT/DenseMap.h"
  32. #include "llvm/ADT/STLExtras.h"
  33. #include "llvm/ADT/SmallPtrSet.h"
  34. #include "llvm/ADT/SmallVector.h"
  35. #include "llvm/ADT/Statistic.h"
  36. #include "llvm/ADT/StringExtras.h"
  37. #include "llvm/ADT/StringRef.h"
  38. #include "llvm/ADT/StringSwitch.h"
  39. #include "llvm/ADT/Triple.h"
  40. #include "llvm/ADT/Twine.h"
  41. #include "llvm/Analysis/VectorUtils.h"
  42. #include "llvm/CodeGen/CallingConvLower.h"
  43. #include "llvm/CodeGen/ISDOpcodes.h"
  44. #include "llvm/CodeGen/IntrinsicLowering.h"
  45. #include "llvm/CodeGen/MachineBasicBlock.h"
  46. #include "llvm/CodeGen/MachineConstantPool.h"
  47. #include "llvm/CodeGen/MachineFrameInfo.h"
  48. #include "llvm/CodeGen/MachineFunction.h"
  49. #include "llvm/CodeGen/MachineInstr.h"
  50. #include "llvm/CodeGen/MachineInstrBuilder.h"
  51. #include "llvm/CodeGen/MachineJumpTableInfo.h"
  52. #include "llvm/CodeGen/MachineMemOperand.h"
  53. #include "llvm/CodeGen/MachineOperand.h"
  54. #include "llvm/CodeGen/MachineRegisterInfo.h"
  55. #include "llvm/CodeGen/RuntimeLibcalls.h"
  56. #include "llvm/CodeGen/SelectionDAG.h"
  57. #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
  58. #include "llvm/CodeGen/SelectionDAGNodes.h"
  59. #include "llvm/CodeGen/TargetInstrInfo.h"
  60. #include "llvm/CodeGen/TargetLowering.h"
  61. #include "llvm/CodeGen/TargetOpcodes.h"
  62. #include "llvm/CodeGen/TargetRegisterInfo.h"
  63. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  64. #include "llvm/CodeGen/ValueTypes.h"
  65. #include "llvm/IR/Attributes.h"
  66. #include "llvm/IR/CallingConv.h"
  67. #include "llvm/IR/Constant.h"
  68. #include "llvm/IR/Constants.h"
  69. #include "llvm/IR/DataLayout.h"
  70. #include "llvm/IR/DebugLoc.h"
  71. #include "llvm/IR/DerivedTypes.h"
  72. #include "llvm/IR/Function.h"
  73. #include "llvm/IR/GlobalAlias.h"
  74. #include "llvm/IR/GlobalValue.h"
  75. #include "llvm/IR/GlobalVariable.h"
  76. #include "llvm/IR/IRBuilder.h"
  77. #include "llvm/IR/InlineAsm.h"
  78. #include "llvm/IR/Instruction.h"
  79. #include "llvm/IR/Instructions.h"
  80. #include "llvm/IR/IntrinsicInst.h"
  81. #include "llvm/IR/Intrinsics.h"
  82. #include "llvm/IR/IntrinsicsARM.h"
  83. #include "llvm/IR/Module.h"
  84. #include "llvm/IR/PatternMatch.h"
  85. #include "llvm/IR/Type.h"
  86. #include "llvm/IR/User.h"
  87. #include "llvm/IR/Value.h"
  88. #include "llvm/MC/MCInstrDesc.h"
  89. #include "llvm/MC/MCInstrItineraries.h"
  90. #include "llvm/MC/MCRegisterInfo.h"
  91. #include "llvm/MC/MCSchedule.h"
  92. #include "llvm/Support/AtomicOrdering.h"
  93. #include "llvm/Support/BranchProbability.h"
  94. #include "llvm/Support/Casting.h"
  95. #include "llvm/Support/CodeGen.h"
  96. #include "llvm/Support/CommandLine.h"
  97. #include "llvm/Support/Compiler.h"
  98. #include "llvm/Support/Debug.h"
  99. #include "llvm/Support/ErrorHandling.h"
  100. #include "llvm/Support/KnownBits.h"
  101. #include "llvm/Support/MachineValueType.h"
  102. #include "llvm/Support/MathExtras.h"
  103. #include "llvm/Support/raw_ostream.h"
  104. #include "llvm/Target/TargetMachine.h"
  105. #include "llvm/Target/TargetOptions.h"
  106. #include <algorithm>
  107. #include <cassert>
  108. #include <cstdint>
  109. #include <cstdlib>
  110. #include <iterator>
  111. #include <limits>
  112. #include <string>
  113. #include <tuple>
  114. #include <utility>
  115. #include <vector>
  116. using namespace llvm;
  117. using namespace llvm::PatternMatch;
  118. #define DEBUG_TYPE "arm-isel"
  119. STATISTIC(NumTailCalls, "Number of tail calls");
  120. STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
  121. STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
  122. STATISTIC(NumConstpoolPromoted,
  123. "Number of constants with their storage promoted into constant pools");
  124. static cl::opt<bool>
  125. ARMInterworking("arm-interworking", cl::Hidden,
  126. cl::desc("Enable / disable ARM interworking (for debugging only)"),
  127. cl::init(true));
  128. static cl::opt<bool> EnableConstpoolPromotion(
  129. "arm-promote-constant", cl::Hidden,
  130. cl::desc("Enable / disable promotion of unnamed_addr constants into "
  131. "constant pools"),
  132. cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
  133. static cl::opt<unsigned> ConstpoolPromotionMaxSize(
  134. "arm-promote-constant-max-size", cl::Hidden,
  135. cl::desc("Maximum size of constant to promote into a constant pool"),
  136. cl::init(64));
  137. static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
  138. "arm-promote-constant-max-total", cl::Hidden,
  139. cl::desc("Maximum size of ALL constants to promote into a constant pool"),
  140. cl::init(128));
  141. cl::opt<unsigned>
  142. MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
  143. cl::desc("Maximum interleave factor for MVE VLDn to generate."),
  144. cl::init(2));
  145. // The APCS parameter registers.
  146. static const MCPhysReg GPRArgRegs[] = {
  147. ARM::R0, ARM::R1, ARM::R2, ARM::R3
  148. };
  149. void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
  150. if (VT != PromotedLdStVT) {
  151. setOperationAction(ISD::LOAD, VT, Promote);
  152. AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
  153. setOperationAction(ISD::STORE, VT, Promote);
  154. AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
  155. }
  156. MVT ElemTy = VT.getVectorElementType();
  157. if (ElemTy != MVT::f64)
  158. setOperationAction(ISD::SETCC, VT, Custom);
  159. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  160. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  161. if (ElemTy == MVT::i32) {
  162. setOperationAction(ISD::SINT_TO_FP, VT, Custom);
  163. setOperationAction(ISD::UINT_TO_FP, VT, Custom);
  164. setOperationAction(ISD::FP_TO_SINT, VT, Custom);
  165. setOperationAction(ISD::FP_TO_UINT, VT, Custom);
  166. } else {
  167. setOperationAction(ISD::SINT_TO_FP, VT, Expand);
  168. setOperationAction(ISD::UINT_TO_FP, VT, Expand);
  169. setOperationAction(ISD::FP_TO_SINT, VT, Expand);
  170. setOperationAction(ISD::FP_TO_UINT, VT, Expand);
  171. }
  172. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  173. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  174. setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
  175. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
  176. setOperationAction(ISD::SELECT, VT, Expand);
  177. setOperationAction(ISD::SELECT_CC, VT, Expand);
  178. setOperationAction(ISD::VSELECT, VT, Expand);
  179. setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
  180. if (VT.isInteger()) {
  181. setOperationAction(ISD::SHL, VT, Custom);
  182. setOperationAction(ISD::SRA, VT, Custom);
  183. setOperationAction(ISD::SRL, VT, Custom);
  184. }
  185. // Neon does not support vector divide/remainder operations.
  186. setOperationAction(ISD::SDIV, VT, Expand);
  187. setOperationAction(ISD::UDIV, VT, Expand);
  188. setOperationAction(ISD::FDIV, VT, Expand);
  189. setOperationAction(ISD::SREM, VT, Expand);
  190. setOperationAction(ISD::UREM, VT, Expand);
  191. setOperationAction(ISD::FREM, VT, Expand);
  192. setOperationAction(ISD::SDIVREM, VT, Expand);
  193. setOperationAction(ISD::UDIVREM, VT, Expand);
  194. if (!VT.isFloatingPoint() &&
  195. VT != MVT::v2i64 && VT != MVT::v1i64)
  196. for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
  197. setOperationAction(Opcode, VT, Legal);
  198. if (!VT.isFloatingPoint())
  199. for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
  200. setOperationAction(Opcode, VT, Legal);
  201. }
  202. void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
  203. addRegisterClass(VT, &ARM::DPRRegClass);
  204. addTypeForNEON(VT, MVT::f64);
  205. }
  206. void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
  207. addRegisterClass(VT, &ARM::DPairRegClass);
  208. addTypeForNEON(VT, MVT::v2f64);
  209. }
  210. void ARMTargetLowering::setAllExpand(MVT VT) {
  211. for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
  212. setOperationAction(Opc, VT, Expand);
  213. // We support these really simple operations even on types where all
  214. // the actual arithmetic has to be broken down into simpler
  215. // operations or turned into library calls.
  216. setOperationAction(ISD::BITCAST, VT, Legal);
  217. setOperationAction(ISD::LOAD, VT, Legal);
  218. setOperationAction(ISD::STORE, VT, Legal);
  219. setOperationAction(ISD::UNDEF, VT, Legal);
  220. }
  221. void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
  222. LegalizeAction Action) {
  223. setLoadExtAction(ISD::EXTLOAD, From, To, Action);
  224. setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
  225. setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
  226. }
  227. void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
  228. const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
  229. for (auto VT : IntTypes) {
  230. addRegisterClass(VT, &ARM::MQPRRegClass);
  231. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  232. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  233. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  234. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  235. setOperationAction(ISD::SHL, VT, Custom);
  236. setOperationAction(ISD::SRA, VT, Custom);
  237. setOperationAction(ISD::SRL, VT, Custom);
  238. setOperationAction(ISD::SMIN, VT, Legal);
  239. setOperationAction(ISD::SMAX, VT, Legal);
  240. setOperationAction(ISD::UMIN, VT, Legal);
  241. setOperationAction(ISD::UMAX, VT, Legal);
  242. setOperationAction(ISD::ABS, VT, Legal);
  243. setOperationAction(ISD::SETCC, VT, Custom);
  244. setOperationAction(ISD::MLOAD, VT, Custom);
  245. setOperationAction(ISD::MSTORE, VT, Legal);
  246. setOperationAction(ISD::CTLZ, VT, Legal);
  247. setOperationAction(ISD::CTTZ, VT, Custom);
  248. setOperationAction(ISD::BITREVERSE, VT, Legal);
  249. setOperationAction(ISD::BSWAP, VT, Legal);
  250. setOperationAction(ISD::SADDSAT, VT, Legal);
  251. setOperationAction(ISD::UADDSAT, VT, Legal);
  252. setOperationAction(ISD::SSUBSAT, VT, Legal);
  253. setOperationAction(ISD::USUBSAT, VT, Legal);
  254. setOperationAction(ISD::ABDS, VT, Legal);
  255. setOperationAction(ISD::ABDU, VT, Legal);
  256. // No native support for these.
  257. setOperationAction(ISD::UDIV, VT, Expand);
  258. setOperationAction(ISD::SDIV, VT, Expand);
  259. setOperationAction(ISD::UREM, VT, Expand);
  260. setOperationAction(ISD::SREM, VT, Expand);
  261. setOperationAction(ISD::UDIVREM, VT, Expand);
  262. setOperationAction(ISD::SDIVREM, VT, Expand);
  263. setOperationAction(ISD::CTPOP, VT, Expand);
  264. setOperationAction(ISD::SELECT, VT, Expand);
  265. setOperationAction(ISD::SELECT_CC, VT, Expand);
  266. // Vector reductions
  267. setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
  268. setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
  269. setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
  270. setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
  271. setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
  272. setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
  273. setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
  274. setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
  275. setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
  276. if (!HasMVEFP) {
  277. setOperationAction(ISD::SINT_TO_FP, VT, Expand);
  278. setOperationAction(ISD::UINT_TO_FP, VT, Expand);
  279. setOperationAction(ISD::FP_TO_SINT, VT, Expand);
  280. setOperationAction(ISD::FP_TO_UINT, VT, Expand);
  281. } else {
  282. setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
  283. setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
  284. }
  285. // Pre and Post inc are supported on loads and stores
  286. for (unsigned im = (unsigned)ISD::PRE_INC;
  287. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  288. setIndexedLoadAction(im, VT, Legal);
  289. setIndexedStoreAction(im, VT, Legal);
  290. setIndexedMaskedLoadAction(im, VT, Legal);
  291. setIndexedMaskedStoreAction(im, VT, Legal);
  292. }
  293. }
  294. const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
  295. for (auto VT : FloatTypes) {
  296. addRegisterClass(VT, &ARM::MQPRRegClass);
  297. if (!HasMVEFP)
  298. setAllExpand(VT);
  299. // These are legal or custom whether we have MVE.fp or not
  300. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  301. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  302. setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
  303. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  304. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  305. setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
  306. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
  307. setOperationAction(ISD::SETCC, VT, Custom);
  308. setOperationAction(ISD::MLOAD, VT, Custom);
  309. setOperationAction(ISD::MSTORE, VT, Legal);
  310. setOperationAction(ISD::SELECT, VT, Expand);
  311. setOperationAction(ISD::SELECT_CC, VT, Expand);
  312. // Pre and Post inc are supported on loads and stores
  313. for (unsigned im = (unsigned)ISD::PRE_INC;
  314. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  315. setIndexedLoadAction(im, VT, Legal);
  316. setIndexedStoreAction(im, VT, Legal);
  317. setIndexedMaskedLoadAction(im, VT, Legal);
  318. setIndexedMaskedStoreAction(im, VT, Legal);
  319. }
  320. if (HasMVEFP) {
  321. setOperationAction(ISD::FMINNUM, VT, Legal);
  322. setOperationAction(ISD::FMAXNUM, VT, Legal);
  323. setOperationAction(ISD::FROUND, VT, Legal);
  324. setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
  325. setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
  326. setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
  327. setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
  328. // No native support for these.
  329. setOperationAction(ISD::FDIV, VT, Expand);
  330. setOperationAction(ISD::FREM, VT, Expand);
  331. setOperationAction(ISD::FSQRT, VT, Expand);
  332. setOperationAction(ISD::FSIN, VT, Expand);
  333. setOperationAction(ISD::FCOS, VT, Expand);
  334. setOperationAction(ISD::FPOW, VT, Expand);
  335. setOperationAction(ISD::FLOG, VT, Expand);
  336. setOperationAction(ISD::FLOG2, VT, Expand);
  337. setOperationAction(ISD::FLOG10, VT, Expand);
  338. setOperationAction(ISD::FEXP, VT, Expand);
  339. setOperationAction(ISD::FEXP2, VT, Expand);
  340. setOperationAction(ISD::FNEARBYINT, VT, Expand);
  341. }
  342. }
  343. // Custom Expand smaller than legal vector reductions to prevent false zero
  344. // items being added.
  345. setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
  346. setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
  347. setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
  348. setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
  349. setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
  350. setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
  351. setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
  352. setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
  353. // We 'support' these types up to bitcast/load/store level, regardless of
  354. // MVE integer-only / float support. Only doing FP data processing on the FP
  355. // vector types is inhibited at integer-only level.
  356. const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
  357. for (auto VT : LongTypes) {
  358. addRegisterClass(VT, &ARM::MQPRRegClass);
  359. setAllExpand(VT);
  360. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  361. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  362. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  363. setOperationAction(ISD::VSELECT, VT, Legal);
  364. }
  365. setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
  366. // We can do bitwise operations on v2i64 vectors
  367. setOperationAction(ISD::AND, MVT::v2i64, Legal);
  368. setOperationAction(ISD::OR, MVT::v2i64, Legal);
  369. setOperationAction(ISD::XOR, MVT::v2i64, Legal);
  370. // It is legal to extload from v4i8 to v4i16 or v4i32.
  371. addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
  372. addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
  373. addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
  374. // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
  375. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
  376. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
  377. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
  378. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);
  379. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
  380. // Some truncating stores are legal too.
  381. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
  382. setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
  383. setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
  384. // Pre and Post inc on these are legal, given the correct extends
  385. for (unsigned im = (unsigned)ISD::PRE_INC;
  386. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  387. for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
  388. setIndexedLoadAction(im, VT, Legal);
  389. setIndexedStoreAction(im, VT, Legal);
  390. setIndexedMaskedLoadAction(im, VT, Legal);
  391. setIndexedMaskedStoreAction(im, VT, Legal);
  392. }
  393. }
  394. // Predicate types
  395. const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
  396. for (auto VT : pTypes) {
  397. addRegisterClass(VT, &ARM::VCCRRegClass);
  398. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  399. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  400. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  401. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  402. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  403. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  404. setOperationAction(ISD::SETCC, VT, Custom);
  405. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
  406. setOperationAction(ISD::LOAD, VT, Custom);
  407. setOperationAction(ISD::STORE, VT, Custom);
  408. setOperationAction(ISD::TRUNCATE, VT, Custom);
  409. setOperationAction(ISD::VSELECT, VT, Expand);
  410. setOperationAction(ISD::SELECT, VT, Expand);
  411. }
  412. setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
  413. setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);
  414. setOperationAction(ISD::AND, MVT::v2i1, Expand);
  415. setOperationAction(ISD::OR, MVT::v2i1, Expand);
  416. setOperationAction(ISD::XOR, MVT::v2i1, Expand);
  417. setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand);
  418. setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand);
  419. setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand);
  420. setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand);
  421. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
  422. setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
  423. setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
  424. setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
  425. setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
  426. setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
  427. setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
  428. setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
  429. }
  430. ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
  431. const ARMSubtarget &STI)
  432. : TargetLowering(TM), Subtarget(&STI) {
  433. RegInfo = Subtarget->getRegisterInfo();
  434. Itins = Subtarget->getInstrItineraryData();
  435. setBooleanContents(ZeroOrOneBooleanContent);
  436. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  437. if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
  438. !Subtarget->isTargetWatchOS()) {
  439. bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
  440. for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
  441. setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
  442. IsHFTarget ? CallingConv::ARM_AAPCS_VFP
  443. : CallingConv::ARM_AAPCS);
  444. }
  445. if (Subtarget->isTargetMachO()) {
  446. // Uses VFP for Thumb libfuncs if available.
  447. if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
  448. Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
  449. static const struct {
  450. const RTLIB::Libcall Op;
  451. const char * const Name;
  452. const ISD::CondCode Cond;
  453. } LibraryCalls[] = {
  454. // Single-precision floating-point arithmetic.
  455. { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
  456. { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
  457. { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
  458. { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
  459. // Double-precision floating-point arithmetic.
  460. { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
  461. { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
  462. { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
  463. { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
  464. // Single-precision comparisons.
  465. { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
  466. { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
  467. { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
  468. { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
  469. { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
  470. { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
  471. { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
  472. // Double-precision comparisons.
  473. { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
  474. { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
  475. { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
  476. { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
  477. { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
  478. { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
  479. { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
  480. // Floating-point to integer conversions.
  481. // i64 conversions are done via library routines even when generating VFP
  482. // instructions, so use the same ones.
  483. { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
  484. { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
  485. { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
  486. { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
  487. // Conversions between floating types.
  488. { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
  489. { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
  490. // Integer to floating-point conversions.
  491. // i64 conversions are done via library routines even when generating VFP
  492. // instructions, so use the same ones.
  493. // FIXME: There appears to be some naming inconsistency in ARM libgcc:
  494. // e.g., __floatunsidf vs. __floatunssidfvfp.
  495. { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
  496. { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
  497. { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
  498. { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
  499. };
  500. for (const auto &LC : LibraryCalls) {
  501. setLibcallName(LC.Op, LC.Name);
  502. if (LC.Cond != ISD::SETCC_INVALID)
  503. setCmpLibcallCC(LC.Op, LC.Cond);
  504. }
  505. }
  506. }
  507. // These libcalls are not available in 32-bit.
  508. setLibcallName(RTLIB::SHL_I128, nullptr);
  509. setLibcallName(RTLIB::SRL_I128, nullptr);
  510. setLibcallName(RTLIB::SRA_I128, nullptr);
  511. setLibcallName(RTLIB::MUL_I128, nullptr);
  512. setLibcallName(RTLIB::MULO_I64, nullptr);
  513. setLibcallName(RTLIB::MULO_I128, nullptr);
  514. // RTLIB
  515. if (Subtarget->isAAPCS_ABI() &&
  516. (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
  517. Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
  518. static const struct {
  519. const RTLIB::Libcall Op;
  520. const char * const Name;
  521. const CallingConv::ID CC;
  522. const ISD::CondCode Cond;
  523. } LibraryCalls[] = {
  524. // Double-precision floating-point arithmetic helper functions
  525. // RTABI chapter 4.1.2, Table 2
  526. { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  527. { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  528. { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  529. { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  530. // Double-precision floating-point comparison helper functions
  531. // RTABI chapter 4.1.2, Table 3
  532. { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
  533. { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
  534. { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
  535. { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
  536. { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
  537. { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
  538. { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
  539. // Single-precision floating-point arithmetic helper functions
  540. // RTABI chapter 4.1.2, Table 4
  541. { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  542. { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  543. { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  544. { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  545. // Single-precision floating-point comparison helper functions
  546. // RTABI chapter 4.1.2, Table 5
  547. { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
  548. { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
  549. { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
  550. { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
  551. { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
  552. { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
  553. { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
  554. // Floating-point to integer conversions.
  555. // RTABI chapter 4.1.2, Table 6
  556. { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  557. { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  558. { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  559. { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  560. { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  561. { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  562. { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  563. { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  564. // Conversions between floating types.
  565. // RTABI chapter 4.1.2, Table 7
  566. { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  567. { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  568. { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  569. // Integer to floating-point conversions.
  570. // RTABI chapter 4.1.2, Table 8
  571. { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  572. { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  573. { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  574. { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  575. { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  576. { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  577. { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  578. { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  579. // Long long helper functions
  580. // RTABI chapter 4.2, Table 9
  581. { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  582. { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  583. { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  584. { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  585. // Integer division functions
  586. // RTABI chapter 4.3.1
  587. { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  588. { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  589. { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  590. { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  591. { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  592. { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  593. { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  594. { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  595. };
  596. for (const auto &LC : LibraryCalls) {
  597. setLibcallName(LC.Op, LC.Name);
  598. setLibcallCallingConv(LC.Op, LC.CC);
  599. if (LC.Cond != ISD::SETCC_INVALID)
  600. setCmpLibcallCC(LC.Op, LC.Cond);
  601. }
  602. // EABI dependent RTLIB
  603. if (TM.Options.EABIVersion == EABI::EABI4 ||
  604. TM.Options.EABIVersion == EABI::EABI5) {
  605. static const struct {
  606. const RTLIB::Libcall Op;
  607. const char *const Name;
  608. const CallingConv::ID CC;
  609. const ISD::CondCode Cond;
  610. } MemOpsLibraryCalls[] = {
  611. // Memory operations
  612. // RTABI chapter 4.3.4
  613. { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  614. { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  615. { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  616. };
  617. for (const auto &LC : MemOpsLibraryCalls) {
  618. setLibcallName(LC.Op, LC.Name);
  619. setLibcallCallingConv(LC.Op, LC.CC);
  620. if (LC.Cond != ISD::SETCC_INVALID)
  621. setCmpLibcallCC(LC.Op, LC.Cond);
  622. }
  623. }
  624. }
  625. if (Subtarget->isTargetWindows()) {
  626. static const struct {
  627. const RTLIB::Libcall Op;
  628. const char * const Name;
  629. const CallingConv::ID CC;
  630. } LibraryCalls[] = {
  631. { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
  632. { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
  633. { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
  634. { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
  635. { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
  636. { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
  637. { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
  638. { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
  639. };
  640. for (const auto &LC : LibraryCalls) {
  641. setLibcallName(LC.Op, LC.Name);
  642. setLibcallCallingConv(LC.Op, LC.CC);
  643. }
  644. }
  645. // Use divmod compiler-rt calls for iOS 5.0 and later.
  646. if (Subtarget->isTargetMachO() &&
  647. !(Subtarget->isTargetIOS() &&
  648. Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
  649. setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
  650. setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
  651. }
  652. // The half <-> float conversion functions are always soft-float on
  653. // non-watchos platforms, but are needed for some targets which use a
  654. // hard-float calling convention by default.
  655. if (!Subtarget->isTargetWatchABI()) {
  656. if (Subtarget->isAAPCS_ABI()) {
  657. setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
  658. setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
  659. setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
  660. } else {
  661. setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
  662. setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
  663. setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
  664. }
  665. }
  666. // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
  667. // a __gnu_ prefix (which is the default).
  668. if (Subtarget->isTargetAEABI()) {
  669. static const struct {
  670. const RTLIB::Libcall Op;
  671. const char * const Name;
  672. const CallingConv::ID CC;
  673. } LibraryCalls[] = {
  674. { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
  675. { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
  676. { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
  677. };
  678. for (const auto &LC : LibraryCalls) {
  679. setLibcallName(LC.Op, LC.Name);
  680. setLibcallCallingConv(LC.Op, LC.CC);
  681. }
  682. }
  683. if (Subtarget->isThumb1Only())
  684. addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
  685. else
  686. addRegisterClass(MVT::i32, &ARM::GPRRegClass);
  687. if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
  688. Subtarget->hasFPRegs()) {
  689. addRegisterClass(MVT::f32, &ARM::SPRRegClass);
  690. addRegisterClass(MVT::f64, &ARM::DPRRegClass);
  691. setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
  692. setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
  693. setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
  694. setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
  695. if (!Subtarget->hasVFP2Base())
  696. setAllExpand(MVT::f32);
  697. if (!Subtarget->hasFP64())
  698. setAllExpand(MVT::f64);
  699. }
  700. if (Subtarget->hasFullFP16()) {
  701. addRegisterClass(MVT::f16, &ARM::HPRRegClass);
  702. setOperationAction(ISD::BITCAST, MVT::i16, Custom);
  703. setOperationAction(ISD::BITCAST, MVT::f16, Custom);
  704. setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
  705. setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
  706. }
  707. if (Subtarget->hasBF16()) {
  708. addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
  709. setAllExpand(MVT::bf16);
  710. if (!Subtarget->hasFullFP16())
  711. setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
  712. }
  713. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  714. for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
  715. setTruncStoreAction(VT, InnerVT, Expand);
  716. addAllExtLoads(VT, InnerVT, Expand);
  717. }
  718. setOperationAction(ISD::SMUL_LOHI, VT, Expand);
  719. setOperationAction(ISD::UMUL_LOHI, VT, Expand);
  720. setOperationAction(ISD::BSWAP, VT, Expand);
  721. }
  722. setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
  723. setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
  724. setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
  725. setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
  726. if (Subtarget->hasMVEIntegerOps())
  727. addMVEVectorTypes(Subtarget->hasMVEFloatOps());
  728. // Combine low-overhead loop intrinsics so that we can lower i1 types.
  729. if (Subtarget->hasLOB()) {
  730. setTargetDAGCombine(ISD::BRCOND);
  731. setTargetDAGCombine(ISD::BR_CC);
  732. }
  733. if (Subtarget->hasNEON()) {
  734. addDRTypeForNEON(MVT::v2f32);
  735. addDRTypeForNEON(MVT::v8i8);
  736. addDRTypeForNEON(MVT::v4i16);
  737. addDRTypeForNEON(MVT::v2i32);
  738. addDRTypeForNEON(MVT::v1i64);
  739. addQRTypeForNEON(MVT::v4f32);
  740. addQRTypeForNEON(MVT::v2f64);
  741. addQRTypeForNEON(MVT::v16i8);
  742. addQRTypeForNEON(MVT::v8i16);
  743. addQRTypeForNEON(MVT::v4i32);
  744. addQRTypeForNEON(MVT::v2i64);
  745. if (Subtarget->hasFullFP16()) {
  746. addQRTypeForNEON(MVT::v8f16);
  747. addDRTypeForNEON(MVT::v4f16);
  748. }
  749. if (Subtarget->hasBF16()) {
  750. addQRTypeForNEON(MVT::v8bf16);
  751. addDRTypeForNEON(MVT::v4bf16);
  752. }
  753. }
  754. if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
  755. // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
  756. // none of Neon, MVE or VFP supports any arithmetic operations on it.
  757. setOperationAction(ISD::FADD, MVT::v2f64, Expand);
  758. setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
  759. setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
  760. // FIXME: Code duplication: FDIV and FREM are expanded always, see
  761. // ARMTargetLowering::addTypeForNEON method for details.
  762. setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
  763. setOperationAction(ISD::FREM, MVT::v2f64, Expand);
  764. // FIXME: Create unittest.
  765. // In another words, find a way when "copysign" appears in DAG with vector
  766. // operands.
  767. setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
  768. // FIXME: Code duplication: SETCC has custom operation action, see
  769. // ARMTargetLowering::addTypeForNEON method for details.
  770. setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
  771. // FIXME: Create unittest for FNEG and for FABS.
  772. setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
  773. setOperationAction(ISD::FABS, MVT::v2f64, Expand);
  774. setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
  775. setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
  776. setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
  777. setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
  778. setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
  779. setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
  780. setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
  781. setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
  782. setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
  783. // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
  784. setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
  785. setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
  786. setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
  787. setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
  788. setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
  789. setOperationAction(ISD::FMA, MVT::v2f64, Expand);
  790. }
  791. if (Subtarget->hasNEON()) {
  792. // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
  793. // supported for v4f32.
  794. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
  795. setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
  796. setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
  797. setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
  798. setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
  799. setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
  800. setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
  801. setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
  802. setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
  803. setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
  804. setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
  805. setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
  806. setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
  807. setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
  808. // Mark v2f32 intrinsics.
  809. setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
  810. setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
  811. setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
  812. setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
  813. setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
  814. setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
  815. setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
  816. setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
  817. setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
  818. setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
  819. setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
  820. setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
  821. setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
  822. setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
  823. // Neon does not support some operations on v1i64 and v2i64 types.
  824. setOperationAction(ISD::MUL, MVT::v1i64, Expand);
  825. // Custom handling for some quad-vector types to detect VMULL.
  826. setOperationAction(ISD::MUL, MVT::v8i16, Custom);
  827. setOperationAction(ISD::MUL, MVT::v4i32, Custom);
  828. setOperationAction(ISD::MUL, MVT::v2i64, Custom);
  829. // Custom handling for some vector types to avoid expensive expansions
  830. setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
  831. setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
  832. setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
  833. setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
  834. // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
  835. // a destination type that is wider than the source, and nor does
  836. // it have a FP_TO_[SU]INT instruction with a narrower destination than
  837. // source.
  838. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
  839. setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
  840. setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
  841. setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
  842. setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
  843. setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
  844. setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
  845. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
  846. setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
  847. setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
  848. // NEON does not have single instruction CTPOP for vectors with element
  849. // types wider than 8-bits. However, custom lowering can leverage the
  850. // v8i8/v16i8 vcnt instruction.
  851. setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
  852. setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
  853. setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
  854. setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
  855. setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
  856. setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
  857. setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
  858. setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
  859. // NEON does not have single instruction CTTZ for vectors.
  860. setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
  861. setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
  862. setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
  863. setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
  864. setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
  865. setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
  866. setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
  867. setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
  868. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
  869. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
  870. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
  871. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
  872. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
  873. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
  874. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
  875. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
  876. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  877. setOperationAction(ISD::MULHS, VT, Expand);
  878. setOperationAction(ISD::MULHU, VT, Expand);
  879. }
  880. // NEON only has FMA instructions as of VFP4.
  881. if (!Subtarget->hasVFP4Base()) {
  882. setOperationAction(ISD::FMA, MVT::v2f32, Expand);
  883. setOperationAction(ISD::FMA, MVT::v4f32, Expand);
  884. }
  885. setTargetDAGCombine(ISD::SHL);
  886. setTargetDAGCombine(ISD::SRL);
  887. setTargetDAGCombine(ISD::SRA);
  888. setTargetDAGCombine(ISD::FP_TO_SINT);
  889. setTargetDAGCombine(ISD::FP_TO_UINT);
  890. setTargetDAGCombine(ISD::FDIV);
  891. setTargetDAGCombine(ISD::LOAD);
  892. // It is legal to extload from v4i8 to v4i16 or v4i32.
  893. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
  894. MVT::v2i32}) {
  895. for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
  896. setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
  897. setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
  898. setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
  899. }
  900. }
  901. }
  902. if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
  903. setTargetDAGCombine(ISD::BUILD_VECTOR);
  904. setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
  905. setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
  906. setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
  907. setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  908. setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
  909. setTargetDAGCombine(ISD::STORE);
  910. setTargetDAGCombine(ISD::SIGN_EXTEND);
  911. setTargetDAGCombine(ISD::ZERO_EXTEND);
  912. setTargetDAGCombine(ISD::ANY_EXTEND);
  913. setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  914. setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
  915. setTargetDAGCombine(ISD::INTRINSIC_VOID);
  916. setTargetDAGCombine(ISD::VECREDUCE_ADD);
  917. setTargetDAGCombine(ISD::ADD);
  918. setTargetDAGCombine(ISD::BITCAST);
  919. }
  920. if (Subtarget->hasMVEIntegerOps()) {
  921. setTargetDAGCombine(ISD::SMIN);
  922. setTargetDAGCombine(ISD::UMIN);
  923. setTargetDAGCombine(ISD::SMAX);
  924. setTargetDAGCombine(ISD::UMAX);
  925. setTargetDAGCombine(ISD::FP_EXTEND);
  926. setTargetDAGCombine(ISD::SELECT);
  927. setTargetDAGCombine(ISD::SELECT_CC);
  928. setTargetDAGCombine(ISD::SETCC);
  929. }
  930. if (Subtarget->hasMVEFloatOps()) {
  931. setTargetDAGCombine(ISD::FADD);
  932. }
  933. if (!Subtarget->hasFP64()) {
  934. // When targeting a floating-point unit with only single-precision
  935. // operations, f64 is legal for the few double-precision instructions which
  936. // are present However, no double-precision operations other than moves,
  937. // loads and stores are provided by the hardware.
  938. setOperationAction(ISD::FADD, MVT::f64, Expand);
  939. setOperationAction(ISD::FSUB, MVT::f64, Expand);
  940. setOperationAction(ISD::FMUL, MVT::f64, Expand);
  941. setOperationAction(ISD::FMA, MVT::f64, Expand);
  942. setOperationAction(ISD::FDIV, MVT::f64, Expand);
  943. setOperationAction(ISD::FREM, MVT::f64, Expand);
  944. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  945. setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
  946. setOperationAction(ISD::FNEG, MVT::f64, Expand);
  947. setOperationAction(ISD::FABS, MVT::f64, Expand);
  948. setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  949. setOperationAction(ISD::FSIN, MVT::f64, Expand);
  950. setOperationAction(ISD::FCOS, MVT::f64, Expand);
  951. setOperationAction(ISD::FPOW, MVT::f64, Expand);
  952. setOperationAction(ISD::FLOG, MVT::f64, Expand);
  953. setOperationAction(ISD::FLOG2, MVT::f64, Expand);
  954. setOperationAction(ISD::FLOG10, MVT::f64, Expand);
  955. setOperationAction(ISD::FEXP, MVT::f64, Expand);
  956. setOperationAction(ISD::FEXP2, MVT::f64, Expand);
  957. setOperationAction(ISD::FCEIL, MVT::f64, Expand);
  958. setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
  959. setOperationAction(ISD::FRINT, MVT::f64, Expand);
  960. setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
  961. setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
  962. setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
  963. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
  964. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
  965. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
  966. setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
  967. setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
  968. setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
  969. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
  970. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
  971. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
  972. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
  973. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
  974. }
  975. if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
  976. setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
  977. setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
  978. if (Subtarget->hasFullFP16()) {
  979. setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
  980. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
  981. }
  982. }
  983. if (!Subtarget->hasFP16()) {
  984. setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
  985. setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
  986. }
  987. computeRegisterProperties(Subtarget->getRegisterInfo());
  988. // ARM does not have floating-point extending loads.
  989. for (MVT VT : MVT::fp_valuetypes()) {
  990. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
  991. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
  992. }
  993. // ... or truncating stores
  994. setTruncStoreAction(MVT::f64, MVT::f32, Expand);
  995. setTruncStoreAction(MVT::f32, MVT::f16, Expand);
  996. setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  997. // ARM does not have i1 sign extending load.
  998. for (MVT VT : MVT::integer_valuetypes())
  999. setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  1000. // ARM supports all 4 flavors of integer indexed load / store.
  1001. if (!Subtarget->isThumb1Only()) {
  1002. for (unsigned im = (unsigned)ISD::PRE_INC;
  1003. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  1004. setIndexedLoadAction(im, MVT::i1, Legal);
  1005. setIndexedLoadAction(im, MVT::i8, Legal);
  1006. setIndexedLoadAction(im, MVT::i16, Legal);
  1007. setIndexedLoadAction(im, MVT::i32, Legal);
  1008. setIndexedStoreAction(im, MVT::i1, Legal);
  1009. setIndexedStoreAction(im, MVT::i8, Legal);
  1010. setIndexedStoreAction(im, MVT::i16, Legal);
  1011. setIndexedStoreAction(im, MVT::i32, Legal);
  1012. }
  1013. } else {
  1014. // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
  1015. setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
  1016. setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
  1017. }
  1018. setOperationAction(ISD::SADDO, MVT::i32, Custom);
  1019. setOperationAction(ISD::UADDO, MVT::i32, Custom);
  1020. setOperationAction(ISD::SSUBO, MVT::i32, Custom);
  1021. setOperationAction(ISD::USUBO, MVT::i32, Custom);
  1022. setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
  1023. setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
  1024. if (Subtarget->hasDSP()) {
  1025. setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
  1026. setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
  1027. setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
  1028. setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
  1029. setOperationAction(ISD::UADDSAT, MVT::i8, Custom);
  1030. setOperationAction(ISD::USUBSAT, MVT::i8, Custom);
  1031. setOperationAction(ISD::UADDSAT, MVT::i16, Custom);
  1032. setOperationAction(ISD::USUBSAT, MVT::i16, Custom);
  1033. }
  1034. if (Subtarget->hasBaseDSP()) {
  1035. setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
  1036. setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
  1037. }
  1038. // i64 operation support.
  1039. setOperationAction(ISD::MUL, MVT::i64, Expand);
  1040. setOperationAction(ISD::MULHU, MVT::i32, Expand);
  1041. if (Subtarget->isThumb1Only()) {
  1042. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
  1043. setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
  1044. }
  1045. if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
  1046. || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
  1047. setOperationAction(ISD::MULHS, MVT::i32, Expand);
  1048. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
  1049. setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
  1050. setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
  1051. setOperationAction(ISD::SRL, MVT::i64, Custom);
  1052. setOperationAction(ISD::SRA, MVT::i64, Custom);
  1053. setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  1054. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
  1055. setOperationAction(ISD::LOAD, MVT::i64, Custom);
  1056. setOperationAction(ISD::STORE, MVT::i64, Custom);
  1057. // MVE lowers 64 bit shifts to lsll and lsrl
  1058. // assuming that ISD::SRL and SRA of i64 are already marked custom
  1059. if (Subtarget->hasMVEIntegerOps())
  1060. setOperationAction(ISD::SHL, MVT::i64, Custom);
  1061. // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
  1062. if (Subtarget->isThumb1Only()) {
  1063. setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
  1064. setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
  1065. setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
  1066. }
  1067. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
  1068. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
  1069. // ARM does not have ROTL.
  1070. setOperationAction(ISD::ROTL, MVT::i32, Expand);
  1071. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  1072. setOperationAction(ISD::ROTL, VT, Expand);
  1073. setOperationAction(ISD::ROTR, VT, Expand);
  1074. }
  1075. setOperationAction(ISD::CTTZ, MVT::i32, Custom);
  1076. setOperationAction(ISD::CTPOP, MVT::i32, Expand);
  1077. if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
  1078. setOperationAction(ISD::CTLZ, MVT::i32, Expand);
  1079. setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
  1080. }
  1081. // @llvm.readcyclecounter requires the Performance Monitors extension.
  1082. // Default to the 0 expansion on unsupported platforms.
  1083. // FIXME: Technically there are older ARM CPUs that have
  1084. // implementation-specific ways of obtaining this information.
  1085. if (Subtarget->hasPerfMon())
  1086. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
  1087. // Only ARMv6 has BSWAP.
  1088. if (!Subtarget->hasV6Ops())
  1089. setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  1090. bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
  1091. : Subtarget->hasDivideInARMMode();
  1092. if (!hasDivide) {
  1093. // These are expanded into libcalls if the cpu doesn't have HW divider.
  1094. setOperationAction(ISD::SDIV, MVT::i32, LibCall);
  1095. setOperationAction(ISD::UDIV, MVT::i32, LibCall);
  1096. }
  1097. if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
  1098. setOperationAction(ISD::SDIV, MVT::i32, Custom);
  1099. setOperationAction(ISD::UDIV, MVT::i32, Custom);
  1100. setOperationAction(ISD::SDIV, MVT::i64, Custom);
  1101. setOperationAction(ISD::UDIV, MVT::i64, Custom);
  1102. }
  1103. setOperationAction(ISD::SREM, MVT::i32, Expand);
  1104. setOperationAction(ISD::UREM, MVT::i32, Expand);
  1105. // Register based DivRem for AEABI (RTABI 4.2)
  1106. if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
  1107. Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
  1108. Subtarget->isTargetWindows()) {
  1109. setOperationAction(ISD::SREM, MVT::i64, Custom);
  1110. setOperationAction(ISD::UREM, MVT::i64, Custom);
  1111. HasStandaloneRem = false;
  1112. if (Subtarget->isTargetWindows()) {
  1113. const struct {
  1114. const RTLIB::Libcall Op;
  1115. const char * const Name;
  1116. const CallingConv::ID CC;
  1117. } LibraryCalls[] = {
  1118. { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
  1119. { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
  1120. { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
  1121. { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
  1122. { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
  1123. { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
  1124. { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
  1125. { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
  1126. };
  1127. for (const auto &LC : LibraryCalls) {
  1128. setLibcallName(LC.Op, LC.Name);
  1129. setLibcallCallingConv(LC.Op, LC.CC);
  1130. }
  1131. } else {
  1132. const struct {
  1133. const RTLIB::Libcall Op;
  1134. const char * const Name;
  1135. const CallingConv::ID CC;
  1136. } LibraryCalls[] = {
  1137. { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
  1138. { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
  1139. { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
  1140. { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
  1141. { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
  1142. { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
  1143. { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
  1144. { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
  1145. };
  1146. for (const auto &LC : LibraryCalls) {
  1147. setLibcallName(LC.Op, LC.Name);
  1148. setLibcallCallingConv(LC.Op, LC.CC);
  1149. }
  1150. }
  1151. setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
  1152. setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
  1153. setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
  1154. setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
  1155. } else {
  1156. setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
  1157. setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
  1158. }
  1159. if (Subtarget->getTargetTriple().isOSMSVCRT()) {
  1160. // MSVCRT doesn't have powi; fall back to pow
  1161. setLibcallName(RTLIB::POWI_F32, nullptr);
  1162. setLibcallName(RTLIB::POWI_F64, nullptr);
  1163. }
  1164. setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  1165. setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
  1166. setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
  1167. setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
  1168. setOperationAction(ISD::TRAP, MVT::Other, Legal);
  1169. setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
  1170. // Use the default implementation.
  1171. setOperationAction(ISD::VASTART, MVT::Other, Custom);
  1172. setOperationAction(ISD::VAARG, MVT::Other, Expand);
  1173. setOperationAction(ISD::VACOPY, MVT::Other, Expand);
  1174. setOperationAction(ISD::VAEND, MVT::Other, Expand);
  1175. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
  1176. setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
  1177. if (Subtarget->isTargetWindows())
  1178. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
  1179. else
  1180. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
  1181. // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
  1182. // the default expansion.
  1183. InsertFencesForAtomic = false;
  1184. if (Subtarget->hasAnyDataBarrier() &&
  1185. (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
  1186. // ATOMIC_FENCE needs custom lowering; the others should have been expanded
  1187. // to ldrex/strex loops already.
  1188. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
  1189. if (!Subtarget->isThumb() || !Subtarget->isMClass())
  1190. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
  1191. // On v8, we have particularly efficient implementations of atomic fences
  1192. // if they can be combined with nearby atomic loads and stores.
  1193. if (!Subtarget->hasAcquireRelease() ||
  1194. getTargetMachine().getOptLevel() == 0) {
  1195. // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
  1196. InsertFencesForAtomic = true;
  1197. }
  1198. } else {
  1199. // If there's anything we can use as a barrier, go through custom lowering
  1200. // for ATOMIC_FENCE.
  1201. // If target has DMB in thumb, Fences can be inserted.
  1202. if (Subtarget->hasDataBarrier())
  1203. InsertFencesForAtomic = true;
  1204. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
  1205. Subtarget->hasAnyDataBarrier() ? Custom : Expand);
  1206. // Set them all for expansion, which will force libcalls.
  1207. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
  1208. setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
  1209. setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
  1210. setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
  1211. setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
  1212. setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
  1213. setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
  1214. setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
  1215. setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
  1216. setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
  1217. setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
  1218. setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
  1219. // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
  1220. // Unordered/Monotonic case.
  1221. if (!InsertFencesForAtomic) {
  1222. setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
  1223. setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
  1224. }
  1225. }
  1226. setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
  1227. // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
  1228. if (!Subtarget->hasV6Ops()) {
  1229. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
  1230. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
  1231. }
  1232. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  1233. if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
  1234. !Subtarget->isThumb1Only()) {
  1235. // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
  1236. // iff target supports vfp2.
  1237. setOperationAction(ISD::BITCAST, MVT::i64, Custom);
  1238. setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
  1239. setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
  1240. }
  1241. // We want to custom lower some of our intrinsics.
  1242. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  1243. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
  1244. setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
  1245. setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
  1246. if (Subtarget->useSjLjEH())
  1247. setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
  1248. setOperationAction(ISD::SETCC, MVT::i32, Expand);
  1249. setOperationAction(ISD::SETCC, MVT::f32, Expand);
  1250. setOperationAction(ISD::SETCC, MVT::f64, Expand);
  1251. setOperationAction(ISD::SELECT, MVT::i32, Custom);
  1252. setOperationAction(ISD::SELECT, MVT::f32, Custom);
  1253. setOperationAction(ISD::SELECT, MVT::f64, Custom);
  1254. setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  1255. setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  1256. setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
  1257. if (Subtarget->hasFullFP16()) {
  1258. setOperationAction(ISD::SETCC, MVT::f16, Expand);
  1259. setOperationAction(ISD::SELECT, MVT::f16, Custom);
  1260. setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
  1261. }
  1262. setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
  1263. setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  1264. setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  1265. if (Subtarget->hasFullFP16())
  1266. setOperationAction(ISD::BR_CC, MVT::f16, Custom);
  1267. setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  1268. setOperationAction(ISD::BR_CC, MVT::f64, Custom);
  1269. setOperationAction(ISD::BR_JT, MVT::Other, Custom);
  1270. // We don't support sin/cos/fmod/copysign/pow
  1271. setOperationAction(ISD::FSIN, MVT::f64, Expand);
  1272. setOperationAction(ISD::FSIN, MVT::f32, Expand);
  1273. setOperationAction(ISD::FCOS, MVT::f32, Expand);
  1274. setOperationAction(ISD::FCOS, MVT::f64, Expand);
  1275. setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
  1276. setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
  1277. setOperationAction(ISD::FREM, MVT::f64, Expand);
  1278. setOperationAction(ISD::FREM, MVT::f32, Expand);
  1279. if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
  1280. !Subtarget->isThumb1Only()) {
  1281. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
  1282. setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
  1283. }
  1284. setOperationAction(ISD::FPOW, MVT::f64, Expand);
  1285. setOperationAction(ISD::FPOW, MVT::f32, Expand);
  1286. if (!Subtarget->hasVFP4Base()) {
  1287. setOperationAction(ISD::FMA, MVT::f64, Expand);
  1288. setOperationAction(ISD::FMA, MVT::f32, Expand);
  1289. }
  1290. // Various VFP goodness
  1291. if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
  1292. // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
  1293. if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
  1294. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
  1295. setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
  1296. }
  1297. // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
  1298. if (!Subtarget->hasFP16()) {
  1299. setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
  1300. setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
  1301. }
  1302. // Strict floating-point comparisons need custom lowering.
  1303. setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
  1304. setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
  1305. setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
  1306. setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
  1307. setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
  1308. setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
  1309. }
  1310. // Use __sincos_stret if available.
  1311. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
  1312. getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
  1313. setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
  1314. setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
  1315. }
  1316. // FP-ARMv8 implements a lot of rounding-like FP operations.
  1317. if (Subtarget->hasFPARMv8Base()) {
  1318. setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
  1319. setOperationAction(ISD::FCEIL, MVT::f32, Legal);
  1320. setOperationAction(ISD::FROUND, MVT::f32, Legal);
  1321. setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
  1322. setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
  1323. setOperationAction(ISD::FRINT, MVT::f32, Legal);
  1324. setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
  1325. setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
  1326. if (Subtarget->hasNEON()) {
  1327. setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
  1328. setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
  1329. setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
  1330. setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
  1331. }
  1332. if (Subtarget->hasFP64()) {
  1333. setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
  1334. setOperationAction(ISD::FCEIL, MVT::f64, Legal);
  1335. setOperationAction(ISD::FROUND, MVT::f64, Legal);
  1336. setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
  1337. setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
  1338. setOperationAction(ISD::FRINT, MVT::f64, Legal);
  1339. setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
  1340. setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
  1341. }
  1342. }
  1343. // FP16 often need to be promoted to call lib functions
  1344. if (Subtarget->hasFullFP16()) {
  1345. setOperationAction(ISD::FREM, MVT::f16, Promote);
  1346. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
  1347. setOperationAction(ISD::FSIN, MVT::f16, Promote);
  1348. setOperationAction(ISD::FCOS, MVT::f16, Promote);
  1349. setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
  1350. setOperationAction(ISD::FPOWI, MVT::f16, Promote);
  1351. setOperationAction(ISD::FPOW, MVT::f16, Promote);
  1352. setOperationAction(ISD::FEXP, MVT::f16, Promote);
  1353. setOperationAction(ISD::FEXP2, MVT::f16, Promote);
  1354. setOperationAction(ISD::FLOG, MVT::f16, Promote);
  1355. setOperationAction(ISD::FLOG10, MVT::f16, Promote);
  1356. setOperationAction(ISD::FLOG2, MVT::f16, Promote);
  1357. setOperationAction(ISD::FROUND, MVT::f16, Legal);
  1358. }
  1359. if (Subtarget->hasNEON()) {
  1360. // vmin and vmax aren't available in a scalar form, so we can use
  1361. // a NEON instruction with an undef lane instead. This has a performance
  1362. // penalty on some cores, so we don't do this unless we have been
  1363. // asked to by the core tuning model.
  1364. if (Subtarget->useNEONForSinglePrecisionFP()) {
  1365. setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
  1366. setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
  1367. setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
  1368. setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
  1369. }
  1370. setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
  1371. setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
  1372. setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
  1373. setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
  1374. if (Subtarget->hasFullFP16()) {
  1375. setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
  1376. setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
  1377. setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
  1378. setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
  1379. setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
  1380. setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
  1381. setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
  1382. setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
  1383. }
  1384. }
  1385. // We have target-specific dag combine patterns for the following nodes:
  1386. // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
  1387. setTargetDAGCombine(ISD::ADD);
  1388. setTargetDAGCombine(ISD::SUB);
  1389. setTargetDAGCombine(ISD::MUL);
  1390. setTargetDAGCombine(ISD::AND);
  1391. setTargetDAGCombine(ISD::OR);
  1392. setTargetDAGCombine(ISD::XOR);
  1393. if (Subtarget->hasMVEIntegerOps())
  1394. setTargetDAGCombine(ISD::VSELECT);
  1395. if (Subtarget->hasV6Ops())
  1396. setTargetDAGCombine(ISD::SRL);
  1397. if (Subtarget->isThumb1Only())
  1398. setTargetDAGCombine(ISD::SHL);
  1399. setStackPointerRegisterToSaveRestore(ARM::SP);
  1400. if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
  1401. !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
  1402. setSchedulingPreference(Sched::RegPressure);
  1403. else
  1404. setSchedulingPreference(Sched::Hybrid);
  1405. //// temporary - rewrite interface to use type
  1406. MaxStoresPerMemset = 8;
  1407. MaxStoresPerMemsetOptSize = 4;
  1408. MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
  1409. MaxStoresPerMemcpyOptSize = 2;
  1410. MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
  1411. MaxStoresPerMemmoveOptSize = 2;
  1412. // On ARM arguments smaller than 4 bytes are extended, so all arguments
  1413. // are at least 4 bytes aligned.
  1414. setMinStackArgumentAlignment(Align(4));
  1415. // Prefer likely predicted branches to selects on out-of-order cores.
  1416. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
  1417. setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
  1418. setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
  1419. if (Subtarget->isThumb() || Subtarget->isThumb2())
  1420. setTargetDAGCombine(ISD::ABS);
  1421. }
  1422. bool ARMTargetLowering::useSoftFloat() const {
  1423. return Subtarget->useSoftFloat();
  1424. }
  1425. // FIXME: It might make sense to define the representative register class as the
  1426. // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
  1427. // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
  1428. // SPR's representative would be DPR_VFP2. This should work well if register
  1429. // pressure tracking were modified such that a register use would increment the
  1430. // pressure of the register class's representative and all of it's super
  1431. // classes' representatives transitively. We have not implemented this because
  1432. // of the difficulty prior to coalescing of modeling operand register classes
  1433. // due to the common occurrence of cross class copies and subregister insertions
  1434. // and extractions.
  1435. std::pair<const TargetRegisterClass *, uint8_t>
  1436. ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
  1437. MVT VT) const {
  1438. const TargetRegisterClass *RRC = nullptr;
  1439. uint8_t Cost = 1;
  1440. switch (VT.SimpleTy) {
  1441. default:
  1442. return TargetLowering::findRepresentativeClass(TRI, VT);
  1443. // Use DPR as representative register class for all floating point
  1444. // and vector types. Since there are 32 SPR registers and 32 DPR registers so
  1445. // the cost is 1 for both f32 and f64.
  1446. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
  1447. case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
  1448. RRC = &ARM::DPRRegClass;
  1449. // When NEON is used for SP, only half of the register file is available
  1450. // because operations that define both SP and DP results will be constrained
  1451. // to the VFP2 class (D0-D15). We currently model this constraint prior to
  1452. // coalescing by double-counting the SP regs. See the FIXME above.
  1453. if (Subtarget->useNEONForSinglePrecisionFP())
  1454. Cost = 2;
  1455. break;
  1456. case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
  1457. case MVT::v4f32: case MVT::v2f64:
  1458. RRC = &ARM::DPRRegClass;
  1459. Cost = 2;
  1460. break;
  1461. case MVT::v4i64:
  1462. RRC = &ARM::DPRRegClass;
  1463. Cost = 4;
  1464. break;
  1465. case MVT::v8i64:
  1466. RRC = &ARM::DPRRegClass;
  1467. Cost = 8;
  1468. break;
  1469. }
  1470. return std::make_pair(RRC, Cost);
  1471. }
  1472. const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  1473. #define MAKE_CASE(V) \
  1474. case V: \
  1475. return #V;
  1476. switch ((ARMISD::NodeType)Opcode) {
  1477. case ARMISD::FIRST_NUMBER:
  1478. break;
  1479. MAKE_CASE(ARMISD::Wrapper)
  1480. MAKE_CASE(ARMISD::WrapperPIC)
  1481. MAKE_CASE(ARMISD::WrapperJT)
  1482. MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
  1483. MAKE_CASE(ARMISD::CALL)
  1484. MAKE_CASE(ARMISD::CALL_PRED)
  1485. MAKE_CASE(ARMISD::CALL_NOLINK)
  1486. MAKE_CASE(ARMISD::tSECALL)
  1487. MAKE_CASE(ARMISD::t2CALL_BTI)
  1488. MAKE_CASE(ARMISD::BRCOND)
  1489. MAKE_CASE(ARMISD::BR_JT)
  1490. MAKE_CASE(ARMISD::BR2_JT)
  1491. MAKE_CASE(ARMISD::RET_FLAG)
  1492. MAKE_CASE(ARMISD::SERET_FLAG)
  1493. MAKE_CASE(ARMISD::INTRET_FLAG)
  1494. MAKE_CASE(ARMISD::PIC_ADD)
  1495. MAKE_CASE(ARMISD::CMP)
  1496. MAKE_CASE(ARMISD::CMN)
  1497. MAKE_CASE(ARMISD::CMPZ)
  1498. MAKE_CASE(ARMISD::CMPFP)
  1499. MAKE_CASE(ARMISD::CMPFPE)
  1500. MAKE_CASE(ARMISD::CMPFPw0)
  1501. MAKE_CASE(ARMISD::CMPFPEw0)
  1502. MAKE_CASE(ARMISD::BCC_i64)
  1503. MAKE_CASE(ARMISD::FMSTAT)
  1504. MAKE_CASE(ARMISD::CMOV)
  1505. MAKE_CASE(ARMISD::SUBS)
  1506. MAKE_CASE(ARMISD::SSAT)
  1507. MAKE_CASE(ARMISD::USAT)
  1508. MAKE_CASE(ARMISD::ASRL)
  1509. MAKE_CASE(ARMISD::LSRL)
  1510. MAKE_CASE(ARMISD::LSLL)
  1511. MAKE_CASE(ARMISD::SRL_FLAG)
  1512. MAKE_CASE(ARMISD::SRA_FLAG)
  1513. MAKE_CASE(ARMISD::RRX)
  1514. MAKE_CASE(ARMISD::ADDC)
  1515. MAKE_CASE(ARMISD::ADDE)
  1516. MAKE_CASE(ARMISD::SUBC)
  1517. MAKE_CASE(ARMISD::SUBE)
  1518. MAKE_CASE(ARMISD::LSLS)
  1519. MAKE_CASE(ARMISD::VMOVRRD)
  1520. MAKE_CASE(ARMISD::VMOVDRR)
  1521. MAKE_CASE(ARMISD::VMOVhr)
  1522. MAKE_CASE(ARMISD::VMOVrh)
  1523. MAKE_CASE(ARMISD::VMOVSR)
  1524. MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
  1525. MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
  1526. MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
  1527. MAKE_CASE(ARMISD::TC_RETURN)
  1528. MAKE_CASE(ARMISD::THREAD_POINTER)
  1529. MAKE_CASE(ARMISD::DYN_ALLOC)
  1530. MAKE_CASE(ARMISD::MEMBARRIER_MCR)
  1531. MAKE_CASE(ARMISD::PRELOAD)
  1532. MAKE_CASE(ARMISD::LDRD)
  1533. MAKE_CASE(ARMISD::STRD)
  1534. MAKE_CASE(ARMISD::WIN__CHKSTK)
  1535. MAKE_CASE(ARMISD::WIN__DBZCHK)
  1536. MAKE_CASE(ARMISD::PREDICATE_CAST)
  1537. MAKE_CASE(ARMISD::VECTOR_REG_CAST)
  1538. MAKE_CASE(ARMISD::MVESEXT)
  1539. MAKE_CASE(ARMISD::MVEZEXT)
  1540. MAKE_CASE(ARMISD::MVETRUNC)
  1541. MAKE_CASE(ARMISD::VCMP)
  1542. MAKE_CASE(ARMISD::VCMPZ)
  1543. MAKE_CASE(ARMISD::VTST)
  1544. MAKE_CASE(ARMISD::VSHLs)
  1545. MAKE_CASE(ARMISD::VSHLu)
  1546. MAKE_CASE(ARMISD::VSHLIMM)
  1547. MAKE_CASE(ARMISD::VSHRsIMM)
  1548. MAKE_CASE(ARMISD::VSHRuIMM)
  1549. MAKE_CASE(ARMISD::VRSHRsIMM)
  1550. MAKE_CASE(ARMISD::VRSHRuIMM)
  1551. MAKE_CASE(ARMISD::VRSHRNIMM)
  1552. MAKE_CASE(ARMISD::VQSHLsIMM)
  1553. MAKE_CASE(ARMISD::VQSHLuIMM)
  1554. MAKE_CASE(ARMISD::VQSHLsuIMM)
  1555. MAKE_CASE(ARMISD::VQSHRNsIMM)
  1556. MAKE_CASE(ARMISD::VQSHRNuIMM)
  1557. MAKE_CASE(ARMISD::VQSHRNsuIMM)
  1558. MAKE_CASE(ARMISD::VQRSHRNsIMM)
  1559. MAKE_CASE(ARMISD::VQRSHRNuIMM)
  1560. MAKE_CASE(ARMISD::VQRSHRNsuIMM)
  1561. MAKE_CASE(ARMISD::VSLIIMM)
  1562. MAKE_CASE(ARMISD::VSRIIMM)
  1563. MAKE_CASE(ARMISD::VGETLANEu)
  1564. MAKE_CASE(ARMISD::VGETLANEs)
  1565. MAKE_CASE(ARMISD::VMOVIMM)
  1566. MAKE_CASE(ARMISD::VMVNIMM)
  1567. MAKE_CASE(ARMISD::VMOVFPIMM)
  1568. MAKE_CASE(ARMISD::VDUP)
  1569. MAKE_CASE(ARMISD::VDUPLANE)
  1570. MAKE_CASE(ARMISD::VEXT)
  1571. MAKE_CASE(ARMISD::VREV64)
  1572. MAKE_CASE(ARMISD::VREV32)
  1573. MAKE_CASE(ARMISD::VREV16)
  1574. MAKE_CASE(ARMISD::VZIP)
  1575. MAKE_CASE(ARMISD::VUZP)
  1576. MAKE_CASE(ARMISD::VTRN)
  1577. MAKE_CASE(ARMISD::VTBL1)
  1578. MAKE_CASE(ARMISD::VTBL2)
  1579. MAKE_CASE(ARMISD::VMOVN)
  1580. MAKE_CASE(ARMISD::VQMOVNs)
  1581. MAKE_CASE(ARMISD::VQMOVNu)
  1582. MAKE_CASE(ARMISD::VCVTN)
  1583. MAKE_CASE(ARMISD::VCVTL)
  1584. MAKE_CASE(ARMISD::VIDUP)
  1585. MAKE_CASE(ARMISD::VMULLs)
  1586. MAKE_CASE(ARMISD::VMULLu)
  1587. MAKE_CASE(ARMISD::VQDMULH)
  1588. MAKE_CASE(ARMISD::VADDVs)
  1589. MAKE_CASE(ARMISD::VADDVu)
  1590. MAKE_CASE(ARMISD::VADDVps)
  1591. MAKE_CASE(ARMISD::VADDVpu)
  1592. MAKE_CASE(ARMISD::VADDLVs)
  1593. MAKE_CASE(ARMISD::VADDLVu)
  1594. MAKE_CASE(ARMISD::VADDLVAs)
  1595. MAKE_CASE(ARMISD::VADDLVAu)
  1596. MAKE_CASE(ARMISD::VADDLVps)
  1597. MAKE_CASE(ARMISD::VADDLVpu)
  1598. MAKE_CASE(ARMISD::VADDLVAps)
  1599. MAKE_CASE(ARMISD::VADDLVApu)
  1600. MAKE_CASE(ARMISD::VMLAVs)
  1601. MAKE_CASE(ARMISD::VMLAVu)
  1602. MAKE_CASE(ARMISD::VMLAVps)
  1603. MAKE_CASE(ARMISD::VMLAVpu)
  1604. MAKE_CASE(ARMISD::VMLALVs)
  1605. MAKE_CASE(ARMISD::VMLALVu)
  1606. MAKE_CASE(ARMISD::VMLALVps)
  1607. MAKE_CASE(ARMISD::VMLALVpu)
  1608. MAKE_CASE(ARMISD::VMLALVAs)
  1609. MAKE_CASE(ARMISD::VMLALVAu)
  1610. MAKE_CASE(ARMISD::VMLALVAps)
  1611. MAKE_CASE(ARMISD::VMLALVApu)
  1612. MAKE_CASE(ARMISD::VMINVu)
  1613. MAKE_CASE(ARMISD::VMINVs)
  1614. MAKE_CASE(ARMISD::VMAXVu)
  1615. MAKE_CASE(ARMISD::VMAXVs)
  1616. MAKE_CASE(ARMISD::UMAAL)
  1617. MAKE_CASE(ARMISD::UMLAL)
  1618. MAKE_CASE(ARMISD::SMLAL)
  1619. MAKE_CASE(ARMISD::SMLALBB)
  1620. MAKE_CASE(ARMISD::SMLALBT)
  1621. MAKE_CASE(ARMISD::SMLALTB)
  1622. MAKE_CASE(ARMISD::SMLALTT)
  1623. MAKE_CASE(ARMISD::SMULWB)
  1624. MAKE_CASE(ARMISD::SMULWT)
  1625. MAKE_CASE(ARMISD::SMLALD)
  1626. MAKE_CASE(ARMISD::SMLALDX)
  1627. MAKE_CASE(ARMISD::SMLSLD)
  1628. MAKE_CASE(ARMISD::SMLSLDX)
  1629. MAKE_CASE(ARMISD::SMMLAR)
  1630. MAKE_CASE(ARMISD::SMMLSR)
  1631. MAKE_CASE(ARMISD::QADD16b)
  1632. MAKE_CASE(ARMISD::QSUB16b)
  1633. MAKE_CASE(ARMISD::QADD8b)
  1634. MAKE_CASE(ARMISD::QSUB8b)
  1635. MAKE_CASE(ARMISD::UQADD16b)
  1636. MAKE_CASE(ARMISD::UQSUB16b)
  1637. MAKE_CASE(ARMISD::UQADD8b)
  1638. MAKE_CASE(ARMISD::UQSUB8b)
  1639. MAKE_CASE(ARMISD::BUILD_VECTOR)
  1640. MAKE_CASE(ARMISD::BFI)
  1641. MAKE_CASE(ARMISD::VORRIMM)
  1642. MAKE_CASE(ARMISD::VBICIMM)
  1643. MAKE_CASE(ARMISD::VBSP)
  1644. MAKE_CASE(ARMISD::MEMCPY)
  1645. MAKE_CASE(ARMISD::VLD1DUP)
  1646. MAKE_CASE(ARMISD::VLD2DUP)
  1647. MAKE_CASE(ARMISD::VLD3DUP)
  1648. MAKE_CASE(ARMISD::VLD4DUP)
  1649. MAKE_CASE(ARMISD::VLD1_UPD)
  1650. MAKE_CASE(ARMISD::VLD2_UPD)
  1651. MAKE_CASE(ARMISD::VLD3_UPD)
  1652. MAKE_CASE(ARMISD::VLD4_UPD)
  1653. MAKE_CASE(ARMISD::VLD1x2_UPD)
  1654. MAKE_CASE(ARMISD::VLD1x3_UPD)
  1655. MAKE_CASE(ARMISD::VLD1x4_UPD)
  1656. MAKE_CASE(ARMISD::VLD2LN_UPD)
  1657. MAKE_CASE(ARMISD::VLD3LN_UPD)
  1658. MAKE_CASE(ARMISD::VLD4LN_UPD)
  1659. MAKE_CASE(ARMISD::VLD1DUP_UPD)
  1660. MAKE_CASE(ARMISD::VLD2DUP_UPD)
  1661. MAKE_CASE(ARMISD::VLD3DUP_UPD)
  1662. MAKE_CASE(ARMISD::VLD4DUP_UPD)
  1663. MAKE_CASE(ARMISD::VST1_UPD)
  1664. MAKE_CASE(ARMISD::VST2_UPD)
  1665. MAKE_CASE(ARMISD::VST3_UPD)
  1666. MAKE_CASE(ARMISD::VST4_UPD)
  1667. MAKE_CASE(ARMISD::VST1x2_UPD)
  1668. MAKE_CASE(ARMISD::VST1x3_UPD)
  1669. MAKE_CASE(ARMISD::VST1x4_UPD)
  1670. MAKE_CASE(ARMISD::VST2LN_UPD)
  1671. MAKE_CASE(ARMISD::VST3LN_UPD)
  1672. MAKE_CASE(ARMISD::VST4LN_UPD)
  1673. MAKE_CASE(ARMISD::WLS)
  1674. MAKE_CASE(ARMISD::WLSSETUP)
  1675. MAKE_CASE(ARMISD::LE)
  1676. MAKE_CASE(ARMISD::LOOP_DEC)
  1677. MAKE_CASE(ARMISD::CSINV)
  1678. MAKE_CASE(ARMISD::CSNEG)
  1679. MAKE_CASE(ARMISD::CSINC)
  1680. MAKE_CASE(ARMISD::MEMCPYLOOP)
  1681. MAKE_CASE(ARMISD::MEMSETLOOP)
  1682. #undef MAKE_CASE
  1683. }
  1684. return nullptr;
  1685. }
  1686. EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
  1687. EVT VT) const {
  1688. if (!VT.isVector())
  1689. return getPointerTy(DL);
  1690. // MVE has a predicate register.
  1691. if ((Subtarget->hasMVEIntegerOps() &&
  1692. (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
  1693. VT == MVT::v16i8)) ||
  1694. (Subtarget->hasMVEFloatOps() &&
  1695. (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
  1696. return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
  1697. return VT.changeVectorElementTypeToInteger();
  1698. }
  1699. /// getRegClassFor - Return the register class that should be used for the
  1700. /// specified value type.
  1701. const TargetRegisterClass *
  1702. ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
  1703. (void)isDivergent;
  1704. // Map v4i64 to QQ registers but do not make the type legal. Similarly map
  1705. // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
  1706. // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
  1707. // MVE Q registers.
  1708. if (Subtarget->hasNEON()) {
  1709. if (VT == MVT::v4i64)
  1710. return &ARM::QQPRRegClass;
  1711. if (VT == MVT::v8i64)
  1712. return &ARM::QQQQPRRegClass;
  1713. }
  1714. if (Subtarget->hasMVEIntegerOps()) {
  1715. if (VT == MVT::v4i64)
  1716. return &ARM::MQQPRRegClass;
  1717. if (VT == MVT::v8i64)
  1718. return &ARM::MQQQQPRRegClass;
  1719. }
  1720. return TargetLowering::getRegClassFor(VT);
  1721. }
  1722. // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
  1723. // source/dest is aligned and the copy size is large enough. We therefore want
  1724. // to align such objects passed to memory intrinsics.
  1725. bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
  1726. unsigned &PrefAlign) const {
  1727. if (!isa<MemIntrinsic>(CI))
  1728. return false;
  1729. MinSize = 8;
  1730. // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
  1731. // cycle faster than 4-byte aligned LDM.
  1732. PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
  1733. return true;
  1734. }
  1735. // Create a fast isel object.
  1736. FastISel *
  1737. ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
  1738. const TargetLibraryInfo *libInfo) const {
  1739. return ARM::createFastISel(funcInfo, libInfo);
  1740. }
  1741. Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
  1742. unsigned NumVals = N->getNumValues();
  1743. if (!NumVals)
  1744. return Sched::RegPressure;
  1745. for (unsigned i = 0; i != NumVals; ++i) {
  1746. EVT VT = N->getValueType(i);
  1747. if (VT == MVT::Glue || VT == MVT::Other)
  1748. continue;
  1749. if (VT.isFloatingPoint() || VT.isVector())
  1750. return Sched::ILP;
  1751. }
  1752. if (!N->isMachineOpcode())
  1753. return Sched::RegPressure;
  1754. // Load are scheduled for latency even if there instruction itinerary
  1755. // is not available.
  1756. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  1757. const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
  1758. if (MCID.getNumDefs() == 0)
  1759. return Sched::RegPressure;
  1760. if (!Itins->isEmpty() &&
  1761. Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
  1762. return Sched::ILP;
  1763. return Sched::RegPressure;
  1764. }
  1765. //===----------------------------------------------------------------------===//
  1766. // Lowering Code
  1767. //===----------------------------------------------------------------------===//
  1768. static bool isSRL16(const SDValue &Op) {
  1769. if (Op.getOpcode() != ISD::SRL)
  1770. return false;
  1771. if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
  1772. return Const->getZExtValue() == 16;
  1773. return false;
  1774. }
  1775. static bool isSRA16(const SDValue &Op) {
  1776. if (Op.getOpcode() != ISD::SRA)
  1777. return false;
  1778. if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
  1779. return Const->getZExtValue() == 16;
  1780. return false;
  1781. }
  1782. static bool isSHL16(const SDValue &Op) {
  1783. if (Op.getOpcode() != ISD::SHL)
  1784. return false;
  1785. if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
  1786. return Const->getZExtValue() == 16;
  1787. return false;
  1788. }
  1789. // Check for a signed 16-bit value. We special case SRA because it makes it
  1790. // more simple when also looking for SRAs that aren't sign extending a
  1791. // smaller value. Without the check, we'd need to take extra care with
  1792. // checking order for some operations.
  1793. static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
  1794. if (isSRA16(Op))
  1795. return isSHL16(Op.getOperand(0));
  1796. return DAG.ComputeNumSignBits(Op) == 17;
  1797. }
  1798. /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
  1799. static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
  1800. switch (CC) {
  1801. default: llvm_unreachable("Unknown condition code!");
  1802. case ISD::SETNE: return ARMCC::NE;
  1803. case ISD::SETEQ: return ARMCC::EQ;
  1804. case ISD::SETGT: return ARMCC::GT;
  1805. case ISD::SETGE: return ARMCC::GE;
  1806. case ISD::SETLT: return ARMCC::LT;
  1807. case ISD::SETLE: return ARMCC::LE;
  1808. case ISD::SETUGT: return ARMCC::HI;
  1809. case ISD::SETUGE: return ARMCC::HS;
  1810. case ISD::SETULT: return ARMCC::LO;
  1811. case ISD::SETULE: return ARMCC::LS;
  1812. }
  1813. }
  1814. /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
  1815. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
  1816. ARMCC::CondCodes &CondCode2) {
  1817. CondCode2 = ARMCC::AL;
  1818. switch (CC) {
  1819. default: llvm_unreachable("Unknown FP condition!");
  1820. case ISD::SETEQ:
  1821. case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
  1822. case ISD::SETGT:
  1823. case ISD::SETOGT: CondCode = ARMCC::GT; break;
  1824. case ISD::SETGE:
  1825. case ISD::SETOGE: CondCode = ARMCC::GE; break;
  1826. case ISD::SETOLT: CondCode = ARMCC::MI; break;
  1827. case ISD::SETOLE: CondCode = ARMCC::LS; break;
  1828. case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
  1829. case ISD::SETO: CondCode = ARMCC::VC; break;
  1830. case ISD::SETUO: CondCode = ARMCC::VS; break;
  1831. case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
  1832. case ISD::SETUGT: CondCode = ARMCC::HI; break;
  1833. case ISD::SETUGE: CondCode = ARMCC::PL; break;
  1834. case ISD::SETLT:
  1835. case ISD::SETULT: CondCode = ARMCC::LT; break;
  1836. case ISD::SETLE:
  1837. case ISD::SETULE: CondCode = ARMCC::LE; break;
  1838. case ISD::SETNE:
  1839. case ISD::SETUNE: CondCode = ARMCC::NE; break;
  1840. }
  1841. }
  1842. //===----------------------------------------------------------------------===//
  1843. // Calling Convention Implementation
  1844. //===----------------------------------------------------------------------===//
  1845. /// getEffectiveCallingConv - Get the effective calling convention, taking into
  1846. /// account presence of floating point hardware and calling convention
  1847. /// limitations, such as support for variadic functions.
  1848. CallingConv::ID
  1849. ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
  1850. bool isVarArg) const {
  1851. switch (CC) {
  1852. default:
  1853. report_fatal_error("Unsupported calling convention");
  1854. case CallingConv::ARM_AAPCS:
  1855. case CallingConv::ARM_APCS:
  1856. case CallingConv::GHC:
  1857. case CallingConv::CFGuard_Check:
  1858. return CC;
  1859. case CallingConv::PreserveMost:
  1860. return CallingConv::PreserveMost;
  1861. case CallingConv::ARM_AAPCS_VFP:
  1862. case CallingConv::Swift:
  1863. case CallingConv::SwiftTail:
  1864. return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
  1865. case CallingConv::C:
  1866. case CallingConv::Tail:
  1867. if (!Subtarget->isAAPCS_ABI())
  1868. return CallingConv::ARM_APCS;
  1869. else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
  1870. getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
  1871. !isVarArg)
  1872. return CallingConv::ARM_AAPCS_VFP;
  1873. else
  1874. return CallingConv::ARM_AAPCS;
  1875. case CallingConv::Fast:
  1876. case CallingConv::CXX_FAST_TLS:
  1877. if (!Subtarget->isAAPCS_ABI()) {
  1878. if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
  1879. return CallingConv::Fast;
  1880. return CallingConv::ARM_APCS;
  1881. } else if (Subtarget->hasVFP2Base() &&
  1882. !Subtarget->isThumb1Only() && !isVarArg)
  1883. return CallingConv::ARM_AAPCS_VFP;
  1884. else
  1885. return CallingConv::ARM_AAPCS;
  1886. }
  1887. }
  1888. CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
  1889. bool isVarArg) const {
  1890. return CCAssignFnForNode(CC, false, isVarArg);
  1891. }
  1892. CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
  1893. bool isVarArg) const {
  1894. return CCAssignFnForNode(CC, true, isVarArg);
  1895. }
  1896. /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
  1897. /// CallingConvention.
  1898. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
  1899. bool Return,
  1900. bool isVarArg) const {
  1901. switch (getEffectiveCallingConv(CC, isVarArg)) {
  1902. default:
  1903. report_fatal_error("Unsupported calling convention");
  1904. case CallingConv::ARM_APCS:
  1905. return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
  1906. case CallingConv::ARM_AAPCS:
  1907. return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
  1908. case CallingConv::ARM_AAPCS_VFP:
  1909. return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
  1910. case CallingConv::Fast:
  1911. return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
  1912. case CallingConv::GHC:
  1913. return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
  1914. case CallingConv::PreserveMost:
  1915. return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
  1916. case CallingConv::CFGuard_Check:
  1917. return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
  1918. }
  1919. }
  1920. SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
  1921. MVT LocVT, MVT ValVT, SDValue Val) const {
  1922. Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
  1923. Val);
  1924. if (Subtarget->hasFullFP16()) {
  1925. Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
  1926. } else {
  1927. Val = DAG.getNode(ISD::TRUNCATE, dl,
  1928. MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
  1929. Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
  1930. }
  1931. return Val;
  1932. }
  1933. SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
  1934. MVT LocVT, MVT ValVT,
  1935. SDValue Val) const {
  1936. if (Subtarget->hasFullFP16()) {
  1937. Val = DAG.getNode(ARMISD::VMOVrh, dl,
  1938. MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
  1939. } else {
  1940. Val = DAG.getNode(ISD::BITCAST, dl,
  1941. MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
  1942. Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
  1943. MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
  1944. }
  1945. return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
  1946. }
  1947. /// LowerCallResult - Lower the result values of a call into the
  1948. /// appropriate copies out of appropriate physical registers.
  1949. SDValue ARMTargetLowering::LowerCallResult(
  1950. SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
  1951. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
  1952. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
  1953. SDValue ThisVal) const {
  1954. // Assign locations to each value returned by this call.
  1955. SmallVector<CCValAssign, 16> RVLocs;
  1956. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
  1957. *DAG.getContext());
  1958. CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
  1959. // Copy all of the result registers out of their specified physreg.
  1960. for (unsigned i = 0; i != RVLocs.size(); ++i) {
  1961. CCValAssign VA = RVLocs[i];
  1962. // Pass 'this' value directly from the argument to return value, to avoid
  1963. // reg unit interference
  1964. if (i == 0 && isThisReturn) {
  1965. assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
  1966. "unexpected return calling convention register assignment");
  1967. InVals.push_back(ThisVal);
  1968. continue;
  1969. }
  1970. SDValue Val;
  1971. if (VA.needsCustom() &&
  1972. (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
  1973. // Handle f64 or half of a v2f64.
  1974. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
  1975. InFlag);
  1976. Chain = Lo.getValue(1);
  1977. InFlag = Lo.getValue(2);
  1978. VA = RVLocs[++i]; // skip ahead to next loc
  1979. SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
  1980. InFlag);
  1981. Chain = Hi.getValue(1);
  1982. InFlag = Hi.getValue(2);
  1983. if (!Subtarget->isLittle())
  1984. std::swap (Lo, Hi);
  1985. Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
  1986. if (VA.getLocVT() == MVT::v2f64) {
  1987. SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
  1988. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
  1989. DAG.getConstant(0, dl, MVT::i32));
  1990. VA = RVLocs[++i]; // skip ahead to next loc
  1991. Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
  1992. Chain = Lo.getValue(1);
  1993. InFlag = Lo.getValue(2);
  1994. VA = RVLocs[++i]; // skip ahead to next loc
  1995. Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
  1996. Chain = Hi.getValue(1);
  1997. InFlag = Hi.getValue(2);
  1998. if (!Subtarget->isLittle())
  1999. std::swap (Lo, Hi);
  2000. Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
  2001. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
  2002. DAG.getConstant(1, dl, MVT::i32));
  2003. }
  2004. } else {
  2005. Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
  2006. InFlag);
  2007. Chain = Val.getValue(1);
  2008. InFlag = Val.getValue(2);
  2009. }
  2010. switch (VA.getLocInfo()) {
  2011. default: llvm_unreachable("Unknown loc info!");
  2012. case CCValAssign::Full: break;
  2013. case CCValAssign::BCvt:
  2014. Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
  2015. break;
  2016. }
  2017. // f16 arguments have their size extended to 4 bytes and passed as if they
  2018. // had been copied to the LSBs of a 32-bit register.
  2019. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
  2020. if (VA.needsCustom() &&
  2021. (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
  2022. Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
  2023. InVals.push_back(Val);
  2024. }
  2025. return Chain;
  2026. }
  2027. std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
  2028. const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
  2029. bool IsTailCall, int SPDiff) const {
  2030. SDValue DstAddr;
  2031. MachinePointerInfo DstInfo;
  2032. int32_t Offset = VA.getLocMemOffset();
  2033. MachineFunction &MF = DAG.getMachineFunction();
  2034. if (IsTailCall) {
  2035. Offset += SPDiff;
  2036. auto PtrVT = getPointerTy(DAG.getDataLayout());
  2037. int Size = VA.getLocVT().getFixedSizeInBits() / 8;
  2038. int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
  2039. DstAddr = DAG.getFrameIndex(FI, PtrVT);
  2040. DstInfo =
  2041. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
  2042. } else {
  2043. SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
  2044. DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
  2045. StackPtr, PtrOff);
  2046. DstInfo =
  2047. MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
  2048. }
  2049. return std::make_pair(DstAddr, DstInfo);
  2050. }
  2051. void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
  2052. SDValue Chain, SDValue &Arg,
  2053. RegsToPassVector &RegsToPass,
  2054. CCValAssign &VA, CCValAssign &NextVA,
  2055. SDValue &StackPtr,
  2056. SmallVectorImpl<SDValue> &MemOpChains,
  2057. bool IsTailCall,
  2058. int SPDiff) const {
  2059. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
  2060. DAG.getVTList(MVT::i32, MVT::i32), Arg);
  2061. unsigned id = Subtarget->isLittle() ? 0 : 1;
  2062. RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
  2063. if (NextVA.isRegLoc())
  2064. RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
  2065. else {
  2066. assert(NextVA.isMemLoc());
  2067. if (!StackPtr.getNode())
  2068. StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
  2069. getPointerTy(DAG.getDataLayout()));
  2070. SDValue DstAddr;
  2071. MachinePointerInfo DstInfo;
  2072. std::tie(DstAddr, DstInfo) =
  2073. computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
  2074. MemOpChains.push_back(
  2075. DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
  2076. }
  2077. }
  2078. static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
  2079. return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
  2080. CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
  2081. }
  2082. /// LowerCall - Lowering a call into a callseq_start <-
  2083. /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
  2084. /// nodes.
  2085. SDValue
  2086. ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  2087. SmallVectorImpl<SDValue> &InVals) const {
  2088. SelectionDAG &DAG = CLI.DAG;
  2089. SDLoc &dl = CLI.DL;
  2090. SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
  2091. SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
  2092. SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
  2093. SDValue Chain = CLI.Chain;
  2094. SDValue Callee = CLI.Callee;
  2095. bool &isTailCall = CLI.IsTailCall;
  2096. CallingConv::ID CallConv = CLI.CallConv;
  2097. bool doesNotRet = CLI.DoesNotReturn;
  2098. bool isVarArg = CLI.IsVarArg;
  2099. MachineFunction &MF = DAG.getMachineFunction();
  2100. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  2101. MachineFunction::CallSiteInfo CSInfo;
  2102. bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
  2103. bool isThisReturn = false;
  2104. bool isCmseNSCall = false;
  2105. bool isSibCall = false;
  2106. bool PreferIndirect = false;
  2107. bool GuardWithBTI = false;
  2108. // Lower 'returns_twice' calls to a pseudo-instruction.
  2109. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
  2110. !Subtarget->getNoBTIAtReturnTwice())
  2111. GuardWithBTI = AFI->branchTargetEnforcement();
  2112. // Determine whether this is a non-secure function call.
  2113. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
  2114. isCmseNSCall = true;
  2115. // Disable tail calls if they're not supported.
  2116. if (!Subtarget->supportsTailCall())
  2117. isTailCall = false;
  2118. // For both the non-secure calls and the returns from a CMSE entry function,
  2119. // the function needs to do some extra work afte r the call, or before the
  2120. // return, respectively, thus it cannot end with atail call
  2121. if (isCmseNSCall || AFI->isCmseNSEntryFunction())
  2122. isTailCall = false;
  2123. if (isa<GlobalAddressSDNode>(Callee)) {
  2124. // If we're optimizing for minimum size and the function is called three or
  2125. // more times in this block, we can improve codesize by calling indirectly
  2126. // as BLXr has a 16-bit encoding.
  2127. auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
  2128. if (CLI.CB) {
  2129. auto *BB = CLI.CB->getParent();
  2130. PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
  2131. count_if(GV->users(), [&BB](const User *U) {
  2132. return isa<Instruction>(U) &&
  2133. cast<Instruction>(U)->getParent() == BB;
  2134. }) > 2;
  2135. }
  2136. }
  2137. if (isTailCall) {
  2138. // Check if it's really possible to do a tail call.
  2139. isTailCall = IsEligibleForTailCallOptimization(
  2140. Callee, CallConv, isVarArg, isStructRet,
  2141. MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
  2142. PreferIndirect);
  2143. if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
  2144. CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
  2145. isSibCall = true;
  2146. // We don't support GuaranteedTailCallOpt for ARM, only automatically
  2147. // detected sibcalls.
  2148. if (isTailCall)
  2149. ++NumTailCalls;
  2150. }
  2151. if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
  2152. report_fatal_error("failed to perform tail call elimination on a call "
  2153. "site marked musttail");
  2154. // Analyze operands of the call, assigning locations to each operand.
  2155. SmallVector<CCValAssign, 16> ArgLocs;
  2156. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
  2157. *DAG.getContext());
  2158. CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
  2159. // Get a count of how many bytes are to be pushed on the stack.
  2160. unsigned NumBytes = CCInfo.getNextStackOffset();
  2161. // SPDiff is the byte offset of the call's argument area from the callee's.
  2162. // Stores to callee stack arguments will be placed in FixedStackSlots offset
  2163. // by this amount for a tail call. In a sibling call it must be 0 because the
  2164. // caller will deallocate the entire stack and the callee still expects its
  2165. // arguments to begin at SP+0. Completely unused for non-tail calls.
  2166. int SPDiff = 0;
  2167. if (isTailCall && !isSibCall) {
  2168. auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
  2169. unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
  2170. // Since callee will pop argument stack as a tail call, we must keep the
  2171. // popped size 16-byte aligned.
  2172. Align StackAlign = DAG.getDataLayout().getStackAlignment();
  2173. NumBytes = alignTo(NumBytes, StackAlign);
  2174. // SPDiff will be negative if this tail call requires more space than we
  2175. // would automatically have in our incoming argument space. Positive if we
  2176. // can actually shrink the stack.
  2177. SPDiff = NumReusableBytes - NumBytes;
  2178. // If this call requires more stack than we have available from
  2179. // LowerFormalArguments, tell FrameLowering to reserve space for it.
  2180. if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
  2181. AFI->setArgRegsSaveSize(-SPDiff);
  2182. }
  2183. if (isSibCall) {
  2184. // For sibling tail calls, memory operands are available in our caller's stack.
  2185. NumBytes = 0;
  2186. } else {
  2187. // Adjust the stack pointer for the new arguments...
  2188. // These operations are automatically eliminated by the prolog/epilog pass
  2189. Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
  2190. }
  2191. SDValue StackPtr =
  2192. DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
  2193. RegsToPassVector RegsToPass;
  2194. SmallVector<SDValue, 8> MemOpChains;
  2195. // During a tail call, stores to the argument area must happen after all of
  2196. // the function's incoming arguments have been loaded because they may alias.
  2197. // This is done by folding in a TokenFactor from LowerFormalArguments, but
  2198. // there's no point in doing so repeatedly so this tracks whether that's
  2199. // happened yet.
  2200. bool AfterFormalArgLoads = false;
  2201. // Walk the register/memloc assignments, inserting copies/loads. In the case
  2202. // of tail call optimization, arguments are handled later.
  2203. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
  2204. i != e;
  2205. ++i, ++realArgIdx) {
  2206. CCValAssign &VA = ArgLocs[i];
  2207. SDValue Arg = OutVals[realArgIdx];
  2208. ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
  2209. bool isByVal = Flags.isByVal();
  2210. // Promote the value if needed.
  2211. switch (VA.getLocInfo()) {
  2212. default: llvm_unreachable("Unknown loc info!");
  2213. case CCValAssign::Full: break;
  2214. case CCValAssign::SExt:
  2215. Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
  2216. break;
  2217. case CCValAssign::ZExt:
  2218. Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
  2219. break;
  2220. case CCValAssign::AExt:
  2221. Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
  2222. break;
  2223. case CCValAssign::BCvt:
  2224. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2225. break;
  2226. }
  2227. if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
  2228. Chain = DAG.getStackArgumentTokenFactor(Chain);
  2229. AfterFormalArgLoads = true;
  2230. }
  2231. // f16 arguments have their size extended to 4 bytes and passed as if they
  2232. // had been copied to the LSBs of a 32-bit register.
  2233. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
  2234. if (VA.needsCustom() &&
  2235. (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
  2236. Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
  2237. } else {
  2238. // f16 arguments could have been extended prior to argument lowering.
  2239. // Mask them arguments if this is a CMSE nonsecure call.
  2240. auto ArgVT = Outs[realArgIdx].ArgVT;
  2241. if (isCmseNSCall && (ArgVT == MVT::f16)) {
  2242. auto LocBits = VA.getLocVT().getSizeInBits();
  2243. auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
  2244. SDValue Mask =
  2245. DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
  2246. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
  2247. Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
  2248. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2249. }
  2250. }
  2251. // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
  2252. if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
  2253. SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2254. DAG.getConstant(0, dl, MVT::i32));
  2255. SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2256. DAG.getConstant(1, dl, MVT::i32));
  2257. PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
  2258. StackPtr, MemOpChains, isTailCall, SPDiff);
  2259. VA = ArgLocs[++i]; // skip ahead to next loc
  2260. if (VA.isRegLoc()) {
  2261. PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
  2262. StackPtr, MemOpChains, isTailCall, SPDiff);
  2263. } else {
  2264. assert(VA.isMemLoc());
  2265. SDValue DstAddr;
  2266. MachinePointerInfo DstInfo;
  2267. std::tie(DstAddr, DstInfo) =
  2268. computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
  2269. MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
  2270. }
  2271. } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
  2272. PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
  2273. StackPtr, MemOpChains, isTailCall, SPDiff);
  2274. } else if (VA.isRegLoc()) {
  2275. if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
  2276. Outs[0].VT == MVT::i32) {
  2277. assert(VA.getLocVT() == MVT::i32 &&
  2278. "unexpected calling convention register assignment");
  2279. assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
  2280. "unexpected use of 'returned'");
  2281. isThisReturn = true;
  2282. }
  2283. const TargetOptions &Options = DAG.getTarget().Options;
  2284. if (Options.EmitCallSiteInfo)
  2285. CSInfo.emplace_back(VA.getLocReg(), i);
  2286. RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
  2287. } else if (isByVal) {
  2288. assert(VA.isMemLoc());
  2289. unsigned offset = 0;
  2290. // True if this byval aggregate will be split between registers
  2291. // and memory.
  2292. unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
  2293. unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
  2294. if (CurByValIdx < ByValArgsCount) {
  2295. unsigned RegBegin, RegEnd;
  2296. CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
  2297. EVT PtrVT =
  2298. DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
  2299. unsigned int i, j;
  2300. for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
  2301. SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
  2302. SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
  2303. SDValue Load =
  2304. DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
  2305. DAG.InferPtrAlign(AddArg));
  2306. MemOpChains.push_back(Load.getValue(1));
  2307. RegsToPass.push_back(std::make_pair(j, Load));
  2308. }
  2309. // If parameter size outsides register area, "offset" value
  2310. // helps us to calculate stack slot for remained part properly.
  2311. offset = RegEnd - RegBegin;
  2312. CCInfo.nextInRegsParam();
  2313. }
  2314. if (Flags.getByValSize() > 4*offset) {
  2315. auto PtrVT = getPointerTy(DAG.getDataLayout());
  2316. SDValue Dst;
  2317. MachinePointerInfo DstInfo;
  2318. std::tie(Dst, DstInfo) =
  2319. computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
  2320. SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
  2321. SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
  2322. SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
  2323. MVT::i32);
  2324. SDValue AlignNode =
  2325. DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
  2326. SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
  2327. SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
  2328. MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
  2329. Ops));
  2330. }
  2331. } else {
  2332. assert(VA.isMemLoc());
  2333. SDValue DstAddr;
  2334. MachinePointerInfo DstInfo;
  2335. std::tie(DstAddr, DstInfo) =
  2336. computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
  2337. SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
  2338. MemOpChains.push_back(Store);
  2339. }
  2340. }
  2341. if (!MemOpChains.empty())
  2342. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
  2343. // Build a sequence of copy-to-reg nodes chained together with token chain
  2344. // and flag operands which copy the outgoing args into the appropriate regs.
  2345. SDValue InFlag;
  2346. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
  2347. Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
  2348. RegsToPass[i].second, InFlag);
  2349. InFlag = Chain.getValue(1);
  2350. }
  2351. // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
  2352. // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
  2353. // node so that legalize doesn't hack it.
  2354. bool isDirect = false;
  2355. const TargetMachine &TM = getTargetMachine();
  2356. const Module *Mod = MF.getFunction().getParent();
  2357. const GlobalValue *GV = nullptr;
  2358. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
  2359. GV = G->getGlobal();
  2360. bool isStub =
  2361. !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
  2362. bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
  2363. bool isLocalARMFunc = false;
  2364. auto PtrVt = getPointerTy(DAG.getDataLayout());
  2365. if (Subtarget->genLongCalls()) {
  2366. assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
  2367. "long-calls codegen is not position independent!");
  2368. // Handle a global address or an external symbol. If it's not one of
  2369. // those, the target's already in a register, so we don't need to do
  2370. // anything extra.
  2371. if (isa<GlobalAddressSDNode>(Callee)) {
  2372. // Create a constant pool entry for the callee address
  2373. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  2374. ARMConstantPoolValue *CPV =
  2375. ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
  2376. // Get the address of the callee into a register
  2377. SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
  2378. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  2379. Callee = DAG.getLoad(
  2380. PtrVt, dl, DAG.getEntryNode(), CPAddr,
  2381. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  2382. } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
  2383. const char *Sym = S->getSymbol();
  2384. // Create a constant pool entry for the callee address
  2385. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  2386. ARMConstantPoolValue *CPV =
  2387. ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
  2388. ARMPCLabelIndex, 0);
  2389. // Get the address of the callee into a register
  2390. SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
  2391. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  2392. Callee = DAG.getLoad(
  2393. PtrVt, dl, DAG.getEntryNode(), CPAddr,
  2394. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  2395. }
  2396. } else if (isa<GlobalAddressSDNode>(Callee)) {
  2397. if (!PreferIndirect) {
  2398. isDirect = true;
  2399. bool isDef = GV->isStrongDefinitionForLinker();
  2400. // ARM call to a local ARM function is predicable.
  2401. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
  2402. // tBX takes a register source operand.
  2403. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
  2404. assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
  2405. Callee = DAG.getNode(
  2406. ARMISD::WrapperPIC, dl, PtrVt,
  2407. DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
  2408. Callee = DAG.getLoad(
  2409. PtrVt, dl, DAG.getEntryNode(), Callee,
  2410. MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
  2411. MachineMemOperand::MODereferenceable |
  2412. MachineMemOperand::MOInvariant);
  2413. } else if (Subtarget->isTargetCOFF()) {
  2414. assert(Subtarget->isTargetWindows() &&
  2415. "Windows is the only supported COFF target");
  2416. unsigned TargetFlags = ARMII::MO_NO_FLAG;
  2417. if (GV->hasDLLImportStorageClass())
  2418. TargetFlags = ARMII::MO_DLLIMPORT;
  2419. else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
  2420. TargetFlags = ARMII::MO_COFFSTUB;
  2421. Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
  2422. TargetFlags);
  2423. if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
  2424. Callee =
  2425. DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
  2426. DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
  2427. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  2428. } else {
  2429. Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
  2430. }
  2431. }
  2432. } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
  2433. isDirect = true;
  2434. // tBX takes a register source operand.
  2435. const char *Sym = S->getSymbol();
  2436. if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
  2437. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  2438. ARMConstantPoolValue *CPV =
  2439. ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
  2440. ARMPCLabelIndex, 4);
  2441. SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
  2442. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  2443. Callee = DAG.getLoad(
  2444. PtrVt, dl, DAG.getEntryNode(), CPAddr,
  2445. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  2446. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  2447. Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
  2448. } else {
  2449. Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
  2450. }
  2451. }
  2452. if (isCmseNSCall) {
  2453. assert(!isARMFunc && !isDirect &&
  2454. "Cannot handle call to ARM function or direct call");
  2455. if (NumBytes > 0) {
  2456. DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
  2457. "call to non-secure function would "
  2458. "require passing arguments on stack",
  2459. dl.getDebugLoc());
  2460. DAG.getContext()->diagnose(Diag);
  2461. }
  2462. if (isStructRet) {
  2463. DiagnosticInfoUnsupported Diag(
  2464. DAG.getMachineFunction().getFunction(),
  2465. "call to non-secure function would return value through pointer",
  2466. dl.getDebugLoc());
  2467. DAG.getContext()->diagnose(Diag);
  2468. }
  2469. }
  2470. // FIXME: handle tail calls differently.
  2471. unsigned CallOpc;
  2472. if (Subtarget->isThumb()) {
  2473. if (GuardWithBTI)
  2474. CallOpc = ARMISD::t2CALL_BTI;
  2475. else if (isCmseNSCall)
  2476. CallOpc = ARMISD::tSECALL;
  2477. else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
  2478. CallOpc = ARMISD::CALL_NOLINK;
  2479. else
  2480. CallOpc = ARMISD::CALL;
  2481. } else {
  2482. if (!isDirect && !Subtarget->hasV5TOps())
  2483. CallOpc = ARMISD::CALL_NOLINK;
  2484. else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
  2485. // Emit regular call when code size is the priority
  2486. !Subtarget->hasMinSize())
  2487. // "mov lr, pc; b _foo" to avoid confusing the RSP
  2488. CallOpc = ARMISD::CALL_NOLINK;
  2489. else
  2490. CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
  2491. }
  2492. // We don't usually want to end the call-sequence here because we would tidy
  2493. // the frame up *after* the call, however in the ABI-changing tail-call case
  2494. // we've carefully laid out the parameters so that when sp is reset they'll be
  2495. // in the correct location.
  2496. if (isTailCall && !isSibCall) {
  2497. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
  2498. DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
  2499. InFlag = Chain.getValue(1);
  2500. }
  2501. std::vector<SDValue> Ops;
  2502. Ops.push_back(Chain);
  2503. Ops.push_back(Callee);
  2504. if (isTailCall) {
  2505. Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
  2506. }
  2507. // Add argument registers to the end of the list so that they are known live
  2508. // into the call.
  2509. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
  2510. Ops.push_back(DAG.getRegister(RegsToPass[i].first,
  2511. RegsToPass[i].second.getValueType()));
  2512. // Add a register mask operand representing the call-preserved registers.
  2513. if (!isTailCall) {
  2514. const uint32_t *Mask;
  2515. const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
  2516. if (isThisReturn) {
  2517. // For 'this' returns, use the R0-preserving mask if applicable
  2518. Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
  2519. if (!Mask) {
  2520. // Set isThisReturn to false if the calling convention is not one that
  2521. // allows 'returned' to be modeled in this way, so LowerCallResult does
  2522. // not try to pass 'this' straight through
  2523. isThisReturn = false;
  2524. Mask = ARI->getCallPreservedMask(MF, CallConv);
  2525. }
  2526. } else
  2527. Mask = ARI->getCallPreservedMask(MF, CallConv);
  2528. assert(Mask && "Missing call preserved mask for calling convention");
  2529. Ops.push_back(DAG.getRegisterMask(Mask));
  2530. }
  2531. if (InFlag.getNode())
  2532. Ops.push_back(InFlag);
  2533. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  2534. if (isTailCall) {
  2535. MF.getFrameInfo().setHasTailCall();
  2536. SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
  2537. DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
  2538. return Ret;
  2539. }
  2540. // Returns a chain and a flag for retval copy to use.
  2541. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
  2542. DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
  2543. InFlag = Chain.getValue(1);
  2544. DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
  2545. // If we're guaranteeing tail-calls will be honoured, the callee must
  2546. // pop its own argument stack on return. But this call is *not* a tail call so
  2547. // we need to undo that after it returns to restore the status-quo.
  2548. bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
  2549. uint64_t CalleePopBytes =
  2550. canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
  2551. Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
  2552. DAG.getIntPtrConstant(CalleePopBytes, dl, true),
  2553. InFlag, dl);
  2554. if (!Ins.empty())
  2555. InFlag = Chain.getValue(1);
  2556. // Handle result values, copying them out of physregs into vregs that we
  2557. // return.
  2558. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
  2559. InVals, isThisReturn,
  2560. isThisReturn ? OutVals[0] : SDValue());
  2561. }
  2562. /// HandleByVal - Every parameter *after* a byval parameter is passed
  2563. /// on the stack. Remember the next parameter register to allocate,
  2564. /// and then confiscate the rest of the parameter registers to insure
  2565. /// this.
  2566. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
  2567. Align Alignment) const {
  2568. // Byval (as with any stack) slots are always at least 4 byte aligned.
  2569. Alignment = std::max(Alignment, Align(4));
  2570. unsigned Reg = State->AllocateReg(GPRArgRegs);
  2571. if (!Reg)
  2572. return;
  2573. unsigned AlignInRegs = Alignment.value() / 4;
  2574. unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
  2575. for (unsigned i = 0; i < Waste; ++i)
  2576. Reg = State->AllocateReg(GPRArgRegs);
  2577. if (!Reg)
  2578. return;
  2579. unsigned Excess = 4 * (ARM::R4 - Reg);
  2580. // Special case when NSAA != SP and parameter size greater than size of
  2581. // all remained GPR regs. In that case we can't split parameter, we must
  2582. // send it to stack. We also must set NCRN to R4, so waste all
  2583. // remained registers.
  2584. const unsigned NSAAOffset = State->getNextStackOffset();
  2585. if (NSAAOffset != 0 && Size > Excess) {
  2586. while (State->AllocateReg(GPRArgRegs))
  2587. ;
  2588. return;
  2589. }
  2590. // First register for byval parameter is the first register that wasn't
  2591. // allocated before this method call, so it would be "reg".
  2592. // If parameter is small enough to be saved in range [reg, r4), then
  2593. // the end (first after last) register would be reg + param-size-in-regs,
  2594. // else parameter would be splitted between registers and stack,
  2595. // end register would be r4 in this case.
  2596. unsigned ByValRegBegin = Reg;
  2597. unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
  2598. State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
  2599. // Note, first register is allocated in the beginning of function already,
  2600. // allocate remained amount of registers we need.
  2601. for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
  2602. State->AllocateReg(GPRArgRegs);
  2603. // A byval parameter that is split between registers and memory needs its
  2604. // size truncated here.
  2605. // In the case where the entire structure fits in registers, we set the
  2606. // size in memory to zero.
  2607. Size = std::max<int>(Size - Excess, 0);
  2608. }
  2609. /// MatchingStackOffset - Return true if the given stack call argument is
  2610. /// already available in the same position (relatively) of the caller's
  2611. /// incoming argument stack.
  2612. static
  2613. bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
  2614. MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
  2615. const TargetInstrInfo *TII) {
  2616. unsigned Bytes = Arg.getValueSizeInBits() / 8;
  2617. int FI = std::numeric_limits<int>::max();
  2618. if (Arg.getOpcode() == ISD::CopyFromReg) {
  2619. Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
  2620. if (!Register::isVirtualRegister(VR))
  2621. return false;
  2622. MachineInstr *Def = MRI->getVRegDef(VR);
  2623. if (!Def)
  2624. return false;
  2625. if (!Flags.isByVal()) {
  2626. if (!TII->isLoadFromStackSlot(*Def, FI))
  2627. return false;
  2628. } else {
  2629. return false;
  2630. }
  2631. } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
  2632. if (Flags.isByVal())
  2633. // ByVal argument is passed in as a pointer but it's now being
  2634. // dereferenced. e.g.
  2635. // define @foo(%struct.X* %A) {
  2636. // tail call @bar(%struct.X* byval %A)
  2637. // }
  2638. return false;
  2639. SDValue Ptr = Ld->getBasePtr();
  2640. FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
  2641. if (!FINode)
  2642. return false;
  2643. FI = FINode->getIndex();
  2644. } else
  2645. return false;
  2646. assert(FI != std::numeric_limits<int>::max());
  2647. if (!MFI.isFixedObjectIndex(FI))
  2648. return false;
  2649. return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
  2650. }
  2651. /// IsEligibleForTailCallOptimization - Check whether the call is eligible
  2652. /// for tail call optimization. Targets which want to do tail call
  2653. /// optimization should implement this function.
  2654. bool ARMTargetLowering::IsEligibleForTailCallOptimization(
  2655. SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
  2656. bool isCalleeStructRet, bool isCallerStructRet,
  2657. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2658. const SmallVectorImpl<SDValue> &OutVals,
  2659. const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
  2660. const bool isIndirect) const {
  2661. MachineFunction &MF = DAG.getMachineFunction();
  2662. const Function &CallerF = MF.getFunction();
  2663. CallingConv::ID CallerCC = CallerF.getCallingConv();
  2664. assert(Subtarget->supportsTailCall());
  2665. // Indirect tail calls cannot be optimized for Thumb1 if the args
  2666. // to the call take up r0-r3. The reason is that there are no legal registers
  2667. // left to hold the pointer to the function to be called.
  2668. // Similarly, if the function uses return address sign and authentication,
  2669. // r12 is needed to hold the PAC and is not available to hold the callee
  2670. // address.
  2671. if (Outs.size() >= 4 &&
  2672. (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
  2673. if (Subtarget->isThumb1Only())
  2674. return false;
  2675. // Conservatively assume the function spills LR.
  2676. if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
  2677. return false;
  2678. }
  2679. // Look for obvious safe cases to perform tail call optimization that do not
  2680. // require ABI changes. This is what gcc calls sibcall.
  2681. // Exception-handling functions need a special set of instructions to indicate
  2682. // a return to the hardware. Tail-calling another function would probably
  2683. // break this.
  2684. if (CallerF.hasFnAttribute("interrupt"))
  2685. return false;
  2686. if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
  2687. return CalleeCC == CallerCC;
  2688. // Also avoid sibcall optimization if either caller or callee uses struct
  2689. // return semantics.
  2690. if (isCalleeStructRet || isCallerStructRet)
  2691. return false;
  2692. // Externally-defined functions with weak linkage should not be
  2693. // tail-called on ARM when the OS does not support dynamic
  2694. // pre-emption of symbols, as the AAELF spec requires normal calls
  2695. // to undefined weak functions to be replaced with a NOP or jump to the
  2696. // next instruction. The behaviour of branch instructions in this
  2697. // situation (as used for tail calls) is implementation-defined, so we
  2698. // cannot rely on the linker replacing the tail call with a return.
  2699. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
  2700. const GlobalValue *GV = G->getGlobal();
  2701. const Triple &TT = getTargetMachine().getTargetTriple();
  2702. if (GV->hasExternalWeakLinkage() &&
  2703. (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
  2704. return false;
  2705. }
  2706. // Check that the call results are passed in the same way.
  2707. LLVMContext &C = *DAG.getContext();
  2708. if (!CCState::resultsCompatible(
  2709. getEffectiveCallingConv(CalleeCC, isVarArg),
  2710. getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
  2711. CCAssignFnForReturn(CalleeCC, isVarArg),
  2712. CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
  2713. return false;
  2714. // The callee has to preserve all registers the caller needs to preserve.
  2715. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
  2716. const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
  2717. if (CalleeCC != CallerCC) {
  2718. const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
  2719. if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
  2720. return false;
  2721. }
  2722. // If Caller's vararg or byval argument has been split between registers and
  2723. // stack, do not perform tail call, since part of the argument is in caller's
  2724. // local frame.
  2725. const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
  2726. if (AFI_Caller->getArgRegsSaveSize())
  2727. return false;
  2728. // If the callee takes no arguments then go on to check the results of the
  2729. // call.
  2730. if (!Outs.empty()) {
  2731. // Check if stack adjustment is needed. For now, do not do this if any
  2732. // argument is passed on the stack.
  2733. SmallVector<CCValAssign, 16> ArgLocs;
  2734. CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
  2735. CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
  2736. if (CCInfo.getNextStackOffset()) {
  2737. // Check if the arguments are already laid out in the right way as
  2738. // the caller's fixed stack objects.
  2739. MachineFrameInfo &MFI = MF.getFrameInfo();
  2740. const MachineRegisterInfo *MRI = &MF.getRegInfo();
  2741. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2742. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
  2743. i != e;
  2744. ++i, ++realArgIdx) {
  2745. CCValAssign &VA = ArgLocs[i];
  2746. EVT RegVT = VA.getLocVT();
  2747. SDValue Arg = OutVals[realArgIdx];
  2748. ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
  2749. if (VA.getLocInfo() == CCValAssign::Indirect)
  2750. return false;
  2751. if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
  2752. // f64 and vector types are split into multiple registers or
  2753. // register/stack-slot combinations. The types will not match
  2754. // the registers; give up on memory f64 refs until we figure
  2755. // out what to do about this.
  2756. if (!VA.isRegLoc())
  2757. return false;
  2758. if (!ArgLocs[++i].isRegLoc())
  2759. return false;
  2760. if (RegVT == MVT::v2f64) {
  2761. if (!ArgLocs[++i].isRegLoc())
  2762. return false;
  2763. if (!ArgLocs[++i].isRegLoc())
  2764. return false;
  2765. }
  2766. } else if (!VA.isRegLoc()) {
  2767. if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
  2768. MFI, MRI, TII))
  2769. return false;
  2770. }
  2771. }
  2772. }
  2773. const MachineRegisterInfo &MRI = MF.getRegInfo();
  2774. if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
  2775. return false;
  2776. }
  2777. return true;
  2778. }
  2779. bool
  2780. ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
  2781. MachineFunction &MF, bool isVarArg,
  2782. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2783. LLVMContext &Context) const {
  2784. SmallVector<CCValAssign, 16> RVLocs;
  2785. CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
  2786. return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
  2787. }
  2788. static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
  2789. const SDLoc &DL, SelectionDAG &DAG) {
  2790. const MachineFunction &MF = DAG.getMachineFunction();
  2791. const Function &F = MF.getFunction();
  2792. StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
  2793. // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
  2794. // version of the "preferred return address". These offsets affect the return
  2795. // instruction if this is a return from PL1 without hypervisor extensions.
  2796. // IRQ/FIQ: +4 "subs pc, lr, #4"
  2797. // SWI: 0 "subs pc, lr, #0"
  2798. // ABORT: +4 "subs pc, lr, #4"
  2799. // UNDEF: +4/+2 "subs pc, lr, #0"
  2800. // UNDEF varies depending on where the exception came from ARM or Thumb
  2801. // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
  2802. int64_t LROffset;
  2803. if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
  2804. IntKind == "ABORT")
  2805. LROffset = 4;
  2806. else if (IntKind == "SWI" || IntKind == "UNDEF")
  2807. LROffset = 0;
  2808. else
  2809. report_fatal_error("Unsupported interrupt attribute. If present, value "
  2810. "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
  2811. RetOps.insert(RetOps.begin() + 1,
  2812. DAG.getConstant(LROffset, DL, MVT::i32, false));
  2813. return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
  2814. }
  2815. SDValue
  2816. ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  2817. bool isVarArg,
  2818. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2819. const SmallVectorImpl<SDValue> &OutVals,
  2820. const SDLoc &dl, SelectionDAG &DAG) const {
  2821. // CCValAssign - represent the assignment of the return value to a location.
  2822. SmallVector<CCValAssign, 16> RVLocs;
  2823. // CCState - Info about the registers and stack slots.
  2824. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
  2825. *DAG.getContext());
  2826. // Analyze outgoing return values.
  2827. CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
  2828. SDValue Flag;
  2829. SmallVector<SDValue, 4> RetOps;
  2830. RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
  2831. bool isLittleEndian = Subtarget->isLittle();
  2832. MachineFunction &MF = DAG.getMachineFunction();
  2833. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  2834. AFI->setReturnRegsCount(RVLocs.size());
  2835. // Report error if cmse entry function returns structure through first ptr arg.
  2836. if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
  2837. // Note: using an empty SDLoc(), as the first line of the function is a
  2838. // better place to report than the last line.
  2839. DiagnosticInfoUnsupported Diag(
  2840. DAG.getMachineFunction().getFunction(),
  2841. "secure entry function would return value through pointer",
  2842. SDLoc().getDebugLoc());
  2843. DAG.getContext()->diagnose(Diag);
  2844. }
  2845. // Copy the result values into the output registers.
  2846. for (unsigned i = 0, realRVLocIdx = 0;
  2847. i != RVLocs.size();
  2848. ++i, ++realRVLocIdx) {
  2849. CCValAssign &VA = RVLocs[i];
  2850. assert(VA.isRegLoc() && "Can only return in registers!");
  2851. SDValue Arg = OutVals[realRVLocIdx];
  2852. bool ReturnF16 = false;
  2853. if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
  2854. // Half-precision return values can be returned like this:
  2855. //
  2856. // t11 f16 = fadd ...
  2857. // t12: i16 = bitcast t11
  2858. // t13: i32 = zero_extend t12
  2859. // t14: f32 = bitcast t13 <~~~~~~~ Arg
  2860. //
  2861. // to avoid code generation for bitcasts, we simply set Arg to the node
  2862. // that produces the f16 value, t11 in this case.
  2863. //
  2864. if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
  2865. SDValue ZE = Arg.getOperand(0);
  2866. if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
  2867. SDValue BC = ZE.getOperand(0);
  2868. if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
  2869. Arg = BC.getOperand(0);
  2870. ReturnF16 = true;
  2871. }
  2872. }
  2873. }
  2874. }
  2875. switch (VA.getLocInfo()) {
  2876. default: llvm_unreachable("Unknown loc info!");
  2877. case CCValAssign::Full: break;
  2878. case CCValAssign::BCvt:
  2879. if (!ReturnF16)
  2880. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2881. break;
  2882. }
  2883. // Mask f16 arguments if this is a CMSE nonsecure entry.
  2884. auto RetVT = Outs[realRVLocIdx].ArgVT;
  2885. if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
  2886. if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
  2887. Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
  2888. } else {
  2889. auto LocBits = VA.getLocVT().getSizeInBits();
  2890. auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
  2891. SDValue Mask =
  2892. DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
  2893. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
  2894. Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
  2895. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2896. }
  2897. }
  2898. if (VA.needsCustom() &&
  2899. (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
  2900. if (VA.getLocVT() == MVT::v2f64) {
  2901. // Extract the first half and return it in two registers.
  2902. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2903. DAG.getConstant(0, dl, MVT::i32));
  2904. SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
  2905. DAG.getVTList(MVT::i32, MVT::i32), Half);
  2906. Chain =
  2907. DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2908. HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
  2909. Flag = Chain.getValue(1);
  2910. RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  2911. VA = RVLocs[++i]; // skip ahead to next loc
  2912. Chain =
  2913. DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2914. HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
  2915. Flag = Chain.getValue(1);
  2916. RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  2917. VA = RVLocs[++i]; // skip ahead to next loc
  2918. // Extract the 2nd half and fall through to handle it as an f64 value.
  2919. Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2920. DAG.getConstant(1, dl, MVT::i32));
  2921. }
  2922. // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
  2923. // available.
  2924. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
  2925. DAG.getVTList(MVT::i32, MVT::i32), Arg);
  2926. Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2927. fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
  2928. Flag = Chain.getValue(1);
  2929. RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  2930. VA = RVLocs[++i]; // skip ahead to next loc
  2931. Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2932. fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
  2933. } else
  2934. Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
  2935. // Guarantee that all emitted copies are
  2936. // stuck together, avoiding something bad.
  2937. Flag = Chain.getValue(1);
  2938. RetOps.push_back(DAG.getRegister(
  2939. VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
  2940. }
  2941. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
  2942. const MCPhysReg *I =
  2943. TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
  2944. if (I) {
  2945. for (; *I; ++I) {
  2946. if (ARM::GPRRegClass.contains(*I))
  2947. RetOps.push_back(DAG.getRegister(*I, MVT::i32));
  2948. else if (ARM::DPRRegClass.contains(*I))
  2949. RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
  2950. else
  2951. llvm_unreachable("Unexpected register class in CSRsViaCopy!");
  2952. }
  2953. }
  2954. // Update chain and glue.
  2955. RetOps[0] = Chain;
  2956. if (Flag.getNode())
  2957. RetOps.push_back(Flag);
  2958. // CPUs which aren't M-class use a special sequence to return from
  2959. // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
  2960. // though we use "subs pc, lr, #N").
  2961. //
  2962. // M-class CPUs actually use a normal return sequence with a special
  2963. // (hardware-provided) value in LR, so the normal code path works.
  2964. if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
  2965. !Subtarget->isMClass()) {
  2966. if (Subtarget->isThumb1Only())
  2967. report_fatal_error("interrupt attribute is not supported in Thumb1");
  2968. return LowerInterruptReturn(RetOps, dl, DAG);
  2969. }
  2970. ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
  2971. ARMISD::RET_FLAG;
  2972. return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
  2973. }
  2974. bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
  2975. if (N->getNumValues() != 1)
  2976. return false;
  2977. if (!N->hasNUsesOfValue(1, 0))
  2978. return false;
  2979. SDValue TCChain = Chain;
  2980. SDNode *Copy = *N->use_begin();
  2981. if (Copy->getOpcode() == ISD::CopyToReg) {
  2982. // If the copy has a glue operand, we conservatively assume it isn't safe to
  2983. // perform a tail call.
  2984. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
  2985. return false;
  2986. TCChain = Copy->getOperand(0);
  2987. } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
  2988. SDNode *VMov = Copy;
  2989. // f64 returned in a pair of GPRs.
  2990. SmallPtrSet<SDNode*, 2> Copies;
  2991. for (SDNode *U : VMov->uses()) {
  2992. if (U->getOpcode() != ISD::CopyToReg)
  2993. return false;
  2994. Copies.insert(U);
  2995. }
  2996. if (Copies.size() > 2)
  2997. return false;
  2998. for (SDNode *U : VMov->uses()) {
  2999. SDValue UseChain = U->getOperand(0);
  3000. if (Copies.count(UseChain.getNode()))
  3001. // Second CopyToReg
  3002. Copy = U;
  3003. else {
  3004. // We are at the top of this chain.
  3005. // If the copy has a glue operand, we conservatively assume it
  3006. // isn't safe to perform a tail call.
  3007. if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
  3008. return false;
  3009. // First CopyToReg
  3010. TCChain = UseChain;
  3011. }
  3012. }
  3013. } else if (Copy->getOpcode() == ISD::BITCAST) {
  3014. // f32 returned in a single GPR.
  3015. if (!Copy->hasOneUse())
  3016. return false;
  3017. Copy = *Copy->use_begin();
  3018. if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
  3019. return false;
  3020. // If the copy has a glue operand, we conservatively assume it isn't safe to
  3021. // perform a tail call.
  3022. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
  3023. return false;
  3024. TCChain = Copy->getOperand(0);
  3025. } else {
  3026. return false;
  3027. }
  3028. bool HasRet = false;
  3029. for (const SDNode *U : Copy->uses()) {
  3030. if (U->getOpcode() != ARMISD::RET_FLAG &&
  3031. U->getOpcode() != ARMISD::INTRET_FLAG)
  3032. return false;
  3033. HasRet = true;
  3034. }
  3035. if (!HasRet)
  3036. return false;
  3037. Chain = TCChain;
  3038. return true;
  3039. }
  3040. bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
  3041. if (!Subtarget->supportsTailCall())
  3042. return false;
  3043. if (!CI->isTailCall())
  3044. return false;
  3045. return true;
  3046. }
  3047. // Trying to write a 64 bit value so need to split into two 32 bit values first,
  3048. // and pass the lower and high parts through.
  3049. static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
  3050. SDLoc DL(Op);
  3051. SDValue WriteValue = Op->getOperand(2);
  3052. // This function is only supposed to be called for i64 type argument.
  3053. assert(WriteValue.getValueType() == MVT::i64
  3054. && "LowerWRITE_REGISTER called for non-i64 type argument.");
  3055. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
  3056. DAG.getConstant(0, DL, MVT::i32));
  3057. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
  3058. DAG.getConstant(1, DL, MVT::i32));
  3059. SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
  3060. return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
  3061. }
  3062. // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
  3063. // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
  3064. // one of the above mentioned nodes. It has to be wrapped because otherwise
  3065. // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
  3066. // be used to form addressing mode. These wrapped nodes will be selected
  3067. // into MOVi.
  3068. SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
  3069. SelectionDAG &DAG) const {
  3070. EVT PtrVT = Op.getValueType();
  3071. // FIXME there is no actual debug info here
  3072. SDLoc dl(Op);
  3073. ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  3074. SDValue Res;
  3075. // When generating execute-only code Constant Pools must be promoted to the
  3076. // global data section. It's a bit ugly that we can't share them across basic
  3077. // blocks, but this way we guarantee that execute-only behaves correct with
  3078. // position-independent addressing modes.
  3079. if (Subtarget->genExecuteOnly()) {
  3080. auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
  3081. auto T = const_cast<Type*>(CP->getType());
  3082. auto C = const_cast<Constant*>(CP->getConstVal());
  3083. auto M = const_cast<Module*>(DAG.getMachineFunction().
  3084. getFunction().getParent());
  3085. auto GV = new GlobalVariable(
  3086. *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
  3087. Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
  3088. Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
  3089. Twine(AFI->createPICLabelUId())
  3090. );
  3091. SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
  3092. dl, PtrVT);
  3093. return LowerGlobalAddress(GA, DAG);
  3094. }
  3095. if (CP->isMachineConstantPoolEntry())
  3096. Res =
  3097. DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
  3098. else
  3099. Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
  3100. return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
  3101. }
  3102. unsigned ARMTargetLowering::getJumpTableEncoding() const {
  3103. return MachineJumpTableInfo::EK_Inline;
  3104. }
  3105. SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
  3106. SelectionDAG &DAG) const {
  3107. MachineFunction &MF = DAG.getMachineFunction();
  3108. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3109. unsigned ARMPCLabelIndex = 0;
  3110. SDLoc DL(Op);
  3111. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3112. const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
  3113. SDValue CPAddr;
  3114. bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
  3115. if (!IsPositionIndependent) {
  3116. CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
  3117. } else {
  3118. unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
  3119. ARMPCLabelIndex = AFI->createPICLabelUId();
  3120. ARMConstantPoolValue *CPV =
  3121. ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
  3122. ARMCP::CPBlockAddress, PCAdj);
  3123. CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3124. }
  3125. CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
  3126. SDValue Result = DAG.getLoad(
  3127. PtrVT, DL, DAG.getEntryNode(), CPAddr,
  3128. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3129. if (!IsPositionIndependent)
  3130. return Result;
  3131. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
  3132. return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
  3133. }
  3134. /// Convert a TLS address reference into the correct sequence of loads
  3135. /// and calls to compute the variable's address for Darwin, and return an
  3136. /// SDValue containing the final node.
  3137. /// Darwin only has one TLS scheme which must be capable of dealing with the
  3138. /// fully general situation, in the worst case. This means:
  3139. /// + "extern __thread" declaration.
  3140. /// + Defined in a possibly unknown dynamic library.
  3141. ///
  3142. /// The general system is that each __thread variable has a [3 x i32] descriptor
  3143. /// which contains information used by the runtime to calculate the address. The
  3144. /// only part of this the compiler needs to know about is the first word, which
  3145. /// contains a function pointer that must be called with the address of the
  3146. /// entire descriptor in "r0".
  3147. ///
  3148. /// Since this descriptor may be in a different unit, in general access must
  3149. /// proceed along the usual ARM rules. A common sequence to produce is:
  3150. ///
  3151. /// movw rT1, :lower16:_var$non_lazy_ptr
  3152. /// movt rT1, :upper16:_var$non_lazy_ptr
  3153. /// ldr r0, [rT1]
  3154. /// ldr rT2, [r0]
  3155. /// blx rT2
  3156. /// [...address now in r0...]
  3157. SDValue
  3158. ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
  3159. SelectionDAG &DAG) const {
  3160. assert(Subtarget->isTargetDarwin() &&
  3161. "This function expects a Darwin target");
  3162. SDLoc DL(Op);
  3163. // First step is to get the address of the actua global symbol. This is where
  3164. // the TLS descriptor lives.
  3165. SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
  3166. // The first entry in the descriptor is a function pointer that we must call
  3167. // to obtain the address of the variable.
  3168. SDValue Chain = DAG.getEntryNode();
  3169. SDValue FuncTLVGet = DAG.getLoad(
  3170. MVT::i32, DL, Chain, DescAddr,
  3171. MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
  3172. MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
  3173. MachineMemOperand::MOInvariant);
  3174. Chain = FuncTLVGet.getValue(1);
  3175. MachineFunction &F = DAG.getMachineFunction();
  3176. MachineFrameInfo &MFI = F.getFrameInfo();
  3177. MFI.setAdjustsStack(true);
  3178. // TLS calls preserve all registers except those that absolutely must be
  3179. // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
  3180. // silly).
  3181. auto TRI =
  3182. getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
  3183. auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
  3184. const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
  3185. // Finally, we can make the call. This is just a degenerate version of a
  3186. // normal AArch64 call node: r0 takes the address of the descriptor, and
  3187. // returns the address of the variable in this thread.
  3188. Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
  3189. Chain =
  3190. DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
  3191. Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
  3192. DAG.getRegisterMask(Mask), Chain.getValue(1));
  3193. return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
  3194. }
  3195. SDValue
  3196. ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
  3197. SelectionDAG &DAG) const {
  3198. assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
  3199. SDValue Chain = DAG.getEntryNode();
  3200. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3201. SDLoc DL(Op);
  3202. // Load the current TEB (thread environment block)
  3203. SDValue Ops[] = {Chain,
  3204. DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
  3205. DAG.getTargetConstant(15, DL, MVT::i32),
  3206. DAG.getTargetConstant(0, DL, MVT::i32),
  3207. DAG.getTargetConstant(13, DL, MVT::i32),
  3208. DAG.getTargetConstant(0, DL, MVT::i32),
  3209. DAG.getTargetConstant(2, DL, MVT::i32)};
  3210. SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
  3211. DAG.getVTList(MVT::i32, MVT::Other), Ops);
  3212. SDValue TEB = CurrentTEB.getValue(0);
  3213. Chain = CurrentTEB.getValue(1);
  3214. // Load the ThreadLocalStoragePointer from the TEB
  3215. // A pointer to the TLS array is located at offset 0x2c from the TEB.
  3216. SDValue TLSArray =
  3217. DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
  3218. TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
  3219. // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
  3220. // offset into the TLSArray.
  3221. // Load the TLS index from the C runtime
  3222. SDValue TLSIndex =
  3223. DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
  3224. TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
  3225. TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
  3226. SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
  3227. DAG.getConstant(2, DL, MVT::i32));
  3228. SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
  3229. DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
  3230. MachinePointerInfo());
  3231. // Get the offset of the start of the .tls section (section base)
  3232. const auto *GA = cast<GlobalAddressSDNode>(Op);
  3233. auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
  3234. SDValue Offset = DAG.getLoad(
  3235. PtrVT, DL, Chain,
  3236. DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
  3237. DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
  3238. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3239. return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
  3240. }
  3241. // Lower ISD::GlobalTLSAddress using the "general dynamic" model
  3242. SDValue
  3243. ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
  3244. SelectionDAG &DAG) const {
  3245. SDLoc dl(GA);
  3246. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3247. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
  3248. MachineFunction &MF = DAG.getMachineFunction();
  3249. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3250. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  3251. ARMConstantPoolValue *CPV =
  3252. ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
  3253. ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
  3254. SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3255. Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
  3256. Argument = DAG.getLoad(
  3257. PtrVT, dl, DAG.getEntryNode(), Argument,
  3258. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3259. SDValue Chain = Argument.getValue(1);
  3260. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  3261. Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
  3262. // call __tls_get_addr.
  3263. ArgListTy Args;
  3264. ArgListEntry Entry;
  3265. Entry.Node = Argument;
  3266. Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
  3267. Args.push_back(Entry);
  3268. // FIXME: is there useful debug info available here?
  3269. TargetLowering::CallLoweringInfo CLI(DAG);
  3270. CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
  3271. CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
  3272. DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
  3273. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  3274. return CallResult.first;
  3275. }
  3276. // Lower ISD::GlobalTLSAddress using the "initial exec" or
  3277. // "local exec" model.
  3278. SDValue
  3279. ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
  3280. SelectionDAG &DAG,
  3281. TLSModel::Model model) const {
  3282. const GlobalValue *GV = GA->getGlobal();
  3283. SDLoc dl(GA);
  3284. SDValue Offset;
  3285. SDValue Chain = DAG.getEntryNode();
  3286. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3287. // Get the Thread Pointer
  3288. SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
  3289. if (model == TLSModel::InitialExec) {
  3290. MachineFunction &MF = DAG.getMachineFunction();
  3291. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3292. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  3293. // Initial exec model.
  3294. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
  3295. ARMConstantPoolValue *CPV =
  3296. ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
  3297. ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
  3298. true);
  3299. Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3300. Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
  3301. Offset = DAG.getLoad(
  3302. PtrVT, dl, Chain, Offset,
  3303. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3304. Chain = Offset.getValue(1);
  3305. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  3306. Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
  3307. Offset = DAG.getLoad(
  3308. PtrVT, dl, Chain, Offset,
  3309. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3310. } else {
  3311. // local exec model
  3312. assert(model == TLSModel::LocalExec);
  3313. ARMConstantPoolValue *CPV =
  3314. ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
  3315. Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3316. Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
  3317. Offset = DAG.getLoad(
  3318. PtrVT, dl, Chain, Offset,
  3319. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3320. }
  3321. // The address of the thread local variable is the add of the thread
  3322. // pointer with the offset of the variable.
  3323. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
  3324. }
  3325. SDValue
  3326. ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
  3327. GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  3328. if (DAG.getTarget().useEmulatedTLS())
  3329. return LowerToTLSEmulatedModel(GA, DAG);
  3330. if (Subtarget->isTargetDarwin())
  3331. return LowerGlobalTLSAddressDarwin(Op, DAG);
  3332. if (Subtarget->isTargetWindows())
  3333. return LowerGlobalTLSAddressWindows(Op, DAG);
  3334. // TODO: implement the "local dynamic" model
  3335. assert(Subtarget->isTargetELF() && "Only ELF implemented here");
  3336. TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
  3337. switch (model) {
  3338. case TLSModel::GeneralDynamic:
  3339. case TLSModel::LocalDynamic:
  3340. return LowerToTLSGeneralDynamicModel(GA, DAG);
  3341. case TLSModel::InitialExec:
  3342. case TLSModel::LocalExec:
  3343. return LowerToTLSExecModels(GA, DAG, model);
  3344. }
  3345. llvm_unreachable("bogus TLS model");
  3346. }
  3347. /// Return true if all users of V are within function F, looking through
  3348. /// ConstantExprs.
  3349. static bool allUsersAreInFunction(const Value *V, const Function *F) {
  3350. SmallVector<const User*,4> Worklist(V->users());
  3351. while (!Worklist.empty()) {
  3352. auto *U = Worklist.pop_back_val();
  3353. if (isa<ConstantExpr>(U)) {
  3354. append_range(Worklist, U->users());
  3355. continue;
  3356. }
  3357. auto *I = dyn_cast<Instruction>(U);
  3358. if (!I || I->getParent()->getParent() != F)
  3359. return false;
  3360. }
  3361. return true;
  3362. }
  3363. static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
  3364. const GlobalValue *GV, SelectionDAG &DAG,
  3365. EVT PtrVT, const SDLoc &dl) {
  3366. // If we're creating a pool entry for a constant global with unnamed address,
  3367. // and the global is small enough, we can emit it inline into the constant pool
  3368. // to save ourselves an indirection.
  3369. //
  3370. // This is a win if the constant is only used in one function (so it doesn't
  3371. // need to be duplicated) or duplicating the constant wouldn't increase code
  3372. // size (implying the constant is no larger than 4 bytes).
  3373. const Function &F = DAG.getMachineFunction().getFunction();
  3374. // We rely on this decision to inline being idemopotent and unrelated to the
  3375. // use-site. We know that if we inline a variable at one use site, we'll
  3376. // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
  3377. // doesn't know about this optimization, so bail out if it's enabled else
  3378. // we could decide to inline here (and thus never emit the GV) but require
  3379. // the GV from fast-isel generated code.
  3380. if (!EnableConstpoolPromotion ||
  3381. DAG.getMachineFunction().getTarget().Options.EnableFastISel)
  3382. return SDValue();
  3383. auto *GVar = dyn_cast<GlobalVariable>(GV);
  3384. if (!GVar || !GVar->hasInitializer() ||
  3385. !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
  3386. !GVar->hasLocalLinkage())
  3387. return SDValue();
  3388. // If we inline a value that contains relocations, we move the relocations
  3389. // from .data to .text. This is not allowed in position-independent code.
  3390. auto *Init = GVar->getInitializer();
  3391. if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
  3392. Init->needsDynamicRelocation())
  3393. return SDValue();
  3394. // The constant islands pass can only really deal with alignment requests
  3395. // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
  3396. // any type wanting greater alignment requirements than 4 bytes. We also
  3397. // can only promote constants that are multiples of 4 bytes in size or
  3398. // are paddable to a multiple of 4. Currently we only try and pad constants
  3399. // that are strings for simplicity.
  3400. auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
  3401. unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
  3402. Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
  3403. unsigned RequiredPadding = 4 - (Size % 4);
  3404. bool PaddingPossible =
  3405. RequiredPadding == 4 || (CDAInit && CDAInit->isString());
  3406. if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
  3407. Size == 0)
  3408. return SDValue();
  3409. unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
  3410. MachineFunction &MF = DAG.getMachineFunction();
  3411. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3412. // We can't bloat the constant pool too much, else the ConstantIslands pass
  3413. // may fail to converge. If we haven't promoted this global yet (it may have
  3414. // multiple uses), and promoting it would increase the constant pool size (Sz
  3415. // > 4), ensure we have space to do so up to MaxTotal.
  3416. if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
  3417. if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
  3418. ConstpoolPromotionMaxTotal)
  3419. return SDValue();
  3420. // This is only valid if all users are in a single function; we can't clone
  3421. // the constant in general. The LLVM IR unnamed_addr allows merging
  3422. // constants, but not cloning them.
  3423. //
  3424. // We could potentially allow cloning if we could prove all uses of the
  3425. // constant in the current function don't care about the address, like
  3426. // printf format strings. But that isn't implemented for now.
  3427. if (!allUsersAreInFunction(GVar, &F))
  3428. return SDValue();
  3429. // We're going to inline this global. Pad it out if needed.
  3430. if (RequiredPadding != 4) {
  3431. StringRef S = CDAInit->getAsString();
  3432. SmallVector<uint8_t,16> V(S.size());
  3433. std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
  3434. while (RequiredPadding--)
  3435. V.push_back(0);
  3436. Init = ConstantDataArray::get(*DAG.getContext(), V);
  3437. }
  3438. auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
  3439. SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
  3440. if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
  3441. AFI->markGlobalAsPromotedToConstantPool(GVar);
  3442. AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
  3443. PaddedSize - 4);
  3444. }
  3445. ++NumConstpoolPromoted;
  3446. return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3447. }
  3448. bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
  3449. if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
  3450. if (!(GV = GA->getAliaseeObject()))
  3451. return false;
  3452. if (const auto *V = dyn_cast<GlobalVariable>(GV))
  3453. return V->isConstant();
  3454. return isa<Function>(GV);
  3455. }
  3456. SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
  3457. SelectionDAG &DAG) const {
  3458. switch (Subtarget->getTargetTriple().getObjectFormat()) {
  3459. default: llvm_unreachable("unknown object format");
  3460. case Triple::COFF:
  3461. return LowerGlobalAddressWindows(Op, DAG);
  3462. case Triple::ELF:
  3463. return LowerGlobalAddressELF(Op, DAG);
  3464. case Triple::MachO:
  3465. return LowerGlobalAddressDarwin(Op, DAG);
  3466. }
  3467. }
  3468. SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
  3469. SelectionDAG &DAG) const {
  3470. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3471. SDLoc dl(Op);
  3472. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  3473. const TargetMachine &TM = getTargetMachine();
  3474. bool IsRO = isReadOnly(GV);
  3475. // promoteToConstantPool only if not generating XO text section
  3476. if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
  3477. if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
  3478. return V;
  3479. if (isPositionIndependent()) {
  3480. bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
  3481. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
  3482. UseGOT_PREL ? ARMII::MO_GOT : 0);
  3483. SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
  3484. if (UseGOT_PREL)
  3485. Result =
  3486. DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
  3487. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  3488. return Result;
  3489. } else if (Subtarget->isROPI() && IsRO) {
  3490. // PC-relative.
  3491. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
  3492. SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
  3493. return Result;
  3494. } else if (Subtarget->isRWPI() && !IsRO) {
  3495. // SB-relative.
  3496. SDValue RelAddr;
  3497. if (Subtarget->useMovt()) {
  3498. ++NumMovwMovt;
  3499. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
  3500. RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
  3501. } else { // use literal pool for address constant
  3502. ARMConstantPoolValue *CPV =
  3503. ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
  3504. SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3505. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3506. RelAddr = DAG.getLoad(
  3507. PtrVT, dl, DAG.getEntryNode(), CPAddr,
  3508. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3509. }
  3510. SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
  3511. SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
  3512. return Result;
  3513. }
  3514. // If we have T2 ops, we can materialize the address directly via movt/movw
  3515. // pair. This is always cheaper.
  3516. if (Subtarget->useMovt()) {
  3517. ++NumMovwMovt;
  3518. // FIXME: Once remat is capable of dealing with instructions with register
  3519. // operands, expand this into two nodes.
  3520. return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
  3521. DAG.getTargetGlobalAddress(GV, dl, PtrVT));
  3522. } else {
  3523. SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
  3524. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3525. return DAG.getLoad(
  3526. PtrVT, dl, DAG.getEntryNode(), CPAddr,
  3527. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3528. }
  3529. }
  3530. SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
  3531. SelectionDAG &DAG) const {
  3532. assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
  3533. "ROPI/RWPI not currently supported for Darwin");
  3534. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3535. SDLoc dl(Op);
  3536. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  3537. if (Subtarget->useMovt())
  3538. ++NumMovwMovt;
  3539. // FIXME: Once remat is capable of dealing with instructions with register
  3540. // operands, expand this into multiple nodes
  3541. unsigned Wrapper =
  3542. isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
  3543. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
  3544. SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
  3545. if (Subtarget->isGVIndirectSymbol(GV))
  3546. Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
  3547. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  3548. return Result;
  3549. }
  3550. SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
  3551. SelectionDAG &DAG) const {
  3552. assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
  3553. assert(Subtarget->useMovt() &&
  3554. "Windows on ARM expects to use movw/movt");
  3555. assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
  3556. "ROPI/RWPI not currently supported for Windows");
  3557. const TargetMachine &TM = getTargetMachine();
  3558. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  3559. ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
  3560. if (GV->hasDLLImportStorageClass())
  3561. TargetFlags = ARMII::MO_DLLIMPORT;
  3562. else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
  3563. TargetFlags = ARMII::MO_COFFSTUB;
  3564. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3565. SDValue Result;
  3566. SDLoc DL(Op);
  3567. ++NumMovwMovt;
  3568. // FIXME: Once remat is capable of dealing with instructions with register
  3569. // operands, expand this into two nodes.
  3570. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
  3571. DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
  3572. TargetFlags));
  3573. if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
  3574. Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
  3575. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  3576. return Result;
  3577. }
  3578. SDValue
  3579. ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
  3580. SDLoc dl(Op);
  3581. SDValue Val = DAG.getConstant(0, dl, MVT::i32);
  3582. return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
  3583. DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
  3584. Op.getOperand(1), Val);
  3585. }
  3586. SDValue
  3587. ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
  3588. SDLoc dl(Op);
  3589. return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
  3590. Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
  3591. }
  3592. SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
  3593. SelectionDAG &DAG) const {
  3594. SDLoc dl(Op);
  3595. return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
  3596. Op.getOperand(0));
  3597. }
  3598. SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
  3599. SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
  3600. unsigned IntNo =
  3601. cast<ConstantSDNode>(
  3602. Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
  3603. ->getZExtValue();
  3604. switch (IntNo) {
  3605. default:
  3606. return SDValue(); // Don't custom lower most intrinsics.
  3607. case Intrinsic::arm_gnu_eabi_mcount: {
  3608. MachineFunction &MF = DAG.getMachineFunction();
  3609. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3610. SDLoc dl(Op);
  3611. SDValue Chain = Op.getOperand(0);
  3612. // call "\01__gnu_mcount_nc"
  3613. const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
  3614. const uint32_t *Mask =
  3615. ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
  3616. assert(Mask && "Missing call preserved mask for calling convention");
  3617. // Mark LR an implicit live-in.
  3618. Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
  3619. SDValue ReturnAddress =
  3620. DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
  3621. constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
  3622. SDValue Callee =
  3623. DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
  3624. SDValue RegisterMask = DAG.getRegisterMask(Mask);
  3625. if (Subtarget->isThumb())
  3626. return SDValue(
  3627. DAG.getMachineNode(
  3628. ARM::tBL_PUSHLR, dl, ResultTys,
  3629. {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
  3630. DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
  3631. 0);
  3632. return SDValue(
  3633. DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
  3634. {ReturnAddress, Callee, RegisterMask, Chain}),
  3635. 0);
  3636. }
  3637. }
  3638. }
  3639. SDValue
  3640. ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
  3641. const ARMSubtarget *Subtarget) const {
  3642. unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  3643. SDLoc dl(Op);
  3644. switch (IntNo) {
  3645. default: return SDValue(); // Don't custom lower most intrinsics.
  3646. case Intrinsic::thread_pointer: {
  3647. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3648. return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
  3649. }
  3650. case Intrinsic::arm_cls: {
  3651. const SDValue &Operand = Op.getOperand(1);
  3652. const EVT VTy = Op.getValueType();
  3653. SDValue SRA =
  3654. DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
  3655. SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
  3656. SDValue SHL =
  3657. DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
  3658. SDValue OR =
  3659. DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
  3660. SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
  3661. return Result;
  3662. }
  3663. case Intrinsic::arm_cls64: {
  3664. // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
  3665. // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
  3666. const SDValue &Operand = Op.getOperand(1);
  3667. const EVT VTy = Op.getValueType();
  3668. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
  3669. DAG.getConstant(1, dl, VTy));
  3670. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
  3671. DAG.getConstant(0, dl, VTy));
  3672. SDValue Constant0 = DAG.getConstant(0, dl, VTy);
  3673. SDValue Constant1 = DAG.getConstant(1, dl, VTy);
  3674. SDValue Constant31 = DAG.getConstant(31, dl, VTy);
  3675. SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
  3676. SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
  3677. SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
  3678. SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
  3679. SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
  3680. SDValue CheckLo =
  3681. DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
  3682. SDValue HiIsZero =
  3683. DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
  3684. SDValue AdjustedLo =
  3685. DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
  3686. SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
  3687. SDValue Result =
  3688. DAG.getSelect(dl, VTy, CheckLo,
  3689. DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
  3690. return Result;
  3691. }
  3692. case Intrinsic::eh_sjlj_lsda: {
  3693. MachineFunction &MF = DAG.getMachineFunction();
  3694. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3695. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  3696. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3697. SDValue CPAddr;
  3698. bool IsPositionIndependent = isPositionIndependent();
  3699. unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
  3700. ARMConstantPoolValue *CPV =
  3701. ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
  3702. ARMCP::CPLSDA, PCAdj);
  3703. CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3704. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3705. SDValue Result = DAG.getLoad(
  3706. PtrVT, dl, DAG.getEntryNode(), CPAddr,
  3707. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3708. if (IsPositionIndependent) {
  3709. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  3710. Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
  3711. }
  3712. return Result;
  3713. }
  3714. case Intrinsic::arm_neon_vabs:
  3715. return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
  3716. Op.getOperand(1));
  3717. case Intrinsic::arm_neon_vmulls:
  3718. case Intrinsic::arm_neon_vmullu: {
  3719. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
  3720. ? ARMISD::VMULLs : ARMISD::VMULLu;
  3721. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3722. Op.getOperand(1), Op.getOperand(2));
  3723. }
  3724. case Intrinsic::arm_neon_vminnm:
  3725. case Intrinsic::arm_neon_vmaxnm: {
  3726. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
  3727. ? ISD::FMINNUM : ISD::FMAXNUM;
  3728. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3729. Op.getOperand(1), Op.getOperand(2));
  3730. }
  3731. case Intrinsic::arm_neon_vminu:
  3732. case Intrinsic::arm_neon_vmaxu: {
  3733. if (Op.getValueType().isFloatingPoint())
  3734. return SDValue();
  3735. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
  3736. ? ISD::UMIN : ISD::UMAX;
  3737. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3738. Op.getOperand(1), Op.getOperand(2));
  3739. }
  3740. case Intrinsic::arm_neon_vmins:
  3741. case Intrinsic::arm_neon_vmaxs: {
  3742. // v{min,max}s is overloaded between signed integers and floats.
  3743. if (!Op.getValueType().isFloatingPoint()) {
  3744. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
  3745. ? ISD::SMIN : ISD::SMAX;
  3746. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3747. Op.getOperand(1), Op.getOperand(2));
  3748. }
  3749. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
  3750. ? ISD::FMINIMUM : ISD::FMAXIMUM;
  3751. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3752. Op.getOperand(1), Op.getOperand(2));
  3753. }
  3754. case Intrinsic::arm_neon_vtbl1:
  3755. return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
  3756. Op.getOperand(1), Op.getOperand(2));
  3757. case Intrinsic::arm_neon_vtbl2:
  3758. return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
  3759. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  3760. case Intrinsic::arm_mve_pred_i2v:
  3761. case Intrinsic::arm_mve_pred_v2i:
  3762. return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
  3763. Op.getOperand(1));
  3764. case Intrinsic::arm_mve_vreinterpretq:
  3765. return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
  3766. Op.getOperand(1));
  3767. case Intrinsic::arm_mve_lsll:
  3768. return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
  3769. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  3770. case Intrinsic::arm_mve_asrl:
  3771. return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
  3772. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  3773. }
  3774. }
  3775. static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
  3776. const ARMSubtarget *Subtarget) {
  3777. SDLoc dl(Op);
  3778. ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
  3779. auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
  3780. if (SSID == SyncScope::SingleThread)
  3781. return Op;
  3782. if (!Subtarget->hasDataBarrier()) {
  3783. // Some ARMv6 cpus can support data barriers with an mcr instruction.
  3784. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
  3785. // here.
  3786. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
  3787. "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
  3788. return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
  3789. DAG.getConstant(0, dl, MVT::i32));
  3790. }
  3791. ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
  3792. AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
  3793. ARM_MB::MemBOpt Domain = ARM_MB::ISH;
  3794. if (Subtarget->isMClass()) {
  3795. // Only a full system barrier exists in the M-class architectures.
  3796. Domain = ARM_MB::SY;
  3797. } else if (Subtarget->preferISHSTBarriers() &&
  3798. Ord == AtomicOrdering::Release) {
  3799. // Swift happens to implement ISHST barriers in a way that's compatible with
  3800. // Release semantics but weaker than ISH so we'd be fools not to use
  3801. // it. Beware: other processors probably don't!
  3802. Domain = ARM_MB::ISHST;
  3803. }
  3804. return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
  3805. DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
  3806. DAG.getConstant(Domain, dl, MVT::i32));
  3807. }
  3808. static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
  3809. const ARMSubtarget *Subtarget) {
  3810. // ARM pre v5TE and Thumb1 does not have preload instructions.
  3811. if (!(Subtarget->isThumb2() ||
  3812. (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
  3813. // Just preserve the chain.
  3814. return Op.getOperand(0);
  3815. SDLoc dl(Op);
  3816. unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
  3817. if (!isRead &&
  3818. (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
  3819. // ARMv7 with MP extension has PLDW.
  3820. return Op.getOperand(0);
  3821. unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
  3822. if (Subtarget->isThumb()) {
  3823. // Invert the bits.
  3824. isRead = ~isRead & 1;
  3825. isData = ~isData & 1;
  3826. }
  3827. return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
  3828. Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
  3829. DAG.getConstant(isData, dl, MVT::i32));
  3830. }
  3831. static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
  3832. MachineFunction &MF = DAG.getMachineFunction();
  3833. ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
  3834. // vastart just stores the address of the VarArgsFrameIndex slot into the
  3835. // memory location argument.
  3836. SDLoc dl(Op);
  3837. EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
  3838. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
  3839. const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  3840. return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
  3841. MachinePointerInfo(SV));
  3842. }
  3843. SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
  3844. CCValAssign &NextVA,
  3845. SDValue &Root,
  3846. SelectionDAG &DAG,
  3847. const SDLoc &dl) const {
  3848. MachineFunction &MF = DAG.getMachineFunction();
  3849. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3850. const TargetRegisterClass *RC;
  3851. if (AFI->isThumb1OnlyFunction())
  3852. RC = &ARM::tGPRRegClass;
  3853. else
  3854. RC = &ARM::GPRRegClass;
  3855. // Transform the arguments stored in physical registers into virtual ones.
  3856. Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
  3857. SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
  3858. SDValue ArgValue2;
  3859. if (NextVA.isMemLoc()) {
  3860. MachineFrameInfo &MFI = MF.getFrameInfo();
  3861. int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
  3862. // Create load node to retrieve arguments from the stack.
  3863. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
  3864. ArgValue2 = DAG.getLoad(
  3865. MVT::i32, dl, Root, FIN,
  3866. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
  3867. } else {
  3868. Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
  3869. ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
  3870. }
  3871. if (!Subtarget->isLittle())
  3872. std::swap (ArgValue, ArgValue2);
  3873. return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
  3874. }
  3875. // The remaining GPRs hold either the beginning of variable-argument
  3876. // data, or the beginning of an aggregate passed by value (usually
  3877. // byval). Either way, we allocate stack slots adjacent to the data
  3878. // provided by our caller, and store the unallocated registers there.
  3879. // If this is a variadic function, the va_list pointer will begin with
  3880. // these values; otherwise, this reassembles a (byval) structure that
  3881. // was split between registers and memory.
  3882. // Return: The frame index registers were stored into.
  3883. int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
  3884. const SDLoc &dl, SDValue &Chain,
  3885. const Value *OrigArg,
  3886. unsigned InRegsParamRecordIdx,
  3887. int ArgOffset, unsigned ArgSize) const {
  3888. // Currently, two use-cases possible:
  3889. // Case #1. Non-var-args function, and we meet first byval parameter.
  3890. // Setup first unallocated register as first byval register;
  3891. // eat all remained registers
  3892. // (these two actions are performed by HandleByVal method).
  3893. // Then, here, we initialize stack frame with
  3894. // "store-reg" instructions.
  3895. // Case #2. Var-args function, that doesn't contain byval parameters.
  3896. // The same: eat all remained unallocated registers,
  3897. // initialize stack frame.
  3898. MachineFunction &MF = DAG.getMachineFunction();
  3899. MachineFrameInfo &MFI = MF.getFrameInfo();
  3900. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3901. unsigned RBegin, REnd;
  3902. if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
  3903. CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
  3904. } else {
  3905. unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
  3906. RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
  3907. REnd = ARM::R4;
  3908. }
  3909. if (REnd != RBegin)
  3910. ArgOffset = -4 * (ARM::R4 - RBegin);
  3911. auto PtrVT = getPointerTy(DAG.getDataLayout());
  3912. int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
  3913. SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
  3914. SmallVector<SDValue, 4> MemOps;
  3915. const TargetRegisterClass *RC =
  3916. AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
  3917. for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
  3918. Register VReg = MF.addLiveIn(Reg, RC);
  3919. SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
  3920. SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
  3921. MachinePointerInfo(OrigArg, 4 * i));
  3922. MemOps.push_back(Store);
  3923. FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
  3924. }
  3925. if (!MemOps.empty())
  3926. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
  3927. return FrameIndex;
  3928. }
  3929. // Setup stack frame, the va_list pointer will start from.
  3930. void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
  3931. const SDLoc &dl, SDValue &Chain,
  3932. unsigned ArgOffset,
  3933. unsigned TotalArgRegsSaveSize,
  3934. bool ForceMutable) const {
  3935. MachineFunction &MF = DAG.getMachineFunction();
  3936. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3937. // Try to store any remaining integer argument regs
  3938. // to their spots on the stack so that they may be loaded by dereferencing
  3939. // the result of va_next.
  3940. // If there is no regs to be stored, just point address after last
  3941. // argument passed via stack.
  3942. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
  3943. CCInfo.getInRegsParamsCount(),
  3944. CCInfo.getNextStackOffset(),
  3945. std::max(4U, TotalArgRegsSaveSize));
  3946. AFI->setVarArgsFrameIndex(FrameIndex);
  3947. }
  3948. bool ARMTargetLowering::splitValueIntoRegisterParts(
  3949. SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
  3950. unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
  3951. bool IsABIRegCopy = CC.hasValue();
  3952. EVT ValueVT = Val.getValueType();
  3953. if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
  3954. PartVT == MVT::f32) {
  3955. unsigned ValueBits = ValueVT.getSizeInBits();
  3956. unsigned PartBits = PartVT.getSizeInBits();
  3957. Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
  3958. Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
  3959. Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
  3960. Parts[0] = Val;
  3961. return true;
  3962. }
  3963. return false;
  3964. }
  3965. SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
  3966. SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
  3967. MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
  3968. bool IsABIRegCopy = CC.hasValue();
  3969. if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
  3970. PartVT == MVT::f32) {
  3971. unsigned ValueBits = ValueVT.getSizeInBits();
  3972. unsigned PartBits = PartVT.getSizeInBits();
  3973. SDValue Val = Parts[0];
  3974. Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
  3975. Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
  3976. Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
  3977. return Val;
  3978. }
  3979. return SDValue();
  3980. }
  3981. SDValue ARMTargetLowering::LowerFormalArguments(
  3982. SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
  3983. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
  3984. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  3985. MachineFunction &MF = DAG.getMachineFunction();
  3986. MachineFrameInfo &MFI = MF.getFrameInfo();
  3987. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3988. // Assign locations to all of the incoming arguments.
  3989. SmallVector<CCValAssign, 16> ArgLocs;
  3990. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
  3991. *DAG.getContext());
  3992. CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
  3993. SmallVector<SDValue, 16> ArgValues;
  3994. SDValue ArgValue;
  3995. Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
  3996. unsigned CurArgIdx = 0;
  3997. // Initially ArgRegsSaveSize is zero.
  3998. // Then we increase this value each time we meet byval parameter.
  3999. // We also increase this value in case of varargs function.
  4000. AFI->setArgRegsSaveSize(0);
  4001. // Calculate the amount of stack space that we need to allocate to store
  4002. // byval and variadic arguments that are passed in registers.
  4003. // We need to know this before we allocate the first byval or variadic
  4004. // argument, as they will be allocated a stack slot below the CFA (Canonical
  4005. // Frame Address, the stack pointer at entry to the function).
  4006. unsigned ArgRegBegin = ARM::R4;
  4007. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
  4008. if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
  4009. break;
  4010. CCValAssign &VA = ArgLocs[i];
  4011. unsigned Index = VA.getValNo();
  4012. ISD::ArgFlagsTy Flags = Ins[Index].Flags;
  4013. if (!Flags.isByVal())
  4014. continue;
  4015. assert(VA.isMemLoc() && "unexpected byval pointer in reg");
  4016. unsigned RBegin, REnd;
  4017. CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
  4018. ArgRegBegin = std::min(ArgRegBegin, RBegin);
  4019. CCInfo.nextInRegsParam();
  4020. }
  4021. CCInfo.rewindByValRegsInfo();
  4022. int lastInsIndex = -1;
  4023. if (isVarArg && MFI.hasVAStart()) {
  4024. unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
  4025. if (RegIdx != array_lengthof(GPRArgRegs))
  4026. ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
  4027. }
  4028. unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
  4029. AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
  4030. auto PtrVT = getPointerTy(DAG.getDataLayout());
  4031. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
  4032. CCValAssign &VA = ArgLocs[i];
  4033. if (Ins[VA.getValNo()].isOrigArg()) {
  4034. std::advance(CurOrigArg,
  4035. Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
  4036. CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
  4037. }
  4038. // Arguments stored in registers.
  4039. if (VA.isRegLoc()) {
  4040. EVT RegVT = VA.getLocVT();
  4041. if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
  4042. // f64 and vector types are split up into multiple registers or
  4043. // combinations of registers and stack slots.
  4044. SDValue ArgValue1 =
  4045. GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  4046. VA = ArgLocs[++i]; // skip ahead to next loc
  4047. SDValue ArgValue2;
  4048. if (VA.isMemLoc()) {
  4049. int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
  4050. SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
  4051. ArgValue2 = DAG.getLoad(
  4052. MVT::f64, dl, Chain, FIN,
  4053. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
  4054. } else {
  4055. ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  4056. }
  4057. ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
  4058. ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
  4059. ArgValue1, DAG.getIntPtrConstant(0, dl));
  4060. ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
  4061. ArgValue2, DAG.getIntPtrConstant(1, dl));
  4062. } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
  4063. ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  4064. } else {
  4065. const TargetRegisterClass *RC;
  4066. if (RegVT == MVT::f16 || RegVT == MVT::bf16)
  4067. RC = &ARM::HPRRegClass;
  4068. else if (RegVT == MVT::f32)
  4069. RC = &ARM::SPRRegClass;
  4070. else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
  4071. RegVT == MVT::v4bf16)
  4072. RC = &ARM::DPRRegClass;
  4073. else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
  4074. RegVT == MVT::v8bf16)
  4075. RC = &ARM::QPRRegClass;
  4076. else if (RegVT == MVT::i32)
  4077. RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
  4078. : &ARM::GPRRegClass;
  4079. else
  4080. llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
  4081. // Transform the arguments in physical registers into virtual ones.
  4082. Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
  4083. ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
  4084. // If this value is passed in r0 and has the returned attribute (e.g.
  4085. // C++ 'structors), record this fact for later use.
  4086. if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
  4087. AFI->setPreservesR0();
  4088. }
  4089. }
  4090. // If this is an 8 or 16-bit value, it is really passed promoted
  4091. // to 32 bits. Insert an assert[sz]ext to capture this, then
  4092. // truncate to the right size.
  4093. switch (VA.getLocInfo()) {
  4094. default: llvm_unreachable("Unknown loc info!");
  4095. case CCValAssign::Full: break;
  4096. case CCValAssign::BCvt:
  4097. ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
  4098. break;
  4099. case CCValAssign::SExt:
  4100. ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
  4101. DAG.getValueType(VA.getValVT()));
  4102. ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
  4103. break;
  4104. case CCValAssign::ZExt:
  4105. ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
  4106. DAG.getValueType(VA.getValVT()));
  4107. ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
  4108. break;
  4109. }
  4110. // f16 arguments have their size extended to 4 bytes and passed as if they
  4111. // had been copied to the LSBs of a 32-bit register.
  4112. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
  4113. if (VA.needsCustom() &&
  4114. (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
  4115. ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
  4116. InVals.push_back(ArgValue);
  4117. } else { // VA.isRegLoc()
  4118. // Only arguments passed on the stack should make it here.
  4119. assert(VA.isMemLoc());
  4120. assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
  4121. int index = VA.getValNo();
  4122. // Some Ins[] entries become multiple ArgLoc[] entries.
  4123. // Process them only once.
  4124. if (index != lastInsIndex)
  4125. {
  4126. ISD::ArgFlagsTy Flags = Ins[index].Flags;
  4127. // FIXME: For now, all byval parameter objects are marked mutable.
  4128. // This can be changed with more analysis.
  4129. // In case of tail call optimization mark all arguments mutable.
  4130. // Since they could be overwritten by lowering of arguments in case of
  4131. // a tail call.
  4132. if (Flags.isByVal()) {
  4133. assert(Ins[index].isOrigArg() &&
  4134. "Byval arguments cannot be implicit");
  4135. unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
  4136. int FrameIndex = StoreByValRegs(
  4137. CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
  4138. VA.getLocMemOffset(), Flags.getByValSize());
  4139. InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
  4140. CCInfo.nextInRegsParam();
  4141. } else {
  4142. unsigned FIOffset = VA.getLocMemOffset();
  4143. int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
  4144. FIOffset, true);
  4145. // Create load nodes to retrieve arguments from the stack.
  4146. SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
  4147. InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
  4148. MachinePointerInfo::getFixedStack(
  4149. DAG.getMachineFunction(), FI)));
  4150. }
  4151. lastInsIndex = index;
  4152. }
  4153. }
  4154. }
  4155. // varargs
  4156. if (isVarArg && MFI.hasVAStart()) {
  4157. VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
  4158. TotalArgRegsSaveSize);
  4159. if (AFI->isCmseNSEntryFunction()) {
  4160. DiagnosticInfoUnsupported Diag(
  4161. DAG.getMachineFunction().getFunction(),
  4162. "secure entry function must not be variadic", dl.getDebugLoc());
  4163. DAG.getContext()->diagnose(Diag);
  4164. }
  4165. }
  4166. unsigned StackArgSize = CCInfo.getNextStackOffset();
  4167. bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
  4168. if (canGuaranteeTCO(CallConv, TailCallOpt)) {
  4169. // The only way to guarantee a tail call is if the callee restores its
  4170. // argument area, but it must also keep the stack aligned when doing so.
  4171. const DataLayout &DL = DAG.getDataLayout();
  4172. StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
  4173. AFI->setArgumentStackToRestore(StackArgSize);
  4174. }
  4175. AFI->setArgumentStackSize(StackArgSize);
  4176. if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
  4177. DiagnosticInfoUnsupported Diag(
  4178. DAG.getMachineFunction().getFunction(),
  4179. "secure entry function requires arguments on stack", dl.getDebugLoc());
  4180. DAG.getContext()->diagnose(Diag);
  4181. }
  4182. return Chain;
  4183. }
  4184. /// isFloatingPointZero - Return true if this is +0.0.
  4185. static bool isFloatingPointZero(SDValue Op) {
  4186. if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
  4187. return CFP->getValueAPF().isPosZero();
  4188. else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
  4189. // Maybe this has already been legalized into the constant pool?
  4190. if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
  4191. SDValue WrapperOp = Op.getOperand(1).getOperand(0);
  4192. if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
  4193. if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
  4194. return CFP->getValueAPF().isPosZero();
  4195. }
  4196. } else if (Op->getOpcode() == ISD::BITCAST &&
  4197. Op->getValueType(0) == MVT::f64) {
  4198. // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
  4199. // created by LowerConstantFP().
  4200. SDValue BitcastOp = Op->getOperand(0);
  4201. if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
  4202. isNullConstant(BitcastOp->getOperand(0)))
  4203. return true;
  4204. }
  4205. return false;
  4206. }
  4207. /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
  4208. /// the given operands.
  4209. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  4210. SDValue &ARMcc, SelectionDAG &DAG,
  4211. const SDLoc &dl) const {
  4212. if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
  4213. unsigned C = RHSC->getZExtValue();
  4214. if (!isLegalICmpImmediate((int32_t)C)) {
  4215. // Constant does not fit, try adjusting it by one.
  4216. switch (CC) {
  4217. default: break;
  4218. case ISD::SETLT:
  4219. case ISD::SETGE:
  4220. if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
  4221. CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
  4222. RHS = DAG.getConstant(C - 1, dl, MVT::i32);
  4223. }
  4224. break;
  4225. case ISD::SETULT:
  4226. case ISD::SETUGE:
  4227. if (C != 0 && isLegalICmpImmediate(C-1)) {
  4228. CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
  4229. RHS = DAG.getConstant(C - 1, dl, MVT::i32);
  4230. }
  4231. break;
  4232. case ISD::SETLE:
  4233. case ISD::SETGT:
  4234. if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
  4235. CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
  4236. RHS = DAG.getConstant(C + 1, dl, MVT::i32);
  4237. }
  4238. break;
  4239. case ISD::SETULE:
  4240. case ISD::SETUGT:
  4241. if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
  4242. CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
  4243. RHS = DAG.getConstant(C + 1, dl, MVT::i32);
  4244. }
  4245. break;
  4246. }
  4247. }
  4248. } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
  4249. (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
  4250. // In ARM and Thumb-2, the compare instructions can shift their second
  4251. // operand.
  4252. CC = ISD::getSetCCSwappedOperands(CC);
  4253. std::swap(LHS, RHS);
  4254. }
  4255. // Thumb1 has very limited immediate modes, so turning an "and" into a
  4256. // shift can save multiple instructions.
  4257. //
  4258. // If we have (x & C1), and C1 is an appropriate mask, we can transform it
  4259. // into "((x << n) >> n)". But that isn't necessarily profitable on its
  4260. // own. If it's the operand to an unsigned comparison with an immediate,
  4261. // we can eliminate one of the shifts: we transform
  4262. // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
  4263. //
  4264. // We avoid transforming cases which aren't profitable due to encoding
  4265. // details:
  4266. //
  4267. // 1. C2 fits into the immediate field of a cmp, and the transformed version
  4268. // would not; in that case, we're essentially trading one immediate load for
  4269. // another.
  4270. // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
  4271. // 3. C2 is zero; we have other code for this special case.
  4272. //
  4273. // FIXME: Figure out profitability for Thumb2; we usually can't save an
  4274. // instruction, since the AND is always one instruction anyway, but we could
  4275. // use narrow instructions in some cases.
  4276. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
  4277. LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
  4278. LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
  4279. !isSignedIntSetCC(CC)) {
  4280. unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
  4281. auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
  4282. uint64_t RHSV = RHSC->getZExtValue();
  4283. if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
  4284. unsigned ShiftBits = countLeadingZeros(Mask);
  4285. if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
  4286. SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
  4287. LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
  4288. RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
  4289. }
  4290. }
  4291. }
  4292. // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
  4293. // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
  4294. // way a cmp would.
  4295. // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
  4296. // some tweaks to the heuristics for the previous and->shift transform.
  4297. // FIXME: Optimize cases where the LHS isn't a shift.
  4298. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
  4299. isa<ConstantSDNode>(RHS) &&
  4300. cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
  4301. CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
  4302. cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
  4303. unsigned ShiftAmt =
  4304. cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
  4305. SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
  4306. DAG.getVTList(MVT::i32, MVT::i32),
  4307. LHS.getOperand(0),
  4308. DAG.getConstant(ShiftAmt, dl, MVT::i32));
  4309. SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
  4310. Shift.getValue(1), SDValue());
  4311. ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
  4312. return Chain.getValue(1);
  4313. }
  4314. ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
  4315. // If the RHS is a constant zero then the V (overflow) flag will never be
  4316. // set. This can allow us to simplify GE to PL or LT to MI, which can be
  4317. // simpler for other passes (like the peephole optimiser) to deal with.
  4318. if (isNullConstant(RHS)) {
  4319. switch (CondCode) {
  4320. default: break;
  4321. case ARMCC::GE:
  4322. CondCode = ARMCC::PL;
  4323. break;
  4324. case ARMCC::LT:
  4325. CondCode = ARMCC::MI;
  4326. break;
  4327. }
  4328. }
  4329. ARMISD::NodeType CompareType;
  4330. switch (CondCode) {
  4331. default:
  4332. CompareType = ARMISD::CMP;
  4333. break;
  4334. case ARMCC::EQ:
  4335. case ARMCC::NE:
  4336. // Uses only Z Flag
  4337. CompareType = ARMISD::CMPZ;
  4338. break;
  4339. }
  4340. ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  4341. return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
  4342. }
  4343. /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
  4344. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
  4345. SelectionDAG &DAG, const SDLoc &dl,
  4346. bool Signaling) const {
  4347. assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
  4348. SDValue Cmp;
  4349. if (!isFloatingPointZero(RHS))
  4350. Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
  4351. dl, MVT::Glue, LHS, RHS);
  4352. else
  4353. Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
  4354. dl, MVT::Glue, LHS);
  4355. return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
  4356. }
  4357. /// duplicateCmp - Glue values can have only one use, so this function
  4358. /// duplicates a comparison node.
  4359. SDValue
  4360. ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
  4361. unsigned Opc = Cmp.getOpcode();
  4362. SDLoc DL(Cmp);
  4363. if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
  4364. return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
  4365. assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
  4366. Cmp = Cmp.getOperand(0);
  4367. Opc = Cmp.getOpcode();
  4368. if (Opc == ARMISD::CMPFP)
  4369. Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
  4370. else {
  4371. assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
  4372. Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
  4373. }
  4374. return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
  4375. }
  4376. // This function returns three things: the arithmetic computation itself
  4377. // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
  4378. // comparison and the condition code define the case in which the arithmetic
  4379. // computation *does not* overflow.
  4380. std::pair<SDValue, SDValue>
  4381. ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
  4382. SDValue &ARMcc) const {
  4383. assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
  4384. SDValue Value, OverflowCmp;
  4385. SDValue LHS = Op.getOperand(0);
  4386. SDValue RHS = Op.getOperand(1);
  4387. SDLoc dl(Op);
  4388. // FIXME: We are currently always generating CMPs because we don't support
  4389. // generating CMN through the backend. This is not as good as the natural
  4390. // CMP case because it causes a register dependency and cannot be folded
  4391. // later.
  4392. switch (Op.getOpcode()) {
  4393. default:
  4394. llvm_unreachable("Unknown overflow instruction!");
  4395. case ISD::SADDO:
  4396. ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
  4397. Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
  4398. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
  4399. break;
  4400. case ISD::UADDO:
  4401. ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
  4402. // We use ADDC here to correspond to its use in LowerUnsignedALUO.
  4403. // We do not use it in the USUBO case as Value may not be used.
  4404. Value = DAG.getNode(ARMISD::ADDC, dl,
  4405. DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
  4406. .getValue(0);
  4407. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
  4408. break;
  4409. case ISD::SSUBO:
  4410. ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
  4411. Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
  4412. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
  4413. break;
  4414. case ISD::USUBO:
  4415. ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
  4416. Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
  4417. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
  4418. break;
  4419. case ISD::UMULO:
  4420. // We generate a UMUL_LOHI and then check if the high word is 0.
  4421. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
  4422. Value = DAG.getNode(ISD::UMUL_LOHI, dl,
  4423. DAG.getVTList(Op.getValueType(), Op.getValueType()),
  4424. LHS, RHS);
  4425. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
  4426. DAG.getConstant(0, dl, MVT::i32));
  4427. Value = Value.getValue(0); // We only want the low 32 bits for the result.
  4428. break;
  4429. case ISD::SMULO:
  4430. // We generate a SMUL_LOHI and then check if all the bits of the high word
  4431. // are the same as the sign bit of the low word.
  4432. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
  4433. Value = DAG.getNode(ISD::SMUL_LOHI, dl,
  4434. DAG.getVTList(Op.getValueType(), Op.getValueType()),
  4435. LHS, RHS);
  4436. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
  4437. DAG.getNode(ISD::SRA, dl, Op.getValueType(),
  4438. Value.getValue(0),
  4439. DAG.getConstant(31, dl, MVT::i32)));
  4440. Value = Value.getValue(0); // We only want the low 32 bits for the result.
  4441. break;
  4442. } // switch (...)
  4443. return std::make_pair(Value, OverflowCmp);
  4444. }
  4445. SDValue
  4446. ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
  4447. // Let legalize expand this if it isn't a legal type yet.
  4448. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
  4449. return SDValue();
  4450. SDValue Value, OverflowCmp;
  4451. SDValue ARMcc;
  4452. std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
  4453. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4454. SDLoc dl(Op);
  4455. // We use 0 and 1 as false and true values.
  4456. SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
  4457. SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
  4458. EVT VT = Op.getValueType();
  4459. SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
  4460. ARMcc, CCR, OverflowCmp);
  4461. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
  4462. return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
  4463. }
  4464. static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
  4465. SelectionDAG &DAG) {
  4466. SDLoc DL(BoolCarry);
  4467. EVT CarryVT = BoolCarry.getValueType();
  4468. // This converts the boolean value carry into the carry flag by doing
  4469. // ARMISD::SUBC Carry, 1
  4470. SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
  4471. DAG.getVTList(CarryVT, MVT::i32),
  4472. BoolCarry, DAG.getConstant(1, DL, CarryVT));
  4473. return Carry.getValue(1);
  4474. }
  4475. static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
  4476. SelectionDAG &DAG) {
  4477. SDLoc DL(Flags);
  4478. // Now convert the carry flag into a boolean carry. We do this
  4479. // using ARMISD:ADDE 0, 0, Carry
  4480. return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
  4481. DAG.getConstant(0, DL, MVT::i32),
  4482. DAG.getConstant(0, DL, MVT::i32), Flags);
  4483. }
  4484. SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
  4485. SelectionDAG &DAG) const {
  4486. // Let legalize expand this if it isn't a legal type yet.
  4487. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
  4488. return SDValue();
  4489. SDValue LHS = Op.getOperand(0);
  4490. SDValue RHS = Op.getOperand(1);
  4491. SDLoc dl(Op);
  4492. EVT VT = Op.getValueType();
  4493. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  4494. SDValue Value;
  4495. SDValue Overflow;
  4496. switch (Op.getOpcode()) {
  4497. default:
  4498. llvm_unreachable("Unknown overflow instruction!");
  4499. case ISD::UADDO:
  4500. Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
  4501. // Convert the carry flag into a boolean value.
  4502. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
  4503. break;
  4504. case ISD::USUBO: {
  4505. Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
  4506. // Convert the carry flag into a boolean value.
  4507. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
  4508. // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
  4509. // value. So compute 1 - C.
  4510. Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
  4511. DAG.getConstant(1, dl, MVT::i32), Overflow);
  4512. break;
  4513. }
  4514. }
  4515. return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
  4516. }
  4517. static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,
  4518. const ARMSubtarget *Subtarget) {
  4519. EVT VT = Op.getValueType();
  4520. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
  4521. return SDValue();
  4522. if (!VT.isSimple())
  4523. return SDValue();
  4524. unsigned NewOpcode;
  4525. switch (VT.getSimpleVT().SimpleTy) {
  4526. default:
  4527. return SDValue();
  4528. case MVT::i8:
  4529. switch (Op->getOpcode()) {
  4530. case ISD::UADDSAT:
  4531. NewOpcode = ARMISD::UQADD8b;
  4532. break;
  4533. case ISD::SADDSAT:
  4534. NewOpcode = ARMISD::QADD8b;
  4535. break;
  4536. case ISD::USUBSAT:
  4537. NewOpcode = ARMISD::UQSUB8b;
  4538. break;
  4539. case ISD::SSUBSAT:
  4540. NewOpcode = ARMISD::QSUB8b;
  4541. break;
  4542. }
  4543. break;
  4544. case MVT::i16:
  4545. switch (Op->getOpcode()) {
  4546. case ISD::UADDSAT:
  4547. NewOpcode = ARMISD::UQADD16b;
  4548. break;
  4549. case ISD::SADDSAT:
  4550. NewOpcode = ARMISD::QADD16b;
  4551. break;
  4552. case ISD::USUBSAT:
  4553. NewOpcode = ARMISD::UQSUB16b;
  4554. break;
  4555. case ISD::SSUBSAT:
  4556. NewOpcode = ARMISD::QSUB16b;
  4557. break;
  4558. }
  4559. break;
  4560. }
  4561. SDLoc dl(Op);
  4562. SDValue Add =
  4563. DAG.getNode(NewOpcode, dl, MVT::i32,
  4564. DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
  4565. DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
  4566. return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
  4567. }
  4568. SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
  4569. SDValue Cond = Op.getOperand(0);
  4570. SDValue SelectTrue = Op.getOperand(1);
  4571. SDValue SelectFalse = Op.getOperand(2);
  4572. SDLoc dl(Op);
  4573. unsigned Opc = Cond.getOpcode();
  4574. if (Cond.getResNo() == 1 &&
  4575. (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
  4576. Opc == ISD::USUBO)) {
  4577. if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
  4578. return SDValue();
  4579. SDValue Value, OverflowCmp;
  4580. SDValue ARMcc;
  4581. std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
  4582. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4583. EVT VT = Op.getValueType();
  4584. return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
  4585. OverflowCmp, DAG);
  4586. }
  4587. // Convert:
  4588. //
  4589. // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
  4590. // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
  4591. //
  4592. if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
  4593. const ConstantSDNode *CMOVTrue =
  4594. dyn_cast<ConstantSDNode>(Cond.getOperand(0));
  4595. const ConstantSDNode *CMOVFalse =
  4596. dyn_cast<ConstantSDNode>(Cond.getOperand(1));
  4597. if (CMOVTrue && CMOVFalse) {
  4598. unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
  4599. unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
  4600. SDValue True;
  4601. SDValue False;
  4602. if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
  4603. True = SelectTrue;
  4604. False = SelectFalse;
  4605. } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
  4606. True = SelectFalse;
  4607. False = SelectTrue;
  4608. }
  4609. if (True.getNode() && False.getNode()) {
  4610. EVT VT = Op.getValueType();
  4611. SDValue ARMcc = Cond.getOperand(2);
  4612. SDValue CCR = Cond.getOperand(3);
  4613. SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
  4614. assert(True.getValueType() == VT);
  4615. return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
  4616. }
  4617. }
  4618. }
  4619. // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
  4620. // undefined bits before doing a full-word comparison with zero.
  4621. Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
  4622. DAG.getConstant(1, dl, Cond.getValueType()));
  4623. return DAG.getSelectCC(dl, Cond,
  4624. DAG.getConstant(0, dl, Cond.getValueType()),
  4625. SelectTrue, SelectFalse, ISD::SETNE);
  4626. }
  4627. static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
  4628. bool &swpCmpOps, bool &swpVselOps) {
  4629. // Start by selecting the GE condition code for opcodes that return true for
  4630. // 'equality'
  4631. if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
  4632. CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
  4633. CondCode = ARMCC::GE;
  4634. // and GT for opcodes that return false for 'equality'.
  4635. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
  4636. CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
  4637. CondCode = ARMCC::GT;
  4638. // Since we are constrained to GE/GT, if the opcode contains 'less', we need
  4639. // to swap the compare operands.
  4640. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
  4641. CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
  4642. swpCmpOps = true;
  4643. // Both GT and GE are ordered comparisons, and return false for 'unordered'.
  4644. // If we have an unordered opcode, we need to swap the operands to the VSEL
  4645. // instruction (effectively negating the condition).
  4646. //
  4647. // This also has the effect of swapping which one of 'less' or 'greater'
  4648. // returns true, so we also swap the compare operands. It also switches
  4649. // whether we return true for 'equality', so we compensate by picking the
  4650. // opposite condition code to our original choice.
  4651. if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
  4652. CC == ISD::SETUGT) {
  4653. swpCmpOps = !swpCmpOps;
  4654. swpVselOps = !swpVselOps;
  4655. CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
  4656. }
  4657. // 'ordered' is 'anything but unordered', so use the VS condition code and
  4658. // swap the VSEL operands.
  4659. if (CC == ISD::SETO) {
  4660. CondCode = ARMCC::VS;
  4661. swpVselOps = true;
  4662. }
  4663. // 'unordered or not equal' is 'anything but equal', so use the EQ condition
  4664. // code and swap the VSEL operands. Also do this if we don't care about the
  4665. // unordered case.
  4666. if (CC == ISD::SETUNE || CC == ISD::SETNE) {
  4667. CondCode = ARMCC::EQ;
  4668. swpVselOps = true;
  4669. }
  4670. }
  4671. SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
  4672. SDValue TrueVal, SDValue ARMcc, SDValue CCR,
  4673. SDValue Cmp, SelectionDAG &DAG) const {
  4674. if (!Subtarget->hasFP64() && VT == MVT::f64) {
  4675. FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
  4676. DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
  4677. TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
  4678. DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
  4679. SDValue TrueLow = TrueVal.getValue(0);
  4680. SDValue TrueHigh = TrueVal.getValue(1);
  4681. SDValue FalseLow = FalseVal.getValue(0);
  4682. SDValue FalseHigh = FalseVal.getValue(1);
  4683. SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
  4684. ARMcc, CCR, Cmp);
  4685. SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
  4686. ARMcc, CCR, duplicateCmp(Cmp, DAG));
  4687. return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
  4688. } else {
  4689. return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
  4690. Cmp);
  4691. }
  4692. }
  4693. static bool isGTorGE(ISD::CondCode CC) {
  4694. return CC == ISD::SETGT || CC == ISD::SETGE;
  4695. }
  4696. static bool isLTorLE(ISD::CondCode CC) {
  4697. return CC == ISD::SETLT || CC == ISD::SETLE;
  4698. }
  4699. // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
  4700. // All of these conditions (and their <= and >= counterparts) will do:
  4701. // x < k ? k : x
  4702. // x > k ? x : k
  4703. // k < x ? x : k
  4704. // k > x ? k : x
  4705. static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
  4706. const SDValue TrueVal, const SDValue FalseVal,
  4707. const ISD::CondCode CC, const SDValue K) {
  4708. return (isGTorGE(CC) &&
  4709. ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
  4710. (isLTorLE(CC) &&
  4711. ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
  4712. }
  4713. // Check if two chained conditionals could be converted into SSAT or USAT.
  4714. //
  4715. // SSAT can replace a set of two conditional selectors that bound a number to an
  4716. // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
  4717. //
  4718. // x < -k ? -k : (x > k ? k : x)
  4719. // x < -k ? -k : (x < k ? x : k)
  4720. // x > -k ? (x > k ? k : x) : -k
  4721. // x < k ? (x < -k ? -k : x) : k
  4722. // etc.
  4723. //
  4724. // LLVM canonicalizes these to either a min(max()) or a max(min())
  4725. // pattern. This function tries to match one of these and will return a SSAT
  4726. // node if successful.
  4727. //
  4728. // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
  4729. // is a power of 2.
  4730. static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
  4731. EVT VT = Op.getValueType();
  4732. SDValue V1 = Op.getOperand(0);
  4733. SDValue K1 = Op.getOperand(1);
  4734. SDValue TrueVal1 = Op.getOperand(2);
  4735. SDValue FalseVal1 = Op.getOperand(3);
  4736. ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  4737. const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
  4738. if (Op2.getOpcode() != ISD::SELECT_CC)
  4739. return SDValue();
  4740. SDValue V2 = Op2.getOperand(0);
  4741. SDValue K2 = Op2.getOperand(1);
  4742. SDValue TrueVal2 = Op2.getOperand(2);
  4743. SDValue FalseVal2 = Op2.getOperand(3);
  4744. ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
  4745. SDValue V1Tmp = V1;
  4746. SDValue V2Tmp = V2;
  4747. // Check that the registers and the constants match a max(min()) or min(max())
  4748. // pattern
  4749. if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
  4750. K2 != FalseVal2 ||
  4751. !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
  4752. return SDValue();
  4753. // Check that the constant in the lower-bound check is
  4754. // the opposite of the constant in the upper-bound check
  4755. // in 1's complement.
  4756. if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
  4757. return SDValue();
  4758. int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
  4759. int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
  4760. int64_t PosVal = std::max(Val1, Val2);
  4761. int64_t NegVal = std::min(Val1, Val2);
  4762. if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
  4763. !isPowerOf2_64(PosVal + 1))
  4764. return SDValue();
  4765. // Handle the difference between USAT (unsigned) and SSAT (signed)
  4766. // saturation
  4767. // At this point, PosVal is guaranteed to be positive
  4768. uint64_t K = PosVal;
  4769. SDLoc dl(Op);
  4770. if (Val1 == ~Val2)
  4771. return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
  4772. DAG.getConstant(countTrailingOnes(K), dl, VT));
  4773. if (NegVal == 0)
  4774. return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
  4775. DAG.getConstant(countTrailingOnes(K), dl, VT));
  4776. return SDValue();
  4777. }
  4778. // Check if a condition of the type x < k ? k : x can be converted into a
  4779. // bit operation instead of conditional moves.
  4780. // Currently this is allowed given:
  4781. // - The conditions and values match up
  4782. // - k is 0 or -1 (all ones)
  4783. // This function will not check the last condition, thats up to the caller
  4784. // It returns true if the transformation can be made, and in such case
  4785. // returns x in V, and k in SatK.
  4786. static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
  4787. SDValue &SatK)
  4788. {
  4789. SDValue LHS = Op.getOperand(0);
  4790. SDValue RHS = Op.getOperand(1);
  4791. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  4792. SDValue TrueVal = Op.getOperand(2);
  4793. SDValue FalseVal = Op.getOperand(3);
  4794. SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
  4795. ? &RHS
  4796. : nullptr;
  4797. // No constant operation in comparison, early out
  4798. if (!K)
  4799. return false;
  4800. SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
  4801. V = (KTmp == TrueVal) ? FalseVal : TrueVal;
  4802. SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
  4803. // If the constant on left and right side, or variable on left and right,
  4804. // does not match, early out
  4805. if (*K != KTmp || V != VTmp)
  4806. return false;
  4807. if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
  4808. SatK = *K;
  4809. return true;
  4810. }
  4811. return false;
  4812. }
  4813. bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
  4814. if (VT == MVT::f32)
  4815. return !Subtarget->hasVFP2Base();
  4816. if (VT == MVT::f64)
  4817. return !Subtarget->hasFP64();
  4818. if (VT == MVT::f16)
  4819. return !Subtarget->hasFullFP16();
  4820. return false;
  4821. }
  4822. SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
  4823. EVT VT = Op.getValueType();
  4824. SDLoc dl(Op);
  4825. // Try to convert two saturating conditional selects into a single SSAT
  4826. if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
  4827. if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
  4828. return SatValue;
  4829. // Try to convert expressions of the form x < k ? k : x (and similar forms)
  4830. // into more efficient bit operations, which is possible when k is 0 or -1
  4831. // On ARM and Thumb-2 which have flexible operand 2 this will result in
  4832. // single instructions. On Thumb the shift and the bit operation will be two
  4833. // instructions.
  4834. // Only allow this transformation on full-width (32-bit) operations
  4835. SDValue LowerSatConstant;
  4836. SDValue SatValue;
  4837. if (VT == MVT::i32 &&
  4838. isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
  4839. SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
  4840. DAG.getConstant(31, dl, VT));
  4841. if (isNullConstant(LowerSatConstant)) {
  4842. SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
  4843. DAG.getAllOnesConstant(dl, VT));
  4844. return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
  4845. } else if (isAllOnesConstant(LowerSatConstant))
  4846. return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
  4847. }
  4848. SDValue LHS = Op.getOperand(0);
  4849. SDValue RHS = Op.getOperand(1);
  4850. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  4851. SDValue TrueVal = Op.getOperand(2);
  4852. SDValue FalseVal = Op.getOperand(3);
  4853. ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
  4854. ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
  4855. if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
  4856. LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
  4857. unsigned TVal = CTVal->getZExtValue();
  4858. unsigned FVal = CFVal->getZExtValue();
  4859. unsigned Opcode = 0;
  4860. if (TVal == ~FVal) {
  4861. Opcode = ARMISD::CSINV;
  4862. } else if (TVal == ~FVal + 1) {
  4863. Opcode = ARMISD::CSNEG;
  4864. } else if (TVal + 1 == FVal) {
  4865. Opcode = ARMISD::CSINC;
  4866. } else if (TVal == FVal + 1) {
  4867. Opcode = ARMISD::CSINC;
  4868. std::swap(TrueVal, FalseVal);
  4869. std::swap(TVal, FVal);
  4870. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4871. }
  4872. if (Opcode) {
  4873. // If one of the constants is cheaper than another, materialise the
  4874. // cheaper one and let the csel generate the other.
  4875. if (Opcode != ARMISD::CSINC &&
  4876. HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
  4877. std::swap(TrueVal, FalseVal);
  4878. std::swap(TVal, FVal);
  4879. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4880. }
  4881. // Attempt to use ZR checking TVal is 0, possibly inverting the condition
  4882. // to get there. CSINC not is invertable like the other two (~(~a) == a,
  4883. // -(-a) == a, but (a+1)+1 != a).
  4884. if (FVal == 0 && Opcode != ARMISD::CSINC) {
  4885. std::swap(TrueVal, FalseVal);
  4886. std::swap(TVal, FVal);
  4887. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4888. }
  4889. // Drops F's value because we can get it by inverting/negating TVal.
  4890. FalseVal = TrueVal;
  4891. SDValue ARMcc;
  4892. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  4893. EVT VT = TrueVal.getValueType();
  4894. return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
  4895. }
  4896. }
  4897. if (isUnsupportedFloatingType(LHS.getValueType())) {
  4898. DAG.getTargetLoweringInfo().softenSetCCOperands(
  4899. DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
  4900. // If softenSetCCOperands only returned one value, we should compare it to
  4901. // zero.
  4902. if (!RHS.getNode()) {
  4903. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  4904. CC = ISD::SETNE;
  4905. }
  4906. }
  4907. if (LHS.getValueType() == MVT::i32) {
  4908. // Try to generate VSEL on ARMv8.
  4909. // The VSEL instruction can't use all the usual ARM condition
  4910. // codes: it only has two bits to select the condition code, so it's
  4911. // constrained to use only GE, GT, VS and EQ.
  4912. //
  4913. // To implement all the various ISD::SETXXX opcodes, we sometimes need to
  4914. // swap the operands of the previous compare instruction (effectively
  4915. // inverting the compare condition, swapping 'less' and 'greater') and
  4916. // sometimes need to swap the operands to the VSEL (which inverts the
  4917. // condition in the sense of firing whenever the previous condition didn't)
  4918. if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
  4919. TrueVal.getValueType() == MVT::f32 ||
  4920. TrueVal.getValueType() == MVT::f64)) {
  4921. ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
  4922. if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
  4923. CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
  4924. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4925. std::swap(TrueVal, FalseVal);
  4926. }
  4927. }
  4928. SDValue ARMcc;
  4929. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4930. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  4931. // Choose GE over PL, which vsel does now support
  4932. if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
  4933. ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
  4934. return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
  4935. }
  4936. ARMCC::CondCodes CondCode, CondCode2;
  4937. FPCCToARMCC(CC, CondCode, CondCode2);
  4938. // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
  4939. // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
  4940. // must use VSEL (limited condition codes), due to not having conditional f16
  4941. // moves.
  4942. if (Subtarget->hasFPARMv8Base() &&
  4943. !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
  4944. (TrueVal.getValueType() == MVT::f16 ||
  4945. TrueVal.getValueType() == MVT::f32 ||
  4946. TrueVal.getValueType() == MVT::f64)) {
  4947. bool swpCmpOps = false;
  4948. bool swpVselOps = false;
  4949. checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
  4950. if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
  4951. CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
  4952. if (swpCmpOps)
  4953. std::swap(LHS, RHS);
  4954. if (swpVselOps)
  4955. std::swap(TrueVal, FalseVal);
  4956. }
  4957. }
  4958. SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  4959. SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
  4960. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4961. SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
  4962. if (CondCode2 != ARMCC::AL) {
  4963. SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
  4964. // FIXME: Needs another CMP because flag can have but one use.
  4965. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
  4966. Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
  4967. }
  4968. return Result;
  4969. }
  4970. /// canChangeToInt - Given the fp compare operand, return true if it is suitable
  4971. /// to morph to an integer compare sequence.
  4972. static bool canChangeToInt(SDValue Op, bool &SeenZero,
  4973. const ARMSubtarget *Subtarget) {
  4974. SDNode *N = Op.getNode();
  4975. if (!N->hasOneUse())
  4976. // Otherwise it requires moving the value from fp to integer registers.
  4977. return false;
  4978. if (!N->getNumValues())
  4979. return false;
  4980. EVT VT = Op.getValueType();
  4981. if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
  4982. // f32 case is generally profitable. f64 case only makes sense when vcmpe +
  4983. // vmrs are very slow, e.g. cortex-a8.
  4984. return false;
  4985. if (isFloatingPointZero(Op)) {
  4986. SeenZero = true;
  4987. return true;
  4988. }
  4989. return ISD::isNormalLoad(N);
  4990. }
  4991. static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
  4992. if (isFloatingPointZero(Op))
  4993. return DAG.getConstant(0, SDLoc(Op), MVT::i32);
  4994. if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
  4995. return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
  4996. Ld->getPointerInfo(), Ld->getAlignment(),
  4997. Ld->getMemOperand()->getFlags());
  4998. llvm_unreachable("Unknown VFP cmp argument!");
  4999. }
  5000. static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
  5001. SDValue &RetVal1, SDValue &RetVal2) {
  5002. SDLoc dl(Op);
  5003. if (isFloatingPointZero(Op)) {
  5004. RetVal1 = DAG.getConstant(0, dl, MVT::i32);
  5005. RetVal2 = DAG.getConstant(0, dl, MVT::i32);
  5006. return;
  5007. }
  5008. if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
  5009. SDValue Ptr = Ld->getBasePtr();
  5010. RetVal1 =
  5011. DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
  5012. Ld->getAlignment(), Ld->getMemOperand()->getFlags());
  5013. EVT PtrType = Ptr.getValueType();
  5014. unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
  5015. SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
  5016. PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
  5017. RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
  5018. Ld->getPointerInfo().getWithOffset(4), NewAlign,
  5019. Ld->getMemOperand()->getFlags());
  5020. return;
  5021. }
  5022. llvm_unreachable("Unknown VFP cmp argument!");
  5023. }
  5024. /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
  5025. /// f32 and even f64 comparisons to integer ones.
  5026. SDValue
  5027. ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
  5028. SDValue Chain = Op.getOperand(0);
  5029. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
  5030. SDValue LHS = Op.getOperand(2);
  5031. SDValue RHS = Op.getOperand(3);
  5032. SDValue Dest = Op.getOperand(4);
  5033. SDLoc dl(Op);
  5034. bool LHSSeenZero = false;
  5035. bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
  5036. bool RHSSeenZero = false;
  5037. bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
  5038. if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
  5039. // If unsafe fp math optimization is enabled and there are no other uses of
  5040. // the CMP operands, and the condition code is EQ or NE, we can optimize it
  5041. // to an integer comparison.
  5042. if (CC == ISD::SETOEQ)
  5043. CC = ISD::SETEQ;
  5044. else if (CC == ISD::SETUNE)
  5045. CC = ISD::SETNE;
  5046. SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
  5047. SDValue ARMcc;
  5048. if (LHS.getValueType() == MVT::f32) {
  5049. LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
  5050. bitcastf32Toi32(LHS, DAG), Mask);
  5051. RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
  5052. bitcastf32Toi32(RHS, DAG), Mask);
  5053. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  5054. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5055. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
  5056. Chain, Dest, ARMcc, CCR, Cmp);
  5057. }
  5058. SDValue LHS1, LHS2;
  5059. SDValue RHS1, RHS2;
  5060. expandf64Toi32(LHS, DAG, LHS1, LHS2);
  5061. expandf64Toi32(RHS, DAG, RHS1, RHS2);
  5062. LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
  5063. RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
  5064. ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
  5065. ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  5066. SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
  5067. SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
  5068. return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
  5069. }
  5070. return SDValue();
  5071. }
  5072. SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
  5073. SDValue Chain = Op.getOperand(0);
  5074. SDValue Cond = Op.getOperand(1);
  5075. SDValue Dest = Op.getOperand(2);
  5076. SDLoc dl(Op);
  5077. // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
  5078. // instruction.
  5079. unsigned Opc = Cond.getOpcode();
  5080. bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
  5081. !Subtarget->isThumb1Only();
  5082. if (Cond.getResNo() == 1 &&
  5083. (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
  5084. Opc == ISD::USUBO || OptimizeMul)) {
  5085. // Only lower legal XALUO ops.
  5086. if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
  5087. return SDValue();
  5088. // The actual operation with overflow check.
  5089. SDValue Value, OverflowCmp;
  5090. SDValue ARMcc;
  5091. std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
  5092. // Reverse the condition code.
  5093. ARMCC::CondCodes CondCode =
  5094. (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
  5095. CondCode = ARMCC::getOppositeCondition(CondCode);
  5096. ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
  5097. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5098. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
  5099. OverflowCmp);
  5100. }
  5101. return SDValue();
  5102. }
  5103. SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
  5104. SDValue Chain = Op.getOperand(0);
  5105. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
  5106. SDValue LHS = Op.getOperand(2);
  5107. SDValue RHS = Op.getOperand(3);
  5108. SDValue Dest = Op.getOperand(4);
  5109. SDLoc dl(Op);
  5110. if (isUnsupportedFloatingType(LHS.getValueType())) {
  5111. DAG.getTargetLoweringInfo().softenSetCCOperands(
  5112. DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
  5113. // If softenSetCCOperands only returned one value, we should compare it to
  5114. // zero.
  5115. if (!RHS.getNode()) {
  5116. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  5117. CC = ISD::SETNE;
  5118. }
  5119. }
  5120. // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
  5121. // instruction.
  5122. unsigned Opc = LHS.getOpcode();
  5123. bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
  5124. !Subtarget->isThumb1Only();
  5125. if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
  5126. (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
  5127. Opc == ISD::USUBO || OptimizeMul) &&
  5128. (CC == ISD::SETEQ || CC == ISD::SETNE)) {
  5129. // Only lower legal XALUO ops.
  5130. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
  5131. return SDValue();
  5132. // The actual operation with overflow check.
  5133. SDValue Value, OverflowCmp;
  5134. SDValue ARMcc;
  5135. std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
  5136. if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
  5137. // Reverse the condition code.
  5138. ARMCC::CondCodes CondCode =
  5139. (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
  5140. CondCode = ARMCC::getOppositeCondition(CondCode);
  5141. ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
  5142. }
  5143. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5144. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
  5145. OverflowCmp);
  5146. }
  5147. if (LHS.getValueType() == MVT::i32) {
  5148. SDValue ARMcc;
  5149. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  5150. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5151. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
  5152. Chain, Dest, ARMcc, CCR, Cmp);
  5153. }
  5154. if (getTargetMachine().Options.UnsafeFPMath &&
  5155. (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
  5156. CC == ISD::SETNE || CC == ISD::SETUNE)) {
  5157. if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
  5158. return Result;
  5159. }
  5160. ARMCC::CondCodes CondCode, CondCode2;
  5161. FPCCToARMCC(CC, CondCode, CondCode2);
  5162. SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  5163. SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
  5164. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5165. SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
  5166. SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
  5167. SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
  5168. if (CondCode2 != ARMCC::AL) {
  5169. ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
  5170. SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
  5171. Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
  5172. }
  5173. return Res;
  5174. }
  5175. SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
  5176. SDValue Chain = Op.getOperand(0);
  5177. SDValue Table = Op.getOperand(1);
  5178. SDValue Index = Op.getOperand(2);
  5179. SDLoc dl(Op);
  5180. EVT PTy = getPointerTy(DAG.getDataLayout());
  5181. JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
  5182. SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
  5183. Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
  5184. Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
  5185. SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
  5186. if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
  5187. // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
  5188. // which does another jump to the destination. This also makes it easier
  5189. // to translate it to TBB / TBH later (Thumb2 only).
  5190. // FIXME: This might not work if the function is extremely large.
  5191. return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
  5192. Addr, Op.getOperand(2), JTI);
  5193. }
  5194. if (isPositionIndependent() || Subtarget->isROPI()) {
  5195. Addr =
  5196. DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
  5197. MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
  5198. Chain = Addr.getValue(1);
  5199. Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
  5200. return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
  5201. } else {
  5202. Addr =
  5203. DAG.getLoad(PTy, dl, Chain, Addr,
  5204. MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
  5205. Chain = Addr.getValue(1);
  5206. return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
  5207. }
  5208. }
  5209. static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
  5210. EVT VT = Op.getValueType();
  5211. SDLoc dl(Op);
  5212. if (Op.getValueType().getVectorElementType() == MVT::i32) {
  5213. if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
  5214. return Op;
  5215. return DAG.UnrollVectorOp(Op.getNode());
  5216. }
  5217. const bool HasFullFP16 =
  5218. static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
  5219. EVT NewTy;
  5220. const EVT OpTy = Op.getOperand(0).getValueType();
  5221. if (OpTy == MVT::v4f32)
  5222. NewTy = MVT::v4i32;
  5223. else if (OpTy == MVT::v4f16 && HasFullFP16)
  5224. NewTy = MVT::v4i16;
  5225. else if (OpTy == MVT::v8f16 && HasFullFP16)
  5226. NewTy = MVT::v8i16;
  5227. else
  5228. llvm_unreachable("Invalid type for custom lowering!");
  5229. if (VT != MVT::v4i16 && VT != MVT::v8i16)
  5230. return DAG.UnrollVectorOp(Op.getNode());
  5231. Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
  5232. return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
  5233. }
  5234. SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
  5235. EVT VT = Op.getValueType();
  5236. if (VT.isVector())
  5237. return LowerVectorFP_TO_INT(Op, DAG);
  5238. bool IsStrict = Op->isStrictFPOpcode();
  5239. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  5240. if (isUnsupportedFloatingType(SrcVal.getValueType())) {
  5241. RTLIB::Libcall LC;
  5242. if (Op.getOpcode() == ISD::FP_TO_SINT ||
  5243. Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
  5244. LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
  5245. Op.getValueType());
  5246. else
  5247. LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
  5248. Op.getValueType());
  5249. SDLoc Loc(Op);
  5250. MakeLibCallOptions CallOptions;
  5251. SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
  5252. SDValue Result;
  5253. std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
  5254. CallOptions, Loc, Chain);
  5255. return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
  5256. }
  5257. // FIXME: Remove this when we have strict fp instruction selection patterns
  5258. if (IsStrict) {
  5259. SDLoc Loc(Op);
  5260. SDValue Result =
  5261. DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT
  5262. : ISD::FP_TO_UINT,
  5263. Loc, Op.getValueType(), SrcVal);
  5264. return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
  5265. }
  5266. return Op;
  5267. }
  5268. static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
  5269. const ARMSubtarget *Subtarget) {
  5270. EVT VT = Op.getValueType();
  5271. EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  5272. EVT FromVT = Op.getOperand(0).getValueType();
  5273. if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
  5274. return Op;
  5275. if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
  5276. Subtarget->hasFP64())
  5277. return Op;
  5278. if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
  5279. Subtarget->hasFullFP16())
  5280. return Op;
  5281. if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
  5282. Subtarget->hasMVEFloatOps())
  5283. return Op;
  5284. if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
  5285. Subtarget->hasMVEFloatOps())
  5286. return Op;
  5287. if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
  5288. return SDValue();
  5289. SDLoc DL(Op);
  5290. bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
  5291. unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
  5292. SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
  5293. DAG.getValueType(VT.getScalarType()));
  5294. SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
  5295. DAG.getConstant((1 << BW) - 1, DL, VT));
  5296. if (IsSigned)
  5297. Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
  5298. DAG.getConstant(-(1 << BW), DL, VT));
  5299. return Max;
  5300. }
  5301. static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
  5302. EVT VT = Op.getValueType();
  5303. SDLoc dl(Op);
  5304. if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
  5305. if (VT.getVectorElementType() == MVT::f32)
  5306. return Op;
  5307. return DAG.UnrollVectorOp(Op.getNode());
  5308. }
  5309. assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
  5310. Op.getOperand(0).getValueType() == MVT::v8i16) &&
  5311. "Invalid type for custom lowering!");
  5312. const bool HasFullFP16 =
  5313. static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
  5314. EVT DestVecType;
  5315. if (VT == MVT::v4f32)
  5316. DestVecType = MVT::v4i32;
  5317. else if (VT == MVT::v4f16 && HasFullFP16)
  5318. DestVecType = MVT::v4i16;
  5319. else if (VT == MVT::v8f16 && HasFullFP16)
  5320. DestVecType = MVT::v8i16;
  5321. else
  5322. return DAG.UnrollVectorOp(Op.getNode());
  5323. unsigned CastOpc;
  5324. unsigned Opc;
  5325. switch (Op.getOpcode()) {
  5326. default: llvm_unreachable("Invalid opcode!");
  5327. case ISD::SINT_TO_FP:
  5328. CastOpc = ISD::SIGN_EXTEND;
  5329. Opc = ISD::SINT_TO_FP;
  5330. break;
  5331. case ISD::UINT_TO_FP:
  5332. CastOpc = ISD::ZERO_EXTEND;
  5333. Opc = ISD::UINT_TO_FP;
  5334. break;
  5335. }
  5336. Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
  5337. return DAG.getNode(Opc, dl, VT, Op);
  5338. }
  5339. SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
  5340. EVT VT = Op.getValueType();
  5341. if (VT.isVector())
  5342. return LowerVectorINT_TO_FP(Op, DAG);
  5343. if (isUnsupportedFloatingType(VT)) {
  5344. RTLIB::Libcall LC;
  5345. if (Op.getOpcode() == ISD::SINT_TO_FP)
  5346. LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
  5347. Op.getValueType());
  5348. else
  5349. LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
  5350. Op.getValueType());
  5351. MakeLibCallOptions CallOptions;
  5352. return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
  5353. CallOptions, SDLoc(Op)).first;
  5354. }
  5355. return Op;
  5356. }
  5357. SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
  5358. // Implement fcopysign with a fabs and a conditional fneg.
  5359. SDValue Tmp0 = Op.getOperand(0);
  5360. SDValue Tmp1 = Op.getOperand(1);
  5361. SDLoc dl(Op);
  5362. EVT VT = Op.getValueType();
  5363. EVT SrcVT = Tmp1.getValueType();
  5364. bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
  5365. Tmp0.getOpcode() == ARMISD::VMOVDRR;
  5366. bool UseNEON = !InGPR && Subtarget->hasNEON();
  5367. if (UseNEON) {
  5368. // Use VBSL to copy the sign bit.
  5369. unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
  5370. SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
  5371. DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
  5372. EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
  5373. if (VT == MVT::f64)
  5374. Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
  5375. DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
  5376. DAG.getConstant(32, dl, MVT::i32));
  5377. else /*if (VT == MVT::f32)*/
  5378. Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
  5379. if (SrcVT == MVT::f32) {
  5380. Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
  5381. if (VT == MVT::f64)
  5382. Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
  5383. DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
  5384. DAG.getConstant(32, dl, MVT::i32));
  5385. } else if (VT == MVT::f32)
  5386. Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
  5387. DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
  5388. DAG.getConstant(32, dl, MVT::i32));
  5389. Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
  5390. Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
  5391. SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
  5392. dl, MVT::i32);
  5393. AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
  5394. SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
  5395. DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
  5396. SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
  5397. DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
  5398. DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
  5399. if (VT == MVT::f32) {
  5400. Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
  5401. Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
  5402. DAG.getConstant(0, dl, MVT::i32));
  5403. } else {
  5404. Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
  5405. }
  5406. return Res;
  5407. }
  5408. // Bitcast operand 1 to i32.
  5409. if (SrcVT == MVT::f64)
  5410. Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
  5411. Tmp1).getValue(1);
  5412. Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
  5413. // Or in the signbit with integer operations.
  5414. SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
  5415. SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
  5416. Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
  5417. if (VT == MVT::f32) {
  5418. Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
  5419. DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
  5420. return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
  5421. DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
  5422. }
  5423. // f64: Or the high part with signbit and then combine two parts.
  5424. Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
  5425. Tmp0);
  5426. SDValue Lo = Tmp0.getValue(0);
  5427. SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
  5428. Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
  5429. return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
  5430. }
  5431. SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
  5432. MachineFunction &MF = DAG.getMachineFunction();
  5433. MachineFrameInfo &MFI = MF.getFrameInfo();
  5434. MFI.setReturnAddressIsTaken(true);
  5435. if (verifyReturnAddressArgumentIsConstant(Op, DAG))
  5436. return SDValue();
  5437. EVT VT = Op.getValueType();
  5438. SDLoc dl(Op);
  5439. unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  5440. if (Depth) {
  5441. SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
  5442. SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
  5443. return DAG.getLoad(VT, dl, DAG.getEntryNode(),
  5444. DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
  5445. MachinePointerInfo());
  5446. }
  5447. // Return LR, which contains the return address. Mark it an implicit live-in.
  5448. Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
  5449. return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
  5450. }
  5451. SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
  5452. const ARMBaseRegisterInfo &ARI =
  5453. *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
  5454. MachineFunction &MF = DAG.getMachineFunction();
  5455. MachineFrameInfo &MFI = MF.getFrameInfo();
  5456. MFI.setFrameAddressIsTaken(true);
  5457. EVT VT = Op.getValueType();
  5458. SDLoc dl(Op); // FIXME probably not meaningful
  5459. unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  5460. Register FrameReg = ARI.getFrameRegister(MF);
  5461. SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
  5462. while (Depth--)
  5463. FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
  5464. MachinePointerInfo());
  5465. return FrameAddr;
  5466. }
  5467. // FIXME? Maybe this could be a TableGen attribute on some registers and
  5468. // this table could be generated automatically from RegInfo.
  5469. Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
  5470. const MachineFunction &MF) const {
  5471. Register Reg = StringSwitch<unsigned>(RegName)
  5472. .Case("sp", ARM::SP)
  5473. .Default(0);
  5474. if (Reg)
  5475. return Reg;
  5476. report_fatal_error(Twine("Invalid register name \""
  5477. + StringRef(RegName) + "\"."));
  5478. }
  5479. // Result is 64 bit value so split into two 32 bit values and return as a
  5480. // pair of values.
  5481. static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
  5482. SelectionDAG &DAG) {
  5483. SDLoc DL(N);
  5484. // This function is only supposed to be called for i64 type destination.
  5485. assert(N->getValueType(0) == MVT::i64
  5486. && "ExpandREAD_REGISTER called for non-i64 type result.");
  5487. SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
  5488. DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
  5489. N->getOperand(0),
  5490. N->getOperand(1));
  5491. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
  5492. Read.getValue(1)));
  5493. Results.push_back(Read.getOperand(0));
  5494. }
  5495. /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
  5496. /// When \p DstVT, the destination type of \p BC, is on the vector
  5497. /// register bank and the source of bitcast, \p Op, operates on the same bank,
  5498. /// it might be possible to combine them, such that everything stays on the
  5499. /// vector register bank.
  5500. /// \p return The node that would replace \p BT, if the combine
  5501. /// is possible.
  5502. static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
  5503. SelectionDAG &DAG) {
  5504. SDValue Op = BC->getOperand(0);
  5505. EVT DstVT = BC->getValueType(0);
  5506. // The only vector instruction that can produce a scalar (remember,
  5507. // since the bitcast was about to be turned into VMOVDRR, the source
  5508. // type is i64) from a vector is EXTRACT_VECTOR_ELT.
  5509. // Moreover, we can do this combine only if there is one use.
  5510. // Finally, if the destination type is not a vector, there is not
  5511. // much point on forcing everything on the vector bank.
  5512. if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  5513. !Op.hasOneUse())
  5514. return SDValue();
  5515. // If the index is not constant, we will introduce an additional
  5516. // multiply that will stick.
  5517. // Give up in that case.
  5518. ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  5519. if (!Index)
  5520. return SDValue();
  5521. unsigned DstNumElt = DstVT.getVectorNumElements();
  5522. // Compute the new index.
  5523. const APInt &APIntIndex = Index->getAPIntValue();
  5524. APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
  5525. NewIndex *= APIntIndex;
  5526. // Check if the new constant index fits into i32.
  5527. if (NewIndex.getBitWidth() > 32)
  5528. return SDValue();
  5529. // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
  5530. // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
  5531. SDLoc dl(Op);
  5532. SDValue ExtractSrc = Op.getOperand(0);
  5533. EVT VecVT = EVT::getVectorVT(
  5534. *DAG.getContext(), DstVT.getScalarType(),
  5535. ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
  5536. SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
  5537. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
  5538. DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
  5539. }
  5540. /// ExpandBITCAST - If the target supports VFP, this function is called to
  5541. /// expand a bit convert where either the source or destination type is i64 to
  5542. /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
  5543. /// operand type is illegal (e.g., v2f32 for a target that doesn't support
  5544. /// vectors), since the legalizer won't know what to do with that.
  5545. SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
  5546. const ARMSubtarget *Subtarget) const {
  5547. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  5548. SDLoc dl(N);
  5549. SDValue Op = N->getOperand(0);
  5550. // This function is only supposed to be called for i16 and i64 types, either
  5551. // as the source or destination of the bit convert.
  5552. EVT SrcVT = Op.getValueType();
  5553. EVT DstVT = N->getValueType(0);
  5554. if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
  5555. (DstVT == MVT::f16 || DstVT == MVT::bf16))
  5556. return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
  5557. DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
  5558. if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
  5559. (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
  5560. return DAG.getNode(
  5561. ISD::TRUNCATE, SDLoc(N), DstVT,
  5562. MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
  5563. if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
  5564. return SDValue();
  5565. // Turn i64->f64 into VMOVDRR.
  5566. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
  5567. // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
  5568. // if we can combine the bitcast with its source.
  5569. if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
  5570. return Val;
  5571. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
  5572. DAG.getConstant(0, dl, MVT::i32));
  5573. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
  5574. DAG.getConstant(1, dl, MVT::i32));
  5575. return DAG.getNode(ISD::BITCAST, dl, DstVT,
  5576. DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
  5577. }
  5578. // Turn f64->i64 into VMOVRRD.
  5579. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
  5580. SDValue Cvt;
  5581. if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
  5582. SrcVT.getVectorNumElements() > 1)
  5583. Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
  5584. DAG.getVTList(MVT::i32, MVT::i32),
  5585. DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
  5586. else
  5587. Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
  5588. DAG.getVTList(MVT::i32, MVT::i32), Op);
  5589. // Merge the pieces into a single i64 value.
  5590. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
  5591. }
  5592. return SDValue();
  5593. }
  5594. /// getZeroVector - Returns a vector of specified type with all zero elements.
  5595. /// Zero vectors are used to represent vector negation and in those cases
  5596. /// will be implemented with the NEON VNEG instruction. However, VNEG does
  5597. /// not support i64 elements, so sometimes the zero vectors will need to be
  5598. /// explicitly constructed. Regardless, use a canonical VMOV to create the
  5599. /// zero vector.
  5600. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
  5601. assert(VT.isVector() && "Expected a vector type");
  5602. // The canonical modified immediate encoding of a zero vector is....0!
  5603. SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
  5604. EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
  5605. SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
  5606. return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
  5607. }
  5608. /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
  5609. /// i32 values and take a 2 x i32 value to shift plus a shift amount.
  5610. SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
  5611. SelectionDAG &DAG) const {
  5612. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  5613. EVT VT = Op.getValueType();
  5614. unsigned VTBits = VT.getSizeInBits();
  5615. SDLoc dl(Op);
  5616. SDValue ShOpLo = Op.getOperand(0);
  5617. SDValue ShOpHi = Op.getOperand(1);
  5618. SDValue ShAmt = Op.getOperand(2);
  5619. SDValue ARMcc;
  5620. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5621. unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
  5622. assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  5623. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  5624. DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
  5625. SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
  5626. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  5627. DAG.getConstant(VTBits, dl, MVT::i32));
  5628. SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
  5629. SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  5630. SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
  5631. SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5632. ISD::SETGE, ARMcc, DAG, dl);
  5633. SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
  5634. ARMcc, CCR, CmpLo);
  5635. SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
  5636. SDValue HiBigShift = Opc == ISD::SRA
  5637. ? DAG.getNode(Opc, dl, VT, ShOpHi,
  5638. DAG.getConstant(VTBits - 1, dl, VT))
  5639. : DAG.getConstant(0, dl, VT);
  5640. SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5641. ISD::SETGE, ARMcc, DAG, dl);
  5642. SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
  5643. ARMcc, CCR, CmpHi);
  5644. SDValue Ops[2] = { Lo, Hi };
  5645. return DAG.getMergeValues(Ops, dl);
  5646. }
  5647. /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
  5648. /// i32 values and take a 2 x i32 value to shift plus a shift amount.
  5649. SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
  5650. SelectionDAG &DAG) const {
  5651. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  5652. EVT VT = Op.getValueType();
  5653. unsigned VTBits = VT.getSizeInBits();
  5654. SDLoc dl(Op);
  5655. SDValue ShOpLo = Op.getOperand(0);
  5656. SDValue ShOpHi = Op.getOperand(1);
  5657. SDValue ShAmt = Op.getOperand(2);
  5658. SDValue ARMcc;
  5659. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5660. assert(Op.getOpcode() == ISD::SHL_PARTS);
  5661. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  5662. DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
  5663. SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
  5664. SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
  5665. SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  5666. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  5667. DAG.getConstant(VTBits, dl, MVT::i32));
  5668. SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
  5669. SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5670. ISD::SETGE, ARMcc, DAG, dl);
  5671. SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
  5672. ARMcc, CCR, CmpHi);
  5673. SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5674. ISD::SETGE, ARMcc, DAG, dl);
  5675. SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  5676. SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
  5677. DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
  5678. SDValue Ops[2] = { Lo, Hi };
  5679. return DAG.getMergeValues(Ops, dl);
  5680. }
  5681. SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
  5682. SelectionDAG &DAG) const {
  5683. // The rounding mode is in bits 23:22 of the FPSCR.
  5684. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
  5685. // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
  5686. // so that the shift + and get folded into a bitfield extract.
  5687. SDLoc dl(Op);
  5688. SDValue Chain = Op.getOperand(0);
  5689. SDValue Ops[] = {Chain,
  5690. DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
  5691. SDValue FPSCR =
  5692. DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
  5693. Chain = FPSCR.getValue(1);
  5694. SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
  5695. DAG.getConstant(1U << 22, dl, MVT::i32));
  5696. SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
  5697. DAG.getConstant(22, dl, MVT::i32));
  5698. SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
  5699. DAG.getConstant(3, dl, MVT::i32));
  5700. return DAG.getMergeValues({And, Chain}, dl);
  5701. }
  5702. SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
  5703. SelectionDAG &DAG) const {
  5704. SDLoc DL(Op);
  5705. SDValue Chain = Op->getOperand(0);
  5706. SDValue RMValue = Op->getOperand(1);
  5707. // The rounding mode is in bits 23:22 of the FPSCR.
  5708. // The llvm.set.rounding argument value to ARM rounding mode value mapping
  5709. // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
  5710. // ((arg - 1) & 3) << 22).
  5711. //
  5712. // It is expected that the argument of llvm.set.rounding is within the
  5713. // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
  5714. // responsibility of the code generated llvm.set.rounding to ensure this
  5715. // condition.
  5716. // Calculate new value of FPSCR[23:22].
  5717. RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
  5718. DAG.getConstant(1, DL, MVT::i32));
  5719. RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
  5720. DAG.getConstant(0x3, DL, MVT::i32));
  5721. RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
  5722. DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
  5723. // Get current value of FPSCR.
  5724. SDValue Ops[] = {Chain,
  5725. DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
  5726. SDValue FPSCR =
  5727. DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
  5728. Chain = FPSCR.getValue(1);
  5729. FPSCR = FPSCR.getValue(0);
  5730. // Put new rounding mode into FPSCR[23:22].
  5731. const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
  5732. FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
  5733. DAG.getConstant(RMMask, DL, MVT::i32));
  5734. FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
  5735. SDValue Ops2[] = {
  5736. Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
  5737. return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
  5738. }
  5739. static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
  5740. const ARMSubtarget *ST) {
  5741. SDLoc dl(N);
  5742. EVT VT = N->getValueType(0);
  5743. if (VT.isVector() && ST->hasNEON()) {
  5744. // Compute the least significant set bit: LSB = X & -X
  5745. SDValue X = N->getOperand(0);
  5746. SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
  5747. SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
  5748. EVT ElemTy = VT.getVectorElementType();
  5749. if (ElemTy == MVT::i8) {
  5750. // Compute with: cttz(x) = ctpop(lsb - 1)
  5751. SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5752. DAG.getTargetConstant(1, dl, ElemTy));
  5753. SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
  5754. return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
  5755. }
  5756. if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
  5757. (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
  5758. // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
  5759. unsigned NumBits = ElemTy.getSizeInBits();
  5760. SDValue WidthMinus1 =
  5761. DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5762. DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
  5763. SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
  5764. return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
  5765. }
  5766. // Compute with: cttz(x) = ctpop(lsb - 1)
  5767. // Compute LSB - 1.
  5768. SDValue Bits;
  5769. if (ElemTy == MVT::i64) {
  5770. // Load constant 0xffff'ffff'ffff'ffff to register.
  5771. SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5772. DAG.getTargetConstant(0x1eff, dl, MVT::i32));
  5773. Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
  5774. } else {
  5775. SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5776. DAG.getTargetConstant(1, dl, ElemTy));
  5777. Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
  5778. }
  5779. return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
  5780. }
  5781. if (!ST->hasV6T2Ops())
  5782. return SDValue();
  5783. SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
  5784. return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
  5785. }
  5786. static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
  5787. const ARMSubtarget *ST) {
  5788. EVT VT = N->getValueType(0);
  5789. SDLoc DL(N);
  5790. assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
  5791. assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
  5792. VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
  5793. "Unexpected type for custom ctpop lowering");
  5794. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  5795. EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
  5796. SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
  5797. Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
  5798. // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
  5799. unsigned EltSize = 8;
  5800. unsigned NumElts = VT.is64BitVector() ? 8 : 16;
  5801. while (EltSize != VT.getScalarSizeInBits()) {
  5802. SmallVector<SDValue, 8> Ops;
  5803. Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
  5804. TLI.getPointerTy(DAG.getDataLayout())));
  5805. Ops.push_back(Res);
  5806. EltSize *= 2;
  5807. NumElts /= 2;
  5808. MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
  5809. Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
  5810. }
  5811. return Res;
  5812. }
  5813. /// Getvshiftimm - Check if this is a valid build_vector for the immediate
  5814. /// operand of a vector shift operation, where all the elements of the
  5815. /// build_vector must have the same constant integer value.
  5816. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
  5817. // Ignore bit_converts.
  5818. while (Op.getOpcode() == ISD::BITCAST)
  5819. Op = Op.getOperand(0);
  5820. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
  5821. APInt SplatBits, SplatUndef;
  5822. unsigned SplatBitSize;
  5823. bool HasAnyUndefs;
  5824. if (!BVN ||
  5825. !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
  5826. ElementBits) ||
  5827. SplatBitSize > ElementBits)
  5828. return false;
  5829. Cnt = SplatBits.getSExtValue();
  5830. return true;
  5831. }
  5832. /// isVShiftLImm - Check if this is a valid build_vector for the immediate
  5833. /// operand of a vector shift left operation. That value must be in the range:
  5834. /// 0 <= Value < ElementBits for a left shift; or
  5835. /// 0 <= Value <= ElementBits for a long left shift.
  5836. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
  5837. assert(VT.isVector() && "vector shift count is not a vector type");
  5838. int64_t ElementBits = VT.getScalarSizeInBits();
  5839. if (!getVShiftImm(Op, ElementBits, Cnt))
  5840. return false;
  5841. return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
  5842. }
  5843. /// isVShiftRImm - Check if this is a valid build_vector for the immediate
  5844. /// operand of a vector shift right operation. For a shift opcode, the value
  5845. /// is positive, but for an intrinsic the value count must be negative. The
  5846. /// absolute value must be in the range:
  5847. /// 1 <= |Value| <= ElementBits for a right shift; or
  5848. /// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
  5849. static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
  5850. int64_t &Cnt) {
  5851. assert(VT.isVector() && "vector shift count is not a vector type");
  5852. int64_t ElementBits = VT.getScalarSizeInBits();
  5853. if (!getVShiftImm(Op, ElementBits, Cnt))
  5854. return false;
  5855. if (!isIntrinsic)
  5856. return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
  5857. if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
  5858. Cnt = -Cnt;
  5859. return true;
  5860. }
  5861. return false;
  5862. }
  5863. static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
  5864. const ARMSubtarget *ST) {
  5865. EVT VT = N->getValueType(0);
  5866. SDLoc dl(N);
  5867. int64_t Cnt;
  5868. if (!VT.isVector())
  5869. return SDValue();
  5870. // We essentially have two forms here. Shift by an immediate and shift by a
  5871. // vector register (there are also shift by a gpr, but that is just handled
  5872. // with a tablegen pattern). We cannot easily match shift by an immediate in
  5873. // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
  5874. // For shifting by a vector, we don't have VSHR, only VSHL (which can be
  5875. // signed or unsigned, and a negative shift indicates a shift right).
  5876. if (N->getOpcode() == ISD::SHL) {
  5877. if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
  5878. return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
  5879. DAG.getConstant(Cnt, dl, MVT::i32));
  5880. return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
  5881. N->getOperand(1));
  5882. }
  5883. assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
  5884. "unexpected vector shift opcode");
  5885. if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
  5886. unsigned VShiftOpc =
  5887. (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
  5888. return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
  5889. DAG.getConstant(Cnt, dl, MVT::i32));
  5890. }
  5891. // Other right shifts we don't have operations for (we use a shift left by a
  5892. // negative number).
  5893. EVT ShiftVT = N->getOperand(1).getValueType();
  5894. SDValue NegatedCount = DAG.getNode(
  5895. ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
  5896. unsigned VShiftOpc =
  5897. (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
  5898. return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
  5899. }
  5900. static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
  5901. const ARMSubtarget *ST) {
  5902. EVT VT = N->getValueType(0);
  5903. SDLoc dl(N);
  5904. // We can get here for a node like i32 = ISD::SHL i32, i64
  5905. if (VT != MVT::i64)
  5906. return SDValue();
  5907. assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
  5908. N->getOpcode() == ISD::SHL) &&
  5909. "Unknown shift to lower!");
  5910. unsigned ShOpc = N->getOpcode();
  5911. if (ST->hasMVEIntegerOps()) {
  5912. SDValue ShAmt = N->getOperand(1);
  5913. unsigned ShPartsOpc = ARMISD::LSLL;
  5914. ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
  5915. // If the shift amount is greater than 32 or has a greater bitwidth than 64
  5916. // then do the default optimisation
  5917. if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
  5918. (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
  5919. return SDValue();
  5920. // Extract the lower 32 bits of the shift amount if it's not an i32
  5921. if (ShAmt->getValueType(0) != MVT::i32)
  5922. ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
  5923. if (ShOpc == ISD::SRL) {
  5924. if (!Con)
  5925. // There is no t2LSRLr instruction so negate and perform an lsll if the
  5926. // shift amount is in a register, emulating a right shift.
  5927. ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  5928. DAG.getConstant(0, dl, MVT::i32), ShAmt);
  5929. else
  5930. // Else generate an lsrl on the immediate shift amount
  5931. ShPartsOpc = ARMISD::LSRL;
  5932. } else if (ShOpc == ISD::SRA)
  5933. ShPartsOpc = ARMISD::ASRL;
  5934. // Lower 32 bits of the destination/source
  5935. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5936. DAG.getConstant(0, dl, MVT::i32));
  5937. // Upper 32 bits of the destination/source
  5938. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5939. DAG.getConstant(1, dl, MVT::i32));
  5940. // Generate the shift operation as computed above
  5941. Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
  5942. ShAmt);
  5943. // The upper 32 bits come from the second return value of lsll
  5944. Hi = SDValue(Lo.getNode(), 1);
  5945. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
  5946. }
  5947. // We only lower SRA, SRL of 1 here, all others use generic lowering.
  5948. if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
  5949. return SDValue();
  5950. // If we are in thumb mode, we don't have RRX.
  5951. if (ST->isThumb1Only())
  5952. return SDValue();
  5953. // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
  5954. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5955. DAG.getConstant(0, dl, MVT::i32));
  5956. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5957. DAG.getConstant(1, dl, MVT::i32));
  5958. // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
  5959. // captures the result into a carry flag.
  5960. unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
  5961. Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
  5962. // The low part is an ARMISD::RRX operand, which shifts the carry in.
  5963. Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
  5964. // Merge the pieces into a single i64 value.
  5965. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
  5966. }
  5967. static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
  5968. const ARMSubtarget *ST) {
  5969. bool Invert = false;
  5970. bool Swap = false;
  5971. unsigned Opc = ARMCC::AL;
  5972. SDValue Op0 = Op.getOperand(0);
  5973. SDValue Op1 = Op.getOperand(1);
  5974. SDValue CC = Op.getOperand(2);
  5975. EVT VT = Op.getValueType();
  5976. ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
  5977. SDLoc dl(Op);
  5978. EVT CmpVT;
  5979. if (ST->hasNEON())
  5980. CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
  5981. else {
  5982. assert(ST->hasMVEIntegerOps() &&
  5983. "No hardware support for integer vector comparison!");
  5984. if (Op.getValueType().getVectorElementType() != MVT::i1)
  5985. return SDValue();
  5986. // Make sure we expand floating point setcc to scalar if we do not have
  5987. // mve.fp, so that we can handle them from there.
  5988. if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
  5989. return SDValue();
  5990. CmpVT = VT;
  5991. }
  5992. if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
  5993. (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
  5994. // Special-case integer 64-bit equality comparisons. They aren't legal,
  5995. // but they can be lowered with a few vector instructions.
  5996. unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
  5997. EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
  5998. SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
  5999. SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
  6000. SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
  6001. DAG.getCondCode(ISD::SETEQ));
  6002. SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
  6003. SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
  6004. Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
  6005. if (SetCCOpcode == ISD::SETNE)
  6006. Merged = DAG.getNOT(dl, Merged, CmpVT);
  6007. Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
  6008. return Merged;
  6009. }
  6010. if (CmpVT.getVectorElementType() == MVT::i64)
  6011. // 64-bit comparisons are not legal in general.
  6012. return SDValue();
  6013. if (Op1.getValueType().isFloatingPoint()) {
  6014. switch (SetCCOpcode) {
  6015. default: llvm_unreachable("Illegal FP comparison");
  6016. case ISD::SETUNE:
  6017. case ISD::SETNE:
  6018. if (ST->hasMVEFloatOps()) {
  6019. Opc = ARMCC::NE; break;
  6020. } else {
  6021. Invert = true; LLVM_FALLTHROUGH;
  6022. }
  6023. case ISD::SETOEQ:
  6024. case ISD::SETEQ: Opc = ARMCC::EQ; break;
  6025. case ISD::SETOLT:
  6026. case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
  6027. case ISD::SETOGT:
  6028. case ISD::SETGT: Opc = ARMCC::GT; break;
  6029. case ISD::SETOLE:
  6030. case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
  6031. case ISD::SETOGE:
  6032. case ISD::SETGE: Opc = ARMCC::GE; break;
  6033. case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
  6034. case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
  6035. case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
  6036. case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
  6037. case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
  6038. case ISD::SETONE: {
  6039. // Expand this to (OLT | OGT).
  6040. SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
  6041. DAG.getConstant(ARMCC::GT, dl, MVT::i32));
  6042. SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
  6043. DAG.getConstant(ARMCC::GT, dl, MVT::i32));
  6044. SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
  6045. if (Invert)
  6046. Result = DAG.getNOT(dl, Result, VT);
  6047. return Result;
  6048. }
  6049. case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
  6050. case ISD::SETO: {
  6051. // Expand this to (OLT | OGE).
  6052. SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
  6053. DAG.getConstant(ARMCC::GT, dl, MVT::i32));
  6054. SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
  6055. DAG.getConstant(ARMCC::GE, dl, MVT::i32));
  6056. SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
  6057. if (Invert)
  6058. Result = DAG.getNOT(dl, Result, VT);
  6059. return Result;
  6060. }
  6061. }
  6062. } else {
  6063. // Integer comparisons.
  6064. switch (SetCCOpcode) {
  6065. default: llvm_unreachable("Illegal integer comparison");
  6066. case ISD::SETNE:
  6067. if (ST->hasMVEIntegerOps()) {
  6068. Opc = ARMCC::NE; break;
  6069. } else {
  6070. Invert = true; LLVM_FALLTHROUGH;
  6071. }
  6072. case ISD::SETEQ: Opc = ARMCC::EQ; break;
  6073. case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
  6074. case ISD::SETGT: Opc = ARMCC::GT; break;
  6075. case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
  6076. case ISD::SETGE: Opc = ARMCC::GE; break;
  6077. case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
  6078. case ISD::SETUGT: Opc = ARMCC::HI; break;
  6079. case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
  6080. case ISD::SETUGE: Opc = ARMCC::HS; break;
  6081. }
  6082. // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
  6083. if (ST->hasNEON() && Opc == ARMCC::EQ) {
  6084. SDValue AndOp;
  6085. if (ISD::isBuildVectorAllZeros(Op1.getNode()))
  6086. AndOp = Op0;
  6087. else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
  6088. AndOp = Op1;
  6089. // Ignore bitconvert.
  6090. if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
  6091. AndOp = AndOp.getOperand(0);
  6092. if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
  6093. Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
  6094. Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
  6095. SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
  6096. if (!Invert)
  6097. Result = DAG.getNOT(dl, Result, VT);
  6098. return Result;
  6099. }
  6100. }
  6101. }
  6102. if (Swap)
  6103. std::swap(Op0, Op1);
  6104. // If one of the operands is a constant vector zero, attempt to fold the
  6105. // comparison to a specialized compare-against-zero form.
  6106. SDValue SingleOp;
  6107. if (ISD::isBuildVectorAllZeros(Op1.getNode()))
  6108. SingleOp = Op0;
  6109. else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
  6110. if (Opc == ARMCC::GE)
  6111. Opc = ARMCC::LE;
  6112. else if (Opc == ARMCC::GT)
  6113. Opc = ARMCC::LT;
  6114. SingleOp = Op1;
  6115. }
  6116. SDValue Result;
  6117. if (SingleOp.getNode()) {
  6118. Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
  6119. DAG.getConstant(Opc, dl, MVT::i32));
  6120. } else {
  6121. Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
  6122. DAG.getConstant(Opc, dl, MVT::i32));
  6123. }
  6124. Result = DAG.getSExtOrTrunc(Result, dl, VT);
  6125. if (Invert)
  6126. Result = DAG.getNOT(dl, Result, VT);
  6127. return Result;
  6128. }
  6129. static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
  6130. SDValue LHS = Op.getOperand(0);
  6131. SDValue RHS = Op.getOperand(1);
  6132. SDValue Carry = Op.getOperand(2);
  6133. SDValue Cond = Op.getOperand(3);
  6134. SDLoc DL(Op);
  6135. assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
  6136. // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
  6137. // have to invert the carry first.
  6138. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
  6139. DAG.getConstant(1, DL, MVT::i32), Carry);
  6140. // This converts the boolean value carry into the carry flag.
  6141. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
  6142. SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
  6143. SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
  6144. SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
  6145. SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
  6146. SDValue ARMcc = DAG.getConstant(
  6147. IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
  6148. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  6149. SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
  6150. Cmp.getValue(1), SDValue());
  6151. return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
  6152. CCR, Chain.getValue(1));
  6153. }
  6154. /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
  6155. /// valid vector constant for a NEON or MVE instruction with a "modified
  6156. /// immediate" operand (e.g., VMOV). If so, return the encoded value.
  6157. static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
  6158. unsigned SplatBitSize, SelectionDAG &DAG,
  6159. const SDLoc &dl, EVT &VT, EVT VectorVT,
  6160. VMOVModImmType type) {
  6161. unsigned OpCmode, Imm;
  6162. bool is128Bits = VectorVT.is128BitVector();
  6163. // SplatBitSize is set to the smallest size that splats the vector, so a
  6164. // zero vector will always have SplatBitSize == 8. However, NEON modified
  6165. // immediate instructions others than VMOV do not support the 8-bit encoding
  6166. // of a zero vector, and the default encoding of zero is supposed to be the
  6167. // 32-bit version.
  6168. if (SplatBits == 0)
  6169. SplatBitSize = 32;
  6170. switch (SplatBitSize) {
  6171. case 8:
  6172. if (type != VMOVModImm)
  6173. return SDValue();
  6174. // Any 1-byte value is OK. Op=0, Cmode=1110.
  6175. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
  6176. OpCmode = 0xe;
  6177. Imm = SplatBits;
  6178. VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
  6179. break;
  6180. case 16:
  6181. // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
  6182. VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
  6183. if ((SplatBits & ~0xff) == 0) {
  6184. // Value = 0x00nn: Op=x, Cmode=100x.
  6185. OpCmode = 0x8;
  6186. Imm = SplatBits;
  6187. break;
  6188. }
  6189. if ((SplatBits & ~0xff00) == 0) {
  6190. // Value = 0xnn00: Op=x, Cmode=101x.
  6191. OpCmode = 0xa;
  6192. Imm = SplatBits >> 8;
  6193. break;
  6194. }
  6195. return SDValue();
  6196. case 32:
  6197. // NEON's 32-bit VMOV supports splat values where:
  6198. // * only one byte is nonzero, or
  6199. // * the least significant byte is 0xff and the second byte is nonzero, or
  6200. // * the least significant 2 bytes are 0xff and the third is nonzero.
  6201. VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
  6202. if ((SplatBits & ~0xff) == 0) {
  6203. // Value = 0x000000nn: Op=x, Cmode=000x.
  6204. OpCmode = 0;
  6205. Imm = SplatBits;
  6206. break;
  6207. }
  6208. if ((SplatBits & ~0xff00) == 0) {
  6209. // Value = 0x0000nn00: Op=x, Cmode=001x.
  6210. OpCmode = 0x2;
  6211. Imm = SplatBits >> 8;
  6212. break;
  6213. }
  6214. if ((SplatBits & ~0xff0000) == 0) {
  6215. // Value = 0x00nn0000: Op=x, Cmode=010x.
  6216. OpCmode = 0x4;
  6217. Imm = SplatBits >> 16;
  6218. break;
  6219. }
  6220. if ((SplatBits & ~0xff000000) == 0) {
  6221. // Value = 0xnn000000: Op=x, Cmode=011x.
  6222. OpCmode = 0x6;
  6223. Imm = SplatBits >> 24;
  6224. break;
  6225. }
  6226. // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
  6227. if (type == OtherModImm) return SDValue();
  6228. if ((SplatBits & ~0xffff) == 0 &&
  6229. ((SplatBits | SplatUndef) & 0xff) == 0xff) {
  6230. // Value = 0x0000nnff: Op=x, Cmode=1100.
  6231. OpCmode = 0xc;
  6232. Imm = SplatBits >> 8;
  6233. break;
  6234. }
  6235. // cmode == 0b1101 is not supported for MVE VMVN
  6236. if (type == MVEVMVNModImm)
  6237. return SDValue();
  6238. if ((SplatBits & ~0xffffff) == 0 &&
  6239. ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
  6240. // Value = 0x00nnffff: Op=x, Cmode=1101.
  6241. OpCmode = 0xd;
  6242. Imm = SplatBits >> 16;
  6243. break;
  6244. }
  6245. // Note: there are a few 32-bit splat values (specifically: 00ffff00,
  6246. // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
  6247. // VMOV.I32. A (very) minor optimization would be to replicate the value
  6248. // and fall through here to test for a valid 64-bit splat. But, then the
  6249. // caller would also need to check and handle the change in size.
  6250. return SDValue();
  6251. case 64: {
  6252. if (type != VMOVModImm)
  6253. return SDValue();
  6254. // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
  6255. uint64_t BitMask = 0xff;
  6256. unsigned ImmMask = 1;
  6257. Imm = 0;
  6258. for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
  6259. if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
  6260. Imm |= ImmMask;
  6261. } else if ((SplatBits & BitMask) != 0) {
  6262. return SDValue();
  6263. }
  6264. BitMask <<= 8;
  6265. ImmMask <<= 1;
  6266. }
  6267. if (DAG.getDataLayout().isBigEndian()) {
  6268. // Reverse the order of elements within the vector.
  6269. unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
  6270. unsigned Mask = (1 << BytesPerElem) - 1;
  6271. unsigned NumElems = 8 / BytesPerElem;
  6272. unsigned NewImm = 0;
  6273. for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
  6274. unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
  6275. NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
  6276. }
  6277. Imm = NewImm;
  6278. }
  6279. // Op=1, Cmode=1110.
  6280. OpCmode = 0x1e;
  6281. VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
  6282. break;
  6283. }
  6284. default:
  6285. llvm_unreachable("unexpected size for isVMOVModifiedImm");
  6286. }
  6287. unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
  6288. return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
  6289. }
  6290. SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
  6291. const ARMSubtarget *ST) const {
  6292. EVT VT = Op.getValueType();
  6293. bool IsDouble = (VT == MVT::f64);
  6294. ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
  6295. const APFloat &FPVal = CFP->getValueAPF();
  6296. // Prevent floating-point constants from using literal loads
  6297. // when execute-only is enabled.
  6298. if (ST->genExecuteOnly()) {
  6299. // If we can represent the constant as an immediate, don't lower it
  6300. if (isFPImmLegal(FPVal, VT))
  6301. return Op;
  6302. // Otherwise, construct as integer, and move to float register
  6303. APInt INTVal = FPVal.bitcastToAPInt();
  6304. SDLoc DL(CFP);
  6305. switch (VT.getSimpleVT().SimpleTy) {
  6306. default:
  6307. llvm_unreachable("Unknown floating point type!");
  6308. break;
  6309. case MVT::f64: {
  6310. SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
  6311. SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
  6312. return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
  6313. }
  6314. case MVT::f32:
  6315. return DAG.getNode(ARMISD::VMOVSR, DL, VT,
  6316. DAG.getConstant(INTVal, DL, MVT::i32));
  6317. }
  6318. }
  6319. if (!ST->hasVFP3Base())
  6320. return SDValue();
  6321. // Use the default (constant pool) lowering for double constants when we have
  6322. // an SP-only FPU
  6323. if (IsDouble && !Subtarget->hasFP64())
  6324. return SDValue();
  6325. // Try splatting with a VMOV.f32...
  6326. int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
  6327. if (ImmVal != -1) {
  6328. if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
  6329. // We have code in place to select a valid ConstantFP already, no need to
  6330. // do any mangling.
  6331. return Op;
  6332. }
  6333. // It's a float and we are trying to use NEON operations where
  6334. // possible. Lower it to a splat followed by an extract.
  6335. SDLoc DL(Op);
  6336. SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
  6337. SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
  6338. NewVal);
  6339. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
  6340. DAG.getConstant(0, DL, MVT::i32));
  6341. }
  6342. // The rest of our options are NEON only, make sure that's allowed before
  6343. // proceeding..
  6344. if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
  6345. return SDValue();
  6346. EVT VMovVT;
  6347. uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
  6348. // It wouldn't really be worth bothering for doubles except for one very
  6349. // important value, which does happen to match: 0.0. So make sure we don't do
  6350. // anything stupid.
  6351. if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
  6352. return SDValue();
  6353. // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
  6354. SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
  6355. VMovVT, VT, VMOVModImm);
  6356. if (NewVal != SDValue()) {
  6357. SDLoc DL(Op);
  6358. SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
  6359. NewVal);
  6360. if (IsDouble)
  6361. return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
  6362. // It's a float: cast and extract a vector element.
  6363. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
  6364. VecConstant);
  6365. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
  6366. DAG.getConstant(0, DL, MVT::i32));
  6367. }
  6368. // Finally, try a VMVN.i32
  6369. NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
  6370. VT, VMVNModImm);
  6371. if (NewVal != SDValue()) {
  6372. SDLoc DL(Op);
  6373. SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
  6374. if (IsDouble)
  6375. return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
  6376. // It's a float: cast and extract a vector element.
  6377. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
  6378. VecConstant);
  6379. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
  6380. DAG.getConstant(0, DL, MVT::i32));
  6381. }
  6382. return SDValue();
  6383. }
  6384. // check if an VEXT instruction can handle the shuffle mask when the
  6385. // vector sources of the shuffle are the same.
  6386. static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
  6387. unsigned NumElts = VT.getVectorNumElements();
  6388. // Assume that the first shuffle index is not UNDEF. Fail if it is.
  6389. if (M[0] < 0)
  6390. return false;
  6391. Imm = M[0];
  6392. // If this is a VEXT shuffle, the immediate value is the index of the first
  6393. // element. The other shuffle indices must be the successive elements after
  6394. // the first one.
  6395. unsigned ExpectedElt = Imm;
  6396. for (unsigned i = 1; i < NumElts; ++i) {
  6397. // Increment the expected index. If it wraps around, just follow it
  6398. // back to index zero and keep going.
  6399. ++ExpectedElt;
  6400. if (ExpectedElt == NumElts)
  6401. ExpectedElt = 0;
  6402. if (M[i] < 0) continue; // ignore UNDEF indices
  6403. if (ExpectedElt != static_cast<unsigned>(M[i]))
  6404. return false;
  6405. }
  6406. return true;
  6407. }
  6408. static bool isVEXTMask(ArrayRef<int> M, EVT VT,
  6409. bool &ReverseVEXT, unsigned &Imm) {
  6410. unsigned NumElts = VT.getVectorNumElements();
  6411. ReverseVEXT = false;
  6412. // Assume that the first shuffle index is not UNDEF. Fail if it is.
  6413. if (M[0] < 0)
  6414. return false;
  6415. Imm = M[0];
  6416. // If this is a VEXT shuffle, the immediate value is the index of the first
  6417. // element. The other shuffle indices must be the successive elements after
  6418. // the first one.
  6419. unsigned ExpectedElt = Imm;
  6420. for (unsigned i = 1; i < NumElts; ++i) {
  6421. // Increment the expected index. If it wraps around, it may still be
  6422. // a VEXT but the source vectors must be swapped.
  6423. ExpectedElt += 1;
  6424. if (ExpectedElt == NumElts * 2) {
  6425. ExpectedElt = 0;
  6426. ReverseVEXT = true;
  6427. }
  6428. if (M[i] < 0) continue; // ignore UNDEF indices
  6429. if (ExpectedElt != static_cast<unsigned>(M[i]))
  6430. return false;
  6431. }
  6432. // Adjust the index value if the source operands will be swapped.
  6433. if (ReverseVEXT)
  6434. Imm -= NumElts;
  6435. return true;
  6436. }
  6437. static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
  6438. // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
  6439. // range, then 0 is placed into the resulting vector. So pretty much any mask
  6440. // of 8 elements can work here.
  6441. return VT == MVT::v8i8 && M.size() == 8;
  6442. }
  6443. static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
  6444. unsigned Index) {
  6445. if (Mask.size() == Elements * 2)
  6446. return Index / Elements;
  6447. return Mask[Index] == 0 ? 0 : 1;
  6448. }
  6449. // Checks whether the shuffle mask represents a vector transpose (VTRN) by
  6450. // checking that pairs of elements in the shuffle mask represent the same index
  6451. // in each vector, incrementing the expected index by 2 at each step.
  6452. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
  6453. // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
  6454. // v2={e,f,g,h}
  6455. // WhichResult gives the offset for each element in the mask based on which
  6456. // of the two results it belongs to.
  6457. //
  6458. // The transpose can be represented either as:
  6459. // result1 = shufflevector v1, v2, result1_shuffle_mask
  6460. // result2 = shufflevector v1, v2, result2_shuffle_mask
  6461. // where v1/v2 and the shuffle masks have the same number of elements
  6462. // (here WhichResult (see below) indicates which result is being checked)
  6463. //
  6464. // or as:
  6465. // results = shufflevector v1, v2, shuffle_mask
  6466. // where both results are returned in one vector and the shuffle mask has twice
  6467. // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
  6468. // want to check the low half and high half of the shuffle mask as if it were
  6469. // the other case
  6470. static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  6471. unsigned EltSz = VT.getScalarSizeInBits();
  6472. if (EltSz == 64)
  6473. return false;
  6474. unsigned NumElts = VT.getVectorNumElements();
  6475. if (M.size() != NumElts && M.size() != NumElts*2)
  6476. return false;
  6477. // If the mask is twice as long as the input vector then we need to check the
  6478. // upper and lower parts of the mask with a matching value for WhichResult
  6479. // FIXME: A mask with only even values will be rejected in case the first
  6480. // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
  6481. // M[0] is used to determine WhichResult
  6482. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6483. WhichResult = SelectPairHalf(NumElts, M, i);
  6484. for (unsigned j = 0; j < NumElts; j += 2) {
  6485. if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
  6486. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
  6487. return false;
  6488. }
  6489. }
  6490. if (M.size() == NumElts*2)
  6491. WhichResult = 0;
  6492. return true;
  6493. }
  6494. /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
  6495. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  6496. /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
  6497. static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
  6498. unsigned EltSz = VT.getScalarSizeInBits();
  6499. if (EltSz == 64)
  6500. return false;
  6501. unsigned NumElts = VT.getVectorNumElements();
  6502. if (M.size() != NumElts && M.size() != NumElts*2)
  6503. return false;
  6504. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6505. WhichResult = SelectPairHalf(NumElts, M, i);
  6506. for (unsigned j = 0; j < NumElts; j += 2) {
  6507. if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
  6508. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
  6509. return false;
  6510. }
  6511. }
  6512. if (M.size() == NumElts*2)
  6513. WhichResult = 0;
  6514. return true;
  6515. }
  6516. // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
  6517. // that the mask elements are either all even and in steps of size 2 or all odd
  6518. // and in steps of size 2.
  6519. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
  6520. // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
  6521. // v2={e,f,g,h}
  6522. // Requires similar checks to that of isVTRNMask with
  6523. // respect the how results are returned.
  6524. static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  6525. unsigned EltSz = VT.getScalarSizeInBits();
  6526. if (EltSz == 64)
  6527. return false;
  6528. unsigned NumElts = VT.getVectorNumElements();
  6529. if (M.size() != NumElts && M.size() != NumElts*2)
  6530. return false;
  6531. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6532. WhichResult = SelectPairHalf(NumElts, M, i);
  6533. for (unsigned j = 0; j < NumElts; ++j) {
  6534. if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
  6535. return false;
  6536. }
  6537. }
  6538. if (M.size() == NumElts*2)
  6539. WhichResult = 0;
  6540. // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6541. if (VT.is64BitVector() && EltSz == 32)
  6542. return false;
  6543. return true;
  6544. }
  6545. /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
  6546. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  6547. /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
  6548. static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
  6549. unsigned EltSz = VT.getScalarSizeInBits();
  6550. if (EltSz == 64)
  6551. return false;
  6552. unsigned NumElts = VT.getVectorNumElements();
  6553. if (M.size() != NumElts && M.size() != NumElts*2)
  6554. return false;
  6555. unsigned Half = NumElts / 2;
  6556. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6557. WhichResult = SelectPairHalf(NumElts, M, i);
  6558. for (unsigned j = 0; j < NumElts; j += Half) {
  6559. unsigned Idx = WhichResult;
  6560. for (unsigned k = 0; k < Half; ++k) {
  6561. int MIdx = M[i + j + k];
  6562. if (MIdx >= 0 && (unsigned) MIdx != Idx)
  6563. return false;
  6564. Idx += 2;
  6565. }
  6566. }
  6567. }
  6568. if (M.size() == NumElts*2)
  6569. WhichResult = 0;
  6570. // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6571. if (VT.is64BitVector() && EltSz == 32)
  6572. return false;
  6573. return true;
  6574. }
  6575. // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
  6576. // that pairs of elements of the shufflemask represent the same index in each
  6577. // vector incrementing sequentially through the vectors.
  6578. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
  6579. // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
  6580. // v2={e,f,g,h}
  6581. // Requires similar checks to that of isVTRNMask with respect the how results
  6582. // are returned.
  6583. static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  6584. unsigned EltSz = VT.getScalarSizeInBits();
  6585. if (EltSz == 64)
  6586. return false;
  6587. unsigned NumElts = VT.getVectorNumElements();
  6588. if (M.size() != NumElts && M.size() != NumElts*2)
  6589. return false;
  6590. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6591. WhichResult = SelectPairHalf(NumElts, M, i);
  6592. unsigned Idx = WhichResult * NumElts / 2;
  6593. for (unsigned j = 0; j < NumElts; j += 2) {
  6594. if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
  6595. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
  6596. return false;
  6597. Idx += 1;
  6598. }
  6599. }
  6600. if (M.size() == NumElts*2)
  6601. WhichResult = 0;
  6602. // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6603. if (VT.is64BitVector() && EltSz == 32)
  6604. return false;
  6605. return true;
  6606. }
  6607. /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
  6608. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  6609. /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
  6610. static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
  6611. unsigned EltSz = VT.getScalarSizeInBits();
  6612. if (EltSz == 64)
  6613. return false;
  6614. unsigned NumElts = VT.getVectorNumElements();
  6615. if (M.size() != NumElts && M.size() != NumElts*2)
  6616. return false;
  6617. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6618. WhichResult = SelectPairHalf(NumElts, M, i);
  6619. unsigned Idx = WhichResult * NumElts / 2;
  6620. for (unsigned j = 0; j < NumElts; j += 2) {
  6621. if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
  6622. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
  6623. return false;
  6624. Idx += 1;
  6625. }
  6626. }
  6627. if (M.size() == NumElts*2)
  6628. WhichResult = 0;
  6629. // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6630. if (VT.is64BitVector() && EltSz == 32)
  6631. return false;
  6632. return true;
  6633. }
  6634. /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
  6635. /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
  6636. static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
  6637. unsigned &WhichResult,
  6638. bool &isV_UNDEF) {
  6639. isV_UNDEF = false;
  6640. if (isVTRNMask(ShuffleMask, VT, WhichResult))
  6641. return ARMISD::VTRN;
  6642. if (isVUZPMask(ShuffleMask, VT, WhichResult))
  6643. return ARMISD::VUZP;
  6644. if (isVZIPMask(ShuffleMask, VT, WhichResult))
  6645. return ARMISD::VZIP;
  6646. isV_UNDEF = true;
  6647. if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
  6648. return ARMISD::VTRN;
  6649. if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
  6650. return ARMISD::VUZP;
  6651. if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
  6652. return ARMISD::VZIP;
  6653. return 0;
  6654. }
  6655. /// \return true if this is a reverse operation on an vector.
  6656. static bool isReverseMask(ArrayRef<int> M, EVT VT) {
  6657. unsigned NumElts = VT.getVectorNumElements();
  6658. // Make sure the mask has the right size.
  6659. if (NumElts != M.size())
  6660. return false;
  6661. // Look for <15, ..., 3, -1, 1, 0>.
  6662. for (unsigned i = 0; i != NumElts; ++i)
  6663. if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
  6664. return false;
  6665. return true;
  6666. }
  6667. static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
  6668. unsigned NumElts = VT.getVectorNumElements();
  6669. // Make sure the mask has the right size.
  6670. if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
  6671. return false;
  6672. // If Top
  6673. // Look for <0, N, 2, N+2, 4, N+4, ..>.
  6674. // This inserts Input2 into Input1
  6675. // else if not Top
  6676. // Look for <0, N+1, 2, N+3, 4, N+5, ..>
  6677. // This inserts Input1 into Input2
  6678. unsigned Offset = Top ? 0 : 1;
  6679. unsigned N = SingleSource ? 0 : NumElts;
  6680. for (unsigned i = 0; i < NumElts; i += 2) {
  6681. if (M[i] >= 0 && M[i] != (int)i)
  6682. return false;
  6683. if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
  6684. return false;
  6685. }
  6686. return true;
  6687. }
  6688. static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
  6689. unsigned NumElts = ToVT.getVectorNumElements();
  6690. if (NumElts != M.size())
  6691. return false;
  6692. // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
  6693. // looking for patterns of:
  6694. // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
  6695. // rev: N/2 0 N/2+1 1 N/2+2 2 ...
  6696. unsigned Off0 = rev ? NumElts / 2 : 0;
  6697. unsigned Off1 = rev ? 0 : NumElts / 2;
  6698. for (unsigned i = 0; i < NumElts; i += 2) {
  6699. if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
  6700. return false;
  6701. if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
  6702. return false;
  6703. }
  6704. return true;
  6705. }
  6706. // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
  6707. // from a pair of inputs. For example:
  6708. // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
  6709. // FP_ROUND(EXTRACT_ELT(Y, 0),
  6710. // FP_ROUND(EXTRACT_ELT(X, 1),
  6711. // FP_ROUND(EXTRACT_ELT(Y, 1), ...)
  6712. static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
  6713. const ARMSubtarget *ST) {
  6714. assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  6715. if (!ST->hasMVEFloatOps())
  6716. return SDValue();
  6717. SDLoc dl(BV);
  6718. EVT VT = BV.getValueType();
  6719. if (VT != MVT::v8f16)
  6720. return SDValue();
  6721. // We are looking for a buildvector of fptrunc elements, where all the
  6722. // elements are interleavingly extracted from two sources. Check the first two
  6723. // items are valid enough and extract some info from them (they are checked
  6724. // properly in the loop below).
  6725. if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
  6726. BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  6727. BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
  6728. return SDValue();
  6729. if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
  6730. BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  6731. BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
  6732. return SDValue();
  6733. SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
  6734. SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
  6735. if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
  6736. return SDValue();
  6737. // Check all the values in the BuildVector line up with our expectations.
  6738. for (unsigned i = 1; i < 4; i++) {
  6739. auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
  6740. return Trunc.getOpcode() == ISD::FP_ROUND &&
  6741. Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  6742. Trunc.getOperand(0).getOperand(0) == Op &&
  6743. Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
  6744. };
  6745. if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
  6746. return SDValue();
  6747. if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
  6748. return SDValue();
  6749. }
  6750. SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
  6751. DAG.getConstant(0, dl, MVT::i32));
  6752. return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
  6753. DAG.getConstant(1, dl, MVT::i32));
  6754. }
  6755. // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
  6756. // from a single input on alternating lanes. For example:
  6757. // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
  6758. // FP_ROUND(EXTRACT_ELT(X, 2),
  6759. // FP_ROUND(EXTRACT_ELT(X, 4), ...)
  6760. static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
  6761. const ARMSubtarget *ST) {
  6762. assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  6763. if (!ST->hasMVEFloatOps())
  6764. return SDValue();
  6765. SDLoc dl(BV);
  6766. EVT VT = BV.getValueType();
  6767. if (VT != MVT::v4f32)
  6768. return SDValue();
  6769. // We are looking for a buildvector of fptext elements, where all the
  6770. // elements are alternating lanes from a single source. For example <0,2,4,6>
  6771. // or <1,3,5,7>. Check the first two items are valid enough and extract some
  6772. // info from them (they are checked properly in the loop below).
  6773. if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
  6774. BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  6775. return SDValue();
  6776. SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
  6777. int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
  6778. if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
  6779. return SDValue();
  6780. // Check all the values in the BuildVector line up with our expectations.
  6781. for (unsigned i = 1; i < 4; i++) {
  6782. auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
  6783. return Trunc.getOpcode() == ISD::FP_EXTEND &&
  6784. Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  6785. Trunc.getOperand(0).getOperand(0) == Op &&
  6786. Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
  6787. };
  6788. if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
  6789. return SDValue();
  6790. }
  6791. return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
  6792. DAG.getConstant(Offset, dl, MVT::i32));
  6793. }
  6794. // If N is an integer constant that can be moved into a register in one
  6795. // instruction, return an SDValue of such a constant (will become a MOV
  6796. // instruction). Otherwise return null.
  6797. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
  6798. const ARMSubtarget *ST, const SDLoc &dl) {
  6799. uint64_t Val;
  6800. if (!isa<ConstantSDNode>(N))
  6801. return SDValue();
  6802. Val = cast<ConstantSDNode>(N)->getZExtValue();
  6803. if (ST->isThumb1Only()) {
  6804. if (Val <= 255 || ~Val <= 255)
  6805. return DAG.getConstant(Val, dl, MVT::i32);
  6806. } else {
  6807. if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
  6808. return DAG.getConstant(Val, dl, MVT::i32);
  6809. }
  6810. return SDValue();
  6811. }
  6812. static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
  6813. const ARMSubtarget *ST) {
  6814. SDLoc dl(Op);
  6815. EVT VT = Op.getValueType();
  6816. assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
  6817. unsigned NumElts = VT.getVectorNumElements();
  6818. unsigned BoolMask;
  6819. unsigned BitsPerBool;
  6820. if (NumElts == 2) {
  6821. BitsPerBool = 8;
  6822. BoolMask = 0xff;
  6823. } else if (NumElts == 4) {
  6824. BitsPerBool = 4;
  6825. BoolMask = 0xf;
  6826. } else if (NumElts == 8) {
  6827. BitsPerBool = 2;
  6828. BoolMask = 0x3;
  6829. } else if (NumElts == 16) {
  6830. BitsPerBool = 1;
  6831. BoolMask = 0x1;
  6832. } else
  6833. return SDValue();
  6834. // If this is a single value copied into all lanes (a splat), we can just sign
  6835. // extend that single value
  6836. SDValue FirstOp = Op.getOperand(0);
  6837. if (!isa<ConstantSDNode>(FirstOp) &&
  6838. std::all_of(std::next(Op->op_begin()), Op->op_end(),
  6839. [&FirstOp](SDUse &U) {
  6840. return U.get().isUndef() || U.get() == FirstOp;
  6841. })) {
  6842. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
  6843. DAG.getValueType(MVT::i1));
  6844. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
  6845. }
  6846. // First create base with bits set where known
  6847. unsigned Bits32 = 0;
  6848. for (unsigned i = 0; i < NumElts; ++i) {
  6849. SDValue V = Op.getOperand(i);
  6850. if (!isa<ConstantSDNode>(V) && !V.isUndef())
  6851. continue;
  6852. bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
  6853. if (BitSet)
  6854. Bits32 |= BoolMask << (i * BitsPerBool);
  6855. }
  6856. // Add in unknown nodes
  6857. SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
  6858. DAG.getConstant(Bits32, dl, MVT::i32));
  6859. for (unsigned i = 0; i < NumElts; ++i) {
  6860. SDValue V = Op.getOperand(i);
  6861. if (isa<ConstantSDNode>(V) || V.isUndef())
  6862. continue;
  6863. Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
  6864. DAG.getConstant(i, dl, MVT::i32));
  6865. }
  6866. return Base;
  6867. }
  6868. static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
  6869. const ARMSubtarget *ST) {
  6870. if (!ST->hasMVEIntegerOps())
  6871. return SDValue();
  6872. // We are looking for a buildvector where each element is Op[0] + i*N
  6873. EVT VT = Op.getValueType();
  6874. SDValue Op0 = Op.getOperand(0);
  6875. unsigned NumElts = VT.getVectorNumElements();
  6876. // Get the increment value from operand 1
  6877. SDValue Op1 = Op.getOperand(1);
  6878. if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
  6879. !isa<ConstantSDNode>(Op1.getOperand(1)))
  6880. return SDValue();
  6881. unsigned N = Op1.getConstantOperandVal(1);
  6882. if (N != 1 && N != 2 && N != 4 && N != 8)
  6883. return SDValue();
  6884. // Check that each other operand matches
  6885. for (unsigned I = 2; I < NumElts; I++) {
  6886. SDValue OpI = Op.getOperand(I);
  6887. if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
  6888. !isa<ConstantSDNode>(OpI.getOperand(1)) ||
  6889. OpI.getConstantOperandVal(1) != I * N)
  6890. return SDValue();
  6891. }
  6892. SDLoc DL(Op);
  6893. return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
  6894. DAG.getConstant(N, DL, MVT::i32));
  6895. }
  6896. // Returns true if the operation N can be treated as qr instruction variant at
  6897. // operand Op.
  6898. static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
  6899. switch (N->getOpcode()) {
  6900. case ISD::ADD:
  6901. case ISD::MUL:
  6902. case ISD::SADDSAT:
  6903. case ISD::UADDSAT:
  6904. return true;
  6905. case ISD::SUB:
  6906. case ISD::SSUBSAT:
  6907. case ISD::USUBSAT:
  6908. return N->getOperand(1).getNode() == Op;
  6909. case ISD::INTRINSIC_WO_CHAIN:
  6910. switch (N->getConstantOperandVal(0)) {
  6911. case Intrinsic::arm_mve_add_predicated:
  6912. case Intrinsic::arm_mve_mul_predicated:
  6913. case Intrinsic::arm_mve_qadd_predicated:
  6914. case Intrinsic::arm_mve_vhadd:
  6915. case Intrinsic::arm_mve_hadd_predicated:
  6916. case Intrinsic::arm_mve_vqdmulh:
  6917. case Intrinsic::arm_mve_qdmulh_predicated:
  6918. case Intrinsic::arm_mve_vqrdmulh:
  6919. case Intrinsic::arm_mve_qrdmulh_predicated:
  6920. case Intrinsic::arm_mve_vqdmull:
  6921. case Intrinsic::arm_mve_vqdmull_predicated:
  6922. return true;
  6923. case Intrinsic::arm_mve_sub_predicated:
  6924. case Intrinsic::arm_mve_qsub_predicated:
  6925. case Intrinsic::arm_mve_vhsub:
  6926. case Intrinsic::arm_mve_hsub_predicated:
  6927. return N->getOperand(2).getNode() == Op;
  6928. default:
  6929. return false;
  6930. }
  6931. default:
  6932. return false;
  6933. }
  6934. }
  6935. // If this is a case we can't handle, return null and let the default
  6936. // expansion code take care of it.
  6937. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  6938. const ARMSubtarget *ST) const {
  6939. BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
  6940. SDLoc dl(Op);
  6941. EVT VT = Op.getValueType();
  6942. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
  6943. return LowerBUILD_VECTOR_i1(Op, DAG, ST);
  6944. if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
  6945. return R;
  6946. APInt SplatBits, SplatUndef;
  6947. unsigned SplatBitSize;
  6948. bool HasAnyUndefs;
  6949. if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  6950. if (SplatUndef.isAllOnes())
  6951. return DAG.getUNDEF(VT);
  6952. // If all the users of this constant splat are qr instruction variants,
  6953. // generate a vdup of the constant.
  6954. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
  6955. (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
  6956. all_of(BVN->uses(),
  6957. [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
  6958. EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
  6959. : SplatBitSize == 16 ? MVT::v8i16
  6960. : MVT::v16i8;
  6961. SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
  6962. SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
  6963. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
  6964. }
  6965. if ((ST->hasNEON() && SplatBitSize <= 64) ||
  6966. (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
  6967. // Check if an immediate VMOV works.
  6968. EVT VmovVT;
  6969. SDValue Val =
  6970. isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
  6971. SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
  6972. if (Val.getNode()) {
  6973. SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
  6974. return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
  6975. }
  6976. // Try an immediate VMVN.
  6977. uint64_t NegatedImm = (~SplatBits).getZExtValue();
  6978. Val = isVMOVModifiedImm(
  6979. NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
  6980. VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
  6981. if (Val.getNode()) {
  6982. SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
  6983. return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
  6984. }
  6985. // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
  6986. if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
  6987. int ImmVal = ARM_AM::getFP32Imm(SplatBits);
  6988. if (ImmVal != -1) {
  6989. SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
  6990. return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
  6991. }
  6992. }
  6993. // If we are under MVE, generate a VDUP(constant), bitcast to the original
  6994. // type.
  6995. if (ST->hasMVEIntegerOps() &&
  6996. (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
  6997. EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
  6998. : SplatBitSize == 16 ? MVT::v8i16
  6999. : MVT::v16i8;
  7000. SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
  7001. SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
  7002. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
  7003. }
  7004. }
  7005. }
  7006. // Scan through the operands to see if only one value is used.
  7007. //
  7008. // As an optimisation, even if more than one value is used it may be more
  7009. // profitable to splat with one value then change some lanes.
  7010. //
  7011. // Heuristically we decide to do this if the vector has a "dominant" value,
  7012. // defined as splatted to more than half of the lanes.
  7013. unsigned NumElts = VT.getVectorNumElements();
  7014. bool isOnlyLowElement = true;
  7015. bool usesOnlyOneValue = true;
  7016. bool hasDominantValue = false;
  7017. bool isConstant = true;
  7018. // Map of the number of times a particular SDValue appears in the
  7019. // element list.
  7020. DenseMap<SDValue, unsigned> ValueCounts;
  7021. SDValue Value;
  7022. for (unsigned i = 0; i < NumElts; ++i) {
  7023. SDValue V = Op.getOperand(i);
  7024. if (V.isUndef())
  7025. continue;
  7026. if (i > 0)
  7027. isOnlyLowElement = false;
  7028. if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
  7029. isConstant = false;
  7030. ValueCounts.insert(std::make_pair(V, 0));
  7031. unsigned &Count = ValueCounts[V];
  7032. // Is this value dominant? (takes up more than half of the lanes)
  7033. if (++Count > (NumElts / 2)) {
  7034. hasDominantValue = true;
  7035. Value = V;
  7036. }
  7037. }
  7038. if (ValueCounts.size() != 1)
  7039. usesOnlyOneValue = false;
  7040. if (!Value.getNode() && !ValueCounts.empty())
  7041. Value = ValueCounts.begin()->first;
  7042. if (ValueCounts.empty())
  7043. return DAG.getUNDEF(VT);
  7044. // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
  7045. // Keep going if we are hitting this case.
  7046. if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
  7047. return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
  7048. unsigned EltSize = VT.getScalarSizeInBits();
  7049. // Use VDUP for non-constant splats. For f32 constant splats, reduce to
  7050. // i32 and try again.
  7051. if (hasDominantValue && EltSize <= 32) {
  7052. if (!isConstant) {
  7053. SDValue N;
  7054. // If we are VDUPing a value that comes directly from a vector, that will
  7055. // cause an unnecessary move to and from a GPR, where instead we could
  7056. // just use VDUPLANE. We can only do this if the lane being extracted
  7057. // is at a constant index, as the VDUP from lane instructions only have
  7058. // constant-index forms.
  7059. ConstantSDNode *constIndex;
  7060. if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  7061. (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
  7062. // We need to create a new undef vector to use for the VDUPLANE if the
  7063. // size of the vector from which we get the value is different than the
  7064. // size of the vector that we need to create. We will insert the element
  7065. // such that the register coalescer will remove unnecessary copies.
  7066. if (VT != Value->getOperand(0).getValueType()) {
  7067. unsigned index = constIndex->getAPIntValue().getLimitedValue() %
  7068. VT.getVectorNumElements();
  7069. N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
  7070. DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
  7071. Value, DAG.getConstant(index, dl, MVT::i32)),
  7072. DAG.getConstant(index, dl, MVT::i32));
  7073. } else
  7074. N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
  7075. Value->getOperand(0), Value->getOperand(1));
  7076. } else
  7077. N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
  7078. if (!usesOnlyOneValue) {
  7079. // The dominant value was splatted as 'N', but we now have to insert
  7080. // all differing elements.
  7081. for (unsigned I = 0; I < NumElts; ++I) {
  7082. if (Op.getOperand(I) == Value)
  7083. continue;
  7084. SmallVector<SDValue, 3> Ops;
  7085. Ops.push_back(N);
  7086. Ops.push_back(Op.getOperand(I));
  7087. Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
  7088. N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
  7089. }
  7090. }
  7091. return N;
  7092. }
  7093. if (VT.getVectorElementType().isFloatingPoint()) {
  7094. SmallVector<SDValue, 8> Ops;
  7095. MVT FVT = VT.getVectorElementType().getSimpleVT();
  7096. assert(FVT == MVT::f32 || FVT == MVT::f16);
  7097. MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
  7098. for (unsigned i = 0; i < NumElts; ++i)
  7099. Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
  7100. Op.getOperand(i)));
  7101. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
  7102. SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
  7103. Val = LowerBUILD_VECTOR(Val, DAG, ST);
  7104. if (Val.getNode())
  7105. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  7106. }
  7107. if (usesOnlyOneValue) {
  7108. SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
  7109. if (isConstant && Val.getNode())
  7110. return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
  7111. }
  7112. }
  7113. // If all elements are constants and the case above didn't get hit, fall back
  7114. // to the default expansion, which will generate a load from the constant
  7115. // pool.
  7116. if (isConstant)
  7117. return SDValue();
  7118. // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
  7119. // vmovn). Empirical tests suggest this is rarely worth it for vectors of
  7120. // length <= 2.
  7121. if (NumElts >= 4)
  7122. if (SDValue shuffle = ReconstructShuffle(Op, DAG))
  7123. return shuffle;
  7124. // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
  7125. // VCVT's
  7126. if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
  7127. return VCVT;
  7128. if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
  7129. return VCVT;
  7130. if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
  7131. // If we haven't found an efficient lowering, try splitting a 128-bit vector
  7132. // into two 64-bit vectors; we might discover a better way to lower it.
  7133. SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
  7134. EVT ExtVT = VT.getVectorElementType();
  7135. EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
  7136. SDValue Lower =
  7137. DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
  7138. if (Lower.getOpcode() == ISD::BUILD_VECTOR)
  7139. Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
  7140. SDValue Upper = DAG.getBuildVector(
  7141. HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
  7142. if (Upper.getOpcode() == ISD::BUILD_VECTOR)
  7143. Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
  7144. if (Lower && Upper)
  7145. return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
  7146. }
  7147. // Vectors with 32- or 64-bit elements can be built by directly assigning
  7148. // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
  7149. // will be legalized.
  7150. if (EltSize >= 32) {
  7151. // Do the expansion with floating-point types, since that is what the VFP
  7152. // registers are defined to use, and since i64 is not legal.
  7153. EVT EltVT = EVT::getFloatingPointVT(EltSize);
  7154. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
  7155. SmallVector<SDValue, 8> Ops;
  7156. for (unsigned i = 0; i < NumElts; ++i)
  7157. Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
  7158. SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
  7159. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  7160. }
  7161. // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
  7162. // know the default expansion would otherwise fall back on something even
  7163. // worse. For a vector with one or two non-undef values, that's
  7164. // scalar_to_vector for the elements followed by a shuffle (provided the
  7165. // shuffle is valid for the target) and materialization element by element
  7166. // on the stack followed by a load for everything else.
  7167. if (!isConstant && !usesOnlyOneValue) {
  7168. SDValue Vec = DAG.getUNDEF(VT);
  7169. for (unsigned i = 0 ; i < NumElts; ++i) {
  7170. SDValue V = Op.getOperand(i);
  7171. if (V.isUndef())
  7172. continue;
  7173. SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
  7174. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
  7175. }
  7176. return Vec;
  7177. }
  7178. return SDValue();
  7179. }
  7180. // Gather data to see if the operation can be modelled as a
  7181. // shuffle in combination with VEXTs.
  7182. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
  7183. SelectionDAG &DAG) const {
  7184. assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  7185. SDLoc dl(Op);
  7186. EVT VT = Op.getValueType();
  7187. unsigned NumElts = VT.getVectorNumElements();
  7188. struct ShuffleSourceInfo {
  7189. SDValue Vec;
  7190. unsigned MinElt = std::numeric_limits<unsigned>::max();
  7191. unsigned MaxElt = 0;
  7192. // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
  7193. // be compatible with the shuffle we intend to construct. As a result
  7194. // ShuffleVec will be some sliding window into the original Vec.
  7195. SDValue ShuffleVec;
  7196. // Code should guarantee that element i in Vec starts at element "WindowBase
  7197. // + i * WindowScale in ShuffleVec".
  7198. int WindowBase = 0;
  7199. int WindowScale = 1;
  7200. ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
  7201. bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
  7202. };
  7203. // First gather all vectors used as an immediate source for this BUILD_VECTOR
  7204. // node.
  7205. SmallVector<ShuffleSourceInfo, 2> Sources;
  7206. for (unsigned i = 0; i < NumElts; ++i) {
  7207. SDValue V = Op.getOperand(i);
  7208. if (V.isUndef())
  7209. continue;
  7210. else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
  7211. // A shuffle can only come from building a vector from various
  7212. // elements of other vectors.
  7213. return SDValue();
  7214. } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
  7215. // Furthermore, shuffles require a constant mask, whereas extractelts
  7216. // accept variable indices.
  7217. return SDValue();
  7218. }
  7219. // Add this element source to the list if it's not already there.
  7220. SDValue SourceVec = V.getOperand(0);
  7221. auto Source = llvm::find(Sources, SourceVec);
  7222. if (Source == Sources.end())
  7223. Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
  7224. // Update the minimum and maximum lane number seen.
  7225. unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
  7226. Source->MinElt = std::min(Source->MinElt, EltNo);
  7227. Source->MaxElt = std::max(Source->MaxElt, EltNo);
  7228. }
  7229. // Currently only do something sane when at most two source vectors
  7230. // are involved.
  7231. if (Sources.size() > 2)
  7232. return SDValue();
  7233. // Find out the smallest element size among result and two sources, and use
  7234. // it as element size to build the shuffle_vector.
  7235. EVT SmallestEltTy = VT.getVectorElementType();
  7236. for (auto &Source : Sources) {
  7237. EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
  7238. if (SrcEltTy.bitsLT(SmallestEltTy))
  7239. SmallestEltTy = SrcEltTy;
  7240. }
  7241. unsigned ResMultiplier =
  7242. VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
  7243. NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
  7244. EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
  7245. // If the source vector is too wide or too narrow, we may nevertheless be able
  7246. // to construct a compatible shuffle either by concatenating it with UNDEF or
  7247. // extracting a suitable range of elements.
  7248. for (auto &Src : Sources) {
  7249. EVT SrcVT = Src.ShuffleVec.getValueType();
  7250. uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
  7251. uint64_t VTSize = VT.getFixedSizeInBits();
  7252. if (SrcVTSize == VTSize)
  7253. continue;
  7254. // This stage of the search produces a source with the same element type as
  7255. // the original, but with a total width matching the BUILD_VECTOR output.
  7256. EVT EltVT = SrcVT.getVectorElementType();
  7257. unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
  7258. EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
  7259. if (SrcVTSize < VTSize) {
  7260. if (2 * SrcVTSize != VTSize)
  7261. return SDValue();
  7262. // We can pad out the smaller vector for free, so if it's part of a
  7263. // shuffle...
  7264. Src.ShuffleVec =
  7265. DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
  7266. DAG.getUNDEF(Src.ShuffleVec.getValueType()));
  7267. continue;
  7268. }
  7269. if (SrcVTSize != 2 * VTSize)
  7270. return SDValue();
  7271. if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
  7272. // Span too large for a VEXT to cope
  7273. return SDValue();
  7274. }
  7275. if (Src.MinElt >= NumSrcElts) {
  7276. // The extraction can just take the second half
  7277. Src.ShuffleVec =
  7278. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7279. DAG.getConstant(NumSrcElts, dl, MVT::i32));
  7280. Src.WindowBase = -NumSrcElts;
  7281. } else if (Src.MaxElt < NumSrcElts) {
  7282. // The extraction can just take the first half
  7283. Src.ShuffleVec =
  7284. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7285. DAG.getConstant(0, dl, MVT::i32));
  7286. } else {
  7287. // An actual VEXT is needed
  7288. SDValue VEXTSrc1 =
  7289. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7290. DAG.getConstant(0, dl, MVT::i32));
  7291. SDValue VEXTSrc2 =
  7292. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7293. DAG.getConstant(NumSrcElts, dl, MVT::i32));
  7294. Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
  7295. VEXTSrc2,
  7296. DAG.getConstant(Src.MinElt, dl, MVT::i32));
  7297. Src.WindowBase = -Src.MinElt;
  7298. }
  7299. }
  7300. // Another possible incompatibility occurs from the vector element types. We
  7301. // can fix this by bitcasting the source vectors to the same type we intend
  7302. // for the shuffle.
  7303. for (auto &Src : Sources) {
  7304. EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
  7305. if (SrcEltTy == SmallestEltTy)
  7306. continue;
  7307. assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
  7308. Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
  7309. Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
  7310. Src.WindowBase *= Src.WindowScale;
  7311. }
  7312. // Final check before we try to actually produce a shuffle.
  7313. LLVM_DEBUG(for (auto Src
  7314. : Sources)
  7315. assert(Src.ShuffleVec.getValueType() == ShuffleVT););
  7316. // The stars all align, our next step is to produce the mask for the shuffle.
  7317. SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
  7318. int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
  7319. for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
  7320. SDValue Entry = Op.getOperand(i);
  7321. if (Entry.isUndef())
  7322. continue;
  7323. auto Src = llvm::find(Sources, Entry.getOperand(0));
  7324. int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
  7325. // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
  7326. // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
  7327. // segment.
  7328. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
  7329. int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
  7330. VT.getScalarSizeInBits());
  7331. int LanesDefined = BitsDefined / BitsPerShuffleLane;
  7332. // This source is expected to fill ResMultiplier lanes of the final shuffle,
  7333. // starting at the appropriate offset.
  7334. int *LaneMask = &Mask[i * ResMultiplier];
  7335. int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
  7336. ExtractBase += NumElts * (Src - Sources.begin());
  7337. for (int j = 0; j < LanesDefined; ++j)
  7338. LaneMask[j] = ExtractBase + j;
  7339. }
  7340. // We can't handle more than two sources. This should have already
  7341. // been checked before this point.
  7342. assert(Sources.size() <= 2 && "Too many sources!");
  7343. SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
  7344. for (unsigned i = 0; i < Sources.size(); ++i)
  7345. ShuffleOps[i] = Sources[i].ShuffleVec;
  7346. SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
  7347. ShuffleOps[1], Mask, DAG);
  7348. if (!Shuffle)
  7349. return SDValue();
  7350. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
  7351. }
  7352. enum ShuffleOpCodes {
  7353. OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  7354. OP_VREV,
  7355. OP_VDUP0,
  7356. OP_VDUP1,
  7357. OP_VDUP2,
  7358. OP_VDUP3,
  7359. OP_VEXT1,
  7360. OP_VEXT2,
  7361. OP_VEXT3,
  7362. OP_VUZPL, // VUZP, left result
  7363. OP_VUZPR, // VUZP, right result
  7364. OP_VZIPL, // VZIP, left result
  7365. OP_VZIPR, // VZIP, right result
  7366. OP_VTRNL, // VTRN, left result
  7367. OP_VTRNR // VTRN, right result
  7368. };
  7369. static bool isLegalMVEShuffleOp(unsigned PFEntry) {
  7370. unsigned OpNum = (PFEntry >> 26) & 0x0F;
  7371. switch (OpNum) {
  7372. case OP_COPY:
  7373. case OP_VREV:
  7374. case OP_VDUP0:
  7375. case OP_VDUP1:
  7376. case OP_VDUP2:
  7377. case OP_VDUP3:
  7378. return true;
  7379. }
  7380. return false;
  7381. }
  7382. /// isShuffleMaskLegal - Targets can use this to indicate that they only
  7383. /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
  7384. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
  7385. /// are assumed to be legal.
  7386. bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
  7387. if (VT.getVectorNumElements() == 4 &&
  7388. (VT.is128BitVector() || VT.is64BitVector())) {
  7389. unsigned PFIndexes[4];
  7390. for (unsigned i = 0; i != 4; ++i) {
  7391. if (M[i] < 0)
  7392. PFIndexes[i] = 8;
  7393. else
  7394. PFIndexes[i] = M[i];
  7395. }
  7396. // Compute the index in the perfect shuffle table.
  7397. unsigned PFTableIndex =
  7398. PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
  7399. unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
  7400. unsigned Cost = (PFEntry >> 30);
  7401. if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
  7402. return true;
  7403. }
  7404. bool ReverseVEXT, isV_UNDEF;
  7405. unsigned Imm, WhichResult;
  7406. unsigned EltSize = VT.getScalarSizeInBits();
  7407. if (EltSize >= 32 ||
  7408. ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
  7409. ShuffleVectorInst::isIdentityMask(M) ||
  7410. isVREVMask(M, VT, 64) ||
  7411. isVREVMask(M, VT, 32) ||
  7412. isVREVMask(M, VT, 16))
  7413. return true;
  7414. else if (Subtarget->hasNEON() &&
  7415. (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
  7416. isVTBLMask(M, VT) ||
  7417. isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
  7418. return true;
  7419. else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7420. isReverseMask(M, VT))
  7421. return true;
  7422. else if (Subtarget->hasMVEIntegerOps() &&
  7423. (isVMOVNMask(M, VT, true, false) ||
  7424. isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
  7425. return true;
  7426. else
  7427. return false;
  7428. }
  7429. /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
  7430. /// the specified operations to build the shuffle.
  7431. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
  7432. SDValue RHS, SelectionDAG &DAG,
  7433. const SDLoc &dl) {
  7434. unsigned OpNum = (PFEntry >> 26) & 0x0F;
  7435. unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
  7436. unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
  7437. if (OpNum == OP_COPY) {
  7438. if (LHSID == (1*9+2)*9+3) return LHS;
  7439. assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
  7440. return RHS;
  7441. }
  7442. SDValue OpLHS, OpRHS;
  7443. OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
  7444. OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
  7445. EVT VT = OpLHS.getValueType();
  7446. switch (OpNum) {
  7447. default: llvm_unreachable("Unknown shuffle opcode!");
  7448. case OP_VREV:
  7449. // VREV divides the vector in half and swaps within the half.
  7450. if (VT.getVectorElementType() == MVT::i32 ||
  7451. VT.getVectorElementType() == MVT::f32)
  7452. return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
  7453. // vrev <4 x i16> -> VREV32
  7454. if (VT.getVectorElementType() == MVT::i16 ||
  7455. VT.getVectorElementType() == MVT::f16)
  7456. return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
  7457. // vrev <4 x i8> -> VREV16
  7458. assert(VT.getVectorElementType() == MVT::i8);
  7459. return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
  7460. case OP_VDUP0:
  7461. case OP_VDUP1:
  7462. case OP_VDUP2:
  7463. case OP_VDUP3:
  7464. return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
  7465. OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
  7466. case OP_VEXT1:
  7467. case OP_VEXT2:
  7468. case OP_VEXT3:
  7469. return DAG.getNode(ARMISD::VEXT, dl, VT,
  7470. OpLHS, OpRHS,
  7471. DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
  7472. case OP_VUZPL:
  7473. case OP_VUZPR:
  7474. return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
  7475. OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
  7476. case OP_VZIPL:
  7477. case OP_VZIPR:
  7478. return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
  7479. OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
  7480. case OP_VTRNL:
  7481. case OP_VTRNR:
  7482. return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
  7483. OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
  7484. }
  7485. }
  7486. static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
  7487. ArrayRef<int> ShuffleMask,
  7488. SelectionDAG &DAG) {
  7489. // Check to see if we can use the VTBL instruction.
  7490. SDValue V1 = Op.getOperand(0);
  7491. SDValue V2 = Op.getOperand(1);
  7492. SDLoc DL(Op);
  7493. SmallVector<SDValue, 8> VTBLMask;
  7494. for (int I : ShuffleMask)
  7495. VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));
  7496. if (V2.getNode()->isUndef())
  7497. return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
  7498. DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
  7499. return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
  7500. DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
  7501. }
  7502. static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  7503. SDLoc DL(Op);
  7504. EVT VT = Op.getValueType();
  7505. assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7506. "Expect an v8i16/v16i8 type");
  7507. SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
  7508. // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
  7509. // extract the first 8 bytes into the top double word and the last 8 bytes
  7510. // into the bottom double word, through a new vector shuffle that will be
  7511. // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
  7512. std::vector<int> NewMask;
  7513. for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
  7514. NewMask.push_back(VT.getVectorNumElements() / 2 + i);
  7515. for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
  7516. NewMask.push_back(i);
  7517. return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
  7518. }
  7519. static EVT getVectorTyFromPredicateVector(EVT VT) {
  7520. switch (VT.getSimpleVT().SimpleTy) {
  7521. case MVT::v2i1:
  7522. return MVT::v2f64;
  7523. case MVT::v4i1:
  7524. return MVT::v4i32;
  7525. case MVT::v8i1:
  7526. return MVT::v8i16;
  7527. case MVT::v16i1:
  7528. return MVT::v16i8;
  7529. default:
  7530. llvm_unreachable("Unexpected vector predicate type");
  7531. }
  7532. }
  7533. static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
  7534. SelectionDAG &DAG) {
  7535. // Converting from boolean predicates to integers involves creating a vector
  7536. // of all ones or all zeroes and selecting the lanes based upon the real
  7537. // predicate.
  7538. SDValue AllOnes =
  7539. DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
  7540. AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
  7541. SDValue AllZeroes =
  7542. DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
  7543. AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
  7544. // Get full vector type from predicate type
  7545. EVT NewVT = getVectorTyFromPredicateVector(VT);
  7546. SDValue RecastV1;
  7547. // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
  7548. // this to a v16i1. This cannot be done with an ordinary bitcast because the
  7549. // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
  7550. // since we know in hardware the sizes are really the same.
  7551. if (VT != MVT::v16i1)
  7552. RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
  7553. else
  7554. RecastV1 = Pred;
  7555. // Select either all ones or zeroes depending upon the real predicate bits.
  7556. SDValue PredAsVector =
  7557. DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
  7558. // Recast our new predicate-as-integer v16i8 vector into something
  7559. // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
  7560. return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
  7561. }
  7562. static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
  7563. const ARMSubtarget *ST) {
  7564. EVT VT = Op.getValueType();
  7565. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  7566. ArrayRef<int> ShuffleMask = SVN->getMask();
  7567. assert(ST->hasMVEIntegerOps() &&
  7568. "No support for vector shuffle of boolean predicates");
  7569. SDValue V1 = Op.getOperand(0);
  7570. SDLoc dl(Op);
  7571. if (isReverseMask(ShuffleMask, VT)) {
  7572. SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
  7573. SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
  7574. SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
  7575. DAG.getConstant(16, dl, MVT::i32));
  7576. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
  7577. }
  7578. // Until we can come up with optimised cases for every single vector
  7579. // shuffle in existence we have chosen the least painful strategy. This is
  7580. // to essentially promote the boolean predicate to a 8-bit integer, where
  7581. // each predicate represents a byte. Then we fall back on a normal integer
  7582. // vector shuffle and convert the result back into a predicate vector. In
  7583. // many cases the generated code might be even better than scalar code
  7584. // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
  7585. // fields in a register into 8 other arbitrary 2-bit fields!
  7586. SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
  7587. EVT NewVT = PredAsVector.getValueType();
  7588. // Do the shuffle!
  7589. SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
  7590. DAG.getUNDEF(NewVT), ShuffleMask);
  7591. // Now return the result of comparing the shuffled vector with zero,
  7592. // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
  7593. // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
  7594. if (VT == MVT::v2i1) {
  7595. SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
  7596. SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
  7597. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  7598. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
  7599. }
  7600. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
  7601. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  7602. }
  7603. static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
  7604. ArrayRef<int> ShuffleMask,
  7605. SelectionDAG &DAG) {
  7606. // Attempt to lower the vector shuffle using as many whole register movs as
  7607. // possible. This is useful for types smaller than 32bits, which would
  7608. // often otherwise become a series for grp movs.
  7609. SDLoc dl(Op);
  7610. EVT VT = Op.getValueType();
  7611. if (VT.getScalarSizeInBits() >= 32)
  7612. return SDValue();
  7613. assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7614. "Unexpected vector type");
  7615. int NumElts = VT.getVectorNumElements();
  7616. int QuarterSize = NumElts / 4;
  7617. // The four final parts of the vector, as i32's
  7618. SDValue Parts[4];
  7619. // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
  7620. // <u,u,u,u>), returning the vmov lane index
  7621. auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
  7622. // Detect which mov lane this would be from the first non-undef element.
  7623. int MovIdx = -1;
  7624. for (int i = 0; i < Length; i++) {
  7625. if (ShuffleMask[Start + i] >= 0) {
  7626. if (ShuffleMask[Start + i] % Length != i)
  7627. return -1;
  7628. MovIdx = ShuffleMask[Start + i] / Length;
  7629. break;
  7630. }
  7631. }
  7632. // If all items are undef, leave this for other combines
  7633. if (MovIdx == -1)
  7634. return -1;
  7635. // Check the remaining values are the correct part of the same mov
  7636. for (int i = 1; i < Length; i++) {
  7637. if (ShuffleMask[Start + i] >= 0 &&
  7638. (ShuffleMask[Start + i] / Length != MovIdx ||
  7639. ShuffleMask[Start + i] % Length != i))
  7640. return -1;
  7641. }
  7642. return MovIdx;
  7643. };
  7644. for (int Part = 0; Part < 4; ++Part) {
  7645. // Does this part look like a mov
  7646. int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
  7647. if (Elt != -1) {
  7648. SDValue Input = Op->getOperand(0);
  7649. if (Elt >= 4) {
  7650. Input = Op->getOperand(1);
  7651. Elt -= 4;
  7652. }
  7653. SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
  7654. Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
  7655. DAG.getConstant(Elt, dl, MVT::i32));
  7656. }
  7657. }
  7658. // Nothing interesting found, just return
  7659. if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
  7660. return SDValue();
  7661. // The other parts need to be built with the old shuffle vector, cast to a
  7662. // v4i32 and extract_vector_elts
  7663. if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
  7664. SmallVector<int, 16> NewShuffleMask;
  7665. for (int Part = 0; Part < 4; ++Part)
  7666. for (int i = 0; i < QuarterSize; i++)
  7667. NewShuffleMask.push_back(
  7668. Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
  7669. SDValue NewShuffle = DAG.getVectorShuffle(
  7670. VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
  7671. SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
  7672. for (int Part = 0; Part < 4; ++Part)
  7673. if (!Parts[Part])
  7674. Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
  7675. BitCast, DAG.getConstant(Part, dl, MVT::i32));
  7676. }
  7677. // Build a vector out of the various parts and bitcast it back to the original
  7678. // type.
  7679. SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
  7680. return DAG.getBitcast(VT, NewVec);
  7681. }
  7682. static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
  7683. ArrayRef<int> ShuffleMask,
  7684. SelectionDAG &DAG) {
  7685. SDValue V1 = Op.getOperand(0);
  7686. SDValue V2 = Op.getOperand(1);
  7687. EVT VT = Op.getValueType();
  7688. unsigned NumElts = VT.getVectorNumElements();
  7689. // An One-Off Identity mask is one that is mostly an identity mask from as
  7690. // single source but contains a single element out-of-place, either from a
  7691. // different vector or from another position in the same vector. As opposed to
  7692. // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
  7693. // pair directly.
  7694. auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
  7695. int &OffElement) {
  7696. OffElement = -1;
  7697. int NonUndef = 0;
  7698. for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
  7699. if (Mask[i] == -1)
  7700. continue;
  7701. NonUndef++;
  7702. if (Mask[i] != i + BaseOffset) {
  7703. if (OffElement == -1)
  7704. OffElement = i;
  7705. else
  7706. return false;
  7707. }
  7708. }
  7709. return NonUndef > 2 && OffElement != -1;
  7710. };
  7711. int OffElement;
  7712. SDValue VInput;
  7713. if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
  7714. VInput = V1;
  7715. else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
  7716. VInput = V2;
  7717. else
  7718. return SDValue();
  7719. SDLoc dl(Op);
  7720. EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
  7721. ? MVT::i32
  7722. : VT.getScalarType();
  7723. SDValue Elt = DAG.getNode(
  7724. ISD::EXTRACT_VECTOR_ELT, dl, SVT,
  7725. ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
  7726. DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
  7727. return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
  7728. DAG.getVectorIdxConstant(OffElement % NumElts, dl));
  7729. }
  7730. static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
  7731. const ARMSubtarget *ST) {
  7732. SDValue V1 = Op.getOperand(0);
  7733. SDValue V2 = Op.getOperand(1);
  7734. SDLoc dl(Op);
  7735. EVT VT = Op.getValueType();
  7736. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  7737. unsigned EltSize = VT.getScalarSizeInBits();
  7738. if (ST->hasMVEIntegerOps() && EltSize == 1)
  7739. return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
  7740. // Convert shuffles that are directly supported on NEON to target-specific
  7741. // DAG nodes, instead of keeping them as shuffles and matching them again
  7742. // during code selection. This is more efficient and avoids the possibility
  7743. // of inconsistencies between legalization and selection.
  7744. // FIXME: floating-point vectors should be canonicalized to integer vectors
  7745. // of the same time so that they get CSEd properly.
  7746. ArrayRef<int> ShuffleMask = SVN->getMask();
  7747. if (EltSize <= 32) {
  7748. if (SVN->isSplat()) {
  7749. int Lane = SVN->getSplatIndex();
  7750. // If this is undef splat, generate it via "just" vdup, if possible.
  7751. if (Lane == -1) Lane = 0;
  7752. // Test if V1 is a SCALAR_TO_VECTOR.
  7753. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
  7754. return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
  7755. }
  7756. // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
  7757. // (and probably will turn into a SCALAR_TO_VECTOR once legalization
  7758. // reaches it).
  7759. if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
  7760. !isa<ConstantSDNode>(V1.getOperand(0))) {
  7761. bool IsScalarToVector = true;
  7762. for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
  7763. if (!V1.getOperand(i).isUndef()) {
  7764. IsScalarToVector = false;
  7765. break;
  7766. }
  7767. if (IsScalarToVector)
  7768. return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
  7769. }
  7770. return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
  7771. DAG.getConstant(Lane, dl, MVT::i32));
  7772. }
  7773. bool ReverseVEXT = false;
  7774. unsigned Imm = 0;
  7775. if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
  7776. if (ReverseVEXT)
  7777. std::swap(V1, V2);
  7778. return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
  7779. DAG.getConstant(Imm, dl, MVT::i32));
  7780. }
  7781. if (isVREVMask(ShuffleMask, VT, 64))
  7782. return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
  7783. if (isVREVMask(ShuffleMask, VT, 32))
  7784. return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
  7785. if (isVREVMask(ShuffleMask, VT, 16))
  7786. return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
  7787. if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
  7788. return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
  7789. DAG.getConstant(Imm, dl, MVT::i32));
  7790. }
  7791. // Check for Neon shuffles that modify both input vectors in place.
  7792. // If both results are used, i.e., if there are two shuffles with the same
  7793. // source operands and with masks corresponding to both results of one of
  7794. // these operations, DAG memoization will ensure that a single node is
  7795. // used for both shuffles.
  7796. unsigned WhichResult = 0;
  7797. bool isV_UNDEF = false;
  7798. if (ST->hasNEON()) {
  7799. if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
  7800. ShuffleMask, VT, WhichResult, isV_UNDEF)) {
  7801. if (isV_UNDEF)
  7802. V2 = V1;
  7803. return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
  7804. .getValue(WhichResult);
  7805. }
  7806. }
  7807. if (ST->hasMVEIntegerOps()) {
  7808. if (isVMOVNMask(ShuffleMask, VT, false, false))
  7809. return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
  7810. DAG.getConstant(0, dl, MVT::i32));
  7811. if (isVMOVNMask(ShuffleMask, VT, true, false))
  7812. return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
  7813. DAG.getConstant(1, dl, MVT::i32));
  7814. if (isVMOVNMask(ShuffleMask, VT, true, true))
  7815. return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
  7816. DAG.getConstant(1, dl, MVT::i32));
  7817. }
  7818. // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
  7819. // shuffles that produce a result larger than their operands with:
  7820. // shuffle(concat(v1, undef), concat(v2, undef))
  7821. // ->
  7822. // shuffle(concat(v1, v2), undef)
  7823. // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
  7824. //
  7825. // This is useful in the general case, but there are special cases where
  7826. // native shuffles produce larger results: the two-result ops.
  7827. //
  7828. // Look through the concat when lowering them:
  7829. // shuffle(concat(v1, v2), undef)
  7830. // ->
  7831. // concat(VZIP(v1, v2):0, :1)
  7832. //
  7833. if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
  7834. SDValue SubV1 = V1->getOperand(0);
  7835. SDValue SubV2 = V1->getOperand(1);
  7836. EVT SubVT = SubV1.getValueType();
  7837. // We expect these to have been canonicalized to -1.
  7838. assert(llvm::all_of(ShuffleMask, [&](int i) {
  7839. return i < (int)VT.getVectorNumElements();
  7840. }) && "Unexpected shuffle index into UNDEF operand!");
  7841. if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
  7842. ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
  7843. if (isV_UNDEF)
  7844. SubV2 = SubV1;
  7845. assert((WhichResult == 0) &&
  7846. "In-place shuffle of concat can only have one result!");
  7847. SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
  7848. SubV1, SubV2);
  7849. return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
  7850. Res.getValue(1));
  7851. }
  7852. }
  7853. }
  7854. if (ST->hasMVEIntegerOps() && EltSize <= 32)
  7855. if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
  7856. return V;
  7857. // If the shuffle is not directly supported and it has 4 elements, use
  7858. // the PerfectShuffle-generated table to synthesize it from other shuffles.
  7859. unsigned NumElts = VT.getVectorNumElements();
  7860. if (NumElts == 4) {
  7861. unsigned PFIndexes[4];
  7862. for (unsigned i = 0; i != 4; ++i) {
  7863. if (ShuffleMask[i] < 0)
  7864. PFIndexes[i] = 8;
  7865. else
  7866. PFIndexes[i] = ShuffleMask[i];
  7867. }
  7868. // Compute the index in the perfect shuffle table.
  7869. unsigned PFTableIndex =
  7870. PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
  7871. unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
  7872. unsigned Cost = (PFEntry >> 30);
  7873. if (Cost <= 4) {
  7874. if (ST->hasNEON())
  7875. return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
  7876. else if (isLegalMVEShuffleOp(PFEntry)) {
  7877. unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
  7878. unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
  7879. unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
  7880. unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
  7881. if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
  7882. return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
  7883. }
  7884. }
  7885. }
  7886. // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
  7887. if (EltSize >= 32) {
  7888. // Do the expansion with floating-point types, since that is what the VFP
  7889. // registers are defined to use, and since i64 is not legal.
  7890. EVT EltVT = EVT::getFloatingPointVT(EltSize);
  7891. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
  7892. V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
  7893. V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
  7894. SmallVector<SDValue, 8> Ops;
  7895. for (unsigned i = 0; i < NumElts; ++i) {
  7896. if (ShuffleMask[i] < 0)
  7897. Ops.push_back(DAG.getUNDEF(EltVT));
  7898. else
  7899. Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
  7900. ShuffleMask[i] < (int)NumElts ? V1 : V2,
  7901. DAG.getConstant(ShuffleMask[i] & (NumElts-1),
  7902. dl, MVT::i32)));
  7903. }
  7904. SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
  7905. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  7906. }
  7907. if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7908. isReverseMask(ShuffleMask, VT))
  7909. return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
  7910. if (ST->hasNEON() && VT == MVT::v8i8)
  7911. if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
  7912. return NewOp;
  7913. if (ST->hasMVEIntegerOps())
  7914. if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
  7915. return NewOp;
  7916. return SDValue();
  7917. }
  7918. static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
  7919. const ARMSubtarget *ST) {
  7920. EVT VecVT = Op.getOperand(0).getValueType();
  7921. SDLoc dl(Op);
  7922. assert(ST->hasMVEIntegerOps() &&
  7923. "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
  7924. SDValue Conv =
  7925. DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
  7926. unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
  7927. unsigned LaneWidth =
  7928. getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
  7929. unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
  7930. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
  7931. Op.getOperand(1), DAG.getValueType(MVT::i1));
  7932. SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
  7933. DAG.getConstant(~Mask, dl, MVT::i32));
  7934. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
  7935. }
  7936. SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
  7937. SelectionDAG &DAG) const {
  7938. // INSERT_VECTOR_ELT is legal only for immediate indexes.
  7939. SDValue Lane = Op.getOperand(2);
  7940. if (!isa<ConstantSDNode>(Lane))
  7941. return SDValue();
  7942. SDValue Elt = Op.getOperand(1);
  7943. EVT EltVT = Elt.getValueType();
  7944. if (Subtarget->hasMVEIntegerOps() &&
  7945. Op.getValueType().getScalarSizeInBits() == 1)
  7946. return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
  7947. if (getTypeAction(*DAG.getContext(), EltVT) ==
  7948. TargetLowering::TypePromoteFloat) {
  7949. // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
  7950. // but the type system will try to do that if we don't intervene.
  7951. // Reinterpret any such vector-element insertion as one with the
  7952. // corresponding integer types.
  7953. SDLoc dl(Op);
  7954. EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
  7955. assert(getTypeAction(*DAG.getContext(), IEltVT) !=
  7956. TargetLowering::TypePromoteFloat);
  7957. SDValue VecIn = Op.getOperand(0);
  7958. EVT VecVT = VecIn.getValueType();
  7959. EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
  7960. VecVT.getVectorNumElements());
  7961. SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
  7962. SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
  7963. SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
  7964. IVecIn, IElt, Lane);
  7965. return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
  7966. }
  7967. return Op;
  7968. }
  7969. static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
  7970. const ARMSubtarget *ST) {
  7971. EVT VecVT = Op.getOperand(0).getValueType();
  7972. SDLoc dl(Op);
  7973. assert(ST->hasMVEIntegerOps() &&
  7974. "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
  7975. SDValue Conv =
  7976. DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
  7977. unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
  7978. unsigned LaneWidth =
  7979. getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
  7980. SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
  7981. DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
  7982. return Shift;
  7983. }
  7984. static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
  7985. const ARMSubtarget *ST) {
  7986. // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
  7987. SDValue Lane = Op.getOperand(1);
  7988. if (!isa<ConstantSDNode>(Lane))
  7989. return SDValue();
  7990. SDValue Vec = Op.getOperand(0);
  7991. EVT VT = Vec.getValueType();
  7992. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
  7993. return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
  7994. if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
  7995. SDLoc dl(Op);
  7996. return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
  7997. }
  7998. return Op;
  7999. }
  8000. static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
  8001. const ARMSubtarget *ST) {
  8002. SDLoc dl(Op);
  8003. assert(Op.getValueType().getScalarSizeInBits() == 1 &&
  8004. "Unexpected custom CONCAT_VECTORS lowering");
  8005. assert(isPowerOf2_32(Op.getNumOperands()) &&
  8006. "Unexpected custom CONCAT_VECTORS lowering");
  8007. assert(ST->hasMVEIntegerOps() &&
  8008. "CONCAT_VECTORS lowering only supported for MVE");
  8009. auto ConcatPair = [&](SDValue V1, SDValue V2) {
  8010. EVT Op1VT = V1.getValueType();
  8011. EVT Op2VT = V2.getValueType();
  8012. assert(Op1VT == Op2VT && "Operand types don't match!");
  8013. EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
  8014. SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
  8015. SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
  8016. // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
  8017. // promoted to v8i16, etc.
  8018. MVT ElType =
  8019. getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
  8020. unsigned NumElts = 2 * Op1VT.getVectorNumElements();
  8021. // Extract the vector elements from Op1 and Op2 one by one and truncate them
  8022. // to be the right size for the destination. For example, if Op1 is v4i1
  8023. // then the promoted vector is v4i32. The result of concatentation gives a
  8024. // v8i1, which when promoted is v8i16. That means each i32 element from Op1
  8025. // needs truncating to i16 and inserting in the result.
  8026. EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
  8027. SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
  8028. auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
  8029. EVT NewVT = NewV.getValueType();
  8030. EVT ConcatVT = ConVec.getValueType();
  8031. for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
  8032. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
  8033. DAG.getIntPtrConstant(i, dl));
  8034. ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
  8035. DAG.getConstant(j, dl, MVT::i32));
  8036. }
  8037. return ConVec;
  8038. };
  8039. unsigned j = 0;
  8040. ConVec = ExtractInto(NewV1, ConVec, j);
  8041. ConVec = ExtractInto(NewV2, ConVec, j);
  8042. // Now return the result of comparing the subvector with zero, which will
  8043. // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we
  8044. // convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
  8045. if (VT == MVT::v2i1) {
  8046. SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec);
  8047. SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
  8048. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8049. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
  8050. }
  8051. return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
  8052. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8053. };
  8054. // Concat each pair of subvectors and pack into the lower half of the array.
  8055. SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
  8056. while (ConcatOps.size() > 1) {
  8057. for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
  8058. SDValue V1 = ConcatOps[I];
  8059. SDValue V2 = ConcatOps[I + 1];
  8060. ConcatOps[I / 2] = ConcatPair(V1, V2);
  8061. }
  8062. ConcatOps.resize(ConcatOps.size() / 2);
  8063. }
  8064. return ConcatOps[0];
  8065. }
  8066. static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
  8067. const ARMSubtarget *ST) {
  8068. EVT VT = Op->getValueType(0);
  8069. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
  8070. return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
  8071. // The only time a CONCAT_VECTORS operation can have legal types is when
  8072. // two 64-bit vectors are concatenated to a 128-bit vector.
  8073. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
  8074. "unexpected CONCAT_VECTORS");
  8075. SDLoc dl(Op);
  8076. SDValue Val = DAG.getUNDEF(MVT::v2f64);
  8077. SDValue Op0 = Op.getOperand(0);
  8078. SDValue Op1 = Op.getOperand(1);
  8079. if (!Op0.isUndef())
  8080. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
  8081. DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
  8082. DAG.getIntPtrConstant(0, dl));
  8083. if (!Op1.isUndef())
  8084. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
  8085. DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
  8086. DAG.getIntPtrConstant(1, dl));
  8087. return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
  8088. }
  8089. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
  8090. const ARMSubtarget *ST) {
  8091. SDValue V1 = Op.getOperand(0);
  8092. SDValue V2 = Op.getOperand(1);
  8093. SDLoc dl(Op);
  8094. EVT VT = Op.getValueType();
  8095. EVT Op1VT = V1.getValueType();
  8096. unsigned NumElts = VT.getVectorNumElements();
  8097. unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
  8098. assert(VT.getScalarSizeInBits() == 1 &&
  8099. "Unexpected custom EXTRACT_SUBVECTOR lowering");
  8100. assert(ST->hasMVEIntegerOps() &&
  8101. "EXTRACT_SUBVECTOR lowering only supported for MVE");
  8102. SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
  8103. // We now have Op1 promoted to a vector of integers, where v8i1 gets
  8104. // promoted to v8i16, etc.
  8105. MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
  8106. if (NumElts == 2) {
  8107. EVT SubVT = MVT::v4i32;
  8108. SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
  8109. for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
  8110. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
  8111. DAG.getIntPtrConstant(i, dl));
  8112. SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
  8113. DAG.getConstant(j, dl, MVT::i32));
  8114. SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
  8115. DAG.getConstant(j + 1, dl, MVT::i32));
  8116. }
  8117. SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
  8118. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8119. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
  8120. }
  8121. EVT SubVT = MVT::getVectorVT(ElType, NumElts);
  8122. SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
  8123. for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
  8124. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
  8125. DAG.getIntPtrConstant(i, dl));
  8126. SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
  8127. DAG.getConstant(j, dl, MVT::i32));
  8128. }
  8129. // Now return the result of comparing the subvector with zero,
  8130. // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
  8131. return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
  8132. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8133. }
  8134. // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
  8135. static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,
  8136. const ARMSubtarget *ST) {
  8137. assert(ST->hasMVEIntegerOps() && "Expected MVE!");
  8138. EVT VT = N->getValueType(0);
  8139. assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
  8140. "Expected a vector i1 type!");
  8141. SDValue Op = N->getOperand(0);
  8142. EVT FromVT = Op.getValueType();
  8143. SDLoc DL(N);
  8144. SDValue And =
  8145. DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
  8146. return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
  8147. DAG.getCondCode(ISD::SETNE));
  8148. }
  8149. static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
  8150. const ARMSubtarget *Subtarget) {
  8151. if (!Subtarget->hasMVEIntegerOps())
  8152. return SDValue();
  8153. EVT ToVT = N->getValueType(0);
  8154. if (ToVT.getScalarType() == MVT::i1)
  8155. return LowerTruncatei1(N, DAG, Subtarget);
  8156. // MVE does not have a single instruction to perform the truncation of a v4i32
  8157. // into the lower half of a v8i16, in the same way that a NEON vmovn would.
  8158. // Most of the instructions in MVE follow the 'Beats' system, where moving
  8159. // values from different lanes is usually something that the instructions
  8160. // avoid.
  8161. //
  8162. // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
  8163. // which take a the top/bottom half of a larger lane and extend it (or do the
  8164. // opposite, truncating into the top/bottom lane from a larger lane). Note
  8165. // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
  8166. // bottom 16bits from each vector lane. This works really well with T/B
  8167. // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
  8168. // to move order.
  8169. //
  8170. // But truncates and sext/zext are always going to be fairly common from llvm.
  8171. // We have several options for how to deal with them:
  8172. // - Wherever possible combine them into an instruction that makes them
  8173. // "free". This includes loads/stores, which can perform the trunc as part
  8174. // of the memory operation. Or certain shuffles that can be turned into
  8175. // VMOVN/VMOVL.
  8176. // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
  8177. // trunc(mul(sext(a), sext(b))) may become
  8178. // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
  8179. // this case can use VMULL). This is performed in the
  8180. // MVELaneInterleavingPass.
  8181. // - Otherwise we have an option. By default we would expand the
  8182. // zext/sext/trunc into a series of lane extract/inserts going via GPR
  8183. // registers. One for each vector lane in the vector. This can obviously be
  8184. // very expensive.
  8185. // - The other option is to use the fact that loads/store can extend/truncate
  8186. // to turn a trunc into two truncating stack stores and a stack reload. This
  8187. // becomes 3 back-to-back memory operations, but at least that is less than
  8188. // all the insert/extracts.
  8189. //
  8190. // In order to do the last, we convert certain trunc's into MVETRUNC, which
  8191. // are either optimized where they can be, or eventually lowered into stack
  8192. // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
  8193. // two early, where other instructions would be better, and stops us from
  8194. // having to reconstruct multiple buildvector shuffles into loads/stores.
  8195. if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
  8196. return SDValue();
  8197. EVT FromVT = N->getOperand(0).getValueType();
  8198. if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
  8199. return SDValue();
  8200. SDValue Lo, Hi;
  8201. std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
  8202. SDLoc DL(N);
  8203. return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
  8204. }
  8205. static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
  8206. const ARMSubtarget *Subtarget) {
  8207. if (!Subtarget->hasMVEIntegerOps())
  8208. return SDValue();
  8209. // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
  8210. EVT ToVT = N->getValueType(0);
  8211. if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
  8212. return SDValue();
  8213. SDValue Op = N->getOperand(0);
  8214. EVT FromVT = Op.getValueType();
  8215. if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
  8216. return SDValue();
  8217. SDLoc DL(N);
  8218. EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
  8219. if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
  8220. ExtVT = MVT::v8i16;
  8221. unsigned Opcode =
  8222. N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
  8223. SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
  8224. SDValue Ext1 = Ext.getValue(1);
  8225. if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
  8226. Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
  8227. Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
  8228. }
  8229. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
  8230. }
  8231. /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
  8232. /// element has been zero/sign-extended, depending on the isSigned parameter,
  8233. /// from an integer type half its size.
  8234. static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
  8235. bool isSigned) {
  8236. // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
  8237. EVT VT = N->getValueType(0);
  8238. if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
  8239. SDNode *BVN = N->getOperand(0).getNode();
  8240. if (BVN->getValueType(0) != MVT::v4i32 ||
  8241. BVN->getOpcode() != ISD::BUILD_VECTOR)
  8242. return false;
  8243. unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
  8244. unsigned HiElt = 1 - LoElt;
  8245. ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
  8246. ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
  8247. ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
  8248. ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
  8249. if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
  8250. return false;
  8251. if (isSigned) {
  8252. if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
  8253. Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
  8254. return true;
  8255. } else {
  8256. if (Hi0->isZero() && Hi1->isZero())
  8257. return true;
  8258. }
  8259. return false;
  8260. }
  8261. if (N->getOpcode() != ISD::BUILD_VECTOR)
  8262. return false;
  8263. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
  8264. SDNode *Elt = N->getOperand(i).getNode();
  8265. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
  8266. unsigned EltSize = VT.getScalarSizeInBits();
  8267. unsigned HalfSize = EltSize / 2;
  8268. if (isSigned) {
  8269. if (!isIntN(HalfSize, C->getSExtValue()))
  8270. return false;
  8271. } else {
  8272. if (!isUIntN(HalfSize, C->getZExtValue()))
  8273. return false;
  8274. }
  8275. continue;
  8276. }
  8277. return false;
  8278. }
  8279. return true;
  8280. }
  8281. /// isSignExtended - Check if a node is a vector value that is sign-extended
  8282. /// or a constant BUILD_VECTOR with sign-extended elements.
  8283. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
  8284. if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
  8285. return true;
  8286. if (isExtendedBUILD_VECTOR(N, DAG, true))
  8287. return true;
  8288. return false;
  8289. }
  8290. /// isZeroExtended - Check if a node is a vector value that is zero-extended (or
  8291. /// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
  8292. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
  8293. if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
  8294. ISD::isZEXTLoad(N))
  8295. return true;
  8296. if (isExtendedBUILD_VECTOR(N, DAG, false))
  8297. return true;
  8298. return false;
  8299. }
  8300. static EVT getExtensionTo64Bits(const EVT &OrigVT) {
  8301. if (OrigVT.getSizeInBits() >= 64)
  8302. return OrigVT;
  8303. assert(OrigVT.isSimple() && "Expecting a simple value type");
  8304. MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
  8305. switch (OrigSimpleTy) {
  8306. default: llvm_unreachable("Unexpected Vector Type");
  8307. case MVT::v2i8:
  8308. case MVT::v2i16:
  8309. return MVT::v2i32;
  8310. case MVT::v4i8:
  8311. return MVT::v4i16;
  8312. }
  8313. }
  8314. /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
  8315. /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
  8316. /// We insert the required extension here to get the vector to fill a D register.
  8317. static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
  8318. const EVT &OrigTy,
  8319. const EVT &ExtTy,
  8320. unsigned ExtOpcode) {
  8321. // The vector originally had a size of OrigTy. It was then extended to ExtTy.
  8322. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
  8323. // 64-bits we need to insert a new extension so that it will be 64-bits.
  8324. assert(ExtTy.is128BitVector() && "Unexpected extension size");
  8325. if (OrigTy.getSizeInBits() >= 64)
  8326. return N;
  8327. // Must extend size to at least 64 bits to be used as an operand for VMULL.
  8328. EVT NewVT = getExtensionTo64Bits(OrigTy);
  8329. return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
  8330. }
  8331. /// SkipLoadExtensionForVMULL - return a load of the original vector size that
  8332. /// does not do any sign/zero extension. If the original vector is less
  8333. /// than 64 bits, an appropriate extension will be added after the load to
  8334. /// reach a total size of 64 bits. We have to add the extension separately
  8335. /// because ARM does not have a sign/zero extending load for vectors.
  8336. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
  8337. EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
  8338. // The load already has the right type.
  8339. if (ExtendedTy == LD->getMemoryVT())
  8340. return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
  8341. LD->getBasePtr(), LD->getPointerInfo(),
  8342. LD->getAlignment(), LD->getMemOperand()->getFlags());
  8343. // We need to create a zextload/sextload. We cannot just create a load
  8344. // followed by a zext/zext node because LowerMUL is also run during normal
  8345. // operation legalization where we can't create illegal types.
  8346. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
  8347. LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
  8348. LD->getMemoryVT(), LD->getAlignment(),
  8349. LD->getMemOperand()->getFlags());
  8350. }
  8351. /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
  8352. /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
  8353. /// the unextended value. The unextended vector should be 64 bits so that it can
  8354. /// be used as an operand to a VMULL instruction. If the original vector size
  8355. /// before extension is less than 64 bits we add a an extension to resize
  8356. /// the vector to 64 bits.
  8357. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
  8358. if (N->getOpcode() == ISD::SIGN_EXTEND ||
  8359. N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
  8360. return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
  8361. N->getOperand(0)->getValueType(0),
  8362. N->getValueType(0),
  8363. N->getOpcode());
  8364. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  8365. assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
  8366. "Expected extending load");
  8367. SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
  8368. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
  8369. unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  8370. SDValue extLoad =
  8371. DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
  8372. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
  8373. return newLoad;
  8374. }
  8375. // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
  8376. // have been legalized as a BITCAST from v4i32.
  8377. if (N->getOpcode() == ISD::BITCAST) {
  8378. SDNode *BVN = N->getOperand(0).getNode();
  8379. assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
  8380. BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
  8381. unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
  8382. return DAG.getBuildVector(
  8383. MVT::v2i32, SDLoc(N),
  8384. {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
  8385. }
  8386. // Construct a new BUILD_VECTOR with elements truncated to half the size.
  8387. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
  8388. EVT VT = N->getValueType(0);
  8389. unsigned EltSize = VT.getScalarSizeInBits() / 2;
  8390. unsigned NumElts = VT.getVectorNumElements();
  8391. MVT TruncVT = MVT::getIntegerVT(EltSize);
  8392. SmallVector<SDValue, 8> Ops;
  8393. SDLoc dl(N);
  8394. for (unsigned i = 0; i != NumElts; ++i) {
  8395. ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
  8396. const APInt &CInt = C->getAPIntValue();
  8397. // Element types smaller than 32 bits are not legal, so use i32 elements.
  8398. // The values are implicitly truncated so sext vs. zext doesn't matter.
  8399. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
  8400. }
  8401. return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
  8402. }
  8403. static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
  8404. unsigned Opcode = N->getOpcode();
  8405. if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
  8406. SDNode *N0 = N->getOperand(0).getNode();
  8407. SDNode *N1 = N->getOperand(1).getNode();
  8408. return N0->hasOneUse() && N1->hasOneUse() &&
  8409. isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
  8410. }
  8411. return false;
  8412. }
  8413. static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
  8414. unsigned Opcode = N->getOpcode();
  8415. if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
  8416. SDNode *N0 = N->getOperand(0).getNode();
  8417. SDNode *N1 = N->getOperand(1).getNode();
  8418. return N0->hasOneUse() && N1->hasOneUse() &&
  8419. isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
  8420. }
  8421. return false;
  8422. }
  8423. static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
  8424. // Multiplications are only custom-lowered for 128-bit vectors so that
  8425. // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
  8426. EVT VT = Op.getValueType();
  8427. assert(VT.is128BitVector() && VT.isInteger() &&
  8428. "unexpected type for custom-lowering ISD::MUL");
  8429. SDNode *N0 = Op.getOperand(0).getNode();
  8430. SDNode *N1 = Op.getOperand(1).getNode();
  8431. unsigned NewOpc = 0;
  8432. bool isMLA = false;
  8433. bool isN0SExt = isSignExtended(N0, DAG);
  8434. bool isN1SExt = isSignExtended(N1, DAG);
  8435. if (isN0SExt && isN1SExt)
  8436. NewOpc = ARMISD::VMULLs;
  8437. else {
  8438. bool isN0ZExt = isZeroExtended(N0, DAG);
  8439. bool isN1ZExt = isZeroExtended(N1, DAG);
  8440. if (isN0ZExt && isN1ZExt)
  8441. NewOpc = ARMISD::VMULLu;
  8442. else if (isN1SExt || isN1ZExt) {
  8443. // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
  8444. // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
  8445. if (isN1SExt && isAddSubSExt(N0, DAG)) {
  8446. NewOpc = ARMISD::VMULLs;
  8447. isMLA = true;
  8448. } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
  8449. NewOpc = ARMISD::VMULLu;
  8450. isMLA = true;
  8451. } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
  8452. std::swap(N0, N1);
  8453. NewOpc = ARMISD::VMULLu;
  8454. isMLA = true;
  8455. }
  8456. }
  8457. if (!NewOpc) {
  8458. if (VT == MVT::v2i64)
  8459. // Fall through to expand this. It is not legal.
  8460. return SDValue();
  8461. else
  8462. // Other vector multiplications are legal.
  8463. return Op;
  8464. }
  8465. }
  8466. // Legalize to a VMULL instruction.
  8467. SDLoc DL(Op);
  8468. SDValue Op0;
  8469. SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
  8470. if (!isMLA) {
  8471. Op0 = SkipExtensionForVMULL(N0, DAG);
  8472. assert(Op0.getValueType().is64BitVector() &&
  8473. Op1.getValueType().is64BitVector() &&
  8474. "unexpected types for extended operands to VMULL");
  8475. return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
  8476. }
  8477. // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
  8478. // isel lowering to take advantage of no-stall back to back vmul + vmla.
  8479. // vmull q0, d4, d6
  8480. // vmlal q0, d5, d6
  8481. // is faster than
  8482. // vaddl q0, d4, d5
  8483. // vmovl q1, d6
  8484. // vmul q0, q0, q1
  8485. SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
  8486. SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
  8487. EVT Op1VT = Op1.getValueType();
  8488. return DAG.getNode(N0->getOpcode(), DL, VT,
  8489. DAG.getNode(NewOpc, DL, VT,
  8490. DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
  8491. DAG.getNode(NewOpc, DL, VT,
  8492. DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
  8493. }
  8494. static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
  8495. SelectionDAG &DAG) {
  8496. // TODO: Should this propagate fast-math-flags?
  8497. // Convert to float
  8498. // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
  8499. // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
  8500. X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
  8501. Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
  8502. X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
  8503. Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
  8504. // Get reciprocal estimate.
  8505. // float4 recip = vrecpeq_f32(yf);
  8506. Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8507. DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
  8508. Y);
  8509. // Because char has a smaller range than uchar, we can actually get away
  8510. // without any newton steps. This requires that we use a weird bias
  8511. // of 0xb000, however (again, this has been exhaustively tested).
  8512. // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
  8513. X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
  8514. X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
  8515. Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
  8516. X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
  8517. X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
  8518. // Convert back to short.
  8519. X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
  8520. X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
  8521. return X;
  8522. }
  8523. static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
  8524. SelectionDAG &DAG) {
  8525. // TODO: Should this propagate fast-math-flags?
  8526. SDValue N2;
  8527. // Convert to float.
  8528. // float4 yf = vcvt_f32_s32(vmovl_s16(y));
  8529. // float4 xf = vcvt_f32_s32(vmovl_s16(x));
  8530. N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
  8531. N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
  8532. N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
  8533. N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
  8534. // Use reciprocal estimate and one refinement step.
  8535. // float4 recip = vrecpeq_f32(yf);
  8536. // recip *= vrecpsq_f32(yf, recip);
  8537. N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8538. DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
  8539. N1);
  8540. N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8541. DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
  8542. N1, N2);
  8543. N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
  8544. // Because short has a smaller range than ushort, we can actually get away
  8545. // with only a single newton step. This requires that we use a weird bias
  8546. // of 89, however (again, this has been exhaustively tested).
  8547. // float4 result = as_float4(as_int4(xf*recip) + 0x89);
  8548. N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
  8549. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
  8550. N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
  8551. N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
  8552. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
  8553. // Convert back to integer and return.
  8554. // return vmovn_s32(vcvt_s32_f32(result));
  8555. N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
  8556. N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
  8557. return N0;
  8558. }
  8559. static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
  8560. const ARMSubtarget *ST) {
  8561. EVT VT = Op.getValueType();
  8562. assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
  8563. "unexpected type for custom-lowering ISD::SDIV");
  8564. SDLoc dl(Op);
  8565. SDValue N0 = Op.getOperand(0);
  8566. SDValue N1 = Op.getOperand(1);
  8567. SDValue N2, N3;
  8568. if (VT == MVT::v8i8) {
  8569. N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
  8570. N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
  8571. N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8572. DAG.getIntPtrConstant(4, dl));
  8573. N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8574. DAG.getIntPtrConstant(4, dl));
  8575. N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8576. DAG.getIntPtrConstant(0, dl));
  8577. N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8578. DAG.getIntPtrConstant(0, dl));
  8579. N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
  8580. N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
  8581. N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
  8582. N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
  8583. N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
  8584. return N0;
  8585. }
  8586. return LowerSDIV_v4i16(N0, N1, dl, DAG);
  8587. }
  8588. static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
  8589. const ARMSubtarget *ST) {
  8590. // TODO: Should this propagate fast-math-flags?
  8591. EVT VT = Op.getValueType();
  8592. assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
  8593. "unexpected type for custom-lowering ISD::UDIV");
  8594. SDLoc dl(Op);
  8595. SDValue N0 = Op.getOperand(0);
  8596. SDValue N1 = Op.getOperand(1);
  8597. SDValue N2, N3;
  8598. if (VT == MVT::v8i8) {
  8599. N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
  8600. N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
  8601. N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8602. DAG.getIntPtrConstant(4, dl));
  8603. N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8604. DAG.getIntPtrConstant(4, dl));
  8605. N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8606. DAG.getIntPtrConstant(0, dl));
  8607. N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8608. DAG.getIntPtrConstant(0, dl));
  8609. N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
  8610. N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
  8611. N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
  8612. N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
  8613. N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
  8614. DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
  8615. MVT::i32),
  8616. N0);
  8617. return N0;
  8618. }
  8619. // v4i16 sdiv ... Convert to float.
  8620. // float4 yf = vcvt_f32_s32(vmovl_u16(y));
  8621. // float4 xf = vcvt_f32_s32(vmovl_u16(x));
  8622. N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
  8623. N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
  8624. N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
  8625. SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
  8626. // Use reciprocal estimate and two refinement steps.
  8627. // float4 recip = vrecpeq_f32(yf);
  8628. // recip *= vrecpsq_f32(yf, recip);
  8629. // recip *= vrecpsq_f32(yf, recip);
  8630. N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8631. DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
  8632. BN1);
  8633. N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8634. DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
  8635. BN1, N2);
  8636. N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
  8637. N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8638. DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
  8639. BN1, N2);
  8640. N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
  8641. // Simply multiplying by the reciprocal estimate can leave us a few ulps
  8642. // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
  8643. // and that it will never cause us to return an answer too large).
  8644. // float4 result = as_float4(as_int4(xf*recip) + 2);
  8645. N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
  8646. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
  8647. N1 = DAG.getConstant(2, dl, MVT::v4i32);
  8648. N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
  8649. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
  8650. // Convert back to integer and return.
  8651. // return vmovn_u32(vcvt_s32_f32(result));
  8652. N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
  8653. N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
  8654. return N0;
  8655. }
  8656. static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
  8657. SDNode *N = Op.getNode();
  8658. EVT VT = N->getValueType(0);
  8659. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  8660. SDValue Carry = Op.getOperand(2);
  8661. SDLoc DL(Op);
  8662. SDValue Result;
  8663. if (Op.getOpcode() == ISD::ADDCARRY) {
  8664. // This converts the boolean value carry into the carry flag.
  8665. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
  8666. // Do the addition proper using the carry flag we wanted.
  8667. Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
  8668. Op.getOperand(1), Carry);
  8669. // Now convert the carry flag into a boolean value.
  8670. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
  8671. } else {
  8672. // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
  8673. // have to invert the carry first.
  8674. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
  8675. DAG.getConstant(1, DL, MVT::i32), Carry);
  8676. // This converts the boolean value carry into the carry flag.
  8677. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
  8678. // Do the subtraction proper using the carry flag we wanted.
  8679. Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
  8680. Op.getOperand(1), Carry);
  8681. // Now convert the carry flag into a boolean value.
  8682. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
  8683. // But the carry returned by ARMISD::SUBE is not a borrow as expected
  8684. // by ISD::SUBCARRY, so compute 1 - C.
  8685. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
  8686. DAG.getConstant(1, DL, MVT::i32), Carry);
  8687. }
  8688. // Return both values.
  8689. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
  8690. }
  8691. SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
  8692. assert(Subtarget->isTargetDarwin());
  8693. // For iOS, we want to call an alternative entry point: __sincos_stret,
  8694. // return values are passed via sret.
  8695. SDLoc dl(Op);
  8696. SDValue Arg = Op.getOperand(0);
  8697. EVT ArgVT = Arg.getValueType();
  8698. Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
  8699. auto PtrVT = getPointerTy(DAG.getDataLayout());
  8700. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
  8701. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  8702. // Pair of floats / doubles used to pass the result.
  8703. Type *RetTy = StructType::get(ArgTy, ArgTy);
  8704. auto &DL = DAG.getDataLayout();
  8705. ArgListTy Args;
  8706. bool ShouldUseSRet = Subtarget->isAPCS_ABI();
  8707. SDValue SRet;
  8708. if (ShouldUseSRet) {
  8709. // Create stack object for sret.
  8710. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
  8711. const Align StackAlign = DL.getPrefTypeAlign(RetTy);
  8712. int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
  8713. SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
  8714. ArgListEntry Entry;
  8715. Entry.Node = SRet;
  8716. Entry.Ty = RetTy->getPointerTo();
  8717. Entry.IsSExt = false;
  8718. Entry.IsZExt = false;
  8719. Entry.IsSRet = true;
  8720. Args.push_back(Entry);
  8721. RetTy = Type::getVoidTy(*DAG.getContext());
  8722. }
  8723. ArgListEntry Entry;
  8724. Entry.Node = Arg;
  8725. Entry.Ty = ArgTy;
  8726. Entry.IsSExt = false;
  8727. Entry.IsZExt = false;
  8728. Args.push_back(Entry);
  8729. RTLIB::Libcall LC =
  8730. (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
  8731. const char *LibcallName = getLibcallName(LC);
  8732. CallingConv::ID CC = getLibcallCallingConv(LC);
  8733. SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
  8734. TargetLowering::CallLoweringInfo CLI(DAG);
  8735. CLI.setDebugLoc(dl)
  8736. .setChain(DAG.getEntryNode())
  8737. .setCallee(CC, RetTy, Callee, std::move(Args))
  8738. .setDiscardResult(ShouldUseSRet);
  8739. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  8740. if (!ShouldUseSRet)
  8741. return CallResult.first;
  8742. SDValue LoadSin =
  8743. DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
  8744. // Address of cos field.
  8745. SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
  8746. DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
  8747. SDValue LoadCos =
  8748. DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
  8749. SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
  8750. return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
  8751. LoadSin.getValue(0), LoadCos.getValue(0));
  8752. }
  8753. SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
  8754. bool Signed,
  8755. SDValue &Chain) const {
  8756. EVT VT = Op.getValueType();
  8757. assert((VT == MVT::i32 || VT == MVT::i64) &&
  8758. "unexpected type for custom lowering DIV");
  8759. SDLoc dl(Op);
  8760. const auto &DL = DAG.getDataLayout();
  8761. const auto &TLI = DAG.getTargetLoweringInfo();
  8762. const char *Name = nullptr;
  8763. if (Signed)
  8764. Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
  8765. else
  8766. Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
  8767. SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
  8768. ARMTargetLowering::ArgListTy Args;
  8769. for (auto AI : {1, 0}) {
  8770. ArgListEntry Arg;
  8771. Arg.Node = Op.getOperand(AI);
  8772. Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
  8773. Args.push_back(Arg);
  8774. }
  8775. CallLoweringInfo CLI(DAG);
  8776. CLI.setDebugLoc(dl)
  8777. .setChain(Chain)
  8778. .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
  8779. ES, std::move(Args));
  8780. return LowerCallTo(CLI).first;
  8781. }
  8782. // This is a code size optimisation: return the original SDIV node to
  8783. // DAGCombiner when we don't want to expand SDIV into a sequence of
  8784. // instructions, and an empty node otherwise which will cause the
  8785. // SDIV to be expanded in DAGCombine.
  8786. SDValue
  8787. ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
  8788. SelectionDAG &DAG,
  8789. SmallVectorImpl<SDNode *> &Created) const {
  8790. // TODO: Support SREM
  8791. if (N->getOpcode() != ISD::SDIV)
  8792. return SDValue();
  8793. const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
  8794. const bool MinSize = ST.hasMinSize();
  8795. const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
  8796. : ST.hasDivideInARMMode();
  8797. // Don't touch vector types; rewriting this may lead to scalarizing
  8798. // the int divs.
  8799. if (N->getOperand(0).getValueType().isVector())
  8800. return SDValue();
  8801. // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
  8802. // hwdiv support for this to be really profitable.
  8803. if (!(MinSize && HasDivide))
  8804. return SDValue();
  8805. // ARM mode is a bit simpler than Thumb: we can handle large power
  8806. // of 2 immediates with 1 mov instruction; no further checks required,
  8807. // just return the sdiv node.
  8808. if (!ST.isThumb())
  8809. return SDValue(N, 0);
  8810. // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
  8811. // and thus lose the code size benefits of a MOVS that requires only 2.
  8812. // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
  8813. // but as it's doing exactly this, it's not worth the trouble to get TTI.
  8814. if (Divisor.sgt(128))
  8815. return SDValue();
  8816. return SDValue(N, 0);
  8817. }
  8818. SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
  8819. bool Signed) const {
  8820. assert(Op.getValueType() == MVT::i32 &&
  8821. "unexpected type for custom lowering DIV");
  8822. SDLoc dl(Op);
  8823. SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
  8824. DAG.getEntryNode(), Op.getOperand(1));
  8825. return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
  8826. }
  8827. static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
  8828. SDLoc DL(N);
  8829. SDValue Op = N->getOperand(1);
  8830. if (N->getValueType(0) == MVT::i32)
  8831. return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
  8832. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
  8833. DAG.getConstant(0, DL, MVT::i32));
  8834. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
  8835. DAG.getConstant(1, DL, MVT::i32));
  8836. return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
  8837. DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
  8838. }
  8839. void ARMTargetLowering::ExpandDIV_Windows(
  8840. SDValue Op, SelectionDAG &DAG, bool Signed,
  8841. SmallVectorImpl<SDValue> &Results) const {
  8842. const auto &DL = DAG.getDataLayout();
  8843. const auto &TLI = DAG.getTargetLoweringInfo();
  8844. assert(Op.getValueType() == MVT::i64 &&
  8845. "unexpected type for custom lowering DIV");
  8846. SDLoc dl(Op);
  8847. SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
  8848. SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
  8849. SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
  8850. SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
  8851. DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
  8852. Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
  8853. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
  8854. }
  8855. static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
  8856. LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
  8857. EVT MemVT = LD->getMemoryVT();
  8858. assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
  8859. MemVT == MVT::v16i1) &&
  8860. "Expected a predicate type!");
  8861. assert(MemVT == Op.getValueType());
  8862. assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
  8863. "Expected a non-extending load");
  8864. assert(LD->isUnindexed() && "Expected a unindexed load");
  8865. // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
  8866. // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
  8867. // need to make sure that 8/4/2 bits are actually loaded into the correct
  8868. // place, which means loading the value and then shuffling the values into
  8869. // the bottom bits of the predicate.
  8870. // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
  8871. // for BE).
  8872. // Speaking of BE, apparently the rest of llvm will assume a reverse order to
  8873. // a natural VMSR(load), so needs to be reversed.
  8874. SDLoc dl(Op);
  8875. SDValue Load = DAG.getExtLoad(
  8876. ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
  8877. EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
  8878. LD->getMemOperand());
  8879. SDValue Val = Load;
  8880. if (DAG.getDataLayout().isBigEndian())
  8881. Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
  8882. DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
  8883. DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
  8884. SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
  8885. if (MemVT != MVT::v16i1)
  8886. Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
  8887. DAG.getConstant(0, dl, MVT::i32));
  8888. return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
  8889. }
  8890. void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
  8891. SelectionDAG &DAG) const {
  8892. LoadSDNode *LD = cast<LoadSDNode>(N);
  8893. EVT MemVT = LD->getMemoryVT();
  8894. assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
  8895. if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
  8896. !Subtarget->isThumb1Only() && LD->isVolatile()) {
  8897. SDLoc dl(N);
  8898. SDValue Result = DAG.getMemIntrinsicNode(
  8899. ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
  8900. {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
  8901. SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
  8902. SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
  8903. SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
  8904. Results.append({Pair, Result.getValue(2)});
  8905. }
  8906. }
  8907. static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
  8908. StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
  8909. EVT MemVT = ST->getMemoryVT();
  8910. assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
  8911. MemVT == MVT::v16i1) &&
  8912. "Expected a predicate type!");
  8913. assert(MemVT == ST->getValue().getValueType());
  8914. assert(!ST->isTruncatingStore() && "Expected a non-extending store");
  8915. assert(ST->isUnindexed() && "Expected a unindexed store");
  8916. // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
  8917. // top bits unset and a scalar store.
  8918. SDLoc dl(Op);
  8919. SDValue Build = ST->getValue();
  8920. if (MemVT != MVT::v16i1) {
  8921. SmallVector<SDValue, 16> Ops;
  8922. for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
  8923. unsigned Elt = DAG.getDataLayout().isBigEndian()
  8924. ? MemVT.getVectorNumElements() - I - 1
  8925. : I;
  8926. Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
  8927. DAG.getConstant(Elt, dl, MVT::i32)));
  8928. }
  8929. for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
  8930. Ops.push_back(DAG.getUNDEF(MVT::i32));
  8931. Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
  8932. }
  8933. SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
  8934. if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
  8935. GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
  8936. DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
  8937. DAG.getConstant(16, dl, MVT::i32));
  8938. return DAG.getTruncStore(
  8939. ST->getChain(), dl, GRP, ST->getBasePtr(),
  8940. EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
  8941. ST->getMemOperand());
  8942. }
  8943. static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
  8944. const ARMSubtarget *Subtarget) {
  8945. StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
  8946. EVT MemVT = ST->getMemoryVT();
  8947. assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
  8948. if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
  8949. !Subtarget->isThumb1Only() && ST->isVolatile()) {
  8950. SDNode *N = Op.getNode();
  8951. SDLoc dl(N);
  8952. SDValue Lo = DAG.getNode(
  8953. ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
  8954. DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
  8955. MVT::i32));
  8956. SDValue Hi = DAG.getNode(
  8957. ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
  8958. DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
  8959. MVT::i32));
  8960. return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
  8961. {ST->getChain(), Lo, Hi, ST->getBasePtr()},
  8962. MemVT, ST->getMemOperand());
  8963. } else if (Subtarget->hasMVEIntegerOps() &&
  8964. ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
  8965. MemVT == MVT::v16i1))) {
  8966. return LowerPredicateStore(Op, DAG);
  8967. }
  8968. return SDValue();
  8969. }
  8970. static bool isZeroVector(SDValue N) {
  8971. return (ISD::isBuildVectorAllZeros(N.getNode()) ||
  8972. (N->getOpcode() == ARMISD::VMOVIMM &&
  8973. isNullConstant(N->getOperand(0))));
  8974. }
  8975. static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
  8976. MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
  8977. MVT VT = Op.getSimpleValueType();
  8978. SDValue Mask = N->getMask();
  8979. SDValue PassThru = N->getPassThru();
  8980. SDLoc dl(Op);
  8981. if (isZeroVector(PassThru))
  8982. return Op;
  8983. // MVE Masked loads use zero as the passthru value. Here we convert undef to
  8984. // zero too, and other values are lowered to a select.
  8985. SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  8986. DAG.getTargetConstant(0, dl, MVT::i32));
  8987. SDValue NewLoad = DAG.getMaskedLoad(
  8988. VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
  8989. N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
  8990. N->getExtensionType(), N->isExpandingLoad());
  8991. SDValue Combo = NewLoad;
  8992. bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
  8993. PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
  8994. isZeroVector(PassThru->getOperand(0));
  8995. if (!PassThru.isUndef() && !PassThruIsCastZero)
  8996. Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
  8997. return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
  8998. }
  8999. static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
  9000. const ARMSubtarget *ST) {
  9001. if (!ST->hasMVEIntegerOps())
  9002. return SDValue();
  9003. SDLoc dl(Op);
  9004. unsigned BaseOpcode = 0;
  9005. switch (Op->getOpcode()) {
  9006. default: llvm_unreachable("Expected VECREDUCE opcode");
  9007. case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
  9008. case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
  9009. case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
  9010. case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
  9011. case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
  9012. case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
  9013. case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
  9014. case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
  9015. }
  9016. SDValue Op0 = Op->getOperand(0);
  9017. EVT VT = Op0.getValueType();
  9018. EVT EltVT = VT.getVectorElementType();
  9019. unsigned NumElts = VT.getVectorNumElements();
  9020. unsigned NumActiveLanes = NumElts;
  9021. assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
  9022. NumActiveLanes == 2) &&
  9023. "Only expected a power 2 vector size");
  9024. // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
  9025. // allows us to easily extract vector elements from the lanes.
  9026. while (NumActiveLanes > 4) {
  9027. unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
  9028. SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
  9029. Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
  9030. NumActiveLanes /= 2;
  9031. }
  9032. SDValue Res;
  9033. if (NumActiveLanes == 4) {
  9034. // The remaining 4 elements are summed sequentially
  9035. SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9036. DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
  9037. SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9038. DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
  9039. SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9040. DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
  9041. SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9042. DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
  9043. SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
  9044. SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
  9045. Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
  9046. } else {
  9047. SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9048. DAG.getConstant(0, dl, MVT::i32));
  9049. SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9050. DAG.getConstant(1, dl, MVT::i32));
  9051. Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
  9052. }
  9053. // Result type may be wider than element type.
  9054. if (EltVT != Op->getValueType(0))
  9055. Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
  9056. return Res;
  9057. }
  9058. static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
  9059. const ARMSubtarget *ST) {
  9060. if (!ST->hasMVEFloatOps())
  9061. return SDValue();
  9062. return LowerVecReduce(Op, DAG, ST);
  9063. }
  9064. static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
  9065. if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
  9066. // Acquire/Release load/store is not legal for targets without a dmb or
  9067. // equivalent available.
  9068. return SDValue();
  9069. // Monotonic load/store is legal for all targets.
  9070. return Op;
  9071. }
  9072. static void ReplaceREADCYCLECOUNTER(SDNode *N,
  9073. SmallVectorImpl<SDValue> &Results,
  9074. SelectionDAG &DAG,
  9075. const ARMSubtarget *Subtarget) {
  9076. SDLoc DL(N);
  9077. // Under Power Management extensions, the cycle-count is:
  9078. // mrc p15, #0, <Rt>, c9, c13, #0
  9079. SDValue Ops[] = { N->getOperand(0), // Chain
  9080. DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
  9081. DAG.getTargetConstant(15, DL, MVT::i32),
  9082. DAG.getTargetConstant(0, DL, MVT::i32),
  9083. DAG.getTargetConstant(9, DL, MVT::i32),
  9084. DAG.getTargetConstant(13, DL, MVT::i32),
  9085. DAG.getTargetConstant(0, DL, MVT::i32)
  9086. };
  9087. SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
  9088. DAG.getVTList(MVT::i32, MVT::Other), Ops);
  9089. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
  9090. DAG.getConstant(0, DL, MVT::i32)));
  9091. Results.push_back(Cycles32.getValue(1));
  9092. }
  9093. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
  9094. SDLoc dl(V.getNode());
  9095. SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
  9096. SDValue VHi = DAG.getAnyExtOrTrunc(
  9097. DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
  9098. dl, MVT::i32);
  9099. bool isBigEndian = DAG.getDataLayout().isBigEndian();
  9100. if (isBigEndian)
  9101. std::swap (VLo, VHi);
  9102. SDValue RegClass =
  9103. DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
  9104. SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
  9105. SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
  9106. const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
  9107. return SDValue(
  9108. DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
  9109. }
  9110. static void ReplaceCMP_SWAP_64Results(SDNode *N,
  9111. SmallVectorImpl<SDValue> & Results,
  9112. SelectionDAG &DAG) {
  9113. assert(N->getValueType(0) == MVT::i64 &&
  9114. "AtomicCmpSwap on types less than 64 should be legal");
  9115. SDValue Ops[] = {N->getOperand(1),
  9116. createGPRPairNode(DAG, N->getOperand(2)),
  9117. createGPRPairNode(DAG, N->getOperand(3)),
  9118. N->getOperand(0)};
  9119. SDNode *CmpSwap = DAG.getMachineNode(
  9120. ARM::CMP_SWAP_64, SDLoc(N),
  9121. DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
  9122. MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
  9123. DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
  9124. bool isBigEndian = DAG.getDataLayout().isBigEndian();
  9125. SDValue Lo =
  9126. DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
  9127. SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
  9128. SDValue Hi =
  9129. DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
  9130. SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
  9131. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
  9132. Results.push_back(SDValue(CmpSwap, 2));
  9133. }
  9134. SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
  9135. SDLoc dl(Op);
  9136. EVT VT = Op.getValueType();
  9137. SDValue Chain = Op.getOperand(0);
  9138. SDValue LHS = Op.getOperand(1);
  9139. SDValue RHS = Op.getOperand(2);
  9140. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
  9141. bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
  9142. // If we don't have instructions of this float type then soften to a libcall
  9143. // and use SETCC instead.
  9144. if (isUnsupportedFloatingType(LHS.getValueType())) {
  9145. DAG.getTargetLoweringInfo().softenSetCCOperands(
  9146. DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
  9147. if (!RHS.getNode()) {
  9148. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  9149. CC = ISD::SETNE;
  9150. }
  9151. SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
  9152. DAG.getCondCode(CC));
  9153. return DAG.getMergeValues({Result, Chain}, dl);
  9154. }
  9155. ARMCC::CondCodes CondCode, CondCode2;
  9156. FPCCToARMCC(CC, CondCode, CondCode2);
  9157. // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
  9158. // in CMPFP and CMPFPE, but instead it should be made explicit by these
  9159. // instructions using a chain instead of glue. This would also fix the problem
  9160. // here (and also in LowerSELECT_CC) where we generate two comparisons when
  9161. // CondCode2 != AL.
  9162. SDValue True = DAG.getConstant(1, dl, VT);
  9163. SDValue False = DAG.getConstant(0, dl, VT);
  9164. SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  9165. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  9166. SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
  9167. SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
  9168. if (CondCode2 != ARMCC::AL) {
  9169. ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
  9170. Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
  9171. Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
  9172. }
  9173. return DAG.getMergeValues({Result, Chain}, dl);
  9174. }
  9175. SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  9176. LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
  9177. switch (Op.getOpcode()) {
  9178. default: llvm_unreachable("Don't know how to custom lower this!");
  9179. case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
  9180. case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
  9181. case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
  9182. case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
  9183. case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
  9184. case ISD::SELECT: return LowerSELECT(Op, DAG);
  9185. case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
  9186. case ISD::BRCOND: return LowerBRCOND(Op, DAG);
  9187. case ISD::BR_CC: return LowerBR_CC(Op, DAG);
  9188. case ISD::BR_JT: return LowerBR_JT(Op, DAG);
  9189. case ISD::VASTART: return LowerVASTART(Op, DAG);
  9190. case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
  9191. case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
  9192. case ISD::SINT_TO_FP:
  9193. case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
  9194. case ISD::STRICT_FP_TO_SINT:
  9195. case ISD::STRICT_FP_TO_UINT:
  9196. case ISD::FP_TO_SINT:
  9197. case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
  9198. case ISD::FP_TO_SINT_SAT:
  9199. case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
  9200. case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
  9201. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
  9202. case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
  9203. case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
  9204. case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
  9205. case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
  9206. case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
  9207. case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
  9208. Subtarget);
  9209. case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
  9210. case ISD::SHL:
  9211. case ISD::SRL:
  9212. case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
  9213. case ISD::SREM: return LowerREM(Op.getNode(), DAG);
  9214. case ISD::UREM: return LowerREM(Op.getNode(), DAG);
  9215. case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
  9216. case ISD::SRL_PARTS:
  9217. case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
  9218. case ISD::CTTZ:
  9219. case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
  9220. case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
  9221. case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
  9222. case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
  9223. case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
  9224. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
  9225. case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
  9226. case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
  9227. case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
  9228. case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
  9229. case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
  9230. case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
  9231. case ISD::SIGN_EXTEND:
  9232. case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
  9233. case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
  9234. case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
  9235. case ISD::MUL: return LowerMUL(Op, DAG);
  9236. case ISD::SDIV:
  9237. if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
  9238. return LowerDIV_Windows(Op, DAG, /* Signed */ true);
  9239. return LowerSDIV(Op, DAG, Subtarget);
  9240. case ISD::UDIV:
  9241. if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
  9242. return LowerDIV_Windows(Op, DAG, /* Signed */ false);
  9243. return LowerUDIV(Op, DAG, Subtarget);
  9244. case ISD::ADDCARRY:
  9245. case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
  9246. case ISD::SADDO:
  9247. case ISD::SSUBO:
  9248. return LowerSignedALUO(Op, DAG);
  9249. case ISD::UADDO:
  9250. case ISD::USUBO:
  9251. return LowerUnsignedALUO(Op, DAG);
  9252. case ISD::SADDSAT:
  9253. case ISD::SSUBSAT:
  9254. case ISD::UADDSAT:
  9255. case ISD::USUBSAT:
  9256. return LowerADDSUBSAT(Op, DAG, Subtarget);
  9257. case ISD::LOAD:
  9258. return LowerPredicateLoad(Op, DAG);
  9259. case ISD::STORE:
  9260. return LowerSTORE(Op, DAG, Subtarget);
  9261. case ISD::MLOAD:
  9262. return LowerMLOAD(Op, DAG);
  9263. case ISD::VECREDUCE_MUL:
  9264. case ISD::VECREDUCE_AND:
  9265. case ISD::VECREDUCE_OR:
  9266. case ISD::VECREDUCE_XOR:
  9267. return LowerVecReduce(Op, DAG, Subtarget);
  9268. case ISD::VECREDUCE_FADD:
  9269. case ISD::VECREDUCE_FMUL:
  9270. case ISD::VECREDUCE_FMIN:
  9271. case ISD::VECREDUCE_FMAX:
  9272. return LowerVecReduceF(Op, DAG, Subtarget);
  9273. case ISD::ATOMIC_LOAD:
  9274. case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
  9275. case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
  9276. case ISD::SDIVREM:
  9277. case ISD::UDIVREM: return LowerDivRem(Op, DAG);
  9278. case ISD::DYNAMIC_STACKALLOC:
  9279. if (Subtarget->isTargetWindows())
  9280. return LowerDYNAMIC_STACKALLOC(Op, DAG);
  9281. llvm_unreachable("Don't know how to custom lower this!");
  9282. case ISD::STRICT_FP_ROUND:
  9283. case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
  9284. case ISD::STRICT_FP_EXTEND:
  9285. case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
  9286. case ISD::STRICT_FSETCC:
  9287. case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
  9288. case ARMISD::WIN__DBZCHK: return SDValue();
  9289. }
  9290. }
  9291. static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
  9292. SelectionDAG &DAG) {
  9293. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
  9294. unsigned Opc = 0;
  9295. if (IntNo == Intrinsic::arm_smlald)
  9296. Opc = ARMISD::SMLALD;
  9297. else if (IntNo == Intrinsic::arm_smlaldx)
  9298. Opc = ARMISD::SMLALDX;
  9299. else if (IntNo == Intrinsic::arm_smlsld)
  9300. Opc = ARMISD::SMLSLD;
  9301. else if (IntNo == Intrinsic::arm_smlsldx)
  9302. Opc = ARMISD::SMLSLDX;
  9303. else
  9304. return;
  9305. SDLoc dl(N);
  9306. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
  9307. N->getOperand(3),
  9308. DAG.getConstant(0, dl, MVT::i32));
  9309. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
  9310. N->getOperand(3),
  9311. DAG.getConstant(1, dl, MVT::i32));
  9312. SDValue LongMul = DAG.getNode(Opc, dl,
  9313. DAG.getVTList(MVT::i32, MVT::i32),
  9314. N->getOperand(1), N->getOperand(2),
  9315. Lo, Hi);
  9316. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
  9317. LongMul.getValue(0), LongMul.getValue(1)));
  9318. }
  9319. /// ReplaceNodeResults - Replace the results of node with an illegal result
  9320. /// type with new values built out of custom code.
  9321. void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
  9322. SmallVectorImpl<SDValue> &Results,
  9323. SelectionDAG &DAG) const {
  9324. SDValue Res;
  9325. switch (N->getOpcode()) {
  9326. default:
  9327. llvm_unreachable("Don't know how to custom expand this!");
  9328. case ISD::READ_REGISTER:
  9329. ExpandREAD_REGISTER(N, Results, DAG);
  9330. break;
  9331. case ISD::BITCAST:
  9332. Res = ExpandBITCAST(N, DAG, Subtarget);
  9333. break;
  9334. case ISD::SRL:
  9335. case ISD::SRA:
  9336. case ISD::SHL:
  9337. Res = Expand64BitShift(N, DAG, Subtarget);
  9338. break;
  9339. case ISD::SREM:
  9340. case ISD::UREM:
  9341. Res = LowerREM(N, DAG);
  9342. break;
  9343. case ISD::SDIVREM:
  9344. case ISD::UDIVREM:
  9345. Res = LowerDivRem(SDValue(N, 0), DAG);
  9346. assert(Res.getNumOperands() == 2 && "DivRem needs two values");
  9347. Results.push_back(Res.getValue(0));
  9348. Results.push_back(Res.getValue(1));
  9349. return;
  9350. case ISD::SADDSAT:
  9351. case ISD::SSUBSAT:
  9352. case ISD::UADDSAT:
  9353. case ISD::USUBSAT:
  9354. Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
  9355. break;
  9356. case ISD::READCYCLECOUNTER:
  9357. ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
  9358. return;
  9359. case ISD::UDIV:
  9360. case ISD::SDIV:
  9361. assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
  9362. return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
  9363. Results);
  9364. case ISD::ATOMIC_CMP_SWAP:
  9365. ReplaceCMP_SWAP_64Results(N, Results, DAG);
  9366. return;
  9367. case ISD::INTRINSIC_WO_CHAIN:
  9368. return ReplaceLongIntrinsic(N, Results, DAG);
  9369. case ISD::ABS:
  9370. lowerABS(N, Results, DAG);
  9371. return ;
  9372. case ISD::LOAD:
  9373. LowerLOAD(N, Results, DAG);
  9374. break;
  9375. case ISD::TRUNCATE:
  9376. Res = LowerTruncate(N, DAG, Subtarget);
  9377. break;
  9378. case ISD::SIGN_EXTEND:
  9379. case ISD::ZERO_EXTEND:
  9380. Res = LowerVectorExtend(N, DAG, Subtarget);
  9381. break;
  9382. case ISD::FP_TO_SINT_SAT:
  9383. case ISD::FP_TO_UINT_SAT:
  9384. Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
  9385. break;
  9386. }
  9387. if (Res.getNode())
  9388. Results.push_back(Res);
  9389. }
  9390. //===----------------------------------------------------------------------===//
  9391. // ARM Scheduler Hooks
  9392. //===----------------------------------------------------------------------===//
  9393. /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
  9394. /// registers the function context.
  9395. void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
  9396. MachineBasicBlock *MBB,
  9397. MachineBasicBlock *DispatchBB,
  9398. int FI) const {
  9399. assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
  9400. "ROPI/RWPI not currently supported with SjLj");
  9401. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  9402. DebugLoc dl = MI.getDebugLoc();
  9403. MachineFunction *MF = MBB->getParent();
  9404. MachineRegisterInfo *MRI = &MF->getRegInfo();
  9405. MachineConstantPool *MCP = MF->getConstantPool();
  9406. ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
  9407. const Function &F = MF->getFunction();
  9408. bool isThumb = Subtarget->isThumb();
  9409. bool isThumb2 = Subtarget->isThumb2();
  9410. unsigned PCLabelId = AFI->createPICLabelUId();
  9411. unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
  9412. ARMConstantPoolValue *CPV =
  9413. ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
  9414. unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
  9415. const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
  9416. : &ARM::GPRRegClass;
  9417. // Grab constant pool and fixed stack memory operands.
  9418. MachineMemOperand *CPMMO =
  9419. MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
  9420. MachineMemOperand::MOLoad, 4, Align(4));
  9421. MachineMemOperand *FIMMOSt =
  9422. MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
  9423. MachineMemOperand::MOStore, 4, Align(4));
  9424. // Load the address of the dispatch MBB into the jump buffer.
  9425. if (isThumb2) {
  9426. // Incoming value: jbuf
  9427. // ldr.n r5, LCPI1_1
  9428. // orr r5, r5, #1
  9429. // add r5, pc
  9430. // str r5, [$jbuf, #+4] ; &jbuf[1]
  9431. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9432. BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
  9433. .addConstantPoolIndex(CPI)
  9434. .addMemOperand(CPMMO)
  9435. .add(predOps(ARMCC::AL));
  9436. // Set the low bit because of thumb mode.
  9437. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9438. BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
  9439. .addReg(NewVReg1, RegState::Kill)
  9440. .addImm(0x01)
  9441. .add(predOps(ARMCC::AL))
  9442. .add(condCodeOp());
  9443. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9444. BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
  9445. .addReg(NewVReg2, RegState::Kill)
  9446. .addImm(PCLabelId);
  9447. BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
  9448. .addReg(NewVReg3, RegState::Kill)
  9449. .addFrameIndex(FI)
  9450. .addImm(36) // &jbuf[1] :: pc
  9451. .addMemOperand(FIMMOSt)
  9452. .add(predOps(ARMCC::AL));
  9453. } else if (isThumb) {
  9454. // Incoming value: jbuf
  9455. // ldr.n r1, LCPI1_4
  9456. // add r1, pc
  9457. // mov r2, #1
  9458. // orrs r1, r2
  9459. // add r2, $jbuf, #+4 ; &jbuf[1]
  9460. // str r1, [r2]
  9461. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9462. BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
  9463. .addConstantPoolIndex(CPI)
  9464. .addMemOperand(CPMMO)
  9465. .add(predOps(ARMCC::AL));
  9466. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9467. BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
  9468. .addReg(NewVReg1, RegState::Kill)
  9469. .addImm(PCLabelId);
  9470. // Set the low bit because of thumb mode.
  9471. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9472. BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
  9473. .addReg(ARM::CPSR, RegState::Define)
  9474. .addImm(1)
  9475. .add(predOps(ARMCC::AL));
  9476. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9477. BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
  9478. .addReg(ARM::CPSR, RegState::Define)
  9479. .addReg(NewVReg2, RegState::Kill)
  9480. .addReg(NewVReg3, RegState::Kill)
  9481. .add(predOps(ARMCC::AL));
  9482. Register NewVReg5 = MRI->createVirtualRegister(TRC);
  9483. BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
  9484. .addFrameIndex(FI)
  9485. .addImm(36); // &jbuf[1] :: pc
  9486. BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
  9487. .addReg(NewVReg4, RegState::Kill)
  9488. .addReg(NewVReg5, RegState::Kill)
  9489. .addImm(0)
  9490. .addMemOperand(FIMMOSt)
  9491. .add(predOps(ARMCC::AL));
  9492. } else {
  9493. // Incoming value: jbuf
  9494. // ldr r1, LCPI1_1
  9495. // add r1, pc, r1
  9496. // str r1, [$jbuf, #+4] ; &jbuf[1]
  9497. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9498. BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
  9499. .addConstantPoolIndex(CPI)
  9500. .addImm(0)
  9501. .addMemOperand(CPMMO)
  9502. .add(predOps(ARMCC::AL));
  9503. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9504. BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
  9505. .addReg(NewVReg1, RegState::Kill)
  9506. .addImm(PCLabelId)
  9507. .add(predOps(ARMCC::AL));
  9508. BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
  9509. .addReg(NewVReg2, RegState::Kill)
  9510. .addFrameIndex(FI)
  9511. .addImm(36) // &jbuf[1] :: pc
  9512. .addMemOperand(FIMMOSt)
  9513. .add(predOps(ARMCC::AL));
  9514. }
  9515. }
  9516. void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
  9517. MachineBasicBlock *MBB) const {
  9518. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  9519. DebugLoc dl = MI.getDebugLoc();
  9520. MachineFunction *MF = MBB->getParent();
  9521. MachineRegisterInfo *MRI = &MF->getRegInfo();
  9522. MachineFrameInfo &MFI = MF->getFrameInfo();
  9523. int FI = MFI.getFunctionContextIndex();
  9524. const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
  9525. : &ARM::GPRnopcRegClass;
  9526. // Get a mapping of the call site numbers to all of the landing pads they're
  9527. // associated with.
  9528. DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
  9529. unsigned MaxCSNum = 0;
  9530. for (MachineBasicBlock &BB : *MF) {
  9531. if (!BB.isEHPad())
  9532. continue;
  9533. // FIXME: We should assert that the EH_LABEL is the first MI in the landing
  9534. // pad.
  9535. for (MachineInstr &II : BB) {
  9536. if (!II.isEHLabel())
  9537. continue;
  9538. MCSymbol *Sym = II.getOperand(0).getMCSymbol();
  9539. if (!MF->hasCallSiteLandingPad(Sym)) continue;
  9540. SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
  9541. for (unsigned Idx : CallSiteIdxs) {
  9542. CallSiteNumToLPad[Idx].push_back(&BB);
  9543. MaxCSNum = std::max(MaxCSNum, Idx);
  9544. }
  9545. break;
  9546. }
  9547. }
  9548. // Get an ordered list of the machine basic blocks for the jump table.
  9549. std::vector<MachineBasicBlock*> LPadList;
  9550. SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
  9551. LPadList.reserve(CallSiteNumToLPad.size());
  9552. for (unsigned I = 1; I <= MaxCSNum; ++I) {
  9553. SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
  9554. for (MachineBasicBlock *MBB : MBBList) {
  9555. LPadList.push_back(MBB);
  9556. InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());
  9557. }
  9558. }
  9559. assert(!LPadList.empty() &&
  9560. "No landing pad destinations for the dispatch jump table!");
  9561. // Create the jump table and associated information.
  9562. MachineJumpTableInfo *JTI =
  9563. MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
  9564. unsigned MJTI = JTI->createJumpTableIndex(LPadList);
  9565. // Create the MBBs for the dispatch code.
  9566. // Shove the dispatch's address into the return slot in the function context.
  9567. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
  9568. DispatchBB->setIsEHPad();
  9569. MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
  9570. unsigned trap_opcode;
  9571. if (Subtarget->isThumb())
  9572. trap_opcode = ARM::tTRAP;
  9573. else
  9574. trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
  9575. BuildMI(TrapBB, dl, TII->get(trap_opcode));
  9576. DispatchBB->addSuccessor(TrapBB);
  9577. MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
  9578. DispatchBB->addSuccessor(DispContBB);
  9579. // Insert and MBBs.
  9580. MF->insert(MF->end(), DispatchBB);
  9581. MF->insert(MF->end(), DispContBB);
  9582. MF->insert(MF->end(), TrapBB);
  9583. // Insert code into the entry block that creates and registers the function
  9584. // context.
  9585. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
  9586. MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
  9587. MachinePointerInfo::getFixedStack(*MF, FI),
  9588. MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
  9589. MachineInstrBuilder MIB;
  9590. MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
  9591. const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
  9592. const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
  9593. // Add a register mask with no preserved registers. This results in all
  9594. // registers being marked as clobbered. This can't work if the dispatch block
  9595. // is in a Thumb1 function and is linked with ARM code which uses the FP
  9596. // registers, as there is no way to preserve the FP registers in Thumb1 mode.
  9597. MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
  9598. bool IsPositionIndependent = isPositionIndependent();
  9599. unsigned NumLPads = LPadList.size();
  9600. if (Subtarget->isThumb2()) {
  9601. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9602. BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
  9603. .addFrameIndex(FI)
  9604. .addImm(4)
  9605. .addMemOperand(FIMMOLd)
  9606. .add(predOps(ARMCC::AL));
  9607. if (NumLPads < 256) {
  9608. BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
  9609. .addReg(NewVReg1)
  9610. .addImm(LPadList.size())
  9611. .add(predOps(ARMCC::AL));
  9612. } else {
  9613. Register VReg1 = MRI->createVirtualRegister(TRC);
  9614. BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
  9615. .addImm(NumLPads & 0xFFFF)
  9616. .add(predOps(ARMCC::AL));
  9617. unsigned VReg2 = VReg1;
  9618. if ((NumLPads & 0xFFFF0000) != 0) {
  9619. VReg2 = MRI->createVirtualRegister(TRC);
  9620. BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
  9621. .addReg(VReg1)
  9622. .addImm(NumLPads >> 16)
  9623. .add(predOps(ARMCC::AL));
  9624. }
  9625. BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
  9626. .addReg(NewVReg1)
  9627. .addReg(VReg2)
  9628. .add(predOps(ARMCC::AL));
  9629. }
  9630. BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
  9631. .addMBB(TrapBB)
  9632. .addImm(ARMCC::HI)
  9633. .addReg(ARM::CPSR);
  9634. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9635. BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
  9636. .addJumpTableIndex(MJTI)
  9637. .add(predOps(ARMCC::AL));
  9638. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9639. BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
  9640. .addReg(NewVReg3, RegState::Kill)
  9641. .addReg(NewVReg1)
  9642. .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
  9643. .add(predOps(ARMCC::AL))
  9644. .add(condCodeOp());
  9645. BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
  9646. .addReg(NewVReg4, RegState::Kill)
  9647. .addReg(NewVReg1)
  9648. .addJumpTableIndex(MJTI);
  9649. } else if (Subtarget->isThumb()) {
  9650. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9651. BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
  9652. .addFrameIndex(FI)
  9653. .addImm(1)
  9654. .addMemOperand(FIMMOLd)
  9655. .add(predOps(ARMCC::AL));
  9656. if (NumLPads < 256) {
  9657. BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
  9658. .addReg(NewVReg1)
  9659. .addImm(NumLPads)
  9660. .add(predOps(ARMCC::AL));
  9661. } else {
  9662. MachineConstantPool *ConstantPool = MF->getConstantPool();
  9663. Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
  9664. const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
  9665. // MachineConstantPool wants an explicit alignment.
  9666. Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
  9667. unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
  9668. Register VReg1 = MRI->createVirtualRegister(TRC);
  9669. BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
  9670. .addReg(VReg1, RegState::Define)
  9671. .addConstantPoolIndex(Idx)
  9672. .add(predOps(ARMCC::AL));
  9673. BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
  9674. .addReg(NewVReg1)
  9675. .addReg(VReg1)
  9676. .add(predOps(ARMCC::AL));
  9677. }
  9678. BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
  9679. .addMBB(TrapBB)
  9680. .addImm(ARMCC::HI)
  9681. .addReg(ARM::CPSR);
  9682. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9683. BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
  9684. .addReg(ARM::CPSR, RegState::Define)
  9685. .addReg(NewVReg1)
  9686. .addImm(2)
  9687. .add(predOps(ARMCC::AL));
  9688. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9689. BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
  9690. .addJumpTableIndex(MJTI)
  9691. .add(predOps(ARMCC::AL));
  9692. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9693. BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
  9694. .addReg(ARM::CPSR, RegState::Define)
  9695. .addReg(NewVReg2, RegState::Kill)
  9696. .addReg(NewVReg3)
  9697. .add(predOps(ARMCC::AL));
  9698. MachineMemOperand *JTMMOLd =
  9699. MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
  9700. MachineMemOperand::MOLoad, 4, Align(4));
  9701. Register NewVReg5 = MRI->createVirtualRegister(TRC);
  9702. BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
  9703. .addReg(NewVReg4, RegState::Kill)
  9704. .addImm(0)
  9705. .addMemOperand(JTMMOLd)
  9706. .add(predOps(ARMCC::AL));
  9707. unsigned NewVReg6 = NewVReg5;
  9708. if (IsPositionIndependent) {
  9709. NewVReg6 = MRI->createVirtualRegister(TRC);
  9710. BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
  9711. .addReg(ARM::CPSR, RegState::Define)
  9712. .addReg(NewVReg5, RegState::Kill)
  9713. .addReg(NewVReg3)
  9714. .add(predOps(ARMCC::AL));
  9715. }
  9716. BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
  9717. .addReg(NewVReg6, RegState::Kill)
  9718. .addJumpTableIndex(MJTI);
  9719. } else {
  9720. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9721. BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
  9722. .addFrameIndex(FI)
  9723. .addImm(4)
  9724. .addMemOperand(FIMMOLd)
  9725. .add(predOps(ARMCC::AL));
  9726. if (NumLPads < 256) {
  9727. BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
  9728. .addReg(NewVReg1)
  9729. .addImm(NumLPads)
  9730. .add(predOps(ARMCC::AL));
  9731. } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
  9732. Register VReg1 = MRI->createVirtualRegister(TRC);
  9733. BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
  9734. .addImm(NumLPads & 0xFFFF)
  9735. .add(predOps(ARMCC::AL));
  9736. unsigned VReg2 = VReg1;
  9737. if ((NumLPads & 0xFFFF0000) != 0) {
  9738. VReg2 = MRI->createVirtualRegister(TRC);
  9739. BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
  9740. .addReg(VReg1)
  9741. .addImm(NumLPads >> 16)
  9742. .add(predOps(ARMCC::AL));
  9743. }
  9744. BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
  9745. .addReg(NewVReg1)
  9746. .addReg(VReg2)
  9747. .add(predOps(ARMCC::AL));
  9748. } else {
  9749. MachineConstantPool *ConstantPool = MF->getConstantPool();
  9750. Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
  9751. const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
  9752. // MachineConstantPool wants an explicit alignment.
  9753. Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
  9754. unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
  9755. Register VReg1 = MRI->createVirtualRegister(TRC);
  9756. BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
  9757. .addReg(VReg1, RegState::Define)
  9758. .addConstantPoolIndex(Idx)
  9759. .addImm(0)
  9760. .add(predOps(ARMCC::AL));
  9761. BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
  9762. .addReg(NewVReg1)
  9763. .addReg(VReg1, RegState::Kill)
  9764. .add(predOps(ARMCC::AL));
  9765. }
  9766. BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
  9767. .addMBB(TrapBB)
  9768. .addImm(ARMCC::HI)
  9769. .addReg(ARM::CPSR);
  9770. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9771. BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
  9772. .addReg(NewVReg1)
  9773. .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
  9774. .add(predOps(ARMCC::AL))
  9775. .add(condCodeOp());
  9776. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9777. BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
  9778. .addJumpTableIndex(MJTI)
  9779. .add(predOps(ARMCC::AL));
  9780. MachineMemOperand *JTMMOLd =
  9781. MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
  9782. MachineMemOperand::MOLoad, 4, Align(4));
  9783. Register NewVReg5 = MRI->createVirtualRegister(TRC);
  9784. BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
  9785. .addReg(NewVReg3, RegState::Kill)
  9786. .addReg(NewVReg4)
  9787. .addImm(0)
  9788. .addMemOperand(JTMMOLd)
  9789. .add(predOps(ARMCC::AL));
  9790. if (IsPositionIndependent) {
  9791. BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
  9792. .addReg(NewVReg5, RegState::Kill)
  9793. .addReg(NewVReg4)
  9794. .addJumpTableIndex(MJTI);
  9795. } else {
  9796. BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
  9797. .addReg(NewVReg5, RegState::Kill)
  9798. .addJumpTableIndex(MJTI);
  9799. }
  9800. }
  9801. // Add the jump table entries as successors to the MBB.
  9802. SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
  9803. for (MachineBasicBlock *CurMBB : LPadList) {
  9804. if (SeenMBBs.insert(CurMBB).second)
  9805. DispContBB->addSuccessor(CurMBB);
  9806. }
  9807. // N.B. the order the invoke BBs are processed in doesn't matter here.
  9808. const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
  9809. SmallVector<MachineBasicBlock*, 64> MBBLPads;
  9810. for (MachineBasicBlock *BB : InvokeBBs) {
  9811. // Remove the landing pad successor from the invoke block and replace it
  9812. // with the new dispatch block.
  9813. SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
  9814. while (!Successors.empty()) {
  9815. MachineBasicBlock *SMBB = Successors.pop_back_val();
  9816. if (SMBB->isEHPad()) {
  9817. BB->removeSuccessor(SMBB);
  9818. MBBLPads.push_back(SMBB);
  9819. }
  9820. }
  9821. BB->addSuccessor(DispatchBB, BranchProbability::getZero());
  9822. BB->normalizeSuccProbs();
  9823. // Find the invoke call and mark all of the callee-saved registers as
  9824. // 'implicit defined' so that they're spilled. This prevents code from
  9825. // moving instructions to before the EH block, where they will never be
  9826. // executed.
  9827. for (MachineBasicBlock::reverse_iterator
  9828. II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
  9829. if (!II->isCall()) continue;
  9830. DenseMap<unsigned, bool> DefRegs;
  9831. for (MachineInstr::mop_iterator
  9832. OI = II->operands_begin(), OE = II->operands_end();
  9833. OI != OE; ++OI) {
  9834. if (!OI->isReg()) continue;
  9835. DefRegs[OI->getReg()] = true;
  9836. }
  9837. MachineInstrBuilder MIB(*MF, &*II);
  9838. for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
  9839. unsigned Reg = SavedRegs[i];
  9840. if (Subtarget->isThumb2() &&
  9841. !ARM::tGPRRegClass.contains(Reg) &&
  9842. !ARM::hGPRRegClass.contains(Reg))
  9843. continue;
  9844. if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
  9845. continue;
  9846. if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
  9847. continue;
  9848. if (!DefRegs[Reg])
  9849. MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
  9850. }
  9851. break;
  9852. }
  9853. }
  9854. // Mark all former landing pads as non-landing pads. The dispatch is the only
  9855. // landing pad now.
  9856. for (MachineBasicBlock *MBBLPad : MBBLPads)
  9857. MBBLPad->setIsEHPad(false);
  9858. // The instruction is gone now.
  9859. MI.eraseFromParent();
  9860. }
  9861. static
  9862. MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
  9863. for (MachineBasicBlock *S : MBB->successors())
  9864. if (S != Succ)
  9865. return S;
  9866. llvm_unreachable("Expecting a BB with two successors!");
  9867. }
  9868. /// Return the load opcode for a given load size. If load size >= 8,
  9869. /// neon opcode will be returned.
  9870. static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
  9871. if (LdSize >= 8)
  9872. return LdSize == 16 ? ARM::VLD1q32wb_fixed
  9873. : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
  9874. if (IsThumb1)
  9875. return LdSize == 4 ? ARM::tLDRi
  9876. : LdSize == 2 ? ARM::tLDRHi
  9877. : LdSize == 1 ? ARM::tLDRBi : 0;
  9878. if (IsThumb2)
  9879. return LdSize == 4 ? ARM::t2LDR_POST
  9880. : LdSize == 2 ? ARM::t2LDRH_POST
  9881. : LdSize == 1 ? ARM::t2LDRB_POST : 0;
  9882. return LdSize == 4 ? ARM::LDR_POST_IMM
  9883. : LdSize == 2 ? ARM::LDRH_POST
  9884. : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
  9885. }
  9886. /// Return the store opcode for a given store size. If store size >= 8,
  9887. /// neon opcode will be returned.
  9888. static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
  9889. if (StSize >= 8)
  9890. return StSize == 16 ? ARM::VST1q32wb_fixed
  9891. : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
  9892. if (IsThumb1)
  9893. return StSize == 4 ? ARM::tSTRi
  9894. : StSize == 2 ? ARM::tSTRHi
  9895. : StSize == 1 ? ARM::tSTRBi : 0;
  9896. if (IsThumb2)
  9897. return StSize == 4 ? ARM::t2STR_POST
  9898. : StSize == 2 ? ARM::t2STRH_POST
  9899. : StSize == 1 ? ARM::t2STRB_POST : 0;
  9900. return StSize == 4 ? ARM::STR_POST_IMM
  9901. : StSize == 2 ? ARM::STRH_POST
  9902. : StSize == 1 ? ARM::STRB_POST_IMM : 0;
  9903. }
  9904. /// Emit a post-increment load operation with given size. The instructions
  9905. /// will be added to BB at Pos.
  9906. static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
  9907. const TargetInstrInfo *TII, const DebugLoc &dl,
  9908. unsigned LdSize, unsigned Data, unsigned AddrIn,
  9909. unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
  9910. unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
  9911. assert(LdOpc != 0 && "Should have a load opcode");
  9912. if (LdSize >= 8) {
  9913. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  9914. .addReg(AddrOut, RegState::Define)
  9915. .addReg(AddrIn)
  9916. .addImm(0)
  9917. .add(predOps(ARMCC::AL));
  9918. } else if (IsThumb1) {
  9919. // load + update AddrIn
  9920. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  9921. .addReg(AddrIn)
  9922. .addImm(0)
  9923. .add(predOps(ARMCC::AL));
  9924. BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
  9925. .add(t1CondCodeOp())
  9926. .addReg(AddrIn)
  9927. .addImm(LdSize)
  9928. .add(predOps(ARMCC::AL));
  9929. } else if (IsThumb2) {
  9930. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  9931. .addReg(AddrOut, RegState::Define)
  9932. .addReg(AddrIn)
  9933. .addImm(LdSize)
  9934. .add(predOps(ARMCC::AL));
  9935. } else { // arm
  9936. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  9937. .addReg(AddrOut, RegState::Define)
  9938. .addReg(AddrIn)
  9939. .addReg(0)
  9940. .addImm(LdSize)
  9941. .add(predOps(ARMCC::AL));
  9942. }
  9943. }
  9944. /// Emit a post-increment store operation with given size. The instructions
  9945. /// will be added to BB at Pos.
  9946. static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
  9947. const TargetInstrInfo *TII, const DebugLoc &dl,
  9948. unsigned StSize, unsigned Data, unsigned AddrIn,
  9949. unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
  9950. unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
  9951. assert(StOpc != 0 && "Should have a store opcode");
  9952. if (StSize >= 8) {
  9953. BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
  9954. .addReg(AddrIn)
  9955. .addImm(0)
  9956. .addReg(Data)
  9957. .add(predOps(ARMCC::AL));
  9958. } else if (IsThumb1) {
  9959. // store + update AddrIn
  9960. BuildMI(*BB, Pos, dl, TII->get(StOpc))
  9961. .addReg(Data)
  9962. .addReg(AddrIn)
  9963. .addImm(0)
  9964. .add(predOps(ARMCC::AL));
  9965. BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
  9966. .add(t1CondCodeOp())
  9967. .addReg(AddrIn)
  9968. .addImm(StSize)
  9969. .add(predOps(ARMCC::AL));
  9970. } else if (IsThumb2) {
  9971. BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
  9972. .addReg(Data)
  9973. .addReg(AddrIn)
  9974. .addImm(StSize)
  9975. .add(predOps(ARMCC::AL));
  9976. } else { // arm
  9977. BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
  9978. .addReg(Data)
  9979. .addReg(AddrIn)
  9980. .addReg(0)
  9981. .addImm(StSize)
  9982. .add(predOps(ARMCC::AL));
  9983. }
  9984. }
  9985. MachineBasicBlock *
  9986. ARMTargetLowering::EmitStructByval(MachineInstr &MI,
  9987. MachineBasicBlock *BB) const {
  9988. // This pseudo instruction has 3 operands: dst, src, size
  9989. // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
  9990. // Otherwise, we will generate unrolled scalar copies.
  9991. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  9992. const BasicBlock *LLVM_BB = BB->getBasicBlock();
  9993. MachineFunction::iterator It = ++BB->getIterator();
  9994. Register dest = MI.getOperand(0).getReg();
  9995. Register src = MI.getOperand(1).getReg();
  9996. unsigned SizeVal = MI.getOperand(2).getImm();
  9997. unsigned Alignment = MI.getOperand(3).getImm();
  9998. DebugLoc dl = MI.getDebugLoc();
  9999. MachineFunction *MF = BB->getParent();
  10000. MachineRegisterInfo &MRI = MF->getRegInfo();
  10001. unsigned UnitSize = 0;
  10002. const TargetRegisterClass *TRC = nullptr;
  10003. const TargetRegisterClass *VecTRC = nullptr;
  10004. bool IsThumb1 = Subtarget->isThumb1Only();
  10005. bool IsThumb2 = Subtarget->isThumb2();
  10006. bool IsThumb = Subtarget->isThumb();
  10007. if (Alignment & 1) {
  10008. UnitSize = 1;
  10009. } else if (Alignment & 2) {
  10010. UnitSize = 2;
  10011. } else {
  10012. // Check whether we can use NEON instructions.
  10013. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
  10014. Subtarget->hasNEON()) {
  10015. if ((Alignment % 16 == 0) && SizeVal >= 16)
  10016. UnitSize = 16;
  10017. else if ((Alignment % 8 == 0) && SizeVal >= 8)
  10018. UnitSize = 8;
  10019. }
  10020. // Can't use NEON instructions.
  10021. if (UnitSize == 0)
  10022. UnitSize = 4;
  10023. }
  10024. // Select the correct opcode and register class for unit size load/store
  10025. bool IsNeon = UnitSize >= 8;
  10026. TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
  10027. if (IsNeon)
  10028. VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
  10029. : UnitSize == 8 ? &ARM::DPRRegClass
  10030. : nullptr;
  10031. unsigned BytesLeft = SizeVal % UnitSize;
  10032. unsigned LoopSize = SizeVal - BytesLeft;
  10033. if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
  10034. // Use LDR and STR to copy.
  10035. // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
  10036. // [destOut] = STR_POST(scratch, destIn, UnitSize)
  10037. unsigned srcIn = src;
  10038. unsigned destIn = dest;
  10039. for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
  10040. Register srcOut = MRI.createVirtualRegister(TRC);
  10041. Register destOut = MRI.createVirtualRegister(TRC);
  10042. Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
  10043. emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
  10044. IsThumb1, IsThumb2);
  10045. emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
  10046. IsThumb1, IsThumb2);
  10047. srcIn = srcOut;
  10048. destIn = destOut;
  10049. }
  10050. // Handle the leftover bytes with LDRB and STRB.
  10051. // [scratch, srcOut] = LDRB_POST(srcIn, 1)
  10052. // [destOut] = STRB_POST(scratch, destIn, 1)
  10053. for (unsigned i = 0; i < BytesLeft; i++) {
  10054. Register srcOut = MRI.createVirtualRegister(TRC);
  10055. Register destOut = MRI.createVirtualRegister(TRC);
  10056. Register scratch = MRI.createVirtualRegister(TRC);
  10057. emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
  10058. IsThumb1, IsThumb2);
  10059. emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
  10060. IsThumb1, IsThumb2);
  10061. srcIn = srcOut;
  10062. destIn = destOut;
  10063. }
  10064. MI.eraseFromParent(); // The instruction is gone now.
  10065. return BB;
  10066. }
  10067. // Expand the pseudo op to a loop.
  10068. // thisMBB:
  10069. // ...
  10070. // movw varEnd, # --> with thumb2
  10071. // movt varEnd, #
  10072. // ldrcp varEnd, idx --> without thumb2
  10073. // fallthrough --> loopMBB
  10074. // loopMBB:
  10075. // PHI varPhi, varEnd, varLoop
  10076. // PHI srcPhi, src, srcLoop
  10077. // PHI destPhi, dst, destLoop
  10078. // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
  10079. // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
  10080. // subs varLoop, varPhi, #UnitSize
  10081. // bne loopMBB
  10082. // fallthrough --> exitMBB
  10083. // exitMBB:
  10084. // epilogue to handle left-over bytes
  10085. // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
  10086. // [destOut] = STRB_POST(scratch, destLoop, 1)
  10087. MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  10088. MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  10089. MF->insert(It, loopMBB);
  10090. MF->insert(It, exitMBB);
  10091. // Transfer the remainder of BB and its successor edges to exitMBB.
  10092. exitMBB->splice(exitMBB->begin(), BB,
  10093. std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10094. exitMBB->transferSuccessorsAndUpdatePHIs(BB);
  10095. // Load an immediate to varEnd.
  10096. Register varEnd = MRI.createVirtualRegister(TRC);
  10097. if (Subtarget->useMovt()) {
  10098. unsigned Vtmp = varEnd;
  10099. if ((LoopSize & 0xFFFF0000) != 0)
  10100. Vtmp = MRI.createVirtualRegister(TRC);
  10101. BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
  10102. .addImm(LoopSize & 0xFFFF)
  10103. .add(predOps(ARMCC::AL));
  10104. if ((LoopSize & 0xFFFF0000) != 0)
  10105. BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
  10106. .addReg(Vtmp)
  10107. .addImm(LoopSize >> 16)
  10108. .add(predOps(ARMCC::AL));
  10109. } else {
  10110. MachineConstantPool *ConstantPool = MF->getConstantPool();
  10111. Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
  10112. const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
  10113. // MachineConstantPool wants an explicit alignment.
  10114. Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
  10115. unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
  10116. MachineMemOperand *CPMMO =
  10117. MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
  10118. MachineMemOperand::MOLoad, 4, Align(4));
  10119. if (IsThumb)
  10120. BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
  10121. .addReg(varEnd, RegState::Define)
  10122. .addConstantPoolIndex(Idx)
  10123. .add(predOps(ARMCC::AL))
  10124. .addMemOperand(CPMMO);
  10125. else
  10126. BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
  10127. .addReg(varEnd, RegState::Define)
  10128. .addConstantPoolIndex(Idx)
  10129. .addImm(0)
  10130. .add(predOps(ARMCC::AL))
  10131. .addMemOperand(CPMMO);
  10132. }
  10133. BB->addSuccessor(loopMBB);
  10134. // Generate the loop body:
  10135. // varPhi = PHI(varLoop, varEnd)
  10136. // srcPhi = PHI(srcLoop, src)
  10137. // destPhi = PHI(destLoop, dst)
  10138. MachineBasicBlock *entryBB = BB;
  10139. BB = loopMBB;
  10140. Register varLoop = MRI.createVirtualRegister(TRC);
  10141. Register varPhi = MRI.createVirtualRegister(TRC);
  10142. Register srcLoop = MRI.createVirtualRegister(TRC);
  10143. Register srcPhi = MRI.createVirtualRegister(TRC);
  10144. Register destLoop = MRI.createVirtualRegister(TRC);
  10145. Register destPhi = MRI.createVirtualRegister(TRC);
  10146. BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
  10147. .addReg(varLoop).addMBB(loopMBB)
  10148. .addReg(varEnd).addMBB(entryBB);
  10149. BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
  10150. .addReg(srcLoop).addMBB(loopMBB)
  10151. .addReg(src).addMBB(entryBB);
  10152. BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
  10153. .addReg(destLoop).addMBB(loopMBB)
  10154. .addReg(dest).addMBB(entryBB);
  10155. // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
  10156. // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
  10157. Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
  10158. emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
  10159. IsThumb1, IsThumb2);
  10160. emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
  10161. IsThumb1, IsThumb2);
  10162. // Decrement loop variable by UnitSize.
  10163. if (IsThumb1) {
  10164. BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
  10165. .add(t1CondCodeOp())
  10166. .addReg(varPhi)
  10167. .addImm(UnitSize)
  10168. .add(predOps(ARMCC::AL));
  10169. } else {
  10170. MachineInstrBuilder MIB =
  10171. BuildMI(*BB, BB->end(), dl,
  10172. TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
  10173. MIB.addReg(varPhi)
  10174. .addImm(UnitSize)
  10175. .add(predOps(ARMCC::AL))
  10176. .add(condCodeOp());
  10177. MIB->getOperand(5).setReg(ARM::CPSR);
  10178. MIB->getOperand(5).setIsDef(true);
  10179. }
  10180. BuildMI(*BB, BB->end(), dl,
  10181. TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
  10182. .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
  10183. // loopMBB can loop back to loopMBB or fall through to exitMBB.
  10184. BB->addSuccessor(loopMBB);
  10185. BB->addSuccessor(exitMBB);
  10186. // Add epilogue to handle BytesLeft.
  10187. BB = exitMBB;
  10188. auto StartOfExit = exitMBB->begin();
  10189. // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
  10190. // [destOut] = STRB_POST(scratch, destLoop, 1)
  10191. unsigned srcIn = srcLoop;
  10192. unsigned destIn = destLoop;
  10193. for (unsigned i = 0; i < BytesLeft; i++) {
  10194. Register srcOut = MRI.createVirtualRegister(TRC);
  10195. Register destOut = MRI.createVirtualRegister(TRC);
  10196. Register scratch = MRI.createVirtualRegister(TRC);
  10197. emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
  10198. IsThumb1, IsThumb2);
  10199. emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
  10200. IsThumb1, IsThumb2);
  10201. srcIn = srcOut;
  10202. destIn = destOut;
  10203. }
  10204. MI.eraseFromParent(); // The instruction is gone now.
  10205. return BB;
  10206. }
  10207. MachineBasicBlock *
  10208. ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
  10209. MachineBasicBlock *MBB) const {
  10210. const TargetMachine &TM = getTargetMachine();
  10211. const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
  10212. DebugLoc DL = MI.getDebugLoc();
  10213. assert(Subtarget->isTargetWindows() &&
  10214. "__chkstk is only supported on Windows");
  10215. assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
  10216. // __chkstk takes the number of words to allocate on the stack in R4, and
  10217. // returns the stack adjustment in number of bytes in R4. This will not
  10218. // clober any other registers (other than the obvious lr).
  10219. //
  10220. // Although, technically, IP should be considered a register which may be
  10221. // clobbered, the call itself will not touch it. Windows on ARM is a pure
  10222. // thumb-2 environment, so there is no interworking required. As a result, we
  10223. // do not expect a veneer to be emitted by the linker, clobbering IP.
  10224. //
  10225. // Each module receives its own copy of __chkstk, so no import thunk is
  10226. // required, again, ensuring that IP is not clobbered.
  10227. //
  10228. // Finally, although some linkers may theoretically provide a trampoline for
  10229. // out of range calls (which is quite common due to a 32M range limitation of
  10230. // branches for Thumb), we can generate the long-call version via
  10231. // -mcmodel=large, alleviating the need for the trampoline which may clobber
  10232. // IP.
  10233. switch (TM.getCodeModel()) {
  10234. case CodeModel::Tiny:
  10235. llvm_unreachable("Tiny code model not available on ARM.");
  10236. case CodeModel::Small:
  10237. case CodeModel::Medium:
  10238. case CodeModel::Kernel:
  10239. BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
  10240. .add(predOps(ARMCC::AL))
  10241. .addExternalSymbol("__chkstk")
  10242. .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
  10243. .addReg(ARM::R4, RegState::Implicit | RegState::Define)
  10244. .addReg(ARM::R12,
  10245. RegState::Implicit | RegState::Define | RegState::Dead)
  10246. .addReg(ARM::CPSR,
  10247. RegState::Implicit | RegState::Define | RegState::Dead);
  10248. break;
  10249. case CodeModel::Large: {
  10250. MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
  10251. Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10252. BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
  10253. .addExternalSymbol("__chkstk");
  10254. BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
  10255. .add(predOps(ARMCC::AL))
  10256. .addReg(Reg, RegState::Kill)
  10257. .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
  10258. .addReg(ARM::R4, RegState::Implicit | RegState::Define)
  10259. .addReg(ARM::R12,
  10260. RegState::Implicit | RegState::Define | RegState::Dead)
  10261. .addReg(ARM::CPSR,
  10262. RegState::Implicit | RegState::Define | RegState::Dead);
  10263. break;
  10264. }
  10265. }
  10266. BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
  10267. .addReg(ARM::SP, RegState::Kill)
  10268. .addReg(ARM::R4, RegState::Kill)
  10269. .setMIFlags(MachineInstr::FrameSetup)
  10270. .add(predOps(ARMCC::AL))
  10271. .add(condCodeOp());
  10272. MI.eraseFromParent();
  10273. return MBB;
  10274. }
  10275. MachineBasicBlock *
  10276. ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
  10277. MachineBasicBlock *MBB) const {
  10278. DebugLoc DL = MI.getDebugLoc();
  10279. MachineFunction *MF = MBB->getParent();
  10280. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  10281. MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
  10282. MF->insert(++MBB->getIterator(), ContBB);
  10283. ContBB->splice(ContBB->begin(), MBB,
  10284. std::next(MachineBasicBlock::iterator(MI)), MBB->end());
  10285. ContBB->transferSuccessorsAndUpdatePHIs(MBB);
  10286. MBB->addSuccessor(ContBB);
  10287. MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
  10288. BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
  10289. MF->push_back(TrapBB);
  10290. MBB->addSuccessor(TrapBB);
  10291. BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
  10292. .addReg(MI.getOperand(0).getReg())
  10293. .addImm(0)
  10294. .add(predOps(ARMCC::AL));
  10295. BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
  10296. .addMBB(TrapBB)
  10297. .addImm(ARMCC::EQ)
  10298. .addReg(ARM::CPSR);
  10299. MI.eraseFromParent();
  10300. return ContBB;
  10301. }
  10302. // The CPSR operand of SelectItr might be missing a kill marker
  10303. // because there were multiple uses of CPSR, and ISel didn't know
  10304. // which to mark. Figure out whether SelectItr should have had a
  10305. // kill marker, and set it if it should. Returns the correct kill
  10306. // marker value.
  10307. static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
  10308. MachineBasicBlock* BB,
  10309. const TargetRegisterInfo* TRI) {
  10310. // Scan forward through BB for a use/def of CPSR.
  10311. MachineBasicBlock::iterator miI(std::next(SelectItr));
  10312. for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
  10313. const MachineInstr& mi = *miI;
  10314. if (mi.readsRegister(ARM::CPSR))
  10315. return false;
  10316. if (mi.definesRegister(ARM::CPSR))
  10317. break; // Should have kill-flag - update below.
  10318. }
  10319. // If we hit the end of the block, check whether CPSR is live into a
  10320. // successor.
  10321. if (miI == BB->end()) {
  10322. for (MachineBasicBlock *Succ : BB->successors())
  10323. if (Succ->isLiveIn(ARM::CPSR))
  10324. return false;
  10325. }
  10326. // We found a def, or hit the end of the basic block and CPSR wasn't live
  10327. // out. SelectMI should have a kill flag on CPSR.
  10328. SelectItr->addRegisterKilled(ARM::CPSR, TRI);
  10329. return true;
  10330. }
  10331. /// Adds logic in loop entry MBB to calculate loop iteration count and adds
  10332. /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
  10333. static Register genTPEntry(MachineBasicBlock *TpEntry,
  10334. MachineBasicBlock *TpLoopBody,
  10335. MachineBasicBlock *TpExit, Register OpSizeReg,
  10336. const TargetInstrInfo *TII, DebugLoc Dl,
  10337. MachineRegisterInfo &MRI) {
  10338. // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
  10339. Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10340. BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
  10341. .addUse(OpSizeReg)
  10342. .addImm(15)
  10343. .add(predOps(ARMCC::AL))
  10344. .addReg(0);
  10345. Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10346. BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
  10347. .addUse(AddDestReg, RegState::Kill)
  10348. .addImm(4)
  10349. .add(predOps(ARMCC::AL))
  10350. .addReg(0);
  10351. Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
  10352. BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
  10353. .addUse(LsrDestReg, RegState::Kill);
  10354. BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
  10355. .addUse(TotalIterationsReg)
  10356. .addMBB(TpExit);
  10357. BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
  10358. .addMBB(TpLoopBody)
  10359. .add(predOps(ARMCC::AL));
  10360. return TotalIterationsReg;
  10361. }
  10362. /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
  10363. /// t2DoLoopEnd. These are used by later passes to generate tail predicated
  10364. /// loops.
  10365. static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
  10366. MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
  10367. const TargetInstrInfo *TII, DebugLoc Dl,
  10368. MachineRegisterInfo &MRI, Register OpSrcReg,
  10369. Register OpDestReg, Register ElementCountReg,
  10370. Register TotalIterationsReg, bool IsMemcpy) {
  10371. // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
  10372. // array, loop iteration counter, predication counter.
  10373. Register SrcPhiReg, CurrSrcReg;
  10374. if (IsMemcpy) {
  10375. // Current position in the src array
  10376. SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10377. CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10378. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
  10379. .addUse(OpSrcReg)
  10380. .addMBB(TpEntry)
  10381. .addUse(CurrSrcReg)
  10382. .addMBB(TpLoopBody);
  10383. }
  10384. // Current position in the dest array
  10385. Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10386. Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10387. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
  10388. .addUse(OpDestReg)
  10389. .addMBB(TpEntry)
  10390. .addUse(CurrDestReg)
  10391. .addMBB(TpLoopBody);
  10392. // Current loop counter
  10393. Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
  10394. Register RemainingLoopIterationsReg =
  10395. MRI.createVirtualRegister(&ARM::GPRlrRegClass);
  10396. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
  10397. .addUse(TotalIterationsReg)
  10398. .addMBB(TpEntry)
  10399. .addUse(RemainingLoopIterationsReg)
  10400. .addMBB(TpLoopBody);
  10401. // Predication counter
  10402. Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10403. Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10404. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
  10405. .addUse(ElementCountReg)
  10406. .addMBB(TpEntry)
  10407. .addUse(RemainingElementsReg)
  10408. .addMBB(TpLoopBody);
  10409. // Pass predication counter to VCTP
  10410. Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
  10411. BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
  10412. .addUse(PredCounterPhiReg)
  10413. .addImm(ARMVCC::None)
  10414. .addReg(0)
  10415. .addReg(0);
  10416. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
  10417. .addUse(PredCounterPhiReg)
  10418. .addImm(16)
  10419. .add(predOps(ARMCC::AL))
  10420. .addReg(0);
  10421. // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
  10422. Register SrcValueReg;
  10423. if (IsMemcpy) {
  10424. SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
  10425. BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
  10426. .addDef(CurrSrcReg)
  10427. .addDef(SrcValueReg)
  10428. .addReg(SrcPhiReg)
  10429. .addImm(16)
  10430. .addImm(ARMVCC::Then)
  10431. .addUse(VccrReg)
  10432. .addReg(0);
  10433. } else
  10434. SrcValueReg = OpSrcReg;
  10435. BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
  10436. .addDef(CurrDestReg)
  10437. .addUse(SrcValueReg)
  10438. .addReg(DestPhiReg)
  10439. .addImm(16)
  10440. .addImm(ARMVCC::Then)
  10441. .addUse(VccrReg)
  10442. .addReg(0);
  10443. // Add the pseudoInstrs for decrementing the loop counter and marking the
  10444. // end:t2DoLoopDec and t2DoLoopEnd
  10445. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
  10446. .addUse(LoopCounterPhiReg)
  10447. .addImm(1);
  10448. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
  10449. .addUse(RemainingLoopIterationsReg)
  10450. .addMBB(TpLoopBody);
  10451. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
  10452. .addMBB(TpExit)
  10453. .add(predOps(ARMCC::AL));
  10454. }
  10455. MachineBasicBlock *
  10456. ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
  10457. MachineBasicBlock *BB) const {
  10458. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  10459. DebugLoc dl = MI.getDebugLoc();
  10460. bool isThumb2 = Subtarget->isThumb2();
  10461. switch (MI.getOpcode()) {
  10462. default: {
  10463. MI.print(errs());
  10464. llvm_unreachable("Unexpected instr type to insert");
  10465. }
  10466. // Thumb1 post-indexed loads are really just single-register LDMs.
  10467. case ARM::tLDR_postidx: {
  10468. MachineOperand Def(MI.getOperand(1));
  10469. BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
  10470. .add(Def) // Rn_wb
  10471. .add(MI.getOperand(2)) // Rn
  10472. .add(MI.getOperand(3)) // PredImm
  10473. .add(MI.getOperand(4)) // PredReg
  10474. .add(MI.getOperand(0)) // Rt
  10475. .cloneMemRefs(MI);
  10476. MI.eraseFromParent();
  10477. return BB;
  10478. }
  10479. case ARM::MVE_MEMCPYLOOPINST:
  10480. case ARM::MVE_MEMSETLOOPINST: {
  10481. // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
  10482. // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
  10483. // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
  10484. // adds the relevant instructions in the TP loop Body for generation of a
  10485. // WLSTP loop.
  10486. // Below is relevant portion of the CFG after the transformation.
  10487. // The Machine Basic Blocks are shown along with branch conditions (in
  10488. // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
  10489. // portion of the CFG and may not necessarily be the entry/exit of the
  10490. // function.
  10491. // (Relevant) CFG after transformation:
  10492. // TP entry MBB
  10493. // |
  10494. // |-----------------|
  10495. // (n <= 0) (n > 0)
  10496. // | |
  10497. // | TP loop Body MBB<--|
  10498. // | | |
  10499. // \ |___________|
  10500. // \ /
  10501. // TP exit MBB
  10502. MachineFunction *MF = BB->getParent();
  10503. MachineFunctionProperties &Properties = MF->getProperties();
  10504. MachineRegisterInfo &MRI = MF->getRegInfo();
  10505. Register OpDestReg = MI.getOperand(0).getReg();
  10506. Register OpSrcReg = MI.getOperand(1).getReg();
  10507. Register OpSizeReg = MI.getOperand(2).getReg();
  10508. // Allocate the required MBBs and add to parent function.
  10509. MachineBasicBlock *TpEntry = BB;
  10510. MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
  10511. MachineBasicBlock *TpExit;
  10512. MF->push_back(TpLoopBody);
  10513. // If any instructions are present in the current block after
  10514. // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
  10515. // move the instructions into the newly created exit block. If there are no
  10516. // instructions add an explicit branch to the FallThrough block and then
  10517. // split.
  10518. //
  10519. // The split is required for two reasons:
  10520. // 1) A terminator(t2WhileLoopStart) will be placed at that site.
  10521. // 2) Since a TPLoopBody will be added later, any phis in successive blocks
  10522. // need to be updated. splitAt() already handles this.
  10523. TpExit = BB->splitAt(MI, false);
  10524. if (TpExit == BB) {
  10525. assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
  10526. "block containing memcpy/memset Pseudo");
  10527. TpExit = BB->getFallThrough();
  10528. BuildMI(BB, dl, TII->get(ARM::t2B))
  10529. .addMBB(TpExit)
  10530. .add(predOps(ARMCC::AL));
  10531. TpExit = BB->splitAt(MI, false);
  10532. }
  10533. // Add logic for iteration count
  10534. Register TotalIterationsReg =
  10535. genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
  10536. // Add the vectorized (and predicated) loads/store instructions
  10537. bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
  10538. genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
  10539. OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
  10540. // Required to avoid conflict with the MachineVerifier during testing.
  10541. Properties.reset(MachineFunctionProperties::Property::NoPHIs);
  10542. // Connect the blocks
  10543. TpEntry->addSuccessor(TpLoopBody);
  10544. TpLoopBody->addSuccessor(TpLoopBody);
  10545. TpLoopBody->addSuccessor(TpExit);
  10546. // Reorder for a more natural layout
  10547. TpLoopBody->moveAfter(TpEntry);
  10548. TpExit->moveAfter(TpLoopBody);
  10549. // Finally, remove the memcpy Psuedo Instruction
  10550. MI.eraseFromParent();
  10551. // Return the exit block as it may contain other instructions requiring a
  10552. // custom inserter
  10553. return TpExit;
  10554. }
  10555. // The Thumb2 pre-indexed stores have the same MI operands, they just
  10556. // define them differently in the .td files from the isel patterns, so
  10557. // they need pseudos.
  10558. case ARM::t2STR_preidx:
  10559. MI.setDesc(TII->get(ARM::t2STR_PRE));
  10560. return BB;
  10561. case ARM::t2STRB_preidx:
  10562. MI.setDesc(TII->get(ARM::t2STRB_PRE));
  10563. return BB;
  10564. case ARM::t2STRH_preidx:
  10565. MI.setDesc(TII->get(ARM::t2STRH_PRE));
  10566. return BB;
  10567. case ARM::STRi_preidx:
  10568. case ARM::STRBi_preidx: {
  10569. unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
  10570. : ARM::STRB_PRE_IMM;
  10571. // Decode the offset.
  10572. unsigned Offset = MI.getOperand(4).getImm();
  10573. bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
  10574. Offset = ARM_AM::getAM2Offset(Offset);
  10575. if (isSub)
  10576. Offset = -Offset;
  10577. MachineMemOperand *MMO = *MI.memoperands_begin();
  10578. BuildMI(*BB, MI, dl, TII->get(NewOpc))
  10579. .add(MI.getOperand(0)) // Rn_wb
  10580. .add(MI.getOperand(1)) // Rt
  10581. .add(MI.getOperand(2)) // Rn
  10582. .addImm(Offset) // offset (skip GPR==zero_reg)
  10583. .add(MI.getOperand(5)) // pred
  10584. .add(MI.getOperand(6))
  10585. .addMemOperand(MMO);
  10586. MI.eraseFromParent();
  10587. return BB;
  10588. }
  10589. case ARM::STRr_preidx:
  10590. case ARM::STRBr_preidx:
  10591. case ARM::STRH_preidx: {
  10592. unsigned NewOpc;
  10593. switch (MI.getOpcode()) {
  10594. default: llvm_unreachable("unexpected opcode!");
  10595. case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
  10596. case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
  10597. case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
  10598. }
  10599. MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
  10600. for (const MachineOperand &MO : MI.operands())
  10601. MIB.add(MO);
  10602. MI.eraseFromParent();
  10603. return BB;
  10604. }
  10605. case ARM::tMOVCCr_pseudo: {
  10606. // To "insert" a SELECT_CC instruction, we actually have to insert the
  10607. // diamond control-flow pattern. The incoming instruction knows the
  10608. // destination vreg to set, the condition code register to branch on, the
  10609. // true/false values to select between, and a branch opcode to use.
  10610. const BasicBlock *LLVM_BB = BB->getBasicBlock();
  10611. MachineFunction::iterator It = ++BB->getIterator();
  10612. // thisMBB:
  10613. // ...
  10614. // TrueVal = ...
  10615. // cmpTY ccX, r1, r2
  10616. // bCC copy1MBB
  10617. // fallthrough --> copy0MBB
  10618. MachineBasicBlock *thisMBB = BB;
  10619. MachineFunction *F = BB->getParent();
  10620. MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
  10621. MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
  10622. F->insert(It, copy0MBB);
  10623. F->insert(It, sinkMBB);
  10624. // Check whether CPSR is live past the tMOVCCr_pseudo.
  10625. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
  10626. if (!MI.killsRegister(ARM::CPSR) &&
  10627. !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
  10628. copy0MBB->addLiveIn(ARM::CPSR);
  10629. sinkMBB->addLiveIn(ARM::CPSR);
  10630. }
  10631. // Transfer the remainder of BB and its successor edges to sinkMBB.
  10632. sinkMBB->splice(sinkMBB->begin(), BB,
  10633. std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10634. sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
  10635. BB->addSuccessor(copy0MBB);
  10636. BB->addSuccessor(sinkMBB);
  10637. BuildMI(BB, dl, TII->get(ARM::tBcc))
  10638. .addMBB(sinkMBB)
  10639. .addImm(MI.getOperand(3).getImm())
  10640. .addReg(MI.getOperand(4).getReg());
  10641. // copy0MBB:
  10642. // %FalseValue = ...
  10643. // # fallthrough to sinkMBB
  10644. BB = copy0MBB;
  10645. // Update machine-CFG edges
  10646. BB->addSuccessor(sinkMBB);
  10647. // sinkMBB:
  10648. // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
  10649. // ...
  10650. BB = sinkMBB;
  10651. BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
  10652. .addReg(MI.getOperand(1).getReg())
  10653. .addMBB(copy0MBB)
  10654. .addReg(MI.getOperand(2).getReg())
  10655. .addMBB(thisMBB);
  10656. MI.eraseFromParent(); // The pseudo instruction is gone now.
  10657. return BB;
  10658. }
  10659. case ARM::BCCi64:
  10660. case ARM::BCCZi64: {
  10661. // If there is an unconditional branch to the other successor, remove it.
  10662. BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10663. // Compare both parts that make up the double comparison separately for
  10664. // equality.
  10665. bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
  10666. Register LHS1 = MI.getOperand(1).getReg();
  10667. Register LHS2 = MI.getOperand(2).getReg();
  10668. if (RHSisZero) {
  10669. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
  10670. .addReg(LHS1)
  10671. .addImm(0)
  10672. .add(predOps(ARMCC::AL));
  10673. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
  10674. .addReg(LHS2).addImm(0)
  10675. .addImm(ARMCC::EQ).addReg(ARM::CPSR);
  10676. } else {
  10677. Register RHS1 = MI.getOperand(3).getReg();
  10678. Register RHS2 = MI.getOperand(4).getReg();
  10679. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
  10680. .addReg(LHS1)
  10681. .addReg(RHS1)
  10682. .add(predOps(ARMCC::AL));
  10683. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
  10684. .addReg(LHS2).addReg(RHS2)
  10685. .addImm(ARMCC::EQ).addReg(ARM::CPSR);
  10686. }
  10687. MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
  10688. MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
  10689. if (MI.getOperand(0).getImm() == ARMCC::NE)
  10690. std::swap(destMBB, exitMBB);
  10691. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
  10692. .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
  10693. if (isThumb2)
  10694. BuildMI(BB, dl, TII->get(ARM::t2B))
  10695. .addMBB(exitMBB)
  10696. .add(predOps(ARMCC::AL));
  10697. else
  10698. BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
  10699. MI.eraseFromParent(); // The pseudo instruction is gone now.
  10700. return BB;
  10701. }
  10702. case ARM::Int_eh_sjlj_setjmp:
  10703. case ARM::Int_eh_sjlj_setjmp_nofp:
  10704. case ARM::tInt_eh_sjlj_setjmp:
  10705. case ARM::t2Int_eh_sjlj_setjmp:
  10706. case ARM::t2Int_eh_sjlj_setjmp_nofp:
  10707. return BB;
  10708. case ARM::Int_eh_sjlj_setup_dispatch:
  10709. EmitSjLjDispatchBlock(MI, BB);
  10710. return BB;
  10711. case ARM::ABS:
  10712. case ARM::t2ABS: {
  10713. // To insert an ABS instruction, we have to insert the
  10714. // diamond control-flow pattern. The incoming instruction knows the
  10715. // source vreg to test against 0, the destination vreg to set,
  10716. // the condition code register to branch on, the
  10717. // true/false values to select between, and a branch opcode to use.
  10718. // It transforms
  10719. // V1 = ABS V0
  10720. // into
  10721. // V2 = MOVS V0
  10722. // BCC (branch to SinkBB if V0 >= 0)
  10723. // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
  10724. // SinkBB: V1 = PHI(V2, V3)
  10725. const BasicBlock *LLVM_BB = BB->getBasicBlock();
  10726. MachineFunction::iterator BBI = ++BB->getIterator();
  10727. MachineFunction *Fn = BB->getParent();
  10728. MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
  10729. MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
  10730. Fn->insert(BBI, RSBBB);
  10731. Fn->insert(BBI, SinkBB);
  10732. Register ABSSrcReg = MI.getOperand(1).getReg();
  10733. Register ABSDstReg = MI.getOperand(0).getReg();
  10734. bool ABSSrcKIll = MI.getOperand(1).isKill();
  10735. bool isThumb2 = Subtarget->isThumb2();
  10736. MachineRegisterInfo &MRI = Fn->getRegInfo();
  10737. // In Thumb mode S must not be specified if source register is the SP or
  10738. // PC and if destination register is the SP, so restrict register class
  10739. Register NewRsbDstReg = MRI.createVirtualRegister(
  10740. isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
  10741. // Transfer the remainder of BB and its successor edges to sinkMBB.
  10742. SinkBB->splice(SinkBB->begin(), BB,
  10743. std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10744. SinkBB->transferSuccessorsAndUpdatePHIs(BB);
  10745. BB->addSuccessor(RSBBB);
  10746. BB->addSuccessor(SinkBB);
  10747. // fall through to SinkMBB
  10748. RSBBB->addSuccessor(SinkBB);
  10749. // insert a cmp at the end of BB
  10750. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
  10751. .addReg(ABSSrcReg)
  10752. .addImm(0)
  10753. .add(predOps(ARMCC::AL));
  10754. // insert a bcc with opposite CC to ARMCC::MI at the end of BB
  10755. BuildMI(BB, dl,
  10756. TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
  10757. .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
  10758. // insert rsbri in RSBBB
  10759. // Note: BCC and rsbri will be converted into predicated rsbmi
  10760. // by if-conversion pass
  10761. BuildMI(*RSBBB, RSBBB->begin(), dl,
  10762. TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
  10763. .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
  10764. .addImm(0)
  10765. .add(predOps(ARMCC::AL))
  10766. .add(condCodeOp());
  10767. // insert PHI in SinkBB,
  10768. // reuse ABSDstReg to not change uses of ABS instruction
  10769. BuildMI(*SinkBB, SinkBB->begin(), dl,
  10770. TII->get(ARM::PHI), ABSDstReg)
  10771. .addReg(NewRsbDstReg).addMBB(RSBBB)
  10772. .addReg(ABSSrcReg).addMBB(BB);
  10773. // remove ABS instruction
  10774. MI.eraseFromParent();
  10775. // return last added BB
  10776. return SinkBB;
  10777. }
  10778. case ARM::COPY_STRUCT_BYVAL_I32:
  10779. ++NumLoopByVals;
  10780. return EmitStructByval(MI, BB);
  10781. case ARM::WIN__CHKSTK:
  10782. return EmitLowered__chkstk(MI, BB);
  10783. case ARM::WIN__DBZCHK:
  10784. return EmitLowered__dbzchk(MI, BB);
  10785. }
  10786. }
  10787. /// Attaches vregs to MEMCPY that it will use as scratch registers
  10788. /// when it is expanded into LDM/STM. This is done as a post-isel lowering
  10789. /// instead of as a custom inserter because we need the use list from the SDNode.
  10790. static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
  10791. MachineInstr &MI, const SDNode *Node) {
  10792. bool isThumb1 = Subtarget->isThumb1Only();
  10793. DebugLoc DL = MI.getDebugLoc();
  10794. MachineFunction *MF = MI.getParent()->getParent();
  10795. MachineRegisterInfo &MRI = MF->getRegInfo();
  10796. MachineInstrBuilder MIB(*MF, MI);
  10797. // If the new dst/src is unused mark it as dead.
  10798. if (!Node->hasAnyUseOfValue(0)) {
  10799. MI.getOperand(0).setIsDead(true);
  10800. }
  10801. if (!Node->hasAnyUseOfValue(1)) {
  10802. MI.getOperand(1).setIsDead(true);
  10803. }
  10804. // The MEMCPY both defines and kills the scratch registers.
  10805. for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
  10806. Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
  10807. : &ARM::GPRRegClass);
  10808. MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
  10809. }
  10810. }
  10811. void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
  10812. SDNode *Node) const {
  10813. if (MI.getOpcode() == ARM::MEMCPY) {
  10814. attachMEMCPYScratchRegs(Subtarget, MI, Node);
  10815. return;
  10816. }
  10817. const MCInstrDesc *MCID = &MI.getDesc();
  10818. // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
  10819. // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
  10820. // operand is still set to noreg. If needed, set the optional operand's
  10821. // register to CPSR, and remove the redundant implicit def.
  10822. //
  10823. // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
  10824. // Rename pseudo opcodes.
  10825. unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
  10826. unsigned ccOutIdx;
  10827. if (NewOpc) {
  10828. const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
  10829. MCID = &TII->get(NewOpc);
  10830. assert(MCID->getNumOperands() ==
  10831. MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
  10832. && "converted opcode should be the same except for cc_out"
  10833. " (and, on Thumb1, pred)");
  10834. MI.setDesc(*MCID);
  10835. // Add the optional cc_out operand
  10836. MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
  10837. // On Thumb1, move all input operands to the end, then add the predicate
  10838. if (Subtarget->isThumb1Only()) {
  10839. for (unsigned c = MCID->getNumOperands() - 4; c--;) {
  10840. MI.addOperand(MI.getOperand(1));
  10841. MI.RemoveOperand(1);
  10842. }
  10843. // Restore the ties
  10844. for (unsigned i = MI.getNumOperands(); i--;) {
  10845. const MachineOperand& op = MI.getOperand(i);
  10846. if (op.isReg() && op.isUse()) {
  10847. int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
  10848. if (DefIdx != -1)
  10849. MI.tieOperands(DefIdx, i);
  10850. }
  10851. }
  10852. MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
  10853. MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
  10854. ccOutIdx = 1;
  10855. } else
  10856. ccOutIdx = MCID->getNumOperands() - 1;
  10857. } else
  10858. ccOutIdx = MCID->getNumOperands() - 1;
  10859. // Any ARM instruction that sets the 's' bit should specify an optional
  10860. // "cc_out" operand in the last operand position.
  10861. if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
  10862. assert(!NewOpc && "Optional cc_out operand required");
  10863. return;
  10864. }
  10865. // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
  10866. // since we already have an optional CPSR def.
  10867. bool definesCPSR = false;
  10868. bool deadCPSR = false;
  10869. for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
  10870. ++i) {
  10871. const MachineOperand &MO = MI.getOperand(i);
  10872. if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
  10873. definesCPSR = true;
  10874. if (MO.isDead())
  10875. deadCPSR = true;
  10876. MI.RemoveOperand(i);
  10877. break;
  10878. }
  10879. }
  10880. if (!definesCPSR) {
  10881. assert(!NewOpc && "Optional cc_out operand required");
  10882. return;
  10883. }
  10884. assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
  10885. if (deadCPSR) {
  10886. assert(!MI.getOperand(ccOutIdx).getReg() &&
  10887. "expect uninitialized optional cc_out operand");
  10888. // Thumb1 instructions must have the S bit even if the CPSR is dead.
  10889. if (!Subtarget->isThumb1Only())
  10890. return;
  10891. }
  10892. // If this instruction was defined with an optional CPSR def and its dag node
  10893. // had a live implicit CPSR def, then activate the optional CPSR def.
  10894. MachineOperand &MO = MI.getOperand(ccOutIdx);
  10895. MO.setReg(ARM::CPSR);
  10896. MO.setIsDef(true);
  10897. }
  10898. //===----------------------------------------------------------------------===//
  10899. // ARM Optimization Hooks
  10900. //===----------------------------------------------------------------------===//
  10901. // Helper function that checks if N is a null or all ones constant.
  10902. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
  10903. return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
  10904. }
  10905. // Return true if N is conditionally 0 or all ones.
  10906. // Detects these expressions where cc is an i1 value:
  10907. //
  10908. // (select cc 0, y) [AllOnes=0]
  10909. // (select cc y, 0) [AllOnes=0]
  10910. // (zext cc) [AllOnes=0]
  10911. // (sext cc) [AllOnes=0/1]
  10912. // (select cc -1, y) [AllOnes=1]
  10913. // (select cc y, -1) [AllOnes=1]
  10914. //
  10915. // Invert is set when N is the null/all ones constant when CC is false.
  10916. // OtherOp is set to the alternative value of N.
  10917. static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
  10918. SDValue &CC, bool &Invert,
  10919. SDValue &OtherOp,
  10920. SelectionDAG &DAG) {
  10921. switch (N->getOpcode()) {
  10922. default: return false;
  10923. case ISD::SELECT: {
  10924. CC = N->getOperand(0);
  10925. SDValue N1 = N->getOperand(1);
  10926. SDValue N2 = N->getOperand(2);
  10927. if (isZeroOrAllOnes(N1, AllOnes)) {
  10928. Invert = false;
  10929. OtherOp = N2;
  10930. return true;
  10931. }
  10932. if (isZeroOrAllOnes(N2, AllOnes)) {
  10933. Invert = true;
  10934. OtherOp = N1;
  10935. return true;
  10936. }
  10937. return false;
  10938. }
  10939. case ISD::ZERO_EXTEND:
  10940. // (zext cc) can never be the all ones value.
  10941. if (AllOnes)
  10942. return false;
  10943. LLVM_FALLTHROUGH;
  10944. case ISD::SIGN_EXTEND: {
  10945. SDLoc dl(N);
  10946. EVT VT = N->getValueType(0);
  10947. CC = N->getOperand(0);
  10948. if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
  10949. return false;
  10950. Invert = !AllOnes;
  10951. if (AllOnes)
  10952. // When looking for an AllOnes constant, N is an sext, and the 'other'
  10953. // value is 0.
  10954. OtherOp = DAG.getConstant(0, dl, VT);
  10955. else if (N->getOpcode() == ISD::ZERO_EXTEND)
  10956. // When looking for a 0 constant, N can be zext or sext.
  10957. OtherOp = DAG.getConstant(1, dl, VT);
  10958. else
  10959. OtherOp = DAG.getAllOnesConstant(dl, VT);
  10960. return true;
  10961. }
  10962. }
  10963. }
  10964. // Combine a constant select operand into its use:
  10965. //
  10966. // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
  10967. // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
  10968. // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
  10969. // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
  10970. // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
  10971. //
  10972. // The transform is rejected if the select doesn't have a constant operand that
  10973. // is null, or all ones when AllOnes is set.
  10974. //
  10975. // Also recognize sext/zext from i1:
  10976. //
  10977. // (add (zext cc), x) -> (select cc (add x, 1), x)
  10978. // (add (sext cc), x) -> (select cc (add x, -1), x)
  10979. //
  10980. // These transformations eventually create predicated instructions.
  10981. //
  10982. // @param N The node to transform.
  10983. // @param Slct The N operand that is a select.
  10984. // @param OtherOp The other N operand (x above).
  10985. // @param DCI Context.
  10986. // @param AllOnes Require the select constant to be all ones instead of null.
  10987. // @returns The new node, or SDValue() on failure.
  10988. static
  10989. SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
  10990. TargetLowering::DAGCombinerInfo &DCI,
  10991. bool AllOnes = false) {
  10992. SelectionDAG &DAG = DCI.DAG;
  10993. EVT VT = N->getValueType(0);
  10994. SDValue NonConstantVal;
  10995. SDValue CCOp;
  10996. bool SwapSelectOps;
  10997. if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
  10998. NonConstantVal, DAG))
  10999. return SDValue();
  11000. // Slct is now know to be the desired identity constant when CC is true.
  11001. SDValue TrueVal = OtherOp;
  11002. SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
  11003. OtherOp, NonConstantVal);
  11004. // Unless SwapSelectOps says CC should be false.
  11005. if (SwapSelectOps)
  11006. std::swap(TrueVal, FalseVal);
  11007. return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
  11008. CCOp, TrueVal, FalseVal);
  11009. }
  11010. // Attempt combineSelectAndUse on each operand of a commutative operator N.
  11011. static
  11012. SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
  11013. TargetLowering::DAGCombinerInfo &DCI) {
  11014. SDValue N0 = N->getOperand(0);
  11015. SDValue N1 = N->getOperand(1);
  11016. if (N0.getNode()->hasOneUse())
  11017. if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
  11018. return Result;
  11019. if (N1.getNode()->hasOneUse())
  11020. if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
  11021. return Result;
  11022. return SDValue();
  11023. }
  11024. static bool IsVUZPShuffleNode(SDNode *N) {
  11025. // VUZP shuffle node.
  11026. if (N->getOpcode() == ARMISD::VUZP)
  11027. return true;
  11028. // "VUZP" on i32 is an alias for VTRN.
  11029. if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
  11030. return true;
  11031. return false;
  11032. }
  11033. static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
  11034. TargetLowering::DAGCombinerInfo &DCI,
  11035. const ARMSubtarget *Subtarget) {
  11036. // Look for ADD(VUZP.0, VUZP.1).
  11037. if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
  11038. N0 == N1)
  11039. return SDValue();
  11040. // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
  11041. if (!N->getValueType(0).is64BitVector())
  11042. return SDValue();
  11043. // Generate vpadd.
  11044. SelectionDAG &DAG = DCI.DAG;
  11045. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11046. SDLoc dl(N);
  11047. SDNode *Unzip = N0.getNode();
  11048. EVT VT = N->getValueType(0);
  11049. SmallVector<SDValue, 8> Ops;
  11050. Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
  11051. TLI.getPointerTy(DAG.getDataLayout())));
  11052. Ops.push_back(Unzip->getOperand(0));
  11053. Ops.push_back(Unzip->getOperand(1));
  11054. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
  11055. }
  11056. static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
  11057. TargetLowering::DAGCombinerInfo &DCI,
  11058. const ARMSubtarget *Subtarget) {
  11059. // Check for two extended operands.
  11060. if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
  11061. N1.getOpcode() == ISD::SIGN_EXTEND) &&
  11062. !(N0.getOpcode() == ISD::ZERO_EXTEND &&
  11063. N1.getOpcode() == ISD::ZERO_EXTEND))
  11064. return SDValue();
  11065. SDValue N00 = N0.getOperand(0);
  11066. SDValue N10 = N1.getOperand(0);
  11067. // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
  11068. if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
  11069. N00 == N10)
  11070. return SDValue();
  11071. // We only recognize Q register paddl here; this can't be reached until
  11072. // after type legalization.
  11073. if (!N00.getValueType().is64BitVector() ||
  11074. !N0.getValueType().is128BitVector())
  11075. return SDValue();
  11076. // Generate vpaddl.
  11077. SelectionDAG &DAG = DCI.DAG;
  11078. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11079. SDLoc dl(N);
  11080. EVT VT = N->getValueType(0);
  11081. SmallVector<SDValue, 8> Ops;
  11082. // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
  11083. unsigned Opcode;
  11084. if (N0.getOpcode() == ISD::SIGN_EXTEND)
  11085. Opcode = Intrinsic::arm_neon_vpaddls;
  11086. else
  11087. Opcode = Intrinsic::arm_neon_vpaddlu;
  11088. Ops.push_back(DAG.getConstant(Opcode, dl,
  11089. TLI.getPointerTy(DAG.getDataLayout())));
  11090. EVT ElemTy = N00.getValueType().getVectorElementType();
  11091. unsigned NumElts = VT.getVectorNumElements();
  11092. EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
  11093. SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
  11094. N00.getOperand(0), N00.getOperand(1));
  11095. Ops.push_back(Concat);
  11096. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
  11097. }
  11098. // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
  11099. // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
  11100. // much easier to match.
  11101. static SDValue
  11102. AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
  11103. TargetLowering::DAGCombinerInfo &DCI,
  11104. const ARMSubtarget *Subtarget) {
  11105. // Only perform optimization if after legalize, and if NEON is available. We
  11106. // also expected both operands to be BUILD_VECTORs.
  11107. if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
  11108. || N0.getOpcode() != ISD::BUILD_VECTOR
  11109. || N1.getOpcode() != ISD::BUILD_VECTOR)
  11110. return SDValue();
  11111. // Check output type since VPADDL operand elements can only be 8, 16, or 32.
  11112. EVT VT = N->getValueType(0);
  11113. if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
  11114. return SDValue();
  11115. // Check that the vector operands are of the right form.
  11116. // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
  11117. // operands, where N is the size of the formed vector.
  11118. // Each EXTRACT_VECTOR should have the same input vector and odd or even
  11119. // index such that we have a pair wise add pattern.
  11120. // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
  11121. if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  11122. return SDValue();
  11123. SDValue Vec = N0->getOperand(0)->getOperand(0);
  11124. SDNode *V = Vec.getNode();
  11125. unsigned nextIndex = 0;
  11126. // For each operands to the ADD which are BUILD_VECTORs,
  11127. // check to see if each of their operands are an EXTRACT_VECTOR with
  11128. // the same vector and appropriate index.
  11129. for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
  11130. if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
  11131. && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
  11132. SDValue ExtVec0 = N0->getOperand(i);
  11133. SDValue ExtVec1 = N1->getOperand(i);
  11134. // First operand is the vector, verify its the same.
  11135. if (V != ExtVec0->getOperand(0).getNode() ||
  11136. V != ExtVec1->getOperand(0).getNode())
  11137. return SDValue();
  11138. // Second is the constant, verify its correct.
  11139. ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
  11140. ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
  11141. // For the constant, we want to see all the even or all the odd.
  11142. if (!C0 || !C1 || C0->getZExtValue() != nextIndex
  11143. || C1->getZExtValue() != nextIndex+1)
  11144. return SDValue();
  11145. // Increment index.
  11146. nextIndex+=2;
  11147. } else
  11148. return SDValue();
  11149. }
  11150. // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
  11151. // we're using the entire input vector, otherwise there's a size/legality
  11152. // mismatch somewhere.
  11153. if (nextIndex != Vec.getValueType().getVectorNumElements() ||
  11154. Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
  11155. return SDValue();
  11156. // Create VPADDL node.
  11157. SelectionDAG &DAG = DCI.DAG;
  11158. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11159. SDLoc dl(N);
  11160. // Build operand list.
  11161. SmallVector<SDValue, 8> Ops;
  11162. Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
  11163. TLI.getPointerTy(DAG.getDataLayout())));
  11164. // Input is the vector.
  11165. Ops.push_back(Vec);
  11166. // Get widened type and narrowed type.
  11167. MVT widenType;
  11168. unsigned numElem = VT.getVectorNumElements();
  11169. EVT inputLaneType = Vec.getValueType().getVectorElementType();
  11170. switch (inputLaneType.getSimpleVT().SimpleTy) {
  11171. case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
  11172. case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
  11173. case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
  11174. default:
  11175. llvm_unreachable("Invalid vector element type for padd optimization.");
  11176. }
  11177. SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
  11178. unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
  11179. return DAG.getNode(ExtOp, dl, VT, tmp);
  11180. }
  11181. static SDValue findMUL_LOHI(SDValue V) {
  11182. if (V->getOpcode() == ISD::UMUL_LOHI ||
  11183. V->getOpcode() == ISD::SMUL_LOHI)
  11184. return V;
  11185. return SDValue();
  11186. }
  11187. static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
  11188. TargetLowering::DAGCombinerInfo &DCI,
  11189. const ARMSubtarget *Subtarget) {
  11190. if (!Subtarget->hasBaseDSP())
  11191. return SDValue();
  11192. // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
  11193. // accumulates the product into a 64-bit value. The 16-bit values will
  11194. // be sign extended somehow or SRA'd into 32-bit values
  11195. // (addc (adde (mul 16bit, 16bit), lo), hi)
  11196. SDValue Mul = AddcNode->getOperand(0);
  11197. SDValue Lo = AddcNode->getOperand(1);
  11198. if (Mul.getOpcode() != ISD::MUL) {
  11199. Lo = AddcNode->getOperand(0);
  11200. Mul = AddcNode->getOperand(1);
  11201. if (Mul.getOpcode() != ISD::MUL)
  11202. return SDValue();
  11203. }
  11204. SDValue SRA = AddeNode->getOperand(0);
  11205. SDValue Hi = AddeNode->getOperand(1);
  11206. if (SRA.getOpcode() != ISD::SRA) {
  11207. SRA = AddeNode->getOperand(1);
  11208. Hi = AddeNode->getOperand(0);
  11209. if (SRA.getOpcode() != ISD::SRA)
  11210. return SDValue();
  11211. }
  11212. if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
  11213. if (Const->getZExtValue() != 31)
  11214. return SDValue();
  11215. } else
  11216. return SDValue();
  11217. if (SRA.getOperand(0) != Mul)
  11218. return SDValue();
  11219. SelectionDAG &DAG = DCI.DAG;
  11220. SDLoc dl(AddcNode);
  11221. unsigned Opcode = 0;
  11222. SDValue Op0;
  11223. SDValue Op1;
  11224. if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
  11225. Opcode = ARMISD::SMLALBB;
  11226. Op0 = Mul.getOperand(0);
  11227. Op1 = Mul.getOperand(1);
  11228. } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
  11229. Opcode = ARMISD::SMLALBT;
  11230. Op0 = Mul.getOperand(0);
  11231. Op1 = Mul.getOperand(1).getOperand(0);
  11232. } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
  11233. Opcode = ARMISD::SMLALTB;
  11234. Op0 = Mul.getOperand(0).getOperand(0);
  11235. Op1 = Mul.getOperand(1);
  11236. } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
  11237. Opcode = ARMISD::SMLALTT;
  11238. Op0 = Mul->getOperand(0).getOperand(0);
  11239. Op1 = Mul->getOperand(1).getOperand(0);
  11240. }
  11241. if (!Op0 || !Op1)
  11242. return SDValue();
  11243. SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
  11244. Op0, Op1, Lo, Hi);
  11245. // Replace the ADDs' nodes uses by the MLA node's values.
  11246. SDValue HiMLALResult(SMLAL.getNode(), 1);
  11247. SDValue LoMLALResult(SMLAL.getNode(), 0);
  11248. DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
  11249. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
  11250. // Return original node to notify the driver to stop replacing.
  11251. SDValue resNode(AddcNode, 0);
  11252. return resNode;
  11253. }
  11254. static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
  11255. TargetLowering::DAGCombinerInfo &DCI,
  11256. const ARMSubtarget *Subtarget) {
  11257. // Look for multiply add opportunities.
  11258. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
  11259. // each add nodes consumes a value from ISD::UMUL_LOHI and there is
  11260. // a glue link from the first add to the second add.
  11261. // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
  11262. // a S/UMLAL instruction.
  11263. // UMUL_LOHI
  11264. // / :lo \ :hi
  11265. // V \ [no multiline comment]
  11266. // loAdd -> ADDC |
  11267. // \ :carry /
  11268. // V V
  11269. // ADDE <- hiAdd
  11270. //
  11271. // In the special case where only the higher part of a signed result is used
  11272. // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
  11273. // a constant with the exact value of 0x80000000, we recognize we are dealing
  11274. // with a "rounded multiply and add" (or subtract) and transform it into
  11275. // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
  11276. assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
  11277. AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
  11278. "Expect an ADDE or SUBE");
  11279. assert(AddeSubeNode->getNumOperands() == 3 &&
  11280. AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
  11281. "ADDE node has the wrong inputs");
  11282. // Check that we are chained to the right ADDC or SUBC node.
  11283. SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
  11284. if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
  11285. AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
  11286. (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
  11287. AddcSubcNode->getOpcode() != ARMISD::SUBC))
  11288. return SDValue();
  11289. SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
  11290. SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
  11291. // Check if the two operands are from the same mul_lohi node.
  11292. if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
  11293. return SDValue();
  11294. assert(AddcSubcNode->getNumValues() == 2 &&
  11295. AddcSubcNode->getValueType(0) == MVT::i32 &&
  11296. "Expect ADDC with two result values. First: i32");
  11297. // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
  11298. // maybe a SMLAL which multiplies two 16-bit values.
  11299. if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
  11300. AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
  11301. AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
  11302. AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
  11303. AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
  11304. return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
  11305. // Check for the triangle shape.
  11306. SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
  11307. SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
  11308. // Make sure that the ADDE/SUBE operands are not coming from the same node.
  11309. if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
  11310. return SDValue();
  11311. // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
  11312. bool IsLeftOperandMUL = false;
  11313. SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
  11314. if (MULOp == SDValue())
  11315. MULOp = findMUL_LOHI(AddeSubeOp1);
  11316. else
  11317. IsLeftOperandMUL = true;
  11318. if (MULOp == SDValue())
  11319. return SDValue();
  11320. // Figure out the right opcode.
  11321. unsigned Opc = MULOp->getOpcode();
  11322. unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
  11323. // Figure out the high and low input values to the MLAL node.
  11324. SDValue *HiAddSub = nullptr;
  11325. SDValue *LoMul = nullptr;
  11326. SDValue *LowAddSub = nullptr;
  11327. // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
  11328. if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
  11329. return SDValue();
  11330. if (IsLeftOperandMUL)
  11331. HiAddSub = &AddeSubeOp1;
  11332. else
  11333. HiAddSub = &AddeSubeOp0;
  11334. // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
  11335. // whose low result is fed to the ADDC/SUBC we are checking.
  11336. if (AddcSubcOp0 == MULOp.getValue(0)) {
  11337. LoMul = &AddcSubcOp0;
  11338. LowAddSub = &AddcSubcOp1;
  11339. }
  11340. if (AddcSubcOp1 == MULOp.getValue(0)) {
  11341. LoMul = &AddcSubcOp1;
  11342. LowAddSub = &AddcSubcOp0;
  11343. }
  11344. if (!LoMul)
  11345. return SDValue();
  11346. // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
  11347. // the replacement below will create a cycle.
  11348. if (AddcSubcNode == HiAddSub->getNode() ||
  11349. AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
  11350. return SDValue();
  11351. // Create the merged node.
  11352. SelectionDAG &DAG = DCI.DAG;
  11353. // Start building operand list.
  11354. SmallVector<SDValue, 8> Ops;
  11355. Ops.push_back(LoMul->getOperand(0));
  11356. Ops.push_back(LoMul->getOperand(1));
  11357. // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
  11358. // the case, we must be doing signed multiplication and only use the higher
  11359. // part of the result of the MLAL, furthermore the LowAddSub must be a constant
  11360. // addition or subtraction with the value of 0x800000.
  11361. if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
  11362. FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
  11363. LowAddSub->getNode()->getOpcode() == ISD::Constant &&
  11364. static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
  11365. 0x80000000) {
  11366. Ops.push_back(*HiAddSub);
  11367. if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
  11368. FinalOpc = ARMISD::SMMLSR;
  11369. } else {
  11370. FinalOpc = ARMISD::SMMLAR;
  11371. }
  11372. SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
  11373. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
  11374. return SDValue(AddeSubeNode, 0);
  11375. } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
  11376. // SMMLS is generated during instruction selection and the rest of this
  11377. // function can not handle the case where AddcSubcNode is a SUBC.
  11378. return SDValue();
  11379. // Finish building the operand list for {U/S}MLAL
  11380. Ops.push_back(*LowAddSub);
  11381. Ops.push_back(*HiAddSub);
  11382. SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
  11383. DAG.getVTList(MVT::i32, MVT::i32), Ops);
  11384. // Replace the ADDs' nodes uses by the MLA node's values.
  11385. SDValue HiMLALResult(MLALNode.getNode(), 1);
  11386. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
  11387. SDValue LoMLALResult(MLALNode.getNode(), 0);
  11388. DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
  11389. // Return original node to notify the driver to stop replacing.
  11390. return SDValue(AddeSubeNode, 0);
  11391. }
  11392. static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
  11393. TargetLowering::DAGCombinerInfo &DCI,
  11394. const ARMSubtarget *Subtarget) {
  11395. // UMAAL is similar to UMLAL except that it adds two unsigned values.
  11396. // While trying to combine for the other MLAL nodes, first search for the
  11397. // chance to use UMAAL. Check if Addc uses a node which has already
  11398. // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
  11399. // as the addend, and it's handled in PerformUMLALCombine.
  11400. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
  11401. return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
  11402. // Check that we have a glued ADDC node.
  11403. SDNode* AddcNode = AddeNode->getOperand(2).getNode();
  11404. if (AddcNode->getOpcode() != ARMISD::ADDC)
  11405. return SDValue();
  11406. // Find the converted UMAAL or quit if it doesn't exist.
  11407. SDNode *UmlalNode = nullptr;
  11408. SDValue AddHi;
  11409. if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
  11410. UmlalNode = AddcNode->getOperand(0).getNode();
  11411. AddHi = AddcNode->getOperand(1);
  11412. } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
  11413. UmlalNode = AddcNode->getOperand(1).getNode();
  11414. AddHi = AddcNode->getOperand(0);
  11415. } else {
  11416. return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
  11417. }
  11418. // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
  11419. // the ADDC as well as Zero.
  11420. if (!isNullConstant(UmlalNode->getOperand(3)))
  11421. return SDValue();
  11422. if ((isNullConstant(AddeNode->getOperand(0)) &&
  11423. AddeNode->getOperand(1).getNode() == UmlalNode) ||
  11424. (AddeNode->getOperand(0).getNode() == UmlalNode &&
  11425. isNullConstant(AddeNode->getOperand(1)))) {
  11426. SelectionDAG &DAG = DCI.DAG;
  11427. SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
  11428. UmlalNode->getOperand(2), AddHi };
  11429. SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
  11430. DAG.getVTList(MVT::i32, MVT::i32), Ops);
  11431. // Replace the ADDs' nodes uses by the UMAAL node's values.
  11432. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
  11433. DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
  11434. // Return original node to notify the driver to stop replacing.
  11435. return SDValue(AddeNode, 0);
  11436. }
  11437. return SDValue();
  11438. }
  11439. static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
  11440. const ARMSubtarget *Subtarget) {
  11441. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
  11442. return SDValue();
  11443. // Check that we have a pair of ADDC and ADDE as operands.
  11444. // Both addends of the ADDE must be zero.
  11445. SDNode* AddcNode = N->getOperand(2).getNode();
  11446. SDNode* AddeNode = N->getOperand(3).getNode();
  11447. if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
  11448. (AddeNode->getOpcode() == ARMISD::ADDE) &&
  11449. isNullConstant(AddeNode->getOperand(0)) &&
  11450. isNullConstant(AddeNode->getOperand(1)) &&
  11451. (AddeNode->getOperand(2).getNode() == AddcNode))
  11452. return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
  11453. DAG.getVTList(MVT::i32, MVT::i32),
  11454. {N->getOperand(0), N->getOperand(1),
  11455. AddcNode->getOperand(0), AddcNode->getOperand(1)});
  11456. else
  11457. return SDValue();
  11458. }
  11459. static SDValue PerformAddcSubcCombine(SDNode *N,
  11460. TargetLowering::DAGCombinerInfo &DCI,
  11461. const ARMSubtarget *Subtarget) {
  11462. SelectionDAG &DAG(DCI.DAG);
  11463. if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
  11464. // (SUBC (ADDE 0, 0, C), 1) -> C
  11465. SDValue LHS = N->getOperand(0);
  11466. SDValue RHS = N->getOperand(1);
  11467. if (LHS->getOpcode() == ARMISD::ADDE &&
  11468. isNullConstant(LHS->getOperand(0)) &&
  11469. isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
  11470. return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
  11471. }
  11472. }
  11473. if (Subtarget->isThumb1Only()) {
  11474. SDValue RHS = N->getOperand(1);
  11475. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
  11476. int32_t imm = C->getSExtValue();
  11477. if (imm < 0 && imm > std::numeric_limits<int>::min()) {
  11478. SDLoc DL(N);
  11479. RHS = DAG.getConstant(-imm, DL, MVT::i32);
  11480. unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
  11481. : ARMISD::ADDC;
  11482. return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
  11483. }
  11484. }
  11485. }
  11486. return SDValue();
  11487. }
  11488. static SDValue PerformAddeSubeCombine(SDNode *N,
  11489. TargetLowering::DAGCombinerInfo &DCI,
  11490. const ARMSubtarget *Subtarget) {
  11491. if (Subtarget->isThumb1Only()) {
  11492. SelectionDAG &DAG = DCI.DAG;
  11493. SDValue RHS = N->getOperand(1);
  11494. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
  11495. int64_t imm = C->getSExtValue();
  11496. if (imm < 0) {
  11497. SDLoc DL(N);
  11498. // The with-carry-in form matches bitwise not instead of the negation.
  11499. // Effectively, the inverse interpretation of the carry flag already
  11500. // accounts for part of the negation.
  11501. RHS = DAG.getConstant(~imm, DL, MVT::i32);
  11502. unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
  11503. : ARMISD::ADDE;
  11504. return DAG.getNode(Opcode, DL, N->getVTList(),
  11505. N->getOperand(0), RHS, N->getOperand(2));
  11506. }
  11507. }
  11508. } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
  11509. return AddCombineTo64bitMLAL(N, DCI, Subtarget);
  11510. }
  11511. return SDValue();
  11512. }
  11513. static SDValue PerformSELECTCombine(SDNode *N,
  11514. TargetLowering::DAGCombinerInfo &DCI,
  11515. const ARMSubtarget *Subtarget) {
  11516. if (!Subtarget->hasMVEIntegerOps())
  11517. return SDValue();
  11518. SDLoc dl(N);
  11519. SDValue SetCC;
  11520. SDValue LHS;
  11521. SDValue RHS;
  11522. ISD::CondCode CC;
  11523. SDValue TrueVal;
  11524. SDValue FalseVal;
  11525. if (N->getOpcode() == ISD::SELECT &&
  11526. N->getOperand(0)->getOpcode() == ISD::SETCC) {
  11527. SetCC = N->getOperand(0);
  11528. LHS = SetCC->getOperand(0);
  11529. RHS = SetCC->getOperand(1);
  11530. CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
  11531. TrueVal = N->getOperand(1);
  11532. FalseVal = N->getOperand(2);
  11533. } else if (N->getOpcode() == ISD::SELECT_CC) {
  11534. LHS = N->getOperand(0);
  11535. RHS = N->getOperand(1);
  11536. CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
  11537. TrueVal = N->getOperand(2);
  11538. FalseVal = N->getOperand(3);
  11539. } else {
  11540. return SDValue();
  11541. }
  11542. unsigned int Opcode = 0;
  11543. if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
  11544. FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
  11545. (CC == ISD::SETULT || CC == ISD::SETUGT)) {
  11546. Opcode = ARMISD::VMINVu;
  11547. if (CC == ISD::SETUGT)
  11548. std::swap(TrueVal, FalseVal);
  11549. } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
  11550. FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
  11551. (CC == ISD::SETLT || CC == ISD::SETGT)) {
  11552. Opcode = ARMISD::VMINVs;
  11553. if (CC == ISD::SETGT)
  11554. std::swap(TrueVal, FalseVal);
  11555. } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
  11556. FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
  11557. (CC == ISD::SETUGT || CC == ISD::SETULT)) {
  11558. Opcode = ARMISD::VMAXVu;
  11559. if (CC == ISD::SETULT)
  11560. std::swap(TrueVal, FalseVal);
  11561. } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
  11562. FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
  11563. (CC == ISD::SETGT || CC == ISD::SETLT)) {
  11564. Opcode = ARMISD::VMAXVs;
  11565. if (CC == ISD::SETLT)
  11566. std::swap(TrueVal, FalseVal);
  11567. } else
  11568. return SDValue();
  11569. // Normalise to the right hand side being the vector reduction
  11570. switch (TrueVal->getOpcode()) {
  11571. case ISD::VECREDUCE_UMIN:
  11572. case ISD::VECREDUCE_SMIN:
  11573. case ISD::VECREDUCE_UMAX:
  11574. case ISD::VECREDUCE_SMAX:
  11575. std::swap(LHS, RHS);
  11576. std::swap(TrueVal, FalseVal);
  11577. break;
  11578. }
  11579. EVT VectorType = FalseVal->getOperand(0).getValueType();
  11580. if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
  11581. VectorType != MVT::v4i32)
  11582. return SDValue();
  11583. EVT VectorScalarType = VectorType.getVectorElementType();
  11584. // The values being selected must also be the ones being compared
  11585. if (TrueVal != LHS || FalseVal != RHS)
  11586. return SDValue();
  11587. EVT LeftType = LHS->getValueType(0);
  11588. EVT RightType = RHS->getValueType(0);
  11589. // The types must match the reduced type too
  11590. if (LeftType != VectorScalarType || RightType != VectorScalarType)
  11591. return SDValue();
  11592. // Legalise the scalar to an i32
  11593. if (VectorScalarType != MVT::i32)
  11594. LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
  11595. // Generate the reduction as an i32 for legalisation purposes
  11596. auto Reduction =
  11597. DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
  11598. // The result isn't actually an i32 so truncate it back to its original type
  11599. if (VectorScalarType != MVT::i32)
  11600. Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
  11601. return Reduction;
  11602. }
  11603. // A special combine for the vqdmulh family of instructions. This is one of the
  11604. // potential set of patterns that could patch this instruction. The base pattern
  11605. // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
  11606. // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
  11607. // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
  11608. // the max is unnecessary.
  11609. static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
  11610. EVT VT = N->getValueType(0);
  11611. SDValue Shft;
  11612. ConstantSDNode *Clamp;
  11613. if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
  11614. return SDValue();
  11615. if (N->getOpcode() == ISD::SMIN) {
  11616. Shft = N->getOperand(0);
  11617. Clamp = isConstOrConstSplat(N->getOperand(1));
  11618. } else if (N->getOpcode() == ISD::VSELECT) {
  11619. // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
  11620. SDValue Cmp = N->getOperand(0);
  11621. if (Cmp.getOpcode() != ISD::SETCC ||
  11622. cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
  11623. Cmp.getOperand(0) != N->getOperand(1) ||
  11624. Cmp.getOperand(1) != N->getOperand(2))
  11625. return SDValue();
  11626. Shft = N->getOperand(1);
  11627. Clamp = isConstOrConstSplat(N->getOperand(2));
  11628. } else
  11629. return SDValue();
  11630. if (!Clamp)
  11631. return SDValue();
  11632. MVT ScalarType;
  11633. int ShftAmt = 0;
  11634. switch (Clamp->getSExtValue()) {
  11635. case (1 << 7) - 1:
  11636. ScalarType = MVT::i8;
  11637. ShftAmt = 7;
  11638. break;
  11639. case (1 << 15) - 1:
  11640. ScalarType = MVT::i16;
  11641. ShftAmt = 15;
  11642. break;
  11643. case (1ULL << 31) - 1:
  11644. ScalarType = MVT::i32;
  11645. ShftAmt = 31;
  11646. break;
  11647. default:
  11648. return SDValue();
  11649. }
  11650. if (Shft.getOpcode() != ISD::SRA)
  11651. return SDValue();
  11652. ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
  11653. if (!N1 || N1->getSExtValue() != ShftAmt)
  11654. return SDValue();
  11655. SDValue Mul = Shft.getOperand(0);
  11656. if (Mul.getOpcode() != ISD::MUL)
  11657. return SDValue();
  11658. SDValue Ext0 = Mul.getOperand(0);
  11659. SDValue Ext1 = Mul.getOperand(1);
  11660. if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
  11661. Ext1.getOpcode() != ISD::SIGN_EXTEND)
  11662. return SDValue();
  11663. EVT VecVT = Ext0.getOperand(0).getValueType();
  11664. if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
  11665. return SDValue();
  11666. if (Ext1.getOperand(0).getValueType() != VecVT ||
  11667. VecVT.getScalarType() != ScalarType ||
  11668. VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
  11669. return SDValue();
  11670. SDLoc DL(Mul);
  11671. unsigned LegalLanes = 128 / (ShftAmt + 1);
  11672. EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
  11673. // For types smaller than legal vectors extend to be legal and only use needed
  11674. // lanes.
  11675. if (VecVT.getSizeInBits() < 128) {
  11676. EVT ExtVecVT =
  11677. MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()),
  11678. VecVT.getVectorNumElements());
  11679. SDValue Inp0 =
  11680. DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
  11681. SDValue Inp1 =
  11682. DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
  11683. Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
  11684. Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
  11685. SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
  11686. SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
  11687. Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
  11688. return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
  11689. }
  11690. // For larger types, split into legal sized chunks.
  11691. assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
  11692. unsigned NumParts = VecVT.getSizeInBits() / 128;
  11693. SmallVector<SDValue> Parts;
  11694. for (unsigned I = 0; I < NumParts; ++I) {
  11695. SDValue Inp0 =
  11696. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
  11697. DAG.getVectorIdxConstant(I * LegalLanes, DL));
  11698. SDValue Inp1 =
  11699. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
  11700. DAG.getVectorIdxConstant(I * LegalLanes, DL));
  11701. SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
  11702. Parts.push_back(VQDMULH);
  11703. }
  11704. return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
  11705. DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
  11706. }
  11707. static SDValue PerformVSELECTCombine(SDNode *N,
  11708. TargetLowering::DAGCombinerInfo &DCI,
  11709. const ARMSubtarget *Subtarget) {
  11710. if (!Subtarget->hasMVEIntegerOps())
  11711. return SDValue();
  11712. if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
  11713. return V;
  11714. // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
  11715. //
  11716. // We need to re-implement this optimization here as the implementation in the
  11717. // Target-Independent DAGCombiner does not handle the kind of constant we make
  11718. // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
  11719. // good reason, allowing truncation there would break other targets).
  11720. //
  11721. // Currently, this is only done for MVE, as it's the only target that benefits
  11722. // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
  11723. if (N->getOperand(0).getOpcode() != ISD::XOR)
  11724. return SDValue();
  11725. SDValue XOR = N->getOperand(0);
  11726. // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
  11727. // It is important to check with truncation allowed as the BUILD_VECTORs we
  11728. // generate in those situations will truncate their operands.
  11729. ConstantSDNode *Const =
  11730. isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
  11731. /*AllowTruncation*/ true);
  11732. if (!Const || !Const->isOne())
  11733. return SDValue();
  11734. // Rewrite into vselect(cond, rhs, lhs).
  11735. SDValue Cond = XOR->getOperand(0);
  11736. SDValue LHS = N->getOperand(1);
  11737. SDValue RHS = N->getOperand(2);
  11738. EVT Type = N->getValueType(0);
  11739. return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
  11740. }
  11741. // Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
  11742. static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
  11743. TargetLowering::DAGCombinerInfo &DCI,
  11744. const ARMSubtarget *Subtarget) {
  11745. SDValue Op0 = N->getOperand(0);
  11746. SDValue Op1 = N->getOperand(1);
  11747. ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
  11748. EVT VT = N->getValueType(0);
  11749. if (!Subtarget->hasMVEIntegerOps() ||
  11750. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
  11751. return SDValue();
  11752. if (CC == ISD::SETUGE) {
  11753. std::swap(Op0, Op1);
  11754. CC = ISD::SETULT;
  11755. }
  11756. if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
  11757. Op0.getOpcode() != ISD::BUILD_VECTOR)
  11758. return SDValue();
  11759. // Check first operand is BuildVector of 0,1,2,...
  11760. for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
  11761. if (!Op0.getOperand(I).isUndef() &&
  11762. !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
  11763. Op0.getConstantOperandVal(I) == I))
  11764. return SDValue();
  11765. }
  11766. // The second is a Splat of Op1S
  11767. SDValue Op1S = DCI.DAG.getSplatValue(Op1);
  11768. if (!Op1S)
  11769. return SDValue();
  11770. unsigned Opc;
  11771. switch (VT.getVectorNumElements()) {
  11772. case 2:
  11773. Opc = Intrinsic::arm_mve_vctp64;
  11774. break;
  11775. case 4:
  11776. Opc = Intrinsic::arm_mve_vctp32;
  11777. break;
  11778. case 8:
  11779. Opc = Intrinsic::arm_mve_vctp16;
  11780. break;
  11781. case 16:
  11782. Opc = Intrinsic::arm_mve_vctp8;
  11783. break;
  11784. default:
  11785. return SDValue();
  11786. }
  11787. SDLoc DL(N);
  11788. return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
  11789. DCI.DAG.getConstant(Opc, DL, MVT::i32),
  11790. DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
  11791. }
  11792. static SDValue PerformABSCombine(SDNode *N,
  11793. TargetLowering::DAGCombinerInfo &DCI,
  11794. const ARMSubtarget *Subtarget) {
  11795. SelectionDAG &DAG = DCI.DAG;
  11796. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11797. if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
  11798. return SDValue();
  11799. return TLI.expandABS(N, DAG);
  11800. }
  11801. /// PerformADDECombine - Target-specific dag combine transform from
  11802. /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
  11803. /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
  11804. static SDValue PerformADDECombine(SDNode *N,
  11805. TargetLowering::DAGCombinerInfo &DCI,
  11806. const ARMSubtarget *Subtarget) {
  11807. // Only ARM and Thumb2 support UMLAL/SMLAL.
  11808. if (Subtarget->isThumb1Only())
  11809. return PerformAddeSubeCombine(N, DCI, Subtarget);
  11810. // Only perform the checks after legalize when the pattern is available.
  11811. if (DCI.isBeforeLegalize()) return SDValue();
  11812. return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
  11813. }
  11814. /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
  11815. /// operands N0 and N1. This is a helper for PerformADDCombine that is
  11816. /// called with the default operands, and if that fails, with commuted
  11817. /// operands.
  11818. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
  11819. TargetLowering::DAGCombinerInfo &DCI,
  11820. const ARMSubtarget *Subtarget){
  11821. // Attempt to create vpadd for this add.
  11822. if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
  11823. return Result;
  11824. // Attempt to create vpaddl for this add.
  11825. if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
  11826. return Result;
  11827. if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
  11828. Subtarget))
  11829. return Result;
  11830. // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
  11831. if (N0.getNode()->hasOneUse())
  11832. if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
  11833. return Result;
  11834. return SDValue();
  11835. }
  11836. static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
  11837. EVT VT = N->getValueType(0);
  11838. SDValue N0 = N->getOperand(0);
  11839. SDValue N1 = N->getOperand(1);
  11840. SDLoc dl(N);
  11841. auto IsVecReduce = [](SDValue Op) {
  11842. switch (Op.getOpcode()) {
  11843. case ISD::VECREDUCE_ADD:
  11844. case ARMISD::VADDVs:
  11845. case ARMISD::VADDVu:
  11846. case ARMISD::VMLAVs:
  11847. case ARMISD::VMLAVu:
  11848. return true;
  11849. }
  11850. return false;
  11851. };
  11852. auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
  11853. // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
  11854. // add(add(X, vecreduce(Y)), vecreduce(Z))
  11855. // to make better use of vaddva style instructions.
  11856. if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
  11857. IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
  11858. !isa<ConstantSDNode>(N0)) {
  11859. SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
  11860. return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
  11861. }
  11862. // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
  11863. // add(add(add(A, C), reduce(B)), reduce(D))
  11864. if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
  11865. N1.getOpcode() == ISD::ADD) {
  11866. unsigned N0RedOp = 0;
  11867. if (!IsVecReduce(N0.getOperand(N0RedOp))) {
  11868. N0RedOp = 1;
  11869. if (!IsVecReduce(N0.getOperand(N0RedOp)))
  11870. return SDValue();
  11871. }
  11872. unsigned N1RedOp = 0;
  11873. if (!IsVecReduce(N1.getOperand(N1RedOp)))
  11874. N1RedOp = 1;
  11875. if (!IsVecReduce(N1.getOperand(N1RedOp)))
  11876. return SDValue();
  11877. SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
  11878. N1.getOperand(1 - N1RedOp));
  11879. SDValue Add1 =
  11880. DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
  11881. return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
  11882. }
  11883. return SDValue();
  11884. };
  11885. if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
  11886. return R;
  11887. if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
  11888. return R;
  11889. // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
  11890. // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
  11891. // by ascending load offsets. This can help cores prefetch if the order of
  11892. // loads is more predictable.
  11893. auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
  11894. // Check if two reductions are known to load data where one is before/after
  11895. // another. Return negative if N0 loads data before N1, positive if N1 is
  11896. // before N0 and 0 otherwise if nothing is known.
  11897. auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
  11898. // Look through to the first operand of a MUL, for the VMLA case.
  11899. // Currently only looks at the first operand, in the hope they are equal.
  11900. if (N0.getOpcode() == ISD::MUL)
  11901. N0 = N0.getOperand(0);
  11902. if (N1.getOpcode() == ISD::MUL)
  11903. N1 = N1.getOperand(0);
  11904. // Return true if the two operands are loads to the same object and the
  11905. // offset of the first is known to be less than the offset of the second.
  11906. LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
  11907. LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
  11908. if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
  11909. !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
  11910. Load1->isIndexed())
  11911. return 0;
  11912. auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
  11913. auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
  11914. if (!BaseLocDecomp0.getBase() ||
  11915. BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
  11916. !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
  11917. return 0;
  11918. if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
  11919. return -1;
  11920. if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
  11921. return 1;
  11922. return 0;
  11923. };
  11924. SDValue X;
  11925. if (N0.getOpcode() == ISD::ADD) {
  11926. if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
  11927. int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
  11928. N0.getOperand(1).getOperand(0));
  11929. if (IsBefore < 0) {
  11930. X = N0.getOperand(0);
  11931. N0 = N0.getOperand(1);
  11932. } else if (IsBefore > 0) {
  11933. X = N0.getOperand(1);
  11934. N0 = N0.getOperand(0);
  11935. } else
  11936. return SDValue();
  11937. } else if (IsVecReduce(N0.getOperand(0))) {
  11938. X = N0.getOperand(1);
  11939. N0 = N0.getOperand(0);
  11940. } else if (IsVecReduce(N0.getOperand(1))) {
  11941. X = N0.getOperand(0);
  11942. N0 = N0.getOperand(1);
  11943. } else
  11944. return SDValue();
  11945. } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
  11946. IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
  11947. // Note this is backward to how you would expect. We create
  11948. // add(reduce(load + 16), reduce(load + 0)) so that the
  11949. // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
  11950. // the X as VADDV(load + 0)
  11951. return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
  11952. } else
  11953. return SDValue();
  11954. if (!IsVecReduce(N0) || !IsVecReduce(N1))
  11955. return SDValue();
  11956. if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
  11957. return SDValue();
  11958. // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
  11959. SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
  11960. return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
  11961. };
  11962. if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
  11963. return R;
  11964. if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
  11965. return R;
  11966. return SDValue();
  11967. }
  11968. static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
  11969. const ARMSubtarget *Subtarget) {
  11970. if (!Subtarget->hasMVEIntegerOps())
  11971. return SDValue();
  11972. if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))
  11973. return R;
  11974. EVT VT = N->getValueType(0);
  11975. SDValue N0 = N->getOperand(0);
  11976. SDValue N1 = N->getOperand(1);
  11977. SDLoc dl(N);
  11978. if (VT != MVT::i64)
  11979. return SDValue();
  11980. // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
  11981. // will look like:
  11982. // t1: i32,i32 = ARMISD::VADDLVs x
  11983. // t2: i64 = build_pair t1, t1:1
  11984. // t3: i64 = add t2, y
  11985. // Otherwise we try to push the add up above VADDLVAx, to potentially allow
  11986. // the add to be simplified seperately.
  11987. // We also need to check for sext / zext and commutitive adds.
  11988. auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
  11989. SDValue NB) {
  11990. if (NB->getOpcode() != ISD::BUILD_PAIR)
  11991. return SDValue();
  11992. SDValue VecRed = NB->getOperand(0);
  11993. if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
  11994. VecRed.getResNo() != 0 ||
  11995. NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
  11996. return SDValue();
  11997. if (VecRed->getOpcode() == OpcodeA) {
  11998. // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
  11999. SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
  12000. VecRed.getOperand(0), VecRed.getOperand(1));
  12001. NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
  12002. }
  12003. SmallVector<SDValue, 4> Ops;
  12004. Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
  12005. DAG.getConstant(0, dl, MVT::i32)));
  12006. Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
  12007. DAG.getConstant(1, dl, MVT::i32)));
  12008. unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
  12009. for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
  12010. Ops.push_back(VecRed->getOperand(I));
  12011. SDValue Red =
  12012. DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
  12013. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
  12014. SDValue(Red.getNode(), 1));
  12015. };
  12016. if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
  12017. return M;
  12018. if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
  12019. return M;
  12020. if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
  12021. return M;
  12022. if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
  12023. return M;
  12024. if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
  12025. return M;
  12026. if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
  12027. return M;
  12028. if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
  12029. return M;
  12030. if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
  12031. return M;
  12032. if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
  12033. return M;
  12034. if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
  12035. return M;
  12036. if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
  12037. return M;
  12038. if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
  12039. return M;
  12040. if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
  12041. return M;
  12042. if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
  12043. return M;
  12044. if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
  12045. return M;
  12046. if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
  12047. return M;
  12048. return SDValue();
  12049. }
  12050. bool
  12051. ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
  12052. CombineLevel Level) const {
  12053. if (Level == BeforeLegalizeTypes)
  12054. return true;
  12055. if (N->getOpcode() != ISD::SHL)
  12056. return true;
  12057. if (Subtarget->isThumb1Only()) {
  12058. // Avoid making expensive immediates by commuting shifts. (This logic
  12059. // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
  12060. // for free.)
  12061. if (N->getOpcode() != ISD::SHL)
  12062. return true;
  12063. SDValue N1 = N->getOperand(0);
  12064. if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
  12065. N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
  12066. return true;
  12067. if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
  12068. if (Const->getAPIntValue().ult(256))
  12069. return false;
  12070. if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
  12071. Const->getAPIntValue().sgt(-256))
  12072. return false;
  12073. }
  12074. return true;
  12075. }
  12076. // Turn off commute-with-shift transform after legalization, so it doesn't
  12077. // conflict with PerformSHLSimplify. (We could try to detect when
  12078. // PerformSHLSimplify would trigger more precisely, but it isn't
  12079. // really necessary.)
  12080. return false;
  12081. }
  12082. bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
  12083. const SDNode *N, CombineLevel Level) const {
  12084. if (!Subtarget->isThumb1Only())
  12085. return true;
  12086. if (Level == BeforeLegalizeTypes)
  12087. return true;
  12088. return false;
  12089. }
  12090. bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
  12091. if (!Subtarget->hasNEON()) {
  12092. if (Subtarget->isThumb1Only())
  12093. return VT.getScalarSizeInBits() <= 32;
  12094. return true;
  12095. }
  12096. return VT.isScalarInteger();
  12097. }
  12098. bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
  12099. EVT VT) const {
  12100. if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
  12101. return false;
  12102. switch (FPVT.getSimpleVT().SimpleTy) {
  12103. case MVT::f16:
  12104. return Subtarget->hasVFP2Base();
  12105. case MVT::f32:
  12106. return Subtarget->hasVFP2Base();
  12107. case MVT::f64:
  12108. return Subtarget->hasFP64();
  12109. case MVT::v4f32:
  12110. case MVT::v8f16:
  12111. return Subtarget->hasMVEFloatOps();
  12112. default:
  12113. return false;
  12114. }
  12115. }
  12116. static SDValue PerformSHLSimplify(SDNode *N,
  12117. TargetLowering::DAGCombinerInfo &DCI,
  12118. const ARMSubtarget *ST) {
  12119. // Allow the generic combiner to identify potential bswaps.
  12120. if (DCI.isBeforeLegalize())
  12121. return SDValue();
  12122. // DAG combiner will fold:
  12123. // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
  12124. // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
  12125. // Other code patterns that can be also be modified have the following form:
  12126. // b + ((a << 1) | 510)
  12127. // b + ((a << 1) & 510)
  12128. // b + ((a << 1) ^ 510)
  12129. // b + ((a << 1) + 510)
  12130. // Many instructions can perform the shift for free, but it requires both
  12131. // the operands to be registers. If c1 << c2 is too large, a mov immediate
  12132. // instruction will needed. So, unfold back to the original pattern if:
  12133. // - if c1 and c2 are small enough that they don't require mov imms.
  12134. // - the user(s) of the node can perform an shl
  12135. // No shifted operands for 16-bit instructions.
  12136. if (ST->isThumb() && ST->isThumb1Only())
  12137. return SDValue();
  12138. // Check that all the users could perform the shl themselves.
  12139. for (auto U : N->uses()) {
  12140. switch(U->getOpcode()) {
  12141. default:
  12142. return SDValue();
  12143. case ISD::SUB:
  12144. case ISD::ADD:
  12145. case ISD::AND:
  12146. case ISD::OR:
  12147. case ISD::XOR:
  12148. case ISD::SETCC:
  12149. case ARMISD::CMP:
  12150. // Check that the user isn't already using a constant because there
  12151. // aren't any instructions that support an immediate operand and a
  12152. // shifted operand.
  12153. if (isa<ConstantSDNode>(U->getOperand(0)) ||
  12154. isa<ConstantSDNode>(U->getOperand(1)))
  12155. return SDValue();
  12156. // Check that it's not already using a shift.
  12157. if (U->getOperand(0).getOpcode() == ISD::SHL ||
  12158. U->getOperand(1).getOpcode() == ISD::SHL)
  12159. return SDValue();
  12160. break;
  12161. }
  12162. }
  12163. if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
  12164. N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
  12165. return SDValue();
  12166. if (N->getOperand(0).getOpcode() != ISD::SHL)
  12167. return SDValue();
  12168. SDValue SHL = N->getOperand(0);
  12169. auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12170. auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
  12171. if (!C1ShlC2 || !C2)
  12172. return SDValue();
  12173. APInt C2Int = C2->getAPIntValue();
  12174. APInt C1Int = C1ShlC2->getAPIntValue();
  12175. // Check that performing a lshr will not lose any information.
  12176. APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
  12177. C2Int.getBitWidth() - C2->getZExtValue());
  12178. if ((C1Int & Mask) != C1Int)
  12179. return SDValue();
  12180. // Shift the first constant.
  12181. C1Int.lshrInPlace(C2Int);
  12182. // The immediates are encoded as an 8-bit value that can be rotated.
  12183. auto LargeImm = [](const APInt &Imm) {
  12184. unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
  12185. return Imm.getBitWidth() - Zeros > 8;
  12186. };
  12187. if (LargeImm(C1Int) || LargeImm(C2Int))
  12188. return SDValue();
  12189. SelectionDAG &DAG = DCI.DAG;
  12190. SDLoc dl(N);
  12191. SDValue X = SHL.getOperand(0);
  12192. SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
  12193. DAG.getConstant(C1Int, dl, MVT::i32));
  12194. // Shift left to compensate for the lshr of C1Int.
  12195. SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
  12196. LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
  12197. SHL.dump(); N->dump());
  12198. LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
  12199. return Res;
  12200. }
  12201. /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
  12202. ///
  12203. static SDValue PerformADDCombine(SDNode *N,
  12204. TargetLowering::DAGCombinerInfo &DCI,
  12205. const ARMSubtarget *Subtarget) {
  12206. SDValue N0 = N->getOperand(0);
  12207. SDValue N1 = N->getOperand(1);
  12208. // Only works one way, because it needs an immediate operand.
  12209. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12210. return Result;
  12211. if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
  12212. return Result;
  12213. // First try with the default operand order.
  12214. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
  12215. return Result;
  12216. // If that didn't work, try again with the operands commuted.
  12217. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
  12218. }
  12219. // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
  12220. // providing -X is as cheap as X (currently, just a constant).
  12221. static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
  12222. if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
  12223. return SDValue();
  12224. SDValue CSINC = N->getOperand(1);
  12225. if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
  12226. return SDValue();
  12227. ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
  12228. if (!X)
  12229. return SDValue();
  12230. return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
  12231. DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
  12232. CSINC.getOperand(0)),
  12233. CSINC.getOperand(1), CSINC.getOperand(2),
  12234. CSINC.getOperand(3));
  12235. }
  12236. /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
  12237. ///
  12238. static SDValue PerformSUBCombine(SDNode *N,
  12239. TargetLowering::DAGCombinerInfo &DCI,
  12240. const ARMSubtarget *Subtarget) {
  12241. SDValue N0 = N->getOperand(0);
  12242. SDValue N1 = N->getOperand(1);
  12243. // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
  12244. if (N1.getNode()->hasOneUse())
  12245. if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
  12246. return Result;
  12247. if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
  12248. return R;
  12249. if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
  12250. return SDValue();
  12251. // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
  12252. // so that we can readily pattern match more mve instructions which can use
  12253. // a scalar operand.
  12254. SDValue VDup = N->getOperand(1);
  12255. if (VDup->getOpcode() != ARMISD::VDUP)
  12256. return SDValue();
  12257. SDValue VMov = N->getOperand(0);
  12258. if (VMov->getOpcode() == ISD::BITCAST)
  12259. VMov = VMov->getOperand(0);
  12260. if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
  12261. return SDValue();
  12262. SDLoc dl(N);
  12263. SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
  12264. DCI.DAG.getConstant(0, dl, MVT::i32),
  12265. VDup->getOperand(0));
  12266. return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
  12267. }
  12268. /// PerformVMULCombine
  12269. /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
  12270. /// special multiplier accumulator forwarding.
  12271. /// vmul d3, d0, d2
  12272. /// vmla d3, d1, d2
  12273. /// is faster than
  12274. /// vadd d3, d0, d1
  12275. /// vmul d3, d3, d2
  12276. // However, for (A + B) * (A + B),
  12277. // vadd d2, d0, d1
  12278. // vmul d3, d0, d2
  12279. // vmla d3, d1, d2
  12280. // is slower than
  12281. // vadd d2, d0, d1
  12282. // vmul d3, d2, d2
  12283. static SDValue PerformVMULCombine(SDNode *N,
  12284. TargetLowering::DAGCombinerInfo &DCI,
  12285. const ARMSubtarget *Subtarget) {
  12286. if (!Subtarget->hasVMLxForwarding())
  12287. return SDValue();
  12288. SelectionDAG &DAG = DCI.DAG;
  12289. SDValue N0 = N->getOperand(0);
  12290. SDValue N1 = N->getOperand(1);
  12291. unsigned Opcode = N0.getOpcode();
  12292. if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
  12293. Opcode != ISD::FADD && Opcode != ISD::FSUB) {
  12294. Opcode = N1.getOpcode();
  12295. if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
  12296. Opcode != ISD::FADD && Opcode != ISD::FSUB)
  12297. return SDValue();
  12298. std::swap(N0, N1);
  12299. }
  12300. if (N0 == N1)
  12301. return SDValue();
  12302. EVT VT = N->getValueType(0);
  12303. SDLoc DL(N);
  12304. SDValue N00 = N0->getOperand(0);
  12305. SDValue N01 = N0->getOperand(1);
  12306. return DAG.getNode(Opcode, DL, VT,
  12307. DAG.getNode(ISD::MUL, DL, VT, N00, N1),
  12308. DAG.getNode(ISD::MUL, DL, VT, N01, N1));
  12309. }
  12310. static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
  12311. const ARMSubtarget *Subtarget) {
  12312. EVT VT = N->getValueType(0);
  12313. if (VT != MVT::v2i64)
  12314. return SDValue();
  12315. SDValue N0 = N->getOperand(0);
  12316. SDValue N1 = N->getOperand(1);
  12317. auto IsSignExt = [&](SDValue Op) {
  12318. if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
  12319. return SDValue();
  12320. EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
  12321. if (VT.getScalarSizeInBits() == 32)
  12322. return Op->getOperand(0);
  12323. return SDValue();
  12324. };
  12325. auto IsZeroExt = [&](SDValue Op) {
  12326. // Zero extends are a little more awkward. At the point we are matching
  12327. // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
  12328. // That might be before of after a bitcast depending on how the and is
  12329. // placed. Because this has to look through bitcasts, it is currently only
  12330. // supported on LE.
  12331. if (!Subtarget->isLittle())
  12332. return SDValue();
  12333. SDValue And = Op;
  12334. if (And->getOpcode() == ISD::BITCAST)
  12335. And = And->getOperand(0);
  12336. if (And->getOpcode() != ISD::AND)
  12337. return SDValue();
  12338. SDValue Mask = And->getOperand(1);
  12339. if (Mask->getOpcode() == ISD::BITCAST)
  12340. Mask = Mask->getOperand(0);
  12341. if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
  12342. Mask.getValueType() != MVT::v4i32)
  12343. return SDValue();
  12344. if (isAllOnesConstant(Mask->getOperand(0)) &&
  12345. isNullConstant(Mask->getOperand(1)) &&
  12346. isAllOnesConstant(Mask->getOperand(2)) &&
  12347. isNullConstant(Mask->getOperand(3)))
  12348. return And->getOperand(0);
  12349. return SDValue();
  12350. };
  12351. SDLoc dl(N);
  12352. if (SDValue Op0 = IsSignExt(N0)) {
  12353. if (SDValue Op1 = IsSignExt(N1)) {
  12354. SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
  12355. SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
  12356. return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
  12357. }
  12358. }
  12359. if (SDValue Op0 = IsZeroExt(N0)) {
  12360. if (SDValue Op1 = IsZeroExt(N1)) {
  12361. SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
  12362. SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
  12363. return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
  12364. }
  12365. }
  12366. return SDValue();
  12367. }
  12368. static SDValue PerformMULCombine(SDNode *N,
  12369. TargetLowering::DAGCombinerInfo &DCI,
  12370. const ARMSubtarget *Subtarget) {
  12371. SelectionDAG &DAG = DCI.DAG;
  12372. EVT VT = N->getValueType(0);
  12373. if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
  12374. return PerformMVEVMULLCombine(N, DAG, Subtarget);
  12375. if (Subtarget->isThumb1Only())
  12376. return SDValue();
  12377. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  12378. return SDValue();
  12379. if (VT.is64BitVector() || VT.is128BitVector())
  12380. return PerformVMULCombine(N, DCI, Subtarget);
  12381. if (VT != MVT::i32)
  12382. return SDValue();
  12383. ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12384. if (!C)
  12385. return SDValue();
  12386. int64_t MulAmt = C->getSExtValue();
  12387. unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
  12388. ShiftAmt = ShiftAmt & (32 - 1);
  12389. SDValue V = N->getOperand(0);
  12390. SDLoc DL(N);
  12391. SDValue Res;
  12392. MulAmt >>= ShiftAmt;
  12393. if (MulAmt >= 0) {
  12394. if (isPowerOf2_32(MulAmt - 1)) {
  12395. // (mul x, 2^N + 1) => (add (shl x, N), x)
  12396. Res = DAG.getNode(ISD::ADD, DL, VT,
  12397. V,
  12398. DAG.getNode(ISD::SHL, DL, VT,
  12399. V,
  12400. DAG.getConstant(Log2_32(MulAmt - 1), DL,
  12401. MVT::i32)));
  12402. } else if (isPowerOf2_32(MulAmt + 1)) {
  12403. // (mul x, 2^N - 1) => (sub (shl x, N), x)
  12404. Res = DAG.getNode(ISD::SUB, DL, VT,
  12405. DAG.getNode(ISD::SHL, DL, VT,
  12406. V,
  12407. DAG.getConstant(Log2_32(MulAmt + 1), DL,
  12408. MVT::i32)),
  12409. V);
  12410. } else
  12411. return SDValue();
  12412. } else {
  12413. uint64_t MulAmtAbs = -MulAmt;
  12414. if (isPowerOf2_32(MulAmtAbs + 1)) {
  12415. // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
  12416. Res = DAG.getNode(ISD::SUB, DL, VT,
  12417. V,
  12418. DAG.getNode(ISD::SHL, DL, VT,
  12419. V,
  12420. DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
  12421. MVT::i32)));
  12422. } else if (isPowerOf2_32(MulAmtAbs - 1)) {
  12423. // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
  12424. Res = DAG.getNode(ISD::ADD, DL, VT,
  12425. V,
  12426. DAG.getNode(ISD::SHL, DL, VT,
  12427. V,
  12428. DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
  12429. MVT::i32)));
  12430. Res = DAG.getNode(ISD::SUB, DL, VT,
  12431. DAG.getConstant(0, DL, MVT::i32), Res);
  12432. } else
  12433. return SDValue();
  12434. }
  12435. if (ShiftAmt != 0)
  12436. Res = DAG.getNode(ISD::SHL, DL, VT,
  12437. Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
  12438. // Do not add new nodes to DAG combiner worklist.
  12439. DCI.CombineTo(N, Res, false);
  12440. return SDValue();
  12441. }
  12442. static SDValue CombineANDShift(SDNode *N,
  12443. TargetLowering::DAGCombinerInfo &DCI,
  12444. const ARMSubtarget *Subtarget) {
  12445. // Allow DAGCombine to pattern-match before we touch the canonical form.
  12446. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  12447. return SDValue();
  12448. if (N->getValueType(0) != MVT::i32)
  12449. return SDValue();
  12450. ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12451. if (!N1C)
  12452. return SDValue();
  12453. uint32_t C1 = (uint32_t)N1C->getZExtValue();
  12454. // Don't transform uxtb/uxth.
  12455. if (C1 == 255 || C1 == 65535)
  12456. return SDValue();
  12457. SDNode *N0 = N->getOperand(0).getNode();
  12458. if (!N0->hasOneUse())
  12459. return SDValue();
  12460. if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
  12461. return SDValue();
  12462. bool LeftShift = N0->getOpcode() == ISD::SHL;
  12463. ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
  12464. if (!N01C)
  12465. return SDValue();
  12466. uint32_t C2 = (uint32_t)N01C->getZExtValue();
  12467. if (!C2 || C2 >= 32)
  12468. return SDValue();
  12469. // Clear irrelevant bits in the mask.
  12470. if (LeftShift)
  12471. C1 &= (-1U << C2);
  12472. else
  12473. C1 &= (-1U >> C2);
  12474. SelectionDAG &DAG = DCI.DAG;
  12475. SDLoc DL(N);
  12476. // We have a pattern of the form "(and (shl x, c2) c1)" or
  12477. // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
  12478. // transform to a pair of shifts, to save materializing c1.
  12479. // First pattern: right shift, then mask off leading bits.
  12480. // FIXME: Use demanded bits?
  12481. if (!LeftShift && isMask_32(C1)) {
  12482. uint32_t C3 = countLeadingZeros(C1);
  12483. if (C2 < C3) {
  12484. SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
  12485. DAG.getConstant(C3 - C2, DL, MVT::i32));
  12486. return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
  12487. DAG.getConstant(C3, DL, MVT::i32));
  12488. }
  12489. }
  12490. // First pattern, reversed: left shift, then mask off trailing bits.
  12491. if (LeftShift && isMask_32(~C1)) {
  12492. uint32_t C3 = countTrailingZeros(C1);
  12493. if (C2 < C3) {
  12494. SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
  12495. DAG.getConstant(C3 - C2, DL, MVT::i32));
  12496. return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
  12497. DAG.getConstant(C3, DL, MVT::i32));
  12498. }
  12499. }
  12500. // Second pattern: left shift, then mask off leading bits.
  12501. // FIXME: Use demanded bits?
  12502. if (LeftShift && isShiftedMask_32(C1)) {
  12503. uint32_t Trailing = countTrailingZeros(C1);
  12504. uint32_t C3 = countLeadingZeros(C1);
  12505. if (Trailing == C2 && C2 + C3 < 32) {
  12506. SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
  12507. DAG.getConstant(C2 + C3, DL, MVT::i32));
  12508. return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
  12509. DAG.getConstant(C3, DL, MVT::i32));
  12510. }
  12511. }
  12512. // Second pattern, reversed: right shift, then mask off trailing bits.
  12513. // FIXME: Handle other patterns of known/demanded bits.
  12514. if (!LeftShift && isShiftedMask_32(C1)) {
  12515. uint32_t Leading = countLeadingZeros(C1);
  12516. uint32_t C3 = countTrailingZeros(C1);
  12517. if (Leading == C2 && C2 + C3 < 32) {
  12518. SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
  12519. DAG.getConstant(C2 + C3, DL, MVT::i32));
  12520. return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
  12521. DAG.getConstant(C3, DL, MVT::i32));
  12522. }
  12523. }
  12524. // FIXME: Transform "(and (shl x, c2) c1)" ->
  12525. // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
  12526. // c1.
  12527. return SDValue();
  12528. }
  12529. static SDValue PerformANDCombine(SDNode *N,
  12530. TargetLowering::DAGCombinerInfo &DCI,
  12531. const ARMSubtarget *Subtarget) {
  12532. // Attempt to use immediate-form VBIC
  12533. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
  12534. SDLoc dl(N);
  12535. EVT VT = N->getValueType(0);
  12536. SelectionDAG &DAG = DCI.DAG;
  12537. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
  12538. VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
  12539. return SDValue();
  12540. APInt SplatBits, SplatUndef;
  12541. unsigned SplatBitSize;
  12542. bool HasAnyUndefs;
  12543. if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
  12544. BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  12545. if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
  12546. SplatBitSize == 64) {
  12547. EVT VbicVT;
  12548. SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
  12549. SplatUndef.getZExtValue(), SplatBitSize,
  12550. DAG, dl, VbicVT, VT, OtherModImm);
  12551. if (Val.getNode()) {
  12552. SDValue Input =
  12553. DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
  12554. SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
  12555. return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
  12556. }
  12557. }
  12558. }
  12559. if (!Subtarget->isThumb1Only()) {
  12560. // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
  12561. if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
  12562. return Result;
  12563. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12564. return Result;
  12565. }
  12566. if (Subtarget->isThumb1Only())
  12567. if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
  12568. return Result;
  12569. return SDValue();
  12570. }
  12571. // Try combining OR nodes to SMULWB, SMULWT.
  12572. static SDValue PerformORCombineToSMULWBT(SDNode *OR,
  12573. TargetLowering::DAGCombinerInfo &DCI,
  12574. const ARMSubtarget *Subtarget) {
  12575. if (!Subtarget->hasV6Ops() ||
  12576. (Subtarget->isThumb() &&
  12577. (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
  12578. return SDValue();
  12579. SDValue SRL = OR->getOperand(0);
  12580. SDValue SHL = OR->getOperand(1);
  12581. if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
  12582. SRL = OR->getOperand(1);
  12583. SHL = OR->getOperand(0);
  12584. }
  12585. if (!isSRL16(SRL) || !isSHL16(SHL))
  12586. return SDValue();
  12587. // The first operands to the shifts need to be the two results from the
  12588. // same smul_lohi node.
  12589. if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
  12590. SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
  12591. return SDValue();
  12592. SDNode *SMULLOHI = SRL.getOperand(0).getNode();
  12593. if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
  12594. SHL.getOperand(0) != SDValue(SMULLOHI, 1))
  12595. return SDValue();
  12596. // Now we have:
  12597. // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
  12598. // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
  12599. // For SMUWB the 16-bit value will signed extended somehow.
  12600. // For SMULWT only the SRA is required.
  12601. // Check both sides of SMUL_LOHI
  12602. SDValue OpS16 = SMULLOHI->getOperand(0);
  12603. SDValue OpS32 = SMULLOHI->getOperand(1);
  12604. SelectionDAG &DAG = DCI.DAG;
  12605. if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
  12606. OpS16 = OpS32;
  12607. OpS32 = SMULLOHI->getOperand(0);
  12608. }
  12609. SDLoc dl(OR);
  12610. unsigned Opcode = 0;
  12611. if (isS16(OpS16, DAG))
  12612. Opcode = ARMISD::SMULWB;
  12613. else if (isSRA16(OpS16)) {
  12614. Opcode = ARMISD::SMULWT;
  12615. OpS16 = OpS16->getOperand(0);
  12616. }
  12617. else
  12618. return SDValue();
  12619. SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
  12620. DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
  12621. return SDValue(OR, 0);
  12622. }
  12623. static SDValue PerformORCombineToBFI(SDNode *N,
  12624. TargetLowering::DAGCombinerInfo &DCI,
  12625. const ARMSubtarget *Subtarget) {
  12626. // BFI is only available on V6T2+
  12627. if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
  12628. return SDValue();
  12629. EVT VT = N->getValueType(0);
  12630. SDValue N0 = N->getOperand(0);
  12631. SDValue N1 = N->getOperand(1);
  12632. SelectionDAG &DAG = DCI.DAG;
  12633. SDLoc DL(N);
  12634. // 1) or (and A, mask), val => ARMbfi A, val, mask
  12635. // iff (val & mask) == val
  12636. //
  12637. // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
  12638. // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
  12639. // && mask == ~mask2
  12640. // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
  12641. // && ~mask == mask2
  12642. // (i.e., copy a bitfield value into another bitfield of the same width)
  12643. if (VT != MVT::i32)
  12644. return SDValue();
  12645. SDValue N00 = N0.getOperand(0);
  12646. // The value and the mask need to be constants so we can verify this is
  12647. // actually a bitfield set. If the mask is 0xffff, we can do better
  12648. // via a movt instruction, so don't use BFI in that case.
  12649. SDValue MaskOp = N0.getOperand(1);
  12650. ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
  12651. if (!MaskC)
  12652. return SDValue();
  12653. unsigned Mask = MaskC->getZExtValue();
  12654. if (Mask == 0xffff)
  12655. return SDValue();
  12656. SDValue Res;
  12657. // Case (1): or (and A, mask), val => ARMbfi A, val, mask
  12658. ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
  12659. if (N1C) {
  12660. unsigned Val = N1C->getZExtValue();
  12661. if ((Val & ~Mask) != Val)
  12662. return SDValue();
  12663. if (ARM::isBitFieldInvertedMask(Mask)) {
  12664. Val >>= countTrailingZeros(~Mask);
  12665. Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
  12666. DAG.getConstant(Val, DL, MVT::i32),
  12667. DAG.getConstant(Mask, DL, MVT::i32));
  12668. DCI.CombineTo(N, Res, false);
  12669. // Return value from the original node to inform the combiner than N is
  12670. // now dead.
  12671. return SDValue(N, 0);
  12672. }
  12673. } else if (N1.getOpcode() == ISD::AND) {
  12674. // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
  12675. ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
  12676. if (!N11C)
  12677. return SDValue();
  12678. unsigned Mask2 = N11C->getZExtValue();
  12679. // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
  12680. // as is to match.
  12681. if (ARM::isBitFieldInvertedMask(Mask) &&
  12682. (Mask == ~Mask2)) {
  12683. // The pack halfword instruction works better for masks that fit it,
  12684. // so use that when it's available.
  12685. if (Subtarget->hasDSP() &&
  12686. (Mask == 0xffff || Mask == 0xffff0000))
  12687. return SDValue();
  12688. // 2a
  12689. unsigned amt = countTrailingZeros(Mask2);
  12690. Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
  12691. DAG.getConstant(amt, DL, MVT::i32));
  12692. Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
  12693. DAG.getConstant(Mask, DL, MVT::i32));
  12694. DCI.CombineTo(N, Res, false);
  12695. // Return value from the original node to inform the combiner than N is
  12696. // now dead.
  12697. return SDValue(N, 0);
  12698. } else if (ARM::isBitFieldInvertedMask(~Mask) &&
  12699. (~Mask == Mask2)) {
  12700. // The pack halfword instruction works better for masks that fit it,
  12701. // so use that when it's available.
  12702. if (Subtarget->hasDSP() &&
  12703. (Mask2 == 0xffff || Mask2 == 0xffff0000))
  12704. return SDValue();
  12705. // 2b
  12706. unsigned lsb = countTrailingZeros(Mask);
  12707. Res = DAG.getNode(ISD::SRL, DL, VT, N00,
  12708. DAG.getConstant(lsb, DL, MVT::i32));
  12709. Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
  12710. DAG.getConstant(Mask2, DL, MVT::i32));
  12711. DCI.CombineTo(N, Res, false);
  12712. // Return value from the original node to inform the combiner than N is
  12713. // now dead.
  12714. return SDValue(N, 0);
  12715. }
  12716. }
  12717. if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
  12718. N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
  12719. ARM::isBitFieldInvertedMask(~Mask)) {
  12720. // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
  12721. // where lsb(mask) == #shamt and masked bits of B are known zero.
  12722. SDValue ShAmt = N00.getOperand(1);
  12723. unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
  12724. unsigned LSB = countTrailingZeros(Mask);
  12725. if (ShAmtC != LSB)
  12726. return SDValue();
  12727. Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
  12728. DAG.getConstant(~Mask, DL, MVT::i32));
  12729. DCI.CombineTo(N, Res, false);
  12730. // Return value from the original node to inform the combiner than N is
  12731. // now dead.
  12732. return SDValue(N, 0);
  12733. }
  12734. return SDValue();
  12735. }
  12736. static bool isValidMVECond(unsigned CC, bool IsFloat) {
  12737. switch (CC) {
  12738. case ARMCC::EQ:
  12739. case ARMCC::NE:
  12740. case ARMCC::LE:
  12741. case ARMCC::GT:
  12742. case ARMCC::GE:
  12743. case ARMCC::LT:
  12744. return true;
  12745. case ARMCC::HS:
  12746. case ARMCC::HI:
  12747. return !IsFloat;
  12748. default:
  12749. return false;
  12750. };
  12751. }
  12752. static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
  12753. if (N->getOpcode() == ARMISD::VCMP)
  12754. return (ARMCC::CondCodes)N->getConstantOperandVal(2);
  12755. else if (N->getOpcode() == ARMISD::VCMPZ)
  12756. return (ARMCC::CondCodes)N->getConstantOperandVal(1);
  12757. else
  12758. llvm_unreachable("Not a VCMP/VCMPZ!");
  12759. }
  12760. static bool CanInvertMVEVCMP(SDValue N) {
  12761. ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
  12762. return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
  12763. }
  12764. static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,
  12765. const ARMSubtarget *Subtarget) {
  12766. // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
  12767. // together with predicates
  12768. EVT VT = N->getValueType(0);
  12769. SDLoc DL(N);
  12770. SDValue N0 = N->getOperand(0);
  12771. SDValue N1 = N->getOperand(1);
  12772. auto IsFreelyInvertable = [&](SDValue V) {
  12773. if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
  12774. return CanInvertMVEVCMP(V);
  12775. return false;
  12776. };
  12777. // At least one operand must be freely invertable.
  12778. if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
  12779. return SDValue();
  12780. SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
  12781. SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
  12782. SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
  12783. return DAG.getLogicalNOT(DL, And, VT);
  12784. }
  12785. /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
  12786. static SDValue PerformORCombine(SDNode *N,
  12787. TargetLowering::DAGCombinerInfo &DCI,
  12788. const ARMSubtarget *Subtarget) {
  12789. // Attempt to use immediate-form VORR
  12790. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
  12791. SDLoc dl(N);
  12792. EVT VT = N->getValueType(0);
  12793. SelectionDAG &DAG = DCI.DAG;
  12794. if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
  12795. return SDValue();
  12796. if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
  12797. VT == MVT::v8i1 || VT == MVT::v16i1))
  12798. return PerformORCombine_i1(N, DAG, Subtarget);
  12799. APInt SplatBits, SplatUndef;
  12800. unsigned SplatBitSize;
  12801. bool HasAnyUndefs;
  12802. if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
  12803. BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  12804. if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
  12805. SplatBitSize == 64) {
  12806. EVT VorrVT;
  12807. SDValue Val =
  12808. isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
  12809. SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
  12810. if (Val.getNode()) {
  12811. SDValue Input =
  12812. DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
  12813. SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
  12814. return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
  12815. }
  12816. }
  12817. }
  12818. if (!Subtarget->isThumb1Only()) {
  12819. // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
  12820. if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
  12821. return Result;
  12822. if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
  12823. return Result;
  12824. }
  12825. SDValue N0 = N->getOperand(0);
  12826. SDValue N1 = N->getOperand(1);
  12827. // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
  12828. if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
  12829. DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
  12830. // The code below optimizes (or (and X, Y), Z).
  12831. // The AND operand needs to have a single user to make these optimizations
  12832. // profitable.
  12833. if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
  12834. return SDValue();
  12835. APInt SplatUndef;
  12836. unsigned SplatBitSize;
  12837. bool HasAnyUndefs;
  12838. APInt SplatBits0, SplatBits1;
  12839. BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
  12840. BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
  12841. // Ensure that the second operand of both ands are constants
  12842. if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
  12843. HasAnyUndefs) && !HasAnyUndefs) {
  12844. if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
  12845. HasAnyUndefs) && !HasAnyUndefs) {
  12846. // Ensure that the bit width of the constants are the same and that
  12847. // the splat arguments are logical inverses as per the pattern we
  12848. // are trying to simplify.
  12849. if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
  12850. SplatBits0 == ~SplatBits1) {
  12851. // Canonicalize the vector type to make instruction selection
  12852. // simpler.
  12853. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
  12854. SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
  12855. N0->getOperand(1),
  12856. N0->getOperand(0),
  12857. N1->getOperand(0));
  12858. return DAG.getNode(ISD::BITCAST, dl, VT, Result);
  12859. }
  12860. }
  12861. }
  12862. }
  12863. // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
  12864. // reasonable.
  12865. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
  12866. if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
  12867. return Res;
  12868. }
  12869. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12870. return Result;
  12871. return SDValue();
  12872. }
  12873. static SDValue PerformXORCombine(SDNode *N,
  12874. TargetLowering::DAGCombinerInfo &DCI,
  12875. const ARMSubtarget *Subtarget) {
  12876. EVT VT = N->getValueType(0);
  12877. SelectionDAG &DAG = DCI.DAG;
  12878. if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
  12879. return SDValue();
  12880. if (!Subtarget->isThumb1Only()) {
  12881. // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
  12882. if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
  12883. return Result;
  12884. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12885. return Result;
  12886. }
  12887. if (Subtarget->hasMVEIntegerOps()) {
  12888. // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
  12889. SDValue N0 = N->getOperand(0);
  12890. SDValue N1 = N->getOperand(1);
  12891. const TargetLowering *TLI = Subtarget->getTargetLowering();
  12892. if (TLI->isConstTrueVal(N1) &&
  12893. (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
  12894. if (CanInvertMVEVCMP(N0)) {
  12895. SDLoc DL(N0);
  12896. ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
  12897. SmallVector<SDValue, 4> Ops;
  12898. Ops.push_back(N0->getOperand(0));
  12899. if (N0->getOpcode() == ARMISD::VCMP)
  12900. Ops.push_back(N0->getOperand(1));
  12901. Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
  12902. return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
  12903. }
  12904. }
  12905. }
  12906. return SDValue();
  12907. }
  12908. // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
  12909. // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
  12910. // their position in "to" (Rd).
  12911. static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
  12912. assert(N->getOpcode() == ARMISD::BFI);
  12913. SDValue From = N->getOperand(1);
  12914. ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
  12915. FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
  12916. // If the Base came from a SHR #C, we can deduce that it is really testing bit
  12917. // #C in the base of the SHR.
  12918. if (From->getOpcode() == ISD::SRL &&
  12919. isa<ConstantSDNode>(From->getOperand(1))) {
  12920. APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
  12921. assert(Shift.getLimitedValue() < 32 && "Shift too large!");
  12922. FromMask <<= Shift.getLimitedValue(31);
  12923. From = From->getOperand(0);
  12924. }
  12925. return From;
  12926. }
  12927. // If A and B contain one contiguous set of bits, does A | B == A . B?
  12928. //
  12929. // Neither A nor B must be zero.
  12930. static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
  12931. unsigned LastActiveBitInA = A.countTrailingZeros();
  12932. unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
  12933. return LastActiveBitInA - 1 == FirstActiveBitInB;
  12934. }
  12935. static SDValue FindBFIToCombineWith(SDNode *N) {
  12936. // We have a BFI in N. Find a BFI it can combine with, if one exists.
  12937. APInt ToMask, FromMask;
  12938. SDValue From = ParseBFI(N, ToMask, FromMask);
  12939. SDValue To = N->getOperand(0);
  12940. SDValue V = To;
  12941. if (V.getOpcode() != ARMISD::BFI)
  12942. return SDValue();
  12943. APInt NewToMask, NewFromMask;
  12944. SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
  12945. if (NewFrom != From)
  12946. return SDValue();
  12947. // Do the written bits conflict with any we've seen so far?
  12948. if ((NewToMask & ToMask).getBoolValue())
  12949. // Conflicting bits.
  12950. return SDValue();
  12951. // Are the new bits contiguous when combined with the old bits?
  12952. if (BitsProperlyConcatenate(ToMask, NewToMask) &&
  12953. BitsProperlyConcatenate(FromMask, NewFromMask))
  12954. return V;
  12955. if (BitsProperlyConcatenate(NewToMask, ToMask) &&
  12956. BitsProperlyConcatenate(NewFromMask, FromMask))
  12957. return V;
  12958. return SDValue();
  12959. }
  12960. static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
  12961. SDValue N0 = N->getOperand(0);
  12962. SDValue N1 = N->getOperand(1);
  12963. if (N1.getOpcode() == ISD::AND) {
  12964. // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
  12965. // the bits being cleared by the AND are not demanded by the BFI.
  12966. ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
  12967. if (!N11C)
  12968. return SDValue();
  12969. unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  12970. unsigned LSB = countTrailingZeros(~InvMask);
  12971. unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
  12972. assert(Width <
  12973. static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
  12974. "undefined behavior");
  12975. unsigned Mask = (1u << Width) - 1;
  12976. unsigned Mask2 = N11C->getZExtValue();
  12977. if ((Mask & (~Mask2)) == 0)
  12978. return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
  12979. N->getOperand(0), N1.getOperand(0), N->getOperand(2));
  12980. return SDValue();
  12981. }
  12982. // Look for another BFI to combine with.
  12983. if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
  12984. // We've found a BFI.
  12985. APInt ToMask1, FromMask1;
  12986. SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
  12987. APInt ToMask2, FromMask2;
  12988. SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
  12989. assert(From1 == From2);
  12990. (void)From2;
  12991. // Create a new BFI, combining the two together.
  12992. APInt NewFromMask = FromMask1 | FromMask2;
  12993. APInt NewToMask = ToMask1 | ToMask2;
  12994. EVT VT = N->getValueType(0);
  12995. SDLoc dl(N);
  12996. if (NewFromMask[0] == 0)
  12997. From1 = DAG.getNode(
  12998. ISD::SRL, dl, VT, From1,
  12999. DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
  13000. return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
  13001. DAG.getConstant(~NewToMask, dl, VT));
  13002. }
  13003. // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
  13004. // that lower bit insertions are performed first, providing that M1 and M2
  13005. // do no overlap. This can allow multiple BFI instructions to be combined
  13006. // together by the other folds above.
  13007. if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
  13008. APInt ToMask1 = ~N->getConstantOperandAPInt(2);
  13009. APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
  13010. if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
  13011. ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros())
  13012. return SDValue();
  13013. EVT VT = N->getValueType(0);
  13014. SDLoc dl(N);
  13015. SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
  13016. N->getOperand(1), N->getOperand(2));
  13017. return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
  13018. N0.getOperand(2));
  13019. }
  13020. return SDValue();
  13021. }
  13022. // Check that N is CMPZ(CSINC(0, 0, CC, X)),
  13023. // or CMPZ(CMOV(1, 0, CC, $cpsr, X))
  13024. // return X if valid.
  13025. static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
  13026. if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
  13027. return SDValue();
  13028. SDValue CSInc = Cmp->getOperand(0);
  13029. // Ignore any `And 1` nodes that may not yet have been removed. We are
  13030. // looking for a value that produces 1/0, so these have no effect on the
  13031. // code.
  13032. while (CSInc.getOpcode() == ISD::AND &&
  13033. isa<ConstantSDNode>(CSInc.getOperand(1)) &&
  13034. CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
  13035. CSInc = CSInc.getOperand(0);
  13036. if (CSInc.getOpcode() == ARMISD::CSINC &&
  13037. isNullConstant(CSInc.getOperand(0)) &&
  13038. isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
  13039. CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
  13040. return CSInc.getOperand(3);
  13041. }
  13042. if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
  13043. isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
  13044. CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
  13045. return CSInc.getOperand(4);
  13046. }
  13047. if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
  13048. isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
  13049. CC = ARMCC::getOppositeCondition(
  13050. (ARMCC::CondCodes)CSInc.getConstantOperandVal(2));
  13051. return CSInc.getOperand(4);
  13052. }
  13053. return SDValue();
  13054. }
  13055. static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
  13056. // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
  13057. // t92: glue = ARMISD::CMPZ t74, 0
  13058. // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
  13059. // t96: glue = ARMISD::CMPZ t93, 0
  13060. // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
  13061. ARMCC::CondCodes Cond;
  13062. if (SDValue C = IsCMPZCSINC(N, Cond))
  13063. if (Cond == ARMCC::EQ)
  13064. return C;
  13065. return SDValue();
  13066. }
  13067. static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) {
  13068. // Fold away an unneccessary CMPZ/CSINC
  13069. // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
  13070. // if C1==EQ -> CSXYZ A, B, C2, D
  13071. // if C1==NE -> CSXYZ A, B, NOT(C2), D
  13072. ARMCC::CondCodes Cond;
  13073. if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
  13074. if (N->getConstantOperandVal(2) == ARMCC::EQ)
  13075. return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
  13076. N->getOperand(1),
  13077. DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
  13078. if (N->getConstantOperandVal(2) == ARMCC::NE)
  13079. return DAG.getNode(
  13080. N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
  13081. N->getOperand(1),
  13082. DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C);
  13083. }
  13084. return SDValue();
  13085. }
  13086. /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
  13087. /// ARMISD::VMOVRRD.
  13088. static SDValue PerformVMOVRRDCombine(SDNode *N,
  13089. TargetLowering::DAGCombinerInfo &DCI,
  13090. const ARMSubtarget *Subtarget) {
  13091. // vmovrrd(vmovdrr x, y) -> x,y
  13092. SDValue InDouble = N->getOperand(0);
  13093. if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
  13094. return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
  13095. // vmovrrd(load f64) -> (load i32), (load i32)
  13096. SDNode *InNode = InDouble.getNode();
  13097. if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
  13098. InNode->getValueType(0) == MVT::f64 &&
  13099. InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
  13100. !cast<LoadSDNode>(InNode)->isVolatile()) {
  13101. // TODO: Should this be done for non-FrameIndex operands?
  13102. LoadSDNode *LD = cast<LoadSDNode>(InNode);
  13103. SelectionDAG &DAG = DCI.DAG;
  13104. SDLoc DL(LD);
  13105. SDValue BasePtr = LD->getBasePtr();
  13106. SDValue NewLD1 =
  13107. DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
  13108. LD->getAlignment(), LD->getMemOperand()->getFlags());
  13109. SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
  13110. DAG.getConstant(4, DL, MVT::i32));
  13111. SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
  13112. LD->getPointerInfo().getWithOffset(4),
  13113. std::min(4U, LD->getAlignment()),
  13114. LD->getMemOperand()->getFlags());
  13115. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
  13116. if (DCI.DAG.getDataLayout().isBigEndian())
  13117. std::swap (NewLD1, NewLD2);
  13118. SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
  13119. return Result;
  13120. }
  13121. // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
  13122. // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
  13123. if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  13124. isa<ConstantSDNode>(InDouble.getOperand(1))) {
  13125. SDValue BV = InDouble.getOperand(0);
  13126. // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
  13127. // change lane order under big endian.
  13128. bool BVSwap = BV.getOpcode() == ISD::BITCAST;
  13129. while (
  13130. (BV.getOpcode() == ISD::BITCAST ||
  13131. BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
  13132. (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
  13133. BVSwap = BV.getOpcode() == ISD::BITCAST;
  13134. BV = BV.getOperand(0);
  13135. }
  13136. if (BV.getValueType() != MVT::v4i32)
  13137. return SDValue();
  13138. // Handle buildvectors, pulling out the correct lane depending on
  13139. // endianness.
  13140. unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
  13141. if (BV.getOpcode() == ISD::BUILD_VECTOR) {
  13142. SDValue Op0 = BV.getOperand(Offset);
  13143. SDValue Op1 = BV.getOperand(Offset + 1);
  13144. if (!Subtarget->isLittle() && BVSwap)
  13145. std::swap(Op0, Op1);
  13146. return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
  13147. }
  13148. // A chain of insert_vectors, grabbing the correct value of the chain of
  13149. // inserts.
  13150. SDValue Op0, Op1;
  13151. while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
  13152. if (isa<ConstantSDNode>(BV.getOperand(2))) {
  13153. if (BV.getConstantOperandVal(2) == Offset)
  13154. Op0 = BV.getOperand(1);
  13155. if (BV.getConstantOperandVal(2) == Offset + 1)
  13156. Op1 = BV.getOperand(1);
  13157. }
  13158. BV = BV.getOperand(0);
  13159. }
  13160. if (!Subtarget->isLittle() && BVSwap)
  13161. std::swap(Op0, Op1);
  13162. if (Op0 && Op1)
  13163. return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
  13164. }
  13165. return SDValue();
  13166. }
  13167. /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
  13168. /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
  13169. static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
  13170. // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
  13171. SDValue Op0 = N->getOperand(0);
  13172. SDValue Op1 = N->getOperand(1);
  13173. if (Op0.getOpcode() == ISD::BITCAST)
  13174. Op0 = Op0.getOperand(0);
  13175. if (Op1.getOpcode() == ISD::BITCAST)
  13176. Op1 = Op1.getOperand(0);
  13177. if (Op0.getOpcode() == ARMISD::VMOVRRD &&
  13178. Op0.getNode() == Op1.getNode() &&
  13179. Op0.getResNo() == 0 && Op1.getResNo() == 1)
  13180. return DAG.getNode(ISD::BITCAST, SDLoc(N),
  13181. N->getValueType(0), Op0.getOperand(0));
  13182. return SDValue();
  13183. }
  13184. static SDValue PerformVMOVhrCombine(SDNode *N,
  13185. TargetLowering::DAGCombinerInfo &DCI) {
  13186. SDValue Op0 = N->getOperand(0);
  13187. // VMOVhr (VMOVrh (X)) -> X
  13188. if (Op0->getOpcode() == ARMISD::VMOVrh)
  13189. return Op0->getOperand(0);
  13190. // FullFP16: half values are passed in S-registers, and we don't
  13191. // need any of the bitcast and moves:
  13192. //
  13193. // t2: f32,ch = CopyFromReg t0, Register:f32 %0
  13194. // t5: i32 = bitcast t2
  13195. // t18: f16 = ARMISD::VMOVhr t5
  13196. if (Op0->getOpcode() == ISD::BITCAST) {
  13197. SDValue Copy = Op0->getOperand(0);
  13198. if (Copy.getValueType() == MVT::f32 &&
  13199. Copy->getOpcode() == ISD::CopyFromReg) {
  13200. SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
  13201. SDValue NewCopy =
  13202. DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
  13203. return NewCopy;
  13204. }
  13205. }
  13206. // fold (VMOVhr (load x)) -> (load (f16*)x)
  13207. if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
  13208. if (LN0->hasOneUse() && LN0->isUnindexed() &&
  13209. LN0->getMemoryVT() == MVT::i16) {
  13210. SDValue Load =
  13211. DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
  13212. LN0->getBasePtr(), LN0->getMemOperand());
  13213. DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
  13214. DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
  13215. return Load;
  13216. }
  13217. }
  13218. // Only the bottom 16 bits of the source register are used.
  13219. APInt DemandedMask = APInt::getLowBitsSet(32, 16);
  13220. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  13221. if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
  13222. return SDValue(N, 0);
  13223. return SDValue();
  13224. }
  13225. static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {
  13226. SDValue N0 = N->getOperand(0);
  13227. EVT VT = N->getValueType(0);
  13228. // fold (VMOVrh (fpconst x)) -> const x
  13229. if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
  13230. APFloat V = C->getValueAPF();
  13231. return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
  13232. }
  13233. // fold (VMOVrh (load x)) -> (zextload (i16*)x)
  13234. if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
  13235. LoadSDNode *LN0 = cast<LoadSDNode>(N0);
  13236. SDValue Load =
  13237. DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
  13238. LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
  13239. DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
  13240. DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
  13241. return Load;
  13242. }
  13243. // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
  13244. if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  13245. isa<ConstantSDNode>(N0->getOperand(1)))
  13246. return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
  13247. N0->getOperand(1));
  13248. return SDValue();
  13249. }
  13250. /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
  13251. /// are normal, non-volatile loads. If so, it is profitable to bitcast an
  13252. /// i64 vector to have f64 elements, since the value can then be loaded
  13253. /// directly into a VFP register.
  13254. static bool hasNormalLoadOperand(SDNode *N) {
  13255. unsigned NumElts = N->getValueType(0).getVectorNumElements();
  13256. for (unsigned i = 0; i < NumElts; ++i) {
  13257. SDNode *Elt = N->getOperand(i).getNode();
  13258. if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
  13259. return true;
  13260. }
  13261. return false;
  13262. }
  13263. /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
  13264. /// ISD::BUILD_VECTOR.
  13265. static SDValue PerformBUILD_VECTORCombine(SDNode *N,
  13266. TargetLowering::DAGCombinerInfo &DCI,
  13267. const ARMSubtarget *Subtarget) {
  13268. // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
  13269. // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
  13270. // into a pair of GPRs, which is fine when the value is used as a scalar,
  13271. // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
  13272. SelectionDAG &DAG = DCI.DAG;
  13273. if (N->getNumOperands() == 2)
  13274. if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
  13275. return RV;
  13276. // Load i64 elements as f64 values so that type legalization does not split
  13277. // them up into i32 values.
  13278. EVT VT = N->getValueType(0);
  13279. if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
  13280. return SDValue();
  13281. SDLoc dl(N);
  13282. SmallVector<SDValue, 8> Ops;
  13283. unsigned NumElts = VT.getVectorNumElements();
  13284. for (unsigned i = 0; i < NumElts; ++i) {
  13285. SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
  13286. Ops.push_back(V);
  13287. // Make the DAGCombiner fold the bitcast.
  13288. DCI.AddToWorklist(V.getNode());
  13289. }
  13290. EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
  13291. SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
  13292. return DAG.getNode(ISD::BITCAST, dl, VT, BV);
  13293. }
  13294. /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
  13295. static SDValue
  13296. PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13297. // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
  13298. // At that time, we may have inserted bitcasts from integer to float.
  13299. // If these bitcasts have survived DAGCombine, change the lowering of this
  13300. // BUILD_VECTOR in something more vector friendly, i.e., that does not
  13301. // force to use floating point types.
  13302. // Make sure we can change the type of the vector.
  13303. // This is possible iff:
  13304. // 1. The vector is only used in a bitcast to a integer type. I.e.,
  13305. // 1.1. Vector is used only once.
  13306. // 1.2. Use is a bit convert to an integer type.
  13307. // 2. The size of its operands are 32-bits (64-bits are not legal).
  13308. EVT VT = N->getValueType(0);
  13309. EVT EltVT = VT.getVectorElementType();
  13310. // Check 1.1. and 2.
  13311. if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
  13312. return SDValue();
  13313. // By construction, the input type must be float.
  13314. assert(EltVT == MVT::f32 && "Unexpected type!");
  13315. // Check 1.2.
  13316. SDNode *Use = *N->use_begin();
  13317. if (Use->getOpcode() != ISD::BITCAST ||
  13318. Use->getValueType(0).isFloatingPoint())
  13319. return SDValue();
  13320. // Check profitability.
  13321. // Model is, if more than half of the relevant operands are bitcast from
  13322. // i32, turn the build_vector into a sequence of insert_vector_elt.
  13323. // Relevant operands are everything that is not statically
  13324. // (i.e., at compile time) bitcasted.
  13325. unsigned NumOfBitCastedElts = 0;
  13326. unsigned NumElts = VT.getVectorNumElements();
  13327. unsigned NumOfRelevantElts = NumElts;
  13328. for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
  13329. SDValue Elt = N->getOperand(Idx);
  13330. if (Elt->getOpcode() == ISD::BITCAST) {
  13331. // Assume only bit cast to i32 will go away.
  13332. if (Elt->getOperand(0).getValueType() == MVT::i32)
  13333. ++NumOfBitCastedElts;
  13334. } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
  13335. // Constants are statically casted, thus do not count them as
  13336. // relevant operands.
  13337. --NumOfRelevantElts;
  13338. }
  13339. // Check if more than half of the elements require a non-free bitcast.
  13340. if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
  13341. return SDValue();
  13342. SelectionDAG &DAG = DCI.DAG;
  13343. // Create the new vector type.
  13344. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
  13345. // Check if the type is legal.
  13346. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  13347. if (!TLI.isTypeLegal(VecVT))
  13348. return SDValue();
  13349. // Combine:
  13350. // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
  13351. // => BITCAST INSERT_VECTOR_ELT
  13352. // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
  13353. // (BITCAST EN), N.
  13354. SDValue Vec = DAG.getUNDEF(VecVT);
  13355. SDLoc dl(N);
  13356. for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
  13357. SDValue V = N->getOperand(Idx);
  13358. if (V.isUndef())
  13359. continue;
  13360. if (V.getOpcode() == ISD::BITCAST &&
  13361. V->getOperand(0).getValueType() == MVT::i32)
  13362. // Fold obvious case.
  13363. V = V.getOperand(0);
  13364. else {
  13365. V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
  13366. // Make the DAGCombiner fold the bitcasts.
  13367. DCI.AddToWorklist(V.getNode());
  13368. }
  13369. SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
  13370. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
  13371. }
  13372. Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  13373. // Make the DAGCombiner fold the bitcasts.
  13374. DCI.AddToWorklist(Vec.getNode());
  13375. return Vec;
  13376. }
  13377. static SDValue
  13378. PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13379. EVT VT = N->getValueType(0);
  13380. SDValue Op = N->getOperand(0);
  13381. SDLoc dl(N);
  13382. // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
  13383. if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
  13384. // If the valuetypes are the same, we can remove the cast entirely.
  13385. if (Op->getOperand(0).getValueType() == VT)
  13386. return Op->getOperand(0);
  13387. return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
  13388. }
  13389. // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
  13390. // more VPNOT which might get folded as else predicates.
  13391. if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
  13392. SDValue X =
  13393. DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
  13394. SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
  13395. DCI.DAG.getConstant(65535, dl, MVT::i32));
  13396. return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
  13397. }
  13398. // Only the bottom 16 bits of the source register are used.
  13399. if (Op.getValueType() == MVT::i32) {
  13400. APInt DemandedMask = APInt::getLowBitsSet(32, 16);
  13401. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  13402. if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
  13403. return SDValue(N, 0);
  13404. }
  13405. return SDValue();
  13406. }
  13407. static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,
  13408. const ARMSubtarget *ST) {
  13409. EVT VT = N->getValueType(0);
  13410. SDValue Op = N->getOperand(0);
  13411. SDLoc dl(N);
  13412. // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
  13413. if (ST->isLittle())
  13414. return DAG.getNode(ISD::BITCAST, dl, VT, Op);
  13415. // VECTOR_REG_CAST undef -> undef
  13416. if (Op.isUndef())
  13417. return DAG.getUNDEF(VT);
  13418. // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
  13419. if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
  13420. // If the valuetypes are the same, we can remove the cast entirely.
  13421. if (Op->getOperand(0).getValueType() == VT)
  13422. return Op->getOperand(0);
  13423. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
  13424. }
  13425. return SDValue();
  13426. }
  13427. static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
  13428. const ARMSubtarget *Subtarget) {
  13429. if (!Subtarget->hasMVEIntegerOps())
  13430. return SDValue();
  13431. EVT VT = N->getValueType(0);
  13432. SDValue Op0 = N->getOperand(0);
  13433. SDValue Op1 = N->getOperand(1);
  13434. ARMCC::CondCodes Cond =
  13435. (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  13436. SDLoc dl(N);
  13437. // vcmp X, 0, cc -> vcmpz X, cc
  13438. if (isZeroVector(Op1))
  13439. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
  13440. unsigned SwappedCond = getSwappedCondition(Cond);
  13441. if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
  13442. // vcmp 0, X, cc -> vcmpz X, reversed(cc)
  13443. if (isZeroVector(Op0))
  13444. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
  13445. DAG.getConstant(SwappedCond, dl, MVT::i32));
  13446. // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
  13447. if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
  13448. return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
  13449. DAG.getConstant(SwappedCond, dl, MVT::i32));
  13450. }
  13451. return SDValue();
  13452. }
  13453. /// PerformInsertEltCombine - Target-specific dag combine xforms for
  13454. /// ISD::INSERT_VECTOR_ELT.
  13455. static SDValue PerformInsertEltCombine(SDNode *N,
  13456. TargetLowering::DAGCombinerInfo &DCI) {
  13457. // Bitcast an i64 load inserted into a vector to f64.
  13458. // Otherwise, the i64 value will be legalized to a pair of i32 values.
  13459. EVT VT = N->getValueType(0);
  13460. SDNode *Elt = N->getOperand(1).getNode();
  13461. if (VT.getVectorElementType() != MVT::i64 ||
  13462. !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
  13463. return SDValue();
  13464. SelectionDAG &DAG = DCI.DAG;
  13465. SDLoc dl(N);
  13466. EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
  13467. VT.getVectorNumElements());
  13468. SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
  13469. SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
  13470. // Make the DAGCombiner fold the bitcasts.
  13471. DCI.AddToWorklist(Vec.getNode());
  13472. DCI.AddToWorklist(V.getNode());
  13473. SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
  13474. Vec, V, N->getOperand(2));
  13475. return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
  13476. }
  13477. // Convert a pair of extracts from the same base vector to a VMOVRRD. Either
  13478. // directly or bitcast to an integer if the original is a float vector.
  13479. // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
  13480. // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
  13481. static SDValue
  13482. PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13483. EVT VT = N->getValueType(0);
  13484. SDLoc dl(N);
  13485. if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
  13486. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
  13487. return SDValue();
  13488. SDValue Ext = SDValue(N, 0);
  13489. if (Ext.getOpcode() == ISD::BITCAST &&
  13490. Ext.getOperand(0).getValueType() == MVT::f32)
  13491. Ext = Ext.getOperand(0);
  13492. if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  13493. !isa<ConstantSDNode>(Ext.getOperand(1)) ||
  13494. Ext.getConstantOperandVal(1) % 2 != 0)
  13495. return SDValue();
  13496. if (Ext->use_size() == 1 &&
  13497. (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
  13498. Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
  13499. return SDValue();
  13500. SDValue Op0 = Ext.getOperand(0);
  13501. EVT VecVT = Op0.getValueType();
  13502. unsigned ResNo = Op0.getResNo();
  13503. unsigned Lane = Ext.getConstantOperandVal(1);
  13504. if (VecVT.getVectorNumElements() != 4)
  13505. return SDValue();
  13506. // Find another extract, of Lane + 1
  13507. auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
  13508. return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  13509. isa<ConstantSDNode>(V->getOperand(1)) &&
  13510. V->getConstantOperandVal(1) == Lane + 1 &&
  13511. V->getOperand(0).getResNo() == ResNo;
  13512. });
  13513. if (OtherIt == Op0->uses().end())
  13514. return SDValue();
  13515. // For float extracts, we need to be converting to a i32 for both vector
  13516. // lanes.
  13517. SDValue OtherExt(*OtherIt, 0);
  13518. if (OtherExt.getValueType() != MVT::i32) {
  13519. if (OtherExt->use_size() != 1 ||
  13520. OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
  13521. OtherExt->use_begin()->getValueType(0) != MVT::i32)
  13522. return SDValue();
  13523. OtherExt = SDValue(*OtherExt->use_begin(), 0);
  13524. }
  13525. // Convert the type to a f64 and extract with a VMOVRRD.
  13526. SDValue F64 = DCI.DAG.getNode(
  13527. ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
  13528. DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
  13529. DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
  13530. SDValue VMOVRRD =
  13531. DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
  13532. DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
  13533. return VMOVRRD;
  13534. }
  13535. static SDValue PerformExtractEltCombine(SDNode *N,
  13536. TargetLowering::DAGCombinerInfo &DCI,
  13537. const ARMSubtarget *ST) {
  13538. SDValue Op0 = N->getOperand(0);
  13539. EVT VT = N->getValueType(0);
  13540. SDLoc dl(N);
  13541. // extract (vdup x) -> x
  13542. if (Op0->getOpcode() == ARMISD::VDUP) {
  13543. SDValue X = Op0->getOperand(0);
  13544. if (VT == MVT::f16 && X.getValueType() == MVT::i32)
  13545. return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
  13546. if (VT == MVT::i32 && X.getValueType() == MVT::f16)
  13547. return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
  13548. if (VT == MVT::f32 && X.getValueType() == MVT::i32)
  13549. return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
  13550. while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
  13551. X = X->getOperand(0);
  13552. if (X.getValueType() == VT)
  13553. return X;
  13554. }
  13555. // extract ARM_BUILD_VECTOR -> x
  13556. if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
  13557. isa<ConstantSDNode>(N->getOperand(1)) &&
  13558. N->getConstantOperandVal(1) < Op0.getNumOperands()) {
  13559. return Op0.getOperand(N->getConstantOperandVal(1));
  13560. }
  13561. // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
  13562. if (Op0.getValueType() == MVT::v4i32 &&
  13563. isa<ConstantSDNode>(N->getOperand(1)) &&
  13564. Op0.getOpcode() == ISD::BITCAST &&
  13565. Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
  13566. Op0.getOperand(0).getValueType() == MVT::v2f64) {
  13567. SDValue BV = Op0.getOperand(0);
  13568. unsigned Offset = N->getConstantOperandVal(1);
  13569. SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
  13570. if (MOV.getOpcode() == ARMISD::VMOVDRR)
  13571. return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
  13572. }
  13573. // extract x, n; extract x, n+1 -> VMOVRRD x
  13574. if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
  13575. return R;
  13576. // extract (MVETrunc(x)) -> extract x
  13577. if (Op0->getOpcode() == ARMISD::MVETRUNC) {
  13578. unsigned Idx = N->getConstantOperandVal(1);
  13579. unsigned Vec =
  13580. Idx / Op0->getOperand(0).getValueType().getVectorNumElements();
  13581. unsigned SubIdx =
  13582. Idx % Op0->getOperand(0).getValueType().getVectorNumElements();
  13583. return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
  13584. DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
  13585. }
  13586. return SDValue();
  13587. }
  13588. static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {
  13589. SDValue Op = N->getOperand(0);
  13590. EVT VT = N->getValueType(0);
  13591. // sext_inreg(VGETLANEu) -> VGETLANEs
  13592. if (Op.getOpcode() == ARMISD::VGETLANEu &&
  13593. cast<VTSDNode>(N->getOperand(1))->getVT() ==
  13594. Op.getOperand(0).getValueType().getScalarType())
  13595. return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
  13596. Op.getOperand(1));
  13597. return SDValue();
  13598. }
  13599. // When lowering complex nodes that we recognize, like VQDMULH and MULH, we
  13600. // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
  13601. // binop as the shuffles cancel out.
  13602. static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
  13603. EVT VT = N->getValueType(0);
  13604. if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT)
  13605. return SDValue();
  13606. SDValue Op = N->getOperand(0);
  13607. // Looking for binary operators that will have been folded from
  13608. // truncates/extends.
  13609. switch (Op.getOpcode()) {
  13610. case ARMISD::VQDMULH:
  13611. case ISD::MULHS:
  13612. case ISD::MULHU:
  13613. case ISD::ABDS:
  13614. case ISD::ABDU:
  13615. break;
  13616. default:
  13617. return SDValue();
  13618. }
  13619. ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
  13620. ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1));
  13621. if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() ||
  13622. !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() ||
  13623. Op0->getOperand(0).getValueType() != VT)
  13624. return SDValue();
  13625. // Check the mask turns into an identity shuffle.
  13626. ArrayRef<int> NMask = N->getMask();
  13627. ArrayRef<int> OpMask = Op0->getMask();
  13628. for (int i = 0, e = NMask.size(); i != e; i++) {
  13629. if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i)
  13630. return SDValue();
  13631. }
  13632. return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
  13633. Op0->getOperand(0), Op1->getOperand(0));
  13634. }
  13635. static SDValue
  13636. PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13637. SDValue Vec = N->getOperand(0);
  13638. SDValue SubVec = N->getOperand(1);
  13639. uint64_t IdxVal = N->getConstantOperandVal(2);
  13640. EVT VecVT = Vec.getValueType();
  13641. EVT SubVT = SubVec.getValueType();
  13642. // Only do this for legal fixed vector types.
  13643. if (!VecVT.isFixedLengthVector() ||
  13644. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
  13645. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
  13646. return SDValue();
  13647. // Ignore widening patterns.
  13648. if (IdxVal == 0 && Vec.isUndef())
  13649. return SDValue();
  13650. // Subvector must be half the width and an "aligned" insertion.
  13651. unsigned NumSubElts = SubVT.getVectorNumElements();
  13652. if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
  13653. (IdxVal != 0 && IdxVal != NumSubElts))
  13654. return SDValue();
  13655. // Fold insert_subvector -> concat_vectors
  13656. // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
  13657. // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
  13658. SDLoc DL(N);
  13659. SDValue Lo, Hi;
  13660. if (IdxVal == 0) {
  13661. Lo = SubVec;
  13662. Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
  13663. DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
  13664. } else {
  13665. Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
  13666. DCI.DAG.getVectorIdxConstant(0, DL));
  13667. Hi = SubVec;
  13668. }
  13669. return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
  13670. }
  13671. // shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
  13672. static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
  13673. SelectionDAG &DAG) {
  13674. SDValue Trunc = N->getOperand(0);
  13675. EVT VT = Trunc.getValueType();
  13676. if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
  13677. return SDValue();
  13678. SDLoc DL(Trunc);
  13679. if (isVMOVNTruncMask(N->getMask(), VT, false))
  13680. return DAG.getNode(
  13681. ARMISD::VMOVN, DL, VT,
  13682. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
  13683. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
  13684. DAG.getConstant(1, DL, MVT::i32));
  13685. else if (isVMOVNTruncMask(N->getMask(), VT, true))
  13686. return DAG.getNode(
  13687. ARMISD::VMOVN, DL, VT,
  13688. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
  13689. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
  13690. DAG.getConstant(1, DL, MVT::i32));
  13691. return SDValue();
  13692. }
  13693. /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
  13694. /// ISD::VECTOR_SHUFFLE.
  13695. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
  13696. if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG))
  13697. return R;
  13698. if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
  13699. return R;
  13700. // The LLVM shufflevector instruction does not require the shuffle mask
  13701. // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
  13702. // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
  13703. // operands do not match the mask length, they are extended by concatenating
  13704. // them with undef vectors. That is probably the right thing for other
  13705. // targets, but for NEON it is better to concatenate two double-register
  13706. // size vector operands into a single quad-register size vector. Do that
  13707. // transformation here:
  13708. // shuffle(concat(v1, undef), concat(v2, undef)) ->
  13709. // shuffle(concat(v1, v2), undef)
  13710. SDValue Op0 = N->getOperand(0);
  13711. SDValue Op1 = N->getOperand(1);
  13712. if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
  13713. Op1.getOpcode() != ISD::CONCAT_VECTORS ||
  13714. Op0.getNumOperands() != 2 ||
  13715. Op1.getNumOperands() != 2)
  13716. return SDValue();
  13717. SDValue Concat0Op1 = Op0.getOperand(1);
  13718. SDValue Concat1Op1 = Op1.getOperand(1);
  13719. if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
  13720. return SDValue();
  13721. // Skip the transformation if any of the types are illegal.
  13722. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  13723. EVT VT = N->getValueType(0);
  13724. if (!TLI.isTypeLegal(VT) ||
  13725. !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
  13726. !TLI.isTypeLegal(Concat1Op1.getValueType()))
  13727. return SDValue();
  13728. SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
  13729. Op0.getOperand(0), Op1.getOperand(0));
  13730. // Translate the shuffle mask.
  13731. SmallVector<int, 16> NewMask;
  13732. unsigned NumElts = VT.getVectorNumElements();
  13733. unsigned HalfElts = NumElts/2;
  13734. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
  13735. for (unsigned n = 0; n < NumElts; ++n) {
  13736. int MaskElt = SVN->getMaskElt(n);
  13737. int NewElt = -1;
  13738. if (MaskElt < (int)HalfElts)
  13739. NewElt = MaskElt;
  13740. else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
  13741. NewElt = HalfElts + MaskElt - NumElts;
  13742. NewMask.push_back(NewElt);
  13743. }
  13744. return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
  13745. DAG.getUNDEF(VT), NewMask);
  13746. }
  13747. /// Load/store instruction that can be merged with a base address
  13748. /// update
  13749. struct BaseUpdateTarget {
  13750. SDNode *N;
  13751. bool isIntrinsic;
  13752. bool isStore;
  13753. unsigned AddrOpIdx;
  13754. };
  13755. struct BaseUpdateUser {
  13756. /// Instruction that updates a pointer
  13757. SDNode *N;
  13758. /// Pointer increment operand
  13759. SDValue Inc;
  13760. /// Pointer increment value if it is a constant, or 0 otherwise
  13761. unsigned ConstInc;
  13762. };
  13763. static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
  13764. struct BaseUpdateUser &User,
  13765. bool SimpleConstIncOnly,
  13766. TargetLowering::DAGCombinerInfo &DCI) {
  13767. SelectionDAG &DAG = DCI.DAG;
  13768. SDNode *N = Target.N;
  13769. MemSDNode *MemN = cast<MemSDNode>(N);
  13770. SDLoc dl(N);
  13771. // Find the new opcode for the updating load/store.
  13772. bool isLoadOp = true;
  13773. bool isLaneOp = false;
  13774. // Workaround for vst1x and vld1x intrinsics which do not have alignment
  13775. // as an operand.
  13776. bool hasAlignment = true;
  13777. unsigned NewOpc = 0;
  13778. unsigned NumVecs = 0;
  13779. if (Target.isIntrinsic) {
  13780. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  13781. switch (IntNo) {
  13782. default:
  13783. llvm_unreachable("unexpected intrinsic for Neon base update");
  13784. case Intrinsic::arm_neon_vld1:
  13785. NewOpc = ARMISD::VLD1_UPD;
  13786. NumVecs = 1;
  13787. break;
  13788. case Intrinsic::arm_neon_vld2:
  13789. NewOpc = ARMISD::VLD2_UPD;
  13790. NumVecs = 2;
  13791. break;
  13792. case Intrinsic::arm_neon_vld3:
  13793. NewOpc = ARMISD::VLD3_UPD;
  13794. NumVecs = 3;
  13795. break;
  13796. case Intrinsic::arm_neon_vld4:
  13797. NewOpc = ARMISD::VLD4_UPD;
  13798. NumVecs = 4;
  13799. break;
  13800. case Intrinsic::arm_neon_vld1x2:
  13801. NewOpc = ARMISD::VLD1x2_UPD;
  13802. NumVecs = 2;
  13803. hasAlignment = false;
  13804. break;
  13805. case Intrinsic::arm_neon_vld1x3:
  13806. NewOpc = ARMISD::VLD1x3_UPD;
  13807. NumVecs = 3;
  13808. hasAlignment = false;
  13809. break;
  13810. case Intrinsic::arm_neon_vld1x4:
  13811. NewOpc = ARMISD::VLD1x4_UPD;
  13812. NumVecs = 4;
  13813. hasAlignment = false;
  13814. break;
  13815. case Intrinsic::arm_neon_vld2dup:
  13816. NewOpc = ARMISD::VLD2DUP_UPD;
  13817. NumVecs = 2;
  13818. break;
  13819. case Intrinsic::arm_neon_vld3dup:
  13820. NewOpc = ARMISD::VLD3DUP_UPD;
  13821. NumVecs = 3;
  13822. break;
  13823. case Intrinsic::arm_neon_vld4dup:
  13824. NewOpc = ARMISD::VLD4DUP_UPD;
  13825. NumVecs = 4;
  13826. break;
  13827. case Intrinsic::arm_neon_vld2lane:
  13828. NewOpc = ARMISD::VLD2LN_UPD;
  13829. NumVecs = 2;
  13830. isLaneOp = true;
  13831. break;
  13832. case Intrinsic::arm_neon_vld3lane:
  13833. NewOpc = ARMISD::VLD3LN_UPD;
  13834. NumVecs = 3;
  13835. isLaneOp = true;
  13836. break;
  13837. case Intrinsic::arm_neon_vld4lane:
  13838. NewOpc = ARMISD::VLD4LN_UPD;
  13839. NumVecs = 4;
  13840. isLaneOp = true;
  13841. break;
  13842. case Intrinsic::arm_neon_vst1:
  13843. NewOpc = ARMISD::VST1_UPD;
  13844. NumVecs = 1;
  13845. isLoadOp = false;
  13846. break;
  13847. case Intrinsic::arm_neon_vst2:
  13848. NewOpc = ARMISD::VST2_UPD;
  13849. NumVecs = 2;
  13850. isLoadOp = false;
  13851. break;
  13852. case Intrinsic::arm_neon_vst3:
  13853. NewOpc = ARMISD::VST3_UPD;
  13854. NumVecs = 3;
  13855. isLoadOp = false;
  13856. break;
  13857. case Intrinsic::arm_neon_vst4:
  13858. NewOpc = ARMISD::VST4_UPD;
  13859. NumVecs = 4;
  13860. isLoadOp = false;
  13861. break;
  13862. case Intrinsic::arm_neon_vst2lane:
  13863. NewOpc = ARMISD::VST2LN_UPD;
  13864. NumVecs = 2;
  13865. isLoadOp = false;
  13866. isLaneOp = true;
  13867. break;
  13868. case Intrinsic::arm_neon_vst3lane:
  13869. NewOpc = ARMISD::VST3LN_UPD;
  13870. NumVecs = 3;
  13871. isLoadOp = false;
  13872. isLaneOp = true;
  13873. break;
  13874. case Intrinsic::arm_neon_vst4lane:
  13875. NewOpc = ARMISD::VST4LN_UPD;
  13876. NumVecs = 4;
  13877. isLoadOp = false;
  13878. isLaneOp = true;
  13879. break;
  13880. case Intrinsic::arm_neon_vst1x2:
  13881. NewOpc = ARMISD::VST1x2_UPD;
  13882. NumVecs = 2;
  13883. isLoadOp = false;
  13884. hasAlignment = false;
  13885. break;
  13886. case Intrinsic::arm_neon_vst1x3:
  13887. NewOpc = ARMISD::VST1x3_UPD;
  13888. NumVecs = 3;
  13889. isLoadOp = false;
  13890. hasAlignment = false;
  13891. break;
  13892. case Intrinsic::arm_neon_vst1x4:
  13893. NewOpc = ARMISD::VST1x4_UPD;
  13894. NumVecs = 4;
  13895. isLoadOp = false;
  13896. hasAlignment = false;
  13897. break;
  13898. }
  13899. } else {
  13900. isLaneOp = true;
  13901. switch (N->getOpcode()) {
  13902. default:
  13903. llvm_unreachable("unexpected opcode for Neon base update");
  13904. case ARMISD::VLD1DUP:
  13905. NewOpc = ARMISD::VLD1DUP_UPD;
  13906. NumVecs = 1;
  13907. break;
  13908. case ARMISD::VLD2DUP:
  13909. NewOpc = ARMISD::VLD2DUP_UPD;
  13910. NumVecs = 2;
  13911. break;
  13912. case ARMISD::VLD3DUP:
  13913. NewOpc = ARMISD::VLD3DUP_UPD;
  13914. NumVecs = 3;
  13915. break;
  13916. case ARMISD::VLD4DUP:
  13917. NewOpc = ARMISD::VLD4DUP_UPD;
  13918. NumVecs = 4;
  13919. break;
  13920. case ISD::LOAD:
  13921. NewOpc = ARMISD::VLD1_UPD;
  13922. NumVecs = 1;
  13923. isLaneOp = false;
  13924. break;
  13925. case ISD::STORE:
  13926. NewOpc = ARMISD::VST1_UPD;
  13927. NumVecs = 1;
  13928. isLaneOp = false;
  13929. isLoadOp = false;
  13930. break;
  13931. }
  13932. }
  13933. // Find the size of memory referenced by the load/store.
  13934. EVT VecTy;
  13935. if (isLoadOp) {
  13936. VecTy = N->getValueType(0);
  13937. } else if (Target.isIntrinsic) {
  13938. VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
  13939. } else {
  13940. assert(Target.isStore &&
  13941. "Node has to be a load, a store, or an intrinsic!");
  13942. VecTy = N->getOperand(1).getValueType();
  13943. }
  13944. bool isVLDDUPOp =
  13945. NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
  13946. NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
  13947. unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
  13948. if (isLaneOp || isVLDDUPOp)
  13949. NumBytes /= VecTy.getVectorNumElements();
  13950. if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
  13951. // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
  13952. // separate instructions that make it harder to use a non-constant update.
  13953. return false;
  13954. }
  13955. if (SimpleConstIncOnly && User.ConstInc != NumBytes)
  13956. return false;
  13957. // OK, we found an ADD we can fold into the base update.
  13958. // Now, create a _UPD node, taking care of not breaking alignment.
  13959. EVT AlignedVecTy = VecTy;
  13960. unsigned Alignment = MemN->getAlignment();
  13961. // If this is a less-than-standard-aligned load/store, change the type to
  13962. // match the standard alignment.
  13963. // The alignment is overlooked when selecting _UPD variants; and it's
  13964. // easier to introduce bitcasts here than fix that.
  13965. // There are 3 ways to get to this base-update combine:
  13966. // - intrinsics: they are assumed to be properly aligned (to the standard
  13967. // alignment of the memory type), so we don't need to do anything.
  13968. // - ARMISD::VLDx nodes: they are only generated from the aforementioned
  13969. // intrinsics, so, likewise, there's nothing to do.
  13970. // - generic load/store instructions: the alignment is specified as an
  13971. // explicit operand, rather than implicitly as the standard alignment
  13972. // of the memory type (like the intrisics). We need to change the
  13973. // memory type to match the explicit alignment. That way, we don't
  13974. // generate non-standard-aligned ARMISD::VLDx nodes.
  13975. if (isa<LSBaseSDNode>(N)) {
  13976. if (Alignment == 0)
  13977. Alignment = 1;
  13978. if (Alignment < VecTy.getScalarSizeInBits() / 8) {
  13979. MVT EltTy = MVT::getIntegerVT(Alignment * 8);
  13980. assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
  13981. assert(!isLaneOp && "Unexpected generic load/store lane.");
  13982. unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
  13983. AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
  13984. }
  13985. // Don't set an explicit alignment on regular load/stores that we want
  13986. // to transform to VLD/VST 1_UPD nodes.
  13987. // This matches the behavior of regular load/stores, which only get an
  13988. // explicit alignment if the MMO alignment is larger than the standard
  13989. // alignment of the memory type.
  13990. // Intrinsics, however, always get an explicit alignment, set to the
  13991. // alignment of the MMO.
  13992. Alignment = 1;
  13993. }
  13994. // Create the new updating load/store node.
  13995. // First, create an SDVTList for the new updating node's results.
  13996. EVT Tys[6];
  13997. unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
  13998. unsigned n;
  13999. for (n = 0; n < NumResultVecs; ++n)
  14000. Tys[n] = AlignedVecTy;
  14001. Tys[n++] = MVT::i32;
  14002. Tys[n] = MVT::Other;
  14003. SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
  14004. // Then, gather the new node's operands.
  14005. SmallVector<SDValue, 8> Ops;
  14006. Ops.push_back(N->getOperand(0)); // incoming chain
  14007. Ops.push_back(N->getOperand(Target.AddrOpIdx));
  14008. Ops.push_back(User.Inc);
  14009. if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
  14010. // Try to match the intrinsic's signature
  14011. Ops.push_back(StN->getValue());
  14012. } else {
  14013. // Loads (and of course intrinsics) match the intrinsics' signature,
  14014. // so just add all but the alignment operand.
  14015. unsigned LastOperand =
  14016. hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
  14017. for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
  14018. Ops.push_back(N->getOperand(i));
  14019. }
  14020. // For all node types, the alignment operand is always the last one.
  14021. Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
  14022. // If this is a non-standard-aligned STORE, the penultimate operand is the
  14023. // stored value. Bitcast it to the aligned type.
  14024. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
  14025. SDValue &StVal = Ops[Ops.size() - 2];
  14026. StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
  14027. }
  14028. EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
  14029. SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
  14030. MemN->getMemOperand());
  14031. // Update the uses.
  14032. SmallVector<SDValue, 5> NewResults;
  14033. for (unsigned i = 0; i < NumResultVecs; ++i)
  14034. NewResults.push_back(SDValue(UpdN.getNode(), i));
  14035. // If this is an non-standard-aligned LOAD, the first result is the loaded
  14036. // value. Bitcast it to the expected result type.
  14037. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
  14038. SDValue &LdVal = NewResults[0];
  14039. LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
  14040. }
  14041. NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
  14042. DCI.CombineTo(N, NewResults);
  14043. DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
  14044. return true;
  14045. }
  14046. // If (opcode ptr inc) is and ADD-like instruction, return the
  14047. // increment value. Otherwise return 0.
  14048. static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
  14049. SDValue Inc, const SelectionDAG &DAG) {
  14050. ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
  14051. if (!CInc)
  14052. return 0;
  14053. switch (Opcode) {
  14054. case ARMISD::VLD1_UPD:
  14055. case ISD::ADD:
  14056. return CInc->getZExtValue();
  14057. case ISD::OR: {
  14058. if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
  14059. // (OR ptr inc) is the same as (ADD ptr inc)
  14060. return CInc->getZExtValue();
  14061. }
  14062. return 0;
  14063. }
  14064. default:
  14065. return 0;
  14066. }
  14067. }
  14068. static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
  14069. switch (N->getOpcode()) {
  14070. case ISD::ADD:
  14071. case ISD::OR: {
  14072. if (isa<ConstantSDNode>(N->getOperand(1))) {
  14073. *Ptr = N->getOperand(0);
  14074. *CInc = N->getOperand(1);
  14075. return true;
  14076. }
  14077. return false;
  14078. }
  14079. case ARMISD::VLD1_UPD: {
  14080. if (isa<ConstantSDNode>(N->getOperand(2))) {
  14081. *Ptr = N->getOperand(1);
  14082. *CInc = N->getOperand(2);
  14083. return true;
  14084. }
  14085. return false;
  14086. }
  14087. default:
  14088. return false;
  14089. }
  14090. }
  14091. static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
  14092. // Check that the add is independent of the load/store.
  14093. // Otherwise, folding it would create a cycle. Search through Addr
  14094. // as well, since the User may not be a direct user of Addr and
  14095. // only share a base pointer.
  14096. SmallPtrSet<const SDNode *, 32> Visited;
  14097. SmallVector<const SDNode *, 16> Worklist;
  14098. Worklist.push_back(N);
  14099. Worklist.push_back(User);
  14100. if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
  14101. SDNode::hasPredecessorHelper(User, Visited, Worklist))
  14102. return false;
  14103. return true;
  14104. }
  14105. /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
  14106. /// NEON load/store intrinsics, and generic vector load/stores, to merge
  14107. /// base address updates.
  14108. /// For generic load/stores, the memory type is assumed to be a vector.
  14109. /// The caller is assumed to have checked legality.
  14110. static SDValue CombineBaseUpdate(SDNode *N,
  14111. TargetLowering::DAGCombinerInfo &DCI) {
  14112. const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
  14113. N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
  14114. const bool isStore = N->getOpcode() == ISD::STORE;
  14115. const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
  14116. BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
  14117. SDValue Addr = N->getOperand(AddrOpIdx);
  14118. SmallVector<BaseUpdateUser, 8> BaseUpdates;
  14119. // Search for a use of the address operand that is an increment.
  14120. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
  14121. UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
  14122. SDNode *User = *UI;
  14123. if (UI.getUse().getResNo() != Addr.getResNo() ||
  14124. User->getNumOperands() != 2)
  14125. continue;
  14126. SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
  14127. unsigned ConstInc =
  14128. getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
  14129. if (ConstInc || User->getOpcode() == ISD::ADD)
  14130. BaseUpdates.push_back({User, Inc, ConstInc});
  14131. }
  14132. // If the address is a constant pointer increment itself, find
  14133. // another constant increment that has the same base operand
  14134. SDValue Base;
  14135. SDValue CInc;
  14136. if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
  14137. unsigned Offset =
  14138. getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
  14139. for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
  14140. UI != UE; ++UI) {
  14141. SDNode *User = *UI;
  14142. if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
  14143. User->getNumOperands() != 2)
  14144. continue;
  14145. SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
  14146. unsigned UserOffset =
  14147. getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
  14148. if (!UserOffset || UserOffset <= Offset)
  14149. continue;
  14150. unsigned NewConstInc = UserOffset - Offset;
  14151. SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
  14152. BaseUpdates.push_back({User, NewInc, NewConstInc});
  14153. }
  14154. }
  14155. // Try to fold the load/store with an update that matches memory
  14156. // access size. This should work well for sequential loads.
  14157. //
  14158. // Filter out invalid updates as well.
  14159. unsigned NumValidUpd = BaseUpdates.size();
  14160. for (unsigned I = 0; I < NumValidUpd;) {
  14161. BaseUpdateUser &User = BaseUpdates[I];
  14162. if (!isValidBaseUpdate(N, User.N)) {
  14163. --NumValidUpd;
  14164. std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
  14165. continue;
  14166. }
  14167. if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
  14168. return SDValue();
  14169. ++I;
  14170. }
  14171. BaseUpdates.resize(NumValidUpd);
  14172. // Try to fold with other users. Non-constant updates are considered
  14173. // first, and constant updates are sorted to not break a sequence of
  14174. // strided accesses (if there is any).
  14175. std::sort(BaseUpdates.begin(), BaseUpdates.end(),
  14176. [](BaseUpdateUser &LHS, BaseUpdateUser &RHS) {
  14177. return LHS.ConstInc < RHS.ConstInc;
  14178. });
  14179. for (BaseUpdateUser &User : BaseUpdates) {
  14180. if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
  14181. return SDValue();
  14182. }
  14183. return SDValue();
  14184. }
  14185. static SDValue PerformVLDCombine(SDNode *N,
  14186. TargetLowering::DAGCombinerInfo &DCI) {
  14187. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  14188. return SDValue();
  14189. return CombineBaseUpdate(N, DCI);
  14190. }
  14191. static SDValue PerformMVEVLDCombine(SDNode *N,
  14192. TargetLowering::DAGCombinerInfo &DCI) {
  14193. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  14194. return SDValue();
  14195. SelectionDAG &DAG = DCI.DAG;
  14196. SDValue Addr = N->getOperand(2);
  14197. MemSDNode *MemN = cast<MemSDNode>(N);
  14198. SDLoc dl(N);
  14199. // For the stores, where there are multiple intrinsics we only actually want
  14200. // to post-inc the last of the them.
  14201. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  14202. if (IntNo == Intrinsic::arm_mve_vst2q &&
  14203. cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
  14204. return SDValue();
  14205. if (IntNo == Intrinsic::arm_mve_vst4q &&
  14206. cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
  14207. return SDValue();
  14208. // Search for a use of the address operand that is an increment.
  14209. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
  14210. UE = Addr.getNode()->use_end();
  14211. UI != UE; ++UI) {
  14212. SDNode *User = *UI;
  14213. if (User->getOpcode() != ISD::ADD ||
  14214. UI.getUse().getResNo() != Addr.getResNo())
  14215. continue;
  14216. // Check that the add is independent of the load/store. Otherwise, folding
  14217. // it would create a cycle. We can avoid searching through Addr as it's a
  14218. // predecessor to both.
  14219. SmallPtrSet<const SDNode *, 32> Visited;
  14220. SmallVector<const SDNode *, 16> Worklist;
  14221. Visited.insert(Addr.getNode());
  14222. Worklist.push_back(N);
  14223. Worklist.push_back(User);
  14224. if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
  14225. SDNode::hasPredecessorHelper(User, Visited, Worklist))
  14226. continue;
  14227. // Find the new opcode for the updating load/store.
  14228. bool isLoadOp = true;
  14229. unsigned NewOpc = 0;
  14230. unsigned NumVecs = 0;
  14231. switch (IntNo) {
  14232. default:
  14233. llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
  14234. case Intrinsic::arm_mve_vld2q:
  14235. NewOpc = ARMISD::VLD2_UPD;
  14236. NumVecs = 2;
  14237. break;
  14238. case Intrinsic::arm_mve_vld4q:
  14239. NewOpc = ARMISD::VLD4_UPD;
  14240. NumVecs = 4;
  14241. break;
  14242. case Intrinsic::arm_mve_vst2q:
  14243. NewOpc = ARMISD::VST2_UPD;
  14244. NumVecs = 2;
  14245. isLoadOp = false;
  14246. break;
  14247. case Intrinsic::arm_mve_vst4q:
  14248. NewOpc = ARMISD::VST4_UPD;
  14249. NumVecs = 4;
  14250. isLoadOp = false;
  14251. break;
  14252. }
  14253. // Find the size of memory referenced by the load/store.
  14254. EVT VecTy;
  14255. if (isLoadOp) {
  14256. VecTy = N->getValueType(0);
  14257. } else {
  14258. VecTy = N->getOperand(3).getValueType();
  14259. }
  14260. unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
  14261. // If the increment is a constant, it must match the memory ref size.
  14262. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
  14263. ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
  14264. if (!CInc || CInc->getZExtValue() != NumBytes)
  14265. continue;
  14266. // Create the new updating load/store node.
  14267. // First, create an SDVTList for the new updating node's results.
  14268. EVT Tys[6];
  14269. unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
  14270. unsigned n;
  14271. for (n = 0; n < NumResultVecs; ++n)
  14272. Tys[n] = VecTy;
  14273. Tys[n++] = MVT::i32;
  14274. Tys[n] = MVT::Other;
  14275. SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
  14276. // Then, gather the new node's operands.
  14277. SmallVector<SDValue, 8> Ops;
  14278. Ops.push_back(N->getOperand(0)); // incoming chain
  14279. Ops.push_back(N->getOperand(2)); // ptr
  14280. Ops.push_back(Inc);
  14281. for (unsigned i = 3; i < N->getNumOperands(); ++i)
  14282. Ops.push_back(N->getOperand(i));
  14283. SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
  14284. MemN->getMemOperand());
  14285. // Update the uses.
  14286. SmallVector<SDValue, 5> NewResults;
  14287. for (unsigned i = 0; i < NumResultVecs; ++i)
  14288. NewResults.push_back(SDValue(UpdN.getNode(), i));
  14289. NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
  14290. DCI.CombineTo(N, NewResults);
  14291. DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
  14292. break;
  14293. }
  14294. return SDValue();
  14295. }
  14296. /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
  14297. /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
  14298. /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
  14299. /// return true.
  14300. static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  14301. SelectionDAG &DAG = DCI.DAG;
  14302. EVT VT = N->getValueType(0);
  14303. // vldN-dup instructions only support 64-bit vectors for N > 1.
  14304. if (!VT.is64BitVector())
  14305. return false;
  14306. // Check if the VDUPLANE operand is a vldN-dup intrinsic.
  14307. SDNode *VLD = N->getOperand(0).getNode();
  14308. if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
  14309. return false;
  14310. unsigned NumVecs = 0;
  14311. unsigned NewOpc = 0;
  14312. unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
  14313. if (IntNo == Intrinsic::arm_neon_vld2lane) {
  14314. NumVecs = 2;
  14315. NewOpc = ARMISD::VLD2DUP;
  14316. } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
  14317. NumVecs = 3;
  14318. NewOpc = ARMISD::VLD3DUP;
  14319. } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
  14320. NumVecs = 4;
  14321. NewOpc = ARMISD::VLD4DUP;
  14322. } else {
  14323. return false;
  14324. }
  14325. // First check that all the vldN-lane uses are VDUPLANEs and that the lane
  14326. // numbers match the load.
  14327. unsigned VLDLaneNo =
  14328. cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
  14329. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
  14330. UI != UE; ++UI) {
  14331. // Ignore uses of the chain result.
  14332. if (UI.getUse().getResNo() == NumVecs)
  14333. continue;
  14334. SDNode *User = *UI;
  14335. if (User->getOpcode() != ARMISD::VDUPLANE ||
  14336. VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
  14337. return false;
  14338. }
  14339. // Create the vldN-dup node.
  14340. EVT Tys[5];
  14341. unsigned n;
  14342. for (n = 0; n < NumVecs; ++n)
  14343. Tys[n] = VT;
  14344. Tys[n] = MVT::Other;
  14345. SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
  14346. SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
  14347. MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
  14348. SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
  14349. Ops, VLDMemInt->getMemoryVT(),
  14350. VLDMemInt->getMemOperand());
  14351. // Update the uses.
  14352. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
  14353. UI != UE; ++UI) {
  14354. unsigned ResNo = UI.getUse().getResNo();
  14355. // Ignore uses of the chain result.
  14356. if (ResNo == NumVecs)
  14357. continue;
  14358. SDNode *User = *UI;
  14359. DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
  14360. }
  14361. // Now the vldN-lane intrinsic is dead except for its chain result.
  14362. // Update uses of the chain.
  14363. std::vector<SDValue> VLDDupResults;
  14364. for (unsigned n = 0; n < NumVecs; ++n)
  14365. VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
  14366. VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
  14367. DCI.CombineTo(VLD, VLDDupResults);
  14368. return true;
  14369. }
  14370. /// PerformVDUPLANECombine - Target-specific dag combine xforms for
  14371. /// ARMISD::VDUPLANE.
  14372. static SDValue PerformVDUPLANECombine(SDNode *N,
  14373. TargetLowering::DAGCombinerInfo &DCI,
  14374. const ARMSubtarget *Subtarget) {
  14375. SDValue Op = N->getOperand(0);
  14376. EVT VT = N->getValueType(0);
  14377. // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
  14378. if (Subtarget->hasMVEIntegerOps()) {
  14379. EVT ExtractVT = VT.getVectorElementType();
  14380. // We need to ensure we are creating a legal type.
  14381. if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
  14382. ExtractVT = MVT::i32;
  14383. SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
  14384. N->getOperand(0), N->getOperand(1));
  14385. return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
  14386. }
  14387. // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
  14388. // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
  14389. if (CombineVLDDUP(N, DCI))
  14390. return SDValue(N, 0);
  14391. // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
  14392. // redundant. Ignore bit_converts for now; element sizes are checked below.
  14393. while (Op.getOpcode() == ISD::BITCAST)
  14394. Op = Op.getOperand(0);
  14395. if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
  14396. return SDValue();
  14397. // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
  14398. unsigned EltSize = Op.getScalarValueSizeInBits();
  14399. // The canonical VMOV for a zero vector uses a 32-bit element size.
  14400. unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  14401. unsigned EltBits;
  14402. if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
  14403. EltSize = 8;
  14404. if (EltSize > VT.getScalarSizeInBits())
  14405. return SDValue();
  14406. return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
  14407. }
  14408. /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
  14409. static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
  14410. const ARMSubtarget *Subtarget) {
  14411. SDValue Op = N->getOperand(0);
  14412. SDLoc dl(N);
  14413. if (Subtarget->hasMVEIntegerOps()) {
  14414. // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
  14415. // need to come from a GPR.
  14416. if (Op.getValueType() == MVT::f32)
  14417. return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
  14418. DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
  14419. else if (Op.getValueType() == MVT::f16)
  14420. return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
  14421. DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
  14422. }
  14423. if (!Subtarget->hasNEON())
  14424. return SDValue();
  14425. // Match VDUP(LOAD) -> VLD1DUP.
  14426. // We match this pattern here rather than waiting for isel because the
  14427. // transform is only legal for unindexed loads.
  14428. LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
  14429. if (LD && Op.hasOneUse() && LD->isUnindexed() &&
  14430. LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
  14431. SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
  14432. DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)};
  14433. SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
  14434. SDValue VLDDup =
  14435. DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
  14436. LD->getMemoryVT(), LD->getMemOperand());
  14437. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
  14438. return VLDDup;
  14439. }
  14440. return SDValue();
  14441. }
  14442. static SDValue PerformLOADCombine(SDNode *N,
  14443. TargetLowering::DAGCombinerInfo &DCI,
  14444. const ARMSubtarget *Subtarget) {
  14445. EVT VT = N->getValueType(0);
  14446. // If this is a legal vector load, try to combine it into a VLD1_UPD.
  14447. if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
  14448. DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
  14449. return CombineBaseUpdate(N, DCI);
  14450. return SDValue();
  14451. }
  14452. // Optimize trunc store (of multiple scalars) to shuffle and store. First,
  14453. // pack all of the elements in one place. Next, store to memory in fewer
  14454. // chunks.
  14455. static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
  14456. SelectionDAG &DAG) {
  14457. SDValue StVal = St->getValue();
  14458. EVT VT = StVal.getValueType();
  14459. if (!St->isTruncatingStore() || !VT.isVector())
  14460. return SDValue();
  14461. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  14462. EVT StVT = St->getMemoryVT();
  14463. unsigned NumElems = VT.getVectorNumElements();
  14464. assert(StVT != VT && "Cannot truncate to the same type");
  14465. unsigned FromEltSz = VT.getScalarSizeInBits();
  14466. unsigned ToEltSz = StVT.getScalarSizeInBits();
  14467. // From, To sizes and ElemCount must be pow of two
  14468. if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
  14469. return SDValue();
  14470. // We are going to use the original vector elt for storing.
  14471. // Accumulated smaller vector elements must be a multiple of the store size.
  14472. if (0 != (NumElems * FromEltSz) % ToEltSz)
  14473. return SDValue();
  14474. unsigned SizeRatio = FromEltSz / ToEltSz;
  14475. assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
  14476. // Create a type on which we perform the shuffle.
  14477. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
  14478. NumElems * SizeRatio);
  14479. assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  14480. SDLoc DL(St);
  14481. SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
  14482. SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
  14483. for (unsigned i = 0; i < NumElems; ++i)
  14484. ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
  14485. : i * SizeRatio;
  14486. // Can't shuffle using an illegal type.
  14487. if (!TLI.isTypeLegal(WideVecVT))
  14488. return SDValue();
  14489. SDValue Shuff = DAG.getVectorShuffle(
  14490. WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
  14491. // At this point all of the data is stored at the bottom of the
  14492. // register. We now need to save it to mem.
  14493. // Find the largest store unit
  14494. MVT StoreType = MVT::i8;
  14495. for (MVT Tp : MVT::integer_valuetypes()) {
  14496. if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
  14497. StoreType = Tp;
  14498. }
  14499. // Didn't find a legal store type.
  14500. if (!TLI.isTypeLegal(StoreType))
  14501. return SDValue();
  14502. // Bitcast the original vector into a vector of store-size units
  14503. EVT StoreVecVT =
  14504. EVT::getVectorVT(*DAG.getContext(), StoreType,
  14505. VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
  14506. assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
  14507. SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
  14508. SmallVector<SDValue, 8> Chains;
  14509. SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
  14510. TLI.getPointerTy(DAG.getDataLayout()));
  14511. SDValue BasePtr = St->getBasePtr();
  14512. // Perform one or more big stores into memory.
  14513. unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
  14514. for (unsigned I = 0; I < E; I++) {
  14515. SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
  14516. ShuffWide, DAG.getIntPtrConstant(I, DL));
  14517. SDValue Ch =
  14518. DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
  14519. St->getAlignment(), St->getMemOperand()->getFlags());
  14520. BasePtr =
  14521. DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
  14522. Chains.push_back(Ch);
  14523. }
  14524. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  14525. }
  14526. // Try taking a single vector store from an fpround (which would otherwise turn
  14527. // into an expensive buildvector) and splitting it into a series of narrowing
  14528. // stores.
  14529. static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
  14530. SelectionDAG &DAG) {
  14531. if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
  14532. return SDValue();
  14533. SDValue Trunc = St->getValue();
  14534. if (Trunc->getOpcode() != ISD::FP_ROUND)
  14535. return SDValue();
  14536. EVT FromVT = Trunc->getOperand(0).getValueType();
  14537. EVT ToVT = Trunc.getValueType();
  14538. if (!ToVT.isVector())
  14539. return SDValue();
  14540. assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
  14541. EVT ToEltVT = ToVT.getVectorElementType();
  14542. EVT FromEltVT = FromVT.getVectorElementType();
  14543. if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
  14544. return SDValue();
  14545. unsigned NumElements = 4;
  14546. if (FromVT.getVectorNumElements() % NumElements != 0)
  14547. return SDValue();
  14548. // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
  14549. // use the VMOVN over splitting the store. We are looking for patterns of:
  14550. // !rev: 0 N 1 N+1 2 N+2 ...
  14551. // rev: N 0 N+1 1 N+2 2 ...
  14552. // The shuffle may either be a single source (in which case N = NumElts/2) or
  14553. // two inputs extended with concat to the same size (in which case N =
  14554. // NumElts).
  14555. auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
  14556. ArrayRef<int> M = SVN->getMask();
  14557. unsigned NumElts = ToVT.getVectorNumElements();
  14558. if (SVN->getOperand(1).isUndef())
  14559. NumElts /= 2;
  14560. unsigned Off0 = Rev ? NumElts : 0;
  14561. unsigned Off1 = Rev ? 0 : NumElts;
  14562. for (unsigned I = 0; I < NumElts; I += 2) {
  14563. if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
  14564. return false;
  14565. if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
  14566. return false;
  14567. }
  14568. return true;
  14569. };
  14570. if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
  14571. if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
  14572. return SDValue();
  14573. LLVMContext &C = *DAG.getContext();
  14574. SDLoc DL(St);
  14575. // Details about the old store
  14576. SDValue Ch = St->getChain();
  14577. SDValue BasePtr = St->getBasePtr();
  14578. Align Alignment = St->getOriginalAlign();
  14579. MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
  14580. AAMDNodes AAInfo = St->getAAInfo();
  14581. // We split the store into slices of NumElements. fp16 trunc stores are vcvt
  14582. // and then stored as truncating integer stores.
  14583. EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
  14584. EVT NewToVT = EVT::getVectorVT(
  14585. C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
  14586. SmallVector<SDValue, 4> Stores;
  14587. for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
  14588. unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
  14589. SDValue NewPtr =
  14590. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  14591. SDValue Extract =
  14592. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
  14593. DAG.getConstant(i * NumElements, DL, MVT::i32));
  14594. SDValue FPTrunc =
  14595. DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
  14596. Extract, DAG.getConstant(0, DL, MVT::i32));
  14597. Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
  14598. SDValue Store = DAG.getTruncStore(
  14599. Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
  14600. NewToVT, Alignment.value(), MMOFlags, AAInfo);
  14601. Stores.push_back(Store);
  14602. }
  14603. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
  14604. }
  14605. // Try taking a single vector store from an MVETRUNC (which would otherwise turn
  14606. // into an expensive buildvector) and splitting it into a series of narrowing
  14607. // stores.
  14608. static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
  14609. SelectionDAG &DAG) {
  14610. if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
  14611. return SDValue();
  14612. SDValue Trunc = St->getValue();
  14613. if (Trunc->getOpcode() != ARMISD::MVETRUNC)
  14614. return SDValue();
  14615. EVT FromVT = Trunc->getOperand(0).getValueType();
  14616. EVT ToVT = Trunc.getValueType();
  14617. LLVMContext &C = *DAG.getContext();
  14618. SDLoc DL(St);
  14619. // Details about the old store
  14620. SDValue Ch = St->getChain();
  14621. SDValue BasePtr = St->getBasePtr();
  14622. Align Alignment = St->getOriginalAlign();
  14623. MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
  14624. AAMDNodes AAInfo = St->getAAInfo();
  14625. EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
  14626. FromVT.getVectorNumElements());
  14627. SmallVector<SDValue, 4> Stores;
  14628. for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
  14629. unsigned NewOffset =
  14630. i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
  14631. SDValue NewPtr =
  14632. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  14633. SDValue Extract = Trunc.getOperand(i);
  14634. SDValue Store = DAG.getTruncStore(
  14635. Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
  14636. NewToVT, Alignment.value(), MMOFlags, AAInfo);
  14637. Stores.push_back(Store);
  14638. }
  14639. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
  14640. }
  14641. // Given a floating point store from an extracted vector, with an integer
  14642. // VGETLANE that already exists, store the existing VGETLANEu directly. This can
  14643. // help reduce fp register pressure, doesn't require the fp extract and allows
  14644. // use of more integer post-inc stores not available with vstr.
  14645. static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
  14646. if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
  14647. return SDValue();
  14648. SDValue Extract = St->getValue();
  14649. EVT VT = Extract.getValueType();
  14650. // For now only uses f16. This may be useful for f32 too, but that will
  14651. // be bitcast(extract), not the VGETLANEu we currently check here.
  14652. if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  14653. return SDValue();
  14654. SDNode *GetLane =
  14655. DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
  14656. {Extract.getOperand(0), Extract.getOperand(1)});
  14657. if (!GetLane)
  14658. return SDValue();
  14659. LLVMContext &C = *DAG.getContext();
  14660. SDLoc DL(St);
  14661. // Create a new integer store to replace the existing floating point version.
  14662. SDValue Ch = St->getChain();
  14663. SDValue BasePtr = St->getBasePtr();
  14664. Align Alignment = St->getOriginalAlign();
  14665. MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
  14666. AAMDNodes AAInfo = St->getAAInfo();
  14667. EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
  14668. SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
  14669. St->getPointerInfo(), NewToVT,
  14670. Alignment.value(), MMOFlags, AAInfo);
  14671. return Store;
  14672. }
  14673. /// PerformSTORECombine - Target-specific dag combine xforms for
  14674. /// ISD::STORE.
  14675. static SDValue PerformSTORECombine(SDNode *N,
  14676. TargetLowering::DAGCombinerInfo &DCI,
  14677. const ARMSubtarget *Subtarget) {
  14678. StoreSDNode *St = cast<StoreSDNode>(N);
  14679. if (St->isVolatile())
  14680. return SDValue();
  14681. SDValue StVal = St->getValue();
  14682. EVT VT = StVal.getValueType();
  14683. if (Subtarget->hasNEON())
  14684. if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
  14685. return Store;
  14686. if (Subtarget->hasMVEIntegerOps()) {
  14687. if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
  14688. return NewToken;
  14689. if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
  14690. return NewChain;
  14691. if (SDValue NewToken =
  14692. PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG))
  14693. return NewToken;
  14694. }
  14695. if (!ISD::isNormalStore(St))
  14696. return SDValue();
  14697. // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
  14698. // ARM stores of arguments in the same cache line.
  14699. if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
  14700. StVal.getNode()->hasOneUse()) {
  14701. SelectionDAG &DAG = DCI.DAG;
  14702. bool isBigEndian = DAG.getDataLayout().isBigEndian();
  14703. SDLoc DL(St);
  14704. SDValue BasePtr = St->getBasePtr();
  14705. SDValue NewST1 = DAG.getStore(
  14706. St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
  14707. BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
  14708. St->getMemOperand()->getFlags());
  14709. SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
  14710. DAG.getConstant(4, DL, MVT::i32));
  14711. return DAG.getStore(NewST1.getValue(0), DL,
  14712. StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
  14713. OffsetPtr, St->getPointerInfo().getWithOffset(4),
  14714. St->getOriginalAlign(),
  14715. St->getMemOperand()->getFlags());
  14716. }
  14717. if (StVal.getValueType() == MVT::i64 &&
  14718. StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
  14719. // Bitcast an i64 store extracted from a vector to f64.
  14720. // Otherwise, the i64 value will be legalized to a pair of i32 values.
  14721. SelectionDAG &DAG = DCI.DAG;
  14722. SDLoc dl(StVal);
  14723. SDValue IntVec = StVal.getOperand(0);
  14724. EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
  14725. IntVec.getValueType().getVectorNumElements());
  14726. SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
  14727. SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
  14728. Vec, StVal.getOperand(1));
  14729. dl = SDLoc(N);
  14730. SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
  14731. // Make the DAGCombiner fold the bitcasts.
  14732. DCI.AddToWorklist(Vec.getNode());
  14733. DCI.AddToWorklist(ExtElt.getNode());
  14734. DCI.AddToWorklist(V.getNode());
  14735. return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
  14736. St->getPointerInfo(), St->getAlignment(),
  14737. St->getMemOperand()->getFlags(), St->getAAInfo());
  14738. }
  14739. // If this is a legal vector store, try to combine it into a VST1_UPD.
  14740. if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
  14741. DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
  14742. return CombineBaseUpdate(N, DCI);
  14743. return SDValue();
  14744. }
  14745. /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
  14746. /// can replace combinations of VMUL and VCVT (floating-point to integer)
  14747. /// when the VMUL has a constant operand that is a power of 2.
  14748. ///
  14749. /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
  14750. /// vmul.f32 d16, d17, d16
  14751. /// vcvt.s32.f32 d16, d16
  14752. /// becomes:
  14753. /// vcvt.s32.f32 d16, d16, #3
  14754. static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
  14755. const ARMSubtarget *Subtarget) {
  14756. if (!Subtarget->hasNEON())
  14757. return SDValue();
  14758. SDValue Op = N->getOperand(0);
  14759. if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
  14760. Op.getOpcode() != ISD::FMUL)
  14761. return SDValue();
  14762. SDValue ConstVec = Op->getOperand(1);
  14763. if (!isa<BuildVectorSDNode>(ConstVec))
  14764. return SDValue();
  14765. MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
  14766. uint32_t FloatBits = FloatTy.getSizeInBits();
  14767. MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
  14768. uint32_t IntBits = IntTy.getSizeInBits();
  14769. unsigned NumLanes = Op.getValueType().getVectorNumElements();
  14770. if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
  14771. // These instructions only exist converting from f32 to i32. We can handle
  14772. // smaller integers by generating an extra truncate, but larger ones would
  14773. // be lossy. We also can't handle anything other than 2 or 4 lanes, since
  14774. // these intructions only support v2i32/v4i32 types.
  14775. return SDValue();
  14776. }
  14777. BitVector UndefElements;
  14778. BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
  14779. int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
  14780. if (C == -1 || C == 0 || C > 32)
  14781. return SDValue();
  14782. SDLoc dl(N);
  14783. bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
  14784. unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
  14785. Intrinsic::arm_neon_vcvtfp2fxu;
  14786. SDValue FixConv = DAG.getNode(
  14787. ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
  14788. DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
  14789. DAG.getConstant(C, dl, MVT::i32));
  14790. if (IntBits < FloatBits)
  14791. FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
  14792. return FixConv;
  14793. }
  14794. static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
  14795. const ARMSubtarget *Subtarget) {
  14796. if (!Subtarget->hasMVEFloatOps())
  14797. return SDValue();
  14798. // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
  14799. // The second form can be more easily turned into a predicated vadd, and
  14800. // possibly combined into a fma to become a predicated vfma.
  14801. SDValue Op0 = N->getOperand(0);
  14802. SDValue Op1 = N->getOperand(1);
  14803. EVT VT = N->getValueType(0);
  14804. SDLoc DL(N);
  14805. // The identity element for a fadd is -0.0, which these VMOV's represent.
  14806. auto isNegativeZeroSplat = [&](SDValue Op) {
  14807. if (Op.getOpcode() != ISD::BITCAST ||
  14808. Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
  14809. return false;
  14810. if (VT == MVT::v4f32 && Op.getOperand(0).getConstantOperandVal(0) == 1664)
  14811. return true;
  14812. if (VT == MVT::v8f16 && Op.getOperand(0).getConstantOperandVal(0) == 2688)
  14813. return true;
  14814. return false;
  14815. };
  14816. if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
  14817. std::swap(Op0, Op1);
  14818. if (Op1.getOpcode() != ISD::VSELECT ||
  14819. !isNegativeZeroSplat(Op1.getOperand(2)))
  14820. return SDValue();
  14821. SDValue FAdd =
  14822. DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), N->getFlags());
  14823. return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0);
  14824. }
  14825. /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
  14826. /// can replace combinations of VCVT (integer to floating-point) and VDIV
  14827. /// when the VDIV has a constant operand that is a power of 2.
  14828. ///
  14829. /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
  14830. /// vcvt.f32.s32 d16, d16
  14831. /// vdiv.f32 d16, d17, d16
  14832. /// becomes:
  14833. /// vcvt.f32.s32 d16, d16, #3
  14834. static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
  14835. const ARMSubtarget *Subtarget) {
  14836. if (!Subtarget->hasNEON())
  14837. return SDValue();
  14838. SDValue Op = N->getOperand(0);
  14839. unsigned OpOpcode = Op.getNode()->getOpcode();
  14840. if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
  14841. (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
  14842. return SDValue();
  14843. SDValue ConstVec = N->getOperand(1);
  14844. if (!isa<BuildVectorSDNode>(ConstVec))
  14845. return SDValue();
  14846. MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
  14847. uint32_t FloatBits = FloatTy.getSizeInBits();
  14848. MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
  14849. uint32_t IntBits = IntTy.getSizeInBits();
  14850. unsigned NumLanes = Op.getValueType().getVectorNumElements();
  14851. if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
  14852. // These instructions only exist converting from i32 to f32. We can handle
  14853. // smaller integers by generating an extra extend, but larger ones would
  14854. // be lossy. We also can't handle anything other than 2 or 4 lanes, since
  14855. // these intructions only support v2i32/v4i32 types.
  14856. return SDValue();
  14857. }
  14858. BitVector UndefElements;
  14859. BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
  14860. int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
  14861. if (C == -1 || C == 0 || C > 32)
  14862. return SDValue();
  14863. SDLoc dl(N);
  14864. bool isSigned = OpOpcode == ISD::SINT_TO_FP;
  14865. SDValue ConvInput = Op.getOperand(0);
  14866. if (IntBits < FloatBits)
  14867. ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
  14868. dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
  14869. ConvInput);
  14870. unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
  14871. Intrinsic::arm_neon_vcvtfxu2fp;
  14872. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
  14873. Op.getValueType(),
  14874. DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
  14875. ConvInput, DAG.getConstant(C, dl, MVT::i32));
  14876. }
  14877. static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
  14878. const ARMSubtarget *ST) {
  14879. if (!ST->hasMVEIntegerOps())
  14880. return SDValue();
  14881. assert(N->getOpcode() == ISD::VECREDUCE_ADD);
  14882. EVT ResVT = N->getValueType(0);
  14883. SDValue N0 = N->getOperand(0);
  14884. SDLoc dl(N);
  14885. // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
  14886. if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
  14887. (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
  14888. N0.getValueType() == MVT::v16i8)) {
  14889. SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
  14890. SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
  14891. return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
  14892. }
  14893. // We are looking for something that will have illegal types if left alone,
  14894. // but that we can convert to a single instruction under MVE. For example
  14895. // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
  14896. // or
  14897. // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
  14898. // The legal cases are:
  14899. // VADDV u/s 8/16/32
  14900. // VMLAV u/s 8/16/32
  14901. // VADDLV u/s 32
  14902. // VMLALV u/s 16/32
  14903. // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
  14904. // extend it and use v4i32 instead.
  14905. auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
  14906. EVT AVT = A.getValueType();
  14907. return any_of(ExtTypes, [&](MVT Ty) {
  14908. return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
  14909. AVT.bitsLE(Ty);
  14910. });
  14911. };
  14912. auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
  14913. EVT AVT = A.getValueType();
  14914. if (!AVT.is128BitVector())
  14915. A = DAG.getNode(ExtendCode, dl,
  14916. AVT.changeVectorElementType(MVT::getIntegerVT(
  14917. 128 / AVT.getVectorMinNumElements())),
  14918. A);
  14919. return A;
  14920. };
  14921. auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
  14922. if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
  14923. return SDValue();
  14924. SDValue A = N0->getOperand(0);
  14925. if (ExtTypeMatches(A, ExtTypes))
  14926. return ExtendIfNeeded(A, ExtendCode);
  14927. return SDValue();
  14928. };
  14929. auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
  14930. ArrayRef<MVT> ExtTypes, SDValue &Mask) {
  14931. if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
  14932. !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
  14933. return SDValue();
  14934. Mask = N0->getOperand(0);
  14935. SDValue Ext = N0->getOperand(1);
  14936. if (Ext->getOpcode() != ExtendCode)
  14937. return SDValue();
  14938. SDValue A = Ext->getOperand(0);
  14939. if (ExtTypeMatches(A, ExtTypes))
  14940. return ExtendIfNeeded(A, ExtendCode);
  14941. return SDValue();
  14942. };
  14943. auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
  14944. SDValue &A, SDValue &B) {
  14945. // For a vmla we are trying to match a larger pattern:
  14946. // ExtA = sext/zext A
  14947. // ExtB = sext/zext B
  14948. // Mul = mul ExtA, ExtB
  14949. // vecreduce.add Mul
  14950. // There might also be en extra extend between the mul and the addreduce, so
  14951. // long as the bitwidth is high enough to make them equivalent (for example
  14952. // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
  14953. if (ResVT != RetTy)
  14954. return false;
  14955. SDValue Mul = N0;
  14956. if (Mul->getOpcode() == ExtendCode &&
  14957. Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
  14958. ResVT.getScalarSizeInBits())
  14959. Mul = Mul->getOperand(0);
  14960. if (Mul->getOpcode() != ISD::MUL)
  14961. return false;
  14962. SDValue ExtA = Mul->getOperand(0);
  14963. SDValue ExtB = Mul->getOperand(1);
  14964. if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
  14965. return false;
  14966. A = ExtA->getOperand(0);
  14967. B = ExtB->getOperand(0);
  14968. if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
  14969. A = ExtendIfNeeded(A, ExtendCode);
  14970. B = ExtendIfNeeded(B, ExtendCode);
  14971. return true;
  14972. }
  14973. return false;
  14974. };
  14975. auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
  14976. SDValue &A, SDValue &B, SDValue &Mask) {
  14977. // Same as the pattern above with a select for the zero predicated lanes
  14978. // ExtA = sext/zext A
  14979. // ExtB = sext/zext B
  14980. // Mul = mul ExtA, ExtB
  14981. // N0 = select Mask, Mul, 0
  14982. // vecreduce.add N0
  14983. if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
  14984. !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
  14985. return false;
  14986. Mask = N0->getOperand(0);
  14987. SDValue Mul = N0->getOperand(1);
  14988. if (Mul->getOpcode() == ExtendCode &&
  14989. Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
  14990. ResVT.getScalarSizeInBits())
  14991. Mul = Mul->getOperand(0);
  14992. if (Mul->getOpcode() != ISD::MUL)
  14993. return false;
  14994. SDValue ExtA = Mul->getOperand(0);
  14995. SDValue ExtB = Mul->getOperand(1);
  14996. if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
  14997. return false;
  14998. A = ExtA->getOperand(0);
  14999. B = ExtB->getOperand(0);
  15000. if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
  15001. A = ExtendIfNeeded(A, ExtendCode);
  15002. B = ExtendIfNeeded(B, ExtendCode);
  15003. return true;
  15004. }
  15005. return false;
  15006. };
  15007. auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
  15008. // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
  15009. // reductions. The operands are extended with MVEEXT, but as they are
  15010. // reductions the lane orders do not matter. MVEEXT may be combined with
  15011. // loads to produce two extending loads, or else they will be expanded to
  15012. // VREV/VMOVL.
  15013. EVT VT = Ops[0].getValueType();
  15014. if (VT == MVT::v16i8) {
  15015. assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
  15016. "Unexpected illegal long reduction opcode");
  15017. bool IsUnsigned = Opcode == ARMISD::VMLALVu;
  15018. SDValue Ext0 =
  15019. DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
  15020. DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
  15021. SDValue Ext1 =
  15022. DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
  15023. DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
  15024. SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
  15025. Ext0, Ext1);
  15026. SDValue MLA1 =
  15027. DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
  15028. DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
  15029. Ext0.getValue(1), Ext1.getValue(1));
  15030. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
  15031. }
  15032. SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
  15033. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
  15034. SDValue(Node.getNode(), 1));
  15035. };
  15036. SDValue A, B;
  15037. SDValue Mask;
  15038. if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
  15039. return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
  15040. if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
  15041. return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
  15042. if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
  15043. A, B))
  15044. return Create64bitNode(ARMISD::VMLALVs, {A, B});
  15045. if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
  15046. A, B))
  15047. return Create64bitNode(ARMISD::VMLALVu, {A, B});
  15048. if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
  15049. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15050. DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
  15051. if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
  15052. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15053. DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
  15054. if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
  15055. Mask))
  15056. return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
  15057. if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
  15058. Mask))
  15059. return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
  15060. if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
  15061. Mask))
  15062. return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
  15063. if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
  15064. Mask))
  15065. return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
  15066. if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
  15067. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15068. DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
  15069. if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
  15070. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15071. DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
  15072. if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
  15073. return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
  15074. if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
  15075. return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
  15076. if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
  15077. return Create64bitNode(ARMISD::VADDLVs, {A});
  15078. if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
  15079. return Create64bitNode(ARMISD::VADDLVu, {A});
  15080. if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
  15081. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15082. DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
  15083. if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
  15084. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15085. DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
  15086. if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
  15087. return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
  15088. if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
  15089. return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
  15090. if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
  15091. return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
  15092. if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
  15093. return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
  15094. if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
  15095. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15096. DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
  15097. if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
  15098. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15099. DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
  15100. // Some complications. We can get a case where the two inputs of the mul are
  15101. // the same, then the output sext will have been helpfully converted to a
  15102. // zext. Turn it back.
  15103. SDValue Op = N0;
  15104. if (Op->getOpcode() == ISD::VSELECT)
  15105. Op = Op->getOperand(1);
  15106. if (Op->getOpcode() == ISD::ZERO_EXTEND &&
  15107. Op->getOperand(0)->getOpcode() == ISD::MUL) {
  15108. SDValue Mul = Op->getOperand(0);
  15109. if (Mul->getOperand(0) == Mul->getOperand(1) &&
  15110. Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
  15111. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
  15112. if (Op != N0)
  15113. Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
  15114. N0->getOperand(0), Ext, N0->getOperand(2));
  15115. return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
  15116. }
  15117. }
  15118. return SDValue();
  15119. }
  15120. static SDValue PerformVMOVNCombine(SDNode *N,
  15121. TargetLowering::DAGCombinerInfo &DCI) {
  15122. SDValue Op0 = N->getOperand(0);
  15123. SDValue Op1 = N->getOperand(1);
  15124. unsigned IsTop = N->getConstantOperandVal(2);
  15125. // VMOVNT a undef -> a
  15126. // VMOVNB a undef -> a
  15127. // VMOVNB undef a -> a
  15128. if (Op1->isUndef())
  15129. return Op0;
  15130. if (Op0->isUndef() && !IsTop)
  15131. return Op1;
  15132. // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
  15133. // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
  15134. if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
  15135. Op1->getOpcode() == ARMISD::VQMOVNu) &&
  15136. Op1->getConstantOperandVal(2) == 0)
  15137. return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
  15138. Op0, Op1->getOperand(1), N->getOperand(2));
  15139. // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
  15140. // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
  15141. // into the top or bottom lanes.
  15142. unsigned NumElts = N->getValueType(0).getVectorNumElements();
  15143. APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
  15144. APInt Op0DemandedElts =
  15145. IsTop ? Op1DemandedElts
  15146. : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
  15147. APInt KnownUndef, KnownZero;
  15148. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  15149. if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
  15150. KnownZero, DCI))
  15151. return SDValue(N, 0);
  15152. if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef,
  15153. KnownZero, DCI))
  15154. return SDValue(N, 0);
  15155. return SDValue();
  15156. }
  15157. static SDValue PerformVQMOVNCombine(SDNode *N,
  15158. TargetLowering::DAGCombinerInfo &DCI) {
  15159. SDValue Op0 = N->getOperand(0);
  15160. unsigned IsTop = N->getConstantOperandVal(2);
  15161. unsigned NumElts = N->getValueType(0).getVectorNumElements();
  15162. APInt Op0DemandedElts =
  15163. APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
  15164. : APInt::getHighBitsSet(2, 1));
  15165. APInt KnownUndef, KnownZero;
  15166. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  15167. if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
  15168. KnownZero, DCI))
  15169. return SDValue(N, 0);
  15170. return SDValue();
  15171. }
  15172. static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
  15173. SDLoc DL(N);
  15174. SDValue Op0 = N->getOperand(0);
  15175. SDValue Op1 = N->getOperand(1);
  15176. // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
  15177. // uses of the intrinsics.
  15178. if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
  15179. int ShiftAmt = C->getSExtValue();
  15180. if (ShiftAmt == 0) {
  15181. SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
  15182. DAG.ReplaceAllUsesWith(N, Merge.getNode());
  15183. return SDValue();
  15184. }
  15185. if (ShiftAmt >= -32 && ShiftAmt < 0) {
  15186. unsigned NewOpcode =
  15187. N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
  15188. SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
  15189. DAG.getConstant(-ShiftAmt, DL, MVT::i32));
  15190. DAG.ReplaceAllUsesWith(N, NewShift.getNode());
  15191. return NewShift;
  15192. }
  15193. }
  15194. return SDValue();
  15195. }
  15196. /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
  15197. SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
  15198. DAGCombinerInfo &DCI) const {
  15199. SelectionDAG &DAG = DCI.DAG;
  15200. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
  15201. switch (IntNo) {
  15202. default:
  15203. // Don't do anything for most intrinsics.
  15204. break;
  15205. // Vector shifts: check for immediate versions and lower them.
  15206. // Note: This is done during DAG combining instead of DAG legalizing because
  15207. // the build_vectors for 64-bit vector element shift counts are generally
  15208. // not legal, and it is hard to see their values after they get legalized to
  15209. // loads from a constant pool.
  15210. case Intrinsic::arm_neon_vshifts:
  15211. case Intrinsic::arm_neon_vshiftu:
  15212. case Intrinsic::arm_neon_vrshifts:
  15213. case Intrinsic::arm_neon_vrshiftu:
  15214. case Intrinsic::arm_neon_vrshiftn:
  15215. case Intrinsic::arm_neon_vqshifts:
  15216. case Intrinsic::arm_neon_vqshiftu:
  15217. case Intrinsic::arm_neon_vqshiftsu:
  15218. case Intrinsic::arm_neon_vqshiftns:
  15219. case Intrinsic::arm_neon_vqshiftnu:
  15220. case Intrinsic::arm_neon_vqshiftnsu:
  15221. case Intrinsic::arm_neon_vqrshiftns:
  15222. case Intrinsic::arm_neon_vqrshiftnu:
  15223. case Intrinsic::arm_neon_vqrshiftnsu: {
  15224. EVT VT = N->getOperand(1).getValueType();
  15225. int64_t Cnt;
  15226. unsigned VShiftOpc = 0;
  15227. switch (IntNo) {
  15228. case Intrinsic::arm_neon_vshifts:
  15229. case Intrinsic::arm_neon_vshiftu:
  15230. if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
  15231. VShiftOpc = ARMISD::VSHLIMM;
  15232. break;
  15233. }
  15234. if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
  15235. VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
  15236. : ARMISD::VSHRuIMM);
  15237. break;
  15238. }
  15239. return SDValue();
  15240. case Intrinsic::arm_neon_vrshifts:
  15241. case Intrinsic::arm_neon_vrshiftu:
  15242. if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
  15243. break;
  15244. return SDValue();
  15245. case Intrinsic::arm_neon_vqshifts:
  15246. case Intrinsic::arm_neon_vqshiftu:
  15247. if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
  15248. break;
  15249. return SDValue();
  15250. case Intrinsic::arm_neon_vqshiftsu:
  15251. if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
  15252. break;
  15253. llvm_unreachable("invalid shift count for vqshlu intrinsic");
  15254. case Intrinsic::arm_neon_vrshiftn:
  15255. case Intrinsic::arm_neon_vqshiftns:
  15256. case Intrinsic::arm_neon_vqshiftnu:
  15257. case Intrinsic::arm_neon_vqshiftnsu:
  15258. case Intrinsic::arm_neon_vqrshiftns:
  15259. case Intrinsic::arm_neon_vqrshiftnu:
  15260. case Intrinsic::arm_neon_vqrshiftnsu:
  15261. // Narrowing shifts require an immediate right shift.
  15262. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
  15263. break;
  15264. llvm_unreachable("invalid shift count for narrowing vector shift "
  15265. "intrinsic");
  15266. default:
  15267. llvm_unreachable("unhandled vector shift");
  15268. }
  15269. switch (IntNo) {
  15270. case Intrinsic::arm_neon_vshifts:
  15271. case Intrinsic::arm_neon_vshiftu:
  15272. // Opcode already set above.
  15273. break;
  15274. case Intrinsic::arm_neon_vrshifts:
  15275. VShiftOpc = ARMISD::VRSHRsIMM;
  15276. break;
  15277. case Intrinsic::arm_neon_vrshiftu:
  15278. VShiftOpc = ARMISD::VRSHRuIMM;
  15279. break;
  15280. case Intrinsic::arm_neon_vrshiftn:
  15281. VShiftOpc = ARMISD::VRSHRNIMM;
  15282. break;
  15283. case Intrinsic::arm_neon_vqshifts:
  15284. VShiftOpc = ARMISD::VQSHLsIMM;
  15285. break;
  15286. case Intrinsic::arm_neon_vqshiftu:
  15287. VShiftOpc = ARMISD::VQSHLuIMM;
  15288. break;
  15289. case Intrinsic::arm_neon_vqshiftsu:
  15290. VShiftOpc = ARMISD::VQSHLsuIMM;
  15291. break;
  15292. case Intrinsic::arm_neon_vqshiftns:
  15293. VShiftOpc = ARMISD::VQSHRNsIMM;
  15294. break;
  15295. case Intrinsic::arm_neon_vqshiftnu:
  15296. VShiftOpc = ARMISD::VQSHRNuIMM;
  15297. break;
  15298. case Intrinsic::arm_neon_vqshiftnsu:
  15299. VShiftOpc = ARMISD::VQSHRNsuIMM;
  15300. break;
  15301. case Intrinsic::arm_neon_vqrshiftns:
  15302. VShiftOpc = ARMISD::VQRSHRNsIMM;
  15303. break;
  15304. case Intrinsic::arm_neon_vqrshiftnu:
  15305. VShiftOpc = ARMISD::VQRSHRNuIMM;
  15306. break;
  15307. case Intrinsic::arm_neon_vqrshiftnsu:
  15308. VShiftOpc = ARMISD::VQRSHRNsuIMM;
  15309. break;
  15310. }
  15311. SDLoc dl(N);
  15312. return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
  15313. N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
  15314. }
  15315. case Intrinsic::arm_neon_vshiftins: {
  15316. EVT VT = N->getOperand(1).getValueType();
  15317. int64_t Cnt;
  15318. unsigned VShiftOpc = 0;
  15319. if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
  15320. VShiftOpc = ARMISD::VSLIIMM;
  15321. else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
  15322. VShiftOpc = ARMISD::VSRIIMM;
  15323. else {
  15324. llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
  15325. }
  15326. SDLoc dl(N);
  15327. return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
  15328. N->getOperand(1), N->getOperand(2),
  15329. DAG.getConstant(Cnt, dl, MVT::i32));
  15330. }
  15331. case Intrinsic::arm_neon_vqrshifts:
  15332. case Intrinsic::arm_neon_vqrshiftu:
  15333. // No immediate versions of these to check for.
  15334. break;
  15335. case Intrinsic::arm_mve_vqdmlah:
  15336. case Intrinsic::arm_mve_vqdmlash:
  15337. case Intrinsic::arm_mve_vqrdmlah:
  15338. case Intrinsic::arm_mve_vqrdmlash:
  15339. case Intrinsic::arm_mve_vmla_n_predicated:
  15340. case Intrinsic::arm_mve_vmlas_n_predicated:
  15341. case Intrinsic::arm_mve_vqdmlah_predicated:
  15342. case Intrinsic::arm_mve_vqdmlash_predicated:
  15343. case Intrinsic::arm_mve_vqrdmlah_predicated:
  15344. case Intrinsic::arm_mve_vqrdmlash_predicated: {
  15345. // These intrinsics all take an i32 scalar operand which is narrowed to the
  15346. // size of a single lane of the vector type they return. So we don't need
  15347. // any bits of that operand above that point, which allows us to eliminate
  15348. // uxth/sxth.
  15349. unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
  15350. APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
  15351. if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
  15352. return SDValue();
  15353. break;
  15354. }
  15355. case Intrinsic::arm_mve_minv:
  15356. case Intrinsic::arm_mve_maxv:
  15357. case Intrinsic::arm_mve_minav:
  15358. case Intrinsic::arm_mve_maxav:
  15359. case Intrinsic::arm_mve_minv_predicated:
  15360. case Intrinsic::arm_mve_maxv_predicated:
  15361. case Intrinsic::arm_mve_minav_predicated:
  15362. case Intrinsic::arm_mve_maxav_predicated: {
  15363. // These intrinsics all take an i32 scalar operand which is narrowed to the
  15364. // size of a single lane of the vector type they take as the other input.
  15365. unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
  15366. APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
  15367. if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
  15368. return SDValue();
  15369. break;
  15370. }
  15371. case Intrinsic::arm_mve_addv: {
  15372. // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
  15373. // which allow PerformADDVecReduce to turn it into VADDLV when possible.
  15374. bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  15375. unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
  15376. return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
  15377. }
  15378. case Intrinsic::arm_mve_addlv:
  15379. case Intrinsic::arm_mve_addlv_predicated: {
  15380. // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
  15381. // which recombines the two outputs into an i64
  15382. bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  15383. unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
  15384. (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
  15385. (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
  15386. SmallVector<SDValue, 4> Ops;
  15387. for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
  15388. if (i != 2) // skip the unsigned flag
  15389. Ops.push_back(N->getOperand(i));
  15390. SDLoc dl(N);
  15391. SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
  15392. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
  15393. val.getValue(1));
  15394. }
  15395. }
  15396. return SDValue();
  15397. }
  15398. /// PerformShiftCombine - Checks for immediate versions of vector shifts and
  15399. /// lowers them. As with the vector shift intrinsics, this is done during DAG
  15400. /// combining instead of DAG legalizing because the build_vectors for 64-bit
  15401. /// vector element shift counts are generally not legal, and it is hard to see
  15402. /// their values after they get legalized to loads from a constant pool.
  15403. static SDValue PerformShiftCombine(SDNode *N,
  15404. TargetLowering::DAGCombinerInfo &DCI,
  15405. const ARMSubtarget *ST) {
  15406. SelectionDAG &DAG = DCI.DAG;
  15407. EVT VT = N->getValueType(0);
  15408. if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
  15409. N->getOperand(0)->getOpcode() == ISD::AND &&
  15410. N->getOperand(0)->hasOneUse()) {
  15411. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  15412. return SDValue();
  15413. // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
  15414. // usually show up because instcombine prefers to canonicalize it to
  15415. // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
  15416. // out of GEP lowering in some cases.
  15417. SDValue N0 = N->getOperand(0);
  15418. ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
  15419. if (!ShiftAmtNode)
  15420. return SDValue();
  15421. uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
  15422. ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
  15423. if (!AndMaskNode)
  15424. return SDValue();
  15425. uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
  15426. // Don't transform uxtb/uxth.
  15427. if (AndMask == 255 || AndMask == 65535)
  15428. return SDValue();
  15429. if (isMask_32(AndMask)) {
  15430. uint32_t MaskedBits = countLeadingZeros(AndMask);
  15431. if (MaskedBits > ShiftAmt) {
  15432. SDLoc DL(N);
  15433. SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
  15434. DAG.getConstant(MaskedBits, DL, MVT::i32));
  15435. return DAG.getNode(
  15436. ISD::SRL, DL, MVT::i32, SHL,
  15437. DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
  15438. }
  15439. }
  15440. }
  15441. // Nothing to be done for scalar shifts.
  15442. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  15443. if (!VT.isVector() || !TLI.isTypeLegal(VT))
  15444. return SDValue();
  15445. if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
  15446. return SDValue();
  15447. int64_t Cnt;
  15448. switch (N->getOpcode()) {
  15449. default: llvm_unreachable("unexpected shift opcode");
  15450. case ISD::SHL:
  15451. if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
  15452. SDLoc dl(N);
  15453. return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
  15454. DAG.getConstant(Cnt, dl, MVT::i32));
  15455. }
  15456. break;
  15457. case ISD::SRA:
  15458. case ISD::SRL:
  15459. if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
  15460. unsigned VShiftOpc =
  15461. (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
  15462. SDLoc dl(N);
  15463. return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
  15464. DAG.getConstant(Cnt, dl, MVT::i32));
  15465. }
  15466. }
  15467. return SDValue();
  15468. }
  15469. // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
  15470. // split into multiple extending loads, which are simpler to deal with than an
  15471. // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
  15472. // to convert the type to an f32.
  15473. static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
  15474. SDValue N0 = N->getOperand(0);
  15475. if (N0.getOpcode() != ISD::LOAD)
  15476. return SDValue();
  15477. LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
  15478. if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
  15479. LD->getExtensionType() != ISD::NON_EXTLOAD)
  15480. return SDValue();
  15481. EVT FromVT = LD->getValueType(0);
  15482. EVT ToVT = N->getValueType(0);
  15483. if (!ToVT.isVector())
  15484. return SDValue();
  15485. assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
  15486. EVT ToEltVT = ToVT.getVectorElementType();
  15487. EVT FromEltVT = FromVT.getVectorElementType();
  15488. unsigned NumElements = 0;
  15489. if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
  15490. NumElements = 4;
  15491. if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
  15492. NumElements = 4;
  15493. if (NumElements == 0 ||
  15494. (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
  15495. FromVT.getVectorNumElements() % NumElements != 0 ||
  15496. !isPowerOf2_32(NumElements))
  15497. return SDValue();
  15498. LLVMContext &C = *DAG.getContext();
  15499. SDLoc DL(LD);
  15500. // Details about the old load
  15501. SDValue Ch = LD->getChain();
  15502. SDValue BasePtr = LD->getBasePtr();
  15503. Align Alignment = LD->getOriginalAlign();
  15504. MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
  15505. AAMDNodes AAInfo = LD->getAAInfo();
  15506. ISD::LoadExtType NewExtType =
  15507. N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
  15508. SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
  15509. EVT NewFromVT = EVT::getVectorVT(
  15510. C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
  15511. EVT NewToVT = EVT::getVectorVT(
  15512. C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
  15513. SmallVector<SDValue, 4> Loads;
  15514. SmallVector<SDValue, 4> Chains;
  15515. for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
  15516. unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
  15517. SDValue NewPtr =
  15518. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  15519. SDValue NewLoad =
  15520. DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
  15521. LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
  15522. Alignment, MMOFlags, AAInfo);
  15523. Loads.push_back(NewLoad);
  15524. Chains.push_back(SDValue(NewLoad.getNode(), 1));
  15525. }
  15526. // Float truncs need to extended with VCVTB's into their floating point types.
  15527. if (FromEltVT == MVT::f16) {
  15528. SmallVector<SDValue, 4> Extends;
  15529. for (unsigned i = 0; i < Loads.size(); i++) {
  15530. SDValue LoadBC =
  15531. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
  15532. SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
  15533. DAG.getConstant(0, DL, MVT::i32));
  15534. Extends.push_back(FPExt);
  15535. }
  15536. Loads = Extends;
  15537. }
  15538. SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  15539. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
  15540. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
  15541. }
  15542. /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
  15543. /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
  15544. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
  15545. const ARMSubtarget *ST) {
  15546. SDValue N0 = N->getOperand(0);
  15547. // Check for sign- and zero-extensions of vector extract operations of 8- and
  15548. // 16-bit vector elements. NEON and MVE support these directly. They are
  15549. // handled during DAG combining because type legalization will promote them
  15550. // to 32-bit types and it is messy to recognize the operations after that.
  15551. if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
  15552. N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
  15553. SDValue Vec = N0.getOperand(0);
  15554. SDValue Lane = N0.getOperand(1);
  15555. EVT VT = N->getValueType(0);
  15556. EVT EltVT = N0.getValueType();
  15557. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  15558. if (VT == MVT::i32 &&
  15559. (EltVT == MVT::i8 || EltVT == MVT::i16) &&
  15560. TLI.isTypeLegal(Vec.getValueType()) &&
  15561. isa<ConstantSDNode>(Lane)) {
  15562. unsigned Opc = 0;
  15563. switch (N->getOpcode()) {
  15564. default: llvm_unreachable("unexpected opcode");
  15565. case ISD::SIGN_EXTEND:
  15566. Opc = ARMISD::VGETLANEs;
  15567. break;
  15568. case ISD::ZERO_EXTEND:
  15569. case ISD::ANY_EXTEND:
  15570. Opc = ARMISD::VGETLANEu;
  15571. break;
  15572. }
  15573. return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
  15574. }
  15575. }
  15576. if (ST->hasMVEIntegerOps())
  15577. if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
  15578. return NewLoad;
  15579. return SDValue();
  15580. }
  15581. static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
  15582. const ARMSubtarget *ST) {
  15583. if (ST->hasMVEFloatOps())
  15584. if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
  15585. return NewLoad;
  15586. return SDValue();
  15587. }
  15588. /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
  15589. /// saturates.
  15590. static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
  15591. const ARMSubtarget *ST) {
  15592. EVT VT = N->getValueType(0);
  15593. SDValue N0 = N->getOperand(0);
  15594. if (!ST->hasMVEIntegerOps())
  15595. return SDValue();
  15596. if (SDValue V = PerformVQDMULHCombine(N, DAG))
  15597. return V;
  15598. if (VT != MVT::v4i32 && VT != MVT::v8i16)
  15599. return SDValue();
  15600. auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
  15601. // Check one is a smin and the other is a smax
  15602. if (Min->getOpcode() != ISD::SMIN)
  15603. std::swap(Min, Max);
  15604. if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
  15605. return false;
  15606. APInt SaturateC;
  15607. if (VT == MVT::v4i32)
  15608. SaturateC = APInt(32, (1 << 15) - 1, true);
  15609. else //if (VT == MVT::v8i16)
  15610. SaturateC = APInt(16, (1 << 7) - 1, true);
  15611. APInt MinC, MaxC;
  15612. if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
  15613. MinC != SaturateC)
  15614. return false;
  15615. if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
  15616. MaxC != ~SaturateC)
  15617. return false;
  15618. return true;
  15619. };
  15620. if (IsSignedSaturate(N, N0.getNode())) {
  15621. SDLoc DL(N);
  15622. MVT ExtVT, HalfVT;
  15623. if (VT == MVT::v4i32) {
  15624. HalfVT = MVT::v8i16;
  15625. ExtVT = MVT::v4i16;
  15626. } else { // if (VT == MVT::v8i16)
  15627. HalfVT = MVT::v16i8;
  15628. ExtVT = MVT::v8i8;
  15629. }
  15630. // Create a VQMOVNB with undef top lanes, then signed extended into the top
  15631. // half. That extend will hopefully be removed if only the bottom bits are
  15632. // demanded (though a truncating store, for example).
  15633. SDValue VQMOVN =
  15634. DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
  15635. N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
  15636. SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
  15637. return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
  15638. DAG.getValueType(ExtVT));
  15639. }
  15640. auto IsUnsignedSaturate = [&](SDNode *Min) {
  15641. // For unsigned, we just need to check for <= 0xffff
  15642. if (Min->getOpcode() != ISD::UMIN)
  15643. return false;
  15644. APInt SaturateC;
  15645. if (VT == MVT::v4i32)
  15646. SaturateC = APInt(32, (1 << 16) - 1, true);
  15647. else //if (VT == MVT::v8i16)
  15648. SaturateC = APInt(16, (1 << 8) - 1, true);
  15649. APInt MinC;
  15650. if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
  15651. MinC != SaturateC)
  15652. return false;
  15653. return true;
  15654. };
  15655. if (IsUnsignedSaturate(N)) {
  15656. SDLoc DL(N);
  15657. MVT HalfVT;
  15658. unsigned ExtConst;
  15659. if (VT == MVT::v4i32) {
  15660. HalfVT = MVT::v8i16;
  15661. ExtConst = 0x0000FFFF;
  15662. } else { //if (VT == MVT::v8i16)
  15663. HalfVT = MVT::v16i8;
  15664. ExtConst = 0x00FF;
  15665. }
  15666. // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
  15667. // an AND. That extend will hopefully be removed if only the bottom bits are
  15668. // demanded (though a truncating store, for example).
  15669. SDValue VQMOVN =
  15670. DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
  15671. DAG.getConstant(0, DL, MVT::i32));
  15672. SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
  15673. return DAG.getNode(ISD::AND, DL, VT, Bitcast,
  15674. DAG.getConstant(ExtConst, DL, VT));
  15675. }
  15676. return SDValue();
  15677. }
  15678. static const APInt *isPowerOf2Constant(SDValue V) {
  15679. ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
  15680. if (!C)
  15681. return nullptr;
  15682. const APInt *CV = &C->getAPIntValue();
  15683. return CV->isPowerOf2() ? CV : nullptr;
  15684. }
  15685. SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
  15686. // If we have a CMOV, OR and AND combination such as:
  15687. // if (x & CN)
  15688. // y |= CM;
  15689. //
  15690. // And:
  15691. // * CN is a single bit;
  15692. // * All bits covered by CM are known zero in y
  15693. //
  15694. // Then we can convert this into a sequence of BFI instructions. This will
  15695. // always be a win if CM is a single bit, will always be no worse than the
  15696. // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
  15697. // three bits (due to the extra IT instruction).
  15698. SDValue Op0 = CMOV->getOperand(0);
  15699. SDValue Op1 = CMOV->getOperand(1);
  15700. auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
  15701. auto CC = CCNode->getAPIntValue().getLimitedValue();
  15702. SDValue CmpZ = CMOV->getOperand(4);
  15703. // The compare must be against zero.
  15704. if (!isNullConstant(CmpZ->getOperand(1)))
  15705. return SDValue();
  15706. assert(CmpZ->getOpcode() == ARMISD::CMPZ);
  15707. SDValue And = CmpZ->getOperand(0);
  15708. if (And->getOpcode() != ISD::AND)
  15709. return SDValue();
  15710. const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
  15711. if (!AndC)
  15712. return SDValue();
  15713. SDValue X = And->getOperand(0);
  15714. if (CC == ARMCC::EQ) {
  15715. // We're performing an "equal to zero" compare. Swap the operands so we
  15716. // canonicalize on a "not equal to zero" compare.
  15717. std::swap(Op0, Op1);
  15718. } else {
  15719. assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
  15720. }
  15721. if (Op1->getOpcode() != ISD::OR)
  15722. return SDValue();
  15723. ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
  15724. if (!OrC)
  15725. return SDValue();
  15726. SDValue Y = Op1->getOperand(0);
  15727. if (Op0 != Y)
  15728. return SDValue();
  15729. // Now, is it profitable to continue?
  15730. APInt OrCI = OrC->getAPIntValue();
  15731. unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
  15732. if (OrCI.countPopulation() > Heuristic)
  15733. return SDValue();
  15734. // Lastly, can we determine that the bits defined by OrCI
  15735. // are zero in Y?
  15736. KnownBits Known = DAG.computeKnownBits(Y);
  15737. if ((OrCI & Known.Zero) != OrCI)
  15738. return SDValue();
  15739. // OK, we can do the combine.
  15740. SDValue V = Y;
  15741. SDLoc dl(X);
  15742. EVT VT = X.getValueType();
  15743. unsigned BitInX = AndC->logBase2();
  15744. if (BitInX != 0) {
  15745. // We must shift X first.
  15746. X = DAG.getNode(ISD::SRL, dl, VT, X,
  15747. DAG.getConstant(BitInX, dl, VT));
  15748. }
  15749. for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
  15750. BitInY < NumActiveBits; ++BitInY) {
  15751. if (OrCI[BitInY] == 0)
  15752. continue;
  15753. APInt Mask(VT.getSizeInBits(), 0);
  15754. Mask.setBit(BitInY);
  15755. V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
  15756. // Confusingly, the operand is an *inverted* mask.
  15757. DAG.getConstant(~Mask, dl, VT));
  15758. }
  15759. return V;
  15760. }
  15761. // Given N, the value controlling the conditional branch, search for the loop
  15762. // intrinsic, returning it, along with how the value is used. We need to handle
  15763. // patterns such as the following:
  15764. // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
  15765. // (brcond (setcc (loop.decrement), 0, eq), exit)
  15766. // (brcond (setcc (loop.decrement), 0, ne), header)
  15767. static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
  15768. bool &Negate) {
  15769. switch (N->getOpcode()) {
  15770. default:
  15771. break;
  15772. case ISD::XOR: {
  15773. if (!isa<ConstantSDNode>(N.getOperand(1)))
  15774. return SDValue();
  15775. if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
  15776. return SDValue();
  15777. Negate = !Negate;
  15778. return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
  15779. }
  15780. case ISD::SETCC: {
  15781. auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
  15782. if (!Const)
  15783. return SDValue();
  15784. if (Const->isZero())
  15785. Imm = 0;
  15786. else if (Const->isOne())
  15787. Imm = 1;
  15788. else
  15789. return SDValue();
  15790. CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
  15791. return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
  15792. }
  15793. case ISD::INTRINSIC_W_CHAIN: {
  15794. unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
  15795. if (IntOp != Intrinsic::test_start_loop_iterations &&
  15796. IntOp != Intrinsic::loop_decrement_reg)
  15797. return SDValue();
  15798. return N;
  15799. }
  15800. }
  15801. return SDValue();
  15802. }
  15803. static SDValue PerformHWLoopCombine(SDNode *N,
  15804. TargetLowering::DAGCombinerInfo &DCI,
  15805. const ARMSubtarget *ST) {
  15806. // The hwloop intrinsics that we're interested are used for control-flow,
  15807. // either for entering or exiting the loop:
  15808. // - test.start.loop.iterations will test whether its operand is zero. If it
  15809. // is zero, the proceeding branch should not enter the loop.
  15810. // - loop.decrement.reg also tests whether its operand is zero. If it is
  15811. // zero, the proceeding branch should not branch back to the beginning of
  15812. // the loop.
  15813. // So here, we need to check that how the brcond is using the result of each
  15814. // of the intrinsics to ensure that we're branching to the right place at the
  15815. // right time.
  15816. ISD::CondCode CC;
  15817. SDValue Cond;
  15818. int Imm = 1;
  15819. bool Negate = false;
  15820. SDValue Chain = N->getOperand(0);
  15821. SDValue Dest;
  15822. if (N->getOpcode() == ISD::BRCOND) {
  15823. CC = ISD::SETEQ;
  15824. Cond = N->getOperand(1);
  15825. Dest = N->getOperand(2);
  15826. } else {
  15827. assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
  15828. CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
  15829. Cond = N->getOperand(2);
  15830. Dest = N->getOperand(4);
  15831. if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
  15832. if (!Const->isOne() && !Const->isZero())
  15833. return SDValue();
  15834. Imm = Const->getZExtValue();
  15835. } else
  15836. return SDValue();
  15837. }
  15838. SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
  15839. if (!Int)
  15840. return SDValue();
  15841. if (Negate)
  15842. CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
  15843. auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
  15844. return (CC == ISD::SETEQ && Imm == 0) ||
  15845. (CC == ISD::SETNE && Imm == 1) ||
  15846. (CC == ISD::SETLT && Imm == 1) ||
  15847. (CC == ISD::SETULT && Imm == 1);
  15848. };
  15849. auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
  15850. return (CC == ISD::SETEQ && Imm == 1) ||
  15851. (CC == ISD::SETNE && Imm == 0) ||
  15852. (CC == ISD::SETGT && Imm == 0) ||
  15853. (CC == ISD::SETUGT && Imm == 0) ||
  15854. (CC == ISD::SETGE && Imm == 1) ||
  15855. (CC == ISD::SETUGE && Imm == 1);
  15856. };
  15857. assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
  15858. "unsupported condition");
  15859. SDLoc dl(Int);
  15860. SelectionDAG &DAG = DCI.DAG;
  15861. SDValue Elements = Int.getOperand(2);
  15862. unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
  15863. assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
  15864. && "expected single br user");
  15865. SDNode *Br = *N->use_begin();
  15866. SDValue OtherTarget = Br->getOperand(1);
  15867. // Update the unconditional branch to branch to the given Dest.
  15868. auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
  15869. SDValue NewBrOps[] = { Br->getOperand(0), Dest };
  15870. SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
  15871. DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
  15872. };
  15873. if (IntOp == Intrinsic::test_start_loop_iterations) {
  15874. SDValue Res;
  15875. SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
  15876. // We expect this 'instruction' to branch when the counter is zero.
  15877. if (IsTrueIfZero(CC, Imm)) {
  15878. SDValue Ops[] = {Chain, Setup, Dest};
  15879. Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
  15880. } else {
  15881. // The logic is the reverse of what we need for WLS, so find the other
  15882. // basic block target: the target of the proceeding br.
  15883. UpdateUncondBr(Br, Dest, DAG);
  15884. SDValue Ops[] = {Chain, Setup, OtherTarget};
  15885. Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
  15886. }
  15887. // Update LR count to the new value
  15888. DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
  15889. // Update chain
  15890. DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
  15891. return Res;
  15892. } else {
  15893. SDValue Size = DAG.getTargetConstant(
  15894. cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
  15895. SDValue Args[] = { Int.getOperand(0), Elements, Size, };
  15896. SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
  15897. DAG.getVTList(MVT::i32, MVT::Other), Args);
  15898. DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
  15899. // We expect this instruction to branch when the count is not zero.
  15900. SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
  15901. // Update the unconditional branch to target the loop preheader if we've
  15902. // found the condition has been reversed.
  15903. if (Target == OtherTarget)
  15904. UpdateUncondBr(Br, Dest, DAG);
  15905. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
  15906. SDValue(LoopDec.getNode(), 1), Chain);
  15907. SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
  15908. return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
  15909. }
  15910. return SDValue();
  15911. }
  15912. /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
  15913. SDValue
  15914. ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
  15915. SDValue Cmp = N->getOperand(4);
  15916. if (Cmp.getOpcode() != ARMISD::CMPZ)
  15917. // Only looking at NE cases.
  15918. return SDValue();
  15919. EVT VT = N->getValueType(0);
  15920. SDLoc dl(N);
  15921. SDValue LHS = Cmp.getOperand(0);
  15922. SDValue RHS = Cmp.getOperand(1);
  15923. SDValue Chain = N->getOperand(0);
  15924. SDValue BB = N->getOperand(1);
  15925. SDValue ARMcc = N->getOperand(2);
  15926. ARMCC::CondCodes CC =
  15927. (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
  15928. // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
  15929. // -> (brcond Chain BB CC CPSR Cmp)
  15930. if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
  15931. LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
  15932. LHS->getOperand(0)->hasOneUse()) {
  15933. auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
  15934. auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
  15935. auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
  15936. auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
  15937. if ((LHS00C && LHS00C->getZExtValue() == 0) &&
  15938. (LHS01C && LHS01C->getZExtValue() == 1) &&
  15939. (LHS1C && LHS1C->getZExtValue() == 1) &&
  15940. (RHSC && RHSC->getZExtValue() == 0)) {
  15941. return DAG.getNode(
  15942. ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
  15943. LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
  15944. }
  15945. }
  15946. return SDValue();
  15947. }
  15948. /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
  15949. SDValue
  15950. ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
  15951. SDValue Cmp = N->getOperand(4);
  15952. if (Cmp.getOpcode() != ARMISD::CMPZ)
  15953. // Only looking at EQ and NE cases.
  15954. return SDValue();
  15955. EVT VT = N->getValueType(0);
  15956. SDLoc dl(N);
  15957. SDValue LHS = Cmp.getOperand(0);
  15958. SDValue RHS = Cmp.getOperand(1);
  15959. SDValue FalseVal = N->getOperand(0);
  15960. SDValue TrueVal = N->getOperand(1);
  15961. SDValue ARMcc = N->getOperand(2);
  15962. ARMCC::CondCodes CC =
  15963. (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
  15964. // BFI is only available on V6T2+.
  15965. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
  15966. SDValue R = PerformCMOVToBFICombine(N, DAG);
  15967. if (R)
  15968. return R;
  15969. }
  15970. // Simplify
  15971. // mov r1, r0
  15972. // cmp r1, x
  15973. // mov r0, y
  15974. // moveq r0, x
  15975. // to
  15976. // cmp r0, x
  15977. // movne r0, y
  15978. //
  15979. // mov r1, r0
  15980. // cmp r1, x
  15981. // mov r0, x
  15982. // movne r0, y
  15983. // to
  15984. // cmp r0, x
  15985. // movne r0, y
  15986. /// FIXME: Turn this into a target neutral optimization?
  15987. SDValue Res;
  15988. if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
  15989. Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
  15990. N->getOperand(3), Cmp);
  15991. } else if (CC == ARMCC::EQ && TrueVal == RHS) {
  15992. SDValue ARMcc;
  15993. SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
  15994. Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
  15995. N->getOperand(3), NewCmp);
  15996. }
  15997. // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
  15998. // -> (cmov F T CC CPSR Cmp)
  15999. if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
  16000. auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
  16001. auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
  16002. auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
  16003. if ((LHS0C && LHS0C->getZExtValue() == 0) &&
  16004. (LHS1C && LHS1C->getZExtValue() == 1) &&
  16005. (RHSC && RHSC->getZExtValue() == 0)) {
  16006. return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
  16007. LHS->getOperand(2), LHS->getOperand(3),
  16008. LHS->getOperand(4));
  16009. }
  16010. }
  16011. if (!VT.isInteger())
  16012. return SDValue();
  16013. // Fold away an unneccessary CMPZ/CMOV
  16014. // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
  16015. // if C1==EQ -> CMOV A, B, C2, $cpsr, D
  16016. // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
  16017. if (N->getConstantOperandVal(2) == ARMCC::EQ ||
  16018. N->getConstantOperandVal(2) == ARMCC::NE) {
  16019. ARMCC::CondCodes Cond;
  16020. if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {
  16021. if (N->getConstantOperandVal(2) == ARMCC::NE)
  16022. Cond = ARMCC::getOppositeCondition(Cond);
  16023. return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
  16024. N->getOperand(1),
  16025. DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
  16026. N->getOperand(3), C);
  16027. }
  16028. }
  16029. // Materialize a boolean comparison for integers so we can avoid branching.
  16030. if (isNullConstant(FalseVal)) {
  16031. if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
  16032. if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
  16033. // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
  16034. // right 5 bits will make that 32 be 1, otherwise it will be 0.
  16035. // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
  16036. SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
  16037. Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
  16038. DAG.getConstant(5, dl, MVT::i32));
  16039. } else {
  16040. // CMOV 0, 1, ==, (CMPZ x, y) ->
  16041. // (ADDCARRY (SUB x, y), t:0, t:1)
  16042. // where t = (SUBCARRY 0, (SUB x, y), 0)
  16043. //
  16044. // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
  16045. // x != y. In other words, a carry C == 1 when x == y, C == 0
  16046. // otherwise.
  16047. // The final ADDCARRY computes
  16048. // x - y + (0 - (x - y)) + C == C
  16049. SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
  16050. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  16051. SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
  16052. // ISD::SUBCARRY returns a borrow but we want the carry here
  16053. // actually.
  16054. SDValue Carry =
  16055. DAG.getNode(ISD::SUB, dl, MVT::i32,
  16056. DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
  16057. Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
  16058. }
  16059. } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
  16060. (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
  16061. // This seems pointless but will allow us to combine it further below.
  16062. // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
  16063. SDValue Sub =
  16064. DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
  16065. SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
  16066. Sub.getValue(1), SDValue());
  16067. Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
  16068. N->getOperand(3), CPSRGlue.getValue(1));
  16069. FalseVal = Sub;
  16070. }
  16071. } else if (isNullConstant(TrueVal)) {
  16072. if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
  16073. (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
  16074. // This seems pointless but will allow us to combine it further below
  16075. // Note that we change == for != as this is the dual for the case above.
  16076. // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
  16077. SDValue Sub =
  16078. DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
  16079. SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
  16080. Sub.getValue(1), SDValue());
  16081. Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
  16082. DAG.getConstant(ARMCC::NE, dl, MVT::i32),
  16083. N->getOperand(3), CPSRGlue.getValue(1));
  16084. FalseVal = Sub;
  16085. }
  16086. }
  16087. // On Thumb1, the DAG above may be further combined if z is a power of 2
  16088. // (z == 2 ^ K).
  16089. // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
  16090. // t1 = (USUBO (SUB x, y), 1)
  16091. // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
  16092. // Result = if K != 0 then (SHL t2:0, K) else t2:0
  16093. //
  16094. // This also handles the special case of comparing against zero; it's
  16095. // essentially, the same pattern, except there's no SUBS:
  16096. // CMOV x, z, !=, (CMPZ x, 0) ->
  16097. // t1 = (USUBO x, 1)
  16098. // t2 = (SUBCARRY x, t1:0, t1:1)
  16099. // Result = if K != 0 then (SHL t2:0, K) else t2:0
  16100. const APInt *TrueConst;
  16101. if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
  16102. ((FalseVal.getOpcode() == ARMISD::SUBS &&
  16103. FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
  16104. (FalseVal == LHS && isNullConstant(RHS))) &&
  16105. (TrueConst = isPowerOf2Constant(TrueVal))) {
  16106. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  16107. unsigned ShiftAmount = TrueConst->logBase2();
  16108. if (ShiftAmount)
  16109. TrueVal = DAG.getConstant(1, dl, VT);
  16110. SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
  16111. Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
  16112. if (ShiftAmount)
  16113. Res = DAG.getNode(ISD::SHL, dl, VT, Res,
  16114. DAG.getConstant(ShiftAmount, dl, MVT::i32));
  16115. }
  16116. if (Res.getNode()) {
  16117. KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
  16118. // Capture demanded bits information that would be otherwise lost.
  16119. if (Known.Zero == 0xfffffffe)
  16120. Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
  16121. DAG.getValueType(MVT::i1));
  16122. else if (Known.Zero == 0xffffff00)
  16123. Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
  16124. DAG.getValueType(MVT::i8));
  16125. else if (Known.Zero == 0xffff0000)
  16126. Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
  16127. DAG.getValueType(MVT::i16));
  16128. }
  16129. return Res;
  16130. }
  16131. static SDValue PerformBITCASTCombine(SDNode *N,
  16132. TargetLowering::DAGCombinerInfo &DCI,
  16133. const ARMSubtarget *ST) {
  16134. SelectionDAG &DAG = DCI.DAG;
  16135. SDValue Src = N->getOperand(0);
  16136. EVT DstVT = N->getValueType(0);
  16137. // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
  16138. if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
  16139. EVT SrcVT = Src.getValueType();
  16140. if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
  16141. return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
  16142. }
  16143. // We may have a bitcast of something that has already had this bitcast
  16144. // combine performed on it, so skip past any VECTOR_REG_CASTs.
  16145. while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
  16146. Src = Src.getOperand(0);
  16147. // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
  16148. // would be generated is at least the width of the element type.
  16149. EVT SrcVT = Src.getValueType();
  16150. if ((Src.getOpcode() == ARMISD::VMOVIMM ||
  16151. Src.getOpcode() == ARMISD::VMVNIMM ||
  16152. Src.getOpcode() == ARMISD::VMOVFPIMM) &&
  16153. SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
  16154. DAG.getDataLayout().isBigEndian())
  16155. return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
  16156. // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
  16157. if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
  16158. return R;
  16159. return SDValue();
  16160. }
  16161. // Some combines for the MVETrunc truncations legalizer helper. Also lowers the
  16162. // node into stack operations after legalizeOps.
  16163. SDValue ARMTargetLowering::PerformMVETruncCombine(
  16164. SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
  16165. SelectionDAG &DAG = DCI.DAG;
  16166. EVT VT = N->getValueType(0);
  16167. SDLoc DL(N);
  16168. // MVETrunc(Undef, Undef) -> Undef
  16169. if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
  16170. return DAG.getUNDEF(VT);
  16171. // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
  16172. if (N->getNumOperands() == 2 &&
  16173. N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
  16174. N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
  16175. return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
  16176. N->getOperand(0).getOperand(1),
  16177. N->getOperand(1).getOperand(0),
  16178. N->getOperand(1).getOperand(1));
  16179. // MVETrunc(shuffle, shuffle) -> VMOVN
  16180. if (N->getNumOperands() == 2 &&
  16181. N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
  16182. N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
  16183. auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
  16184. auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
  16185. if (S0->getOperand(0) == S1->getOperand(0) &&
  16186. S0->getOperand(1) == S1->getOperand(1)) {
  16187. // Construct complete shuffle mask
  16188. SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end());
  16189. Mask.append(S1->getMask().begin(), S1->getMask().end());
  16190. if (isVMOVNTruncMask(Mask, VT, false))
  16191. return DAG.getNode(
  16192. ARMISD::VMOVN, DL, VT,
  16193. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
  16194. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
  16195. DAG.getConstant(1, DL, MVT::i32));
  16196. if (isVMOVNTruncMask(Mask, VT, true))
  16197. return DAG.getNode(
  16198. ARMISD::VMOVN, DL, VT,
  16199. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
  16200. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
  16201. DAG.getConstant(1, DL, MVT::i32));
  16202. }
  16203. }
  16204. // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
  16205. // truncate to a buildvector to allow the generic optimisations to kick in.
  16206. if (all_of(N->ops(), [](SDValue Op) {
  16207. return Op.getOpcode() == ISD::BUILD_VECTOR ||
  16208. Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
  16209. (Op.getOpcode() == ISD::BITCAST &&
  16210. Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
  16211. })) {
  16212. SmallVector<SDValue, 8> Extracts;
  16213. for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
  16214. SDValue O = N->getOperand(Op);
  16215. for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
  16216. SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
  16217. DAG.getConstant(i, DL, MVT::i32));
  16218. Extracts.push_back(Ext);
  16219. }
  16220. }
  16221. return DAG.getBuildVector(VT, DL, Extracts);
  16222. }
  16223. // If we are late in the legalization process and nothing has optimised
  16224. // the trunc to anything better, lower it to a stack store and reload,
  16225. // performing the truncation whilst keeping the lanes in the correct order:
  16226. // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
  16227. if (!DCI.isAfterLegalizeDAG())
  16228. return SDValue();
  16229. SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
  16230. int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  16231. int NumIns = N->getNumOperands();
  16232. assert((NumIns == 2 || NumIns == 4) &&
  16233. "Expected 2 or 4 inputs to an MVETrunc");
  16234. EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
  16235. if (N->getNumOperands() == 4)
  16236. StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
  16237. SmallVector<SDValue> Chains;
  16238. for (int I = 0; I < NumIns; I++) {
  16239. SDValue Ptr = DAG.getNode(
  16240. ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
  16241. DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
  16242. MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
  16243. DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
  16244. SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
  16245. Ptr, MPI, StoreVT, Align(4));
  16246. Chains.push_back(Ch);
  16247. }
  16248. SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  16249. MachinePointerInfo MPI =
  16250. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
  16251. return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
  16252. }
  16253. // Take a MVEEXT(load x) and split that into (extload x, extload x+8)
  16254. static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
  16255. SelectionDAG &DAG) {
  16256. SDValue N0 = N->getOperand(0);
  16257. LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
  16258. if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
  16259. return SDValue();
  16260. EVT FromVT = LD->getMemoryVT();
  16261. EVT ToVT = N->getValueType(0);
  16262. if (!ToVT.isVector())
  16263. return SDValue();
  16264. assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
  16265. EVT ToEltVT = ToVT.getVectorElementType();
  16266. EVT FromEltVT = FromVT.getVectorElementType();
  16267. unsigned NumElements = 0;
  16268. if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
  16269. NumElements = 4;
  16270. if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
  16271. NumElements = 8;
  16272. assert(NumElements != 0);
  16273. ISD::LoadExtType NewExtType =
  16274. N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
  16275. if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
  16276. LD->getExtensionType() != ISD::EXTLOAD &&
  16277. LD->getExtensionType() != NewExtType)
  16278. return SDValue();
  16279. LLVMContext &C = *DAG.getContext();
  16280. SDLoc DL(LD);
  16281. // Details about the old load
  16282. SDValue Ch = LD->getChain();
  16283. SDValue BasePtr = LD->getBasePtr();
  16284. Align Alignment = LD->getOriginalAlign();
  16285. MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
  16286. AAMDNodes AAInfo = LD->getAAInfo();
  16287. SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
  16288. EVT NewFromVT = EVT::getVectorVT(
  16289. C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
  16290. EVT NewToVT = EVT::getVectorVT(
  16291. C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
  16292. SmallVector<SDValue, 4> Loads;
  16293. SmallVector<SDValue, 4> Chains;
  16294. for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
  16295. unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
  16296. SDValue NewPtr =
  16297. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  16298. SDValue NewLoad =
  16299. DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
  16300. LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
  16301. Alignment, MMOFlags, AAInfo);
  16302. Loads.push_back(NewLoad);
  16303. Chains.push_back(SDValue(NewLoad.getNode(), 1));
  16304. }
  16305. SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  16306. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
  16307. return DAG.getMergeValues(Loads, DL);
  16308. }
  16309. // Perform combines for MVEEXT. If it has not be optimized to anything better
  16310. // before lowering, it gets converted to stack store and extloads performing the
  16311. // extend whilst still keeping the same lane ordering.
  16312. SDValue ARMTargetLowering::PerformMVEExtCombine(
  16313. SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
  16314. SelectionDAG &DAG = DCI.DAG;
  16315. EVT VT = N->getValueType(0);
  16316. SDLoc DL(N);
  16317. assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
  16318. assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
  16319. EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
  16320. *DAG.getContext());
  16321. auto Extend = [&](SDValue V) {
  16322. SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
  16323. return N->getOpcode() == ARMISD::MVESEXT
  16324. ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
  16325. DAG.getValueType(ExtVT))
  16326. : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
  16327. };
  16328. // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
  16329. if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
  16330. SDValue Ext = Extend(N->getOperand(0));
  16331. return DAG.getMergeValues({Ext, Ext}, DL);
  16332. }
  16333. // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
  16334. if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
  16335. ArrayRef<int> Mask = SVN->getMask();
  16336. assert(Mask.size() == 2 * VT.getVectorNumElements());
  16337. assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
  16338. unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
  16339. SDValue Op0 = SVN->getOperand(0);
  16340. SDValue Op1 = SVN->getOperand(1);
  16341. auto CheckInregMask = [&](int Start, int Offset) {
  16342. for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
  16343. if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
  16344. return false;
  16345. return true;
  16346. };
  16347. SDValue V0 = SDValue(N, 0);
  16348. SDValue V1 = SDValue(N, 1);
  16349. if (CheckInregMask(0, 0))
  16350. V0 = Extend(Op0);
  16351. else if (CheckInregMask(0, 1))
  16352. V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
  16353. else if (CheckInregMask(0, Mask.size()))
  16354. V0 = Extend(Op1);
  16355. else if (CheckInregMask(0, Mask.size() + 1))
  16356. V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
  16357. if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
  16358. V1 = Extend(Op1);
  16359. else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
  16360. V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
  16361. else if (CheckInregMask(VT.getVectorNumElements(), 0))
  16362. V1 = Extend(Op0);
  16363. else if (CheckInregMask(VT.getVectorNumElements(), 1))
  16364. V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
  16365. if (V0.getNode() != N || V1.getNode() != N)
  16366. return DAG.getMergeValues({V0, V1}, DL);
  16367. }
  16368. // MVEEXT(load) -> extload, extload
  16369. if (N->getOperand(0)->getOpcode() == ISD::LOAD)
  16370. if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
  16371. return L;
  16372. if (!DCI.isAfterLegalizeDAG())
  16373. return SDValue();
  16374. // Lower to a stack store and reload:
  16375. // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
  16376. SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
  16377. int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  16378. int NumOuts = N->getNumValues();
  16379. assert((NumOuts == 2 || NumOuts == 4) &&
  16380. "Expected 2 or 4 outputs to an MVEEXT");
  16381. EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
  16382. *DAG.getContext());
  16383. if (N->getNumOperands() == 4)
  16384. LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
  16385. MachinePointerInfo MPI =
  16386. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
  16387. SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
  16388. StackPtr, MPI, Align(4));
  16389. SmallVector<SDValue> Loads;
  16390. for (int I = 0; I < NumOuts; I++) {
  16391. SDValue Ptr = DAG.getNode(
  16392. ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
  16393. DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
  16394. MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
  16395. DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
  16396. SDValue Load = DAG.getExtLoad(
  16397. N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
  16398. VT, Chain, Ptr, MPI, LoadVT, Align(4));
  16399. Loads.push_back(Load);
  16400. }
  16401. return DAG.getMergeValues(Loads, DL);
  16402. }
  16403. SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
  16404. DAGCombinerInfo &DCI) const {
  16405. switch (N->getOpcode()) {
  16406. default: break;
  16407. case ISD::SELECT_CC:
  16408. case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
  16409. case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
  16410. case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
  16411. case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
  16412. case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
  16413. case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
  16414. case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
  16415. case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
  16416. case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
  16417. case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
  16418. case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
  16419. case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
  16420. case ISD::BRCOND:
  16421. case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
  16422. case ARMISD::ADDC:
  16423. case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
  16424. case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
  16425. case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
  16426. case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
  16427. case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
  16428. case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
  16429. case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
  16430. case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
  16431. case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
  16432. case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
  16433. case ISD::EXTRACT_VECTOR_ELT:
  16434. return PerformExtractEltCombine(N, DCI, Subtarget);
  16435. case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);
  16436. case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);
  16437. case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
  16438. case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
  16439. case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
  16440. case ISD::FP_TO_SINT:
  16441. case ISD::FP_TO_UINT:
  16442. return PerformVCVTCombine(N, DCI.DAG, Subtarget);
  16443. case ISD::FADD:
  16444. return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget);
  16445. case ISD::FDIV:
  16446. return PerformVDIVCombine(N, DCI.DAG, Subtarget);
  16447. case ISD::INTRINSIC_WO_CHAIN:
  16448. return PerformIntrinsicCombine(N, DCI);
  16449. case ISD::SHL:
  16450. case ISD::SRA:
  16451. case ISD::SRL:
  16452. return PerformShiftCombine(N, DCI, Subtarget);
  16453. case ISD::SIGN_EXTEND:
  16454. case ISD::ZERO_EXTEND:
  16455. case ISD::ANY_EXTEND:
  16456. return PerformExtendCombine(N, DCI.DAG, Subtarget);
  16457. case ISD::FP_EXTEND:
  16458. return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
  16459. case ISD::SMIN:
  16460. case ISD::UMIN:
  16461. case ISD::SMAX:
  16462. case ISD::UMAX:
  16463. return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
  16464. case ARMISD::CMOV:
  16465. return PerformCMOVCombine(N, DCI.DAG);
  16466. case ARMISD::BRCOND:
  16467. return PerformBRCONDCombine(N, DCI.DAG);
  16468. case ARMISD::CMPZ:
  16469. return PerformCMPZCombine(N, DCI.DAG);
  16470. case ARMISD::CSINC:
  16471. case ARMISD::CSINV:
  16472. case ARMISD::CSNEG:
  16473. return PerformCSETCombine(N, DCI.DAG);
  16474. case ISD::LOAD:
  16475. return PerformLOADCombine(N, DCI, Subtarget);
  16476. case ARMISD::VLD1DUP:
  16477. case ARMISD::VLD2DUP:
  16478. case ARMISD::VLD3DUP:
  16479. case ARMISD::VLD4DUP:
  16480. return PerformVLDCombine(N, DCI);
  16481. case ARMISD::BUILD_VECTOR:
  16482. return PerformARMBUILD_VECTORCombine(N, DCI);
  16483. case ISD::BITCAST:
  16484. return PerformBITCASTCombine(N, DCI, Subtarget);
  16485. case ARMISD::PREDICATE_CAST:
  16486. return PerformPREDICATE_CASTCombine(N, DCI);
  16487. case ARMISD::VECTOR_REG_CAST:
  16488. return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
  16489. case ARMISD::MVETRUNC:
  16490. return PerformMVETruncCombine(N, DCI);
  16491. case ARMISD::MVESEXT:
  16492. case ARMISD::MVEZEXT:
  16493. return PerformMVEExtCombine(N, DCI);
  16494. case ARMISD::VCMP:
  16495. return PerformVCMPCombine(N, DCI.DAG, Subtarget);
  16496. case ISD::VECREDUCE_ADD:
  16497. return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
  16498. case ARMISD::VMOVN:
  16499. return PerformVMOVNCombine(N, DCI);
  16500. case ARMISD::VQMOVNs:
  16501. case ARMISD::VQMOVNu:
  16502. return PerformVQMOVNCombine(N, DCI);
  16503. case ARMISD::ASRL:
  16504. case ARMISD::LSRL:
  16505. case ARMISD::LSLL:
  16506. return PerformLongShiftCombine(N, DCI.DAG);
  16507. case ARMISD::SMULWB: {
  16508. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16509. APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
  16510. if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
  16511. return SDValue();
  16512. break;
  16513. }
  16514. case ARMISD::SMULWT: {
  16515. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16516. APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
  16517. if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
  16518. return SDValue();
  16519. break;
  16520. }
  16521. case ARMISD::SMLALBB:
  16522. case ARMISD::QADD16b:
  16523. case ARMISD::QSUB16b:
  16524. case ARMISD::UQADD16b:
  16525. case ARMISD::UQSUB16b: {
  16526. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16527. APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
  16528. if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
  16529. (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
  16530. return SDValue();
  16531. break;
  16532. }
  16533. case ARMISD::SMLALBT: {
  16534. unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
  16535. APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
  16536. unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
  16537. APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
  16538. if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
  16539. (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
  16540. return SDValue();
  16541. break;
  16542. }
  16543. case ARMISD::SMLALTB: {
  16544. unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
  16545. APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
  16546. unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
  16547. APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
  16548. if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
  16549. (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
  16550. return SDValue();
  16551. break;
  16552. }
  16553. case ARMISD::SMLALTT: {
  16554. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16555. APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
  16556. if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
  16557. (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
  16558. return SDValue();
  16559. break;
  16560. }
  16561. case ARMISD::QADD8b:
  16562. case ARMISD::QSUB8b:
  16563. case ARMISD::UQADD8b:
  16564. case ARMISD::UQSUB8b: {
  16565. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16566. APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
  16567. if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
  16568. (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
  16569. return SDValue();
  16570. break;
  16571. }
  16572. case ISD::INTRINSIC_VOID:
  16573. case ISD::INTRINSIC_W_CHAIN:
  16574. switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
  16575. case Intrinsic::arm_neon_vld1:
  16576. case Intrinsic::arm_neon_vld1x2:
  16577. case Intrinsic::arm_neon_vld1x3:
  16578. case Intrinsic::arm_neon_vld1x4:
  16579. case Intrinsic::arm_neon_vld2:
  16580. case Intrinsic::arm_neon_vld3:
  16581. case Intrinsic::arm_neon_vld4:
  16582. case Intrinsic::arm_neon_vld2lane:
  16583. case Intrinsic::arm_neon_vld3lane:
  16584. case Intrinsic::arm_neon_vld4lane:
  16585. case Intrinsic::arm_neon_vld2dup:
  16586. case Intrinsic::arm_neon_vld3dup:
  16587. case Intrinsic::arm_neon_vld4dup:
  16588. case Intrinsic::arm_neon_vst1:
  16589. case Intrinsic::arm_neon_vst1x2:
  16590. case Intrinsic::arm_neon_vst1x3:
  16591. case Intrinsic::arm_neon_vst1x4:
  16592. case Intrinsic::arm_neon_vst2:
  16593. case Intrinsic::arm_neon_vst3:
  16594. case Intrinsic::arm_neon_vst4:
  16595. case Intrinsic::arm_neon_vst2lane:
  16596. case Intrinsic::arm_neon_vst3lane:
  16597. case Intrinsic::arm_neon_vst4lane:
  16598. return PerformVLDCombine(N, DCI);
  16599. case Intrinsic::arm_mve_vld2q:
  16600. case Intrinsic::arm_mve_vld4q:
  16601. case Intrinsic::arm_mve_vst2q:
  16602. case Intrinsic::arm_mve_vst4q:
  16603. return PerformMVEVLDCombine(N, DCI);
  16604. default: break;
  16605. }
  16606. break;
  16607. }
  16608. return SDValue();
  16609. }
  16610. bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
  16611. EVT VT) const {
  16612. return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
  16613. }
  16614. bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
  16615. Align Alignment,
  16616. MachineMemOperand::Flags,
  16617. bool *Fast) const {
  16618. // Depends what it gets converted into if the type is weird.
  16619. if (!VT.isSimple())
  16620. return false;
  16621. // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
  16622. bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
  16623. auto Ty = VT.getSimpleVT().SimpleTy;
  16624. if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
  16625. // Unaligned access can use (for example) LRDB, LRDH, LDR
  16626. if (AllowsUnaligned) {
  16627. if (Fast)
  16628. *Fast = Subtarget->hasV7Ops();
  16629. return true;
  16630. }
  16631. }
  16632. if (Ty == MVT::f64 || Ty == MVT::v2f64) {
  16633. // For any little-endian targets with neon, we can support unaligned ld/st
  16634. // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
  16635. // A big-endian target may also explicitly support unaligned accesses
  16636. if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
  16637. if (Fast)
  16638. *Fast = true;
  16639. return true;
  16640. }
  16641. }
  16642. if (!Subtarget->hasMVEIntegerOps())
  16643. return false;
  16644. // These are for predicates
  16645. if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
  16646. Ty == MVT::v2i1)) {
  16647. if (Fast)
  16648. *Fast = true;
  16649. return true;
  16650. }
  16651. // These are for truncated stores/narrowing loads. They are fine so long as
  16652. // the alignment is at least the size of the item being loaded
  16653. if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
  16654. Alignment >= VT.getScalarSizeInBits() / 8) {
  16655. if (Fast)
  16656. *Fast = true;
  16657. return true;
  16658. }
  16659. // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
  16660. // VSTRW.U32 all store the vector register in exactly the same format, and
  16661. // differ only in the range of their immediate offset field and the required
  16662. // alignment. So there is always a store that can be used, regardless of
  16663. // actual type.
  16664. //
  16665. // For big endian, that is not the case. But can still emit a (VSTRB.U8;
  16666. // VREV64.8) pair and get the same effect. This will likely be better than
  16667. // aligning the vector through the stack.
  16668. if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
  16669. Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
  16670. Ty == MVT::v2f64) {
  16671. if (Fast)
  16672. *Fast = true;
  16673. return true;
  16674. }
  16675. return false;
  16676. }
  16677. EVT ARMTargetLowering::getOptimalMemOpType(
  16678. const MemOp &Op, const AttributeList &FuncAttributes) const {
  16679. // See if we can use NEON instructions for this...
  16680. if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
  16681. !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
  16682. bool Fast;
  16683. if (Op.size() >= 16 &&
  16684. (Op.isAligned(Align(16)) ||
  16685. (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
  16686. MachineMemOperand::MONone, &Fast) &&
  16687. Fast))) {
  16688. return MVT::v2f64;
  16689. } else if (Op.size() >= 8 &&
  16690. (Op.isAligned(Align(8)) ||
  16691. (allowsMisalignedMemoryAccesses(
  16692. MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
  16693. Fast))) {
  16694. return MVT::f64;
  16695. }
  16696. }
  16697. // Let the target-independent logic figure it out.
  16698. return MVT::Other;
  16699. }
  16700. // 64-bit integers are split into their high and low parts and held in two
  16701. // different registers, so the trunc is free since the low register can just
  16702. // be used.
  16703. bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
  16704. if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
  16705. return false;
  16706. unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
  16707. unsigned DestBits = DstTy->getPrimitiveSizeInBits();
  16708. return (SrcBits == 64 && DestBits == 32);
  16709. }
  16710. bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
  16711. if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
  16712. !DstVT.isInteger())
  16713. return false;
  16714. unsigned SrcBits = SrcVT.getSizeInBits();
  16715. unsigned DestBits = DstVT.getSizeInBits();
  16716. return (SrcBits == 64 && DestBits == 32);
  16717. }
  16718. bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
  16719. if (Val.getOpcode() != ISD::LOAD)
  16720. return false;
  16721. EVT VT1 = Val.getValueType();
  16722. if (!VT1.isSimple() || !VT1.isInteger() ||
  16723. !VT2.isSimple() || !VT2.isInteger())
  16724. return false;
  16725. switch (VT1.getSimpleVT().SimpleTy) {
  16726. default: break;
  16727. case MVT::i1:
  16728. case MVT::i8:
  16729. case MVT::i16:
  16730. // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
  16731. return true;
  16732. }
  16733. return false;
  16734. }
  16735. bool ARMTargetLowering::isFNegFree(EVT VT) const {
  16736. if (!VT.isSimple())
  16737. return false;
  16738. // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
  16739. // negate values directly (fneg is free). So, we don't want to let the DAG
  16740. // combiner rewrite fneg into xors and some other instructions. For f16 and
  16741. // FullFP16 argument passing, some bitcast nodes may be introduced,
  16742. // triggering this DAG combine rewrite, so we are avoiding that with this.
  16743. switch (VT.getSimpleVT().SimpleTy) {
  16744. default: break;
  16745. case MVT::f16:
  16746. return Subtarget->hasFullFP16();
  16747. }
  16748. return false;
  16749. }
  16750. /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
  16751. /// of the vector elements.
  16752. static bool areExtractExts(Value *Ext1, Value *Ext2) {
  16753. auto areExtDoubled = [](Instruction *Ext) {
  16754. return Ext->getType()->getScalarSizeInBits() ==
  16755. 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
  16756. };
  16757. if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
  16758. !match(Ext2, m_ZExtOrSExt(m_Value())) ||
  16759. !areExtDoubled(cast<Instruction>(Ext1)) ||
  16760. !areExtDoubled(cast<Instruction>(Ext2)))
  16761. return false;
  16762. return true;
  16763. }
  16764. /// Check if sinking \p I's operands to I's basic block is profitable, because
  16765. /// the operands can be folded into a target instruction, e.g.
  16766. /// sext/zext can be folded into vsubl.
  16767. bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
  16768. SmallVectorImpl<Use *> &Ops) const {
  16769. if (!I->getType()->isVectorTy())
  16770. return false;
  16771. if (Subtarget->hasNEON()) {
  16772. switch (I->getOpcode()) {
  16773. case Instruction::Sub:
  16774. case Instruction::Add: {
  16775. if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
  16776. return false;
  16777. Ops.push_back(&I->getOperandUse(0));
  16778. Ops.push_back(&I->getOperandUse(1));
  16779. return true;
  16780. }
  16781. default:
  16782. return false;
  16783. }
  16784. }
  16785. if (!Subtarget->hasMVEIntegerOps())
  16786. return false;
  16787. auto IsFMSMul = [&](Instruction *I) {
  16788. if (!I->hasOneUse())
  16789. return false;
  16790. auto *Sub = cast<Instruction>(*I->users().begin());
  16791. return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
  16792. };
  16793. auto IsFMS = [&](Instruction *I) {
  16794. if (match(I->getOperand(0), m_FNeg(m_Value())) ||
  16795. match(I->getOperand(1), m_FNeg(m_Value())))
  16796. return true;
  16797. return false;
  16798. };
  16799. auto IsSinker = [&](Instruction *I, int Operand) {
  16800. switch (I->getOpcode()) {
  16801. case Instruction::Add:
  16802. case Instruction::Mul:
  16803. case Instruction::FAdd:
  16804. case Instruction::ICmp:
  16805. case Instruction::FCmp:
  16806. return true;
  16807. case Instruction::FMul:
  16808. return !IsFMSMul(I);
  16809. case Instruction::Sub:
  16810. case Instruction::FSub:
  16811. case Instruction::Shl:
  16812. case Instruction::LShr:
  16813. case Instruction::AShr:
  16814. return Operand == 1;
  16815. case Instruction::Call:
  16816. if (auto *II = dyn_cast<IntrinsicInst>(I)) {
  16817. switch (II->getIntrinsicID()) {
  16818. case Intrinsic::fma:
  16819. return !IsFMS(I);
  16820. case Intrinsic::sadd_sat:
  16821. case Intrinsic::uadd_sat:
  16822. case Intrinsic::arm_mve_add_predicated:
  16823. case Intrinsic::arm_mve_mul_predicated:
  16824. case Intrinsic::arm_mve_qadd_predicated:
  16825. case Intrinsic::arm_mve_vhadd:
  16826. case Intrinsic::arm_mve_hadd_predicated:
  16827. case Intrinsic::arm_mve_vqdmull:
  16828. case Intrinsic::arm_mve_vqdmull_predicated:
  16829. case Intrinsic::arm_mve_vqdmulh:
  16830. case Intrinsic::arm_mve_qdmulh_predicated:
  16831. case Intrinsic::arm_mve_vqrdmulh:
  16832. case Intrinsic::arm_mve_qrdmulh_predicated:
  16833. case Intrinsic::arm_mve_fma_predicated:
  16834. return true;
  16835. case Intrinsic::ssub_sat:
  16836. case Intrinsic::usub_sat:
  16837. case Intrinsic::arm_mve_sub_predicated:
  16838. case Intrinsic::arm_mve_qsub_predicated:
  16839. case Intrinsic::arm_mve_hsub_predicated:
  16840. case Intrinsic::arm_mve_vhsub:
  16841. return Operand == 1;
  16842. default:
  16843. return false;
  16844. }
  16845. }
  16846. return false;
  16847. default:
  16848. return false;
  16849. }
  16850. };
  16851. for (auto OpIdx : enumerate(I->operands())) {
  16852. Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
  16853. // Make sure we are not already sinking this operand
  16854. if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
  16855. continue;
  16856. Instruction *Shuffle = Op;
  16857. if (Shuffle->getOpcode() == Instruction::BitCast)
  16858. Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
  16859. // We are looking for a splat that can be sunk.
  16860. if (!Shuffle ||
  16861. !match(Shuffle, m_Shuffle(
  16862. m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
  16863. m_Undef(), m_ZeroMask())))
  16864. continue;
  16865. if (!IsSinker(I, OpIdx.index()))
  16866. continue;
  16867. // All uses of the shuffle should be sunk to avoid duplicating it across gpr
  16868. // and vector registers
  16869. for (Use &U : Op->uses()) {
  16870. Instruction *Insn = cast<Instruction>(U.getUser());
  16871. if (!IsSinker(Insn, U.getOperandNo()))
  16872. return false;
  16873. }
  16874. Ops.push_back(&Shuffle->getOperandUse(0));
  16875. if (Shuffle != Op)
  16876. Ops.push_back(&Op->getOperandUse(0));
  16877. Ops.push_back(&OpIdx.value());
  16878. }
  16879. return true;
  16880. }
  16881. Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
  16882. if (!Subtarget->hasMVEIntegerOps())
  16883. return nullptr;
  16884. Type *SVIType = SVI->getType();
  16885. Type *ScalarType = SVIType->getScalarType();
  16886. if (ScalarType->isFloatTy())
  16887. return Type::getInt32Ty(SVIType->getContext());
  16888. if (ScalarType->isHalfTy())
  16889. return Type::getInt16Ty(SVIType->getContext());
  16890. return nullptr;
  16891. }
  16892. bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
  16893. EVT VT = ExtVal.getValueType();
  16894. if (!isTypeLegal(VT))
  16895. return false;
  16896. if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
  16897. if (Ld->isExpandingLoad())
  16898. return false;
  16899. }
  16900. if (Subtarget->hasMVEIntegerOps())
  16901. return true;
  16902. // Don't create a loadext if we can fold the extension into a wide/long
  16903. // instruction.
  16904. // If there's more than one user instruction, the loadext is desirable no
  16905. // matter what. There can be two uses by the same instruction.
  16906. if (ExtVal->use_empty() ||
  16907. !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
  16908. return true;
  16909. SDNode *U = *ExtVal->use_begin();
  16910. if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
  16911. U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
  16912. return false;
  16913. return true;
  16914. }
  16915. bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
  16916. if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
  16917. return false;
  16918. if (!isTypeLegal(EVT::getEVT(Ty1)))
  16919. return false;
  16920. assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
  16921. // Assuming the caller doesn't have a zeroext or signext return parameter,
  16922. // truncation all the way down to i1 is valid.
  16923. return true;
  16924. }
  16925. InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
  16926. const AddrMode &AM,
  16927. Type *Ty,
  16928. unsigned AS) const {
  16929. if (isLegalAddressingMode(DL, AM, Ty, AS)) {
  16930. if (Subtarget->hasFPAO())
  16931. return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
  16932. return 0;
  16933. }
  16934. return -1;
  16935. }
  16936. /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
  16937. /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
  16938. /// expanded to FMAs when this method returns true, otherwise fmuladd is
  16939. /// expanded to fmul + fadd.
  16940. ///
  16941. /// ARM supports both fused and unfused multiply-add operations; we already
  16942. /// lower a pair of fmul and fadd to the latter so it's not clear that there
  16943. /// would be a gain or that the gain would be worthwhile enough to risk
  16944. /// correctness bugs.
  16945. ///
  16946. /// For MVE, we set this to true as it helps simplify the need for some
  16947. /// patterns (and we don't have the non-fused floating point instruction).
  16948. bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
  16949. EVT VT) const {
  16950. if (!VT.isSimple())
  16951. return false;
  16952. switch (VT.getSimpleVT().SimpleTy) {
  16953. case MVT::v4f32:
  16954. case MVT::v8f16:
  16955. return Subtarget->hasMVEFloatOps();
  16956. case MVT::f16:
  16957. return Subtarget->useFPVFMx16();
  16958. case MVT::f32:
  16959. return Subtarget->useFPVFMx();
  16960. case MVT::f64:
  16961. return Subtarget->useFPVFMx64();
  16962. default:
  16963. break;
  16964. }
  16965. return false;
  16966. }
  16967. static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
  16968. if (V < 0)
  16969. return false;
  16970. unsigned Scale = 1;
  16971. switch (VT.getSimpleVT().SimpleTy) {
  16972. case MVT::i1:
  16973. case MVT::i8:
  16974. // Scale == 1;
  16975. break;
  16976. case MVT::i16:
  16977. // Scale == 2;
  16978. Scale = 2;
  16979. break;
  16980. default:
  16981. // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
  16982. // Scale == 4;
  16983. Scale = 4;
  16984. break;
  16985. }
  16986. if ((V & (Scale - 1)) != 0)
  16987. return false;
  16988. return isUInt<5>(V / Scale);
  16989. }
  16990. static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
  16991. const ARMSubtarget *Subtarget) {
  16992. if (!VT.isInteger() && !VT.isFloatingPoint())
  16993. return false;
  16994. if (VT.isVector() && Subtarget->hasNEON())
  16995. return false;
  16996. if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
  16997. !Subtarget->hasMVEFloatOps())
  16998. return false;
  16999. bool IsNeg = false;
  17000. if (V < 0) {
  17001. IsNeg = true;
  17002. V = -V;
  17003. }
  17004. unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
  17005. // MVE: size * imm7
  17006. if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
  17007. switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
  17008. case MVT::i32:
  17009. case MVT::f32:
  17010. return isShiftedUInt<7,2>(V);
  17011. case MVT::i16:
  17012. case MVT::f16:
  17013. return isShiftedUInt<7,1>(V);
  17014. case MVT::i8:
  17015. return isUInt<7>(V);
  17016. default:
  17017. return false;
  17018. }
  17019. }
  17020. // half VLDR: 2 * imm8
  17021. if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
  17022. return isShiftedUInt<8, 1>(V);
  17023. // VLDR and LDRD: 4 * imm8
  17024. if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
  17025. return isShiftedUInt<8, 2>(V);
  17026. if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
  17027. // + imm12 or - imm8
  17028. if (IsNeg)
  17029. return isUInt<8>(V);
  17030. return isUInt<12>(V);
  17031. }
  17032. return false;
  17033. }
  17034. /// isLegalAddressImmediate - Return true if the integer value can be used
  17035. /// as the offset of the target addressing mode for load / store of the
  17036. /// given type.
  17037. static bool isLegalAddressImmediate(int64_t V, EVT VT,
  17038. const ARMSubtarget *Subtarget) {
  17039. if (V == 0)
  17040. return true;
  17041. if (!VT.isSimple())
  17042. return false;
  17043. if (Subtarget->isThumb1Only())
  17044. return isLegalT1AddressImmediate(V, VT);
  17045. else if (Subtarget->isThumb2())
  17046. return isLegalT2AddressImmediate(V, VT, Subtarget);
  17047. // ARM mode.
  17048. if (V < 0)
  17049. V = - V;
  17050. switch (VT.getSimpleVT().SimpleTy) {
  17051. default: return false;
  17052. case MVT::i1:
  17053. case MVT::i8:
  17054. case MVT::i32:
  17055. // +- imm12
  17056. return isUInt<12>(V);
  17057. case MVT::i16:
  17058. // +- imm8
  17059. return isUInt<8>(V);
  17060. case MVT::f32:
  17061. case MVT::f64:
  17062. if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
  17063. return false;
  17064. return isShiftedUInt<8, 2>(V);
  17065. }
  17066. }
  17067. bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
  17068. EVT VT) const {
  17069. int Scale = AM.Scale;
  17070. if (Scale < 0)
  17071. return false;
  17072. switch (VT.getSimpleVT().SimpleTy) {
  17073. default: return false;
  17074. case MVT::i1:
  17075. case MVT::i8:
  17076. case MVT::i16:
  17077. case MVT::i32:
  17078. if (Scale == 1)
  17079. return true;
  17080. // r + r << imm
  17081. Scale = Scale & ~1;
  17082. return Scale == 2 || Scale == 4 || Scale == 8;
  17083. case MVT::i64:
  17084. // FIXME: What are we trying to model here? ldrd doesn't have an r + r
  17085. // version in Thumb mode.
  17086. // r + r
  17087. if (Scale == 1)
  17088. return true;
  17089. // r * 2 (this can be lowered to r + r).
  17090. if (!AM.HasBaseReg && Scale == 2)
  17091. return true;
  17092. return false;
  17093. case MVT::isVoid:
  17094. // Note, we allow "void" uses (basically, uses that aren't loads or
  17095. // stores), because arm allows folding a scale into many arithmetic
  17096. // operations. This should be made more precise and revisited later.
  17097. // Allow r << imm, but the imm has to be a multiple of two.
  17098. if (Scale & 1) return false;
  17099. return isPowerOf2_32(Scale);
  17100. }
  17101. }
  17102. bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
  17103. EVT VT) const {
  17104. const int Scale = AM.Scale;
  17105. // Negative scales are not supported in Thumb1.
  17106. if (Scale < 0)
  17107. return false;
  17108. // Thumb1 addressing modes do not support register scaling excepting the
  17109. // following cases:
  17110. // 1. Scale == 1 means no scaling.
  17111. // 2. Scale == 2 this can be lowered to r + r if there is no base register.
  17112. return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
  17113. }
  17114. /// isLegalAddressingMode - Return true if the addressing mode represented
  17115. /// by AM is legal for this target, for a load/store of the specified type.
  17116. bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
  17117. const AddrMode &AM, Type *Ty,
  17118. unsigned AS, Instruction *I) const {
  17119. EVT VT = getValueType(DL, Ty, true);
  17120. if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
  17121. return false;
  17122. // Can never fold addr of global into load/store.
  17123. if (AM.BaseGV)
  17124. return false;
  17125. switch (AM.Scale) {
  17126. case 0: // no scale reg, must be "r+i" or "r", or "i".
  17127. break;
  17128. default:
  17129. // ARM doesn't support any R+R*scale+imm addr modes.
  17130. if (AM.BaseOffs)
  17131. return false;
  17132. if (!VT.isSimple())
  17133. return false;
  17134. if (Subtarget->isThumb1Only())
  17135. return isLegalT1ScaledAddressingMode(AM, VT);
  17136. if (Subtarget->isThumb2())
  17137. return isLegalT2ScaledAddressingMode(AM, VT);
  17138. int Scale = AM.Scale;
  17139. switch (VT.getSimpleVT().SimpleTy) {
  17140. default: return false;
  17141. case MVT::i1:
  17142. case MVT::i8:
  17143. case MVT::i32:
  17144. if (Scale < 0) Scale = -Scale;
  17145. if (Scale == 1)
  17146. return true;
  17147. // r + r << imm
  17148. return isPowerOf2_32(Scale & ~1);
  17149. case MVT::i16:
  17150. case MVT::i64:
  17151. // r +/- r
  17152. if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
  17153. return true;
  17154. // r * 2 (this can be lowered to r + r).
  17155. if (!AM.HasBaseReg && Scale == 2)
  17156. return true;
  17157. return false;
  17158. case MVT::isVoid:
  17159. // Note, we allow "void" uses (basically, uses that aren't loads or
  17160. // stores), because arm allows folding a scale into many arithmetic
  17161. // operations. This should be made more precise and revisited later.
  17162. // Allow r << imm, but the imm has to be a multiple of two.
  17163. if (Scale & 1) return false;
  17164. return isPowerOf2_32(Scale);
  17165. }
  17166. }
  17167. return true;
  17168. }
  17169. /// isLegalICmpImmediate - Return true if the specified immediate is legal
  17170. /// icmp immediate, that is the target has icmp instructions which can compare
  17171. /// a register against the immediate without having to materialize the
  17172. /// immediate into a register.
  17173. bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
  17174. // Thumb2 and ARM modes can use cmn for negative immediates.
  17175. if (!Subtarget->isThumb())
  17176. return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
  17177. ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
  17178. if (Subtarget->isThumb2())
  17179. return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
  17180. ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
  17181. // Thumb1 doesn't have cmn, and only 8-bit immediates.
  17182. return Imm >= 0 && Imm <= 255;
  17183. }
  17184. /// isLegalAddImmediate - Return true if the specified immediate is a legal add
  17185. /// *or sub* immediate, that is the target has add or sub instructions which can
  17186. /// add a register with the immediate without having to materialize the
  17187. /// immediate into a register.
  17188. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
  17189. // Same encoding for add/sub, just flip the sign.
  17190. int64_t AbsImm = std::abs(Imm);
  17191. if (!Subtarget->isThumb())
  17192. return ARM_AM::getSOImmVal(AbsImm) != -1;
  17193. if (Subtarget->isThumb2())
  17194. return ARM_AM::getT2SOImmVal(AbsImm) != -1;
  17195. // Thumb1 only has 8-bit unsigned immediate.
  17196. return AbsImm >= 0 && AbsImm <= 255;
  17197. }
  17198. // Return false to prevent folding
  17199. // (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
  17200. // if the folding leads to worse code.
  17201. bool ARMTargetLowering::isMulAddWithConstProfitable(
  17202. const SDValue &AddNode, const SDValue &ConstNode) const {
  17203. // Let the DAGCombiner decide for vector types and large types.
  17204. const EVT VT = AddNode.getValueType();
  17205. if (VT.isVector() || VT.getScalarSizeInBits() > 32)
  17206. return true;
  17207. // It is worse if c0 is legal add immediate, while c1*c0 is not
  17208. // and has to be composed by at least two instructions.
  17209. const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
  17210. const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
  17211. const int64_t C0 = C0Node->getSExtValue();
  17212. APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
  17213. if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue()))
  17214. return true;
  17215. if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
  17216. return false;
  17217. // Default to true and let the DAGCombiner decide.
  17218. return true;
  17219. }
  17220. static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
  17221. bool isSEXTLoad, SDValue &Base,
  17222. SDValue &Offset, bool &isInc,
  17223. SelectionDAG &DAG) {
  17224. if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
  17225. return false;
  17226. if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
  17227. // AddressingMode 3
  17228. Base = Ptr->getOperand(0);
  17229. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
  17230. int RHSC = (int)RHS->getZExtValue();
  17231. if (RHSC < 0 && RHSC > -256) {
  17232. assert(Ptr->getOpcode() == ISD::ADD);
  17233. isInc = false;
  17234. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17235. return true;
  17236. }
  17237. }
  17238. isInc = (Ptr->getOpcode() == ISD::ADD);
  17239. Offset = Ptr->getOperand(1);
  17240. return true;
  17241. } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
  17242. // AddressingMode 2
  17243. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
  17244. int RHSC = (int)RHS->getZExtValue();
  17245. if (RHSC < 0 && RHSC > -0x1000) {
  17246. assert(Ptr->getOpcode() == ISD::ADD);
  17247. isInc = false;
  17248. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17249. Base = Ptr->getOperand(0);
  17250. return true;
  17251. }
  17252. }
  17253. if (Ptr->getOpcode() == ISD::ADD) {
  17254. isInc = true;
  17255. ARM_AM::ShiftOpc ShOpcVal=
  17256. ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
  17257. if (ShOpcVal != ARM_AM::no_shift) {
  17258. Base = Ptr->getOperand(1);
  17259. Offset = Ptr->getOperand(0);
  17260. } else {
  17261. Base = Ptr->getOperand(0);
  17262. Offset = Ptr->getOperand(1);
  17263. }
  17264. return true;
  17265. }
  17266. isInc = (Ptr->getOpcode() == ISD::ADD);
  17267. Base = Ptr->getOperand(0);
  17268. Offset = Ptr->getOperand(1);
  17269. return true;
  17270. }
  17271. // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
  17272. return false;
  17273. }
  17274. static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
  17275. bool isSEXTLoad, SDValue &Base,
  17276. SDValue &Offset, bool &isInc,
  17277. SelectionDAG &DAG) {
  17278. if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
  17279. return false;
  17280. Base = Ptr->getOperand(0);
  17281. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
  17282. int RHSC = (int)RHS->getZExtValue();
  17283. if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
  17284. assert(Ptr->getOpcode() == ISD::ADD);
  17285. isInc = false;
  17286. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17287. return true;
  17288. } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
  17289. isInc = Ptr->getOpcode() == ISD::ADD;
  17290. Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17291. return true;
  17292. }
  17293. }
  17294. return false;
  17295. }
  17296. static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
  17297. bool isSEXTLoad, bool IsMasked, bool isLE,
  17298. SDValue &Base, SDValue &Offset,
  17299. bool &isInc, SelectionDAG &DAG) {
  17300. if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
  17301. return false;
  17302. if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
  17303. return false;
  17304. // We allow LE non-masked loads to change the type (for example use a vldrb.8
  17305. // as opposed to a vldrw.32). This can allow extra addressing modes or
  17306. // alignments for what is otherwise an equivalent instruction.
  17307. bool CanChangeType = isLE && !IsMasked;
  17308. ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
  17309. int RHSC = (int)RHS->getZExtValue();
  17310. auto IsInRange = [&](int RHSC, int Limit, int Scale) {
  17311. if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
  17312. assert(Ptr->getOpcode() == ISD::ADD);
  17313. isInc = false;
  17314. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17315. return true;
  17316. } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
  17317. isInc = Ptr->getOpcode() == ISD::ADD;
  17318. Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17319. return true;
  17320. }
  17321. return false;
  17322. };
  17323. // Try to find a matching instruction based on s/zext, Alignment, Offset and
  17324. // (in BE/masked) type.
  17325. Base = Ptr->getOperand(0);
  17326. if (VT == MVT::v4i16) {
  17327. if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
  17328. return true;
  17329. } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
  17330. if (IsInRange(RHSC, 0x80, 1))
  17331. return true;
  17332. } else if (Alignment >= 4 &&
  17333. (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
  17334. IsInRange(RHSC, 0x80, 4))
  17335. return true;
  17336. else if (Alignment >= 2 &&
  17337. (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
  17338. IsInRange(RHSC, 0x80, 2))
  17339. return true;
  17340. else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
  17341. return true;
  17342. return false;
  17343. }
  17344. /// getPreIndexedAddressParts - returns true by value, base pointer and
  17345. /// offset pointer and addressing mode by reference if the node's address
  17346. /// can be legally represented as pre-indexed load / store address.
  17347. bool
  17348. ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
  17349. SDValue &Offset,
  17350. ISD::MemIndexedMode &AM,
  17351. SelectionDAG &DAG) const {
  17352. if (Subtarget->isThumb1Only())
  17353. return false;
  17354. EVT VT;
  17355. SDValue Ptr;
  17356. Align Alignment;
  17357. bool isSEXTLoad = false;
  17358. bool IsMasked = false;
  17359. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  17360. Ptr = LD->getBasePtr();
  17361. VT = LD->getMemoryVT();
  17362. Alignment = LD->getAlign();
  17363. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17364. } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
  17365. Ptr = ST->getBasePtr();
  17366. VT = ST->getMemoryVT();
  17367. Alignment = ST->getAlign();
  17368. } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
  17369. Ptr = LD->getBasePtr();
  17370. VT = LD->getMemoryVT();
  17371. Alignment = LD->getAlign();
  17372. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17373. IsMasked = true;
  17374. } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
  17375. Ptr = ST->getBasePtr();
  17376. VT = ST->getMemoryVT();
  17377. Alignment = ST->getAlign();
  17378. IsMasked = true;
  17379. } else
  17380. return false;
  17381. bool isInc;
  17382. bool isLegal = false;
  17383. if (VT.isVector())
  17384. isLegal = Subtarget->hasMVEIntegerOps() &&
  17385. getMVEIndexedAddressParts(
  17386. Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
  17387. Subtarget->isLittle(), Base, Offset, isInc, DAG);
  17388. else {
  17389. if (Subtarget->isThumb2())
  17390. isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
  17391. Offset, isInc, DAG);
  17392. else
  17393. isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
  17394. Offset, isInc, DAG);
  17395. }
  17396. if (!isLegal)
  17397. return false;
  17398. AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
  17399. return true;
  17400. }
  17401. /// getPostIndexedAddressParts - returns true by value, base pointer and
  17402. /// offset pointer and addressing mode by reference if this node can be
  17403. /// combined with a load / store to form a post-indexed load / store.
  17404. bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
  17405. SDValue &Base,
  17406. SDValue &Offset,
  17407. ISD::MemIndexedMode &AM,
  17408. SelectionDAG &DAG) const {
  17409. EVT VT;
  17410. SDValue Ptr;
  17411. Align Alignment;
  17412. bool isSEXTLoad = false, isNonExt;
  17413. bool IsMasked = false;
  17414. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  17415. VT = LD->getMemoryVT();
  17416. Ptr = LD->getBasePtr();
  17417. Alignment = LD->getAlign();
  17418. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17419. isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
  17420. } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
  17421. VT = ST->getMemoryVT();
  17422. Ptr = ST->getBasePtr();
  17423. Alignment = ST->getAlign();
  17424. isNonExt = !ST->isTruncatingStore();
  17425. } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
  17426. VT = LD->getMemoryVT();
  17427. Ptr = LD->getBasePtr();
  17428. Alignment = LD->getAlign();
  17429. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17430. isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
  17431. IsMasked = true;
  17432. } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
  17433. VT = ST->getMemoryVT();
  17434. Ptr = ST->getBasePtr();
  17435. Alignment = ST->getAlign();
  17436. isNonExt = !ST->isTruncatingStore();
  17437. IsMasked = true;
  17438. } else
  17439. return false;
  17440. if (Subtarget->isThumb1Only()) {
  17441. // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
  17442. // must be non-extending/truncating, i32, with an offset of 4.
  17443. assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
  17444. if (Op->getOpcode() != ISD::ADD || !isNonExt)
  17445. return false;
  17446. auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
  17447. if (!RHS || RHS->getZExtValue() != 4)
  17448. return false;
  17449. if (Alignment < Align(4))
  17450. return false;
  17451. Offset = Op->getOperand(1);
  17452. Base = Op->getOperand(0);
  17453. AM = ISD::POST_INC;
  17454. return true;
  17455. }
  17456. bool isInc;
  17457. bool isLegal = false;
  17458. if (VT.isVector())
  17459. isLegal = Subtarget->hasMVEIntegerOps() &&
  17460. getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
  17461. Subtarget->isLittle(), Base, Offset,
  17462. isInc, DAG);
  17463. else {
  17464. if (Subtarget->isThumb2())
  17465. isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
  17466. isInc, DAG);
  17467. else
  17468. isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
  17469. isInc, DAG);
  17470. }
  17471. if (!isLegal)
  17472. return false;
  17473. if (Ptr != Base) {
  17474. // Swap base ptr and offset to catch more post-index load / store when
  17475. // it's legal. In Thumb2 mode, offset must be an immediate.
  17476. if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
  17477. !Subtarget->isThumb2())
  17478. std::swap(Base, Offset);
  17479. // Post-indexed load / store update the base pointer.
  17480. if (Ptr != Base)
  17481. return false;
  17482. }
  17483. AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
  17484. return true;
  17485. }
  17486. void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
  17487. KnownBits &Known,
  17488. const APInt &DemandedElts,
  17489. const SelectionDAG &DAG,
  17490. unsigned Depth) const {
  17491. unsigned BitWidth = Known.getBitWidth();
  17492. Known.resetAll();
  17493. switch (Op.getOpcode()) {
  17494. default: break;
  17495. case ARMISD::ADDC:
  17496. case ARMISD::ADDE:
  17497. case ARMISD::SUBC:
  17498. case ARMISD::SUBE:
  17499. // Special cases when we convert a carry to a boolean.
  17500. if (Op.getResNo() == 0) {
  17501. SDValue LHS = Op.getOperand(0);
  17502. SDValue RHS = Op.getOperand(1);
  17503. // (ADDE 0, 0, C) will give us a single bit.
  17504. if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
  17505. isNullConstant(RHS)) {
  17506. Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
  17507. return;
  17508. }
  17509. }
  17510. break;
  17511. case ARMISD::CMOV: {
  17512. // Bits are known zero/one if known on the LHS and RHS.
  17513. Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
  17514. if (Known.isUnknown())
  17515. return;
  17516. KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
  17517. Known = KnownBits::commonBits(Known, KnownRHS);
  17518. return;
  17519. }
  17520. case ISD::INTRINSIC_W_CHAIN: {
  17521. ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
  17522. Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
  17523. switch (IntID) {
  17524. default: return;
  17525. case Intrinsic::arm_ldaex:
  17526. case Intrinsic::arm_ldrex: {
  17527. EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
  17528. unsigned MemBits = VT.getScalarSizeInBits();
  17529. Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
  17530. return;
  17531. }
  17532. }
  17533. }
  17534. case ARMISD::BFI: {
  17535. // Conservatively, we can recurse down the first operand
  17536. // and just mask out all affected bits.
  17537. Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
  17538. // The operand to BFI is already a mask suitable for removing the bits it
  17539. // sets.
  17540. ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
  17541. const APInt &Mask = CI->getAPIntValue();
  17542. Known.Zero &= Mask;
  17543. Known.One &= Mask;
  17544. return;
  17545. }
  17546. case ARMISD::VGETLANEs:
  17547. case ARMISD::VGETLANEu: {
  17548. const SDValue &SrcSV = Op.getOperand(0);
  17549. EVT VecVT = SrcSV.getValueType();
  17550. assert(VecVT.isVector() && "VGETLANE expected a vector type");
  17551. const unsigned NumSrcElts = VecVT.getVectorNumElements();
  17552. ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
  17553. assert(Pos->getAPIntValue().ult(NumSrcElts) &&
  17554. "VGETLANE index out of bounds");
  17555. unsigned Idx = Pos->getZExtValue();
  17556. APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
  17557. Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
  17558. EVT VT = Op.getValueType();
  17559. const unsigned DstSz = VT.getScalarSizeInBits();
  17560. const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
  17561. (void)SrcSz;
  17562. assert(SrcSz == Known.getBitWidth());
  17563. assert(DstSz > SrcSz);
  17564. if (Op.getOpcode() == ARMISD::VGETLANEs)
  17565. Known = Known.sext(DstSz);
  17566. else {
  17567. Known = Known.zext(DstSz);
  17568. }
  17569. assert(DstSz == Known.getBitWidth());
  17570. break;
  17571. }
  17572. case ARMISD::VMOVrh: {
  17573. KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  17574. assert(KnownOp.getBitWidth() == 16);
  17575. Known = KnownOp.zext(32);
  17576. break;
  17577. }
  17578. case ARMISD::CSINC:
  17579. case ARMISD::CSINV:
  17580. case ARMISD::CSNEG: {
  17581. KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  17582. KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
  17583. // The result is either:
  17584. // CSINC: KnownOp0 or KnownOp1 + 1
  17585. // CSINV: KnownOp0 or ~KnownOp1
  17586. // CSNEG: KnownOp0 or KnownOp1 * -1
  17587. if (Op.getOpcode() == ARMISD::CSINC)
  17588. KnownOp1 = KnownBits::computeForAddSub(
  17589. true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
  17590. else if (Op.getOpcode() == ARMISD::CSINV)
  17591. std::swap(KnownOp1.Zero, KnownOp1.One);
  17592. else if (Op.getOpcode() == ARMISD::CSNEG)
  17593. KnownOp1 = KnownBits::mul(
  17594. KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
  17595. Known = KnownBits::commonBits(KnownOp0, KnownOp1);
  17596. break;
  17597. }
  17598. }
  17599. }
  17600. bool ARMTargetLowering::targetShrinkDemandedConstant(
  17601. SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
  17602. TargetLoweringOpt &TLO) const {
  17603. // Delay optimization, so we don't have to deal with illegal types, or block
  17604. // optimizations.
  17605. if (!TLO.LegalOps)
  17606. return false;
  17607. // Only optimize AND for now.
  17608. if (Op.getOpcode() != ISD::AND)
  17609. return false;
  17610. EVT VT = Op.getValueType();
  17611. // Ignore vectors.
  17612. if (VT.isVector())
  17613. return false;
  17614. assert(VT == MVT::i32 && "Unexpected integer type");
  17615. // Make sure the RHS really is a constant.
  17616. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  17617. if (!C)
  17618. return false;
  17619. unsigned Mask = C->getZExtValue();
  17620. unsigned Demanded = DemandedBits.getZExtValue();
  17621. unsigned ShrunkMask = Mask & Demanded;
  17622. unsigned ExpandedMask = Mask | ~Demanded;
  17623. // If the mask is all zeros, let the target-independent code replace the
  17624. // result with zero.
  17625. if (ShrunkMask == 0)
  17626. return false;
  17627. // If the mask is all ones, erase the AND. (Currently, the target-independent
  17628. // code won't do this, so we have to do it explicitly to avoid an infinite
  17629. // loop in obscure cases.)
  17630. if (ExpandedMask == ~0U)
  17631. return TLO.CombineTo(Op, Op.getOperand(0));
  17632. auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
  17633. return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
  17634. };
  17635. auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
  17636. if (NewMask == Mask)
  17637. return true;
  17638. SDLoc DL(Op);
  17639. SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
  17640. SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
  17641. return TLO.CombineTo(Op, NewOp);
  17642. };
  17643. // Prefer uxtb mask.
  17644. if (IsLegalMask(0xFF))
  17645. return UseMask(0xFF);
  17646. // Prefer uxth mask.
  17647. if (IsLegalMask(0xFFFF))
  17648. return UseMask(0xFFFF);
  17649. // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
  17650. // FIXME: Prefer a contiguous sequence of bits for other optimizations.
  17651. if (ShrunkMask < 256)
  17652. return UseMask(ShrunkMask);
  17653. // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
  17654. // FIXME: Prefer a contiguous sequence of bits for other optimizations.
  17655. if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
  17656. return UseMask(ExpandedMask);
  17657. // Potential improvements:
  17658. //
  17659. // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
  17660. // We could try to prefer Thumb1 immediates which can be lowered to a
  17661. // two-instruction sequence.
  17662. // We could try to recognize more legal ARM/Thumb2 immediates here.
  17663. return false;
  17664. }
  17665. bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
  17666. SDValue Op, const APInt &OriginalDemandedBits,
  17667. const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
  17668. unsigned Depth) const {
  17669. unsigned Opc = Op.getOpcode();
  17670. switch (Opc) {
  17671. case ARMISD::ASRL:
  17672. case ARMISD::LSRL: {
  17673. // If this is result 0 and the other result is unused, see if the demand
  17674. // bits allow us to shrink this long shift into a standard small shift in
  17675. // the opposite direction.
  17676. if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
  17677. isa<ConstantSDNode>(Op->getOperand(2))) {
  17678. unsigned ShAmt = Op->getConstantOperandVal(2);
  17679. if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
  17680. << (32 - ShAmt)))
  17681. return TLO.CombineTo(
  17682. Op, TLO.DAG.getNode(
  17683. ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
  17684. TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
  17685. }
  17686. break;
  17687. }
  17688. }
  17689. return TargetLowering::SimplifyDemandedBitsForTargetNode(
  17690. Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
  17691. }
  17692. //===----------------------------------------------------------------------===//
  17693. // ARM Inline Assembly Support
  17694. //===----------------------------------------------------------------------===//
  17695. bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
  17696. // Looking for "rev" which is V6+.
  17697. if (!Subtarget->hasV6Ops())
  17698. return false;
  17699. InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
  17700. std::string AsmStr = IA->getAsmString();
  17701. SmallVector<StringRef, 4> AsmPieces;
  17702. SplitString(AsmStr, AsmPieces, ";\n");
  17703. switch (AsmPieces.size()) {
  17704. default: return false;
  17705. case 1:
  17706. AsmStr = std::string(AsmPieces[0]);
  17707. AsmPieces.clear();
  17708. SplitString(AsmStr, AsmPieces, " \t,");
  17709. // rev $0, $1
  17710. if (AsmPieces.size() == 3 &&
  17711. AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
  17712. IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
  17713. IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
  17714. if (Ty && Ty->getBitWidth() == 32)
  17715. return IntrinsicLowering::LowerToByteSwap(CI);
  17716. }
  17717. break;
  17718. }
  17719. return false;
  17720. }
  17721. const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
  17722. // At this point, we have to lower this constraint to something else, so we
  17723. // lower it to an "r" or "w". However, by doing this we will force the result
  17724. // to be in register, while the X constraint is much more permissive.
  17725. //
  17726. // Although we are correct (we are free to emit anything, without
  17727. // constraints), we might break use cases that would expect us to be more
  17728. // efficient and emit something else.
  17729. if (!Subtarget->hasVFP2Base())
  17730. return "r";
  17731. if (ConstraintVT.isFloatingPoint())
  17732. return "w";
  17733. if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
  17734. (ConstraintVT.getSizeInBits() == 64 ||
  17735. ConstraintVT.getSizeInBits() == 128))
  17736. return "w";
  17737. return "r";
  17738. }
  17739. /// getConstraintType - Given a constraint letter, return the type of
  17740. /// constraint it is for this target.
  17741. ARMTargetLowering::ConstraintType
  17742. ARMTargetLowering::getConstraintType(StringRef Constraint) const {
  17743. unsigned S = Constraint.size();
  17744. if (S == 1) {
  17745. switch (Constraint[0]) {
  17746. default: break;
  17747. case 'l': return C_RegisterClass;
  17748. case 'w': return C_RegisterClass;
  17749. case 'h': return C_RegisterClass;
  17750. case 'x': return C_RegisterClass;
  17751. case 't': return C_RegisterClass;
  17752. case 'j': return C_Immediate; // Constant for movw.
  17753. // An address with a single base register. Due to the way we
  17754. // currently handle addresses it is the same as an 'r' memory constraint.
  17755. case 'Q': return C_Memory;
  17756. }
  17757. } else if (S == 2) {
  17758. switch (Constraint[0]) {
  17759. default: break;
  17760. case 'T': return C_RegisterClass;
  17761. // All 'U+' constraints are addresses.
  17762. case 'U': return C_Memory;
  17763. }
  17764. }
  17765. return TargetLowering::getConstraintType(Constraint);
  17766. }
  17767. /// Examine constraint type and operand type and determine a weight value.
  17768. /// This object must already have been set up with the operand type
  17769. /// and the current alternative constraint selected.
  17770. TargetLowering::ConstraintWeight
  17771. ARMTargetLowering::getSingleConstraintMatchWeight(
  17772. AsmOperandInfo &info, const char *constraint) const {
  17773. ConstraintWeight weight = CW_Invalid;
  17774. Value *CallOperandVal = info.CallOperandVal;
  17775. // If we don't have a value, we can't do a match,
  17776. // but allow it at the lowest weight.
  17777. if (!CallOperandVal)
  17778. return CW_Default;
  17779. Type *type = CallOperandVal->getType();
  17780. // Look at the constraint type.
  17781. switch (*constraint) {
  17782. default:
  17783. weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
  17784. break;
  17785. case 'l':
  17786. if (type->isIntegerTy()) {
  17787. if (Subtarget->isThumb())
  17788. weight = CW_SpecificReg;
  17789. else
  17790. weight = CW_Register;
  17791. }
  17792. break;
  17793. case 'w':
  17794. if (type->isFloatingPointTy())
  17795. weight = CW_Register;
  17796. break;
  17797. }
  17798. return weight;
  17799. }
  17800. using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
  17801. RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
  17802. const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
  17803. switch (Constraint.size()) {
  17804. case 1:
  17805. // GCC ARM Constraint Letters
  17806. switch (Constraint[0]) {
  17807. case 'l': // Low regs or general regs.
  17808. if (Subtarget->isThumb())
  17809. return RCPair(0U, &ARM::tGPRRegClass);
  17810. return RCPair(0U, &ARM::GPRRegClass);
  17811. case 'h': // High regs or no regs.
  17812. if (Subtarget->isThumb())
  17813. return RCPair(0U, &ARM::hGPRRegClass);
  17814. break;
  17815. case 'r':
  17816. if (Subtarget->isThumb1Only())
  17817. return RCPair(0U, &ARM::tGPRRegClass);
  17818. return RCPair(0U, &ARM::GPRRegClass);
  17819. case 'w':
  17820. if (VT == MVT::Other)
  17821. break;
  17822. if (VT == MVT::f32)
  17823. return RCPair(0U, &ARM::SPRRegClass);
  17824. if (VT.getSizeInBits() == 64)
  17825. return RCPair(0U, &ARM::DPRRegClass);
  17826. if (VT.getSizeInBits() == 128)
  17827. return RCPair(0U, &ARM::QPRRegClass);
  17828. break;
  17829. case 'x':
  17830. if (VT == MVT::Other)
  17831. break;
  17832. if (VT == MVT::f32)
  17833. return RCPair(0U, &ARM::SPR_8RegClass);
  17834. if (VT.getSizeInBits() == 64)
  17835. return RCPair(0U, &ARM::DPR_8RegClass);
  17836. if (VT.getSizeInBits() == 128)
  17837. return RCPair(0U, &ARM::QPR_8RegClass);
  17838. break;
  17839. case 't':
  17840. if (VT == MVT::Other)
  17841. break;
  17842. if (VT == MVT::f32 || VT == MVT::i32)
  17843. return RCPair(0U, &ARM::SPRRegClass);
  17844. if (VT.getSizeInBits() == 64)
  17845. return RCPair(0U, &ARM::DPR_VFP2RegClass);
  17846. if (VT.getSizeInBits() == 128)
  17847. return RCPair(0U, &ARM::QPR_VFP2RegClass);
  17848. break;
  17849. }
  17850. break;
  17851. case 2:
  17852. if (Constraint[0] == 'T') {
  17853. switch (Constraint[1]) {
  17854. default:
  17855. break;
  17856. case 'e':
  17857. return RCPair(0U, &ARM::tGPREvenRegClass);
  17858. case 'o':
  17859. return RCPair(0U, &ARM::tGPROddRegClass);
  17860. }
  17861. }
  17862. break;
  17863. default:
  17864. break;
  17865. }
  17866. if (StringRef("{cc}").equals_insensitive(Constraint))
  17867. return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
  17868. return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  17869. }
  17870. /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
  17871. /// vector. If it is invalid, don't add anything to Ops.
  17872. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
  17873. std::string &Constraint,
  17874. std::vector<SDValue>&Ops,
  17875. SelectionDAG &DAG) const {
  17876. SDValue Result;
  17877. // Currently only support length 1 constraints.
  17878. if (Constraint.length() != 1) return;
  17879. char ConstraintLetter = Constraint[0];
  17880. switch (ConstraintLetter) {
  17881. default: break;
  17882. case 'j':
  17883. case 'I': case 'J': case 'K': case 'L':
  17884. case 'M': case 'N': case 'O':
  17885. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
  17886. if (!C)
  17887. return;
  17888. int64_t CVal64 = C->getSExtValue();
  17889. int CVal = (int) CVal64;
  17890. // None of these constraints allow values larger than 32 bits. Check
  17891. // that the value fits in an int.
  17892. if (CVal != CVal64)
  17893. return;
  17894. switch (ConstraintLetter) {
  17895. case 'j':
  17896. // Constant suitable for movw, must be between 0 and
  17897. // 65535.
  17898. if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
  17899. if (CVal >= 0 && CVal <= 65535)
  17900. break;
  17901. return;
  17902. case 'I':
  17903. if (Subtarget->isThumb1Only()) {
  17904. // This must be a constant between 0 and 255, for ADD
  17905. // immediates.
  17906. if (CVal >= 0 && CVal <= 255)
  17907. break;
  17908. } else if (Subtarget->isThumb2()) {
  17909. // A constant that can be used as an immediate value in a
  17910. // data-processing instruction.
  17911. if (ARM_AM::getT2SOImmVal(CVal) != -1)
  17912. break;
  17913. } else {
  17914. // A constant that can be used as an immediate value in a
  17915. // data-processing instruction.
  17916. if (ARM_AM::getSOImmVal(CVal) != -1)
  17917. break;
  17918. }
  17919. return;
  17920. case 'J':
  17921. if (Subtarget->isThumb1Only()) {
  17922. // This must be a constant between -255 and -1, for negated ADD
  17923. // immediates. This can be used in GCC with an "n" modifier that
  17924. // prints the negated value, for use with SUB instructions. It is
  17925. // not useful otherwise but is implemented for compatibility.
  17926. if (CVal >= -255 && CVal <= -1)
  17927. break;
  17928. } else {
  17929. // This must be a constant between -4095 and 4095. It is not clear
  17930. // what this constraint is intended for. Implemented for
  17931. // compatibility with GCC.
  17932. if (CVal >= -4095 && CVal <= 4095)
  17933. break;
  17934. }
  17935. return;
  17936. case 'K':
  17937. if (Subtarget->isThumb1Only()) {
  17938. // A 32-bit value where only one byte has a nonzero value. Exclude
  17939. // zero to match GCC. This constraint is used by GCC internally for
  17940. // constants that can be loaded with a move/shift combination.
  17941. // It is not useful otherwise but is implemented for compatibility.
  17942. if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
  17943. break;
  17944. } else if (Subtarget->isThumb2()) {
  17945. // A constant whose bitwise inverse can be used as an immediate
  17946. // value in a data-processing instruction. This can be used in GCC
  17947. // with a "B" modifier that prints the inverted value, for use with
  17948. // BIC and MVN instructions. It is not useful otherwise but is
  17949. // implemented for compatibility.
  17950. if (ARM_AM::getT2SOImmVal(~CVal) != -1)
  17951. break;
  17952. } else {
  17953. // A constant whose bitwise inverse can be used as an immediate
  17954. // value in a data-processing instruction. This can be used in GCC
  17955. // with a "B" modifier that prints the inverted value, for use with
  17956. // BIC and MVN instructions. It is not useful otherwise but is
  17957. // implemented for compatibility.
  17958. if (ARM_AM::getSOImmVal(~CVal) != -1)
  17959. break;
  17960. }
  17961. return;
  17962. case 'L':
  17963. if (Subtarget->isThumb1Only()) {
  17964. // This must be a constant between -7 and 7,
  17965. // for 3-operand ADD/SUB immediate instructions.
  17966. if (CVal >= -7 && CVal < 7)
  17967. break;
  17968. } else if (Subtarget->isThumb2()) {
  17969. // A constant whose negation can be used as an immediate value in a
  17970. // data-processing instruction. This can be used in GCC with an "n"
  17971. // modifier that prints the negated value, for use with SUB
  17972. // instructions. It is not useful otherwise but is implemented for
  17973. // compatibility.
  17974. if (ARM_AM::getT2SOImmVal(-CVal) != -1)
  17975. break;
  17976. } else {
  17977. // A constant whose negation can be used as an immediate value in a
  17978. // data-processing instruction. This can be used in GCC with an "n"
  17979. // modifier that prints the negated value, for use with SUB
  17980. // instructions. It is not useful otherwise but is implemented for
  17981. // compatibility.
  17982. if (ARM_AM::getSOImmVal(-CVal) != -1)
  17983. break;
  17984. }
  17985. return;
  17986. case 'M':
  17987. if (Subtarget->isThumb1Only()) {
  17988. // This must be a multiple of 4 between 0 and 1020, for
  17989. // ADD sp + immediate.
  17990. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
  17991. break;
  17992. } else {
  17993. // A power of two or a constant between 0 and 32. This is used in
  17994. // GCC for the shift amount on shifted register operands, but it is
  17995. // useful in general for any shift amounts.
  17996. if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
  17997. break;
  17998. }
  17999. return;
  18000. case 'N':
  18001. if (Subtarget->isThumb1Only()) {
  18002. // This must be a constant between 0 and 31, for shift amounts.
  18003. if (CVal >= 0 && CVal <= 31)
  18004. break;
  18005. }
  18006. return;
  18007. case 'O':
  18008. if (Subtarget->isThumb1Only()) {
  18009. // This must be a multiple of 4 between -508 and 508, for
  18010. // ADD/SUB sp = sp + immediate.
  18011. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
  18012. break;
  18013. }
  18014. return;
  18015. }
  18016. Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
  18017. break;
  18018. }
  18019. if (Result.getNode()) {
  18020. Ops.push_back(Result);
  18021. return;
  18022. }
  18023. return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
  18024. }
  18025. static RTLIB::Libcall getDivRemLibcall(
  18026. const SDNode *N, MVT::SimpleValueType SVT) {
  18027. assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
  18028. N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
  18029. "Unhandled Opcode in getDivRemLibcall");
  18030. bool isSigned = N->getOpcode() == ISD::SDIVREM ||
  18031. N->getOpcode() == ISD::SREM;
  18032. RTLIB::Libcall LC;
  18033. switch (SVT) {
  18034. default: llvm_unreachable("Unexpected request for libcall!");
  18035. case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
  18036. case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
  18037. case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
  18038. case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
  18039. }
  18040. return LC;
  18041. }
  18042. static TargetLowering::ArgListTy getDivRemArgList(
  18043. const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
  18044. assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
  18045. N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
  18046. "Unhandled Opcode in getDivRemArgList");
  18047. bool isSigned = N->getOpcode() == ISD::SDIVREM ||
  18048. N->getOpcode() == ISD::SREM;
  18049. TargetLowering::ArgListTy Args;
  18050. TargetLowering::ArgListEntry Entry;
  18051. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
  18052. EVT ArgVT = N->getOperand(i).getValueType();
  18053. Type *ArgTy = ArgVT.getTypeForEVT(*Context);
  18054. Entry.Node = N->getOperand(i);
  18055. Entry.Ty = ArgTy;
  18056. Entry.IsSExt = isSigned;
  18057. Entry.IsZExt = !isSigned;
  18058. Args.push_back(Entry);
  18059. }
  18060. if (Subtarget->isTargetWindows() && Args.size() >= 2)
  18061. std::swap(Args[0], Args[1]);
  18062. return Args;
  18063. }
  18064. SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
  18065. assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
  18066. Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
  18067. Subtarget->isTargetWindows()) &&
  18068. "Register-based DivRem lowering only");
  18069. unsigned Opcode = Op->getOpcode();
  18070. assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
  18071. "Invalid opcode for Div/Rem lowering");
  18072. bool isSigned = (Opcode == ISD::SDIVREM);
  18073. EVT VT = Op->getValueType(0);
  18074. Type *Ty = VT.getTypeForEVT(*DAG.getContext());
  18075. SDLoc dl(Op);
  18076. // If the target has hardware divide, use divide + multiply + subtract:
  18077. // div = a / b
  18078. // rem = a - b * div
  18079. // return {div, rem}
  18080. // This should be lowered into UDIV/SDIV + MLS later on.
  18081. bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
  18082. : Subtarget->hasDivideInARMMode();
  18083. if (hasDivide && Op->getValueType(0).isSimple() &&
  18084. Op->getSimpleValueType(0) == MVT::i32) {
  18085. unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
  18086. const SDValue Dividend = Op->getOperand(0);
  18087. const SDValue Divisor = Op->getOperand(1);
  18088. SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
  18089. SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
  18090. SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
  18091. SDValue Values[2] = {Div, Rem};
  18092. return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
  18093. }
  18094. RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
  18095. VT.getSimpleVT().SimpleTy);
  18096. SDValue InChain = DAG.getEntryNode();
  18097. TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
  18098. DAG.getContext(),
  18099. Subtarget);
  18100. SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
  18101. getPointerTy(DAG.getDataLayout()));
  18102. Type *RetTy = StructType::get(Ty, Ty);
  18103. if (Subtarget->isTargetWindows())
  18104. InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
  18105. TargetLowering::CallLoweringInfo CLI(DAG);
  18106. CLI.setDebugLoc(dl).setChain(InChain)
  18107. .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
  18108. .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
  18109. std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
  18110. return CallInfo.first;
  18111. }
  18112. // Lowers REM using divmod helpers
  18113. // see RTABI section 4.2/4.3
  18114. SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
  18115. // Build return types (div and rem)
  18116. std::vector<Type*> RetTyParams;
  18117. Type *RetTyElement;
  18118. switch (N->getValueType(0).getSimpleVT().SimpleTy) {
  18119. default: llvm_unreachable("Unexpected request for libcall!");
  18120. case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
  18121. case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
  18122. case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
  18123. case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
  18124. }
  18125. RetTyParams.push_back(RetTyElement);
  18126. RetTyParams.push_back(RetTyElement);
  18127. ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
  18128. Type *RetTy = StructType::get(*DAG.getContext(), ret);
  18129. RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
  18130. SimpleTy);
  18131. SDValue InChain = DAG.getEntryNode();
  18132. TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
  18133. Subtarget);
  18134. bool isSigned = N->getOpcode() == ISD::SREM;
  18135. SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
  18136. getPointerTy(DAG.getDataLayout()));
  18137. if (Subtarget->isTargetWindows())
  18138. InChain = WinDBZCheckDenominator(DAG, N, InChain);
  18139. // Lower call
  18140. CallLoweringInfo CLI(DAG);
  18141. CLI.setChain(InChain)
  18142. .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
  18143. .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
  18144. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  18145. // Return second (rem) result operand (first contains div)
  18146. SDNode *ResNode = CallResult.first.getNode();
  18147. assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
  18148. return ResNode->getOperand(1);
  18149. }
  18150. SDValue
  18151. ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
  18152. assert(Subtarget->isTargetWindows() && "unsupported target platform");
  18153. SDLoc DL(Op);
  18154. // Get the inputs.
  18155. SDValue Chain = Op.getOperand(0);
  18156. SDValue Size = Op.getOperand(1);
  18157. if (DAG.getMachineFunction().getFunction().hasFnAttribute(
  18158. "no-stack-arg-probe")) {
  18159. MaybeAlign Align =
  18160. cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
  18161. SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
  18162. Chain = SP.getValue(1);
  18163. SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
  18164. if (Align)
  18165. SP =
  18166. DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
  18167. DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
  18168. Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
  18169. SDValue Ops[2] = { SP, Chain };
  18170. return DAG.getMergeValues(Ops, DL);
  18171. }
  18172. SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
  18173. DAG.getConstant(2, DL, MVT::i32));
  18174. SDValue Flag;
  18175. Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
  18176. Flag = Chain.getValue(1);
  18177. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  18178. Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
  18179. SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
  18180. Chain = NewSP.getValue(1);
  18181. SDValue Ops[2] = { NewSP, Chain };
  18182. return DAG.getMergeValues(Ops, DL);
  18183. }
  18184. SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
  18185. bool IsStrict = Op->isStrictFPOpcode();
  18186. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  18187. const unsigned DstSz = Op.getValueType().getSizeInBits();
  18188. const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
  18189. assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
  18190. "Unexpected type for custom-lowering FP_EXTEND");
  18191. assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
  18192. "With both FP DP and 16, any FP conversion is legal!");
  18193. assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
  18194. "With FP16, 16 to 32 conversion is legal!");
  18195. // Converting from 32 -> 64 is valid if we have FP64.
  18196. if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
  18197. // FIXME: Remove this when we have strict fp instruction selection patterns
  18198. if (IsStrict) {
  18199. SDLoc Loc(Op);
  18200. SDValue Result = DAG.getNode(ISD::FP_EXTEND,
  18201. Loc, Op.getValueType(), SrcVal);
  18202. return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
  18203. }
  18204. return Op;
  18205. }
  18206. // Either we are converting from 16 -> 64, without FP16 and/or
  18207. // FP.double-precision or without Armv8-fp. So we must do it in two
  18208. // steps.
  18209. // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
  18210. // without FP16. So we must do a function call.
  18211. SDLoc Loc(Op);
  18212. RTLIB::Libcall LC;
  18213. MakeLibCallOptions CallOptions;
  18214. SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
  18215. for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
  18216. bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
  18217. MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
  18218. MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
  18219. if (Supported) {
  18220. if (IsStrict) {
  18221. SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
  18222. {DstVT, MVT::Other}, {Chain, SrcVal});
  18223. Chain = SrcVal.getValue(1);
  18224. } else {
  18225. SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
  18226. }
  18227. } else {
  18228. LC = RTLIB::getFPEXT(SrcVT, DstVT);
  18229. assert(LC != RTLIB::UNKNOWN_LIBCALL &&
  18230. "Unexpected type for custom-lowering FP_EXTEND");
  18231. std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
  18232. Loc, Chain);
  18233. }
  18234. }
  18235. return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
  18236. }
  18237. SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
  18238. bool IsStrict = Op->isStrictFPOpcode();
  18239. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  18240. EVT SrcVT = SrcVal.getValueType();
  18241. EVT DstVT = Op.getValueType();
  18242. const unsigned DstSz = Op.getValueType().getSizeInBits();
  18243. const unsigned SrcSz = SrcVT.getSizeInBits();
  18244. (void)DstSz;
  18245. assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
  18246. "Unexpected type for custom-lowering FP_ROUND");
  18247. assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
  18248. "With both FP DP and 16, any FP conversion is legal!");
  18249. SDLoc Loc(Op);
  18250. // Instruction from 32 -> 16 if hasFP16 is valid
  18251. if (SrcSz == 32 && Subtarget->hasFP16())
  18252. return Op;
  18253. // Lib call from 32 -> 16 / 64 -> [32, 16]
  18254. RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
  18255. assert(LC != RTLIB::UNKNOWN_LIBCALL &&
  18256. "Unexpected type for custom-lowering FP_ROUND");
  18257. MakeLibCallOptions CallOptions;
  18258. SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
  18259. SDValue Result;
  18260. std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
  18261. Loc, Chain);
  18262. return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
  18263. }
  18264. void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
  18265. SelectionDAG &DAG) const {
  18266. assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
  18267. MVT HalfT = MVT::i32;
  18268. SDLoc dl(N);
  18269. SDValue Hi, Lo, Tmp;
  18270. if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
  18271. !isOperationLegalOrCustom(ISD::UADDO, HalfT))
  18272. return ;
  18273. unsigned OpTypeBits = HalfT.getScalarSizeInBits();
  18274. SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
  18275. Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
  18276. DAG.getConstant(0, dl, HalfT));
  18277. Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
  18278. DAG.getConstant(1, dl, HalfT));
  18279. Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
  18280. DAG.getConstant(OpTypeBits - 1, dl,
  18281. getShiftAmountTy(HalfT, DAG.getDataLayout())));
  18282. Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
  18283. Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
  18284. SDValue(Lo.getNode(), 1));
  18285. Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
  18286. Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
  18287. Results.push_back(Lo);
  18288. Results.push_back(Hi);
  18289. }
  18290. bool
  18291. ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  18292. // The ARM target isn't yet aware of offsets.
  18293. return false;
  18294. }
  18295. bool ARM::isBitFieldInvertedMask(unsigned v) {
  18296. if (v == 0xffffffff)
  18297. return false;
  18298. // there can be 1's on either or both "outsides", all the "inside"
  18299. // bits must be 0's
  18300. return isShiftedMask_32(~v);
  18301. }
  18302. /// isFPImmLegal - Returns true if the target can instruction select the
  18303. /// specified FP immediate natively. If false, the legalizer will
  18304. /// materialize the FP immediate as a load from a constant pool.
  18305. bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
  18306. bool ForCodeSize) const {
  18307. if (!Subtarget->hasVFP3Base())
  18308. return false;
  18309. if (VT == MVT::f16 && Subtarget->hasFullFP16())
  18310. return ARM_AM::getFP16Imm(Imm) != -1;
  18311. if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
  18312. ARM_AM::getFP32FP16Imm(Imm) != -1)
  18313. return true;
  18314. if (VT == MVT::f32)
  18315. return ARM_AM::getFP32Imm(Imm) != -1;
  18316. if (VT == MVT::f64 && Subtarget->hasFP64())
  18317. return ARM_AM::getFP64Imm(Imm) != -1;
  18318. return false;
  18319. }
  18320. /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
  18321. /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
  18322. /// specified in the intrinsic calls.
  18323. bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
  18324. const CallInst &I,
  18325. MachineFunction &MF,
  18326. unsigned Intrinsic) const {
  18327. switch (Intrinsic) {
  18328. case Intrinsic::arm_neon_vld1:
  18329. case Intrinsic::arm_neon_vld2:
  18330. case Intrinsic::arm_neon_vld3:
  18331. case Intrinsic::arm_neon_vld4:
  18332. case Intrinsic::arm_neon_vld2lane:
  18333. case Intrinsic::arm_neon_vld3lane:
  18334. case Intrinsic::arm_neon_vld4lane:
  18335. case Intrinsic::arm_neon_vld2dup:
  18336. case Intrinsic::arm_neon_vld3dup:
  18337. case Intrinsic::arm_neon_vld4dup: {
  18338. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18339. // Conservatively set memVT to the entire set of vectors loaded.
  18340. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18341. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
  18342. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18343. Info.ptrVal = I.getArgOperand(0);
  18344. Info.offset = 0;
  18345. Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
  18346. Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
  18347. // volatile loads with NEON intrinsics not supported
  18348. Info.flags = MachineMemOperand::MOLoad;
  18349. return true;
  18350. }
  18351. case Intrinsic::arm_neon_vld1x2:
  18352. case Intrinsic::arm_neon_vld1x3:
  18353. case Intrinsic::arm_neon_vld1x4: {
  18354. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18355. // Conservatively set memVT to the entire set of vectors loaded.
  18356. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18357. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
  18358. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18359. Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
  18360. Info.offset = 0;
  18361. Info.align.reset();
  18362. // volatile loads with NEON intrinsics not supported
  18363. Info.flags = MachineMemOperand::MOLoad;
  18364. return true;
  18365. }
  18366. case Intrinsic::arm_neon_vst1:
  18367. case Intrinsic::arm_neon_vst2:
  18368. case Intrinsic::arm_neon_vst3:
  18369. case Intrinsic::arm_neon_vst4:
  18370. case Intrinsic::arm_neon_vst2lane:
  18371. case Intrinsic::arm_neon_vst3lane:
  18372. case Intrinsic::arm_neon_vst4lane: {
  18373. Info.opc = ISD::INTRINSIC_VOID;
  18374. // Conservatively set memVT to the entire set of vectors stored.
  18375. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18376. unsigned NumElts = 0;
  18377. for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
  18378. Type *ArgTy = I.getArgOperand(ArgI)->getType();
  18379. if (!ArgTy->isVectorTy())
  18380. break;
  18381. NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
  18382. }
  18383. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18384. Info.ptrVal = I.getArgOperand(0);
  18385. Info.offset = 0;
  18386. Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
  18387. Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
  18388. // volatile stores with NEON intrinsics not supported
  18389. Info.flags = MachineMemOperand::MOStore;
  18390. return true;
  18391. }
  18392. case Intrinsic::arm_neon_vst1x2:
  18393. case Intrinsic::arm_neon_vst1x3:
  18394. case Intrinsic::arm_neon_vst1x4: {
  18395. Info.opc = ISD::INTRINSIC_VOID;
  18396. // Conservatively set memVT to the entire set of vectors stored.
  18397. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18398. unsigned NumElts = 0;
  18399. for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
  18400. Type *ArgTy = I.getArgOperand(ArgI)->getType();
  18401. if (!ArgTy->isVectorTy())
  18402. break;
  18403. NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
  18404. }
  18405. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18406. Info.ptrVal = I.getArgOperand(0);
  18407. Info.offset = 0;
  18408. Info.align.reset();
  18409. // volatile stores with NEON intrinsics not supported
  18410. Info.flags = MachineMemOperand::MOStore;
  18411. return true;
  18412. }
  18413. case Intrinsic::arm_mve_vld2q:
  18414. case Intrinsic::arm_mve_vld4q: {
  18415. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18416. // Conservatively set memVT to the entire set of vectors loaded.
  18417. Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
  18418. unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
  18419. Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
  18420. Info.ptrVal = I.getArgOperand(0);
  18421. Info.offset = 0;
  18422. Info.align = Align(VecTy->getScalarSizeInBits() / 8);
  18423. // volatile loads with MVE intrinsics not supported
  18424. Info.flags = MachineMemOperand::MOLoad;
  18425. return true;
  18426. }
  18427. case Intrinsic::arm_mve_vst2q:
  18428. case Intrinsic::arm_mve_vst4q: {
  18429. Info.opc = ISD::INTRINSIC_VOID;
  18430. // Conservatively set memVT to the entire set of vectors stored.
  18431. Type *VecTy = I.getArgOperand(1)->getType();
  18432. unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
  18433. Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
  18434. Info.ptrVal = I.getArgOperand(0);
  18435. Info.offset = 0;
  18436. Info.align = Align(VecTy->getScalarSizeInBits() / 8);
  18437. // volatile stores with MVE intrinsics not supported
  18438. Info.flags = MachineMemOperand::MOStore;
  18439. return true;
  18440. }
  18441. case Intrinsic::arm_mve_vldr_gather_base:
  18442. case Intrinsic::arm_mve_vldr_gather_base_predicated: {
  18443. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18444. Info.ptrVal = nullptr;
  18445. Info.memVT = MVT::getVT(I.getType());
  18446. Info.align = Align(1);
  18447. Info.flags |= MachineMemOperand::MOLoad;
  18448. return true;
  18449. }
  18450. case Intrinsic::arm_mve_vldr_gather_base_wb:
  18451. case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
  18452. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18453. Info.ptrVal = nullptr;
  18454. Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
  18455. Info.align = Align(1);
  18456. Info.flags |= MachineMemOperand::MOLoad;
  18457. return true;
  18458. }
  18459. case Intrinsic::arm_mve_vldr_gather_offset:
  18460. case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
  18461. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18462. Info.ptrVal = nullptr;
  18463. MVT DataVT = MVT::getVT(I.getType());
  18464. unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
  18465. Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
  18466. DataVT.getVectorNumElements());
  18467. Info.align = Align(1);
  18468. Info.flags |= MachineMemOperand::MOLoad;
  18469. return true;
  18470. }
  18471. case Intrinsic::arm_mve_vstr_scatter_base:
  18472. case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
  18473. Info.opc = ISD::INTRINSIC_VOID;
  18474. Info.ptrVal = nullptr;
  18475. Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
  18476. Info.align = Align(1);
  18477. Info.flags |= MachineMemOperand::MOStore;
  18478. return true;
  18479. }
  18480. case Intrinsic::arm_mve_vstr_scatter_base_wb:
  18481. case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
  18482. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18483. Info.ptrVal = nullptr;
  18484. Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
  18485. Info.align = Align(1);
  18486. Info.flags |= MachineMemOperand::MOStore;
  18487. return true;
  18488. }
  18489. case Intrinsic::arm_mve_vstr_scatter_offset:
  18490. case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
  18491. Info.opc = ISD::INTRINSIC_VOID;
  18492. Info.ptrVal = nullptr;
  18493. MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
  18494. unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
  18495. Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
  18496. DataVT.getVectorNumElements());
  18497. Info.align = Align(1);
  18498. Info.flags |= MachineMemOperand::MOStore;
  18499. return true;
  18500. }
  18501. case Intrinsic::arm_ldaex:
  18502. case Intrinsic::arm_ldrex: {
  18503. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18504. PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
  18505. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18506. Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
  18507. Info.ptrVal = I.getArgOperand(0);
  18508. Info.offset = 0;
  18509. Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
  18510. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
  18511. return true;
  18512. }
  18513. case Intrinsic::arm_stlex:
  18514. case Intrinsic::arm_strex: {
  18515. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18516. PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
  18517. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18518. Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
  18519. Info.ptrVal = I.getArgOperand(1);
  18520. Info.offset = 0;
  18521. Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
  18522. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
  18523. return true;
  18524. }
  18525. case Intrinsic::arm_stlexd:
  18526. case Intrinsic::arm_strexd:
  18527. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18528. Info.memVT = MVT::i64;
  18529. Info.ptrVal = I.getArgOperand(2);
  18530. Info.offset = 0;
  18531. Info.align = Align(8);
  18532. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
  18533. return true;
  18534. case Intrinsic::arm_ldaexd:
  18535. case Intrinsic::arm_ldrexd:
  18536. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18537. Info.memVT = MVT::i64;
  18538. Info.ptrVal = I.getArgOperand(0);
  18539. Info.offset = 0;
  18540. Info.align = Align(8);
  18541. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
  18542. return true;
  18543. default:
  18544. break;
  18545. }
  18546. return false;
  18547. }
  18548. /// Returns true if it is beneficial to convert a load of a constant
  18549. /// to just the constant itself.
  18550. bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
  18551. Type *Ty) const {
  18552. assert(Ty->isIntegerTy());
  18553. unsigned Bits = Ty->getPrimitiveSizeInBits();
  18554. if (Bits == 0 || Bits > 32)
  18555. return false;
  18556. return true;
  18557. }
  18558. bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
  18559. unsigned Index) const {
  18560. if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
  18561. return false;
  18562. return (Index == 0 || Index == ResVT.getVectorNumElements());
  18563. }
  18564. Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
  18565. ARM_MB::MemBOpt Domain) const {
  18566. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18567. // First, if the target has no DMB, see what fallback we can use.
  18568. if (!Subtarget->hasDataBarrier()) {
  18569. // Some ARMv6 cpus can support data barriers with an mcr instruction.
  18570. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
  18571. // here.
  18572. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
  18573. Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
  18574. Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
  18575. Builder.getInt32(0), Builder.getInt32(7),
  18576. Builder.getInt32(10), Builder.getInt32(5)};
  18577. return Builder.CreateCall(MCR, args);
  18578. } else {
  18579. // Instead of using barriers, atomic accesses on these subtargets use
  18580. // libcalls.
  18581. llvm_unreachable("makeDMB on a target so old that it has no barriers");
  18582. }
  18583. } else {
  18584. Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
  18585. // Only a full system barrier exists in the M-class architectures.
  18586. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
  18587. Constant *CDomain = Builder.getInt32(Domain);
  18588. return Builder.CreateCall(DMB, CDomain);
  18589. }
  18590. }
  18591. // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
  18592. Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
  18593. Instruction *Inst,
  18594. AtomicOrdering Ord) const {
  18595. switch (Ord) {
  18596. case AtomicOrdering::NotAtomic:
  18597. case AtomicOrdering::Unordered:
  18598. llvm_unreachable("Invalid fence: unordered/non-atomic");
  18599. case AtomicOrdering::Monotonic:
  18600. case AtomicOrdering::Acquire:
  18601. return nullptr; // Nothing to do
  18602. case AtomicOrdering::SequentiallyConsistent:
  18603. if (!Inst->hasAtomicStore())
  18604. return nullptr; // Nothing to do
  18605. LLVM_FALLTHROUGH;
  18606. case AtomicOrdering::Release:
  18607. case AtomicOrdering::AcquireRelease:
  18608. if (Subtarget->preferISHSTBarriers())
  18609. return makeDMB(Builder, ARM_MB::ISHST);
  18610. // FIXME: add a comment with a link to documentation justifying this.
  18611. else
  18612. return makeDMB(Builder, ARM_MB::ISH);
  18613. }
  18614. llvm_unreachable("Unknown fence ordering in emitLeadingFence");
  18615. }
  18616. Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
  18617. Instruction *Inst,
  18618. AtomicOrdering Ord) const {
  18619. switch (Ord) {
  18620. case AtomicOrdering::NotAtomic:
  18621. case AtomicOrdering::Unordered:
  18622. llvm_unreachable("Invalid fence: unordered/not-atomic");
  18623. case AtomicOrdering::Monotonic:
  18624. case AtomicOrdering::Release:
  18625. return nullptr; // Nothing to do
  18626. case AtomicOrdering::Acquire:
  18627. case AtomicOrdering::AcquireRelease:
  18628. case AtomicOrdering::SequentiallyConsistent:
  18629. return makeDMB(Builder, ARM_MB::ISH);
  18630. }
  18631. llvm_unreachable("Unknown fence ordering in emitTrailingFence");
  18632. }
  18633. // Loads and stores less than 64-bits are already atomic; ones above that
  18634. // are doomed anyway, so defer to the default libcall and blame the OS when
  18635. // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
  18636. // anything for those.
  18637. bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  18638. unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
  18639. return (Size == 64) && !Subtarget->isMClass();
  18640. }
  18641. // Loads and stores less than 64-bits are already atomic; ones above that
  18642. // are doomed anyway, so defer to the default libcall and blame the OS when
  18643. // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
  18644. // anything for those.
  18645. // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
  18646. // guarantee, see DDI0406C ARM architecture reference manual,
  18647. // sections A8.8.72-74 LDRD)
  18648. TargetLowering::AtomicExpansionKind
  18649. ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
  18650. unsigned Size = LI->getType()->getPrimitiveSizeInBits();
  18651. return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
  18652. : AtomicExpansionKind::None;
  18653. }
  18654. // For the real atomic operations, we have ldrex/strex up to 32 bits,
  18655. // and up to 64 bits on the non-M profiles
  18656. TargetLowering::AtomicExpansionKind
  18657. ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
  18658. if (AI->isFloatingPointOperation())
  18659. return AtomicExpansionKind::CmpXChg;
  18660. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  18661. // implement atomicrmw without spilling. If the target address is also on the
  18662. // stack and close enough to the spill slot, this can lead to a situation
  18663. // where the monitor always gets cleared and the atomic operation can never
  18664. // succeed. So at -O0 lower this operation to a CAS loop.
  18665. if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
  18666. return AtomicExpansionKind::CmpXChg;
  18667. unsigned Size = AI->getType()->getPrimitiveSizeInBits();
  18668. bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
  18669. return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
  18670. ? AtomicExpansionKind::LLSC
  18671. : AtomicExpansionKind::None;
  18672. }
  18673. // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
  18674. // bits, and up to 64 bits on the non-M profiles.
  18675. TargetLowering::AtomicExpansionKind
  18676. ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
  18677. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  18678. // implement cmpxchg without spilling. If the address being exchanged is also
  18679. // on the stack and close enough to the spill slot, this can lead to a
  18680. // situation where the monitor always gets cleared and the atomic operation
  18681. // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
  18682. unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
  18683. bool HasAtomicCmpXchg =
  18684. !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
  18685. if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
  18686. Size <= (Subtarget->isMClass() ? 32U : 64U))
  18687. return AtomicExpansionKind::LLSC;
  18688. return AtomicExpansionKind::None;
  18689. }
  18690. bool ARMTargetLowering::shouldInsertFencesForAtomic(
  18691. const Instruction *I) const {
  18692. return InsertFencesForAtomic;
  18693. }
  18694. bool ARMTargetLowering::useLoadStackGuardNode() const { return true; }
  18695. void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
  18696. if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  18697. return TargetLowering::insertSSPDeclarations(M);
  18698. // MSVC CRT has a global variable holding security cookie.
  18699. M.getOrInsertGlobal("__security_cookie",
  18700. Type::getInt8PtrTy(M.getContext()));
  18701. // MSVC CRT has a function to validate security cookie.
  18702. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
  18703. "__security_check_cookie", Type::getVoidTy(M.getContext()),
  18704. Type::getInt8PtrTy(M.getContext()));
  18705. if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
  18706. F->addParamAttr(0, Attribute::AttrKind::InReg);
  18707. }
  18708. Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
  18709. // MSVC CRT has a global variable holding security cookie.
  18710. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  18711. return M.getGlobalVariable("__security_cookie");
  18712. return TargetLowering::getSDagStackGuard(M);
  18713. }
  18714. Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
  18715. // MSVC CRT has a function to validate security cookie.
  18716. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  18717. return M.getFunction("__security_check_cookie");
  18718. return TargetLowering::getSSPStackGuardCheck(M);
  18719. }
  18720. bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
  18721. unsigned &Cost) const {
  18722. // If we do not have NEON, vector types are not natively supported.
  18723. if (!Subtarget->hasNEON())
  18724. return false;
  18725. // Floating point values and vector values map to the same register file.
  18726. // Therefore, although we could do a store extract of a vector type, this is
  18727. // better to leave at float as we have more freedom in the addressing mode for
  18728. // those.
  18729. if (VectorTy->isFPOrFPVectorTy())
  18730. return false;
  18731. // If the index is unknown at compile time, this is very expensive to lower
  18732. // and it is not possible to combine the store with the extract.
  18733. if (!isa<ConstantInt>(Idx))
  18734. return false;
  18735. assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
  18736. unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
  18737. // We can do a store + vector extract on any vector that fits perfectly in a D
  18738. // or Q register.
  18739. if (BitWidth == 64 || BitWidth == 128) {
  18740. Cost = 0;
  18741. return true;
  18742. }
  18743. return false;
  18744. }
  18745. bool ARMTargetLowering::isCheapToSpeculateCttz() const {
  18746. return Subtarget->hasV6T2Ops();
  18747. }
  18748. bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
  18749. return Subtarget->hasV6T2Ops();
  18750. }
  18751. bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
  18752. return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
  18753. }
  18754. Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
  18755. Value *Addr,
  18756. AtomicOrdering Ord) const {
  18757. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18758. bool IsAcquire = isAcquireOrStronger(Ord);
  18759. // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
  18760. // intrinsic must return {i32, i32} and we have to recombine them into a
  18761. // single i64 here.
  18762. if (ValueTy->getPrimitiveSizeInBits() == 64) {
  18763. Intrinsic::ID Int =
  18764. IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
  18765. Function *Ldrex = Intrinsic::getDeclaration(M, Int);
  18766. Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
  18767. Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
  18768. Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
  18769. Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
  18770. if (!Subtarget->isLittle())
  18771. std::swap (Lo, Hi);
  18772. Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
  18773. Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
  18774. return Builder.CreateOr(
  18775. Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
  18776. }
  18777. Type *Tys[] = { Addr->getType() };
  18778. Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
  18779. Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
  18780. return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy);
  18781. }
  18782. void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
  18783. IRBuilderBase &Builder) const {
  18784. if (!Subtarget->hasV7Ops())
  18785. return;
  18786. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18787. Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
  18788. }
  18789. Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
  18790. Value *Val, Value *Addr,
  18791. AtomicOrdering Ord) const {
  18792. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18793. bool IsRelease = isReleaseOrStronger(Ord);
  18794. // Since the intrinsics must have legal type, the i64 intrinsics take two
  18795. // parameters: "i32, i32". We must marshal Val into the appropriate form
  18796. // before the call.
  18797. if (Val->getType()->getPrimitiveSizeInBits() == 64) {
  18798. Intrinsic::ID Int =
  18799. IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
  18800. Function *Strex = Intrinsic::getDeclaration(M, Int);
  18801. Type *Int32Ty = Type::getInt32Ty(M->getContext());
  18802. Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
  18803. Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
  18804. if (!Subtarget->isLittle())
  18805. std::swap(Lo, Hi);
  18806. Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
  18807. return Builder.CreateCall(Strex, {Lo, Hi, Addr});
  18808. }
  18809. Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
  18810. Type *Tys[] = { Addr->getType() };
  18811. Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
  18812. return Builder.CreateCall(
  18813. Strex, {Builder.CreateZExtOrBitCast(
  18814. Val, Strex->getFunctionType()->getParamType(0)),
  18815. Addr});
  18816. }
  18817. bool ARMTargetLowering::alignLoopsWithOptSize() const {
  18818. return Subtarget->isMClass();
  18819. }
  18820. /// A helper function for determining the number of interleaved accesses we
  18821. /// will generate when lowering accesses of the given type.
  18822. unsigned
  18823. ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
  18824. const DataLayout &DL) const {
  18825. return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
  18826. }
  18827. bool ARMTargetLowering::isLegalInterleavedAccessType(
  18828. unsigned Factor, FixedVectorType *VecTy, Align Alignment,
  18829. const DataLayout &DL) const {
  18830. unsigned VecSize = DL.getTypeSizeInBits(VecTy);
  18831. unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
  18832. if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
  18833. return false;
  18834. // Ensure the vector doesn't have f16 elements. Even though we could do an
  18835. // i16 vldN, we can't hold the f16 vectors and will end up converting via
  18836. // f32.
  18837. if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
  18838. return false;
  18839. if (Subtarget->hasMVEIntegerOps() && Factor == 3)
  18840. return false;
  18841. // Ensure the number of vector elements is greater than 1.
  18842. if (VecTy->getNumElements() < 2)
  18843. return false;
  18844. // Ensure the element type is legal.
  18845. if (ElSize != 8 && ElSize != 16 && ElSize != 32)
  18846. return false;
  18847. // And the alignment if high enough under MVE.
  18848. if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
  18849. return false;
  18850. // Ensure the total vector size is 64 or a multiple of 128. Types larger than
  18851. // 128 will be split into multiple interleaved accesses.
  18852. if (Subtarget->hasNEON() && VecSize == 64)
  18853. return true;
  18854. return VecSize % 128 == 0;
  18855. }
  18856. unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
  18857. if (Subtarget->hasNEON())
  18858. return 4;
  18859. if (Subtarget->hasMVEIntegerOps())
  18860. return MVEMaxSupportedInterleaveFactor;
  18861. return TargetLoweringBase::getMaxSupportedInterleaveFactor();
  18862. }
  18863. /// Lower an interleaved load into a vldN intrinsic.
  18864. ///
  18865. /// E.g. Lower an interleaved load (Factor = 2):
  18866. /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
  18867. /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
  18868. /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
  18869. ///
  18870. /// Into:
  18871. /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
  18872. /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
  18873. /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
  18874. bool ARMTargetLowering::lowerInterleavedLoad(
  18875. LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
  18876. ArrayRef<unsigned> Indices, unsigned Factor) const {
  18877. assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
  18878. "Invalid interleave factor");
  18879. assert(!Shuffles.empty() && "Empty shufflevector input");
  18880. assert(Shuffles.size() == Indices.size() &&
  18881. "Unmatched number of shufflevectors and indices");
  18882. auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
  18883. Type *EltTy = VecTy->getElementType();
  18884. const DataLayout &DL = LI->getModule()->getDataLayout();
  18885. Align Alignment = LI->getAlign();
  18886. // Skip if we do not have NEON and skip illegal vector types. We can
  18887. // "legalize" wide vector types into multiple interleaved accesses as long as
  18888. // the vector types are divisible by 128.
  18889. if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
  18890. return false;
  18891. unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
  18892. // A pointer vector can not be the return type of the ldN intrinsics. Need to
  18893. // load integer vectors first and then convert to pointer vectors.
  18894. if (EltTy->isPointerTy())
  18895. VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
  18896. IRBuilder<> Builder(LI);
  18897. // The base address of the load.
  18898. Value *BaseAddr = LI->getPointerOperand();
  18899. if (NumLoads > 1) {
  18900. // If we're going to generate more than one load, reset the sub-vector type
  18901. // to something legal.
  18902. VecTy = FixedVectorType::get(VecTy->getElementType(),
  18903. VecTy->getNumElements() / NumLoads);
  18904. // We will compute the pointer operand of each load from the original base
  18905. // address using GEPs. Cast the base address to a pointer to the scalar
  18906. // element type.
  18907. BaseAddr = Builder.CreateBitCast(
  18908. BaseAddr,
  18909. VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
  18910. }
  18911. assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
  18912. auto createLoadIntrinsic = [&](Value *BaseAddr) {
  18913. if (Subtarget->hasNEON()) {
  18914. Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
  18915. Type *Tys[] = {VecTy, Int8Ptr};
  18916. static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
  18917. Intrinsic::arm_neon_vld3,
  18918. Intrinsic::arm_neon_vld4};
  18919. Function *VldnFunc =
  18920. Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
  18921. SmallVector<Value *, 2> Ops;
  18922. Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
  18923. Ops.push_back(Builder.getInt32(LI->getAlignment()));
  18924. return Builder.CreateCall(VldnFunc, Ops, "vldN");
  18925. } else {
  18926. assert((Factor == 2 || Factor == 4) &&
  18927. "expected interleave factor of 2 or 4 for MVE");
  18928. Intrinsic::ID LoadInts =
  18929. Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
  18930. Type *VecEltTy =
  18931. VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
  18932. Type *Tys[] = {VecTy, VecEltTy};
  18933. Function *VldnFunc =
  18934. Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
  18935. SmallVector<Value *, 2> Ops;
  18936. Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
  18937. return Builder.CreateCall(VldnFunc, Ops, "vldN");
  18938. }
  18939. };
  18940. // Holds sub-vectors extracted from the load intrinsic return values. The
  18941. // sub-vectors are associated with the shufflevector instructions they will
  18942. // replace.
  18943. DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
  18944. for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
  18945. // If we're generating more than one load, compute the base address of
  18946. // subsequent loads as an offset from the previous.
  18947. if (LoadCount > 0)
  18948. BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
  18949. VecTy->getNumElements() * Factor);
  18950. CallInst *VldN = createLoadIntrinsic(BaseAddr);
  18951. // Replace uses of each shufflevector with the corresponding vector loaded
  18952. // by ldN.
  18953. for (unsigned i = 0; i < Shuffles.size(); i++) {
  18954. ShuffleVectorInst *SV = Shuffles[i];
  18955. unsigned Index = Indices[i];
  18956. Value *SubVec = Builder.CreateExtractValue(VldN, Index);
  18957. // Convert the integer vector to pointer vector if the element is pointer.
  18958. if (EltTy->isPointerTy())
  18959. SubVec = Builder.CreateIntToPtr(
  18960. SubVec,
  18961. FixedVectorType::get(SV->getType()->getElementType(), VecTy));
  18962. SubVecs[SV].push_back(SubVec);
  18963. }
  18964. }
  18965. // Replace uses of the shufflevector instructions with the sub-vectors
  18966. // returned by the load intrinsic. If a shufflevector instruction is
  18967. // associated with more than one sub-vector, those sub-vectors will be
  18968. // concatenated into a single wide vector.
  18969. for (ShuffleVectorInst *SVI : Shuffles) {
  18970. auto &SubVec = SubVecs[SVI];
  18971. auto *WideVec =
  18972. SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
  18973. SVI->replaceAllUsesWith(WideVec);
  18974. }
  18975. return true;
  18976. }
  18977. /// Lower an interleaved store into a vstN intrinsic.
  18978. ///
  18979. /// E.g. Lower an interleaved store (Factor = 3):
  18980. /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  18981. /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  18982. /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
  18983. ///
  18984. /// Into:
  18985. /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
  18986. /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
  18987. /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
  18988. /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
  18989. ///
  18990. /// Note that the new shufflevectors will be removed and we'll only generate one
  18991. /// vst3 instruction in CodeGen.
  18992. ///
  18993. /// Example for a more general valid mask (Factor 3). Lower:
  18994. /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
  18995. /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
  18996. /// store <12 x i32> %i.vec, <12 x i32>* %ptr
  18997. ///
  18998. /// Into:
  18999. /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
  19000. /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
  19001. /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
  19002. /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
  19003. bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
  19004. ShuffleVectorInst *SVI,
  19005. unsigned Factor) const {
  19006. assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
  19007. "Invalid interleave factor");
  19008. auto *VecTy = cast<FixedVectorType>(SVI->getType());
  19009. assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
  19010. unsigned LaneLen = VecTy->getNumElements() / Factor;
  19011. Type *EltTy = VecTy->getElementType();
  19012. auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
  19013. const DataLayout &DL = SI->getModule()->getDataLayout();
  19014. Align Alignment = SI->getAlign();
  19015. // Skip if we do not have NEON and skip illegal vector types. We can
  19016. // "legalize" wide vector types into multiple interleaved accesses as long as
  19017. // the vector types are divisible by 128.
  19018. if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
  19019. return false;
  19020. unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
  19021. Value *Op0 = SVI->getOperand(0);
  19022. Value *Op1 = SVI->getOperand(1);
  19023. IRBuilder<> Builder(SI);
  19024. // StN intrinsics don't support pointer vectors as arguments. Convert pointer
  19025. // vectors to integer vectors.
  19026. if (EltTy->isPointerTy()) {
  19027. Type *IntTy = DL.getIntPtrType(EltTy);
  19028. // Convert to the corresponding integer vector.
  19029. auto *IntVecTy =
  19030. FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
  19031. Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
  19032. Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
  19033. SubVecTy = FixedVectorType::get(IntTy, LaneLen);
  19034. }
  19035. // The base address of the store.
  19036. Value *BaseAddr = SI->getPointerOperand();
  19037. if (NumStores > 1) {
  19038. // If we're going to generate more than one store, reset the lane length
  19039. // and sub-vector type to something legal.
  19040. LaneLen /= NumStores;
  19041. SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
  19042. // We will compute the pointer operand of each store from the original base
  19043. // address using GEPs. Cast the base address to a pointer to the scalar
  19044. // element type.
  19045. BaseAddr = Builder.CreateBitCast(
  19046. BaseAddr,
  19047. SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
  19048. }
  19049. assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
  19050. auto Mask = SVI->getShuffleMask();
  19051. auto createStoreIntrinsic = [&](Value *BaseAddr,
  19052. SmallVectorImpl<Value *> &Shuffles) {
  19053. if (Subtarget->hasNEON()) {
  19054. static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
  19055. Intrinsic::arm_neon_vst3,
  19056. Intrinsic::arm_neon_vst4};
  19057. Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
  19058. Type *Tys[] = {Int8Ptr, SubVecTy};
  19059. Function *VstNFunc = Intrinsic::getDeclaration(
  19060. SI->getModule(), StoreInts[Factor - 2], Tys);
  19061. SmallVector<Value *, 6> Ops;
  19062. Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
  19063. append_range(Ops, Shuffles);
  19064. Ops.push_back(Builder.getInt32(SI->getAlignment()));
  19065. Builder.CreateCall(VstNFunc, Ops);
  19066. } else {
  19067. assert((Factor == 2 || Factor == 4) &&
  19068. "expected interleave factor of 2 or 4 for MVE");
  19069. Intrinsic::ID StoreInts =
  19070. Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
  19071. Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
  19072. SI->getPointerAddressSpace());
  19073. Type *Tys[] = {EltPtrTy, SubVecTy};
  19074. Function *VstNFunc =
  19075. Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
  19076. SmallVector<Value *, 6> Ops;
  19077. Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
  19078. append_range(Ops, Shuffles);
  19079. for (unsigned F = 0; F < Factor; F++) {
  19080. Ops.push_back(Builder.getInt32(F));
  19081. Builder.CreateCall(VstNFunc, Ops);
  19082. Ops.pop_back();
  19083. }
  19084. }
  19085. };
  19086. for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
  19087. // If we generating more than one store, we compute the base address of
  19088. // subsequent stores as an offset from the previous.
  19089. if (StoreCount > 0)
  19090. BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
  19091. BaseAddr, LaneLen * Factor);
  19092. SmallVector<Value *, 4> Shuffles;
  19093. // Split the shufflevector operands into sub vectors for the new vstN call.
  19094. for (unsigned i = 0; i < Factor; i++) {
  19095. unsigned IdxI = StoreCount * LaneLen * Factor + i;
  19096. if (Mask[IdxI] >= 0) {
  19097. Shuffles.push_back(Builder.CreateShuffleVector(
  19098. Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
  19099. } else {
  19100. unsigned StartMask = 0;
  19101. for (unsigned j = 1; j < LaneLen; j++) {
  19102. unsigned IdxJ = StoreCount * LaneLen * Factor + j;
  19103. if (Mask[IdxJ * Factor + IdxI] >= 0) {
  19104. StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
  19105. break;
  19106. }
  19107. }
  19108. // Note: If all elements in a chunk are undefs, StartMask=0!
  19109. // Note: Filling undef gaps with random elements is ok, since
  19110. // those elements were being written anyway (with undefs).
  19111. // In the case of all undefs we're defaulting to using elems from 0
  19112. // Note: StartMask cannot be negative, it's checked in
  19113. // isReInterleaveMask
  19114. Shuffles.push_back(Builder.CreateShuffleVector(
  19115. Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
  19116. }
  19117. }
  19118. createStoreIntrinsic(BaseAddr, Shuffles);
  19119. }
  19120. return true;
  19121. }
  19122. enum HABaseType {
  19123. HA_UNKNOWN = 0,
  19124. HA_FLOAT,
  19125. HA_DOUBLE,
  19126. HA_VECT64,
  19127. HA_VECT128
  19128. };
  19129. static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
  19130. uint64_t &Members) {
  19131. if (auto *ST = dyn_cast<StructType>(Ty)) {
  19132. for (unsigned i = 0; i < ST->getNumElements(); ++i) {
  19133. uint64_t SubMembers = 0;
  19134. if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
  19135. return false;
  19136. Members += SubMembers;
  19137. }
  19138. } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
  19139. uint64_t SubMembers = 0;
  19140. if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
  19141. return false;
  19142. Members += SubMembers * AT->getNumElements();
  19143. } else if (Ty->isFloatTy()) {
  19144. if (Base != HA_UNKNOWN && Base != HA_FLOAT)
  19145. return false;
  19146. Members = 1;
  19147. Base = HA_FLOAT;
  19148. } else if (Ty->isDoubleTy()) {
  19149. if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
  19150. return false;
  19151. Members = 1;
  19152. Base = HA_DOUBLE;
  19153. } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
  19154. Members = 1;
  19155. switch (Base) {
  19156. case HA_FLOAT:
  19157. case HA_DOUBLE:
  19158. return false;
  19159. case HA_VECT64:
  19160. return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
  19161. case HA_VECT128:
  19162. return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
  19163. case HA_UNKNOWN:
  19164. switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
  19165. case 64:
  19166. Base = HA_VECT64;
  19167. return true;
  19168. case 128:
  19169. Base = HA_VECT128;
  19170. return true;
  19171. default:
  19172. return false;
  19173. }
  19174. }
  19175. }
  19176. return (Members > 0 && Members <= 4);
  19177. }
  19178. /// Return the correct alignment for the current calling convention.
  19179. Align ARMTargetLowering::getABIAlignmentForCallingConv(
  19180. Type *ArgTy, const DataLayout &DL) const {
  19181. const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
  19182. if (!ArgTy->isVectorTy())
  19183. return ABITypeAlign;
  19184. // Avoid over-aligning vector parameters. It would require realigning the
  19185. // stack and waste space for no real benefit.
  19186. return std::min(ABITypeAlign, DL.getStackAlignment());
  19187. }
  19188. /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
  19189. /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
  19190. /// passing according to AAPCS rules.
  19191. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
  19192. Type *Ty, CallingConv::ID CallConv, bool isVarArg,
  19193. const DataLayout &DL) const {
  19194. if (getEffectiveCallingConv(CallConv, isVarArg) !=
  19195. CallingConv::ARM_AAPCS_VFP)
  19196. return false;
  19197. HABaseType Base = HA_UNKNOWN;
  19198. uint64_t Members = 0;
  19199. bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
  19200. LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
  19201. bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
  19202. return IsHA || IsIntArray;
  19203. }
  19204. Register ARMTargetLowering::getExceptionPointerRegister(
  19205. const Constant *PersonalityFn) const {
  19206. // Platforms which do not use SjLj EH may return values in these registers
  19207. // via the personality function.
  19208. return Subtarget->useSjLjEH() ? Register() : ARM::R0;
  19209. }
  19210. Register ARMTargetLowering::getExceptionSelectorRegister(
  19211. const Constant *PersonalityFn) const {
  19212. // Platforms which do not use SjLj EH may return values in these registers
  19213. // via the personality function.
  19214. return Subtarget->useSjLjEH() ? Register() : ARM::R1;
  19215. }
  19216. void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
  19217. // Update IsSplitCSR in ARMFunctionInfo.
  19218. ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
  19219. AFI->setIsSplitCSR(true);
  19220. }
  19221. void ARMTargetLowering::insertCopiesSplitCSR(
  19222. MachineBasicBlock *Entry,
  19223. const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
  19224. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
  19225. const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
  19226. if (!IStart)
  19227. return;
  19228. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  19229. MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
  19230. MachineBasicBlock::iterator MBBI = Entry->begin();
  19231. for (const MCPhysReg *I = IStart; *I; ++I) {
  19232. const TargetRegisterClass *RC = nullptr;
  19233. if (ARM::GPRRegClass.contains(*I))
  19234. RC = &ARM::GPRRegClass;
  19235. else if (ARM::DPRRegClass.contains(*I))
  19236. RC = &ARM::DPRRegClass;
  19237. else
  19238. llvm_unreachable("Unexpected register class in CSRsViaCopy!");
  19239. Register NewVR = MRI->createVirtualRegister(RC);
  19240. // Create copy from CSR to a virtual register.
  19241. // FIXME: this currently does not emit CFI pseudo-instructions, it works
  19242. // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
  19243. // nounwind. If we want to generalize this later, we may need to emit
  19244. // CFI pseudo-instructions.
  19245. assert(Entry->getParent()->getFunction().hasFnAttribute(
  19246. Attribute::NoUnwind) &&
  19247. "Function should be nounwind in insertCopiesSplitCSR!");
  19248. Entry->addLiveIn(*I);
  19249. BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
  19250. .addReg(*I);
  19251. // Insert the copy-back instructions right before the terminator.
  19252. for (auto *Exit : Exits)
  19253. BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
  19254. TII->get(TargetOpcode::COPY), *I)
  19255. .addReg(NewVR);
  19256. }
  19257. }
  19258. void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
  19259. MF.getFrameInfo().computeMaxCallFrameSize(MF);
  19260. TargetLoweringBase::finalizeLowering(MF);
  19261. }