ARMISelLowering.cpp 838 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362193631936419365193661936719368193691937019371193721937319374193751937619377193781937919380193811938219383193841938519386193871938819389193901939119392193931939419395193961939719398193991940019401194021940319404194051940619407194081940919410194111941219413194141941519416194171941819419194201942119422194231942419425194261942719428194291943019431194321943319434194351943619437194381943919440194411944219443194441944519446194471944819449194501945119452194531945419455194561945719458194591946019461194621946319464194651946619467194681946919470194711947219473194741947519476194771947819479194801948119482194831948419485194861948719488194891949019491194921949319494194951949619497194981949919500195011950219503195041950519506195071950819509195101951119512195131951419515195161951719518195191952019521195221952319524195251952619527195281952919530195311953219533195341953519536195371953819539195401954119542195431954419545195461954719548195491955019551195521955319554195551955619557195581955919560195611956219563195641956519566195671956819569195701957119572195731957419575195761957719578195791958019581195821958319584195851958619587195881958919590195911959219593195941959519596195971959819599196001960119602196031960419605196061960719608196091961019611196121961319614196151961619617196181961919620196211962219623196241962519626196271962819629196301963119632196331963419635196361963719638196391964019641196421964319644196451964619647196481964919650196511965219653196541965519656196571965819659196601966119662196631966419665196661966719668196691967019671196721967319674196751967619677196781967919680196811968219683196841968519686196871968819689196901969119692196931969419695196961969719698196991970019701197021970319704197051970619707197081970919710197111971219713197141971519716197171971819719197201972119722197231972419725197261972719728197291973019731197321973319734197351973619737197381973919740197411974219743197441974519746197471974819749197501975119752197531975419755197561975719758197591976019761197621976319764197651976619767197681976919770197711977219773197741977519776197771977819779197801978119782197831978419785197861978719788197891979019791197921979319794197951979619797197981979919800198011980219803198041980519806198071980819809198101981119812198131981419815198161981719818198191982019821198221982319824198251982619827198281982919830198311983219833198341983519836198371983819839198401984119842198431984419845198461984719848198491985019851198521985319854198551985619857198581985919860198611986219863198641986519866198671986819869198701987119872198731987419875198761987719878198791988019881198821988319884198851988619887198881988919890198911989219893198941989519896198971989819899199001990119902199031990419905199061990719908199091991019911199121991319914199151991619917199181991919920199211992219923199241992519926199271992819929199301993119932199331993419935199361993719938199391994019941199421994319944199451994619947199481994919950199511995219953199541995519956199571995819959199601996119962199631996419965199661996719968199691997019971199721997319974199751997619977199781997919980199811998219983199841998519986199871998819989199901999119992199931999419995199961999719998199992000020001200022000320004200052000620007200082000920010200112001220013200142001520016200172001820019200202002120022200232002420025200262002720028200292003020031200322003320034200352003620037200382003920040200412004220043200442004520046200472004820049200502005120052200532005420055200562005720058200592006020061200622006320064200652006620067200682006920070200712007220073200742007520076200772007820079200802008120082200832008420085200862008720088200892009020091200922009320094200952009620097200982009920100201012010220103201042010520106201072010820109201102011120112201132011420115201162011720118201192012020121201222012320124201252012620127201282012920130201312013220133201342013520136201372013820139201402014120142201432014420145201462014720148201492015020151201522015320154201552015620157201582015920160201612016220163201642016520166201672016820169201702017120172201732017420175201762017720178201792018020181201822018320184201852018620187201882018920190201912019220193201942019520196201972019820199202002020120202202032020420205202062020720208202092021020211202122021320214202152021620217202182021920220202212022220223202242022520226202272022820229202302023120232202332023420235202362023720238202392024020241202422024320244202452024620247202482024920250202512025220253202542025520256202572025820259202602026120262202632026420265202662026720268202692027020271202722027320274202752027620277202782027920280202812028220283202842028520286202872028820289202902029120292202932029420295202962029720298202992030020301203022030320304203052030620307203082030920310203112031220313203142031520316203172031820319203202032120322203232032420325203262032720328203292033020331203322033320334203352033620337203382033920340203412034220343203442034520346203472034820349203502035120352203532035420355203562035720358203592036020361203622036320364203652036620367203682036920370203712037220373203742037520376203772037820379203802038120382203832038420385203862038720388203892039020391203922039320394203952039620397203982039920400204012040220403204042040520406204072040820409204102041120412204132041420415204162041720418204192042020421204222042320424204252042620427204282042920430204312043220433204342043520436204372043820439204402044120442204432044420445204462044720448204492045020451204522045320454204552045620457204582045920460204612046220463204642046520466204672046820469204702047120472204732047420475204762047720478204792048020481204822048320484204852048620487204882048920490204912049220493204942049520496204972049820499205002050120502205032050420505205062050720508205092051020511205122051320514205152051620517205182051920520205212052220523205242052520526205272052820529205302053120532205332053420535205362053720538205392054020541205422054320544205452054620547205482054920550205512055220553205542055520556205572055820559205602056120562205632056420565205662056720568205692057020571205722057320574205752057620577205782057920580205812058220583205842058520586205872058820589205902059120592205932059420595205962059720598205992060020601206022060320604206052060620607206082060920610206112061220613206142061520616206172061820619206202062120622206232062420625206262062720628206292063020631206322063320634206352063620637206382063920640206412064220643206442064520646206472064820649206502065120652206532065420655206562065720658206592066020661206622066320664206652066620667206682066920670206712067220673206742067520676206772067820679206802068120682206832068420685206862068720688206892069020691206922069320694206952069620697206982069920700207012070220703207042070520706207072070820709207102071120712207132071420715207162071720718207192072020721207222072320724207252072620727207282072920730207312073220733207342073520736207372073820739207402074120742207432074420745207462074720748207492075020751207522075320754207552075620757207582075920760207612076220763207642076520766207672076820769207702077120772207732077420775207762077720778207792078020781207822078320784207852078620787207882078920790207912079220793207942079520796207972079820799208002080120802208032080420805208062080720808208092081020811208122081320814208152081620817208182081920820208212082220823208242082520826208272082820829208302083120832208332083420835208362083720838208392084020841208422084320844208452084620847208482084920850208512085220853208542085520856208572085820859208602086120862208632086420865208662086720868208692087020871208722087320874208752087620877208782087920880208812088220883208842088520886208872088820889208902089120892208932089420895208962089720898208992090020901209022090320904209052090620907209082090920910209112091220913209142091520916209172091820919209202092120922209232092420925209262092720928209292093020931209322093320934209352093620937209382093920940209412094220943209442094520946209472094820949209502095120952209532095420955209562095720958209592096020961209622096320964209652096620967209682096920970209712097220973209742097520976209772097820979209802098120982209832098420985209862098720988209892099020991209922099320994209952099620997209982099921000210012100221003210042100521006210072100821009210102101121012210132101421015210162101721018210192102021021210222102321024210252102621027210282102921030210312103221033210342103521036210372103821039210402104121042210432104421045210462104721048210492105021051210522105321054210552105621057210582105921060210612106221063210642106521066210672106821069210702107121072210732107421075210762107721078210792108021081210822108321084210852108621087210882108921090210912109221093210942109521096210972109821099211002110121102211032110421105211062110721108211092111021111211122111321114211152111621117211182111921120211212112221123211242112521126211272112821129211302113121132211332113421135211362113721138211392114021141211422114321144211452114621147211482114921150211512115221153211542115521156211572115821159211602116121162211632116421165211662116721168211692117021171211722117321174211752117621177211782117921180211812118221183211842118521186211872118821189211902119121192211932119421195211962119721198211992120021201212022120321204212052120621207212082120921210212112121221213212142121521216212172121821219212202122121222212232122421225212262122721228212292123021231212322123321234212352123621237212382123921240212412124221243212442124521246212472124821249212502125121252212532125421255212562125721258212592126021261212622126321264212652126621267212682126921270212712127221273212742127521276212772127821279212802128121282212832128421285212862128721288212892129021291212922129321294212952129621297212982129921300213012130221303213042130521306213072130821309213102131121312213132131421315213162131721318213192132021321213222132321324213252132621327213282132921330213312133221333213342133521336213372133821339213402134121342213432134421345213462134721348213492135021351213522135321354213552135621357213582135921360213612136221363213642136521366213672136821369213702137121372213732137421375213762137721378213792138021381213822138321384213852138621387213882138921390213912139221393213942139521396213972139821399214002140121402214032140421405214062140721408214092141021411214122141321414214152141621417214182141921420214212142221423214242142521426214272142821429214302143121432214332143421435214362143721438214392144021441214422144321444214452144621447214482144921450214512145221453214542145521456214572145821459214602146121462214632146421465214662146721468214692147021471214722147321474214752147621477214782147921480214812148221483214842148521486214872148821489214902149121492214932149421495214962149721498214992150021501215022150321504215052150621507215082150921510215112151221513215142151521516215172151821519215202152121522215232152421525215262152721528215292153021531215322153321534215352153621537215382153921540215412154221543215442154521546215472154821549215502155121552215532155421555215562155721558215592156021561215622156321564215652156621567215682156921570215712157221573215742157521576215772157821579215802158121582215832158421585215862158721588215892159021591215922159321594215952159621597215982159921600216012160221603216042160521606216072160821609216102161121612216132161421615216162161721618216192162021621216222162321624216252162621627216282162921630216312163221633216342163521636216372163821639216402164121642216432164421645216462164721648216492165021651216522165321654216552165621657216582165921660216612166221663216642166521666216672166821669216702167121672216732167421675216762167721678216792168021681216822168321684216852168621687216882168921690216912169221693216942169521696216972169821699217002170121702217032170421705217062170721708217092171021711217122171321714217152171621717217182171921720217212172221723217242172521726217272172821729217302173121732217332173421735217362173721738217392174021741217422174321744217452174621747217482174921750217512175221753217542175521756217572175821759217602176121762217632176421765217662176721768217692177021771217722177321774217752177621777217782177921780217812178221783217842178521786217872178821789217902179121792217932179421795217962179721798217992180021801218022180321804218052180621807218082180921810218112181221813218142181521816218172181821819218202182121822218232182421825218262182721828218292183021831218322183321834218352183621837218382183921840218412184221843218442184521846218472184821849218502185121852218532185421855218562185721858218592186021861218622186321864218652186621867218682186921870218712187221873218742187521876218772187821879218802188121882218832188421885218862188721888218892189021891218922189321894218952189621897218982189921900219012190221903219042190521906219072190821909219102191121912219132191421915219162191721918219192192021921219222192321924219252192621927219282192921930219312193221933219342193521936219372193821939219402194121942219432194421945219462194721948219492195021951219522195321954219552195621957219582195921960219612196221963219642196521966219672196821969219702197121972219732197421975219762197721978219792198021981219822198321984219852198621987219882198921990219912199221993219942199521996
  1. //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the interfaces that ARM uses to lower LLVM code into a
  10. // selection DAG.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "ARMISelLowering.h"
  14. #include "ARMBaseInstrInfo.h"
  15. #include "ARMBaseRegisterInfo.h"
  16. #include "ARMCallingConv.h"
  17. #include "ARMConstantPoolValue.h"
  18. #include "ARMMachineFunctionInfo.h"
  19. #include "ARMPerfectShuffle.h"
  20. #include "ARMRegisterInfo.h"
  21. #include "ARMSelectionDAGInfo.h"
  22. #include "ARMSubtarget.h"
  23. #include "ARMTargetTransformInfo.h"
  24. #include "MCTargetDesc/ARMAddressingModes.h"
  25. #include "MCTargetDesc/ARMBaseInfo.h"
  26. #include "Utils/ARMBaseInfo.h"
  27. #include "llvm/ADT/APFloat.h"
  28. #include "llvm/ADT/APInt.h"
  29. #include "llvm/ADT/ArrayRef.h"
  30. #include "llvm/ADT/BitVector.h"
  31. #include "llvm/ADT/DenseMap.h"
  32. #include "llvm/ADT/STLExtras.h"
  33. #include "llvm/ADT/SmallPtrSet.h"
  34. #include "llvm/ADT/SmallVector.h"
  35. #include "llvm/ADT/Statistic.h"
  36. #include "llvm/ADT/StringExtras.h"
  37. #include "llvm/ADT/StringRef.h"
  38. #include "llvm/ADT/StringSwitch.h"
  39. #include "llvm/ADT/Triple.h"
  40. #include "llvm/ADT/Twine.h"
  41. #include "llvm/Analysis/VectorUtils.h"
  42. #include "llvm/CodeGen/CallingConvLower.h"
  43. #include "llvm/CodeGen/ISDOpcodes.h"
  44. #include "llvm/CodeGen/IntrinsicLowering.h"
  45. #include "llvm/CodeGen/MachineBasicBlock.h"
  46. #include "llvm/CodeGen/MachineConstantPool.h"
  47. #include "llvm/CodeGen/MachineFrameInfo.h"
  48. #include "llvm/CodeGen/MachineFunction.h"
  49. #include "llvm/CodeGen/MachineInstr.h"
  50. #include "llvm/CodeGen/MachineInstrBuilder.h"
  51. #include "llvm/CodeGen/MachineJumpTableInfo.h"
  52. #include "llvm/CodeGen/MachineMemOperand.h"
  53. #include "llvm/CodeGen/MachineOperand.h"
  54. #include "llvm/CodeGen/MachineRegisterInfo.h"
  55. #include "llvm/CodeGen/RuntimeLibcalls.h"
  56. #include "llvm/CodeGen/SelectionDAG.h"
  57. #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
  58. #include "llvm/CodeGen/SelectionDAGNodes.h"
  59. #include "llvm/CodeGen/TargetInstrInfo.h"
  60. #include "llvm/CodeGen/TargetLowering.h"
  61. #include "llvm/CodeGen/TargetOpcodes.h"
  62. #include "llvm/CodeGen/TargetRegisterInfo.h"
  63. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  64. #include "llvm/CodeGen/ValueTypes.h"
  65. #include "llvm/IR/Attributes.h"
  66. #include "llvm/IR/CallingConv.h"
  67. #include "llvm/IR/Constant.h"
  68. #include "llvm/IR/Constants.h"
  69. #include "llvm/IR/DataLayout.h"
  70. #include "llvm/IR/DebugLoc.h"
  71. #include "llvm/IR/DerivedTypes.h"
  72. #include "llvm/IR/Function.h"
  73. #include "llvm/IR/GlobalAlias.h"
  74. #include "llvm/IR/GlobalValue.h"
  75. #include "llvm/IR/GlobalVariable.h"
  76. #include "llvm/IR/IRBuilder.h"
  77. #include "llvm/IR/InlineAsm.h"
  78. #include "llvm/IR/Instruction.h"
  79. #include "llvm/IR/Instructions.h"
  80. #include "llvm/IR/IntrinsicInst.h"
  81. #include "llvm/IR/Intrinsics.h"
  82. #include "llvm/IR/IntrinsicsARM.h"
  83. #include "llvm/IR/Module.h"
  84. #include "llvm/IR/PatternMatch.h"
  85. #include "llvm/IR/Type.h"
  86. #include "llvm/IR/User.h"
  87. #include "llvm/IR/Value.h"
  88. #include "llvm/MC/MCInstrDesc.h"
  89. #include "llvm/MC/MCInstrItineraries.h"
  90. #include "llvm/MC/MCRegisterInfo.h"
  91. #include "llvm/MC/MCSchedule.h"
  92. #include "llvm/Support/AtomicOrdering.h"
  93. #include "llvm/Support/BranchProbability.h"
  94. #include "llvm/Support/Casting.h"
  95. #include "llvm/Support/CodeGen.h"
  96. #include "llvm/Support/CommandLine.h"
  97. #include "llvm/Support/Compiler.h"
  98. #include "llvm/Support/Debug.h"
  99. #include "llvm/Support/ErrorHandling.h"
  100. #include "llvm/Support/KnownBits.h"
  101. #include "llvm/Support/MachineValueType.h"
  102. #include "llvm/Support/MathExtras.h"
  103. #include "llvm/Support/raw_ostream.h"
  104. #include "llvm/Target/TargetMachine.h"
  105. #include "llvm/Target/TargetOptions.h"
  106. #include <algorithm>
  107. #include <cassert>
  108. #include <cstdint>
  109. #include <cstdlib>
  110. #include <iterator>
  111. #include <limits>
  112. #include <optional>
  113. #include <string>
  114. #include <tuple>
  115. #include <utility>
  116. #include <vector>
  117. using namespace llvm;
  118. using namespace llvm::PatternMatch;
  119. #define DEBUG_TYPE "arm-isel"
  120. STATISTIC(NumTailCalls, "Number of tail calls");
  121. STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
  122. STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
  123. STATISTIC(NumConstpoolPromoted,
  124. "Number of constants with their storage promoted into constant pools");
  125. static cl::opt<bool>
  126. ARMInterworking("arm-interworking", cl::Hidden,
  127. cl::desc("Enable / disable ARM interworking (for debugging only)"),
  128. cl::init(true));
  129. static cl::opt<bool> EnableConstpoolPromotion(
  130. "arm-promote-constant", cl::Hidden,
  131. cl::desc("Enable / disable promotion of unnamed_addr constants into "
  132. "constant pools"),
  133. cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
  134. static cl::opt<unsigned> ConstpoolPromotionMaxSize(
  135. "arm-promote-constant-max-size", cl::Hidden,
  136. cl::desc("Maximum size of constant to promote into a constant pool"),
  137. cl::init(64));
  138. static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
  139. "arm-promote-constant-max-total", cl::Hidden,
  140. cl::desc("Maximum size of ALL constants to promote into a constant pool"),
  141. cl::init(128));
  142. cl::opt<unsigned>
  143. MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
  144. cl::desc("Maximum interleave factor for MVE VLDn to generate."),
  145. cl::init(2));
  146. // The APCS parameter registers.
  147. static const MCPhysReg GPRArgRegs[] = {
  148. ARM::R0, ARM::R1, ARM::R2, ARM::R3
  149. };
  150. void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
  151. if (VT != PromotedLdStVT) {
  152. setOperationAction(ISD::LOAD, VT, Promote);
  153. AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
  154. setOperationAction(ISD::STORE, VT, Promote);
  155. AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
  156. }
  157. MVT ElemTy = VT.getVectorElementType();
  158. if (ElemTy != MVT::f64)
  159. setOperationAction(ISD::SETCC, VT, Custom);
  160. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  161. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  162. if (ElemTy == MVT::i32) {
  163. setOperationAction(ISD::SINT_TO_FP, VT, Custom);
  164. setOperationAction(ISD::UINT_TO_FP, VT, Custom);
  165. setOperationAction(ISD::FP_TO_SINT, VT, Custom);
  166. setOperationAction(ISD::FP_TO_UINT, VT, Custom);
  167. } else {
  168. setOperationAction(ISD::SINT_TO_FP, VT, Expand);
  169. setOperationAction(ISD::UINT_TO_FP, VT, Expand);
  170. setOperationAction(ISD::FP_TO_SINT, VT, Expand);
  171. setOperationAction(ISD::FP_TO_UINT, VT, Expand);
  172. }
  173. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  174. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  175. setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
  176. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
  177. setOperationAction(ISD::SELECT, VT, Expand);
  178. setOperationAction(ISD::SELECT_CC, VT, Expand);
  179. setOperationAction(ISD::VSELECT, VT, Expand);
  180. setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
  181. if (VT.isInteger()) {
  182. setOperationAction(ISD::SHL, VT, Custom);
  183. setOperationAction(ISD::SRA, VT, Custom);
  184. setOperationAction(ISD::SRL, VT, Custom);
  185. }
  186. // Neon does not support vector divide/remainder operations.
  187. setOperationAction(ISD::SDIV, VT, Expand);
  188. setOperationAction(ISD::UDIV, VT, Expand);
  189. setOperationAction(ISD::FDIV, VT, Expand);
  190. setOperationAction(ISD::SREM, VT, Expand);
  191. setOperationAction(ISD::UREM, VT, Expand);
  192. setOperationAction(ISD::FREM, VT, Expand);
  193. setOperationAction(ISD::SDIVREM, VT, Expand);
  194. setOperationAction(ISD::UDIVREM, VT, Expand);
  195. if (!VT.isFloatingPoint() &&
  196. VT != MVT::v2i64 && VT != MVT::v1i64)
  197. for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
  198. setOperationAction(Opcode, VT, Legal);
  199. if (!VT.isFloatingPoint())
  200. for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
  201. setOperationAction(Opcode, VT, Legal);
  202. }
  203. void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
  204. addRegisterClass(VT, &ARM::DPRRegClass);
  205. addTypeForNEON(VT, MVT::f64);
  206. }
  207. void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
  208. addRegisterClass(VT, &ARM::DPairRegClass);
  209. addTypeForNEON(VT, MVT::v2f64);
  210. }
  211. void ARMTargetLowering::setAllExpand(MVT VT) {
  212. for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
  213. setOperationAction(Opc, VT, Expand);
  214. // We support these really simple operations even on types where all
  215. // the actual arithmetic has to be broken down into simpler
  216. // operations or turned into library calls.
  217. setOperationAction(ISD::BITCAST, VT, Legal);
  218. setOperationAction(ISD::LOAD, VT, Legal);
  219. setOperationAction(ISD::STORE, VT, Legal);
  220. setOperationAction(ISD::UNDEF, VT, Legal);
  221. }
  222. void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
  223. LegalizeAction Action) {
  224. setLoadExtAction(ISD::EXTLOAD, From, To, Action);
  225. setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
  226. setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
  227. }
  228. void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
  229. const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
  230. for (auto VT : IntTypes) {
  231. addRegisterClass(VT, &ARM::MQPRRegClass);
  232. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  233. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  234. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  235. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  236. setOperationAction(ISD::SHL, VT, Custom);
  237. setOperationAction(ISD::SRA, VT, Custom);
  238. setOperationAction(ISD::SRL, VT, Custom);
  239. setOperationAction(ISD::SMIN, VT, Legal);
  240. setOperationAction(ISD::SMAX, VT, Legal);
  241. setOperationAction(ISD::UMIN, VT, Legal);
  242. setOperationAction(ISD::UMAX, VT, Legal);
  243. setOperationAction(ISD::ABS, VT, Legal);
  244. setOperationAction(ISD::SETCC, VT, Custom);
  245. setOperationAction(ISD::MLOAD, VT, Custom);
  246. setOperationAction(ISD::MSTORE, VT, Legal);
  247. setOperationAction(ISD::CTLZ, VT, Legal);
  248. setOperationAction(ISD::CTTZ, VT, Custom);
  249. setOperationAction(ISD::BITREVERSE, VT, Legal);
  250. setOperationAction(ISD::BSWAP, VT, Legal);
  251. setOperationAction(ISD::SADDSAT, VT, Legal);
  252. setOperationAction(ISD::UADDSAT, VT, Legal);
  253. setOperationAction(ISD::SSUBSAT, VT, Legal);
  254. setOperationAction(ISD::USUBSAT, VT, Legal);
  255. setOperationAction(ISD::ABDS, VT, Legal);
  256. setOperationAction(ISD::ABDU, VT, Legal);
  257. setOperationAction(ISD::AVGFLOORS, VT, Legal);
  258. setOperationAction(ISD::AVGFLOORU, VT, Legal);
  259. setOperationAction(ISD::AVGCEILS, VT, Legal);
  260. setOperationAction(ISD::AVGCEILU, VT, Legal);
  261. // No native support for these.
  262. setOperationAction(ISD::UDIV, VT, Expand);
  263. setOperationAction(ISD::SDIV, VT, Expand);
  264. setOperationAction(ISD::UREM, VT, Expand);
  265. setOperationAction(ISD::SREM, VT, Expand);
  266. setOperationAction(ISD::UDIVREM, VT, Expand);
  267. setOperationAction(ISD::SDIVREM, VT, Expand);
  268. setOperationAction(ISD::CTPOP, VT, Expand);
  269. setOperationAction(ISD::SELECT, VT, Expand);
  270. setOperationAction(ISD::SELECT_CC, VT, Expand);
  271. // Vector reductions
  272. setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
  273. setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
  274. setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
  275. setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
  276. setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
  277. setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
  278. setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
  279. setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
  280. setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
  281. if (!HasMVEFP) {
  282. setOperationAction(ISD::SINT_TO_FP, VT, Expand);
  283. setOperationAction(ISD::UINT_TO_FP, VT, Expand);
  284. setOperationAction(ISD::FP_TO_SINT, VT, Expand);
  285. setOperationAction(ISD::FP_TO_UINT, VT, Expand);
  286. } else {
  287. setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
  288. setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
  289. }
  290. // Pre and Post inc are supported on loads and stores
  291. for (unsigned im = (unsigned)ISD::PRE_INC;
  292. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  293. setIndexedLoadAction(im, VT, Legal);
  294. setIndexedStoreAction(im, VT, Legal);
  295. setIndexedMaskedLoadAction(im, VT, Legal);
  296. setIndexedMaskedStoreAction(im, VT, Legal);
  297. }
  298. }
  299. const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
  300. for (auto VT : FloatTypes) {
  301. addRegisterClass(VT, &ARM::MQPRRegClass);
  302. if (!HasMVEFP)
  303. setAllExpand(VT);
  304. // These are legal or custom whether we have MVE.fp or not
  305. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  306. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  307. setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
  308. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  309. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  310. setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
  311. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
  312. setOperationAction(ISD::SETCC, VT, Custom);
  313. setOperationAction(ISD::MLOAD, VT, Custom);
  314. setOperationAction(ISD::MSTORE, VT, Legal);
  315. setOperationAction(ISD::SELECT, VT, Expand);
  316. setOperationAction(ISD::SELECT_CC, VT, Expand);
  317. // Pre and Post inc are supported on loads and stores
  318. for (unsigned im = (unsigned)ISD::PRE_INC;
  319. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  320. setIndexedLoadAction(im, VT, Legal);
  321. setIndexedStoreAction(im, VT, Legal);
  322. setIndexedMaskedLoadAction(im, VT, Legal);
  323. setIndexedMaskedStoreAction(im, VT, Legal);
  324. }
  325. if (HasMVEFP) {
  326. setOperationAction(ISD::FMINNUM, VT, Legal);
  327. setOperationAction(ISD::FMAXNUM, VT, Legal);
  328. setOperationAction(ISD::FROUND, VT, Legal);
  329. setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
  330. setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
  331. setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
  332. setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
  333. // No native support for these.
  334. setOperationAction(ISD::FDIV, VT, Expand);
  335. setOperationAction(ISD::FREM, VT, Expand);
  336. setOperationAction(ISD::FSQRT, VT, Expand);
  337. setOperationAction(ISD::FSIN, VT, Expand);
  338. setOperationAction(ISD::FCOS, VT, Expand);
  339. setOperationAction(ISD::FPOW, VT, Expand);
  340. setOperationAction(ISD::FLOG, VT, Expand);
  341. setOperationAction(ISD::FLOG2, VT, Expand);
  342. setOperationAction(ISD::FLOG10, VT, Expand);
  343. setOperationAction(ISD::FEXP, VT, Expand);
  344. setOperationAction(ISD::FEXP2, VT, Expand);
  345. setOperationAction(ISD::FNEARBYINT, VT, Expand);
  346. }
  347. }
  348. // Custom Expand smaller than legal vector reductions to prevent false zero
  349. // items being added.
  350. setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
  351. setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
  352. setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
  353. setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
  354. setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
  355. setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
  356. setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
  357. setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
  358. // We 'support' these types up to bitcast/load/store level, regardless of
  359. // MVE integer-only / float support. Only doing FP data processing on the FP
  360. // vector types is inhibited at integer-only level.
  361. const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
  362. for (auto VT : LongTypes) {
  363. addRegisterClass(VT, &ARM::MQPRRegClass);
  364. setAllExpand(VT);
  365. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  366. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  367. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  368. setOperationAction(ISD::VSELECT, VT, Legal);
  369. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  370. }
  371. setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
  372. // We can do bitwise operations on v2i64 vectors
  373. setOperationAction(ISD::AND, MVT::v2i64, Legal);
  374. setOperationAction(ISD::OR, MVT::v2i64, Legal);
  375. setOperationAction(ISD::XOR, MVT::v2i64, Legal);
  376. // It is legal to extload from v4i8 to v4i16 or v4i32.
  377. addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
  378. addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
  379. addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
  380. // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
  381. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
  382. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
  383. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
  384. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);
  385. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
  386. // Some truncating stores are legal too.
  387. setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
  388. setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
  389. setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
  390. // Pre and Post inc on these are legal, given the correct extends
  391. for (unsigned im = (unsigned)ISD::PRE_INC;
  392. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  393. for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
  394. setIndexedLoadAction(im, VT, Legal);
  395. setIndexedStoreAction(im, VT, Legal);
  396. setIndexedMaskedLoadAction(im, VT, Legal);
  397. setIndexedMaskedStoreAction(im, VT, Legal);
  398. }
  399. }
  400. // Predicate types
  401. const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
  402. for (auto VT : pTypes) {
  403. addRegisterClass(VT, &ARM::VCCRRegClass);
  404. setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
  405. setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  406. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  407. setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
  408. setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
  409. setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
  410. setOperationAction(ISD::SETCC, VT, Custom);
  411. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
  412. setOperationAction(ISD::LOAD, VT, Custom);
  413. setOperationAction(ISD::STORE, VT, Custom);
  414. setOperationAction(ISD::TRUNCATE, VT, Custom);
  415. setOperationAction(ISD::VSELECT, VT, Expand);
  416. setOperationAction(ISD::SELECT, VT, Expand);
  417. setOperationAction(ISD::SELECT_CC, VT, Expand);
  418. if (!HasMVEFP) {
  419. setOperationAction(ISD::SINT_TO_FP, VT, Expand);
  420. setOperationAction(ISD::UINT_TO_FP, VT, Expand);
  421. setOperationAction(ISD::FP_TO_SINT, VT, Expand);
  422. setOperationAction(ISD::FP_TO_UINT, VT, Expand);
  423. }
  424. }
  425. setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
  426. setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);
  427. setOperationAction(ISD::AND, MVT::v2i1, Expand);
  428. setOperationAction(ISD::OR, MVT::v2i1, Expand);
  429. setOperationAction(ISD::XOR, MVT::v2i1, Expand);
  430. setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand);
  431. setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand);
  432. setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand);
  433. setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand);
  434. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
  435. setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
  436. setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
  437. setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
  438. setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
  439. setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
  440. setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
  441. setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
  442. }
  443. ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
  444. const ARMSubtarget &STI)
  445. : TargetLowering(TM), Subtarget(&STI) {
  446. RegInfo = Subtarget->getRegisterInfo();
  447. Itins = Subtarget->getInstrItineraryData();
  448. setBooleanContents(ZeroOrOneBooleanContent);
  449. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  450. if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
  451. !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
  452. bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
  453. for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
  454. setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
  455. IsHFTarget ? CallingConv::ARM_AAPCS_VFP
  456. : CallingConv::ARM_AAPCS);
  457. }
  458. if (Subtarget->isTargetMachO()) {
  459. // Uses VFP for Thumb libfuncs if available.
  460. if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
  461. Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
  462. static const struct {
  463. const RTLIB::Libcall Op;
  464. const char * const Name;
  465. const ISD::CondCode Cond;
  466. } LibraryCalls[] = {
  467. // Single-precision floating-point arithmetic.
  468. { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
  469. { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
  470. { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
  471. { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
  472. // Double-precision floating-point arithmetic.
  473. { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
  474. { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
  475. { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
  476. { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
  477. // Single-precision comparisons.
  478. { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
  479. { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
  480. { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
  481. { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
  482. { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
  483. { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
  484. { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
  485. // Double-precision comparisons.
  486. { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
  487. { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
  488. { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
  489. { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
  490. { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
  491. { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
  492. { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
  493. // Floating-point to integer conversions.
  494. // i64 conversions are done via library routines even when generating VFP
  495. // instructions, so use the same ones.
  496. { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
  497. { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
  498. { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
  499. { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
  500. // Conversions between floating types.
  501. { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
  502. { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
  503. // Integer to floating-point conversions.
  504. // i64 conversions are done via library routines even when generating VFP
  505. // instructions, so use the same ones.
  506. // FIXME: There appears to be some naming inconsistency in ARM libgcc:
  507. // e.g., __floatunsidf vs. __floatunssidfvfp.
  508. { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
  509. { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
  510. { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
  511. { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
  512. };
  513. for (const auto &LC : LibraryCalls) {
  514. setLibcallName(LC.Op, LC.Name);
  515. if (LC.Cond != ISD::SETCC_INVALID)
  516. setCmpLibcallCC(LC.Op, LC.Cond);
  517. }
  518. }
  519. }
  520. // These libcalls are not available in 32-bit.
  521. setLibcallName(RTLIB::SHL_I128, nullptr);
  522. setLibcallName(RTLIB::SRL_I128, nullptr);
  523. setLibcallName(RTLIB::SRA_I128, nullptr);
  524. setLibcallName(RTLIB::MUL_I128, nullptr);
  525. setLibcallName(RTLIB::MULO_I64, nullptr);
  526. setLibcallName(RTLIB::MULO_I128, nullptr);
  527. // RTLIB
  528. if (Subtarget->isAAPCS_ABI() &&
  529. (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
  530. Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
  531. static const struct {
  532. const RTLIB::Libcall Op;
  533. const char * const Name;
  534. const CallingConv::ID CC;
  535. const ISD::CondCode Cond;
  536. } LibraryCalls[] = {
  537. // Double-precision floating-point arithmetic helper functions
  538. // RTABI chapter 4.1.2, Table 2
  539. { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  540. { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  541. { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  542. { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  543. // Double-precision floating-point comparison helper functions
  544. // RTABI chapter 4.1.2, Table 3
  545. { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
  546. { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
  547. { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
  548. { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
  549. { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
  550. { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
  551. { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
  552. // Single-precision floating-point arithmetic helper functions
  553. // RTABI chapter 4.1.2, Table 4
  554. { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  555. { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  556. { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  557. { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  558. // Single-precision floating-point comparison helper functions
  559. // RTABI chapter 4.1.2, Table 5
  560. { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
  561. { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
  562. { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
  563. { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
  564. { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
  565. { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
  566. { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
  567. // Floating-point to integer conversions.
  568. // RTABI chapter 4.1.2, Table 6
  569. { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  570. { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  571. { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  572. { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  573. { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  574. { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  575. { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  576. { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  577. // Conversions between floating types.
  578. // RTABI chapter 4.1.2, Table 7
  579. { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  580. { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  581. { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  582. // Integer to floating-point conversions.
  583. // RTABI chapter 4.1.2, Table 8
  584. { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  585. { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  586. { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  587. { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  588. { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  589. { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  590. { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  591. { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  592. // Long long helper functions
  593. // RTABI chapter 4.2, Table 9
  594. { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  595. { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  596. { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  597. { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  598. // Integer division functions
  599. // RTABI chapter 4.3.1
  600. { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  601. { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  602. { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  603. { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  604. { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  605. { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  606. { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  607. { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  608. };
  609. for (const auto &LC : LibraryCalls) {
  610. setLibcallName(LC.Op, LC.Name);
  611. setLibcallCallingConv(LC.Op, LC.CC);
  612. if (LC.Cond != ISD::SETCC_INVALID)
  613. setCmpLibcallCC(LC.Op, LC.Cond);
  614. }
  615. // EABI dependent RTLIB
  616. if (TM.Options.EABIVersion == EABI::EABI4 ||
  617. TM.Options.EABIVersion == EABI::EABI5) {
  618. static const struct {
  619. const RTLIB::Libcall Op;
  620. const char *const Name;
  621. const CallingConv::ID CC;
  622. const ISD::CondCode Cond;
  623. } MemOpsLibraryCalls[] = {
  624. // Memory operations
  625. // RTABI chapter 4.3.4
  626. { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  627. { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  628. { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  629. };
  630. for (const auto &LC : MemOpsLibraryCalls) {
  631. setLibcallName(LC.Op, LC.Name);
  632. setLibcallCallingConv(LC.Op, LC.CC);
  633. if (LC.Cond != ISD::SETCC_INVALID)
  634. setCmpLibcallCC(LC.Op, LC.Cond);
  635. }
  636. }
  637. }
  638. if (Subtarget->isTargetWindows()) {
  639. static const struct {
  640. const RTLIB::Libcall Op;
  641. const char * const Name;
  642. const CallingConv::ID CC;
  643. } LibraryCalls[] = {
  644. { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
  645. { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
  646. { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
  647. { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
  648. { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
  649. { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
  650. { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
  651. { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
  652. };
  653. for (const auto &LC : LibraryCalls) {
  654. setLibcallName(LC.Op, LC.Name);
  655. setLibcallCallingConv(LC.Op, LC.CC);
  656. }
  657. }
  658. // Use divmod compiler-rt calls for iOS 5.0 and later.
  659. if (Subtarget->isTargetMachO() &&
  660. !(Subtarget->isTargetIOS() &&
  661. Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
  662. setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
  663. setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
  664. }
  665. // The half <-> float conversion functions are always soft-float on
  666. // non-watchos platforms, but are needed for some targets which use a
  667. // hard-float calling convention by default.
  668. if (!Subtarget->isTargetWatchABI()) {
  669. if (Subtarget->isAAPCS_ABI()) {
  670. setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
  671. setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
  672. setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
  673. } else {
  674. setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
  675. setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
  676. setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
  677. }
  678. }
  679. // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
  680. // a __gnu_ prefix (which is the default).
  681. if (Subtarget->isTargetAEABI()) {
  682. static const struct {
  683. const RTLIB::Libcall Op;
  684. const char * const Name;
  685. const CallingConv::ID CC;
  686. } LibraryCalls[] = {
  687. { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
  688. { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
  689. { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
  690. };
  691. for (const auto &LC : LibraryCalls) {
  692. setLibcallName(LC.Op, LC.Name);
  693. setLibcallCallingConv(LC.Op, LC.CC);
  694. }
  695. }
  696. if (Subtarget->isThumb1Only())
  697. addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
  698. else
  699. addRegisterClass(MVT::i32, &ARM::GPRRegClass);
  700. if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
  701. Subtarget->hasFPRegs()) {
  702. addRegisterClass(MVT::f32, &ARM::SPRRegClass);
  703. addRegisterClass(MVT::f64, &ARM::DPRRegClass);
  704. setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
  705. setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
  706. setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
  707. setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
  708. if (!Subtarget->hasVFP2Base())
  709. setAllExpand(MVT::f32);
  710. if (!Subtarget->hasFP64())
  711. setAllExpand(MVT::f64);
  712. }
  713. if (Subtarget->hasFullFP16()) {
  714. addRegisterClass(MVT::f16, &ARM::HPRRegClass);
  715. setOperationAction(ISD::BITCAST, MVT::i16, Custom);
  716. setOperationAction(ISD::BITCAST, MVT::f16, Custom);
  717. setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
  718. setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
  719. }
  720. if (Subtarget->hasBF16()) {
  721. addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
  722. setAllExpand(MVT::bf16);
  723. if (!Subtarget->hasFullFP16())
  724. setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
  725. }
  726. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  727. for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
  728. setTruncStoreAction(VT, InnerVT, Expand);
  729. addAllExtLoads(VT, InnerVT, Expand);
  730. }
  731. setOperationAction(ISD::SMUL_LOHI, VT, Expand);
  732. setOperationAction(ISD::UMUL_LOHI, VT, Expand);
  733. setOperationAction(ISD::BSWAP, VT, Expand);
  734. }
  735. setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
  736. setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
  737. setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
  738. setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
  739. if (Subtarget->hasMVEIntegerOps())
  740. addMVEVectorTypes(Subtarget->hasMVEFloatOps());
  741. // Combine low-overhead loop intrinsics so that we can lower i1 types.
  742. if (Subtarget->hasLOB()) {
  743. setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
  744. }
  745. if (Subtarget->hasNEON()) {
  746. addDRTypeForNEON(MVT::v2f32);
  747. addDRTypeForNEON(MVT::v8i8);
  748. addDRTypeForNEON(MVT::v4i16);
  749. addDRTypeForNEON(MVT::v2i32);
  750. addDRTypeForNEON(MVT::v1i64);
  751. addQRTypeForNEON(MVT::v4f32);
  752. addQRTypeForNEON(MVT::v2f64);
  753. addQRTypeForNEON(MVT::v16i8);
  754. addQRTypeForNEON(MVT::v8i16);
  755. addQRTypeForNEON(MVT::v4i32);
  756. addQRTypeForNEON(MVT::v2i64);
  757. if (Subtarget->hasFullFP16()) {
  758. addQRTypeForNEON(MVT::v8f16);
  759. addDRTypeForNEON(MVT::v4f16);
  760. }
  761. if (Subtarget->hasBF16()) {
  762. addQRTypeForNEON(MVT::v8bf16);
  763. addDRTypeForNEON(MVT::v4bf16);
  764. }
  765. }
  766. if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
  767. // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
  768. // none of Neon, MVE or VFP supports any arithmetic operations on it.
  769. setOperationAction(ISD::FADD, MVT::v2f64, Expand);
  770. setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
  771. setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
  772. // FIXME: Code duplication: FDIV and FREM are expanded always, see
  773. // ARMTargetLowering::addTypeForNEON method for details.
  774. setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
  775. setOperationAction(ISD::FREM, MVT::v2f64, Expand);
  776. // FIXME: Create unittest.
  777. // In another words, find a way when "copysign" appears in DAG with vector
  778. // operands.
  779. setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
  780. // FIXME: Code duplication: SETCC has custom operation action, see
  781. // ARMTargetLowering::addTypeForNEON method for details.
  782. setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
  783. // FIXME: Create unittest for FNEG and for FABS.
  784. setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
  785. setOperationAction(ISD::FABS, MVT::v2f64, Expand);
  786. setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
  787. setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
  788. setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
  789. setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
  790. setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
  791. setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
  792. setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
  793. setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
  794. setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
  795. // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
  796. setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
  797. setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
  798. setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
  799. setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
  800. setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
  801. setOperationAction(ISD::FMA, MVT::v2f64, Expand);
  802. }
  803. if (Subtarget->hasNEON()) {
  804. // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
  805. // supported for v4f32.
  806. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
  807. setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
  808. setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
  809. setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
  810. setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
  811. setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
  812. setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
  813. setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
  814. setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
  815. setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
  816. setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
  817. setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
  818. setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
  819. setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
  820. // Mark v2f32 intrinsics.
  821. setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
  822. setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
  823. setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
  824. setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
  825. setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
  826. setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
  827. setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
  828. setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
  829. setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
  830. setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
  831. setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
  832. setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
  833. setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
  834. setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
  835. // Neon does not support some operations on v1i64 and v2i64 types.
  836. setOperationAction(ISD::MUL, MVT::v1i64, Expand);
  837. // Custom handling for some quad-vector types to detect VMULL.
  838. setOperationAction(ISD::MUL, MVT::v8i16, Custom);
  839. setOperationAction(ISD::MUL, MVT::v4i32, Custom);
  840. setOperationAction(ISD::MUL, MVT::v2i64, Custom);
  841. // Custom handling for some vector types to avoid expensive expansions
  842. setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
  843. setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
  844. setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
  845. setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
  846. // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
  847. // a destination type that is wider than the source, and nor does
  848. // it have a FP_TO_[SU]INT instruction with a narrower destination than
  849. // source.
  850. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
  851. setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
  852. setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
  853. setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
  854. setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
  855. setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
  856. setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
  857. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
  858. setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
  859. setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
  860. // NEON does not have single instruction CTPOP for vectors with element
  861. // types wider than 8-bits. However, custom lowering can leverage the
  862. // v8i8/v16i8 vcnt instruction.
  863. setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
  864. setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
  865. setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
  866. setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
  867. setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
  868. setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
  869. setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
  870. setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
  871. // NEON does not have single instruction CTTZ for vectors.
  872. setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
  873. setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
  874. setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
  875. setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
  876. setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
  877. setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
  878. setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
  879. setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
  880. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
  881. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
  882. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
  883. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
  884. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
  885. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
  886. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
  887. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
  888. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  889. setOperationAction(ISD::MULHS, VT, Expand);
  890. setOperationAction(ISD::MULHU, VT, Expand);
  891. }
  892. // NEON only has FMA instructions as of VFP4.
  893. if (!Subtarget->hasVFP4Base()) {
  894. setOperationAction(ISD::FMA, MVT::v2f32, Expand);
  895. setOperationAction(ISD::FMA, MVT::v4f32, Expand);
  896. }
  897. setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,
  898. ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD});
  899. // It is legal to extload from v4i8 to v4i16 or v4i32.
  900. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
  901. MVT::v2i32}) {
  902. for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
  903. setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
  904. setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
  905. setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
  906. }
  907. }
  908. }
  909. if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
  910. setTargetDAGCombine(
  911. {ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR,
  912. ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
  913. ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,
  914. ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN,
  915. ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
  916. }
  917. if (Subtarget->hasMVEIntegerOps()) {
  918. setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
  919. ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
  920. ISD::SETCC});
  921. }
  922. if (Subtarget->hasMVEFloatOps()) {
  923. setTargetDAGCombine(ISD::FADD);
  924. }
  925. if (!Subtarget->hasFP64()) {
  926. // When targeting a floating-point unit with only single-precision
  927. // operations, f64 is legal for the few double-precision instructions which
  928. // are present However, no double-precision operations other than moves,
  929. // loads and stores are provided by the hardware.
  930. setOperationAction(ISD::FADD, MVT::f64, Expand);
  931. setOperationAction(ISD::FSUB, MVT::f64, Expand);
  932. setOperationAction(ISD::FMUL, MVT::f64, Expand);
  933. setOperationAction(ISD::FMA, MVT::f64, Expand);
  934. setOperationAction(ISD::FDIV, MVT::f64, Expand);
  935. setOperationAction(ISD::FREM, MVT::f64, Expand);
  936. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  937. setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
  938. setOperationAction(ISD::FNEG, MVT::f64, Expand);
  939. setOperationAction(ISD::FABS, MVT::f64, Expand);
  940. setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  941. setOperationAction(ISD::FSIN, MVT::f64, Expand);
  942. setOperationAction(ISD::FCOS, MVT::f64, Expand);
  943. setOperationAction(ISD::FPOW, MVT::f64, Expand);
  944. setOperationAction(ISD::FLOG, MVT::f64, Expand);
  945. setOperationAction(ISD::FLOG2, MVT::f64, Expand);
  946. setOperationAction(ISD::FLOG10, MVT::f64, Expand);
  947. setOperationAction(ISD::FEXP, MVT::f64, Expand);
  948. setOperationAction(ISD::FEXP2, MVT::f64, Expand);
  949. setOperationAction(ISD::FCEIL, MVT::f64, Expand);
  950. setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
  951. setOperationAction(ISD::FRINT, MVT::f64, Expand);
  952. setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
  953. setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
  954. setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
  955. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
  956. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
  957. setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
  958. setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
  959. setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
  960. setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
  961. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
  962. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
  963. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
  964. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
  965. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
  966. }
  967. if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
  968. setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
  969. setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
  970. if (Subtarget->hasFullFP16()) {
  971. setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
  972. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
  973. }
  974. }
  975. if (!Subtarget->hasFP16()) {
  976. setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
  977. setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
  978. }
  979. computeRegisterProperties(Subtarget->getRegisterInfo());
  980. // ARM does not have floating-point extending loads.
  981. for (MVT VT : MVT::fp_valuetypes()) {
  982. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
  983. setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
  984. }
  985. // ... or truncating stores
  986. setTruncStoreAction(MVT::f64, MVT::f32, Expand);
  987. setTruncStoreAction(MVT::f32, MVT::f16, Expand);
  988. setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  989. // ARM does not have i1 sign extending load.
  990. for (MVT VT : MVT::integer_valuetypes())
  991. setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  992. // ARM supports all 4 flavors of integer indexed load / store.
  993. if (!Subtarget->isThumb1Only()) {
  994. for (unsigned im = (unsigned)ISD::PRE_INC;
  995. im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
  996. setIndexedLoadAction(im, MVT::i1, Legal);
  997. setIndexedLoadAction(im, MVT::i8, Legal);
  998. setIndexedLoadAction(im, MVT::i16, Legal);
  999. setIndexedLoadAction(im, MVT::i32, Legal);
  1000. setIndexedStoreAction(im, MVT::i1, Legal);
  1001. setIndexedStoreAction(im, MVT::i8, Legal);
  1002. setIndexedStoreAction(im, MVT::i16, Legal);
  1003. setIndexedStoreAction(im, MVT::i32, Legal);
  1004. }
  1005. } else {
  1006. // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
  1007. setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
  1008. setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
  1009. }
  1010. setOperationAction(ISD::SADDO, MVT::i32, Custom);
  1011. setOperationAction(ISD::UADDO, MVT::i32, Custom);
  1012. setOperationAction(ISD::SSUBO, MVT::i32, Custom);
  1013. setOperationAction(ISD::USUBO, MVT::i32, Custom);
  1014. setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
  1015. setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
  1016. if (Subtarget->hasDSP()) {
  1017. setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
  1018. setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
  1019. setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
  1020. setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
  1021. setOperationAction(ISD::UADDSAT, MVT::i8, Custom);
  1022. setOperationAction(ISD::USUBSAT, MVT::i8, Custom);
  1023. setOperationAction(ISD::UADDSAT, MVT::i16, Custom);
  1024. setOperationAction(ISD::USUBSAT, MVT::i16, Custom);
  1025. }
  1026. if (Subtarget->hasBaseDSP()) {
  1027. setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
  1028. setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
  1029. }
  1030. // i64 operation support.
  1031. setOperationAction(ISD::MUL, MVT::i64, Expand);
  1032. setOperationAction(ISD::MULHU, MVT::i32, Expand);
  1033. if (Subtarget->isThumb1Only()) {
  1034. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
  1035. setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
  1036. }
  1037. if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
  1038. || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
  1039. setOperationAction(ISD::MULHS, MVT::i32, Expand);
  1040. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
  1041. setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
  1042. setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
  1043. setOperationAction(ISD::SRL, MVT::i64, Custom);
  1044. setOperationAction(ISD::SRA, MVT::i64, Custom);
  1045. setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  1046. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
  1047. setOperationAction(ISD::LOAD, MVT::i64, Custom);
  1048. setOperationAction(ISD::STORE, MVT::i64, Custom);
  1049. // MVE lowers 64 bit shifts to lsll and lsrl
  1050. // assuming that ISD::SRL and SRA of i64 are already marked custom
  1051. if (Subtarget->hasMVEIntegerOps())
  1052. setOperationAction(ISD::SHL, MVT::i64, Custom);
  1053. // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
  1054. if (Subtarget->isThumb1Only()) {
  1055. setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
  1056. setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
  1057. setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
  1058. }
  1059. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
  1060. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
  1061. // ARM does not have ROTL.
  1062. setOperationAction(ISD::ROTL, MVT::i32, Expand);
  1063. for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
  1064. setOperationAction(ISD::ROTL, VT, Expand);
  1065. setOperationAction(ISD::ROTR, VT, Expand);
  1066. }
  1067. setOperationAction(ISD::CTTZ, MVT::i32, Custom);
  1068. setOperationAction(ISD::CTPOP, MVT::i32, Expand);
  1069. if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
  1070. setOperationAction(ISD::CTLZ, MVT::i32, Expand);
  1071. setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
  1072. }
  1073. // @llvm.readcyclecounter requires the Performance Monitors extension.
  1074. // Default to the 0 expansion on unsupported platforms.
  1075. // FIXME: Technically there are older ARM CPUs that have
  1076. // implementation-specific ways of obtaining this information.
  1077. if (Subtarget->hasPerfMon())
  1078. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
  1079. // Only ARMv6 has BSWAP.
  1080. if (!Subtarget->hasV6Ops())
  1081. setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  1082. bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
  1083. : Subtarget->hasDivideInARMMode();
  1084. if (!hasDivide) {
  1085. // These are expanded into libcalls if the cpu doesn't have HW divider.
  1086. setOperationAction(ISD::SDIV, MVT::i32, LibCall);
  1087. setOperationAction(ISD::UDIV, MVT::i32, LibCall);
  1088. }
  1089. if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
  1090. setOperationAction(ISD::SDIV, MVT::i32, Custom);
  1091. setOperationAction(ISD::UDIV, MVT::i32, Custom);
  1092. setOperationAction(ISD::SDIV, MVT::i64, Custom);
  1093. setOperationAction(ISD::UDIV, MVT::i64, Custom);
  1094. }
  1095. setOperationAction(ISD::SREM, MVT::i32, Expand);
  1096. setOperationAction(ISD::UREM, MVT::i32, Expand);
  1097. // Register based DivRem for AEABI (RTABI 4.2)
  1098. if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
  1099. Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
  1100. Subtarget->isTargetWindows()) {
  1101. setOperationAction(ISD::SREM, MVT::i64, Custom);
  1102. setOperationAction(ISD::UREM, MVT::i64, Custom);
  1103. HasStandaloneRem = false;
  1104. if (Subtarget->isTargetWindows()) {
  1105. const struct {
  1106. const RTLIB::Libcall Op;
  1107. const char * const Name;
  1108. const CallingConv::ID CC;
  1109. } LibraryCalls[] = {
  1110. { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
  1111. { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
  1112. { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
  1113. { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
  1114. { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
  1115. { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
  1116. { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
  1117. { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
  1118. };
  1119. for (const auto &LC : LibraryCalls) {
  1120. setLibcallName(LC.Op, LC.Name);
  1121. setLibcallCallingConv(LC.Op, LC.CC);
  1122. }
  1123. } else {
  1124. const struct {
  1125. const RTLIB::Libcall Op;
  1126. const char * const Name;
  1127. const CallingConv::ID CC;
  1128. } LibraryCalls[] = {
  1129. { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
  1130. { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
  1131. { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
  1132. { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
  1133. { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
  1134. { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
  1135. { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
  1136. { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
  1137. };
  1138. for (const auto &LC : LibraryCalls) {
  1139. setLibcallName(LC.Op, LC.Name);
  1140. setLibcallCallingConv(LC.Op, LC.CC);
  1141. }
  1142. }
  1143. setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
  1144. setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
  1145. setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
  1146. setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
  1147. } else {
  1148. setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
  1149. setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
  1150. }
  1151. if (Subtarget->getTargetTriple().isOSMSVCRT()) {
  1152. // MSVCRT doesn't have powi; fall back to pow
  1153. setLibcallName(RTLIB::POWI_F32, nullptr);
  1154. setLibcallName(RTLIB::POWI_F64, nullptr);
  1155. }
  1156. setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
  1157. setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
  1158. setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
  1159. setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
  1160. setOperationAction(ISD::TRAP, MVT::Other, Legal);
  1161. setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
  1162. // Use the default implementation.
  1163. setOperationAction(ISD::VASTART, MVT::Other, Custom);
  1164. setOperationAction(ISD::VAARG, MVT::Other, Expand);
  1165. setOperationAction(ISD::VACOPY, MVT::Other, Expand);
  1166. setOperationAction(ISD::VAEND, MVT::Other, Expand);
  1167. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
  1168. setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
  1169. if (Subtarget->isTargetWindows())
  1170. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
  1171. else
  1172. setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
  1173. // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
  1174. // the default expansion.
  1175. InsertFencesForAtomic = false;
  1176. if (Subtarget->hasAnyDataBarrier() &&
  1177. (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
  1178. // ATOMIC_FENCE needs custom lowering; the others should have been expanded
  1179. // to ldrex/strex loops already.
  1180. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
  1181. if (!Subtarget->isThumb() || !Subtarget->isMClass())
  1182. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
  1183. // On v8, we have particularly efficient implementations of atomic fences
  1184. // if they can be combined with nearby atomic loads and stores.
  1185. if (!Subtarget->hasAcquireRelease() ||
  1186. getTargetMachine().getOptLevel() == 0) {
  1187. // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
  1188. InsertFencesForAtomic = true;
  1189. }
  1190. } else {
  1191. // If there's anything we can use as a barrier, go through custom lowering
  1192. // for ATOMIC_FENCE.
  1193. // If target has DMB in thumb, Fences can be inserted.
  1194. if (Subtarget->hasDataBarrier())
  1195. InsertFencesForAtomic = true;
  1196. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
  1197. Subtarget->hasAnyDataBarrier() ? Custom : Expand);
  1198. // Set them all for expansion, which will force libcalls.
  1199. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
  1200. setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
  1201. setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
  1202. setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
  1203. setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
  1204. setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
  1205. setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
  1206. setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
  1207. setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
  1208. setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
  1209. setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
  1210. setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
  1211. // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
  1212. // Unordered/Monotonic case.
  1213. if (!InsertFencesForAtomic) {
  1214. setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
  1215. setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
  1216. }
  1217. }
  1218. // Compute supported atomic widths.
  1219. if (Subtarget->isTargetLinux() ||
  1220. (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
  1221. // For targets where __sync_* routines are reliably available, we use them
  1222. // if necessary.
  1223. //
  1224. // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
  1225. // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
  1226. //
  1227. // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
  1228. // such targets should provide __sync_* routines, which use the ARM mode
  1229. // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
  1230. // encoding; see ARMISD::MEMBARRIER_MCR.)
  1231. setMaxAtomicSizeInBitsSupported(64);
  1232. } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
  1233. Subtarget->hasForced32BitAtomics()) {
  1234. // Cortex-M (besides Cortex-M0) have 32-bit atomics.
  1235. setMaxAtomicSizeInBitsSupported(32);
  1236. } else {
  1237. // We can't assume anything about other targets; just use libatomic
  1238. // routines.
  1239. setMaxAtomicSizeInBitsSupported(0);
  1240. }
  1241. setMaxDivRemBitWidthSupported(64);
  1242. setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
  1243. // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
  1244. if (!Subtarget->hasV6Ops()) {
  1245. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
  1246. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
  1247. }
  1248. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  1249. if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
  1250. !Subtarget->isThumb1Only()) {
  1251. // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
  1252. // iff target supports vfp2.
  1253. setOperationAction(ISD::BITCAST, MVT::i64, Custom);
  1254. setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
  1255. setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
  1256. }
  1257. // We want to custom lower some of our intrinsics.
  1258. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  1259. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
  1260. setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
  1261. setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
  1262. if (Subtarget->useSjLjEH())
  1263. setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
  1264. setOperationAction(ISD::SETCC, MVT::i32, Expand);
  1265. setOperationAction(ISD::SETCC, MVT::f32, Expand);
  1266. setOperationAction(ISD::SETCC, MVT::f64, Expand);
  1267. setOperationAction(ISD::SELECT, MVT::i32, Custom);
  1268. setOperationAction(ISD::SELECT, MVT::f32, Custom);
  1269. setOperationAction(ISD::SELECT, MVT::f64, Custom);
  1270. setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  1271. setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  1272. setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
  1273. if (Subtarget->hasFullFP16()) {
  1274. setOperationAction(ISD::SETCC, MVT::f16, Expand);
  1275. setOperationAction(ISD::SELECT, MVT::f16, Custom);
  1276. setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
  1277. }
  1278. setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
  1279. setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  1280. setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  1281. if (Subtarget->hasFullFP16())
  1282. setOperationAction(ISD::BR_CC, MVT::f16, Custom);
  1283. setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  1284. setOperationAction(ISD::BR_CC, MVT::f64, Custom);
  1285. setOperationAction(ISD::BR_JT, MVT::Other, Custom);
  1286. // We don't support sin/cos/fmod/copysign/pow
  1287. setOperationAction(ISD::FSIN, MVT::f64, Expand);
  1288. setOperationAction(ISD::FSIN, MVT::f32, Expand);
  1289. setOperationAction(ISD::FCOS, MVT::f32, Expand);
  1290. setOperationAction(ISD::FCOS, MVT::f64, Expand);
  1291. setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
  1292. setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
  1293. setOperationAction(ISD::FREM, MVT::f64, Expand);
  1294. setOperationAction(ISD::FREM, MVT::f32, Expand);
  1295. if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
  1296. !Subtarget->isThumb1Only()) {
  1297. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
  1298. setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
  1299. }
  1300. setOperationAction(ISD::FPOW, MVT::f64, Expand);
  1301. setOperationAction(ISD::FPOW, MVT::f32, Expand);
  1302. if (!Subtarget->hasVFP4Base()) {
  1303. setOperationAction(ISD::FMA, MVT::f64, Expand);
  1304. setOperationAction(ISD::FMA, MVT::f32, Expand);
  1305. }
  1306. // Various VFP goodness
  1307. if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
  1308. // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
  1309. if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
  1310. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
  1311. setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
  1312. }
  1313. // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
  1314. if (!Subtarget->hasFP16()) {
  1315. setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
  1316. setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
  1317. }
  1318. // Strict floating-point comparisons need custom lowering.
  1319. setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
  1320. setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
  1321. setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
  1322. setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
  1323. setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
  1324. setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
  1325. }
  1326. // Use __sincos_stret if available.
  1327. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
  1328. getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
  1329. setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
  1330. setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
  1331. }
  1332. // FP-ARMv8 implements a lot of rounding-like FP operations.
  1333. if (Subtarget->hasFPARMv8Base()) {
  1334. setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
  1335. setOperationAction(ISD::FCEIL, MVT::f32, Legal);
  1336. setOperationAction(ISD::FROUND, MVT::f32, Legal);
  1337. setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
  1338. setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
  1339. setOperationAction(ISD::FRINT, MVT::f32, Legal);
  1340. setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
  1341. setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
  1342. if (Subtarget->hasNEON()) {
  1343. setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
  1344. setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
  1345. setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
  1346. setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
  1347. }
  1348. if (Subtarget->hasFP64()) {
  1349. setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
  1350. setOperationAction(ISD::FCEIL, MVT::f64, Legal);
  1351. setOperationAction(ISD::FROUND, MVT::f64, Legal);
  1352. setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
  1353. setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
  1354. setOperationAction(ISD::FRINT, MVT::f64, Legal);
  1355. setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
  1356. setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
  1357. }
  1358. }
  1359. // FP16 often need to be promoted to call lib functions
  1360. if (Subtarget->hasFullFP16()) {
  1361. setOperationAction(ISD::FREM, MVT::f16, Promote);
  1362. setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
  1363. setOperationAction(ISD::FSIN, MVT::f16, Promote);
  1364. setOperationAction(ISD::FCOS, MVT::f16, Promote);
  1365. setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
  1366. setOperationAction(ISD::FPOWI, MVT::f16, Promote);
  1367. setOperationAction(ISD::FPOW, MVT::f16, Promote);
  1368. setOperationAction(ISD::FEXP, MVT::f16, Promote);
  1369. setOperationAction(ISD::FEXP2, MVT::f16, Promote);
  1370. setOperationAction(ISD::FLOG, MVT::f16, Promote);
  1371. setOperationAction(ISD::FLOG10, MVT::f16, Promote);
  1372. setOperationAction(ISD::FLOG2, MVT::f16, Promote);
  1373. setOperationAction(ISD::FROUND, MVT::f16, Legal);
  1374. }
  1375. if (Subtarget->hasNEON()) {
  1376. // vmin and vmax aren't available in a scalar form, so we can use
  1377. // a NEON instruction with an undef lane instead. This has a performance
  1378. // penalty on some cores, so we don't do this unless we have been
  1379. // asked to by the core tuning model.
  1380. if (Subtarget->useNEONForSinglePrecisionFP()) {
  1381. setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
  1382. setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
  1383. setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
  1384. setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
  1385. }
  1386. setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
  1387. setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
  1388. setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
  1389. setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
  1390. if (Subtarget->hasFullFP16()) {
  1391. setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
  1392. setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
  1393. setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
  1394. setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
  1395. setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
  1396. setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
  1397. setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
  1398. setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
  1399. }
  1400. }
  1401. // We have target-specific dag combine patterns for the following nodes:
  1402. // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
  1403. setTargetDAGCombine(
  1404. {ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR});
  1405. if (Subtarget->hasMVEIntegerOps())
  1406. setTargetDAGCombine(ISD::VSELECT);
  1407. if (Subtarget->hasV6Ops())
  1408. setTargetDAGCombine(ISD::SRL);
  1409. if (Subtarget->isThumb1Only())
  1410. setTargetDAGCombine(ISD::SHL);
  1411. // Attempt to lower smin/smax to ssat/usat
  1412. if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
  1413. Subtarget->isThumb2()) {
  1414. setTargetDAGCombine({ISD::SMIN, ISD::SMAX});
  1415. }
  1416. setStackPointerRegisterToSaveRestore(ARM::SP);
  1417. if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
  1418. !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
  1419. setSchedulingPreference(Sched::RegPressure);
  1420. else
  1421. setSchedulingPreference(Sched::Hybrid);
  1422. //// temporary - rewrite interface to use type
  1423. MaxStoresPerMemset = 8;
  1424. MaxStoresPerMemsetOptSize = 4;
  1425. MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
  1426. MaxStoresPerMemcpyOptSize = 2;
  1427. MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
  1428. MaxStoresPerMemmoveOptSize = 2;
  1429. // On ARM arguments smaller than 4 bytes are extended, so all arguments
  1430. // are at least 4 bytes aligned.
  1431. setMinStackArgumentAlignment(Align(4));
  1432. // Prefer likely predicted branches to selects on out-of-order cores.
  1433. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
  1434. setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
  1435. setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
  1436. if (Subtarget->isThumb() || Subtarget->isThumb2())
  1437. setTargetDAGCombine(ISD::ABS);
  1438. }
  1439. bool ARMTargetLowering::useSoftFloat() const {
  1440. return Subtarget->useSoftFloat();
  1441. }
  1442. // FIXME: It might make sense to define the representative register class as the
  1443. // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
  1444. // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
  1445. // SPR's representative would be DPR_VFP2. This should work well if register
  1446. // pressure tracking were modified such that a register use would increment the
  1447. // pressure of the register class's representative and all of it's super
  1448. // classes' representatives transitively. We have not implemented this because
  1449. // of the difficulty prior to coalescing of modeling operand register classes
  1450. // due to the common occurrence of cross class copies and subregister insertions
  1451. // and extractions.
  1452. std::pair<const TargetRegisterClass *, uint8_t>
  1453. ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
  1454. MVT VT) const {
  1455. const TargetRegisterClass *RRC = nullptr;
  1456. uint8_t Cost = 1;
  1457. switch (VT.SimpleTy) {
  1458. default:
  1459. return TargetLowering::findRepresentativeClass(TRI, VT);
  1460. // Use DPR as representative register class for all floating point
  1461. // and vector types. Since there are 32 SPR registers and 32 DPR registers so
  1462. // the cost is 1 for both f32 and f64.
  1463. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
  1464. case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
  1465. RRC = &ARM::DPRRegClass;
  1466. // When NEON is used for SP, only half of the register file is available
  1467. // because operations that define both SP and DP results will be constrained
  1468. // to the VFP2 class (D0-D15). We currently model this constraint prior to
  1469. // coalescing by double-counting the SP regs. See the FIXME above.
  1470. if (Subtarget->useNEONForSinglePrecisionFP())
  1471. Cost = 2;
  1472. break;
  1473. case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
  1474. case MVT::v4f32: case MVT::v2f64:
  1475. RRC = &ARM::DPRRegClass;
  1476. Cost = 2;
  1477. break;
  1478. case MVT::v4i64:
  1479. RRC = &ARM::DPRRegClass;
  1480. Cost = 4;
  1481. break;
  1482. case MVT::v8i64:
  1483. RRC = &ARM::DPRRegClass;
  1484. Cost = 8;
  1485. break;
  1486. }
  1487. return std::make_pair(RRC, Cost);
  1488. }
  1489. const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  1490. #define MAKE_CASE(V) \
  1491. case V: \
  1492. return #V;
  1493. switch ((ARMISD::NodeType)Opcode) {
  1494. case ARMISD::FIRST_NUMBER:
  1495. break;
  1496. MAKE_CASE(ARMISD::Wrapper)
  1497. MAKE_CASE(ARMISD::WrapperPIC)
  1498. MAKE_CASE(ARMISD::WrapperJT)
  1499. MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
  1500. MAKE_CASE(ARMISD::CALL)
  1501. MAKE_CASE(ARMISD::CALL_PRED)
  1502. MAKE_CASE(ARMISD::CALL_NOLINK)
  1503. MAKE_CASE(ARMISD::tSECALL)
  1504. MAKE_CASE(ARMISD::t2CALL_BTI)
  1505. MAKE_CASE(ARMISD::BRCOND)
  1506. MAKE_CASE(ARMISD::BR_JT)
  1507. MAKE_CASE(ARMISD::BR2_JT)
  1508. MAKE_CASE(ARMISD::RET_FLAG)
  1509. MAKE_CASE(ARMISD::SERET_FLAG)
  1510. MAKE_CASE(ARMISD::INTRET_FLAG)
  1511. MAKE_CASE(ARMISD::PIC_ADD)
  1512. MAKE_CASE(ARMISD::CMP)
  1513. MAKE_CASE(ARMISD::CMN)
  1514. MAKE_CASE(ARMISD::CMPZ)
  1515. MAKE_CASE(ARMISD::CMPFP)
  1516. MAKE_CASE(ARMISD::CMPFPE)
  1517. MAKE_CASE(ARMISD::CMPFPw0)
  1518. MAKE_CASE(ARMISD::CMPFPEw0)
  1519. MAKE_CASE(ARMISD::BCC_i64)
  1520. MAKE_CASE(ARMISD::FMSTAT)
  1521. MAKE_CASE(ARMISD::CMOV)
  1522. MAKE_CASE(ARMISD::SUBS)
  1523. MAKE_CASE(ARMISD::SSAT)
  1524. MAKE_CASE(ARMISD::USAT)
  1525. MAKE_CASE(ARMISD::ASRL)
  1526. MAKE_CASE(ARMISD::LSRL)
  1527. MAKE_CASE(ARMISD::LSLL)
  1528. MAKE_CASE(ARMISD::SRL_FLAG)
  1529. MAKE_CASE(ARMISD::SRA_FLAG)
  1530. MAKE_CASE(ARMISD::RRX)
  1531. MAKE_CASE(ARMISD::ADDC)
  1532. MAKE_CASE(ARMISD::ADDE)
  1533. MAKE_CASE(ARMISD::SUBC)
  1534. MAKE_CASE(ARMISD::SUBE)
  1535. MAKE_CASE(ARMISD::LSLS)
  1536. MAKE_CASE(ARMISD::VMOVRRD)
  1537. MAKE_CASE(ARMISD::VMOVDRR)
  1538. MAKE_CASE(ARMISD::VMOVhr)
  1539. MAKE_CASE(ARMISD::VMOVrh)
  1540. MAKE_CASE(ARMISD::VMOVSR)
  1541. MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
  1542. MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
  1543. MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
  1544. MAKE_CASE(ARMISD::TC_RETURN)
  1545. MAKE_CASE(ARMISD::THREAD_POINTER)
  1546. MAKE_CASE(ARMISD::DYN_ALLOC)
  1547. MAKE_CASE(ARMISD::MEMBARRIER_MCR)
  1548. MAKE_CASE(ARMISD::PRELOAD)
  1549. MAKE_CASE(ARMISD::LDRD)
  1550. MAKE_CASE(ARMISD::STRD)
  1551. MAKE_CASE(ARMISD::WIN__CHKSTK)
  1552. MAKE_CASE(ARMISD::WIN__DBZCHK)
  1553. MAKE_CASE(ARMISD::PREDICATE_CAST)
  1554. MAKE_CASE(ARMISD::VECTOR_REG_CAST)
  1555. MAKE_CASE(ARMISD::MVESEXT)
  1556. MAKE_CASE(ARMISD::MVEZEXT)
  1557. MAKE_CASE(ARMISD::MVETRUNC)
  1558. MAKE_CASE(ARMISD::VCMP)
  1559. MAKE_CASE(ARMISD::VCMPZ)
  1560. MAKE_CASE(ARMISD::VTST)
  1561. MAKE_CASE(ARMISD::VSHLs)
  1562. MAKE_CASE(ARMISD::VSHLu)
  1563. MAKE_CASE(ARMISD::VSHLIMM)
  1564. MAKE_CASE(ARMISD::VSHRsIMM)
  1565. MAKE_CASE(ARMISD::VSHRuIMM)
  1566. MAKE_CASE(ARMISD::VRSHRsIMM)
  1567. MAKE_CASE(ARMISD::VRSHRuIMM)
  1568. MAKE_CASE(ARMISD::VRSHRNIMM)
  1569. MAKE_CASE(ARMISD::VQSHLsIMM)
  1570. MAKE_CASE(ARMISD::VQSHLuIMM)
  1571. MAKE_CASE(ARMISD::VQSHLsuIMM)
  1572. MAKE_CASE(ARMISD::VQSHRNsIMM)
  1573. MAKE_CASE(ARMISD::VQSHRNuIMM)
  1574. MAKE_CASE(ARMISD::VQSHRNsuIMM)
  1575. MAKE_CASE(ARMISD::VQRSHRNsIMM)
  1576. MAKE_CASE(ARMISD::VQRSHRNuIMM)
  1577. MAKE_CASE(ARMISD::VQRSHRNsuIMM)
  1578. MAKE_CASE(ARMISD::VSLIIMM)
  1579. MAKE_CASE(ARMISD::VSRIIMM)
  1580. MAKE_CASE(ARMISD::VGETLANEu)
  1581. MAKE_CASE(ARMISD::VGETLANEs)
  1582. MAKE_CASE(ARMISD::VMOVIMM)
  1583. MAKE_CASE(ARMISD::VMVNIMM)
  1584. MAKE_CASE(ARMISD::VMOVFPIMM)
  1585. MAKE_CASE(ARMISD::VDUP)
  1586. MAKE_CASE(ARMISD::VDUPLANE)
  1587. MAKE_CASE(ARMISD::VEXT)
  1588. MAKE_CASE(ARMISD::VREV64)
  1589. MAKE_CASE(ARMISD::VREV32)
  1590. MAKE_CASE(ARMISD::VREV16)
  1591. MAKE_CASE(ARMISD::VZIP)
  1592. MAKE_CASE(ARMISD::VUZP)
  1593. MAKE_CASE(ARMISD::VTRN)
  1594. MAKE_CASE(ARMISD::VTBL1)
  1595. MAKE_CASE(ARMISD::VTBL2)
  1596. MAKE_CASE(ARMISD::VMOVN)
  1597. MAKE_CASE(ARMISD::VQMOVNs)
  1598. MAKE_CASE(ARMISD::VQMOVNu)
  1599. MAKE_CASE(ARMISD::VCVTN)
  1600. MAKE_CASE(ARMISD::VCVTL)
  1601. MAKE_CASE(ARMISD::VIDUP)
  1602. MAKE_CASE(ARMISD::VMULLs)
  1603. MAKE_CASE(ARMISD::VMULLu)
  1604. MAKE_CASE(ARMISD::VQDMULH)
  1605. MAKE_CASE(ARMISD::VADDVs)
  1606. MAKE_CASE(ARMISD::VADDVu)
  1607. MAKE_CASE(ARMISD::VADDVps)
  1608. MAKE_CASE(ARMISD::VADDVpu)
  1609. MAKE_CASE(ARMISD::VADDLVs)
  1610. MAKE_CASE(ARMISD::VADDLVu)
  1611. MAKE_CASE(ARMISD::VADDLVAs)
  1612. MAKE_CASE(ARMISD::VADDLVAu)
  1613. MAKE_CASE(ARMISD::VADDLVps)
  1614. MAKE_CASE(ARMISD::VADDLVpu)
  1615. MAKE_CASE(ARMISD::VADDLVAps)
  1616. MAKE_CASE(ARMISD::VADDLVApu)
  1617. MAKE_CASE(ARMISD::VMLAVs)
  1618. MAKE_CASE(ARMISD::VMLAVu)
  1619. MAKE_CASE(ARMISD::VMLAVps)
  1620. MAKE_CASE(ARMISD::VMLAVpu)
  1621. MAKE_CASE(ARMISD::VMLALVs)
  1622. MAKE_CASE(ARMISD::VMLALVu)
  1623. MAKE_CASE(ARMISD::VMLALVps)
  1624. MAKE_CASE(ARMISD::VMLALVpu)
  1625. MAKE_CASE(ARMISD::VMLALVAs)
  1626. MAKE_CASE(ARMISD::VMLALVAu)
  1627. MAKE_CASE(ARMISD::VMLALVAps)
  1628. MAKE_CASE(ARMISD::VMLALVApu)
  1629. MAKE_CASE(ARMISD::VMINVu)
  1630. MAKE_CASE(ARMISD::VMINVs)
  1631. MAKE_CASE(ARMISD::VMAXVu)
  1632. MAKE_CASE(ARMISD::VMAXVs)
  1633. MAKE_CASE(ARMISD::UMAAL)
  1634. MAKE_CASE(ARMISD::UMLAL)
  1635. MAKE_CASE(ARMISD::SMLAL)
  1636. MAKE_CASE(ARMISD::SMLALBB)
  1637. MAKE_CASE(ARMISD::SMLALBT)
  1638. MAKE_CASE(ARMISD::SMLALTB)
  1639. MAKE_CASE(ARMISD::SMLALTT)
  1640. MAKE_CASE(ARMISD::SMULWB)
  1641. MAKE_CASE(ARMISD::SMULWT)
  1642. MAKE_CASE(ARMISD::SMLALD)
  1643. MAKE_CASE(ARMISD::SMLALDX)
  1644. MAKE_CASE(ARMISD::SMLSLD)
  1645. MAKE_CASE(ARMISD::SMLSLDX)
  1646. MAKE_CASE(ARMISD::SMMLAR)
  1647. MAKE_CASE(ARMISD::SMMLSR)
  1648. MAKE_CASE(ARMISD::QADD16b)
  1649. MAKE_CASE(ARMISD::QSUB16b)
  1650. MAKE_CASE(ARMISD::QADD8b)
  1651. MAKE_CASE(ARMISD::QSUB8b)
  1652. MAKE_CASE(ARMISD::UQADD16b)
  1653. MAKE_CASE(ARMISD::UQSUB16b)
  1654. MAKE_CASE(ARMISD::UQADD8b)
  1655. MAKE_CASE(ARMISD::UQSUB8b)
  1656. MAKE_CASE(ARMISD::BUILD_VECTOR)
  1657. MAKE_CASE(ARMISD::BFI)
  1658. MAKE_CASE(ARMISD::VORRIMM)
  1659. MAKE_CASE(ARMISD::VBICIMM)
  1660. MAKE_CASE(ARMISD::VBSP)
  1661. MAKE_CASE(ARMISD::MEMCPY)
  1662. MAKE_CASE(ARMISD::VLD1DUP)
  1663. MAKE_CASE(ARMISD::VLD2DUP)
  1664. MAKE_CASE(ARMISD::VLD3DUP)
  1665. MAKE_CASE(ARMISD::VLD4DUP)
  1666. MAKE_CASE(ARMISD::VLD1_UPD)
  1667. MAKE_CASE(ARMISD::VLD2_UPD)
  1668. MAKE_CASE(ARMISD::VLD3_UPD)
  1669. MAKE_CASE(ARMISD::VLD4_UPD)
  1670. MAKE_CASE(ARMISD::VLD1x2_UPD)
  1671. MAKE_CASE(ARMISD::VLD1x3_UPD)
  1672. MAKE_CASE(ARMISD::VLD1x4_UPD)
  1673. MAKE_CASE(ARMISD::VLD2LN_UPD)
  1674. MAKE_CASE(ARMISD::VLD3LN_UPD)
  1675. MAKE_CASE(ARMISD::VLD4LN_UPD)
  1676. MAKE_CASE(ARMISD::VLD1DUP_UPD)
  1677. MAKE_CASE(ARMISD::VLD2DUP_UPD)
  1678. MAKE_CASE(ARMISD::VLD3DUP_UPD)
  1679. MAKE_CASE(ARMISD::VLD4DUP_UPD)
  1680. MAKE_CASE(ARMISD::VST1_UPD)
  1681. MAKE_CASE(ARMISD::VST2_UPD)
  1682. MAKE_CASE(ARMISD::VST3_UPD)
  1683. MAKE_CASE(ARMISD::VST4_UPD)
  1684. MAKE_CASE(ARMISD::VST1x2_UPD)
  1685. MAKE_CASE(ARMISD::VST1x3_UPD)
  1686. MAKE_CASE(ARMISD::VST1x4_UPD)
  1687. MAKE_CASE(ARMISD::VST2LN_UPD)
  1688. MAKE_CASE(ARMISD::VST3LN_UPD)
  1689. MAKE_CASE(ARMISD::VST4LN_UPD)
  1690. MAKE_CASE(ARMISD::WLS)
  1691. MAKE_CASE(ARMISD::WLSSETUP)
  1692. MAKE_CASE(ARMISD::LE)
  1693. MAKE_CASE(ARMISD::LOOP_DEC)
  1694. MAKE_CASE(ARMISD::CSINV)
  1695. MAKE_CASE(ARMISD::CSNEG)
  1696. MAKE_CASE(ARMISD::CSINC)
  1697. MAKE_CASE(ARMISD::MEMCPYLOOP)
  1698. MAKE_CASE(ARMISD::MEMSETLOOP)
  1699. #undef MAKE_CASE
  1700. }
  1701. return nullptr;
  1702. }
  1703. EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
  1704. EVT VT) const {
  1705. if (!VT.isVector())
  1706. return getPointerTy(DL);
  1707. // MVE has a predicate register.
  1708. if ((Subtarget->hasMVEIntegerOps() &&
  1709. (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
  1710. VT == MVT::v16i8)) ||
  1711. (Subtarget->hasMVEFloatOps() &&
  1712. (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
  1713. return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
  1714. return VT.changeVectorElementTypeToInteger();
  1715. }
  1716. /// getRegClassFor - Return the register class that should be used for the
  1717. /// specified value type.
  1718. const TargetRegisterClass *
  1719. ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
  1720. (void)isDivergent;
  1721. // Map v4i64 to QQ registers but do not make the type legal. Similarly map
  1722. // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
  1723. // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
  1724. // MVE Q registers.
  1725. if (Subtarget->hasNEON()) {
  1726. if (VT == MVT::v4i64)
  1727. return &ARM::QQPRRegClass;
  1728. if (VT == MVT::v8i64)
  1729. return &ARM::QQQQPRRegClass;
  1730. }
  1731. if (Subtarget->hasMVEIntegerOps()) {
  1732. if (VT == MVT::v4i64)
  1733. return &ARM::MQQPRRegClass;
  1734. if (VT == MVT::v8i64)
  1735. return &ARM::MQQQQPRRegClass;
  1736. }
  1737. return TargetLowering::getRegClassFor(VT);
  1738. }
  1739. // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
  1740. // source/dest is aligned and the copy size is large enough. We therefore want
  1741. // to align such objects passed to memory intrinsics.
  1742. bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
  1743. Align &PrefAlign) const {
  1744. if (!isa<MemIntrinsic>(CI))
  1745. return false;
  1746. MinSize = 8;
  1747. // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
  1748. // cycle faster than 4-byte aligned LDM.
  1749. PrefAlign =
  1750. (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
  1751. return true;
  1752. }
  1753. // Create a fast isel object.
  1754. FastISel *
  1755. ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
  1756. const TargetLibraryInfo *libInfo) const {
  1757. return ARM::createFastISel(funcInfo, libInfo);
  1758. }
  1759. Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
  1760. unsigned NumVals = N->getNumValues();
  1761. if (!NumVals)
  1762. return Sched::RegPressure;
  1763. for (unsigned i = 0; i != NumVals; ++i) {
  1764. EVT VT = N->getValueType(i);
  1765. if (VT == MVT::Glue || VT == MVT::Other)
  1766. continue;
  1767. if (VT.isFloatingPoint() || VT.isVector())
  1768. return Sched::ILP;
  1769. }
  1770. if (!N->isMachineOpcode())
  1771. return Sched::RegPressure;
  1772. // Load are scheduled for latency even if there instruction itinerary
  1773. // is not available.
  1774. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  1775. const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
  1776. if (MCID.getNumDefs() == 0)
  1777. return Sched::RegPressure;
  1778. if (!Itins->isEmpty() &&
  1779. Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
  1780. return Sched::ILP;
  1781. return Sched::RegPressure;
  1782. }
  1783. //===----------------------------------------------------------------------===//
  1784. // Lowering Code
  1785. //===----------------------------------------------------------------------===//
  1786. static bool isSRL16(const SDValue &Op) {
  1787. if (Op.getOpcode() != ISD::SRL)
  1788. return false;
  1789. if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
  1790. return Const->getZExtValue() == 16;
  1791. return false;
  1792. }
  1793. static bool isSRA16(const SDValue &Op) {
  1794. if (Op.getOpcode() != ISD::SRA)
  1795. return false;
  1796. if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
  1797. return Const->getZExtValue() == 16;
  1798. return false;
  1799. }
  1800. static bool isSHL16(const SDValue &Op) {
  1801. if (Op.getOpcode() != ISD::SHL)
  1802. return false;
  1803. if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
  1804. return Const->getZExtValue() == 16;
  1805. return false;
  1806. }
  1807. // Check for a signed 16-bit value. We special case SRA because it makes it
  1808. // more simple when also looking for SRAs that aren't sign extending a
  1809. // smaller value. Without the check, we'd need to take extra care with
  1810. // checking order for some operations.
  1811. static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
  1812. if (isSRA16(Op))
  1813. return isSHL16(Op.getOperand(0));
  1814. return DAG.ComputeNumSignBits(Op) == 17;
  1815. }
  1816. /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
  1817. static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
  1818. switch (CC) {
  1819. default: llvm_unreachable("Unknown condition code!");
  1820. case ISD::SETNE: return ARMCC::NE;
  1821. case ISD::SETEQ: return ARMCC::EQ;
  1822. case ISD::SETGT: return ARMCC::GT;
  1823. case ISD::SETGE: return ARMCC::GE;
  1824. case ISD::SETLT: return ARMCC::LT;
  1825. case ISD::SETLE: return ARMCC::LE;
  1826. case ISD::SETUGT: return ARMCC::HI;
  1827. case ISD::SETUGE: return ARMCC::HS;
  1828. case ISD::SETULT: return ARMCC::LO;
  1829. case ISD::SETULE: return ARMCC::LS;
  1830. }
  1831. }
  1832. /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
  1833. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
  1834. ARMCC::CondCodes &CondCode2) {
  1835. CondCode2 = ARMCC::AL;
  1836. switch (CC) {
  1837. default: llvm_unreachable("Unknown FP condition!");
  1838. case ISD::SETEQ:
  1839. case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
  1840. case ISD::SETGT:
  1841. case ISD::SETOGT: CondCode = ARMCC::GT; break;
  1842. case ISD::SETGE:
  1843. case ISD::SETOGE: CondCode = ARMCC::GE; break;
  1844. case ISD::SETOLT: CondCode = ARMCC::MI; break;
  1845. case ISD::SETOLE: CondCode = ARMCC::LS; break;
  1846. case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
  1847. case ISD::SETO: CondCode = ARMCC::VC; break;
  1848. case ISD::SETUO: CondCode = ARMCC::VS; break;
  1849. case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
  1850. case ISD::SETUGT: CondCode = ARMCC::HI; break;
  1851. case ISD::SETUGE: CondCode = ARMCC::PL; break;
  1852. case ISD::SETLT:
  1853. case ISD::SETULT: CondCode = ARMCC::LT; break;
  1854. case ISD::SETLE:
  1855. case ISD::SETULE: CondCode = ARMCC::LE; break;
  1856. case ISD::SETNE:
  1857. case ISD::SETUNE: CondCode = ARMCC::NE; break;
  1858. }
  1859. }
  1860. //===----------------------------------------------------------------------===//
  1861. // Calling Convention Implementation
  1862. //===----------------------------------------------------------------------===//
  1863. /// getEffectiveCallingConv - Get the effective calling convention, taking into
  1864. /// account presence of floating point hardware and calling convention
  1865. /// limitations, such as support for variadic functions.
  1866. CallingConv::ID
  1867. ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
  1868. bool isVarArg) const {
  1869. switch (CC) {
  1870. default:
  1871. report_fatal_error("Unsupported calling convention");
  1872. case CallingConv::ARM_AAPCS:
  1873. case CallingConv::ARM_APCS:
  1874. case CallingConv::GHC:
  1875. case CallingConv::CFGuard_Check:
  1876. return CC;
  1877. case CallingConv::PreserveMost:
  1878. return CallingConv::PreserveMost;
  1879. case CallingConv::ARM_AAPCS_VFP:
  1880. case CallingConv::Swift:
  1881. case CallingConv::SwiftTail:
  1882. return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
  1883. case CallingConv::C:
  1884. case CallingConv::Tail:
  1885. if (!Subtarget->isAAPCS_ABI())
  1886. return CallingConv::ARM_APCS;
  1887. else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
  1888. getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
  1889. !isVarArg)
  1890. return CallingConv::ARM_AAPCS_VFP;
  1891. else
  1892. return CallingConv::ARM_AAPCS;
  1893. case CallingConv::Fast:
  1894. case CallingConv::CXX_FAST_TLS:
  1895. if (!Subtarget->isAAPCS_ABI()) {
  1896. if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
  1897. return CallingConv::Fast;
  1898. return CallingConv::ARM_APCS;
  1899. } else if (Subtarget->hasVFP2Base() &&
  1900. !Subtarget->isThumb1Only() && !isVarArg)
  1901. return CallingConv::ARM_AAPCS_VFP;
  1902. else
  1903. return CallingConv::ARM_AAPCS;
  1904. }
  1905. }
  1906. CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
  1907. bool isVarArg) const {
  1908. return CCAssignFnForNode(CC, false, isVarArg);
  1909. }
  1910. CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
  1911. bool isVarArg) const {
  1912. return CCAssignFnForNode(CC, true, isVarArg);
  1913. }
  1914. /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
  1915. /// CallingConvention.
  1916. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
  1917. bool Return,
  1918. bool isVarArg) const {
  1919. switch (getEffectiveCallingConv(CC, isVarArg)) {
  1920. default:
  1921. report_fatal_error("Unsupported calling convention");
  1922. case CallingConv::ARM_APCS:
  1923. return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
  1924. case CallingConv::ARM_AAPCS:
  1925. return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
  1926. case CallingConv::ARM_AAPCS_VFP:
  1927. return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
  1928. case CallingConv::Fast:
  1929. return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
  1930. case CallingConv::GHC:
  1931. return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
  1932. case CallingConv::PreserveMost:
  1933. return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
  1934. case CallingConv::CFGuard_Check:
  1935. return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
  1936. }
  1937. }
  1938. SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
  1939. MVT LocVT, MVT ValVT, SDValue Val) const {
  1940. Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
  1941. Val);
  1942. if (Subtarget->hasFullFP16()) {
  1943. Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
  1944. } else {
  1945. Val = DAG.getNode(ISD::TRUNCATE, dl,
  1946. MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
  1947. Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
  1948. }
  1949. return Val;
  1950. }
  1951. SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
  1952. MVT LocVT, MVT ValVT,
  1953. SDValue Val) const {
  1954. if (Subtarget->hasFullFP16()) {
  1955. Val = DAG.getNode(ARMISD::VMOVrh, dl,
  1956. MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
  1957. } else {
  1958. Val = DAG.getNode(ISD::BITCAST, dl,
  1959. MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
  1960. Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
  1961. MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
  1962. }
  1963. return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
  1964. }
  1965. /// LowerCallResult - Lower the result values of a call into the
  1966. /// appropriate copies out of appropriate physical registers.
  1967. SDValue ARMTargetLowering::LowerCallResult(
  1968. SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
  1969. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
  1970. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
  1971. SDValue ThisVal) const {
  1972. // Assign locations to each value returned by this call.
  1973. SmallVector<CCValAssign, 16> RVLocs;
  1974. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
  1975. *DAG.getContext());
  1976. CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
  1977. // Copy all of the result registers out of their specified physreg.
  1978. for (unsigned i = 0; i != RVLocs.size(); ++i) {
  1979. CCValAssign VA = RVLocs[i];
  1980. // Pass 'this' value directly from the argument to return value, to avoid
  1981. // reg unit interference
  1982. if (i == 0 && isThisReturn) {
  1983. assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
  1984. "unexpected return calling convention register assignment");
  1985. InVals.push_back(ThisVal);
  1986. continue;
  1987. }
  1988. SDValue Val;
  1989. if (VA.needsCustom() &&
  1990. (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
  1991. // Handle f64 or half of a v2f64.
  1992. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
  1993. InFlag);
  1994. Chain = Lo.getValue(1);
  1995. InFlag = Lo.getValue(2);
  1996. VA = RVLocs[++i]; // skip ahead to next loc
  1997. SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
  1998. InFlag);
  1999. Chain = Hi.getValue(1);
  2000. InFlag = Hi.getValue(2);
  2001. if (!Subtarget->isLittle())
  2002. std::swap (Lo, Hi);
  2003. Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
  2004. if (VA.getLocVT() == MVT::v2f64) {
  2005. SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
  2006. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
  2007. DAG.getConstant(0, dl, MVT::i32));
  2008. VA = RVLocs[++i]; // skip ahead to next loc
  2009. Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
  2010. Chain = Lo.getValue(1);
  2011. InFlag = Lo.getValue(2);
  2012. VA = RVLocs[++i]; // skip ahead to next loc
  2013. Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
  2014. Chain = Hi.getValue(1);
  2015. InFlag = Hi.getValue(2);
  2016. if (!Subtarget->isLittle())
  2017. std::swap (Lo, Hi);
  2018. Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
  2019. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
  2020. DAG.getConstant(1, dl, MVT::i32));
  2021. }
  2022. } else {
  2023. Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
  2024. InFlag);
  2025. Chain = Val.getValue(1);
  2026. InFlag = Val.getValue(2);
  2027. }
  2028. switch (VA.getLocInfo()) {
  2029. default: llvm_unreachable("Unknown loc info!");
  2030. case CCValAssign::Full: break;
  2031. case CCValAssign::BCvt:
  2032. Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
  2033. break;
  2034. }
  2035. // f16 arguments have their size extended to 4 bytes and passed as if they
  2036. // had been copied to the LSBs of a 32-bit register.
  2037. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
  2038. if (VA.needsCustom() &&
  2039. (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
  2040. Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
  2041. InVals.push_back(Val);
  2042. }
  2043. return Chain;
  2044. }
  2045. std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
  2046. const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
  2047. bool IsTailCall, int SPDiff) const {
  2048. SDValue DstAddr;
  2049. MachinePointerInfo DstInfo;
  2050. int32_t Offset = VA.getLocMemOffset();
  2051. MachineFunction &MF = DAG.getMachineFunction();
  2052. if (IsTailCall) {
  2053. Offset += SPDiff;
  2054. auto PtrVT = getPointerTy(DAG.getDataLayout());
  2055. int Size = VA.getLocVT().getFixedSizeInBits() / 8;
  2056. int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
  2057. DstAddr = DAG.getFrameIndex(FI, PtrVT);
  2058. DstInfo =
  2059. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
  2060. } else {
  2061. SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
  2062. DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
  2063. StackPtr, PtrOff);
  2064. DstInfo =
  2065. MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);
  2066. }
  2067. return std::make_pair(DstAddr, DstInfo);
  2068. }
  2069. void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
  2070. SDValue Chain, SDValue &Arg,
  2071. RegsToPassVector &RegsToPass,
  2072. CCValAssign &VA, CCValAssign &NextVA,
  2073. SDValue &StackPtr,
  2074. SmallVectorImpl<SDValue> &MemOpChains,
  2075. bool IsTailCall,
  2076. int SPDiff) const {
  2077. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
  2078. DAG.getVTList(MVT::i32, MVT::i32), Arg);
  2079. unsigned id = Subtarget->isLittle() ? 0 : 1;
  2080. RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
  2081. if (NextVA.isRegLoc())
  2082. RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
  2083. else {
  2084. assert(NextVA.isMemLoc());
  2085. if (!StackPtr.getNode())
  2086. StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
  2087. getPointerTy(DAG.getDataLayout()));
  2088. SDValue DstAddr;
  2089. MachinePointerInfo DstInfo;
  2090. std::tie(DstAddr, DstInfo) =
  2091. computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
  2092. MemOpChains.push_back(
  2093. DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
  2094. }
  2095. }
  2096. static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
  2097. return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
  2098. CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
  2099. }
  2100. /// LowerCall - Lowering a call into a callseq_start <-
  2101. /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
  2102. /// nodes.
  2103. SDValue
  2104. ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  2105. SmallVectorImpl<SDValue> &InVals) const {
  2106. SelectionDAG &DAG = CLI.DAG;
  2107. SDLoc &dl = CLI.DL;
  2108. SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
  2109. SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
  2110. SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
  2111. SDValue Chain = CLI.Chain;
  2112. SDValue Callee = CLI.Callee;
  2113. bool &isTailCall = CLI.IsTailCall;
  2114. CallingConv::ID CallConv = CLI.CallConv;
  2115. bool doesNotRet = CLI.DoesNotReturn;
  2116. bool isVarArg = CLI.IsVarArg;
  2117. MachineFunction &MF = DAG.getMachineFunction();
  2118. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  2119. MachineFunction::CallSiteInfo CSInfo;
  2120. bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
  2121. bool isThisReturn = false;
  2122. bool isCmseNSCall = false;
  2123. bool isSibCall = false;
  2124. bool PreferIndirect = false;
  2125. bool GuardWithBTI = false;
  2126. // Lower 'returns_twice' calls to a pseudo-instruction.
  2127. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
  2128. !Subtarget->noBTIAtReturnTwice())
  2129. GuardWithBTI = AFI->branchTargetEnforcement();
  2130. // Determine whether this is a non-secure function call.
  2131. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
  2132. isCmseNSCall = true;
  2133. // Disable tail calls if they're not supported.
  2134. if (!Subtarget->supportsTailCall())
  2135. isTailCall = false;
  2136. // For both the non-secure calls and the returns from a CMSE entry function,
  2137. // the function needs to do some extra work afte r the call, or before the
  2138. // return, respectively, thus it cannot end with atail call
  2139. if (isCmseNSCall || AFI->isCmseNSEntryFunction())
  2140. isTailCall = false;
  2141. if (isa<GlobalAddressSDNode>(Callee)) {
  2142. // If we're optimizing for minimum size and the function is called three or
  2143. // more times in this block, we can improve codesize by calling indirectly
  2144. // as BLXr has a 16-bit encoding.
  2145. auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
  2146. if (CLI.CB) {
  2147. auto *BB = CLI.CB->getParent();
  2148. PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
  2149. count_if(GV->users(), [&BB](const User *U) {
  2150. return isa<Instruction>(U) &&
  2151. cast<Instruction>(U)->getParent() == BB;
  2152. }) > 2;
  2153. }
  2154. }
  2155. if (isTailCall) {
  2156. // Check if it's really possible to do a tail call.
  2157. isTailCall = IsEligibleForTailCallOptimization(
  2158. Callee, CallConv, isVarArg, isStructRet,
  2159. MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
  2160. PreferIndirect);
  2161. if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
  2162. CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
  2163. isSibCall = true;
  2164. // We don't support GuaranteedTailCallOpt for ARM, only automatically
  2165. // detected sibcalls.
  2166. if (isTailCall)
  2167. ++NumTailCalls;
  2168. }
  2169. if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
  2170. report_fatal_error("failed to perform tail call elimination on a call "
  2171. "site marked musttail");
  2172. // Analyze operands of the call, assigning locations to each operand.
  2173. SmallVector<CCValAssign, 16> ArgLocs;
  2174. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
  2175. *DAG.getContext());
  2176. CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
  2177. // Get a count of how many bytes are to be pushed on the stack.
  2178. unsigned NumBytes = CCInfo.getNextStackOffset();
  2179. // SPDiff is the byte offset of the call's argument area from the callee's.
  2180. // Stores to callee stack arguments will be placed in FixedStackSlots offset
  2181. // by this amount for a tail call. In a sibling call it must be 0 because the
  2182. // caller will deallocate the entire stack and the callee still expects its
  2183. // arguments to begin at SP+0. Completely unused for non-tail calls.
  2184. int SPDiff = 0;
  2185. if (isTailCall && !isSibCall) {
  2186. auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
  2187. unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
  2188. // Since callee will pop argument stack as a tail call, we must keep the
  2189. // popped size 16-byte aligned.
  2190. Align StackAlign = DAG.getDataLayout().getStackAlignment();
  2191. NumBytes = alignTo(NumBytes, StackAlign);
  2192. // SPDiff will be negative if this tail call requires more space than we
  2193. // would automatically have in our incoming argument space. Positive if we
  2194. // can actually shrink the stack.
  2195. SPDiff = NumReusableBytes - NumBytes;
  2196. // If this call requires more stack than we have available from
  2197. // LowerFormalArguments, tell FrameLowering to reserve space for it.
  2198. if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
  2199. AFI->setArgRegsSaveSize(-SPDiff);
  2200. }
  2201. if (isSibCall) {
  2202. // For sibling tail calls, memory operands are available in our caller's stack.
  2203. NumBytes = 0;
  2204. } else {
  2205. // Adjust the stack pointer for the new arguments...
  2206. // These operations are automatically eliminated by the prolog/epilog pass
  2207. Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
  2208. }
  2209. SDValue StackPtr =
  2210. DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
  2211. RegsToPassVector RegsToPass;
  2212. SmallVector<SDValue, 8> MemOpChains;
  2213. // During a tail call, stores to the argument area must happen after all of
  2214. // the function's incoming arguments have been loaded because they may alias.
  2215. // This is done by folding in a TokenFactor from LowerFormalArguments, but
  2216. // there's no point in doing so repeatedly so this tracks whether that's
  2217. // happened yet.
  2218. bool AfterFormalArgLoads = false;
  2219. // Walk the register/memloc assignments, inserting copies/loads. In the case
  2220. // of tail call optimization, arguments are handled later.
  2221. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
  2222. i != e;
  2223. ++i, ++realArgIdx) {
  2224. CCValAssign &VA = ArgLocs[i];
  2225. SDValue Arg = OutVals[realArgIdx];
  2226. ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
  2227. bool isByVal = Flags.isByVal();
  2228. // Promote the value if needed.
  2229. switch (VA.getLocInfo()) {
  2230. default: llvm_unreachable("Unknown loc info!");
  2231. case CCValAssign::Full: break;
  2232. case CCValAssign::SExt:
  2233. Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
  2234. break;
  2235. case CCValAssign::ZExt:
  2236. Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
  2237. break;
  2238. case CCValAssign::AExt:
  2239. Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
  2240. break;
  2241. case CCValAssign::BCvt:
  2242. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2243. break;
  2244. }
  2245. if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
  2246. Chain = DAG.getStackArgumentTokenFactor(Chain);
  2247. AfterFormalArgLoads = true;
  2248. }
  2249. // f16 arguments have their size extended to 4 bytes and passed as if they
  2250. // had been copied to the LSBs of a 32-bit register.
  2251. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
  2252. if (VA.needsCustom() &&
  2253. (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
  2254. Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
  2255. } else {
  2256. // f16 arguments could have been extended prior to argument lowering.
  2257. // Mask them arguments if this is a CMSE nonsecure call.
  2258. auto ArgVT = Outs[realArgIdx].ArgVT;
  2259. if (isCmseNSCall && (ArgVT == MVT::f16)) {
  2260. auto LocBits = VA.getLocVT().getSizeInBits();
  2261. auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
  2262. SDValue Mask =
  2263. DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
  2264. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
  2265. Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
  2266. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2267. }
  2268. }
  2269. // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
  2270. if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
  2271. SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2272. DAG.getConstant(0, dl, MVT::i32));
  2273. SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2274. DAG.getConstant(1, dl, MVT::i32));
  2275. PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
  2276. StackPtr, MemOpChains, isTailCall, SPDiff);
  2277. VA = ArgLocs[++i]; // skip ahead to next loc
  2278. if (VA.isRegLoc()) {
  2279. PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
  2280. StackPtr, MemOpChains, isTailCall, SPDiff);
  2281. } else {
  2282. assert(VA.isMemLoc());
  2283. SDValue DstAddr;
  2284. MachinePointerInfo DstInfo;
  2285. std::tie(DstAddr, DstInfo) =
  2286. computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
  2287. MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
  2288. }
  2289. } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
  2290. PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
  2291. StackPtr, MemOpChains, isTailCall, SPDiff);
  2292. } else if (VA.isRegLoc()) {
  2293. if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
  2294. Outs[0].VT == MVT::i32) {
  2295. assert(VA.getLocVT() == MVT::i32 &&
  2296. "unexpected calling convention register assignment");
  2297. assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
  2298. "unexpected use of 'returned'");
  2299. isThisReturn = true;
  2300. }
  2301. const TargetOptions &Options = DAG.getTarget().Options;
  2302. if (Options.EmitCallSiteInfo)
  2303. CSInfo.emplace_back(VA.getLocReg(), i);
  2304. RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
  2305. } else if (isByVal) {
  2306. assert(VA.isMemLoc());
  2307. unsigned offset = 0;
  2308. // True if this byval aggregate will be split between registers
  2309. // and memory.
  2310. unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
  2311. unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
  2312. if (CurByValIdx < ByValArgsCount) {
  2313. unsigned RegBegin, RegEnd;
  2314. CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
  2315. EVT PtrVT =
  2316. DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
  2317. unsigned int i, j;
  2318. for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
  2319. SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
  2320. SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
  2321. SDValue Load =
  2322. DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
  2323. DAG.InferPtrAlign(AddArg));
  2324. MemOpChains.push_back(Load.getValue(1));
  2325. RegsToPass.push_back(std::make_pair(j, Load));
  2326. }
  2327. // If parameter size outsides register area, "offset" value
  2328. // helps us to calculate stack slot for remained part properly.
  2329. offset = RegEnd - RegBegin;
  2330. CCInfo.nextInRegsParam();
  2331. }
  2332. if (Flags.getByValSize() > 4*offset) {
  2333. auto PtrVT = getPointerTy(DAG.getDataLayout());
  2334. SDValue Dst;
  2335. MachinePointerInfo DstInfo;
  2336. std::tie(Dst, DstInfo) =
  2337. computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
  2338. SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
  2339. SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
  2340. SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
  2341. MVT::i32);
  2342. SDValue AlignNode =
  2343. DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
  2344. SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
  2345. SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
  2346. MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
  2347. Ops));
  2348. }
  2349. } else {
  2350. assert(VA.isMemLoc());
  2351. SDValue DstAddr;
  2352. MachinePointerInfo DstInfo;
  2353. std::tie(DstAddr, DstInfo) =
  2354. computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
  2355. SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
  2356. MemOpChains.push_back(Store);
  2357. }
  2358. }
  2359. if (!MemOpChains.empty())
  2360. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
  2361. // Build a sequence of copy-to-reg nodes chained together with token chain
  2362. // and flag operands which copy the outgoing args into the appropriate regs.
  2363. SDValue InFlag;
  2364. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
  2365. Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
  2366. RegsToPass[i].second, InFlag);
  2367. InFlag = Chain.getValue(1);
  2368. }
  2369. // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
  2370. // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
  2371. // node so that legalize doesn't hack it.
  2372. bool isDirect = false;
  2373. const TargetMachine &TM = getTargetMachine();
  2374. const Module *Mod = MF.getFunction().getParent();
  2375. const GlobalValue *GVal = nullptr;
  2376. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
  2377. GVal = G->getGlobal();
  2378. bool isStub =
  2379. !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
  2380. bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
  2381. bool isLocalARMFunc = false;
  2382. auto PtrVt = getPointerTy(DAG.getDataLayout());
  2383. if (Subtarget->genLongCalls()) {
  2384. assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
  2385. "long-calls codegen is not position independent!");
  2386. // Handle a global address or an external symbol. If it's not one of
  2387. // those, the target's already in a register, so we don't need to do
  2388. // anything extra.
  2389. if (isa<GlobalAddressSDNode>(Callee)) {
  2390. // When generating execute-only code we use movw movt pair.
  2391. // Currently execute-only is only available for architectures that
  2392. // support movw movt, so we are safe to assume that.
  2393. if (Subtarget->genExecuteOnly()) {
  2394. assert(Subtarget->useMovt() &&
  2395. "long-calls with execute-only requires movt and movw!");
  2396. ++NumMovwMovt;
  2397. Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
  2398. DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
  2399. } else {
  2400. // Create a constant pool entry for the callee address
  2401. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  2402. ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
  2403. GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
  2404. // Get the address of the callee into a register
  2405. SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
  2406. Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
  2407. Callee = DAG.getLoad(
  2408. PtrVt, dl, DAG.getEntryNode(), Addr,
  2409. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  2410. }
  2411. } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
  2412. const char *Sym = S->getSymbol();
  2413. // When generating execute-only code we use movw movt pair.
  2414. // Currently execute-only is only available for architectures that
  2415. // support movw movt, so we are safe to assume that.
  2416. if (Subtarget->genExecuteOnly()) {
  2417. assert(Subtarget->useMovt() &&
  2418. "long-calls with execute-only requires movt and movw!");
  2419. ++NumMovwMovt;
  2420. Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
  2421. DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
  2422. } else {
  2423. // Create a constant pool entry for the callee address
  2424. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  2425. ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
  2426. *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
  2427. // Get the address of the callee into a register
  2428. SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
  2429. Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
  2430. Callee = DAG.getLoad(
  2431. PtrVt, dl, DAG.getEntryNode(), Addr,
  2432. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  2433. }
  2434. }
  2435. } else if (isa<GlobalAddressSDNode>(Callee)) {
  2436. if (!PreferIndirect) {
  2437. isDirect = true;
  2438. bool isDef = GVal->isStrongDefinitionForLinker();
  2439. // ARM call to a local ARM function is predicable.
  2440. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
  2441. // tBX takes a register source operand.
  2442. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
  2443. assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
  2444. Callee = DAG.getNode(
  2445. ARMISD::WrapperPIC, dl, PtrVt,
  2446. DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
  2447. Callee = DAG.getLoad(
  2448. PtrVt, dl, DAG.getEntryNode(), Callee,
  2449. MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
  2450. MachineMemOperand::MODereferenceable |
  2451. MachineMemOperand::MOInvariant);
  2452. } else if (Subtarget->isTargetCOFF()) {
  2453. assert(Subtarget->isTargetWindows() &&
  2454. "Windows is the only supported COFF target");
  2455. unsigned TargetFlags = ARMII::MO_NO_FLAG;
  2456. if (GVal->hasDLLImportStorageClass())
  2457. TargetFlags = ARMII::MO_DLLIMPORT;
  2458. else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
  2459. TargetFlags = ARMII::MO_COFFSTUB;
  2460. Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
  2461. TargetFlags);
  2462. if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
  2463. Callee =
  2464. DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
  2465. DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
  2466. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  2467. } else {
  2468. Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
  2469. }
  2470. }
  2471. } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
  2472. isDirect = true;
  2473. // tBX takes a register source operand.
  2474. const char *Sym = S->getSymbol();
  2475. if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
  2476. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  2477. ARMConstantPoolValue *CPV =
  2478. ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
  2479. ARMPCLabelIndex, 4);
  2480. SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
  2481. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  2482. Callee = DAG.getLoad(
  2483. PtrVt, dl, DAG.getEntryNode(), CPAddr,
  2484. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  2485. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  2486. Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
  2487. } else {
  2488. Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
  2489. }
  2490. }
  2491. if (isCmseNSCall) {
  2492. assert(!isARMFunc && !isDirect &&
  2493. "Cannot handle call to ARM function or direct call");
  2494. if (NumBytes > 0) {
  2495. DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
  2496. "call to non-secure function would "
  2497. "require passing arguments on stack",
  2498. dl.getDebugLoc());
  2499. DAG.getContext()->diagnose(Diag);
  2500. }
  2501. if (isStructRet) {
  2502. DiagnosticInfoUnsupported Diag(
  2503. DAG.getMachineFunction().getFunction(),
  2504. "call to non-secure function would return value through pointer",
  2505. dl.getDebugLoc());
  2506. DAG.getContext()->diagnose(Diag);
  2507. }
  2508. }
  2509. // FIXME: handle tail calls differently.
  2510. unsigned CallOpc;
  2511. if (Subtarget->isThumb()) {
  2512. if (GuardWithBTI)
  2513. CallOpc = ARMISD::t2CALL_BTI;
  2514. else if (isCmseNSCall)
  2515. CallOpc = ARMISD::tSECALL;
  2516. else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
  2517. CallOpc = ARMISD::CALL_NOLINK;
  2518. else
  2519. CallOpc = ARMISD::CALL;
  2520. } else {
  2521. if (!isDirect && !Subtarget->hasV5TOps())
  2522. CallOpc = ARMISD::CALL_NOLINK;
  2523. else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
  2524. // Emit regular call when code size is the priority
  2525. !Subtarget->hasMinSize())
  2526. // "mov lr, pc; b _foo" to avoid confusing the RSP
  2527. CallOpc = ARMISD::CALL_NOLINK;
  2528. else
  2529. CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
  2530. }
  2531. // We don't usually want to end the call-sequence here because we would tidy
  2532. // the frame up *after* the call, however in the ABI-changing tail-call case
  2533. // we've carefully laid out the parameters so that when sp is reset they'll be
  2534. // in the correct location.
  2535. if (isTailCall && !isSibCall) {
  2536. Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, dl);
  2537. InFlag = Chain.getValue(1);
  2538. }
  2539. std::vector<SDValue> Ops;
  2540. Ops.push_back(Chain);
  2541. Ops.push_back(Callee);
  2542. if (isTailCall) {
  2543. Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
  2544. }
  2545. // Add argument registers to the end of the list so that they are known live
  2546. // into the call.
  2547. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
  2548. Ops.push_back(DAG.getRegister(RegsToPass[i].first,
  2549. RegsToPass[i].second.getValueType()));
  2550. // Add a register mask operand representing the call-preserved registers.
  2551. const uint32_t *Mask;
  2552. const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
  2553. if (isThisReturn) {
  2554. // For 'this' returns, use the R0-preserving mask if applicable
  2555. Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
  2556. if (!Mask) {
  2557. // Set isThisReturn to false if the calling convention is not one that
  2558. // allows 'returned' to be modeled in this way, so LowerCallResult does
  2559. // not try to pass 'this' straight through
  2560. isThisReturn = false;
  2561. Mask = ARI->getCallPreservedMask(MF, CallConv);
  2562. }
  2563. } else
  2564. Mask = ARI->getCallPreservedMask(MF, CallConv);
  2565. assert(Mask && "Missing call preserved mask for calling convention");
  2566. Ops.push_back(DAG.getRegisterMask(Mask));
  2567. if (InFlag.getNode())
  2568. Ops.push_back(InFlag);
  2569. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  2570. if (isTailCall) {
  2571. MF.getFrameInfo().setHasTailCall();
  2572. SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
  2573. DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
  2574. return Ret;
  2575. }
  2576. // Returns a chain and a flag for retval copy to use.
  2577. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
  2578. DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
  2579. InFlag = Chain.getValue(1);
  2580. DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
  2581. // If we're guaranteeing tail-calls will be honoured, the callee must
  2582. // pop its own argument stack on return. But this call is *not* a tail call so
  2583. // we need to undo that after it returns to restore the status-quo.
  2584. bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
  2585. uint64_t CalleePopBytes =
  2586. canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
  2587. Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, dl);
  2588. if (!Ins.empty())
  2589. InFlag = Chain.getValue(1);
  2590. // Handle result values, copying them out of physregs into vregs that we
  2591. // return.
  2592. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
  2593. InVals, isThisReturn,
  2594. isThisReturn ? OutVals[0] : SDValue());
  2595. }
  2596. /// HandleByVal - Every parameter *after* a byval parameter is passed
  2597. /// on the stack. Remember the next parameter register to allocate,
  2598. /// and then confiscate the rest of the parameter registers to insure
  2599. /// this.
  2600. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
  2601. Align Alignment) const {
  2602. // Byval (as with any stack) slots are always at least 4 byte aligned.
  2603. Alignment = std::max(Alignment, Align(4));
  2604. unsigned Reg = State->AllocateReg(GPRArgRegs);
  2605. if (!Reg)
  2606. return;
  2607. unsigned AlignInRegs = Alignment.value() / 4;
  2608. unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
  2609. for (unsigned i = 0; i < Waste; ++i)
  2610. Reg = State->AllocateReg(GPRArgRegs);
  2611. if (!Reg)
  2612. return;
  2613. unsigned Excess = 4 * (ARM::R4 - Reg);
  2614. // Special case when NSAA != SP and parameter size greater than size of
  2615. // all remained GPR regs. In that case we can't split parameter, we must
  2616. // send it to stack. We also must set NCRN to R4, so waste all
  2617. // remained registers.
  2618. const unsigned NSAAOffset = State->getNextStackOffset();
  2619. if (NSAAOffset != 0 && Size > Excess) {
  2620. while (State->AllocateReg(GPRArgRegs))
  2621. ;
  2622. return;
  2623. }
  2624. // First register for byval parameter is the first register that wasn't
  2625. // allocated before this method call, so it would be "reg".
  2626. // If parameter is small enough to be saved in range [reg, r4), then
  2627. // the end (first after last) register would be reg + param-size-in-regs,
  2628. // else parameter would be splitted between registers and stack,
  2629. // end register would be r4 in this case.
  2630. unsigned ByValRegBegin = Reg;
  2631. unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
  2632. State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
  2633. // Note, first register is allocated in the beginning of function already,
  2634. // allocate remained amount of registers we need.
  2635. for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
  2636. State->AllocateReg(GPRArgRegs);
  2637. // A byval parameter that is split between registers and memory needs its
  2638. // size truncated here.
  2639. // In the case where the entire structure fits in registers, we set the
  2640. // size in memory to zero.
  2641. Size = std::max<int>(Size - Excess, 0);
  2642. }
  2643. /// MatchingStackOffset - Return true if the given stack call argument is
  2644. /// already available in the same position (relatively) of the caller's
  2645. /// incoming argument stack.
  2646. static
  2647. bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
  2648. MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
  2649. const TargetInstrInfo *TII) {
  2650. unsigned Bytes = Arg.getValueSizeInBits() / 8;
  2651. int FI = std::numeric_limits<int>::max();
  2652. if (Arg.getOpcode() == ISD::CopyFromReg) {
  2653. Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
  2654. if (!VR.isVirtual())
  2655. return false;
  2656. MachineInstr *Def = MRI->getVRegDef(VR);
  2657. if (!Def)
  2658. return false;
  2659. if (!Flags.isByVal()) {
  2660. if (!TII->isLoadFromStackSlot(*Def, FI))
  2661. return false;
  2662. } else {
  2663. return false;
  2664. }
  2665. } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
  2666. if (Flags.isByVal())
  2667. // ByVal argument is passed in as a pointer but it's now being
  2668. // dereferenced. e.g.
  2669. // define @foo(%struct.X* %A) {
  2670. // tail call @bar(%struct.X* byval %A)
  2671. // }
  2672. return false;
  2673. SDValue Ptr = Ld->getBasePtr();
  2674. FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
  2675. if (!FINode)
  2676. return false;
  2677. FI = FINode->getIndex();
  2678. } else
  2679. return false;
  2680. assert(FI != std::numeric_limits<int>::max());
  2681. if (!MFI.isFixedObjectIndex(FI))
  2682. return false;
  2683. return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
  2684. }
  2685. /// IsEligibleForTailCallOptimization - Check whether the call is eligible
  2686. /// for tail call optimization. Targets which want to do tail call
  2687. /// optimization should implement this function.
  2688. bool ARMTargetLowering::IsEligibleForTailCallOptimization(
  2689. SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
  2690. bool isCalleeStructRet, bool isCallerStructRet,
  2691. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2692. const SmallVectorImpl<SDValue> &OutVals,
  2693. const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
  2694. const bool isIndirect) const {
  2695. MachineFunction &MF = DAG.getMachineFunction();
  2696. const Function &CallerF = MF.getFunction();
  2697. CallingConv::ID CallerCC = CallerF.getCallingConv();
  2698. assert(Subtarget->supportsTailCall());
  2699. // Indirect tail calls cannot be optimized for Thumb1 if the args
  2700. // to the call take up r0-r3. The reason is that there are no legal registers
  2701. // left to hold the pointer to the function to be called.
  2702. // Similarly, if the function uses return address sign and authentication,
  2703. // r12 is needed to hold the PAC and is not available to hold the callee
  2704. // address.
  2705. if (Outs.size() >= 4 &&
  2706. (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
  2707. if (Subtarget->isThumb1Only())
  2708. return false;
  2709. // Conservatively assume the function spills LR.
  2710. if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
  2711. return false;
  2712. }
  2713. // Look for obvious safe cases to perform tail call optimization that do not
  2714. // require ABI changes. This is what gcc calls sibcall.
  2715. // Exception-handling functions need a special set of instructions to indicate
  2716. // a return to the hardware. Tail-calling another function would probably
  2717. // break this.
  2718. if (CallerF.hasFnAttribute("interrupt"))
  2719. return false;
  2720. if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
  2721. return CalleeCC == CallerCC;
  2722. // Also avoid sibcall optimization if either caller or callee uses struct
  2723. // return semantics.
  2724. if (isCalleeStructRet || isCallerStructRet)
  2725. return false;
  2726. // Externally-defined functions with weak linkage should not be
  2727. // tail-called on ARM when the OS does not support dynamic
  2728. // pre-emption of symbols, as the AAELF spec requires normal calls
  2729. // to undefined weak functions to be replaced with a NOP or jump to the
  2730. // next instruction. The behaviour of branch instructions in this
  2731. // situation (as used for tail calls) is implementation-defined, so we
  2732. // cannot rely on the linker replacing the tail call with a return.
  2733. if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
  2734. const GlobalValue *GV = G->getGlobal();
  2735. const Triple &TT = getTargetMachine().getTargetTriple();
  2736. if (GV->hasExternalWeakLinkage() &&
  2737. (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
  2738. return false;
  2739. }
  2740. // Check that the call results are passed in the same way.
  2741. LLVMContext &C = *DAG.getContext();
  2742. if (!CCState::resultsCompatible(
  2743. getEffectiveCallingConv(CalleeCC, isVarArg),
  2744. getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
  2745. CCAssignFnForReturn(CalleeCC, isVarArg),
  2746. CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
  2747. return false;
  2748. // The callee has to preserve all registers the caller needs to preserve.
  2749. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
  2750. const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
  2751. if (CalleeCC != CallerCC) {
  2752. const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
  2753. if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
  2754. return false;
  2755. }
  2756. // If Caller's vararg or byval argument has been split between registers and
  2757. // stack, do not perform tail call, since part of the argument is in caller's
  2758. // local frame.
  2759. const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
  2760. if (AFI_Caller->getArgRegsSaveSize())
  2761. return false;
  2762. // If the callee takes no arguments then go on to check the results of the
  2763. // call.
  2764. if (!Outs.empty()) {
  2765. // Check if stack adjustment is needed. For now, do not do this if any
  2766. // argument is passed on the stack.
  2767. SmallVector<CCValAssign, 16> ArgLocs;
  2768. CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
  2769. CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
  2770. if (CCInfo.getNextStackOffset()) {
  2771. // Check if the arguments are already laid out in the right way as
  2772. // the caller's fixed stack objects.
  2773. MachineFrameInfo &MFI = MF.getFrameInfo();
  2774. const MachineRegisterInfo *MRI = &MF.getRegInfo();
  2775. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  2776. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
  2777. i != e;
  2778. ++i, ++realArgIdx) {
  2779. CCValAssign &VA = ArgLocs[i];
  2780. EVT RegVT = VA.getLocVT();
  2781. SDValue Arg = OutVals[realArgIdx];
  2782. ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
  2783. if (VA.getLocInfo() == CCValAssign::Indirect)
  2784. return false;
  2785. if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
  2786. // f64 and vector types are split into multiple registers or
  2787. // register/stack-slot combinations. The types will not match
  2788. // the registers; give up on memory f64 refs until we figure
  2789. // out what to do about this.
  2790. if (!VA.isRegLoc())
  2791. return false;
  2792. if (!ArgLocs[++i].isRegLoc())
  2793. return false;
  2794. if (RegVT == MVT::v2f64) {
  2795. if (!ArgLocs[++i].isRegLoc())
  2796. return false;
  2797. if (!ArgLocs[++i].isRegLoc())
  2798. return false;
  2799. }
  2800. } else if (!VA.isRegLoc()) {
  2801. if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
  2802. MFI, MRI, TII))
  2803. return false;
  2804. }
  2805. }
  2806. }
  2807. const MachineRegisterInfo &MRI = MF.getRegInfo();
  2808. if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
  2809. return false;
  2810. }
  2811. return true;
  2812. }
  2813. bool
  2814. ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
  2815. MachineFunction &MF, bool isVarArg,
  2816. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2817. LLVMContext &Context) const {
  2818. SmallVector<CCValAssign, 16> RVLocs;
  2819. CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
  2820. return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
  2821. }
  2822. static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
  2823. const SDLoc &DL, SelectionDAG &DAG) {
  2824. const MachineFunction &MF = DAG.getMachineFunction();
  2825. const Function &F = MF.getFunction();
  2826. StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
  2827. // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
  2828. // version of the "preferred return address". These offsets affect the return
  2829. // instruction if this is a return from PL1 without hypervisor extensions.
  2830. // IRQ/FIQ: +4 "subs pc, lr, #4"
  2831. // SWI: 0 "subs pc, lr, #0"
  2832. // ABORT: +4 "subs pc, lr, #4"
  2833. // UNDEF: +4/+2 "subs pc, lr, #0"
  2834. // UNDEF varies depending on where the exception came from ARM or Thumb
  2835. // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
  2836. int64_t LROffset;
  2837. if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
  2838. IntKind == "ABORT")
  2839. LROffset = 4;
  2840. else if (IntKind == "SWI" || IntKind == "UNDEF")
  2841. LROffset = 0;
  2842. else
  2843. report_fatal_error("Unsupported interrupt attribute. If present, value "
  2844. "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
  2845. RetOps.insert(RetOps.begin() + 1,
  2846. DAG.getConstant(LROffset, DL, MVT::i32, false));
  2847. return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
  2848. }
  2849. SDValue
  2850. ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  2851. bool isVarArg,
  2852. const SmallVectorImpl<ISD::OutputArg> &Outs,
  2853. const SmallVectorImpl<SDValue> &OutVals,
  2854. const SDLoc &dl, SelectionDAG &DAG) const {
  2855. // CCValAssign - represent the assignment of the return value to a location.
  2856. SmallVector<CCValAssign, 16> RVLocs;
  2857. // CCState - Info about the registers and stack slots.
  2858. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
  2859. *DAG.getContext());
  2860. // Analyze outgoing return values.
  2861. CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
  2862. SDValue Flag;
  2863. SmallVector<SDValue, 4> RetOps;
  2864. RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
  2865. bool isLittleEndian = Subtarget->isLittle();
  2866. MachineFunction &MF = DAG.getMachineFunction();
  2867. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  2868. AFI->setReturnRegsCount(RVLocs.size());
  2869. // Report error if cmse entry function returns structure through first ptr arg.
  2870. if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
  2871. // Note: using an empty SDLoc(), as the first line of the function is a
  2872. // better place to report than the last line.
  2873. DiagnosticInfoUnsupported Diag(
  2874. DAG.getMachineFunction().getFunction(),
  2875. "secure entry function would return value through pointer",
  2876. SDLoc().getDebugLoc());
  2877. DAG.getContext()->diagnose(Diag);
  2878. }
  2879. // Copy the result values into the output registers.
  2880. for (unsigned i = 0, realRVLocIdx = 0;
  2881. i != RVLocs.size();
  2882. ++i, ++realRVLocIdx) {
  2883. CCValAssign &VA = RVLocs[i];
  2884. assert(VA.isRegLoc() && "Can only return in registers!");
  2885. SDValue Arg = OutVals[realRVLocIdx];
  2886. bool ReturnF16 = false;
  2887. if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
  2888. // Half-precision return values can be returned like this:
  2889. //
  2890. // t11 f16 = fadd ...
  2891. // t12: i16 = bitcast t11
  2892. // t13: i32 = zero_extend t12
  2893. // t14: f32 = bitcast t13 <~~~~~~~ Arg
  2894. //
  2895. // to avoid code generation for bitcasts, we simply set Arg to the node
  2896. // that produces the f16 value, t11 in this case.
  2897. //
  2898. if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
  2899. SDValue ZE = Arg.getOperand(0);
  2900. if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
  2901. SDValue BC = ZE.getOperand(0);
  2902. if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
  2903. Arg = BC.getOperand(0);
  2904. ReturnF16 = true;
  2905. }
  2906. }
  2907. }
  2908. }
  2909. switch (VA.getLocInfo()) {
  2910. default: llvm_unreachable("Unknown loc info!");
  2911. case CCValAssign::Full: break;
  2912. case CCValAssign::BCvt:
  2913. if (!ReturnF16)
  2914. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2915. break;
  2916. }
  2917. // Mask f16 arguments if this is a CMSE nonsecure entry.
  2918. auto RetVT = Outs[realRVLocIdx].ArgVT;
  2919. if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
  2920. if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
  2921. Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
  2922. } else {
  2923. auto LocBits = VA.getLocVT().getSizeInBits();
  2924. auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
  2925. SDValue Mask =
  2926. DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
  2927. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
  2928. Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
  2929. Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
  2930. }
  2931. }
  2932. if (VA.needsCustom() &&
  2933. (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
  2934. if (VA.getLocVT() == MVT::v2f64) {
  2935. // Extract the first half and return it in two registers.
  2936. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2937. DAG.getConstant(0, dl, MVT::i32));
  2938. SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
  2939. DAG.getVTList(MVT::i32, MVT::i32), Half);
  2940. Chain =
  2941. DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2942. HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
  2943. Flag = Chain.getValue(1);
  2944. RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  2945. VA = RVLocs[++i]; // skip ahead to next loc
  2946. Chain =
  2947. DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2948. HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
  2949. Flag = Chain.getValue(1);
  2950. RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  2951. VA = RVLocs[++i]; // skip ahead to next loc
  2952. // Extract the 2nd half and fall through to handle it as an f64 value.
  2953. Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
  2954. DAG.getConstant(1, dl, MVT::i32));
  2955. }
  2956. // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
  2957. // available.
  2958. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
  2959. DAG.getVTList(MVT::i32, MVT::i32), Arg);
  2960. Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2961. fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
  2962. Flag = Chain.getValue(1);
  2963. RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
  2964. VA = RVLocs[++i]; // skip ahead to next loc
  2965. Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
  2966. fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
  2967. } else
  2968. Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
  2969. // Guarantee that all emitted copies are
  2970. // stuck together, avoiding something bad.
  2971. Flag = Chain.getValue(1);
  2972. RetOps.push_back(DAG.getRegister(
  2973. VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
  2974. }
  2975. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
  2976. const MCPhysReg *I =
  2977. TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
  2978. if (I) {
  2979. for (; *I; ++I) {
  2980. if (ARM::GPRRegClass.contains(*I))
  2981. RetOps.push_back(DAG.getRegister(*I, MVT::i32));
  2982. else if (ARM::DPRRegClass.contains(*I))
  2983. RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
  2984. else
  2985. llvm_unreachable("Unexpected register class in CSRsViaCopy!");
  2986. }
  2987. }
  2988. // Update chain and glue.
  2989. RetOps[0] = Chain;
  2990. if (Flag.getNode())
  2991. RetOps.push_back(Flag);
  2992. // CPUs which aren't M-class use a special sequence to return from
  2993. // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
  2994. // though we use "subs pc, lr, #N").
  2995. //
  2996. // M-class CPUs actually use a normal return sequence with a special
  2997. // (hardware-provided) value in LR, so the normal code path works.
  2998. if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
  2999. !Subtarget->isMClass()) {
  3000. if (Subtarget->isThumb1Only())
  3001. report_fatal_error("interrupt attribute is not supported in Thumb1");
  3002. return LowerInterruptReturn(RetOps, dl, DAG);
  3003. }
  3004. ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
  3005. ARMISD::RET_FLAG;
  3006. return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
  3007. }
  3008. bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
  3009. if (N->getNumValues() != 1)
  3010. return false;
  3011. if (!N->hasNUsesOfValue(1, 0))
  3012. return false;
  3013. SDValue TCChain = Chain;
  3014. SDNode *Copy = *N->use_begin();
  3015. if (Copy->getOpcode() == ISD::CopyToReg) {
  3016. // If the copy has a glue operand, we conservatively assume it isn't safe to
  3017. // perform a tail call.
  3018. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
  3019. return false;
  3020. TCChain = Copy->getOperand(0);
  3021. } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
  3022. SDNode *VMov = Copy;
  3023. // f64 returned in a pair of GPRs.
  3024. SmallPtrSet<SDNode*, 2> Copies;
  3025. for (SDNode *U : VMov->uses()) {
  3026. if (U->getOpcode() != ISD::CopyToReg)
  3027. return false;
  3028. Copies.insert(U);
  3029. }
  3030. if (Copies.size() > 2)
  3031. return false;
  3032. for (SDNode *U : VMov->uses()) {
  3033. SDValue UseChain = U->getOperand(0);
  3034. if (Copies.count(UseChain.getNode()))
  3035. // Second CopyToReg
  3036. Copy = U;
  3037. else {
  3038. // We are at the top of this chain.
  3039. // If the copy has a glue operand, we conservatively assume it
  3040. // isn't safe to perform a tail call.
  3041. if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
  3042. return false;
  3043. // First CopyToReg
  3044. TCChain = UseChain;
  3045. }
  3046. }
  3047. } else if (Copy->getOpcode() == ISD::BITCAST) {
  3048. // f32 returned in a single GPR.
  3049. if (!Copy->hasOneUse())
  3050. return false;
  3051. Copy = *Copy->use_begin();
  3052. if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
  3053. return false;
  3054. // If the copy has a glue operand, we conservatively assume it isn't safe to
  3055. // perform a tail call.
  3056. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
  3057. return false;
  3058. TCChain = Copy->getOperand(0);
  3059. } else {
  3060. return false;
  3061. }
  3062. bool HasRet = false;
  3063. for (const SDNode *U : Copy->uses()) {
  3064. if (U->getOpcode() != ARMISD::RET_FLAG &&
  3065. U->getOpcode() != ARMISD::INTRET_FLAG)
  3066. return false;
  3067. HasRet = true;
  3068. }
  3069. if (!HasRet)
  3070. return false;
  3071. Chain = TCChain;
  3072. return true;
  3073. }
  3074. bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
  3075. if (!Subtarget->supportsTailCall())
  3076. return false;
  3077. if (!CI->isTailCall())
  3078. return false;
  3079. return true;
  3080. }
  3081. // Trying to write a 64 bit value so need to split into two 32 bit values first,
  3082. // and pass the lower and high parts through.
  3083. static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
  3084. SDLoc DL(Op);
  3085. SDValue WriteValue = Op->getOperand(2);
  3086. // This function is only supposed to be called for i64 type argument.
  3087. assert(WriteValue.getValueType() == MVT::i64
  3088. && "LowerWRITE_REGISTER called for non-i64 type argument.");
  3089. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
  3090. DAG.getConstant(0, DL, MVT::i32));
  3091. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
  3092. DAG.getConstant(1, DL, MVT::i32));
  3093. SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
  3094. return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
  3095. }
  3096. // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
  3097. // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
  3098. // one of the above mentioned nodes. It has to be wrapped because otherwise
  3099. // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
  3100. // be used to form addressing mode. These wrapped nodes will be selected
  3101. // into MOVi.
  3102. SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
  3103. SelectionDAG &DAG) const {
  3104. EVT PtrVT = Op.getValueType();
  3105. // FIXME there is no actual debug info here
  3106. SDLoc dl(Op);
  3107. ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  3108. SDValue Res;
  3109. // When generating execute-only code Constant Pools must be promoted to the
  3110. // global data section. It's a bit ugly that we can't share them across basic
  3111. // blocks, but this way we guarantee that execute-only behaves correct with
  3112. // position-independent addressing modes.
  3113. if (Subtarget->genExecuteOnly()) {
  3114. auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
  3115. auto T = const_cast<Type*>(CP->getType());
  3116. auto C = const_cast<Constant*>(CP->getConstVal());
  3117. auto M = const_cast<Module*>(DAG.getMachineFunction().
  3118. getFunction().getParent());
  3119. auto GV = new GlobalVariable(
  3120. *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
  3121. Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
  3122. Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
  3123. Twine(AFI->createPICLabelUId())
  3124. );
  3125. SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
  3126. dl, PtrVT);
  3127. return LowerGlobalAddress(GA, DAG);
  3128. }
  3129. // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
  3130. // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
  3131. Align CPAlign = CP->getAlign();
  3132. if (Subtarget->isThumb1Only())
  3133. CPAlign = std::max(CPAlign, Align(4));
  3134. if (CP->isMachineConstantPoolEntry())
  3135. Res =
  3136. DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
  3137. else
  3138. Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
  3139. return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
  3140. }
  3141. unsigned ARMTargetLowering::getJumpTableEncoding() const {
  3142. return MachineJumpTableInfo::EK_Inline;
  3143. }
  3144. SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
  3145. SelectionDAG &DAG) const {
  3146. MachineFunction &MF = DAG.getMachineFunction();
  3147. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3148. unsigned ARMPCLabelIndex = 0;
  3149. SDLoc DL(Op);
  3150. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3151. const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
  3152. SDValue CPAddr;
  3153. bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
  3154. if (!IsPositionIndependent) {
  3155. CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
  3156. } else {
  3157. unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
  3158. ARMPCLabelIndex = AFI->createPICLabelUId();
  3159. ARMConstantPoolValue *CPV =
  3160. ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
  3161. ARMCP::CPBlockAddress, PCAdj);
  3162. CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3163. }
  3164. CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
  3165. SDValue Result = DAG.getLoad(
  3166. PtrVT, DL, DAG.getEntryNode(), CPAddr,
  3167. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3168. if (!IsPositionIndependent)
  3169. return Result;
  3170. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
  3171. return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
  3172. }
  3173. /// Convert a TLS address reference into the correct sequence of loads
  3174. /// and calls to compute the variable's address for Darwin, and return an
  3175. /// SDValue containing the final node.
  3176. /// Darwin only has one TLS scheme which must be capable of dealing with the
  3177. /// fully general situation, in the worst case. This means:
  3178. /// + "extern __thread" declaration.
  3179. /// + Defined in a possibly unknown dynamic library.
  3180. ///
  3181. /// The general system is that each __thread variable has a [3 x i32] descriptor
  3182. /// which contains information used by the runtime to calculate the address. The
  3183. /// only part of this the compiler needs to know about is the first word, which
  3184. /// contains a function pointer that must be called with the address of the
  3185. /// entire descriptor in "r0".
  3186. ///
  3187. /// Since this descriptor may be in a different unit, in general access must
  3188. /// proceed along the usual ARM rules. A common sequence to produce is:
  3189. ///
  3190. /// movw rT1, :lower16:_var$non_lazy_ptr
  3191. /// movt rT1, :upper16:_var$non_lazy_ptr
  3192. /// ldr r0, [rT1]
  3193. /// ldr rT2, [r0]
  3194. /// blx rT2
  3195. /// [...address now in r0...]
  3196. SDValue
  3197. ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
  3198. SelectionDAG &DAG) const {
  3199. assert(Subtarget->isTargetDarwin() &&
  3200. "This function expects a Darwin target");
  3201. SDLoc DL(Op);
  3202. // First step is to get the address of the actua global symbol. This is where
  3203. // the TLS descriptor lives.
  3204. SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
  3205. // The first entry in the descriptor is a function pointer that we must call
  3206. // to obtain the address of the variable.
  3207. SDValue Chain = DAG.getEntryNode();
  3208. SDValue FuncTLVGet = DAG.getLoad(
  3209. MVT::i32, DL, Chain, DescAddr,
  3210. MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
  3211. MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
  3212. MachineMemOperand::MOInvariant);
  3213. Chain = FuncTLVGet.getValue(1);
  3214. MachineFunction &F = DAG.getMachineFunction();
  3215. MachineFrameInfo &MFI = F.getFrameInfo();
  3216. MFI.setAdjustsStack(true);
  3217. // TLS calls preserve all registers except those that absolutely must be
  3218. // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
  3219. // silly).
  3220. auto TRI =
  3221. getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
  3222. auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
  3223. const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
  3224. // Finally, we can make the call. This is just a degenerate version of a
  3225. // normal AArch64 call node: r0 takes the address of the descriptor, and
  3226. // returns the address of the variable in this thread.
  3227. Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
  3228. Chain =
  3229. DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
  3230. Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
  3231. DAG.getRegisterMask(Mask), Chain.getValue(1));
  3232. return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
  3233. }
  3234. SDValue
  3235. ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
  3236. SelectionDAG &DAG) const {
  3237. assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
  3238. SDValue Chain = DAG.getEntryNode();
  3239. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3240. SDLoc DL(Op);
  3241. // Load the current TEB (thread environment block)
  3242. SDValue Ops[] = {Chain,
  3243. DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
  3244. DAG.getTargetConstant(15, DL, MVT::i32),
  3245. DAG.getTargetConstant(0, DL, MVT::i32),
  3246. DAG.getTargetConstant(13, DL, MVT::i32),
  3247. DAG.getTargetConstant(0, DL, MVT::i32),
  3248. DAG.getTargetConstant(2, DL, MVT::i32)};
  3249. SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
  3250. DAG.getVTList(MVT::i32, MVT::Other), Ops);
  3251. SDValue TEB = CurrentTEB.getValue(0);
  3252. Chain = CurrentTEB.getValue(1);
  3253. // Load the ThreadLocalStoragePointer from the TEB
  3254. // A pointer to the TLS array is located at offset 0x2c from the TEB.
  3255. SDValue TLSArray =
  3256. DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
  3257. TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
  3258. // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
  3259. // offset into the TLSArray.
  3260. // Load the TLS index from the C runtime
  3261. SDValue TLSIndex =
  3262. DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
  3263. TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
  3264. TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
  3265. SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
  3266. DAG.getConstant(2, DL, MVT::i32));
  3267. SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
  3268. DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
  3269. MachinePointerInfo());
  3270. // Get the offset of the start of the .tls section (section base)
  3271. const auto *GA = cast<GlobalAddressSDNode>(Op);
  3272. auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
  3273. SDValue Offset = DAG.getLoad(
  3274. PtrVT, DL, Chain,
  3275. DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
  3276. DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
  3277. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3278. return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
  3279. }
  3280. // Lower ISD::GlobalTLSAddress using the "general dynamic" model
  3281. SDValue
  3282. ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
  3283. SelectionDAG &DAG) const {
  3284. SDLoc dl(GA);
  3285. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3286. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
  3287. MachineFunction &MF = DAG.getMachineFunction();
  3288. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3289. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  3290. ARMConstantPoolValue *CPV =
  3291. ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
  3292. ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
  3293. SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3294. Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
  3295. Argument = DAG.getLoad(
  3296. PtrVT, dl, DAG.getEntryNode(), Argument,
  3297. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3298. SDValue Chain = Argument.getValue(1);
  3299. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  3300. Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
  3301. // call __tls_get_addr.
  3302. ArgListTy Args;
  3303. ArgListEntry Entry;
  3304. Entry.Node = Argument;
  3305. Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
  3306. Args.push_back(Entry);
  3307. // FIXME: is there useful debug info available here?
  3308. TargetLowering::CallLoweringInfo CLI(DAG);
  3309. CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
  3310. CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
  3311. DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
  3312. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  3313. return CallResult.first;
  3314. }
  3315. // Lower ISD::GlobalTLSAddress using the "initial exec" or
  3316. // "local exec" model.
  3317. SDValue
  3318. ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
  3319. SelectionDAG &DAG,
  3320. TLSModel::Model model) const {
  3321. const GlobalValue *GV = GA->getGlobal();
  3322. SDLoc dl(GA);
  3323. SDValue Offset;
  3324. SDValue Chain = DAG.getEntryNode();
  3325. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3326. // Get the Thread Pointer
  3327. SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
  3328. if (model == TLSModel::InitialExec) {
  3329. MachineFunction &MF = DAG.getMachineFunction();
  3330. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3331. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  3332. // Initial exec model.
  3333. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
  3334. ARMConstantPoolValue *CPV =
  3335. ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
  3336. ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
  3337. true);
  3338. Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3339. Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
  3340. Offset = DAG.getLoad(
  3341. PtrVT, dl, Chain, Offset,
  3342. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3343. Chain = Offset.getValue(1);
  3344. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  3345. Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
  3346. Offset = DAG.getLoad(
  3347. PtrVT, dl, Chain, Offset,
  3348. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3349. } else {
  3350. // local exec model
  3351. assert(model == TLSModel::LocalExec);
  3352. ARMConstantPoolValue *CPV =
  3353. ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
  3354. Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3355. Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
  3356. Offset = DAG.getLoad(
  3357. PtrVT, dl, Chain, Offset,
  3358. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3359. }
  3360. // The address of the thread local variable is the add of the thread
  3361. // pointer with the offset of the variable.
  3362. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
  3363. }
  3364. SDValue
  3365. ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
  3366. GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  3367. if (DAG.getTarget().useEmulatedTLS())
  3368. return LowerToTLSEmulatedModel(GA, DAG);
  3369. if (Subtarget->isTargetDarwin())
  3370. return LowerGlobalTLSAddressDarwin(Op, DAG);
  3371. if (Subtarget->isTargetWindows())
  3372. return LowerGlobalTLSAddressWindows(Op, DAG);
  3373. // TODO: implement the "local dynamic" model
  3374. assert(Subtarget->isTargetELF() && "Only ELF implemented here");
  3375. TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
  3376. switch (model) {
  3377. case TLSModel::GeneralDynamic:
  3378. case TLSModel::LocalDynamic:
  3379. return LowerToTLSGeneralDynamicModel(GA, DAG);
  3380. case TLSModel::InitialExec:
  3381. case TLSModel::LocalExec:
  3382. return LowerToTLSExecModels(GA, DAG, model);
  3383. }
  3384. llvm_unreachable("bogus TLS model");
  3385. }
  3386. /// Return true if all users of V are within function F, looking through
  3387. /// ConstantExprs.
  3388. static bool allUsersAreInFunction(const Value *V, const Function *F) {
  3389. SmallVector<const User*,4> Worklist(V->users());
  3390. while (!Worklist.empty()) {
  3391. auto *U = Worklist.pop_back_val();
  3392. if (isa<ConstantExpr>(U)) {
  3393. append_range(Worklist, U->users());
  3394. continue;
  3395. }
  3396. auto *I = dyn_cast<Instruction>(U);
  3397. if (!I || I->getParent()->getParent() != F)
  3398. return false;
  3399. }
  3400. return true;
  3401. }
  3402. static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
  3403. const GlobalValue *GV, SelectionDAG &DAG,
  3404. EVT PtrVT, const SDLoc &dl) {
  3405. // If we're creating a pool entry for a constant global with unnamed address,
  3406. // and the global is small enough, we can emit it inline into the constant pool
  3407. // to save ourselves an indirection.
  3408. //
  3409. // This is a win if the constant is only used in one function (so it doesn't
  3410. // need to be duplicated) or duplicating the constant wouldn't increase code
  3411. // size (implying the constant is no larger than 4 bytes).
  3412. const Function &F = DAG.getMachineFunction().getFunction();
  3413. // We rely on this decision to inline being idemopotent and unrelated to the
  3414. // use-site. We know that if we inline a variable at one use site, we'll
  3415. // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
  3416. // doesn't know about this optimization, so bail out if it's enabled else
  3417. // we could decide to inline here (and thus never emit the GV) but require
  3418. // the GV from fast-isel generated code.
  3419. if (!EnableConstpoolPromotion ||
  3420. DAG.getMachineFunction().getTarget().Options.EnableFastISel)
  3421. return SDValue();
  3422. auto *GVar = dyn_cast<GlobalVariable>(GV);
  3423. if (!GVar || !GVar->hasInitializer() ||
  3424. !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
  3425. !GVar->hasLocalLinkage())
  3426. return SDValue();
  3427. // If we inline a value that contains relocations, we move the relocations
  3428. // from .data to .text. This is not allowed in position-independent code.
  3429. auto *Init = GVar->getInitializer();
  3430. if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
  3431. Init->needsDynamicRelocation())
  3432. return SDValue();
  3433. // The constant islands pass can only really deal with alignment requests
  3434. // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
  3435. // any type wanting greater alignment requirements than 4 bytes. We also
  3436. // can only promote constants that are multiples of 4 bytes in size or
  3437. // are paddable to a multiple of 4. Currently we only try and pad constants
  3438. // that are strings for simplicity.
  3439. auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
  3440. unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
  3441. Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
  3442. unsigned RequiredPadding = 4 - (Size % 4);
  3443. bool PaddingPossible =
  3444. RequiredPadding == 4 || (CDAInit && CDAInit->isString());
  3445. if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
  3446. Size == 0)
  3447. return SDValue();
  3448. unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
  3449. MachineFunction &MF = DAG.getMachineFunction();
  3450. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3451. // We can't bloat the constant pool too much, else the ConstantIslands pass
  3452. // may fail to converge. If we haven't promoted this global yet (it may have
  3453. // multiple uses), and promoting it would increase the constant pool size (Sz
  3454. // > 4), ensure we have space to do so up to MaxTotal.
  3455. if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
  3456. if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
  3457. ConstpoolPromotionMaxTotal)
  3458. return SDValue();
  3459. // This is only valid if all users are in a single function; we can't clone
  3460. // the constant in general. The LLVM IR unnamed_addr allows merging
  3461. // constants, but not cloning them.
  3462. //
  3463. // We could potentially allow cloning if we could prove all uses of the
  3464. // constant in the current function don't care about the address, like
  3465. // printf format strings. But that isn't implemented for now.
  3466. if (!allUsersAreInFunction(GVar, &F))
  3467. return SDValue();
  3468. // We're going to inline this global. Pad it out if needed.
  3469. if (RequiredPadding != 4) {
  3470. StringRef S = CDAInit->getAsString();
  3471. SmallVector<uint8_t,16> V(S.size());
  3472. std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
  3473. while (RequiredPadding--)
  3474. V.push_back(0);
  3475. Init = ConstantDataArray::get(*DAG.getContext(), V);
  3476. }
  3477. auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
  3478. SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
  3479. if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
  3480. AFI->markGlobalAsPromotedToConstantPool(GVar);
  3481. AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
  3482. PaddedSize - 4);
  3483. }
  3484. ++NumConstpoolPromoted;
  3485. return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3486. }
  3487. bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
  3488. if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
  3489. if (!(GV = GA->getAliaseeObject()))
  3490. return false;
  3491. if (const auto *V = dyn_cast<GlobalVariable>(GV))
  3492. return V->isConstant();
  3493. return isa<Function>(GV);
  3494. }
  3495. SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
  3496. SelectionDAG &DAG) const {
  3497. switch (Subtarget->getTargetTriple().getObjectFormat()) {
  3498. default: llvm_unreachable("unknown object format");
  3499. case Triple::COFF:
  3500. return LowerGlobalAddressWindows(Op, DAG);
  3501. case Triple::ELF:
  3502. return LowerGlobalAddressELF(Op, DAG);
  3503. case Triple::MachO:
  3504. return LowerGlobalAddressDarwin(Op, DAG);
  3505. }
  3506. }
  3507. SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
  3508. SelectionDAG &DAG) const {
  3509. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3510. SDLoc dl(Op);
  3511. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  3512. const TargetMachine &TM = getTargetMachine();
  3513. bool IsRO = isReadOnly(GV);
  3514. // promoteToConstantPool only if not generating XO text section
  3515. if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
  3516. if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
  3517. return V;
  3518. if (isPositionIndependent()) {
  3519. bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
  3520. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
  3521. UseGOT_PREL ? ARMII::MO_GOT : 0);
  3522. SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
  3523. if (UseGOT_PREL)
  3524. Result =
  3525. DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
  3526. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  3527. return Result;
  3528. } else if (Subtarget->isROPI() && IsRO) {
  3529. // PC-relative.
  3530. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
  3531. SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
  3532. return Result;
  3533. } else if (Subtarget->isRWPI() && !IsRO) {
  3534. // SB-relative.
  3535. SDValue RelAddr;
  3536. if (Subtarget->useMovt()) {
  3537. ++NumMovwMovt;
  3538. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
  3539. RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
  3540. } else { // use literal pool for address constant
  3541. ARMConstantPoolValue *CPV =
  3542. ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
  3543. SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3544. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3545. RelAddr = DAG.getLoad(
  3546. PtrVT, dl, DAG.getEntryNode(), CPAddr,
  3547. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3548. }
  3549. SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
  3550. SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
  3551. return Result;
  3552. }
  3553. // If we have T2 ops, we can materialize the address directly via movt/movw
  3554. // pair. This is always cheaper.
  3555. if (Subtarget->useMovt()) {
  3556. ++NumMovwMovt;
  3557. // FIXME: Once remat is capable of dealing with instructions with register
  3558. // operands, expand this into two nodes.
  3559. return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
  3560. DAG.getTargetGlobalAddress(GV, dl, PtrVT));
  3561. } else {
  3562. SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
  3563. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3564. return DAG.getLoad(
  3565. PtrVT, dl, DAG.getEntryNode(), CPAddr,
  3566. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3567. }
  3568. }
  3569. SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
  3570. SelectionDAG &DAG) const {
  3571. assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
  3572. "ROPI/RWPI not currently supported for Darwin");
  3573. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3574. SDLoc dl(Op);
  3575. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  3576. if (Subtarget->useMovt())
  3577. ++NumMovwMovt;
  3578. // FIXME: Once remat is capable of dealing with instructions with register
  3579. // operands, expand this into multiple nodes
  3580. unsigned Wrapper =
  3581. isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
  3582. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
  3583. SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
  3584. if (Subtarget->isGVIndirectSymbol(GV))
  3585. Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
  3586. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  3587. return Result;
  3588. }
  3589. SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
  3590. SelectionDAG &DAG) const {
  3591. assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
  3592. assert(Subtarget->useMovt() &&
  3593. "Windows on ARM expects to use movw/movt");
  3594. assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
  3595. "ROPI/RWPI not currently supported for Windows");
  3596. const TargetMachine &TM = getTargetMachine();
  3597. const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  3598. ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
  3599. if (GV->hasDLLImportStorageClass())
  3600. TargetFlags = ARMII::MO_DLLIMPORT;
  3601. else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
  3602. TargetFlags = ARMII::MO_COFFSTUB;
  3603. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3604. SDValue Result;
  3605. SDLoc DL(Op);
  3606. ++NumMovwMovt;
  3607. // FIXME: Once remat is capable of dealing with instructions with register
  3608. // operands, expand this into two nodes.
  3609. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
  3610. DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
  3611. TargetFlags));
  3612. if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
  3613. Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
  3614. MachinePointerInfo::getGOT(DAG.getMachineFunction()));
  3615. return Result;
  3616. }
  3617. SDValue
  3618. ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
  3619. SDLoc dl(Op);
  3620. SDValue Val = DAG.getConstant(0, dl, MVT::i32);
  3621. return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
  3622. DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
  3623. Op.getOperand(1), Val);
  3624. }
  3625. SDValue
  3626. ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
  3627. SDLoc dl(Op);
  3628. return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
  3629. Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
  3630. }
  3631. SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
  3632. SelectionDAG &DAG) const {
  3633. SDLoc dl(Op);
  3634. return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
  3635. Op.getOperand(0));
  3636. }
  3637. SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
  3638. SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
  3639. unsigned IntNo =
  3640. cast<ConstantSDNode>(
  3641. Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
  3642. ->getZExtValue();
  3643. switch (IntNo) {
  3644. default:
  3645. return SDValue(); // Don't custom lower most intrinsics.
  3646. case Intrinsic::arm_gnu_eabi_mcount: {
  3647. MachineFunction &MF = DAG.getMachineFunction();
  3648. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3649. SDLoc dl(Op);
  3650. SDValue Chain = Op.getOperand(0);
  3651. // call "\01__gnu_mcount_nc"
  3652. const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
  3653. const uint32_t *Mask =
  3654. ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
  3655. assert(Mask && "Missing call preserved mask for calling convention");
  3656. // Mark LR an implicit live-in.
  3657. Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
  3658. SDValue ReturnAddress =
  3659. DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
  3660. constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
  3661. SDValue Callee =
  3662. DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
  3663. SDValue RegisterMask = DAG.getRegisterMask(Mask);
  3664. if (Subtarget->isThumb())
  3665. return SDValue(
  3666. DAG.getMachineNode(
  3667. ARM::tBL_PUSHLR, dl, ResultTys,
  3668. {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
  3669. DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
  3670. 0);
  3671. return SDValue(
  3672. DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
  3673. {ReturnAddress, Callee, RegisterMask, Chain}),
  3674. 0);
  3675. }
  3676. }
  3677. }
  3678. SDValue
  3679. ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
  3680. const ARMSubtarget *Subtarget) const {
  3681. unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  3682. SDLoc dl(Op);
  3683. switch (IntNo) {
  3684. default: return SDValue(); // Don't custom lower most intrinsics.
  3685. case Intrinsic::thread_pointer: {
  3686. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3687. return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
  3688. }
  3689. case Intrinsic::arm_cls: {
  3690. const SDValue &Operand = Op.getOperand(1);
  3691. const EVT VTy = Op.getValueType();
  3692. SDValue SRA =
  3693. DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
  3694. SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
  3695. SDValue SHL =
  3696. DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
  3697. SDValue OR =
  3698. DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
  3699. SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
  3700. return Result;
  3701. }
  3702. case Intrinsic::arm_cls64: {
  3703. // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
  3704. // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
  3705. const SDValue &Operand = Op.getOperand(1);
  3706. const EVT VTy = Op.getValueType();
  3707. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
  3708. DAG.getConstant(1, dl, VTy));
  3709. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
  3710. DAG.getConstant(0, dl, VTy));
  3711. SDValue Constant0 = DAG.getConstant(0, dl, VTy);
  3712. SDValue Constant1 = DAG.getConstant(1, dl, VTy);
  3713. SDValue Constant31 = DAG.getConstant(31, dl, VTy);
  3714. SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
  3715. SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
  3716. SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
  3717. SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
  3718. SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
  3719. SDValue CheckLo =
  3720. DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
  3721. SDValue HiIsZero =
  3722. DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
  3723. SDValue AdjustedLo =
  3724. DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
  3725. SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
  3726. SDValue Result =
  3727. DAG.getSelect(dl, VTy, CheckLo,
  3728. DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
  3729. return Result;
  3730. }
  3731. case Intrinsic::eh_sjlj_lsda: {
  3732. MachineFunction &MF = DAG.getMachineFunction();
  3733. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3734. unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
  3735. EVT PtrVT = getPointerTy(DAG.getDataLayout());
  3736. SDValue CPAddr;
  3737. bool IsPositionIndependent = isPositionIndependent();
  3738. unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
  3739. ARMConstantPoolValue *CPV =
  3740. ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
  3741. ARMCP::CPLSDA, PCAdj);
  3742. CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
  3743. CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
  3744. SDValue Result = DAG.getLoad(
  3745. PtrVT, dl, DAG.getEntryNode(), CPAddr,
  3746. MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
  3747. if (IsPositionIndependent) {
  3748. SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
  3749. Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
  3750. }
  3751. return Result;
  3752. }
  3753. case Intrinsic::arm_neon_vabs:
  3754. return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
  3755. Op.getOperand(1));
  3756. case Intrinsic::arm_neon_vmulls:
  3757. case Intrinsic::arm_neon_vmullu: {
  3758. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
  3759. ? ARMISD::VMULLs : ARMISD::VMULLu;
  3760. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3761. Op.getOperand(1), Op.getOperand(2));
  3762. }
  3763. case Intrinsic::arm_neon_vminnm:
  3764. case Intrinsic::arm_neon_vmaxnm: {
  3765. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
  3766. ? ISD::FMINNUM : ISD::FMAXNUM;
  3767. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3768. Op.getOperand(1), Op.getOperand(2));
  3769. }
  3770. case Intrinsic::arm_neon_vminu:
  3771. case Intrinsic::arm_neon_vmaxu: {
  3772. if (Op.getValueType().isFloatingPoint())
  3773. return SDValue();
  3774. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
  3775. ? ISD::UMIN : ISD::UMAX;
  3776. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3777. Op.getOperand(1), Op.getOperand(2));
  3778. }
  3779. case Intrinsic::arm_neon_vmins:
  3780. case Intrinsic::arm_neon_vmaxs: {
  3781. // v{min,max}s is overloaded between signed integers and floats.
  3782. if (!Op.getValueType().isFloatingPoint()) {
  3783. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
  3784. ? ISD::SMIN : ISD::SMAX;
  3785. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3786. Op.getOperand(1), Op.getOperand(2));
  3787. }
  3788. unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
  3789. ? ISD::FMINIMUM : ISD::FMAXIMUM;
  3790. return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
  3791. Op.getOperand(1), Op.getOperand(2));
  3792. }
  3793. case Intrinsic::arm_neon_vtbl1:
  3794. return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
  3795. Op.getOperand(1), Op.getOperand(2));
  3796. case Intrinsic::arm_neon_vtbl2:
  3797. return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
  3798. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  3799. case Intrinsic::arm_mve_pred_i2v:
  3800. case Intrinsic::arm_mve_pred_v2i:
  3801. return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
  3802. Op.getOperand(1));
  3803. case Intrinsic::arm_mve_vreinterpretq:
  3804. return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
  3805. Op.getOperand(1));
  3806. case Intrinsic::arm_mve_lsll:
  3807. return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
  3808. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  3809. case Intrinsic::arm_mve_asrl:
  3810. return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
  3811. Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  3812. }
  3813. }
  3814. static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
  3815. const ARMSubtarget *Subtarget) {
  3816. SDLoc dl(Op);
  3817. ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
  3818. auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
  3819. if (SSID == SyncScope::SingleThread)
  3820. return Op;
  3821. if (!Subtarget->hasDataBarrier()) {
  3822. // Some ARMv6 cpus can support data barriers with an mcr instruction.
  3823. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
  3824. // here.
  3825. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
  3826. "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
  3827. return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
  3828. DAG.getConstant(0, dl, MVT::i32));
  3829. }
  3830. ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
  3831. AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
  3832. ARM_MB::MemBOpt Domain = ARM_MB::ISH;
  3833. if (Subtarget->isMClass()) {
  3834. // Only a full system barrier exists in the M-class architectures.
  3835. Domain = ARM_MB::SY;
  3836. } else if (Subtarget->preferISHSTBarriers() &&
  3837. Ord == AtomicOrdering::Release) {
  3838. // Swift happens to implement ISHST barriers in a way that's compatible with
  3839. // Release semantics but weaker than ISH so we'd be fools not to use
  3840. // it. Beware: other processors probably don't!
  3841. Domain = ARM_MB::ISHST;
  3842. }
  3843. return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
  3844. DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
  3845. DAG.getConstant(Domain, dl, MVT::i32));
  3846. }
  3847. static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
  3848. const ARMSubtarget *Subtarget) {
  3849. // ARM pre v5TE and Thumb1 does not have preload instructions.
  3850. if (!(Subtarget->isThumb2() ||
  3851. (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
  3852. // Just preserve the chain.
  3853. return Op.getOperand(0);
  3854. SDLoc dl(Op);
  3855. unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
  3856. if (!isRead &&
  3857. (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
  3858. // ARMv7 with MP extension has PLDW.
  3859. return Op.getOperand(0);
  3860. unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
  3861. if (Subtarget->isThumb()) {
  3862. // Invert the bits.
  3863. isRead = ~isRead & 1;
  3864. isData = ~isData & 1;
  3865. }
  3866. return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
  3867. Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
  3868. DAG.getConstant(isData, dl, MVT::i32));
  3869. }
  3870. static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
  3871. MachineFunction &MF = DAG.getMachineFunction();
  3872. ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
  3873. // vastart just stores the address of the VarArgsFrameIndex slot into the
  3874. // memory location argument.
  3875. SDLoc dl(Op);
  3876. EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
  3877. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
  3878. const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  3879. return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
  3880. MachinePointerInfo(SV));
  3881. }
  3882. SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
  3883. CCValAssign &NextVA,
  3884. SDValue &Root,
  3885. SelectionDAG &DAG,
  3886. const SDLoc &dl) const {
  3887. MachineFunction &MF = DAG.getMachineFunction();
  3888. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3889. const TargetRegisterClass *RC;
  3890. if (AFI->isThumb1OnlyFunction())
  3891. RC = &ARM::tGPRRegClass;
  3892. else
  3893. RC = &ARM::GPRRegClass;
  3894. // Transform the arguments stored in physical registers into virtual ones.
  3895. Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
  3896. SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
  3897. SDValue ArgValue2;
  3898. if (NextVA.isMemLoc()) {
  3899. MachineFrameInfo &MFI = MF.getFrameInfo();
  3900. int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
  3901. // Create load node to retrieve arguments from the stack.
  3902. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
  3903. ArgValue2 = DAG.getLoad(
  3904. MVT::i32, dl, Root, FIN,
  3905. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
  3906. } else {
  3907. Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
  3908. ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
  3909. }
  3910. if (!Subtarget->isLittle())
  3911. std::swap (ArgValue, ArgValue2);
  3912. return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
  3913. }
  3914. // The remaining GPRs hold either the beginning of variable-argument
  3915. // data, or the beginning of an aggregate passed by value (usually
  3916. // byval). Either way, we allocate stack slots adjacent to the data
  3917. // provided by our caller, and store the unallocated registers there.
  3918. // If this is a variadic function, the va_list pointer will begin with
  3919. // these values; otherwise, this reassembles a (byval) structure that
  3920. // was split between registers and memory.
  3921. // Return: The frame index registers were stored into.
  3922. int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
  3923. const SDLoc &dl, SDValue &Chain,
  3924. const Value *OrigArg,
  3925. unsigned InRegsParamRecordIdx,
  3926. int ArgOffset, unsigned ArgSize) const {
  3927. // Currently, two use-cases possible:
  3928. // Case #1. Non-var-args function, and we meet first byval parameter.
  3929. // Setup first unallocated register as first byval register;
  3930. // eat all remained registers
  3931. // (these two actions are performed by HandleByVal method).
  3932. // Then, here, we initialize stack frame with
  3933. // "store-reg" instructions.
  3934. // Case #2. Var-args function, that doesn't contain byval parameters.
  3935. // The same: eat all remained unallocated registers,
  3936. // initialize stack frame.
  3937. MachineFunction &MF = DAG.getMachineFunction();
  3938. MachineFrameInfo &MFI = MF.getFrameInfo();
  3939. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3940. unsigned RBegin, REnd;
  3941. if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
  3942. CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
  3943. } else {
  3944. unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
  3945. RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
  3946. REnd = ARM::R4;
  3947. }
  3948. if (REnd != RBegin)
  3949. ArgOffset = -4 * (ARM::R4 - RBegin);
  3950. auto PtrVT = getPointerTy(DAG.getDataLayout());
  3951. int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
  3952. SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
  3953. SmallVector<SDValue, 4> MemOps;
  3954. const TargetRegisterClass *RC =
  3955. AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
  3956. for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
  3957. Register VReg = MF.addLiveIn(Reg, RC);
  3958. SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
  3959. SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
  3960. MachinePointerInfo(OrigArg, 4 * i));
  3961. MemOps.push_back(Store);
  3962. FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
  3963. }
  3964. if (!MemOps.empty())
  3965. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
  3966. return FrameIndex;
  3967. }
  3968. // Setup stack frame, the va_list pointer will start from.
  3969. void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
  3970. const SDLoc &dl, SDValue &Chain,
  3971. unsigned ArgOffset,
  3972. unsigned TotalArgRegsSaveSize,
  3973. bool ForceMutable) const {
  3974. MachineFunction &MF = DAG.getMachineFunction();
  3975. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  3976. // Try to store any remaining integer argument regs
  3977. // to their spots on the stack so that they may be loaded by dereferencing
  3978. // the result of va_next.
  3979. // If there is no regs to be stored, just point address after last
  3980. // argument passed via stack.
  3981. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
  3982. CCInfo.getInRegsParamsCount(),
  3983. CCInfo.getNextStackOffset(),
  3984. std::max(4U, TotalArgRegsSaveSize));
  3985. AFI->setVarArgsFrameIndex(FrameIndex);
  3986. }
  3987. bool ARMTargetLowering::splitValueIntoRegisterParts(
  3988. SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
  3989. unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
  3990. bool IsABIRegCopy = CC.has_value();
  3991. EVT ValueVT = Val.getValueType();
  3992. if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
  3993. PartVT == MVT::f32) {
  3994. unsigned ValueBits = ValueVT.getSizeInBits();
  3995. unsigned PartBits = PartVT.getSizeInBits();
  3996. Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
  3997. Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
  3998. Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
  3999. Parts[0] = Val;
  4000. return true;
  4001. }
  4002. return false;
  4003. }
  4004. SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
  4005. SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
  4006. MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
  4007. bool IsABIRegCopy = CC.has_value();
  4008. if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
  4009. PartVT == MVT::f32) {
  4010. unsigned ValueBits = ValueVT.getSizeInBits();
  4011. unsigned PartBits = PartVT.getSizeInBits();
  4012. SDValue Val = Parts[0];
  4013. Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
  4014. Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
  4015. Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
  4016. return Val;
  4017. }
  4018. return SDValue();
  4019. }
  4020. SDValue ARMTargetLowering::LowerFormalArguments(
  4021. SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
  4022. const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
  4023. SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
  4024. MachineFunction &MF = DAG.getMachineFunction();
  4025. MachineFrameInfo &MFI = MF.getFrameInfo();
  4026. ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  4027. // Assign locations to all of the incoming arguments.
  4028. SmallVector<CCValAssign, 16> ArgLocs;
  4029. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
  4030. *DAG.getContext());
  4031. CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
  4032. SmallVector<SDValue, 16> ArgValues;
  4033. SDValue ArgValue;
  4034. Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
  4035. unsigned CurArgIdx = 0;
  4036. // Initially ArgRegsSaveSize is zero.
  4037. // Then we increase this value each time we meet byval parameter.
  4038. // We also increase this value in case of varargs function.
  4039. AFI->setArgRegsSaveSize(0);
  4040. // Calculate the amount of stack space that we need to allocate to store
  4041. // byval and variadic arguments that are passed in registers.
  4042. // We need to know this before we allocate the first byval or variadic
  4043. // argument, as they will be allocated a stack slot below the CFA (Canonical
  4044. // Frame Address, the stack pointer at entry to the function).
  4045. unsigned ArgRegBegin = ARM::R4;
  4046. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
  4047. if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
  4048. break;
  4049. CCValAssign &VA = ArgLocs[i];
  4050. unsigned Index = VA.getValNo();
  4051. ISD::ArgFlagsTy Flags = Ins[Index].Flags;
  4052. if (!Flags.isByVal())
  4053. continue;
  4054. assert(VA.isMemLoc() && "unexpected byval pointer in reg");
  4055. unsigned RBegin, REnd;
  4056. CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
  4057. ArgRegBegin = std::min(ArgRegBegin, RBegin);
  4058. CCInfo.nextInRegsParam();
  4059. }
  4060. CCInfo.rewindByValRegsInfo();
  4061. int lastInsIndex = -1;
  4062. if (isVarArg && MFI.hasVAStart()) {
  4063. unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
  4064. if (RegIdx != std::size(GPRArgRegs))
  4065. ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
  4066. }
  4067. unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
  4068. AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
  4069. auto PtrVT = getPointerTy(DAG.getDataLayout());
  4070. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
  4071. CCValAssign &VA = ArgLocs[i];
  4072. if (Ins[VA.getValNo()].isOrigArg()) {
  4073. std::advance(CurOrigArg,
  4074. Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
  4075. CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
  4076. }
  4077. // Arguments stored in registers.
  4078. if (VA.isRegLoc()) {
  4079. EVT RegVT = VA.getLocVT();
  4080. if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
  4081. // f64 and vector types are split up into multiple registers or
  4082. // combinations of registers and stack slots.
  4083. SDValue ArgValue1 =
  4084. GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  4085. VA = ArgLocs[++i]; // skip ahead to next loc
  4086. SDValue ArgValue2;
  4087. if (VA.isMemLoc()) {
  4088. int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
  4089. SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
  4090. ArgValue2 = DAG.getLoad(
  4091. MVT::f64, dl, Chain, FIN,
  4092. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
  4093. } else {
  4094. ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  4095. }
  4096. ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
  4097. ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
  4098. ArgValue1, DAG.getIntPtrConstant(0, dl));
  4099. ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
  4100. ArgValue2, DAG.getIntPtrConstant(1, dl));
  4101. } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
  4102. ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  4103. } else {
  4104. const TargetRegisterClass *RC;
  4105. if (RegVT == MVT::f16 || RegVT == MVT::bf16)
  4106. RC = &ARM::HPRRegClass;
  4107. else if (RegVT == MVT::f32)
  4108. RC = &ARM::SPRRegClass;
  4109. else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
  4110. RegVT == MVT::v4bf16)
  4111. RC = &ARM::DPRRegClass;
  4112. else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
  4113. RegVT == MVT::v8bf16)
  4114. RC = &ARM::QPRRegClass;
  4115. else if (RegVT == MVT::i32)
  4116. RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
  4117. : &ARM::GPRRegClass;
  4118. else
  4119. llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
  4120. // Transform the arguments in physical registers into virtual ones.
  4121. Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
  4122. ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
  4123. // If this value is passed in r0 and has the returned attribute (e.g.
  4124. // C++ 'structors), record this fact for later use.
  4125. if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
  4126. AFI->setPreservesR0();
  4127. }
  4128. }
  4129. // If this is an 8 or 16-bit value, it is really passed promoted
  4130. // to 32 bits. Insert an assert[sz]ext to capture this, then
  4131. // truncate to the right size.
  4132. switch (VA.getLocInfo()) {
  4133. default: llvm_unreachable("Unknown loc info!");
  4134. case CCValAssign::Full: break;
  4135. case CCValAssign::BCvt:
  4136. ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
  4137. break;
  4138. case CCValAssign::SExt:
  4139. ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
  4140. DAG.getValueType(VA.getValVT()));
  4141. ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
  4142. break;
  4143. case CCValAssign::ZExt:
  4144. ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
  4145. DAG.getValueType(VA.getValVT()));
  4146. ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
  4147. break;
  4148. }
  4149. // f16 arguments have their size extended to 4 bytes and passed as if they
  4150. // had been copied to the LSBs of a 32-bit register.
  4151. // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
  4152. if (VA.needsCustom() &&
  4153. (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
  4154. ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
  4155. InVals.push_back(ArgValue);
  4156. } else { // VA.isRegLoc()
  4157. // Only arguments passed on the stack should make it here.
  4158. assert(VA.isMemLoc());
  4159. assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
  4160. int index = VA.getValNo();
  4161. // Some Ins[] entries become multiple ArgLoc[] entries.
  4162. // Process them only once.
  4163. if (index != lastInsIndex)
  4164. {
  4165. ISD::ArgFlagsTy Flags = Ins[index].Flags;
  4166. // FIXME: For now, all byval parameter objects are marked mutable.
  4167. // This can be changed with more analysis.
  4168. // In case of tail call optimization mark all arguments mutable.
  4169. // Since they could be overwritten by lowering of arguments in case of
  4170. // a tail call.
  4171. if (Flags.isByVal()) {
  4172. assert(Ins[index].isOrigArg() &&
  4173. "Byval arguments cannot be implicit");
  4174. unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
  4175. int FrameIndex = StoreByValRegs(
  4176. CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
  4177. VA.getLocMemOffset(), Flags.getByValSize());
  4178. InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
  4179. CCInfo.nextInRegsParam();
  4180. } else {
  4181. unsigned FIOffset = VA.getLocMemOffset();
  4182. int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
  4183. FIOffset, true);
  4184. // Create load nodes to retrieve arguments from the stack.
  4185. SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
  4186. InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
  4187. MachinePointerInfo::getFixedStack(
  4188. DAG.getMachineFunction(), FI)));
  4189. }
  4190. lastInsIndex = index;
  4191. }
  4192. }
  4193. }
  4194. // varargs
  4195. if (isVarArg && MFI.hasVAStart()) {
  4196. VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
  4197. TotalArgRegsSaveSize);
  4198. if (AFI->isCmseNSEntryFunction()) {
  4199. DiagnosticInfoUnsupported Diag(
  4200. DAG.getMachineFunction().getFunction(),
  4201. "secure entry function must not be variadic", dl.getDebugLoc());
  4202. DAG.getContext()->diagnose(Diag);
  4203. }
  4204. }
  4205. unsigned StackArgSize = CCInfo.getNextStackOffset();
  4206. bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
  4207. if (canGuaranteeTCO(CallConv, TailCallOpt)) {
  4208. // The only way to guarantee a tail call is if the callee restores its
  4209. // argument area, but it must also keep the stack aligned when doing so.
  4210. const DataLayout &DL = DAG.getDataLayout();
  4211. StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
  4212. AFI->setArgumentStackToRestore(StackArgSize);
  4213. }
  4214. AFI->setArgumentStackSize(StackArgSize);
  4215. if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
  4216. DiagnosticInfoUnsupported Diag(
  4217. DAG.getMachineFunction().getFunction(),
  4218. "secure entry function requires arguments on stack", dl.getDebugLoc());
  4219. DAG.getContext()->diagnose(Diag);
  4220. }
  4221. return Chain;
  4222. }
  4223. /// isFloatingPointZero - Return true if this is +0.0.
  4224. static bool isFloatingPointZero(SDValue Op) {
  4225. if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
  4226. return CFP->getValueAPF().isPosZero();
  4227. else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
  4228. // Maybe this has already been legalized into the constant pool?
  4229. if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
  4230. SDValue WrapperOp = Op.getOperand(1).getOperand(0);
  4231. if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
  4232. if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
  4233. return CFP->getValueAPF().isPosZero();
  4234. }
  4235. } else if (Op->getOpcode() == ISD::BITCAST &&
  4236. Op->getValueType(0) == MVT::f64) {
  4237. // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
  4238. // created by LowerConstantFP().
  4239. SDValue BitcastOp = Op->getOperand(0);
  4240. if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
  4241. isNullConstant(BitcastOp->getOperand(0)))
  4242. return true;
  4243. }
  4244. return false;
  4245. }
  4246. /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
  4247. /// the given operands.
  4248. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  4249. SDValue &ARMcc, SelectionDAG &DAG,
  4250. const SDLoc &dl) const {
  4251. if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
  4252. unsigned C = RHSC->getZExtValue();
  4253. if (!isLegalICmpImmediate((int32_t)C)) {
  4254. // Constant does not fit, try adjusting it by one.
  4255. switch (CC) {
  4256. default: break;
  4257. case ISD::SETLT:
  4258. case ISD::SETGE:
  4259. if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
  4260. CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
  4261. RHS = DAG.getConstant(C - 1, dl, MVT::i32);
  4262. }
  4263. break;
  4264. case ISD::SETULT:
  4265. case ISD::SETUGE:
  4266. if (C != 0 && isLegalICmpImmediate(C-1)) {
  4267. CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
  4268. RHS = DAG.getConstant(C - 1, dl, MVT::i32);
  4269. }
  4270. break;
  4271. case ISD::SETLE:
  4272. case ISD::SETGT:
  4273. if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
  4274. CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
  4275. RHS = DAG.getConstant(C + 1, dl, MVT::i32);
  4276. }
  4277. break;
  4278. case ISD::SETULE:
  4279. case ISD::SETUGT:
  4280. if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
  4281. CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
  4282. RHS = DAG.getConstant(C + 1, dl, MVT::i32);
  4283. }
  4284. break;
  4285. }
  4286. }
  4287. } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
  4288. (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
  4289. // In ARM and Thumb-2, the compare instructions can shift their second
  4290. // operand.
  4291. CC = ISD::getSetCCSwappedOperands(CC);
  4292. std::swap(LHS, RHS);
  4293. }
  4294. // Thumb1 has very limited immediate modes, so turning an "and" into a
  4295. // shift can save multiple instructions.
  4296. //
  4297. // If we have (x & C1), and C1 is an appropriate mask, we can transform it
  4298. // into "((x << n) >> n)". But that isn't necessarily profitable on its
  4299. // own. If it's the operand to an unsigned comparison with an immediate,
  4300. // we can eliminate one of the shifts: we transform
  4301. // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
  4302. //
  4303. // We avoid transforming cases which aren't profitable due to encoding
  4304. // details:
  4305. //
  4306. // 1. C2 fits into the immediate field of a cmp, and the transformed version
  4307. // would not; in that case, we're essentially trading one immediate load for
  4308. // another.
  4309. // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
  4310. // 3. C2 is zero; we have other code for this special case.
  4311. //
  4312. // FIXME: Figure out profitability for Thumb2; we usually can't save an
  4313. // instruction, since the AND is always one instruction anyway, but we could
  4314. // use narrow instructions in some cases.
  4315. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
  4316. LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
  4317. LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
  4318. !isSignedIntSetCC(CC)) {
  4319. unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
  4320. auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
  4321. uint64_t RHSV = RHSC->getZExtValue();
  4322. if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
  4323. unsigned ShiftBits = countLeadingZeros(Mask);
  4324. if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
  4325. SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
  4326. LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
  4327. RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
  4328. }
  4329. }
  4330. }
  4331. // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
  4332. // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
  4333. // way a cmp would.
  4334. // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
  4335. // some tweaks to the heuristics for the previous and->shift transform.
  4336. // FIXME: Optimize cases where the LHS isn't a shift.
  4337. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
  4338. isa<ConstantSDNode>(RHS) &&
  4339. cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
  4340. CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
  4341. cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
  4342. unsigned ShiftAmt =
  4343. cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
  4344. SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
  4345. DAG.getVTList(MVT::i32, MVT::i32),
  4346. LHS.getOperand(0),
  4347. DAG.getConstant(ShiftAmt, dl, MVT::i32));
  4348. SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
  4349. Shift.getValue(1), SDValue());
  4350. ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
  4351. return Chain.getValue(1);
  4352. }
  4353. ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
  4354. // If the RHS is a constant zero then the V (overflow) flag will never be
  4355. // set. This can allow us to simplify GE to PL or LT to MI, which can be
  4356. // simpler for other passes (like the peephole optimiser) to deal with.
  4357. if (isNullConstant(RHS)) {
  4358. switch (CondCode) {
  4359. default: break;
  4360. case ARMCC::GE:
  4361. CondCode = ARMCC::PL;
  4362. break;
  4363. case ARMCC::LT:
  4364. CondCode = ARMCC::MI;
  4365. break;
  4366. }
  4367. }
  4368. ARMISD::NodeType CompareType;
  4369. switch (CondCode) {
  4370. default:
  4371. CompareType = ARMISD::CMP;
  4372. break;
  4373. case ARMCC::EQ:
  4374. case ARMCC::NE:
  4375. // Uses only Z Flag
  4376. CompareType = ARMISD::CMPZ;
  4377. break;
  4378. }
  4379. ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  4380. return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
  4381. }
  4382. /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
  4383. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
  4384. SelectionDAG &DAG, const SDLoc &dl,
  4385. bool Signaling) const {
  4386. assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
  4387. SDValue Cmp;
  4388. if (!isFloatingPointZero(RHS))
  4389. Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
  4390. dl, MVT::Glue, LHS, RHS);
  4391. else
  4392. Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
  4393. dl, MVT::Glue, LHS);
  4394. return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
  4395. }
  4396. /// duplicateCmp - Glue values can have only one use, so this function
  4397. /// duplicates a comparison node.
  4398. SDValue
  4399. ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
  4400. unsigned Opc = Cmp.getOpcode();
  4401. SDLoc DL(Cmp);
  4402. if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
  4403. return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
  4404. assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
  4405. Cmp = Cmp.getOperand(0);
  4406. Opc = Cmp.getOpcode();
  4407. if (Opc == ARMISD::CMPFP)
  4408. Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
  4409. else {
  4410. assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
  4411. Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
  4412. }
  4413. return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
  4414. }
  4415. // This function returns three things: the arithmetic computation itself
  4416. // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
  4417. // comparison and the condition code define the case in which the arithmetic
  4418. // computation *does not* overflow.
  4419. std::pair<SDValue, SDValue>
  4420. ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
  4421. SDValue &ARMcc) const {
  4422. assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
  4423. SDValue Value, OverflowCmp;
  4424. SDValue LHS = Op.getOperand(0);
  4425. SDValue RHS = Op.getOperand(1);
  4426. SDLoc dl(Op);
  4427. // FIXME: We are currently always generating CMPs because we don't support
  4428. // generating CMN through the backend. This is not as good as the natural
  4429. // CMP case because it causes a register dependency and cannot be folded
  4430. // later.
  4431. switch (Op.getOpcode()) {
  4432. default:
  4433. llvm_unreachable("Unknown overflow instruction!");
  4434. case ISD::SADDO:
  4435. ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
  4436. Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
  4437. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
  4438. break;
  4439. case ISD::UADDO:
  4440. ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
  4441. // We use ADDC here to correspond to its use in LowerUnsignedALUO.
  4442. // We do not use it in the USUBO case as Value may not be used.
  4443. Value = DAG.getNode(ARMISD::ADDC, dl,
  4444. DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
  4445. .getValue(0);
  4446. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
  4447. break;
  4448. case ISD::SSUBO:
  4449. ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
  4450. Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
  4451. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
  4452. break;
  4453. case ISD::USUBO:
  4454. ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
  4455. Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
  4456. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
  4457. break;
  4458. case ISD::UMULO:
  4459. // We generate a UMUL_LOHI and then check if the high word is 0.
  4460. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
  4461. Value = DAG.getNode(ISD::UMUL_LOHI, dl,
  4462. DAG.getVTList(Op.getValueType(), Op.getValueType()),
  4463. LHS, RHS);
  4464. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
  4465. DAG.getConstant(0, dl, MVT::i32));
  4466. Value = Value.getValue(0); // We only want the low 32 bits for the result.
  4467. break;
  4468. case ISD::SMULO:
  4469. // We generate a SMUL_LOHI and then check if all the bits of the high word
  4470. // are the same as the sign bit of the low word.
  4471. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
  4472. Value = DAG.getNode(ISD::SMUL_LOHI, dl,
  4473. DAG.getVTList(Op.getValueType(), Op.getValueType()),
  4474. LHS, RHS);
  4475. OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
  4476. DAG.getNode(ISD::SRA, dl, Op.getValueType(),
  4477. Value.getValue(0),
  4478. DAG.getConstant(31, dl, MVT::i32)));
  4479. Value = Value.getValue(0); // We only want the low 32 bits for the result.
  4480. break;
  4481. } // switch (...)
  4482. return std::make_pair(Value, OverflowCmp);
  4483. }
  4484. SDValue
  4485. ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
  4486. // Let legalize expand this if it isn't a legal type yet.
  4487. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
  4488. return SDValue();
  4489. SDValue Value, OverflowCmp;
  4490. SDValue ARMcc;
  4491. std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
  4492. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4493. SDLoc dl(Op);
  4494. // We use 0 and 1 as false and true values.
  4495. SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
  4496. SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
  4497. EVT VT = Op.getValueType();
  4498. SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
  4499. ARMcc, CCR, OverflowCmp);
  4500. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
  4501. return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
  4502. }
  4503. static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
  4504. SelectionDAG &DAG) {
  4505. SDLoc DL(BoolCarry);
  4506. EVT CarryVT = BoolCarry.getValueType();
  4507. // This converts the boolean value carry into the carry flag by doing
  4508. // ARMISD::SUBC Carry, 1
  4509. SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
  4510. DAG.getVTList(CarryVT, MVT::i32),
  4511. BoolCarry, DAG.getConstant(1, DL, CarryVT));
  4512. return Carry.getValue(1);
  4513. }
  4514. static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
  4515. SelectionDAG &DAG) {
  4516. SDLoc DL(Flags);
  4517. // Now convert the carry flag into a boolean carry. We do this
  4518. // using ARMISD:ADDE 0, 0, Carry
  4519. return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
  4520. DAG.getConstant(0, DL, MVT::i32),
  4521. DAG.getConstant(0, DL, MVT::i32), Flags);
  4522. }
  4523. SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
  4524. SelectionDAG &DAG) const {
  4525. // Let legalize expand this if it isn't a legal type yet.
  4526. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
  4527. return SDValue();
  4528. SDValue LHS = Op.getOperand(0);
  4529. SDValue RHS = Op.getOperand(1);
  4530. SDLoc dl(Op);
  4531. EVT VT = Op.getValueType();
  4532. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  4533. SDValue Value;
  4534. SDValue Overflow;
  4535. switch (Op.getOpcode()) {
  4536. default:
  4537. llvm_unreachable("Unknown overflow instruction!");
  4538. case ISD::UADDO:
  4539. Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
  4540. // Convert the carry flag into a boolean value.
  4541. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
  4542. break;
  4543. case ISD::USUBO: {
  4544. Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
  4545. // Convert the carry flag into a boolean value.
  4546. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
  4547. // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
  4548. // value. So compute 1 - C.
  4549. Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
  4550. DAG.getConstant(1, dl, MVT::i32), Overflow);
  4551. break;
  4552. }
  4553. }
  4554. return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
  4555. }
  4556. static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,
  4557. const ARMSubtarget *Subtarget) {
  4558. EVT VT = Op.getValueType();
  4559. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
  4560. return SDValue();
  4561. if (!VT.isSimple())
  4562. return SDValue();
  4563. unsigned NewOpcode;
  4564. switch (VT.getSimpleVT().SimpleTy) {
  4565. default:
  4566. return SDValue();
  4567. case MVT::i8:
  4568. switch (Op->getOpcode()) {
  4569. case ISD::UADDSAT:
  4570. NewOpcode = ARMISD::UQADD8b;
  4571. break;
  4572. case ISD::SADDSAT:
  4573. NewOpcode = ARMISD::QADD8b;
  4574. break;
  4575. case ISD::USUBSAT:
  4576. NewOpcode = ARMISD::UQSUB8b;
  4577. break;
  4578. case ISD::SSUBSAT:
  4579. NewOpcode = ARMISD::QSUB8b;
  4580. break;
  4581. }
  4582. break;
  4583. case MVT::i16:
  4584. switch (Op->getOpcode()) {
  4585. case ISD::UADDSAT:
  4586. NewOpcode = ARMISD::UQADD16b;
  4587. break;
  4588. case ISD::SADDSAT:
  4589. NewOpcode = ARMISD::QADD16b;
  4590. break;
  4591. case ISD::USUBSAT:
  4592. NewOpcode = ARMISD::UQSUB16b;
  4593. break;
  4594. case ISD::SSUBSAT:
  4595. NewOpcode = ARMISD::QSUB16b;
  4596. break;
  4597. }
  4598. break;
  4599. }
  4600. SDLoc dl(Op);
  4601. SDValue Add =
  4602. DAG.getNode(NewOpcode, dl, MVT::i32,
  4603. DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
  4604. DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
  4605. return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
  4606. }
  4607. SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
  4608. SDValue Cond = Op.getOperand(0);
  4609. SDValue SelectTrue = Op.getOperand(1);
  4610. SDValue SelectFalse = Op.getOperand(2);
  4611. SDLoc dl(Op);
  4612. unsigned Opc = Cond.getOpcode();
  4613. if (Cond.getResNo() == 1 &&
  4614. (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
  4615. Opc == ISD::USUBO)) {
  4616. if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
  4617. return SDValue();
  4618. SDValue Value, OverflowCmp;
  4619. SDValue ARMcc;
  4620. std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
  4621. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4622. EVT VT = Op.getValueType();
  4623. return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
  4624. OverflowCmp, DAG);
  4625. }
  4626. // Convert:
  4627. //
  4628. // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
  4629. // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
  4630. //
  4631. if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
  4632. const ConstantSDNode *CMOVTrue =
  4633. dyn_cast<ConstantSDNode>(Cond.getOperand(0));
  4634. const ConstantSDNode *CMOVFalse =
  4635. dyn_cast<ConstantSDNode>(Cond.getOperand(1));
  4636. if (CMOVTrue && CMOVFalse) {
  4637. unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
  4638. unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
  4639. SDValue True;
  4640. SDValue False;
  4641. if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
  4642. True = SelectTrue;
  4643. False = SelectFalse;
  4644. } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
  4645. True = SelectFalse;
  4646. False = SelectTrue;
  4647. }
  4648. if (True.getNode() && False.getNode()) {
  4649. EVT VT = Op.getValueType();
  4650. SDValue ARMcc = Cond.getOperand(2);
  4651. SDValue CCR = Cond.getOperand(3);
  4652. SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
  4653. assert(True.getValueType() == VT);
  4654. return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
  4655. }
  4656. }
  4657. }
  4658. // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
  4659. // undefined bits before doing a full-word comparison with zero.
  4660. Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
  4661. DAG.getConstant(1, dl, Cond.getValueType()));
  4662. return DAG.getSelectCC(dl, Cond,
  4663. DAG.getConstant(0, dl, Cond.getValueType()),
  4664. SelectTrue, SelectFalse, ISD::SETNE);
  4665. }
  4666. static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
  4667. bool &swpCmpOps, bool &swpVselOps) {
  4668. // Start by selecting the GE condition code for opcodes that return true for
  4669. // 'equality'
  4670. if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
  4671. CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
  4672. CondCode = ARMCC::GE;
  4673. // and GT for opcodes that return false for 'equality'.
  4674. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
  4675. CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
  4676. CondCode = ARMCC::GT;
  4677. // Since we are constrained to GE/GT, if the opcode contains 'less', we need
  4678. // to swap the compare operands.
  4679. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
  4680. CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
  4681. swpCmpOps = true;
  4682. // Both GT and GE are ordered comparisons, and return false for 'unordered'.
  4683. // If we have an unordered opcode, we need to swap the operands to the VSEL
  4684. // instruction (effectively negating the condition).
  4685. //
  4686. // This also has the effect of swapping which one of 'less' or 'greater'
  4687. // returns true, so we also swap the compare operands. It also switches
  4688. // whether we return true for 'equality', so we compensate by picking the
  4689. // opposite condition code to our original choice.
  4690. if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
  4691. CC == ISD::SETUGT) {
  4692. swpCmpOps = !swpCmpOps;
  4693. swpVselOps = !swpVselOps;
  4694. CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
  4695. }
  4696. // 'ordered' is 'anything but unordered', so use the VS condition code and
  4697. // swap the VSEL operands.
  4698. if (CC == ISD::SETO) {
  4699. CondCode = ARMCC::VS;
  4700. swpVselOps = true;
  4701. }
  4702. // 'unordered or not equal' is 'anything but equal', so use the EQ condition
  4703. // code and swap the VSEL operands. Also do this if we don't care about the
  4704. // unordered case.
  4705. if (CC == ISD::SETUNE || CC == ISD::SETNE) {
  4706. CondCode = ARMCC::EQ;
  4707. swpVselOps = true;
  4708. }
  4709. }
  4710. SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
  4711. SDValue TrueVal, SDValue ARMcc, SDValue CCR,
  4712. SDValue Cmp, SelectionDAG &DAG) const {
  4713. if (!Subtarget->hasFP64() && VT == MVT::f64) {
  4714. FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
  4715. DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
  4716. TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
  4717. DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
  4718. SDValue TrueLow = TrueVal.getValue(0);
  4719. SDValue TrueHigh = TrueVal.getValue(1);
  4720. SDValue FalseLow = FalseVal.getValue(0);
  4721. SDValue FalseHigh = FalseVal.getValue(1);
  4722. SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
  4723. ARMcc, CCR, Cmp);
  4724. SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
  4725. ARMcc, CCR, duplicateCmp(Cmp, DAG));
  4726. return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
  4727. } else {
  4728. return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
  4729. Cmp);
  4730. }
  4731. }
  4732. static bool isGTorGE(ISD::CondCode CC) {
  4733. return CC == ISD::SETGT || CC == ISD::SETGE;
  4734. }
  4735. static bool isLTorLE(ISD::CondCode CC) {
  4736. return CC == ISD::SETLT || CC == ISD::SETLE;
  4737. }
  4738. // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
  4739. // All of these conditions (and their <= and >= counterparts) will do:
  4740. // x < k ? k : x
  4741. // x > k ? x : k
  4742. // k < x ? x : k
  4743. // k > x ? k : x
  4744. static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
  4745. const SDValue TrueVal, const SDValue FalseVal,
  4746. const ISD::CondCode CC, const SDValue K) {
  4747. return (isGTorGE(CC) &&
  4748. ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
  4749. (isLTorLE(CC) &&
  4750. ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
  4751. }
  4752. // Check if two chained conditionals could be converted into SSAT or USAT.
  4753. //
  4754. // SSAT can replace a set of two conditional selectors that bound a number to an
  4755. // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
  4756. //
  4757. // x < -k ? -k : (x > k ? k : x)
  4758. // x < -k ? -k : (x < k ? x : k)
  4759. // x > -k ? (x > k ? k : x) : -k
  4760. // x < k ? (x < -k ? -k : x) : k
  4761. // etc.
  4762. //
  4763. // LLVM canonicalizes these to either a min(max()) or a max(min())
  4764. // pattern. This function tries to match one of these and will return a SSAT
  4765. // node if successful.
  4766. //
  4767. // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
  4768. // is a power of 2.
  4769. static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
  4770. EVT VT = Op.getValueType();
  4771. SDValue V1 = Op.getOperand(0);
  4772. SDValue K1 = Op.getOperand(1);
  4773. SDValue TrueVal1 = Op.getOperand(2);
  4774. SDValue FalseVal1 = Op.getOperand(3);
  4775. ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  4776. const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
  4777. if (Op2.getOpcode() != ISD::SELECT_CC)
  4778. return SDValue();
  4779. SDValue V2 = Op2.getOperand(0);
  4780. SDValue K2 = Op2.getOperand(1);
  4781. SDValue TrueVal2 = Op2.getOperand(2);
  4782. SDValue FalseVal2 = Op2.getOperand(3);
  4783. ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
  4784. SDValue V1Tmp = V1;
  4785. SDValue V2Tmp = V2;
  4786. // Check that the registers and the constants match a max(min()) or min(max())
  4787. // pattern
  4788. if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
  4789. K2 != FalseVal2 ||
  4790. !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
  4791. return SDValue();
  4792. // Check that the constant in the lower-bound check is
  4793. // the opposite of the constant in the upper-bound check
  4794. // in 1's complement.
  4795. if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
  4796. return SDValue();
  4797. int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
  4798. int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
  4799. int64_t PosVal = std::max(Val1, Val2);
  4800. int64_t NegVal = std::min(Val1, Val2);
  4801. if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
  4802. !isPowerOf2_64(PosVal + 1))
  4803. return SDValue();
  4804. // Handle the difference between USAT (unsigned) and SSAT (signed)
  4805. // saturation
  4806. // At this point, PosVal is guaranteed to be positive
  4807. uint64_t K = PosVal;
  4808. SDLoc dl(Op);
  4809. if (Val1 == ~Val2)
  4810. return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
  4811. DAG.getConstant(countTrailingOnes(K), dl, VT));
  4812. if (NegVal == 0)
  4813. return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
  4814. DAG.getConstant(countTrailingOnes(K), dl, VT));
  4815. return SDValue();
  4816. }
  4817. // Check if a condition of the type x < k ? k : x can be converted into a
  4818. // bit operation instead of conditional moves.
  4819. // Currently this is allowed given:
  4820. // - The conditions and values match up
  4821. // - k is 0 or -1 (all ones)
  4822. // This function will not check the last condition, thats up to the caller
  4823. // It returns true if the transformation can be made, and in such case
  4824. // returns x in V, and k in SatK.
  4825. static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
  4826. SDValue &SatK)
  4827. {
  4828. SDValue LHS = Op.getOperand(0);
  4829. SDValue RHS = Op.getOperand(1);
  4830. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  4831. SDValue TrueVal = Op.getOperand(2);
  4832. SDValue FalseVal = Op.getOperand(3);
  4833. SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
  4834. ? &RHS
  4835. : nullptr;
  4836. // No constant operation in comparison, early out
  4837. if (!K)
  4838. return false;
  4839. SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
  4840. V = (KTmp == TrueVal) ? FalseVal : TrueVal;
  4841. SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
  4842. // If the constant on left and right side, or variable on left and right,
  4843. // does not match, early out
  4844. if (*K != KTmp || V != VTmp)
  4845. return false;
  4846. if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
  4847. SatK = *K;
  4848. return true;
  4849. }
  4850. return false;
  4851. }
  4852. bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
  4853. if (VT == MVT::f32)
  4854. return !Subtarget->hasVFP2Base();
  4855. if (VT == MVT::f64)
  4856. return !Subtarget->hasFP64();
  4857. if (VT == MVT::f16)
  4858. return !Subtarget->hasFullFP16();
  4859. return false;
  4860. }
  4861. SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
  4862. EVT VT = Op.getValueType();
  4863. SDLoc dl(Op);
  4864. // Try to convert two saturating conditional selects into a single SSAT
  4865. if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
  4866. if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
  4867. return SatValue;
  4868. // Try to convert expressions of the form x < k ? k : x (and similar forms)
  4869. // into more efficient bit operations, which is possible when k is 0 or -1
  4870. // On ARM and Thumb-2 which have flexible operand 2 this will result in
  4871. // single instructions. On Thumb the shift and the bit operation will be two
  4872. // instructions.
  4873. // Only allow this transformation on full-width (32-bit) operations
  4874. SDValue LowerSatConstant;
  4875. SDValue SatValue;
  4876. if (VT == MVT::i32 &&
  4877. isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
  4878. SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
  4879. DAG.getConstant(31, dl, VT));
  4880. if (isNullConstant(LowerSatConstant)) {
  4881. SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
  4882. DAG.getAllOnesConstant(dl, VT));
  4883. return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
  4884. } else if (isAllOnesConstant(LowerSatConstant))
  4885. return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
  4886. }
  4887. SDValue LHS = Op.getOperand(0);
  4888. SDValue RHS = Op.getOperand(1);
  4889. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
  4890. SDValue TrueVal = Op.getOperand(2);
  4891. SDValue FalseVal = Op.getOperand(3);
  4892. ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
  4893. ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
  4894. if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
  4895. LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
  4896. unsigned TVal = CTVal->getZExtValue();
  4897. unsigned FVal = CFVal->getZExtValue();
  4898. unsigned Opcode = 0;
  4899. if (TVal == ~FVal) {
  4900. Opcode = ARMISD::CSINV;
  4901. } else if (TVal == ~FVal + 1) {
  4902. Opcode = ARMISD::CSNEG;
  4903. } else if (TVal + 1 == FVal) {
  4904. Opcode = ARMISD::CSINC;
  4905. } else if (TVal == FVal + 1) {
  4906. Opcode = ARMISD::CSINC;
  4907. std::swap(TrueVal, FalseVal);
  4908. std::swap(TVal, FVal);
  4909. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4910. }
  4911. if (Opcode) {
  4912. // If one of the constants is cheaper than another, materialise the
  4913. // cheaper one and let the csel generate the other.
  4914. if (Opcode != ARMISD::CSINC &&
  4915. HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
  4916. std::swap(TrueVal, FalseVal);
  4917. std::swap(TVal, FVal);
  4918. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4919. }
  4920. // Attempt to use ZR checking TVal is 0, possibly inverting the condition
  4921. // to get there. CSINC not is invertable like the other two (~(~a) == a,
  4922. // -(-a) == a, but (a+1)+1 != a).
  4923. if (FVal == 0 && Opcode != ARMISD::CSINC) {
  4924. std::swap(TrueVal, FalseVal);
  4925. std::swap(TVal, FVal);
  4926. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4927. }
  4928. // Drops F's value because we can get it by inverting/negating TVal.
  4929. FalseVal = TrueVal;
  4930. SDValue ARMcc;
  4931. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  4932. EVT VT = TrueVal.getValueType();
  4933. return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
  4934. }
  4935. }
  4936. if (isUnsupportedFloatingType(LHS.getValueType())) {
  4937. DAG.getTargetLoweringInfo().softenSetCCOperands(
  4938. DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
  4939. // If softenSetCCOperands only returned one value, we should compare it to
  4940. // zero.
  4941. if (!RHS.getNode()) {
  4942. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  4943. CC = ISD::SETNE;
  4944. }
  4945. }
  4946. if (LHS.getValueType() == MVT::i32) {
  4947. // Try to generate VSEL on ARMv8.
  4948. // The VSEL instruction can't use all the usual ARM condition
  4949. // codes: it only has two bits to select the condition code, so it's
  4950. // constrained to use only GE, GT, VS and EQ.
  4951. //
  4952. // To implement all the various ISD::SETXXX opcodes, we sometimes need to
  4953. // swap the operands of the previous compare instruction (effectively
  4954. // inverting the compare condition, swapping 'less' and 'greater') and
  4955. // sometimes need to swap the operands to the VSEL (which inverts the
  4956. // condition in the sense of firing whenever the previous condition didn't)
  4957. if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
  4958. TrueVal.getValueType() == MVT::f32 ||
  4959. TrueVal.getValueType() == MVT::f64)) {
  4960. ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
  4961. if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
  4962. CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
  4963. CC = ISD::getSetCCInverse(CC, LHS.getValueType());
  4964. std::swap(TrueVal, FalseVal);
  4965. }
  4966. }
  4967. SDValue ARMcc;
  4968. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  4969. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  4970. // Choose GE over PL, which vsel does now support
  4971. if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
  4972. ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
  4973. return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
  4974. }
  4975. ARMCC::CondCodes CondCode, CondCode2;
  4976. FPCCToARMCC(CC, CondCode, CondCode2);
  4977. // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
  4978. // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
  4979. // must use VSEL (limited condition codes), due to not having conditional f16
  4980. // moves.
  4981. if (Subtarget->hasFPARMv8Base() &&
  4982. !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
  4983. (TrueVal.getValueType() == MVT::f16 ||
  4984. TrueVal.getValueType() == MVT::f32 ||
  4985. TrueVal.getValueType() == MVT::f64)) {
  4986. bool swpCmpOps = false;
  4987. bool swpVselOps = false;
  4988. checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
  4989. if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
  4990. CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
  4991. if (swpCmpOps)
  4992. std::swap(LHS, RHS);
  4993. if (swpVselOps)
  4994. std::swap(TrueVal, FalseVal);
  4995. }
  4996. }
  4997. SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  4998. SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
  4999. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5000. SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
  5001. if (CondCode2 != ARMCC::AL) {
  5002. SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
  5003. // FIXME: Needs another CMP because flag can have but one use.
  5004. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
  5005. Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
  5006. }
  5007. return Result;
  5008. }
  5009. /// canChangeToInt - Given the fp compare operand, return true if it is suitable
  5010. /// to morph to an integer compare sequence.
  5011. static bool canChangeToInt(SDValue Op, bool &SeenZero,
  5012. const ARMSubtarget *Subtarget) {
  5013. SDNode *N = Op.getNode();
  5014. if (!N->hasOneUse())
  5015. // Otherwise it requires moving the value from fp to integer registers.
  5016. return false;
  5017. if (!N->getNumValues())
  5018. return false;
  5019. EVT VT = Op.getValueType();
  5020. if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
  5021. // f32 case is generally profitable. f64 case only makes sense when vcmpe +
  5022. // vmrs are very slow, e.g. cortex-a8.
  5023. return false;
  5024. if (isFloatingPointZero(Op)) {
  5025. SeenZero = true;
  5026. return true;
  5027. }
  5028. return ISD::isNormalLoad(N);
  5029. }
  5030. static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
  5031. if (isFloatingPointZero(Op))
  5032. return DAG.getConstant(0, SDLoc(Op), MVT::i32);
  5033. if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
  5034. return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
  5035. Ld->getPointerInfo(), Ld->getAlign(),
  5036. Ld->getMemOperand()->getFlags());
  5037. llvm_unreachable("Unknown VFP cmp argument!");
  5038. }
  5039. static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
  5040. SDValue &RetVal1, SDValue &RetVal2) {
  5041. SDLoc dl(Op);
  5042. if (isFloatingPointZero(Op)) {
  5043. RetVal1 = DAG.getConstant(0, dl, MVT::i32);
  5044. RetVal2 = DAG.getConstant(0, dl, MVT::i32);
  5045. return;
  5046. }
  5047. if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
  5048. SDValue Ptr = Ld->getBasePtr();
  5049. RetVal1 =
  5050. DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
  5051. Ld->getAlign(), Ld->getMemOperand()->getFlags());
  5052. EVT PtrType = Ptr.getValueType();
  5053. SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
  5054. PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
  5055. RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
  5056. Ld->getPointerInfo().getWithOffset(4),
  5057. commonAlignment(Ld->getAlign(), 4),
  5058. Ld->getMemOperand()->getFlags());
  5059. return;
  5060. }
  5061. llvm_unreachable("Unknown VFP cmp argument!");
  5062. }
  5063. /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
  5064. /// f32 and even f64 comparisons to integer ones.
  5065. SDValue
  5066. ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
  5067. SDValue Chain = Op.getOperand(0);
  5068. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
  5069. SDValue LHS = Op.getOperand(2);
  5070. SDValue RHS = Op.getOperand(3);
  5071. SDValue Dest = Op.getOperand(4);
  5072. SDLoc dl(Op);
  5073. bool LHSSeenZero = false;
  5074. bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
  5075. bool RHSSeenZero = false;
  5076. bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
  5077. if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
  5078. // If unsafe fp math optimization is enabled and there are no other uses of
  5079. // the CMP operands, and the condition code is EQ or NE, we can optimize it
  5080. // to an integer comparison.
  5081. if (CC == ISD::SETOEQ)
  5082. CC = ISD::SETEQ;
  5083. else if (CC == ISD::SETUNE)
  5084. CC = ISD::SETNE;
  5085. SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
  5086. SDValue ARMcc;
  5087. if (LHS.getValueType() == MVT::f32) {
  5088. LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
  5089. bitcastf32Toi32(LHS, DAG), Mask);
  5090. RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
  5091. bitcastf32Toi32(RHS, DAG), Mask);
  5092. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  5093. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5094. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
  5095. Chain, Dest, ARMcc, CCR, Cmp);
  5096. }
  5097. SDValue LHS1, LHS2;
  5098. SDValue RHS1, RHS2;
  5099. expandf64Toi32(LHS, DAG, LHS1, LHS2);
  5100. expandf64Toi32(RHS, DAG, RHS1, RHS2);
  5101. LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
  5102. RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
  5103. ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
  5104. ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  5105. SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
  5106. SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
  5107. return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
  5108. }
  5109. return SDValue();
  5110. }
  5111. SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
  5112. SDValue Chain = Op.getOperand(0);
  5113. SDValue Cond = Op.getOperand(1);
  5114. SDValue Dest = Op.getOperand(2);
  5115. SDLoc dl(Op);
  5116. // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
  5117. // instruction.
  5118. unsigned Opc = Cond.getOpcode();
  5119. bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
  5120. !Subtarget->isThumb1Only();
  5121. if (Cond.getResNo() == 1 &&
  5122. (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
  5123. Opc == ISD::USUBO || OptimizeMul)) {
  5124. // Only lower legal XALUO ops.
  5125. if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
  5126. return SDValue();
  5127. // The actual operation with overflow check.
  5128. SDValue Value, OverflowCmp;
  5129. SDValue ARMcc;
  5130. std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
  5131. // Reverse the condition code.
  5132. ARMCC::CondCodes CondCode =
  5133. (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
  5134. CondCode = ARMCC::getOppositeCondition(CondCode);
  5135. ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
  5136. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5137. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
  5138. OverflowCmp);
  5139. }
  5140. return SDValue();
  5141. }
  5142. SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
  5143. SDValue Chain = Op.getOperand(0);
  5144. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
  5145. SDValue LHS = Op.getOperand(2);
  5146. SDValue RHS = Op.getOperand(3);
  5147. SDValue Dest = Op.getOperand(4);
  5148. SDLoc dl(Op);
  5149. if (isUnsupportedFloatingType(LHS.getValueType())) {
  5150. DAG.getTargetLoweringInfo().softenSetCCOperands(
  5151. DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
  5152. // If softenSetCCOperands only returned one value, we should compare it to
  5153. // zero.
  5154. if (!RHS.getNode()) {
  5155. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  5156. CC = ISD::SETNE;
  5157. }
  5158. }
  5159. // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
  5160. // instruction.
  5161. unsigned Opc = LHS.getOpcode();
  5162. bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
  5163. !Subtarget->isThumb1Only();
  5164. if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
  5165. (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
  5166. Opc == ISD::USUBO || OptimizeMul) &&
  5167. (CC == ISD::SETEQ || CC == ISD::SETNE)) {
  5168. // Only lower legal XALUO ops.
  5169. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
  5170. return SDValue();
  5171. // The actual operation with overflow check.
  5172. SDValue Value, OverflowCmp;
  5173. SDValue ARMcc;
  5174. std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
  5175. if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
  5176. // Reverse the condition code.
  5177. ARMCC::CondCodes CondCode =
  5178. (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
  5179. CondCode = ARMCC::getOppositeCondition(CondCode);
  5180. ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
  5181. }
  5182. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5183. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
  5184. OverflowCmp);
  5185. }
  5186. if (LHS.getValueType() == MVT::i32) {
  5187. SDValue ARMcc;
  5188. SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
  5189. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5190. return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
  5191. Chain, Dest, ARMcc, CCR, Cmp);
  5192. }
  5193. if (getTargetMachine().Options.UnsafeFPMath &&
  5194. (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
  5195. CC == ISD::SETNE || CC == ISD::SETUNE)) {
  5196. if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
  5197. return Result;
  5198. }
  5199. ARMCC::CondCodes CondCode, CondCode2;
  5200. FPCCToARMCC(CC, CondCode, CondCode2);
  5201. SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  5202. SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
  5203. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5204. SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
  5205. SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
  5206. SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
  5207. if (CondCode2 != ARMCC::AL) {
  5208. ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
  5209. SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
  5210. Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
  5211. }
  5212. return Res;
  5213. }
  5214. SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
  5215. SDValue Chain = Op.getOperand(0);
  5216. SDValue Table = Op.getOperand(1);
  5217. SDValue Index = Op.getOperand(2);
  5218. SDLoc dl(Op);
  5219. EVT PTy = getPointerTy(DAG.getDataLayout());
  5220. JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
  5221. SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
  5222. Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
  5223. Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
  5224. SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
  5225. if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
  5226. // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
  5227. // which does another jump to the destination. This also makes it easier
  5228. // to translate it to TBB / TBH later (Thumb2 only).
  5229. // FIXME: This might not work if the function is extremely large.
  5230. return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
  5231. Addr, Op.getOperand(2), JTI);
  5232. }
  5233. if (isPositionIndependent() || Subtarget->isROPI()) {
  5234. Addr =
  5235. DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
  5236. MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
  5237. Chain = Addr.getValue(1);
  5238. Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
  5239. return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
  5240. } else {
  5241. Addr =
  5242. DAG.getLoad(PTy, dl, Chain, Addr,
  5243. MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
  5244. Chain = Addr.getValue(1);
  5245. return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
  5246. }
  5247. }
  5248. static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
  5249. EVT VT = Op.getValueType();
  5250. SDLoc dl(Op);
  5251. if (Op.getValueType().getVectorElementType() == MVT::i32) {
  5252. if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
  5253. return Op;
  5254. return DAG.UnrollVectorOp(Op.getNode());
  5255. }
  5256. const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
  5257. EVT NewTy;
  5258. const EVT OpTy = Op.getOperand(0).getValueType();
  5259. if (OpTy == MVT::v4f32)
  5260. NewTy = MVT::v4i32;
  5261. else if (OpTy == MVT::v4f16 && HasFullFP16)
  5262. NewTy = MVT::v4i16;
  5263. else if (OpTy == MVT::v8f16 && HasFullFP16)
  5264. NewTy = MVT::v8i16;
  5265. else
  5266. llvm_unreachable("Invalid type for custom lowering!");
  5267. if (VT != MVT::v4i16 && VT != MVT::v8i16)
  5268. return DAG.UnrollVectorOp(Op.getNode());
  5269. Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
  5270. return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
  5271. }
  5272. SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
  5273. EVT VT = Op.getValueType();
  5274. if (VT.isVector())
  5275. return LowerVectorFP_TO_INT(Op, DAG);
  5276. bool IsStrict = Op->isStrictFPOpcode();
  5277. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  5278. if (isUnsupportedFloatingType(SrcVal.getValueType())) {
  5279. RTLIB::Libcall LC;
  5280. if (Op.getOpcode() == ISD::FP_TO_SINT ||
  5281. Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
  5282. LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
  5283. Op.getValueType());
  5284. else
  5285. LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
  5286. Op.getValueType());
  5287. SDLoc Loc(Op);
  5288. MakeLibCallOptions CallOptions;
  5289. SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
  5290. SDValue Result;
  5291. std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
  5292. CallOptions, Loc, Chain);
  5293. return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
  5294. }
  5295. // FIXME: Remove this when we have strict fp instruction selection patterns
  5296. if (IsStrict) {
  5297. SDLoc Loc(Op);
  5298. SDValue Result =
  5299. DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT
  5300. : ISD::FP_TO_UINT,
  5301. Loc, Op.getValueType(), SrcVal);
  5302. return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
  5303. }
  5304. return Op;
  5305. }
  5306. static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
  5307. const ARMSubtarget *Subtarget) {
  5308. EVT VT = Op.getValueType();
  5309. EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
  5310. EVT FromVT = Op.getOperand(0).getValueType();
  5311. if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
  5312. return Op;
  5313. if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
  5314. Subtarget->hasFP64())
  5315. return Op;
  5316. if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
  5317. Subtarget->hasFullFP16())
  5318. return Op;
  5319. if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
  5320. Subtarget->hasMVEFloatOps())
  5321. return Op;
  5322. if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
  5323. Subtarget->hasMVEFloatOps())
  5324. return Op;
  5325. if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
  5326. return SDValue();
  5327. SDLoc DL(Op);
  5328. bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
  5329. unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
  5330. SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
  5331. DAG.getValueType(VT.getScalarType()));
  5332. SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
  5333. DAG.getConstant((1 << BW) - 1, DL, VT));
  5334. if (IsSigned)
  5335. Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
  5336. DAG.getConstant(-(1 << BW), DL, VT));
  5337. return Max;
  5338. }
  5339. static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
  5340. EVT VT = Op.getValueType();
  5341. SDLoc dl(Op);
  5342. if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
  5343. if (VT.getVectorElementType() == MVT::f32)
  5344. return Op;
  5345. return DAG.UnrollVectorOp(Op.getNode());
  5346. }
  5347. assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
  5348. Op.getOperand(0).getValueType() == MVT::v8i16) &&
  5349. "Invalid type for custom lowering!");
  5350. const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
  5351. EVT DestVecType;
  5352. if (VT == MVT::v4f32)
  5353. DestVecType = MVT::v4i32;
  5354. else if (VT == MVT::v4f16 && HasFullFP16)
  5355. DestVecType = MVT::v4i16;
  5356. else if (VT == MVT::v8f16 && HasFullFP16)
  5357. DestVecType = MVT::v8i16;
  5358. else
  5359. return DAG.UnrollVectorOp(Op.getNode());
  5360. unsigned CastOpc;
  5361. unsigned Opc;
  5362. switch (Op.getOpcode()) {
  5363. default: llvm_unreachable("Invalid opcode!");
  5364. case ISD::SINT_TO_FP:
  5365. CastOpc = ISD::SIGN_EXTEND;
  5366. Opc = ISD::SINT_TO_FP;
  5367. break;
  5368. case ISD::UINT_TO_FP:
  5369. CastOpc = ISD::ZERO_EXTEND;
  5370. Opc = ISD::UINT_TO_FP;
  5371. break;
  5372. }
  5373. Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
  5374. return DAG.getNode(Opc, dl, VT, Op);
  5375. }
  5376. SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
  5377. EVT VT = Op.getValueType();
  5378. if (VT.isVector())
  5379. return LowerVectorINT_TO_FP(Op, DAG);
  5380. if (isUnsupportedFloatingType(VT)) {
  5381. RTLIB::Libcall LC;
  5382. if (Op.getOpcode() == ISD::SINT_TO_FP)
  5383. LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
  5384. Op.getValueType());
  5385. else
  5386. LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
  5387. Op.getValueType());
  5388. MakeLibCallOptions CallOptions;
  5389. return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
  5390. CallOptions, SDLoc(Op)).first;
  5391. }
  5392. return Op;
  5393. }
  5394. SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
  5395. // Implement fcopysign with a fabs and a conditional fneg.
  5396. SDValue Tmp0 = Op.getOperand(0);
  5397. SDValue Tmp1 = Op.getOperand(1);
  5398. SDLoc dl(Op);
  5399. EVT VT = Op.getValueType();
  5400. EVT SrcVT = Tmp1.getValueType();
  5401. bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
  5402. Tmp0.getOpcode() == ARMISD::VMOVDRR;
  5403. bool UseNEON = !InGPR && Subtarget->hasNEON();
  5404. if (UseNEON) {
  5405. // Use VBSL to copy the sign bit.
  5406. unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
  5407. SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
  5408. DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
  5409. EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
  5410. if (VT == MVT::f64)
  5411. Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
  5412. DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
  5413. DAG.getConstant(32, dl, MVT::i32));
  5414. else /*if (VT == MVT::f32)*/
  5415. Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
  5416. if (SrcVT == MVT::f32) {
  5417. Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
  5418. if (VT == MVT::f64)
  5419. Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
  5420. DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
  5421. DAG.getConstant(32, dl, MVT::i32));
  5422. } else if (VT == MVT::f32)
  5423. Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
  5424. DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
  5425. DAG.getConstant(32, dl, MVT::i32));
  5426. Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
  5427. Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
  5428. SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
  5429. dl, MVT::i32);
  5430. AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
  5431. SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
  5432. DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
  5433. SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
  5434. DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
  5435. DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
  5436. if (VT == MVT::f32) {
  5437. Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
  5438. Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
  5439. DAG.getConstant(0, dl, MVT::i32));
  5440. } else {
  5441. Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
  5442. }
  5443. return Res;
  5444. }
  5445. // Bitcast operand 1 to i32.
  5446. if (SrcVT == MVT::f64)
  5447. Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
  5448. Tmp1).getValue(1);
  5449. Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
  5450. // Or in the signbit with integer operations.
  5451. SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
  5452. SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
  5453. Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
  5454. if (VT == MVT::f32) {
  5455. Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
  5456. DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
  5457. return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
  5458. DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
  5459. }
  5460. // f64: Or the high part with signbit and then combine two parts.
  5461. Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
  5462. Tmp0);
  5463. SDValue Lo = Tmp0.getValue(0);
  5464. SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
  5465. Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
  5466. return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
  5467. }
  5468. SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
  5469. MachineFunction &MF = DAG.getMachineFunction();
  5470. MachineFrameInfo &MFI = MF.getFrameInfo();
  5471. MFI.setReturnAddressIsTaken(true);
  5472. if (verifyReturnAddressArgumentIsConstant(Op, DAG))
  5473. return SDValue();
  5474. EVT VT = Op.getValueType();
  5475. SDLoc dl(Op);
  5476. unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  5477. if (Depth) {
  5478. SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
  5479. SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
  5480. return DAG.getLoad(VT, dl, DAG.getEntryNode(),
  5481. DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
  5482. MachinePointerInfo());
  5483. }
  5484. // Return LR, which contains the return address. Mark it an implicit live-in.
  5485. Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
  5486. return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
  5487. }
  5488. SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
  5489. const ARMBaseRegisterInfo &ARI =
  5490. *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
  5491. MachineFunction &MF = DAG.getMachineFunction();
  5492. MachineFrameInfo &MFI = MF.getFrameInfo();
  5493. MFI.setFrameAddressIsTaken(true);
  5494. EVT VT = Op.getValueType();
  5495. SDLoc dl(Op); // FIXME probably not meaningful
  5496. unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  5497. Register FrameReg = ARI.getFrameRegister(MF);
  5498. SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
  5499. while (Depth--)
  5500. FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
  5501. MachinePointerInfo());
  5502. return FrameAddr;
  5503. }
  5504. // FIXME? Maybe this could be a TableGen attribute on some registers and
  5505. // this table could be generated automatically from RegInfo.
  5506. Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
  5507. const MachineFunction &MF) const {
  5508. Register Reg = StringSwitch<unsigned>(RegName)
  5509. .Case("sp", ARM::SP)
  5510. .Default(0);
  5511. if (Reg)
  5512. return Reg;
  5513. report_fatal_error(Twine("Invalid register name \""
  5514. + StringRef(RegName) + "\"."));
  5515. }
  5516. // Result is 64 bit value so split into two 32 bit values and return as a
  5517. // pair of values.
  5518. static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
  5519. SelectionDAG &DAG) {
  5520. SDLoc DL(N);
  5521. // This function is only supposed to be called for i64 type destination.
  5522. assert(N->getValueType(0) == MVT::i64
  5523. && "ExpandREAD_REGISTER called for non-i64 type result.");
  5524. SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
  5525. DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
  5526. N->getOperand(0),
  5527. N->getOperand(1));
  5528. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
  5529. Read.getValue(1)));
  5530. Results.push_back(Read.getOperand(0));
  5531. }
  5532. /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
  5533. /// When \p DstVT, the destination type of \p BC, is on the vector
  5534. /// register bank and the source of bitcast, \p Op, operates on the same bank,
  5535. /// it might be possible to combine them, such that everything stays on the
  5536. /// vector register bank.
  5537. /// \p return The node that would replace \p BT, if the combine
  5538. /// is possible.
  5539. static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
  5540. SelectionDAG &DAG) {
  5541. SDValue Op = BC->getOperand(0);
  5542. EVT DstVT = BC->getValueType(0);
  5543. // The only vector instruction that can produce a scalar (remember,
  5544. // since the bitcast was about to be turned into VMOVDRR, the source
  5545. // type is i64) from a vector is EXTRACT_VECTOR_ELT.
  5546. // Moreover, we can do this combine only if there is one use.
  5547. // Finally, if the destination type is not a vector, there is not
  5548. // much point on forcing everything on the vector bank.
  5549. if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  5550. !Op.hasOneUse())
  5551. return SDValue();
  5552. // If the index is not constant, we will introduce an additional
  5553. // multiply that will stick.
  5554. // Give up in that case.
  5555. ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  5556. if (!Index)
  5557. return SDValue();
  5558. unsigned DstNumElt = DstVT.getVectorNumElements();
  5559. // Compute the new index.
  5560. const APInt &APIntIndex = Index->getAPIntValue();
  5561. APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
  5562. NewIndex *= APIntIndex;
  5563. // Check if the new constant index fits into i32.
  5564. if (NewIndex.getBitWidth() > 32)
  5565. return SDValue();
  5566. // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
  5567. // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
  5568. SDLoc dl(Op);
  5569. SDValue ExtractSrc = Op.getOperand(0);
  5570. EVT VecVT = EVT::getVectorVT(
  5571. *DAG.getContext(), DstVT.getScalarType(),
  5572. ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
  5573. SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
  5574. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
  5575. DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
  5576. }
  5577. /// ExpandBITCAST - If the target supports VFP, this function is called to
  5578. /// expand a bit convert where either the source or destination type is i64 to
  5579. /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
  5580. /// operand type is illegal (e.g., v2f32 for a target that doesn't support
  5581. /// vectors), since the legalizer won't know what to do with that.
  5582. SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
  5583. const ARMSubtarget *Subtarget) const {
  5584. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  5585. SDLoc dl(N);
  5586. SDValue Op = N->getOperand(0);
  5587. // This function is only supposed to be called for i16 and i64 types, either
  5588. // as the source or destination of the bit convert.
  5589. EVT SrcVT = Op.getValueType();
  5590. EVT DstVT = N->getValueType(0);
  5591. if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
  5592. (DstVT == MVT::f16 || DstVT == MVT::bf16))
  5593. return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
  5594. DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
  5595. if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
  5596. (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
  5597. return DAG.getNode(
  5598. ISD::TRUNCATE, SDLoc(N), DstVT,
  5599. MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
  5600. if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
  5601. return SDValue();
  5602. // Turn i64->f64 into VMOVDRR.
  5603. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
  5604. // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
  5605. // if we can combine the bitcast with its source.
  5606. if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
  5607. return Val;
  5608. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
  5609. DAG.getConstant(0, dl, MVT::i32));
  5610. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
  5611. DAG.getConstant(1, dl, MVT::i32));
  5612. return DAG.getNode(ISD::BITCAST, dl, DstVT,
  5613. DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
  5614. }
  5615. // Turn f64->i64 into VMOVRRD.
  5616. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
  5617. SDValue Cvt;
  5618. if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
  5619. SrcVT.getVectorNumElements() > 1)
  5620. Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
  5621. DAG.getVTList(MVT::i32, MVT::i32),
  5622. DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
  5623. else
  5624. Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
  5625. DAG.getVTList(MVT::i32, MVT::i32), Op);
  5626. // Merge the pieces into a single i64 value.
  5627. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
  5628. }
  5629. return SDValue();
  5630. }
  5631. /// getZeroVector - Returns a vector of specified type with all zero elements.
  5632. /// Zero vectors are used to represent vector negation and in those cases
  5633. /// will be implemented with the NEON VNEG instruction. However, VNEG does
  5634. /// not support i64 elements, so sometimes the zero vectors will need to be
  5635. /// explicitly constructed. Regardless, use a canonical VMOV to create the
  5636. /// zero vector.
  5637. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
  5638. assert(VT.isVector() && "Expected a vector type");
  5639. // The canonical modified immediate encoding of a zero vector is....0!
  5640. SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
  5641. EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
  5642. SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
  5643. return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
  5644. }
  5645. /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
  5646. /// i32 values and take a 2 x i32 value to shift plus a shift amount.
  5647. SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
  5648. SelectionDAG &DAG) const {
  5649. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  5650. EVT VT = Op.getValueType();
  5651. unsigned VTBits = VT.getSizeInBits();
  5652. SDLoc dl(Op);
  5653. SDValue ShOpLo = Op.getOperand(0);
  5654. SDValue ShOpHi = Op.getOperand(1);
  5655. SDValue ShAmt = Op.getOperand(2);
  5656. SDValue ARMcc;
  5657. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5658. unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
  5659. assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  5660. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  5661. DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
  5662. SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
  5663. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  5664. DAG.getConstant(VTBits, dl, MVT::i32));
  5665. SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
  5666. SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  5667. SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
  5668. SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5669. ISD::SETGE, ARMcc, DAG, dl);
  5670. SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
  5671. ARMcc, CCR, CmpLo);
  5672. SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
  5673. SDValue HiBigShift = Opc == ISD::SRA
  5674. ? DAG.getNode(Opc, dl, VT, ShOpHi,
  5675. DAG.getConstant(VTBits - 1, dl, VT))
  5676. : DAG.getConstant(0, dl, VT);
  5677. SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5678. ISD::SETGE, ARMcc, DAG, dl);
  5679. SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
  5680. ARMcc, CCR, CmpHi);
  5681. SDValue Ops[2] = { Lo, Hi };
  5682. return DAG.getMergeValues(Ops, dl);
  5683. }
  5684. /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
  5685. /// i32 values and take a 2 x i32 value to shift plus a shift amount.
  5686. SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
  5687. SelectionDAG &DAG) const {
  5688. assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  5689. EVT VT = Op.getValueType();
  5690. unsigned VTBits = VT.getSizeInBits();
  5691. SDLoc dl(Op);
  5692. SDValue ShOpLo = Op.getOperand(0);
  5693. SDValue ShOpHi = Op.getOperand(1);
  5694. SDValue ShAmt = Op.getOperand(2);
  5695. SDValue ARMcc;
  5696. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  5697. assert(Op.getOpcode() == ISD::SHL_PARTS);
  5698. SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  5699. DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
  5700. SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
  5701. SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
  5702. SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
  5703. SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
  5704. DAG.getConstant(VTBits, dl, MVT::i32));
  5705. SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
  5706. SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5707. ISD::SETGE, ARMcc, DAG, dl);
  5708. SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
  5709. ARMcc, CCR, CmpHi);
  5710. SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
  5711. ISD::SETGE, ARMcc, DAG, dl);
  5712. SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  5713. SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
  5714. DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
  5715. SDValue Ops[2] = { Lo, Hi };
  5716. return DAG.getMergeValues(Ops, dl);
  5717. }
  5718. SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
  5719. SelectionDAG &DAG) const {
  5720. // The rounding mode is in bits 23:22 of the FPSCR.
  5721. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
  5722. // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
  5723. // so that the shift + and get folded into a bitfield extract.
  5724. SDLoc dl(Op);
  5725. SDValue Chain = Op.getOperand(0);
  5726. SDValue Ops[] = {Chain,
  5727. DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
  5728. SDValue FPSCR =
  5729. DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
  5730. Chain = FPSCR.getValue(1);
  5731. SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
  5732. DAG.getConstant(1U << 22, dl, MVT::i32));
  5733. SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
  5734. DAG.getConstant(22, dl, MVT::i32));
  5735. SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
  5736. DAG.getConstant(3, dl, MVT::i32));
  5737. return DAG.getMergeValues({And, Chain}, dl);
  5738. }
  5739. SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
  5740. SelectionDAG &DAG) const {
  5741. SDLoc DL(Op);
  5742. SDValue Chain = Op->getOperand(0);
  5743. SDValue RMValue = Op->getOperand(1);
  5744. // The rounding mode is in bits 23:22 of the FPSCR.
  5745. // The llvm.set.rounding argument value to ARM rounding mode value mapping
  5746. // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
  5747. // ((arg - 1) & 3) << 22).
  5748. //
  5749. // It is expected that the argument of llvm.set.rounding is within the
  5750. // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
  5751. // responsibility of the code generated llvm.set.rounding to ensure this
  5752. // condition.
  5753. // Calculate new value of FPSCR[23:22].
  5754. RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
  5755. DAG.getConstant(1, DL, MVT::i32));
  5756. RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
  5757. DAG.getConstant(0x3, DL, MVT::i32));
  5758. RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
  5759. DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
  5760. // Get current value of FPSCR.
  5761. SDValue Ops[] = {Chain,
  5762. DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
  5763. SDValue FPSCR =
  5764. DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
  5765. Chain = FPSCR.getValue(1);
  5766. FPSCR = FPSCR.getValue(0);
  5767. // Put new rounding mode into FPSCR[23:22].
  5768. const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
  5769. FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
  5770. DAG.getConstant(RMMask, DL, MVT::i32));
  5771. FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
  5772. SDValue Ops2[] = {
  5773. Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
  5774. return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
  5775. }
  5776. static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
  5777. const ARMSubtarget *ST) {
  5778. SDLoc dl(N);
  5779. EVT VT = N->getValueType(0);
  5780. if (VT.isVector() && ST->hasNEON()) {
  5781. // Compute the least significant set bit: LSB = X & -X
  5782. SDValue X = N->getOperand(0);
  5783. SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
  5784. SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
  5785. EVT ElemTy = VT.getVectorElementType();
  5786. if (ElemTy == MVT::i8) {
  5787. // Compute with: cttz(x) = ctpop(lsb - 1)
  5788. SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5789. DAG.getTargetConstant(1, dl, ElemTy));
  5790. SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
  5791. return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
  5792. }
  5793. if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
  5794. (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
  5795. // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
  5796. unsigned NumBits = ElemTy.getSizeInBits();
  5797. SDValue WidthMinus1 =
  5798. DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5799. DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
  5800. SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
  5801. return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
  5802. }
  5803. // Compute with: cttz(x) = ctpop(lsb - 1)
  5804. // Compute LSB - 1.
  5805. SDValue Bits;
  5806. if (ElemTy == MVT::i64) {
  5807. // Load constant 0xffff'ffff'ffff'ffff to register.
  5808. SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5809. DAG.getTargetConstant(0x1eff, dl, MVT::i32));
  5810. Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
  5811. } else {
  5812. SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  5813. DAG.getTargetConstant(1, dl, ElemTy));
  5814. Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
  5815. }
  5816. return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
  5817. }
  5818. if (!ST->hasV6T2Ops())
  5819. return SDValue();
  5820. SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
  5821. return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
  5822. }
  5823. static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
  5824. const ARMSubtarget *ST) {
  5825. EVT VT = N->getValueType(0);
  5826. SDLoc DL(N);
  5827. assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
  5828. assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
  5829. VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
  5830. "Unexpected type for custom ctpop lowering");
  5831. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  5832. EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
  5833. SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
  5834. Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
  5835. // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
  5836. unsigned EltSize = 8;
  5837. unsigned NumElts = VT.is64BitVector() ? 8 : 16;
  5838. while (EltSize != VT.getScalarSizeInBits()) {
  5839. SmallVector<SDValue, 8> Ops;
  5840. Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
  5841. TLI.getPointerTy(DAG.getDataLayout())));
  5842. Ops.push_back(Res);
  5843. EltSize *= 2;
  5844. NumElts /= 2;
  5845. MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
  5846. Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
  5847. }
  5848. return Res;
  5849. }
  5850. /// Getvshiftimm - Check if this is a valid build_vector for the immediate
  5851. /// operand of a vector shift operation, where all the elements of the
  5852. /// build_vector must have the same constant integer value.
  5853. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
  5854. // Ignore bit_converts.
  5855. while (Op.getOpcode() == ISD::BITCAST)
  5856. Op = Op.getOperand(0);
  5857. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
  5858. APInt SplatBits, SplatUndef;
  5859. unsigned SplatBitSize;
  5860. bool HasAnyUndefs;
  5861. if (!BVN ||
  5862. !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
  5863. ElementBits) ||
  5864. SplatBitSize > ElementBits)
  5865. return false;
  5866. Cnt = SplatBits.getSExtValue();
  5867. return true;
  5868. }
  5869. /// isVShiftLImm - Check if this is a valid build_vector for the immediate
  5870. /// operand of a vector shift left operation. That value must be in the range:
  5871. /// 0 <= Value < ElementBits for a left shift; or
  5872. /// 0 <= Value <= ElementBits for a long left shift.
  5873. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
  5874. assert(VT.isVector() && "vector shift count is not a vector type");
  5875. int64_t ElementBits = VT.getScalarSizeInBits();
  5876. if (!getVShiftImm(Op, ElementBits, Cnt))
  5877. return false;
  5878. return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
  5879. }
  5880. /// isVShiftRImm - Check if this is a valid build_vector for the immediate
  5881. /// operand of a vector shift right operation. For a shift opcode, the value
  5882. /// is positive, but for an intrinsic the value count must be negative. The
  5883. /// absolute value must be in the range:
  5884. /// 1 <= |Value| <= ElementBits for a right shift; or
  5885. /// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
  5886. static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
  5887. int64_t &Cnt) {
  5888. assert(VT.isVector() && "vector shift count is not a vector type");
  5889. int64_t ElementBits = VT.getScalarSizeInBits();
  5890. if (!getVShiftImm(Op, ElementBits, Cnt))
  5891. return false;
  5892. if (!isIntrinsic)
  5893. return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
  5894. if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
  5895. Cnt = -Cnt;
  5896. return true;
  5897. }
  5898. return false;
  5899. }
  5900. static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
  5901. const ARMSubtarget *ST) {
  5902. EVT VT = N->getValueType(0);
  5903. SDLoc dl(N);
  5904. int64_t Cnt;
  5905. if (!VT.isVector())
  5906. return SDValue();
  5907. // We essentially have two forms here. Shift by an immediate and shift by a
  5908. // vector register (there are also shift by a gpr, but that is just handled
  5909. // with a tablegen pattern). We cannot easily match shift by an immediate in
  5910. // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
  5911. // For shifting by a vector, we don't have VSHR, only VSHL (which can be
  5912. // signed or unsigned, and a negative shift indicates a shift right).
  5913. if (N->getOpcode() == ISD::SHL) {
  5914. if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
  5915. return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
  5916. DAG.getConstant(Cnt, dl, MVT::i32));
  5917. return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
  5918. N->getOperand(1));
  5919. }
  5920. assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
  5921. "unexpected vector shift opcode");
  5922. if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
  5923. unsigned VShiftOpc =
  5924. (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
  5925. return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
  5926. DAG.getConstant(Cnt, dl, MVT::i32));
  5927. }
  5928. // Other right shifts we don't have operations for (we use a shift left by a
  5929. // negative number).
  5930. EVT ShiftVT = N->getOperand(1).getValueType();
  5931. SDValue NegatedCount = DAG.getNode(
  5932. ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
  5933. unsigned VShiftOpc =
  5934. (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
  5935. return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
  5936. }
  5937. static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
  5938. const ARMSubtarget *ST) {
  5939. EVT VT = N->getValueType(0);
  5940. SDLoc dl(N);
  5941. // We can get here for a node like i32 = ISD::SHL i32, i64
  5942. if (VT != MVT::i64)
  5943. return SDValue();
  5944. assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
  5945. N->getOpcode() == ISD::SHL) &&
  5946. "Unknown shift to lower!");
  5947. unsigned ShOpc = N->getOpcode();
  5948. if (ST->hasMVEIntegerOps()) {
  5949. SDValue ShAmt = N->getOperand(1);
  5950. unsigned ShPartsOpc = ARMISD::LSLL;
  5951. ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
  5952. // If the shift amount is greater than 32 or has a greater bitwidth than 64
  5953. // then do the default optimisation
  5954. if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
  5955. (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
  5956. return SDValue();
  5957. // Extract the lower 32 bits of the shift amount if it's not an i32
  5958. if (ShAmt->getValueType(0) != MVT::i32)
  5959. ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
  5960. if (ShOpc == ISD::SRL) {
  5961. if (!Con)
  5962. // There is no t2LSRLr instruction so negate and perform an lsll if the
  5963. // shift amount is in a register, emulating a right shift.
  5964. ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
  5965. DAG.getConstant(0, dl, MVT::i32), ShAmt);
  5966. else
  5967. // Else generate an lsrl on the immediate shift amount
  5968. ShPartsOpc = ARMISD::LSRL;
  5969. } else if (ShOpc == ISD::SRA)
  5970. ShPartsOpc = ARMISD::ASRL;
  5971. // Lower 32 bits of the destination/source
  5972. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5973. DAG.getConstant(0, dl, MVT::i32));
  5974. // Upper 32 bits of the destination/source
  5975. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5976. DAG.getConstant(1, dl, MVT::i32));
  5977. // Generate the shift operation as computed above
  5978. Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
  5979. ShAmt);
  5980. // The upper 32 bits come from the second return value of lsll
  5981. Hi = SDValue(Lo.getNode(), 1);
  5982. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
  5983. }
  5984. // We only lower SRA, SRL of 1 here, all others use generic lowering.
  5985. if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
  5986. return SDValue();
  5987. // If we are in thumb mode, we don't have RRX.
  5988. if (ST->isThumb1Only())
  5989. return SDValue();
  5990. // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
  5991. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5992. DAG.getConstant(0, dl, MVT::i32));
  5993. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
  5994. DAG.getConstant(1, dl, MVT::i32));
  5995. // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
  5996. // captures the result into a carry flag.
  5997. unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
  5998. Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
  5999. // The low part is an ARMISD::RRX operand, which shifts the carry in.
  6000. Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
  6001. // Merge the pieces into a single i64 value.
  6002. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
  6003. }
  6004. static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
  6005. const ARMSubtarget *ST) {
  6006. bool Invert = false;
  6007. bool Swap = false;
  6008. unsigned Opc = ARMCC::AL;
  6009. SDValue Op0 = Op.getOperand(0);
  6010. SDValue Op1 = Op.getOperand(1);
  6011. SDValue CC = Op.getOperand(2);
  6012. EVT VT = Op.getValueType();
  6013. ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
  6014. SDLoc dl(Op);
  6015. EVT CmpVT;
  6016. if (ST->hasNEON())
  6017. CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
  6018. else {
  6019. assert(ST->hasMVEIntegerOps() &&
  6020. "No hardware support for integer vector comparison!");
  6021. if (Op.getValueType().getVectorElementType() != MVT::i1)
  6022. return SDValue();
  6023. // Make sure we expand floating point setcc to scalar if we do not have
  6024. // mve.fp, so that we can handle them from there.
  6025. if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
  6026. return SDValue();
  6027. CmpVT = VT;
  6028. }
  6029. if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
  6030. (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
  6031. // Special-case integer 64-bit equality comparisons. They aren't legal,
  6032. // but they can be lowered with a few vector instructions.
  6033. unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
  6034. EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
  6035. SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
  6036. SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
  6037. SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
  6038. DAG.getCondCode(ISD::SETEQ));
  6039. SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
  6040. SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
  6041. Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
  6042. if (SetCCOpcode == ISD::SETNE)
  6043. Merged = DAG.getNOT(dl, Merged, CmpVT);
  6044. Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
  6045. return Merged;
  6046. }
  6047. if (CmpVT.getVectorElementType() == MVT::i64)
  6048. // 64-bit comparisons are not legal in general.
  6049. return SDValue();
  6050. if (Op1.getValueType().isFloatingPoint()) {
  6051. switch (SetCCOpcode) {
  6052. default: llvm_unreachable("Illegal FP comparison");
  6053. case ISD::SETUNE:
  6054. case ISD::SETNE:
  6055. if (ST->hasMVEFloatOps()) {
  6056. Opc = ARMCC::NE; break;
  6057. } else {
  6058. Invert = true; [[fallthrough]];
  6059. }
  6060. case ISD::SETOEQ:
  6061. case ISD::SETEQ: Opc = ARMCC::EQ; break;
  6062. case ISD::SETOLT:
  6063. case ISD::SETLT: Swap = true; [[fallthrough]];
  6064. case ISD::SETOGT:
  6065. case ISD::SETGT: Opc = ARMCC::GT; break;
  6066. case ISD::SETOLE:
  6067. case ISD::SETLE: Swap = true; [[fallthrough]];
  6068. case ISD::SETOGE:
  6069. case ISD::SETGE: Opc = ARMCC::GE; break;
  6070. case ISD::SETUGE: Swap = true; [[fallthrough]];
  6071. case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
  6072. case ISD::SETUGT: Swap = true; [[fallthrough]];
  6073. case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
  6074. case ISD::SETUEQ: Invert = true; [[fallthrough]];
  6075. case ISD::SETONE: {
  6076. // Expand this to (OLT | OGT).
  6077. SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
  6078. DAG.getConstant(ARMCC::GT, dl, MVT::i32));
  6079. SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
  6080. DAG.getConstant(ARMCC::GT, dl, MVT::i32));
  6081. SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
  6082. if (Invert)
  6083. Result = DAG.getNOT(dl, Result, VT);
  6084. return Result;
  6085. }
  6086. case ISD::SETUO: Invert = true; [[fallthrough]];
  6087. case ISD::SETO: {
  6088. // Expand this to (OLT | OGE).
  6089. SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
  6090. DAG.getConstant(ARMCC::GT, dl, MVT::i32));
  6091. SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
  6092. DAG.getConstant(ARMCC::GE, dl, MVT::i32));
  6093. SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
  6094. if (Invert)
  6095. Result = DAG.getNOT(dl, Result, VT);
  6096. return Result;
  6097. }
  6098. }
  6099. } else {
  6100. // Integer comparisons.
  6101. switch (SetCCOpcode) {
  6102. default: llvm_unreachable("Illegal integer comparison");
  6103. case ISD::SETNE:
  6104. if (ST->hasMVEIntegerOps()) {
  6105. Opc = ARMCC::NE; break;
  6106. } else {
  6107. Invert = true; [[fallthrough]];
  6108. }
  6109. case ISD::SETEQ: Opc = ARMCC::EQ; break;
  6110. case ISD::SETLT: Swap = true; [[fallthrough]];
  6111. case ISD::SETGT: Opc = ARMCC::GT; break;
  6112. case ISD::SETLE: Swap = true; [[fallthrough]];
  6113. case ISD::SETGE: Opc = ARMCC::GE; break;
  6114. case ISD::SETULT: Swap = true; [[fallthrough]];
  6115. case ISD::SETUGT: Opc = ARMCC::HI; break;
  6116. case ISD::SETULE: Swap = true; [[fallthrough]];
  6117. case ISD::SETUGE: Opc = ARMCC::HS; break;
  6118. }
  6119. // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
  6120. if (ST->hasNEON() && Opc == ARMCC::EQ) {
  6121. SDValue AndOp;
  6122. if (ISD::isBuildVectorAllZeros(Op1.getNode()))
  6123. AndOp = Op0;
  6124. else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
  6125. AndOp = Op1;
  6126. // Ignore bitconvert.
  6127. if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
  6128. AndOp = AndOp.getOperand(0);
  6129. if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
  6130. Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
  6131. Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
  6132. SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
  6133. if (!Invert)
  6134. Result = DAG.getNOT(dl, Result, VT);
  6135. return Result;
  6136. }
  6137. }
  6138. }
  6139. if (Swap)
  6140. std::swap(Op0, Op1);
  6141. // If one of the operands is a constant vector zero, attempt to fold the
  6142. // comparison to a specialized compare-against-zero form.
  6143. if (ISD::isBuildVectorAllZeros(Op0.getNode()) &&
  6144. (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
  6145. Opc == ARMCC::NE)) {
  6146. if (Opc == ARMCC::GE)
  6147. Opc = ARMCC::LE;
  6148. else if (Opc == ARMCC::GT)
  6149. Opc = ARMCC::LT;
  6150. std::swap(Op0, Op1);
  6151. }
  6152. SDValue Result;
  6153. if (ISD::isBuildVectorAllZeros(Op1.getNode()) &&
  6154. (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
  6155. Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
  6156. Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
  6157. DAG.getConstant(Opc, dl, MVT::i32));
  6158. else
  6159. Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
  6160. DAG.getConstant(Opc, dl, MVT::i32));
  6161. Result = DAG.getSExtOrTrunc(Result, dl, VT);
  6162. if (Invert)
  6163. Result = DAG.getNOT(dl, Result, VT);
  6164. return Result;
  6165. }
  6166. static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
  6167. SDValue LHS = Op.getOperand(0);
  6168. SDValue RHS = Op.getOperand(1);
  6169. SDValue Carry = Op.getOperand(2);
  6170. SDValue Cond = Op.getOperand(3);
  6171. SDLoc DL(Op);
  6172. assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
  6173. // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
  6174. // have to invert the carry first.
  6175. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
  6176. DAG.getConstant(1, DL, MVT::i32), Carry);
  6177. // This converts the boolean value carry into the carry flag.
  6178. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
  6179. SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
  6180. SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
  6181. SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
  6182. SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
  6183. SDValue ARMcc = DAG.getConstant(
  6184. IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
  6185. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  6186. SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
  6187. Cmp.getValue(1), SDValue());
  6188. return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
  6189. CCR, Chain.getValue(1));
  6190. }
  6191. /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
  6192. /// valid vector constant for a NEON or MVE instruction with a "modified
  6193. /// immediate" operand (e.g., VMOV). If so, return the encoded value.
  6194. static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
  6195. unsigned SplatBitSize, SelectionDAG &DAG,
  6196. const SDLoc &dl, EVT &VT, EVT VectorVT,
  6197. VMOVModImmType type) {
  6198. unsigned OpCmode, Imm;
  6199. bool is128Bits = VectorVT.is128BitVector();
  6200. // SplatBitSize is set to the smallest size that splats the vector, so a
  6201. // zero vector will always have SplatBitSize == 8. However, NEON modified
  6202. // immediate instructions others than VMOV do not support the 8-bit encoding
  6203. // of a zero vector, and the default encoding of zero is supposed to be the
  6204. // 32-bit version.
  6205. if (SplatBits == 0)
  6206. SplatBitSize = 32;
  6207. switch (SplatBitSize) {
  6208. case 8:
  6209. if (type != VMOVModImm)
  6210. return SDValue();
  6211. // Any 1-byte value is OK. Op=0, Cmode=1110.
  6212. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
  6213. OpCmode = 0xe;
  6214. Imm = SplatBits;
  6215. VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
  6216. break;
  6217. case 16:
  6218. // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
  6219. VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
  6220. if ((SplatBits & ~0xff) == 0) {
  6221. // Value = 0x00nn: Op=x, Cmode=100x.
  6222. OpCmode = 0x8;
  6223. Imm = SplatBits;
  6224. break;
  6225. }
  6226. if ((SplatBits & ~0xff00) == 0) {
  6227. // Value = 0xnn00: Op=x, Cmode=101x.
  6228. OpCmode = 0xa;
  6229. Imm = SplatBits >> 8;
  6230. break;
  6231. }
  6232. return SDValue();
  6233. case 32:
  6234. // NEON's 32-bit VMOV supports splat values where:
  6235. // * only one byte is nonzero, or
  6236. // * the least significant byte is 0xff and the second byte is nonzero, or
  6237. // * the least significant 2 bytes are 0xff and the third is nonzero.
  6238. VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
  6239. if ((SplatBits & ~0xff) == 0) {
  6240. // Value = 0x000000nn: Op=x, Cmode=000x.
  6241. OpCmode = 0;
  6242. Imm = SplatBits;
  6243. break;
  6244. }
  6245. if ((SplatBits & ~0xff00) == 0) {
  6246. // Value = 0x0000nn00: Op=x, Cmode=001x.
  6247. OpCmode = 0x2;
  6248. Imm = SplatBits >> 8;
  6249. break;
  6250. }
  6251. if ((SplatBits & ~0xff0000) == 0) {
  6252. // Value = 0x00nn0000: Op=x, Cmode=010x.
  6253. OpCmode = 0x4;
  6254. Imm = SplatBits >> 16;
  6255. break;
  6256. }
  6257. if ((SplatBits & ~0xff000000) == 0) {
  6258. // Value = 0xnn000000: Op=x, Cmode=011x.
  6259. OpCmode = 0x6;
  6260. Imm = SplatBits >> 24;
  6261. break;
  6262. }
  6263. // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
  6264. if (type == OtherModImm) return SDValue();
  6265. if ((SplatBits & ~0xffff) == 0 &&
  6266. ((SplatBits | SplatUndef) & 0xff) == 0xff) {
  6267. // Value = 0x0000nnff: Op=x, Cmode=1100.
  6268. OpCmode = 0xc;
  6269. Imm = SplatBits >> 8;
  6270. break;
  6271. }
  6272. // cmode == 0b1101 is not supported for MVE VMVN
  6273. if (type == MVEVMVNModImm)
  6274. return SDValue();
  6275. if ((SplatBits & ~0xffffff) == 0 &&
  6276. ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
  6277. // Value = 0x00nnffff: Op=x, Cmode=1101.
  6278. OpCmode = 0xd;
  6279. Imm = SplatBits >> 16;
  6280. break;
  6281. }
  6282. // Note: there are a few 32-bit splat values (specifically: 00ffff00,
  6283. // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
  6284. // VMOV.I32. A (very) minor optimization would be to replicate the value
  6285. // and fall through here to test for a valid 64-bit splat. But, then the
  6286. // caller would also need to check and handle the change in size.
  6287. return SDValue();
  6288. case 64: {
  6289. if (type != VMOVModImm)
  6290. return SDValue();
  6291. // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
  6292. uint64_t BitMask = 0xff;
  6293. unsigned ImmMask = 1;
  6294. Imm = 0;
  6295. for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
  6296. if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
  6297. Imm |= ImmMask;
  6298. } else if ((SplatBits & BitMask) != 0) {
  6299. return SDValue();
  6300. }
  6301. BitMask <<= 8;
  6302. ImmMask <<= 1;
  6303. }
  6304. if (DAG.getDataLayout().isBigEndian()) {
  6305. // Reverse the order of elements within the vector.
  6306. unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
  6307. unsigned Mask = (1 << BytesPerElem) - 1;
  6308. unsigned NumElems = 8 / BytesPerElem;
  6309. unsigned NewImm = 0;
  6310. for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
  6311. unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
  6312. NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
  6313. }
  6314. Imm = NewImm;
  6315. }
  6316. // Op=1, Cmode=1110.
  6317. OpCmode = 0x1e;
  6318. VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
  6319. break;
  6320. }
  6321. default:
  6322. llvm_unreachable("unexpected size for isVMOVModifiedImm");
  6323. }
  6324. unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
  6325. return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
  6326. }
  6327. SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
  6328. const ARMSubtarget *ST) const {
  6329. EVT VT = Op.getValueType();
  6330. bool IsDouble = (VT == MVT::f64);
  6331. ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
  6332. const APFloat &FPVal = CFP->getValueAPF();
  6333. // Prevent floating-point constants from using literal loads
  6334. // when execute-only is enabled.
  6335. if (ST->genExecuteOnly()) {
  6336. // If we can represent the constant as an immediate, don't lower it
  6337. if (isFPImmLegal(FPVal, VT))
  6338. return Op;
  6339. // Otherwise, construct as integer, and move to float register
  6340. APInt INTVal = FPVal.bitcastToAPInt();
  6341. SDLoc DL(CFP);
  6342. switch (VT.getSimpleVT().SimpleTy) {
  6343. default:
  6344. llvm_unreachable("Unknown floating point type!");
  6345. break;
  6346. case MVT::f64: {
  6347. SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
  6348. SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
  6349. return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
  6350. }
  6351. case MVT::f32:
  6352. return DAG.getNode(ARMISD::VMOVSR, DL, VT,
  6353. DAG.getConstant(INTVal, DL, MVT::i32));
  6354. }
  6355. }
  6356. if (!ST->hasVFP3Base())
  6357. return SDValue();
  6358. // Use the default (constant pool) lowering for double constants when we have
  6359. // an SP-only FPU
  6360. if (IsDouble && !Subtarget->hasFP64())
  6361. return SDValue();
  6362. // Try splatting with a VMOV.f32...
  6363. int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
  6364. if (ImmVal != -1) {
  6365. if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
  6366. // We have code in place to select a valid ConstantFP already, no need to
  6367. // do any mangling.
  6368. return Op;
  6369. }
  6370. // It's a float and we are trying to use NEON operations where
  6371. // possible. Lower it to a splat followed by an extract.
  6372. SDLoc DL(Op);
  6373. SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
  6374. SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
  6375. NewVal);
  6376. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
  6377. DAG.getConstant(0, DL, MVT::i32));
  6378. }
  6379. // The rest of our options are NEON only, make sure that's allowed before
  6380. // proceeding..
  6381. if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
  6382. return SDValue();
  6383. EVT VMovVT;
  6384. uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
  6385. // It wouldn't really be worth bothering for doubles except for one very
  6386. // important value, which does happen to match: 0.0. So make sure we don't do
  6387. // anything stupid.
  6388. if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
  6389. return SDValue();
  6390. // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
  6391. SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
  6392. VMovVT, VT, VMOVModImm);
  6393. if (NewVal != SDValue()) {
  6394. SDLoc DL(Op);
  6395. SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
  6396. NewVal);
  6397. if (IsDouble)
  6398. return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
  6399. // It's a float: cast and extract a vector element.
  6400. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
  6401. VecConstant);
  6402. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
  6403. DAG.getConstant(0, DL, MVT::i32));
  6404. }
  6405. // Finally, try a VMVN.i32
  6406. NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
  6407. VT, VMVNModImm);
  6408. if (NewVal != SDValue()) {
  6409. SDLoc DL(Op);
  6410. SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
  6411. if (IsDouble)
  6412. return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
  6413. // It's a float: cast and extract a vector element.
  6414. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
  6415. VecConstant);
  6416. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
  6417. DAG.getConstant(0, DL, MVT::i32));
  6418. }
  6419. return SDValue();
  6420. }
  6421. // check if an VEXT instruction can handle the shuffle mask when the
  6422. // vector sources of the shuffle are the same.
  6423. static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
  6424. unsigned NumElts = VT.getVectorNumElements();
  6425. // Assume that the first shuffle index is not UNDEF. Fail if it is.
  6426. if (M[0] < 0)
  6427. return false;
  6428. Imm = M[0];
  6429. // If this is a VEXT shuffle, the immediate value is the index of the first
  6430. // element. The other shuffle indices must be the successive elements after
  6431. // the first one.
  6432. unsigned ExpectedElt = Imm;
  6433. for (unsigned i = 1; i < NumElts; ++i) {
  6434. // Increment the expected index. If it wraps around, just follow it
  6435. // back to index zero and keep going.
  6436. ++ExpectedElt;
  6437. if (ExpectedElt == NumElts)
  6438. ExpectedElt = 0;
  6439. if (M[i] < 0) continue; // ignore UNDEF indices
  6440. if (ExpectedElt != static_cast<unsigned>(M[i]))
  6441. return false;
  6442. }
  6443. return true;
  6444. }
  6445. static bool isVEXTMask(ArrayRef<int> M, EVT VT,
  6446. bool &ReverseVEXT, unsigned &Imm) {
  6447. unsigned NumElts = VT.getVectorNumElements();
  6448. ReverseVEXT = false;
  6449. // Assume that the first shuffle index is not UNDEF. Fail if it is.
  6450. if (M[0] < 0)
  6451. return false;
  6452. Imm = M[0];
  6453. // If this is a VEXT shuffle, the immediate value is the index of the first
  6454. // element. The other shuffle indices must be the successive elements after
  6455. // the first one.
  6456. unsigned ExpectedElt = Imm;
  6457. for (unsigned i = 1; i < NumElts; ++i) {
  6458. // Increment the expected index. If it wraps around, it may still be
  6459. // a VEXT but the source vectors must be swapped.
  6460. ExpectedElt += 1;
  6461. if (ExpectedElt == NumElts * 2) {
  6462. ExpectedElt = 0;
  6463. ReverseVEXT = true;
  6464. }
  6465. if (M[i] < 0) continue; // ignore UNDEF indices
  6466. if (ExpectedElt != static_cast<unsigned>(M[i]))
  6467. return false;
  6468. }
  6469. // Adjust the index value if the source operands will be swapped.
  6470. if (ReverseVEXT)
  6471. Imm -= NumElts;
  6472. return true;
  6473. }
  6474. static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
  6475. // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
  6476. // range, then 0 is placed into the resulting vector. So pretty much any mask
  6477. // of 8 elements can work here.
  6478. return VT == MVT::v8i8 && M.size() == 8;
  6479. }
  6480. static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
  6481. unsigned Index) {
  6482. if (Mask.size() == Elements * 2)
  6483. return Index / Elements;
  6484. return Mask[Index] == 0 ? 0 : 1;
  6485. }
  6486. // Checks whether the shuffle mask represents a vector transpose (VTRN) by
  6487. // checking that pairs of elements in the shuffle mask represent the same index
  6488. // in each vector, incrementing the expected index by 2 at each step.
  6489. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
  6490. // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
  6491. // v2={e,f,g,h}
  6492. // WhichResult gives the offset for each element in the mask based on which
  6493. // of the two results it belongs to.
  6494. //
  6495. // The transpose can be represented either as:
  6496. // result1 = shufflevector v1, v2, result1_shuffle_mask
  6497. // result2 = shufflevector v1, v2, result2_shuffle_mask
  6498. // where v1/v2 and the shuffle masks have the same number of elements
  6499. // (here WhichResult (see below) indicates which result is being checked)
  6500. //
  6501. // or as:
  6502. // results = shufflevector v1, v2, shuffle_mask
  6503. // where both results are returned in one vector and the shuffle mask has twice
  6504. // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
  6505. // want to check the low half and high half of the shuffle mask as if it were
  6506. // the other case
  6507. static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  6508. unsigned EltSz = VT.getScalarSizeInBits();
  6509. if (EltSz == 64)
  6510. return false;
  6511. unsigned NumElts = VT.getVectorNumElements();
  6512. if (M.size() != NumElts && M.size() != NumElts*2)
  6513. return false;
  6514. // If the mask is twice as long as the input vector then we need to check the
  6515. // upper and lower parts of the mask with a matching value for WhichResult
  6516. // FIXME: A mask with only even values will be rejected in case the first
  6517. // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
  6518. // M[0] is used to determine WhichResult
  6519. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6520. WhichResult = SelectPairHalf(NumElts, M, i);
  6521. for (unsigned j = 0; j < NumElts; j += 2) {
  6522. if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
  6523. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
  6524. return false;
  6525. }
  6526. }
  6527. if (M.size() == NumElts*2)
  6528. WhichResult = 0;
  6529. return true;
  6530. }
  6531. /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
  6532. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  6533. /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
  6534. static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
  6535. unsigned EltSz = VT.getScalarSizeInBits();
  6536. if (EltSz == 64)
  6537. return false;
  6538. unsigned NumElts = VT.getVectorNumElements();
  6539. if (M.size() != NumElts && M.size() != NumElts*2)
  6540. return false;
  6541. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6542. WhichResult = SelectPairHalf(NumElts, M, i);
  6543. for (unsigned j = 0; j < NumElts; j += 2) {
  6544. if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
  6545. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
  6546. return false;
  6547. }
  6548. }
  6549. if (M.size() == NumElts*2)
  6550. WhichResult = 0;
  6551. return true;
  6552. }
  6553. // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
  6554. // that the mask elements are either all even and in steps of size 2 or all odd
  6555. // and in steps of size 2.
  6556. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
  6557. // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
  6558. // v2={e,f,g,h}
  6559. // Requires similar checks to that of isVTRNMask with
  6560. // respect the how results are returned.
  6561. static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  6562. unsigned EltSz = VT.getScalarSizeInBits();
  6563. if (EltSz == 64)
  6564. return false;
  6565. unsigned NumElts = VT.getVectorNumElements();
  6566. if (M.size() != NumElts && M.size() != NumElts*2)
  6567. return false;
  6568. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6569. WhichResult = SelectPairHalf(NumElts, M, i);
  6570. for (unsigned j = 0; j < NumElts; ++j) {
  6571. if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
  6572. return false;
  6573. }
  6574. }
  6575. if (M.size() == NumElts*2)
  6576. WhichResult = 0;
  6577. // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6578. if (VT.is64BitVector() && EltSz == 32)
  6579. return false;
  6580. return true;
  6581. }
  6582. /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
  6583. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  6584. /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
  6585. static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
  6586. unsigned EltSz = VT.getScalarSizeInBits();
  6587. if (EltSz == 64)
  6588. return false;
  6589. unsigned NumElts = VT.getVectorNumElements();
  6590. if (M.size() != NumElts && M.size() != NumElts*2)
  6591. return false;
  6592. unsigned Half = NumElts / 2;
  6593. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6594. WhichResult = SelectPairHalf(NumElts, M, i);
  6595. for (unsigned j = 0; j < NumElts; j += Half) {
  6596. unsigned Idx = WhichResult;
  6597. for (unsigned k = 0; k < Half; ++k) {
  6598. int MIdx = M[i + j + k];
  6599. if (MIdx >= 0 && (unsigned) MIdx != Idx)
  6600. return false;
  6601. Idx += 2;
  6602. }
  6603. }
  6604. }
  6605. if (M.size() == NumElts*2)
  6606. WhichResult = 0;
  6607. // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6608. if (VT.is64BitVector() && EltSz == 32)
  6609. return false;
  6610. return true;
  6611. }
  6612. // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
  6613. // that pairs of elements of the shufflemask represent the same index in each
  6614. // vector incrementing sequentially through the vectors.
  6615. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
  6616. // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
  6617. // v2={e,f,g,h}
  6618. // Requires similar checks to that of isVTRNMask with respect the how results
  6619. // are returned.
  6620. static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
  6621. unsigned EltSz = VT.getScalarSizeInBits();
  6622. if (EltSz == 64)
  6623. return false;
  6624. unsigned NumElts = VT.getVectorNumElements();
  6625. if (M.size() != NumElts && M.size() != NumElts*2)
  6626. return false;
  6627. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6628. WhichResult = SelectPairHalf(NumElts, M, i);
  6629. unsigned Idx = WhichResult * NumElts / 2;
  6630. for (unsigned j = 0; j < NumElts; j += 2) {
  6631. if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
  6632. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
  6633. return false;
  6634. Idx += 1;
  6635. }
  6636. }
  6637. if (M.size() == NumElts*2)
  6638. WhichResult = 0;
  6639. // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6640. if (VT.is64BitVector() && EltSz == 32)
  6641. return false;
  6642. return true;
  6643. }
  6644. /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
  6645. /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
  6646. /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
  6647. static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
  6648. unsigned EltSz = VT.getScalarSizeInBits();
  6649. if (EltSz == 64)
  6650. return false;
  6651. unsigned NumElts = VT.getVectorNumElements();
  6652. if (M.size() != NumElts && M.size() != NumElts*2)
  6653. return false;
  6654. for (unsigned i = 0; i < M.size(); i += NumElts) {
  6655. WhichResult = SelectPairHalf(NumElts, M, i);
  6656. unsigned Idx = WhichResult * NumElts / 2;
  6657. for (unsigned j = 0; j < NumElts; j += 2) {
  6658. if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
  6659. (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
  6660. return false;
  6661. Idx += 1;
  6662. }
  6663. }
  6664. if (M.size() == NumElts*2)
  6665. WhichResult = 0;
  6666. // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
  6667. if (VT.is64BitVector() && EltSz == 32)
  6668. return false;
  6669. return true;
  6670. }
  6671. /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
  6672. /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
  6673. static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
  6674. unsigned &WhichResult,
  6675. bool &isV_UNDEF) {
  6676. isV_UNDEF = false;
  6677. if (isVTRNMask(ShuffleMask, VT, WhichResult))
  6678. return ARMISD::VTRN;
  6679. if (isVUZPMask(ShuffleMask, VT, WhichResult))
  6680. return ARMISD::VUZP;
  6681. if (isVZIPMask(ShuffleMask, VT, WhichResult))
  6682. return ARMISD::VZIP;
  6683. isV_UNDEF = true;
  6684. if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
  6685. return ARMISD::VTRN;
  6686. if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
  6687. return ARMISD::VUZP;
  6688. if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
  6689. return ARMISD::VZIP;
  6690. return 0;
  6691. }
  6692. /// \return true if this is a reverse operation on an vector.
  6693. static bool isReverseMask(ArrayRef<int> M, EVT VT) {
  6694. unsigned NumElts = VT.getVectorNumElements();
  6695. // Make sure the mask has the right size.
  6696. if (NumElts != M.size())
  6697. return false;
  6698. // Look for <15, ..., 3, -1, 1, 0>.
  6699. for (unsigned i = 0; i != NumElts; ++i)
  6700. if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
  6701. return false;
  6702. return true;
  6703. }
  6704. static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
  6705. unsigned NumElts = VT.getVectorNumElements();
  6706. // Make sure the mask has the right size.
  6707. if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
  6708. return false;
  6709. // Half-width truncation patterns (e.g. v4i32 -> v8i16):
  6710. // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
  6711. // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
  6712. // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
  6713. // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
  6714. int Ofs = Top ? 1 : 0;
  6715. int Upper = SingleSource ? 0 : NumElts;
  6716. for (int i = 0, e = NumElts / 2; i != e; ++i) {
  6717. if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
  6718. return false;
  6719. if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
  6720. return false;
  6721. }
  6722. return true;
  6723. }
  6724. static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
  6725. unsigned NumElts = VT.getVectorNumElements();
  6726. // Make sure the mask has the right size.
  6727. if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
  6728. return false;
  6729. // If Top
  6730. // Look for <0, N, 2, N+2, 4, N+4, ..>.
  6731. // This inserts Input2 into Input1
  6732. // else if not Top
  6733. // Look for <0, N+1, 2, N+3, 4, N+5, ..>
  6734. // This inserts Input1 into Input2
  6735. unsigned Offset = Top ? 0 : 1;
  6736. unsigned N = SingleSource ? 0 : NumElts;
  6737. for (unsigned i = 0; i < NumElts; i += 2) {
  6738. if (M[i] >= 0 && M[i] != (int)i)
  6739. return false;
  6740. if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
  6741. return false;
  6742. }
  6743. return true;
  6744. }
  6745. static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
  6746. unsigned NumElts = ToVT.getVectorNumElements();
  6747. if (NumElts != M.size())
  6748. return false;
  6749. // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
  6750. // looking for patterns of:
  6751. // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
  6752. // rev: N/2 0 N/2+1 1 N/2+2 2 ...
  6753. unsigned Off0 = rev ? NumElts / 2 : 0;
  6754. unsigned Off1 = rev ? 0 : NumElts / 2;
  6755. for (unsigned i = 0; i < NumElts; i += 2) {
  6756. if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
  6757. return false;
  6758. if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
  6759. return false;
  6760. }
  6761. return true;
  6762. }
  6763. // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
  6764. // from a pair of inputs. For example:
  6765. // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
  6766. // FP_ROUND(EXTRACT_ELT(Y, 0),
  6767. // FP_ROUND(EXTRACT_ELT(X, 1),
  6768. // FP_ROUND(EXTRACT_ELT(Y, 1), ...)
  6769. static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
  6770. const ARMSubtarget *ST) {
  6771. assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  6772. if (!ST->hasMVEFloatOps())
  6773. return SDValue();
  6774. SDLoc dl(BV);
  6775. EVT VT = BV.getValueType();
  6776. if (VT != MVT::v8f16)
  6777. return SDValue();
  6778. // We are looking for a buildvector of fptrunc elements, where all the
  6779. // elements are interleavingly extracted from two sources. Check the first two
  6780. // items are valid enough and extract some info from them (they are checked
  6781. // properly in the loop below).
  6782. if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
  6783. BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  6784. BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
  6785. return SDValue();
  6786. if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
  6787. BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  6788. BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
  6789. return SDValue();
  6790. SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
  6791. SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
  6792. if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
  6793. return SDValue();
  6794. // Check all the values in the BuildVector line up with our expectations.
  6795. for (unsigned i = 1; i < 4; i++) {
  6796. auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
  6797. return Trunc.getOpcode() == ISD::FP_ROUND &&
  6798. Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  6799. Trunc.getOperand(0).getOperand(0) == Op &&
  6800. Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
  6801. };
  6802. if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
  6803. return SDValue();
  6804. if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
  6805. return SDValue();
  6806. }
  6807. SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
  6808. DAG.getConstant(0, dl, MVT::i32));
  6809. return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
  6810. DAG.getConstant(1, dl, MVT::i32));
  6811. }
  6812. // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
  6813. // from a single input on alternating lanes. For example:
  6814. // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
  6815. // FP_ROUND(EXTRACT_ELT(X, 2),
  6816. // FP_ROUND(EXTRACT_ELT(X, 4), ...)
  6817. static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
  6818. const ARMSubtarget *ST) {
  6819. assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  6820. if (!ST->hasMVEFloatOps())
  6821. return SDValue();
  6822. SDLoc dl(BV);
  6823. EVT VT = BV.getValueType();
  6824. if (VT != MVT::v4f32)
  6825. return SDValue();
  6826. // We are looking for a buildvector of fptext elements, where all the
  6827. // elements are alternating lanes from a single source. For example <0,2,4,6>
  6828. // or <1,3,5,7>. Check the first two items are valid enough and extract some
  6829. // info from them (they are checked properly in the loop below).
  6830. if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
  6831. BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  6832. return SDValue();
  6833. SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
  6834. int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
  6835. if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
  6836. return SDValue();
  6837. // Check all the values in the BuildVector line up with our expectations.
  6838. for (unsigned i = 1; i < 4; i++) {
  6839. auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
  6840. return Trunc.getOpcode() == ISD::FP_EXTEND &&
  6841. Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  6842. Trunc.getOperand(0).getOperand(0) == Op &&
  6843. Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
  6844. };
  6845. if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
  6846. return SDValue();
  6847. }
  6848. return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
  6849. DAG.getConstant(Offset, dl, MVT::i32));
  6850. }
  6851. // If N is an integer constant that can be moved into a register in one
  6852. // instruction, return an SDValue of such a constant (will become a MOV
  6853. // instruction). Otherwise return null.
  6854. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
  6855. const ARMSubtarget *ST, const SDLoc &dl) {
  6856. uint64_t Val;
  6857. if (!isa<ConstantSDNode>(N))
  6858. return SDValue();
  6859. Val = cast<ConstantSDNode>(N)->getZExtValue();
  6860. if (ST->isThumb1Only()) {
  6861. if (Val <= 255 || ~Val <= 255)
  6862. return DAG.getConstant(Val, dl, MVT::i32);
  6863. } else {
  6864. if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
  6865. return DAG.getConstant(Val, dl, MVT::i32);
  6866. }
  6867. return SDValue();
  6868. }
  6869. static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
  6870. const ARMSubtarget *ST) {
  6871. SDLoc dl(Op);
  6872. EVT VT = Op.getValueType();
  6873. assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
  6874. unsigned NumElts = VT.getVectorNumElements();
  6875. unsigned BoolMask;
  6876. unsigned BitsPerBool;
  6877. if (NumElts == 2) {
  6878. BitsPerBool = 8;
  6879. BoolMask = 0xff;
  6880. } else if (NumElts == 4) {
  6881. BitsPerBool = 4;
  6882. BoolMask = 0xf;
  6883. } else if (NumElts == 8) {
  6884. BitsPerBool = 2;
  6885. BoolMask = 0x3;
  6886. } else if (NumElts == 16) {
  6887. BitsPerBool = 1;
  6888. BoolMask = 0x1;
  6889. } else
  6890. return SDValue();
  6891. // If this is a single value copied into all lanes (a splat), we can just sign
  6892. // extend that single value
  6893. SDValue FirstOp = Op.getOperand(0);
  6894. if (!isa<ConstantSDNode>(FirstOp) &&
  6895. llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
  6896. return U.get().isUndef() || U.get() == FirstOp;
  6897. })) {
  6898. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
  6899. DAG.getValueType(MVT::i1));
  6900. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
  6901. }
  6902. // First create base with bits set where known
  6903. unsigned Bits32 = 0;
  6904. for (unsigned i = 0; i < NumElts; ++i) {
  6905. SDValue V = Op.getOperand(i);
  6906. if (!isa<ConstantSDNode>(V) && !V.isUndef())
  6907. continue;
  6908. bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
  6909. if (BitSet)
  6910. Bits32 |= BoolMask << (i * BitsPerBool);
  6911. }
  6912. // Add in unknown nodes
  6913. SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
  6914. DAG.getConstant(Bits32, dl, MVT::i32));
  6915. for (unsigned i = 0; i < NumElts; ++i) {
  6916. SDValue V = Op.getOperand(i);
  6917. if (isa<ConstantSDNode>(V) || V.isUndef())
  6918. continue;
  6919. Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
  6920. DAG.getConstant(i, dl, MVT::i32));
  6921. }
  6922. return Base;
  6923. }
  6924. static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
  6925. const ARMSubtarget *ST) {
  6926. if (!ST->hasMVEIntegerOps())
  6927. return SDValue();
  6928. // We are looking for a buildvector where each element is Op[0] + i*N
  6929. EVT VT = Op.getValueType();
  6930. SDValue Op0 = Op.getOperand(0);
  6931. unsigned NumElts = VT.getVectorNumElements();
  6932. // Get the increment value from operand 1
  6933. SDValue Op1 = Op.getOperand(1);
  6934. if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
  6935. !isa<ConstantSDNode>(Op1.getOperand(1)))
  6936. return SDValue();
  6937. unsigned N = Op1.getConstantOperandVal(1);
  6938. if (N != 1 && N != 2 && N != 4 && N != 8)
  6939. return SDValue();
  6940. // Check that each other operand matches
  6941. for (unsigned I = 2; I < NumElts; I++) {
  6942. SDValue OpI = Op.getOperand(I);
  6943. if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
  6944. !isa<ConstantSDNode>(OpI.getOperand(1)) ||
  6945. OpI.getConstantOperandVal(1) != I * N)
  6946. return SDValue();
  6947. }
  6948. SDLoc DL(Op);
  6949. return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
  6950. DAG.getConstant(N, DL, MVT::i32));
  6951. }
  6952. // Returns true if the operation N can be treated as qr instruction variant at
  6953. // operand Op.
  6954. static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
  6955. switch (N->getOpcode()) {
  6956. case ISD::ADD:
  6957. case ISD::MUL:
  6958. case ISD::SADDSAT:
  6959. case ISD::UADDSAT:
  6960. return true;
  6961. case ISD::SUB:
  6962. case ISD::SSUBSAT:
  6963. case ISD::USUBSAT:
  6964. return N->getOperand(1).getNode() == Op;
  6965. case ISD::INTRINSIC_WO_CHAIN:
  6966. switch (N->getConstantOperandVal(0)) {
  6967. case Intrinsic::arm_mve_add_predicated:
  6968. case Intrinsic::arm_mve_mul_predicated:
  6969. case Intrinsic::arm_mve_qadd_predicated:
  6970. case Intrinsic::arm_mve_vhadd:
  6971. case Intrinsic::arm_mve_hadd_predicated:
  6972. case Intrinsic::arm_mve_vqdmulh:
  6973. case Intrinsic::arm_mve_qdmulh_predicated:
  6974. case Intrinsic::arm_mve_vqrdmulh:
  6975. case Intrinsic::arm_mve_qrdmulh_predicated:
  6976. case Intrinsic::arm_mve_vqdmull:
  6977. case Intrinsic::arm_mve_vqdmull_predicated:
  6978. return true;
  6979. case Intrinsic::arm_mve_sub_predicated:
  6980. case Intrinsic::arm_mve_qsub_predicated:
  6981. case Intrinsic::arm_mve_vhsub:
  6982. case Intrinsic::arm_mve_hsub_predicated:
  6983. return N->getOperand(2).getNode() == Op;
  6984. default:
  6985. return false;
  6986. }
  6987. default:
  6988. return false;
  6989. }
  6990. }
  6991. // If this is a case we can't handle, return null and let the default
  6992. // expansion code take care of it.
  6993. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  6994. const ARMSubtarget *ST) const {
  6995. BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
  6996. SDLoc dl(Op);
  6997. EVT VT = Op.getValueType();
  6998. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
  6999. return LowerBUILD_VECTOR_i1(Op, DAG, ST);
  7000. if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
  7001. return R;
  7002. APInt SplatBits, SplatUndef;
  7003. unsigned SplatBitSize;
  7004. bool HasAnyUndefs;
  7005. if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  7006. if (SplatUndef.isAllOnes())
  7007. return DAG.getUNDEF(VT);
  7008. // If all the users of this constant splat are qr instruction variants,
  7009. // generate a vdup of the constant.
  7010. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
  7011. (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
  7012. all_of(BVN->uses(),
  7013. [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
  7014. EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
  7015. : SplatBitSize == 16 ? MVT::v8i16
  7016. : MVT::v16i8;
  7017. SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
  7018. SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
  7019. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
  7020. }
  7021. if ((ST->hasNEON() && SplatBitSize <= 64) ||
  7022. (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
  7023. // Check if an immediate VMOV works.
  7024. EVT VmovVT;
  7025. SDValue Val =
  7026. isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
  7027. SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
  7028. if (Val.getNode()) {
  7029. SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
  7030. return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
  7031. }
  7032. // Try an immediate VMVN.
  7033. uint64_t NegatedImm = (~SplatBits).getZExtValue();
  7034. Val = isVMOVModifiedImm(
  7035. NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
  7036. VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
  7037. if (Val.getNode()) {
  7038. SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
  7039. return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
  7040. }
  7041. // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
  7042. if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
  7043. int ImmVal = ARM_AM::getFP32Imm(SplatBits);
  7044. if (ImmVal != -1) {
  7045. SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
  7046. return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
  7047. }
  7048. }
  7049. // If we are under MVE, generate a VDUP(constant), bitcast to the original
  7050. // type.
  7051. if (ST->hasMVEIntegerOps() &&
  7052. (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
  7053. EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
  7054. : SplatBitSize == 16 ? MVT::v8i16
  7055. : MVT::v16i8;
  7056. SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
  7057. SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
  7058. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
  7059. }
  7060. }
  7061. }
  7062. // Scan through the operands to see if only one value is used.
  7063. //
  7064. // As an optimisation, even if more than one value is used it may be more
  7065. // profitable to splat with one value then change some lanes.
  7066. //
  7067. // Heuristically we decide to do this if the vector has a "dominant" value,
  7068. // defined as splatted to more than half of the lanes.
  7069. unsigned NumElts = VT.getVectorNumElements();
  7070. bool isOnlyLowElement = true;
  7071. bool usesOnlyOneValue = true;
  7072. bool hasDominantValue = false;
  7073. bool isConstant = true;
  7074. // Map of the number of times a particular SDValue appears in the
  7075. // element list.
  7076. DenseMap<SDValue, unsigned> ValueCounts;
  7077. SDValue Value;
  7078. for (unsigned i = 0; i < NumElts; ++i) {
  7079. SDValue V = Op.getOperand(i);
  7080. if (V.isUndef())
  7081. continue;
  7082. if (i > 0)
  7083. isOnlyLowElement = false;
  7084. if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
  7085. isConstant = false;
  7086. ValueCounts.insert(std::make_pair(V, 0));
  7087. unsigned &Count = ValueCounts[V];
  7088. // Is this value dominant? (takes up more than half of the lanes)
  7089. if (++Count > (NumElts / 2)) {
  7090. hasDominantValue = true;
  7091. Value = V;
  7092. }
  7093. }
  7094. if (ValueCounts.size() != 1)
  7095. usesOnlyOneValue = false;
  7096. if (!Value.getNode() && !ValueCounts.empty())
  7097. Value = ValueCounts.begin()->first;
  7098. if (ValueCounts.empty())
  7099. return DAG.getUNDEF(VT);
  7100. // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
  7101. // Keep going if we are hitting this case.
  7102. if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
  7103. return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
  7104. unsigned EltSize = VT.getScalarSizeInBits();
  7105. // Use VDUP for non-constant splats. For f32 constant splats, reduce to
  7106. // i32 and try again.
  7107. if (hasDominantValue && EltSize <= 32) {
  7108. if (!isConstant) {
  7109. SDValue N;
  7110. // If we are VDUPing a value that comes directly from a vector, that will
  7111. // cause an unnecessary move to and from a GPR, where instead we could
  7112. // just use VDUPLANE. We can only do this if the lane being extracted
  7113. // is at a constant index, as the VDUP from lane instructions only have
  7114. // constant-index forms.
  7115. ConstantSDNode *constIndex;
  7116. if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  7117. (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
  7118. // We need to create a new undef vector to use for the VDUPLANE if the
  7119. // size of the vector from which we get the value is different than the
  7120. // size of the vector that we need to create. We will insert the element
  7121. // such that the register coalescer will remove unnecessary copies.
  7122. if (VT != Value->getOperand(0).getValueType()) {
  7123. unsigned index = constIndex->getAPIntValue().getLimitedValue() %
  7124. VT.getVectorNumElements();
  7125. N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
  7126. DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
  7127. Value, DAG.getConstant(index, dl, MVT::i32)),
  7128. DAG.getConstant(index, dl, MVT::i32));
  7129. } else
  7130. N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
  7131. Value->getOperand(0), Value->getOperand(1));
  7132. } else
  7133. N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
  7134. if (!usesOnlyOneValue) {
  7135. // The dominant value was splatted as 'N', but we now have to insert
  7136. // all differing elements.
  7137. for (unsigned I = 0; I < NumElts; ++I) {
  7138. if (Op.getOperand(I) == Value)
  7139. continue;
  7140. SmallVector<SDValue, 3> Ops;
  7141. Ops.push_back(N);
  7142. Ops.push_back(Op.getOperand(I));
  7143. Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
  7144. N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
  7145. }
  7146. }
  7147. return N;
  7148. }
  7149. if (VT.getVectorElementType().isFloatingPoint()) {
  7150. SmallVector<SDValue, 8> Ops;
  7151. MVT FVT = VT.getVectorElementType().getSimpleVT();
  7152. assert(FVT == MVT::f32 || FVT == MVT::f16);
  7153. MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
  7154. for (unsigned i = 0; i < NumElts; ++i)
  7155. Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
  7156. Op.getOperand(i)));
  7157. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
  7158. SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
  7159. Val = LowerBUILD_VECTOR(Val, DAG, ST);
  7160. if (Val.getNode())
  7161. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  7162. }
  7163. if (usesOnlyOneValue) {
  7164. SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
  7165. if (isConstant && Val.getNode())
  7166. return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
  7167. }
  7168. }
  7169. // If all elements are constants and the case above didn't get hit, fall back
  7170. // to the default expansion, which will generate a load from the constant
  7171. // pool.
  7172. if (isConstant)
  7173. return SDValue();
  7174. // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
  7175. // vmovn). Empirical tests suggest this is rarely worth it for vectors of
  7176. // length <= 2.
  7177. if (NumElts >= 4)
  7178. if (SDValue shuffle = ReconstructShuffle(Op, DAG))
  7179. return shuffle;
  7180. // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
  7181. // VCVT's
  7182. if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
  7183. return VCVT;
  7184. if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
  7185. return VCVT;
  7186. if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
  7187. // If we haven't found an efficient lowering, try splitting a 128-bit vector
  7188. // into two 64-bit vectors; we might discover a better way to lower it.
  7189. SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
  7190. EVT ExtVT = VT.getVectorElementType();
  7191. EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
  7192. SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
  7193. if (Lower.getOpcode() == ISD::BUILD_VECTOR)
  7194. Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
  7195. SDValue Upper =
  7196. DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
  7197. if (Upper.getOpcode() == ISD::BUILD_VECTOR)
  7198. Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
  7199. if (Lower && Upper)
  7200. return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
  7201. }
  7202. // Vectors with 32- or 64-bit elements can be built by directly assigning
  7203. // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
  7204. // will be legalized.
  7205. if (EltSize >= 32) {
  7206. // Do the expansion with floating-point types, since that is what the VFP
  7207. // registers are defined to use, and since i64 is not legal.
  7208. EVT EltVT = EVT::getFloatingPointVT(EltSize);
  7209. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
  7210. SmallVector<SDValue, 8> Ops;
  7211. for (unsigned i = 0; i < NumElts; ++i)
  7212. Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
  7213. SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
  7214. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  7215. }
  7216. // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
  7217. // know the default expansion would otherwise fall back on something even
  7218. // worse. For a vector with one or two non-undef values, that's
  7219. // scalar_to_vector for the elements followed by a shuffle (provided the
  7220. // shuffle is valid for the target) and materialization element by element
  7221. // on the stack followed by a load for everything else.
  7222. if (!isConstant && !usesOnlyOneValue) {
  7223. SDValue Vec = DAG.getUNDEF(VT);
  7224. for (unsigned i = 0 ; i < NumElts; ++i) {
  7225. SDValue V = Op.getOperand(i);
  7226. if (V.isUndef())
  7227. continue;
  7228. SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
  7229. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
  7230. }
  7231. return Vec;
  7232. }
  7233. return SDValue();
  7234. }
  7235. // Gather data to see if the operation can be modelled as a
  7236. // shuffle in combination with VEXTs.
  7237. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
  7238. SelectionDAG &DAG) const {
  7239. assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
  7240. SDLoc dl(Op);
  7241. EVT VT = Op.getValueType();
  7242. unsigned NumElts = VT.getVectorNumElements();
  7243. struct ShuffleSourceInfo {
  7244. SDValue Vec;
  7245. unsigned MinElt = std::numeric_limits<unsigned>::max();
  7246. unsigned MaxElt = 0;
  7247. // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
  7248. // be compatible with the shuffle we intend to construct. As a result
  7249. // ShuffleVec will be some sliding window into the original Vec.
  7250. SDValue ShuffleVec;
  7251. // Code should guarantee that element i in Vec starts at element "WindowBase
  7252. // + i * WindowScale in ShuffleVec".
  7253. int WindowBase = 0;
  7254. int WindowScale = 1;
  7255. ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
  7256. bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
  7257. };
  7258. // First gather all vectors used as an immediate source for this BUILD_VECTOR
  7259. // node.
  7260. SmallVector<ShuffleSourceInfo, 2> Sources;
  7261. for (unsigned i = 0; i < NumElts; ++i) {
  7262. SDValue V = Op.getOperand(i);
  7263. if (V.isUndef())
  7264. continue;
  7265. else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
  7266. // A shuffle can only come from building a vector from various
  7267. // elements of other vectors.
  7268. return SDValue();
  7269. } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
  7270. // Furthermore, shuffles require a constant mask, whereas extractelts
  7271. // accept variable indices.
  7272. return SDValue();
  7273. }
  7274. // Add this element source to the list if it's not already there.
  7275. SDValue SourceVec = V.getOperand(0);
  7276. auto Source = llvm::find(Sources, SourceVec);
  7277. if (Source == Sources.end())
  7278. Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
  7279. // Update the minimum and maximum lane number seen.
  7280. unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
  7281. Source->MinElt = std::min(Source->MinElt, EltNo);
  7282. Source->MaxElt = std::max(Source->MaxElt, EltNo);
  7283. }
  7284. // Currently only do something sane when at most two source vectors
  7285. // are involved.
  7286. if (Sources.size() > 2)
  7287. return SDValue();
  7288. // Find out the smallest element size among result and two sources, and use
  7289. // it as element size to build the shuffle_vector.
  7290. EVT SmallestEltTy = VT.getVectorElementType();
  7291. for (auto &Source : Sources) {
  7292. EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
  7293. if (SrcEltTy.bitsLT(SmallestEltTy))
  7294. SmallestEltTy = SrcEltTy;
  7295. }
  7296. unsigned ResMultiplier =
  7297. VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
  7298. NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
  7299. EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
  7300. // If the source vector is too wide or too narrow, we may nevertheless be able
  7301. // to construct a compatible shuffle either by concatenating it with UNDEF or
  7302. // extracting a suitable range of elements.
  7303. for (auto &Src : Sources) {
  7304. EVT SrcVT = Src.ShuffleVec.getValueType();
  7305. uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
  7306. uint64_t VTSize = VT.getFixedSizeInBits();
  7307. if (SrcVTSize == VTSize)
  7308. continue;
  7309. // This stage of the search produces a source with the same element type as
  7310. // the original, but with a total width matching the BUILD_VECTOR output.
  7311. EVT EltVT = SrcVT.getVectorElementType();
  7312. unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
  7313. EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
  7314. if (SrcVTSize < VTSize) {
  7315. if (2 * SrcVTSize != VTSize)
  7316. return SDValue();
  7317. // We can pad out the smaller vector for free, so if it's part of a
  7318. // shuffle...
  7319. Src.ShuffleVec =
  7320. DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
  7321. DAG.getUNDEF(Src.ShuffleVec.getValueType()));
  7322. continue;
  7323. }
  7324. if (SrcVTSize != 2 * VTSize)
  7325. return SDValue();
  7326. if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
  7327. // Span too large for a VEXT to cope
  7328. return SDValue();
  7329. }
  7330. if (Src.MinElt >= NumSrcElts) {
  7331. // The extraction can just take the second half
  7332. Src.ShuffleVec =
  7333. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7334. DAG.getConstant(NumSrcElts, dl, MVT::i32));
  7335. Src.WindowBase = -NumSrcElts;
  7336. } else if (Src.MaxElt < NumSrcElts) {
  7337. // The extraction can just take the first half
  7338. Src.ShuffleVec =
  7339. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7340. DAG.getConstant(0, dl, MVT::i32));
  7341. } else {
  7342. // An actual VEXT is needed
  7343. SDValue VEXTSrc1 =
  7344. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7345. DAG.getConstant(0, dl, MVT::i32));
  7346. SDValue VEXTSrc2 =
  7347. DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
  7348. DAG.getConstant(NumSrcElts, dl, MVT::i32));
  7349. Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
  7350. VEXTSrc2,
  7351. DAG.getConstant(Src.MinElt, dl, MVT::i32));
  7352. Src.WindowBase = -Src.MinElt;
  7353. }
  7354. }
  7355. // Another possible incompatibility occurs from the vector element types. We
  7356. // can fix this by bitcasting the source vectors to the same type we intend
  7357. // for the shuffle.
  7358. for (auto &Src : Sources) {
  7359. EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
  7360. if (SrcEltTy == SmallestEltTy)
  7361. continue;
  7362. assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
  7363. Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
  7364. Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
  7365. Src.WindowBase *= Src.WindowScale;
  7366. }
  7367. // Final check before we try to actually produce a shuffle.
  7368. LLVM_DEBUG(for (auto Src
  7369. : Sources)
  7370. assert(Src.ShuffleVec.getValueType() == ShuffleVT););
  7371. // The stars all align, our next step is to produce the mask for the shuffle.
  7372. SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
  7373. int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
  7374. for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
  7375. SDValue Entry = Op.getOperand(i);
  7376. if (Entry.isUndef())
  7377. continue;
  7378. auto Src = llvm::find(Sources, Entry.getOperand(0));
  7379. int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
  7380. // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
  7381. // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
  7382. // segment.
  7383. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
  7384. int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
  7385. VT.getScalarSizeInBits());
  7386. int LanesDefined = BitsDefined / BitsPerShuffleLane;
  7387. // This source is expected to fill ResMultiplier lanes of the final shuffle,
  7388. // starting at the appropriate offset.
  7389. int *LaneMask = &Mask[i * ResMultiplier];
  7390. int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
  7391. ExtractBase += NumElts * (Src - Sources.begin());
  7392. for (int j = 0; j < LanesDefined; ++j)
  7393. LaneMask[j] = ExtractBase + j;
  7394. }
  7395. // We can't handle more than two sources. This should have already
  7396. // been checked before this point.
  7397. assert(Sources.size() <= 2 && "Too many sources!");
  7398. SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
  7399. for (unsigned i = 0; i < Sources.size(); ++i)
  7400. ShuffleOps[i] = Sources[i].ShuffleVec;
  7401. SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
  7402. ShuffleOps[1], Mask, DAG);
  7403. if (!Shuffle)
  7404. return SDValue();
  7405. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
  7406. }
  7407. enum ShuffleOpCodes {
  7408. OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
  7409. OP_VREV,
  7410. OP_VDUP0,
  7411. OP_VDUP1,
  7412. OP_VDUP2,
  7413. OP_VDUP3,
  7414. OP_VEXT1,
  7415. OP_VEXT2,
  7416. OP_VEXT3,
  7417. OP_VUZPL, // VUZP, left result
  7418. OP_VUZPR, // VUZP, right result
  7419. OP_VZIPL, // VZIP, left result
  7420. OP_VZIPR, // VZIP, right result
  7421. OP_VTRNL, // VTRN, left result
  7422. OP_VTRNR // VTRN, right result
  7423. };
  7424. static bool isLegalMVEShuffleOp(unsigned PFEntry) {
  7425. unsigned OpNum = (PFEntry >> 26) & 0x0F;
  7426. switch (OpNum) {
  7427. case OP_COPY:
  7428. case OP_VREV:
  7429. case OP_VDUP0:
  7430. case OP_VDUP1:
  7431. case OP_VDUP2:
  7432. case OP_VDUP3:
  7433. return true;
  7434. }
  7435. return false;
  7436. }
  7437. /// isShuffleMaskLegal - Targets can use this to indicate that they only
  7438. /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
  7439. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
  7440. /// are assumed to be legal.
  7441. bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
  7442. if (VT.getVectorNumElements() == 4 &&
  7443. (VT.is128BitVector() || VT.is64BitVector())) {
  7444. unsigned PFIndexes[4];
  7445. for (unsigned i = 0; i != 4; ++i) {
  7446. if (M[i] < 0)
  7447. PFIndexes[i] = 8;
  7448. else
  7449. PFIndexes[i] = M[i];
  7450. }
  7451. // Compute the index in the perfect shuffle table.
  7452. unsigned PFTableIndex =
  7453. PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
  7454. unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
  7455. unsigned Cost = (PFEntry >> 30);
  7456. if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
  7457. return true;
  7458. }
  7459. bool ReverseVEXT, isV_UNDEF;
  7460. unsigned Imm, WhichResult;
  7461. unsigned EltSize = VT.getScalarSizeInBits();
  7462. if (EltSize >= 32 ||
  7463. ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
  7464. ShuffleVectorInst::isIdentityMask(M) ||
  7465. isVREVMask(M, VT, 64) ||
  7466. isVREVMask(M, VT, 32) ||
  7467. isVREVMask(M, VT, 16))
  7468. return true;
  7469. else if (Subtarget->hasNEON() &&
  7470. (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
  7471. isVTBLMask(M, VT) ||
  7472. isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
  7473. return true;
  7474. else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7475. isReverseMask(M, VT))
  7476. return true;
  7477. else if (Subtarget->hasMVEIntegerOps() &&
  7478. (isVMOVNMask(M, VT, true, false) ||
  7479. isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
  7480. return true;
  7481. else if (Subtarget->hasMVEIntegerOps() &&
  7482. (isTruncMask(M, VT, false, false) ||
  7483. isTruncMask(M, VT, false, true) ||
  7484. isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
  7485. return true;
  7486. else
  7487. return false;
  7488. }
  7489. /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
  7490. /// the specified operations to build the shuffle.
  7491. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
  7492. SDValue RHS, SelectionDAG &DAG,
  7493. const SDLoc &dl) {
  7494. unsigned OpNum = (PFEntry >> 26) & 0x0F;
  7495. unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
  7496. unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
  7497. if (OpNum == OP_COPY) {
  7498. if (LHSID == (1*9+2)*9+3) return LHS;
  7499. assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
  7500. return RHS;
  7501. }
  7502. SDValue OpLHS, OpRHS;
  7503. OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
  7504. OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
  7505. EVT VT = OpLHS.getValueType();
  7506. switch (OpNum) {
  7507. default: llvm_unreachable("Unknown shuffle opcode!");
  7508. case OP_VREV:
  7509. // VREV divides the vector in half and swaps within the half.
  7510. if (VT.getScalarSizeInBits() == 32)
  7511. return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
  7512. // vrev <4 x i16> -> VREV32
  7513. if (VT.getScalarSizeInBits() == 16)
  7514. return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
  7515. // vrev <4 x i8> -> VREV16
  7516. assert(VT.getScalarSizeInBits() == 8);
  7517. return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
  7518. case OP_VDUP0:
  7519. case OP_VDUP1:
  7520. case OP_VDUP2:
  7521. case OP_VDUP3:
  7522. return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
  7523. OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
  7524. case OP_VEXT1:
  7525. case OP_VEXT2:
  7526. case OP_VEXT3:
  7527. return DAG.getNode(ARMISD::VEXT, dl, VT,
  7528. OpLHS, OpRHS,
  7529. DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
  7530. case OP_VUZPL:
  7531. case OP_VUZPR:
  7532. return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
  7533. OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
  7534. case OP_VZIPL:
  7535. case OP_VZIPR:
  7536. return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
  7537. OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
  7538. case OP_VTRNL:
  7539. case OP_VTRNR:
  7540. return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
  7541. OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
  7542. }
  7543. }
  7544. static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
  7545. ArrayRef<int> ShuffleMask,
  7546. SelectionDAG &DAG) {
  7547. // Check to see if we can use the VTBL instruction.
  7548. SDValue V1 = Op.getOperand(0);
  7549. SDValue V2 = Op.getOperand(1);
  7550. SDLoc DL(Op);
  7551. SmallVector<SDValue, 8> VTBLMask;
  7552. for (int I : ShuffleMask)
  7553. VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));
  7554. if (V2.getNode()->isUndef())
  7555. return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
  7556. DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
  7557. return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
  7558. DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
  7559. }
  7560. static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  7561. SDLoc DL(Op);
  7562. EVT VT = Op.getValueType();
  7563. assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7564. "Expect an v8i16/v16i8 type");
  7565. SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
  7566. // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
  7567. // extract the first 8 bytes into the top double word and the last 8 bytes
  7568. // into the bottom double word, through a new vector shuffle that will be
  7569. // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
  7570. std::vector<int> NewMask;
  7571. for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
  7572. NewMask.push_back(VT.getVectorNumElements() / 2 + i);
  7573. for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
  7574. NewMask.push_back(i);
  7575. return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
  7576. }
  7577. static EVT getVectorTyFromPredicateVector(EVT VT) {
  7578. switch (VT.getSimpleVT().SimpleTy) {
  7579. case MVT::v2i1:
  7580. return MVT::v2f64;
  7581. case MVT::v4i1:
  7582. return MVT::v4i32;
  7583. case MVT::v8i1:
  7584. return MVT::v8i16;
  7585. case MVT::v16i1:
  7586. return MVT::v16i8;
  7587. default:
  7588. llvm_unreachable("Unexpected vector predicate type");
  7589. }
  7590. }
  7591. static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
  7592. SelectionDAG &DAG) {
  7593. // Converting from boolean predicates to integers involves creating a vector
  7594. // of all ones or all zeroes and selecting the lanes based upon the real
  7595. // predicate.
  7596. SDValue AllOnes =
  7597. DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
  7598. AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
  7599. SDValue AllZeroes =
  7600. DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
  7601. AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
  7602. // Get full vector type from predicate type
  7603. EVT NewVT = getVectorTyFromPredicateVector(VT);
  7604. SDValue RecastV1;
  7605. // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
  7606. // this to a v16i1. This cannot be done with an ordinary bitcast because the
  7607. // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
  7608. // since we know in hardware the sizes are really the same.
  7609. if (VT != MVT::v16i1)
  7610. RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
  7611. else
  7612. RecastV1 = Pred;
  7613. // Select either all ones or zeroes depending upon the real predicate bits.
  7614. SDValue PredAsVector =
  7615. DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
  7616. // Recast our new predicate-as-integer v16i8 vector into something
  7617. // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
  7618. return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
  7619. }
  7620. static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
  7621. const ARMSubtarget *ST) {
  7622. EVT VT = Op.getValueType();
  7623. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  7624. ArrayRef<int> ShuffleMask = SVN->getMask();
  7625. assert(ST->hasMVEIntegerOps() &&
  7626. "No support for vector shuffle of boolean predicates");
  7627. SDValue V1 = Op.getOperand(0);
  7628. SDValue V2 = Op.getOperand(1);
  7629. SDLoc dl(Op);
  7630. if (isReverseMask(ShuffleMask, VT)) {
  7631. SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
  7632. SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
  7633. SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
  7634. DAG.getConstant(16, dl, MVT::i32));
  7635. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
  7636. }
  7637. // Until we can come up with optimised cases for every single vector
  7638. // shuffle in existence we have chosen the least painful strategy. This is
  7639. // to essentially promote the boolean predicate to a 8-bit integer, where
  7640. // each predicate represents a byte. Then we fall back on a normal integer
  7641. // vector shuffle and convert the result back into a predicate vector. In
  7642. // many cases the generated code might be even better than scalar code
  7643. // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
  7644. // fields in a register into 8 other arbitrary 2-bit fields!
  7645. SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
  7646. EVT NewVT = PredAsVector1.getValueType();
  7647. SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
  7648. : PromoteMVEPredVector(dl, V2, VT, DAG);
  7649. assert(PredAsVector2.getValueType() == NewVT &&
  7650. "Expected identical vector type in expanded i1 shuffle!");
  7651. // Do the shuffle!
  7652. SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
  7653. PredAsVector2, ShuffleMask);
  7654. // Now return the result of comparing the shuffled vector with zero,
  7655. // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
  7656. // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
  7657. if (VT == MVT::v2i1) {
  7658. SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
  7659. SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
  7660. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  7661. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
  7662. }
  7663. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
  7664. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  7665. }
  7666. static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
  7667. ArrayRef<int> ShuffleMask,
  7668. SelectionDAG &DAG) {
  7669. // Attempt to lower the vector shuffle using as many whole register movs as
  7670. // possible. This is useful for types smaller than 32bits, which would
  7671. // often otherwise become a series for grp movs.
  7672. SDLoc dl(Op);
  7673. EVT VT = Op.getValueType();
  7674. if (VT.getScalarSizeInBits() >= 32)
  7675. return SDValue();
  7676. assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7677. "Unexpected vector type");
  7678. int NumElts = VT.getVectorNumElements();
  7679. int QuarterSize = NumElts / 4;
  7680. // The four final parts of the vector, as i32's
  7681. SDValue Parts[4];
  7682. // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
  7683. // <u,u,u,u>), returning the vmov lane index
  7684. auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
  7685. // Detect which mov lane this would be from the first non-undef element.
  7686. int MovIdx = -1;
  7687. for (int i = 0; i < Length; i++) {
  7688. if (ShuffleMask[Start + i] >= 0) {
  7689. if (ShuffleMask[Start + i] % Length != i)
  7690. return -1;
  7691. MovIdx = ShuffleMask[Start + i] / Length;
  7692. break;
  7693. }
  7694. }
  7695. // If all items are undef, leave this for other combines
  7696. if (MovIdx == -1)
  7697. return -1;
  7698. // Check the remaining values are the correct part of the same mov
  7699. for (int i = 1; i < Length; i++) {
  7700. if (ShuffleMask[Start + i] >= 0 &&
  7701. (ShuffleMask[Start + i] / Length != MovIdx ||
  7702. ShuffleMask[Start + i] % Length != i))
  7703. return -1;
  7704. }
  7705. return MovIdx;
  7706. };
  7707. for (int Part = 0; Part < 4; ++Part) {
  7708. // Does this part look like a mov
  7709. int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
  7710. if (Elt != -1) {
  7711. SDValue Input = Op->getOperand(0);
  7712. if (Elt >= 4) {
  7713. Input = Op->getOperand(1);
  7714. Elt -= 4;
  7715. }
  7716. SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
  7717. Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
  7718. DAG.getConstant(Elt, dl, MVT::i32));
  7719. }
  7720. }
  7721. // Nothing interesting found, just return
  7722. if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
  7723. return SDValue();
  7724. // The other parts need to be built with the old shuffle vector, cast to a
  7725. // v4i32 and extract_vector_elts
  7726. if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
  7727. SmallVector<int, 16> NewShuffleMask;
  7728. for (int Part = 0; Part < 4; ++Part)
  7729. for (int i = 0; i < QuarterSize; i++)
  7730. NewShuffleMask.push_back(
  7731. Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
  7732. SDValue NewShuffle = DAG.getVectorShuffle(
  7733. VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
  7734. SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
  7735. for (int Part = 0; Part < 4; ++Part)
  7736. if (!Parts[Part])
  7737. Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
  7738. BitCast, DAG.getConstant(Part, dl, MVT::i32));
  7739. }
  7740. // Build a vector out of the various parts and bitcast it back to the original
  7741. // type.
  7742. SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
  7743. return DAG.getBitcast(VT, NewVec);
  7744. }
  7745. static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
  7746. ArrayRef<int> ShuffleMask,
  7747. SelectionDAG &DAG) {
  7748. SDValue V1 = Op.getOperand(0);
  7749. SDValue V2 = Op.getOperand(1);
  7750. EVT VT = Op.getValueType();
  7751. unsigned NumElts = VT.getVectorNumElements();
  7752. // An One-Off Identity mask is one that is mostly an identity mask from as
  7753. // single source but contains a single element out-of-place, either from a
  7754. // different vector or from another position in the same vector. As opposed to
  7755. // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
  7756. // pair directly.
  7757. auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
  7758. int &OffElement) {
  7759. OffElement = -1;
  7760. int NonUndef = 0;
  7761. for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
  7762. if (Mask[i] == -1)
  7763. continue;
  7764. NonUndef++;
  7765. if (Mask[i] != i + BaseOffset) {
  7766. if (OffElement == -1)
  7767. OffElement = i;
  7768. else
  7769. return false;
  7770. }
  7771. }
  7772. return NonUndef > 2 && OffElement != -1;
  7773. };
  7774. int OffElement;
  7775. SDValue VInput;
  7776. if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
  7777. VInput = V1;
  7778. else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
  7779. VInput = V2;
  7780. else
  7781. return SDValue();
  7782. SDLoc dl(Op);
  7783. EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
  7784. ? MVT::i32
  7785. : VT.getScalarType();
  7786. SDValue Elt = DAG.getNode(
  7787. ISD::EXTRACT_VECTOR_ELT, dl, SVT,
  7788. ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
  7789. DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
  7790. return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
  7791. DAG.getVectorIdxConstant(OffElement % NumElts, dl));
  7792. }
  7793. static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
  7794. const ARMSubtarget *ST) {
  7795. SDValue V1 = Op.getOperand(0);
  7796. SDValue V2 = Op.getOperand(1);
  7797. SDLoc dl(Op);
  7798. EVT VT = Op.getValueType();
  7799. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  7800. unsigned EltSize = VT.getScalarSizeInBits();
  7801. if (ST->hasMVEIntegerOps() && EltSize == 1)
  7802. return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
  7803. // Convert shuffles that are directly supported on NEON to target-specific
  7804. // DAG nodes, instead of keeping them as shuffles and matching them again
  7805. // during code selection. This is more efficient and avoids the possibility
  7806. // of inconsistencies between legalization and selection.
  7807. // FIXME: floating-point vectors should be canonicalized to integer vectors
  7808. // of the same time so that they get CSEd properly.
  7809. ArrayRef<int> ShuffleMask = SVN->getMask();
  7810. if (EltSize <= 32) {
  7811. if (SVN->isSplat()) {
  7812. int Lane = SVN->getSplatIndex();
  7813. // If this is undef splat, generate it via "just" vdup, if possible.
  7814. if (Lane == -1) Lane = 0;
  7815. // Test if V1 is a SCALAR_TO_VECTOR.
  7816. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
  7817. return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
  7818. }
  7819. // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
  7820. // (and probably will turn into a SCALAR_TO_VECTOR once legalization
  7821. // reaches it).
  7822. if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
  7823. !isa<ConstantSDNode>(V1.getOperand(0))) {
  7824. bool IsScalarToVector = true;
  7825. for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
  7826. if (!V1.getOperand(i).isUndef()) {
  7827. IsScalarToVector = false;
  7828. break;
  7829. }
  7830. if (IsScalarToVector)
  7831. return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
  7832. }
  7833. return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
  7834. DAG.getConstant(Lane, dl, MVT::i32));
  7835. }
  7836. bool ReverseVEXT = false;
  7837. unsigned Imm = 0;
  7838. if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
  7839. if (ReverseVEXT)
  7840. std::swap(V1, V2);
  7841. return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
  7842. DAG.getConstant(Imm, dl, MVT::i32));
  7843. }
  7844. if (isVREVMask(ShuffleMask, VT, 64))
  7845. return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
  7846. if (isVREVMask(ShuffleMask, VT, 32))
  7847. return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
  7848. if (isVREVMask(ShuffleMask, VT, 16))
  7849. return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
  7850. if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
  7851. return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
  7852. DAG.getConstant(Imm, dl, MVT::i32));
  7853. }
  7854. // Check for Neon shuffles that modify both input vectors in place.
  7855. // If both results are used, i.e., if there are two shuffles with the same
  7856. // source operands and with masks corresponding to both results of one of
  7857. // these operations, DAG memoization will ensure that a single node is
  7858. // used for both shuffles.
  7859. unsigned WhichResult = 0;
  7860. bool isV_UNDEF = false;
  7861. if (ST->hasNEON()) {
  7862. if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
  7863. ShuffleMask, VT, WhichResult, isV_UNDEF)) {
  7864. if (isV_UNDEF)
  7865. V2 = V1;
  7866. return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
  7867. .getValue(WhichResult);
  7868. }
  7869. }
  7870. if (ST->hasMVEIntegerOps()) {
  7871. if (isVMOVNMask(ShuffleMask, VT, false, false))
  7872. return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
  7873. DAG.getConstant(0, dl, MVT::i32));
  7874. if (isVMOVNMask(ShuffleMask, VT, true, false))
  7875. return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
  7876. DAG.getConstant(1, dl, MVT::i32));
  7877. if (isVMOVNMask(ShuffleMask, VT, true, true))
  7878. return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
  7879. DAG.getConstant(1, dl, MVT::i32));
  7880. }
  7881. // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
  7882. // shuffles that produce a result larger than their operands with:
  7883. // shuffle(concat(v1, undef), concat(v2, undef))
  7884. // ->
  7885. // shuffle(concat(v1, v2), undef)
  7886. // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
  7887. //
  7888. // This is useful in the general case, but there are special cases where
  7889. // native shuffles produce larger results: the two-result ops.
  7890. //
  7891. // Look through the concat when lowering them:
  7892. // shuffle(concat(v1, v2), undef)
  7893. // ->
  7894. // concat(VZIP(v1, v2):0, :1)
  7895. //
  7896. if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
  7897. SDValue SubV1 = V1->getOperand(0);
  7898. SDValue SubV2 = V1->getOperand(1);
  7899. EVT SubVT = SubV1.getValueType();
  7900. // We expect these to have been canonicalized to -1.
  7901. assert(llvm::all_of(ShuffleMask, [&](int i) {
  7902. return i < (int)VT.getVectorNumElements();
  7903. }) && "Unexpected shuffle index into UNDEF operand!");
  7904. if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
  7905. ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
  7906. if (isV_UNDEF)
  7907. SubV2 = SubV1;
  7908. assert((WhichResult == 0) &&
  7909. "In-place shuffle of concat can only have one result!");
  7910. SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
  7911. SubV1, SubV2);
  7912. return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
  7913. Res.getValue(1));
  7914. }
  7915. }
  7916. }
  7917. if (ST->hasMVEIntegerOps() && EltSize <= 32) {
  7918. if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
  7919. return V;
  7920. for (bool Top : {false, true}) {
  7921. for (bool SingleSource : {false, true}) {
  7922. if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
  7923. MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
  7924. MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
  7925. SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
  7926. SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
  7927. SingleSource ? V1 : V2);
  7928. if (Top) {
  7929. SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
  7930. Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
  7931. Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
  7932. }
  7933. return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
  7934. }
  7935. }
  7936. }
  7937. }
  7938. // If the shuffle is not directly supported and it has 4 elements, use
  7939. // the PerfectShuffle-generated table to synthesize it from other shuffles.
  7940. unsigned NumElts = VT.getVectorNumElements();
  7941. if (NumElts == 4) {
  7942. unsigned PFIndexes[4];
  7943. for (unsigned i = 0; i != 4; ++i) {
  7944. if (ShuffleMask[i] < 0)
  7945. PFIndexes[i] = 8;
  7946. else
  7947. PFIndexes[i] = ShuffleMask[i];
  7948. }
  7949. // Compute the index in the perfect shuffle table.
  7950. unsigned PFTableIndex =
  7951. PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
  7952. unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
  7953. unsigned Cost = (PFEntry >> 30);
  7954. if (Cost <= 4) {
  7955. if (ST->hasNEON())
  7956. return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
  7957. else if (isLegalMVEShuffleOp(PFEntry)) {
  7958. unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
  7959. unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
  7960. unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
  7961. unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
  7962. if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
  7963. return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
  7964. }
  7965. }
  7966. }
  7967. // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
  7968. if (EltSize >= 32) {
  7969. // Do the expansion with floating-point types, since that is what the VFP
  7970. // registers are defined to use, and since i64 is not legal.
  7971. EVT EltVT = EVT::getFloatingPointVT(EltSize);
  7972. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
  7973. V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
  7974. V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
  7975. SmallVector<SDValue, 8> Ops;
  7976. for (unsigned i = 0; i < NumElts; ++i) {
  7977. if (ShuffleMask[i] < 0)
  7978. Ops.push_back(DAG.getUNDEF(EltVT));
  7979. else
  7980. Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
  7981. ShuffleMask[i] < (int)NumElts ? V1 : V2,
  7982. DAG.getConstant(ShuffleMask[i] & (NumElts-1),
  7983. dl, MVT::i32)));
  7984. }
  7985. SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
  7986. return DAG.getNode(ISD::BITCAST, dl, VT, Val);
  7987. }
  7988. if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
  7989. isReverseMask(ShuffleMask, VT))
  7990. return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
  7991. if (ST->hasNEON() && VT == MVT::v8i8)
  7992. if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
  7993. return NewOp;
  7994. if (ST->hasMVEIntegerOps())
  7995. if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
  7996. return NewOp;
  7997. return SDValue();
  7998. }
  7999. static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
  8000. const ARMSubtarget *ST) {
  8001. EVT VecVT = Op.getOperand(0).getValueType();
  8002. SDLoc dl(Op);
  8003. assert(ST->hasMVEIntegerOps() &&
  8004. "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
  8005. SDValue Conv =
  8006. DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
  8007. unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
  8008. unsigned LaneWidth =
  8009. getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
  8010. unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
  8011. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
  8012. Op.getOperand(1), DAG.getValueType(MVT::i1));
  8013. SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
  8014. DAG.getConstant(~Mask, dl, MVT::i32));
  8015. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
  8016. }
  8017. SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
  8018. SelectionDAG &DAG) const {
  8019. // INSERT_VECTOR_ELT is legal only for immediate indexes.
  8020. SDValue Lane = Op.getOperand(2);
  8021. if (!isa<ConstantSDNode>(Lane))
  8022. return SDValue();
  8023. SDValue Elt = Op.getOperand(1);
  8024. EVT EltVT = Elt.getValueType();
  8025. if (Subtarget->hasMVEIntegerOps() &&
  8026. Op.getValueType().getScalarSizeInBits() == 1)
  8027. return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
  8028. if (getTypeAction(*DAG.getContext(), EltVT) ==
  8029. TargetLowering::TypePromoteFloat) {
  8030. // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
  8031. // but the type system will try to do that if we don't intervene.
  8032. // Reinterpret any such vector-element insertion as one with the
  8033. // corresponding integer types.
  8034. SDLoc dl(Op);
  8035. EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
  8036. assert(getTypeAction(*DAG.getContext(), IEltVT) !=
  8037. TargetLowering::TypePromoteFloat);
  8038. SDValue VecIn = Op.getOperand(0);
  8039. EVT VecVT = VecIn.getValueType();
  8040. EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
  8041. VecVT.getVectorNumElements());
  8042. SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
  8043. SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
  8044. SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
  8045. IVecIn, IElt, Lane);
  8046. return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
  8047. }
  8048. return Op;
  8049. }
  8050. static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
  8051. const ARMSubtarget *ST) {
  8052. EVT VecVT = Op.getOperand(0).getValueType();
  8053. SDLoc dl(Op);
  8054. assert(ST->hasMVEIntegerOps() &&
  8055. "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
  8056. SDValue Conv =
  8057. DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
  8058. unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
  8059. unsigned LaneWidth =
  8060. getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
  8061. SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
  8062. DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
  8063. return Shift;
  8064. }
  8065. static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
  8066. const ARMSubtarget *ST) {
  8067. // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
  8068. SDValue Lane = Op.getOperand(1);
  8069. if (!isa<ConstantSDNode>(Lane))
  8070. return SDValue();
  8071. SDValue Vec = Op.getOperand(0);
  8072. EVT VT = Vec.getValueType();
  8073. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
  8074. return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
  8075. if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
  8076. SDLoc dl(Op);
  8077. return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
  8078. }
  8079. return Op;
  8080. }
  8081. static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
  8082. const ARMSubtarget *ST) {
  8083. SDLoc dl(Op);
  8084. assert(Op.getValueType().getScalarSizeInBits() == 1 &&
  8085. "Unexpected custom CONCAT_VECTORS lowering");
  8086. assert(isPowerOf2_32(Op.getNumOperands()) &&
  8087. "Unexpected custom CONCAT_VECTORS lowering");
  8088. assert(ST->hasMVEIntegerOps() &&
  8089. "CONCAT_VECTORS lowering only supported for MVE");
  8090. auto ConcatPair = [&](SDValue V1, SDValue V2) {
  8091. EVT Op1VT = V1.getValueType();
  8092. EVT Op2VT = V2.getValueType();
  8093. assert(Op1VT == Op2VT && "Operand types don't match!");
  8094. EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
  8095. SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
  8096. SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
  8097. // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
  8098. // promoted to v8i16, etc.
  8099. MVT ElType =
  8100. getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
  8101. unsigned NumElts = 2 * Op1VT.getVectorNumElements();
  8102. // Extract the vector elements from Op1 and Op2 one by one and truncate them
  8103. // to be the right size for the destination. For example, if Op1 is v4i1
  8104. // then the promoted vector is v4i32. The result of concatenation gives a
  8105. // v8i1, which when promoted is v8i16. That means each i32 element from Op1
  8106. // needs truncating to i16 and inserting in the result.
  8107. EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
  8108. SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
  8109. auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
  8110. EVT NewVT = NewV.getValueType();
  8111. EVT ConcatVT = ConVec.getValueType();
  8112. for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
  8113. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
  8114. DAG.getIntPtrConstant(i, dl));
  8115. ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
  8116. DAG.getConstant(j, dl, MVT::i32));
  8117. }
  8118. return ConVec;
  8119. };
  8120. unsigned j = 0;
  8121. ConVec = ExtractInto(NewV1, ConVec, j);
  8122. ConVec = ExtractInto(NewV2, ConVec, j);
  8123. // Now return the result of comparing the subvector with zero, which will
  8124. // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we
  8125. // convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
  8126. if (VT == MVT::v2i1) {
  8127. SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec);
  8128. SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
  8129. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8130. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
  8131. }
  8132. return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
  8133. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8134. };
  8135. // Concat each pair of subvectors and pack into the lower half of the array.
  8136. SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
  8137. while (ConcatOps.size() > 1) {
  8138. for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
  8139. SDValue V1 = ConcatOps[I];
  8140. SDValue V2 = ConcatOps[I + 1];
  8141. ConcatOps[I / 2] = ConcatPair(V1, V2);
  8142. }
  8143. ConcatOps.resize(ConcatOps.size() / 2);
  8144. }
  8145. return ConcatOps[0];
  8146. }
  8147. static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
  8148. const ARMSubtarget *ST) {
  8149. EVT VT = Op->getValueType(0);
  8150. if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
  8151. return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
  8152. // The only time a CONCAT_VECTORS operation can have legal types is when
  8153. // two 64-bit vectors are concatenated to a 128-bit vector.
  8154. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
  8155. "unexpected CONCAT_VECTORS");
  8156. SDLoc dl(Op);
  8157. SDValue Val = DAG.getUNDEF(MVT::v2f64);
  8158. SDValue Op0 = Op.getOperand(0);
  8159. SDValue Op1 = Op.getOperand(1);
  8160. if (!Op0.isUndef())
  8161. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
  8162. DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
  8163. DAG.getIntPtrConstant(0, dl));
  8164. if (!Op1.isUndef())
  8165. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
  8166. DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
  8167. DAG.getIntPtrConstant(1, dl));
  8168. return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
  8169. }
  8170. static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
  8171. const ARMSubtarget *ST) {
  8172. SDValue V1 = Op.getOperand(0);
  8173. SDValue V2 = Op.getOperand(1);
  8174. SDLoc dl(Op);
  8175. EVT VT = Op.getValueType();
  8176. EVT Op1VT = V1.getValueType();
  8177. unsigned NumElts = VT.getVectorNumElements();
  8178. unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
  8179. assert(VT.getScalarSizeInBits() == 1 &&
  8180. "Unexpected custom EXTRACT_SUBVECTOR lowering");
  8181. assert(ST->hasMVEIntegerOps() &&
  8182. "EXTRACT_SUBVECTOR lowering only supported for MVE");
  8183. SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
  8184. // We now have Op1 promoted to a vector of integers, where v8i1 gets
  8185. // promoted to v8i16, etc.
  8186. MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
  8187. if (NumElts == 2) {
  8188. EVT SubVT = MVT::v4i32;
  8189. SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
  8190. for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
  8191. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
  8192. DAG.getIntPtrConstant(i, dl));
  8193. SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
  8194. DAG.getConstant(j, dl, MVT::i32));
  8195. SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
  8196. DAG.getConstant(j + 1, dl, MVT::i32));
  8197. }
  8198. SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
  8199. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8200. return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
  8201. }
  8202. EVT SubVT = MVT::getVectorVT(ElType, NumElts);
  8203. SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
  8204. for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
  8205. SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
  8206. DAG.getIntPtrConstant(i, dl));
  8207. SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
  8208. DAG.getConstant(j, dl, MVT::i32));
  8209. }
  8210. // Now return the result of comparing the subvector with zero,
  8211. // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
  8212. return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
  8213. DAG.getConstant(ARMCC::NE, dl, MVT::i32));
  8214. }
  8215. // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
  8216. static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,
  8217. const ARMSubtarget *ST) {
  8218. assert(ST->hasMVEIntegerOps() && "Expected MVE!");
  8219. EVT VT = N->getValueType(0);
  8220. assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
  8221. "Expected a vector i1 type!");
  8222. SDValue Op = N->getOperand(0);
  8223. EVT FromVT = Op.getValueType();
  8224. SDLoc DL(N);
  8225. SDValue And =
  8226. DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
  8227. return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
  8228. DAG.getCondCode(ISD::SETNE));
  8229. }
  8230. static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
  8231. const ARMSubtarget *Subtarget) {
  8232. if (!Subtarget->hasMVEIntegerOps())
  8233. return SDValue();
  8234. EVT ToVT = N->getValueType(0);
  8235. if (ToVT.getScalarType() == MVT::i1)
  8236. return LowerTruncatei1(N, DAG, Subtarget);
  8237. // MVE does not have a single instruction to perform the truncation of a v4i32
  8238. // into the lower half of a v8i16, in the same way that a NEON vmovn would.
  8239. // Most of the instructions in MVE follow the 'Beats' system, where moving
  8240. // values from different lanes is usually something that the instructions
  8241. // avoid.
  8242. //
  8243. // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
  8244. // which take a the top/bottom half of a larger lane and extend it (or do the
  8245. // opposite, truncating into the top/bottom lane from a larger lane). Note
  8246. // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
  8247. // bottom 16bits from each vector lane. This works really well with T/B
  8248. // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
  8249. // to move order.
  8250. //
  8251. // But truncates and sext/zext are always going to be fairly common from llvm.
  8252. // We have several options for how to deal with them:
  8253. // - Wherever possible combine them into an instruction that makes them
  8254. // "free". This includes loads/stores, which can perform the trunc as part
  8255. // of the memory operation. Or certain shuffles that can be turned into
  8256. // VMOVN/VMOVL.
  8257. // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
  8258. // trunc(mul(sext(a), sext(b))) may become
  8259. // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
  8260. // this case can use VMULL). This is performed in the
  8261. // MVELaneInterleavingPass.
  8262. // - Otherwise we have an option. By default we would expand the
  8263. // zext/sext/trunc into a series of lane extract/inserts going via GPR
  8264. // registers. One for each vector lane in the vector. This can obviously be
  8265. // very expensive.
  8266. // - The other option is to use the fact that loads/store can extend/truncate
  8267. // to turn a trunc into two truncating stack stores and a stack reload. This
  8268. // becomes 3 back-to-back memory operations, but at least that is less than
  8269. // all the insert/extracts.
  8270. //
  8271. // In order to do the last, we convert certain trunc's into MVETRUNC, which
  8272. // are either optimized where they can be, or eventually lowered into stack
  8273. // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
  8274. // two early, where other instructions would be better, and stops us from
  8275. // having to reconstruct multiple buildvector shuffles into loads/stores.
  8276. if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
  8277. return SDValue();
  8278. EVT FromVT = N->getOperand(0).getValueType();
  8279. if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
  8280. return SDValue();
  8281. SDValue Lo, Hi;
  8282. std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
  8283. SDLoc DL(N);
  8284. return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
  8285. }
  8286. static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
  8287. const ARMSubtarget *Subtarget) {
  8288. if (!Subtarget->hasMVEIntegerOps())
  8289. return SDValue();
  8290. // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
  8291. EVT ToVT = N->getValueType(0);
  8292. if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
  8293. return SDValue();
  8294. SDValue Op = N->getOperand(0);
  8295. EVT FromVT = Op.getValueType();
  8296. if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
  8297. return SDValue();
  8298. SDLoc DL(N);
  8299. EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
  8300. if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
  8301. ExtVT = MVT::v8i16;
  8302. unsigned Opcode =
  8303. N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
  8304. SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
  8305. SDValue Ext1 = Ext.getValue(1);
  8306. if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
  8307. Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
  8308. Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
  8309. }
  8310. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
  8311. }
  8312. /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
  8313. /// element has been zero/sign-extended, depending on the isSigned parameter,
  8314. /// from an integer type half its size.
  8315. static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
  8316. bool isSigned) {
  8317. // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
  8318. EVT VT = N->getValueType(0);
  8319. if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
  8320. SDNode *BVN = N->getOperand(0).getNode();
  8321. if (BVN->getValueType(0) != MVT::v4i32 ||
  8322. BVN->getOpcode() != ISD::BUILD_VECTOR)
  8323. return false;
  8324. unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
  8325. unsigned HiElt = 1 - LoElt;
  8326. ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
  8327. ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
  8328. ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
  8329. ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
  8330. if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
  8331. return false;
  8332. if (isSigned) {
  8333. if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
  8334. Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
  8335. return true;
  8336. } else {
  8337. if (Hi0->isZero() && Hi1->isZero())
  8338. return true;
  8339. }
  8340. return false;
  8341. }
  8342. if (N->getOpcode() != ISD::BUILD_VECTOR)
  8343. return false;
  8344. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
  8345. SDNode *Elt = N->getOperand(i).getNode();
  8346. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
  8347. unsigned EltSize = VT.getScalarSizeInBits();
  8348. unsigned HalfSize = EltSize / 2;
  8349. if (isSigned) {
  8350. if (!isIntN(HalfSize, C->getSExtValue()))
  8351. return false;
  8352. } else {
  8353. if (!isUIntN(HalfSize, C->getZExtValue()))
  8354. return false;
  8355. }
  8356. continue;
  8357. }
  8358. return false;
  8359. }
  8360. return true;
  8361. }
  8362. /// isSignExtended - Check if a node is a vector value that is sign-extended
  8363. /// or a constant BUILD_VECTOR with sign-extended elements.
  8364. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
  8365. if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
  8366. return true;
  8367. if (isExtendedBUILD_VECTOR(N, DAG, true))
  8368. return true;
  8369. return false;
  8370. }
  8371. /// isZeroExtended - Check if a node is a vector value that is zero-extended (or
  8372. /// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
  8373. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
  8374. if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
  8375. ISD::isZEXTLoad(N))
  8376. return true;
  8377. if (isExtendedBUILD_VECTOR(N, DAG, false))
  8378. return true;
  8379. return false;
  8380. }
  8381. static EVT getExtensionTo64Bits(const EVT &OrigVT) {
  8382. if (OrigVT.getSizeInBits() >= 64)
  8383. return OrigVT;
  8384. assert(OrigVT.isSimple() && "Expecting a simple value type");
  8385. MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
  8386. switch (OrigSimpleTy) {
  8387. default: llvm_unreachable("Unexpected Vector Type");
  8388. case MVT::v2i8:
  8389. case MVT::v2i16:
  8390. return MVT::v2i32;
  8391. case MVT::v4i8:
  8392. return MVT::v4i16;
  8393. }
  8394. }
  8395. /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
  8396. /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
  8397. /// We insert the required extension here to get the vector to fill a D register.
  8398. static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
  8399. const EVT &OrigTy,
  8400. const EVT &ExtTy,
  8401. unsigned ExtOpcode) {
  8402. // The vector originally had a size of OrigTy. It was then extended to ExtTy.
  8403. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
  8404. // 64-bits we need to insert a new extension so that it will be 64-bits.
  8405. assert(ExtTy.is128BitVector() && "Unexpected extension size");
  8406. if (OrigTy.getSizeInBits() >= 64)
  8407. return N;
  8408. // Must extend size to at least 64 bits to be used as an operand for VMULL.
  8409. EVT NewVT = getExtensionTo64Bits(OrigTy);
  8410. return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
  8411. }
  8412. /// SkipLoadExtensionForVMULL - return a load of the original vector size that
  8413. /// does not do any sign/zero extension. If the original vector is less
  8414. /// than 64 bits, an appropriate extension will be added after the load to
  8415. /// reach a total size of 64 bits. We have to add the extension separately
  8416. /// because ARM does not have a sign/zero extending load for vectors.
  8417. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
  8418. EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
  8419. // The load already has the right type.
  8420. if (ExtendedTy == LD->getMemoryVT())
  8421. return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
  8422. LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
  8423. LD->getMemOperand()->getFlags());
  8424. // We need to create a zextload/sextload. We cannot just create a load
  8425. // followed by a zext/zext node because LowerMUL is also run during normal
  8426. // operation legalization where we can't create illegal types.
  8427. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
  8428. LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
  8429. LD->getMemoryVT(), LD->getAlign(),
  8430. LD->getMemOperand()->getFlags());
  8431. }
  8432. /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
  8433. /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
  8434. /// the unextended value. The unextended vector should be 64 bits so that it can
  8435. /// be used as an operand to a VMULL instruction. If the original vector size
  8436. /// before extension is less than 64 bits we add a an extension to resize
  8437. /// the vector to 64 bits.
  8438. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
  8439. if (N->getOpcode() == ISD::SIGN_EXTEND ||
  8440. N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
  8441. return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
  8442. N->getOperand(0)->getValueType(0),
  8443. N->getValueType(0),
  8444. N->getOpcode());
  8445. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  8446. assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
  8447. "Expected extending load");
  8448. SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
  8449. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
  8450. unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
  8451. SDValue extLoad =
  8452. DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
  8453. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
  8454. return newLoad;
  8455. }
  8456. // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
  8457. // have been legalized as a BITCAST from v4i32.
  8458. if (N->getOpcode() == ISD::BITCAST) {
  8459. SDNode *BVN = N->getOperand(0).getNode();
  8460. assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
  8461. BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
  8462. unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
  8463. return DAG.getBuildVector(
  8464. MVT::v2i32, SDLoc(N),
  8465. {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
  8466. }
  8467. // Construct a new BUILD_VECTOR with elements truncated to half the size.
  8468. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
  8469. EVT VT = N->getValueType(0);
  8470. unsigned EltSize = VT.getScalarSizeInBits() / 2;
  8471. unsigned NumElts = VT.getVectorNumElements();
  8472. MVT TruncVT = MVT::getIntegerVT(EltSize);
  8473. SmallVector<SDValue, 8> Ops;
  8474. SDLoc dl(N);
  8475. for (unsigned i = 0; i != NumElts; ++i) {
  8476. ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
  8477. const APInt &CInt = C->getAPIntValue();
  8478. // Element types smaller than 32 bits are not legal, so use i32 elements.
  8479. // The values are implicitly truncated so sext vs. zext doesn't matter.
  8480. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
  8481. }
  8482. return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
  8483. }
  8484. static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
  8485. unsigned Opcode = N->getOpcode();
  8486. if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
  8487. SDNode *N0 = N->getOperand(0).getNode();
  8488. SDNode *N1 = N->getOperand(1).getNode();
  8489. return N0->hasOneUse() && N1->hasOneUse() &&
  8490. isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
  8491. }
  8492. return false;
  8493. }
  8494. static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
  8495. unsigned Opcode = N->getOpcode();
  8496. if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
  8497. SDNode *N0 = N->getOperand(0).getNode();
  8498. SDNode *N1 = N->getOperand(1).getNode();
  8499. return N0->hasOneUse() && N1->hasOneUse() &&
  8500. isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
  8501. }
  8502. return false;
  8503. }
  8504. static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
  8505. // Multiplications are only custom-lowered for 128-bit vectors so that
  8506. // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
  8507. EVT VT = Op.getValueType();
  8508. assert(VT.is128BitVector() && VT.isInteger() &&
  8509. "unexpected type for custom-lowering ISD::MUL");
  8510. SDNode *N0 = Op.getOperand(0).getNode();
  8511. SDNode *N1 = Op.getOperand(1).getNode();
  8512. unsigned NewOpc = 0;
  8513. bool isMLA = false;
  8514. bool isN0SExt = isSignExtended(N0, DAG);
  8515. bool isN1SExt = isSignExtended(N1, DAG);
  8516. if (isN0SExt && isN1SExt)
  8517. NewOpc = ARMISD::VMULLs;
  8518. else {
  8519. bool isN0ZExt = isZeroExtended(N0, DAG);
  8520. bool isN1ZExt = isZeroExtended(N1, DAG);
  8521. if (isN0ZExt && isN1ZExt)
  8522. NewOpc = ARMISD::VMULLu;
  8523. else if (isN1SExt || isN1ZExt) {
  8524. // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
  8525. // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
  8526. if (isN1SExt && isAddSubSExt(N0, DAG)) {
  8527. NewOpc = ARMISD::VMULLs;
  8528. isMLA = true;
  8529. } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
  8530. NewOpc = ARMISD::VMULLu;
  8531. isMLA = true;
  8532. } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
  8533. std::swap(N0, N1);
  8534. NewOpc = ARMISD::VMULLu;
  8535. isMLA = true;
  8536. }
  8537. }
  8538. if (!NewOpc) {
  8539. if (VT == MVT::v2i64)
  8540. // Fall through to expand this. It is not legal.
  8541. return SDValue();
  8542. else
  8543. // Other vector multiplications are legal.
  8544. return Op;
  8545. }
  8546. }
  8547. // Legalize to a VMULL instruction.
  8548. SDLoc DL(Op);
  8549. SDValue Op0;
  8550. SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
  8551. if (!isMLA) {
  8552. Op0 = SkipExtensionForVMULL(N0, DAG);
  8553. assert(Op0.getValueType().is64BitVector() &&
  8554. Op1.getValueType().is64BitVector() &&
  8555. "unexpected types for extended operands to VMULL");
  8556. return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
  8557. }
  8558. // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
  8559. // isel lowering to take advantage of no-stall back to back vmul + vmla.
  8560. // vmull q0, d4, d6
  8561. // vmlal q0, d5, d6
  8562. // is faster than
  8563. // vaddl q0, d4, d5
  8564. // vmovl q1, d6
  8565. // vmul q0, q0, q1
  8566. SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
  8567. SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
  8568. EVT Op1VT = Op1.getValueType();
  8569. return DAG.getNode(N0->getOpcode(), DL, VT,
  8570. DAG.getNode(NewOpc, DL, VT,
  8571. DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
  8572. DAG.getNode(NewOpc, DL, VT,
  8573. DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
  8574. }
  8575. static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
  8576. SelectionDAG &DAG) {
  8577. // TODO: Should this propagate fast-math-flags?
  8578. // Convert to float
  8579. // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
  8580. // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
  8581. X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
  8582. Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
  8583. X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
  8584. Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
  8585. // Get reciprocal estimate.
  8586. // float4 recip = vrecpeq_f32(yf);
  8587. Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8588. DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
  8589. Y);
  8590. // Because char has a smaller range than uchar, we can actually get away
  8591. // without any newton steps. This requires that we use a weird bias
  8592. // of 0xb000, however (again, this has been exhaustively tested).
  8593. // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
  8594. X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
  8595. X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
  8596. Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
  8597. X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
  8598. X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
  8599. // Convert back to short.
  8600. X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
  8601. X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
  8602. return X;
  8603. }
  8604. static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
  8605. SelectionDAG &DAG) {
  8606. // TODO: Should this propagate fast-math-flags?
  8607. SDValue N2;
  8608. // Convert to float.
  8609. // float4 yf = vcvt_f32_s32(vmovl_s16(y));
  8610. // float4 xf = vcvt_f32_s32(vmovl_s16(x));
  8611. N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
  8612. N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
  8613. N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
  8614. N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
  8615. // Use reciprocal estimate and one refinement step.
  8616. // float4 recip = vrecpeq_f32(yf);
  8617. // recip *= vrecpsq_f32(yf, recip);
  8618. N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8619. DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
  8620. N1);
  8621. N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8622. DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
  8623. N1, N2);
  8624. N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
  8625. // Because short has a smaller range than ushort, we can actually get away
  8626. // with only a single newton step. This requires that we use a weird bias
  8627. // of 89, however (again, this has been exhaustively tested).
  8628. // float4 result = as_float4(as_int4(xf*recip) + 0x89);
  8629. N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
  8630. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
  8631. N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
  8632. N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
  8633. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
  8634. // Convert back to integer and return.
  8635. // return vmovn_s32(vcvt_s32_f32(result));
  8636. N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
  8637. N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
  8638. return N0;
  8639. }
  8640. static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
  8641. const ARMSubtarget *ST) {
  8642. EVT VT = Op.getValueType();
  8643. assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
  8644. "unexpected type for custom-lowering ISD::SDIV");
  8645. SDLoc dl(Op);
  8646. SDValue N0 = Op.getOperand(0);
  8647. SDValue N1 = Op.getOperand(1);
  8648. SDValue N2, N3;
  8649. if (VT == MVT::v8i8) {
  8650. N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
  8651. N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
  8652. N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8653. DAG.getIntPtrConstant(4, dl));
  8654. N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8655. DAG.getIntPtrConstant(4, dl));
  8656. N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8657. DAG.getIntPtrConstant(0, dl));
  8658. N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8659. DAG.getIntPtrConstant(0, dl));
  8660. N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
  8661. N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
  8662. N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
  8663. N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
  8664. N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
  8665. return N0;
  8666. }
  8667. return LowerSDIV_v4i16(N0, N1, dl, DAG);
  8668. }
  8669. static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
  8670. const ARMSubtarget *ST) {
  8671. // TODO: Should this propagate fast-math-flags?
  8672. EVT VT = Op.getValueType();
  8673. assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
  8674. "unexpected type for custom-lowering ISD::UDIV");
  8675. SDLoc dl(Op);
  8676. SDValue N0 = Op.getOperand(0);
  8677. SDValue N1 = Op.getOperand(1);
  8678. SDValue N2, N3;
  8679. if (VT == MVT::v8i8) {
  8680. N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
  8681. N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
  8682. N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8683. DAG.getIntPtrConstant(4, dl));
  8684. N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8685. DAG.getIntPtrConstant(4, dl));
  8686. N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
  8687. DAG.getIntPtrConstant(0, dl));
  8688. N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
  8689. DAG.getIntPtrConstant(0, dl));
  8690. N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
  8691. N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
  8692. N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
  8693. N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
  8694. N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
  8695. DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
  8696. MVT::i32),
  8697. N0);
  8698. return N0;
  8699. }
  8700. // v4i16 sdiv ... Convert to float.
  8701. // float4 yf = vcvt_f32_s32(vmovl_u16(y));
  8702. // float4 xf = vcvt_f32_s32(vmovl_u16(x));
  8703. N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
  8704. N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
  8705. N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
  8706. SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
  8707. // Use reciprocal estimate and two refinement steps.
  8708. // float4 recip = vrecpeq_f32(yf);
  8709. // recip *= vrecpsq_f32(yf, recip);
  8710. // recip *= vrecpsq_f32(yf, recip);
  8711. N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8712. DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
  8713. BN1);
  8714. N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8715. DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
  8716. BN1, N2);
  8717. N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
  8718. N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
  8719. DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
  8720. BN1, N2);
  8721. N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
  8722. // Simply multiplying by the reciprocal estimate can leave us a few ulps
  8723. // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
  8724. // and that it will never cause us to return an answer too large).
  8725. // float4 result = as_float4(as_int4(xf*recip) + 2);
  8726. N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
  8727. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
  8728. N1 = DAG.getConstant(2, dl, MVT::v4i32);
  8729. N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
  8730. N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
  8731. // Convert back to integer and return.
  8732. // return vmovn_u32(vcvt_s32_f32(result));
  8733. N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
  8734. N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
  8735. return N0;
  8736. }
  8737. static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
  8738. SDNode *N = Op.getNode();
  8739. EVT VT = N->getValueType(0);
  8740. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  8741. SDValue Carry = Op.getOperand(2);
  8742. SDLoc DL(Op);
  8743. SDValue Result;
  8744. if (Op.getOpcode() == ISD::ADDCARRY) {
  8745. // This converts the boolean value carry into the carry flag.
  8746. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
  8747. // Do the addition proper using the carry flag we wanted.
  8748. Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
  8749. Op.getOperand(1), Carry);
  8750. // Now convert the carry flag into a boolean value.
  8751. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
  8752. } else {
  8753. // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
  8754. // have to invert the carry first.
  8755. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
  8756. DAG.getConstant(1, DL, MVT::i32), Carry);
  8757. // This converts the boolean value carry into the carry flag.
  8758. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
  8759. // Do the subtraction proper using the carry flag we wanted.
  8760. Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
  8761. Op.getOperand(1), Carry);
  8762. // Now convert the carry flag into a boolean value.
  8763. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
  8764. // But the carry returned by ARMISD::SUBE is not a borrow as expected
  8765. // by ISD::SUBCARRY, so compute 1 - C.
  8766. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
  8767. DAG.getConstant(1, DL, MVT::i32), Carry);
  8768. }
  8769. // Return both values.
  8770. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
  8771. }
  8772. SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
  8773. assert(Subtarget->isTargetDarwin());
  8774. // For iOS, we want to call an alternative entry point: __sincos_stret,
  8775. // return values are passed via sret.
  8776. SDLoc dl(Op);
  8777. SDValue Arg = Op.getOperand(0);
  8778. EVT ArgVT = Arg.getValueType();
  8779. Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
  8780. auto PtrVT = getPointerTy(DAG.getDataLayout());
  8781. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
  8782. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  8783. // Pair of floats / doubles used to pass the result.
  8784. Type *RetTy = StructType::get(ArgTy, ArgTy);
  8785. auto &DL = DAG.getDataLayout();
  8786. ArgListTy Args;
  8787. bool ShouldUseSRet = Subtarget->isAPCS_ABI();
  8788. SDValue SRet;
  8789. if (ShouldUseSRet) {
  8790. // Create stack object for sret.
  8791. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
  8792. const Align StackAlign = DL.getPrefTypeAlign(RetTy);
  8793. int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
  8794. SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
  8795. ArgListEntry Entry;
  8796. Entry.Node = SRet;
  8797. Entry.Ty = RetTy->getPointerTo();
  8798. Entry.IsSExt = false;
  8799. Entry.IsZExt = false;
  8800. Entry.IsSRet = true;
  8801. Args.push_back(Entry);
  8802. RetTy = Type::getVoidTy(*DAG.getContext());
  8803. }
  8804. ArgListEntry Entry;
  8805. Entry.Node = Arg;
  8806. Entry.Ty = ArgTy;
  8807. Entry.IsSExt = false;
  8808. Entry.IsZExt = false;
  8809. Args.push_back(Entry);
  8810. RTLIB::Libcall LC =
  8811. (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
  8812. const char *LibcallName = getLibcallName(LC);
  8813. CallingConv::ID CC = getLibcallCallingConv(LC);
  8814. SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
  8815. TargetLowering::CallLoweringInfo CLI(DAG);
  8816. CLI.setDebugLoc(dl)
  8817. .setChain(DAG.getEntryNode())
  8818. .setCallee(CC, RetTy, Callee, std::move(Args))
  8819. .setDiscardResult(ShouldUseSRet);
  8820. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  8821. if (!ShouldUseSRet)
  8822. return CallResult.first;
  8823. SDValue LoadSin =
  8824. DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
  8825. // Address of cos field.
  8826. SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
  8827. DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
  8828. SDValue LoadCos =
  8829. DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
  8830. SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
  8831. return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
  8832. LoadSin.getValue(0), LoadCos.getValue(0));
  8833. }
  8834. SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
  8835. bool Signed,
  8836. SDValue &Chain) const {
  8837. EVT VT = Op.getValueType();
  8838. assert((VT == MVT::i32 || VT == MVT::i64) &&
  8839. "unexpected type for custom lowering DIV");
  8840. SDLoc dl(Op);
  8841. const auto &DL = DAG.getDataLayout();
  8842. const auto &TLI = DAG.getTargetLoweringInfo();
  8843. const char *Name = nullptr;
  8844. if (Signed)
  8845. Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
  8846. else
  8847. Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
  8848. SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
  8849. ARMTargetLowering::ArgListTy Args;
  8850. for (auto AI : {1, 0}) {
  8851. ArgListEntry Arg;
  8852. Arg.Node = Op.getOperand(AI);
  8853. Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
  8854. Args.push_back(Arg);
  8855. }
  8856. CallLoweringInfo CLI(DAG);
  8857. CLI.setDebugLoc(dl)
  8858. .setChain(Chain)
  8859. .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
  8860. ES, std::move(Args));
  8861. return LowerCallTo(CLI).first;
  8862. }
  8863. // This is a code size optimisation: return the original SDIV node to
  8864. // DAGCombiner when we don't want to expand SDIV into a sequence of
  8865. // instructions, and an empty node otherwise which will cause the
  8866. // SDIV to be expanded in DAGCombine.
  8867. SDValue
  8868. ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
  8869. SelectionDAG &DAG,
  8870. SmallVectorImpl<SDNode *> &Created) const {
  8871. // TODO: Support SREM
  8872. if (N->getOpcode() != ISD::SDIV)
  8873. return SDValue();
  8874. const auto &ST = DAG.getSubtarget<ARMSubtarget>();
  8875. const bool MinSize = ST.hasMinSize();
  8876. const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
  8877. : ST.hasDivideInARMMode();
  8878. // Don't touch vector types; rewriting this may lead to scalarizing
  8879. // the int divs.
  8880. if (N->getOperand(0).getValueType().isVector())
  8881. return SDValue();
  8882. // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
  8883. // hwdiv support for this to be really profitable.
  8884. if (!(MinSize && HasDivide))
  8885. return SDValue();
  8886. // ARM mode is a bit simpler than Thumb: we can handle large power
  8887. // of 2 immediates with 1 mov instruction; no further checks required,
  8888. // just return the sdiv node.
  8889. if (!ST.isThumb())
  8890. return SDValue(N, 0);
  8891. // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
  8892. // and thus lose the code size benefits of a MOVS that requires only 2.
  8893. // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
  8894. // but as it's doing exactly this, it's not worth the trouble to get TTI.
  8895. if (Divisor.sgt(128))
  8896. return SDValue();
  8897. return SDValue(N, 0);
  8898. }
  8899. SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
  8900. bool Signed) const {
  8901. assert(Op.getValueType() == MVT::i32 &&
  8902. "unexpected type for custom lowering DIV");
  8903. SDLoc dl(Op);
  8904. SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
  8905. DAG.getEntryNode(), Op.getOperand(1));
  8906. return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
  8907. }
  8908. static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
  8909. SDLoc DL(N);
  8910. SDValue Op = N->getOperand(1);
  8911. if (N->getValueType(0) == MVT::i32)
  8912. return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
  8913. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
  8914. DAG.getConstant(0, DL, MVT::i32));
  8915. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
  8916. DAG.getConstant(1, DL, MVT::i32));
  8917. return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
  8918. DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
  8919. }
  8920. void ARMTargetLowering::ExpandDIV_Windows(
  8921. SDValue Op, SelectionDAG &DAG, bool Signed,
  8922. SmallVectorImpl<SDValue> &Results) const {
  8923. const auto &DL = DAG.getDataLayout();
  8924. const auto &TLI = DAG.getTargetLoweringInfo();
  8925. assert(Op.getValueType() == MVT::i64 &&
  8926. "unexpected type for custom lowering DIV");
  8927. SDLoc dl(Op);
  8928. SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
  8929. SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
  8930. SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
  8931. SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
  8932. DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
  8933. Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
  8934. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
  8935. }
  8936. static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
  8937. LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
  8938. EVT MemVT = LD->getMemoryVT();
  8939. assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
  8940. MemVT == MVT::v16i1) &&
  8941. "Expected a predicate type!");
  8942. assert(MemVT == Op.getValueType());
  8943. assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
  8944. "Expected a non-extending load");
  8945. assert(LD->isUnindexed() && "Expected a unindexed load");
  8946. // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
  8947. // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
  8948. // need to make sure that 8/4/2 bits are actually loaded into the correct
  8949. // place, which means loading the value and then shuffling the values into
  8950. // the bottom bits of the predicate.
  8951. // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
  8952. // for BE).
  8953. // Speaking of BE, apparently the rest of llvm will assume a reverse order to
  8954. // a natural VMSR(load), so needs to be reversed.
  8955. SDLoc dl(Op);
  8956. SDValue Load = DAG.getExtLoad(
  8957. ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
  8958. EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
  8959. LD->getMemOperand());
  8960. SDValue Val = Load;
  8961. if (DAG.getDataLayout().isBigEndian())
  8962. Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
  8963. DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
  8964. DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
  8965. SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
  8966. if (MemVT != MVT::v16i1)
  8967. Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
  8968. DAG.getConstant(0, dl, MVT::i32));
  8969. return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
  8970. }
  8971. void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
  8972. SelectionDAG &DAG) const {
  8973. LoadSDNode *LD = cast<LoadSDNode>(N);
  8974. EVT MemVT = LD->getMemoryVT();
  8975. assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
  8976. if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
  8977. !Subtarget->isThumb1Only() && LD->isVolatile()) {
  8978. SDLoc dl(N);
  8979. SDValue Result = DAG.getMemIntrinsicNode(
  8980. ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
  8981. {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
  8982. SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
  8983. SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
  8984. SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
  8985. Results.append({Pair, Result.getValue(2)});
  8986. }
  8987. }
  8988. static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
  8989. StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
  8990. EVT MemVT = ST->getMemoryVT();
  8991. assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
  8992. MemVT == MVT::v16i1) &&
  8993. "Expected a predicate type!");
  8994. assert(MemVT == ST->getValue().getValueType());
  8995. assert(!ST->isTruncatingStore() && "Expected a non-extending store");
  8996. assert(ST->isUnindexed() && "Expected a unindexed store");
  8997. // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
  8998. // top bits unset and a scalar store.
  8999. SDLoc dl(Op);
  9000. SDValue Build = ST->getValue();
  9001. if (MemVT != MVT::v16i1) {
  9002. SmallVector<SDValue, 16> Ops;
  9003. for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
  9004. unsigned Elt = DAG.getDataLayout().isBigEndian()
  9005. ? MemVT.getVectorNumElements() - I - 1
  9006. : I;
  9007. Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
  9008. DAG.getConstant(Elt, dl, MVT::i32)));
  9009. }
  9010. for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
  9011. Ops.push_back(DAG.getUNDEF(MVT::i32));
  9012. Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
  9013. }
  9014. SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
  9015. if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
  9016. GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
  9017. DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
  9018. DAG.getConstant(16, dl, MVT::i32));
  9019. return DAG.getTruncStore(
  9020. ST->getChain(), dl, GRP, ST->getBasePtr(),
  9021. EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
  9022. ST->getMemOperand());
  9023. }
  9024. static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
  9025. const ARMSubtarget *Subtarget) {
  9026. StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
  9027. EVT MemVT = ST->getMemoryVT();
  9028. assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
  9029. if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
  9030. !Subtarget->isThumb1Only() && ST->isVolatile()) {
  9031. SDNode *N = Op.getNode();
  9032. SDLoc dl(N);
  9033. SDValue Lo = DAG.getNode(
  9034. ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
  9035. DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
  9036. MVT::i32));
  9037. SDValue Hi = DAG.getNode(
  9038. ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
  9039. DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
  9040. MVT::i32));
  9041. return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
  9042. {ST->getChain(), Lo, Hi, ST->getBasePtr()},
  9043. MemVT, ST->getMemOperand());
  9044. } else if (Subtarget->hasMVEIntegerOps() &&
  9045. ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
  9046. MemVT == MVT::v16i1))) {
  9047. return LowerPredicateStore(Op, DAG);
  9048. }
  9049. return SDValue();
  9050. }
  9051. static bool isZeroVector(SDValue N) {
  9052. return (ISD::isBuildVectorAllZeros(N.getNode()) ||
  9053. (N->getOpcode() == ARMISD::VMOVIMM &&
  9054. isNullConstant(N->getOperand(0))));
  9055. }
  9056. static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
  9057. MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
  9058. MVT VT = Op.getSimpleValueType();
  9059. SDValue Mask = N->getMask();
  9060. SDValue PassThru = N->getPassThru();
  9061. SDLoc dl(Op);
  9062. if (isZeroVector(PassThru))
  9063. return Op;
  9064. // MVE Masked loads use zero as the passthru value. Here we convert undef to
  9065. // zero too, and other values are lowered to a select.
  9066. SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
  9067. DAG.getTargetConstant(0, dl, MVT::i32));
  9068. SDValue NewLoad = DAG.getMaskedLoad(
  9069. VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
  9070. N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
  9071. N->getExtensionType(), N->isExpandingLoad());
  9072. SDValue Combo = NewLoad;
  9073. bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
  9074. PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
  9075. isZeroVector(PassThru->getOperand(0));
  9076. if (!PassThru.isUndef() && !PassThruIsCastZero)
  9077. Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
  9078. return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
  9079. }
  9080. static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
  9081. const ARMSubtarget *ST) {
  9082. if (!ST->hasMVEIntegerOps())
  9083. return SDValue();
  9084. SDLoc dl(Op);
  9085. unsigned BaseOpcode = 0;
  9086. switch (Op->getOpcode()) {
  9087. default: llvm_unreachable("Expected VECREDUCE opcode");
  9088. case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
  9089. case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
  9090. case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
  9091. case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
  9092. case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
  9093. case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
  9094. case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
  9095. case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
  9096. }
  9097. SDValue Op0 = Op->getOperand(0);
  9098. EVT VT = Op0.getValueType();
  9099. EVT EltVT = VT.getVectorElementType();
  9100. unsigned NumElts = VT.getVectorNumElements();
  9101. unsigned NumActiveLanes = NumElts;
  9102. assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
  9103. NumActiveLanes == 2) &&
  9104. "Only expected a power 2 vector size");
  9105. // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
  9106. // allows us to easily extract vector elements from the lanes.
  9107. while (NumActiveLanes > 4) {
  9108. unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
  9109. SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
  9110. Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
  9111. NumActiveLanes /= 2;
  9112. }
  9113. SDValue Res;
  9114. if (NumActiveLanes == 4) {
  9115. // The remaining 4 elements are summed sequentially
  9116. SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9117. DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
  9118. SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9119. DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
  9120. SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9121. DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
  9122. SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9123. DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
  9124. SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
  9125. SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
  9126. Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
  9127. } else {
  9128. SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9129. DAG.getConstant(0, dl, MVT::i32));
  9130. SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
  9131. DAG.getConstant(1, dl, MVT::i32));
  9132. Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
  9133. }
  9134. // Result type may be wider than element type.
  9135. if (EltVT != Op->getValueType(0))
  9136. Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
  9137. return Res;
  9138. }
  9139. static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
  9140. const ARMSubtarget *ST) {
  9141. if (!ST->hasMVEFloatOps())
  9142. return SDValue();
  9143. return LowerVecReduce(Op, DAG, ST);
  9144. }
  9145. static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
  9146. if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
  9147. // Acquire/Release load/store is not legal for targets without a dmb or
  9148. // equivalent available.
  9149. return SDValue();
  9150. // Monotonic load/store is legal for all targets.
  9151. return Op;
  9152. }
  9153. static void ReplaceREADCYCLECOUNTER(SDNode *N,
  9154. SmallVectorImpl<SDValue> &Results,
  9155. SelectionDAG &DAG,
  9156. const ARMSubtarget *Subtarget) {
  9157. SDLoc DL(N);
  9158. // Under Power Management extensions, the cycle-count is:
  9159. // mrc p15, #0, <Rt>, c9, c13, #0
  9160. SDValue Ops[] = { N->getOperand(0), // Chain
  9161. DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
  9162. DAG.getTargetConstant(15, DL, MVT::i32),
  9163. DAG.getTargetConstant(0, DL, MVT::i32),
  9164. DAG.getTargetConstant(9, DL, MVT::i32),
  9165. DAG.getTargetConstant(13, DL, MVT::i32),
  9166. DAG.getTargetConstant(0, DL, MVT::i32)
  9167. };
  9168. SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
  9169. DAG.getVTList(MVT::i32, MVT::Other), Ops);
  9170. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
  9171. DAG.getConstant(0, DL, MVT::i32)));
  9172. Results.push_back(Cycles32.getValue(1));
  9173. }
  9174. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
  9175. SDLoc dl(V.getNode());
  9176. SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
  9177. SDValue VHi = DAG.getAnyExtOrTrunc(
  9178. DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
  9179. dl, MVT::i32);
  9180. bool isBigEndian = DAG.getDataLayout().isBigEndian();
  9181. if (isBigEndian)
  9182. std::swap (VLo, VHi);
  9183. SDValue RegClass =
  9184. DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
  9185. SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
  9186. SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
  9187. const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
  9188. return SDValue(
  9189. DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
  9190. }
  9191. static void ReplaceCMP_SWAP_64Results(SDNode *N,
  9192. SmallVectorImpl<SDValue> & Results,
  9193. SelectionDAG &DAG) {
  9194. assert(N->getValueType(0) == MVT::i64 &&
  9195. "AtomicCmpSwap on types less than 64 should be legal");
  9196. SDValue Ops[] = {N->getOperand(1),
  9197. createGPRPairNode(DAG, N->getOperand(2)),
  9198. createGPRPairNode(DAG, N->getOperand(3)),
  9199. N->getOperand(0)};
  9200. SDNode *CmpSwap = DAG.getMachineNode(
  9201. ARM::CMP_SWAP_64, SDLoc(N),
  9202. DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
  9203. MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
  9204. DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
  9205. bool isBigEndian = DAG.getDataLayout().isBigEndian();
  9206. SDValue Lo =
  9207. DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
  9208. SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
  9209. SDValue Hi =
  9210. DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
  9211. SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
  9212. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
  9213. Results.push_back(SDValue(CmpSwap, 2));
  9214. }
  9215. SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
  9216. SDLoc dl(Op);
  9217. EVT VT = Op.getValueType();
  9218. SDValue Chain = Op.getOperand(0);
  9219. SDValue LHS = Op.getOperand(1);
  9220. SDValue RHS = Op.getOperand(2);
  9221. ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
  9222. bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
  9223. // If we don't have instructions of this float type then soften to a libcall
  9224. // and use SETCC instead.
  9225. if (isUnsupportedFloatingType(LHS.getValueType())) {
  9226. DAG.getTargetLoweringInfo().softenSetCCOperands(
  9227. DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
  9228. if (!RHS.getNode()) {
  9229. RHS = DAG.getConstant(0, dl, LHS.getValueType());
  9230. CC = ISD::SETNE;
  9231. }
  9232. SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
  9233. DAG.getCondCode(CC));
  9234. return DAG.getMergeValues({Result, Chain}, dl);
  9235. }
  9236. ARMCC::CondCodes CondCode, CondCode2;
  9237. FPCCToARMCC(CC, CondCode, CondCode2);
  9238. // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
  9239. // in CMPFP and CMPFPE, but instead it should be made explicit by these
  9240. // instructions using a chain instead of glue. This would also fix the problem
  9241. // here (and also in LowerSELECT_CC) where we generate two comparisons when
  9242. // CondCode2 != AL.
  9243. SDValue True = DAG.getConstant(1, dl, VT);
  9244. SDValue False = DAG.getConstant(0, dl, VT);
  9245. SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
  9246. SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
  9247. SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
  9248. SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
  9249. if (CondCode2 != ARMCC::AL) {
  9250. ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
  9251. Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
  9252. Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
  9253. }
  9254. return DAG.getMergeValues({Result, Chain}, dl);
  9255. }
  9256. SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
  9257. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
  9258. EVT VT = getPointerTy(DAG.getDataLayout());
  9259. SDLoc DL(Op);
  9260. int FI = MFI.CreateFixedObject(4, 0, false);
  9261. return DAG.getFrameIndex(FI, VT);
  9262. }
  9263. SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  9264. LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
  9265. switch (Op.getOpcode()) {
  9266. default: llvm_unreachable("Don't know how to custom lower this!");
  9267. case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
  9268. case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
  9269. case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
  9270. case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
  9271. case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
  9272. case ISD::SELECT: return LowerSELECT(Op, DAG);
  9273. case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
  9274. case ISD::BRCOND: return LowerBRCOND(Op, DAG);
  9275. case ISD::BR_CC: return LowerBR_CC(Op, DAG);
  9276. case ISD::BR_JT: return LowerBR_JT(Op, DAG);
  9277. case ISD::VASTART: return LowerVASTART(Op, DAG);
  9278. case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
  9279. case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
  9280. case ISD::SINT_TO_FP:
  9281. case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
  9282. case ISD::STRICT_FP_TO_SINT:
  9283. case ISD::STRICT_FP_TO_UINT:
  9284. case ISD::FP_TO_SINT:
  9285. case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
  9286. case ISD::FP_TO_SINT_SAT:
  9287. case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
  9288. case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
  9289. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
  9290. case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
  9291. case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
  9292. case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
  9293. case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
  9294. case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
  9295. case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
  9296. Subtarget);
  9297. case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
  9298. case ISD::SHL:
  9299. case ISD::SRL:
  9300. case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
  9301. case ISD::SREM: return LowerREM(Op.getNode(), DAG);
  9302. case ISD::UREM: return LowerREM(Op.getNode(), DAG);
  9303. case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
  9304. case ISD::SRL_PARTS:
  9305. case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
  9306. case ISD::CTTZ:
  9307. case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
  9308. case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
  9309. case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
  9310. case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
  9311. case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
  9312. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
  9313. case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
  9314. case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
  9315. case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
  9316. case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
  9317. case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
  9318. case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
  9319. case ISD::SIGN_EXTEND:
  9320. case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
  9321. case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
  9322. case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
  9323. case ISD::MUL: return LowerMUL(Op, DAG);
  9324. case ISD::SDIV:
  9325. if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
  9326. return LowerDIV_Windows(Op, DAG, /* Signed */ true);
  9327. return LowerSDIV(Op, DAG, Subtarget);
  9328. case ISD::UDIV:
  9329. if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
  9330. return LowerDIV_Windows(Op, DAG, /* Signed */ false);
  9331. return LowerUDIV(Op, DAG, Subtarget);
  9332. case ISD::ADDCARRY:
  9333. case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
  9334. case ISD::SADDO:
  9335. case ISD::SSUBO:
  9336. return LowerSignedALUO(Op, DAG);
  9337. case ISD::UADDO:
  9338. case ISD::USUBO:
  9339. return LowerUnsignedALUO(Op, DAG);
  9340. case ISD::SADDSAT:
  9341. case ISD::SSUBSAT:
  9342. case ISD::UADDSAT:
  9343. case ISD::USUBSAT:
  9344. return LowerADDSUBSAT(Op, DAG, Subtarget);
  9345. case ISD::LOAD:
  9346. return LowerPredicateLoad(Op, DAG);
  9347. case ISD::STORE:
  9348. return LowerSTORE(Op, DAG, Subtarget);
  9349. case ISD::MLOAD:
  9350. return LowerMLOAD(Op, DAG);
  9351. case ISD::VECREDUCE_MUL:
  9352. case ISD::VECREDUCE_AND:
  9353. case ISD::VECREDUCE_OR:
  9354. case ISD::VECREDUCE_XOR:
  9355. return LowerVecReduce(Op, DAG, Subtarget);
  9356. case ISD::VECREDUCE_FADD:
  9357. case ISD::VECREDUCE_FMUL:
  9358. case ISD::VECREDUCE_FMIN:
  9359. case ISD::VECREDUCE_FMAX:
  9360. return LowerVecReduceF(Op, DAG, Subtarget);
  9361. case ISD::ATOMIC_LOAD:
  9362. case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
  9363. case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
  9364. case ISD::SDIVREM:
  9365. case ISD::UDIVREM: return LowerDivRem(Op, DAG);
  9366. case ISD::DYNAMIC_STACKALLOC:
  9367. if (Subtarget->isTargetWindows())
  9368. return LowerDYNAMIC_STACKALLOC(Op, DAG);
  9369. llvm_unreachable("Don't know how to custom lower this!");
  9370. case ISD::STRICT_FP_ROUND:
  9371. case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
  9372. case ISD::STRICT_FP_EXTEND:
  9373. case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
  9374. case ISD::STRICT_FSETCC:
  9375. case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
  9376. case ISD::SPONENTRY:
  9377. return LowerSPONENTRY(Op, DAG);
  9378. case ARMISD::WIN__DBZCHK: return SDValue();
  9379. }
  9380. }
  9381. static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
  9382. SelectionDAG &DAG) {
  9383. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
  9384. unsigned Opc = 0;
  9385. if (IntNo == Intrinsic::arm_smlald)
  9386. Opc = ARMISD::SMLALD;
  9387. else if (IntNo == Intrinsic::arm_smlaldx)
  9388. Opc = ARMISD::SMLALDX;
  9389. else if (IntNo == Intrinsic::arm_smlsld)
  9390. Opc = ARMISD::SMLSLD;
  9391. else if (IntNo == Intrinsic::arm_smlsldx)
  9392. Opc = ARMISD::SMLSLDX;
  9393. else
  9394. return;
  9395. SDLoc dl(N);
  9396. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
  9397. N->getOperand(3),
  9398. DAG.getConstant(0, dl, MVT::i32));
  9399. SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
  9400. N->getOperand(3),
  9401. DAG.getConstant(1, dl, MVT::i32));
  9402. SDValue LongMul = DAG.getNode(Opc, dl,
  9403. DAG.getVTList(MVT::i32, MVT::i32),
  9404. N->getOperand(1), N->getOperand(2),
  9405. Lo, Hi);
  9406. Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
  9407. LongMul.getValue(0), LongMul.getValue(1)));
  9408. }
  9409. /// ReplaceNodeResults - Replace the results of node with an illegal result
  9410. /// type with new values built out of custom code.
  9411. void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
  9412. SmallVectorImpl<SDValue> &Results,
  9413. SelectionDAG &DAG) const {
  9414. SDValue Res;
  9415. switch (N->getOpcode()) {
  9416. default:
  9417. llvm_unreachable("Don't know how to custom expand this!");
  9418. case ISD::READ_REGISTER:
  9419. ExpandREAD_REGISTER(N, Results, DAG);
  9420. break;
  9421. case ISD::BITCAST:
  9422. Res = ExpandBITCAST(N, DAG, Subtarget);
  9423. break;
  9424. case ISD::SRL:
  9425. case ISD::SRA:
  9426. case ISD::SHL:
  9427. Res = Expand64BitShift(N, DAG, Subtarget);
  9428. break;
  9429. case ISD::SREM:
  9430. case ISD::UREM:
  9431. Res = LowerREM(N, DAG);
  9432. break;
  9433. case ISD::SDIVREM:
  9434. case ISD::UDIVREM:
  9435. Res = LowerDivRem(SDValue(N, 0), DAG);
  9436. assert(Res.getNumOperands() == 2 && "DivRem needs two values");
  9437. Results.push_back(Res.getValue(0));
  9438. Results.push_back(Res.getValue(1));
  9439. return;
  9440. case ISD::SADDSAT:
  9441. case ISD::SSUBSAT:
  9442. case ISD::UADDSAT:
  9443. case ISD::USUBSAT:
  9444. Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
  9445. break;
  9446. case ISD::READCYCLECOUNTER:
  9447. ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
  9448. return;
  9449. case ISD::UDIV:
  9450. case ISD::SDIV:
  9451. assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
  9452. return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
  9453. Results);
  9454. case ISD::ATOMIC_CMP_SWAP:
  9455. ReplaceCMP_SWAP_64Results(N, Results, DAG);
  9456. return;
  9457. case ISD::INTRINSIC_WO_CHAIN:
  9458. return ReplaceLongIntrinsic(N, Results, DAG);
  9459. case ISD::LOAD:
  9460. LowerLOAD(N, Results, DAG);
  9461. break;
  9462. case ISD::TRUNCATE:
  9463. Res = LowerTruncate(N, DAG, Subtarget);
  9464. break;
  9465. case ISD::SIGN_EXTEND:
  9466. case ISD::ZERO_EXTEND:
  9467. Res = LowerVectorExtend(N, DAG, Subtarget);
  9468. break;
  9469. case ISD::FP_TO_SINT_SAT:
  9470. case ISD::FP_TO_UINT_SAT:
  9471. Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
  9472. break;
  9473. }
  9474. if (Res.getNode())
  9475. Results.push_back(Res);
  9476. }
  9477. //===----------------------------------------------------------------------===//
  9478. // ARM Scheduler Hooks
  9479. //===----------------------------------------------------------------------===//
  9480. /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
  9481. /// registers the function context.
  9482. void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
  9483. MachineBasicBlock *MBB,
  9484. MachineBasicBlock *DispatchBB,
  9485. int FI) const {
  9486. assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
  9487. "ROPI/RWPI not currently supported with SjLj");
  9488. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  9489. DebugLoc dl = MI.getDebugLoc();
  9490. MachineFunction *MF = MBB->getParent();
  9491. MachineRegisterInfo *MRI = &MF->getRegInfo();
  9492. MachineConstantPool *MCP = MF->getConstantPool();
  9493. ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
  9494. const Function &F = MF->getFunction();
  9495. bool isThumb = Subtarget->isThumb();
  9496. bool isThumb2 = Subtarget->isThumb2();
  9497. unsigned PCLabelId = AFI->createPICLabelUId();
  9498. unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
  9499. ARMConstantPoolValue *CPV =
  9500. ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
  9501. unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
  9502. const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
  9503. : &ARM::GPRRegClass;
  9504. // Grab constant pool and fixed stack memory operands.
  9505. MachineMemOperand *CPMMO =
  9506. MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
  9507. MachineMemOperand::MOLoad, 4, Align(4));
  9508. MachineMemOperand *FIMMOSt =
  9509. MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
  9510. MachineMemOperand::MOStore, 4, Align(4));
  9511. // Load the address of the dispatch MBB into the jump buffer.
  9512. if (isThumb2) {
  9513. // Incoming value: jbuf
  9514. // ldr.n r5, LCPI1_1
  9515. // orr r5, r5, #1
  9516. // add r5, pc
  9517. // str r5, [$jbuf, #+4] ; &jbuf[1]
  9518. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9519. BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
  9520. .addConstantPoolIndex(CPI)
  9521. .addMemOperand(CPMMO)
  9522. .add(predOps(ARMCC::AL));
  9523. // Set the low bit because of thumb mode.
  9524. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9525. BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
  9526. .addReg(NewVReg1, RegState::Kill)
  9527. .addImm(0x01)
  9528. .add(predOps(ARMCC::AL))
  9529. .add(condCodeOp());
  9530. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9531. BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
  9532. .addReg(NewVReg2, RegState::Kill)
  9533. .addImm(PCLabelId);
  9534. BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
  9535. .addReg(NewVReg3, RegState::Kill)
  9536. .addFrameIndex(FI)
  9537. .addImm(36) // &jbuf[1] :: pc
  9538. .addMemOperand(FIMMOSt)
  9539. .add(predOps(ARMCC::AL));
  9540. } else if (isThumb) {
  9541. // Incoming value: jbuf
  9542. // ldr.n r1, LCPI1_4
  9543. // add r1, pc
  9544. // mov r2, #1
  9545. // orrs r1, r2
  9546. // add r2, $jbuf, #+4 ; &jbuf[1]
  9547. // str r1, [r2]
  9548. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9549. BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
  9550. .addConstantPoolIndex(CPI)
  9551. .addMemOperand(CPMMO)
  9552. .add(predOps(ARMCC::AL));
  9553. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9554. BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
  9555. .addReg(NewVReg1, RegState::Kill)
  9556. .addImm(PCLabelId);
  9557. // Set the low bit because of thumb mode.
  9558. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9559. BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
  9560. .addReg(ARM::CPSR, RegState::Define)
  9561. .addImm(1)
  9562. .add(predOps(ARMCC::AL));
  9563. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9564. BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
  9565. .addReg(ARM::CPSR, RegState::Define)
  9566. .addReg(NewVReg2, RegState::Kill)
  9567. .addReg(NewVReg3, RegState::Kill)
  9568. .add(predOps(ARMCC::AL));
  9569. Register NewVReg5 = MRI->createVirtualRegister(TRC);
  9570. BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
  9571. .addFrameIndex(FI)
  9572. .addImm(36); // &jbuf[1] :: pc
  9573. BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
  9574. .addReg(NewVReg4, RegState::Kill)
  9575. .addReg(NewVReg5, RegState::Kill)
  9576. .addImm(0)
  9577. .addMemOperand(FIMMOSt)
  9578. .add(predOps(ARMCC::AL));
  9579. } else {
  9580. // Incoming value: jbuf
  9581. // ldr r1, LCPI1_1
  9582. // add r1, pc, r1
  9583. // str r1, [$jbuf, #+4] ; &jbuf[1]
  9584. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9585. BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
  9586. .addConstantPoolIndex(CPI)
  9587. .addImm(0)
  9588. .addMemOperand(CPMMO)
  9589. .add(predOps(ARMCC::AL));
  9590. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9591. BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
  9592. .addReg(NewVReg1, RegState::Kill)
  9593. .addImm(PCLabelId)
  9594. .add(predOps(ARMCC::AL));
  9595. BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
  9596. .addReg(NewVReg2, RegState::Kill)
  9597. .addFrameIndex(FI)
  9598. .addImm(36) // &jbuf[1] :: pc
  9599. .addMemOperand(FIMMOSt)
  9600. .add(predOps(ARMCC::AL));
  9601. }
  9602. }
  9603. void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
  9604. MachineBasicBlock *MBB) const {
  9605. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  9606. DebugLoc dl = MI.getDebugLoc();
  9607. MachineFunction *MF = MBB->getParent();
  9608. MachineRegisterInfo *MRI = &MF->getRegInfo();
  9609. MachineFrameInfo &MFI = MF->getFrameInfo();
  9610. int FI = MFI.getFunctionContextIndex();
  9611. const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
  9612. : &ARM::GPRnopcRegClass;
  9613. // Get a mapping of the call site numbers to all of the landing pads they're
  9614. // associated with.
  9615. DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
  9616. unsigned MaxCSNum = 0;
  9617. for (MachineBasicBlock &BB : *MF) {
  9618. if (!BB.isEHPad())
  9619. continue;
  9620. // FIXME: We should assert that the EH_LABEL is the first MI in the landing
  9621. // pad.
  9622. for (MachineInstr &II : BB) {
  9623. if (!II.isEHLabel())
  9624. continue;
  9625. MCSymbol *Sym = II.getOperand(0).getMCSymbol();
  9626. if (!MF->hasCallSiteLandingPad(Sym)) continue;
  9627. SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
  9628. for (unsigned Idx : CallSiteIdxs) {
  9629. CallSiteNumToLPad[Idx].push_back(&BB);
  9630. MaxCSNum = std::max(MaxCSNum, Idx);
  9631. }
  9632. break;
  9633. }
  9634. }
  9635. // Get an ordered list of the machine basic blocks for the jump table.
  9636. std::vector<MachineBasicBlock*> LPadList;
  9637. SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
  9638. LPadList.reserve(CallSiteNumToLPad.size());
  9639. for (unsigned I = 1; I <= MaxCSNum; ++I) {
  9640. SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
  9641. for (MachineBasicBlock *MBB : MBBList) {
  9642. LPadList.push_back(MBB);
  9643. InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());
  9644. }
  9645. }
  9646. assert(!LPadList.empty() &&
  9647. "No landing pad destinations for the dispatch jump table!");
  9648. // Create the jump table and associated information.
  9649. MachineJumpTableInfo *JTI =
  9650. MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
  9651. unsigned MJTI = JTI->createJumpTableIndex(LPadList);
  9652. // Create the MBBs for the dispatch code.
  9653. // Shove the dispatch's address into the return slot in the function context.
  9654. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
  9655. DispatchBB->setIsEHPad();
  9656. MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
  9657. unsigned trap_opcode;
  9658. if (Subtarget->isThumb())
  9659. trap_opcode = ARM::tTRAP;
  9660. else
  9661. trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
  9662. BuildMI(TrapBB, dl, TII->get(trap_opcode));
  9663. DispatchBB->addSuccessor(TrapBB);
  9664. MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
  9665. DispatchBB->addSuccessor(DispContBB);
  9666. // Insert and MBBs.
  9667. MF->insert(MF->end(), DispatchBB);
  9668. MF->insert(MF->end(), DispContBB);
  9669. MF->insert(MF->end(), TrapBB);
  9670. // Insert code into the entry block that creates and registers the function
  9671. // context.
  9672. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
  9673. MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
  9674. MachinePointerInfo::getFixedStack(*MF, FI),
  9675. MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
  9676. MachineInstrBuilder MIB;
  9677. MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
  9678. const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
  9679. const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
  9680. // Add a register mask with no preserved registers. This results in all
  9681. // registers being marked as clobbered. This can't work if the dispatch block
  9682. // is in a Thumb1 function and is linked with ARM code which uses the FP
  9683. // registers, as there is no way to preserve the FP registers in Thumb1 mode.
  9684. MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
  9685. bool IsPositionIndependent = isPositionIndependent();
  9686. unsigned NumLPads = LPadList.size();
  9687. if (Subtarget->isThumb2()) {
  9688. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9689. BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
  9690. .addFrameIndex(FI)
  9691. .addImm(4)
  9692. .addMemOperand(FIMMOLd)
  9693. .add(predOps(ARMCC::AL));
  9694. if (NumLPads < 256) {
  9695. BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
  9696. .addReg(NewVReg1)
  9697. .addImm(LPadList.size())
  9698. .add(predOps(ARMCC::AL));
  9699. } else {
  9700. Register VReg1 = MRI->createVirtualRegister(TRC);
  9701. BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
  9702. .addImm(NumLPads & 0xFFFF)
  9703. .add(predOps(ARMCC::AL));
  9704. unsigned VReg2 = VReg1;
  9705. if ((NumLPads & 0xFFFF0000) != 0) {
  9706. VReg2 = MRI->createVirtualRegister(TRC);
  9707. BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
  9708. .addReg(VReg1)
  9709. .addImm(NumLPads >> 16)
  9710. .add(predOps(ARMCC::AL));
  9711. }
  9712. BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
  9713. .addReg(NewVReg1)
  9714. .addReg(VReg2)
  9715. .add(predOps(ARMCC::AL));
  9716. }
  9717. BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
  9718. .addMBB(TrapBB)
  9719. .addImm(ARMCC::HI)
  9720. .addReg(ARM::CPSR);
  9721. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9722. BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
  9723. .addJumpTableIndex(MJTI)
  9724. .add(predOps(ARMCC::AL));
  9725. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9726. BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
  9727. .addReg(NewVReg3, RegState::Kill)
  9728. .addReg(NewVReg1)
  9729. .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
  9730. .add(predOps(ARMCC::AL))
  9731. .add(condCodeOp());
  9732. BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
  9733. .addReg(NewVReg4, RegState::Kill)
  9734. .addReg(NewVReg1)
  9735. .addJumpTableIndex(MJTI);
  9736. } else if (Subtarget->isThumb()) {
  9737. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9738. BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
  9739. .addFrameIndex(FI)
  9740. .addImm(1)
  9741. .addMemOperand(FIMMOLd)
  9742. .add(predOps(ARMCC::AL));
  9743. if (NumLPads < 256) {
  9744. BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
  9745. .addReg(NewVReg1)
  9746. .addImm(NumLPads)
  9747. .add(predOps(ARMCC::AL));
  9748. } else {
  9749. MachineConstantPool *ConstantPool = MF->getConstantPool();
  9750. Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
  9751. const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
  9752. // MachineConstantPool wants an explicit alignment.
  9753. Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
  9754. unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
  9755. Register VReg1 = MRI->createVirtualRegister(TRC);
  9756. BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
  9757. .addReg(VReg1, RegState::Define)
  9758. .addConstantPoolIndex(Idx)
  9759. .add(predOps(ARMCC::AL));
  9760. BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
  9761. .addReg(NewVReg1)
  9762. .addReg(VReg1)
  9763. .add(predOps(ARMCC::AL));
  9764. }
  9765. BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
  9766. .addMBB(TrapBB)
  9767. .addImm(ARMCC::HI)
  9768. .addReg(ARM::CPSR);
  9769. Register NewVReg2 = MRI->createVirtualRegister(TRC);
  9770. BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
  9771. .addReg(ARM::CPSR, RegState::Define)
  9772. .addReg(NewVReg1)
  9773. .addImm(2)
  9774. .add(predOps(ARMCC::AL));
  9775. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9776. BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
  9777. .addJumpTableIndex(MJTI)
  9778. .add(predOps(ARMCC::AL));
  9779. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9780. BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
  9781. .addReg(ARM::CPSR, RegState::Define)
  9782. .addReg(NewVReg2, RegState::Kill)
  9783. .addReg(NewVReg3)
  9784. .add(predOps(ARMCC::AL));
  9785. MachineMemOperand *JTMMOLd =
  9786. MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
  9787. MachineMemOperand::MOLoad, 4, Align(4));
  9788. Register NewVReg5 = MRI->createVirtualRegister(TRC);
  9789. BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
  9790. .addReg(NewVReg4, RegState::Kill)
  9791. .addImm(0)
  9792. .addMemOperand(JTMMOLd)
  9793. .add(predOps(ARMCC::AL));
  9794. unsigned NewVReg6 = NewVReg5;
  9795. if (IsPositionIndependent) {
  9796. NewVReg6 = MRI->createVirtualRegister(TRC);
  9797. BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
  9798. .addReg(ARM::CPSR, RegState::Define)
  9799. .addReg(NewVReg5, RegState::Kill)
  9800. .addReg(NewVReg3)
  9801. .add(predOps(ARMCC::AL));
  9802. }
  9803. BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
  9804. .addReg(NewVReg6, RegState::Kill)
  9805. .addJumpTableIndex(MJTI);
  9806. } else {
  9807. Register NewVReg1 = MRI->createVirtualRegister(TRC);
  9808. BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
  9809. .addFrameIndex(FI)
  9810. .addImm(4)
  9811. .addMemOperand(FIMMOLd)
  9812. .add(predOps(ARMCC::AL));
  9813. if (NumLPads < 256) {
  9814. BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
  9815. .addReg(NewVReg1)
  9816. .addImm(NumLPads)
  9817. .add(predOps(ARMCC::AL));
  9818. } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
  9819. Register VReg1 = MRI->createVirtualRegister(TRC);
  9820. BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
  9821. .addImm(NumLPads & 0xFFFF)
  9822. .add(predOps(ARMCC::AL));
  9823. unsigned VReg2 = VReg1;
  9824. if ((NumLPads & 0xFFFF0000) != 0) {
  9825. VReg2 = MRI->createVirtualRegister(TRC);
  9826. BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
  9827. .addReg(VReg1)
  9828. .addImm(NumLPads >> 16)
  9829. .add(predOps(ARMCC::AL));
  9830. }
  9831. BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
  9832. .addReg(NewVReg1)
  9833. .addReg(VReg2)
  9834. .add(predOps(ARMCC::AL));
  9835. } else {
  9836. MachineConstantPool *ConstantPool = MF->getConstantPool();
  9837. Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
  9838. const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
  9839. // MachineConstantPool wants an explicit alignment.
  9840. Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
  9841. unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
  9842. Register VReg1 = MRI->createVirtualRegister(TRC);
  9843. BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
  9844. .addReg(VReg1, RegState::Define)
  9845. .addConstantPoolIndex(Idx)
  9846. .addImm(0)
  9847. .add(predOps(ARMCC::AL));
  9848. BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
  9849. .addReg(NewVReg1)
  9850. .addReg(VReg1, RegState::Kill)
  9851. .add(predOps(ARMCC::AL));
  9852. }
  9853. BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
  9854. .addMBB(TrapBB)
  9855. .addImm(ARMCC::HI)
  9856. .addReg(ARM::CPSR);
  9857. Register NewVReg3 = MRI->createVirtualRegister(TRC);
  9858. BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
  9859. .addReg(NewVReg1)
  9860. .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
  9861. .add(predOps(ARMCC::AL))
  9862. .add(condCodeOp());
  9863. Register NewVReg4 = MRI->createVirtualRegister(TRC);
  9864. BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
  9865. .addJumpTableIndex(MJTI)
  9866. .add(predOps(ARMCC::AL));
  9867. MachineMemOperand *JTMMOLd =
  9868. MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
  9869. MachineMemOperand::MOLoad, 4, Align(4));
  9870. Register NewVReg5 = MRI->createVirtualRegister(TRC);
  9871. BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
  9872. .addReg(NewVReg3, RegState::Kill)
  9873. .addReg(NewVReg4)
  9874. .addImm(0)
  9875. .addMemOperand(JTMMOLd)
  9876. .add(predOps(ARMCC::AL));
  9877. if (IsPositionIndependent) {
  9878. BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
  9879. .addReg(NewVReg5, RegState::Kill)
  9880. .addReg(NewVReg4)
  9881. .addJumpTableIndex(MJTI);
  9882. } else {
  9883. BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
  9884. .addReg(NewVReg5, RegState::Kill)
  9885. .addJumpTableIndex(MJTI);
  9886. }
  9887. }
  9888. // Add the jump table entries as successors to the MBB.
  9889. SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
  9890. for (MachineBasicBlock *CurMBB : LPadList) {
  9891. if (SeenMBBs.insert(CurMBB).second)
  9892. DispContBB->addSuccessor(CurMBB);
  9893. }
  9894. // N.B. the order the invoke BBs are processed in doesn't matter here.
  9895. const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
  9896. SmallVector<MachineBasicBlock*, 64> MBBLPads;
  9897. for (MachineBasicBlock *BB : InvokeBBs) {
  9898. // Remove the landing pad successor from the invoke block and replace it
  9899. // with the new dispatch block.
  9900. SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
  9901. while (!Successors.empty()) {
  9902. MachineBasicBlock *SMBB = Successors.pop_back_val();
  9903. if (SMBB->isEHPad()) {
  9904. BB->removeSuccessor(SMBB);
  9905. MBBLPads.push_back(SMBB);
  9906. }
  9907. }
  9908. BB->addSuccessor(DispatchBB, BranchProbability::getZero());
  9909. BB->normalizeSuccProbs();
  9910. // Find the invoke call and mark all of the callee-saved registers as
  9911. // 'implicit defined' so that they're spilled. This prevents code from
  9912. // moving instructions to before the EH block, where they will never be
  9913. // executed.
  9914. for (MachineBasicBlock::reverse_iterator
  9915. II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
  9916. if (!II->isCall()) continue;
  9917. DenseMap<unsigned, bool> DefRegs;
  9918. for (MachineInstr::mop_iterator
  9919. OI = II->operands_begin(), OE = II->operands_end();
  9920. OI != OE; ++OI) {
  9921. if (!OI->isReg()) continue;
  9922. DefRegs[OI->getReg()] = true;
  9923. }
  9924. MachineInstrBuilder MIB(*MF, &*II);
  9925. for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
  9926. unsigned Reg = SavedRegs[i];
  9927. if (Subtarget->isThumb2() &&
  9928. !ARM::tGPRRegClass.contains(Reg) &&
  9929. !ARM::hGPRRegClass.contains(Reg))
  9930. continue;
  9931. if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
  9932. continue;
  9933. if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
  9934. continue;
  9935. if (!DefRegs[Reg])
  9936. MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
  9937. }
  9938. break;
  9939. }
  9940. }
  9941. // Mark all former landing pads as non-landing pads. The dispatch is the only
  9942. // landing pad now.
  9943. for (MachineBasicBlock *MBBLPad : MBBLPads)
  9944. MBBLPad->setIsEHPad(false);
  9945. // The instruction is gone now.
  9946. MI.eraseFromParent();
  9947. }
  9948. static
  9949. MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
  9950. for (MachineBasicBlock *S : MBB->successors())
  9951. if (S != Succ)
  9952. return S;
  9953. llvm_unreachable("Expecting a BB with two successors!");
  9954. }
  9955. /// Return the load opcode for a given load size. If load size >= 8,
  9956. /// neon opcode will be returned.
  9957. static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
  9958. if (LdSize >= 8)
  9959. return LdSize == 16 ? ARM::VLD1q32wb_fixed
  9960. : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
  9961. if (IsThumb1)
  9962. return LdSize == 4 ? ARM::tLDRi
  9963. : LdSize == 2 ? ARM::tLDRHi
  9964. : LdSize == 1 ? ARM::tLDRBi : 0;
  9965. if (IsThumb2)
  9966. return LdSize == 4 ? ARM::t2LDR_POST
  9967. : LdSize == 2 ? ARM::t2LDRH_POST
  9968. : LdSize == 1 ? ARM::t2LDRB_POST : 0;
  9969. return LdSize == 4 ? ARM::LDR_POST_IMM
  9970. : LdSize == 2 ? ARM::LDRH_POST
  9971. : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
  9972. }
  9973. /// Return the store opcode for a given store size. If store size >= 8,
  9974. /// neon opcode will be returned.
  9975. static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
  9976. if (StSize >= 8)
  9977. return StSize == 16 ? ARM::VST1q32wb_fixed
  9978. : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
  9979. if (IsThumb1)
  9980. return StSize == 4 ? ARM::tSTRi
  9981. : StSize == 2 ? ARM::tSTRHi
  9982. : StSize == 1 ? ARM::tSTRBi : 0;
  9983. if (IsThumb2)
  9984. return StSize == 4 ? ARM::t2STR_POST
  9985. : StSize == 2 ? ARM::t2STRH_POST
  9986. : StSize == 1 ? ARM::t2STRB_POST : 0;
  9987. return StSize == 4 ? ARM::STR_POST_IMM
  9988. : StSize == 2 ? ARM::STRH_POST
  9989. : StSize == 1 ? ARM::STRB_POST_IMM : 0;
  9990. }
  9991. /// Emit a post-increment load operation with given size. The instructions
  9992. /// will be added to BB at Pos.
  9993. static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
  9994. const TargetInstrInfo *TII, const DebugLoc &dl,
  9995. unsigned LdSize, unsigned Data, unsigned AddrIn,
  9996. unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
  9997. unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
  9998. assert(LdOpc != 0 && "Should have a load opcode");
  9999. if (LdSize >= 8) {
  10000. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  10001. .addReg(AddrOut, RegState::Define)
  10002. .addReg(AddrIn)
  10003. .addImm(0)
  10004. .add(predOps(ARMCC::AL));
  10005. } else if (IsThumb1) {
  10006. // load + update AddrIn
  10007. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  10008. .addReg(AddrIn)
  10009. .addImm(0)
  10010. .add(predOps(ARMCC::AL));
  10011. BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
  10012. .add(t1CondCodeOp())
  10013. .addReg(AddrIn)
  10014. .addImm(LdSize)
  10015. .add(predOps(ARMCC::AL));
  10016. } else if (IsThumb2) {
  10017. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  10018. .addReg(AddrOut, RegState::Define)
  10019. .addReg(AddrIn)
  10020. .addImm(LdSize)
  10021. .add(predOps(ARMCC::AL));
  10022. } else { // arm
  10023. BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
  10024. .addReg(AddrOut, RegState::Define)
  10025. .addReg(AddrIn)
  10026. .addReg(0)
  10027. .addImm(LdSize)
  10028. .add(predOps(ARMCC::AL));
  10029. }
  10030. }
  10031. /// Emit a post-increment store operation with given size. The instructions
  10032. /// will be added to BB at Pos.
  10033. static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
  10034. const TargetInstrInfo *TII, const DebugLoc &dl,
  10035. unsigned StSize, unsigned Data, unsigned AddrIn,
  10036. unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
  10037. unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
  10038. assert(StOpc != 0 && "Should have a store opcode");
  10039. if (StSize >= 8) {
  10040. BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
  10041. .addReg(AddrIn)
  10042. .addImm(0)
  10043. .addReg(Data)
  10044. .add(predOps(ARMCC::AL));
  10045. } else if (IsThumb1) {
  10046. // store + update AddrIn
  10047. BuildMI(*BB, Pos, dl, TII->get(StOpc))
  10048. .addReg(Data)
  10049. .addReg(AddrIn)
  10050. .addImm(0)
  10051. .add(predOps(ARMCC::AL));
  10052. BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
  10053. .add(t1CondCodeOp())
  10054. .addReg(AddrIn)
  10055. .addImm(StSize)
  10056. .add(predOps(ARMCC::AL));
  10057. } else if (IsThumb2) {
  10058. BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
  10059. .addReg(Data)
  10060. .addReg(AddrIn)
  10061. .addImm(StSize)
  10062. .add(predOps(ARMCC::AL));
  10063. } else { // arm
  10064. BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
  10065. .addReg(Data)
  10066. .addReg(AddrIn)
  10067. .addReg(0)
  10068. .addImm(StSize)
  10069. .add(predOps(ARMCC::AL));
  10070. }
  10071. }
  10072. MachineBasicBlock *
  10073. ARMTargetLowering::EmitStructByval(MachineInstr &MI,
  10074. MachineBasicBlock *BB) const {
  10075. // This pseudo instruction has 3 operands: dst, src, size
  10076. // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
  10077. // Otherwise, we will generate unrolled scalar copies.
  10078. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  10079. const BasicBlock *LLVM_BB = BB->getBasicBlock();
  10080. MachineFunction::iterator It = ++BB->getIterator();
  10081. Register dest = MI.getOperand(0).getReg();
  10082. Register src = MI.getOperand(1).getReg();
  10083. unsigned SizeVal = MI.getOperand(2).getImm();
  10084. unsigned Alignment = MI.getOperand(3).getImm();
  10085. DebugLoc dl = MI.getDebugLoc();
  10086. MachineFunction *MF = BB->getParent();
  10087. MachineRegisterInfo &MRI = MF->getRegInfo();
  10088. unsigned UnitSize = 0;
  10089. const TargetRegisterClass *TRC = nullptr;
  10090. const TargetRegisterClass *VecTRC = nullptr;
  10091. bool IsThumb1 = Subtarget->isThumb1Only();
  10092. bool IsThumb2 = Subtarget->isThumb2();
  10093. bool IsThumb = Subtarget->isThumb();
  10094. if (Alignment & 1) {
  10095. UnitSize = 1;
  10096. } else if (Alignment & 2) {
  10097. UnitSize = 2;
  10098. } else {
  10099. // Check whether we can use NEON instructions.
  10100. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
  10101. Subtarget->hasNEON()) {
  10102. if ((Alignment % 16 == 0) && SizeVal >= 16)
  10103. UnitSize = 16;
  10104. else if ((Alignment % 8 == 0) && SizeVal >= 8)
  10105. UnitSize = 8;
  10106. }
  10107. // Can't use NEON instructions.
  10108. if (UnitSize == 0)
  10109. UnitSize = 4;
  10110. }
  10111. // Select the correct opcode and register class for unit size load/store
  10112. bool IsNeon = UnitSize >= 8;
  10113. TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
  10114. if (IsNeon)
  10115. VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
  10116. : UnitSize == 8 ? &ARM::DPRRegClass
  10117. : nullptr;
  10118. unsigned BytesLeft = SizeVal % UnitSize;
  10119. unsigned LoopSize = SizeVal - BytesLeft;
  10120. if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
  10121. // Use LDR and STR to copy.
  10122. // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
  10123. // [destOut] = STR_POST(scratch, destIn, UnitSize)
  10124. unsigned srcIn = src;
  10125. unsigned destIn = dest;
  10126. for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
  10127. Register srcOut = MRI.createVirtualRegister(TRC);
  10128. Register destOut = MRI.createVirtualRegister(TRC);
  10129. Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
  10130. emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
  10131. IsThumb1, IsThumb2);
  10132. emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
  10133. IsThumb1, IsThumb2);
  10134. srcIn = srcOut;
  10135. destIn = destOut;
  10136. }
  10137. // Handle the leftover bytes with LDRB and STRB.
  10138. // [scratch, srcOut] = LDRB_POST(srcIn, 1)
  10139. // [destOut] = STRB_POST(scratch, destIn, 1)
  10140. for (unsigned i = 0; i < BytesLeft; i++) {
  10141. Register srcOut = MRI.createVirtualRegister(TRC);
  10142. Register destOut = MRI.createVirtualRegister(TRC);
  10143. Register scratch = MRI.createVirtualRegister(TRC);
  10144. emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
  10145. IsThumb1, IsThumb2);
  10146. emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
  10147. IsThumb1, IsThumb2);
  10148. srcIn = srcOut;
  10149. destIn = destOut;
  10150. }
  10151. MI.eraseFromParent(); // The instruction is gone now.
  10152. return BB;
  10153. }
  10154. // Expand the pseudo op to a loop.
  10155. // thisMBB:
  10156. // ...
  10157. // movw varEnd, # --> with thumb2
  10158. // movt varEnd, #
  10159. // ldrcp varEnd, idx --> without thumb2
  10160. // fallthrough --> loopMBB
  10161. // loopMBB:
  10162. // PHI varPhi, varEnd, varLoop
  10163. // PHI srcPhi, src, srcLoop
  10164. // PHI destPhi, dst, destLoop
  10165. // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
  10166. // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
  10167. // subs varLoop, varPhi, #UnitSize
  10168. // bne loopMBB
  10169. // fallthrough --> exitMBB
  10170. // exitMBB:
  10171. // epilogue to handle left-over bytes
  10172. // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
  10173. // [destOut] = STRB_POST(scratch, destLoop, 1)
  10174. MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  10175. MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
  10176. MF->insert(It, loopMBB);
  10177. MF->insert(It, exitMBB);
  10178. // Transfer the remainder of BB and its successor edges to exitMBB.
  10179. exitMBB->splice(exitMBB->begin(), BB,
  10180. std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10181. exitMBB->transferSuccessorsAndUpdatePHIs(BB);
  10182. // Load an immediate to varEnd.
  10183. Register varEnd = MRI.createVirtualRegister(TRC);
  10184. if (Subtarget->useMovt()) {
  10185. unsigned Vtmp = varEnd;
  10186. if ((LoopSize & 0xFFFF0000) != 0)
  10187. Vtmp = MRI.createVirtualRegister(TRC);
  10188. BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
  10189. .addImm(LoopSize & 0xFFFF)
  10190. .add(predOps(ARMCC::AL));
  10191. if ((LoopSize & 0xFFFF0000) != 0)
  10192. BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
  10193. .addReg(Vtmp)
  10194. .addImm(LoopSize >> 16)
  10195. .add(predOps(ARMCC::AL));
  10196. } else {
  10197. MachineConstantPool *ConstantPool = MF->getConstantPool();
  10198. Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
  10199. const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
  10200. // MachineConstantPool wants an explicit alignment.
  10201. Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
  10202. unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
  10203. MachineMemOperand *CPMMO =
  10204. MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
  10205. MachineMemOperand::MOLoad, 4, Align(4));
  10206. if (IsThumb)
  10207. BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
  10208. .addReg(varEnd, RegState::Define)
  10209. .addConstantPoolIndex(Idx)
  10210. .add(predOps(ARMCC::AL))
  10211. .addMemOperand(CPMMO);
  10212. else
  10213. BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
  10214. .addReg(varEnd, RegState::Define)
  10215. .addConstantPoolIndex(Idx)
  10216. .addImm(0)
  10217. .add(predOps(ARMCC::AL))
  10218. .addMemOperand(CPMMO);
  10219. }
  10220. BB->addSuccessor(loopMBB);
  10221. // Generate the loop body:
  10222. // varPhi = PHI(varLoop, varEnd)
  10223. // srcPhi = PHI(srcLoop, src)
  10224. // destPhi = PHI(destLoop, dst)
  10225. MachineBasicBlock *entryBB = BB;
  10226. BB = loopMBB;
  10227. Register varLoop = MRI.createVirtualRegister(TRC);
  10228. Register varPhi = MRI.createVirtualRegister(TRC);
  10229. Register srcLoop = MRI.createVirtualRegister(TRC);
  10230. Register srcPhi = MRI.createVirtualRegister(TRC);
  10231. Register destLoop = MRI.createVirtualRegister(TRC);
  10232. Register destPhi = MRI.createVirtualRegister(TRC);
  10233. BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
  10234. .addReg(varLoop).addMBB(loopMBB)
  10235. .addReg(varEnd).addMBB(entryBB);
  10236. BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
  10237. .addReg(srcLoop).addMBB(loopMBB)
  10238. .addReg(src).addMBB(entryBB);
  10239. BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
  10240. .addReg(destLoop).addMBB(loopMBB)
  10241. .addReg(dest).addMBB(entryBB);
  10242. // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
  10243. // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
  10244. Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
  10245. emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
  10246. IsThumb1, IsThumb2);
  10247. emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
  10248. IsThumb1, IsThumb2);
  10249. // Decrement loop variable by UnitSize.
  10250. if (IsThumb1) {
  10251. BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
  10252. .add(t1CondCodeOp())
  10253. .addReg(varPhi)
  10254. .addImm(UnitSize)
  10255. .add(predOps(ARMCC::AL));
  10256. } else {
  10257. MachineInstrBuilder MIB =
  10258. BuildMI(*BB, BB->end(), dl,
  10259. TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
  10260. MIB.addReg(varPhi)
  10261. .addImm(UnitSize)
  10262. .add(predOps(ARMCC::AL))
  10263. .add(condCodeOp());
  10264. MIB->getOperand(5).setReg(ARM::CPSR);
  10265. MIB->getOperand(5).setIsDef(true);
  10266. }
  10267. BuildMI(*BB, BB->end(), dl,
  10268. TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
  10269. .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
  10270. // loopMBB can loop back to loopMBB or fall through to exitMBB.
  10271. BB->addSuccessor(loopMBB);
  10272. BB->addSuccessor(exitMBB);
  10273. // Add epilogue to handle BytesLeft.
  10274. BB = exitMBB;
  10275. auto StartOfExit = exitMBB->begin();
  10276. // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
  10277. // [destOut] = STRB_POST(scratch, destLoop, 1)
  10278. unsigned srcIn = srcLoop;
  10279. unsigned destIn = destLoop;
  10280. for (unsigned i = 0; i < BytesLeft; i++) {
  10281. Register srcOut = MRI.createVirtualRegister(TRC);
  10282. Register destOut = MRI.createVirtualRegister(TRC);
  10283. Register scratch = MRI.createVirtualRegister(TRC);
  10284. emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
  10285. IsThumb1, IsThumb2);
  10286. emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
  10287. IsThumb1, IsThumb2);
  10288. srcIn = srcOut;
  10289. destIn = destOut;
  10290. }
  10291. MI.eraseFromParent(); // The instruction is gone now.
  10292. return BB;
  10293. }
  10294. MachineBasicBlock *
  10295. ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
  10296. MachineBasicBlock *MBB) const {
  10297. const TargetMachine &TM = getTargetMachine();
  10298. const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
  10299. DebugLoc DL = MI.getDebugLoc();
  10300. assert(Subtarget->isTargetWindows() &&
  10301. "__chkstk is only supported on Windows");
  10302. assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
  10303. // __chkstk takes the number of words to allocate on the stack in R4, and
  10304. // returns the stack adjustment in number of bytes in R4. This will not
  10305. // clober any other registers (other than the obvious lr).
  10306. //
  10307. // Although, technically, IP should be considered a register which may be
  10308. // clobbered, the call itself will not touch it. Windows on ARM is a pure
  10309. // thumb-2 environment, so there is no interworking required. As a result, we
  10310. // do not expect a veneer to be emitted by the linker, clobbering IP.
  10311. //
  10312. // Each module receives its own copy of __chkstk, so no import thunk is
  10313. // required, again, ensuring that IP is not clobbered.
  10314. //
  10315. // Finally, although some linkers may theoretically provide a trampoline for
  10316. // out of range calls (which is quite common due to a 32M range limitation of
  10317. // branches for Thumb), we can generate the long-call version via
  10318. // -mcmodel=large, alleviating the need for the trampoline which may clobber
  10319. // IP.
  10320. switch (TM.getCodeModel()) {
  10321. case CodeModel::Tiny:
  10322. llvm_unreachable("Tiny code model not available on ARM.");
  10323. case CodeModel::Small:
  10324. case CodeModel::Medium:
  10325. case CodeModel::Kernel:
  10326. BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
  10327. .add(predOps(ARMCC::AL))
  10328. .addExternalSymbol("__chkstk")
  10329. .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
  10330. .addReg(ARM::R4, RegState::Implicit | RegState::Define)
  10331. .addReg(ARM::R12,
  10332. RegState::Implicit | RegState::Define | RegState::Dead)
  10333. .addReg(ARM::CPSR,
  10334. RegState::Implicit | RegState::Define | RegState::Dead);
  10335. break;
  10336. case CodeModel::Large: {
  10337. MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
  10338. Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10339. BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
  10340. .addExternalSymbol("__chkstk");
  10341. BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
  10342. .add(predOps(ARMCC::AL))
  10343. .addReg(Reg, RegState::Kill)
  10344. .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
  10345. .addReg(ARM::R4, RegState::Implicit | RegState::Define)
  10346. .addReg(ARM::R12,
  10347. RegState::Implicit | RegState::Define | RegState::Dead)
  10348. .addReg(ARM::CPSR,
  10349. RegState::Implicit | RegState::Define | RegState::Dead);
  10350. break;
  10351. }
  10352. }
  10353. BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
  10354. .addReg(ARM::SP, RegState::Kill)
  10355. .addReg(ARM::R4, RegState::Kill)
  10356. .setMIFlags(MachineInstr::FrameSetup)
  10357. .add(predOps(ARMCC::AL))
  10358. .add(condCodeOp());
  10359. MI.eraseFromParent();
  10360. return MBB;
  10361. }
  10362. MachineBasicBlock *
  10363. ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
  10364. MachineBasicBlock *MBB) const {
  10365. DebugLoc DL = MI.getDebugLoc();
  10366. MachineFunction *MF = MBB->getParent();
  10367. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  10368. MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
  10369. MF->insert(++MBB->getIterator(), ContBB);
  10370. ContBB->splice(ContBB->begin(), MBB,
  10371. std::next(MachineBasicBlock::iterator(MI)), MBB->end());
  10372. ContBB->transferSuccessorsAndUpdatePHIs(MBB);
  10373. MBB->addSuccessor(ContBB);
  10374. MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
  10375. BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
  10376. MF->push_back(TrapBB);
  10377. MBB->addSuccessor(TrapBB);
  10378. BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
  10379. .addReg(MI.getOperand(0).getReg())
  10380. .addImm(0)
  10381. .add(predOps(ARMCC::AL));
  10382. BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
  10383. .addMBB(TrapBB)
  10384. .addImm(ARMCC::EQ)
  10385. .addReg(ARM::CPSR);
  10386. MI.eraseFromParent();
  10387. return ContBB;
  10388. }
  10389. // The CPSR operand of SelectItr might be missing a kill marker
  10390. // because there were multiple uses of CPSR, and ISel didn't know
  10391. // which to mark. Figure out whether SelectItr should have had a
  10392. // kill marker, and set it if it should. Returns the correct kill
  10393. // marker value.
  10394. static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
  10395. MachineBasicBlock* BB,
  10396. const TargetRegisterInfo* TRI) {
  10397. // Scan forward through BB for a use/def of CPSR.
  10398. MachineBasicBlock::iterator miI(std::next(SelectItr));
  10399. for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
  10400. const MachineInstr& mi = *miI;
  10401. if (mi.readsRegister(ARM::CPSR))
  10402. return false;
  10403. if (mi.definesRegister(ARM::CPSR))
  10404. break; // Should have kill-flag - update below.
  10405. }
  10406. // If we hit the end of the block, check whether CPSR is live into a
  10407. // successor.
  10408. if (miI == BB->end()) {
  10409. for (MachineBasicBlock *Succ : BB->successors())
  10410. if (Succ->isLiveIn(ARM::CPSR))
  10411. return false;
  10412. }
  10413. // We found a def, or hit the end of the basic block and CPSR wasn't live
  10414. // out. SelectMI should have a kill flag on CPSR.
  10415. SelectItr->addRegisterKilled(ARM::CPSR, TRI);
  10416. return true;
  10417. }
  10418. /// Adds logic in loop entry MBB to calculate loop iteration count and adds
  10419. /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
  10420. static Register genTPEntry(MachineBasicBlock *TpEntry,
  10421. MachineBasicBlock *TpLoopBody,
  10422. MachineBasicBlock *TpExit, Register OpSizeReg,
  10423. const TargetInstrInfo *TII, DebugLoc Dl,
  10424. MachineRegisterInfo &MRI) {
  10425. // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
  10426. Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10427. BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
  10428. .addUse(OpSizeReg)
  10429. .addImm(15)
  10430. .add(predOps(ARMCC::AL))
  10431. .addReg(0);
  10432. Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10433. BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
  10434. .addUse(AddDestReg, RegState::Kill)
  10435. .addImm(4)
  10436. .add(predOps(ARMCC::AL))
  10437. .addReg(0);
  10438. Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
  10439. BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
  10440. .addUse(LsrDestReg, RegState::Kill);
  10441. BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
  10442. .addUse(TotalIterationsReg)
  10443. .addMBB(TpExit);
  10444. BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
  10445. .addMBB(TpLoopBody)
  10446. .add(predOps(ARMCC::AL));
  10447. return TotalIterationsReg;
  10448. }
  10449. /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
  10450. /// t2DoLoopEnd. These are used by later passes to generate tail predicated
  10451. /// loops.
  10452. static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
  10453. MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
  10454. const TargetInstrInfo *TII, DebugLoc Dl,
  10455. MachineRegisterInfo &MRI, Register OpSrcReg,
  10456. Register OpDestReg, Register ElementCountReg,
  10457. Register TotalIterationsReg, bool IsMemcpy) {
  10458. // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
  10459. // array, loop iteration counter, predication counter.
  10460. Register SrcPhiReg, CurrSrcReg;
  10461. if (IsMemcpy) {
  10462. // Current position in the src array
  10463. SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10464. CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10465. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
  10466. .addUse(OpSrcReg)
  10467. .addMBB(TpEntry)
  10468. .addUse(CurrSrcReg)
  10469. .addMBB(TpLoopBody);
  10470. }
  10471. // Current position in the dest array
  10472. Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10473. Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10474. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
  10475. .addUse(OpDestReg)
  10476. .addMBB(TpEntry)
  10477. .addUse(CurrDestReg)
  10478. .addMBB(TpLoopBody);
  10479. // Current loop counter
  10480. Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
  10481. Register RemainingLoopIterationsReg =
  10482. MRI.createVirtualRegister(&ARM::GPRlrRegClass);
  10483. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
  10484. .addUse(TotalIterationsReg)
  10485. .addMBB(TpEntry)
  10486. .addUse(RemainingLoopIterationsReg)
  10487. .addMBB(TpLoopBody);
  10488. // Predication counter
  10489. Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10490. Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
  10491. BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
  10492. .addUse(ElementCountReg)
  10493. .addMBB(TpEntry)
  10494. .addUse(RemainingElementsReg)
  10495. .addMBB(TpLoopBody);
  10496. // Pass predication counter to VCTP
  10497. Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
  10498. BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
  10499. .addUse(PredCounterPhiReg)
  10500. .addImm(ARMVCC::None)
  10501. .addReg(0)
  10502. .addReg(0);
  10503. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
  10504. .addUse(PredCounterPhiReg)
  10505. .addImm(16)
  10506. .add(predOps(ARMCC::AL))
  10507. .addReg(0);
  10508. // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
  10509. Register SrcValueReg;
  10510. if (IsMemcpy) {
  10511. SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
  10512. BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
  10513. .addDef(CurrSrcReg)
  10514. .addDef(SrcValueReg)
  10515. .addReg(SrcPhiReg)
  10516. .addImm(16)
  10517. .addImm(ARMVCC::Then)
  10518. .addUse(VccrReg)
  10519. .addReg(0);
  10520. } else
  10521. SrcValueReg = OpSrcReg;
  10522. BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
  10523. .addDef(CurrDestReg)
  10524. .addUse(SrcValueReg)
  10525. .addReg(DestPhiReg)
  10526. .addImm(16)
  10527. .addImm(ARMVCC::Then)
  10528. .addUse(VccrReg)
  10529. .addReg(0);
  10530. // Add the pseudoInstrs for decrementing the loop counter and marking the
  10531. // end:t2DoLoopDec and t2DoLoopEnd
  10532. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
  10533. .addUse(LoopCounterPhiReg)
  10534. .addImm(1);
  10535. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
  10536. .addUse(RemainingLoopIterationsReg)
  10537. .addMBB(TpLoopBody);
  10538. BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
  10539. .addMBB(TpExit)
  10540. .add(predOps(ARMCC::AL));
  10541. }
  10542. MachineBasicBlock *
  10543. ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
  10544. MachineBasicBlock *BB) const {
  10545. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  10546. DebugLoc dl = MI.getDebugLoc();
  10547. bool isThumb2 = Subtarget->isThumb2();
  10548. switch (MI.getOpcode()) {
  10549. default: {
  10550. MI.print(errs());
  10551. llvm_unreachable("Unexpected instr type to insert");
  10552. }
  10553. // Thumb1 post-indexed loads are really just single-register LDMs.
  10554. case ARM::tLDR_postidx: {
  10555. MachineOperand Def(MI.getOperand(1));
  10556. BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
  10557. .add(Def) // Rn_wb
  10558. .add(MI.getOperand(2)) // Rn
  10559. .add(MI.getOperand(3)) // PredImm
  10560. .add(MI.getOperand(4)) // PredReg
  10561. .add(MI.getOperand(0)) // Rt
  10562. .cloneMemRefs(MI);
  10563. MI.eraseFromParent();
  10564. return BB;
  10565. }
  10566. case ARM::MVE_MEMCPYLOOPINST:
  10567. case ARM::MVE_MEMSETLOOPINST: {
  10568. // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
  10569. // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
  10570. // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
  10571. // adds the relevant instructions in the TP loop Body for generation of a
  10572. // WLSTP loop.
  10573. // Below is relevant portion of the CFG after the transformation.
  10574. // The Machine Basic Blocks are shown along with branch conditions (in
  10575. // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
  10576. // portion of the CFG and may not necessarily be the entry/exit of the
  10577. // function.
  10578. // (Relevant) CFG after transformation:
  10579. // TP entry MBB
  10580. // |
  10581. // |-----------------|
  10582. // (n <= 0) (n > 0)
  10583. // | |
  10584. // | TP loop Body MBB<--|
  10585. // | | |
  10586. // \ |___________|
  10587. // \ /
  10588. // TP exit MBB
  10589. MachineFunction *MF = BB->getParent();
  10590. MachineFunctionProperties &Properties = MF->getProperties();
  10591. MachineRegisterInfo &MRI = MF->getRegInfo();
  10592. Register OpDestReg = MI.getOperand(0).getReg();
  10593. Register OpSrcReg = MI.getOperand(1).getReg();
  10594. Register OpSizeReg = MI.getOperand(2).getReg();
  10595. // Allocate the required MBBs and add to parent function.
  10596. MachineBasicBlock *TpEntry = BB;
  10597. MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
  10598. MachineBasicBlock *TpExit;
  10599. MF->push_back(TpLoopBody);
  10600. // If any instructions are present in the current block after
  10601. // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
  10602. // move the instructions into the newly created exit block. If there are no
  10603. // instructions add an explicit branch to the FallThrough block and then
  10604. // split.
  10605. //
  10606. // The split is required for two reasons:
  10607. // 1) A terminator(t2WhileLoopStart) will be placed at that site.
  10608. // 2) Since a TPLoopBody will be added later, any phis in successive blocks
  10609. // need to be updated. splitAt() already handles this.
  10610. TpExit = BB->splitAt(MI, false);
  10611. if (TpExit == BB) {
  10612. assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
  10613. "block containing memcpy/memset Pseudo");
  10614. TpExit = BB->getFallThrough();
  10615. BuildMI(BB, dl, TII->get(ARM::t2B))
  10616. .addMBB(TpExit)
  10617. .add(predOps(ARMCC::AL));
  10618. TpExit = BB->splitAt(MI, false);
  10619. }
  10620. // Add logic for iteration count
  10621. Register TotalIterationsReg =
  10622. genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
  10623. // Add the vectorized (and predicated) loads/store instructions
  10624. bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
  10625. genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
  10626. OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
  10627. // Required to avoid conflict with the MachineVerifier during testing.
  10628. Properties.reset(MachineFunctionProperties::Property::NoPHIs);
  10629. // Connect the blocks
  10630. TpEntry->addSuccessor(TpLoopBody);
  10631. TpLoopBody->addSuccessor(TpLoopBody);
  10632. TpLoopBody->addSuccessor(TpExit);
  10633. // Reorder for a more natural layout
  10634. TpLoopBody->moveAfter(TpEntry);
  10635. TpExit->moveAfter(TpLoopBody);
  10636. // Finally, remove the memcpy Psuedo Instruction
  10637. MI.eraseFromParent();
  10638. // Return the exit block as it may contain other instructions requiring a
  10639. // custom inserter
  10640. return TpExit;
  10641. }
  10642. // The Thumb2 pre-indexed stores have the same MI operands, they just
  10643. // define them differently in the .td files from the isel patterns, so
  10644. // they need pseudos.
  10645. case ARM::t2STR_preidx:
  10646. MI.setDesc(TII->get(ARM::t2STR_PRE));
  10647. return BB;
  10648. case ARM::t2STRB_preidx:
  10649. MI.setDesc(TII->get(ARM::t2STRB_PRE));
  10650. return BB;
  10651. case ARM::t2STRH_preidx:
  10652. MI.setDesc(TII->get(ARM::t2STRH_PRE));
  10653. return BB;
  10654. case ARM::STRi_preidx:
  10655. case ARM::STRBi_preidx: {
  10656. unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
  10657. : ARM::STRB_PRE_IMM;
  10658. // Decode the offset.
  10659. unsigned Offset = MI.getOperand(4).getImm();
  10660. bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
  10661. Offset = ARM_AM::getAM2Offset(Offset);
  10662. if (isSub)
  10663. Offset = -Offset;
  10664. MachineMemOperand *MMO = *MI.memoperands_begin();
  10665. BuildMI(*BB, MI, dl, TII->get(NewOpc))
  10666. .add(MI.getOperand(0)) // Rn_wb
  10667. .add(MI.getOperand(1)) // Rt
  10668. .add(MI.getOperand(2)) // Rn
  10669. .addImm(Offset) // offset (skip GPR==zero_reg)
  10670. .add(MI.getOperand(5)) // pred
  10671. .add(MI.getOperand(6))
  10672. .addMemOperand(MMO);
  10673. MI.eraseFromParent();
  10674. return BB;
  10675. }
  10676. case ARM::STRr_preidx:
  10677. case ARM::STRBr_preidx:
  10678. case ARM::STRH_preidx: {
  10679. unsigned NewOpc;
  10680. switch (MI.getOpcode()) {
  10681. default: llvm_unreachable("unexpected opcode!");
  10682. case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
  10683. case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
  10684. case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
  10685. }
  10686. MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
  10687. for (const MachineOperand &MO : MI.operands())
  10688. MIB.add(MO);
  10689. MI.eraseFromParent();
  10690. return BB;
  10691. }
  10692. case ARM::tMOVCCr_pseudo: {
  10693. // To "insert" a SELECT_CC instruction, we actually have to insert the
  10694. // diamond control-flow pattern. The incoming instruction knows the
  10695. // destination vreg to set, the condition code register to branch on, the
  10696. // true/false values to select between, and a branch opcode to use.
  10697. const BasicBlock *LLVM_BB = BB->getBasicBlock();
  10698. MachineFunction::iterator It = ++BB->getIterator();
  10699. // thisMBB:
  10700. // ...
  10701. // TrueVal = ...
  10702. // cmpTY ccX, r1, r2
  10703. // bCC copy1MBB
  10704. // fallthrough --> copy0MBB
  10705. MachineBasicBlock *thisMBB = BB;
  10706. MachineFunction *F = BB->getParent();
  10707. MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
  10708. MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
  10709. F->insert(It, copy0MBB);
  10710. F->insert(It, sinkMBB);
  10711. // Check whether CPSR is live past the tMOVCCr_pseudo.
  10712. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
  10713. if (!MI.killsRegister(ARM::CPSR) &&
  10714. !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
  10715. copy0MBB->addLiveIn(ARM::CPSR);
  10716. sinkMBB->addLiveIn(ARM::CPSR);
  10717. }
  10718. // Transfer the remainder of BB and its successor edges to sinkMBB.
  10719. sinkMBB->splice(sinkMBB->begin(), BB,
  10720. std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10721. sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
  10722. BB->addSuccessor(copy0MBB);
  10723. BB->addSuccessor(sinkMBB);
  10724. BuildMI(BB, dl, TII->get(ARM::tBcc))
  10725. .addMBB(sinkMBB)
  10726. .addImm(MI.getOperand(3).getImm())
  10727. .addReg(MI.getOperand(4).getReg());
  10728. // copy0MBB:
  10729. // %FalseValue = ...
  10730. // # fallthrough to sinkMBB
  10731. BB = copy0MBB;
  10732. // Update machine-CFG edges
  10733. BB->addSuccessor(sinkMBB);
  10734. // sinkMBB:
  10735. // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
  10736. // ...
  10737. BB = sinkMBB;
  10738. BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
  10739. .addReg(MI.getOperand(1).getReg())
  10740. .addMBB(copy0MBB)
  10741. .addReg(MI.getOperand(2).getReg())
  10742. .addMBB(thisMBB);
  10743. MI.eraseFromParent(); // The pseudo instruction is gone now.
  10744. return BB;
  10745. }
  10746. case ARM::BCCi64:
  10747. case ARM::BCCZi64: {
  10748. // If there is an unconditional branch to the other successor, remove it.
  10749. BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10750. // Compare both parts that make up the double comparison separately for
  10751. // equality.
  10752. bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
  10753. Register LHS1 = MI.getOperand(1).getReg();
  10754. Register LHS2 = MI.getOperand(2).getReg();
  10755. if (RHSisZero) {
  10756. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
  10757. .addReg(LHS1)
  10758. .addImm(0)
  10759. .add(predOps(ARMCC::AL));
  10760. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
  10761. .addReg(LHS2).addImm(0)
  10762. .addImm(ARMCC::EQ).addReg(ARM::CPSR);
  10763. } else {
  10764. Register RHS1 = MI.getOperand(3).getReg();
  10765. Register RHS2 = MI.getOperand(4).getReg();
  10766. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
  10767. .addReg(LHS1)
  10768. .addReg(RHS1)
  10769. .add(predOps(ARMCC::AL));
  10770. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
  10771. .addReg(LHS2).addReg(RHS2)
  10772. .addImm(ARMCC::EQ).addReg(ARM::CPSR);
  10773. }
  10774. MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
  10775. MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
  10776. if (MI.getOperand(0).getImm() == ARMCC::NE)
  10777. std::swap(destMBB, exitMBB);
  10778. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
  10779. .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
  10780. if (isThumb2)
  10781. BuildMI(BB, dl, TII->get(ARM::t2B))
  10782. .addMBB(exitMBB)
  10783. .add(predOps(ARMCC::AL));
  10784. else
  10785. BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
  10786. MI.eraseFromParent(); // The pseudo instruction is gone now.
  10787. return BB;
  10788. }
  10789. case ARM::Int_eh_sjlj_setjmp:
  10790. case ARM::Int_eh_sjlj_setjmp_nofp:
  10791. case ARM::tInt_eh_sjlj_setjmp:
  10792. case ARM::t2Int_eh_sjlj_setjmp:
  10793. case ARM::t2Int_eh_sjlj_setjmp_nofp:
  10794. return BB;
  10795. case ARM::Int_eh_sjlj_setup_dispatch:
  10796. EmitSjLjDispatchBlock(MI, BB);
  10797. return BB;
  10798. case ARM::ABS:
  10799. case ARM::t2ABS: {
  10800. // To insert an ABS instruction, we have to insert the
  10801. // diamond control-flow pattern. The incoming instruction knows the
  10802. // source vreg to test against 0, the destination vreg to set,
  10803. // the condition code register to branch on, the
  10804. // true/false values to select between, and a branch opcode to use.
  10805. // It transforms
  10806. // V1 = ABS V0
  10807. // into
  10808. // V2 = MOVS V0
  10809. // BCC (branch to SinkBB if V0 >= 0)
  10810. // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
  10811. // SinkBB: V1 = PHI(V2, V3)
  10812. const BasicBlock *LLVM_BB = BB->getBasicBlock();
  10813. MachineFunction::iterator BBI = ++BB->getIterator();
  10814. MachineFunction *Fn = BB->getParent();
  10815. MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
  10816. MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
  10817. Fn->insert(BBI, RSBBB);
  10818. Fn->insert(BBI, SinkBB);
  10819. Register ABSSrcReg = MI.getOperand(1).getReg();
  10820. Register ABSDstReg = MI.getOperand(0).getReg();
  10821. bool ABSSrcKIll = MI.getOperand(1).isKill();
  10822. bool isThumb2 = Subtarget->isThumb2();
  10823. MachineRegisterInfo &MRI = Fn->getRegInfo();
  10824. // In Thumb mode S must not be specified if source register is the SP or
  10825. // PC and if destination register is the SP, so restrict register class
  10826. Register NewRsbDstReg = MRI.createVirtualRegister(
  10827. isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
  10828. // Transfer the remainder of BB and its successor edges to sinkMBB.
  10829. SinkBB->splice(SinkBB->begin(), BB,
  10830. std::next(MachineBasicBlock::iterator(MI)), BB->end());
  10831. SinkBB->transferSuccessorsAndUpdatePHIs(BB);
  10832. BB->addSuccessor(RSBBB);
  10833. BB->addSuccessor(SinkBB);
  10834. // fall through to SinkMBB
  10835. RSBBB->addSuccessor(SinkBB);
  10836. // insert a cmp at the end of BB
  10837. BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
  10838. .addReg(ABSSrcReg)
  10839. .addImm(0)
  10840. .add(predOps(ARMCC::AL));
  10841. // insert a bcc with opposite CC to ARMCC::MI at the end of BB
  10842. BuildMI(BB, dl,
  10843. TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
  10844. .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
  10845. // insert rsbri in RSBBB
  10846. // Note: BCC and rsbri will be converted into predicated rsbmi
  10847. // by if-conversion pass
  10848. BuildMI(*RSBBB, RSBBB->begin(), dl,
  10849. TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
  10850. .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
  10851. .addImm(0)
  10852. .add(predOps(ARMCC::AL))
  10853. .add(condCodeOp());
  10854. // insert PHI in SinkBB,
  10855. // reuse ABSDstReg to not change uses of ABS instruction
  10856. BuildMI(*SinkBB, SinkBB->begin(), dl,
  10857. TII->get(ARM::PHI), ABSDstReg)
  10858. .addReg(NewRsbDstReg).addMBB(RSBBB)
  10859. .addReg(ABSSrcReg).addMBB(BB);
  10860. // remove ABS instruction
  10861. MI.eraseFromParent();
  10862. // return last added BB
  10863. return SinkBB;
  10864. }
  10865. case ARM::COPY_STRUCT_BYVAL_I32:
  10866. ++NumLoopByVals;
  10867. return EmitStructByval(MI, BB);
  10868. case ARM::WIN__CHKSTK:
  10869. return EmitLowered__chkstk(MI, BB);
  10870. case ARM::WIN__DBZCHK:
  10871. return EmitLowered__dbzchk(MI, BB);
  10872. }
  10873. }
  10874. /// Attaches vregs to MEMCPY that it will use as scratch registers
  10875. /// when it is expanded into LDM/STM. This is done as a post-isel lowering
  10876. /// instead of as a custom inserter because we need the use list from the SDNode.
  10877. static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
  10878. MachineInstr &MI, const SDNode *Node) {
  10879. bool isThumb1 = Subtarget->isThumb1Only();
  10880. DebugLoc DL = MI.getDebugLoc();
  10881. MachineFunction *MF = MI.getParent()->getParent();
  10882. MachineRegisterInfo &MRI = MF->getRegInfo();
  10883. MachineInstrBuilder MIB(*MF, MI);
  10884. // If the new dst/src is unused mark it as dead.
  10885. if (!Node->hasAnyUseOfValue(0)) {
  10886. MI.getOperand(0).setIsDead(true);
  10887. }
  10888. if (!Node->hasAnyUseOfValue(1)) {
  10889. MI.getOperand(1).setIsDead(true);
  10890. }
  10891. // The MEMCPY both defines and kills the scratch registers.
  10892. for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
  10893. Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
  10894. : &ARM::GPRRegClass);
  10895. MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
  10896. }
  10897. }
  10898. void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
  10899. SDNode *Node) const {
  10900. if (MI.getOpcode() == ARM::MEMCPY) {
  10901. attachMEMCPYScratchRegs(Subtarget, MI, Node);
  10902. return;
  10903. }
  10904. const MCInstrDesc *MCID = &MI.getDesc();
  10905. // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
  10906. // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
  10907. // operand is still set to noreg. If needed, set the optional operand's
  10908. // register to CPSR, and remove the redundant implicit def.
  10909. //
  10910. // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
  10911. // Rename pseudo opcodes.
  10912. unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
  10913. unsigned ccOutIdx;
  10914. if (NewOpc) {
  10915. const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
  10916. MCID = &TII->get(NewOpc);
  10917. assert(MCID->getNumOperands() ==
  10918. MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
  10919. && "converted opcode should be the same except for cc_out"
  10920. " (and, on Thumb1, pred)");
  10921. MI.setDesc(*MCID);
  10922. // Add the optional cc_out operand
  10923. MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
  10924. // On Thumb1, move all input operands to the end, then add the predicate
  10925. if (Subtarget->isThumb1Only()) {
  10926. for (unsigned c = MCID->getNumOperands() - 4; c--;) {
  10927. MI.addOperand(MI.getOperand(1));
  10928. MI.removeOperand(1);
  10929. }
  10930. // Restore the ties
  10931. for (unsigned i = MI.getNumOperands(); i--;) {
  10932. const MachineOperand& op = MI.getOperand(i);
  10933. if (op.isReg() && op.isUse()) {
  10934. int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
  10935. if (DefIdx != -1)
  10936. MI.tieOperands(DefIdx, i);
  10937. }
  10938. }
  10939. MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
  10940. MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
  10941. ccOutIdx = 1;
  10942. } else
  10943. ccOutIdx = MCID->getNumOperands() - 1;
  10944. } else
  10945. ccOutIdx = MCID->getNumOperands() - 1;
  10946. // Any ARM instruction that sets the 's' bit should specify an optional
  10947. // "cc_out" operand in the last operand position.
  10948. if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
  10949. assert(!NewOpc && "Optional cc_out operand required");
  10950. return;
  10951. }
  10952. // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
  10953. // since we already have an optional CPSR def.
  10954. bool definesCPSR = false;
  10955. bool deadCPSR = false;
  10956. for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
  10957. ++i) {
  10958. const MachineOperand &MO = MI.getOperand(i);
  10959. if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
  10960. definesCPSR = true;
  10961. if (MO.isDead())
  10962. deadCPSR = true;
  10963. MI.removeOperand(i);
  10964. break;
  10965. }
  10966. }
  10967. if (!definesCPSR) {
  10968. assert(!NewOpc && "Optional cc_out operand required");
  10969. return;
  10970. }
  10971. assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
  10972. if (deadCPSR) {
  10973. assert(!MI.getOperand(ccOutIdx).getReg() &&
  10974. "expect uninitialized optional cc_out operand");
  10975. // Thumb1 instructions must have the S bit even if the CPSR is dead.
  10976. if (!Subtarget->isThumb1Only())
  10977. return;
  10978. }
  10979. // If this instruction was defined with an optional CPSR def and its dag node
  10980. // had a live implicit CPSR def, then activate the optional CPSR def.
  10981. MachineOperand &MO = MI.getOperand(ccOutIdx);
  10982. MO.setReg(ARM::CPSR);
  10983. MO.setIsDef(true);
  10984. }
  10985. //===----------------------------------------------------------------------===//
  10986. // ARM Optimization Hooks
  10987. //===----------------------------------------------------------------------===//
  10988. // Helper function that checks if N is a null or all ones constant.
  10989. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
  10990. return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
  10991. }
  10992. // Return true if N is conditionally 0 or all ones.
  10993. // Detects these expressions where cc is an i1 value:
  10994. //
  10995. // (select cc 0, y) [AllOnes=0]
  10996. // (select cc y, 0) [AllOnes=0]
  10997. // (zext cc) [AllOnes=0]
  10998. // (sext cc) [AllOnes=0/1]
  10999. // (select cc -1, y) [AllOnes=1]
  11000. // (select cc y, -1) [AllOnes=1]
  11001. //
  11002. // Invert is set when N is the null/all ones constant when CC is false.
  11003. // OtherOp is set to the alternative value of N.
  11004. static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
  11005. SDValue &CC, bool &Invert,
  11006. SDValue &OtherOp,
  11007. SelectionDAG &DAG) {
  11008. switch (N->getOpcode()) {
  11009. default: return false;
  11010. case ISD::SELECT: {
  11011. CC = N->getOperand(0);
  11012. SDValue N1 = N->getOperand(1);
  11013. SDValue N2 = N->getOperand(2);
  11014. if (isZeroOrAllOnes(N1, AllOnes)) {
  11015. Invert = false;
  11016. OtherOp = N2;
  11017. return true;
  11018. }
  11019. if (isZeroOrAllOnes(N2, AllOnes)) {
  11020. Invert = true;
  11021. OtherOp = N1;
  11022. return true;
  11023. }
  11024. return false;
  11025. }
  11026. case ISD::ZERO_EXTEND:
  11027. // (zext cc) can never be the all ones value.
  11028. if (AllOnes)
  11029. return false;
  11030. [[fallthrough]];
  11031. case ISD::SIGN_EXTEND: {
  11032. SDLoc dl(N);
  11033. EVT VT = N->getValueType(0);
  11034. CC = N->getOperand(0);
  11035. if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
  11036. return false;
  11037. Invert = !AllOnes;
  11038. if (AllOnes)
  11039. // When looking for an AllOnes constant, N is an sext, and the 'other'
  11040. // value is 0.
  11041. OtherOp = DAG.getConstant(0, dl, VT);
  11042. else if (N->getOpcode() == ISD::ZERO_EXTEND)
  11043. // When looking for a 0 constant, N can be zext or sext.
  11044. OtherOp = DAG.getConstant(1, dl, VT);
  11045. else
  11046. OtherOp = DAG.getAllOnesConstant(dl, VT);
  11047. return true;
  11048. }
  11049. }
  11050. }
  11051. // Combine a constant select operand into its use:
  11052. //
  11053. // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
  11054. // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
  11055. // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
  11056. // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
  11057. // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
  11058. //
  11059. // The transform is rejected if the select doesn't have a constant operand that
  11060. // is null, or all ones when AllOnes is set.
  11061. //
  11062. // Also recognize sext/zext from i1:
  11063. //
  11064. // (add (zext cc), x) -> (select cc (add x, 1), x)
  11065. // (add (sext cc), x) -> (select cc (add x, -1), x)
  11066. //
  11067. // These transformations eventually create predicated instructions.
  11068. //
  11069. // @param N The node to transform.
  11070. // @param Slct The N operand that is a select.
  11071. // @param OtherOp The other N operand (x above).
  11072. // @param DCI Context.
  11073. // @param AllOnes Require the select constant to be all ones instead of null.
  11074. // @returns The new node, or SDValue() on failure.
  11075. static
  11076. SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
  11077. TargetLowering::DAGCombinerInfo &DCI,
  11078. bool AllOnes = false) {
  11079. SelectionDAG &DAG = DCI.DAG;
  11080. EVT VT = N->getValueType(0);
  11081. SDValue NonConstantVal;
  11082. SDValue CCOp;
  11083. bool SwapSelectOps;
  11084. if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
  11085. NonConstantVal, DAG))
  11086. return SDValue();
  11087. // Slct is now know to be the desired identity constant when CC is true.
  11088. SDValue TrueVal = OtherOp;
  11089. SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
  11090. OtherOp, NonConstantVal);
  11091. // Unless SwapSelectOps says CC should be false.
  11092. if (SwapSelectOps)
  11093. std::swap(TrueVal, FalseVal);
  11094. return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
  11095. CCOp, TrueVal, FalseVal);
  11096. }
  11097. // Attempt combineSelectAndUse on each operand of a commutative operator N.
  11098. static
  11099. SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
  11100. TargetLowering::DAGCombinerInfo &DCI) {
  11101. SDValue N0 = N->getOperand(0);
  11102. SDValue N1 = N->getOperand(1);
  11103. if (N0.getNode()->hasOneUse())
  11104. if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
  11105. return Result;
  11106. if (N1.getNode()->hasOneUse())
  11107. if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
  11108. return Result;
  11109. return SDValue();
  11110. }
  11111. static bool IsVUZPShuffleNode(SDNode *N) {
  11112. // VUZP shuffle node.
  11113. if (N->getOpcode() == ARMISD::VUZP)
  11114. return true;
  11115. // "VUZP" on i32 is an alias for VTRN.
  11116. if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
  11117. return true;
  11118. return false;
  11119. }
  11120. static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
  11121. TargetLowering::DAGCombinerInfo &DCI,
  11122. const ARMSubtarget *Subtarget) {
  11123. // Look for ADD(VUZP.0, VUZP.1).
  11124. if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
  11125. N0 == N1)
  11126. return SDValue();
  11127. // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
  11128. if (!N->getValueType(0).is64BitVector())
  11129. return SDValue();
  11130. // Generate vpadd.
  11131. SelectionDAG &DAG = DCI.DAG;
  11132. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11133. SDLoc dl(N);
  11134. SDNode *Unzip = N0.getNode();
  11135. EVT VT = N->getValueType(0);
  11136. SmallVector<SDValue, 8> Ops;
  11137. Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
  11138. TLI.getPointerTy(DAG.getDataLayout())));
  11139. Ops.push_back(Unzip->getOperand(0));
  11140. Ops.push_back(Unzip->getOperand(1));
  11141. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
  11142. }
  11143. static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
  11144. TargetLowering::DAGCombinerInfo &DCI,
  11145. const ARMSubtarget *Subtarget) {
  11146. // Check for two extended operands.
  11147. if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
  11148. N1.getOpcode() == ISD::SIGN_EXTEND) &&
  11149. !(N0.getOpcode() == ISD::ZERO_EXTEND &&
  11150. N1.getOpcode() == ISD::ZERO_EXTEND))
  11151. return SDValue();
  11152. SDValue N00 = N0.getOperand(0);
  11153. SDValue N10 = N1.getOperand(0);
  11154. // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
  11155. if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
  11156. N00 == N10)
  11157. return SDValue();
  11158. // We only recognize Q register paddl here; this can't be reached until
  11159. // after type legalization.
  11160. if (!N00.getValueType().is64BitVector() ||
  11161. !N0.getValueType().is128BitVector())
  11162. return SDValue();
  11163. // Generate vpaddl.
  11164. SelectionDAG &DAG = DCI.DAG;
  11165. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11166. SDLoc dl(N);
  11167. EVT VT = N->getValueType(0);
  11168. SmallVector<SDValue, 8> Ops;
  11169. // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
  11170. unsigned Opcode;
  11171. if (N0.getOpcode() == ISD::SIGN_EXTEND)
  11172. Opcode = Intrinsic::arm_neon_vpaddls;
  11173. else
  11174. Opcode = Intrinsic::arm_neon_vpaddlu;
  11175. Ops.push_back(DAG.getConstant(Opcode, dl,
  11176. TLI.getPointerTy(DAG.getDataLayout())));
  11177. EVT ElemTy = N00.getValueType().getVectorElementType();
  11178. unsigned NumElts = VT.getVectorNumElements();
  11179. EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
  11180. SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
  11181. N00.getOperand(0), N00.getOperand(1));
  11182. Ops.push_back(Concat);
  11183. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
  11184. }
  11185. // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
  11186. // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
  11187. // much easier to match.
  11188. static SDValue
  11189. AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
  11190. TargetLowering::DAGCombinerInfo &DCI,
  11191. const ARMSubtarget *Subtarget) {
  11192. // Only perform optimization if after legalize, and if NEON is available. We
  11193. // also expected both operands to be BUILD_VECTORs.
  11194. if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
  11195. || N0.getOpcode() != ISD::BUILD_VECTOR
  11196. || N1.getOpcode() != ISD::BUILD_VECTOR)
  11197. return SDValue();
  11198. // Check output type since VPADDL operand elements can only be 8, 16, or 32.
  11199. EVT VT = N->getValueType(0);
  11200. if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
  11201. return SDValue();
  11202. // Check that the vector operands are of the right form.
  11203. // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
  11204. // operands, where N is the size of the formed vector.
  11205. // Each EXTRACT_VECTOR should have the same input vector and odd or even
  11206. // index such that we have a pair wise add pattern.
  11207. // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
  11208. if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  11209. return SDValue();
  11210. SDValue Vec = N0->getOperand(0)->getOperand(0);
  11211. SDNode *V = Vec.getNode();
  11212. unsigned nextIndex = 0;
  11213. // For each operands to the ADD which are BUILD_VECTORs,
  11214. // check to see if each of their operands are an EXTRACT_VECTOR with
  11215. // the same vector and appropriate index.
  11216. for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
  11217. if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
  11218. && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
  11219. SDValue ExtVec0 = N0->getOperand(i);
  11220. SDValue ExtVec1 = N1->getOperand(i);
  11221. // First operand is the vector, verify its the same.
  11222. if (V != ExtVec0->getOperand(0).getNode() ||
  11223. V != ExtVec1->getOperand(0).getNode())
  11224. return SDValue();
  11225. // Second is the constant, verify its correct.
  11226. ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
  11227. ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
  11228. // For the constant, we want to see all the even or all the odd.
  11229. if (!C0 || !C1 || C0->getZExtValue() != nextIndex
  11230. || C1->getZExtValue() != nextIndex+1)
  11231. return SDValue();
  11232. // Increment index.
  11233. nextIndex+=2;
  11234. } else
  11235. return SDValue();
  11236. }
  11237. // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
  11238. // we're using the entire input vector, otherwise there's a size/legality
  11239. // mismatch somewhere.
  11240. if (nextIndex != Vec.getValueType().getVectorNumElements() ||
  11241. Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
  11242. return SDValue();
  11243. // Create VPADDL node.
  11244. SelectionDAG &DAG = DCI.DAG;
  11245. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11246. SDLoc dl(N);
  11247. // Build operand list.
  11248. SmallVector<SDValue, 8> Ops;
  11249. Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
  11250. TLI.getPointerTy(DAG.getDataLayout())));
  11251. // Input is the vector.
  11252. Ops.push_back(Vec);
  11253. // Get widened type and narrowed type.
  11254. MVT widenType;
  11255. unsigned numElem = VT.getVectorNumElements();
  11256. EVT inputLaneType = Vec.getValueType().getVectorElementType();
  11257. switch (inputLaneType.getSimpleVT().SimpleTy) {
  11258. case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
  11259. case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
  11260. case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
  11261. default:
  11262. llvm_unreachable("Invalid vector element type for padd optimization.");
  11263. }
  11264. SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
  11265. unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
  11266. return DAG.getNode(ExtOp, dl, VT, tmp);
  11267. }
  11268. static SDValue findMUL_LOHI(SDValue V) {
  11269. if (V->getOpcode() == ISD::UMUL_LOHI ||
  11270. V->getOpcode() == ISD::SMUL_LOHI)
  11271. return V;
  11272. return SDValue();
  11273. }
  11274. static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
  11275. TargetLowering::DAGCombinerInfo &DCI,
  11276. const ARMSubtarget *Subtarget) {
  11277. if (!Subtarget->hasBaseDSP())
  11278. return SDValue();
  11279. // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
  11280. // accumulates the product into a 64-bit value. The 16-bit values will
  11281. // be sign extended somehow or SRA'd into 32-bit values
  11282. // (addc (adde (mul 16bit, 16bit), lo), hi)
  11283. SDValue Mul = AddcNode->getOperand(0);
  11284. SDValue Lo = AddcNode->getOperand(1);
  11285. if (Mul.getOpcode() != ISD::MUL) {
  11286. Lo = AddcNode->getOperand(0);
  11287. Mul = AddcNode->getOperand(1);
  11288. if (Mul.getOpcode() != ISD::MUL)
  11289. return SDValue();
  11290. }
  11291. SDValue SRA = AddeNode->getOperand(0);
  11292. SDValue Hi = AddeNode->getOperand(1);
  11293. if (SRA.getOpcode() != ISD::SRA) {
  11294. SRA = AddeNode->getOperand(1);
  11295. Hi = AddeNode->getOperand(0);
  11296. if (SRA.getOpcode() != ISD::SRA)
  11297. return SDValue();
  11298. }
  11299. if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
  11300. if (Const->getZExtValue() != 31)
  11301. return SDValue();
  11302. } else
  11303. return SDValue();
  11304. if (SRA.getOperand(0) != Mul)
  11305. return SDValue();
  11306. SelectionDAG &DAG = DCI.DAG;
  11307. SDLoc dl(AddcNode);
  11308. unsigned Opcode = 0;
  11309. SDValue Op0;
  11310. SDValue Op1;
  11311. if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
  11312. Opcode = ARMISD::SMLALBB;
  11313. Op0 = Mul.getOperand(0);
  11314. Op1 = Mul.getOperand(1);
  11315. } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
  11316. Opcode = ARMISD::SMLALBT;
  11317. Op0 = Mul.getOperand(0);
  11318. Op1 = Mul.getOperand(1).getOperand(0);
  11319. } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
  11320. Opcode = ARMISD::SMLALTB;
  11321. Op0 = Mul.getOperand(0).getOperand(0);
  11322. Op1 = Mul.getOperand(1);
  11323. } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
  11324. Opcode = ARMISD::SMLALTT;
  11325. Op0 = Mul->getOperand(0).getOperand(0);
  11326. Op1 = Mul->getOperand(1).getOperand(0);
  11327. }
  11328. if (!Op0 || !Op1)
  11329. return SDValue();
  11330. SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
  11331. Op0, Op1, Lo, Hi);
  11332. // Replace the ADDs' nodes uses by the MLA node's values.
  11333. SDValue HiMLALResult(SMLAL.getNode(), 1);
  11334. SDValue LoMLALResult(SMLAL.getNode(), 0);
  11335. DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
  11336. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
  11337. // Return original node to notify the driver to stop replacing.
  11338. SDValue resNode(AddcNode, 0);
  11339. return resNode;
  11340. }
  11341. static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
  11342. TargetLowering::DAGCombinerInfo &DCI,
  11343. const ARMSubtarget *Subtarget) {
  11344. // Look for multiply add opportunities.
  11345. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
  11346. // each add nodes consumes a value from ISD::UMUL_LOHI and there is
  11347. // a glue link from the first add to the second add.
  11348. // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
  11349. // a S/UMLAL instruction.
  11350. // UMUL_LOHI
  11351. // / :lo \ :hi
  11352. // V \ [no multiline comment]
  11353. // loAdd -> ADDC |
  11354. // \ :carry /
  11355. // V V
  11356. // ADDE <- hiAdd
  11357. //
  11358. // In the special case where only the higher part of a signed result is used
  11359. // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
  11360. // a constant with the exact value of 0x80000000, we recognize we are dealing
  11361. // with a "rounded multiply and add" (or subtract) and transform it into
  11362. // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
  11363. assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
  11364. AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
  11365. "Expect an ADDE or SUBE");
  11366. assert(AddeSubeNode->getNumOperands() == 3 &&
  11367. AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
  11368. "ADDE node has the wrong inputs");
  11369. // Check that we are chained to the right ADDC or SUBC node.
  11370. SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
  11371. if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
  11372. AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
  11373. (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
  11374. AddcSubcNode->getOpcode() != ARMISD::SUBC))
  11375. return SDValue();
  11376. SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
  11377. SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
  11378. // Check if the two operands are from the same mul_lohi node.
  11379. if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
  11380. return SDValue();
  11381. assert(AddcSubcNode->getNumValues() == 2 &&
  11382. AddcSubcNode->getValueType(0) == MVT::i32 &&
  11383. "Expect ADDC with two result values. First: i32");
  11384. // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
  11385. // maybe a SMLAL which multiplies two 16-bit values.
  11386. if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
  11387. AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
  11388. AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
  11389. AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
  11390. AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
  11391. return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
  11392. // Check for the triangle shape.
  11393. SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
  11394. SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
  11395. // Make sure that the ADDE/SUBE operands are not coming from the same node.
  11396. if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
  11397. return SDValue();
  11398. // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
  11399. bool IsLeftOperandMUL = false;
  11400. SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
  11401. if (MULOp == SDValue())
  11402. MULOp = findMUL_LOHI(AddeSubeOp1);
  11403. else
  11404. IsLeftOperandMUL = true;
  11405. if (MULOp == SDValue())
  11406. return SDValue();
  11407. // Figure out the right opcode.
  11408. unsigned Opc = MULOp->getOpcode();
  11409. unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
  11410. // Figure out the high and low input values to the MLAL node.
  11411. SDValue *HiAddSub = nullptr;
  11412. SDValue *LoMul = nullptr;
  11413. SDValue *LowAddSub = nullptr;
  11414. // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
  11415. if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
  11416. return SDValue();
  11417. if (IsLeftOperandMUL)
  11418. HiAddSub = &AddeSubeOp1;
  11419. else
  11420. HiAddSub = &AddeSubeOp0;
  11421. // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
  11422. // whose low result is fed to the ADDC/SUBC we are checking.
  11423. if (AddcSubcOp0 == MULOp.getValue(0)) {
  11424. LoMul = &AddcSubcOp0;
  11425. LowAddSub = &AddcSubcOp1;
  11426. }
  11427. if (AddcSubcOp1 == MULOp.getValue(0)) {
  11428. LoMul = &AddcSubcOp1;
  11429. LowAddSub = &AddcSubcOp0;
  11430. }
  11431. if (!LoMul)
  11432. return SDValue();
  11433. // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
  11434. // the replacement below will create a cycle.
  11435. if (AddcSubcNode == HiAddSub->getNode() ||
  11436. AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
  11437. return SDValue();
  11438. // Create the merged node.
  11439. SelectionDAG &DAG = DCI.DAG;
  11440. // Start building operand list.
  11441. SmallVector<SDValue, 8> Ops;
  11442. Ops.push_back(LoMul->getOperand(0));
  11443. Ops.push_back(LoMul->getOperand(1));
  11444. // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
  11445. // the case, we must be doing signed multiplication and only use the higher
  11446. // part of the result of the MLAL, furthermore the LowAddSub must be a constant
  11447. // addition or subtraction with the value of 0x800000.
  11448. if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
  11449. FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
  11450. LowAddSub->getNode()->getOpcode() == ISD::Constant &&
  11451. static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
  11452. 0x80000000) {
  11453. Ops.push_back(*HiAddSub);
  11454. if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
  11455. FinalOpc = ARMISD::SMMLSR;
  11456. } else {
  11457. FinalOpc = ARMISD::SMMLAR;
  11458. }
  11459. SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
  11460. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
  11461. return SDValue(AddeSubeNode, 0);
  11462. } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
  11463. // SMMLS is generated during instruction selection and the rest of this
  11464. // function can not handle the case where AddcSubcNode is a SUBC.
  11465. return SDValue();
  11466. // Finish building the operand list for {U/S}MLAL
  11467. Ops.push_back(*LowAddSub);
  11468. Ops.push_back(*HiAddSub);
  11469. SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
  11470. DAG.getVTList(MVT::i32, MVT::i32), Ops);
  11471. // Replace the ADDs' nodes uses by the MLA node's values.
  11472. SDValue HiMLALResult(MLALNode.getNode(), 1);
  11473. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
  11474. SDValue LoMLALResult(MLALNode.getNode(), 0);
  11475. DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
  11476. // Return original node to notify the driver to stop replacing.
  11477. return SDValue(AddeSubeNode, 0);
  11478. }
  11479. static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
  11480. TargetLowering::DAGCombinerInfo &DCI,
  11481. const ARMSubtarget *Subtarget) {
  11482. // UMAAL is similar to UMLAL except that it adds two unsigned values.
  11483. // While trying to combine for the other MLAL nodes, first search for the
  11484. // chance to use UMAAL. Check if Addc uses a node which has already
  11485. // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
  11486. // as the addend, and it's handled in PerformUMLALCombine.
  11487. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
  11488. return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
  11489. // Check that we have a glued ADDC node.
  11490. SDNode* AddcNode = AddeNode->getOperand(2).getNode();
  11491. if (AddcNode->getOpcode() != ARMISD::ADDC)
  11492. return SDValue();
  11493. // Find the converted UMAAL or quit if it doesn't exist.
  11494. SDNode *UmlalNode = nullptr;
  11495. SDValue AddHi;
  11496. if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
  11497. UmlalNode = AddcNode->getOperand(0).getNode();
  11498. AddHi = AddcNode->getOperand(1);
  11499. } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
  11500. UmlalNode = AddcNode->getOperand(1).getNode();
  11501. AddHi = AddcNode->getOperand(0);
  11502. } else {
  11503. return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
  11504. }
  11505. // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
  11506. // the ADDC as well as Zero.
  11507. if (!isNullConstant(UmlalNode->getOperand(3)))
  11508. return SDValue();
  11509. if ((isNullConstant(AddeNode->getOperand(0)) &&
  11510. AddeNode->getOperand(1).getNode() == UmlalNode) ||
  11511. (AddeNode->getOperand(0).getNode() == UmlalNode &&
  11512. isNullConstant(AddeNode->getOperand(1)))) {
  11513. SelectionDAG &DAG = DCI.DAG;
  11514. SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
  11515. UmlalNode->getOperand(2), AddHi };
  11516. SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
  11517. DAG.getVTList(MVT::i32, MVT::i32), Ops);
  11518. // Replace the ADDs' nodes uses by the UMAAL node's values.
  11519. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
  11520. DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
  11521. // Return original node to notify the driver to stop replacing.
  11522. return SDValue(AddeNode, 0);
  11523. }
  11524. return SDValue();
  11525. }
  11526. static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
  11527. const ARMSubtarget *Subtarget) {
  11528. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
  11529. return SDValue();
  11530. // Check that we have a pair of ADDC and ADDE as operands.
  11531. // Both addends of the ADDE must be zero.
  11532. SDNode* AddcNode = N->getOperand(2).getNode();
  11533. SDNode* AddeNode = N->getOperand(3).getNode();
  11534. if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
  11535. (AddeNode->getOpcode() == ARMISD::ADDE) &&
  11536. isNullConstant(AddeNode->getOperand(0)) &&
  11537. isNullConstant(AddeNode->getOperand(1)) &&
  11538. (AddeNode->getOperand(2).getNode() == AddcNode))
  11539. return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
  11540. DAG.getVTList(MVT::i32, MVT::i32),
  11541. {N->getOperand(0), N->getOperand(1),
  11542. AddcNode->getOperand(0), AddcNode->getOperand(1)});
  11543. else
  11544. return SDValue();
  11545. }
  11546. static SDValue PerformAddcSubcCombine(SDNode *N,
  11547. TargetLowering::DAGCombinerInfo &DCI,
  11548. const ARMSubtarget *Subtarget) {
  11549. SelectionDAG &DAG(DCI.DAG);
  11550. if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
  11551. // (SUBC (ADDE 0, 0, C), 1) -> C
  11552. SDValue LHS = N->getOperand(0);
  11553. SDValue RHS = N->getOperand(1);
  11554. if (LHS->getOpcode() == ARMISD::ADDE &&
  11555. isNullConstant(LHS->getOperand(0)) &&
  11556. isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
  11557. return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
  11558. }
  11559. }
  11560. if (Subtarget->isThumb1Only()) {
  11561. SDValue RHS = N->getOperand(1);
  11562. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
  11563. int32_t imm = C->getSExtValue();
  11564. if (imm < 0 && imm > std::numeric_limits<int>::min()) {
  11565. SDLoc DL(N);
  11566. RHS = DAG.getConstant(-imm, DL, MVT::i32);
  11567. unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
  11568. : ARMISD::ADDC;
  11569. return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
  11570. }
  11571. }
  11572. }
  11573. return SDValue();
  11574. }
  11575. static SDValue PerformAddeSubeCombine(SDNode *N,
  11576. TargetLowering::DAGCombinerInfo &DCI,
  11577. const ARMSubtarget *Subtarget) {
  11578. if (Subtarget->isThumb1Only()) {
  11579. SelectionDAG &DAG = DCI.DAG;
  11580. SDValue RHS = N->getOperand(1);
  11581. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
  11582. int64_t imm = C->getSExtValue();
  11583. if (imm < 0) {
  11584. SDLoc DL(N);
  11585. // The with-carry-in form matches bitwise not instead of the negation.
  11586. // Effectively, the inverse interpretation of the carry flag already
  11587. // accounts for part of the negation.
  11588. RHS = DAG.getConstant(~imm, DL, MVT::i32);
  11589. unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
  11590. : ARMISD::ADDE;
  11591. return DAG.getNode(Opcode, DL, N->getVTList(),
  11592. N->getOperand(0), RHS, N->getOperand(2));
  11593. }
  11594. }
  11595. } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
  11596. return AddCombineTo64bitMLAL(N, DCI, Subtarget);
  11597. }
  11598. return SDValue();
  11599. }
  11600. static SDValue PerformSELECTCombine(SDNode *N,
  11601. TargetLowering::DAGCombinerInfo &DCI,
  11602. const ARMSubtarget *Subtarget) {
  11603. if (!Subtarget->hasMVEIntegerOps())
  11604. return SDValue();
  11605. SDLoc dl(N);
  11606. SDValue SetCC;
  11607. SDValue LHS;
  11608. SDValue RHS;
  11609. ISD::CondCode CC;
  11610. SDValue TrueVal;
  11611. SDValue FalseVal;
  11612. if (N->getOpcode() == ISD::SELECT &&
  11613. N->getOperand(0)->getOpcode() == ISD::SETCC) {
  11614. SetCC = N->getOperand(0);
  11615. LHS = SetCC->getOperand(0);
  11616. RHS = SetCC->getOperand(1);
  11617. CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
  11618. TrueVal = N->getOperand(1);
  11619. FalseVal = N->getOperand(2);
  11620. } else if (N->getOpcode() == ISD::SELECT_CC) {
  11621. LHS = N->getOperand(0);
  11622. RHS = N->getOperand(1);
  11623. CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
  11624. TrueVal = N->getOperand(2);
  11625. FalseVal = N->getOperand(3);
  11626. } else {
  11627. return SDValue();
  11628. }
  11629. unsigned int Opcode = 0;
  11630. if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
  11631. FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
  11632. (CC == ISD::SETULT || CC == ISD::SETUGT)) {
  11633. Opcode = ARMISD::VMINVu;
  11634. if (CC == ISD::SETUGT)
  11635. std::swap(TrueVal, FalseVal);
  11636. } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
  11637. FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
  11638. (CC == ISD::SETLT || CC == ISD::SETGT)) {
  11639. Opcode = ARMISD::VMINVs;
  11640. if (CC == ISD::SETGT)
  11641. std::swap(TrueVal, FalseVal);
  11642. } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
  11643. FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
  11644. (CC == ISD::SETUGT || CC == ISD::SETULT)) {
  11645. Opcode = ARMISD::VMAXVu;
  11646. if (CC == ISD::SETULT)
  11647. std::swap(TrueVal, FalseVal);
  11648. } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
  11649. FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
  11650. (CC == ISD::SETGT || CC == ISD::SETLT)) {
  11651. Opcode = ARMISD::VMAXVs;
  11652. if (CC == ISD::SETLT)
  11653. std::swap(TrueVal, FalseVal);
  11654. } else
  11655. return SDValue();
  11656. // Normalise to the right hand side being the vector reduction
  11657. switch (TrueVal->getOpcode()) {
  11658. case ISD::VECREDUCE_UMIN:
  11659. case ISD::VECREDUCE_SMIN:
  11660. case ISD::VECREDUCE_UMAX:
  11661. case ISD::VECREDUCE_SMAX:
  11662. std::swap(LHS, RHS);
  11663. std::swap(TrueVal, FalseVal);
  11664. break;
  11665. }
  11666. EVT VectorType = FalseVal->getOperand(0).getValueType();
  11667. if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
  11668. VectorType != MVT::v4i32)
  11669. return SDValue();
  11670. EVT VectorScalarType = VectorType.getVectorElementType();
  11671. // The values being selected must also be the ones being compared
  11672. if (TrueVal != LHS || FalseVal != RHS)
  11673. return SDValue();
  11674. EVT LeftType = LHS->getValueType(0);
  11675. EVT RightType = RHS->getValueType(0);
  11676. // The types must match the reduced type too
  11677. if (LeftType != VectorScalarType || RightType != VectorScalarType)
  11678. return SDValue();
  11679. // Legalise the scalar to an i32
  11680. if (VectorScalarType != MVT::i32)
  11681. LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
  11682. // Generate the reduction as an i32 for legalisation purposes
  11683. auto Reduction =
  11684. DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
  11685. // The result isn't actually an i32 so truncate it back to its original type
  11686. if (VectorScalarType != MVT::i32)
  11687. Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
  11688. return Reduction;
  11689. }
  11690. // A special combine for the vqdmulh family of instructions. This is one of the
  11691. // potential set of patterns that could patch this instruction. The base pattern
  11692. // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
  11693. // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
  11694. // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
  11695. // the max is unnecessary.
  11696. static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
  11697. EVT VT = N->getValueType(0);
  11698. SDValue Shft;
  11699. ConstantSDNode *Clamp;
  11700. if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
  11701. return SDValue();
  11702. if (N->getOpcode() == ISD::SMIN) {
  11703. Shft = N->getOperand(0);
  11704. Clamp = isConstOrConstSplat(N->getOperand(1));
  11705. } else if (N->getOpcode() == ISD::VSELECT) {
  11706. // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
  11707. SDValue Cmp = N->getOperand(0);
  11708. if (Cmp.getOpcode() != ISD::SETCC ||
  11709. cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
  11710. Cmp.getOperand(0) != N->getOperand(1) ||
  11711. Cmp.getOperand(1) != N->getOperand(2))
  11712. return SDValue();
  11713. Shft = N->getOperand(1);
  11714. Clamp = isConstOrConstSplat(N->getOperand(2));
  11715. } else
  11716. return SDValue();
  11717. if (!Clamp)
  11718. return SDValue();
  11719. MVT ScalarType;
  11720. int ShftAmt = 0;
  11721. switch (Clamp->getSExtValue()) {
  11722. case (1 << 7) - 1:
  11723. ScalarType = MVT::i8;
  11724. ShftAmt = 7;
  11725. break;
  11726. case (1 << 15) - 1:
  11727. ScalarType = MVT::i16;
  11728. ShftAmt = 15;
  11729. break;
  11730. case (1ULL << 31) - 1:
  11731. ScalarType = MVT::i32;
  11732. ShftAmt = 31;
  11733. break;
  11734. default:
  11735. return SDValue();
  11736. }
  11737. if (Shft.getOpcode() != ISD::SRA)
  11738. return SDValue();
  11739. ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
  11740. if (!N1 || N1->getSExtValue() != ShftAmt)
  11741. return SDValue();
  11742. SDValue Mul = Shft.getOperand(0);
  11743. if (Mul.getOpcode() != ISD::MUL)
  11744. return SDValue();
  11745. SDValue Ext0 = Mul.getOperand(0);
  11746. SDValue Ext1 = Mul.getOperand(1);
  11747. if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
  11748. Ext1.getOpcode() != ISD::SIGN_EXTEND)
  11749. return SDValue();
  11750. EVT VecVT = Ext0.getOperand(0).getValueType();
  11751. if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
  11752. return SDValue();
  11753. if (Ext1.getOperand(0).getValueType() != VecVT ||
  11754. VecVT.getScalarType() != ScalarType ||
  11755. VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
  11756. return SDValue();
  11757. SDLoc DL(Mul);
  11758. unsigned LegalLanes = 128 / (ShftAmt + 1);
  11759. EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
  11760. // For types smaller than legal vectors extend to be legal and only use needed
  11761. // lanes.
  11762. if (VecVT.getSizeInBits() < 128) {
  11763. EVT ExtVecVT =
  11764. MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()),
  11765. VecVT.getVectorNumElements());
  11766. SDValue Inp0 =
  11767. DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
  11768. SDValue Inp1 =
  11769. DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
  11770. Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
  11771. Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
  11772. SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
  11773. SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
  11774. Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
  11775. return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
  11776. }
  11777. // For larger types, split into legal sized chunks.
  11778. assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
  11779. unsigned NumParts = VecVT.getSizeInBits() / 128;
  11780. SmallVector<SDValue> Parts;
  11781. for (unsigned I = 0; I < NumParts; ++I) {
  11782. SDValue Inp0 =
  11783. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
  11784. DAG.getVectorIdxConstant(I * LegalLanes, DL));
  11785. SDValue Inp1 =
  11786. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
  11787. DAG.getVectorIdxConstant(I * LegalLanes, DL));
  11788. SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
  11789. Parts.push_back(VQDMULH);
  11790. }
  11791. return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
  11792. DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
  11793. }
  11794. static SDValue PerformVSELECTCombine(SDNode *N,
  11795. TargetLowering::DAGCombinerInfo &DCI,
  11796. const ARMSubtarget *Subtarget) {
  11797. if (!Subtarget->hasMVEIntegerOps())
  11798. return SDValue();
  11799. if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
  11800. return V;
  11801. // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
  11802. //
  11803. // We need to re-implement this optimization here as the implementation in the
  11804. // Target-Independent DAGCombiner does not handle the kind of constant we make
  11805. // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
  11806. // good reason, allowing truncation there would break other targets).
  11807. //
  11808. // Currently, this is only done for MVE, as it's the only target that benefits
  11809. // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
  11810. if (N->getOperand(0).getOpcode() != ISD::XOR)
  11811. return SDValue();
  11812. SDValue XOR = N->getOperand(0);
  11813. // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
  11814. // It is important to check with truncation allowed as the BUILD_VECTORs we
  11815. // generate in those situations will truncate their operands.
  11816. ConstantSDNode *Const =
  11817. isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
  11818. /*AllowTruncation*/ true);
  11819. if (!Const || !Const->isOne())
  11820. return SDValue();
  11821. // Rewrite into vselect(cond, rhs, lhs).
  11822. SDValue Cond = XOR->getOperand(0);
  11823. SDValue LHS = N->getOperand(1);
  11824. SDValue RHS = N->getOperand(2);
  11825. EVT Type = N->getValueType(0);
  11826. return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
  11827. }
  11828. // Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
  11829. static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
  11830. TargetLowering::DAGCombinerInfo &DCI,
  11831. const ARMSubtarget *Subtarget) {
  11832. SDValue Op0 = N->getOperand(0);
  11833. SDValue Op1 = N->getOperand(1);
  11834. ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
  11835. EVT VT = N->getValueType(0);
  11836. if (!Subtarget->hasMVEIntegerOps() ||
  11837. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
  11838. return SDValue();
  11839. if (CC == ISD::SETUGE) {
  11840. std::swap(Op0, Op1);
  11841. CC = ISD::SETULT;
  11842. }
  11843. if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
  11844. Op0.getOpcode() != ISD::BUILD_VECTOR)
  11845. return SDValue();
  11846. // Check first operand is BuildVector of 0,1,2,...
  11847. for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
  11848. if (!Op0.getOperand(I).isUndef() &&
  11849. !(isa<ConstantSDNode>(Op0.getOperand(I)) &&
  11850. Op0.getConstantOperandVal(I) == I))
  11851. return SDValue();
  11852. }
  11853. // The second is a Splat of Op1S
  11854. SDValue Op1S = DCI.DAG.getSplatValue(Op1);
  11855. if (!Op1S)
  11856. return SDValue();
  11857. unsigned Opc;
  11858. switch (VT.getVectorNumElements()) {
  11859. case 2:
  11860. Opc = Intrinsic::arm_mve_vctp64;
  11861. break;
  11862. case 4:
  11863. Opc = Intrinsic::arm_mve_vctp32;
  11864. break;
  11865. case 8:
  11866. Opc = Intrinsic::arm_mve_vctp16;
  11867. break;
  11868. case 16:
  11869. Opc = Intrinsic::arm_mve_vctp8;
  11870. break;
  11871. default:
  11872. return SDValue();
  11873. }
  11874. SDLoc DL(N);
  11875. return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
  11876. DCI.DAG.getConstant(Opc, DL, MVT::i32),
  11877. DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
  11878. }
  11879. static SDValue PerformABSCombine(SDNode *N,
  11880. TargetLowering::DAGCombinerInfo &DCI,
  11881. const ARMSubtarget *Subtarget) {
  11882. SelectionDAG &DAG = DCI.DAG;
  11883. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  11884. if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
  11885. return SDValue();
  11886. return TLI.expandABS(N, DAG);
  11887. }
  11888. /// PerformADDECombine - Target-specific dag combine transform from
  11889. /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
  11890. /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
  11891. static SDValue PerformADDECombine(SDNode *N,
  11892. TargetLowering::DAGCombinerInfo &DCI,
  11893. const ARMSubtarget *Subtarget) {
  11894. // Only ARM and Thumb2 support UMLAL/SMLAL.
  11895. if (Subtarget->isThumb1Only())
  11896. return PerformAddeSubeCombine(N, DCI, Subtarget);
  11897. // Only perform the checks after legalize when the pattern is available.
  11898. if (DCI.isBeforeLegalize()) return SDValue();
  11899. return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
  11900. }
  11901. /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
  11902. /// operands N0 and N1. This is a helper for PerformADDCombine that is
  11903. /// called with the default operands, and if that fails, with commuted
  11904. /// operands.
  11905. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
  11906. TargetLowering::DAGCombinerInfo &DCI,
  11907. const ARMSubtarget *Subtarget){
  11908. // Attempt to create vpadd for this add.
  11909. if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
  11910. return Result;
  11911. // Attempt to create vpaddl for this add.
  11912. if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
  11913. return Result;
  11914. if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
  11915. Subtarget))
  11916. return Result;
  11917. // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
  11918. if (N0.getNode()->hasOneUse())
  11919. if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
  11920. return Result;
  11921. return SDValue();
  11922. }
  11923. static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
  11924. EVT VT = N->getValueType(0);
  11925. SDValue N0 = N->getOperand(0);
  11926. SDValue N1 = N->getOperand(1);
  11927. SDLoc dl(N);
  11928. auto IsVecReduce = [](SDValue Op) {
  11929. switch (Op.getOpcode()) {
  11930. case ISD::VECREDUCE_ADD:
  11931. case ARMISD::VADDVs:
  11932. case ARMISD::VADDVu:
  11933. case ARMISD::VMLAVs:
  11934. case ARMISD::VMLAVu:
  11935. return true;
  11936. }
  11937. return false;
  11938. };
  11939. auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
  11940. // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
  11941. // add(add(X, vecreduce(Y)), vecreduce(Z))
  11942. // to make better use of vaddva style instructions.
  11943. if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
  11944. IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
  11945. !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
  11946. SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
  11947. return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
  11948. }
  11949. // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
  11950. // add(add(add(A, C), reduce(B)), reduce(D))
  11951. if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
  11952. N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
  11953. unsigned N0RedOp = 0;
  11954. if (!IsVecReduce(N0.getOperand(N0RedOp))) {
  11955. N0RedOp = 1;
  11956. if (!IsVecReduce(N0.getOperand(N0RedOp)))
  11957. return SDValue();
  11958. }
  11959. unsigned N1RedOp = 0;
  11960. if (!IsVecReduce(N1.getOperand(N1RedOp)))
  11961. N1RedOp = 1;
  11962. if (!IsVecReduce(N1.getOperand(N1RedOp)))
  11963. return SDValue();
  11964. SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
  11965. N1.getOperand(1 - N1RedOp));
  11966. SDValue Add1 =
  11967. DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
  11968. return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
  11969. }
  11970. return SDValue();
  11971. };
  11972. if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
  11973. return R;
  11974. if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
  11975. return R;
  11976. // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
  11977. // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
  11978. // by ascending load offsets. This can help cores prefetch if the order of
  11979. // loads is more predictable.
  11980. auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
  11981. // Check if two reductions are known to load data where one is before/after
  11982. // another. Return negative if N0 loads data before N1, positive if N1 is
  11983. // before N0 and 0 otherwise if nothing is known.
  11984. auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
  11985. // Look through to the first operand of a MUL, for the VMLA case.
  11986. // Currently only looks at the first operand, in the hope they are equal.
  11987. if (N0.getOpcode() == ISD::MUL)
  11988. N0 = N0.getOperand(0);
  11989. if (N1.getOpcode() == ISD::MUL)
  11990. N1 = N1.getOperand(0);
  11991. // Return true if the two operands are loads to the same object and the
  11992. // offset of the first is known to be less than the offset of the second.
  11993. LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
  11994. LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
  11995. if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
  11996. !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
  11997. Load1->isIndexed())
  11998. return 0;
  11999. auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
  12000. auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
  12001. if (!BaseLocDecomp0.getBase() ||
  12002. BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
  12003. !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
  12004. return 0;
  12005. if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
  12006. return -1;
  12007. if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
  12008. return 1;
  12009. return 0;
  12010. };
  12011. SDValue X;
  12012. if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
  12013. if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
  12014. int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
  12015. N0.getOperand(1).getOperand(0));
  12016. if (IsBefore < 0) {
  12017. X = N0.getOperand(0);
  12018. N0 = N0.getOperand(1);
  12019. } else if (IsBefore > 0) {
  12020. X = N0.getOperand(1);
  12021. N0 = N0.getOperand(0);
  12022. } else
  12023. return SDValue();
  12024. } else if (IsVecReduce(N0.getOperand(0))) {
  12025. X = N0.getOperand(1);
  12026. N0 = N0.getOperand(0);
  12027. } else if (IsVecReduce(N0.getOperand(1))) {
  12028. X = N0.getOperand(0);
  12029. N0 = N0.getOperand(1);
  12030. } else
  12031. return SDValue();
  12032. } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
  12033. IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
  12034. // Note this is backward to how you would expect. We create
  12035. // add(reduce(load + 16), reduce(load + 0)) so that the
  12036. // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
  12037. // the X as VADDV(load + 0)
  12038. return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
  12039. } else
  12040. return SDValue();
  12041. if (!IsVecReduce(N0) || !IsVecReduce(N1))
  12042. return SDValue();
  12043. if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
  12044. return SDValue();
  12045. // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
  12046. SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
  12047. return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
  12048. };
  12049. if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
  12050. return R;
  12051. if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
  12052. return R;
  12053. return SDValue();
  12054. }
  12055. static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
  12056. const ARMSubtarget *Subtarget) {
  12057. if (!Subtarget->hasMVEIntegerOps())
  12058. return SDValue();
  12059. if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))
  12060. return R;
  12061. EVT VT = N->getValueType(0);
  12062. SDValue N0 = N->getOperand(0);
  12063. SDValue N1 = N->getOperand(1);
  12064. SDLoc dl(N);
  12065. if (VT != MVT::i64)
  12066. return SDValue();
  12067. // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
  12068. // will look like:
  12069. // t1: i32,i32 = ARMISD::VADDLVs x
  12070. // t2: i64 = build_pair t1, t1:1
  12071. // t3: i64 = add t2, y
  12072. // Otherwise we try to push the add up above VADDLVAx, to potentially allow
  12073. // the add to be simplified seperately.
  12074. // We also need to check for sext / zext and commutitive adds.
  12075. auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
  12076. SDValue NB) {
  12077. if (NB->getOpcode() != ISD::BUILD_PAIR)
  12078. return SDValue();
  12079. SDValue VecRed = NB->getOperand(0);
  12080. if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
  12081. VecRed.getResNo() != 0 ||
  12082. NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
  12083. return SDValue();
  12084. if (VecRed->getOpcode() == OpcodeA) {
  12085. // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
  12086. SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
  12087. VecRed.getOperand(0), VecRed.getOperand(1));
  12088. NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
  12089. }
  12090. SmallVector<SDValue, 4> Ops;
  12091. Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
  12092. DAG.getConstant(0, dl, MVT::i32)));
  12093. Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
  12094. DAG.getConstant(1, dl, MVT::i32)));
  12095. unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
  12096. for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
  12097. Ops.push_back(VecRed->getOperand(I));
  12098. SDValue Red =
  12099. DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
  12100. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
  12101. SDValue(Red.getNode(), 1));
  12102. };
  12103. if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
  12104. return M;
  12105. if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
  12106. return M;
  12107. if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
  12108. return M;
  12109. if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
  12110. return M;
  12111. if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
  12112. return M;
  12113. if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
  12114. return M;
  12115. if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
  12116. return M;
  12117. if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
  12118. return M;
  12119. if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
  12120. return M;
  12121. if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
  12122. return M;
  12123. if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
  12124. return M;
  12125. if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
  12126. return M;
  12127. if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
  12128. return M;
  12129. if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
  12130. return M;
  12131. if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
  12132. return M;
  12133. if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
  12134. return M;
  12135. return SDValue();
  12136. }
  12137. bool
  12138. ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
  12139. CombineLevel Level) const {
  12140. assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
  12141. N->getOpcode() == ISD::SRL) &&
  12142. "Expected shift op");
  12143. if (Level == BeforeLegalizeTypes)
  12144. return true;
  12145. if (N->getOpcode() != ISD::SHL)
  12146. return true;
  12147. if (Subtarget->isThumb1Only()) {
  12148. // Avoid making expensive immediates by commuting shifts. (This logic
  12149. // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
  12150. // for free.)
  12151. if (N->getOpcode() != ISD::SHL)
  12152. return true;
  12153. SDValue N1 = N->getOperand(0);
  12154. if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
  12155. N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
  12156. return true;
  12157. if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
  12158. if (Const->getAPIntValue().ult(256))
  12159. return false;
  12160. if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
  12161. Const->getAPIntValue().sgt(-256))
  12162. return false;
  12163. }
  12164. return true;
  12165. }
  12166. // Turn off commute-with-shift transform after legalization, so it doesn't
  12167. // conflict with PerformSHLSimplify. (We could try to detect when
  12168. // PerformSHLSimplify would trigger more precisely, but it isn't
  12169. // really necessary.)
  12170. return false;
  12171. }
  12172. bool ARMTargetLowering::isDesirableToCommuteXorWithShift(
  12173. const SDNode *N) const {
  12174. assert(N->getOpcode() == ISD::XOR &&
  12175. (N->getOperand(0).getOpcode() == ISD::SHL ||
  12176. N->getOperand(0).getOpcode() == ISD::SRL) &&
  12177. "Expected XOR(SHIFT) pattern");
  12178. // Only commute if the entire NOT mask is a hidden shifted mask.
  12179. auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12180. auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
  12181. if (XorC && ShiftC) {
  12182. unsigned MaskIdx, MaskLen;
  12183. if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
  12184. unsigned ShiftAmt = ShiftC->getZExtValue();
  12185. unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
  12186. if (N->getOperand(0).getOpcode() == ISD::SHL)
  12187. return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
  12188. return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
  12189. }
  12190. }
  12191. return false;
  12192. }
  12193. bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
  12194. const SDNode *N, CombineLevel Level) const {
  12195. assert(((N->getOpcode() == ISD::SHL &&
  12196. N->getOperand(0).getOpcode() == ISD::SRL) ||
  12197. (N->getOpcode() == ISD::SRL &&
  12198. N->getOperand(0).getOpcode() == ISD::SHL)) &&
  12199. "Expected shift-shift mask");
  12200. if (!Subtarget->isThumb1Only())
  12201. return true;
  12202. if (Level == BeforeLegalizeTypes)
  12203. return true;
  12204. return false;
  12205. }
  12206. bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
  12207. if (!Subtarget->hasNEON()) {
  12208. if (Subtarget->isThumb1Only())
  12209. return VT.getScalarSizeInBits() <= 32;
  12210. return true;
  12211. }
  12212. return VT.isScalarInteger();
  12213. }
  12214. bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
  12215. EVT VT) const {
  12216. if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
  12217. return false;
  12218. switch (FPVT.getSimpleVT().SimpleTy) {
  12219. case MVT::f16:
  12220. return Subtarget->hasVFP2Base();
  12221. case MVT::f32:
  12222. return Subtarget->hasVFP2Base();
  12223. case MVT::f64:
  12224. return Subtarget->hasFP64();
  12225. case MVT::v4f32:
  12226. case MVT::v8f16:
  12227. return Subtarget->hasMVEFloatOps();
  12228. default:
  12229. return false;
  12230. }
  12231. }
  12232. static SDValue PerformSHLSimplify(SDNode *N,
  12233. TargetLowering::DAGCombinerInfo &DCI,
  12234. const ARMSubtarget *ST) {
  12235. // Allow the generic combiner to identify potential bswaps.
  12236. if (DCI.isBeforeLegalize())
  12237. return SDValue();
  12238. // DAG combiner will fold:
  12239. // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
  12240. // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
  12241. // Other code patterns that can be also be modified have the following form:
  12242. // b + ((a << 1) | 510)
  12243. // b + ((a << 1) & 510)
  12244. // b + ((a << 1) ^ 510)
  12245. // b + ((a << 1) + 510)
  12246. // Many instructions can perform the shift for free, but it requires both
  12247. // the operands to be registers. If c1 << c2 is too large, a mov immediate
  12248. // instruction will needed. So, unfold back to the original pattern if:
  12249. // - if c1 and c2 are small enough that they don't require mov imms.
  12250. // - the user(s) of the node can perform an shl
  12251. // No shifted operands for 16-bit instructions.
  12252. if (ST->isThumb() && ST->isThumb1Only())
  12253. return SDValue();
  12254. // Check that all the users could perform the shl themselves.
  12255. for (auto *U : N->uses()) {
  12256. switch(U->getOpcode()) {
  12257. default:
  12258. return SDValue();
  12259. case ISD::SUB:
  12260. case ISD::ADD:
  12261. case ISD::AND:
  12262. case ISD::OR:
  12263. case ISD::XOR:
  12264. case ISD::SETCC:
  12265. case ARMISD::CMP:
  12266. // Check that the user isn't already using a constant because there
  12267. // aren't any instructions that support an immediate operand and a
  12268. // shifted operand.
  12269. if (isa<ConstantSDNode>(U->getOperand(0)) ||
  12270. isa<ConstantSDNode>(U->getOperand(1)))
  12271. return SDValue();
  12272. // Check that it's not already using a shift.
  12273. if (U->getOperand(0).getOpcode() == ISD::SHL ||
  12274. U->getOperand(1).getOpcode() == ISD::SHL)
  12275. return SDValue();
  12276. break;
  12277. }
  12278. }
  12279. if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
  12280. N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
  12281. return SDValue();
  12282. if (N->getOperand(0).getOpcode() != ISD::SHL)
  12283. return SDValue();
  12284. SDValue SHL = N->getOperand(0);
  12285. auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12286. auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
  12287. if (!C1ShlC2 || !C2)
  12288. return SDValue();
  12289. APInt C2Int = C2->getAPIntValue();
  12290. APInt C1Int = C1ShlC2->getAPIntValue();
  12291. unsigned C2Width = C2Int.getBitWidth();
  12292. if (C2Int.uge(C2Width))
  12293. return SDValue();
  12294. uint64_t C2Value = C2Int.getZExtValue();
  12295. // Check that performing a lshr will not lose any information.
  12296. APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
  12297. if ((C1Int & Mask) != C1Int)
  12298. return SDValue();
  12299. // Shift the first constant.
  12300. C1Int.lshrInPlace(C2Int);
  12301. // The immediates are encoded as an 8-bit value that can be rotated.
  12302. auto LargeImm = [](const APInt &Imm) {
  12303. unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
  12304. return Imm.getBitWidth() - Zeros > 8;
  12305. };
  12306. if (LargeImm(C1Int) || LargeImm(C2Int))
  12307. return SDValue();
  12308. SelectionDAG &DAG = DCI.DAG;
  12309. SDLoc dl(N);
  12310. SDValue X = SHL.getOperand(0);
  12311. SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
  12312. DAG.getConstant(C1Int, dl, MVT::i32));
  12313. // Shift left to compensate for the lshr of C1Int.
  12314. SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
  12315. LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
  12316. SHL.dump(); N->dump());
  12317. LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
  12318. return Res;
  12319. }
  12320. /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
  12321. ///
  12322. static SDValue PerformADDCombine(SDNode *N,
  12323. TargetLowering::DAGCombinerInfo &DCI,
  12324. const ARMSubtarget *Subtarget) {
  12325. SDValue N0 = N->getOperand(0);
  12326. SDValue N1 = N->getOperand(1);
  12327. // Only works one way, because it needs an immediate operand.
  12328. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12329. return Result;
  12330. if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
  12331. return Result;
  12332. // First try with the default operand order.
  12333. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
  12334. return Result;
  12335. // If that didn't work, try again with the operands commuted.
  12336. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
  12337. }
  12338. // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
  12339. // providing -X is as cheap as X (currently, just a constant).
  12340. static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
  12341. if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
  12342. return SDValue();
  12343. SDValue CSINC = N->getOperand(1);
  12344. if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
  12345. return SDValue();
  12346. ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
  12347. if (!X)
  12348. return SDValue();
  12349. return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
  12350. DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
  12351. CSINC.getOperand(0)),
  12352. CSINC.getOperand(1), CSINC.getOperand(2),
  12353. CSINC.getOperand(3));
  12354. }
  12355. /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
  12356. ///
  12357. static SDValue PerformSUBCombine(SDNode *N,
  12358. TargetLowering::DAGCombinerInfo &DCI,
  12359. const ARMSubtarget *Subtarget) {
  12360. SDValue N0 = N->getOperand(0);
  12361. SDValue N1 = N->getOperand(1);
  12362. // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
  12363. if (N1.getNode()->hasOneUse())
  12364. if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
  12365. return Result;
  12366. if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
  12367. return R;
  12368. if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
  12369. return SDValue();
  12370. // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
  12371. // so that we can readily pattern match more mve instructions which can use
  12372. // a scalar operand.
  12373. SDValue VDup = N->getOperand(1);
  12374. if (VDup->getOpcode() != ARMISD::VDUP)
  12375. return SDValue();
  12376. SDValue VMov = N->getOperand(0);
  12377. if (VMov->getOpcode() == ISD::BITCAST)
  12378. VMov = VMov->getOperand(0);
  12379. if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
  12380. return SDValue();
  12381. SDLoc dl(N);
  12382. SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
  12383. DCI.DAG.getConstant(0, dl, MVT::i32),
  12384. VDup->getOperand(0));
  12385. return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
  12386. }
  12387. /// PerformVMULCombine
  12388. /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
  12389. /// special multiplier accumulator forwarding.
  12390. /// vmul d3, d0, d2
  12391. /// vmla d3, d1, d2
  12392. /// is faster than
  12393. /// vadd d3, d0, d1
  12394. /// vmul d3, d3, d2
  12395. // However, for (A + B) * (A + B),
  12396. // vadd d2, d0, d1
  12397. // vmul d3, d0, d2
  12398. // vmla d3, d1, d2
  12399. // is slower than
  12400. // vadd d2, d0, d1
  12401. // vmul d3, d2, d2
  12402. static SDValue PerformVMULCombine(SDNode *N,
  12403. TargetLowering::DAGCombinerInfo &DCI,
  12404. const ARMSubtarget *Subtarget) {
  12405. if (!Subtarget->hasVMLxForwarding())
  12406. return SDValue();
  12407. SelectionDAG &DAG = DCI.DAG;
  12408. SDValue N0 = N->getOperand(0);
  12409. SDValue N1 = N->getOperand(1);
  12410. unsigned Opcode = N0.getOpcode();
  12411. if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
  12412. Opcode != ISD::FADD && Opcode != ISD::FSUB) {
  12413. Opcode = N1.getOpcode();
  12414. if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
  12415. Opcode != ISD::FADD && Opcode != ISD::FSUB)
  12416. return SDValue();
  12417. std::swap(N0, N1);
  12418. }
  12419. if (N0 == N1)
  12420. return SDValue();
  12421. EVT VT = N->getValueType(0);
  12422. SDLoc DL(N);
  12423. SDValue N00 = N0->getOperand(0);
  12424. SDValue N01 = N0->getOperand(1);
  12425. return DAG.getNode(Opcode, DL, VT,
  12426. DAG.getNode(ISD::MUL, DL, VT, N00, N1),
  12427. DAG.getNode(ISD::MUL, DL, VT, N01, N1));
  12428. }
  12429. static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
  12430. const ARMSubtarget *Subtarget) {
  12431. EVT VT = N->getValueType(0);
  12432. if (VT != MVT::v2i64)
  12433. return SDValue();
  12434. SDValue N0 = N->getOperand(0);
  12435. SDValue N1 = N->getOperand(1);
  12436. auto IsSignExt = [&](SDValue Op) {
  12437. if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
  12438. return SDValue();
  12439. EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
  12440. if (VT.getScalarSizeInBits() == 32)
  12441. return Op->getOperand(0);
  12442. return SDValue();
  12443. };
  12444. auto IsZeroExt = [&](SDValue Op) {
  12445. // Zero extends are a little more awkward. At the point we are matching
  12446. // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
  12447. // That might be before of after a bitcast depending on how the and is
  12448. // placed. Because this has to look through bitcasts, it is currently only
  12449. // supported on LE.
  12450. if (!Subtarget->isLittle())
  12451. return SDValue();
  12452. SDValue And = Op;
  12453. if (And->getOpcode() == ISD::BITCAST)
  12454. And = And->getOperand(0);
  12455. if (And->getOpcode() != ISD::AND)
  12456. return SDValue();
  12457. SDValue Mask = And->getOperand(1);
  12458. if (Mask->getOpcode() == ISD::BITCAST)
  12459. Mask = Mask->getOperand(0);
  12460. if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
  12461. Mask.getValueType() != MVT::v4i32)
  12462. return SDValue();
  12463. if (isAllOnesConstant(Mask->getOperand(0)) &&
  12464. isNullConstant(Mask->getOperand(1)) &&
  12465. isAllOnesConstant(Mask->getOperand(2)) &&
  12466. isNullConstant(Mask->getOperand(3)))
  12467. return And->getOperand(0);
  12468. return SDValue();
  12469. };
  12470. SDLoc dl(N);
  12471. if (SDValue Op0 = IsSignExt(N0)) {
  12472. if (SDValue Op1 = IsSignExt(N1)) {
  12473. SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
  12474. SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
  12475. return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
  12476. }
  12477. }
  12478. if (SDValue Op0 = IsZeroExt(N0)) {
  12479. if (SDValue Op1 = IsZeroExt(N1)) {
  12480. SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
  12481. SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
  12482. return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
  12483. }
  12484. }
  12485. return SDValue();
  12486. }
  12487. static SDValue PerformMULCombine(SDNode *N,
  12488. TargetLowering::DAGCombinerInfo &DCI,
  12489. const ARMSubtarget *Subtarget) {
  12490. SelectionDAG &DAG = DCI.DAG;
  12491. EVT VT = N->getValueType(0);
  12492. if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
  12493. return PerformMVEVMULLCombine(N, DAG, Subtarget);
  12494. if (Subtarget->isThumb1Only())
  12495. return SDValue();
  12496. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  12497. return SDValue();
  12498. if (VT.is64BitVector() || VT.is128BitVector())
  12499. return PerformVMULCombine(N, DCI, Subtarget);
  12500. if (VT != MVT::i32)
  12501. return SDValue();
  12502. ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12503. if (!C)
  12504. return SDValue();
  12505. int64_t MulAmt = C->getSExtValue();
  12506. unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
  12507. ShiftAmt = ShiftAmt & (32 - 1);
  12508. SDValue V = N->getOperand(0);
  12509. SDLoc DL(N);
  12510. SDValue Res;
  12511. MulAmt >>= ShiftAmt;
  12512. if (MulAmt >= 0) {
  12513. if (isPowerOf2_32(MulAmt - 1)) {
  12514. // (mul x, 2^N + 1) => (add (shl x, N), x)
  12515. Res = DAG.getNode(ISD::ADD, DL, VT,
  12516. V,
  12517. DAG.getNode(ISD::SHL, DL, VT,
  12518. V,
  12519. DAG.getConstant(Log2_32(MulAmt - 1), DL,
  12520. MVT::i32)));
  12521. } else if (isPowerOf2_32(MulAmt + 1)) {
  12522. // (mul x, 2^N - 1) => (sub (shl x, N), x)
  12523. Res = DAG.getNode(ISD::SUB, DL, VT,
  12524. DAG.getNode(ISD::SHL, DL, VT,
  12525. V,
  12526. DAG.getConstant(Log2_32(MulAmt + 1), DL,
  12527. MVT::i32)),
  12528. V);
  12529. } else
  12530. return SDValue();
  12531. } else {
  12532. uint64_t MulAmtAbs = -MulAmt;
  12533. if (isPowerOf2_32(MulAmtAbs + 1)) {
  12534. // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
  12535. Res = DAG.getNode(ISD::SUB, DL, VT,
  12536. V,
  12537. DAG.getNode(ISD::SHL, DL, VT,
  12538. V,
  12539. DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
  12540. MVT::i32)));
  12541. } else if (isPowerOf2_32(MulAmtAbs - 1)) {
  12542. // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
  12543. Res = DAG.getNode(ISD::ADD, DL, VT,
  12544. V,
  12545. DAG.getNode(ISD::SHL, DL, VT,
  12546. V,
  12547. DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
  12548. MVT::i32)));
  12549. Res = DAG.getNode(ISD::SUB, DL, VT,
  12550. DAG.getConstant(0, DL, MVT::i32), Res);
  12551. } else
  12552. return SDValue();
  12553. }
  12554. if (ShiftAmt != 0)
  12555. Res = DAG.getNode(ISD::SHL, DL, VT,
  12556. Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
  12557. // Do not add new nodes to DAG combiner worklist.
  12558. DCI.CombineTo(N, Res, false);
  12559. return SDValue();
  12560. }
  12561. static SDValue CombineANDShift(SDNode *N,
  12562. TargetLowering::DAGCombinerInfo &DCI,
  12563. const ARMSubtarget *Subtarget) {
  12564. // Allow DAGCombine to pattern-match before we touch the canonical form.
  12565. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  12566. return SDValue();
  12567. if (N->getValueType(0) != MVT::i32)
  12568. return SDValue();
  12569. ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
  12570. if (!N1C)
  12571. return SDValue();
  12572. uint32_t C1 = (uint32_t)N1C->getZExtValue();
  12573. // Don't transform uxtb/uxth.
  12574. if (C1 == 255 || C1 == 65535)
  12575. return SDValue();
  12576. SDNode *N0 = N->getOperand(0).getNode();
  12577. if (!N0->hasOneUse())
  12578. return SDValue();
  12579. if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
  12580. return SDValue();
  12581. bool LeftShift = N0->getOpcode() == ISD::SHL;
  12582. ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
  12583. if (!N01C)
  12584. return SDValue();
  12585. uint32_t C2 = (uint32_t)N01C->getZExtValue();
  12586. if (!C2 || C2 >= 32)
  12587. return SDValue();
  12588. // Clear irrelevant bits in the mask.
  12589. if (LeftShift)
  12590. C1 &= (-1U << C2);
  12591. else
  12592. C1 &= (-1U >> C2);
  12593. SelectionDAG &DAG = DCI.DAG;
  12594. SDLoc DL(N);
  12595. // We have a pattern of the form "(and (shl x, c2) c1)" or
  12596. // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
  12597. // transform to a pair of shifts, to save materializing c1.
  12598. // First pattern: right shift, then mask off leading bits.
  12599. // FIXME: Use demanded bits?
  12600. if (!LeftShift && isMask_32(C1)) {
  12601. uint32_t C3 = countLeadingZeros(C1);
  12602. if (C2 < C3) {
  12603. SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
  12604. DAG.getConstant(C3 - C2, DL, MVT::i32));
  12605. return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
  12606. DAG.getConstant(C3, DL, MVT::i32));
  12607. }
  12608. }
  12609. // First pattern, reversed: left shift, then mask off trailing bits.
  12610. if (LeftShift && isMask_32(~C1)) {
  12611. uint32_t C3 = countTrailingZeros(C1);
  12612. if (C2 < C3) {
  12613. SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
  12614. DAG.getConstant(C3 - C2, DL, MVT::i32));
  12615. return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
  12616. DAG.getConstant(C3, DL, MVT::i32));
  12617. }
  12618. }
  12619. // Second pattern: left shift, then mask off leading bits.
  12620. // FIXME: Use demanded bits?
  12621. if (LeftShift && isShiftedMask_32(C1)) {
  12622. uint32_t Trailing = countTrailingZeros(C1);
  12623. uint32_t C3 = countLeadingZeros(C1);
  12624. if (Trailing == C2 && C2 + C3 < 32) {
  12625. SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
  12626. DAG.getConstant(C2 + C3, DL, MVT::i32));
  12627. return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
  12628. DAG.getConstant(C3, DL, MVT::i32));
  12629. }
  12630. }
  12631. // Second pattern, reversed: right shift, then mask off trailing bits.
  12632. // FIXME: Handle other patterns of known/demanded bits.
  12633. if (!LeftShift && isShiftedMask_32(C1)) {
  12634. uint32_t Leading = countLeadingZeros(C1);
  12635. uint32_t C3 = countTrailingZeros(C1);
  12636. if (Leading == C2 && C2 + C3 < 32) {
  12637. SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
  12638. DAG.getConstant(C2 + C3, DL, MVT::i32));
  12639. return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
  12640. DAG.getConstant(C3, DL, MVT::i32));
  12641. }
  12642. }
  12643. // FIXME: Transform "(and (shl x, c2) c1)" ->
  12644. // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
  12645. // c1.
  12646. return SDValue();
  12647. }
  12648. static SDValue PerformANDCombine(SDNode *N,
  12649. TargetLowering::DAGCombinerInfo &DCI,
  12650. const ARMSubtarget *Subtarget) {
  12651. // Attempt to use immediate-form VBIC
  12652. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
  12653. SDLoc dl(N);
  12654. EVT VT = N->getValueType(0);
  12655. SelectionDAG &DAG = DCI.DAG;
  12656. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
  12657. VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
  12658. return SDValue();
  12659. APInt SplatBits, SplatUndef;
  12660. unsigned SplatBitSize;
  12661. bool HasAnyUndefs;
  12662. if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
  12663. BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  12664. if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
  12665. SplatBitSize == 64) {
  12666. EVT VbicVT;
  12667. SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
  12668. SplatUndef.getZExtValue(), SplatBitSize,
  12669. DAG, dl, VbicVT, VT, OtherModImm);
  12670. if (Val.getNode()) {
  12671. SDValue Input =
  12672. DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
  12673. SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
  12674. return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
  12675. }
  12676. }
  12677. }
  12678. if (!Subtarget->isThumb1Only()) {
  12679. // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
  12680. if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
  12681. return Result;
  12682. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12683. return Result;
  12684. }
  12685. if (Subtarget->isThumb1Only())
  12686. if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
  12687. return Result;
  12688. return SDValue();
  12689. }
  12690. // Try combining OR nodes to SMULWB, SMULWT.
  12691. static SDValue PerformORCombineToSMULWBT(SDNode *OR,
  12692. TargetLowering::DAGCombinerInfo &DCI,
  12693. const ARMSubtarget *Subtarget) {
  12694. if (!Subtarget->hasV6Ops() ||
  12695. (Subtarget->isThumb() &&
  12696. (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
  12697. return SDValue();
  12698. SDValue SRL = OR->getOperand(0);
  12699. SDValue SHL = OR->getOperand(1);
  12700. if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
  12701. SRL = OR->getOperand(1);
  12702. SHL = OR->getOperand(0);
  12703. }
  12704. if (!isSRL16(SRL) || !isSHL16(SHL))
  12705. return SDValue();
  12706. // The first operands to the shifts need to be the two results from the
  12707. // same smul_lohi node.
  12708. if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
  12709. SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
  12710. return SDValue();
  12711. SDNode *SMULLOHI = SRL.getOperand(0).getNode();
  12712. if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
  12713. SHL.getOperand(0) != SDValue(SMULLOHI, 1))
  12714. return SDValue();
  12715. // Now we have:
  12716. // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
  12717. // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
  12718. // For SMUWB the 16-bit value will signed extended somehow.
  12719. // For SMULWT only the SRA is required.
  12720. // Check both sides of SMUL_LOHI
  12721. SDValue OpS16 = SMULLOHI->getOperand(0);
  12722. SDValue OpS32 = SMULLOHI->getOperand(1);
  12723. SelectionDAG &DAG = DCI.DAG;
  12724. if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
  12725. OpS16 = OpS32;
  12726. OpS32 = SMULLOHI->getOperand(0);
  12727. }
  12728. SDLoc dl(OR);
  12729. unsigned Opcode = 0;
  12730. if (isS16(OpS16, DAG))
  12731. Opcode = ARMISD::SMULWB;
  12732. else if (isSRA16(OpS16)) {
  12733. Opcode = ARMISD::SMULWT;
  12734. OpS16 = OpS16->getOperand(0);
  12735. }
  12736. else
  12737. return SDValue();
  12738. SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
  12739. DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
  12740. return SDValue(OR, 0);
  12741. }
  12742. static SDValue PerformORCombineToBFI(SDNode *N,
  12743. TargetLowering::DAGCombinerInfo &DCI,
  12744. const ARMSubtarget *Subtarget) {
  12745. // BFI is only available on V6T2+
  12746. if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
  12747. return SDValue();
  12748. EVT VT = N->getValueType(0);
  12749. SDValue N0 = N->getOperand(0);
  12750. SDValue N1 = N->getOperand(1);
  12751. SelectionDAG &DAG = DCI.DAG;
  12752. SDLoc DL(N);
  12753. // 1) or (and A, mask), val => ARMbfi A, val, mask
  12754. // iff (val & mask) == val
  12755. //
  12756. // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
  12757. // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
  12758. // && mask == ~mask2
  12759. // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
  12760. // && ~mask == mask2
  12761. // (i.e., copy a bitfield value into another bitfield of the same width)
  12762. if (VT != MVT::i32)
  12763. return SDValue();
  12764. SDValue N00 = N0.getOperand(0);
  12765. // The value and the mask need to be constants so we can verify this is
  12766. // actually a bitfield set. If the mask is 0xffff, we can do better
  12767. // via a movt instruction, so don't use BFI in that case.
  12768. SDValue MaskOp = N0.getOperand(1);
  12769. ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
  12770. if (!MaskC)
  12771. return SDValue();
  12772. unsigned Mask = MaskC->getZExtValue();
  12773. if (Mask == 0xffff)
  12774. return SDValue();
  12775. SDValue Res;
  12776. // Case (1): or (and A, mask), val => ARMbfi A, val, mask
  12777. ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
  12778. if (N1C) {
  12779. unsigned Val = N1C->getZExtValue();
  12780. if ((Val & ~Mask) != Val)
  12781. return SDValue();
  12782. if (ARM::isBitFieldInvertedMask(Mask)) {
  12783. Val >>= countTrailingZeros(~Mask);
  12784. Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
  12785. DAG.getConstant(Val, DL, MVT::i32),
  12786. DAG.getConstant(Mask, DL, MVT::i32));
  12787. DCI.CombineTo(N, Res, false);
  12788. // Return value from the original node to inform the combiner than N is
  12789. // now dead.
  12790. return SDValue(N, 0);
  12791. }
  12792. } else if (N1.getOpcode() == ISD::AND) {
  12793. // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
  12794. ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
  12795. if (!N11C)
  12796. return SDValue();
  12797. unsigned Mask2 = N11C->getZExtValue();
  12798. // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
  12799. // as is to match.
  12800. if (ARM::isBitFieldInvertedMask(Mask) &&
  12801. (Mask == ~Mask2)) {
  12802. // The pack halfword instruction works better for masks that fit it,
  12803. // so use that when it's available.
  12804. if (Subtarget->hasDSP() &&
  12805. (Mask == 0xffff || Mask == 0xffff0000))
  12806. return SDValue();
  12807. // 2a
  12808. unsigned amt = countTrailingZeros(Mask2);
  12809. Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
  12810. DAG.getConstant(amt, DL, MVT::i32));
  12811. Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
  12812. DAG.getConstant(Mask, DL, MVT::i32));
  12813. DCI.CombineTo(N, Res, false);
  12814. // Return value from the original node to inform the combiner than N is
  12815. // now dead.
  12816. return SDValue(N, 0);
  12817. } else if (ARM::isBitFieldInvertedMask(~Mask) &&
  12818. (~Mask == Mask2)) {
  12819. // The pack halfword instruction works better for masks that fit it,
  12820. // so use that when it's available.
  12821. if (Subtarget->hasDSP() &&
  12822. (Mask2 == 0xffff || Mask2 == 0xffff0000))
  12823. return SDValue();
  12824. // 2b
  12825. unsigned lsb = countTrailingZeros(Mask);
  12826. Res = DAG.getNode(ISD::SRL, DL, VT, N00,
  12827. DAG.getConstant(lsb, DL, MVT::i32));
  12828. Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
  12829. DAG.getConstant(Mask2, DL, MVT::i32));
  12830. DCI.CombineTo(N, Res, false);
  12831. // Return value from the original node to inform the combiner than N is
  12832. // now dead.
  12833. return SDValue(N, 0);
  12834. }
  12835. }
  12836. if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
  12837. N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
  12838. ARM::isBitFieldInvertedMask(~Mask)) {
  12839. // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
  12840. // where lsb(mask) == #shamt and masked bits of B are known zero.
  12841. SDValue ShAmt = N00.getOperand(1);
  12842. unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
  12843. unsigned LSB = countTrailingZeros(Mask);
  12844. if (ShAmtC != LSB)
  12845. return SDValue();
  12846. Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
  12847. DAG.getConstant(~Mask, DL, MVT::i32));
  12848. DCI.CombineTo(N, Res, false);
  12849. // Return value from the original node to inform the combiner than N is
  12850. // now dead.
  12851. return SDValue(N, 0);
  12852. }
  12853. return SDValue();
  12854. }
  12855. static bool isValidMVECond(unsigned CC, bool IsFloat) {
  12856. switch (CC) {
  12857. case ARMCC::EQ:
  12858. case ARMCC::NE:
  12859. case ARMCC::LE:
  12860. case ARMCC::GT:
  12861. case ARMCC::GE:
  12862. case ARMCC::LT:
  12863. return true;
  12864. case ARMCC::HS:
  12865. case ARMCC::HI:
  12866. return !IsFloat;
  12867. default:
  12868. return false;
  12869. };
  12870. }
  12871. static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
  12872. if (N->getOpcode() == ARMISD::VCMP)
  12873. return (ARMCC::CondCodes)N->getConstantOperandVal(2);
  12874. else if (N->getOpcode() == ARMISD::VCMPZ)
  12875. return (ARMCC::CondCodes)N->getConstantOperandVal(1);
  12876. else
  12877. llvm_unreachable("Not a VCMP/VCMPZ!");
  12878. }
  12879. static bool CanInvertMVEVCMP(SDValue N) {
  12880. ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
  12881. return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
  12882. }
  12883. static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,
  12884. const ARMSubtarget *Subtarget) {
  12885. // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
  12886. // together with predicates
  12887. EVT VT = N->getValueType(0);
  12888. SDLoc DL(N);
  12889. SDValue N0 = N->getOperand(0);
  12890. SDValue N1 = N->getOperand(1);
  12891. auto IsFreelyInvertable = [&](SDValue V) {
  12892. if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
  12893. return CanInvertMVEVCMP(V);
  12894. return false;
  12895. };
  12896. // At least one operand must be freely invertable.
  12897. if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
  12898. return SDValue();
  12899. SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
  12900. SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
  12901. SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
  12902. return DAG.getLogicalNOT(DL, And, VT);
  12903. }
  12904. /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
  12905. static SDValue PerformORCombine(SDNode *N,
  12906. TargetLowering::DAGCombinerInfo &DCI,
  12907. const ARMSubtarget *Subtarget) {
  12908. // Attempt to use immediate-form VORR
  12909. BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
  12910. SDLoc dl(N);
  12911. EVT VT = N->getValueType(0);
  12912. SelectionDAG &DAG = DCI.DAG;
  12913. if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
  12914. return SDValue();
  12915. if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
  12916. VT == MVT::v8i1 || VT == MVT::v16i1))
  12917. return PerformORCombine_i1(N, DAG, Subtarget);
  12918. APInt SplatBits, SplatUndef;
  12919. unsigned SplatBitSize;
  12920. bool HasAnyUndefs;
  12921. if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
  12922. BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
  12923. if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
  12924. SplatBitSize == 64) {
  12925. EVT VorrVT;
  12926. SDValue Val =
  12927. isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
  12928. SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
  12929. if (Val.getNode()) {
  12930. SDValue Input =
  12931. DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
  12932. SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
  12933. return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
  12934. }
  12935. }
  12936. }
  12937. if (!Subtarget->isThumb1Only()) {
  12938. // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
  12939. if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
  12940. return Result;
  12941. if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
  12942. return Result;
  12943. }
  12944. SDValue N0 = N->getOperand(0);
  12945. SDValue N1 = N->getOperand(1);
  12946. // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
  12947. if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
  12948. DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
  12949. // The code below optimizes (or (and X, Y), Z).
  12950. // The AND operand needs to have a single user to make these optimizations
  12951. // profitable.
  12952. if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
  12953. return SDValue();
  12954. APInt SplatUndef;
  12955. unsigned SplatBitSize;
  12956. bool HasAnyUndefs;
  12957. APInt SplatBits0, SplatBits1;
  12958. BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
  12959. BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
  12960. // Ensure that the second operand of both ands are constants
  12961. if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
  12962. HasAnyUndefs) && !HasAnyUndefs) {
  12963. if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
  12964. HasAnyUndefs) && !HasAnyUndefs) {
  12965. // Ensure that the bit width of the constants are the same and that
  12966. // the splat arguments are logical inverses as per the pattern we
  12967. // are trying to simplify.
  12968. if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
  12969. SplatBits0 == ~SplatBits1) {
  12970. // Canonicalize the vector type to make instruction selection
  12971. // simpler.
  12972. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
  12973. SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
  12974. N0->getOperand(1),
  12975. N0->getOperand(0),
  12976. N1->getOperand(0));
  12977. return DAG.getNode(ISD::BITCAST, dl, VT, Result);
  12978. }
  12979. }
  12980. }
  12981. }
  12982. // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
  12983. // reasonable.
  12984. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
  12985. if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
  12986. return Res;
  12987. }
  12988. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  12989. return Result;
  12990. return SDValue();
  12991. }
  12992. static SDValue PerformXORCombine(SDNode *N,
  12993. TargetLowering::DAGCombinerInfo &DCI,
  12994. const ARMSubtarget *Subtarget) {
  12995. EVT VT = N->getValueType(0);
  12996. SelectionDAG &DAG = DCI.DAG;
  12997. if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
  12998. return SDValue();
  12999. if (!Subtarget->isThumb1Only()) {
  13000. // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
  13001. if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
  13002. return Result;
  13003. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
  13004. return Result;
  13005. }
  13006. if (Subtarget->hasMVEIntegerOps()) {
  13007. // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
  13008. SDValue N0 = N->getOperand(0);
  13009. SDValue N1 = N->getOperand(1);
  13010. const TargetLowering *TLI = Subtarget->getTargetLowering();
  13011. if (TLI->isConstTrueVal(N1) &&
  13012. (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
  13013. if (CanInvertMVEVCMP(N0)) {
  13014. SDLoc DL(N0);
  13015. ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
  13016. SmallVector<SDValue, 4> Ops;
  13017. Ops.push_back(N0->getOperand(0));
  13018. if (N0->getOpcode() == ARMISD::VCMP)
  13019. Ops.push_back(N0->getOperand(1));
  13020. Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
  13021. return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
  13022. }
  13023. }
  13024. }
  13025. return SDValue();
  13026. }
  13027. // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
  13028. // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
  13029. // their position in "to" (Rd).
  13030. static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
  13031. assert(N->getOpcode() == ARMISD::BFI);
  13032. SDValue From = N->getOperand(1);
  13033. ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
  13034. FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
  13035. // If the Base came from a SHR #C, we can deduce that it is really testing bit
  13036. // #C in the base of the SHR.
  13037. if (From->getOpcode() == ISD::SRL &&
  13038. isa<ConstantSDNode>(From->getOperand(1))) {
  13039. APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
  13040. assert(Shift.getLimitedValue() < 32 && "Shift too large!");
  13041. FromMask <<= Shift.getLimitedValue(31);
  13042. From = From->getOperand(0);
  13043. }
  13044. return From;
  13045. }
  13046. // If A and B contain one contiguous set of bits, does A | B == A . B?
  13047. //
  13048. // Neither A nor B must be zero.
  13049. static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
  13050. unsigned LastActiveBitInA = A.countTrailingZeros();
  13051. unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
  13052. return LastActiveBitInA - 1 == FirstActiveBitInB;
  13053. }
  13054. static SDValue FindBFIToCombineWith(SDNode *N) {
  13055. // We have a BFI in N. Find a BFI it can combine with, if one exists.
  13056. APInt ToMask, FromMask;
  13057. SDValue From = ParseBFI(N, ToMask, FromMask);
  13058. SDValue To = N->getOperand(0);
  13059. SDValue V = To;
  13060. if (V.getOpcode() != ARMISD::BFI)
  13061. return SDValue();
  13062. APInt NewToMask, NewFromMask;
  13063. SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
  13064. if (NewFrom != From)
  13065. return SDValue();
  13066. // Do the written bits conflict with any we've seen so far?
  13067. if ((NewToMask & ToMask).getBoolValue())
  13068. // Conflicting bits.
  13069. return SDValue();
  13070. // Are the new bits contiguous when combined with the old bits?
  13071. if (BitsProperlyConcatenate(ToMask, NewToMask) &&
  13072. BitsProperlyConcatenate(FromMask, NewFromMask))
  13073. return V;
  13074. if (BitsProperlyConcatenate(NewToMask, ToMask) &&
  13075. BitsProperlyConcatenate(NewFromMask, FromMask))
  13076. return V;
  13077. return SDValue();
  13078. }
  13079. static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
  13080. SDValue N0 = N->getOperand(0);
  13081. SDValue N1 = N->getOperand(1);
  13082. if (N1.getOpcode() == ISD::AND) {
  13083. // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
  13084. // the bits being cleared by the AND are not demanded by the BFI.
  13085. ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
  13086. if (!N11C)
  13087. return SDValue();
  13088. unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  13089. unsigned LSB = countTrailingZeros(~InvMask);
  13090. unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
  13091. assert(Width <
  13092. static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
  13093. "undefined behavior");
  13094. unsigned Mask = (1u << Width) - 1;
  13095. unsigned Mask2 = N11C->getZExtValue();
  13096. if ((Mask & (~Mask2)) == 0)
  13097. return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
  13098. N->getOperand(0), N1.getOperand(0), N->getOperand(2));
  13099. return SDValue();
  13100. }
  13101. // Look for another BFI to combine with.
  13102. if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
  13103. // We've found a BFI.
  13104. APInt ToMask1, FromMask1;
  13105. SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
  13106. APInt ToMask2, FromMask2;
  13107. SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
  13108. assert(From1 == From2);
  13109. (void)From2;
  13110. // Create a new BFI, combining the two together.
  13111. APInt NewFromMask = FromMask1 | FromMask2;
  13112. APInt NewToMask = ToMask1 | ToMask2;
  13113. EVT VT = N->getValueType(0);
  13114. SDLoc dl(N);
  13115. if (NewFromMask[0] == 0)
  13116. From1 = DAG.getNode(
  13117. ISD::SRL, dl, VT, From1,
  13118. DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
  13119. return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
  13120. DAG.getConstant(~NewToMask, dl, VT));
  13121. }
  13122. // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
  13123. // that lower bit insertions are performed first, providing that M1 and M2
  13124. // do no overlap. This can allow multiple BFI instructions to be combined
  13125. // together by the other folds above.
  13126. if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
  13127. APInt ToMask1 = ~N->getConstantOperandAPInt(2);
  13128. APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
  13129. if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
  13130. ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros())
  13131. return SDValue();
  13132. EVT VT = N->getValueType(0);
  13133. SDLoc dl(N);
  13134. SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
  13135. N->getOperand(1), N->getOperand(2));
  13136. return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
  13137. N0.getOperand(2));
  13138. }
  13139. return SDValue();
  13140. }
  13141. // Check that N is CMPZ(CSINC(0, 0, CC, X)),
  13142. // or CMPZ(CMOV(1, 0, CC, $cpsr, X))
  13143. // return X if valid.
  13144. static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
  13145. if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
  13146. return SDValue();
  13147. SDValue CSInc = Cmp->getOperand(0);
  13148. // Ignore any `And 1` nodes that may not yet have been removed. We are
  13149. // looking for a value that produces 1/0, so these have no effect on the
  13150. // code.
  13151. while (CSInc.getOpcode() == ISD::AND &&
  13152. isa<ConstantSDNode>(CSInc.getOperand(1)) &&
  13153. CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
  13154. CSInc = CSInc.getOperand(0);
  13155. if (CSInc.getOpcode() == ARMISD::CSINC &&
  13156. isNullConstant(CSInc.getOperand(0)) &&
  13157. isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
  13158. CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
  13159. return CSInc.getOperand(3);
  13160. }
  13161. if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
  13162. isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
  13163. CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
  13164. return CSInc.getOperand(4);
  13165. }
  13166. if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
  13167. isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
  13168. CC = ARMCC::getOppositeCondition(
  13169. (ARMCC::CondCodes)CSInc.getConstantOperandVal(2));
  13170. return CSInc.getOperand(4);
  13171. }
  13172. return SDValue();
  13173. }
  13174. static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
  13175. // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
  13176. // t92: glue = ARMISD::CMPZ t74, 0
  13177. // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
  13178. // t96: glue = ARMISD::CMPZ t93, 0
  13179. // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
  13180. ARMCC::CondCodes Cond;
  13181. if (SDValue C = IsCMPZCSINC(N, Cond))
  13182. if (Cond == ARMCC::EQ)
  13183. return C;
  13184. return SDValue();
  13185. }
  13186. static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) {
  13187. // Fold away an unneccessary CMPZ/CSINC
  13188. // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
  13189. // if C1==EQ -> CSXYZ A, B, C2, D
  13190. // if C1==NE -> CSXYZ A, B, NOT(C2), D
  13191. ARMCC::CondCodes Cond;
  13192. if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
  13193. if (N->getConstantOperandVal(2) == ARMCC::EQ)
  13194. return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
  13195. N->getOperand(1),
  13196. DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
  13197. if (N->getConstantOperandVal(2) == ARMCC::NE)
  13198. return DAG.getNode(
  13199. N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
  13200. N->getOperand(1),
  13201. DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C);
  13202. }
  13203. return SDValue();
  13204. }
  13205. /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
  13206. /// ARMISD::VMOVRRD.
  13207. static SDValue PerformVMOVRRDCombine(SDNode *N,
  13208. TargetLowering::DAGCombinerInfo &DCI,
  13209. const ARMSubtarget *Subtarget) {
  13210. // vmovrrd(vmovdrr x, y) -> x,y
  13211. SDValue InDouble = N->getOperand(0);
  13212. if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
  13213. return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
  13214. // vmovrrd(load f64) -> (load i32), (load i32)
  13215. SDNode *InNode = InDouble.getNode();
  13216. if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
  13217. InNode->getValueType(0) == MVT::f64 &&
  13218. InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
  13219. !cast<LoadSDNode>(InNode)->isVolatile()) {
  13220. // TODO: Should this be done for non-FrameIndex operands?
  13221. LoadSDNode *LD = cast<LoadSDNode>(InNode);
  13222. SelectionDAG &DAG = DCI.DAG;
  13223. SDLoc DL(LD);
  13224. SDValue BasePtr = LD->getBasePtr();
  13225. SDValue NewLD1 =
  13226. DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
  13227. LD->getAlign(), LD->getMemOperand()->getFlags());
  13228. SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
  13229. DAG.getConstant(4, DL, MVT::i32));
  13230. SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
  13231. LD->getPointerInfo().getWithOffset(4),
  13232. commonAlignment(LD->getAlign(), 4),
  13233. LD->getMemOperand()->getFlags());
  13234. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
  13235. if (DCI.DAG.getDataLayout().isBigEndian())
  13236. std::swap (NewLD1, NewLD2);
  13237. SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
  13238. return Result;
  13239. }
  13240. // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
  13241. // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
  13242. if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  13243. isa<ConstantSDNode>(InDouble.getOperand(1))) {
  13244. SDValue BV = InDouble.getOperand(0);
  13245. // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
  13246. // change lane order under big endian.
  13247. bool BVSwap = BV.getOpcode() == ISD::BITCAST;
  13248. while (
  13249. (BV.getOpcode() == ISD::BITCAST ||
  13250. BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
  13251. (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
  13252. BVSwap = BV.getOpcode() == ISD::BITCAST;
  13253. BV = BV.getOperand(0);
  13254. }
  13255. if (BV.getValueType() != MVT::v4i32)
  13256. return SDValue();
  13257. // Handle buildvectors, pulling out the correct lane depending on
  13258. // endianness.
  13259. unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
  13260. if (BV.getOpcode() == ISD::BUILD_VECTOR) {
  13261. SDValue Op0 = BV.getOperand(Offset);
  13262. SDValue Op1 = BV.getOperand(Offset + 1);
  13263. if (!Subtarget->isLittle() && BVSwap)
  13264. std::swap(Op0, Op1);
  13265. return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
  13266. }
  13267. // A chain of insert_vectors, grabbing the correct value of the chain of
  13268. // inserts.
  13269. SDValue Op0, Op1;
  13270. while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
  13271. if (isa<ConstantSDNode>(BV.getOperand(2))) {
  13272. if (BV.getConstantOperandVal(2) == Offset)
  13273. Op0 = BV.getOperand(1);
  13274. if (BV.getConstantOperandVal(2) == Offset + 1)
  13275. Op1 = BV.getOperand(1);
  13276. }
  13277. BV = BV.getOperand(0);
  13278. }
  13279. if (!Subtarget->isLittle() && BVSwap)
  13280. std::swap(Op0, Op1);
  13281. if (Op0 && Op1)
  13282. return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
  13283. }
  13284. return SDValue();
  13285. }
  13286. /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
  13287. /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
  13288. static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
  13289. // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
  13290. SDValue Op0 = N->getOperand(0);
  13291. SDValue Op1 = N->getOperand(1);
  13292. if (Op0.getOpcode() == ISD::BITCAST)
  13293. Op0 = Op0.getOperand(0);
  13294. if (Op1.getOpcode() == ISD::BITCAST)
  13295. Op1 = Op1.getOperand(0);
  13296. if (Op0.getOpcode() == ARMISD::VMOVRRD &&
  13297. Op0.getNode() == Op1.getNode() &&
  13298. Op0.getResNo() == 0 && Op1.getResNo() == 1)
  13299. return DAG.getNode(ISD::BITCAST, SDLoc(N),
  13300. N->getValueType(0), Op0.getOperand(0));
  13301. return SDValue();
  13302. }
  13303. static SDValue PerformVMOVhrCombine(SDNode *N,
  13304. TargetLowering::DAGCombinerInfo &DCI) {
  13305. SDValue Op0 = N->getOperand(0);
  13306. // VMOVhr (VMOVrh (X)) -> X
  13307. if (Op0->getOpcode() == ARMISD::VMOVrh)
  13308. return Op0->getOperand(0);
  13309. // FullFP16: half values are passed in S-registers, and we don't
  13310. // need any of the bitcast and moves:
  13311. //
  13312. // t2: f32,ch = CopyFromReg t0, Register:f32 %0
  13313. // t5: i32 = bitcast t2
  13314. // t18: f16 = ARMISD::VMOVhr t5
  13315. if (Op0->getOpcode() == ISD::BITCAST) {
  13316. SDValue Copy = Op0->getOperand(0);
  13317. if (Copy.getValueType() == MVT::f32 &&
  13318. Copy->getOpcode() == ISD::CopyFromReg) {
  13319. SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
  13320. SDValue NewCopy =
  13321. DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
  13322. return NewCopy;
  13323. }
  13324. }
  13325. // fold (VMOVhr (load x)) -> (load (f16*)x)
  13326. if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
  13327. if (LN0->hasOneUse() && LN0->isUnindexed() &&
  13328. LN0->getMemoryVT() == MVT::i16) {
  13329. SDValue Load =
  13330. DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
  13331. LN0->getBasePtr(), LN0->getMemOperand());
  13332. DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
  13333. DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
  13334. return Load;
  13335. }
  13336. }
  13337. // Only the bottom 16 bits of the source register are used.
  13338. APInt DemandedMask = APInt::getLowBitsSet(32, 16);
  13339. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  13340. if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
  13341. return SDValue(N, 0);
  13342. return SDValue();
  13343. }
  13344. static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {
  13345. SDValue N0 = N->getOperand(0);
  13346. EVT VT = N->getValueType(0);
  13347. // fold (VMOVrh (fpconst x)) -> const x
  13348. if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
  13349. APFloat V = C->getValueAPF();
  13350. return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
  13351. }
  13352. // fold (VMOVrh (load x)) -> (zextload (i16*)x)
  13353. if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
  13354. LoadSDNode *LN0 = cast<LoadSDNode>(N0);
  13355. SDValue Load =
  13356. DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
  13357. LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
  13358. DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
  13359. DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
  13360. return Load;
  13361. }
  13362. // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
  13363. if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  13364. isa<ConstantSDNode>(N0->getOperand(1)))
  13365. return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
  13366. N0->getOperand(1));
  13367. return SDValue();
  13368. }
  13369. /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
  13370. /// are normal, non-volatile loads. If so, it is profitable to bitcast an
  13371. /// i64 vector to have f64 elements, since the value can then be loaded
  13372. /// directly into a VFP register.
  13373. static bool hasNormalLoadOperand(SDNode *N) {
  13374. unsigned NumElts = N->getValueType(0).getVectorNumElements();
  13375. for (unsigned i = 0; i < NumElts; ++i) {
  13376. SDNode *Elt = N->getOperand(i).getNode();
  13377. if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
  13378. return true;
  13379. }
  13380. return false;
  13381. }
  13382. /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
  13383. /// ISD::BUILD_VECTOR.
  13384. static SDValue PerformBUILD_VECTORCombine(SDNode *N,
  13385. TargetLowering::DAGCombinerInfo &DCI,
  13386. const ARMSubtarget *Subtarget) {
  13387. // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
  13388. // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
  13389. // into a pair of GPRs, which is fine when the value is used as a scalar,
  13390. // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
  13391. SelectionDAG &DAG = DCI.DAG;
  13392. if (N->getNumOperands() == 2)
  13393. if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
  13394. return RV;
  13395. // Load i64 elements as f64 values so that type legalization does not split
  13396. // them up into i32 values.
  13397. EVT VT = N->getValueType(0);
  13398. if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
  13399. return SDValue();
  13400. SDLoc dl(N);
  13401. SmallVector<SDValue, 8> Ops;
  13402. unsigned NumElts = VT.getVectorNumElements();
  13403. for (unsigned i = 0; i < NumElts; ++i) {
  13404. SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
  13405. Ops.push_back(V);
  13406. // Make the DAGCombiner fold the bitcast.
  13407. DCI.AddToWorklist(V.getNode());
  13408. }
  13409. EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
  13410. SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
  13411. return DAG.getNode(ISD::BITCAST, dl, VT, BV);
  13412. }
  13413. /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
  13414. static SDValue
  13415. PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13416. // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
  13417. // At that time, we may have inserted bitcasts from integer to float.
  13418. // If these bitcasts have survived DAGCombine, change the lowering of this
  13419. // BUILD_VECTOR in something more vector friendly, i.e., that does not
  13420. // force to use floating point types.
  13421. // Make sure we can change the type of the vector.
  13422. // This is possible iff:
  13423. // 1. The vector is only used in a bitcast to a integer type. I.e.,
  13424. // 1.1. Vector is used only once.
  13425. // 1.2. Use is a bit convert to an integer type.
  13426. // 2. The size of its operands are 32-bits (64-bits are not legal).
  13427. EVT VT = N->getValueType(0);
  13428. EVT EltVT = VT.getVectorElementType();
  13429. // Check 1.1. and 2.
  13430. if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
  13431. return SDValue();
  13432. // By construction, the input type must be float.
  13433. assert(EltVT == MVT::f32 && "Unexpected type!");
  13434. // Check 1.2.
  13435. SDNode *Use = *N->use_begin();
  13436. if (Use->getOpcode() != ISD::BITCAST ||
  13437. Use->getValueType(0).isFloatingPoint())
  13438. return SDValue();
  13439. // Check profitability.
  13440. // Model is, if more than half of the relevant operands are bitcast from
  13441. // i32, turn the build_vector into a sequence of insert_vector_elt.
  13442. // Relevant operands are everything that is not statically
  13443. // (i.e., at compile time) bitcasted.
  13444. unsigned NumOfBitCastedElts = 0;
  13445. unsigned NumElts = VT.getVectorNumElements();
  13446. unsigned NumOfRelevantElts = NumElts;
  13447. for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
  13448. SDValue Elt = N->getOperand(Idx);
  13449. if (Elt->getOpcode() == ISD::BITCAST) {
  13450. // Assume only bit cast to i32 will go away.
  13451. if (Elt->getOperand(0).getValueType() == MVT::i32)
  13452. ++NumOfBitCastedElts;
  13453. } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
  13454. // Constants are statically casted, thus do not count them as
  13455. // relevant operands.
  13456. --NumOfRelevantElts;
  13457. }
  13458. // Check if more than half of the elements require a non-free bitcast.
  13459. if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
  13460. return SDValue();
  13461. SelectionDAG &DAG = DCI.DAG;
  13462. // Create the new vector type.
  13463. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
  13464. // Check if the type is legal.
  13465. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  13466. if (!TLI.isTypeLegal(VecVT))
  13467. return SDValue();
  13468. // Combine:
  13469. // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
  13470. // => BITCAST INSERT_VECTOR_ELT
  13471. // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
  13472. // (BITCAST EN), N.
  13473. SDValue Vec = DAG.getUNDEF(VecVT);
  13474. SDLoc dl(N);
  13475. for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
  13476. SDValue V = N->getOperand(Idx);
  13477. if (V.isUndef())
  13478. continue;
  13479. if (V.getOpcode() == ISD::BITCAST &&
  13480. V->getOperand(0).getValueType() == MVT::i32)
  13481. // Fold obvious case.
  13482. V = V.getOperand(0);
  13483. else {
  13484. V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
  13485. // Make the DAGCombiner fold the bitcasts.
  13486. DCI.AddToWorklist(V.getNode());
  13487. }
  13488. SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
  13489. Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
  13490. }
  13491. Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  13492. // Make the DAGCombiner fold the bitcasts.
  13493. DCI.AddToWorklist(Vec.getNode());
  13494. return Vec;
  13495. }
  13496. static SDValue
  13497. PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13498. EVT VT = N->getValueType(0);
  13499. SDValue Op = N->getOperand(0);
  13500. SDLoc dl(N);
  13501. // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
  13502. if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
  13503. // If the valuetypes are the same, we can remove the cast entirely.
  13504. if (Op->getOperand(0).getValueType() == VT)
  13505. return Op->getOperand(0);
  13506. return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
  13507. }
  13508. // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
  13509. // more VPNOT which might get folded as else predicates.
  13510. if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
  13511. SDValue X =
  13512. DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
  13513. SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
  13514. DCI.DAG.getConstant(65535, dl, MVT::i32));
  13515. return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
  13516. }
  13517. // Only the bottom 16 bits of the source register are used.
  13518. if (Op.getValueType() == MVT::i32) {
  13519. APInt DemandedMask = APInt::getLowBitsSet(32, 16);
  13520. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  13521. if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
  13522. return SDValue(N, 0);
  13523. }
  13524. return SDValue();
  13525. }
  13526. static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,
  13527. const ARMSubtarget *ST) {
  13528. EVT VT = N->getValueType(0);
  13529. SDValue Op = N->getOperand(0);
  13530. SDLoc dl(N);
  13531. // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
  13532. if (ST->isLittle())
  13533. return DAG.getNode(ISD::BITCAST, dl, VT, Op);
  13534. // VECTOR_REG_CAST undef -> undef
  13535. if (Op.isUndef())
  13536. return DAG.getUNDEF(VT);
  13537. // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
  13538. if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
  13539. // If the valuetypes are the same, we can remove the cast entirely.
  13540. if (Op->getOperand(0).getValueType() == VT)
  13541. return Op->getOperand(0);
  13542. return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
  13543. }
  13544. return SDValue();
  13545. }
  13546. static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
  13547. const ARMSubtarget *Subtarget) {
  13548. if (!Subtarget->hasMVEIntegerOps())
  13549. return SDValue();
  13550. EVT VT = N->getValueType(0);
  13551. SDValue Op0 = N->getOperand(0);
  13552. SDValue Op1 = N->getOperand(1);
  13553. ARMCC::CondCodes Cond =
  13554. (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  13555. SDLoc dl(N);
  13556. // vcmp X, 0, cc -> vcmpz X, cc
  13557. if (isZeroVector(Op1))
  13558. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
  13559. unsigned SwappedCond = getSwappedCondition(Cond);
  13560. if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
  13561. // vcmp 0, X, cc -> vcmpz X, reversed(cc)
  13562. if (isZeroVector(Op0))
  13563. return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
  13564. DAG.getConstant(SwappedCond, dl, MVT::i32));
  13565. // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
  13566. if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
  13567. return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
  13568. DAG.getConstant(SwappedCond, dl, MVT::i32));
  13569. }
  13570. return SDValue();
  13571. }
  13572. /// PerformInsertEltCombine - Target-specific dag combine xforms for
  13573. /// ISD::INSERT_VECTOR_ELT.
  13574. static SDValue PerformInsertEltCombine(SDNode *N,
  13575. TargetLowering::DAGCombinerInfo &DCI) {
  13576. // Bitcast an i64 load inserted into a vector to f64.
  13577. // Otherwise, the i64 value will be legalized to a pair of i32 values.
  13578. EVT VT = N->getValueType(0);
  13579. SDNode *Elt = N->getOperand(1).getNode();
  13580. if (VT.getVectorElementType() != MVT::i64 ||
  13581. !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
  13582. return SDValue();
  13583. SelectionDAG &DAG = DCI.DAG;
  13584. SDLoc dl(N);
  13585. EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
  13586. VT.getVectorNumElements());
  13587. SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
  13588. SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
  13589. // Make the DAGCombiner fold the bitcasts.
  13590. DCI.AddToWorklist(Vec.getNode());
  13591. DCI.AddToWorklist(V.getNode());
  13592. SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
  13593. Vec, V, N->getOperand(2));
  13594. return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
  13595. }
  13596. // Convert a pair of extracts from the same base vector to a VMOVRRD. Either
  13597. // directly or bitcast to an integer if the original is a float vector.
  13598. // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
  13599. // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
  13600. static SDValue
  13601. PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13602. EVT VT = N->getValueType(0);
  13603. SDLoc dl(N);
  13604. if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
  13605. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
  13606. return SDValue();
  13607. SDValue Ext = SDValue(N, 0);
  13608. if (Ext.getOpcode() == ISD::BITCAST &&
  13609. Ext.getOperand(0).getValueType() == MVT::f32)
  13610. Ext = Ext.getOperand(0);
  13611. if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
  13612. !isa<ConstantSDNode>(Ext.getOperand(1)) ||
  13613. Ext.getConstantOperandVal(1) % 2 != 0)
  13614. return SDValue();
  13615. if (Ext->use_size() == 1 &&
  13616. (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
  13617. Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
  13618. return SDValue();
  13619. SDValue Op0 = Ext.getOperand(0);
  13620. EVT VecVT = Op0.getValueType();
  13621. unsigned ResNo = Op0.getResNo();
  13622. unsigned Lane = Ext.getConstantOperandVal(1);
  13623. if (VecVT.getVectorNumElements() != 4)
  13624. return SDValue();
  13625. // Find another extract, of Lane + 1
  13626. auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
  13627. return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
  13628. isa<ConstantSDNode>(V->getOperand(1)) &&
  13629. V->getConstantOperandVal(1) == Lane + 1 &&
  13630. V->getOperand(0).getResNo() == ResNo;
  13631. });
  13632. if (OtherIt == Op0->uses().end())
  13633. return SDValue();
  13634. // For float extracts, we need to be converting to a i32 for both vector
  13635. // lanes.
  13636. SDValue OtherExt(*OtherIt, 0);
  13637. if (OtherExt.getValueType() != MVT::i32) {
  13638. if (OtherExt->use_size() != 1 ||
  13639. OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
  13640. OtherExt->use_begin()->getValueType(0) != MVT::i32)
  13641. return SDValue();
  13642. OtherExt = SDValue(*OtherExt->use_begin(), 0);
  13643. }
  13644. // Convert the type to a f64 and extract with a VMOVRRD.
  13645. SDValue F64 = DCI.DAG.getNode(
  13646. ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
  13647. DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
  13648. DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
  13649. SDValue VMOVRRD =
  13650. DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
  13651. DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
  13652. return VMOVRRD;
  13653. }
  13654. static SDValue PerformExtractEltCombine(SDNode *N,
  13655. TargetLowering::DAGCombinerInfo &DCI,
  13656. const ARMSubtarget *ST) {
  13657. SDValue Op0 = N->getOperand(0);
  13658. EVT VT = N->getValueType(0);
  13659. SDLoc dl(N);
  13660. // extract (vdup x) -> x
  13661. if (Op0->getOpcode() == ARMISD::VDUP) {
  13662. SDValue X = Op0->getOperand(0);
  13663. if (VT == MVT::f16 && X.getValueType() == MVT::i32)
  13664. return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
  13665. if (VT == MVT::i32 && X.getValueType() == MVT::f16)
  13666. return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
  13667. if (VT == MVT::f32 && X.getValueType() == MVT::i32)
  13668. return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
  13669. while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
  13670. X = X->getOperand(0);
  13671. if (X.getValueType() == VT)
  13672. return X;
  13673. }
  13674. // extract ARM_BUILD_VECTOR -> x
  13675. if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
  13676. isa<ConstantSDNode>(N->getOperand(1)) &&
  13677. N->getConstantOperandVal(1) < Op0.getNumOperands()) {
  13678. return Op0.getOperand(N->getConstantOperandVal(1));
  13679. }
  13680. // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
  13681. if (Op0.getValueType() == MVT::v4i32 &&
  13682. isa<ConstantSDNode>(N->getOperand(1)) &&
  13683. Op0.getOpcode() == ISD::BITCAST &&
  13684. Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
  13685. Op0.getOperand(0).getValueType() == MVT::v2f64) {
  13686. SDValue BV = Op0.getOperand(0);
  13687. unsigned Offset = N->getConstantOperandVal(1);
  13688. SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
  13689. if (MOV.getOpcode() == ARMISD::VMOVDRR)
  13690. return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
  13691. }
  13692. // extract x, n; extract x, n+1 -> VMOVRRD x
  13693. if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
  13694. return R;
  13695. // extract (MVETrunc(x)) -> extract x
  13696. if (Op0->getOpcode() == ARMISD::MVETRUNC) {
  13697. unsigned Idx = N->getConstantOperandVal(1);
  13698. unsigned Vec =
  13699. Idx / Op0->getOperand(0).getValueType().getVectorNumElements();
  13700. unsigned SubIdx =
  13701. Idx % Op0->getOperand(0).getValueType().getVectorNumElements();
  13702. return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
  13703. DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
  13704. }
  13705. return SDValue();
  13706. }
  13707. static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {
  13708. SDValue Op = N->getOperand(0);
  13709. EVT VT = N->getValueType(0);
  13710. // sext_inreg(VGETLANEu) -> VGETLANEs
  13711. if (Op.getOpcode() == ARMISD::VGETLANEu &&
  13712. cast<VTSDNode>(N->getOperand(1))->getVT() ==
  13713. Op.getOperand(0).getValueType().getScalarType())
  13714. return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
  13715. Op.getOperand(1));
  13716. return SDValue();
  13717. }
  13718. // When lowering complex nodes that we recognize, like VQDMULH and MULH, we
  13719. // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
  13720. // binop as the shuffles cancel out.
  13721. static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) {
  13722. EVT VT = N->getValueType(0);
  13723. if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT)
  13724. return SDValue();
  13725. SDValue Op = N->getOperand(0);
  13726. // Looking for binary operators that will have been folded from
  13727. // truncates/extends.
  13728. switch (Op.getOpcode()) {
  13729. case ARMISD::VQDMULH:
  13730. case ISD::MULHS:
  13731. case ISD::MULHU:
  13732. case ISD::ABDS:
  13733. case ISD::ABDU:
  13734. case ISD::AVGFLOORS:
  13735. case ISD::AVGFLOORU:
  13736. case ISD::AVGCEILS:
  13737. case ISD::AVGCEILU:
  13738. break;
  13739. default:
  13740. return SDValue();
  13741. }
  13742. ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
  13743. ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1));
  13744. if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() ||
  13745. !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() ||
  13746. Op0->getOperand(0).getValueType() != VT)
  13747. return SDValue();
  13748. // Check the mask turns into an identity shuffle.
  13749. ArrayRef<int> NMask = N->getMask();
  13750. ArrayRef<int> OpMask = Op0->getMask();
  13751. for (int i = 0, e = NMask.size(); i != e; i++) {
  13752. if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i)
  13753. return SDValue();
  13754. }
  13755. return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
  13756. Op0->getOperand(0), Op1->getOperand(0));
  13757. }
  13758. static SDValue
  13759. PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  13760. SDValue Vec = N->getOperand(0);
  13761. SDValue SubVec = N->getOperand(1);
  13762. uint64_t IdxVal = N->getConstantOperandVal(2);
  13763. EVT VecVT = Vec.getValueType();
  13764. EVT SubVT = SubVec.getValueType();
  13765. // Only do this for legal fixed vector types.
  13766. if (!VecVT.isFixedLengthVector() ||
  13767. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
  13768. !DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
  13769. return SDValue();
  13770. // Ignore widening patterns.
  13771. if (IdxVal == 0 && Vec.isUndef())
  13772. return SDValue();
  13773. // Subvector must be half the width and an "aligned" insertion.
  13774. unsigned NumSubElts = SubVT.getVectorNumElements();
  13775. if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
  13776. (IdxVal != 0 && IdxVal != NumSubElts))
  13777. return SDValue();
  13778. // Fold insert_subvector -> concat_vectors
  13779. // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
  13780. // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
  13781. SDLoc DL(N);
  13782. SDValue Lo, Hi;
  13783. if (IdxVal == 0) {
  13784. Lo = SubVec;
  13785. Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
  13786. DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
  13787. } else {
  13788. Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
  13789. DCI.DAG.getVectorIdxConstant(0, DL));
  13790. Hi = SubVec;
  13791. }
  13792. return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
  13793. }
  13794. // shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
  13795. static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
  13796. SelectionDAG &DAG) {
  13797. SDValue Trunc = N->getOperand(0);
  13798. EVT VT = Trunc.getValueType();
  13799. if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
  13800. return SDValue();
  13801. SDLoc DL(Trunc);
  13802. if (isVMOVNTruncMask(N->getMask(), VT, false))
  13803. return DAG.getNode(
  13804. ARMISD::VMOVN, DL, VT,
  13805. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
  13806. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
  13807. DAG.getConstant(1, DL, MVT::i32));
  13808. else if (isVMOVNTruncMask(N->getMask(), VT, true))
  13809. return DAG.getNode(
  13810. ARMISD::VMOVN, DL, VT,
  13811. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
  13812. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
  13813. DAG.getConstant(1, DL, MVT::i32));
  13814. return SDValue();
  13815. }
  13816. /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
  13817. /// ISD::VECTOR_SHUFFLE.
  13818. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
  13819. if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG))
  13820. return R;
  13821. if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))
  13822. return R;
  13823. // The LLVM shufflevector instruction does not require the shuffle mask
  13824. // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
  13825. // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
  13826. // operands do not match the mask length, they are extended by concatenating
  13827. // them with undef vectors. That is probably the right thing for other
  13828. // targets, but for NEON it is better to concatenate two double-register
  13829. // size vector operands into a single quad-register size vector. Do that
  13830. // transformation here:
  13831. // shuffle(concat(v1, undef), concat(v2, undef)) ->
  13832. // shuffle(concat(v1, v2), undef)
  13833. SDValue Op0 = N->getOperand(0);
  13834. SDValue Op1 = N->getOperand(1);
  13835. if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
  13836. Op1.getOpcode() != ISD::CONCAT_VECTORS ||
  13837. Op0.getNumOperands() != 2 ||
  13838. Op1.getNumOperands() != 2)
  13839. return SDValue();
  13840. SDValue Concat0Op1 = Op0.getOperand(1);
  13841. SDValue Concat1Op1 = Op1.getOperand(1);
  13842. if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
  13843. return SDValue();
  13844. // Skip the transformation if any of the types are illegal.
  13845. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  13846. EVT VT = N->getValueType(0);
  13847. if (!TLI.isTypeLegal(VT) ||
  13848. !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
  13849. !TLI.isTypeLegal(Concat1Op1.getValueType()))
  13850. return SDValue();
  13851. SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
  13852. Op0.getOperand(0), Op1.getOperand(0));
  13853. // Translate the shuffle mask.
  13854. SmallVector<int, 16> NewMask;
  13855. unsigned NumElts = VT.getVectorNumElements();
  13856. unsigned HalfElts = NumElts/2;
  13857. ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
  13858. for (unsigned n = 0; n < NumElts; ++n) {
  13859. int MaskElt = SVN->getMaskElt(n);
  13860. int NewElt = -1;
  13861. if (MaskElt < (int)HalfElts)
  13862. NewElt = MaskElt;
  13863. else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
  13864. NewElt = HalfElts + MaskElt - NumElts;
  13865. NewMask.push_back(NewElt);
  13866. }
  13867. return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
  13868. DAG.getUNDEF(VT), NewMask);
  13869. }
  13870. /// Load/store instruction that can be merged with a base address
  13871. /// update
  13872. struct BaseUpdateTarget {
  13873. SDNode *N;
  13874. bool isIntrinsic;
  13875. bool isStore;
  13876. unsigned AddrOpIdx;
  13877. };
  13878. struct BaseUpdateUser {
  13879. /// Instruction that updates a pointer
  13880. SDNode *N;
  13881. /// Pointer increment operand
  13882. SDValue Inc;
  13883. /// Pointer increment value if it is a constant, or 0 otherwise
  13884. unsigned ConstInc;
  13885. };
  13886. static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
  13887. struct BaseUpdateUser &User,
  13888. bool SimpleConstIncOnly,
  13889. TargetLowering::DAGCombinerInfo &DCI) {
  13890. SelectionDAG &DAG = DCI.DAG;
  13891. SDNode *N = Target.N;
  13892. MemSDNode *MemN = cast<MemSDNode>(N);
  13893. SDLoc dl(N);
  13894. // Find the new opcode for the updating load/store.
  13895. bool isLoadOp = true;
  13896. bool isLaneOp = false;
  13897. // Workaround for vst1x and vld1x intrinsics which do not have alignment
  13898. // as an operand.
  13899. bool hasAlignment = true;
  13900. unsigned NewOpc = 0;
  13901. unsigned NumVecs = 0;
  13902. if (Target.isIntrinsic) {
  13903. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  13904. switch (IntNo) {
  13905. default:
  13906. llvm_unreachable("unexpected intrinsic for Neon base update");
  13907. case Intrinsic::arm_neon_vld1:
  13908. NewOpc = ARMISD::VLD1_UPD;
  13909. NumVecs = 1;
  13910. break;
  13911. case Intrinsic::arm_neon_vld2:
  13912. NewOpc = ARMISD::VLD2_UPD;
  13913. NumVecs = 2;
  13914. break;
  13915. case Intrinsic::arm_neon_vld3:
  13916. NewOpc = ARMISD::VLD3_UPD;
  13917. NumVecs = 3;
  13918. break;
  13919. case Intrinsic::arm_neon_vld4:
  13920. NewOpc = ARMISD::VLD4_UPD;
  13921. NumVecs = 4;
  13922. break;
  13923. case Intrinsic::arm_neon_vld1x2:
  13924. NewOpc = ARMISD::VLD1x2_UPD;
  13925. NumVecs = 2;
  13926. hasAlignment = false;
  13927. break;
  13928. case Intrinsic::arm_neon_vld1x3:
  13929. NewOpc = ARMISD::VLD1x3_UPD;
  13930. NumVecs = 3;
  13931. hasAlignment = false;
  13932. break;
  13933. case Intrinsic::arm_neon_vld1x4:
  13934. NewOpc = ARMISD::VLD1x4_UPD;
  13935. NumVecs = 4;
  13936. hasAlignment = false;
  13937. break;
  13938. case Intrinsic::arm_neon_vld2dup:
  13939. NewOpc = ARMISD::VLD2DUP_UPD;
  13940. NumVecs = 2;
  13941. break;
  13942. case Intrinsic::arm_neon_vld3dup:
  13943. NewOpc = ARMISD::VLD3DUP_UPD;
  13944. NumVecs = 3;
  13945. break;
  13946. case Intrinsic::arm_neon_vld4dup:
  13947. NewOpc = ARMISD::VLD4DUP_UPD;
  13948. NumVecs = 4;
  13949. break;
  13950. case Intrinsic::arm_neon_vld2lane:
  13951. NewOpc = ARMISD::VLD2LN_UPD;
  13952. NumVecs = 2;
  13953. isLaneOp = true;
  13954. break;
  13955. case Intrinsic::arm_neon_vld3lane:
  13956. NewOpc = ARMISD::VLD3LN_UPD;
  13957. NumVecs = 3;
  13958. isLaneOp = true;
  13959. break;
  13960. case Intrinsic::arm_neon_vld4lane:
  13961. NewOpc = ARMISD::VLD4LN_UPD;
  13962. NumVecs = 4;
  13963. isLaneOp = true;
  13964. break;
  13965. case Intrinsic::arm_neon_vst1:
  13966. NewOpc = ARMISD::VST1_UPD;
  13967. NumVecs = 1;
  13968. isLoadOp = false;
  13969. break;
  13970. case Intrinsic::arm_neon_vst2:
  13971. NewOpc = ARMISD::VST2_UPD;
  13972. NumVecs = 2;
  13973. isLoadOp = false;
  13974. break;
  13975. case Intrinsic::arm_neon_vst3:
  13976. NewOpc = ARMISD::VST3_UPD;
  13977. NumVecs = 3;
  13978. isLoadOp = false;
  13979. break;
  13980. case Intrinsic::arm_neon_vst4:
  13981. NewOpc = ARMISD::VST4_UPD;
  13982. NumVecs = 4;
  13983. isLoadOp = false;
  13984. break;
  13985. case Intrinsic::arm_neon_vst2lane:
  13986. NewOpc = ARMISD::VST2LN_UPD;
  13987. NumVecs = 2;
  13988. isLoadOp = false;
  13989. isLaneOp = true;
  13990. break;
  13991. case Intrinsic::arm_neon_vst3lane:
  13992. NewOpc = ARMISD::VST3LN_UPD;
  13993. NumVecs = 3;
  13994. isLoadOp = false;
  13995. isLaneOp = true;
  13996. break;
  13997. case Intrinsic::arm_neon_vst4lane:
  13998. NewOpc = ARMISD::VST4LN_UPD;
  13999. NumVecs = 4;
  14000. isLoadOp = false;
  14001. isLaneOp = true;
  14002. break;
  14003. case Intrinsic::arm_neon_vst1x2:
  14004. NewOpc = ARMISD::VST1x2_UPD;
  14005. NumVecs = 2;
  14006. isLoadOp = false;
  14007. hasAlignment = false;
  14008. break;
  14009. case Intrinsic::arm_neon_vst1x3:
  14010. NewOpc = ARMISD::VST1x3_UPD;
  14011. NumVecs = 3;
  14012. isLoadOp = false;
  14013. hasAlignment = false;
  14014. break;
  14015. case Intrinsic::arm_neon_vst1x4:
  14016. NewOpc = ARMISD::VST1x4_UPD;
  14017. NumVecs = 4;
  14018. isLoadOp = false;
  14019. hasAlignment = false;
  14020. break;
  14021. }
  14022. } else {
  14023. isLaneOp = true;
  14024. switch (N->getOpcode()) {
  14025. default:
  14026. llvm_unreachable("unexpected opcode for Neon base update");
  14027. case ARMISD::VLD1DUP:
  14028. NewOpc = ARMISD::VLD1DUP_UPD;
  14029. NumVecs = 1;
  14030. break;
  14031. case ARMISD::VLD2DUP:
  14032. NewOpc = ARMISD::VLD2DUP_UPD;
  14033. NumVecs = 2;
  14034. break;
  14035. case ARMISD::VLD3DUP:
  14036. NewOpc = ARMISD::VLD3DUP_UPD;
  14037. NumVecs = 3;
  14038. break;
  14039. case ARMISD::VLD4DUP:
  14040. NewOpc = ARMISD::VLD4DUP_UPD;
  14041. NumVecs = 4;
  14042. break;
  14043. case ISD::LOAD:
  14044. NewOpc = ARMISD::VLD1_UPD;
  14045. NumVecs = 1;
  14046. isLaneOp = false;
  14047. break;
  14048. case ISD::STORE:
  14049. NewOpc = ARMISD::VST1_UPD;
  14050. NumVecs = 1;
  14051. isLaneOp = false;
  14052. isLoadOp = false;
  14053. break;
  14054. }
  14055. }
  14056. // Find the size of memory referenced by the load/store.
  14057. EVT VecTy;
  14058. if (isLoadOp) {
  14059. VecTy = N->getValueType(0);
  14060. } else if (Target.isIntrinsic) {
  14061. VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
  14062. } else {
  14063. assert(Target.isStore &&
  14064. "Node has to be a load, a store, or an intrinsic!");
  14065. VecTy = N->getOperand(1).getValueType();
  14066. }
  14067. bool isVLDDUPOp =
  14068. NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
  14069. NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
  14070. unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
  14071. if (isLaneOp || isVLDDUPOp)
  14072. NumBytes /= VecTy.getVectorNumElements();
  14073. if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
  14074. // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
  14075. // separate instructions that make it harder to use a non-constant update.
  14076. return false;
  14077. }
  14078. if (SimpleConstIncOnly && User.ConstInc != NumBytes)
  14079. return false;
  14080. // OK, we found an ADD we can fold into the base update.
  14081. // Now, create a _UPD node, taking care of not breaking alignment.
  14082. EVT AlignedVecTy = VecTy;
  14083. Align Alignment = MemN->getAlign();
  14084. // If this is a less-than-standard-aligned load/store, change the type to
  14085. // match the standard alignment.
  14086. // The alignment is overlooked when selecting _UPD variants; and it's
  14087. // easier to introduce bitcasts here than fix that.
  14088. // There are 3 ways to get to this base-update combine:
  14089. // - intrinsics: they are assumed to be properly aligned (to the standard
  14090. // alignment of the memory type), so we don't need to do anything.
  14091. // - ARMISD::VLDx nodes: they are only generated from the aforementioned
  14092. // intrinsics, so, likewise, there's nothing to do.
  14093. // - generic load/store instructions: the alignment is specified as an
  14094. // explicit operand, rather than implicitly as the standard alignment
  14095. // of the memory type (like the intrisics). We need to change the
  14096. // memory type to match the explicit alignment. That way, we don't
  14097. // generate non-standard-aligned ARMISD::VLDx nodes.
  14098. if (isa<LSBaseSDNode>(N)) {
  14099. if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
  14100. MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
  14101. assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
  14102. assert(!isLaneOp && "Unexpected generic load/store lane.");
  14103. unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
  14104. AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
  14105. }
  14106. // Don't set an explicit alignment on regular load/stores that we want
  14107. // to transform to VLD/VST 1_UPD nodes.
  14108. // This matches the behavior of regular load/stores, which only get an
  14109. // explicit alignment if the MMO alignment is larger than the standard
  14110. // alignment of the memory type.
  14111. // Intrinsics, however, always get an explicit alignment, set to the
  14112. // alignment of the MMO.
  14113. Alignment = Align(1);
  14114. }
  14115. // Create the new updating load/store node.
  14116. // First, create an SDVTList for the new updating node's results.
  14117. EVT Tys[6];
  14118. unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
  14119. unsigned n;
  14120. for (n = 0; n < NumResultVecs; ++n)
  14121. Tys[n] = AlignedVecTy;
  14122. Tys[n++] = MVT::i32;
  14123. Tys[n] = MVT::Other;
  14124. SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
  14125. // Then, gather the new node's operands.
  14126. SmallVector<SDValue, 8> Ops;
  14127. Ops.push_back(N->getOperand(0)); // incoming chain
  14128. Ops.push_back(N->getOperand(Target.AddrOpIdx));
  14129. Ops.push_back(User.Inc);
  14130. if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
  14131. // Try to match the intrinsic's signature
  14132. Ops.push_back(StN->getValue());
  14133. } else {
  14134. // Loads (and of course intrinsics) match the intrinsics' signature,
  14135. // so just add all but the alignment operand.
  14136. unsigned LastOperand =
  14137. hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
  14138. for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
  14139. Ops.push_back(N->getOperand(i));
  14140. }
  14141. // For all node types, the alignment operand is always the last one.
  14142. Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
  14143. // If this is a non-standard-aligned STORE, the penultimate operand is the
  14144. // stored value. Bitcast it to the aligned type.
  14145. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
  14146. SDValue &StVal = Ops[Ops.size() - 2];
  14147. StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
  14148. }
  14149. EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
  14150. SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
  14151. MemN->getMemOperand());
  14152. // Update the uses.
  14153. SmallVector<SDValue, 5> NewResults;
  14154. for (unsigned i = 0; i < NumResultVecs; ++i)
  14155. NewResults.push_back(SDValue(UpdN.getNode(), i));
  14156. // If this is an non-standard-aligned LOAD, the first result is the loaded
  14157. // value. Bitcast it to the expected result type.
  14158. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
  14159. SDValue &LdVal = NewResults[0];
  14160. LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
  14161. }
  14162. NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
  14163. DCI.CombineTo(N, NewResults);
  14164. DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
  14165. return true;
  14166. }
  14167. // If (opcode ptr inc) is and ADD-like instruction, return the
  14168. // increment value. Otherwise return 0.
  14169. static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
  14170. SDValue Inc, const SelectionDAG &DAG) {
  14171. ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
  14172. if (!CInc)
  14173. return 0;
  14174. switch (Opcode) {
  14175. case ARMISD::VLD1_UPD:
  14176. case ISD::ADD:
  14177. return CInc->getZExtValue();
  14178. case ISD::OR: {
  14179. if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
  14180. // (OR ptr inc) is the same as (ADD ptr inc)
  14181. return CInc->getZExtValue();
  14182. }
  14183. return 0;
  14184. }
  14185. default:
  14186. return 0;
  14187. }
  14188. }
  14189. static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
  14190. switch (N->getOpcode()) {
  14191. case ISD::ADD:
  14192. case ISD::OR: {
  14193. if (isa<ConstantSDNode>(N->getOperand(1))) {
  14194. *Ptr = N->getOperand(0);
  14195. *CInc = N->getOperand(1);
  14196. return true;
  14197. }
  14198. return false;
  14199. }
  14200. case ARMISD::VLD1_UPD: {
  14201. if (isa<ConstantSDNode>(N->getOperand(2))) {
  14202. *Ptr = N->getOperand(1);
  14203. *CInc = N->getOperand(2);
  14204. return true;
  14205. }
  14206. return false;
  14207. }
  14208. default:
  14209. return false;
  14210. }
  14211. }
  14212. static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
  14213. // Check that the add is independent of the load/store.
  14214. // Otherwise, folding it would create a cycle. Search through Addr
  14215. // as well, since the User may not be a direct user of Addr and
  14216. // only share a base pointer.
  14217. SmallPtrSet<const SDNode *, 32> Visited;
  14218. SmallVector<const SDNode *, 16> Worklist;
  14219. Worklist.push_back(N);
  14220. Worklist.push_back(User);
  14221. if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
  14222. SDNode::hasPredecessorHelper(User, Visited, Worklist))
  14223. return false;
  14224. return true;
  14225. }
  14226. /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
  14227. /// NEON load/store intrinsics, and generic vector load/stores, to merge
  14228. /// base address updates.
  14229. /// For generic load/stores, the memory type is assumed to be a vector.
  14230. /// The caller is assumed to have checked legality.
  14231. static SDValue CombineBaseUpdate(SDNode *N,
  14232. TargetLowering::DAGCombinerInfo &DCI) {
  14233. const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
  14234. N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
  14235. const bool isStore = N->getOpcode() == ISD::STORE;
  14236. const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
  14237. BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
  14238. SDValue Addr = N->getOperand(AddrOpIdx);
  14239. SmallVector<BaseUpdateUser, 8> BaseUpdates;
  14240. // Search for a use of the address operand that is an increment.
  14241. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
  14242. UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
  14243. SDNode *User = *UI;
  14244. if (UI.getUse().getResNo() != Addr.getResNo() ||
  14245. User->getNumOperands() != 2)
  14246. continue;
  14247. SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
  14248. unsigned ConstInc =
  14249. getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
  14250. if (ConstInc || User->getOpcode() == ISD::ADD)
  14251. BaseUpdates.push_back({User, Inc, ConstInc});
  14252. }
  14253. // If the address is a constant pointer increment itself, find
  14254. // another constant increment that has the same base operand
  14255. SDValue Base;
  14256. SDValue CInc;
  14257. if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
  14258. unsigned Offset =
  14259. getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
  14260. for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
  14261. UI != UE; ++UI) {
  14262. SDNode *User = *UI;
  14263. if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
  14264. User->getNumOperands() != 2)
  14265. continue;
  14266. SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
  14267. unsigned UserOffset =
  14268. getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
  14269. if (!UserOffset || UserOffset <= Offset)
  14270. continue;
  14271. unsigned NewConstInc = UserOffset - Offset;
  14272. SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
  14273. BaseUpdates.push_back({User, NewInc, NewConstInc});
  14274. }
  14275. }
  14276. // Try to fold the load/store with an update that matches memory
  14277. // access size. This should work well for sequential loads.
  14278. //
  14279. // Filter out invalid updates as well.
  14280. unsigned NumValidUpd = BaseUpdates.size();
  14281. for (unsigned I = 0; I < NumValidUpd;) {
  14282. BaseUpdateUser &User = BaseUpdates[I];
  14283. if (!isValidBaseUpdate(N, User.N)) {
  14284. --NumValidUpd;
  14285. std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
  14286. continue;
  14287. }
  14288. if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
  14289. return SDValue();
  14290. ++I;
  14291. }
  14292. BaseUpdates.resize(NumValidUpd);
  14293. // Try to fold with other users. Non-constant updates are considered
  14294. // first, and constant updates are sorted to not break a sequence of
  14295. // strided accesses (if there is any).
  14296. std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(),
  14297. [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
  14298. return LHS.ConstInc < RHS.ConstInc;
  14299. });
  14300. for (BaseUpdateUser &User : BaseUpdates) {
  14301. if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
  14302. return SDValue();
  14303. }
  14304. return SDValue();
  14305. }
  14306. static SDValue PerformVLDCombine(SDNode *N,
  14307. TargetLowering::DAGCombinerInfo &DCI) {
  14308. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  14309. return SDValue();
  14310. return CombineBaseUpdate(N, DCI);
  14311. }
  14312. static SDValue PerformMVEVLDCombine(SDNode *N,
  14313. TargetLowering::DAGCombinerInfo &DCI) {
  14314. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  14315. return SDValue();
  14316. SelectionDAG &DAG = DCI.DAG;
  14317. SDValue Addr = N->getOperand(2);
  14318. MemSDNode *MemN = cast<MemSDNode>(N);
  14319. SDLoc dl(N);
  14320. // For the stores, where there are multiple intrinsics we only actually want
  14321. // to post-inc the last of the them.
  14322. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
  14323. if (IntNo == Intrinsic::arm_mve_vst2q &&
  14324. cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
  14325. return SDValue();
  14326. if (IntNo == Intrinsic::arm_mve_vst4q &&
  14327. cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
  14328. return SDValue();
  14329. // Search for a use of the address operand that is an increment.
  14330. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
  14331. UE = Addr.getNode()->use_end();
  14332. UI != UE; ++UI) {
  14333. SDNode *User = *UI;
  14334. if (User->getOpcode() != ISD::ADD ||
  14335. UI.getUse().getResNo() != Addr.getResNo())
  14336. continue;
  14337. // Check that the add is independent of the load/store. Otherwise, folding
  14338. // it would create a cycle. We can avoid searching through Addr as it's a
  14339. // predecessor to both.
  14340. SmallPtrSet<const SDNode *, 32> Visited;
  14341. SmallVector<const SDNode *, 16> Worklist;
  14342. Visited.insert(Addr.getNode());
  14343. Worklist.push_back(N);
  14344. Worklist.push_back(User);
  14345. if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
  14346. SDNode::hasPredecessorHelper(User, Visited, Worklist))
  14347. continue;
  14348. // Find the new opcode for the updating load/store.
  14349. bool isLoadOp = true;
  14350. unsigned NewOpc = 0;
  14351. unsigned NumVecs = 0;
  14352. switch (IntNo) {
  14353. default:
  14354. llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
  14355. case Intrinsic::arm_mve_vld2q:
  14356. NewOpc = ARMISD::VLD2_UPD;
  14357. NumVecs = 2;
  14358. break;
  14359. case Intrinsic::arm_mve_vld4q:
  14360. NewOpc = ARMISD::VLD4_UPD;
  14361. NumVecs = 4;
  14362. break;
  14363. case Intrinsic::arm_mve_vst2q:
  14364. NewOpc = ARMISD::VST2_UPD;
  14365. NumVecs = 2;
  14366. isLoadOp = false;
  14367. break;
  14368. case Intrinsic::arm_mve_vst4q:
  14369. NewOpc = ARMISD::VST4_UPD;
  14370. NumVecs = 4;
  14371. isLoadOp = false;
  14372. break;
  14373. }
  14374. // Find the size of memory referenced by the load/store.
  14375. EVT VecTy;
  14376. if (isLoadOp) {
  14377. VecTy = N->getValueType(0);
  14378. } else {
  14379. VecTy = N->getOperand(3).getValueType();
  14380. }
  14381. unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
  14382. // If the increment is a constant, it must match the memory ref size.
  14383. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
  14384. ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
  14385. if (!CInc || CInc->getZExtValue() != NumBytes)
  14386. continue;
  14387. // Create the new updating load/store node.
  14388. // First, create an SDVTList for the new updating node's results.
  14389. EVT Tys[6];
  14390. unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
  14391. unsigned n;
  14392. for (n = 0; n < NumResultVecs; ++n)
  14393. Tys[n] = VecTy;
  14394. Tys[n++] = MVT::i32;
  14395. Tys[n] = MVT::Other;
  14396. SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
  14397. // Then, gather the new node's operands.
  14398. SmallVector<SDValue, 8> Ops;
  14399. Ops.push_back(N->getOperand(0)); // incoming chain
  14400. Ops.push_back(N->getOperand(2)); // ptr
  14401. Ops.push_back(Inc);
  14402. for (unsigned i = 3; i < N->getNumOperands(); ++i)
  14403. Ops.push_back(N->getOperand(i));
  14404. SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
  14405. MemN->getMemOperand());
  14406. // Update the uses.
  14407. SmallVector<SDValue, 5> NewResults;
  14408. for (unsigned i = 0; i < NumResultVecs; ++i)
  14409. NewResults.push_back(SDValue(UpdN.getNode(), i));
  14410. NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
  14411. DCI.CombineTo(N, NewResults);
  14412. DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
  14413. break;
  14414. }
  14415. return SDValue();
  14416. }
  14417. /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
  14418. /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
  14419. /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
  14420. /// return true.
  14421. static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
  14422. SelectionDAG &DAG = DCI.DAG;
  14423. EVT VT = N->getValueType(0);
  14424. // vldN-dup instructions only support 64-bit vectors for N > 1.
  14425. if (!VT.is64BitVector())
  14426. return false;
  14427. // Check if the VDUPLANE operand is a vldN-dup intrinsic.
  14428. SDNode *VLD = N->getOperand(0).getNode();
  14429. if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
  14430. return false;
  14431. unsigned NumVecs = 0;
  14432. unsigned NewOpc = 0;
  14433. unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
  14434. if (IntNo == Intrinsic::arm_neon_vld2lane) {
  14435. NumVecs = 2;
  14436. NewOpc = ARMISD::VLD2DUP;
  14437. } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
  14438. NumVecs = 3;
  14439. NewOpc = ARMISD::VLD3DUP;
  14440. } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
  14441. NumVecs = 4;
  14442. NewOpc = ARMISD::VLD4DUP;
  14443. } else {
  14444. return false;
  14445. }
  14446. // First check that all the vldN-lane uses are VDUPLANEs and that the lane
  14447. // numbers match the load.
  14448. unsigned VLDLaneNo =
  14449. cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
  14450. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
  14451. UI != UE; ++UI) {
  14452. // Ignore uses of the chain result.
  14453. if (UI.getUse().getResNo() == NumVecs)
  14454. continue;
  14455. SDNode *User = *UI;
  14456. if (User->getOpcode() != ARMISD::VDUPLANE ||
  14457. VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
  14458. return false;
  14459. }
  14460. // Create the vldN-dup node.
  14461. EVT Tys[5];
  14462. unsigned n;
  14463. for (n = 0; n < NumVecs; ++n)
  14464. Tys[n] = VT;
  14465. Tys[n] = MVT::Other;
  14466. SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
  14467. SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
  14468. MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
  14469. SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
  14470. Ops, VLDMemInt->getMemoryVT(),
  14471. VLDMemInt->getMemOperand());
  14472. // Update the uses.
  14473. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
  14474. UI != UE; ++UI) {
  14475. unsigned ResNo = UI.getUse().getResNo();
  14476. // Ignore uses of the chain result.
  14477. if (ResNo == NumVecs)
  14478. continue;
  14479. SDNode *User = *UI;
  14480. DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
  14481. }
  14482. // Now the vldN-lane intrinsic is dead except for its chain result.
  14483. // Update uses of the chain.
  14484. std::vector<SDValue> VLDDupResults;
  14485. for (unsigned n = 0; n < NumVecs; ++n)
  14486. VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
  14487. VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
  14488. DCI.CombineTo(VLD, VLDDupResults);
  14489. return true;
  14490. }
  14491. /// PerformVDUPLANECombine - Target-specific dag combine xforms for
  14492. /// ARMISD::VDUPLANE.
  14493. static SDValue PerformVDUPLANECombine(SDNode *N,
  14494. TargetLowering::DAGCombinerInfo &DCI,
  14495. const ARMSubtarget *Subtarget) {
  14496. SDValue Op = N->getOperand(0);
  14497. EVT VT = N->getValueType(0);
  14498. // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
  14499. if (Subtarget->hasMVEIntegerOps()) {
  14500. EVT ExtractVT = VT.getVectorElementType();
  14501. // We need to ensure we are creating a legal type.
  14502. if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
  14503. ExtractVT = MVT::i32;
  14504. SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
  14505. N->getOperand(0), N->getOperand(1));
  14506. return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
  14507. }
  14508. // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
  14509. // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
  14510. if (CombineVLDDUP(N, DCI))
  14511. return SDValue(N, 0);
  14512. // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
  14513. // redundant. Ignore bit_converts for now; element sizes are checked below.
  14514. while (Op.getOpcode() == ISD::BITCAST)
  14515. Op = Op.getOperand(0);
  14516. if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
  14517. return SDValue();
  14518. // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
  14519. unsigned EltSize = Op.getScalarValueSizeInBits();
  14520. // The canonical VMOV for a zero vector uses a 32-bit element size.
  14521. unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  14522. unsigned EltBits;
  14523. if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
  14524. EltSize = 8;
  14525. if (EltSize > VT.getScalarSizeInBits())
  14526. return SDValue();
  14527. return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
  14528. }
  14529. /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
  14530. static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
  14531. const ARMSubtarget *Subtarget) {
  14532. SDValue Op = N->getOperand(0);
  14533. SDLoc dl(N);
  14534. if (Subtarget->hasMVEIntegerOps()) {
  14535. // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
  14536. // need to come from a GPR.
  14537. if (Op.getValueType() == MVT::f32)
  14538. return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
  14539. DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
  14540. else if (Op.getValueType() == MVT::f16)
  14541. return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
  14542. DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
  14543. }
  14544. if (!Subtarget->hasNEON())
  14545. return SDValue();
  14546. // Match VDUP(LOAD) -> VLD1DUP.
  14547. // We match this pattern here rather than waiting for isel because the
  14548. // transform is only legal for unindexed loads.
  14549. LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
  14550. if (LD && Op.hasOneUse() && LD->isUnindexed() &&
  14551. LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
  14552. SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
  14553. DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
  14554. SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
  14555. SDValue VLDDup =
  14556. DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
  14557. LD->getMemoryVT(), LD->getMemOperand());
  14558. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
  14559. return VLDDup;
  14560. }
  14561. return SDValue();
  14562. }
  14563. static SDValue PerformLOADCombine(SDNode *N,
  14564. TargetLowering::DAGCombinerInfo &DCI,
  14565. const ARMSubtarget *Subtarget) {
  14566. EVT VT = N->getValueType(0);
  14567. // If this is a legal vector load, try to combine it into a VLD1_UPD.
  14568. if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
  14569. DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
  14570. return CombineBaseUpdate(N, DCI);
  14571. return SDValue();
  14572. }
  14573. // Optimize trunc store (of multiple scalars) to shuffle and store. First,
  14574. // pack all of the elements in one place. Next, store to memory in fewer
  14575. // chunks.
  14576. static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
  14577. SelectionDAG &DAG) {
  14578. SDValue StVal = St->getValue();
  14579. EVT VT = StVal.getValueType();
  14580. if (!St->isTruncatingStore() || !VT.isVector())
  14581. return SDValue();
  14582. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  14583. EVT StVT = St->getMemoryVT();
  14584. unsigned NumElems = VT.getVectorNumElements();
  14585. assert(StVT != VT && "Cannot truncate to the same type");
  14586. unsigned FromEltSz = VT.getScalarSizeInBits();
  14587. unsigned ToEltSz = StVT.getScalarSizeInBits();
  14588. // From, To sizes and ElemCount must be pow of two
  14589. if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
  14590. return SDValue();
  14591. // We are going to use the original vector elt for storing.
  14592. // Accumulated smaller vector elements must be a multiple of the store size.
  14593. if (0 != (NumElems * FromEltSz) % ToEltSz)
  14594. return SDValue();
  14595. unsigned SizeRatio = FromEltSz / ToEltSz;
  14596. assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
  14597. // Create a type on which we perform the shuffle.
  14598. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
  14599. NumElems * SizeRatio);
  14600. assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  14601. SDLoc DL(St);
  14602. SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
  14603. SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
  14604. for (unsigned i = 0; i < NumElems; ++i)
  14605. ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
  14606. : i * SizeRatio;
  14607. // Can't shuffle using an illegal type.
  14608. if (!TLI.isTypeLegal(WideVecVT))
  14609. return SDValue();
  14610. SDValue Shuff = DAG.getVectorShuffle(
  14611. WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
  14612. // At this point all of the data is stored at the bottom of the
  14613. // register. We now need to save it to mem.
  14614. // Find the largest store unit
  14615. MVT StoreType = MVT::i8;
  14616. for (MVT Tp : MVT::integer_valuetypes()) {
  14617. if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
  14618. StoreType = Tp;
  14619. }
  14620. // Didn't find a legal store type.
  14621. if (!TLI.isTypeLegal(StoreType))
  14622. return SDValue();
  14623. // Bitcast the original vector into a vector of store-size units
  14624. EVT StoreVecVT =
  14625. EVT::getVectorVT(*DAG.getContext(), StoreType,
  14626. VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
  14627. assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
  14628. SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
  14629. SmallVector<SDValue, 8> Chains;
  14630. SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
  14631. TLI.getPointerTy(DAG.getDataLayout()));
  14632. SDValue BasePtr = St->getBasePtr();
  14633. // Perform one or more big stores into memory.
  14634. unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
  14635. for (unsigned I = 0; I < E; I++) {
  14636. SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
  14637. ShuffWide, DAG.getIntPtrConstant(I, DL));
  14638. SDValue Ch =
  14639. DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
  14640. St->getAlign(), St->getMemOperand()->getFlags());
  14641. BasePtr =
  14642. DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
  14643. Chains.push_back(Ch);
  14644. }
  14645. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  14646. }
  14647. // Try taking a single vector store from an fpround (which would otherwise turn
  14648. // into an expensive buildvector) and splitting it into a series of narrowing
  14649. // stores.
  14650. static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
  14651. SelectionDAG &DAG) {
  14652. if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
  14653. return SDValue();
  14654. SDValue Trunc = St->getValue();
  14655. if (Trunc->getOpcode() != ISD::FP_ROUND)
  14656. return SDValue();
  14657. EVT FromVT = Trunc->getOperand(0).getValueType();
  14658. EVT ToVT = Trunc.getValueType();
  14659. if (!ToVT.isVector())
  14660. return SDValue();
  14661. assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
  14662. EVT ToEltVT = ToVT.getVectorElementType();
  14663. EVT FromEltVT = FromVT.getVectorElementType();
  14664. if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
  14665. return SDValue();
  14666. unsigned NumElements = 4;
  14667. if (FromVT.getVectorNumElements() % NumElements != 0)
  14668. return SDValue();
  14669. // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
  14670. // use the VMOVN over splitting the store. We are looking for patterns of:
  14671. // !rev: 0 N 1 N+1 2 N+2 ...
  14672. // rev: N 0 N+1 1 N+2 2 ...
  14673. // The shuffle may either be a single source (in which case N = NumElts/2) or
  14674. // two inputs extended with concat to the same size (in which case N =
  14675. // NumElts).
  14676. auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
  14677. ArrayRef<int> M = SVN->getMask();
  14678. unsigned NumElts = ToVT.getVectorNumElements();
  14679. if (SVN->getOperand(1).isUndef())
  14680. NumElts /= 2;
  14681. unsigned Off0 = Rev ? NumElts : 0;
  14682. unsigned Off1 = Rev ? 0 : NumElts;
  14683. for (unsigned I = 0; I < NumElts; I += 2) {
  14684. if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
  14685. return false;
  14686. if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
  14687. return false;
  14688. }
  14689. return true;
  14690. };
  14691. if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
  14692. if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
  14693. return SDValue();
  14694. LLVMContext &C = *DAG.getContext();
  14695. SDLoc DL(St);
  14696. // Details about the old store
  14697. SDValue Ch = St->getChain();
  14698. SDValue BasePtr = St->getBasePtr();
  14699. Align Alignment = St->getOriginalAlign();
  14700. MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
  14701. AAMDNodes AAInfo = St->getAAInfo();
  14702. // We split the store into slices of NumElements. fp16 trunc stores are vcvt
  14703. // and then stored as truncating integer stores.
  14704. EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
  14705. EVT NewToVT = EVT::getVectorVT(
  14706. C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
  14707. SmallVector<SDValue, 4> Stores;
  14708. for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
  14709. unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
  14710. SDValue NewPtr =
  14711. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  14712. SDValue Extract =
  14713. DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
  14714. DAG.getConstant(i * NumElements, DL, MVT::i32));
  14715. SDValue FPTrunc =
  14716. DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
  14717. Extract, DAG.getConstant(0, DL, MVT::i32));
  14718. Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
  14719. SDValue Store = DAG.getTruncStore(
  14720. Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
  14721. NewToVT, Alignment, MMOFlags, AAInfo);
  14722. Stores.push_back(Store);
  14723. }
  14724. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
  14725. }
  14726. // Try taking a single vector store from an MVETRUNC (which would otherwise turn
  14727. // into an expensive buildvector) and splitting it into a series of narrowing
  14728. // stores.
  14729. static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
  14730. SelectionDAG &DAG) {
  14731. if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
  14732. return SDValue();
  14733. SDValue Trunc = St->getValue();
  14734. if (Trunc->getOpcode() != ARMISD::MVETRUNC)
  14735. return SDValue();
  14736. EVT FromVT = Trunc->getOperand(0).getValueType();
  14737. EVT ToVT = Trunc.getValueType();
  14738. LLVMContext &C = *DAG.getContext();
  14739. SDLoc DL(St);
  14740. // Details about the old store
  14741. SDValue Ch = St->getChain();
  14742. SDValue BasePtr = St->getBasePtr();
  14743. Align Alignment = St->getOriginalAlign();
  14744. MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
  14745. AAMDNodes AAInfo = St->getAAInfo();
  14746. EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
  14747. FromVT.getVectorNumElements());
  14748. SmallVector<SDValue, 4> Stores;
  14749. for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
  14750. unsigned NewOffset =
  14751. i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
  14752. SDValue NewPtr =
  14753. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  14754. SDValue Extract = Trunc.getOperand(i);
  14755. SDValue Store = DAG.getTruncStore(
  14756. Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
  14757. NewToVT, Alignment, MMOFlags, AAInfo);
  14758. Stores.push_back(Store);
  14759. }
  14760. return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
  14761. }
  14762. // Given a floating point store from an extracted vector, with an integer
  14763. // VGETLANE that already exists, store the existing VGETLANEu directly. This can
  14764. // help reduce fp register pressure, doesn't require the fp extract and allows
  14765. // use of more integer post-inc stores not available with vstr.
  14766. static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
  14767. if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
  14768. return SDValue();
  14769. SDValue Extract = St->getValue();
  14770. EVT VT = Extract.getValueType();
  14771. // For now only uses f16. This may be useful for f32 too, but that will
  14772. // be bitcast(extract), not the VGETLANEu we currently check here.
  14773. if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
  14774. return SDValue();
  14775. SDNode *GetLane =
  14776. DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
  14777. {Extract.getOperand(0), Extract.getOperand(1)});
  14778. if (!GetLane)
  14779. return SDValue();
  14780. LLVMContext &C = *DAG.getContext();
  14781. SDLoc DL(St);
  14782. // Create a new integer store to replace the existing floating point version.
  14783. SDValue Ch = St->getChain();
  14784. SDValue BasePtr = St->getBasePtr();
  14785. Align Alignment = St->getOriginalAlign();
  14786. MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
  14787. AAMDNodes AAInfo = St->getAAInfo();
  14788. EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
  14789. SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
  14790. St->getPointerInfo(), NewToVT, Alignment,
  14791. MMOFlags, AAInfo);
  14792. return Store;
  14793. }
  14794. /// PerformSTORECombine - Target-specific dag combine xforms for
  14795. /// ISD::STORE.
  14796. static SDValue PerformSTORECombine(SDNode *N,
  14797. TargetLowering::DAGCombinerInfo &DCI,
  14798. const ARMSubtarget *Subtarget) {
  14799. StoreSDNode *St = cast<StoreSDNode>(N);
  14800. if (St->isVolatile())
  14801. return SDValue();
  14802. SDValue StVal = St->getValue();
  14803. EVT VT = StVal.getValueType();
  14804. if (Subtarget->hasNEON())
  14805. if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
  14806. return Store;
  14807. if (Subtarget->hasMVEIntegerOps()) {
  14808. if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
  14809. return NewToken;
  14810. if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
  14811. return NewChain;
  14812. if (SDValue NewToken =
  14813. PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG))
  14814. return NewToken;
  14815. }
  14816. if (!ISD::isNormalStore(St))
  14817. return SDValue();
  14818. // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
  14819. // ARM stores of arguments in the same cache line.
  14820. if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
  14821. StVal.getNode()->hasOneUse()) {
  14822. SelectionDAG &DAG = DCI.DAG;
  14823. bool isBigEndian = DAG.getDataLayout().isBigEndian();
  14824. SDLoc DL(St);
  14825. SDValue BasePtr = St->getBasePtr();
  14826. SDValue NewST1 = DAG.getStore(
  14827. St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
  14828. BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
  14829. St->getMemOperand()->getFlags());
  14830. SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
  14831. DAG.getConstant(4, DL, MVT::i32));
  14832. return DAG.getStore(NewST1.getValue(0), DL,
  14833. StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
  14834. OffsetPtr, St->getPointerInfo().getWithOffset(4),
  14835. St->getOriginalAlign(),
  14836. St->getMemOperand()->getFlags());
  14837. }
  14838. if (StVal.getValueType() == MVT::i64 &&
  14839. StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
  14840. // Bitcast an i64 store extracted from a vector to f64.
  14841. // Otherwise, the i64 value will be legalized to a pair of i32 values.
  14842. SelectionDAG &DAG = DCI.DAG;
  14843. SDLoc dl(StVal);
  14844. SDValue IntVec = StVal.getOperand(0);
  14845. EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
  14846. IntVec.getValueType().getVectorNumElements());
  14847. SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
  14848. SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
  14849. Vec, StVal.getOperand(1));
  14850. dl = SDLoc(N);
  14851. SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
  14852. // Make the DAGCombiner fold the bitcasts.
  14853. DCI.AddToWorklist(Vec.getNode());
  14854. DCI.AddToWorklist(ExtElt.getNode());
  14855. DCI.AddToWorklist(V.getNode());
  14856. return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
  14857. St->getPointerInfo(), St->getAlign(),
  14858. St->getMemOperand()->getFlags(), St->getAAInfo());
  14859. }
  14860. // If this is a legal vector store, try to combine it into a VST1_UPD.
  14861. if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
  14862. DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
  14863. return CombineBaseUpdate(N, DCI);
  14864. return SDValue();
  14865. }
  14866. /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
  14867. /// can replace combinations of VMUL and VCVT (floating-point to integer)
  14868. /// when the VMUL has a constant operand that is a power of 2.
  14869. ///
  14870. /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
  14871. /// vmul.f32 d16, d17, d16
  14872. /// vcvt.s32.f32 d16, d16
  14873. /// becomes:
  14874. /// vcvt.s32.f32 d16, d16, #3
  14875. static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
  14876. const ARMSubtarget *Subtarget) {
  14877. if (!Subtarget->hasNEON())
  14878. return SDValue();
  14879. SDValue Op = N->getOperand(0);
  14880. if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
  14881. Op.getOpcode() != ISD::FMUL)
  14882. return SDValue();
  14883. SDValue ConstVec = Op->getOperand(1);
  14884. if (!isa<BuildVectorSDNode>(ConstVec))
  14885. return SDValue();
  14886. MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
  14887. uint32_t FloatBits = FloatTy.getSizeInBits();
  14888. MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
  14889. uint32_t IntBits = IntTy.getSizeInBits();
  14890. unsigned NumLanes = Op.getValueType().getVectorNumElements();
  14891. if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
  14892. // These instructions only exist converting from f32 to i32. We can handle
  14893. // smaller integers by generating an extra truncate, but larger ones would
  14894. // be lossy. We also can't handle anything other than 2 or 4 lanes, since
  14895. // these intructions only support v2i32/v4i32 types.
  14896. return SDValue();
  14897. }
  14898. BitVector UndefElements;
  14899. BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
  14900. int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
  14901. if (C == -1 || C == 0 || C > 32)
  14902. return SDValue();
  14903. SDLoc dl(N);
  14904. bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
  14905. unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
  14906. Intrinsic::arm_neon_vcvtfp2fxu;
  14907. SDValue FixConv = DAG.getNode(
  14908. ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
  14909. DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
  14910. DAG.getConstant(C, dl, MVT::i32));
  14911. if (IntBits < FloatBits)
  14912. FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
  14913. return FixConv;
  14914. }
  14915. static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
  14916. const ARMSubtarget *Subtarget) {
  14917. if (!Subtarget->hasMVEFloatOps())
  14918. return SDValue();
  14919. // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
  14920. // The second form can be more easily turned into a predicated vadd, and
  14921. // possibly combined into a fma to become a predicated vfma.
  14922. SDValue Op0 = N->getOperand(0);
  14923. SDValue Op1 = N->getOperand(1);
  14924. EVT VT = N->getValueType(0);
  14925. SDLoc DL(N);
  14926. // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
  14927. // which these VMOV's represent.
  14928. auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
  14929. if (Op.getOpcode() != ISD::BITCAST ||
  14930. Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
  14931. return false;
  14932. uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
  14933. if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
  14934. return true;
  14935. if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
  14936. return true;
  14937. return false;
  14938. };
  14939. if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
  14940. std::swap(Op0, Op1);
  14941. if (Op1.getOpcode() != ISD::VSELECT)
  14942. return SDValue();
  14943. SDNodeFlags FaddFlags = N->getFlags();
  14944. bool NSZ = FaddFlags.hasNoSignedZeros();
  14945. if (!isIdentitySplat(Op1.getOperand(2), NSZ))
  14946. return SDValue();
  14947. SDValue FAdd =
  14948. DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
  14949. return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
  14950. }
  14951. /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
  14952. /// can replace combinations of VCVT (integer to floating-point) and VDIV
  14953. /// when the VDIV has a constant operand that is a power of 2.
  14954. ///
  14955. /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
  14956. /// vcvt.f32.s32 d16, d16
  14957. /// vdiv.f32 d16, d17, d16
  14958. /// becomes:
  14959. /// vcvt.f32.s32 d16, d16, #3
  14960. static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
  14961. const ARMSubtarget *Subtarget) {
  14962. if (!Subtarget->hasNEON())
  14963. return SDValue();
  14964. SDValue Op = N->getOperand(0);
  14965. unsigned OpOpcode = Op.getNode()->getOpcode();
  14966. if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
  14967. (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
  14968. return SDValue();
  14969. SDValue ConstVec = N->getOperand(1);
  14970. if (!isa<BuildVectorSDNode>(ConstVec))
  14971. return SDValue();
  14972. MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
  14973. uint32_t FloatBits = FloatTy.getSizeInBits();
  14974. MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
  14975. uint32_t IntBits = IntTy.getSizeInBits();
  14976. unsigned NumLanes = Op.getValueType().getVectorNumElements();
  14977. if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
  14978. // These instructions only exist converting from i32 to f32. We can handle
  14979. // smaller integers by generating an extra extend, but larger ones would
  14980. // be lossy. We also can't handle anything other than 2 or 4 lanes, since
  14981. // these intructions only support v2i32/v4i32 types.
  14982. return SDValue();
  14983. }
  14984. BitVector UndefElements;
  14985. BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
  14986. int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
  14987. if (C == -1 || C == 0 || C > 32)
  14988. return SDValue();
  14989. SDLoc dl(N);
  14990. bool isSigned = OpOpcode == ISD::SINT_TO_FP;
  14991. SDValue ConvInput = Op.getOperand(0);
  14992. if (IntBits < FloatBits)
  14993. ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
  14994. dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
  14995. ConvInput);
  14996. unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
  14997. Intrinsic::arm_neon_vcvtfxu2fp;
  14998. return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
  14999. Op.getValueType(),
  15000. DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
  15001. ConvInput, DAG.getConstant(C, dl, MVT::i32));
  15002. }
  15003. static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
  15004. const ARMSubtarget *ST) {
  15005. if (!ST->hasMVEIntegerOps())
  15006. return SDValue();
  15007. assert(N->getOpcode() == ISD::VECREDUCE_ADD);
  15008. EVT ResVT = N->getValueType(0);
  15009. SDValue N0 = N->getOperand(0);
  15010. SDLoc dl(N);
  15011. // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
  15012. if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
  15013. (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
  15014. N0.getValueType() == MVT::v16i8)) {
  15015. SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
  15016. SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
  15017. return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
  15018. }
  15019. // We are looking for something that will have illegal types if left alone,
  15020. // but that we can convert to a single instruction under MVE. For example
  15021. // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
  15022. // or
  15023. // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
  15024. // The legal cases are:
  15025. // VADDV u/s 8/16/32
  15026. // VMLAV u/s 8/16/32
  15027. // VADDLV u/s 32
  15028. // VMLALV u/s 16/32
  15029. // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
  15030. // extend it and use v4i32 instead.
  15031. auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
  15032. EVT AVT = A.getValueType();
  15033. return any_of(ExtTypes, [&](MVT Ty) {
  15034. return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
  15035. AVT.bitsLE(Ty);
  15036. });
  15037. };
  15038. auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
  15039. EVT AVT = A.getValueType();
  15040. if (!AVT.is128BitVector())
  15041. A = DAG.getNode(ExtendCode, dl,
  15042. AVT.changeVectorElementType(MVT::getIntegerVT(
  15043. 128 / AVT.getVectorMinNumElements())),
  15044. A);
  15045. return A;
  15046. };
  15047. auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
  15048. if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
  15049. return SDValue();
  15050. SDValue A = N0->getOperand(0);
  15051. if (ExtTypeMatches(A, ExtTypes))
  15052. return ExtendIfNeeded(A, ExtendCode);
  15053. return SDValue();
  15054. };
  15055. auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
  15056. ArrayRef<MVT> ExtTypes, SDValue &Mask) {
  15057. if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
  15058. !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
  15059. return SDValue();
  15060. Mask = N0->getOperand(0);
  15061. SDValue Ext = N0->getOperand(1);
  15062. if (Ext->getOpcode() != ExtendCode)
  15063. return SDValue();
  15064. SDValue A = Ext->getOperand(0);
  15065. if (ExtTypeMatches(A, ExtTypes))
  15066. return ExtendIfNeeded(A, ExtendCode);
  15067. return SDValue();
  15068. };
  15069. auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
  15070. SDValue &A, SDValue &B) {
  15071. // For a vmla we are trying to match a larger pattern:
  15072. // ExtA = sext/zext A
  15073. // ExtB = sext/zext B
  15074. // Mul = mul ExtA, ExtB
  15075. // vecreduce.add Mul
  15076. // There might also be en extra extend between the mul and the addreduce, so
  15077. // long as the bitwidth is high enough to make them equivalent (for example
  15078. // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
  15079. if (ResVT != RetTy)
  15080. return false;
  15081. SDValue Mul = N0;
  15082. if (Mul->getOpcode() == ExtendCode &&
  15083. Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
  15084. ResVT.getScalarSizeInBits())
  15085. Mul = Mul->getOperand(0);
  15086. if (Mul->getOpcode() != ISD::MUL)
  15087. return false;
  15088. SDValue ExtA = Mul->getOperand(0);
  15089. SDValue ExtB = Mul->getOperand(1);
  15090. if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
  15091. return false;
  15092. A = ExtA->getOperand(0);
  15093. B = ExtB->getOperand(0);
  15094. if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
  15095. A = ExtendIfNeeded(A, ExtendCode);
  15096. B = ExtendIfNeeded(B, ExtendCode);
  15097. return true;
  15098. }
  15099. return false;
  15100. };
  15101. auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
  15102. SDValue &A, SDValue &B, SDValue &Mask) {
  15103. // Same as the pattern above with a select for the zero predicated lanes
  15104. // ExtA = sext/zext A
  15105. // ExtB = sext/zext B
  15106. // Mul = mul ExtA, ExtB
  15107. // N0 = select Mask, Mul, 0
  15108. // vecreduce.add N0
  15109. if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
  15110. !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
  15111. return false;
  15112. Mask = N0->getOperand(0);
  15113. SDValue Mul = N0->getOperand(1);
  15114. if (Mul->getOpcode() == ExtendCode &&
  15115. Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
  15116. ResVT.getScalarSizeInBits())
  15117. Mul = Mul->getOperand(0);
  15118. if (Mul->getOpcode() != ISD::MUL)
  15119. return false;
  15120. SDValue ExtA = Mul->getOperand(0);
  15121. SDValue ExtB = Mul->getOperand(1);
  15122. if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
  15123. return false;
  15124. A = ExtA->getOperand(0);
  15125. B = ExtB->getOperand(0);
  15126. if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
  15127. A = ExtendIfNeeded(A, ExtendCode);
  15128. B = ExtendIfNeeded(B, ExtendCode);
  15129. return true;
  15130. }
  15131. return false;
  15132. };
  15133. auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
  15134. // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
  15135. // reductions. The operands are extended with MVEEXT, but as they are
  15136. // reductions the lane orders do not matter. MVEEXT may be combined with
  15137. // loads to produce two extending loads, or else they will be expanded to
  15138. // VREV/VMOVL.
  15139. EVT VT = Ops[0].getValueType();
  15140. if (VT == MVT::v16i8) {
  15141. assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
  15142. "Unexpected illegal long reduction opcode");
  15143. bool IsUnsigned = Opcode == ARMISD::VMLALVu;
  15144. SDValue Ext0 =
  15145. DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
  15146. DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
  15147. SDValue Ext1 =
  15148. DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
  15149. DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
  15150. SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
  15151. Ext0, Ext1);
  15152. SDValue MLA1 =
  15153. DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
  15154. DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
  15155. Ext0.getValue(1), Ext1.getValue(1));
  15156. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
  15157. }
  15158. SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
  15159. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
  15160. SDValue(Node.getNode(), 1));
  15161. };
  15162. SDValue A, B;
  15163. SDValue Mask;
  15164. if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
  15165. return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
  15166. if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
  15167. return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
  15168. if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
  15169. A, B))
  15170. return Create64bitNode(ARMISD::VMLALVs, {A, B});
  15171. if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
  15172. A, B))
  15173. return Create64bitNode(ARMISD::VMLALVu, {A, B});
  15174. if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
  15175. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15176. DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
  15177. if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
  15178. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15179. DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
  15180. if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
  15181. Mask))
  15182. return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
  15183. if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
  15184. Mask))
  15185. return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
  15186. if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
  15187. Mask))
  15188. return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
  15189. if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
  15190. Mask))
  15191. return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
  15192. if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
  15193. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15194. DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
  15195. if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
  15196. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15197. DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
  15198. if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
  15199. return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
  15200. if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
  15201. return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
  15202. if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
  15203. return Create64bitNode(ARMISD::VADDLVs, {A});
  15204. if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
  15205. return Create64bitNode(ARMISD::VADDLVu, {A});
  15206. if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
  15207. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15208. DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
  15209. if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
  15210. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15211. DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
  15212. if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
  15213. return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
  15214. if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
  15215. return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
  15216. if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
  15217. return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
  15218. if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
  15219. return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
  15220. if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
  15221. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15222. DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
  15223. if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
  15224. return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
  15225. DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
  15226. // Some complications. We can get a case where the two inputs of the mul are
  15227. // the same, then the output sext will have been helpfully converted to a
  15228. // zext. Turn it back.
  15229. SDValue Op = N0;
  15230. if (Op->getOpcode() == ISD::VSELECT)
  15231. Op = Op->getOperand(1);
  15232. if (Op->getOpcode() == ISD::ZERO_EXTEND &&
  15233. Op->getOperand(0)->getOpcode() == ISD::MUL) {
  15234. SDValue Mul = Op->getOperand(0);
  15235. if (Mul->getOperand(0) == Mul->getOperand(1) &&
  15236. Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
  15237. SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
  15238. if (Op != N0)
  15239. Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
  15240. N0->getOperand(0), Ext, N0->getOperand(2));
  15241. return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
  15242. }
  15243. }
  15244. return SDValue();
  15245. }
  15246. static SDValue PerformVMOVNCombine(SDNode *N,
  15247. TargetLowering::DAGCombinerInfo &DCI) {
  15248. SDValue Op0 = N->getOperand(0);
  15249. SDValue Op1 = N->getOperand(1);
  15250. unsigned IsTop = N->getConstantOperandVal(2);
  15251. // VMOVNT a undef -> a
  15252. // VMOVNB a undef -> a
  15253. // VMOVNB undef a -> a
  15254. if (Op1->isUndef())
  15255. return Op0;
  15256. if (Op0->isUndef() && !IsTop)
  15257. return Op1;
  15258. // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
  15259. // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
  15260. if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
  15261. Op1->getOpcode() == ARMISD::VQMOVNu) &&
  15262. Op1->getConstantOperandVal(2) == 0)
  15263. return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
  15264. Op0, Op1->getOperand(1), N->getOperand(2));
  15265. // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
  15266. // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
  15267. // into the top or bottom lanes.
  15268. unsigned NumElts = N->getValueType(0).getVectorNumElements();
  15269. APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
  15270. APInt Op0DemandedElts =
  15271. IsTop ? Op1DemandedElts
  15272. : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
  15273. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  15274. if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
  15275. return SDValue(N, 0);
  15276. if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
  15277. return SDValue(N, 0);
  15278. return SDValue();
  15279. }
  15280. static SDValue PerformVQMOVNCombine(SDNode *N,
  15281. TargetLowering::DAGCombinerInfo &DCI) {
  15282. SDValue Op0 = N->getOperand(0);
  15283. unsigned IsTop = N->getConstantOperandVal(2);
  15284. unsigned NumElts = N->getValueType(0).getVectorNumElements();
  15285. APInt Op0DemandedElts =
  15286. APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
  15287. : APInt::getHighBitsSet(2, 1));
  15288. const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
  15289. if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
  15290. return SDValue(N, 0);
  15291. return SDValue();
  15292. }
  15293. static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
  15294. SDLoc DL(N);
  15295. SDValue Op0 = N->getOperand(0);
  15296. SDValue Op1 = N->getOperand(1);
  15297. // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
  15298. // uses of the intrinsics.
  15299. if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
  15300. int ShiftAmt = C->getSExtValue();
  15301. if (ShiftAmt == 0) {
  15302. SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
  15303. DAG.ReplaceAllUsesWith(N, Merge.getNode());
  15304. return SDValue();
  15305. }
  15306. if (ShiftAmt >= -32 && ShiftAmt < 0) {
  15307. unsigned NewOpcode =
  15308. N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
  15309. SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
  15310. DAG.getConstant(-ShiftAmt, DL, MVT::i32));
  15311. DAG.ReplaceAllUsesWith(N, NewShift.getNode());
  15312. return NewShift;
  15313. }
  15314. }
  15315. return SDValue();
  15316. }
  15317. /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
  15318. SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
  15319. DAGCombinerInfo &DCI) const {
  15320. SelectionDAG &DAG = DCI.DAG;
  15321. unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
  15322. switch (IntNo) {
  15323. default:
  15324. // Don't do anything for most intrinsics.
  15325. break;
  15326. // Vector shifts: check for immediate versions and lower them.
  15327. // Note: This is done during DAG combining instead of DAG legalizing because
  15328. // the build_vectors for 64-bit vector element shift counts are generally
  15329. // not legal, and it is hard to see their values after they get legalized to
  15330. // loads from a constant pool.
  15331. case Intrinsic::arm_neon_vshifts:
  15332. case Intrinsic::arm_neon_vshiftu:
  15333. case Intrinsic::arm_neon_vrshifts:
  15334. case Intrinsic::arm_neon_vrshiftu:
  15335. case Intrinsic::arm_neon_vrshiftn:
  15336. case Intrinsic::arm_neon_vqshifts:
  15337. case Intrinsic::arm_neon_vqshiftu:
  15338. case Intrinsic::arm_neon_vqshiftsu:
  15339. case Intrinsic::arm_neon_vqshiftns:
  15340. case Intrinsic::arm_neon_vqshiftnu:
  15341. case Intrinsic::arm_neon_vqshiftnsu:
  15342. case Intrinsic::arm_neon_vqrshiftns:
  15343. case Intrinsic::arm_neon_vqrshiftnu:
  15344. case Intrinsic::arm_neon_vqrshiftnsu: {
  15345. EVT VT = N->getOperand(1).getValueType();
  15346. int64_t Cnt;
  15347. unsigned VShiftOpc = 0;
  15348. switch (IntNo) {
  15349. case Intrinsic::arm_neon_vshifts:
  15350. case Intrinsic::arm_neon_vshiftu:
  15351. if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
  15352. VShiftOpc = ARMISD::VSHLIMM;
  15353. break;
  15354. }
  15355. if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
  15356. VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
  15357. : ARMISD::VSHRuIMM);
  15358. break;
  15359. }
  15360. return SDValue();
  15361. case Intrinsic::arm_neon_vrshifts:
  15362. case Intrinsic::arm_neon_vrshiftu:
  15363. if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
  15364. break;
  15365. return SDValue();
  15366. case Intrinsic::arm_neon_vqshifts:
  15367. case Intrinsic::arm_neon_vqshiftu:
  15368. if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
  15369. break;
  15370. return SDValue();
  15371. case Intrinsic::arm_neon_vqshiftsu:
  15372. if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
  15373. break;
  15374. llvm_unreachable("invalid shift count for vqshlu intrinsic");
  15375. case Intrinsic::arm_neon_vrshiftn:
  15376. case Intrinsic::arm_neon_vqshiftns:
  15377. case Intrinsic::arm_neon_vqshiftnu:
  15378. case Intrinsic::arm_neon_vqshiftnsu:
  15379. case Intrinsic::arm_neon_vqrshiftns:
  15380. case Intrinsic::arm_neon_vqrshiftnu:
  15381. case Intrinsic::arm_neon_vqrshiftnsu:
  15382. // Narrowing shifts require an immediate right shift.
  15383. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
  15384. break;
  15385. llvm_unreachable("invalid shift count for narrowing vector shift "
  15386. "intrinsic");
  15387. default:
  15388. llvm_unreachable("unhandled vector shift");
  15389. }
  15390. switch (IntNo) {
  15391. case Intrinsic::arm_neon_vshifts:
  15392. case Intrinsic::arm_neon_vshiftu:
  15393. // Opcode already set above.
  15394. break;
  15395. case Intrinsic::arm_neon_vrshifts:
  15396. VShiftOpc = ARMISD::VRSHRsIMM;
  15397. break;
  15398. case Intrinsic::arm_neon_vrshiftu:
  15399. VShiftOpc = ARMISD::VRSHRuIMM;
  15400. break;
  15401. case Intrinsic::arm_neon_vrshiftn:
  15402. VShiftOpc = ARMISD::VRSHRNIMM;
  15403. break;
  15404. case Intrinsic::arm_neon_vqshifts:
  15405. VShiftOpc = ARMISD::VQSHLsIMM;
  15406. break;
  15407. case Intrinsic::arm_neon_vqshiftu:
  15408. VShiftOpc = ARMISD::VQSHLuIMM;
  15409. break;
  15410. case Intrinsic::arm_neon_vqshiftsu:
  15411. VShiftOpc = ARMISD::VQSHLsuIMM;
  15412. break;
  15413. case Intrinsic::arm_neon_vqshiftns:
  15414. VShiftOpc = ARMISD::VQSHRNsIMM;
  15415. break;
  15416. case Intrinsic::arm_neon_vqshiftnu:
  15417. VShiftOpc = ARMISD::VQSHRNuIMM;
  15418. break;
  15419. case Intrinsic::arm_neon_vqshiftnsu:
  15420. VShiftOpc = ARMISD::VQSHRNsuIMM;
  15421. break;
  15422. case Intrinsic::arm_neon_vqrshiftns:
  15423. VShiftOpc = ARMISD::VQRSHRNsIMM;
  15424. break;
  15425. case Intrinsic::arm_neon_vqrshiftnu:
  15426. VShiftOpc = ARMISD::VQRSHRNuIMM;
  15427. break;
  15428. case Intrinsic::arm_neon_vqrshiftnsu:
  15429. VShiftOpc = ARMISD::VQRSHRNsuIMM;
  15430. break;
  15431. }
  15432. SDLoc dl(N);
  15433. return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
  15434. N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
  15435. }
  15436. case Intrinsic::arm_neon_vshiftins: {
  15437. EVT VT = N->getOperand(1).getValueType();
  15438. int64_t Cnt;
  15439. unsigned VShiftOpc = 0;
  15440. if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
  15441. VShiftOpc = ARMISD::VSLIIMM;
  15442. else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
  15443. VShiftOpc = ARMISD::VSRIIMM;
  15444. else {
  15445. llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
  15446. }
  15447. SDLoc dl(N);
  15448. return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
  15449. N->getOperand(1), N->getOperand(2),
  15450. DAG.getConstant(Cnt, dl, MVT::i32));
  15451. }
  15452. case Intrinsic::arm_neon_vqrshifts:
  15453. case Intrinsic::arm_neon_vqrshiftu:
  15454. // No immediate versions of these to check for.
  15455. break;
  15456. case Intrinsic::arm_mve_vqdmlah:
  15457. case Intrinsic::arm_mve_vqdmlash:
  15458. case Intrinsic::arm_mve_vqrdmlah:
  15459. case Intrinsic::arm_mve_vqrdmlash:
  15460. case Intrinsic::arm_mve_vmla_n_predicated:
  15461. case Intrinsic::arm_mve_vmlas_n_predicated:
  15462. case Intrinsic::arm_mve_vqdmlah_predicated:
  15463. case Intrinsic::arm_mve_vqdmlash_predicated:
  15464. case Intrinsic::arm_mve_vqrdmlah_predicated:
  15465. case Intrinsic::arm_mve_vqrdmlash_predicated: {
  15466. // These intrinsics all take an i32 scalar operand which is narrowed to the
  15467. // size of a single lane of the vector type they return. So we don't need
  15468. // any bits of that operand above that point, which allows us to eliminate
  15469. // uxth/sxth.
  15470. unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
  15471. APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
  15472. if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
  15473. return SDValue();
  15474. break;
  15475. }
  15476. case Intrinsic::arm_mve_minv:
  15477. case Intrinsic::arm_mve_maxv:
  15478. case Intrinsic::arm_mve_minav:
  15479. case Intrinsic::arm_mve_maxav:
  15480. case Intrinsic::arm_mve_minv_predicated:
  15481. case Intrinsic::arm_mve_maxv_predicated:
  15482. case Intrinsic::arm_mve_minav_predicated:
  15483. case Intrinsic::arm_mve_maxav_predicated: {
  15484. // These intrinsics all take an i32 scalar operand which is narrowed to the
  15485. // size of a single lane of the vector type they take as the other input.
  15486. unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
  15487. APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
  15488. if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
  15489. return SDValue();
  15490. break;
  15491. }
  15492. case Intrinsic::arm_mve_addv: {
  15493. // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
  15494. // which allow PerformADDVecReduce to turn it into VADDLV when possible.
  15495. bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  15496. unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
  15497. return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
  15498. }
  15499. case Intrinsic::arm_mve_addlv:
  15500. case Intrinsic::arm_mve_addlv_predicated: {
  15501. // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
  15502. // which recombines the two outputs into an i64
  15503. bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
  15504. unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
  15505. (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
  15506. (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
  15507. SmallVector<SDValue, 4> Ops;
  15508. for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
  15509. if (i != 2) // skip the unsigned flag
  15510. Ops.push_back(N->getOperand(i));
  15511. SDLoc dl(N);
  15512. SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
  15513. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
  15514. val.getValue(1));
  15515. }
  15516. }
  15517. return SDValue();
  15518. }
  15519. /// PerformShiftCombine - Checks for immediate versions of vector shifts and
  15520. /// lowers them. As with the vector shift intrinsics, this is done during DAG
  15521. /// combining instead of DAG legalizing because the build_vectors for 64-bit
  15522. /// vector element shift counts are generally not legal, and it is hard to see
  15523. /// their values after they get legalized to loads from a constant pool.
  15524. static SDValue PerformShiftCombine(SDNode *N,
  15525. TargetLowering::DAGCombinerInfo &DCI,
  15526. const ARMSubtarget *ST) {
  15527. SelectionDAG &DAG = DCI.DAG;
  15528. EVT VT = N->getValueType(0);
  15529. if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
  15530. N->getOperand(0)->getOpcode() == ISD::AND &&
  15531. N->getOperand(0)->hasOneUse()) {
  15532. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
  15533. return SDValue();
  15534. // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
  15535. // usually show up because instcombine prefers to canonicalize it to
  15536. // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
  15537. // out of GEP lowering in some cases.
  15538. SDValue N0 = N->getOperand(0);
  15539. ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
  15540. if (!ShiftAmtNode)
  15541. return SDValue();
  15542. uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
  15543. ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
  15544. if (!AndMaskNode)
  15545. return SDValue();
  15546. uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
  15547. // Don't transform uxtb/uxth.
  15548. if (AndMask == 255 || AndMask == 65535)
  15549. return SDValue();
  15550. if (isMask_32(AndMask)) {
  15551. uint32_t MaskedBits = countLeadingZeros(AndMask);
  15552. if (MaskedBits > ShiftAmt) {
  15553. SDLoc DL(N);
  15554. SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
  15555. DAG.getConstant(MaskedBits, DL, MVT::i32));
  15556. return DAG.getNode(
  15557. ISD::SRL, DL, MVT::i32, SHL,
  15558. DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
  15559. }
  15560. }
  15561. }
  15562. // Nothing to be done for scalar shifts.
  15563. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  15564. if (!VT.isVector() || !TLI.isTypeLegal(VT))
  15565. return SDValue();
  15566. if (ST->hasMVEIntegerOps())
  15567. return SDValue();
  15568. int64_t Cnt;
  15569. switch (N->getOpcode()) {
  15570. default: llvm_unreachable("unexpected shift opcode");
  15571. case ISD::SHL:
  15572. if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
  15573. SDLoc dl(N);
  15574. return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
  15575. DAG.getConstant(Cnt, dl, MVT::i32));
  15576. }
  15577. break;
  15578. case ISD::SRA:
  15579. case ISD::SRL:
  15580. if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
  15581. unsigned VShiftOpc =
  15582. (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
  15583. SDLoc dl(N);
  15584. return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
  15585. DAG.getConstant(Cnt, dl, MVT::i32));
  15586. }
  15587. }
  15588. return SDValue();
  15589. }
  15590. // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
  15591. // split into multiple extending loads, which are simpler to deal with than an
  15592. // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
  15593. // to convert the type to an f32.
  15594. static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
  15595. SDValue N0 = N->getOperand(0);
  15596. if (N0.getOpcode() != ISD::LOAD)
  15597. return SDValue();
  15598. LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
  15599. if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
  15600. LD->getExtensionType() != ISD::NON_EXTLOAD)
  15601. return SDValue();
  15602. EVT FromVT = LD->getValueType(0);
  15603. EVT ToVT = N->getValueType(0);
  15604. if (!ToVT.isVector())
  15605. return SDValue();
  15606. assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
  15607. EVT ToEltVT = ToVT.getVectorElementType();
  15608. EVT FromEltVT = FromVT.getVectorElementType();
  15609. unsigned NumElements = 0;
  15610. if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
  15611. NumElements = 4;
  15612. if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
  15613. NumElements = 4;
  15614. if (NumElements == 0 ||
  15615. (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
  15616. FromVT.getVectorNumElements() % NumElements != 0 ||
  15617. !isPowerOf2_32(NumElements))
  15618. return SDValue();
  15619. LLVMContext &C = *DAG.getContext();
  15620. SDLoc DL(LD);
  15621. // Details about the old load
  15622. SDValue Ch = LD->getChain();
  15623. SDValue BasePtr = LD->getBasePtr();
  15624. Align Alignment = LD->getOriginalAlign();
  15625. MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
  15626. AAMDNodes AAInfo = LD->getAAInfo();
  15627. ISD::LoadExtType NewExtType =
  15628. N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
  15629. SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
  15630. EVT NewFromVT = EVT::getVectorVT(
  15631. C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
  15632. EVT NewToVT = EVT::getVectorVT(
  15633. C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
  15634. SmallVector<SDValue, 4> Loads;
  15635. SmallVector<SDValue, 4> Chains;
  15636. for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
  15637. unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
  15638. SDValue NewPtr =
  15639. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  15640. SDValue NewLoad =
  15641. DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
  15642. LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
  15643. Alignment, MMOFlags, AAInfo);
  15644. Loads.push_back(NewLoad);
  15645. Chains.push_back(SDValue(NewLoad.getNode(), 1));
  15646. }
  15647. // Float truncs need to extended with VCVTB's into their floating point types.
  15648. if (FromEltVT == MVT::f16) {
  15649. SmallVector<SDValue, 4> Extends;
  15650. for (unsigned i = 0; i < Loads.size(); i++) {
  15651. SDValue LoadBC =
  15652. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
  15653. SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
  15654. DAG.getConstant(0, DL, MVT::i32));
  15655. Extends.push_back(FPExt);
  15656. }
  15657. Loads = Extends;
  15658. }
  15659. SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  15660. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
  15661. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
  15662. }
  15663. /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
  15664. /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
  15665. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
  15666. const ARMSubtarget *ST) {
  15667. SDValue N0 = N->getOperand(0);
  15668. // Check for sign- and zero-extensions of vector extract operations of 8- and
  15669. // 16-bit vector elements. NEON and MVE support these directly. They are
  15670. // handled during DAG combining because type legalization will promote them
  15671. // to 32-bit types and it is messy to recognize the operations after that.
  15672. if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
  15673. N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
  15674. SDValue Vec = N0.getOperand(0);
  15675. SDValue Lane = N0.getOperand(1);
  15676. EVT VT = N->getValueType(0);
  15677. EVT EltVT = N0.getValueType();
  15678. const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  15679. if (VT == MVT::i32 &&
  15680. (EltVT == MVT::i8 || EltVT == MVT::i16) &&
  15681. TLI.isTypeLegal(Vec.getValueType()) &&
  15682. isa<ConstantSDNode>(Lane)) {
  15683. unsigned Opc = 0;
  15684. switch (N->getOpcode()) {
  15685. default: llvm_unreachable("unexpected opcode");
  15686. case ISD::SIGN_EXTEND:
  15687. Opc = ARMISD::VGETLANEs;
  15688. break;
  15689. case ISD::ZERO_EXTEND:
  15690. case ISD::ANY_EXTEND:
  15691. Opc = ARMISD::VGETLANEu;
  15692. break;
  15693. }
  15694. return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
  15695. }
  15696. }
  15697. if (ST->hasMVEIntegerOps())
  15698. if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
  15699. return NewLoad;
  15700. return SDValue();
  15701. }
  15702. static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
  15703. const ARMSubtarget *ST) {
  15704. if (ST->hasMVEFloatOps())
  15705. if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
  15706. return NewLoad;
  15707. return SDValue();
  15708. }
  15709. // Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
  15710. // constant bounds.
  15711. static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG,
  15712. const ARMSubtarget *Subtarget) {
  15713. if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
  15714. !Subtarget->isThumb2())
  15715. return SDValue();
  15716. EVT VT = Op.getValueType();
  15717. SDValue Op0 = Op.getOperand(0);
  15718. if (VT != MVT::i32 ||
  15719. (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
  15720. !isa<ConstantSDNode>(Op.getOperand(1)) ||
  15721. !isa<ConstantSDNode>(Op0.getOperand(1)))
  15722. return SDValue();
  15723. SDValue Min = Op;
  15724. SDValue Max = Op0;
  15725. SDValue Input = Op0.getOperand(0);
  15726. if (Min.getOpcode() == ISD::SMAX)
  15727. std::swap(Min, Max);
  15728. APInt MinC = Min.getConstantOperandAPInt(1);
  15729. APInt MaxC = Max.getConstantOperandAPInt(1);
  15730. if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
  15731. !(MinC + 1).isPowerOf2())
  15732. return SDValue();
  15733. SDLoc DL(Op);
  15734. if (MinC == ~MaxC)
  15735. return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
  15736. DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
  15737. if (MaxC == 0)
  15738. return DAG.getNode(ARMISD::USAT, DL, VT, Input,
  15739. DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
  15740. return SDValue();
  15741. }
  15742. /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
  15743. /// saturates.
  15744. static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
  15745. const ARMSubtarget *ST) {
  15746. EVT VT = N->getValueType(0);
  15747. SDValue N0 = N->getOperand(0);
  15748. if (VT == MVT::i32)
  15749. return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
  15750. if (!ST->hasMVEIntegerOps())
  15751. return SDValue();
  15752. if (SDValue V = PerformVQDMULHCombine(N, DAG))
  15753. return V;
  15754. if (VT != MVT::v4i32 && VT != MVT::v8i16)
  15755. return SDValue();
  15756. auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
  15757. // Check one is a smin and the other is a smax
  15758. if (Min->getOpcode() != ISD::SMIN)
  15759. std::swap(Min, Max);
  15760. if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
  15761. return false;
  15762. APInt SaturateC;
  15763. if (VT == MVT::v4i32)
  15764. SaturateC = APInt(32, (1 << 15) - 1, true);
  15765. else //if (VT == MVT::v8i16)
  15766. SaturateC = APInt(16, (1 << 7) - 1, true);
  15767. APInt MinC, MaxC;
  15768. if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
  15769. MinC != SaturateC)
  15770. return false;
  15771. if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
  15772. MaxC != ~SaturateC)
  15773. return false;
  15774. return true;
  15775. };
  15776. if (IsSignedSaturate(N, N0.getNode())) {
  15777. SDLoc DL(N);
  15778. MVT ExtVT, HalfVT;
  15779. if (VT == MVT::v4i32) {
  15780. HalfVT = MVT::v8i16;
  15781. ExtVT = MVT::v4i16;
  15782. } else { // if (VT == MVT::v8i16)
  15783. HalfVT = MVT::v16i8;
  15784. ExtVT = MVT::v8i8;
  15785. }
  15786. // Create a VQMOVNB with undef top lanes, then signed extended into the top
  15787. // half. That extend will hopefully be removed if only the bottom bits are
  15788. // demanded (though a truncating store, for example).
  15789. SDValue VQMOVN =
  15790. DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
  15791. N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
  15792. SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
  15793. return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
  15794. DAG.getValueType(ExtVT));
  15795. }
  15796. auto IsUnsignedSaturate = [&](SDNode *Min) {
  15797. // For unsigned, we just need to check for <= 0xffff
  15798. if (Min->getOpcode() != ISD::UMIN)
  15799. return false;
  15800. APInt SaturateC;
  15801. if (VT == MVT::v4i32)
  15802. SaturateC = APInt(32, (1 << 16) - 1, true);
  15803. else //if (VT == MVT::v8i16)
  15804. SaturateC = APInt(16, (1 << 8) - 1, true);
  15805. APInt MinC;
  15806. if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
  15807. MinC != SaturateC)
  15808. return false;
  15809. return true;
  15810. };
  15811. if (IsUnsignedSaturate(N)) {
  15812. SDLoc DL(N);
  15813. MVT HalfVT;
  15814. unsigned ExtConst;
  15815. if (VT == MVT::v4i32) {
  15816. HalfVT = MVT::v8i16;
  15817. ExtConst = 0x0000FFFF;
  15818. } else { //if (VT == MVT::v8i16)
  15819. HalfVT = MVT::v16i8;
  15820. ExtConst = 0x00FF;
  15821. }
  15822. // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
  15823. // an AND. That extend will hopefully be removed if only the bottom bits are
  15824. // demanded (though a truncating store, for example).
  15825. SDValue VQMOVN =
  15826. DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
  15827. DAG.getConstant(0, DL, MVT::i32));
  15828. SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
  15829. return DAG.getNode(ISD::AND, DL, VT, Bitcast,
  15830. DAG.getConstant(ExtConst, DL, VT));
  15831. }
  15832. return SDValue();
  15833. }
  15834. static const APInt *isPowerOf2Constant(SDValue V) {
  15835. ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
  15836. if (!C)
  15837. return nullptr;
  15838. const APInt *CV = &C->getAPIntValue();
  15839. return CV->isPowerOf2() ? CV : nullptr;
  15840. }
  15841. SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
  15842. // If we have a CMOV, OR and AND combination such as:
  15843. // if (x & CN)
  15844. // y |= CM;
  15845. //
  15846. // And:
  15847. // * CN is a single bit;
  15848. // * All bits covered by CM are known zero in y
  15849. //
  15850. // Then we can convert this into a sequence of BFI instructions. This will
  15851. // always be a win if CM is a single bit, will always be no worse than the
  15852. // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
  15853. // three bits (due to the extra IT instruction).
  15854. SDValue Op0 = CMOV->getOperand(0);
  15855. SDValue Op1 = CMOV->getOperand(1);
  15856. auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
  15857. auto CC = CCNode->getAPIntValue().getLimitedValue();
  15858. SDValue CmpZ = CMOV->getOperand(4);
  15859. // The compare must be against zero.
  15860. if (!isNullConstant(CmpZ->getOperand(1)))
  15861. return SDValue();
  15862. assert(CmpZ->getOpcode() == ARMISD::CMPZ);
  15863. SDValue And = CmpZ->getOperand(0);
  15864. if (And->getOpcode() != ISD::AND)
  15865. return SDValue();
  15866. const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
  15867. if (!AndC)
  15868. return SDValue();
  15869. SDValue X = And->getOperand(0);
  15870. if (CC == ARMCC::EQ) {
  15871. // We're performing an "equal to zero" compare. Swap the operands so we
  15872. // canonicalize on a "not equal to zero" compare.
  15873. std::swap(Op0, Op1);
  15874. } else {
  15875. assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
  15876. }
  15877. if (Op1->getOpcode() != ISD::OR)
  15878. return SDValue();
  15879. ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
  15880. if (!OrC)
  15881. return SDValue();
  15882. SDValue Y = Op1->getOperand(0);
  15883. if (Op0 != Y)
  15884. return SDValue();
  15885. // Now, is it profitable to continue?
  15886. APInt OrCI = OrC->getAPIntValue();
  15887. unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
  15888. if (OrCI.countPopulation() > Heuristic)
  15889. return SDValue();
  15890. // Lastly, can we determine that the bits defined by OrCI
  15891. // are zero in Y?
  15892. KnownBits Known = DAG.computeKnownBits(Y);
  15893. if ((OrCI & Known.Zero) != OrCI)
  15894. return SDValue();
  15895. // OK, we can do the combine.
  15896. SDValue V = Y;
  15897. SDLoc dl(X);
  15898. EVT VT = X.getValueType();
  15899. unsigned BitInX = AndC->logBase2();
  15900. if (BitInX != 0) {
  15901. // We must shift X first.
  15902. X = DAG.getNode(ISD::SRL, dl, VT, X,
  15903. DAG.getConstant(BitInX, dl, VT));
  15904. }
  15905. for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
  15906. BitInY < NumActiveBits; ++BitInY) {
  15907. if (OrCI[BitInY] == 0)
  15908. continue;
  15909. APInt Mask(VT.getSizeInBits(), 0);
  15910. Mask.setBit(BitInY);
  15911. V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
  15912. // Confusingly, the operand is an *inverted* mask.
  15913. DAG.getConstant(~Mask, dl, VT));
  15914. }
  15915. return V;
  15916. }
  15917. // Given N, the value controlling the conditional branch, search for the loop
  15918. // intrinsic, returning it, along with how the value is used. We need to handle
  15919. // patterns such as the following:
  15920. // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
  15921. // (brcond (setcc (loop.decrement), 0, eq), exit)
  15922. // (brcond (setcc (loop.decrement), 0, ne), header)
  15923. static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
  15924. bool &Negate) {
  15925. switch (N->getOpcode()) {
  15926. default:
  15927. break;
  15928. case ISD::XOR: {
  15929. if (!isa<ConstantSDNode>(N.getOperand(1)))
  15930. return SDValue();
  15931. if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
  15932. return SDValue();
  15933. Negate = !Negate;
  15934. return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
  15935. }
  15936. case ISD::SETCC: {
  15937. auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
  15938. if (!Const)
  15939. return SDValue();
  15940. if (Const->isZero())
  15941. Imm = 0;
  15942. else if (Const->isOne())
  15943. Imm = 1;
  15944. else
  15945. return SDValue();
  15946. CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
  15947. return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
  15948. }
  15949. case ISD::INTRINSIC_W_CHAIN: {
  15950. unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
  15951. if (IntOp != Intrinsic::test_start_loop_iterations &&
  15952. IntOp != Intrinsic::loop_decrement_reg)
  15953. return SDValue();
  15954. return N;
  15955. }
  15956. }
  15957. return SDValue();
  15958. }
  15959. static SDValue PerformHWLoopCombine(SDNode *N,
  15960. TargetLowering::DAGCombinerInfo &DCI,
  15961. const ARMSubtarget *ST) {
  15962. // The hwloop intrinsics that we're interested are used for control-flow,
  15963. // either for entering or exiting the loop:
  15964. // - test.start.loop.iterations will test whether its operand is zero. If it
  15965. // is zero, the proceeding branch should not enter the loop.
  15966. // - loop.decrement.reg also tests whether its operand is zero. If it is
  15967. // zero, the proceeding branch should not branch back to the beginning of
  15968. // the loop.
  15969. // So here, we need to check that how the brcond is using the result of each
  15970. // of the intrinsics to ensure that we're branching to the right place at the
  15971. // right time.
  15972. ISD::CondCode CC;
  15973. SDValue Cond;
  15974. int Imm = 1;
  15975. bool Negate = false;
  15976. SDValue Chain = N->getOperand(0);
  15977. SDValue Dest;
  15978. if (N->getOpcode() == ISD::BRCOND) {
  15979. CC = ISD::SETEQ;
  15980. Cond = N->getOperand(1);
  15981. Dest = N->getOperand(2);
  15982. } else {
  15983. assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
  15984. CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
  15985. Cond = N->getOperand(2);
  15986. Dest = N->getOperand(4);
  15987. if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
  15988. if (!Const->isOne() && !Const->isZero())
  15989. return SDValue();
  15990. Imm = Const->getZExtValue();
  15991. } else
  15992. return SDValue();
  15993. }
  15994. SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
  15995. if (!Int)
  15996. return SDValue();
  15997. if (Negate)
  15998. CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
  15999. auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
  16000. return (CC == ISD::SETEQ && Imm == 0) ||
  16001. (CC == ISD::SETNE && Imm == 1) ||
  16002. (CC == ISD::SETLT && Imm == 1) ||
  16003. (CC == ISD::SETULT && Imm == 1);
  16004. };
  16005. auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
  16006. return (CC == ISD::SETEQ && Imm == 1) ||
  16007. (CC == ISD::SETNE && Imm == 0) ||
  16008. (CC == ISD::SETGT && Imm == 0) ||
  16009. (CC == ISD::SETUGT && Imm == 0) ||
  16010. (CC == ISD::SETGE && Imm == 1) ||
  16011. (CC == ISD::SETUGE && Imm == 1);
  16012. };
  16013. assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
  16014. "unsupported condition");
  16015. SDLoc dl(Int);
  16016. SelectionDAG &DAG = DCI.DAG;
  16017. SDValue Elements = Int.getOperand(2);
  16018. unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
  16019. assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
  16020. && "expected single br user");
  16021. SDNode *Br = *N->use_begin();
  16022. SDValue OtherTarget = Br->getOperand(1);
  16023. // Update the unconditional branch to branch to the given Dest.
  16024. auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
  16025. SDValue NewBrOps[] = { Br->getOperand(0), Dest };
  16026. SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
  16027. DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
  16028. };
  16029. if (IntOp == Intrinsic::test_start_loop_iterations) {
  16030. SDValue Res;
  16031. SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
  16032. // We expect this 'instruction' to branch when the counter is zero.
  16033. if (IsTrueIfZero(CC, Imm)) {
  16034. SDValue Ops[] = {Chain, Setup, Dest};
  16035. Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
  16036. } else {
  16037. // The logic is the reverse of what we need for WLS, so find the other
  16038. // basic block target: the target of the proceeding br.
  16039. UpdateUncondBr(Br, Dest, DAG);
  16040. SDValue Ops[] = {Chain, Setup, OtherTarget};
  16041. Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
  16042. }
  16043. // Update LR count to the new value
  16044. DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
  16045. // Update chain
  16046. DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
  16047. return Res;
  16048. } else {
  16049. SDValue Size = DAG.getTargetConstant(
  16050. cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
  16051. SDValue Args[] = { Int.getOperand(0), Elements, Size, };
  16052. SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
  16053. DAG.getVTList(MVT::i32, MVT::Other), Args);
  16054. DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
  16055. // We expect this instruction to branch when the count is not zero.
  16056. SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
  16057. // Update the unconditional branch to target the loop preheader if we've
  16058. // found the condition has been reversed.
  16059. if (Target == OtherTarget)
  16060. UpdateUncondBr(Br, Dest, DAG);
  16061. Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
  16062. SDValue(LoopDec.getNode(), 1), Chain);
  16063. SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
  16064. return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
  16065. }
  16066. return SDValue();
  16067. }
  16068. /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
  16069. SDValue
  16070. ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
  16071. SDValue Cmp = N->getOperand(4);
  16072. if (Cmp.getOpcode() != ARMISD::CMPZ)
  16073. // Only looking at NE cases.
  16074. return SDValue();
  16075. EVT VT = N->getValueType(0);
  16076. SDLoc dl(N);
  16077. SDValue LHS = Cmp.getOperand(0);
  16078. SDValue RHS = Cmp.getOperand(1);
  16079. SDValue Chain = N->getOperand(0);
  16080. SDValue BB = N->getOperand(1);
  16081. SDValue ARMcc = N->getOperand(2);
  16082. ARMCC::CondCodes CC =
  16083. (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
  16084. // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
  16085. // -> (brcond Chain BB CC CPSR Cmp)
  16086. if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
  16087. LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
  16088. LHS->getOperand(0)->hasOneUse()) {
  16089. auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
  16090. auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
  16091. auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
  16092. auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
  16093. if ((LHS00C && LHS00C->getZExtValue() == 0) &&
  16094. (LHS01C && LHS01C->getZExtValue() == 1) &&
  16095. (LHS1C && LHS1C->getZExtValue() == 1) &&
  16096. (RHSC && RHSC->getZExtValue() == 0)) {
  16097. return DAG.getNode(
  16098. ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
  16099. LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
  16100. }
  16101. }
  16102. return SDValue();
  16103. }
  16104. /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
  16105. SDValue
  16106. ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
  16107. SDValue Cmp = N->getOperand(4);
  16108. if (Cmp.getOpcode() != ARMISD::CMPZ)
  16109. // Only looking at EQ and NE cases.
  16110. return SDValue();
  16111. EVT VT = N->getValueType(0);
  16112. SDLoc dl(N);
  16113. SDValue LHS = Cmp.getOperand(0);
  16114. SDValue RHS = Cmp.getOperand(1);
  16115. SDValue FalseVal = N->getOperand(0);
  16116. SDValue TrueVal = N->getOperand(1);
  16117. SDValue ARMcc = N->getOperand(2);
  16118. ARMCC::CondCodes CC =
  16119. (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
  16120. // BFI is only available on V6T2+.
  16121. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
  16122. SDValue R = PerformCMOVToBFICombine(N, DAG);
  16123. if (R)
  16124. return R;
  16125. }
  16126. // Simplify
  16127. // mov r1, r0
  16128. // cmp r1, x
  16129. // mov r0, y
  16130. // moveq r0, x
  16131. // to
  16132. // cmp r0, x
  16133. // movne r0, y
  16134. //
  16135. // mov r1, r0
  16136. // cmp r1, x
  16137. // mov r0, x
  16138. // movne r0, y
  16139. // to
  16140. // cmp r0, x
  16141. // movne r0, y
  16142. /// FIXME: Turn this into a target neutral optimization?
  16143. SDValue Res;
  16144. if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
  16145. Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
  16146. N->getOperand(3), Cmp);
  16147. } else if (CC == ARMCC::EQ && TrueVal == RHS) {
  16148. SDValue ARMcc;
  16149. SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
  16150. Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
  16151. N->getOperand(3), NewCmp);
  16152. }
  16153. // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
  16154. // -> (cmov F T CC CPSR Cmp)
  16155. if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
  16156. auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
  16157. auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
  16158. auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
  16159. if ((LHS0C && LHS0C->getZExtValue() == 0) &&
  16160. (LHS1C && LHS1C->getZExtValue() == 1) &&
  16161. (RHSC && RHSC->getZExtValue() == 0)) {
  16162. return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
  16163. LHS->getOperand(2), LHS->getOperand(3),
  16164. LHS->getOperand(4));
  16165. }
  16166. }
  16167. if (!VT.isInteger())
  16168. return SDValue();
  16169. // Fold away an unneccessary CMPZ/CMOV
  16170. // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
  16171. // if C1==EQ -> CMOV A, B, C2, $cpsr, D
  16172. // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
  16173. if (N->getConstantOperandVal(2) == ARMCC::EQ ||
  16174. N->getConstantOperandVal(2) == ARMCC::NE) {
  16175. ARMCC::CondCodes Cond;
  16176. if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {
  16177. if (N->getConstantOperandVal(2) == ARMCC::NE)
  16178. Cond = ARMCC::getOppositeCondition(Cond);
  16179. return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
  16180. N->getOperand(1),
  16181. DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
  16182. N->getOperand(3), C);
  16183. }
  16184. }
  16185. // Materialize a boolean comparison for integers so we can avoid branching.
  16186. if (isNullConstant(FalseVal)) {
  16187. if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
  16188. if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
  16189. // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
  16190. // right 5 bits will make that 32 be 1, otherwise it will be 0.
  16191. // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
  16192. SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
  16193. Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
  16194. DAG.getConstant(5, dl, MVT::i32));
  16195. } else {
  16196. // CMOV 0, 1, ==, (CMPZ x, y) ->
  16197. // (ADDCARRY (SUB x, y), t:0, t:1)
  16198. // where t = (SUBCARRY 0, (SUB x, y), 0)
  16199. //
  16200. // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
  16201. // x != y. In other words, a carry C == 1 when x == y, C == 0
  16202. // otherwise.
  16203. // The final ADDCARRY computes
  16204. // x - y + (0 - (x - y)) + C == C
  16205. SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
  16206. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  16207. SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
  16208. // ISD::SUBCARRY returns a borrow but we want the carry here
  16209. // actually.
  16210. SDValue Carry =
  16211. DAG.getNode(ISD::SUB, dl, MVT::i32,
  16212. DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
  16213. Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
  16214. }
  16215. } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
  16216. (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
  16217. // This seems pointless but will allow us to combine it further below.
  16218. // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
  16219. SDValue Sub =
  16220. DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
  16221. SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
  16222. Sub.getValue(1), SDValue());
  16223. Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
  16224. N->getOperand(3), CPSRGlue.getValue(1));
  16225. FalseVal = Sub;
  16226. }
  16227. } else if (isNullConstant(TrueVal)) {
  16228. if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
  16229. (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
  16230. // This seems pointless but will allow us to combine it further below
  16231. // Note that we change == for != as this is the dual for the case above.
  16232. // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
  16233. SDValue Sub =
  16234. DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
  16235. SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
  16236. Sub.getValue(1), SDValue());
  16237. Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
  16238. DAG.getConstant(ARMCC::NE, dl, MVT::i32),
  16239. N->getOperand(3), CPSRGlue.getValue(1));
  16240. FalseVal = Sub;
  16241. }
  16242. }
  16243. // On Thumb1, the DAG above may be further combined if z is a power of 2
  16244. // (z == 2 ^ K).
  16245. // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
  16246. // t1 = (USUBO (SUB x, y), 1)
  16247. // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
  16248. // Result = if K != 0 then (SHL t2:0, K) else t2:0
  16249. //
  16250. // This also handles the special case of comparing against zero; it's
  16251. // essentially, the same pattern, except there's no SUBS:
  16252. // CMOV x, z, !=, (CMPZ x, 0) ->
  16253. // t1 = (USUBO x, 1)
  16254. // t2 = (SUBCARRY x, t1:0, t1:1)
  16255. // Result = if K != 0 then (SHL t2:0, K) else t2:0
  16256. const APInt *TrueConst;
  16257. if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
  16258. ((FalseVal.getOpcode() == ARMISD::SUBS &&
  16259. FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
  16260. (FalseVal == LHS && isNullConstant(RHS))) &&
  16261. (TrueConst = isPowerOf2Constant(TrueVal))) {
  16262. SDVTList VTs = DAG.getVTList(VT, MVT::i32);
  16263. unsigned ShiftAmount = TrueConst->logBase2();
  16264. if (ShiftAmount)
  16265. TrueVal = DAG.getConstant(1, dl, VT);
  16266. SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
  16267. Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
  16268. if (ShiftAmount)
  16269. Res = DAG.getNode(ISD::SHL, dl, VT, Res,
  16270. DAG.getConstant(ShiftAmount, dl, MVT::i32));
  16271. }
  16272. if (Res.getNode()) {
  16273. KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
  16274. // Capture demanded bits information that would be otherwise lost.
  16275. if (Known.Zero == 0xfffffffe)
  16276. Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
  16277. DAG.getValueType(MVT::i1));
  16278. else if (Known.Zero == 0xffffff00)
  16279. Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
  16280. DAG.getValueType(MVT::i8));
  16281. else if (Known.Zero == 0xffff0000)
  16282. Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
  16283. DAG.getValueType(MVT::i16));
  16284. }
  16285. return Res;
  16286. }
  16287. static SDValue PerformBITCASTCombine(SDNode *N,
  16288. TargetLowering::DAGCombinerInfo &DCI,
  16289. const ARMSubtarget *ST) {
  16290. SelectionDAG &DAG = DCI.DAG;
  16291. SDValue Src = N->getOperand(0);
  16292. EVT DstVT = N->getValueType(0);
  16293. // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
  16294. if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
  16295. EVT SrcVT = Src.getValueType();
  16296. if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
  16297. return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
  16298. }
  16299. // We may have a bitcast of something that has already had this bitcast
  16300. // combine performed on it, so skip past any VECTOR_REG_CASTs.
  16301. while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
  16302. Src = Src.getOperand(0);
  16303. // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
  16304. // would be generated is at least the width of the element type.
  16305. EVT SrcVT = Src.getValueType();
  16306. if ((Src.getOpcode() == ARMISD::VMOVIMM ||
  16307. Src.getOpcode() == ARMISD::VMVNIMM ||
  16308. Src.getOpcode() == ARMISD::VMOVFPIMM) &&
  16309. SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
  16310. DAG.getDataLayout().isBigEndian())
  16311. return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
  16312. // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
  16313. if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
  16314. return R;
  16315. return SDValue();
  16316. }
  16317. // Some combines for the MVETrunc truncations legalizer helper. Also lowers the
  16318. // node into stack operations after legalizeOps.
  16319. SDValue ARMTargetLowering::PerformMVETruncCombine(
  16320. SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
  16321. SelectionDAG &DAG = DCI.DAG;
  16322. EVT VT = N->getValueType(0);
  16323. SDLoc DL(N);
  16324. // MVETrunc(Undef, Undef) -> Undef
  16325. if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
  16326. return DAG.getUNDEF(VT);
  16327. // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
  16328. if (N->getNumOperands() == 2 &&
  16329. N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
  16330. N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
  16331. return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
  16332. N->getOperand(0).getOperand(1),
  16333. N->getOperand(1).getOperand(0),
  16334. N->getOperand(1).getOperand(1));
  16335. // MVETrunc(shuffle, shuffle) -> VMOVN
  16336. if (N->getNumOperands() == 2 &&
  16337. N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
  16338. N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
  16339. auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
  16340. auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
  16341. if (S0->getOperand(0) == S1->getOperand(0) &&
  16342. S0->getOperand(1) == S1->getOperand(1)) {
  16343. // Construct complete shuffle mask
  16344. SmallVector<int, 8> Mask(S0->getMask());
  16345. Mask.append(S1->getMask().begin(), S1->getMask().end());
  16346. if (isVMOVNTruncMask(Mask, VT, false))
  16347. return DAG.getNode(
  16348. ARMISD::VMOVN, DL, VT,
  16349. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
  16350. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
  16351. DAG.getConstant(1, DL, MVT::i32));
  16352. if (isVMOVNTruncMask(Mask, VT, true))
  16353. return DAG.getNode(
  16354. ARMISD::VMOVN, DL, VT,
  16355. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
  16356. DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
  16357. DAG.getConstant(1, DL, MVT::i32));
  16358. }
  16359. }
  16360. // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
  16361. // truncate to a buildvector to allow the generic optimisations to kick in.
  16362. if (all_of(N->ops(), [](SDValue Op) {
  16363. return Op.getOpcode() == ISD::BUILD_VECTOR ||
  16364. Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
  16365. (Op.getOpcode() == ISD::BITCAST &&
  16366. Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
  16367. })) {
  16368. SmallVector<SDValue, 8> Extracts;
  16369. for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
  16370. SDValue O = N->getOperand(Op);
  16371. for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
  16372. SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
  16373. DAG.getConstant(i, DL, MVT::i32));
  16374. Extracts.push_back(Ext);
  16375. }
  16376. }
  16377. return DAG.getBuildVector(VT, DL, Extracts);
  16378. }
  16379. // If we are late in the legalization process and nothing has optimised
  16380. // the trunc to anything better, lower it to a stack store and reload,
  16381. // performing the truncation whilst keeping the lanes in the correct order:
  16382. // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
  16383. if (!DCI.isAfterLegalizeDAG())
  16384. return SDValue();
  16385. SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
  16386. int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  16387. int NumIns = N->getNumOperands();
  16388. assert((NumIns == 2 || NumIns == 4) &&
  16389. "Expected 2 or 4 inputs to an MVETrunc");
  16390. EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
  16391. if (N->getNumOperands() == 4)
  16392. StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
  16393. SmallVector<SDValue> Chains;
  16394. for (int I = 0; I < NumIns; I++) {
  16395. SDValue Ptr = DAG.getNode(
  16396. ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
  16397. DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
  16398. MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
  16399. DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
  16400. SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
  16401. Ptr, MPI, StoreVT, Align(4));
  16402. Chains.push_back(Ch);
  16403. }
  16404. SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  16405. MachinePointerInfo MPI =
  16406. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
  16407. return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
  16408. }
  16409. // Take a MVEEXT(load x) and split that into (extload x, extload x+8)
  16410. static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
  16411. SelectionDAG &DAG) {
  16412. SDValue N0 = N->getOperand(0);
  16413. LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());
  16414. if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
  16415. return SDValue();
  16416. EVT FromVT = LD->getMemoryVT();
  16417. EVT ToVT = N->getValueType(0);
  16418. if (!ToVT.isVector())
  16419. return SDValue();
  16420. assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
  16421. EVT ToEltVT = ToVT.getVectorElementType();
  16422. EVT FromEltVT = FromVT.getVectorElementType();
  16423. unsigned NumElements = 0;
  16424. if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
  16425. NumElements = 4;
  16426. if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
  16427. NumElements = 8;
  16428. assert(NumElements != 0);
  16429. ISD::LoadExtType NewExtType =
  16430. N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
  16431. if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
  16432. LD->getExtensionType() != ISD::EXTLOAD &&
  16433. LD->getExtensionType() != NewExtType)
  16434. return SDValue();
  16435. LLVMContext &C = *DAG.getContext();
  16436. SDLoc DL(LD);
  16437. // Details about the old load
  16438. SDValue Ch = LD->getChain();
  16439. SDValue BasePtr = LD->getBasePtr();
  16440. Align Alignment = LD->getOriginalAlign();
  16441. MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
  16442. AAMDNodes AAInfo = LD->getAAInfo();
  16443. SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
  16444. EVT NewFromVT = EVT::getVectorVT(
  16445. C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
  16446. EVT NewToVT = EVT::getVectorVT(
  16447. C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
  16448. SmallVector<SDValue, 4> Loads;
  16449. SmallVector<SDValue, 4> Chains;
  16450. for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
  16451. unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
  16452. SDValue NewPtr =
  16453. DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
  16454. SDValue NewLoad =
  16455. DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
  16456. LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
  16457. Alignment, MMOFlags, AAInfo);
  16458. Loads.push_back(NewLoad);
  16459. Chains.push_back(SDValue(NewLoad.getNode(), 1));
  16460. }
  16461. SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  16462. DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
  16463. return DAG.getMergeValues(Loads, DL);
  16464. }
  16465. // Perform combines for MVEEXT. If it has not be optimized to anything better
  16466. // before lowering, it gets converted to stack store and extloads performing the
  16467. // extend whilst still keeping the same lane ordering.
  16468. SDValue ARMTargetLowering::PerformMVEExtCombine(
  16469. SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
  16470. SelectionDAG &DAG = DCI.DAG;
  16471. EVT VT = N->getValueType(0);
  16472. SDLoc DL(N);
  16473. assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
  16474. assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
  16475. EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
  16476. *DAG.getContext());
  16477. auto Extend = [&](SDValue V) {
  16478. SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
  16479. return N->getOpcode() == ARMISD::MVESEXT
  16480. ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
  16481. DAG.getValueType(ExtVT))
  16482. : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
  16483. };
  16484. // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
  16485. if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
  16486. SDValue Ext = Extend(N->getOperand(0));
  16487. return DAG.getMergeValues({Ext, Ext}, DL);
  16488. }
  16489. // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
  16490. if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
  16491. ArrayRef<int> Mask = SVN->getMask();
  16492. assert(Mask.size() == 2 * VT.getVectorNumElements());
  16493. assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
  16494. unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
  16495. SDValue Op0 = SVN->getOperand(0);
  16496. SDValue Op1 = SVN->getOperand(1);
  16497. auto CheckInregMask = [&](int Start, int Offset) {
  16498. for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
  16499. if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
  16500. return false;
  16501. return true;
  16502. };
  16503. SDValue V0 = SDValue(N, 0);
  16504. SDValue V1 = SDValue(N, 1);
  16505. if (CheckInregMask(0, 0))
  16506. V0 = Extend(Op0);
  16507. else if (CheckInregMask(0, 1))
  16508. V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
  16509. else if (CheckInregMask(0, Mask.size()))
  16510. V0 = Extend(Op1);
  16511. else if (CheckInregMask(0, Mask.size() + 1))
  16512. V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
  16513. if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
  16514. V1 = Extend(Op1);
  16515. else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
  16516. V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
  16517. else if (CheckInregMask(VT.getVectorNumElements(), 0))
  16518. V1 = Extend(Op0);
  16519. else if (CheckInregMask(VT.getVectorNumElements(), 1))
  16520. V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
  16521. if (V0.getNode() != N || V1.getNode() != N)
  16522. return DAG.getMergeValues({V0, V1}, DL);
  16523. }
  16524. // MVEEXT(load) -> extload, extload
  16525. if (N->getOperand(0)->getOpcode() == ISD::LOAD)
  16526. if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
  16527. return L;
  16528. if (!DCI.isAfterLegalizeDAG())
  16529. return SDValue();
  16530. // Lower to a stack store and reload:
  16531. // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
  16532. SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
  16533. int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  16534. int NumOuts = N->getNumValues();
  16535. assert((NumOuts == 2 || NumOuts == 4) &&
  16536. "Expected 2 or 4 outputs to an MVEEXT");
  16537. EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
  16538. *DAG.getContext());
  16539. if (N->getNumOperands() == 4)
  16540. LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
  16541. MachinePointerInfo MPI =
  16542. MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
  16543. SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
  16544. StackPtr, MPI, Align(4));
  16545. SmallVector<SDValue> Loads;
  16546. for (int I = 0; I < NumOuts; I++) {
  16547. SDValue Ptr = DAG.getNode(
  16548. ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
  16549. DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
  16550. MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
  16551. DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
  16552. SDValue Load = DAG.getExtLoad(
  16553. N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
  16554. VT, Chain, Ptr, MPI, LoadVT, Align(4));
  16555. Loads.push_back(Load);
  16556. }
  16557. return DAG.getMergeValues(Loads, DL);
  16558. }
  16559. SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
  16560. DAGCombinerInfo &DCI) const {
  16561. switch (N->getOpcode()) {
  16562. default: break;
  16563. case ISD::SELECT_CC:
  16564. case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
  16565. case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
  16566. case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
  16567. case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
  16568. case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
  16569. case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
  16570. case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
  16571. case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
  16572. case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
  16573. case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
  16574. case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
  16575. case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
  16576. case ISD::BRCOND:
  16577. case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
  16578. case ARMISD::ADDC:
  16579. case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
  16580. case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
  16581. case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
  16582. case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
  16583. case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
  16584. case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
  16585. case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
  16586. case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
  16587. case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
  16588. case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
  16589. case ISD::EXTRACT_VECTOR_ELT:
  16590. return PerformExtractEltCombine(N, DCI, Subtarget);
  16591. case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);
  16592. case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);
  16593. case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
  16594. case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
  16595. case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
  16596. case ISD::FP_TO_SINT:
  16597. case ISD::FP_TO_UINT:
  16598. return PerformVCVTCombine(N, DCI.DAG, Subtarget);
  16599. case ISD::FADD:
  16600. return PerformFAddVSelectCombine(N, DCI.DAG, Subtarget);
  16601. case ISD::FDIV:
  16602. return PerformVDIVCombine(N, DCI.DAG, Subtarget);
  16603. case ISD::INTRINSIC_WO_CHAIN:
  16604. return PerformIntrinsicCombine(N, DCI);
  16605. case ISD::SHL:
  16606. case ISD::SRA:
  16607. case ISD::SRL:
  16608. return PerformShiftCombine(N, DCI, Subtarget);
  16609. case ISD::SIGN_EXTEND:
  16610. case ISD::ZERO_EXTEND:
  16611. case ISD::ANY_EXTEND:
  16612. return PerformExtendCombine(N, DCI.DAG, Subtarget);
  16613. case ISD::FP_EXTEND:
  16614. return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
  16615. case ISD::SMIN:
  16616. case ISD::UMIN:
  16617. case ISD::SMAX:
  16618. case ISD::UMAX:
  16619. return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
  16620. case ARMISD::CMOV:
  16621. return PerformCMOVCombine(N, DCI.DAG);
  16622. case ARMISD::BRCOND:
  16623. return PerformBRCONDCombine(N, DCI.DAG);
  16624. case ARMISD::CMPZ:
  16625. return PerformCMPZCombine(N, DCI.DAG);
  16626. case ARMISD::CSINC:
  16627. case ARMISD::CSINV:
  16628. case ARMISD::CSNEG:
  16629. return PerformCSETCombine(N, DCI.DAG);
  16630. case ISD::LOAD:
  16631. return PerformLOADCombine(N, DCI, Subtarget);
  16632. case ARMISD::VLD1DUP:
  16633. case ARMISD::VLD2DUP:
  16634. case ARMISD::VLD3DUP:
  16635. case ARMISD::VLD4DUP:
  16636. return PerformVLDCombine(N, DCI);
  16637. case ARMISD::BUILD_VECTOR:
  16638. return PerformARMBUILD_VECTORCombine(N, DCI);
  16639. case ISD::BITCAST:
  16640. return PerformBITCASTCombine(N, DCI, Subtarget);
  16641. case ARMISD::PREDICATE_CAST:
  16642. return PerformPREDICATE_CASTCombine(N, DCI);
  16643. case ARMISD::VECTOR_REG_CAST:
  16644. return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
  16645. case ARMISD::MVETRUNC:
  16646. return PerformMVETruncCombine(N, DCI);
  16647. case ARMISD::MVESEXT:
  16648. case ARMISD::MVEZEXT:
  16649. return PerformMVEExtCombine(N, DCI);
  16650. case ARMISD::VCMP:
  16651. return PerformVCMPCombine(N, DCI.DAG, Subtarget);
  16652. case ISD::VECREDUCE_ADD:
  16653. return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
  16654. case ARMISD::VMOVN:
  16655. return PerformVMOVNCombine(N, DCI);
  16656. case ARMISD::VQMOVNs:
  16657. case ARMISD::VQMOVNu:
  16658. return PerformVQMOVNCombine(N, DCI);
  16659. case ARMISD::ASRL:
  16660. case ARMISD::LSRL:
  16661. case ARMISD::LSLL:
  16662. return PerformLongShiftCombine(N, DCI.DAG);
  16663. case ARMISD::SMULWB: {
  16664. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16665. APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
  16666. if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
  16667. return SDValue();
  16668. break;
  16669. }
  16670. case ARMISD::SMULWT: {
  16671. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16672. APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
  16673. if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
  16674. return SDValue();
  16675. break;
  16676. }
  16677. case ARMISD::SMLALBB:
  16678. case ARMISD::QADD16b:
  16679. case ARMISD::QSUB16b:
  16680. case ARMISD::UQADD16b:
  16681. case ARMISD::UQSUB16b: {
  16682. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16683. APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
  16684. if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
  16685. (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
  16686. return SDValue();
  16687. break;
  16688. }
  16689. case ARMISD::SMLALBT: {
  16690. unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
  16691. APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
  16692. unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
  16693. APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
  16694. if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
  16695. (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
  16696. return SDValue();
  16697. break;
  16698. }
  16699. case ARMISD::SMLALTB: {
  16700. unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
  16701. APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
  16702. unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
  16703. APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
  16704. if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
  16705. (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
  16706. return SDValue();
  16707. break;
  16708. }
  16709. case ARMISD::SMLALTT: {
  16710. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16711. APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
  16712. if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
  16713. (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
  16714. return SDValue();
  16715. break;
  16716. }
  16717. case ARMISD::QADD8b:
  16718. case ARMISD::QSUB8b:
  16719. case ARMISD::UQADD8b:
  16720. case ARMISD::UQSUB8b: {
  16721. unsigned BitWidth = N->getValueType(0).getSizeInBits();
  16722. APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
  16723. if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
  16724. (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
  16725. return SDValue();
  16726. break;
  16727. }
  16728. case ISD::INTRINSIC_VOID:
  16729. case ISD::INTRINSIC_W_CHAIN:
  16730. switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
  16731. case Intrinsic::arm_neon_vld1:
  16732. case Intrinsic::arm_neon_vld1x2:
  16733. case Intrinsic::arm_neon_vld1x3:
  16734. case Intrinsic::arm_neon_vld1x4:
  16735. case Intrinsic::arm_neon_vld2:
  16736. case Intrinsic::arm_neon_vld3:
  16737. case Intrinsic::arm_neon_vld4:
  16738. case Intrinsic::arm_neon_vld2lane:
  16739. case Intrinsic::arm_neon_vld3lane:
  16740. case Intrinsic::arm_neon_vld4lane:
  16741. case Intrinsic::arm_neon_vld2dup:
  16742. case Intrinsic::arm_neon_vld3dup:
  16743. case Intrinsic::arm_neon_vld4dup:
  16744. case Intrinsic::arm_neon_vst1:
  16745. case Intrinsic::arm_neon_vst1x2:
  16746. case Intrinsic::arm_neon_vst1x3:
  16747. case Intrinsic::arm_neon_vst1x4:
  16748. case Intrinsic::arm_neon_vst2:
  16749. case Intrinsic::arm_neon_vst3:
  16750. case Intrinsic::arm_neon_vst4:
  16751. case Intrinsic::arm_neon_vst2lane:
  16752. case Intrinsic::arm_neon_vst3lane:
  16753. case Intrinsic::arm_neon_vst4lane:
  16754. return PerformVLDCombine(N, DCI);
  16755. case Intrinsic::arm_mve_vld2q:
  16756. case Intrinsic::arm_mve_vld4q:
  16757. case Intrinsic::arm_mve_vst2q:
  16758. case Intrinsic::arm_mve_vst4q:
  16759. return PerformMVEVLDCombine(N, DCI);
  16760. default: break;
  16761. }
  16762. break;
  16763. }
  16764. return SDValue();
  16765. }
  16766. bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
  16767. EVT VT) const {
  16768. return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
  16769. }
  16770. bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
  16771. Align Alignment,
  16772. MachineMemOperand::Flags,
  16773. unsigned *Fast) const {
  16774. // Depends what it gets converted into if the type is weird.
  16775. if (!VT.isSimple())
  16776. return false;
  16777. // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
  16778. bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
  16779. auto Ty = VT.getSimpleVT().SimpleTy;
  16780. if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
  16781. // Unaligned access can use (for example) LRDB, LRDH, LDR
  16782. if (AllowsUnaligned) {
  16783. if (Fast)
  16784. *Fast = Subtarget->hasV7Ops();
  16785. return true;
  16786. }
  16787. }
  16788. if (Ty == MVT::f64 || Ty == MVT::v2f64) {
  16789. // For any little-endian targets with neon, we can support unaligned ld/st
  16790. // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
  16791. // A big-endian target may also explicitly support unaligned accesses
  16792. if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
  16793. if (Fast)
  16794. *Fast = 1;
  16795. return true;
  16796. }
  16797. }
  16798. if (!Subtarget->hasMVEIntegerOps())
  16799. return false;
  16800. // These are for predicates
  16801. if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
  16802. Ty == MVT::v2i1)) {
  16803. if (Fast)
  16804. *Fast = 1;
  16805. return true;
  16806. }
  16807. // These are for truncated stores/narrowing loads. They are fine so long as
  16808. // the alignment is at least the size of the item being loaded
  16809. if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
  16810. Alignment >= VT.getScalarSizeInBits() / 8) {
  16811. if (Fast)
  16812. *Fast = true;
  16813. return true;
  16814. }
  16815. // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
  16816. // VSTRW.U32 all store the vector register in exactly the same format, and
  16817. // differ only in the range of their immediate offset field and the required
  16818. // alignment. So there is always a store that can be used, regardless of
  16819. // actual type.
  16820. //
  16821. // For big endian, that is not the case. But can still emit a (VSTRB.U8;
  16822. // VREV64.8) pair and get the same effect. This will likely be better than
  16823. // aligning the vector through the stack.
  16824. if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
  16825. Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
  16826. Ty == MVT::v2f64) {
  16827. if (Fast)
  16828. *Fast = 1;
  16829. return true;
  16830. }
  16831. return false;
  16832. }
  16833. EVT ARMTargetLowering::getOptimalMemOpType(
  16834. const MemOp &Op, const AttributeList &FuncAttributes) const {
  16835. // See if we can use NEON instructions for this...
  16836. if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
  16837. !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
  16838. unsigned Fast;
  16839. if (Op.size() >= 16 &&
  16840. (Op.isAligned(Align(16)) ||
  16841. (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
  16842. MachineMemOperand::MONone, &Fast) &&
  16843. Fast))) {
  16844. return MVT::v2f64;
  16845. } else if (Op.size() >= 8 &&
  16846. (Op.isAligned(Align(8)) ||
  16847. (allowsMisalignedMemoryAccesses(
  16848. MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
  16849. Fast))) {
  16850. return MVT::f64;
  16851. }
  16852. }
  16853. // Let the target-independent logic figure it out.
  16854. return MVT::Other;
  16855. }
  16856. // 64-bit integers are split into their high and low parts and held in two
  16857. // different registers, so the trunc is free since the low register can just
  16858. // be used.
  16859. bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
  16860. if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
  16861. return false;
  16862. unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
  16863. unsigned DestBits = DstTy->getPrimitiveSizeInBits();
  16864. return (SrcBits == 64 && DestBits == 32);
  16865. }
  16866. bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
  16867. if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
  16868. !DstVT.isInteger())
  16869. return false;
  16870. unsigned SrcBits = SrcVT.getSizeInBits();
  16871. unsigned DestBits = DstVT.getSizeInBits();
  16872. return (SrcBits == 64 && DestBits == 32);
  16873. }
  16874. bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
  16875. if (Val.getOpcode() != ISD::LOAD)
  16876. return false;
  16877. EVT VT1 = Val.getValueType();
  16878. if (!VT1.isSimple() || !VT1.isInteger() ||
  16879. !VT2.isSimple() || !VT2.isInteger())
  16880. return false;
  16881. switch (VT1.getSimpleVT().SimpleTy) {
  16882. default: break;
  16883. case MVT::i1:
  16884. case MVT::i8:
  16885. case MVT::i16:
  16886. // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
  16887. return true;
  16888. }
  16889. return false;
  16890. }
  16891. bool ARMTargetLowering::isFNegFree(EVT VT) const {
  16892. if (!VT.isSimple())
  16893. return false;
  16894. // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
  16895. // negate values directly (fneg is free). So, we don't want to let the DAG
  16896. // combiner rewrite fneg into xors and some other instructions. For f16 and
  16897. // FullFP16 argument passing, some bitcast nodes may be introduced,
  16898. // triggering this DAG combine rewrite, so we are avoiding that with this.
  16899. switch (VT.getSimpleVT().SimpleTy) {
  16900. default: break;
  16901. case MVT::f16:
  16902. return Subtarget->hasFullFP16();
  16903. }
  16904. return false;
  16905. }
  16906. /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
  16907. /// of the vector elements.
  16908. static bool areExtractExts(Value *Ext1, Value *Ext2) {
  16909. auto areExtDoubled = [](Instruction *Ext) {
  16910. return Ext->getType()->getScalarSizeInBits() ==
  16911. 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
  16912. };
  16913. if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
  16914. !match(Ext2, m_ZExtOrSExt(m_Value())) ||
  16915. !areExtDoubled(cast<Instruction>(Ext1)) ||
  16916. !areExtDoubled(cast<Instruction>(Ext2)))
  16917. return false;
  16918. return true;
  16919. }
  16920. /// Check if sinking \p I's operands to I's basic block is profitable, because
  16921. /// the operands can be folded into a target instruction, e.g.
  16922. /// sext/zext can be folded into vsubl.
  16923. bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
  16924. SmallVectorImpl<Use *> &Ops) const {
  16925. if (!I->getType()->isVectorTy())
  16926. return false;
  16927. if (Subtarget->hasNEON()) {
  16928. switch (I->getOpcode()) {
  16929. case Instruction::Sub:
  16930. case Instruction::Add: {
  16931. if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
  16932. return false;
  16933. Ops.push_back(&I->getOperandUse(0));
  16934. Ops.push_back(&I->getOperandUse(1));
  16935. return true;
  16936. }
  16937. default:
  16938. return false;
  16939. }
  16940. }
  16941. if (!Subtarget->hasMVEIntegerOps())
  16942. return false;
  16943. auto IsFMSMul = [&](Instruction *I) {
  16944. if (!I->hasOneUse())
  16945. return false;
  16946. auto *Sub = cast<Instruction>(*I->users().begin());
  16947. return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
  16948. };
  16949. auto IsFMS = [&](Instruction *I) {
  16950. if (match(I->getOperand(0), m_FNeg(m_Value())) ||
  16951. match(I->getOperand(1), m_FNeg(m_Value())))
  16952. return true;
  16953. return false;
  16954. };
  16955. auto IsSinker = [&](Instruction *I, int Operand) {
  16956. switch (I->getOpcode()) {
  16957. case Instruction::Add:
  16958. case Instruction::Mul:
  16959. case Instruction::FAdd:
  16960. case Instruction::ICmp:
  16961. case Instruction::FCmp:
  16962. return true;
  16963. case Instruction::FMul:
  16964. return !IsFMSMul(I);
  16965. case Instruction::Sub:
  16966. case Instruction::FSub:
  16967. case Instruction::Shl:
  16968. case Instruction::LShr:
  16969. case Instruction::AShr:
  16970. return Operand == 1;
  16971. case Instruction::Call:
  16972. if (auto *II = dyn_cast<IntrinsicInst>(I)) {
  16973. switch (II->getIntrinsicID()) {
  16974. case Intrinsic::fma:
  16975. return !IsFMS(I);
  16976. case Intrinsic::sadd_sat:
  16977. case Intrinsic::uadd_sat:
  16978. case Intrinsic::arm_mve_add_predicated:
  16979. case Intrinsic::arm_mve_mul_predicated:
  16980. case Intrinsic::arm_mve_qadd_predicated:
  16981. case Intrinsic::arm_mve_vhadd:
  16982. case Intrinsic::arm_mve_hadd_predicated:
  16983. case Intrinsic::arm_mve_vqdmull:
  16984. case Intrinsic::arm_mve_vqdmull_predicated:
  16985. case Intrinsic::arm_mve_vqdmulh:
  16986. case Intrinsic::arm_mve_qdmulh_predicated:
  16987. case Intrinsic::arm_mve_vqrdmulh:
  16988. case Intrinsic::arm_mve_qrdmulh_predicated:
  16989. case Intrinsic::arm_mve_fma_predicated:
  16990. return true;
  16991. case Intrinsic::ssub_sat:
  16992. case Intrinsic::usub_sat:
  16993. case Intrinsic::arm_mve_sub_predicated:
  16994. case Intrinsic::arm_mve_qsub_predicated:
  16995. case Intrinsic::arm_mve_hsub_predicated:
  16996. case Intrinsic::arm_mve_vhsub:
  16997. return Operand == 1;
  16998. default:
  16999. return false;
  17000. }
  17001. }
  17002. return false;
  17003. default:
  17004. return false;
  17005. }
  17006. };
  17007. for (auto OpIdx : enumerate(I->operands())) {
  17008. Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
  17009. // Make sure we are not already sinking this operand
  17010. if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
  17011. continue;
  17012. Instruction *Shuffle = Op;
  17013. if (Shuffle->getOpcode() == Instruction::BitCast)
  17014. Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
  17015. // We are looking for a splat that can be sunk.
  17016. if (!Shuffle ||
  17017. !match(Shuffle, m_Shuffle(
  17018. m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
  17019. m_Undef(), m_ZeroMask())))
  17020. continue;
  17021. if (!IsSinker(I, OpIdx.index()))
  17022. continue;
  17023. // All uses of the shuffle should be sunk to avoid duplicating it across gpr
  17024. // and vector registers
  17025. for (Use &U : Op->uses()) {
  17026. Instruction *Insn = cast<Instruction>(U.getUser());
  17027. if (!IsSinker(Insn, U.getOperandNo()))
  17028. return false;
  17029. }
  17030. Ops.push_back(&Shuffle->getOperandUse(0));
  17031. if (Shuffle != Op)
  17032. Ops.push_back(&Op->getOperandUse(0));
  17033. Ops.push_back(&OpIdx.value());
  17034. }
  17035. return true;
  17036. }
  17037. Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
  17038. if (!Subtarget->hasMVEIntegerOps())
  17039. return nullptr;
  17040. Type *SVIType = SVI->getType();
  17041. Type *ScalarType = SVIType->getScalarType();
  17042. if (ScalarType->isFloatTy())
  17043. return Type::getInt32Ty(SVIType->getContext());
  17044. if (ScalarType->isHalfTy())
  17045. return Type::getInt16Ty(SVIType->getContext());
  17046. return nullptr;
  17047. }
  17048. bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
  17049. EVT VT = ExtVal.getValueType();
  17050. if (!isTypeLegal(VT))
  17051. return false;
  17052. if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
  17053. if (Ld->isExpandingLoad())
  17054. return false;
  17055. }
  17056. if (Subtarget->hasMVEIntegerOps())
  17057. return true;
  17058. // Don't create a loadext if we can fold the extension into a wide/long
  17059. // instruction.
  17060. // If there's more than one user instruction, the loadext is desirable no
  17061. // matter what. There can be two uses by the same instruction.
  17062. if (ExtVal->use_empty() ||
  17063. !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
  17064. return true;
  17065. SDNode *U = *ExtVal->use_begin();
  17066. if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
  17067. U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
  17068. return false;
  17069. return true;
  17070. }
  17071. bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
  17072. if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
  17073. return false;
  17074. if (!isTypeLegal(EVT::getEVT(Ty1)))
  17075. return false;
  17076. assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
  17077. // Assuming the caller doesn't have a zeroext or signext return parameter,
  17078. // truncation all the way down to i1 is valid.
  17079. return true;
  17080. }
  17081. /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
  17082. /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
  17083. /// expanded to FMAs when this method returns true, otherwise fmuladd is
  17084. /// expanded to fmul + fadd.
  17085. ///
  17086. /// ARM supports both fused and unfused multiply-add operations; we already
  17087. /// lower a pair of fmul and fadd to the latter so it's not clear that there
  17088. /// would be a gain or that the gain would be worthwhile enough to risk
  17089. /// correctness bugs.
  17090. ///
  17091. /// For MVE, we set this to true as it helps simplify the need for some
  17092. /// patterns (and we don't have the non-fused floating point instruction).
  17093. bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
  17094. EVT VT) const {
  17095. if (!VT.isSimple())
  17096. return false;
  17097. switch (VT.getSimpleVT().SimpleTy) {
  17098. case MVT::v4f32:
  17099. case MVT::v8f16:
  17100. return Subtarget->hasMVEFloatOps();
  17101. case MVT::f16:
  17102. return Subtarget->useFPVFMx16();
  17103. case MVT::f32:
  17104. return Subtarget->useFPVFMx();
  17105. case MVT::f64:
  17106. return Subtarget->useFPVFMx64();
  17107. default:
  17108. break;
  17109. }
  17110. return false;
  17111. }
  17112. static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
  17113. if (V < 0)
  17114. return false;
  17115. unsigned Scale = 1;
  17116. switch (VT.getSimpleVT().SimpleTy) {
  17117. case MVT::i1:
  17118. case MVT::i8:
  17119. // Scale == 1;
  17120. break;
  17121. case MVT::i16:
  17122. // Scale == 2;
  17123. Scale = 2;
  17124. break;
  17125. default:
  17126. // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
  17127. // Scale == 4;
  17128. Scale = 4;
  17129. break;
  17130. }
  17131. if ((V & (Scale - 1)) != 0)
  17132. return false;
  17133. return isUInt<5>(V / Scale);
  17134. }
  17135. static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
  17136. const ARMSubtarget *Subtarget) {
  17137. if (!VT.isInteger() && !VT.isFloatingPoint())
  17138. return false;
  17139. if (VT.isVector() && Subtarget->hasNEON())
  17140. return false;
  17141. if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
  17142. !Subtarget->hasMVEFloatOps())
  17143. return false;
  17144. bool IsNeg = false;
  17145. if (V < 0) {
  17146. IsNeg = true;
  17147. V = -V;
  17148. }
  17149. unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
  17150. // MVE: size * imm7
  17151. if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
  17152. switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
  17153. case MVT::i32:
  17154. case MVT::f32:
  17155. return isShiftedUInt<7,2>(V);
  17156. case MVT::i16:
  17157. case MVT::f16:
  17158. return isShiftedUInt<7,1>(V);
  17159. case MVT::i8:
  17160. return isUInt<7>(V);
  17161. default:
  17162. return false;
  17163. }
  17164. }
  17165. // half VLDR: 2 * imm8
  17166. if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
  17167. return isShiftedUInt<8, 1>(V);
  17168. // VLDR and LDRD: 4 * imm8
  17169. if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
  17170. return isShiftedUInt<8, 2>(V);
  17171. if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
  17172. // + imm12 or - imm8
  17173. if (IsNeg)
  17174. return isUInt<8>(V);
  17175. return isUInt<12>(V);
  17176. }
  17177. return false;
  17178. }
  17179. /// isLegalAddressImmediate - Return true if the integer value can be used
  17180. /// as the offset of the target addressing mode for load / store of the
  17181. /// given type.
  17182. static bool isLegalAddressImmediate(int64_t V, EVT VT,
  17183. const ARMSubtarget *Subtarget) {
  17184. if (V == 0)
  17185. return true;
  17186. if (!VT.isSimple())
  17187. return false;
  17188. if (Subtarget->isThumb1Only())
  17189. return isLegalT1AddressImmediate(V, VT);
  17190. else if (Subtarget->isThumb2())
  17191. return isLegalT2AddressImmediate(V, VT, Subtarget);
  17192. // ARM mode.
  17193. if (V < 0)
  17194. V = - V;
  17195. switch (VT.getSimpleVT().SimpleTy) {
  17196. default: return false;
  17197. case MVT::i1:
  17198. case MVT::i8:
  17199. case MVT::i32:
  17200. // +- imm12
  17201. return isUInt<12>(V);
  17202. case MVT::i16:
  17203. // +- imm8
  17204. return isUInt<8>(V);
  17205. case MVT::f32:
  17206. case MVT::f64:
  17207. if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
  17208. return false;
  17209. return isShiftedUInt<8, 2>(V);
  17210. }
  17211. }
  17212. bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
  17213. EVT VT) const {
  17214. int Scale = AM.Scale;
  17215. if (Scale < 0)
  17216. return false;
  17217. switch (VT.getSimpleVT().SimpleTy) {
  17218. default: return false;
  17219. case MVT::i1:
  17220. case MVT::i8:
  17221. case MVT::i16:
  17222. case MVT::i32:
  17223. if (Scale == 1)
  17224. return true;
  17225. // r + r << imm
  17226. Scale = Scale & ~1;
  17227. return Scale == 2 || Scale == 4 || Scale == 8;
  17228. case MVT::i64:
  17229. // FIXME: What are we trying to model here? ldrd doesn't have an r + r
  17230. // version in Thumb mode.
  17231. // r + r
  17232. if (Scale == 1)
  17233. return true;
  17234. // r * 2 (this can be lowered to r + r).
  17235. if (!AM.HasBaseReg && Scale == 2)
  17236. return true;
  17237. return false;
  17238. case MVT::isVoid:
  17239. // Note, we allow "void" uses (basically, uses that aren't loads or
  17240. // stores), because arm allows folding a scale into many arithmetic
  17241. // operations. This should be made more precise and revisited later.
  17242. // Allow r << imm, but the imm has to be a multiple of two.
  17243. if (Scale & 1) return false;
  17244. return isPowerOf2_32(Scale);
  17245. }
  17246. }
  17247. bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
  17248. EVT VT) const {
  17249. const int Scale = AM.Scale;
  17250. // Negative scales are not supported in Thumb1.
  17251. if (Scale < 0)
  17252. return false;
  17253. // Thumb1 addressing modes do not support register scaling excepting the
  17254. // following cases:
  17255. // 1. Scale == 1 means no scaling.
  17256. // 2. Scale == 2 this can be lowered to r + r if there is no base register.
  17257. return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
  17258. }
  17259. /// isLegalAddressingMode - Return true if the addressing mode represented
  17260. /// by AM is legal for this target, for a load/store of the specified type.
  17261. bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
  17262. const AddrMode &AM, Type *Ty,
  17263. unsigned AS, Instruction *I) const {
  17264. EVT VT = getValueType(DL, Ty, true);
  17265. if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
  17266. return false;
  17267. // Can never fold addr of global into load/store.
  17268. if (AM.BaseGV)
  17269. return false;
  17270. switch (AM.Scale) {
  17271. case 0: // no scale reg, must be "r+i" or "r", or "i".
  17272. break;
  17273. default:
  17274. // ARM doesn't support any R+R*scale+imm addr modes.
  17275. if (AM.BaseOffs)
  17276. return false;
  17277. if (!VT.isSimple())
  17278. return false;
  17279. if (Subtarget->isThumb1Only())
  17280. return isLegalT1ScaledAddressingMode(AM, VT);
  17281. if (Subtarget->isThumb2())
  17282. return isLegalT2ScaledAddressingMode(AM, VT);
  17283. int Scale = AM.Scale;
  17284. switch (VT.getSimpleVT().SimpleTy) {
  17285. default: return false;
  17286. case MVT::i1:
  17287. case MVT::i8:
  17288. case MVT::i32:
  17289. if (Scale < 0) Scale = -Scale;
  17290. if (Scale == 1)
  17291. return true;
  17292. // r + r << imm
  17293. return isPowerOf2_32(Scale & ~1);
  17294. case MVT::i16:
  17295. case MVT::i64:
  17296. // r +/- r
  17297. if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
  17298. return true;
  17299. // r * 2 (this can be lowered to r + r).
  17300. if (!AM.HasBaseReg && Scale == 2)
  17301. return true;
  17302. return false;
  17303. case MVT::isVoid:
  17304. // Note, we allow "void" uses (basically, uses that aren't loads or
  17305. // stores), because arm allows folding a scale into many arithmetic
  17306. // operations. This should be made more precise and revisited later.
  17307. // Allow r << imm, but the imm has to be a multiple of two.
  17308. if (Scale & 1) return false;
  17309. return isPowerOf2_32(Scale);
  17310. }
  17311. }
  17312. return true;
  17313. }
  17314. /// isLegalICmpImmediate - Return true if the specified immediate is legal
  17315. /// icmp immediate, that is the target has icmp instructions which can compare
  17316. /// a register against the immediate without having to materialize the
  17317. /// immediate into a register.
  17318. bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
  17319. // Thumb2 and ARM modes can use cmn for negative immediates.
  17320. if (!Subtarget->isThumb())
  17321. return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
  17322. ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
  17323. if (Subtarget->isThumb2())
  17324. return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
  17325. ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
  17326. // Thumb1 doesn't have cmn, and only 8-bit immediates.
  17327. return Imm >= 0 && Imm <= 255;
  17328. }
  17329. /// isLegalAddImmediate - Return true if the specified immediate is a legal add
  17330. /// *or sub* immediate, that is the target has add or sub instructions which can
  17331. /// add a register with the immediate without having to materialize the
  17332. /// immediate into a register.
  17333. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
  17334. // Same encoding for add/sub, just flip the sign.
  17335. int64_t AbsImm = std::abs(Imm);
  17336. if (!Subtarget->isThumb())
  17337. return ARM_AM::getSOImmVal(AbsImm) != -1;
  17338. if (Subtarget->isThumb2())
  17339. return ARM_AM::getT2SOImmVal(AbsImm) != -1;
  17340. // Thumb1 only has 8-bit unsigned immediate.
  17341. return AbsImm >= 0 && AbsImm <= 255;
  17342. }
  17343. // Return false to prevent folding
  17344. // (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
  17345. // if the folding leads to worse code.
  17346. bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
  17347. SDValue ConstNode) const {
  17348. // Let the DAGCombiner decide for vector types and large types.
  17349. const EVT VT = AddNode.getValueType();
  17350. if (VT.isVector() || VT.getScalarSizeInBits() > 32)
  17351. return true;
  17352. // It is worse if c0 is legal add immediate, while c1*c0 is not
  17353. // and has to be composed by at least two instructions.
  17354. const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
  17355. const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
  17356. const int64_t C0 = C0Node->getSExtValue();
  17357. APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
  17358. if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue()))
  17359. return true;
  17360. if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
  17361. return false;
  17362. // Default to true and let the DAGCombiner decide.
  17363. return true;
  17364. }
  17365. static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
  17366. bool isSEXTLoad, SDValue &Base,
  17367. SDValue &Offset, bool &isInc,
  17368. SelectionDAG &DAG) {
  17369. if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
  17370. return false;
  17371. if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
  17372. // AddressingMode 3
  17373. Base = Ptr->getOperand(0);
  17374. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
  17375. int RHSC = (int)RHS->getZExtValue();
  17376. if (RHSC < 0 && RHSC > -256) {
  17377. assert(Ptr->getOpcode() == ISD::ADD);
  17378. isInc = false;
  17379. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17380. return true;
  17381. }
  17382. }
  17383. isInc = (Ptr->getOpcode() == ISD::ADD);
  17384. Offset = Ptr->getOperand(1);
  17385. return true;
  17386. } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
  17387. // AddressingMode 2
  17388. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
  17389. int RHSC = (int)RHS->getZExtValue();
  17390. if (RHSC < 0 && RHSC > -0x1000) {
  17391. assert(Ptr->getOpcode() == ISD::ADD);
  17392. isInc = false;
  17393. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17394. Base = Ptr->getOperand(0);
  17395. return true;
  17396. }
  17397. }
  17398. if (Ptr->getOpcode() == ISD::ADD) {
  17399. isInc = true;
  17400. ARM_AM::ShiftOpc ShOpcVal=
  17401. ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
  17402. if (ShOpcVal != ARM_AM::no_shift) {
  17403. Base = Ptr->getOperand(1);
  17404. Offset = Ptr->getOperand(0);
  17405. } else {
  17406. Base = Ptr->getOperand(0);
  17407. Offset = Ptr->getOperand(1);
  17408. }
  17409. return true;
  17410. }
  17411. isInc = (Ptr->getOpcode() == ISD::ADD);
  17412. Base = Ptr->getOperand(0);
  17413. Offset = Ptr->getOperand(1);
  17414. return true;
  17415. }
  17416. // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
  17417. return false;
  17418. }
  17419. static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
  17420. bool isSEXTLoad, SDValue &Base,
  17421. SDValue &Offset, bool &isInc,
  17422. SelectionDAG &DAG) {
  17423. if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
  17424. return false;
  17425. Base = Ptr->getOperand(0);
  17426. if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
  17427. int RHSC = (int)RHS->getZExtValue();
  17428. if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
  17429. assert(Ptr->getOpcode() == ISD::ADD);
  17430. isInc = false;
  17431. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17432. return true;
  17433. } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
  17434. isInc = Ptr->getOpcode() == ISD::ADD;
  17435. Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17436. return true;
  17437. }
  17438. }
  17439. return false;
  17440. }
  17441. static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
  17442. bool isSEXTLoad, bool IsMasked, bool isLE,
  17443. SDValue &Base, SDValue &Offset,
  17444. bool &isInc, SelectionDAG &DAG) {
  17445. if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
  17446. return false;
  17447. if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
  17448. return false;
  17449. // We allow LE non-masked loads to change the type (for example use a vldrb.8
  17450. // as opposed to a vldrw.32). This can allow extra addressing modes or
  17451. // alignments for what is otherwise an equivalent instruction.
  17452. bool CanChangeType = isLE && !IsMasked;
  17453. ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
  17454. int RHSC = (int)RHS->getZExtValue();
  17455. auto IsInRange = [&](int RHSC, int Limit, int Scale) {
  17456. if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
  17457. assert(Ptr->getOpcode() == ISD::ADD);
  17458. isInc = false;
  17459. Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17460. return true;
  17461. } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
  17462. isInc = Ptr->getOpcode() == ISD::ADD;
  17463. Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
  17464. return true;
  17465. }
  17466. return false;
  17467. };
  17468. // Try to find a matching instruction based on s/zext, Alignment, Offset and
  17469. // (in BE/masked) type.
  17470. Base = Ptr->getOperand(0);
  17471. if (VT == MVT::v4i16) {
  17472. if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
  17473. return true;
  17474. } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
  17475. if (IsInRange(RHSC, 0x80, 1))
  17476. return true;
  17477. } else if (Alignment >= 4 &&
  17478. (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
  17479. IsInRange(RHSC, 0x80, 4))
  17480. return true;
  17481. else if (Alignment >= 2 &&
  17482. (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
  17483. IsInRange(RHSC, 0x80, 2))
  17484. return true;
  17485. else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
  17486. return true;
  17487. return false;
  17488. }
  17489. /// getPreIndexedAddressParts - returns true by value, base pointer and
  17490. /// offset pointer and addressing mode by reference if the node's address
  17491. /// can be legally represented as pre-indexed load / store address.
  17492. bool
  17493. ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
  17494. SDValue &Offset,
  17495. ISD::MemIndexedMode &AM,
  17496. SelectionDAG &DAG) const {
  17497. if (Subtarget->isThumb1Only())
  17498. return false;
  17499. EVT VT;
  17500. SDValue Ptr;
  17501. Align Alignment;
  17502. bool isSEXTLoad = false;
  17503. bool IsMasked = false;
  17504. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  17505. Ptr = LD->getBasePtr();
  17506. VT = LD->getMemoryVT();
  17507. Alignment = LD->getAlign();
  17508. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17509. } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
  17510. Ptr = ST->getBasePtr();
  17511. VT = ST->getMemoryVT();
  17512. Alignment = ST->getAlign();
  17513. } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
  17514. Ptr = LD->getBasePtr();
  17515. VT = LD->getMemoryVT();
  17516. Alignment = LD->getAlign();
  17517. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17518. IsMasked = true;
  17519. } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
  17520. Ptr = ST->getBasePtr();
  17521. VT = ST->getMemoryVT();
  17522. Alignment = ST->getAlign();
  17523. IsMasked = true;
  17524. } else
  17525. return false;
  17526. bool isInc;
  17527. bool isLegal = false;
  17528. if (VT.isVector())
  17529. isLegal = Subtarget->hasMVEIntegerOps() &&
  17530. getMVEIndexedAddressParts(
  17531. Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
  17532. Subtarget->isLittle(), Base, Offset, isInc, DAG);
  17533. else {
  17534. if (Subtarget->isThumb2())
  17535. isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
  17536. Offset, isInc, DAG);
  17537. else
  17538. isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
  17539. Offset, isInc, DAG);
  17540. }
  17541. if (!isLegal)
  17542. return false;
  17543. AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
  17544. return true;
  17545. }
  17546. /// getPostIndexedAddressParts - returns true by value, base pointer and
  17547. /// offset pointer and addressing mode by reference if this node can be
  17548. /// combined with a load / store to form a post-indexed load / store.
  17549. bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
  17550. SDValue &Base,
  17551. SDValue &Offset,
  17552. ISD::MemIndexedMode &AM,
  17553. SelectionDAG &DAG) const {
  17554. EVT VT;
  17555. SDValue Ptr;
  17556. Align Alignment;
  17557. bool isSEXTLoad = false, isNonExt;
  17558. bool IsMasked = false;
  17559. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
  17560. VT = LD->getMemoryVT();
  17561. Ptr = LD->getBasePtr();
  17562. Alignment = LD->getAlign();
  17563. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17564. isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
  17565. } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
  17566. VT = ST->getMemoryVT();
  17567. Ptr = ST->getBasePtr();
  17568. Alignment = ST->getAlign();
  17569. isNonExt = !ST->isTruncatingStore();
  17570. } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
  17571. VT = LD->getMemoryVT();
  17572. Ptr = LD->getBasePtr();
  17573. Alignment = LD->getAlign();
  17574. isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
  17575. isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
  17576. IsMasked = true;
  17577. } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
  17578. VT = ST->getMemoryVT();
  17579. Ptr = ST->getBasePtr();
  17580. Alignment = ST->getAlign();
  17581. isNonExt = !ST->isTruncatingStore();
  17582. IsMasked = true;
  17583. } else
  17584. return false;
  17585. if (Subtarget->isThumb1Only()) {
  17586. // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
  17587. // must be non-extending/truncating, i32, with an offset of 4.
  17588. assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
  17589. if (Op->getOpcode() != ISD::ADD || !isNonExt)
  17590. return false;
  17591. auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
  17592. if (!RHS || RHS->getZExtValue() != 4)
  17593. return false;
  17594. if (Alignment < Align(4))
  17595. return false;
  17596. Offset = Op->getOperand(1);
  17597. Base = Op->getOperand(0);
  17598. AM = ISD::POST_INC;
  17599. return true;
  17600. }
  17601. bool isInc;
  17602. bool isLegal = false;
  17603. if (VT.isVector())
  17604. isLegal = Subtarget->hasMVEIntegerOps() &&
  17605. getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
  17606. Subtarget->isLittle(), Base, Offset,
  17607. isInc, DAG);
  17608. else {
  17609. if (Subtarget->isThumb2())
  17610. isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
  17611. isInc, DAG);
  17612. else
  17613. isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
  17614. isInc, DAG);
  17615. }
  17616. if (!isLegal)
  17617. return false;
  17618. if (Ptr != Base) {
  17619. // Swap base ptr and offset to catch more post-index load / store when
  17620. // it's legal. In Thumb2 mode, offset must be an immediate.
  17621. if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
  17622. !Subtarget->isThumb2())
  17623. std::swap(Base, Offset);
  17624. // Post-indexed load / store update the base pointer.
  17625. if (Ptr != Base)
  17626. return false;
  17627. }
  17628. AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
  17629. return true;
  17630. }
  17631. void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
  17632. KnownBits &Known,
  17633. const APInt &DemandedElts,
  17634. const SelectionDAG &DAG,
  17635. unsigned Depth) const {
  17636. unsigned BitWidth = Known.getBitWidth();
  17637. Known.resetAll();
  17638. switch (Op.getOpcode()) {
  17639. default: break;
  17640. case ARMISD::ADDC:
  17641. case ARMISD::ADDE:
  17642. case ARMISD::SUBC:
  17643. case ARMISD::SUBE:
  17644. // Special cases when we convert a carry to a boolean.
  17645. if (Op.getResNo() == 0) {
  17646. SDValue LHS = Op.getOperand(0);
  17647. SDValue RHS = Op.getOperand(1);
  17648. // (ADDE 0, 0, C) will give us a single bit.
  17649. if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
  17650. isNullConstant(RHS)) {
  17651. Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
  17652. return;
  17653. }
  17654. }
  17655. break;
  17656. case ARMISD::CMOV: {
  17657. // Bits are known zero/one if known on the LHS and RHS.
  17658. Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
  17659. if (Known.isUnknown())
  17660. return;
  17661. KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
  17662. Known = KnownBits::commonBits(Known, KnownRHS);
  17663. return;
  17664. }
  17665. case ISD::INTRINSIC_W_CHAIN: {
  17666. ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
  17667. Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
  17668. switch (IntID) {
  17669. default: return;
  17670. case Intrinsic::arm_ldaex:
  17671. case Intrinsic::arm_ldrex: {
  17672. EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
  17673. unsigned MemBits = VT.getScalarSizeInBits();
  17674. Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
  17675. return;
  17676. }
  17677. }
  17678. }
  17679. case ARMISD::BFI: {
  17680. // Conservatively, we can recurse down the first operand
  17681. // and just mask out all affected bits.
  17682. Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
  17683. // The operand to BFI is already a mask suitable for removing the bits it
  17684. // sets.
  17685. ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
  17686. const APInt &Mask = CI->getAPIntValue();
  17687. Known.Zero &= Mask;
  17688. Known.One &= Mask;
  17689. return;
  17690. }
  17691. case ARMISD::VGETLANEs:
  17692. case ARMISD::VGETLANEu: {
  17693. const SDValue &SrcSV = Op.getOperand(0);
  17694. EVT VecVT = SrcSV.getValueType();
  17695. assert(VecVT.isVector() && "VGETLANE expected a vector type");
  17696. const unsigned NumSrcElts = VecVT.getVectorNumElements();
  17697. ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
  17698. assert(Pos->getAPIntValue().ult(NumSrcElts) &&
  17699. "VGETLANE index out of bounds");
  17700. unsigned Idx = Pos->getZExtValue();
  17701. APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
  17702. Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
  17703. EVT VT = Op.getValueType();
  17704. const unsigned DstSz = VT.getScalarSizeInBits();
  17705. const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
  17706. (void)SrcSz;
  17707. assert(SrcSz == Known.getBitWidth());
  17708. assert(DstSz > SrcSz);
  17709. if (Op.getOpcode() == ARMISD::VGETLANEs)
  17710. Known = Known.sext(DstSz);
  17711. else {
  17712. Known = Known.zext(DstSz);
  17713. }
  17714. assert(DstSz == Known.getBitWidth());
  17715. break;
  17716. }
  17717. case ARMISD::VMOVrh: {
  17718. KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  17719. assert(KnownOp.getBitWidth() == 16);
  17720. Known = KnownOp.zext(32);
  17721. break;
  17722. }
  17723. case ARMISD::CSINC:
  17724. case ARMISD::CSINV:
  17725. case ARMISD::CSNEG: {
  17726. KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
  17727. KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
  17728. // The result is either:
  17729. // CSINC: KnownOp0 or KnownOp1 + 1
  17730. // CSINV: KnownOp0 or ~KnownOp1
  17731. // CSNEG: KnownOp0 or KnownOp1 * -1
  17732. if (Op.getOpcode() == ARMISD::CSINC)
  17733. KnownOp1 = KnownBits::computeForAddSub(
  17734. true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
  17735. else if (Op.getOpcode() == ARMISD::CSINV)
  17736. std::swap(KnownOp1.Zero, KnownOp1.One);
  17737. else if (Op.getOpcode() == ARMISD::CSNEG)
  17738. KnownOp1 = KnownBits::mul(
  17739. KnownOp1, KnownBits::makeConstant(APInt(32, -1)));
  17740. Known = KnownBits::commonBits(KnownOp0, KnownOp1);
  17741. break;
  17742. }
  17743. }
  17744. }
  17745. bool ARMTargetLowering::targetShrinkDemandedConstant(
  17746. SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
  17747. TargetLoweringOpt &TLO) const {
  17748. // Delay optimization, so we don't have to deal with illegal types, or block
  17749. // optimizations.
  17750. if (!TLO.LegalOps)
  17751. return false;
  17752. // Only optimize AND for now.
  17753. if (Op.getOpcode() != ISD::AND)
  17754. return false;
  17755. EVT VT = Op.getValueType();
  17756. // Ignore vectors.
  17757. if (VT.isVector())
  17758. return false;
  17759. assert(VT == MVT::i32 && "Unexpected integer type");
  17760. // Make sure the RHS really is a constant.
  17761. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
  17762. if (!C)
  17763. return false;
  17764. unsigned Mask = C->getZExtValue();
  17765. unsigned Demanded = DemandedBits.getZExtValue();
  17766. unsigned ShrunkMask = Mask & Demanded;
  17767. unsigned ExpandedMask = Mask | ~Demanded;
  17768. // If the mask is all zeros, let the target-independent code replace the
  17769. // result with zero.
  17770. if (ShrunkMask == 0)
  17771. return false;
  17772. // If the mask is all ones, erase the AND. (Currently, the target-independent
  17773. // code won't do this, so we have to do it explicitly to avoid an infinite
  17774. // loop in obscure cases.)
  17775. if (ExpandedMask == ~0U)
  17776. return TLO.CombineTo(Op, Op.getOperand(0));
  17777. auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
  17778. return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
  17779. };
  17780. auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
  17781. if (NewMask == Mask)
  17782. return true;
  17783. SDLoc DL(Op);
  17784. SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
  17785. SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
  17786. return TLO.CombineTo(Op, NewOp);
  17787. };
  17788. // Prefer uxtb mask.
  17789. if (IsLegalMask(0xFF))
  17790. return UseMask(0xFF);
  17791. // Prefer uxth mask.
  17792. if (IsLegalMask(0xFFFF))
  17793. return UseMask(0xFFFF);
  17794. // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
  17795. // FIXME: Prefer a contiguous sequence of bits for other optimizations.
  17796. if (ShrunkMask < 256)
  17797. return UseMask(ShrunkMask);
  17798. // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
  17799. // FIXME: Prefer a contiguous sequence of bits for other optimizations.
  17800. if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
  17801. return UseMask(ExpandedMask);
  17802. // Potential improvements:
  17803. //
  17804. // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
  17805. // We could try to prefer Thumb1 immediates which can be lowered to a
  17806. // two-instruction sequence.
  17807. // We could try to recognize more legal ARM/Thumb2 immediates here.
  17808. return false;
  17809. }
  17810. bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
  17811. SDValue Op, const APInt &OriginalDemandedBits,
  17812. const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
  17813. unsigned Depth) const {
  17814. unsigned Opc = Op.getOpcode();
  17815. switch (Opc) {
  17816. case ARMISD::ASRL:
  17817. case ARMISD::LSRL: {
  17818. // If this is result 0 and the other result is unused, see if the demand
  17819. // bits allow us to shrink this long shift into a standard small shift in
  17820. // the opposite direction.
  17821. if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
  17822. isa<ConstantSDNode>(Op->getOperand(2))) {
  17823. unsigned ShAmt = Op->getConstantOperandVal(2);
  17824. if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
  17825. << (32 - ShAmt)))
  17826. return TLO.CombineTo(
  17827. Op, TLO.DAG.getNode(
  17828. ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
  17829. TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
  17830. }
  17831. break;
  17832. }
  17833. case ARMISD::VBICIMM: {
  17834. SDValue Op0 = Op.getOperand(0);
  17835. unsigned ModImm = Op.getConstantOperandVal(1);
  17836. unsigned EltBits = 0;
  17837. uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
  17838. if ((OriginalDemandedBits & Mask) == 0)
  17839. return TLO.CombineTo(Op, Op0);
  17840. }
  17841. }
  17842. return TargetLowering::SimplifyDemandedBitsForTargetNode(
  17843. Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
  17844. }
  17845. //===----------------------------------------------------------------------===//
  17846. // ARM Inline Assembly Support
  17847. //===----------------------------------------------------------------------===//
  17848. bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
  17849. // Looking for "rev" which is V6+.
  17850. if (!Subtarget->hasV6Ops())
  17851. return false;
  17852. InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
  17853. std::string AsmStr = IA->getAsmString();
  17854. SmallVector<StringRef, 4> AsmPieces;
  17855. SplitString(AsmStr, AsmPieces, ";\n");
  17856. switch (AsmPieces.size()) {
  17857. default: return false;
  17858. case 1:
  17859. AsmStr = std::string(AsmPieces[0]);
  17860. AsmPieces.clear();
  17861. SplitString(AsmStr, AsmPieces, " \t,");
  17862. // rev $0, $1
  17863. if (AsmPieces.size() == 3 &&
  17864. AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
  17865. IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
  17866. IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
  17867. if (Ty && Ty->getBitWidth() == 32)
  17868. return IntrinsicLowering::LowerToByteSwap(CI);
  17869. }
  17870. break;
  17871. }
  17872. return false;
  17873. }
  17874. const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
  17875. // At this point, we have to lower this constraint to something else, so we
  17876. // lower it to an "r" or "w". However, by doing this we will force the result
  17877. // to be in register, while the X constraint is much more permissive.
  17878. //
  17879. // Although we are correct (we are free to emit anything, without
  17880. // constraints), we might break use cases that would expect us to be more
  17881. // efficient and emit something else.
  17882. if (!Subtarget->hasVFP2Base())
  17883. return "r";
  17884. if (ConstraintVT.isFloatingPoint())
  17885. return "w";
  17886. if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
  17887. (ConstraintVT.getSizeInBits() == 64 ||
  17888. ConstraintVT.getSizeInBits() == 128))
  17889. return "w";
  17890. return "r";
  17891. }
  17892. /// getConstraintType - Given a constraint letter, return the type of
  17893. /// constraint it is for this target.
  17894. ARMTargetLowering::ConstraintType
  17895. ARMTargetLowering::getConstraintType(StringRef Constraint) const {
  17896. unsigned S = Constraint.size();
  17897. if (S == 1) {
  17898. switch (Constraint[0]) {
  17899. default: break;
  17900. case 'l': return C_RegisterClass;
  17901. case 'w': return C_RegisterClass;
  17902. case 'h': return C_RegisterClass;
  17903. case 'x': return C_RegisterClass;
  17904. case 't': return C_RegisterClass;
  17905. case 'j': return C_Immediate; // Constant for movw.
  17906. // An address with a single base register. Due to the way we
  17907. // currently handle addresses it is the same as an 'r' memory constraint.
  17908. case 'Q': return C_Memory;
  17909. }
  17910. } else if (S == 2) {
  17911. switch (Constraint[0]) {
  17912. default: break;
  17913. case 'T': return C_RegisterClass;
  17914. // All 'U+' constraints are addresses.
  17915. case 'U': return C_Memory;
  17916. }
  17917. }
  17918. return TargetLowering::getConstraintType(Constraint);
  17919. }
  17920. /// Examine constraint type and operand type and determine a weight value.
  17921. /// This object must already have been set up with the operand type
  17922. /// and the current alternative constraint selected.
  17923. TargetLowering::ConstraintWeight
  17924. ARMTargetLowering::getSingleConstraintMatchWeight(
  17925. AsmOperandInfo &info, const char *constraint) const {
  17926. ConstraintWeight weight = CW_Invalid;
  17927. Value *CallOperandVal = info.CallOperandVal;
  17928. // If we don't have a value, we can't do a match,
  17929. // but allow it at the lowest weight.
  17930. if (!CallOperandVal)
  17931. return CW_Default;
  17932. Type *type = CallOperandVal->getType();
  17933. // Look at the constraint type.
  17934. switch (*constraint) {
  17935. default:
  17936. weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
  17937. break;
  17938. case 'l':
  17939. if (type->isIntegerTy()) {
  17940. if (Subtarget->isThumb())
  17941. weight = CW_SpecificReg;
  17942. else
  17943. weight = CW_Register;
  17944. }
  17945. break;
  17946. case 'w':
  17947. if (type->isFloatingPointTy())
  17948. weight = CW_Register;
  17949. break;
  17950. }
  17951. return weight;
  17952. }
  17953. using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
  17954. RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
  17955. const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
  17956. switch (Constraint.size()) {
  17957. case 1:
  17958. // GCC ARM Constraint Letters
  17959. switch (Constraint[0]) {
  17960. case 'l': // Low regs or general regs.
  17961. if (Subtarget->isThumb())
  17962. return RCPair(0U, &ARM::tGPRRegClass);
  17963. return RCPair(0U, &ARM::GPRRegClass);
  17964. case 'h': // High regs or no regs.
  17965. if (Subtarget->isThumb())
  17966. return RCPair(0U, &ARM::hGPRRegClass);
  17967. break;
  17968. case 'r':
  17969. if (Subtarget->isThumb1Only())
  17970. return RCPair(0U, &ARM::tGPRRegClass);
  17971. return RCPair(0U, &ARM::GPRRegClass);
  17972. case 'w':
  17973. if (VT == MVT::Other)
  17974. break;
  17975. if (VT == MVT::f16 || VT == MVT::bf16)
  17976. return RCPair(0U, &ARM::HPRRegClass);
  17977. if (VT == MVT::f32)
  17978. return RCPair(0U, &ARM::SPRRegClass);
  17979. if (VT.getSizeInBits() == 64)
  17980. return RCPair(0U, &ARM::DPRRegClass);
  17981. if (VT.getSizeInBits() == 128)
  17982. return RCPair(0U, &ARM::QPRRegClass);
  17983. break;
  17984. case 'x':
  17985. if (VT == MVT::Other)
  17986. break;
  17987. if (VT == MVT::f32)
  17988. return RCPair(0U, &ARM::SPR_8RegClass);
  17989. if (VT.getSizeInBits() == 64)
  17990. return RCPair(0U, &ARM::DPR_8RegClass);
  17991. if (VT.getSizeInBits() == 128)
  17992. return RCPair(0U, &ARM::QPR_8RegClass);
  17993. break;
  17994. case 't':
  17995. if (VT == MVT::Other)
  17996. break;
  17997. if (VT == MVT::f16 || VT == MVT::bf16)
  17998. return RCPair(0U, &ARM::HPRRegClass);
  17999. if (VT == MVT::f32 || VT == MVT::i32)
  18000. return RCPair(0U, &ARM::SPRRegClass);
  18001. if (VT.getSizeInBits() == 64)
  18002. return RCPair(0U, &ARM::DPR_VFP2RegClass);
  18003. if (VT.getSizeInBits() == 128)
  18004. return RCPair(0U, &ARM::QPR_VFP2RegClass);
  18005. break;
  18006. }
  18007. break;
  18008. case 2:
  18009. if (Constraint[0] == 'T') {
  18010. switch (Constraint[1]) {
  18011. default:
  18012. break;
  18013. case 'e':
  18014. return RCPair(0U, &ARM::tGPREvenRegClass);
  18015. case 'o':
  18016. return RCPair(0U, &ARM::tGPROddRegClass);
  18017. }
  18018. }
  18019. break;
  18020. default:
  18021. break;
  18022. }
  18023. if (StringRef("{cc}").equals_insensitive(Constraint))
  18024. return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
  18025. return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  18026. }
  18027. /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
  18028. /// vector. If it is invalid, don't add anything to Ops.
  18029. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
  18030. std::string &Constraint,
  18031. std::vector<SDValue>&Ops,
  18032. SelectionDAG &DAG) const {
  18033. SDValue Result;
  18034. // Currently only support length 1 constraints.
  18035. if (Constraint.length() != 1) return;
  18036. char ConstraintLetter = Constraint[0];
  18037. switch (ConstraintLetter) {
  18038. default: break;
  18039. case 'j':
  18040. case 'I': case 'J': case 'K': case 'L':
  18041. case 'M': case 'N': case 'O':
  18042. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
  18043. if (!C)
  18044. return;
  18045. int64_t CVal64 = C->getSExtValue();
  18046. int CVal = (int) CVal64;
  18047. // None of these constraints allow values larger than 32 bits. Check
  18048. // that the value fits in an int.
  18049. if (CVal != CVal64)
  18050. return;
  18051. switch (ConstraintLetter) {
  18052. case 'j':
  18053. // Constant suitable for movw, must be between 0 and
  18054. // 65535.
  18055. if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
  18056. if (CVal >= 0 && CVal <= 65535)
  18057. break;
  18058. return;
  18059. case 'I':
  18060. if (Subtarget->isThumb1Only()) {
  18061. // This must be a constant between 0 and 255, for ADD
  18062. // immediates.
  18063. if (CVal >= 0 && CVal <= 255)
  18064. break;
  18065. } else if (Subtarget->isThumb2()) {
  18066. // A constant that can be used as an immediate value in a
  18067. // data-processing instruction.
  18068. if (ARM_AM::getT2SOImmVal(CVal) != -1)
  18069. break;
  18070. } else {
  18071. // A constant that can be used as an immediate value in a
  18072. // data-processing instruction.
  18073. if (ARM_AM::getSOImmVal(CVal) != -1)
  18074. break;
  18075. }
  18076. return;
  18077. case 'J':
  18078. if (Subtarget->isThumb1Only()) {
  18079. // This must be a constant between -255 and -1, for negated ADD
  18080. // immediates. This can be used in GCC with an "n" modifier that
  18081. // prints the negated value, for use with SUB instructions. It is
  18082. // not useful otherwise but is implemented for compatibility.
  18083. if (CVal >= -255 && CVal <= -1)
  18084. break;
  18085. } else {
  18086. // This must be a constant between -4095 and 4095. It is not clear
  18087. // what this constraint is intended for. Implemented for
  18088. // compatibility with GCC.
  18089. if (CVal >= -4095 && CVal <= 4095)
  18090. break;
  18091. }
  18092. return;
  18093. case 'K':
  18094. if (Subtarget->isThumb1Only()) {
  18095. // A 32-bit value where only one byte has a nonzero value. Exclude
  18096. // zero to match GCC. This constraint is used by GCC internally for
  18097. // constants that can be loaded with a move/shift combination.
  18098. // It is not useful otherwise but is implemented for compatibility.
  18099. if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
  18100. break;
  18101. } else if (Subtarget->isThumb2()) {
  18102. // A constant whose bitwise inverse can be used as an immediate
  18103. // value in a data-processing instruction. This can be used in GCC
  18104. // with a "B" modifier that prints the inverted value, for use with
  18105. // BIC and MVN instructions. It is not useful otherwise but is
  18106. // implemented for compatibility.
  18107. if (ARM_AM::getT2SOImmVal(~CVal) != -1)
  18108. break;
  18109. } else {
  18110. // A constant whose bitwise inverse can be used as an immediate
  18111. // value in a data-processing instruction. This can be used in GCC
  18112. // with a "B" modifier that prints the inverted value, for use with
  18113. // BIC and MVN instructions. It is not useful otherwise but is
  18114. // implemented for compatibility.
  18115. if (ARM_AM::getSOImmVal(~CVal) != -1)
  18116. break;
  18117. }
  18118. return;
  18119. case 'L':
  18120. if (Subtarget->isThumb1Only()) {
  18121. // This must be a constant between -7 and 7,
  18122. // for 3-operand ADD/SUB immediate instructions.
  18123. if (CVal >= -7 && CVal < 7)
  18124. break;
  18125. } else if (Subtarget->isThumb2()) {
  18126. // A constant whose negation can be used as an immediate value in a
  18127. // data-processing instruction. This can be used in GCC with an "n"
  18128. // modifier that prints the negated value, for use with SUB
  18129. // instructions. It is not useful otherwise but is implemented for
  18130. // compatibility.
  18131. if (ARM_AM::getT2SOImmVal(-CVal) != -1)
  18132. break;
  18133. } else {
  18134. // A constant whose negation can be used as an immediate value in a
  18135. // data-processing instruction. This can be used in GCC with an "n"
  18136. // modifier that prints the negated value, for use with SUB
  18137. // instructions. It is not useful otherwise but is implemented for
  18138. // compatibility.
  18139. if (ARM_AM::getSOImmVal(-CVal) != -1)
  18140. break;
  18141. }
  18142. return;
  18143. case 'M':
  18144. if (Subtarget->isThumb1Only()) {
  18145. // This must be a multiple of 4 between 0 and 1020, for
  18146. // ADD sp + immediate.
  18147. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
  18148. break;
  18149. } else {
  18150. // A power of two or a constant between 0 and 32. This is used in
  18151. // GCC for the shift amount on shifted register operands, but it is
  18152. // useful in general for any shift amounts.
  18153. if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
  18154. break;
  18155. }
  18156. return;
  18157. case 'N':
  18158. if (Subtarget->isThumb1Only()) {
  18159. // This must be a constant between 0 and 31, for shift amounts.
  18160. if (CVal >= 0 && CVal <= 31)
  18161. break;
  18162. }
  18163. return;
  18164. case 'O':
  18165. if (Subtarget->isThumb1Only()) {
  18166. // This must be a multiple of 4 between -508 and 508, for
  18167. // ADD/SUB sp = sp + immediate.
  18168. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
  18169. break;
  18170. }
  18171. return;
  18172. }
  18173. Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
  18174. break;
  18175. }
  18176. if (Result.getNode()) {
  18177. Ops.push_back(Result);
  18178. return;
  18179. }
  18180. return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
  18181. }
  18182. static RTLIB::Libcall getDivRemLibcall(
  18183. const SDNode *N, MVT::SimpleValueType SVT) {
  18184. assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
  18185. N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
  18186. "Unhandled Opcode in getDivRemLibcall");
  18187. bool isSigned = N->getOpcode() == ISD::SDIVREM ||
  18188. N->getOpcode() == ISD::SREM;
  18189. RTLIB::Libcall LC;
  18190. switch (SVT) {
  18191. default: llvm_unreachable("Unexpected request for libcall!");
  18192. case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
  18193. case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
  18194. case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
  18195. case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
  18196. }
  18197. return LC;
  18198. }
  18199. static TargetLowering::ArgListTy getDivRemArgList(
  18200. const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
  18201. assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
  18202. N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
  18203. "Unhandled Opcode in getDivRemArgList");
  18204. bool isSigned = N->getOpcode() == ISD::SDIVREM ||
  18205. N->getOpcode() == ISD::SREM;
  18206. TargetLowering::ArgListTy Args;
  18207. TargetLowering::ArgListEntry Entry;
  18208. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
  18209. EVT ArgVT = N->getOperand(i).getValueType();
  18210. Type *ArgTy = ArgVT.getTypeForEVT(*Context);
  18211. Entry.Node = N->getOperand(i);
  18212. Entry.Ty = ArgTy;
  18213. Entry.IsSExt = isSigned;
  18214. Entry.IsZExt = !isSigned;
  18215. Args.push_back(Entry);
  18216. }
  18217. if (Subtarget->isTargetWindows() && Args.size() >= 2)
  18218. std::swap(Args[0], Args[1]);
  18219. return Args;
  18220. }
  18221. SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
  18222. assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
  18223. Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
  18224. Subtarget->isTargetWindows()) &&
  18225. "Register-based DivRem lowering only");
  18226. unsigned Opcode = Op->getOpcode();
  18227. assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
  18228. "Invalid opcode for Div/Rem lowering");
  18229. bool isSigned = (Opcode == ISD::SDIVREM);
  18230. EVT VT = Op->getValueType(0);
  18231. SDLoc dl(Op);
  18232. if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
  18233. SmallVector<SDValue> Result;
  18234. if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
  18235. SDValue Res0 =
  18236. DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
  18237. SDValue Res1 =
  18238. DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
  18239. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
  18240. {Res0, Res1});
  18241. }
  18242. }
  18243. Type *Ty = VT.getTypeForEVT(*DAG.getContext());
  18244. // If the target has hardware divide, use divide + multiply + subtract:
  18245. // div = a / b
  18246. // rem = a - b * div
  18247. // return {div, rem}
  18248. // This should be lowered into UDIV/SDIV + MLS later on.
  18249. bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
  18250. : Subtarget->hasDivideInARMMode();
  18251. if (hasDivide && Op->getValueType(0).isSimple() &&
  18252. Op->getSimpleValueType(0) == MVT::i32) {
  18253. unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
  18254. const SDValue Dividend = Op->getOperand(0);
  18255. const SDValue Divisor = Op->getOperand(1);
  18256. SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
  18257. SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
  18258. SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
  18259. SDValue Values[2] = {Div, Rem};
  18260. return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
  18261. }
  18262. RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
  18263. VT.getSimpleVT().SimpleTy);
  18264. SDValue InChain = DAG.getEntryNode();
  18265. TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
  18266. DAG.getContext(),
  18267. Subtarget);
  18268. SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
  18269. getPointerTy(DAG.getDataLayout()));
  18270. Type *RetTy = StructType::get(Ty, Ty);
  18271. if (Subtarget->isTargetWindows())
  18272. InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
  18273. TargetLowering::CallLoweringInfo CLI(DAG);
  18274. CLI.setDebugLoc(dl).setChain(InChain)
  18275. .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
  18276. .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
  18277. std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
  18278. return CallInfo.first;
  18279. }
  18280. // Lowers REM using divmod helpers
  18281. // see RTABI section 4.2/4.3
  18282. SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
  18283. EVT VT = N->getValueType(0);
  18284. if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
  18285. SmallVector<SDValue> Result;
  18286. if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
  18287. return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
  18288. Result[0], Result[1]);
  18289. }
  18290. // Build return types (div and rem)
  18291. std::vector<Type*> RetTyParams;
  18292. Type *RetTyElement;
  18293. switch (VT.getSimpleVT().SimpleTy) {
  18294. default: llvm_unreachable("Unexpected request for libcall!");
  18295. case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
  18296. case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
  18297. case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
  18298. case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
  18299. }
  18300. RetTyParams.push_back(RetTyElement);
  18301. RetTyParams.push_back(RetTyElement);
  18302. ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
  18303. Type *RetTy = StructType::get(*DAG.getContext(), ret);
  18304. RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
  18305. SimpleTy);
  18306. SDValue InChain = DAG.getEntryNode();
  18307. TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
  18308. Subtarget);
  18309. bool isSigned = N->getOpcode() == ISD::SREM;
  18310. SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
  18311. getPointerTy(DAG.getDataLayout()));
  18312. if (Subtarget->isTargetWindows())
  18313. InChain = WinDBZCheckDenominator(DAG, N, InChain);
  18314. // Lower call
  18315. CallLoweringInfo CLI(DAG);
  18316. CLI.setChain(InChain)
  18317. .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
  18318. .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
  18319. std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  18320. // Return second (rem) result operand (first contains div)
  18321. SDNode *ResNode = CallResult.first.getNode();
  18322. assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
  18323. return ResNode->getOperand(1);
  18324. }
  18325. SDValue
  18326. ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
  18327. assert(Subtarget->isTargetWindows() && "unsupported target platform");
  18328. SDLoc DL(Op);
  18329. // Get the inputs.
  18330. SDValue Chain = Op.getOperand(0);
  18331. SDValue Size = Op.getOperand(1);
  18332. if (DAG.getMachineFunction().getFunction().hasFnAttribute(
  18333. "no-stack-arg-probe")) {
  18334. MaybeAlign Align =
  18335. cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
  18336. SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
  18337. Chain = SP.getValue(1);
  18338. SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
  18339. if (Align)
  18340. SP =
  18341. DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
  18342. DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
  18343. Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
  18344. SDValue Ops[2] = { SP, Chain };
  18345. return DAG.getMergeValues(Ops, DL);
  18346. }
  18347. SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
  18348. DAG.getConstant(2, DL, MVT::i32));
  18349. SDValue Flag;
  18350. Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
  18351. Flag = Chain.getValue(1);
  18352. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  18353. Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
  18354. SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
  18355. Chain = NewSP.getValue(1);
  18356. SDValue Ops[2] = { NewSP, Chain };
  18357. return DAG.getMergeValues(Ops, DL);
  18358. }
  18359. SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
  18360. bool IsStrict = Op->isStrictFPOpcode();
  18361. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  18362. const unsigned DstSz = Op.getValueType().getSizeInBits();
  18363. const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
  18364. assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
  18365. "Unexpected type for custom-lowering FP_EXTEND");
  18366. assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
  18367. "With both FP DP and 16, any FP conversion is legal!");
  18368. assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
  18369. "With FP16, 16 to 32 conversion is legal!");
  18370. // Converting from 32 -> 64 is valid if we have FP64.
  18371. if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
  18372. // FIXME: Remove this when we have strict fp instruction selection patterns
  18373. if (IsStrict) {
  18374. SDLoc Loc(Op);
  18375. SDValue Result = DAG.getNode(ISD::FP_EXTEND,
  18376. Loc, Op.getValueType(), SrcVal);
  18377. return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
  18378. }
  18379. return Op;
  18380. }
  18381. // Either we are converting from 16 -> 64, without FP16 and/or
  18382. // FP.double-precision or without Armv8-fp. So we must do it in two
  18383. // steps.
  18384. // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
  18385. // without FP16. So we must do a function call.
  18386. SDLoc Loc(Op);
  18387. RTLIB::Libcall LC;
  18388. MakeLibCallOptions CallOptions;
  18389. SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
  18390. for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
  18391. bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
  18392. MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
  18393. MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
  18394. if (Supported) {
  18395. if (IsStrict) {
  18396. SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
  18397. {DstVT, MVT::Other}, {Chain, SrcVal});
  18398. Chain = SrcVal.getValue(1);
  18399. } else {
  18400. SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
  18401. }
  18402. } else {
  18403. LC = RTLIB::getFPEXT(SrcVT, DstVT);
  18404. assert(LC != RTLIB::UNKNOWN_LIBCALL &&
  18405. "Unexpected type for custom-lowering FP_EXTEND");
  18406. std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
  18407. Loc, Chain);
  18408. }
  18409. }
  18410. return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
  18411. }
  18412. SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
  18413. bool IsStrict = Op->isStrictFPOpcode();
  18414. SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
  18415. EVT SrcVT = SrcVal.getValueType();
  18416. EVT DstVT = Op.getValueType();
  18417. const unsigned DstSz = Op.getValueType().getSizeInBits();
  18418. const unsigned SrcSz = SrcVT.getSizeInBits();
  18419. (void)DstSz;
  18420. assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
  18421. "Unexpected type for custom-lowering FP_ROUND");
  18422. assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
  18423. "With both FP DP and 16, any FP conversion is legal!");
  18424. SDLoc Loc(Op);
  18425. // Instruction from 32 -> 16 if hasFP16 is valid
  18426. if (SrcSz == 32 && Subtarget->hasFP16())
  18427. return Op;
  18428. // Lib call from 32 -> 16 / 64 -> [32, 16]
  18429. RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
  18430. assert(LC != RTLIB::UNKNOWN_LIBCALL &&
  18431. "Unexpected type for custom-lowering FP_ROUND");
  18432. MakeLibCallOptions CallOptions;
  18433. SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
  18434. SDValue Result;
  18435. std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
  18436. Loc, Chain);
  18437. return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
  18438. }
  18439. bool
  18440. ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  18441. // The ARM target isn't yet aware of offsets.
  18442. return false;
  18443. }
  18444. bool ARM::isBitFieldInvertedMask(unsigned v) {
  18445. if (v == 0xffffffff)
  18446. return false;
  18447. // there can be 1's on either or both "outsides", all the "inside"
  18448. // bits must be 0's
  18449. return isShiftedMask_32(~v);
  18450. }
  18451. /// isFPImmLegal - Returns true if the target can instruction select the
  18452. /// specified FP immediate natively. If false, the legalizer will
  18453. /// materialize the FP immediate as a load from a constant pool.
  18454. bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
  18455. bool ForCodeSize) const {
  18456. if (!Subtarget->hasVFP3Base())
  18457. return false;
  18458. if (VT == MVT::f16 && Subtarget->hasFullFP16())
  18459. return ARM_AM::getFP16Imm(Imm) != -1;
  18460. if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
  18461. ARM_AM::getFP32FP16Imm(Imm) != -1)
  18462. return true;
  18463. if (VT == MVT::f32)
  18464. return ARM_AM::getFP32Imm(Imm) != -1;
  18465. if (VT == MVT::f64 && Subtarget->hasFP64())
  18466. return ARM_AM::getFP64Imm(Imm) != -1;
  18467. return false;
  18468. }
  18469. /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
  18470. /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
  18471. /// specified in the intrinsic calls.
  18472. bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
  18473. const CallInst &I,
  18474. MachineFunction &MF,
  18475. unsigned Intrinsic) const {
  18476. switch (Intrinsic) {
  18477. case Intrinsic::arm_neon_vld1:
  18478. case Intrinsic::arm_neon_vld2:
  18479. case Intrinsic::arm_neon_vld3:
  18480. case Intrinsic::arm_neon_vld4:
  18481. case Intrinsic::arm_neon_vld2lane:
  18482. case Intrinsic::arm_neon_vld3lane:
  18483. case Intrinsic::arm_neon_vld4lane:
  18484. case Intrinsic::arm_neon_vld2dup:
  18485. case Intrinsic::arm_neon_vld3dup:
  18486. case Intrinsic::arm_neon_vld4dup: {
  18487. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18488. // Conservatively set memVT to the entire set of vectors loaded.
  18489. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18490. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
  18491. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18492. Info.ptrVal = I.getArgOperand(0);
  18493. Info.offset = 0;
  18494. Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
  18495. Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
  18496. // volatile loads with NEON intrinsics not supported
  18497. Info.flags = MachineMemOperand::MOLoad;
  18498. return true;
  18499. }
  18500. case Intrinsic::arm_neon_vld1x2:
  18501. case Intrinsic::arm_neon_vld1x3:
  18502. case Intrinsic::arm_neon_vld1x4: {
  18503. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18504. // Conservatively set memVT to the entire set of vectors loaded.
  18505. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18506. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
  18507. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18508. Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
  18509. Info.offset = 0;
  18510. Info.align.reset();
  18511. // volatile loads with NEON intrinsics not supported
  18512. Info.flags = MachineMemOperand::MOLoad;
  18513. return true;
  18514. }
  18515. case Intrinsic::arm_neon_vst1:
  18516. case Intrinsic::arm_neon_vst2:
  18517. case Intrinsic::arm_neon_vst3:
  18518. case Intrinsic::arm_neon_vst4:
  18519. case Intrinsic::arm_neon_vst2lane:
  18520. case Intrinsic::arm_neon_vst3lane:
  18521. case Intrinsic::arm_neon_vst4lane: {
  18522. Info.opc = ISD::INTRINSIC_VOID;
  18523. // Conservatively set memVT to the entire set of vectors stored.
  18524. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18525. unsigned NumElts = 0;
  18526. for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
  18527. Type *ArgTy = I.getArgOperand(ArgI)->getType();
  18528. if (!ArgTy->isVectorTy())
  18529. break;
  18530. NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
  18531. }
  18532. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18533. Info.ptrVal = I.getArgOperand(0);
  18534. Info.offset = 0;
  18535. Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
  18536. Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
  18537. // volatile stores with NEON intrinsics not supported
  18538. Info.flags = MachineMemOperand::MOStore;
  18539. return true;
  18540. }
  18541. case Intrinsic::arm_neon_vst1x2:
  18542. case Intrinsic::arm_neon_vst1x3:
  18543. case Intrinsic::arm_neon_vst1x4: {
  18544. Info.opc = ISD::INTRINSIC_VOID;
  18545. // Conservatively set memVT to the entire set of vectors stored.
  18546. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18547. unsigned NumElts = 0;
  18548. for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
  18549. Type *ArgTy = I.getArgOperand(ArgI)->getType();
  18550. if (!ArgTy->isVectorTy())
  18551. break;
  18552. NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
  18553. }
  18554. Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
  18555. Info.ptrVal = I.getArgOperand(0);
  18556. Info.offset = 0;
  18557. Info.align.reset();
  18558. // volatile stores with NEON intrinsics not supported
  18559. Info.flags = MachineMemOperand::MOStore;
  18560. return true;
  18561. }
  18562. case Intrinsic::arm_mve_vld2q:
  18563. case Intrinsic::arm_mve_vld4q: {
  18564. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18565. // Conservatively set memVT to the entire set of vectors loaded.
  18566. Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
  18567. unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
  18568. Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
  18569. Info.ptrVal = I.getArgOperand(0);
  18570. Info.offset = 0;
  18571. Info.align = Align(VecTy->getScalarSizeInBits() / 8);
  18572. // volatile loads with MVE intrinsics not supported
  18573. Info.flags = MachineMemOperand::MOLoad;
  18574. return true;
  18575. }
  18576. case Intrinsic::arm_mve_vst2q:
  18577. case Intrinsic::arm_mve_vst4q: {
  18578. Info.opc = ISD::INTRINSIC_VOID;
  18579. // Conservatively set memVT to the entire set of vectors stored.
  18580. Type *VecTy = I.getArgOperand(1)->getType();
  18581. unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
  18582. Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
  18583. Info.ptrVal = I.getArgOperand(0);
  18584. Info.offset = 0;
  18585. Info.align = Align(VecTy->getScalarSizeInBits() / 8);
  18586. // volatile stores with MVE intrinsics not supported
  18587. Info.flags = MachineMemOperand::MOStore;
  18588. return true;
  18589. }
  18590. case Intrinsic::arm_mve_vldr_gather_base:
  18591. case Intrinsic::arm_mve_vldr_gather_base_predicated: {
  18592. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18593. Info.ptrVal = nullptr;
  18594. Info.memVT = MVT::getVT(I.getType());
  18595. Info.align = Align(1);
  18596. Info.flags |= MachineMemOperand::MOLoad;
  18597. return true;
  18598. }
  18599. case Intrinsic::arm_mve_vldr_gather_base_wb:
  18600. case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
  18601. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18602. Info.ptrVal = nullptr;
  18603. Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
  18604. Info.align = Align(1);
  18605. Info.flags |= MachineMemOperand::MOLoad;
  18606. return true;
  18607. }
  18608. case Intrinsic::arm_mve_vldr_gather_offset:
  18609. case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
  18610. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18611. Info.ptrVal = nullptr;
  18612. MVT DataVT = MVT::getVT(I.getType());
  18613. unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
  18614. Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
  18615. DataVT.getVectorNumElements());
  18616. Info.align = Align(1);
  18617. Info.flags |= MachineMemOperand::MOLoad;
  18618. return true;
  18619. }
  18620. case Intrinsic::arm_mve_vstr_scatter_base:
  18621. case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
  18622. Info.opc = ISD::INTRINSIC_VOID;
  18623. Info.ptrVal = nullptr;
  18624. Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
  18625. Info.align = Align(1);
  18626. Info.flags |= MachineMemOperand::MOStore;
  18627. return true;
  18628. }
  18629. case Intrinsic::arm_mve_vstr_scatter_base_wb:
  18630. case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
  18631. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18632. Info.ptrVal = nullptr;
  18633. Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
  18634. Info.align = Align(1);
  18635. Info.flags |= MachineMemOperand::MOStore;
  18636. return true;
  18637. }
  18638. case Intrinsic::arm_mve_vstr_scatter_offset:
  18639. case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
  18640. Info.opc = ISD::INTRINSIC_VOID;
  18641. Info.ptrVal = nullptr;
  18642. MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
  18643. unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
  18644. Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
  18645. DataVT.getVectorNumElements());
  18646. Info.align = Align(1);
  18647. Info.flags |= MachineMemOperand::MOStore;
  18648. return true;
  18649. }
  18650. case Intrinsic::arm_ldaex:
  18651. case Intrinsic::arm_ldrex: {
  18652. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18653. Type *ValTy = I.getParamElementType(0);
  18654. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18655. Info.memVT = MVT::getVT(ValTy);
  18656. Info.ptrVal = I.getArgOperand(0);
  18657. Info.offset = 0;
  18658. Info.align = DL.getABITypeAlign(ValTy);
  18659. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
  18660. return true;
  18661. }
  18662. case Intrinsic::arm_stlex:
  18663. case Intrinsic::arm_strex: {
  18664. auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
  18665. Type *ValTy = I.getParamElementType(1);
  18666. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18667. Info.memVT = MVT::getVT(ValTy);
  18668. Info.ptrVal = I.getArgOperand(1);
  18669. Info.offset = 0;
  18670. Info.align = DL.getABITypeAlign(ValTy);
  18671. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
  18672. return true;
  18673. }
  18674. case Intrinsic::arm_stlexd:
  18675. case Intrinsic::arm_strexd:
  18676. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18677. Info.memVT = MVT::i64;
  18678. Info.ptrVal = I.getArgOperand(2);
  18679. Info.offset = 0;
  18680. Info.align = Align(8);
  18681. Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
  18682. return true;
  18683. case Intrinsic::arm_ldaexd:
  18684. case Intrinsic::arm_ldrexd:
  18685. Info.opc = ISD::INTRINSIC_W_CHAIN;
  18686. Info.memVT = MVT::i64;
  18687. Info.ptrVal = I.getArgOperand(0);
  18688. Info.offset = 0;
  18689. Info.align = Align(8);
  18690. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
  18691. return true;
  18692. default:
  18693. break;
  18694. }
  18695. return false;
  18696. }
  18697. /// Returns true if it is beneficial to convert a load of a constant
  18698. /// to just the constant itself.
  18699. bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
  18700. Type *Ty) const {
  18701. assert(Ty->isIntegerTy());
  18702. unsigned Bits = Ty->getPrimitiveSizeInBits();
  18703. if (Bits == 0 || Bits > 32)
  18704. return false;
  18705. return true;
  18706. }
  18707. bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
  18708. unsigned Index) const {
  18709. if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
  18710. return false;
  18711. return (Index == 0 || Index == ResVT.getVectorNumElements());
  18712. }
  18713. Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
  18714. ARM_MB::MemBOpt Domain) const {
  18715. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18716. // First, if the target has no DMB, see what fallback we can use.
  18717. if (!Subtarget->hasDataBarrier()) {
  18718. // Some ARMv6 cpus can support data barriers with an mcr instruction.
  18719. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
  18720. // here.
  18721. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
  18722. Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
  18723. Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
  18724. Builder.getInt32(0), Builder.getInt32(7),
  18725. Builder.getInt32(10), Builder.getInt32(5)};
  18726. return Builder.CreateCall(MCR, args);
  18727. } else {
  18728. // Instead of using barriers, atomic accesses on these subtargets use
  18729. // libcalls.
  18730. llvm_unreachable("makeDMB on a target so old that it has no barriers");
  18731. }
  18732. } else {
  18733. Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
  18734. // Only a full system barrier exists in the M-class architectures.
  18735. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
  18736. Constant *CDomain = Builder.getInt32(Domain);
  18737. return Builder.CreateCall(DMB, CDomain);
  18738. }
  18739. }
  18740. // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
  18741. Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
  18742. Instruction *Inst,
  18743. AtomicOrdering Ord) const {
  18744. switch (Ord) {
  18745. case AtomicOrdering::NotAtomic:
  18746. case AtomicOrdering::Unordered:
  18747. llvm_unreachable("Invalid fence: unordered/non-atomic");
  18748. case AtomicOrdering::Monotonic:
  18749. case AtomicOrdering::Acquire:
  18750. return nullptr; // Nothing to do
  18751. case AtomicOrdering::SequentiallyConsistent:
  18752. if (!Inst->hasAtomicStore())
  18753. return nullptr; // Nothing to do
  18754. [[fallthrough]];
  18755. case AtomicOrdering::Release:
  18756. case AtomicOrdering::AcquireRelease:
  18757. if (Subtarget->preferISHSTBarriers())
  18758. return makeDMB(Builder, ARM_MB::ISHST);
  18759. // FIXME: add a comment with a link to documentation justifying this.
  18760. else
  18761. return makeDMB(Builder, ARM_MB::ISH);
  18762. }
  18763. llvm_unreachable("Unknown fence ordering in emitLeadingFence");
  18764. }
  18765. Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
  18766. Instruction *Inst,
  18767. AtomicOrdering Ord) const {
  18768. switch (Ord) {
  18769. case AtomicOrdering::NotAtomic:
  18770. case AtomicOrdering::Unordered:
  18771. llvm_unreachable("Invalid fence: unordered/not-atomic");
  18772. case AtomicOrdering::Monotonic:
  18773. case AtomicOrdering::Release:
  18774. return nullptr; // Nothing to do
  18775. case AtomicOrdering::Acquire:
  18776. case AtomicOrdering::AcquireRelease:
  18777. case AtomicOrdering::SequentiallyConsistent:
  18778. return makeDMB(Builder, ARM_MB::ISH);
  18779. }
  18780. llvm_unreachable("Unknown fence ordering in emitTrailingFence");
  18781. }
  18782. // Loads and stores less than 64-bits are already atomic; ones above that
  18783. // are doomed anyway, so defer to the default libcall and blame the OS when
  18784. // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
  18785. // anything for those.
  18786. TargetLoweringBase::AtomicExpansionKind
  18787. ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  18788. bool has64BitAtomicStore;
  18789. if (Subtarget->isMClass())
  18790. has64BitAtomicStore = false;
  18791. else if (Subtarget->isThumb())
  18792. has64BitAtomicStore = Subtarget->hasV7Ops();
  18793. else
  18794. has64BitAtomicStore = Subtarget->hasV6Ops();
  18795. unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
  18796. return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
  18797. : AtomicExpansionKind::None;
  18798. }
  18799. // Loads and stores less than 64-bits are already atomic; ones above that
  18800. // are doomed anyway, so defer to the default libcall and blame the OS when
  18801. // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
  18802. // anything for those.
  18803. // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
  18804. // guarantee, see DDI0406C ARM architecture reference manual,
  18805. // sections A8.8.72-74 LDRD)
  18806. TargetLowering::AtomicExpansionKind
  18807. ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
  18808. bool has64BitAtomicLoad;
  18809. if (Subtarget->isMClass())
  18810. has64BitAtomicLoad = false;
  18811. else if (Subtarget->isThumb())
  18812. has64BitAtomicLoad = Subtarget->hasV7Ops();
  18813. else
  18814. has64BitAtomicLoad = Subtarget->hasV6Ops();
  18815. unsigned Size = LI->getType()->getPrimitiveSizeInBits();
  18816. return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
  18817. : AtomicExpansionKind::None;
  18818. }
  18819. // For the real atomic operations, we have ldrex/strex up to 32 bits,
  18820. // and up to 64 bits on the non-M profiles
  18821. TargetLowering::AtomicExpansionKind
  18822. ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
  18823. if (AI->isFloatingPointOperation())
  18824. return AtomicExpansionKind::CmpXChg;
  18825. unsigned Size = AI->getType()->getPrimitiveSizeInBits();
  18826. bool hasAtomicRMW;
  18827. if (Subtarget->isMClass())
  18828. hasAtomicRMW = Subtarget->hasV8MBaselineOps();
  18829. else if (Subtarget->isThumb())
  18830. hasAtomicRMW = Subtarget->hasV7Ops();
  18831. else
  18832. hasAtomicRMW = Subtarget->hasV6Ops();
  18833. if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
  18834. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  18835. // implement atomicrmw without spilling. If the target address is also on
  18836. // the stack and close enough to the spill slot, this can lead to a
  18837. // situation where the monitor always gets cleared and the atomic operation
  18838. // can never succeed. So at -O0 lower this operation to a CAS loop.
  18839. if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
  18840. return AtomicExpansionKind::CmpXChg;
  18841. return AtomicExpansionKind::LLSC;
  18842. }
  18843. return AtomicExpansionKind::None;
  18844. }
  18845. // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
  18846. // bits, and up to 64 bits on the non-M profiles.
  18847. TargetLowering::AtomicExpansionKind
  18848. ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
  18849. // At -O0, fast-regalloc cannot cope with the live vregs necessary to
  18850. // implement cmpxchg without spilling. If the address being exchanged is also
  18851. // on the stack and close enough to the spill slot, this can lead to a
  18852. // situation where the monitor always gets cleared and the atomic operation
  18853. // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
  18854. unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
  18855. bool HasAtomicCmpXchg;
  18856. if (Subtarget->isMClass())
  18857. HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
  18858. else if (Subtarget->isThumb())
  18859. HasAtomicCmpXchg = Subtarget->hasV7Ops();
  18860. else
  18861. HasAtomicCmpXchg = Subtarget->hasV6Ops();
  18862. if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
  18863. Size <= (Subtarget->isMClass() ? 32U : 64U))
  18864. return AtomicExpansionKind::LLSC;
  18865. return AtomicExpansionKind::None;
  18866. }
  18867. bool ARMTargetLowering::shouldInsertFencesForAtomic(
  18868. const Instruction *I) const {
  18869. return InsertFencesForAtomic;
  18870. }
  18871. bool ARMTargetLowering::useLoadStackGuardNode() const {
  18872. // ROPI/RWPI are not supported currently.
  18873. return !Subtarget->isROPI() && !Subtarget->isRWPI();
  18874. }
  18875. void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
  18876. if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  18877. return TargetLowering::insertSSPDeclarations(M);
  18878. // MSVC CRT has a global variable holding security cookie.
  18879. M.getOrInsertGlobal("__security_cookie",
  18880. Type::getInt8PtrTy(M.getContext()));
  18881. // MSVC CRT has a function to validate security cookie.
  18882. FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
  18883. "__security_check_cookie", Type::getVoidTy(M.getContext()),
  18884. Type::getInt8PtrTy(M.getContext()));
  18885. if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
  18886. F->addParamAttr(0, Attribute::AttrKind::InReg);
  18887. }
  18888. Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
  18889. // MSVC CRT has a global variable holding security cookie.
  18890. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  18891. return M.getGlobalVariable("__security_cookie");
  18892. return TargetLowering::getSDagStackGuard(M);
  18893. }
  18894. Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
  18895. // MSVC CRT has a function to validate security cookie.
  18896. if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
  18897. return M.getFunction("__security_check_cookie");
  18898. return TargetLowering::getSSPStackGuardCheck(M);
  18899. }
  18900. bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
  18901. unsigned &Cost) const {
  18902. // If we do not have NEON, vector types are not natively supported.
  18903. if (!Subtarget->hasNEON())
  18904. return false;
  18905. // Floating point values and vector values map to the same register file.
  18906. // Therefore, although we could do a store extract of a vector type, this is
  18907. // better to leave at float as we have more freedom in the addressing mode for
  18908. // those.
  18909. if (VectorTy->isFPOrFPVectorTy())
  18910. return false;
  18911. // If the index is unknown at compile time, this is very expensive to lower
  18912. // and it is not possible to combine the store with the extract.
  18913. if (!isa<ConstantInt>(Idx))
  18914. return false;
  18915. assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
  18916. unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
  18917. // We can do a store + vector extract on any vector that fits perfectly in a D
  18918. // or Q register.
  18919. if (BitWidth == 64 || BitWidth == 128) {
  18920. Cost = 0;
  18921. return true;
  18922. }
  18923. return false;
  18924. }
  18925. bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
  18926. return Subtarget->hasV6T2Ops();
  18927. }
  18928. bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
  18929. return Subtarget->hasV6T2Ops();
  18930. }
  18931. bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
  18932. const Instruction &AndI) const {
  18933. if (!Subtarget->hasV7Ops())
  18934. return false;
  18935. // Sink the `and` instruction only if the mask would fit into a modified
  18936. // immediate operand.
  18937. ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
  18938. if (!Mask || Mask->getValue().getBitWidth() > 32u)
  18939. return false;
  18940. auto MaskVal = unsigned(Mask->getValue().getZExtValue());
  18941. return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
  18942. : ARM_AM::getSOImmVal(MaskVal)) != -1;
  18943. }
  18944. TargetLowering::ShiftLegalizationStrategy
  18945. ARMTargetLowering::preferredShiftLegalizationStrategy(
  18946. SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
  18947. if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
  18948. return ShiftLegalizationStrategy::LowerToLibcall;
  18949. return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
  18950. ExpansionFactor);
  18951. }
  18952. Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
  18953. Value *Addr,
  18954. AtomicOrdering Ord) const {
  18955. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18956. bool IsAcquire = isAcquireOrStronger(Ord);
  18957. // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
  18958. // intrinsic must return {i32, i32} and we have to recombine them into a
  18959. // single i64 here.
  18960. if (ValueTy->getPrimitiveSizeInBits() == 64) {
  18961. Intrinsic::ID Int =
  18962. IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
  18963. Function *Ldrex = Intrinsic::getDeclaration(M, Int);
  18964. Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
  18965. Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
  18966. Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
  18967. Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
  18968. if (!Subtarget->isLittle())
  18969. std::swap (Lo, Hi);
  18970. Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
  18971. Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
  18972. return Builder.CreateOr(
  18973. Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
  18974. }
  18975. Type *Tys[] = { Addr->getType() };
  18976. Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
  18977. Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
  18978. CallInst *CI = Builder.CreateCall(Ldrex, Addr);
  18979. CI->addParamAttr(
  18980. 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
  18981. return Builder.CreateTruncOrBitCast(CI, ValueTy);
  18982. }
  18983. void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
  18984. IRBuilderBase &Builder) const {
  18985. if (!Subtarget->hasV7Ops())
  18986. return;
  18987. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18988. Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
  18989. }
  18990. Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
  18991. Value *Val, Value *Addr,
  18992. AtomicOrdering Ord) const {
  18993. Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  18994. bool IsRelease = isReleaseOrStronger(Ord);
  18995. // Since the intrinsics must have legal type, the i64 intrinsics take two
  18996. // parameters: "i32, i32". We must marshal Val into the appropriate form
  18997. // before the call.
  18998. if (Val->getType()->getPrimitiveSizeInBits() == 64) {
  18999. Intrinsic::ID Int =
  19000. IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
  19001. Function *Strex = Intrinsic::getDeclaration(M, Int);
  19002. Type *Int32Ty = Type::getInt32Ty(M->getContext());
  19003. Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
  19004. Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
  19005. if (!Subtarget->isLittle())
  19006. std::swap(Lo, Hi);
  19007. Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
  19008. return Builder.CreateCall(Strex, {Lo, Hi, Addr});
  19009. }
  19010. Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
  19011. Type *Tys[] = { Addr->getType() };
  19012. Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
  19013. CallInst *CI = Builder.CreateCall(
  19014. Strex, {Builder.CreateZExtOrBitCast(
  19015. Val, Strex->getFunctionType()->getParamType(0)),
  19016. Addr});
  19017. CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
  19018. Val->getType()));
  19019. return CI;
  19020. }
  19021. bool ARMTargetLowering::alignLoopsWithOptSize() const {
  19022. return Subtarget->isMClass();
  19023. }
  19024. /// A helper function for determining the number of interleaved accesses we
  19025. /// will generate when lowering accesses of the given type.
  19026. unsigned
  19027. ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
  19028. const DataLayout &DL) const {
  19029. return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
  19030. }
  19031. bool ARMTargetLowering::isLegalInterleavedAccessType(
  19032. unsigned Factor, FixedVectorType *VecTy, Align Alignment,
  19033. const DataLayout &DL) const {
  19034. unsigned VecSize = DL.getTypeSizeInBits(VecTy);
  19035. unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
  19036. if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
  19037. return false;
  19038. // Ensure the vector doesn't have f16 elements. Even though we could do an
  19039. // i16 vldN, we can't hold the f16 vectors and will end up converting via
  19040. // f32.
  19041. if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
  19042. return false;
  19043. if (Subtarget->hasMVEIntegerOps() && Factor == 3)
  19044. return false;
  19045. // Ensure the number of vector elements is greater than 1.
  19046. if (VecTy->getNumElements() < 2)
  19047. return false;
  19048. // Ensure the element type is legal.
  19049. if (ElSize != 8 && ElSize != 16 && ElSize != 32)
  19050. return false;
  19051. // And the alignment if high enough under MVE.
  19052. if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
  19053. return false;
  19054. // Ensure the total vector size is 64 or a multiple of 128. Types larger than
  19055. // 128 will be split into multiple interleaved accesses.
  19056. if (Subtarget->hasNEON() && VecSize == 64)
  19057. return true;
  19058. return VecSize % 128 == 0;
  19059. }
  19060. unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
  19061. if (Subtarget->hasNEON())
  19062. return 4;
  19063. if (Subtarget->hasMVEIntegerOps())
  19064. return MVEMaxSupportedInterleaveFactor;
  19065. return TargetLoweringBase::getMaxSupportedInterleaveFactor();
  19066. }
  19067. /// Lower an interleaved load into a vldN intrinsic.
  19068. ///
  19069. /// E.g. Lower an interleaved load (Factor = 2):
  19070. /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
  19071. /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
  19072. /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
  19073. ///
  19074. /// Into:
  19075. /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
  19076. /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
  19077. /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
  19078. bool ARMTargetLowering::lowerInterleavedLoad(
  19079. LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
  19080. ArrayRef<unsigned> Indices, unsigned Factor) const {
  19081. assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
  19082. "Invalid interleave factor");
  19083. assert(!Shuffles.empty() && "Empty shufflevector input");
  19084. assert(Shuffles.size() == Indices.size() &&
  19085. "Unmatched number of shufflevectors and indices");
  19086. auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
  19087. Type *EltTy = VecTy->getElementType();
  19088. const DataLayout &DL = LI->getModule()->getDataLayout();
  19089. Align Alignment = LI->getAlign();
  19090. // Skip if we do not have NEON and skip illegal vector types. We can
  19091. // "legalize" wide vector types into multiple interleaved accesses as long as
  19092. // the vector types are divisible by 128.
  19093. if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
  19094. return false;
  19095. unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
  19096. // A pointer vector can not be the return type of the ldN intrinsics. Need to
  19097. // load integer vectors first and then convert to pointer vectors.
  19098. if (EltTy->isPointerTy())
  19099. VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
  19100. IRBuilder<> Builder(LI);
  19101. // The base address of the load.
  19102. Value *BaseAddr = LI->getPointerOperand();
  19103. if (NumLoads > 1) {
  19104. // If we're going to generate more than one load, reset the sub-vector type
  19105. // to something legal.
  19106. VecTy = FixedVectorType::get(VecTy->getElementType(),
  19107. VecTy->getNumElements() / NumLoads);
  19108. // We will compute the pointer operand of each load from the original base
  19109. // address using GEPs. Cast the base address to a pointer to the scalar
  19110. // element type.
  19111. BaseAddr = Builder.CreateBitCast(
  19112. BaseAddr,
  19113. VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
  19114. }
  19115. assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
  19116. auto createLoadIntrinsic = [&](Value *BaseAddr) {
  19117. if (Subtarget->hasNEON()) {
  19118. Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
  19119. Type *Tys[] = {VecTy, Int8Ptr};
  19120. static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
  19121. Intrinsic::arm_neon_vld3,
  19122. Intrinsic::arm_neon_vld4};
  19123. Function *VldnFunc =
  19124. Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
  19125. SmallVector<Value *, 2> Ops;
  19126. Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
  19127. Ops.push_back(Builder.getInt32(LI->getAlign().value()));
  19128. return Builder.CreateCall(VldnFunc, Ops, "vldN");
  19129. } else {
  19130. assert((Factor == 2 || Factor == 4) &&
  19131. "expected interleave factor of 2 or 4 for MVE");
  19132. Intrinsic::ID LoadInts =
  19133. Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
  19134. Type *VecEltTy =
  19135. VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
  19136. Type *Tys[] = {VecTy, VecEltTy};
  19137. Function *VldnFunc =
  19138. Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
  19139. SmallVector<Value *, 2> Ops;
  19140. Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
  19141. return Builder.CreateCall(VldnFunc, Ops, "vldN");
  19142. }
  19143. };
  19144. // Holds sub-vectors extracted from the load intrinsic return values. The
  19145. // sub-vectors are associated with the shufflevector instructions they will
  19146. // replace.
  19147. DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
  19148. for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
  19149. // If we're generating more than one load, compute the base address of
  19150. // subsequent loads as an offset from the previous.
  19151. if (LoadCount > 0)
  19152. BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
  19153. VecTy->getNumElements() * Factor);
  19154. CallInst *VldN = createLoadIntrinsic(BaseAddr);
  19155. // Replace uses of each shufflevector with the corresponding vector loaded
  19156. // by ldN.
  19157. for (unsigned i = 0; i < Shuffles.size(); i++) {
  19158. ShuffleVectorInst *SV = Shuffles[i];
  19159. unsigned Index = Indices[i];
  19160. Value *SubVec = Builder.CreateExtractValue(VldN, Index);
  19161. // Convert the integer vector to pointer vector if the element is pointer.
  19162. if (EltTy->isPointerTy())
  19163. SubVec = Builder.CreateIntToPtr(
  19164. SubVec,
  19165. FixedVectorType::get(SV->getType()->getElementType(), VecTy));
  19166. SubVecs[SV].push_back(SubVec);
  19167. }
  19168. }
  19169. // Replace uses of the shufflevector instructions with the sub-vectors
  19170. // returned by the load intrinsic. If a shufflevector instruction is
  19171. // associated with more than one sub-vector, those sub-vectors will be
  19172. // concatenated into a single wide vector.
  19173. for (ShuffleVectorInst *SVI : Shuffles) {
  19174. auto &SubVec = SubVecs[SVI];
  19175. auto *WideVec =
  19176. SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
  19177. SVI->replaceAllUsesWith(WideVec);
  19178. }
  19179. return true;
  19180. }
  19181. /// Lower an interleaved store into a vstN intrinsic.
  19182. ///
  19183. /// E.g. Lower an interleaved store (Factor = 3):
  19184. /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  19185. /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  19186. /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
  19187. ///
  19188. /// Into:
  19189. /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
  19190. /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
  19191. /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
  19192. /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
  19193. ///
  19194. /// Note that the new shufflevectors will be removed and we'll only generate one
  19195. /// vst3 instruction in CodeGen.
  19196. ///
  19197. /// Example for a more general valid mask (Factor 3). Lower:
  19198. /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
  19199. /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
  19200. /// store <12 x i32> %i.vec, <12 x i32>* %ptr
  19201. ///
  19202. /// Into:
  19203. /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
  19204. /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
  19205. /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
  19206. /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
  19207. bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
  19208. ShuffleVectorInst *SVI,
  19209. unsigned Factor) const {
  19210. assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
  19211. "Invalid interleave factor");
  19212. auto *VecTy = cast<FixedVectorType>(SVI->getType());
  19213. assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
  19214. unsigned LaneLen = VecTy->getNumElements() / Factor;
  19215. Type *EltTy = VecTy->getElementType();
  19216. auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
  19217. const DataLayout &DL = SI->getModule()->getDataLayout();
  19218. Align Alignment = SI->getAlign();
  19219. // Skip if we do not have NEON and skip illegal vector types. We can
  19220. // "legalize" wide vector types into multiple interleaved accesses as long as
  19221. // the vector types are divisible by 128.
  19222. if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
  19223. return false;
  19224. unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
  19225. Value *Op0 = SVI->getOperand(0);
  19226. Value *Op1 = SVI->getOperand(1);
  19227. IRBuilder<> Builder(SI);
  19228. // StN intrinsics don't support pointer vectors as arguments. Convert pointer
  19229. // vectors to integer vectors.
  19230. if (EltTy->isPointerTy()) {
  19231. Type *IntTy = DL.getIntPtrType(EltTy);
  19232. // Convert to the corresponding integer vector.
  19233. auto *IntVecTy =
  19234. FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
  19235. Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
  19236. Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
  19237. SubVecTy = FixedVectorType::get(IntTy, LaneLen);
  19238. }
  19239. // The base address of the store.
  19240. Value *BaseAddr = SI->getPointerOperand();
  19241. if (NumStores > 1) {
  19242. // If we're going to generate more than one store, reset the lane length
  19243. // and sub-vector type to something legal.
  19244. LaneLen /= NumStores;
  19245. SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
  19246. // We will compute the pointer operand of each store from the original base
  19247. // address using GEPs. Cast the base address to a pointer to the scalar
  19248. // element type.
  19249. BaseAddr = Builder.CreateBitCast(
  19250. BaseAddr,
  19251. SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
  19252. }
  19253. assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
  19254. auto Mask = SVI->getShuffleMask();
  19255. auto createStoreIntrinsic = [&](Value *BaseAddr,
  19256. SmallVectorImpl<Value *> &Shuffles) {
  19257. if (Subtarget->hasNEON()) {
  19258. static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
  19259. Intrinsic::arm_neon_vst3,
  19260. Intrinsic::arm_neon_vst4};
  19261. Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
  19262. Type *Tys[] = {Int8Ptr, SubVecTy};
  19263. Function *VstNFunc = Intrinsic::getDeclaration(
  19264. SI->getModule(), StoreInts[Factor - 2], Tys);
  19265. SmallVector<Value *, 6> Ops;
  19266. Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
  19267. append_range(Ops, Shuffles);
  19268. Ops.push_back(Builder.getInt32(SI->getAlign().value()));
  19269. Builder.CreateCall(VstNFunc, Ops);
  19270. } else {
  19271. assert((Factor == 2 || Factor == 4) &&
  19272. "expected interleave factor of 2 or 4 for MVE");
  19273. Intrinsic::ID StoreInts =
  19274. Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
  19275. Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
  19276. SI->getPointerAddressSpace());
  19277. Type *Tys[] = {EltPtrTy, SubVecTy};
  19278. Function *VstNFunc =
  19279. Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
  19280. SmallVector<Value *, 6> Ops;
  19281. Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
  19282. append_range(Ops, Shuffles);
  19283. for (unsigned F = 0; F < Factor; F++) {
  19284. Ops.push_back(Builder.getInt32(F));
  19285. Builder.CreateCall(VstNFunc, Ops);
  19286. Ops.pop_back();
  19287. }
  19288. }
  19289. };
  19290. for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
  19291. // If we generating more than one store, we compute the base address of
  19292. // subsequent stores as an offset from the previous.
  19293. if (StoreCount > 0)
  19294. BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
  19295. BaseAddr, LaneLen * Factor);
  19296. SmallVector<Value *, 4> Shuffles;
  19297. // Split the shufflevector operands into sub vectors for the new vstN call.
  19298. for (unsigned i = 0; i < Factor; i++) {
  19299. unsigned IdxI = StoreCount * LaneLen * Factor + i;
  19300. if (Mask[IdxI] >= 0) {
  19301. Shuffles.push_back(Builder.CreateShuffleVector(
  19302. Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
  19303. } else {
  19304. unsigned StartMask = 0;
  19305. for (unsigned j = 1; j < LaneLen; j++) {
  19306. unsigned IdxJ = StoreCount * LaneLen * Factor + j;
  19307. if (Mask[IdxJ * Factor + IdxI] >= 0) {
  19308. StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
  19309. break;
  19310. }
  19311. }
  19312. // Note: If all elements in a chunk are undefs, StartMask=0!
  19313. // Note: Filling undef gaps with random elements is ok, since
  19314. // those elements were being written anyway (with undefs).
  19315. // In the case of all undefs we're defaulting to using elems from 0
  19316. // Note: StartMask cannot be negative, it's checked in
  19317. // isReInterleaveMask
  19318. Shuffles.push_back(Builder.CreateShuffleVector(
  19319. Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
  19320. }
  19321. }
  19322. createStoreIntrinsic(BaseAddr, Shuffles);
  19323. }
  19324. return true;
  19325. }
  19326. enum HABaseType {
  19327. HA_UNKNOWN = 0,
  19328. HA_FLOAT,
  19329. HA_DOUBLE,
  19330. HA_VECT64,
  19331. HA_VECT128
  19332. };
  19333. static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
  19334. uint64_t &Members) {
  19335. if (auto *ST = dyn_cast<StructType>(Ty)) {
  19336. for (unsigned i = 0; i < ST->getNumElements(); ++i) {
  19337. uint64_t SubMembers = 0;
  19338. if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
  19339. return false;
  19340. Members += SubMembers;
  19341. }
  19342. } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
  19343. uint64_t SubMembers = 0;
  19344. if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
  19345. return false;
  19346. Members += SubMembers * AT->getNumElements();
  19347. } else if (Ty->isFloatTy()) {
  19348. if (Base != HA_UNKNOWN && Base != HA_FLOAT)
  19349. return false;
  19350. Members = 1;
  19351. Base = HA_FLOAT;
  19352. } else if (Ty->isDoubleTy()) {
  19353. if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
  19354. return false;
  19355. Members = 1;
  19356. Base = HA_DOUBLE;
  19357. } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
  19358. Members = 1;
  19359. switch (Base) {
  19360. case HA_FLOAT:
  19361. case HA_DOUBLE:
  19362. return false;
  19363. case HA_VECT64:
  19364. return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
  19365. case HA_VECT128:
  19366. return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
  19367. case HA_UNKNOWN:
  19368. switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
  19369. case 64:
  19370. Base = HA_VECT64;
  19371. return true;
  19372. case 128:
  19373. Base = HA_VECT128;
  19374. return true;
  19375. default:
  19376. return false;
  19377. }
  19378. }
  19379. }
  19380. return (Members > 0 && Members <= 4);
  19381. }
  19382. /// Return the correct alignment for the current calling convention.
  19383. Align ARMTargetLowering::getABIAlignmentForCallingConv(
  19384. Type *ArgTy, const DataLayout &DL) const {
  19385. const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
  19386. if (!ArgTy->isVectorTy())
  19387. return ABITypeAlign;
  19388. // Avoid over-aligning vector parameters. It would require realigning the
  19389. // stack and waste space for no real benefit.
  19390. return std::min(ABITypeAlign, DL.getStackAlignment());
  19391. }
  19392. /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
  19393. /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
  19394. /// passing according to AAPCS rules.
  19395. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
  19396. Type *Ty, CallingConv::ID CallConv, bool isVarArg,
  19397. const DataLayout &DL) const {
  19398. if (getEffectiveCallingConv(CallConv, isVarArg) !=
  19399. CallingConv::ARM_AAPCS_VFP)
  19400. return false;
  19401. HABaseType Base = HA_UNKNOWN;
  19402. uint64_t Members = 0;
  19403. bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
  19404. LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
  19405. bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
  19406. return IsHA || IsIntArray;
  19407. }
  19408. Register ARMTargetLowering::getExceptionPointerRegister(
  19409. const Constant *PersonalityFn) const {
  19410. // Platforms which do not use SjLj EH may return values in these registers
  19411. // via the personality function.
  19412. return Subtarget->useSjLjEH() ? Register() : ARM::R0;
  19413. }
  19414. Register ARMTargetLowering::getExceptionSelectorRegister(
  19415. const Constant *PersonalityFn) const {
  19416. // Platforms which do not use SjLj EH may return values in these registers
  19417. // via the personality function.
  19418. return Subtarget->useSjLjEH() ? Register() : ARM::R1;
  19419. }
  19420. void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
  19421. // Update IsSplitCSR in ARMFunctionInfo.
  19422. ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
  19423. AFI->setIsSplitCSR(true);
  19424. }
  19425. void ARMTargetLowering::insertCopiesSplitCSR(
  19426. MachineBasicBlock *Entry,
  19427. const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
  19428. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
  19429. const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
  19430. if (!IStart)
  19431. return;
  19432. const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  19433. MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
  19434. MachineBasicBlock::iterator MBBI = Entry->begin();
  19435. for (const MCPhysReg *I = IStart; *I; ++I) {
  19436. const TargetRegisterClass *RC = nullptr;
  19437. if (ARM::GPRRegClass.contains(*I))
  19438. RC = &ARM::GPRRegClass;
  19439. else if (ARM::DPRRegClass.contains(*I))
  19440. RC = &ARM::DPRRegClass;
  19441. else
  19442. llvm_unreachable("Unexpected register class in CSRsViaCopy!");
  19443. Register NewVR = MRI->createVirtualRegister(RC);
  19444. // Create copy from CSR to a virtual register.
  19445. // FIXME: this currently does not emit CFI pseudo-instructions, it works
  19446. // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
  19447. // nounwind. If we want to generalize this later, we may need to emit
  19448. // CFI pseudo-instructions.
  19449. assert(Entry->getParent()->getFunction().hasFnAttribute(
  19450. Attribute::NoUnwind) &&
  19451. "Function should be nounwind in insertCopiesSplitCSR!");
  19452. Entry->addLiveIn(*I);
  19453. BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
  19454. .addReg(*I);
  19455. // Insert the copy-back instructions right before the terminator.
  19456. for (auto *Exit : Exits)
  19457. BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
  19458. TII->get(TargetOpcode::COPY), *I)
  19459. .addReg(NewVR);
  19460. }
  19461. }
  19462. void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
  19463. MF.getFrameInfo().computeMaxCallFrameSize(MF);
  19464. TargetLoweringBase::finalizeLowering(MF);
  19465. }
  19466. bool ARMTargetLowering::isComplexDeinterleavingSupported() const {
  19467. return Subtarget->hasMVEIntegerOps();
  19468. }
  19469. bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
  19470. ComplexDeinterleavingOperation Operation, Type *Ty) const {
  19471. auto *VTy = dyn_cast<FixedVectorType>(Ty);
  19472. if (!VTy)
  19473. return false;
  19474. auto *ScalarTy = VTy->getScalarType();
  19475. unsigned NumElements = VTy->getNumElements();
  19476. unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
  19477. if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
  19478. return false;
  19479. // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
  19480. if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
  19481. return Subtarget->hasMVEFloatOps();
  19482. if (Operation != ComplexDeinterleavingOperation::CAdd)
  19483. return false;
  19484. return Subtarget->hasMVEIntegerOps() &&
  19485. (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
  19486. ScalarTy->isIntegerTy(32));
  19487. }
  19488. Value *ARMTargetLowering::createComplexDeinterleavingIR(
  19489. Instruction *I, ComplexDeinterleavingOperation OperationType,
  19490. ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
  19491. Value *Accumulator) const {
  19492. FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
  19493. IRBuilder<> B(I);
  19494. unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
  19495. assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
  19496. if (TyWidth > 128) {
  19497. int Stride = Ty->getNumElements() / 2;
  19498. auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
  19499. auto SplitSeqVec = llvm::to_vector(SplitSeq);
  19500. ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
  19501. ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
  19502. auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
  19503. auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
  19504. auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
  19505. auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
  19506. Value *LowerSplitAcc = nullptr;
  19507. Value *UpperSplitAcc = nullptr;
  19508. if (Accumulator) {
  19509. LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
  19510. UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
  19511. }
  19512. auto *LowerSplitInt = createComplexDeinterleavingIR(
  19513. I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
  19514. auto *UpperSplitInt = createComplexDeinterleavingIR(
  19515. I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
  19516. ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
  19517. return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
  19518. }
  19519. auto *IntTy = Type::getInt32Ty(B.getContext());
  19520. ConstantInt *ConstRotation = nullptr;
  19521. if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
  19522. ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
  19523. if (Accumulator)
  19524. return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
  19525. {ConstRotation, Accumulator, InputB, InputA});
  19526. return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
  19527. {ConstRotation, InputB, InputA});
  19528. }
  19529. if (OperationType == ComplexDeinterleavingOperation::CAdd) {
  19530. // 1 means the value is not halved.
  19531. auto *ConstHalving = ConstantInt::get(IntTy, 1);
  19532. if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
  19533. ConstRotation = ConstantInt::get(IntTy, 0);
  19534. else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
  19535. ConstRotation = ConstantInt::get(IntTy, 1);
  19536. if (!ConstRotation)
  19537. return nullptr; // Invalid rotation for arm_mve_vcaddq
  19538. return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
  19539. {ConstHalving, ConstRotation, InputA, InputB});
  19540. }
  19541. return nullptr;
  19542. }